131 lines
4.4 KiB
Plaintext
131 lines
4.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "plaintext"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain.vectorstores import Chroma\n",
|
|
"from langchain.docstore.document import Document\n",
|
|
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
|
"from pathlib import Path\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"# Path to your test step CSVs\n",
|
|
"input_dir = Path(\"failures_ds_csv\") # Replace with your actual CSV folder name\n",
|
|
"\n",
|
|
"# Step 1: Load all .csv files recursively and convert to Documents\n",
|
|
"documents = []\n",
|
|
"\n",
|
|
"for csv_path in input_dir.rglob(\"*.csv\"):\n",
|
|
" df = pd.read_csv(csv_path)\n",
|
|
"\n",
|
|
" # Option 1: concatenate relevant columns like \"Step\", \"Description\", \"Command\"\n",
|
|
" if \"Step\" in df.columns and \"Description\" in df.columns:\n",
|
|
" steps = [\n",
|
|
" f\"Step {row['Step']}: {row['Description']}\"\n",
|
|
" for _, row in df.iterrows()\n",
|
|
" if pd.notna(row['Description'])\n",
|
|
" ]\n",
|
|
" else:\n",
|
|
" # fallback: join all rows\n",
|
|
" steps = [\" \".join(str(cell) for cell in row) for _, row in df.iterrows()]\n",
|
|
"\n",
|
|
" content = \"\\n\".join(steps).strip()\n",
|
|
"\n",
|
|
" if content:\n",
|
|
" documents.append(Document(\n",
|
|
" page_content=content,\n",
|
|
" metadata={\"source\": str(csv_path.relative_to(input_dir))}\n",
|
|
" ))\n",
|
|
"\n",
|
|
"print(f\"✅ Loaded {len(documents)} CSV-based test documents.\")\n",
|
|
"\n",
|
|
"# Step 2: Load the embedding model\n",
|
|
"embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
|
|
"\n",
|
|
"# Step 3: Create Chroma vectorstore (skip chunking)\n",
|
|
"db_path = \"chroma_test_step_vectors\"\n",
|
|
"vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=db_path)\n",
|
|
"vectorstore.persist()\n",
|
|
"\n",
|
|
"print(f\"✅ Vectorstore created with {vectorstore._collection.count()} test cases at {db_path}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Demonstrate results in 2D curve"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "plaintext"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Step 1: Load the Chroma DB\n",
|
|
"from langchain.vectorstores import Chroma\n",
|
|
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
|
"from sklearn.manifold import TSNE\n",
|
|
"import plotly.express as px\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"persist_path = \"chroma_test_step_vectors\"\n",
|
|
"embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
|
|
"vectorstore = Chroma(persist_directory=persist_path, embedding_function=embedding_model)\n",
|
|
"\n",
|
|
"# ✅ Get embeddings explicitly\n",
|
|
"result = vectorstore.get(include=['embeddings', 'metadatas', 'documents']) # Include documents ✅\n",
|
|
"all_docs = result['documents']\n",
|
|
"all_metas = result['metadatas']\n",
|
|
"all_embeddings = result['embeddings']\n",
|
|
"\n",
|
|
"# ✅ Convert to numpy array and verify shape\n",
|
|
"X = np.array(all_embeddings)\n",
|
|
"print(\"Shape of X:\", X.shape)\n",
|
|
"\n",
|
|
"# ✅ Adjust perplexity to be < number of samples\n",
|
|
"X_2d = TSNE(n_components=2, perplexity=min(30, X.shape[0] - 1), random_state=42).fit_transform(X)\n",
|
|
"\n",
|
|
"# Prepare Plotly data\n",
|
|
"from pathlib import Path\n",
|
|
"def extract_test_id(path_str):\n",
|
|
" return Path(path_str).stem\n",
|
|
"\n",
|
|
"sources = [extract_test_id(meta['source']) for meta in all_metas]\n",
|
|
"\n",
|
|
"texts = [doc[:200] for doc in all_docs]\n",
|
|
"df_data = {\n",
|
|
" \"x\": X_2d[:, 0],\n",
|
|
" \"y\": X_2d[:, 1],\n",
|
|
" \"source\": sources,\n",
|
|
" \"preview\": texts,\n",
|
|
"}\n",
|
|
"\n",
|
|
"# Plot\n",
|
|
"fig = px.scatter(df_data, x=\"x\", y=\"y\", color=\"source\", hover_data=[\"preview\"])\n",
|
|
"fig.update_layout(title=\"2D Visualization of Chroma Embeddings\", width=1000, height=700)\n",
|
|
"fig.show()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|