Files
2025-06-21 20:26:48 +03:00

131 lines
4.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"from langchain.vectorstores import Chroma\n",
"from langchain.docstore.document import Document\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"\n",
"# Path to your test step CSVs\n",
"input_dir = Path(\"failures_ds_csv\") # Replace with your actual CSV folder name\n",
"\n",
"# Step 1: Load all .csv files recursively and convert to Documents\n",
"documents = []\n",
"\n",
"for csv_path in input_dir.rglob(\"*.csv\"):\n",
" df = pd.read_csv(csv_path)\n",
"\n",
" # Option 1: concatenate relevant columns like \"Step\", \"Description\", \"Command\"\n",
" if \"Step\" in df.columns and \"Description\" in df.columns:\n",
" steps = [\n",
" f\"Step {row['Step']}: {row['Description']}\"\n",
" for _, row in df.iterrows()\n",
" if pd.notna(row['Description'])\n",
" ]\n",
" else:\n",
" # fallback: join all rows\n",
" steps = [\" \".join(str(cell) for cell in row) for _, row in df.iterrows()]\n",
"\n",
" content = \"\\n\".join(steps).strip()\n",
"\n",
" if content:\n",
" documents.append(Document(\n",
" page_content=content,\n",
" metadata={\"source\": str(csv_path.relative_to(input_dir))}\n",
" ))\n",
"\n",
"print(f\"✅ Loaded {len(documents)} CSV-based test documents.\")\n",
"\n",
"# Step 2: Load the embedding model\n",
"embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
"\n",
"# Step 3: Create Chroma vectorstore (skip chunking)\n",
"db_path = \"chroma_test_step_vectors\"\n",
"vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=db_path)\n",
"vectorstore.persist()\n",
"\n",
"print(f\"✅ Vectorstore created with {vectorstore._collection.count()} test cases at {db_path}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Demonstrate results in 2D curve"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"# Step 1: Load the Chroma DB\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"from sklearn.manifold import TSNE\n",
"import plotly.express as px\n",
"import numpy as np\n",
"\n",
"persist_path = \"chroma_test_step_vectors\"\n",
"embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
"vectorstore = Chroma(persist_directory=persist_path, embedding_function=embedding_model)\n",
"\n",
"# ✅ Get embeddings explicitly\n",
"result = vectorstore.get(include=['embeddings', 'metadatas', 'documents']) # Include documents ✅\n",
"all_docs = result['documents']\n",
"all_metas = result['metadatas']\n",
"all_embeddings = result['embeddings']\n",
"\n",
"# ✅ Convert to numpy array and verify shape\n",
"X = np.array(all_embeddings)\n",
"print(\"Shape of X:\", X.shape)\n",
"\n",
"# ✅ Adjust perplexity to be < number of samples\n",
"X_2d = TSNE(n_components=2, perplexity=min(30, X.shape[0] - 1), random_state=42).fit_transform(X)\n",
"\n",
"# Prepare Plotly data\n",
"from pathlib import Path\n",
"def extract_test_id(path_str):\n",
" return Path(path_str).stem\n",
"\n",
"sources = [extract_test_id(meta['source']) for meta in all_metas]\n",
"\n",
"texts = [doc[:200] for doc in all_docs]\n",
"df_data = {\n",
" \"x\": X_2d[:, 0],\n",
" \"y\": X_2d[:, 1],\n",
" \"source\": sources,\n",
" \"preview\": texts,\n",
"}\n",
"\n",
"# Plot\n",
"fig = px.scatter(df_data, x=\"x\", y=\"y\", color=\"source\", hover_data=[\"preview\"])\n",
"fig.update_layout(title=\"2D Visualization of Chroma Embeddings\", width=1000, height=700)\n",
"fig.show()"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}