{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "from langchain.vectorstores import Chroma\n", "from langchain.docstore.document import Document\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "from pathlib import Path\n", "import pandas as pd\n", "\n", "# Path to your test step CSVs\n", "input_dir = Path(\"failures_ds_csv\") # Replace with your actual CSV folder name\n", "\n", "# Step 1: Load all .csv files recursively and convert to Documents\n", "documents = []\n", "\n", "for csv_path in input_dir.rglob(\"*.csv\"):\n", " df = pd.read_csv(csv_path)\n", "\n", " # Option 1: concatenate relevant columns like \"Step\", \"Description\", \"Command\"\n", " if \"Step\" in df.columns and \"Description\" in df.columns:\n", " steps = [\n", " f\"Step {row['Step']}: {row['Description']}\"\n", " for _, row in df.iterrows()\n", " if pd.notna(row['Description'])\n", " ]\n", " else:\n", " # fallback: join all rows\n", " steps = [\" \".join(str(cell) for cell in row) for _, row in df.iterrows()]\n", "\n", " content = \"\\n\".join(steps).strip()\n", "\n", " if content:\n", " documents.append(Document(\n", " page_content=content,\n", " metadata={\"source\": str(csv_path.relative_to(input_dir))}\n", " ))\n", "\n", "print(f\"✅ Loaded {len(documents)} CSV-based test documents.\")\n", "\n", "# Step 2: Load the embedding model\n", "embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", "\n", "# Step 3: Create Chroma vectorstore (skip chunking)\n", "db_path = \"chroma_test_step_vectors\"\n", "vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=db_path)\n", "vectorstore.persist()\n", "\n", "print(f\"✅ Vectorstore created with {vectorstore._collection.count()} test cases at {db_path}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Demonstrate results in 2D curve" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "# Step 1: Load the Chroma DB\n", "from langchain.vectorstores import Chroma\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "from sklearn.manifold import TSNE\n", "import plotly.express as px\n", "import numpy as np\n", "\n", "persist_path = \"chroma_test_step_vectors\"\n", "embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", "vectorstore = Chroma(persist_directory=persist_path, embedding_function=embedding_model)\n", "\n", "# ✅ Get embeddings explicitly\n", "result = vectorstore.get(include=['embeddings', 'metadatas', 'documents']) # Include documents ✅\n", "all_docs = result['documents']\n", "all_metas = result['metadatas']\n", "all_embeddings = result['embeddings']\n", "\n", "# ✅ Convert to numpy array and verify shape\n", "X = np.array(all_embeddings)\n", "print(\"Shape of X:\", X.shape)\n", "\n", "# ✅ Adjust perplexity to be < number of samples\n", "X_2d = TSNE(n_components=2, perplexity=min(30, X.shape[0] - 1), random_state=42).fit_transform(X)\n", "\n", "# Prepare Plotly data\n", "from pathlib import Path\n", "def extract_test_id(path_str):\n", " return Path(path_str).stem\n", "\n", "sources = [extract_test_id(meta['source']) for meta in all_metas]\n", "\n", "texts = [doc[:200] for doc in all_docs]\n", "df_data = {\n", " \"x\": X_2d[:, 0],\n", " \"y\": X_2d[:, 1],\n", " \"source\": sources,\n", " \"preview\": texts,\n", "}\n", "\n", "# Plot\n", "fig = px.scatter(df_data, x=\"x\", y=\"y\", color=\"source\", hover_data=[\"preview\"])\n", "fig.update_layout(title=\"2D Visualization of Chroma Embeddings\", width=1000, height=700)\n", "fig.show()" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }