{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This is an example on how to process log files in a simple rag system" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "from langchain.vectorstores import Chroma\n", "from langchain.docstore.document import Document\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "from pathlib import Path\n", "from langchain.document_loaders import DirectoryLoader, TextLoader\n", "from langchain.text_splitter import CharacterTextSplitter\n", "\n", "# Path to your logs directory\n", "input_dir = Path(\"failures_ds\")\n", "\n", "# Step 1: Load all .log files recursively\n", "documents = []\n", "for log_path in input_dir.rglob(\"*.log\"):\n", " with open(log_path, \"r\", encoding=\"utf-8\") as f:\n", " content = f.read().strip()\n", " if content:\n", " documents.append(Document(\n", " page_content=content,\n", " metadata={\"source\": str(log_path.relative_to(input_dir))} # optional: store relative path\n", " ))\n", "\n", "print(f\"Loaded {len(documents)} log documents.\")\n", "\n", "# Step 2: Load the embedding model\n", "embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", "\n", "# Step 3: Create the Chroma vectorstore\n", "\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", "chunks = text_splitter.split_documents(documents)\n", "\n", "db_path = \"chroma_failures_ds\"\n", "vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding_model, persist_directory=db_path)\n", "vectorstore.persist()\n", "print(f\"✅ Vectorstore created with {vectorstore._collection.count()} documents at {db_path}\")\n", "\n", "print(f\"✅ Vectorstore created with {vectorstore._collection.count()} documents at {db_path}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Display in 2D in order to understand what happened in chroma" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "# Step 1: Load the Chroma DB\n", "from langchain.vectorstores import Chroma\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "from sklearn.manifold import TSNE\n", "import plotly.express as px\n", "import numpy as np\n", "\n", "persist_path = \"chroma_failures_ds\"\n", "embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", "vectorstore = Chroma(persist_directory=persist_path, embedding_function=embedding_model)\n", "\n", "# ✅ Get embeddings explicitly\n", "result = vectorstore.get(include=['embeddings', 'metadatas', 'documents']) # Include documents ✅\n", "all_docs = result['documents']\n", "all_metas = result['metadatas']\n", "all_embeddings = result['embeddings']\n", "\n", "# ✅ Convert to numpy array and verify shape\n", "X = np.array(all_embeddings)\n", "print(\"Shape of X:\", X.shape)\n", "\n", "# ✅ Adjust perplexity to be < number of samples\n", "X_2d = TSNE(n_components=2, perplexity=min(30, X.shape[0] - 1), random_state=42).fit_transform(X)\n", "\n", "# Prepare Plotly data\n", "sources = [meta['source'] for meta in all_metas]\n", "texts = [doc[:200] for doc in all_docs]\n", "df_data = {\n", " \"x\": X_2d[:, 0],\n", " \"y\": X_2d[:, 1],\n", " \"source\": sources,\n", " \"preview\": texts,\n", "}\n", "\n", "# Plot\n", "fig = px.scatter(df_data, x=\"x\", y=\"y\", color=\"source\", hover_data=[\"preview\"])\n", "fig.update_layout(title=\"2D Visualization of Chroma Embeddings\", width=1000, height=700)\n", "fig.show()" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }