LLM_Engineering_OLD/week5/community-contributions/muawiya/rag_logs.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is an example on how to process log files in a simple rag system"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "from langchain.vectorstores import Chroma\n",
    "from langchain.docstore.document import Document\n",
    "from langchain.embeddings import HuggingFaceEmbeddings\n",
    "from pathlib import Path\n",
    "from langchain.document_loaders import DirectoryLoader, TextLoader\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "\n",
    "# Path to your logs directory\n",
    "input_dir = Path(\"failures_ds\")\n",
    "\n",
    "# Step 1: Load all .log files recursively\n",
    "documents = []\n",
    "for log_path in input_dir.rglob(\"*.log\"):\n",
    "    with open(log_path, \"r\", encoding=\"utf-8\") as f:\n",
    "        content = f.read().strip()\n",
    "        if content:\n",
    "            documents.append(Document(\n",
    "                page_content=content,\n",
    "                metadata={\"source\": str(log_path.relative_to(input_dir))}  # optional: store relative path\n",
    "            ))\n",
    "\n",
    "print(f\"Loaded {len(documents)} log documents.\")\n",
    "\n",
    "# Step 2: Load the embedding model\n",
    "embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
    "\n",
    "# Step 3: Create the Chroma vectorstore\n",
    "\n",
    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
    "chunks = text_splitter.split_documents(documents)\n",
    "\n",
    "db_path = \"chroma_failures_ds\"\n",
    "vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding_model, persist_directory=db_path)\n",
    "vectorstore.persist()\n",
    "print(f\"✅ Vectorstore created with {vectorstore._collection.count()} documents at {db_path}\")\n",
    "\n",
    "print(f\"✅ Vectorstore created with {vectorstore._collection.count()} documents at {db_path}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Display in 2D in order to understand what happened in chroma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "# Step 1: Load the Chroma DB\n",
    "from langchain.vectorstores import Chroma\n",
    "from langchain.embeddings import HuggingFaceEmbeddings\n",
    "from sklearn.manifold import TSNE\n",
    "import plotly.express as px\n",
    "import numpy as np\n",
    "\n",
    "persist_path = \"chroma_failures_ds\"\n",
    "embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
    "vectorstore = Chroma(persist_directory=persist_path, embedding_function=embedding_model)\n",
    "\n",
    "# ✅ Get embeddings explicitly\n",
    "result = vectorstore.get(include=['embeddings', 'metadatas', 'documents'])  # Include documents ✅\n",
    "all_docs = result['documents']\n",
    "all_metas = result['metadatas']\n",
    "all_embeddings = result['embeddings']\n",
    "\n",
    "# ✅ Convert to numpy array and verify shape\n",
    "X = np.array(all_embeddings)\n",
    "print(\"Shape of X:\", X.shape)\n",
    "\n",
    "# ✅ Adjust perplexity to be < number of samples\n",
    "X_2d = TSNE(n_components=2, perplexity=min(30, X.shape[0] - 1), random_state=42).fit_transform(X)\n",
    "\n",
    "# Prepare Plotly data\n",
    "sources = [meta['source'] for meta in all_metas]\n",
    "texts = [doc[:200] for doc in all_docs]\n",
    "df_data = {\n",
    "    \"x\": X_2d[:, 0],\n",
    "    \"y\": X_2d[:, 1],\n",
    "    \"source\": sources,\n",
    "    \"preview\": texts,\n",
    "}\n",
    "\n",
    "# Plot\n",
    "fig = px.scatter(df_data, x=\"x\", y=\"y\", color=\"source\", hover_data=[\"preview\"])\n",
    "fig.update_layout(title=\"2D Visualization of Chroma Embeddings\", width=1000, height=700)\n",
    "fig.show()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}