Files
LLM_Engineering_OLD/week5/community-contributions/muawiya/rag_logs.ipynb
2025-06-21 20:26:48 +03:00

125 lines
4.2 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is an example on how to process log files in a simple rag system"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"from langchain.vectorstores import Chroma\n",
"from langchain.docstore.document import Document\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"from pathlib import Path\n",
"from langchain.document_loaders import DirectoryLoader, TextLoader\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"\n",
"# Path to your logs directory\n",
"input_dir = Path(\"failures_ds\")\n",
"\n",
"# Step 1: Load all .log files recursively\n",
"documents = []\n",
"for log_path in input_dir.rglob(\"*.log\"):\n",
" with open(log_path, \"r\", encoding=\"utf-8\") as f:\n",
" content = f.read().strip()\n",
" if content:\n",
" documents.append(Document(\n",
" page_content=content,\n",
" metadata={\"source\": str(log_path.relative_to(input_dir))} # optional: store relative path\n",
" ))\n",
"\n",
"print(f\"Loaded {len(documents)} log documents.\")\n",
"\n",
"# Step 2: Load the embedding model\n",
"embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
"\n",
"# Step 3: Create the Chroma vectorstore\n",
"\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
"chunks = text_splitter.split_documents(documents)\n",
"\n",
"db_path = \"chroma_failures_ds\"\n",
"vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding_model, persist_directory=db_path)\n",
"vectorstore.persist()\n",
"print(f\"✅ Vectorstore created with {vectorstore._collection.count()} documents at {db_path}\")\n",
"\n",
"print(f\"✅ Vectorstore created with {vectorstore._collection.count()} documents at {db_path}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Display in 2D in order to understand what happened in chroma"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"# Step 1: Load the Chroma DB\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"from sklearn.manifold import TSNE\n",
"import plotly.express as px\n",
"import numpy as np\n",
"\n",
"persist_path = \"chroma_failures_ds\"\n",
"embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
"vectorstore = Chroma(persist_directory=persist_path, embedding_function=embedding_model)\n",
"\n",
"# ✅ Get embeddings explicitly\n",
"result = vectorstore.get(include=['embeddings', 'metadatas', 'documents']) # Include documents ✅\n",
"all_docs = result['documents']\n",
"all_metas = result['metadatas']\n",
"all_embeddings = result['embeddings']\n",
"\n",
"# ✅ Convert to numpy array and verify shape\n",
"X = np.array(all_embeddings)\n",
"print(\"Shape of X:\", X.shape)\n",
"\n",
"# ✅ Adjust perplexity to be < number of samples\n",
"X_2d = TSNE(n_components=2, perplexity=min(30, X.shape[0] - 1), random_state=42).fit_transform(X)\n",
"\n",
"# Prepare Plotly data\n",
"sources = [meta['source'] for meta in all_metas]\n",
"texts = [doc[:200] for doc in all_docs]\n",
"df_data = {\n",
" \"x\": X_2d[:, 0],\n",
" \"y\": X_2d[:, 1],\n",
" \"source\": sources,\n",
" \"preview\": texts,\n",
"}\n",
"\n",
"# Plot\n",
"fig = px.scatter(df_data, x=\"x\", y=\"y\", color=\"source\", hover_data=[\"preview\"])\n",
"fig.update_layout(title=\"2D Visualization of Chroma Embeddings\", width=1000, height=700)\n",
"fig.show()"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}