Merge pull request #347 from Adriana394/week-exercises
create week 5 exercise
This commit is contained in:
410
week5/community-contributions/colabnotebook_rag_assisstant.ipynb
Normal file
410
week5/community-contributions/colabnotebook_rag_assisstant.ipynb
Normal file
@@ -0,0 +1,410 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f3ce7a00-62c7-4cee-bed6-a89bf052e167",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Colab Notebook RAG Assistant\n",
|
||||||
|
"\n",
|
||||||
|
"Short Notebook Description:\n",
|
||||||
|
"\n",
|
||||||
|
"This Jupyter/Colab notebook builds a Retrieval-Augmented Generation (RAG) assistant over your own collection of .ipynb files in Google Colab. It:\n",
|
||||||
|
"\n",
|
||||||
|
"1. Loads all notebooks from a local folder or mounted Google Drive.\n",
|
||||||
|
"\n",
|
||||||
|
"2. Chunks their content into manageable pieces.\n",
|
||||||
|
"\n",
|
||||||
|
"3. Embeds each chunk with OpenAI embeddings and stores them in a persistent Chroma vector database.\n",
|
||||||
|
"\n",
|
||||||
|
"4. Provides a ConversationalRetrievalChain with memory and a Gradio chat interface.\n",
|
||||||
|
"\n",
|
||||||
|
"5. For any user question, it returns both an answer and the names of the exact notebooks where the relevant information was found.\n",
|
||||||
|
"\n",
|
||||||
|
"This setup lets you query your entire notebook history—whether local or in Colab—just like a personal knowledge base."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1a7a0225-800c-4088-9bd2-ac98dbbb55c9",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Imports"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "fe7e9772-171f-4ff6-bd3b-e77aa82b19d3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import glob\n",
|
||||||
|
"from dotenv import load_dotenv\n",
|
||||||
|
"import gradio as gr\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
|
||||||
|
"from langchain.document_loaders import DirectoryLoader, NotebookLoader\n",
|
||||||
|
"from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n",
|
||||||
|
"from langchain_chroma import Chroma\n",
|
||||||
|
"from langchain.memory import ConversationBufferMemory\n",
|
||||||
|
"from langchain.chains import ConversationalRetrievalChain\n",
|
||||||
|
"from langchain.chains import RetrievalQA\n",
|
||||||
|
"\n",
|
||||||
|
"from sklearn.manifold import TSNE\n",
|
||||||
|
"import plotly.graph_objects as go\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"\n",
|
||||||
|
"import numpy as np"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "8dfd2b57-3b3b-4fc2-bb2d-40be7aba3a4a",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Configuration & Set Environment"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5d502b28-1d33-43bc-8797-41fed26d5fa0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"load_dotenv(override = True)\n",
|
||||||
|
"\n",
|
||||||
|
"OPENAI_KEY = os.getenv('OPENAI_API_KEY')\n",
|
||||||
|
"NOTEBOOKS_DIR = os.getenv('NOTEBOOKS_DIR')\n",
|
||||||
|
"VECTOR_DB_DIR = os.getenv('VECTOR_DB_DIR')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "af7f5fa2-78f8-45bf-a0e0-e1f93cc98f4b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"MODEL = 'gpt-4o-mini'\n",
|
||||||
|
"CHUNK_SIZE = 1000\n",
|
||||||
|
"CHUNK_OVERLAP = 200"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "82f7b583-d176-448b-b762-28618f05c660",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"try:\n",
|
||||||
|
" # Colab\n",
|
||||||
|
" from google.colab import drive\n",
|
||||||
|
" print(\"Running in Colab: mounting Google Drive...\")\n",
|
||||||
|
" drive.mount('/content/drive')\n",
|
||||||
|
" is_colab = True\n",
|
||||||
|
"\n",
|
||||||
|
" # Colab defaults\n",
|
||||||
|
" NOTEBOOKS_DIR = '/content/drive/MyDrive/ColabNotebooks'\n",
|
||||||
|
" DB_DIR = VECTOR_DB_DIR or '/content/drive/MyDrive/colab_vector_db'\n",
|
||||||
|
"\n",
|
||||||
|
"except ImportError:\n",
|
||||||
|
" # Local Jupyter Lab:\n",
|
||||||
|
" print(\"Not in Colab: using local notebooks directory.\")\n",
|
||||||
|
" NOTEBOOKS_DIR = os.path.expanduser(NOTEBOOKS_DIR)\n",
|
||||||
|
" DB_DIR = VECTOR_DB_DIR\n",
|
||||||
|
"\n",
|
||||||
|
" # Verify the local notebooks directory exists\n",
|
||||||
|
" if not os.path.isdir(NOTEBOOKS_DIR):\n",
|
||||||
|
" raise FileNotFoundError(\n",
|
||||||
|
" f\"Local notebooks directory '{NOTEBOOKS_DIR}' not found.\" \n",
|
||||||
|
" \"\\nPlease sync your Google Drive folder (e.g., via Drive for Desktop) \"\n",
|
||||||
|
" \"or set NOTEBOOKS_DIR in your .env to the correct path.\"\n",
|
||||||
|
" )\n",
|
||||||
|
"# Confirm final paths\n",
|
||||||
|
"# print(f\"Indexing notebooks from: {NOTEBOOKS_DIR}\")\n",
|
||||||
|
"# print(f\"Chroma will store embeddings in: {DB_DIR}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "5eefd329-712f-4c43-b7aa-0322c0cd7c41",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Read in Notebook files"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c4468cbc-8f04-47c7-a583-1cb81cdb17fb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"notebooks = glob.glob(\n",
|
||||||
|
" os.path.join(NOTEBOOKS_DIR, \"**\", \"*.ipynb\"),\n",
|
||||||
|
" recursive=True\n",
|
||||||
|
")\n",
|
||||||
|
"print(f\"Notebooks found: {len(notebooks)}\")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"loader = DirectoryLoader(NOTEBOOKS_DIR,\n",
|
||||||
|
" glob = '**/*.ipynb',\n",
|
||||||
|
" loader_cls = NotebookLoader\n",
|
||||||
|
" )\n",
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3f8c0ce3-e3a9-4271-9824-e7fa42c8867d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"splitter = RecursiveCharacterTextSplitter(\n",
|
||||||
|
" chunk_size = CHUNK_SIZE, \n",
|
||||||
|
" chunk_overlap = CHUNK_OVERLAP, \n",
|
||||||
|
" separators=[\"\\n## \", \"\\n### \", \"\\n#### \", \"\\n\\n\", \"\\n\", \" \", \"\"]\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chunks = splitter.split_documents(docs)\n",
|
||||||
|
"print(f'Created {len(chunks)} chunks from your notebooks')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d73f8869-020b-48ac-bce9-82fadd58e04b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Embedding"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "27d6269c-88ac-4da7-8e87-691308d9e473",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY)\n",
|
||||||
|
"\n",
|
||||||
|
"if os.path.exists(DB_DIR):\n",
|
||||||
|
" Chroma(persist_directory = DB_DIR, embedding_function = embeddings).delete_collection()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"vectorstore = Chroma.from_documents(\n",
|
||||||
|
" documents = chunks,\n",
|
||||||
|
" embedding = embeddings,\n",
|
||||||
|
" persist_directory = VECTOR_DB_DIR\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"vector_count = vectorstore._collection.count()\n",
|
||||||
|
"print(f\"Vectorstore contains {vector_count} vectors.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "8d795bc7-82dc-4ad5-be39-97e08c033a4c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sample_embedding = vectorstore._collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n",
|
||||||
|
"dimensions = len(sample_embedding)\n",
|
||||||
|
"print(f\"There are {vectorstore._collection.count():,} vectors with {dimensions:,} dimensions in the vector store.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "bae1ab40-c22d-4815-bec4-840d69cf702b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Visualize in 3D"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e3e30379-eb8f-469d-841e-cf95a542595b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"result = vectorstore._collection.get(include=['embeddings', 'documents',])\n",
|
||||||
|
"vectors = np.array(result['embeddings'])\n",
|
||||||
|
"documents = result['documents']\n",
|
||||||
|
"colors = ['blue'] * len(vectors)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "1b0c61e2-1d5d-429d-ace7-e883e051fdd2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tsne = TSNE(n_components=3, random_state=42)\n",
|
||||||
|
"reduced_vectors = tsne.fit_transform(vectors)\n",
|
||||||
|
"\n",
|
||||||
|
"fig = go.Figure(data=[go.Scatter3d(\n",
|
||||||
|
" x=reduced_vectors[:, 0],\n",
|
||||||
|
" y=reduced_vectors[:, 1],\n",
|
||||||
|
" z=reduced_vectors[:, 2],\n",
|
||||||
|
" mode='markers',\n",
|
||||||
|
" marker=dict(size=4, color=colors, opacity=0.8),\n",
|
||||||
|
" text=[d[:100] + \"...\" for d in documents],\n",
|
||||||
|
" hoverinfo='text'\n",
|
||||||
|
")])\n",
|
||||||
|
"\n",
|
||||||
|
"fig.update_layout(\n",
|
||||||
|
" title='3D TSNE of Notebook-Chunks',\n",
|
||||||
|
" scene=dict(\n",
|
||||||
|
" xaxis_title=\"TSNE-1\",\n",
|
||||||
|
" yaxis_title=\"TSNE-2\",\n",
|
||||||
|
" zaxis_title=\"TSNE-3\"\n",
|
||||||
|
" ),\n",
|
||||||
|
" width=800,\n",
|
||||||
|
" height=600,\n",
|
||||||
|
" margin=dict(r=10, b=10, l=10, t=40)\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"fig.show()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "831684d5-5694-488f-aaed-e219d57b909c",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Build LLM"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "02197e43-c958-4f70-be38-666ee4c1c4ae",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"llm = ChatOpenAI(model_name = MODEL, temperature = 0.6)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3aabba9e-e447-4597-a86e-fe3fc5b8babe",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"qa = RetrievalQA.from_chain_type(\n",
|
||||||
|
" llm = llm,\n",
|
||||||
|
" chain_type=\"stuff\",\n",
|
||||||
|
" retriever=vectorstore.as_retriever(search_kwargs={\"k\": 4}),\n",
|
||||||
|
" return_source_documents=True\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a71daee6-7d82-4212-ae4f-a2553f1d3c8a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"memory = ConversationBufferMemory(\n",
|
||||||
|
" memory_key = 'chat_history',\n",
|
||||||
|
" return_messages = True\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"conv_chain = ConversationalRetrievalChain.from_llm(\n",
|
||||||
|
" llm = llm,\n",
|
||||||
|
" retriever = vectorstore.as_retriever(search_kwargs = {'k': 10}),\n",
|
||||||
|
" memory = memory\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "8d8e2b9e-bb50-4bda-9832-c6a2779526e3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def chat_with_memory_and_sources(message, chat_history):\n",
|
||||||
|
" # Get the conversational answer (memory included)\n",
|
||||||
|
" conv_res = conv_chain.invoke({\n",
|
||||||
|
" \"question\": message,\n",
|
||||||
|
" \"chat_history\": chat_history\n",
|
||||||
|
" })\n",
|
||||||
|
" answer = conv_res[\"answer\"]\n",
|
||||||
|
"\n",
|
||||||
|
" # Retrieve source documents \n",
|
||||||
|
" src_res = qa({\"query\": message})\n",
|
||||||
|
" src_docs = src_res[\"source_documents\"]\n",
|
||||||
|
"\n",
|
||||||
|
" # Extract and dedupe notebook filenames from metadata\n",
|
||||||
|
" notebooks = [\n",
|
||||||
|
" os.path.basename(doc.metadata.get(\"source\", \"\"))\n",
|
||||||
|
" for doc in src_docs\n",
|
||||||
|
" if doc.metadata.get(\"source\")\n",
|
||||||
|
" ]\n",
|
||||||
|
" unique = []\n",
|
||||||
|
" for n in notebooks:\n",
|
||||||
|
" if n not in unique:\n",
|
||||||
|
" unique.append(n)\n",
|
||||||
|
"\n",
|
||||||
|
" # Append the list of notebook filenames\n",
|
||||||
|
" if unique:\n",
|
||||||
|
" answer += \"\\n\\n**Found Notebooks:**\\n\" + \"\\n\".join(f\"- {n}\" for n in unique)\n",
|
||||||
|
" else:\n",
|
||||||
|
" answer += \"\\n\\n_No Notebooks found._\"\n",
|
||||||
|
"\n",
|
||||||
|
" return answer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "3d5c53ec-4fe3-46cc-9c17-7326294d24ef",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Gradio UI"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "371e32ee-df20-4ec5-91eb-5023fc4b70b2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"view = gr.ChatInterface(chat_with_memory_and_sources, \n",
|
||||||
|
" title=\"Notebook-RAG-Assistant mit Memory & Quellen\",\n",
|
||||||
|
" type = 'messages').launch(inbrowser=True)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.11"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user