From 200e3ab2abefd87325984a6d302dbbd4f887d425 Mon Sep 17 00:00:00 2001 From: Adriana394 <158718290+Adriana394@users.noreply.github.com> Date: Fri, 25 Apr 2025 09:46:20 +0200 Subject: [PATCH] create week 5 exercise --- .../colabnotebook_rag_assisstant.ipynb | 410 ++++++++++++++++++ 1 file changed, 410 insertions(+) create mode 100644 week5/community-contributions/colabnotebook_rag_assisstant.ipynb diff --git a/week5/community-contributions/colabnotebook_rag_assisstant.ipynb b/week5/community-contributions/colabnotebook_rag_assisstant.ipynb new file mode 100644 index 0000000..1ec7875 --- /dev/null +++ b/week5/community-contributions/colabnotebook_rag_assisstant.ipynb @@ -0,0 +1,410 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f3ce7a00-62c7-4cee-bed6-a89bf052e167", + "metadata": {}, + "source": [ + "# Colab Notebook RAG Assistant\n", + "\n", + "Short Notebook Description:\n", + "\n", + "This Jupyter/Colab notebook builds a Retrieval-Augmented Generation (RAG) assistant over your own collection of .ipynb files in Google Colab. It:\n", + "\n", + "1. Loads all notebooks from a local folder or mounted Google Drive.\n", + "\n", + "2. Chunks their content into manageable pieces.\n", + "\n", + "3. Embeds each chunk with OpenAI embeddings and stores them in a persistent Chroma vector database.\n", + "\n", + "4. Provides a ConversationalRetrievalChain with memory and a Gradio chat interface.\n", + "\n", + "5. For any user question, it returns both an answer and the names of the exact notebooks where the relevant information was found.\n", + "\n", + "This setup lets you query your entire notebook history—whether local or in Colab—just like a personal knowledge base." + ] + }, + { + "cell_type": "markdown", + "id": "1a7a0225-800c-4088-9bd2-ac98dbbb55c9", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe7e9772-171f-4ff6-bd3b-e77aa82b19d3", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import glob\n", + "from dotenv import load_dotenv\n", + "import gradio as gr\n", + "\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain.document_loaders import DirectoryLoader, NotebookLoader\n", + "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n", + "from langchain_chroma import Chroma\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.chains import RetrievalQA\n", + "\n", + "from sklearn.manifold import TSNE\n", + "import plotly.graph_objects as go\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "8dfd2b57-3b3b-4fc2-bb2d-40be7aba3a4a", + "metadata": {}, + "source": [ + "## Configuration & Set Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d502b28-1d33-43bc-8797-41fed26d5fa0", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override = True)\n", + "\n", + "OPENAI_KEY = os.getenv('OPENAI_API_KEY')\n", + "NOTEBOOKS_DIR = os.getenv('NOTEBOOKS_DIR')\n", + "VECTOR_DB_DIR = os.getenv('VECTOR_DB_DIR')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af7f5fa2-78f8-45bf-a0e0-e1f93cc98f4b", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL = 'gpt-4o-mini'\n", + "CHUNK_SIZE = 1000\n", + "CHUNK_OVERLAP = 200" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82f7b583-d176-448b-b762-28618f05c660", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " # Colab\n", + " from google.colab import drive\n", + " print(\"Running in Colab: mounting Google Drive...\")\n", + " drive.mount('/content/drive')\n", + " is_colab = True\n", + "\n", + " # Colab defaults\n", + " NOTEBOOKS_DIR = '/content/drive/MyDrive/ColabNotebooks'\n", + " DB_DIR = VECTOR_DB_DIR or '/content/drive/MyDrive/colab_vector_db'\n", + "\n", + "except ImportError:\n", + " # Local Jupyter Lab:\n", + " print(\"Not in Colab: using local notebooks directory.\")\n", + " NOTEBOOKS_DIR = os.path.expanduser(NOTEBOOKS_DIR)\n", + " DB_DIR = VECTOR_DB_DIR\n", + "\n", + " # Verify the local notebooks directory exists\n", + " if not os.path.isdir(NOTEBOOKS_DIR):\n", + " raise FileNotFoundError(\n", + " f\"Local notebooks directory '{NOTEBOOKS_DIR}' not found.\" \n", + " \"\\nPlease sync your Google Drive folder (e.g., via Drive for Desktop) \"\n", + " \"or set NOTEBOOKS_DIR in your .env to the correct path.\"\n", + " )\n", + "# Confirm final paths\n", + "# print(f\"Indexing notebooks from: {NOTEBOOKS_DIR}\")\n", + "# print(f\"Chroma will store embeddings in: {DB_DIR}\")" + ] + }, + { + "cell_type": "markdown", + "id": "5eefd329-712f-4c43-b7aa-0322c0cd7c41", + "metadata": {}, + "source": [ + "## Read in Notebook files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4468cbc-8f04-47c7-a583-1cb81cdb17fb", + "metadata": {}, + "outputs": [], + "source": [ + "notebooks = glob.glob(\n", + " os.path.join(NOTEBOOKS_DIR, \"**\", \"*.ipynb\"),\n", + " recursive=True\n", + ")\n", + "print(f\"Notebooks found: {len(notebooks)}\")\n", + "\n", + "\n", + "loader = DirectoryLoader(NOTEBOOKS_DIR,\n", + " glob = '**/*.ipynb',\n", + " loader_cls = NotebookLoader\n", + " )\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f8c0ce3-e3a9-4271-9824-e7fa42c8867d", + "metadata": {}, + "outputs": [], + "source": [ + "splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size = CHUNK_SIZE, \n", + " chunk_overlap = CHUNK_OVERLAP, \n", + " separators=[\"\\n## \", \"\\n### \", \"\\n#### \", \"\\n\\n\", \"\\n\", \" \", \"\"]\n", + ")\n", + "\n", + "chunks = splitter.split_documents(docs)\n", + "print(f'Created {len(chunks)} chunks from your notebooks')" + ] + }, + { + "cell_type": "markdown", + "id": "d73f8869-020b-48ac-bce9-82fadd58e04b", + "metadata": {}, + "source": [ + "## Embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27d6269c-88ac-4da7-8e87-691308d9e473", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY)\n", + "\n", + "if os.path.exists(DB_DIR):\n", + " Chroma(persist_directory = DB_DIR, embedding_function = embeddings).delete_collection()\n", + "\n", + "\n", + "vectorstore = Chroma.from_documents(\n", + " documents = chunks,\n", + " embedding = embeddings,\n", + " persist_directory = VECTOR_DB_DIR\n", + ")\n", + "\n", + "vector_count = vectorstore._collection.count()\n", + "print(f\"Vectorstore contains {vector_count} vectors.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d795bc7-82dc-4ad5-be39-97e08c033a4c", + "metadata": {}, + "outputs": [], + "source": [ + "sample_embedding = vectorstore._collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n", + "dimensions = len(sample_embedding)\n", + "print(f\"There are {vectorstore._collection.count():,} vectors with {dimensions:,} dimensions in the vector store.\")" + ] + }, + { + "cell_type": "markdown", + "id": "bae1ab40-c22d-4815-bec4-840d69cf702b", + "metadata": {}, + "source": [ + "## Visualize in 3D" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3e30379-eb8f-469d-841e-cf95a542595b", + "metadata": {}, + "outputs": [], + "source": [ + "result = vectorstore._collection.get(include=['embeddings', 'documents',])\n", + "vectors = np.array(result['embeddings'])\n", + "documents = result['documents']\n", + "colors = ['blue'] * len(vectors)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b0c61e2-1d5d-429d-ace7-e883e051fdd2", + "metadata": {}, + "outputs": [], + "source": [ + "tsne = TSNE(n_components=3, random_state=42)\n", + "reduced_vectors = tsne.fit_transform(vectors)\n", + "\n", + "fig = go.Figure(data=[go.Scatter3d(\n", + " x=reduced_vectors[:, 0],\n", + " y=reduced_vectors[:, 1],\n", + " z=reduced_vectors[:, 2],\n", + " mode='markers',\n", + " marker=dict(size=4, color=colors, opacity=0.8),\n", + " text=[d[:100] + \"...\" for d in documents],\n", + " hoverinfo='text'\n", + ")])\n", + "\n", + "fig.update_layout(\n", + " title='3D TSNE of Notebook-Chunks',\n", + " scene=dict(\n", + " xaxis_title=\"TSNE-1\",\n", + " yaxis_title=\"TSNE-2\",\n", + " zaxis_title=\"TSNE-3\"\n", + " ),\n", + " width=800,\n", + " height=600,\n", + " margin=dict(r=10, b=10, l=10, t=40)\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "831684d5-5694-488f-aaed-e219d57b909c", + "metadata": {}, + "source": [ + "## Build LLM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02197e43-c958-4f70-be38-666ee4c1c4ae", + "metadata": {}, + "outputs": [], + "source": [ + "llm = ChatOpenAI(model_name = MODEL, temperature = 0.6)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3aabba9e-e447-4597-a86e-fe3fc5b8babe", + "metadata": {}, + "outputs": [], + "source": [ + "qa = RetrievalQA.from_chain_type(\n", + " llm = llm,\n", + " chain_type=\"stuff\",\n", + " retriever=vectorstore.as_retriever(search_kwargs={\"k\": 4}),\n", + " return_source_documents=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a71daee6-7d82-4212-ae4f-a2553f1d3c8a", + "metadata": {}, + "outputs": [], + "source": [ + "memory = ConversationBufferMemory(\n", + " memory_key = 'chat_history',\n", + " return_messages = True\n", + ")\n", + "\n", + "conv_chain = ConversationalRetrievalChain.from_llm(\n", + " llm = llm,\n", + " retriever = vectorstore.as_retriever(search_kwargs = {'k': 10}),\n", + " memory = memory\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d8e2b9e-bb50-4bda-9832-c6a2779526e3", + "metadata": {}, + "outputs": [], + "source": [ + "def chat_with_memory_and_sources(message, chat_history):\n", + " # Get the conversational answer (memory included)\n", + " conv_res = conv_chain.invoke({\n", + " \"question\": message,\n", + " \"chat_history\": chat_history\n", + " })\n", + " answer = conv_res[\"answer\"]\n", + "\n", + " # Retrieve source documents \n", + " src_res = qa({\"query\": message})\n", + " src_docs = src_res[\"source_documents\"]\n", + "\n", + " # Extract and dedupe notebook filenames from metadata\n", + " notebooks = [\n", + " os.path.basename(doc.metadata.get(\"source\", \"\"))\n", + " for doc in src_docs\n", + " if doc.metadata.get(\"source\")\n", + " ]\n", + " unique = []\n", + " for n in notebooks:\n", + " if n not in unique:\n", + " unique.append(n)\n", + "\n", + " # Append the list of notebook filenames\n", + " if unique:\n", + " answer += \"\\n\\n**Found Notebooks:**\\n\" + \"\\n\".join(f\"- {n}\" for n in unique)\n", + " else:\n", + " answer += \"\\n\\n_No Notebooks found._\"\n", + "\n", + " return answer" + ] + }, + { + "cell_type": "markdown", + "id": "3d5c53ec-4fe3-46cc-9c17-7326294d24ef", + "metadata": {}, + "source": [ + "## Gradio UI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "371e32ee-df20-4ec5-91eb-5023fc4b70b2", + "metadata": {}, + "outputs": [], + "source": [ + "view = gr.ChatInterface(chat_with_memory_and_sources, \n", + " title=\"Notebook-RAG-Assistant mit Memory & Quellen\",\n", + " type = 'messages').launch(inbrowser=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}