From ecf60e48285ae8aab06f07ff0c845abb4d00c83d Mon Sep 17 00:00:00 2001 From: Krabulek Date: Thu, 2 Oct 2025 19:01:59 +0200 Subject: [PATCH] Week 5 Excercise - DocSearch with Gemini for Notion exported workspace --- .../DocSearch_notion_gemini.ipynb | 333 ++++++++++++++++++ 1 file changed, 333 insertions(+) create mode 100644 week5/community-contributions/DocSearch_Notion/DocSearch_notion_gemini.ipynb diff --git a/week5/community-contributions/DocSearch_Notion/DocSearch_notion_gemini.ipynb b/week5/community-contributions/DocSearch_Notion/DocSearch_notion_gemini.ipynb new file mode 100644 index 0000000..1f233c0 --- /dev/null +++ b/week5/community-contributions/DocSearch_Notion/DocSearch_notion_gemini.ipynb @@ -0,0 +1,333 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Personal Knowledge Worker\n", + "\n", + "Search through your exported Notion Workspace with Gemini models using RAG.\n", + "\n", + "How to export the content from Notion: https://www.notion.com/help/export-your-content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports and Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U -q langchain-google-genai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "import glob\n", + "from dotenv import load_dotenv\n", + "import gradio as gr\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import DirectoryLoader, TextLoader\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.schema import Document\n", + "from langchain_chroma import Chroma\n", + "from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.chains import ConversationalRetrievalChain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LLM_MODEL = \"gemini-2.5-flash-lite\"\n", + "EMBEDDINGS_MODEL = \"models/gemini-embedding-001\"\n", + "db_name = \"vector_db\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv()\n", + "os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vector DB Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean up and Load Documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Clean up the Notion directory, remove MD5 hashes from filenames and directory names\n", + "\n", + "# Root directory of your export\n", + "root_dir = \"notion_export\"\n", + "\n", + "# Regex to match the hash: space + 24-32 hex chars (sometimes longer)\n", + "hash_pattern = re.compile(r\"\\s[0-9a-f]{16,32}(_all)?\")\n", + "\n", + "for dirpath, dirnames, filenames in os.walk(root_dir, topdown=False):\n", + " # Rename files\n", + " for filename in filenames:\n", + " new_name = re.sub(hash_pattern, \"\", filename)\n", + " if new_name != filename:\n", + " old_path = os.path.join(dirpath, filename)\n", + " new_path = os.path.join(dirpath, new_name)\n", + " print(f\"Renaming file: {old_path} -> {new_path}\")\n", + " os.rename(old_path, new_path)\n", + "\n", + " # Rename directories\n", + " for dirname in dirnames:\n", + " new_name = re.sub(hash_pattern, \"\", dirname)\n", + " if new_name != dirname:\n", + " old_path = os.path.join(dirpath, dirname)\n", + " new_path = os.path.join(dirpath, new_name)\n", + " print(f\"Renaming dir: {old_path} -> {new_path}\")\n", + " os.rename(old_path, new_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read in documents using LangChain's loaders\n", + "\n", + "documents = []\n", + "for dirpath, dirnames, filenames in os.walk(root_dir):\n", + " # Define doc_type relative to root_dir\n", + " doc_type = os.path.relpath(dirpath, root_dir)\n", + "\n", + " # for main pages in Notion\n", + " if doc_type == \".\":\n", + " doc_type = \"Main\"\n", + " \n", + " loader = DirectoryLoader(\n", + " dirpath,\n", + " glob=\"**/*.md\", # recursive match inside dirpath\n", + " loader_cls=TextLoader\n", + " )\n", + " \n", + " folder_docs = loader.load()\n", + " for doc in folder_docs:\n", + " doc.metadata[\"doc_type\"] = doc_type\n", + " documents.append(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create chunks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)\n", + "chunks = text_splitter.split_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(chunks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n", + "print(f\"Document types found: {', '.join(doc_types)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDINGS_MODEL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If you don't want to recreate the collection\n", + "\n", + "vectorstore = Chroma(embedding_function=embeddings, persist_directory=db_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch\n", + "\n", + "if os.path.exists(db_name):\n", + " Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n", + "\n", + "# Create our Chroma vectorstore!\n", + "\n", + "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n", + "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get one vector and find how many dimensions it has\n", + "\n", + "collection = vectorstore._collection\n", + "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n", + "dimensions = len(sample_embedding)\n", + "print(f\"The vectors have {dimensions:,} dimensions\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RAG pipeline using LangChain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a new Chat with ChatGoogleGenerativeAI\n", + "llm = ChatGoogleGenerativeAI(model=LLM_MODEL, temperature=0.7)\n", + "\n", + "# set up the conversation memory for the chat\n", + "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "\n", + "# the retriever is an abstraction over the VectorStore that will be used during RAG\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n", + "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gradio User Interface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def chat(message, history):\n", + " result = conversation_chain.invoke({\"question\": message})\n", + " return result[\"answer\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "view = gr.ChatInterface(chat, type=\"messages\").launch(inbrowser=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}