{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Personal Knowledge Worker\n", "\n", "Search through your exported Notion Workspace with Gemini models using RAG.\n", "\n", "How to export the content from Notion: https://www.notion.com/help/export-your-content" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports and Setup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -U -q langchain-google-genai" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import re\n", "import glob\n", "from dotenv import load_dotenv\n", "import gradio as gr\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from langchain.document_loaders import DirectoryLoader, TextLoader\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.schema import Document\n", "from langchain_chroma import Chroma\n", "from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI\n", "from langchain.memory import ConversationBufferMemory\n", "from langchain.chains import ConversationalRetrievalChain" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "LLM_MODEL = \"gemini-2.5-flash-lite\"\n", "EMBEDDINGS_MODEL = \"models/gemini-embedding-001\"\n", "db_name = \"vector_db\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "load_dotenv()\n", "os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Vector DB Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Clean up and Load Documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Clean up the Notion directory, remove MD5 hashes from filenames and directory names\n", "\n", "# Root directory of your export\n", "root_dir = \"notion_export\"\n", "\n", "# Regex to match the hash: space + 24-32 hex chars (sometimes longer)\n", "hash_pattern = re.compile(r\"\\s[0-9a-f]{16,32}(_all)?\")\n", "\n", "for dirpath, dirnames, filenames in os.walk(root_dir, topdown=False):\n", " # Rename files\n", " for filename in filenames:\n", " new_name = re.sub(hash_pattern, \"\", filename)\n", " if new_name != filename:\n", " old_path = os.path.join(dirpath, filename)\n", " new_path = os.path.join(dirpath, new_name)\n", " print(f\"Renaming file: {old_path} -> {new_path}\")\n", " os.rename(old_path, new_path)\n", "\n", " # Rename directories\n", " for dirname in dirnames:\n", " new_name = re.sub(hash_pattern, \"\", dirname)\n", " if new_name != dirname:\n", " old_path = os.path.join(dirpath, dirname)\n", " new_path = os.path.join(dirpath, new_name)\n", " print(f\"Renaming dir: {old_path} -> {new_path}\")\n", " os.rename(old_path, new_path)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read in documents using LangChain's loaders\n", "\n", "documents = []\n", "for dirpath, dirnames, filenames in os.walk(root_dir):\n", " # Define doc_type relative to root_dir\n", " doc_type = os.path.relpath(dirpath, root_dir)\n", "\n", " # for main pages in Notion\n", " if doc_type == \".\":\n", " doc_type = \"Main\"\n", " \n", " loader = DirectoryLoader(\n", " dirpath,\n", " glob=\"**/*.md\", # recursive match inside dirpath\n", " loader_cls=TextLoader\n", " )\n", " \n", " folder_docs = loader.load()\n", " for doc in folder_docs:\n", " doc.metadata[\"doc_type\"] = doc_type\n", " documents.append(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create chunks" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)\n", "chunks = text_splitter.split_documents(documents)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(chunks)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n", "print(f\"Document types found: {', '.join(doc_types)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create Embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDINGS_MODEL)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# If you don't want to recreate the collection\n", "\n", "vectorstore = Chroma(embedding_function=embeddings, persist_directory=db_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch\n", "\n", "if os.path.exists(db_name):\n", " Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n", "\n", "# Create our Chroma vectorstore!\n", "\n", "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n", "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get one vector and find how many dimensions it has\n", "\n", "collection = vectorstore._collection\n", "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n", "dimensions = len(sample_embedding)\n", "print(f\"The vectors have {dimensions:,} dimensions\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RAG pipeline using LangChain" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create a new Chat with ChatGoogleGenerativeAI\n", "llm = ChatGoogleGenerativeAI(model=LLM_MODEL, temperature=0.7)\n", "\n", "# set up the conversation memory for the chat\n", "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", "\n", "# the retriever is an abstraction over the VectorStore that will be used during RAG\n", "retriever = vectorstore.as_retriever()\n", "\n", "# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n", "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Gradio User Interface" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def chat(message, history):\n", " result = conversation_chain.invoke({\"question\": message})\n", " return result[\"answer\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "view = gr.ChatInterface(chat, type=\"messages\").launch(inbrowser=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 4 }