{ "cells": [ { "cell_type": "markdown", "id": "dfe37963-1af6-44fc-a841-8e462443f5e6", "metadata": {}, "source": [ "## Personal Knowledge Worker for Sameer Khadatkar\n", "\n", "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy.\n", "\n", "This first implementation will use a simple, brute-force type of RAG.." ] }, { "cell_type": "code", "execution_count": null, "id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77", "metadata": {}, "outputs": [], "source": [ "# imports\n", "\n", "import os\n", "import glob\n", "from dotenv import load_dotenv\n", "import gradio as gr" ] }, { "cell_type": "code", "execution_count": null, "id": "802137aa-8a74-45e0-a487-d1974927d7ca", "metadata": {}, "outputs": [], "source": [ "# imports for langchain, plotly and Chroma\n", "\n", "from langchain.document_loaders import DirectoryLoader, TextLoader\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.schema import Document\n", "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", "from langchain_chroma import Chroma\n", "import matplotlib.pyplot as plt\n", "from sklearn.manifold import TSNE\n", "import numpy as np\n", "import plotly.graph_objects as go\n", "from langchain.memory import ConversationBufferMemory\n", "from langchain.chains import ConversationalRetrievalChain\n", "from langchain.embeddings import HuggingFaceEmbeddings" ] }, { "cell_type": "code", "execution_count": null, "id": "58c85082-e417-4708-9efe-81a5d55d1424", "metadata": {}, "outputs": [], "source": [ "# price is a factor, so we're going to use a low cost model\n", "\n", "MODEL = \"gpt-4o-mini\"\n", "db_name = \"vector_db\"" ] }, { "cell_type": "code", "execution_count": null, "id": "ee78efcb-60fe-449e-a944-40bab26261af", "metadata": {}, "outputs": [], "source": [ "# Load environment variables in a file called .env\n", "\n", "load_dotenv(override=True)\n", "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')" ] }, { "cell_type": "code", "execution_count": null, "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905", "metadata": {}, "outputs": [], "source": [ "# Read in documents using LangChain's loaders\n", "# Take everything in all the sub-folders of our knowledgebase\n", "\n", "folders = glob.glob(\"sameer-db/*\")\n", "\n", "def add_metadata(doc, doc_type):\n", " doc.metadata[\"doc_type\"] = doc_type\n", " return doc\n", "\n", "text_loader_kwargs = {'encoding': 'utf-8'}\n", "\n", "documents = []\n", "for folder in folders:\n", " doc_type = os.path.basename(folder)\n", " loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n", " folder_docs = loader.load()\n", " documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])\n", "\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", "chunks = text_splitter.split_documents(documents)\n", "\n", "print(f\"Total number of chunks: {len(chunks)}\")\n", "print(f\"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "78998399-ac17-4e28-b15f-0b5f51e6ee23", "metadata": {}, "outputs": [], "source": [ "# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk\n", "# Chroma is a popular open source Vector Database based on SQLLite\n", "\n", "embeddings = OpenAIEmbeddings()\n", "\n", "if os.path.exists(db_name):\n", " Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n", "\n", "# Create vectorstore\n", "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n", "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ff2e7687-60d4-4920-a1d7-a34b9f70a250", "metadata": {}, "outputs": [], "source": [ "# Let's investigate the vectors\n", "\n", "collection = vectorstore._collection\n", "count = collection.count()\n", "\n", "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n", "dimensions = len(sample_embedding)\n", "print(f\"There are {count:,} vectors with {dimensions:,} dimensions in the vector store\")" ] }, { "cell_type": "markdown", "id": "b0d45462-a818-441c-b010-b85b32bcf618", "metadata": {}, "source": [ "## Visualizing the Vector Store\n", "\n", "Let's take a minute to look at the documents and their embedding vectors to see what's going on." ] }, { "cell_type": "code", "execution_count": null, "id": "b98adf5e-d464-4bd2-9bdf-bc5b6770263b", "metadata": {}, "outputs": [], "source": [ "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n", "vectors = np.array(result['embeddings'])\n", "documents = result['documents']\n", "metadatas = result['metadatas']\n", "doc_types = [metadata['doc_type'] for metadata in metadatas]\n", "colors = [['green', 'red'][['personal', 'profile'].index(t)] for t in doc_types]" ] }, { "cell_type": "code", "execution_count": null, "id": "427149d5-e5d8-4abd-bb6f-7ef0333cca21", "metadata": {}, "outputs": [], "source": [ "# We humans find it easier to visalize things in 2D!\n", "# Reduce the dimensionality of the vectors to 2D using t-SNE\n", "# (t-distributed stochastic neighbor embedding)\n", "\n", "tsne = TSNE(n_components=2, random_state=42,perplexity=5)\n", "reduced_vectors = tsne.fit_transform(vectors)\n", "\n", "# Create the 2D scatter plot\n", "fig = go.Figure(data=[go.Scatter(\n", " x=reduced_vectors[:, 0],\n", " y=reduced_vectors[:, 1],\n", " mode='markers',\n", " marker=dict(size=5, color=colors, opacity=0.8),\n", " text=[f\"Type: {t}
Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n", " hoverinfo='text'\n", ")])\n", "\n", "fig.update_layout(\n", " title='2D Chroma Vector Store Visualization',\n", " scene=dict(xaxis_title='x',yaxis_title='y'),\n", " width=800,\n", " height=600,\n", " margin=dict(r=20, b=10, l=10, t=40)\n", ")\n", "\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "e1418e88-acd5-460a-bf2b-4e6efc88e3dd", "metadata": {}, "outputs": [], "source": [ "# Let's try 3D!\n", "\n", "tsne = TSNE(n_components=3, random_state=42,perplexity=5)\n", "reduced_vectors = tsne.fit_transform(vectors)\n", "\n", "# Create the 3D scatter plot\n", "fig = go.Figure(data=[go.Scatter3d(\n", " x=reduced_vectors[:, 0],\n", " y=reduced_vectors[:, 1],\n", " z=reduced_vectors[:, 2],\n", " mode='markers',\n", " marker=dict(size=5, color=colors, opacity=0.8),\n", " text=[f\"Type: {t}
Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n", " hoverinfo='text'\n", ")])\n", "\n", "fig.update_layout(\n", " title='3D Chroma Vector Store Visualization',\n", " scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n", " width=900,\n", " height=700,\n", " margin=dict(r=20, b=10, l=10, t=40)\n", ")\n", "\n", "fig.show()" ] }, { "cell_type": "markdown", "id": "9468860b-86a2-41df-af01-b2400cc985be", "metadata": {}, "source": [ "## Time to use LangChain to bring it all together" ] }, { "cell_type": "code", "execution_count": null, "id": "b3942a10-9977-4ae7-9acf-968c43ad0d4a", "metadata": {}, "outputs": [], "source": [ "from langchain.schema import SystemMessage" ] }, { "cell_type": "code", "execution_count": null, "id": "45c0fb93-0a16-4e55-857b-1f9fd61ec24c", "metadata": {}, "outputs": [], "source": [ "# create a new Chat with OpenAI\n", "llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n", "\n", "# set up the conversation memory for the chat\n", "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", "memory.chat_memory.messages.insert(0, SystemMessage(\n", " content=\"\"\"You are an AI Assistant specialized in providing accurate information about Sameer Khadatkar. Only respond when the question explicitly asks for information. \n", " Keep your answers brief, factual, and based solely on the information provided. Do not speculate or fabricate details. \n", " For example, if the user simply says \"hi,\" respond with: \"How can I help you?\"\n", " \"\"\"\n", "))\n", "\n", "# the retriever is an abstraction over the VectorStore that will be used during RAG\n", "retriever = vectorstore.as_retriever(k=4)\n", "\n", "# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory\n", "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" ] }, { "cell_type": "code", "execution_count": null, "id": "968e7bf2-e862-4679-a11f-6c1efb6ec8ca", "metadata": {}, "outputs": [], "source": [ "# Let's try a simple question\n", "\n", "query = \"Who are you?\"\n", "result = conversation_chain.invoke({\"question\": query})\n", "print(result[\"answer\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "5b5a9013-d5d4-4e25-9e7c-cdbb4f33e319", "metadata": {}, "outputs": [], "source": [ "# set up a new conversation memory for the chat\n", "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", "\n", "# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n", "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" ] }, { "cell_type": "markdown", "id": "bbbcb659-13ce-47ab-8a5e-01b930494964", "metadata": {}, "source": [ "## Now we will bring this up in Gradio using the Chat interface -\n", "\n", "A quick and easy way to prototype a chat with an LLM" ] }, { "cell_type": "code", "execution_count": null, "id": "c3536590-85c7-4155-bd87-ae78a1467670", "metadata": {}, "outputs": [], "source": [ "# Wrapping that in a function\n", "\n", "def chat(question, history):\n", " result = conversation_chain.invoke({\"question\": question})\n", " return result[\"answer\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "b252d8c1-61a8-406d-b57a-8f708a62b014", "metadata": {}, "outputs": [], "source": [ "# And in Gradio:\n", "\n", "view = gr.ChatInterface(chat, type=\"messages\").launch(inbrowser=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "e23270cf-2d46-4f9e-aeb3-de1673900d2f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3476931e-7d94-4b4d-8cc6-67a1bd5fa79c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }