diff --git a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb new file mode 100644 index 0000000..75b190e --- /dev/null +++ b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb @@ -0,0 +1,498 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dfe37963-1af6-44fc-a841-8e462443f5e6", + "metadata": {}, + "source": [ + "## Expert Knowledge Worker\n", + "\n", + "### A question answering agent that is an expert knowledge worker\n", + "### To be used by employees of Insurellm, an Insurance Tech company\n", + "### The agent needs to be accurate and the solution should be low cost.\n", + "\n", + "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import glob\n", + "from dotenv import load_dotenv\n", + "import gradio as gr" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "78743444-cae7-4fad-bf66-dcfabbe73335", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U -q imapclient langchain langchain-openai langchain-chroma langchain-community langchain-core langchain-text-splitters langchain-huggingface chromadb sentence-transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "802137aa-8a74-45e0-a487-d1974927d7ca", + "metadata": {}, + "outputs": [], + "source": [ + "# # imports for langchain\n", + "\n", + "# from langchain.document_loaders import DirectoryLoader, TextLoader\n", + "# from langchain.text_splitter import CharacterTextSplitter\n", + "# from langchain.schema import Document\n", + "# from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "# from langchain_chroma import Chroma\n", + "# import numpy as np\n", + "# from sklearn.manifold import TSNE\n", + "# import plotly.graph_objects as go\n", + "# from langchain.memory import ConversationBufferMemory\n", + "# from langchain.chains import ConversationalRetrievalChain" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e0e47493-d020-4fe4-bac9-6dc42dee1841", + "metadata": {}, + "outputs": [], + "source": [ + "# # imports for langchain\n", + "\n", + "# imports for langchain (modular setup)\n", + "from langchain_core.documents import Document\n", + "from langchain_core.messages import HumanMessage\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_chroma import Chroma\n", + "from langchain_huggingface import HuggingFaceEmbeddings\n", + "from langchain_core.callbacks import StdOutCallbackHandler\n", + "# Other imports\n", + "import numpy as np\n", + "from sklearn.manifold import TSNE\n", + "import plotly.graph_objects as go\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "58c85082-e417-4708-9efe-81a5d55d1424", + "metadata": {}, + "outputs": [], + "source": [ + "# price is a factor for our company, so we're going to use a low cost model\n", + "\n", + "MODEL = \"gpt-4o-mini\"\n", + "db_name = \"vector_db\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ee78efcb-60fe-449e-a944-40bab26261af", + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables in a file called .env\n", + "\n", + "load_dotenv(override=True)\n", + "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'DirectoryLoader' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m 12\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m folder \u001b[38;5;129;01min\u001b[39;00m folders:\n\u001b[32m 13\u001b[39m doc_type = os.path.basename(folder)\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m loader = \u001b[43mDirectoryLoader\u001b[49m(folder, glob=\u001b[33m\"\u001b[39m\u001b[33m**/*.md\u001b[39m\u001b[33m\"\u001b[39m, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n\u001b[32m 15\u001b[39m folder_docs = loader.load()\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m folder_docs:\n", + "\u001b[31mNameError\u001b[39m: name 'DirectoryLoader' is not defined" + ] + } + ], + "source": [ + "# Read in documents using LangChain's loaders\n", + "# Take everything in all the sub-folders of our knowledgebase\n", + "\n", + "folders = glob.glob(\"knowledge-base/*\")\n", + "\n", + "# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n", + "text_loader_kwargs = {'encoding': 'utf-8'}\n", + "# If that doesn't work, some Windows users might need to uncomment the next line instead\n", + "# text_loader_kwargs={'autodetect_encoding': True}\n", + "\n", + "documents = []\n", + "for folder in folders:\n", + " doc_type = os.path.basename(folder)\n", + " loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n", + " folder_docs = loader.load()\n", + " for doc in folder_docs:\n", + " doc.metadata[\"doc_type\"] = doc_type\n", + " documents.append(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a", + "metadata": {}, + "outputs": [], + "source": [ + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + "chunks = text_splitter.split_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb", + "metadata": {}, + "outputs": [], + "source": [ + "len(chunks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c54b4b6-06da-463d-bee7-4dd456c2b887", + "metadata": {}, + "outputs": [], + "source": [ + "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n", + "print(f\"Document types found: {', '.join(doc_types)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "77f7d2a6-ccfa-425b-a1c3-5e55b23bd013", + "metadata": {}, + "source": [ + "## A sidenote on Embeddings, and \"Auto-Encoding LLMs\"\n", + "\n", + "We will be mapping each chunk of text into a Vector that represents the meaning of the text, known as an embedding.\n", + "\n", + "OpenAI offers a model to do this, which we will use by calling their API with some LangChain code.\n", + "\n", + "This model is an example of an \"Auto-Encoding LLM\" which generates an output given a complete input.\n", + "It's different to all the other LLMs we've discussed today, which are known as \"Auto-Regressive LLMs\", and generate future tokens based only on past context.\n", + "\n", + "Another example of an Auto-Encoding LLMs is BERT from Google. In addition to embedding, Auto-encoding LLMs are often used for classification.\n", + "\n", + "### Sidenote\n", + "\n", + "In week 8 we will return to RAG and vector embeddings, and we will use an open-source vector encoder so that the data never leaves our computer - that's an important consideration when building enterprise systems and the data needs to remain internal." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78998399-ac17-4e28-b15f-0b5f51e6ee23", + "metadata": {}, + "outputs": [], + "source": [ + "# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk\n", + "# Chroma is a popular open source Vector Database based on SQLLite\n", + "\n", + "# embeddings = OpenAIEmbeddings()\n", + "\n", + "# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers\n", + "# Then replace embeddings = OpenAIEmbeddings()\n", + "# with:\n", + "# from langchain.embeddings import HuggingFaceEmbeddings\n", + "# embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\",show_progress=False # you can set this False to hide the download bar)\n", + "\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + " \n", + "# Delete if already exists\n", + "\n", + "if os.path.exists(db_name):\n", + " Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n", + "\n", + "# Create vectorstore\n", + "\n", + "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n", + "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "057868f6-51a6-4087-94d1-380145821550", + "metadata": {}, + "outputs": [], + "source": [ + "# Get one vector and find how many dimensions it has\n", + "\n", + "collection = vectorstore._collection\n", + "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n", + "dimensions = len(sample_embedding)\n", + "print(f\"The vectors have {dimensions:,} dimensions\")" + ] + }, + { + "cell_type": "markdown", + "id": "b0d45462-a818-441c-b010-b85b32bcf618", + "metadata": {}, + "source": [ + "## Visualizing the Vector Store\n", + "\n", + "Let's take a minute to look at the documents and their embedding vectors to see what's going on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b98adf5e-d464-4bd2-9bdf-bc5b6770263b", + "metadata": {}, + "outputs": [], + "source": [ + "# Prework\n", + "\n", + "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n", + "vectors = np.array(result['embeddings'])\n", + "documents = result['documents']\n", + "doc_types = [metadata['doc_type'] for metadata in result['metadatas']]\n", + "colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "427149d5-e5d8-4abd-bb6f-7ef0333cca21", + "metadata": {}, + "outputs": [], + "source": [ + "# We humans find it easier to visalize things in 2D!\n", + "# Reduce the dimensionality of the vectors to 2D using t-SNE\n", + "# (t-distributed stochastic neighbor embedding)\n", + "\n", + "tsne = TSNE(n_components=2, random_state=42)\n", + "reduced_vectors = tsne.fit_transform(vectors)\n", + "\n", + "# Create the 2D scatter plot\n", + "fig = go.Figure(data=[go.Scatter(\n", + " x=reduced_vectors[:, 0],\n", + " y=reduced_vectors[:, 1],\n", + " mode='markers',\n", + " marker=dict(size=5, color=colors, opacity=0.8),\n", + " text=[f\"Type: {t}
Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n", + " hoverinfo='text'\n", + ")])\n", + "\n", + "fig.update_layout(\n", + " title='2D Chroma Vector Store Visualization',\n", + " scene=dict(xaxis_title='x',yaxis_title='y'),\n", + " width=800,\n", + " height=600,\n", + " margin=dict(r=20, b=10, l=10, t=40)\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1418e88-acd5-460a-bf2b-4e6efc88e3dd", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's try 3D!\n", + "\n", + "tsne = TSNE(n_components=3, random_state=42)\n", + "reduced_vectors = tsne.fit_transform(vectors)\n", + "\n", + "# Create the 3D scatter plot\n", + "fig = go.Figure(data=[go.Scatter3d(\n", + " x=reduced_vectors[:, 0],\n", + " y=reduced_vectors[:, 1],\n", + " z=reduced_vectors[:, 2],\n", + " mode='markers',\n", + " marker=dict(size=5, color=colors, opacity=0.8),\n", + " text=[f\"Type: {t}
Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n", + " hoverinfo='text'\n", + ")])\n", + "\n", + "fig.update_layout(\n", + " title='3D Chroma Vector Store Visualization',\n", + " scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n", + " width=900,\n", + " height=700,\n", + " margin=dict(r=20, b=10, l=10, t=40)\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "9468860b-86a2-41df-af01-b2400cc985be", + "metadata": {}, + "source": [ + "# Time to use LangChain to bring it all together" + ] + }, + { + "cell_type": "markdown", + "id": "8ba8a5e7-965d-4770-a12d-532aff50c4b5", + "metadata": {}, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

PLEASE READ ME! Ignoring the Deprecation Warning

\n", + " When you run the next cell, you will get a LangChainDeprecationWarning \n", + " about the simple way we use LangChain memory. They ask us to migrate to their new approach for memory. \n", + " I feel quite conflicted about this. The new approach involves moving to LangGraph and getting deep into their ecosystem.\n", + " There's a fair amount of learning and coding in LangGraph, frankly without much benefit in our case.

\n", + " I'm going to think about whether/how to incorporate it in the course, but for now please ignore the Depreciation Warning and\n", + " use the code as is; LangChain are not expected to remove ConversationBufferMemory any time soon.\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "3dd0a478-bde4-41f8-8fe8-a35d3246e1b6", + "metadata": {}, + "source": [ + "## Alternative: to use a free open-source model instead of OpenAI in the next cell\n", + "\n", + "First run this in a cell: `!pip install langchain-ollama`\n", + "\n", + "Then replace `llm = ChatOpenAI(temperature=0.7, model_name=MODEL)` with:\n", + "\n", + "```python\n", + "from langchain_ollama import ChatOllama\n", + "llm = ChatOllama(temperature=0.7, model=\"llama3.2\")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "129c7d1e-0094-4479-9459-f9360b95f244", + "metadata": {}, + "outputs": [], + "source": [ + "# create a new Chat with OpenAI\n", + "llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n", + "\n", + "# set up the conversation memory for the chat\n", + "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "\n", + "# the retriever is an abstraction over the VectorStore that will be used during RAG\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n", + "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "968e7bf2-e862-4679-a11f-6c1efb6ec8ca", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"Can you describe Insurellm in a few sentences\"\n", + "result = conversation_chain.invoke({\"question\":query})\n", + "print(result[\"answer\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6eb99fb-33ec-4025-ab92-b634ede03647", + "metadata": {}, + "outputs": [], + "source": [ + "# set up a new conversation memory for the chat\n", + "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "\n", + "# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n", + "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" + ] + }, + { + "cell_type": "markdown", + "id": "bbbcb659-13ce-47ab-8a5e-01b930494964", + "metadata": {}, + "source": [ + "## Now we will bring this up in Gradio using the Chat interface -\n", + "\n", + "A quick and easy way to prototype a chat with an LLM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3536590-85c7-4155-bd87-ae78a1467670", + "metadata": {}, + "outputs": [], + "source": [ + "# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain\n", + "\n", + "def chat(message, history):\n", + " result = conversation_chain.invoke({\"question\": message})\n", + " return result[\"answer\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b252d8c1-61a8-406d-b57a-8f708a62b014", + "metadata": {}, + "outputs": [], + "source": [ + "# And in Gradio:\n", + "\n", + "view = gr.ChatInterface(chat, type=\"messages\").launch(inbrowser=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}