From cc7bf8bf5dfbb727379d5fc9a2c2a1362267c6a6 Mon Sep 17 00:00:00 2001 From: Bharat Puri Date: Fri, 24 Oct 2025 11:16:38 +0530 Subject: [PATCH 1/3] Added file based kb --- .../files_based_knowledge_base.ipynb | 498 ++++++++++++++++++ 1 file changed, 498 insertions(+) create mode 100644 week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb diff --git a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb new file mode 100644 index 0000000..75b190e --- /dev/null +++ b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb @@ -0,0 +1,498 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dfe37963-1af6-44fc-a841-8e462443f5e6", + "metadata": {}, + "source": [ + "## Expert Knowledge Worker\n", + "\n", + "### A question answering agent that is an expert knowledge worker\n", + "### To be used by employees of Insurellm, an Insurance Tech company\n", + "### The agent needs to be accurate and the solution should be low cost.\n", + "\n", + "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import glob\n", + "from dotenv import load_dotenv\n", + "import gradio as gr" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "78743444-cae7-4fad-bf66-dcfabbe73335", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U -q imapclient langchain langchain-openai langchain-chroma langchain-community langchain-core langchain-text-splitters langchain-huggingface chromadb sentence-transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "802137aa-8a74-45e0-a487-d1974927d7ca", + "metadata": {}, + "outputs": [], + "source": [ + "# # imports for langchain\n", + "\n", + "# from langchain.document_loaders import DirectoryLoader, TextLoader\n", + "# from langchain.text_splitter import CharacterTextSplitter\n", + "# from langchain.schema import Document\n", + "# from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "# from langchain_chroma import Chroma\n", + "# import numpy as np\n", + "# from sklearn.manifold import TSNE\n", + "# import plotly.graph_objects as go\n", + "# from langchain.memory import ConversationBufferMemory\n", + "# from langchain.chains import ConversationalRetrievalChain" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e0e47493-d020-4fe4-bac9-6dc42dee1841", + "metadata": {}, + "outputs": [], + "source": [ + "# # imports for langchain\n", + "\n", + "# imports for langchain (modular setup)\n", + "from langchain_core.documents import Document\n", + "from langchain_core.messages import HumanMessage\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_chroma import Chroma\n", + "from langchain_huggingface import HuggingFaceEmbeddings\n", + "from langchain_core.callbacks import StdOutCallbackHandler\n", + "# Other imports\n", + "import numpy as np\n", + "from sklearn.manifold import TSNE\n", + "import plotly.graph_objects as go\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "58c85082-e417-4708-9efe-81a5d55d1424", + "metadata": {}, + "outputs": [], + "source": [ + "# price is a factor for our company, so we're going to use a low cost model\n", + "\n", + "MODEL = \"gpt-4o-mini\"\n", + "db_name = \"vector_db\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ee78efcb-60fe-449e-a944-40bab26261af", + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables in a file called .env\n", + "\n", + "load_dotenv(override=True)\n", + "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'DirectoryLoader' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m 12\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m folder \u001b[38;5;129;01min\u001b[39;00m folders:\n\u001b[32m 13\u001b[39m doc_type = os.path.basename(folder)\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m loader = \u001b[43mDirectoryLoader\u001b[49m(folder, glob=\u001b[33m\"\u001b[39m\u001b[33m**/*.md\u001b[39m\u001b[33m\"\u001b[39m, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n\u001b[32m 15\u001b[39m folder_docs = loader.load()\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m folder_docs:\n", + "\u001b[31mNameError\u001b[39m: name 'DirectoryLoader' is not defined" + ] + } + ], + "source": [ + "# Read in documents using LangChain's loaders\n", + "# Take everything in all the sub-folders of our knowledgebase\n", + "\n", + "folders = glob.glob(\"knowledge-base/*\")\n", + "\n", + "# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n", + "text_loader_kwargs = {'encoding': 'utf-8'}\n", + "# If that doesn't work, some Windows users might need to uncomment the next line instead\n", + "# text_loader_kwargs={'autodetect_encoding': True}\n", + "\n", + "documents = []\n", + "for folder in folders:\n", + " doc_type = os.path.basename(folder)\n", + " loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n", + " folder_docs = loader.load()\n", + " for doc in folder_docs:\n", + " doc.metadata[\"doc_type\"] = doc_type\n", + " documents.append(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a", + "metadata": {}, + "outputs": [], + "source": [ + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + "chunks = text_splitter.split_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb", + "metadata": {}, + "outputs": [], + "source": [ + "len(chunks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c54b4b6-06da-463d-bee7-4dd456c2b887", + "metadata": {}, + "outputs": [], + "source": [ + "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n", + "print(f\"Document types found: {', '.join(doc_types)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "77f7d2a6-ccfa-425b-a1c3-5e55b23bd013", + "metadata": {}, + "source": [ + "## A sidenote on Embeddings, and \"Auto-Encoding LLMs\"\n", + "\n", + "We will be mapping each chunk of text into a Vector that represents the meaning of the text, known as an embedding.\n", + "\n", + "OpenAI offers a model to do this, which we will use by calling their API with some LangChain code.\n", + "\n", + "This model is an example of an \"Auto-Encoding LLM\" which generates an output given a complete input.\n", + "It's different to all the other LLMs we've discussed today, which are known as \"Auto-Regressive LLMs\", and generate future tokens based only on past context.\n", + "\n", + "Another example of an Auto-Encoding LLMs is BERT from Google. In addition to embedding, Auto-encoding LLMs are often used for classification.\n", + "\n", + "### Sidenote\n", + "\n", + "In week 8 we will return to RAG and vector embeddings, and we will use an open-source vector encoder so that the data never leaves our computer - that's an important consideration when building enterprise systems and the data needs to remain internal." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78998399-ac17-4e28-b15f-0b5f51e6ee23", + "metadata": {}, + "outputs": [], + "source": [ + "# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk\n", + "# Chroma is a popular open source Vector Database based on SQLLite\n", + "\n", + "# embeddings = OpenAIEmbeddings()\n", + "\n", + "# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers\n", + "# Then replace embeddings = OpenAIEmbeddings()\n", + "# with:\n", + "# from langchain.embeddings import HuggingFaceEmbeddings\n", + "# embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\",show_progress=False # you can set this False to hide the download bar)\n", + "\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + " \n", + "# Delete if already exists\n", + "\n", + "if os.path.exists(db_name):\n", + " Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n", + "\n", + "# Create vectorstore\n", + "\n", + "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n", + "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "057868f6-51a6-4087-94d1-380145821550", + "metadata": {}, + "outputs": [], + "source": [ + "# Get one vector and find how many dimensions it has\n", + "\n", + "collection = vectorstore._collection\n", + "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n", + "dimensions = len(sample_embedding)\n", + "print(f\"The vectors have {dimensions:,} dimensions\")" + ] + }, + { + "cell_type": "markdown", + "id": "b0d45462-a818-441c-b010-b85b32bcf618", + "metadata": {}, + "source": [ + "## Visualizing the Vector Store\n", + "\n", + "Let's take a minute to look at the documents and their embedding vectors to see what's going on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b98adf5e-d464-4bd2-9bdf-bc5b6770263b", + "metadata": {}, + "outputs": [], + "source": [ + "# Prework\n", + "\n", + "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n", + "vectors = np.array(result['embeddings'])\n", + "documents = result['documents']\n", + "doc_types = [metadata['doc_type'] for metadata in result['metadatas']]\n", + "colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "427149d5-e5d8-4abd-bb6f-7ef0333cca21", + "metadata": {}, + "outputs": [], + "source": [ + "# We humans find it easier to visalize things in 2D!\n", + "# Reduce the dimensionality of the vectors to 2D using t-SNE\n", + "# (t-distributed stochastic neighbor embedding)\n", + "\n", + "tsne = TSNE(n_components=2, random_state=42)\n", + "reduced_vectors = tsne.fit_transform(vectors)\n", + "\n", + "# Create the 2D scatter plot\n", + "fig = go.Figure(data=[go.Scatter(\n", + " x=reduced_vectors[:, 0],\n", + " y=reduced_vectors[:, 1],\n", + " mode='markers',\n", + " marker=dict(size=5, color=colors, opacity=0.8),\n", + " text=[f\"Type: {t}
Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n", + " hoverinfo='text'\n", + ")])\n", + "\n", + "fig.update_layout(\n", + " title='2D Chroma Vector Store Visualization',\n", + " scene=dict(xaxis_title='x',yaxis_title='y'),\n", + " width=800,\n", + " height=600,\n", + " margin=dict(r=20, b=10, l=10, t=40)\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1418e88-acd5-460a-bf2b-4e6efc88e3dd", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's try 3D!\n", + "\n", + "tsne = TSNE(n_components=3, random_state=42)\n", + "reduced_vectors = tsne.fit_transform(vectors)\n", + "\n", + "# Create the 3D scatter plot\n", + "fig = go.Figure(data=[go.Scatter3d(\n", + " x=reduced_vectors[:, 0],\n", + " y=reduced_vectors[:, 1],\n", + " z=reduced_vectors[:, 2],\n", + " mode='markers',\n", + " marker=dict(size=5, color=colors, opacity=0.8),\n", + " text=[f\"Type: {t}
Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n", + " hoverinfo='text'\n", + ")])\n", + "\n", + "fig.update_layout(\n", + " title='3D Chroma Vector Store Visualization',\n", + " scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n", + " width=900,\n", + " height=700,\n", + " margin=dict(r=20, b=10, l=10, t=40)\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "9468860b-86a2-41df-af01-b2400cc985be", + "metadata": {}, + "source": [ + "# Time to use LangChain to bring it all together" + ] + }, + { + "cell_type": "markdown", + "id": "8ba8a5e7-965d-4770-a12d-532aff50c4b5", + "metadata": {}, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

PLEASE READ ME! Ignoring the Deprecation Warning

\n", + " When you run the next cell, you will get a LangChainDeprecationWarning \n", + " about the simple way we use LangChain memory. They ask us to migrate to their new approach for memory. \n", + " I feel quite conflicted about this. The new approach involves moving to LangGraph and getting deep into their ecosystem.\n", + " There's a fair amount of learning and coding in LangGraph, frankly without much benefit in our case.

\n", + " I'm going to think about whether/how to incorporate it in the course, but for now please ignore the Depreciation Warning and\n", + " use the code as is; LangChain are not expected to remove ConversationBufferMemory any time soon.\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "3dd0a478-bde4-41f8-8fe8-a35d3246e1b6", + "metadata": {}, + "source": [ + "## Alternative: to use a free open-source model instead of OpenAI in the next cell\n", + "\n", + "First run this in a cell: `!pip install langchain-ollama`\n", + "\n", + "Then replace `llm = ChatOpenAI(temperature=0.7, model_name=MODEL)` with:\n", + "\n", + "```python\n", + "from langchain_ollama import ChatOllama\n", + "llm = ChatOllama(temperature=0.7, model=\"llama3.2\")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "129c7d1e-0094-4479-9459-f9360b95f244", + "metadata": {}, + "outputs": [], + "source": [ + "# create a new Chat with OpenAI\n", + "llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n", + "\n", + "# set up the conversation memory for the chat\n", + "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "\n", + "# the retriever is an abstraction over the VectorStore that will be used during RAG\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n", + "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "968e7bf2-e862-4679-a11f-6c1efb6ec8ca", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"Can you describe Insurellm in a few sentences\"\n", + "result = conversation_chain.invoke({\"question\":query})\n", + "print(result[\"answer\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6eb99fb-33ec-4025-ab92-b634ede03647", + "metadata": {}, + "outputs": [], + "source": [ + "# set up a new conversation memory for the chat\n", + "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "\n", + "# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n", + "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" + ] + }, + { + "cell_type": "markdown", + "id": "bbbcb659-13ce-47ab-8a5e-01b930494964", + "metadata": {}, + "source": [ + "## Now we will bring this up in Gradio using the Chat interface -\n", + "\n", + "A quick and easy way to prototype a chat with an LLM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3536590-85c7-4155-bd87-ae78a1467670", + "metadata": {}, + "outputs": [], + "source": [ + "# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain\n", + "\n", + "def chat(message, history):\n", + " result = conversation_chain.invoke({\"question\": message})\n", + " return result[\"answer\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b252d8c1-61a8-406d-b57a-8f708a62b014", + "metadata": {}, + "outputs": [], + "source": [ + "# And in Gradio:\n", + "\n", + "view = gr.ChatInterface(chat, type=\"messages\").launch(inbrowser=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From fe122c223d6a12678a8d908d94580373af170f82 Mon Sep 17 00:00:00 2001 From: Bharat Puri Date: Fri, 24 Oct 2025 11:24:06 +0530 Subject: [PATCH 2/3] Fix langchain module and updated kb path --- .../files_based_knowledge_base.ipynb | 105 +++++++++--------- 1 file changed, 54 insertions(+), 51 deletions(-) diff --git a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb index 75b190e..554f6c6 100644 --- a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb +++ b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb @@ -5,11 +5,14 @@ "id": "dfe37963-1af6-44fc-a841-8e462443f5e6", "metadata": {}, "source": [ - "## Expert Knowledge Worker\n", + "## Expert Files based Knowledge Worker\n", + "\n", + "Submitted By: Bharat Puri\n", "\n", "### A question answering agent that is an expert knowledge worker\n", "### To be used by employees of Insurellm, an Insurance Tech company\n", "### The agent needs to be accurate and the solution should be low cost.\n", + "### Fixes to the LangChain \n", "\n", "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy." ] @@ -26,12 +29,22 @@ "import os\n", "import glob\n", "from dotenv import load_dotenv\n", + "import gradio as gr\n", + "import sys\n", + "sys.path.append(os.path.abspath(os.path.join(\"..\", \"..\"))) \n", + "# LLM APIs\n", + "from openai import OpenAI\n", + "\n", + "# HuggingFace\n", + "from huggingface_hub import login\n", + "\n", + "# Gradio\n", "import gradio as gr" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "id": "78743444-cae7-4fad-bf66-dcfabbe73335", "metadata": {}, "outputs": [], @@ -41,35 +54,13 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 11, "id": "802137aa-8a74-45e0-a487-d1974927d7ca", "metadata": {}, "outputs": [], "source": [ - "# # imports for langchain\n", + "# LangChain v1.0+ imports\n", "\n", - "# from langchain.document_loaders import DirectoryLoader, TextLoader\n", - "# from langchain.text_splitter import CharacterTextSplitter\n", - "# from langchain.schema import Document\n", - "# from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", - "# from langchain_chroma import Chroma\n", - "# import numpy as np\n", - "# from sklearn.manifold import TSNE\n", - "# import plotly.graph_objects as go\n", - "# from langchain.memory import ConversationBufferMemory\n", - "# from langchain.chains import ConversationalRetrievalChain" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e0e47493-d020-4fe4-bac9-6dc42dee1841", - "metadata": {}, - "outputs": [], - "source": [ - "# # imports for langchain\n", - "\n", - "# imports for langchain (modular setup)\n", "from langchain_core.documents import Document\n", "from langchain_core.messages import HumanMessage\n", "from langchain_text_splitters import CharacterTextSplitter\n", @@ -77,10 +68,7 @@ "from langchain_chroma import Chroma\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "from langchain_core.callbacks import StdOutCallbackHandler\n", - "# Other imports\n", - "import numpy as np\n", - "from sklearn.manifold import TSNE\n", - "import plotly.graph_objects as go\n" + "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n" ] }, { @@ -111,27 +99,15 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905", "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'DirectoryLoader' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m 12\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m folder \u001b[38;5;129;01min\u001b[39;00m folders:\n\u001b[32m 13\u001b[39m doc_type = os.path.basename(folder)\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m loader = \u001b[43mDirectoryLoader\u001b[49m(folder, glob=\u001b[33m\"\u001b[39m\u001b[33m**/*.md\u001b[39m\u001b[33m\"\u001b[39m, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n\u001b[32m 15\u001b[39m folder_docs = loader.load()\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m folder_docs:\n", - "\u001b[31mNameError\u001b[39m: name 'DirectoryLoader' is not defined" - ] - } - ], + "outputs": [], "source": [ "# Read in documents using LangChain's loaders\n", "# Take everything in all the sub-folders of our knowledgebase\n", "\n", - "folders = glob.glob(\"knowledge-base/*\")\n", + "folders = glob.glob(\"../../knowledge-base/*\")\n", "\n", "# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n", "text_loader_kwargs = {'encoding': 'utf-8'}\n", @@ -150,10 +126,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Created a chunk of size 1088, which is longer than the specified 1000\n" + ] + } + ], "source": [ "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", "chunks = text_splitter.split_documents(documents)" @@ -161,20 +145,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "123" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "len(chunks)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "2c54b4b6-06da-463d-bee7-4dd456c2b887", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document types found: company, employees, contracts, products\n" + ] + } + ], "source": [ "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n", "print(f\"Document types found: {', '.join(doc_types)}\")" From 54d717fbd217bf46284ab77fa735dfe67961ede2 Mon Sep 17 00:00:00 2001 From: Bharat Puri Date: Fri, 24 Oct 2025 12:14:48 +0530 Subject: [PATCH 3/3] fixed memory method of langchain --- .../files_based_knowledge_base.ipynb | 100 ++++++++++-------- 1 file changed, 57 insertions(+), 43 deletions(-) diff --git a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb index 554f6c6..7fd4999 100644 --- a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb +++ b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb @@ -44,17 +44,30 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "78743444-cae7-4fad-bf66-dcfabbe73335", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "!pip install -U -q imapclient langchain langchain-openai langchain-chroma langchain-community langchain-core langchain-text-splitters langchain-huggingface chromadb sentence-transformers" + "# !pip install -U imapclient langchain langchain-openai langchain-chroma langchain-community langchain-core langchain-text-splitters langchain-huggingface chromadb sentence-transformers" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, + "id": "71924170-e73a-4e98-a34a-c5c0567f39da", + "metadata": {}, + "outputs": [], + "source": [ + "## Install specific version of langchain to avoid future issues\n", + "!pip install -U -q imapclient langchain==1.0.2 langchain-openai==1.0.1 langchain-chroma==1.0.0 langchain-community==0.4 langchain-core==1.0.0 langchain-text-splitters==1.0.0 langchain-huggingface==1.0.0 langchain-classic==1.0.0 chromadb==1.2.1 sentence-transformers==5.1.2" + ] + }, + { + "cell_type": "code", + "execution_count": 41, "id": "802137aa-8a74-45e0-a487-d1974927d7ca", "metadata": {}, "outputs": [], @@ -68,7 +81,10 @@ "from langchain_chroma import Chroma\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "from langchain_core.callbacks import StdOutCallbackHandler\n", - "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n" + "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n", + "from langchain_classic.memory import ConversationBufferMemory\n", + "from langchain_classic.chains import ConversationalRetrievalChain\n", + "\n" ] }, { @@ -99,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905", "metadata": {}, "outputs": [], @@ -126,18 +142,12 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Created a chunk of size 1088, which is longer than the specified 1000\n" - ] - } - ], + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", "chunks = text_splitter.split_documents(documents)" @@ -145,28 +155,17 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "123" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(chunks)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "id": "2c54b4b6-06da-463d-bee7-4dd456c2b887", "metadata": {}, "outputs": [ @@ -174,7 +173,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Document types found: company, employees, contracts, products\n" + "Document types found: employees, products, contracts, company\n" ] } ], @@ -208,13 +207,15 @@ "cell_type": "code", "execution_count": null, "id": "78998399-ac17-4e28-b15f-0b5f51e6ee23", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk\n", "# Chroma is a popular open source Vector Database based on SQLLite\n", "\n", - "# embeddings = OpenAIEmbeddings()\n", + "embeddings = OpenAIEmbeddings()\n", "\n", "# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers\n", "# Then replace embeddings = OpenAIEmbeddings()\n", @@ -222,7 +223,8 @@ "# from langchain.embeddings import HuggingFaceEmbeddings\n", "# embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\",show_progress=False # you can set this False to hide the download bar)\n", "\n", - "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + "# embeddings = HuggingFaceEmbeddings(\n", + "# model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", " \n", "# Delete if already exists\n", "\n", @@ -239,7 +241,9 @@ "cell_type": "code", "execution_count": null, "id": "057868f6-51a6-4087-94d1-380145821550", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# Get one vector and find how many dimensions it has\n", @@ -262,12 +266,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "b98adf5e-d464-4bd2-9bdf-bc5b6770263b", "metadata": {}, "outputs": [], "source": [ "# Prework\n", + "import numpy as np\n", "\n", "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n", "vectors = np.array(result['embeddings'])\n", @@ -287,6 +292,9 @@ "# Reduce the dimensionality of the vectors to 2D using t-SNE\n", "# (t-distributed stochastic neighbor embedding)\n", "\n", + "from sklearn.manifold import TSNE\n", + "import plotly.graph_objects as go\n", + "\n", "tsne = TSNE(n_components=2, random_state=42)\n", "reduced_vectors = tsne.fit_transform(vectors)\n", "\n", @@ -315,7 +323,9 @@ "cell_type": "code", "execution_count": null, "id": "e1418e88-acd5-460a-bf2b-4e6efc88e3dd", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# Let's try 3D!\n", @@ -396,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "129c7d1e-0094-4479-9459-f9360b95f244", "metadata": {}, "outputs": [], @@ -418,7 +428,9 @@ "cell_type": "code", "execution_count": null, "id": "968e7bf2-e862-4679-a11f-6c1efb6ec8ca", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "query = \"Can you describe Insurellm in a few sentences\"\n", @@ -428,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "e6eb99fb-33ec-4025-ab92-b634ede03647", "metadata": {}, "outputs": [], @@ -452,7 +464,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "c3536590-85c7-4155-bd87-ae78a1467670", "metadata": {}, "outputs": [], @@ -468,7 +480,9 @@ "cell_type": "code", "execution_count": null, "id": "b252d8c1-61a8-406d-b57a-8f708a62b014", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# And in Gradio:\n",