Fix langchain module and updated kb path

2025-10-24 11:24:06 +05:30
parent cc7bf8bf5d
commit fe122c223d
1 changed files with 54 additions and 51 deletions
--- a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb
+++ b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb
@@ -5,11 +5,14 @@
   "id": "dfe37963-1af6-44fc-a841-8e462443f5e6",
   "metadata": {},
   "source": [
-    "## Expert Knowledge Worker\n",
+    "## Expert Files based Knowledge Worker\n",
+    "\n",
+    "Submitted By: Bharat Puri\n",
    "\n",
    "### A question answering agent that is an expert knowledge worker\n",
    "### To be used by employees of Insurellm, an Insurance Tech company\n",
    "### The agent needs to be accurate and the solution should be low cost.\n",
+    "### Fixes to the LangChain \n",
    "\n",
    "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy."
   ]
@@ -26,12 +29,22 @@
    "import os\n",
    "import glob\n",
    "from dotenv import load_dotenv\n",
+    "import gradio as gr\n",
+    "import sys\n",
+    "sys.path.append(os.path.abspath(os.path.join(\"..\", \"..\"))) \n",
+    "# LLM APIs\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "# HuggingFace\n",
+    "from huggingface_hub import login\n",
+    "\n",
+    "# Gradio\n",
    "import gradio as gr"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
   "id": "78743444-cae7-4fad-bf66-dcfabbe73335",
   "metadata": {},
   "outputs": [],
@@ -41,35 +54,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 11,
   "id": "802137aa-8a74-45e0-a487-d1974927d7ca",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# # imports for langchain\n",
+    "# LangChain v1.0+ imports\n",
    "\n",
-    "# from langchain.document_loaders import DirectoryLoader, TextLoader\n",
-    "# from langchain.text_splitter import CharacterTextSplitter\n",
-    "# from langchain.schema import Document\n",
-    "# from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
-    "# from langchain_chroma import Chroma\n",
-    "# import numpy as np\n",
-    "# from sklearn.manifold import TSNE\n",
-    "# import plotly.graph_objects as go\n",
-    "# from langchain.memory import ConversationBufferMemory\n",
-    "# from langchain.chains import ConversationalRetrievalChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "e0e47493-d020-4fe4-bac9-6dc42dee1841",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# # imports for langchain\n",
-    "\n",
-    "# imports for langchain (modular setup)\n",
    "from langchain_core.documents import Document\n",
    "from langchain_core.messages import HumanMessage\n",
    "from langchain_text_splitters import CharacterTextSplitter\n",
@@ -77,10 +68,7 @@
    "from langchain_chroma import Chroma\n",
    "from langchain_huggingface import HuggingFaceEmbeddings\n",
    "from langchain_core.callbacks import StdOutCallbackHandler\n",
-    "# Other imports\n",
-    "import numpy as np\n",
-    "from sklearn.manifold import TSNE\n",
-    "import plotly.graph_objects as go\n"
+    "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n"
   ]
  },
  {
@@ -111,27 +99,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 12,
   "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'DirectoryLoader' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m     12\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m folder \u001b[38;5;129;01min\u001b[39;00m folders:\n\u001b[32m     13\u001b[39m     doc_type = os.path.basename(folder)\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m     loader = \u001b[43mDirectoryLoader\u001b[49m(folder, glob=\u001b[33m\"\u001b[39m\u001b[33m**/*.md\u001b[39m\u001b[33m\"\u001b[39m, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n\u001b[32m     15\u001b[39m     folder_docs = loader.load()\n\u001b[32m     16\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m folder_docs:\n",
-      "\u001b[31mNameError\u001b[39m: name 'DirectoryLoader' is not defined"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Read in documents using LangChain's loaders\n",
    "# Take everything in all the sub-folders of our knowledgebase\n",
    "\n",
-    "folders = glob.glob(\"knowledge-base/*\")\n",
+    "folders = glob.glob(\"../../knowledge-base/*\")\n",
    "\n",
    "# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n",
    "text_loader_kwargs = {'encoding': 'utf-8'}\n",
@@ -150,10 +126,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Created a chunk of size 1088, which is longer than the specified 1000\n"
+     ]
+    }
+   ],
   "source": [
    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
    "chunks = text_splitter.split_documents(documents)"
@@ -161,20 +145,39 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "123"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "len(chunks)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "id": "2c54b4b6-06da-463d-bee7-4dd456c2b887",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Document types found: company, employees, contracts, products\n"
+     ]
+    }
+   ],
   "source": [
    "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
    "print(f\"Document types found: {', '.join(doc_types)}\")"