Fix langchain module and updated kb path

This commit is contained in:
Bharat Puri
2025-10-24 11:24:06 +05:30
parent cc7bf8bf5d
commit fe122c223d

View File

@@ -5,11 +5,14 @@
"id": "dfe37963-1af6-44fc-a841-8e462443f5e6", "id": "dfe37963-1af6-44fc-a841-8e462443f5e6",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Expert Knowledge Worker\n", "## Expert Files based Knowledge Worker\n",
"\n",
"Submitted By: Bharat Puri\n",
"\n", "\n",
"### A question answering agent that is an expert knowledge worker\n", "### A question answering agent that is an expert knowledge worker\n",
"### To be used by employees of Insurellm, an Insurance Tech company\n", "### To be used by employees of Insurellm, an Insurance Tech company\n",
"### The agent needs to be accurate and the solution should be low cost.\n", "### The agent needs to be accurate and the solution should be low cost.\n",
"### Fixes to the LangChain \n",
"\n", "\n",
"This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy." "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy."
] ]
@@ -26,12 +29,22 @@
"import os\n", "import os\n",
"import glob\n", "import glob\n",
"from dotenv import load_dotenv\n", "from dotenv import load_dotenv\n",
"import gradio as gr\n",
"import sys\n",
"sys.path.append(os.path.abspath(os.path.join(\"..\", \"..\"))) \n",
"# LLM APIs\n",
"from openai import OpenAI\n",
"\n",
"# HuggingFace\n",
"from huggingface_hub import login\n",
"\n",
"# Gradio\n",
"import gradio as gr" "import gradio as gr"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 2,
"id": "78743444-cae7-4fad-bf66-dcfabbe73335", "id": "78743444-cae7-4fad-bf66-dcfabbe73335",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -41,35 +54,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 11,
"id": "802137aa-8a74-45e0-a487-d1974927d7ca", "id": "802137aa-8a74-45e0-a487-d1974927d7ca",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# # imports for langchain\n", "# LangChain v1.0+ imports\n",
"\n", "\n",
"# from langchain.document_loaders import DirectoryLoader, TextLoader\n",
"# from langchain.text_splitter import CharacterTextSplitter\n",
"# from langchain.schema import Document\n",
"# from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
"# from langchain_chroma import Chroma\n",
"# import numpy as np\n",
"# from sklearn.manifold import TSNE\n",
"# import plotly.graph_objects as go\n",
"# from langchain.memory import ConversationBufferMemory\n",
"# from langchain.chains import ConversationalRetrievalChain"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e0e47493-d020-4fe4-bac9-6dc42dee1841",
"metadata": {},
"outputs": [],
"source": [
"# # imports for langchain\n",
"\n",
"# imports for langchain (modular setup)\n",
"from langchain_core.documents import Document\n", "from langchain_core.documents import Document\n",
"from langchain_core.messages import HumanMessage\n", "from langchain_core.messages import HumanMessage\n",
"from langchain_text_splitters import CharacterTextSplitter\n", "from langchain_text_splitters import CharacterTextSplitter\n",
@@ -77,10 +68,7 @@
"from langchain_chroma import Chroma\n", "from langchain_chroma import Chroma\n",
"from langchain_huggingface import HuggingFaceEmbeddings\n", "from langchain_huggingface import HuggingFaceEmbeddings\n",
"from langchain_core.callbacks import StdOutCallbackHandler\n", "from langchain_core.callbacks import StdOutCallbackHandler\n",
"# Other imports\n", "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n"
"import numpy as np\n",
"from sklearn.manifold import TSNE\n",
"import plotly.graph_objects as go\n"
] ]
}, },
{ {
@@ -111,27 +99,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 12,
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905", "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"ename": "NameError",
"evalue": "name 'DirectoryLoader' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m 12\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m folder \u001b[38;5;129;01min\u001b[39;00m folders:\n\u001b[32m 13\u001b[39m doc_type = os.path.basename(folder)\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m loader = \u001b[43mDirectoryLoader\u001b[49m(folder, glob=\u001b[33m\"\u001b[39m\u001b[33m**/*.md\u001b[39m\u001b[33m\"\u001b[39m, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n\u001b[32m 15\u001b[39m folder_docs = loader.load()\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m folder_docs:\n",
"\u001b[31mNameError\u001b[39m: name 'DirectoryLoader' is not defined"
]
}
],
"source": [ "source": [
"# Read in documents using LangChain's loaders\n", "# Read in documents using LangChain's loaders\n",
"# Take everything in all the sub-folders of our knowledgebase\n", "# Take everything in all the sub-folders of our knowledgebase\n",
"\n", "\n",
"folders = glob.glob(\"knowledge-base/*\")\n", "folders = glob.glob(\"../../knowledge-base/*\")\n",
"\n", "\n",
"# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n", "# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n",
"text_loader_kwargs = {'encoding': 'utf-8'}\n", "text_loader_kwargs = {'encoding': 'utf-8'}\n",
@@ -150,10 +126,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 13,
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a", "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Created a chunk of size 1088, which is longer than the specified 1000\n"
]
}
],
"source": [ "source": [
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
"chunks = text_splitter.split_documents(documents)" "chunks = text_splitter.split_documents(documents)"
@@ -161,20 +145,39 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 14,
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb", "id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"123"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"len(chunks)" "len(chunks)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 15,
"id": "2c54b4b6-06da-463d-bee7-4dd456c2b887", "id": "2c54b4b6-06da-463d-bee7-4dd456c2b887",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document types found: company, employees, contracts, products\n"
]
}
],
"source": [ "source": [
"doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n", "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
"print(f\"Document types found: {', '.join(doc_types)}\")" "print(f\"Document types found: {', '.join(doc_types)}\")"