Fix langchain module and updated kb path
This commit is contained in:
@@ -5,11 +5,14 @@
|
|||||||
"id": "dfe37963-1af6-44fc-a841-8e462443f5e6",
|
"id": "dfe37963-1af6-44fc-a841-8e462443f5e6",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Expert Knowledge Worker\n",
|
"## Expert Files based Knowledge Worker\n",
|
||||||
|
"\n",
|
||||||
|
"Submitted By: Bharat Puri\n",
|
||||||
"\n",
|
"\n",
|
||||||
"### A question answering agent that is an expert knowledge worker\n",
|
"### A question answering agent that is an expert knowledge worker\n",
|
||||||
"### To be used by employees of Insurellm, an Insurance Tech company\n",
|
"### To be used by employees of Insurellm, an Insurance Tech company\n",
|
||||||
"### The agent needs to be accurate and the solution should be low cost.\n",
|
"### The agent needs to be accurate and the solution should be low cost.\n",
|
||||||
|
"### Fixes to the LangChain \n",
|
||||||
"\n",
|
"\n",
|
||||||
"This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy."
|
"This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy."
|
||||||
]
|
]
|
||||||
@@ -26,12 +29,22 @@
|
|||||||
"import os\n",
|
"import os\n",
|
||||||
"import glob\n",
|
"import glob\n",
|
||||||
"from dotenv import load_dotenv\n",
|
"from dotenv import load_dotenv\n",
|
||||||
|
"import gradio as gr\n",
|
||||||
|
"import sys\n",
|
||||||
|
"sys.path.append(os.path.abspath(os.path.join(\"..\", \"..\"))) \n",
|
||||||
|
"# LLM APIs\n",
|
||||||
|
"from openai import OpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"# HuggingFace\n",
|
||||||
|
"from huggingface_hub import login\n",
|
||||||
|
"\n",
|
||||||
|
"# Gradio\n",
|
||||||
"import gradio as gr"
|
"import gradio as gr"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 2,
|
||||||
"id": "78743444-cae7-4fad-bf66-dcfabbe73335",
|
"id": "78743444-cae7-4fad-bf66-dcfabbe73335",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -41,35 +54,13 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 27,
|
"execution_count": 11,
|
||||||
"id": "802137aa-8a74-45e0-a487-d1974927d7ca",
|
"id": "802137aa-8a74-45e0-a487-d1974927d7ca",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# # imports for langchain\n",
|
"# LangChain v1.0+ imports\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# from langchain.document_loaders import DirectoryLoader, TextLoader\n",
|
|
||||||
"# from langchain.text_splitter import CharacterTextSplitter\n",
|
|
||||||
"# from langchain.schema import Document\n",
|
|
||||||
"# from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
|
|
||||||
"# from langchain_chroma import Chroma\n",
|
|
||||||
"# import numpy as np\n",
|
|
||||||
"# from sklearn.manifold import TSNE\n",
|
|
||||||
"# import plotly.graph_objects as go\n",
|
|
||||||
"# from langchain.memory import ConversationBufferMemory\n",
|
|
||||||
"# from langchain.chains import ConversationalRetrievalChain"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"id": "e0e47493-d020-4fe4-bac9-6dc42dee1841",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# # imports for langchain\n",
|
|
||||||
"\n",
|
|
||||||
"# imports for langchain (modular setup)\n",
|
|
||||||
"from langchain_core.documents import Document\n",
|
"from langchain_core.documents import Document\n",
|
||||||
"from langchain_core.messages import HumanMessage\n",
|
"from langchain_core.messages import HumanMessage\n",
|
||||||
"from langchain_text_splitters import CharacterTextSplitter\n",
|
"from langchain_text_splitters import CharacterTextSplitter\n",
|
||||||
@@ -77,10 +68,7 @@
|
|||||||
"from langchain_chroma import Chroma\n",
|
"from langchain_chroma import Chroma\n",
|
||||||
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
||||||
"from langchain_core.callbacks import StdOutCallbackHandler\n",
|
"from langchain_core.callbacks import StdOutCallbackHandler\n",
|
||||||
"# Other imports\n",
|
"from langchain_community.document_loaders import DirectoryLoader, TextLoader\n"
|
||||||
"import numpy as np\n",
|
|
||||||
"from sklearn.manifold import TSNE\n",
|
|
||||||
"import plotly.graph_objects as go\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -111,27 +99,15 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 12,
|
||||||
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
|
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"ename": "NameError",
|
|
||||||
"evalue": "name 'DirectoryLoader' is not defined",
|
|
||||||
"output_type": "error",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|
||||||
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
|
||||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m 12\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m folder \u001b[38;5;129;01min\u001b[39;00m folders:\n\u001b[32m 13\u001b[39m doc_type = os.path.basename(folder)\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m loader = \u001b[43mDirectoryLoader\u001b[49m(folder, glob=\u001b[33m\"\u001b[39m\u001b[33m**/*.md\u001b[39m\u001b[33m\"\u001b[39m, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n\u001b[32m 15\u001b[39m folder_docs = loader.load()\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m folder_docs:\n",
|
|
||||||
"\u001b[31mNameError\u001b[39m: name 'DirectoryLoader' is not defined"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"# Read in documents using LangChain's loaders\n",
|
"# Read in documents using LangChain's loaders\n",
|
||||||
"# Take everything in all the sub-folders of our knowledgebase\n",
|
"# Take everything in all the sub-folders of our knowledgebase\n",
|
||||||
"\n",
|
"\n",
|
||||||
"folders = glob.glob(\"knowledge-base/*\")\n",
|
"folders = glob.glob(\"../../knowledge-base/*\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n",
|
"# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n",
|
||||||
"text_loader_kwargs = {'encoding': 'utf-8'}\n",
|
"text_loader_kwargs = {'encoding': 'utf-8'}\n",
|
||||||
@@ -150,10 +126,18 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 13,
|
||||||
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
|
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Created a chunk of size 1088, which is longer than the specified 1000\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
|
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
|
||||||
"chunks = text_splitter.split_documents(documents)"
|
"chunks = text_splitter.split_documents(documents)"
|
||||||
@@ -161,20 +145,39 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 14,
|
||||||
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
|
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"123"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"len(chunks)"
|
"len(chunks)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 15,
|
||||||
"id": "2c54b4b6-06da-463d-bee7-4dd456c2b887",
|
"id": "2c54b4b6-06da-463d-bee7-4dd456c2b887",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Document types found: company, employees, contracts, products\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
|
"doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
|
||||||
"print(f\"Document types found: {', '.join(doc_types)}\")"
|
"print(f\"Document types found: {', '.join(doc_types)}\")"
|
||||||
|
|||||||
Reference in New Issue
Block a user