Fix langchain module and updated kb path
This commit is contained in:
@@ -5,11 +5,14 @@
|
||||
"id": "dfe37963-1af6-44fc-a841-8e462443f5e6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Expert Knowledge Worker\n",
|
||||
"## Expert Files based Knowledge Worker\n",
|
||||
"\n",
|
||||
"Submitted By: Bharat Puri\n",
|
||||
"\n",
|
||||
"### A question answering agent that is an expert knowledge worker\n",
|
||||
"### To be used by employees of Insurellm, an Insurance Tech company\n",
|
||||
"### The agent needs to be accurate and the solution should be low cost.\n",
|
||||
"### Fixes to the LangChain \n",
|
||||
"\n",
|
||||
"This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy."
|
||||
]
|
||||
@@ -26,12 +29,22 @@
|
||||
"import os\n",
|
||||
"import glob\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import gradio as gr\n",
|
||||
"import sys\n",
|
||||
"sys.path.append(os.path.abspath(os.path.join(\"..\", \"..\"))) \n",
|
||||
"# LLM APIs\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# HuggingFace\n",
|
||||
"from huggingface_hub import login\n",
|
||||
"\n",
|
||||
"# Gradio\n",
|
||||
"import gradio as gr"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 2,
|
||||
"id": "78743444-cae7-4fad-bf66-dcfabbe73335",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -41,35 +54,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 11,
|
||||
"id": "802137aa-8a74-45e0-a487-d1974927d7ca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # imports for langchain\n",
|
||||
"# LangChain v1.0+ imports\n",
|
||||
"\n",
|
||||
"# from langchain.document_loaders import DirectoryLoader, TextLoader\n",
|
||||
"# from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"# from langchain.schema import Document\n",
|
||||
"# from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
|
||||
"# from langchain_chroma import Chroma\n",
|
||||
"# import numpy as np\n",
|
||||
"# from sklearn.manifold import TSNE\n",
|
||||
"# import plotly.graph_objects as go\n",
|
||||
"# from langchain.memory import ConversationBufferMemory\n",
|
||||
"# from langchain.chains import ConversationalRetrievalChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "e0e47493-d020-4fe4-bac9-6dc42dee1841",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # imports for langchain\n",
|
||||
"\n",
|
||||
"# imports for langchain (modular setup)\n",
|
||||
"from langchain_core.documents import Document\n",
|
||||
"from langchain_core.messages import HumanMessage\n",
|
||||
"from langchain_text_splitters import CharacterTextSplitter\n",
|
||||
@@ -77,10 +68,7 @@
|
||||
"from langchain_chroma import Chroma\n",
|
||||
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
||||
"from langchain_core.callbacks import StdOutCallbackHandler\n",
|
||||
"# Other imports\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.manifold import TSNE\n",
|
||||
"import plotly.graph_objects as go\n"
|
||||
"from langchain_community.document_loaders import DirectoryLoader, TextLoader\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -111,27 +99,15 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 12,
|
||||
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'DirectoryLoader' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m 12\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m folder \u001b[38;5;129;01min\u001b[39;00m folders:\n\u001b[32m 13\u001b[39m doc_type = os.path.basename(folder)\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m loader = \u001b[43mDirectoryLoader\u001b[49m(folder, glob=\u001b[33m\"\u001b[39m\u001b[33m**/*.md\u001b[39m\u001b[33m\"\u001b[39m, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n\u001b[32m 15\u001b[39m folder_docs = loader.load()\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m folder_docs:\n",
|
||||
"\u001b[31mNameError\u001b[39m: name 'DirectoryLoader' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Read in documents using LangChain's loaders\n",
|
||||
"# Take everything in all the sub-folders of our knowledgebase\n",
|
||||
"\n",
|
||||
"folders = glob.glob(\"knowledge-base/*\")\n",
|
||||
"folders = glob.glob(\"../../knowledge-base/*\")\n",
|
||||
"\n",
|
||||
"# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n",
|
||||
"text_loader_kwargs = {'encoding': 'utf-8'}\n",
|
||||
@@ -150,10 +126,18 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 13,
|
||||
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Created a chunk of size 1088, which is longer than the specified 1000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
|
||||
"chunks = text_splitter.split_documents(documents)"
|
||||
@@ -161,20 +145,39 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 14,
|
||||
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"123"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(chunks)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 15,
|
||||
"id": "2c54b4b6-06da-463d-bee7-4dd456c2b887",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Document types found: company, employees, contracts, products\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
|
||||
"print(f\"Document types found: {', '.join(doc_types)}\")"
|
||||
|
||||
Reference in New Issue
Block a user