diff --git a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb index 75b190e..554f6c6 100644 --- a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb +++ b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb @@ -5,11 +5,14 @@ "id": "dfe37963-1af6-44fc-a841-8e462443f5e6", "metadata": {}, "source": [ - "## Expert Knowledge Worker\n", + "## Expert Files based Knowledge Worker\n", + "\n", + "Submitted By: Bharat Puri\n", "\n", "### A question answering agent that is an expert knowledge worker\n", "### To be used by employees of Insurellm, an Insurance Tech company\n", "### The agent needs to be accurate and the solution should be low cost.\n", + "### Fixes to the LangChain \n", "\n", "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy." ] @@ -26,12 +29,22 @@ "import os\n", "import glob\n", "from dotenv import load_dotenv\n", + "import gradio as gr\n", + "import sys\n", + "sys.path.append(os.path.abspath(os.path.join(\"..\", \"..\"))) \n", + "# LLM APIs\n", + "from openai import OpenAI\n", + "\n", + "# HuggingFace\n", + "from huggingface_hub import login\n", + "\n", + "# Gradio\n", "import gradio as gr" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "id": "78743444-cae7-4fad-bf66-dcfabbe73335", "metadata": {}, "outputs": [], @@ -41,35 +54,13 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 11, "id": "802137aa-8a74-45e0-a487-d1974927d7ca", "metadata": {}, "outputs": [], "source": [ - "# # imports for langchain\n", + "# LangChain v1.0+ imports\n", "\n", - "# from langchain.document_loaders import DirectoryLoader, TextLoader\n", - "# from langchain.text_splitter import CharacterTextSplitter\n", - "# from langchain.schema import Document\n", - "# from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", - "# from langchain_chroma import Chroma\n", - "# import numpy as np\n", - "# from sklearn.manifold import TSNE\n", - "# import plotly.graph_objects as go\n", - "# from langchain.memory import ConversationBufferMemory\n", - "# from langchain.chains import ConversationalRetrievalChain" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e0e47493-d020-4fe4-bac9-6dc42dee1841", - "metadata": {}, - "outputs": [], - "source": [ - "# # imports for langchain\n", - "\n", - "# imports for langchain (modular setup)\n", "from langchain_core.documents import Document\n", "from langchain_core.messages import HumanMessage\n", "from langchain_text_splitters import CharacterTextSplitter\n", @@ -77,10 +68,7 @@ "from langchain_chroma import Chroma\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "from langchain_core.callbacks import StdOutCallbackHandler\n", - "# Other imports\n", - "import numpy as np\n", - "from sklearn.manifold import TSNE\n", - "import plotly.graph_objects as go\n" + "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n" ] }, { @@ -111,27 +99,15 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905", "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'DirectoryLoader' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m 12\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m folder \u001b[38;5;129;01min\u001b[39;00m folders:\n\u001b[32m 13\u001b[39m doc_type = os.path.basename(folder)\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m loader = \u001b[43mDirectoryLoader\u001b[49m(folder, glob=\u001b[33m\"\u001b[39m\u001b[33m**/*.md\u001b[39m\u001b[33m\"\u001b[39m, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n\u001b[32m 15\u001b[39m folder_docs = loader.load()\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m folder_docs:\n", - "\u001b[31mNameError\u001b[39m: name 'DirectoryLoader' is not defined" - ] - } - ], + "outputs": [], "source": [ "# Read in documents using LangChain's loaders\n", "# Take everything in all the sub-folders of our knowledgebase\n", "\n", - "folders = glob.glob(\"knowledge-base/*\")\n", + "folders = glob.glob(\"../../knowledge-base/*\")\n", "\n", "# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n", "text_loader_kwargs = {'encoding': 'utf-8'}\n", @@ -150,10 +126,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Created a chunk of size 1088, which is longer than the specified 1000\n" + ] + } + ], "source": [ "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", "chunks = text_splitter.split_documents(documents)" @@ -161,20 +145,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "123" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "len(chunks)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "2c54b4b6-06da-463d-bee7-4dd456c2b887", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document types found: company, employees, contracts, products\n" + ] + } + ], "source": [ "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n", "print(f\"Document types found: {', '.join(doc_types)}\")"