diff --git a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb index 554f6c6..7fd4999 100644 --- a/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb +++ b/week5/community-contributions/bharat_puri/files_based_knowledge_base.ipynb @@ -44,17 +44,30 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "78743444-cae7-4fad-bf66-dcfabbe73335", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "!pip install -U -q imapclient langchain langchain-openai langchain-chroma langchain-community langchain-core langchain-text-splitters langchain-huggingface chromadb sentence-transformers" + "# !pip install -U imapclient langchain langchain-openai langchain-chroma langchain-community langchain-core langchain-text-splitters langchain-huggingface chromadb sentence-transformers" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, + "id": "71924170-e73a-4e98-a34a-c5c0567f39da", + "metadata": {}, + "outputs": [], + "source": [ + "## Install specific version of langchain to avoid future issues\n", + "!pip install -U -q imapclient langchain==1.0.2 langchain-openai==1.0.1 langchain-chroma==1.0.0 langchain-community==0.4 langchain-core==1.0.0 langchain-text-splitters==1.0.0 langchain-huggingface==1.0.0 langchain-classic==1.0.0 chromadb==1.2.1 sentence-transformers==5.1.2" + ] + }, + { + "cell_type": "code", + "execution_count": 41, "id": "802137aa-8a74-45e0-a487-d1974927d7ca", "metadata": {}, "outputs": [], @@ -68,7 +81,10 @@ "from langchain_chroma import Chroma\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "from langchain_core.callbacks import StdOutCallbackHandler\n", - "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n" + "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n", + "from langchain_classic.memory import ConversationBufferMemory\n", + "from langchain_classic.chains import ConversationalRetrievalChain\n", + "\n" ] }, { @@ -99,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905", "metadata": {}, "outputs": [], @@ -126,18 +142,12 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Created a chunk of size 1088, which is longer than the specified 1000\n" - ] - } - ], + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", "chunks = text_splitter.split_documents(documents)" @@ -145,28 +155,17 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "123" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(chunks)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "id": "2c54b4b6-06da-463d-bee7-4dd456c2b887", "metadata": {}, "outputs": [ @@ -174,7 +173,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Document types found: company, employees, contracts, products\n" + "Document types found: employees, products, contracts, company\n" ] } ], @@ -208,13 +207,15 @@ "cell_type": "code", "execution_count": null, "id": "78998399-ac17-4e28-b15f-0b5f51e6ee23", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk\n", "# Chroma is a popular open source Vector Database based on SQLLite\n", "\n", - "# embeddings = OpenAIEmbeddings()\n", + "embeddings = OpenAIEmbeddings()\n", "\n", "# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers\n", "# Then replace embeddings = OpenAIEmbeddings()\n", @@ -222,7 +223,8 @@ "# from langchain.embeddings import HuggingFaceEmbeddings\n", "# embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\",show_progress=False # you can set this False to hide the download bar)\n", "\n", - "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + "# embeddings = HuggingFaceEmbeddings(\n", + "# model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", " \n", "# Delete if already exists\n", "\n", @@ -239,7 +241,9 @@ "cell_type": "code", "execution_count": null, "id": "057868f6-51a6-4087-94d1-380145821550", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# Get one vector and find how many dimensions it has\n", @@ -262,12 +266,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "b98adf5e-d464-4bd2-9bdf-bc5b6770263b", "metadata": {}, "outputs": [], "source": [ "# Prework\n", + "import numpy as np\n", "\n", "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n", "vectors = np.array(result['embeddings'])\n", @@ -287,6 +292,9 @@ "# Reduce the dimensionality of the vectors to 2D using t-SNE\n", "# (t-distributed stochastic neighbor embedding)\n", "\n", + "from sklearn.manifold import TSNE\n", + "import plotly.graph_objects as go\n", + "\n", "tsne = TSNE(n_components=2, random_state=42)\n", "reduced_vectors = tsne.fit_transform(vectors)\n", "\n", @@ -315,7 +323,9 @@ "cell_type": "code", "execution_count": null, "id": "e1418e88-acd5-460a-bf2b-4e6efc88e3dd", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# Let's try 3D!\n", @@ -396,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "129c7d1e-0094-4479-9459-f9360b95f244", "metadata": {}, "outputs": [], @@ -418,7 +428,9 @@ "cell_type": "code", "execution_count": null, "id": "968e7bf2-e862-4679-a11f-6c1efb6ec8ca", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "query = \"Can you describe Insurellm in a few sentences\"\n", @@ -428,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "e6eb99fb-33ec-4025-ab92-b634ede03647", "metadata": {}, "outputs": [], @@ -452,7 +464,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "c3536590-85c7-4155-bd87-ae78a1467670", "metadata": {}, "outputs": [], @@ -468,7 +480,9 @@ "cell_type": "code", "execution_count": null, "id": "b252d8c1-61a8-406d-b57a-8f708a62b014", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# And in Gradio:\n",