Updated Week 5 with November version

2025-11-04 07:26:42 -05:00
parent 9132764523
commit e5c3fcab46
81 changed files with 9263 additions and 2725 deletions
--- a/week5/day3.ipynb
+++ b/week5/day3.ipynb
@@ -2,349 +2,189 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "id": "dfe37963-1af6-44fc-a841-8e462443f5e6",
   "metadata": {},
   "source": [
-    "## Expert Knowledge Worker\n",
+    "## RAG Day 3\n",
    "\n",
-    "### A question answering agent that is an expert knowledge worker\n",
-    "### To be used by employees of Insurellm, an Insurance Tech company\n",
-    "### The agent needs to be accurate and the solution should be low cost.\n",
+    "### Expert Question Answerer for InsureLLM\n",
    "\n",
-    "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy."
+    "LangChain 1.0 implementation of a RAG pipeline.\n",
+    "\n",
+    "Using the VectorStore we created last time (with HuggingFace `all-MiniLM-L6-v2`)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# imports\n",
-    "\n",
-    "import os\n",
-    "import glob\n",
    "from dotenv import load_dotenv\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "from langchain_chroma import Chroma\n",
+    "from langchain_core.messages import SystemMessage, HumanMessage\n",
+    "from langchain_huggingface import HuggingFaceEmbeddings\n",
    "import gradio as gr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "802137aa-8a74-45e0-a487-d1974927d7ca",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# imports for langchain and Chroma and plotly\n",
-    "\n",
-    "from langchain.document_loaders import DirectoryLoader, TextLoader\n",
-    "from langchain.text_splitter import CharacterTextSplitter\n",
-    "from langchain.schema import Document\n",
-    "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
-    "from langchain_chroma import Chroma\n",
-    "import numpy as np\n",
-    "from sklearn.manifold import TSNE\n",
-    "import plotly.graph_objects as go"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "58c85082-e417-4708-9efe-81a5d55d1424",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# price is a factor for our company, so we're going to use a low cost model\n",
-    "\n",
-    "MODEL = \"gpt-4o-mini\"\n",
-    "db_name = \"vector_db\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ee78efcb-60fe-449e-a944-40bab26261af",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load environment variables in a file called .env\n",
-    "\n",
-    "load_dotenv(override=True)\n",
-    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Read in documents using LangChain's loaders\n",
-    "# Take everything in all the sub-folders of our knowledgebase\n",
-    "\n",
-    "folders = glob.glob(\"knowledge-base/*\")\n",
-    "\n",
-    "# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n",
-    "text_loader_kwargs = {'encoding': 'utf-8'}\n",
-    "# If that doesn't work, some Windows users might need to uncomment the next line instead\n",
-    "# text_loader_kwargs={'autodetect_encoding': True}\n",
-    "\n",
-    "documents = []\n",
-    "for folder in folders:\n",
-    "    doc_type = os.path.basename(folder)\n",
-    "    loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
-    "    folder_docs = loader.load()\n",
-    "    for doc in folder_docs:\n",
-    "        doc.metadata[\"doc_type\"] = doc_type\n",
-    "        documents.append(doc)"
+    "MODEL = \"gpt-4.1-nano\"\n",
+    "DB_NAME = \"vector_db\"\n",
+    "load_dotenv(override=True)"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "f065d4b1-80b7-4e15-abd4-60a83e752ea8",
   "metadata": {},
   "source": [
-    "# Please note:\n",
-    "\n",
-    "In the next cell, we split the text into chunks.\n",
-    "\n",
-    "2 students let me know that the next cell crashed their computer.  \n",
-    "They were able to fix it by changing the chunk_size from 1,000 to 2,000 and the chunk_overlap from 200 to 400.  \n",
-    "This shouldn't be required; but if it happens to you, please make that change!  \n",
-    "(Note that LangChain may give a warning about a chunk being larger than 1,000 - this can be safely ignored).\n",
-    "\n",
-    "_With much thanks to Steven W and Nir P for this valuable contribution._"
+    "### Connect to Chroma; use Hugging Face all-MiniLM-L6-v2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
   "metadata": {},
   "outputs": [],
   "source": [
-    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
-    "chunks = text_splitter.split_documents(documents)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(chunks)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2c54b4b6-06da-463d-bee7-4dd456c2b887",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
-    "print(f\"Document types found: {', '.join(doc_types)}\")"
+    "embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
+    "vectorstore = Chroma(persist_directory=DB_NAME, embedding_function=embeddings)"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "77f7d2a6-ccfa-425b-a1c3-5e55b23bd013",
   "metadata": {},
   "source": [
-    "## A sidenote on Embeddings, and \"Auto-Encoding LLMs\"\n",
+    "### Set up the 2 key LangChain objects: retriever and llm\n",
    "\n",
-    "We will be mapping each chunk of text into a Vector that represents the meaning of the text, known as an embedding.\n",
+    "#### A sidebar on \"temperature\":\n",
+    "- Controls how diverse the output is\n",
+    "- A temperature of 0 means that the output should be predictable\n",
+    "- Higher temperature for more variety in answers\n",
    "\n",
-    "OpenAI offers a model to do this, which we will use by calling their API with some LangChain code.\n",
+    "Some people describe temperature as being like 'creativity' but that's not quite right\n",
+    "- It actually controls which tokens get selected during inference\n",
+    "- temperature=0 means: always select the token with highest probability\n",
+    "- temperature=1 usually means: a token with 10% probability should be picked 10% of the time\n",
    "\n",
-    "This model is an example of an \"Auto-Encoding LLM\" which generates an output given a complete input.\n",
-    "It's different to all the other LLMs we've discussed today, which are known as \"Auto-Regressive LLMs\", and generate future tokens based only on past context.\n",
+    "Note: a temperature of 0 doesn't mean outputs will always be reproducible. You also need to set a random seed. We will do that in weeks 6-8. (Even then, it's not always reproducible.)\n",
    "\n",
-    "Another example of an Auto-Encoding LLMs is BERT from Google. In addition to embedding, Auto-encoding LLMs are often used for classification.\n",
-    "\n",
-    "### Sidenote\n",
-    "\n",
-    "In week 8 we will return to RAG and vector embeddings, and we will use an open-source vector encoder so that the data never leaves our computer - that's an important consideration when building enterprise systems and the data needs to remain internal."
+    "Note 2: if you want creativity, use the System Prompt!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "78998399-ac17-4e28-b15f-0b5f51e6ee23",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk\n",
-    "\n",
-    "embeddings = OpenAIEmbeddings()\n",
-    "\n",
-    "# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers\n",
-    "# Then replace embeddings = OpenAIEmbeddings()\n",
-    "# with:\n",
-    "# from langchain.embeddings import HuggingFaceEmbeddings\n",
-    "# embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")"
+    "retriever = vectorstore.as_retriever()\n",
+    "llm = ChatOpenAI(temperature=0, model_name=MODEL)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "763e51ff-5787-4a56-8176-36b7c5796fe3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch\n",
-    "\n",
-    "if os.path.exists(db_name):\n",
-    "    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "99fe3a37-480f-4d55-be48-120588d5846b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create our Chroma vectorstore!\n",
-    "\n",
-    "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n",
-    "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "057868f6-51a6-4087-94d1-380145821550",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get one vector and find how many dimensions it has\n",
-    "\n",
-    "collection = vectorstore._collection\n",
-    "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n",
-    "dimensions = len(sample_embedding)\n",
-    "print(f\"The vectors have {dimensions:,} dimensions\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "61e393a0-dd4c-419f-842f-60c1cb3b716b",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
  {
   "cell_type": "markdown",
-   "id": "b0d45462-a818-441c-b010-b85b32bcf618",
   "metadata": {},
   "source": [
-    "## Visualizing the Vector Store\n",
-    "\n",
-    "Let's take a minute to look at the documents and their embedding vectors to see what's going on."
+    "### These LangChain objects implement the method `invoke()`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "b98adf5e-d464-4bd2-9bdf-bc5b6770263b",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Prework\n",
-    "\n",
-    "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n",
-    "vectors = np.array(result['embeddings'])\n",
-    "documents = result['documents']\n",
-    "doc_types = [metadata['doc_type'] for metadata in result['metadatas']]\n",
-    "colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]"
+    "retriever.invoke(\"Who is Avery?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "427149d5-e5d8-4abd-bb6f-7ef0333cca21",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# We humans find it easier to visalize things in 2D!\n",
-    "# Reduce the dimensionality of the vectors to 2D using t-SNE\n",
-    "# (t-distributed stochastic neighbor embedding)\n",
-    "\n",
-    "tsne = TSNE(n_components=2, random_state=42)\n",
-    "reduced_vectors = tsne.fit_transform(vectors)\n",
-    "\n",
-    "# Create the 2D scatter plot\n",
-    "fig = go.Figure(data=[go.Scatter(\n",
-    "    x=reduced_vectors[:, 0],\n",
-    "    y=reduced_vectors[:, 1],\n",
-    "    mode='markers',\n",
-    "    marker=dict(size=5, color=colors, opacity=0.8),\n",
-    "    text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n",
-    "    hoverinfo='text'\n",
-    ")])\n",
-    "\n",
-    "fig.update_layout(\n",
-    "    title='2D Chroma Vector Store Visualization',\n",
-    "    scene=dict(xaxis_title='x',yaxis_title='y'),\n",
-    "    width=800,\n",
-    "    height=600,\n",
-    "    margin=dict(r=20, b=10, l=10, t=40)\n",
-    ")\n",
-    "\n",
-    "fig.show()"
+    "llm.invoke(\"Who is Avery?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Time to put this together!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "e1418e88-acd5-460a-bf2b-4e6efc88e3dd",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Let's try 3D!\n",
-    "\n",
-    "tsne = TSNE(n_components=3, random_state=42)\n",
-    "reduced_vectors = tsne.fit_transform(vectors)\n",
-    "\n",
-    "# Create the 3D scatter plot\n",
-    "fig = go.Figure(data=[go.Scatter3d(\n",
-    "    x=reduced_vectors[:, 0],\n",
-    "    y=reduced_vectors[:, 1],\n",
-    "    z=reduced_vectors[:, 2],\n",
-    "    mode='markers',\n",
-    "    marker=dict(size=5, color=colors, opacity=0.8),\n",
-    "    text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n",
-    "    hoverinfo='text'\n",
-    ")])\n",
-    "\n",
-    "fig.update_layout(\n",
-    "    title='3D Chroma Vector Store Visualization',\n",
-    "    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n",
-    "    width=900,\n",
-    "    height=700,\n",
-    "    margin=dict(r=20, b=10, l=10, t=40)\n",
-    ")\n",
-    "\n",
-    "fig.show()"
+    "SYSTEM_PROMPT_TEMPLATE = \"\"\"\n",
+    "You are a knowledgeable, friendly assistant representing the company Insurellm.\n",
+    "You are chatting with a user about Insurellm.\n",
+    "If relevant, use the given context to answer any question.\n",
+    "If you don't know the answer, say so.\n",
+    "Context:\n",
+    "{context}\n",
+    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "9b3ada26-b4b7-42fc-b943-933c14adf89b",
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "def answer_question(question: str, history):\n",
+    "    docs = retriever.invoke(question)\n",
+    "    context = \"\\n\\n\".join(doc.page_content for doc in docs)\n",
+    "    system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=context)\n",
+    "    response = llm.invoke([SystemMessage(content=system_prompt), HumanMessage(content=question)])\n",
+    "    return response.content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "answer_question(\"Who is Averi Lancaster?\", [])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## What could possibly come next? 😂"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gr.ChatInterface(answer_question).launch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Admit it - you thought RAG would be more complicated than that!!"
+   ]
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
@@ -358,9 +198,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.13"
+   "version": "3.12.9"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 5
+ "nbformat_minor": 4
 }