Merge branch 'main' of github.com:ed-donner/llm_engineering

2025-03-01 15:03:23 -05:00
parent 8338dfc248 54d61c10a8
commit 3a2eb97cf2
25 changed files with 10314 additions and 0 deletions
--- a/week5/community-contributions/day3-gemini.ipynb
+++ b/week5/community-contributions/day3-gemini.ipynb
--- a/week5/community-contributions/day4-gemini.ipynb
+++ b/week5/community-contributions/day4-gemini.ipynb
@@ -0,0 +1,433 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import os\n",
+    "import glob\n",
+    "from dotenv import load_dotenv\n",
+    "import gradio as gr\n",
+    "# import gemini\n",
+    "import google.generativeai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports for langchain\n",
+    "\n",
+    "from langchain.document_loaders import DirectoryLoader, TextLoader\n",
+    "from langchain.text_splitter import CharacterTextSplitter\n",
+    "from langchain.schema import Document\n",
+    "# from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
+    "from langchain_chroma import Chroma\n",
+    "from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI\n",
+    "import numpy as np\n",
+    "from sklearn.manifold import TSNE\n",
+    "import plotly.graph_objects as go\n",
+    "from langchain.memory import ConversationBufferMemory\n",
+    "from langchain.chains import ConversationalRetrievalChain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# price is a factor for our company, so we're going to use a low cost model\n",
+    "\n",
+    "MODEL = \"gemini-1.5-flash\"\n",
+    "db_name = \"vector_db\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load environment variables in a file called .env\n",
+    "\n",
+    "load_dotenv()\n",
+    "os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "google.generativeai.configure()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read in documents using LangChain's loaders\n",
+    "# Take everything in all the sub-folders of our knowledgebase\n",
+    "\n",
+    "folders = glob.glob(\"knowledge-base/*\")\n",
+    "\n",
+    "# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n",
+    "text_loader_kwargs = {'encoding': 'utf-8'}\n",
+    "# If that doesn't work, some Windows users might need to uncomment the next line instead\n",
+    "# text_loader_kwargs={'autodetect_encoding': True}\n",
+    "\n",
+    "documents = []\n",
+    "for folder in folders:\n",
+    "    doc_type = os.path.basename(folder)\n",
+    "    loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
+    "    folder_docs = loader.load()\n",
+    "    for doc in folder_docs:\n",
+    "        doc.metadata[\"doc_type\"] = doc_type\n",
+    "        documents.append(doc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Created a chunk of size 1088, which is longer than the specified 1000\n"
+     ]
+    }
+   ],
+   "source": [
+    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
+    "chunks = text_splitter.split_documents(documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "123"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(chunks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Document types found: company, contracts, employees, products\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
+    "print(f\"Document types found: {', '.join(doc_types)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vectorstore created with 123 documents\n"
+     ]
+    }
+   ],
+   "source": [
+    "embeddings = GoogleGenerativeAIEmbeddings(model=\"models/embedding-001\")\n",
+    "\n",
+    "# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch\n",
+    "\n",
+    "if os.path.exists(db_name):\n",
+    "    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n",
+    "\n",
+    "# Create our Chroma vectorstore!\n",
+    "\n",
+    "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n",
+    "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The vectors have 768 dimensions\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get one vector and find how many dimensions it has\n",
+    "\n",
+    "collection = vectorstore._collection\n",
+    "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n",
+    "dimensions = len(sample_embedding)\n",
+    "print(f\"The vectors have {dimensions:,} dimensions\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prework\n",
+    "\n",
+    "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n",
+    "vectors = np.array(result['embeddings'])\n",
+    "documents = result['documents']\n",
+    "doc_types = [metadata['doc_type'] for metadata in result['metadatas']]\n",
+    "colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We humans find it easier to visalize things in 2D!\n",
+    "# Reduce the dimensionality of the vectors to 2D using t-SNE\n",
+    "# (t-distributed stochastic neighbor embedding)\n",
+    "\n",
+    "tsne = TSNE(n_components=2, random_state=42)\n",
+    "reduced_vectors = tsne.fit_transform(vectors)\n",
+    "\n",
+    "# Create the 2D scatter plot\n",
+    "fig = go.Figure(data=[go.Scatter(\n",
+    "    x=reduced_vectors[:, 0],\n",
+    "    y=reduced_vectors[:, 1],\n",
+    "    mode='markers',\n",
+    "    marker=dict(size=5, color=colors, opacity=0.8),\n",
+    "    text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n",
+    "    hoverinfo='text'\n",
+    ")])\n",
+    "\n",
+    "fig.update_layout(\n",
+    "    title='2D Chroma Vector Store Visualization',\n",
+    "    scene=dict(xaxis_title='x',yaxis_title='y'),\n",
+    "    width=800,\n",
+    "    height=600,\n",
+    "    margin=dict(r=20, b=10, l=10, t=40)\n",
+    ")\n",
+    "\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's try 3D!\n",
+    "\n",
+    "tsne = TSNE(n_components=3, random_state=42)\n",
+    "reduced_vectors = tsne.fit_transform(vectors)\n",
+    "\n",
+    "# Create the 3D scatter plot\n",
+    "fig = go.Figure(data=[go.Scatter3d(\n",
+    "    x=reduced_vectors[:, 0],\n",
+    "    y=reduced_vectors[:, 1],\n",
+    "    z=reduced_vectors[:, 2],\n",
+    "    mode='markers',\n",
+    "    marker=dict(size=5, color=colors, opacity=0.8),\n",
+    "    text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n",
+    "    hoverinfo='text'\n",
+    ")])\n",
+    "\n",
+    "fig.update_layout(\n",
+    "    title='3D Chroma Vector Store Visualization',\n",
+    "    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n",
+    "    width=900,\n",
+    "    height=700,\n",
+    "    margin=dict(r=20, b=10, l=10, t=40)\n",
+    ")\n",
+    "\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "RAG pipeline using langchain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\GANESH\\AppData\\Local\\Temp\\ipykernel_524\\4130109764.py:5: LangChainDeprecationWarning:\n",
+      "\n",
+      "Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# create a new Chat with ChatGoogleGenerativeAI\n",
+    "llm = ChatGoogleGenerativeAI(model=MODEL, temperature=0.7)\n",
+    "\n",
+    "# set up the conversation memory for the chat\n",
+    "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
+    "\n",
+    "# the retriever is an abstraction over the VectorStore that will be used during RAG\n",
+    "retriever = vectorstore.as_retriever()\n",
+    "\n",
+    "# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n",
+    "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Insurellm is an insurance technology company with 200 employees and over 300 clients worldwide.  They offer four software products, including Homellm, a portal for home insurance companies that integrates with existing platforms and offers a customer portal for policy management.  Their pricing model is based on provider size and customization needs.\n"
+     ]
+    }
+   ],
+   "source": [
+    "query = \"Can you describe Insurellm in a few sentences\"\n",
+    "result = conversation_chain.invoke({\"question\":query})\n",
+    "print(result[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set up a new conversation memory for the chat\n",
+    "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
+    "\n",
+    "# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n",
+    "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Gradio User Interface"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def chat(message, history):\n",
+    "    result = conversation_chain.invoke({\"question\": message})\n",
+    "    return result[\"answer\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Running on local URL:  http://127.0.0.1:7860\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "view = gr.ChatInterface(chat, type=\"messages\").launch(inbrowser=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llms",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/week5/community-contributions/verify-encodings.ipynb
+++ b/week5/community-contributions/verify-encodings.ipynb
@@ -0,0 +1,405 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "dfe37963-1af6-44fc-a841-8e462443f5e6",
+   "metadata": {},
+   "source": [
+    "## This notebook compares the embeddings generated by OpenAIEmbeddings.\n",
+    "\n",
+    "It shows that OpenAIEmbeddings embeddings can differ slightly (typically at 4 the decimal place).\n",
+    "\n",
+    "### Results from OpenAIEmbeddings:\n",
+    "encodings are NOT identical on each run.\n",
+    "\n",
+    "### Repeating with sentence-transformers/all-MiniLM-L6-v2:\n",
+    "encodings ARE identical on each run.\n",
+    "\n",
+    "Tests verify simple numerical comparisons.\n",
+    "\n",
+    "### Advanced Comparison\n",
+    "A more advanced euclidean and cosine comparison is also included.\n",
+    "\n",
+    "## NOTES: Tests run on local Jupiter Notebook| Anaconda setup for the course."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import os\n",
+    "import glob\n",
+    "from dotenv import load_dotenv\n",
+    "import gradio as gr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "802137aa-8a74-45e0-a487-d1974927d7ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports for langchain\n",
+    "\n",
+    "from langchain.document_loaders import DirectoryLoader, TextLoader\n",
+    "from langchain.text_splitter import CharacterTextSplitter\n",
+    "from langchain.schema import Document\n",
+    "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
+    "from langchain_chroma import Chroma\n",
+    "import numpy as np\n",
+    "from sklearn.manifold import TSNE\n",
+    "import plotly.graph_objects as go\n",
+    "from langchain.memory import ConversationBufferMemory\n",
+    "from langchain.chains import ConversationalRetrievalChain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58c85082-e417-4708-9efe-81a5d55d1424",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# price is a factor for our company, so we're going to use a low cost model\n",
+    "\n",
+    "MODEL = \"gpt-4o-mini\"\n",
+    "db_name = \"vector_db\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee78efcb-60fe-449e-a944-40bab26261af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load environment variables in a file called .env\n",
+    "\n",
+    "load_dotenv()\n",
+    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read in documents using LangChain's loaders\n",
+    "# Take everything in all the sub-folders of our knowledgebase\n",
+    "\n",
+    "folders = glob.glob(\"knowledge-base/*\")\n",
+    "\n",
+    "# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n",
+    "text_loader_kwargs = {'encoding': 'utf-8'}\n",
+    "# If that doesn't work, some Windows users might need to uncomment the next line instead\n",
+    "# text_loader_kwargs={'autodetect_encoding': True}\n",
+    "\n",
+    "documents = []\n",
+    "for folder in folders:\n",
+    "    doc_type = os.path.basename(folder)\n",
+    "    loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
+    "    folder_docs = loader.load()\n",
+    "    for doc in folder_docs:\n",
+    "        doc.metadata[\"doc_type\"] = doc_type\n",
+    "        documents.append(doc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
+    "chunks = text_splitter.split_documents(documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(chunks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c54b4b6-06da-463d-bee7-4dd456c2b887",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
+    "print(f\"Document types found: {', '.join(doc_types)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8b5ef27-70c2-4111-bce7-854bc1ebd02a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use a where filter to specify the metadata condition\n",
+    "# Get the 3 company vectors (corresponds to our 3 yellow dots)\n",
+    "\n",
+    "def get_company_vectors(collection):\n",
+    "    company_vectors = collection.get(\n",
+    "        where={\"doc_type\": \"company\"},  # Filter for documents where source = \"XXXX\"\n",
+    "        limit=10,\n",
+    "        include=[\"embeddings\", \"metadatas\", \"documents\"]\n",
+    "    )\n",
+    "    print(f\"Found {len(company_vectors)} company vectors\")\n",
+    "    return company_vectors\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d688b873-b52b-4d80-9df2-f70b389f5dc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def print_vectors_summary(vectors):\n",
+    "    for i in range(len(vectors[\"documents\"])):\n",
+    "        print(f\"\\n--- Chunk {i+1} ---\")\n",
+    "        \n",
+    "        # Print document content (first 100 chars)\n",
+    "        print(f\"Content: {vectors['documents'][i][:100]}...\")\n",
+    "        \n",
+    "        # Print metadata\n",
+    "        print(f\"Metadata: {vectors['metadatas'][i]}\")\n",
+    "        \n",
+    "        # Print embedding info (not the full vector as it would be too long)\n",
+    "        embedding = vectors[\"embeddings\"][i]\n",
+    "        print(f\"Embedding: Vector of length {len(embedding)}, first 5 values: {embedding[:5]}\")\n",
+    "\n",
+    "\n",
+    "def get_dimensions_for_vectors(vectors):\n",
+    "    dimensions = []\n",
+    "\n",
+    "    for i in range(len(vectors[\"documents\"])):\n",
+    "        embedding = vectors[\"embeddings\"][i]\n",
+    "        dimensions.append(embedding)\n",
+    "\n",
+    "    return dimensions\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b195184-4920-404a-9bfa-0231f1dbe276",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Quick check if any single value is different\n",
+    "def quick_diff_check(emb1, emb2):\n",
+    "    result = \"Embeddings are identical\"\n",
+    "    print(\"\\n\\nComparing two embeddings:\\n\\n\")\n",
+    "    print(emb1)\n",
+    "    print(emb2)\n",
+    "    for i, (v1, v2) in enumerate(zip(emb1, emb2)):\n",
+    "        if v1 != v2:\n",
+    "            result = f\"Different at dimension {i}: {v1} vs {v2}\"\n",
+    "            break\n",
+    "    print(result)\n",
+    "    return result\n",
+    "\n",
+    "#quick_diff_check(dimensions[0], dimensions[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06ba838d-d179-4e2d-b208-dd9cc1fd0097",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "embeddings = OpenAIEmbeddings()\n",
+    "\n",
+    "def create_vectorstores(embeddings):\n",
+    "\n",
+    "    if os.path.exists(\"vectorstore1\"):\n",
+    "        Chroma(persist_directory=\"vectorstore1\", embedding_function=embeddings).delete_collection()\n",
+    "    if os.path.exists(\"vectorstore2\"):\n",
+    "        Chroma(persist_directory=\"vectorstore2\", embedding_function=embeddings).delete_collection()\n",
+    "    \n",
+    "    \n",
+    "    # Create vectorstore 1\n",
+    "    vectorstore1 = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=\"vectorstore1\")\n",
+    "    print(f\"Vectorstore 1 created with {vectorstore1._collection.count()} documents\")\n",
+    "    \n",
+    "    # Create vectorstore 2\n",
+    "    vectorstore2 = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=\"vectorstore2\")\n",
+    "    print(f\"Vectorstore 2 created with {vectorstore2._collection.count()} documents\")\n",
+    "\n",
+    "    return vectorstore1, vectorstore2\n",
+    "\n",
+    "vectorstore1, vectorstore2 = create_vectorstores(embeddings)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e24242eb-613a-4edb-a081-6b8937f106a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Uncomment this and rerun cells below, \n",
+    "## to see that HuggingFaceEmbeddings is idential\n",
+    "\n",
+    "#from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "#embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
+    "#vectorstore1, vectorstore2 = create_vectorstores(embeddings)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "000b9e70-2958-40db-bbed-56a00e4249ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get the 3 company doc_type vectors\n",
+    "collection1 = vectorstore1._collection\n",
+    "collection2 = vectorstore2._collection\n",
+    "\n",
+    "company_vectors1=get_company_vectors(collection1)\n",
+    "company_vectors2=get_company_vectors(collection2)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63cd63e4-9d3e-405a-8ef9-dac16fe2570e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets print out summary info just to see we have the same chunks.\n",
+    "\n",
+    "def print_summary_info (vectors):\n",
+    "    print(\"VECTORS SUMMARY\\n\")\n",
+    "    print_vectors_summary(vectors)\n",
+    "\n",
+    "\n",
+    "print(\"\\n\\n\\n========= VECTORS 1 =========\\n\\n\")\n",
+    "print_summary_info(company_vectors1)\n",
+    "\n",
+    "print(\"\\n\\n\\n========= VECTORS 2 =========\\n\\n\")\n",
+    "print_summary_info(company_vectors2)\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc085a35-f0ec-4ddb-955c-244cb2d3eb2a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dimensions1 = get_dimensions_for_vectors(company_vectors1)\n",
+    "dimensions2 = get_dimensions_for_vectors(company_vectors2)\n",
+    "\n",
+    "result1 = quick_diff_check(dimensions1[0], dimensions2[0])   \n",
+    "result2 = quick_diff_check(dimensions1[1], dimensions2[1])    \n",
+    "result3 = quick_diff_check(dimensions1[2], dimensions2[2])    \n",
+    "\n",
+    "print(\"\\n\\nSUMMARY RESULTS:\")\n",
+    "print(\"================\\n\\n\")\n",
+    "print(result1)    \n",
+    "print(result2)\n",
+    "print(result3)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "164cf94d-9d63-4bae-91f9-4b02da1537ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## ADVANCED COMPARISONS:\n",
+    "# More advanced comparisons (from Claude 3.7 Sonnet):\n",
+    "\n",
+    "\n",
+    "## !IMPORTANT *** Uncomment final line to execute ***\n",
+    "\n",
+    "\n",
+    "import numpy as np\n",
+    "from scipy.spatial.distance import cosine\n",
+    "\n",
+    "# Method 1: Euclidean distance (L2 norm)\n",
+    "def compare_embeddings_euclidean(emb1, emb2):\n",
+    "    emb1_array = np.array(emb1)\n",
+    "    emb2_array = np.array(emb2)\n",
+    "    distance = np.linalg.norm(emb1_array - emb2_array)\n",
+    "    return {\n",
+    "        \"different\": distance > 0,\n",
+    "        \"distance\": distance,\n",
+    "        \"similarity\": 1/(1+distance)  # Converts distance to similarity score\n",
+    "    }\n",
+    "\n",
+    "# Method 2: Cosine similarity (common for embeddings)\n",
+    "def compare_embeddings_cosine(emb1, emb2):\n",
+    "    emb1_array = np.array(emb1)\n",
+    "    emb2_array = np.array(emb2)\n",
+    "    similarity = 1 - cosine(emb1_array, emb2_array)  # Cosine returns distance, so subtract from 1\n",
+    "    return {\n",
+    "        \"different\": similarity < 0.9999,  # Almost identical if > 0.9999\n",
+    "        \"similarity\": similarity\n",
+    "    }\n",
+    "\n",
+    "# Method 3: Simple exact equality check\n",
+    "def are_embeddings_identical(emb1, emb2):\n",
+    "    return np.array_equal(np.array(emb1), np.array(emb2))\n",
+    "\n",
+    "\n",
+    "def run_advanced_comparisons():\n",
+    "    for i in range(0, 3):\n",
+    "        print(f\"\\n\\nComparing vector dimensions for dimension[{i}]....\\n\")\n",
+    "        print(\"Exactly identical?    ---> \", are_embeddings_identical(dimensions1[i], dimensions2[i]))\n",
+    "        print(\"Cosine comparison:    ---> \", compare_embeddings_cosine(dimensions1[i], dimensions2[i]))\n",
+    "        print(\"Euclidean comparison: ---> \", compare_embeddings_euclidean(dimensions1[i], dimensions2[i]))\n",
+    "\n",
+    "\n",
+    "#run_advanced_comparisons()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}