Merge branch 'main' of github.com:ed-donner/llm_engineering

This commit is contained in:
Edward Donner
2025-03-01 15:03:23 -05:00
25 changed files with 10314 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,433 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import glob\n",
"from dotenv import load_dotenv\n",
"import gradio as gr\n",
"# import gemini\n",
"import google.generativeai"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# imports for langchain\n",
"\n",
"from langchain.document_loaders import DirectoryLoader, TextLoader\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.schema import Document\n",
"# from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
"from langchain_chroma import Chroma\n",
"from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI\n",
"import numpy as np\n",
"from sklearn.manifold import TSNE\n",
"import plotly.graph_objects as go\n",
"from langchain.memory import ConversationBufferMemory\n",
"from langchain.chains import ConversationalRetrievalChain"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# price is a factor for our company, so we're going to use a low cost model\n",
"\n",
"MODEL = \"gemini-1.5-flash\"\n",
"db_name = \"vector_db\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Load environment variables in a file called .env\n",
"\n",
"load_dotenv()\n",
"os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"google.generativeai.configure()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Read in documents using LangChain's loaders\n",
"# Take everything in all the sub-folders of our knowledgebase\n",
"\n",
"folders = glob.glob(\"knowledge-base/*\")\n",
"\n",
"# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n",
"text_loader_kwargs = {'encoding': 'utf-8'}\n",
"# If that doesn't work, some Windows users might need to uncomment the next line instead\n",
"# text_loader_kwargs={'autodetect_encoding': True}\n",
"\n",
"documents = []\n",
"for folder in folders:\n",
" doc_type = os.path.basename(folder)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
" folder_docs = loader.load()\n",
" for doc in folder_docs:\n",
" doc.metadata[\"doc_type\"] = doc_type\n",
" documents.append(doc)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Created a chunk of size 1088, which is longer than the specified 1000\n"
]
}
],
"source": [
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
"chunks = text_splitter.split_documents(documents)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"123"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(chunks)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document types found: company, contracts, employees, products\n"
]
}
],
"source": [
"doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
"print(f\"Document types found: {', '.join(doc_types)}\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Vectorstore created with 123 documents\n"
]
}
],
"source": [
"embeddings = GoogleGenerativeAIEmbeddings(model=\"models/embedding-001\")\n",
"\n",
"# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch\n",
"\n",
"if os.path.exists(db_name):\n",
" Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n",
"\n",
"# Create our Chroma vectorstore!\n",
"\n",
"vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n",
"print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The vectors have 768 dimensions\n"
]
}
],
"source": [
"# Get one vector and find how many dimensions it has\n",
"\n",
"collection = vectorstore._collection\n",
"sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n",
"dimensions = len(sample_embedding)\n",
"print(f\"The vectors have {dimensions:,} dimensions\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# Prework\n",
"\n",
"result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n",
"vectors = np.array(result['embeddings'])\n",
"documents = result['documents']\n",
"doc_types = [metadata['doc_type'] for metadata in result['metadatas']]\n",
"colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We humans find it easier to visalize things in 2D!\n",
"# Reduce the dimensionality of the vectors to 2D using t-SNE\n",
"# (t-distributed stochastic neighbor embedding)\n",
"\n",
"tsne = TSNE(n_components=2, random_state=42)\n",
"reduced_vectors = tsne.fit_transform(vectors)\n",
"\n",
"# Create the 2D scatter plot\n",
"fig = go.Figure(data=[go.Scatter(\n",
" x=reduced_vectors[:, 0],\n",
" y=reduced_vectors[:, 1],\n",
" mode='markers',\n",
" marker=dict(size=5, color=colors, opacity=0.8),\n",
" text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n",
" hoverinfo='text'\n",
")])\n",
"\n",
"fig.update_layout(\n",
" title='2D Chroma Vector Store Visualization',\n",
" scene=dict(xaxis_title='x',yaxis_title='y'),\n",
" width=800,\n",
" height=600,\n",
" margin=dict(r=20, b=10, l=10, t=40)\n",
")\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's try 3D!\n",
"\n",
"tsne = TSNE(n_components=3, random_state=42)\n",
"reduced_vectors = tsne.fit_transform(vectors)\n",
"\n",
"# Create the 3D scatter plot\n",
"fig = go.Figure(data=[go.Scatter3d(\n",
" x=reduced_vectors[:, 0],\n",
" y=reduced_vectors[:, 1],\n",
" z=reduced_vectors[:, 2],\n",
" mode='markers',\n",
" marker=dict(size=5, color=colors, opacity=0.8),\n",
" text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n",
" hoverinfo='text'\n",
")])\n",
"\n",
"fig.update_layout(\n",
" title='3D Chroma Vector Store Visualization',\n",
" scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n",
" width=900,\n",
" height=700,\n",
" margin=dict(r=20, b=10, l=10, t=40)\n",
")\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"RAG pipeline using langchain"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\GANESH\\AppData\\Local\\Temp\\ipykernel_524\\4130109764.py:5: LangChainDeprecationWarning:\n",
"\n",
"Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/\n",
"\n"
]
}
],
"source": [
"# create a new Chat with ChatGoogleGenerativeAI\n",
"llm = ChatGoogleGenerativeAI(model=MODEL, temperature=0.7)\n",
"\n",
"# set up the conversation memory for the chat\n",
"memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
"\n",
"# the retriever is an abstraction over the VectorStore that will be used during RAG\n",
"retriever = vectorstore.as_retriever()\n",
"\n",
"# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n",
"conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Insurellm is an insurance technology company with 200 employees and over 300 clients worldwide. They offer four software products, including Homellm, a portal for home insurance companies that integrates with existing platforms and offers a customer portal for policy management. Their pricing model is based on provider size and customization needs.\n"
]
}
],
"source": [
"query = \"Can you describe Insurellm in a few sentences\"\n",
"result = conversation_chain.invoke({\"question\":query})\n",
"print(result[\"answer\"])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# set up a new conversation memory for the chat\n",
"memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
"\n",
"# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n",
"conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Gradio User Interface"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def chat(message, history):\n",
" result = conversation_chain.invoke({\"question\": message})\n",
" return result[\"answer\"]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"* Running on local URL: http://127.0.0.1:7860\n",
"\n",
"To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"view = gr.ChatInterface(chat, type=\"messages\").launch(inbrowser=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "llms",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,405 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "dfe37963-1af6-44fc-a841-8e462443f5e6",
"metadata": {},
"source": [
"## This notebook compares the embeddings generated by OpenAIEmbeddings.\n",
"\n",
"It shows that OpenAIEmbeddings embeddings can differ slightly (typically at 4 the decimal place).\n",
"\n",
"### Results from OpenAIEmbeddings:\n",
"encodings are NOT identical on each run.\n",
"\n",
"### Repeating with sentence-transformers/all-MiniLM-L6-v2:\n",
"encodings ARE identical on each run.\n",
"\n",
"Tests verify simple numerical comparisons.\n",
"\n",
"### Advanced Comparison\n",
"A more advanced euclidean and cosine comparison is also included.\n",
"\n",
"## NOTES: Tests run on local Jupiter Notebook| Anaconda setup for the course."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import glob\n",
"from dotenv import load_dotenv\n",
"import gradio as gr"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "802137aa-8a74-45e0-a487-d1974927d7ca",
"metadata": {},
"outputs": [],
"source": [
"# imports for langchain\n",
"\n",
"from langchain.document_loaders import DirectoryLoader, TextLoader\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.schema import Document\n",
"from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
"from langchain_chroma import Chroma\n",
"import numpy as np\n",
"from sklearn.manifold import TSNE\n",
"import plotly.graph_objects as go\n",
"from langchain.memory import ConversationBufferMemory\n",
"from langchain.chains import ConversationalRetrievalChain"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58c85082-e417-4708-9efe-81a5d55d1424",
"metadata": {},
"outputs": [],
"source": [
"# price is a factor for our company, so we're going to use a low cost model\n",
"\n",
"MODEL = \"gpt-4o-mini\"\n",
"db_name = \"vector_db\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee78efcb-60fe-449e-a944-40bab26261af",
"metadata": {},
"outputs": [],
"source": [
"# Load environment variables in a file called .env\n",
"\n",
"load_dotenv()\n",
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
"metadata": {},
"outputs": [],
"source": [
"# Read in documents using LangChain's loaders\n",
"# Take everything in all the sub-folders of our knowledgebase\n",
"\n",
"folders = glob.glob(\"knowledge-base/*\")\n",
"\n",
"# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n",
"text_loader_kwargs = {'encoding': 'utf-8'}\n",
"# If that doesn't work, some Windows users might need to uncomment the next line instead\n",
"# text_loader_kwargs={'autodetect_encoding': True}\n",
"\n",
"documents = []\n",
"for folder in folders:\n",
" doc_type = os.path.basename(folder)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
" folder_docs = loader.load()\n",
" for doc in folder_docs:\n",
" doc.metadata[\"doc_type\"] = doc_type\n",
" documents.append(doc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
"metadata": {},
"outputs": [],
"source": [
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
"chunks = text_splitter.split_documents(documents)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
"metadata": {},
"outputs": [],
"source": [
"len(chunks)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c54b4b6-06da-463d-bee7-4dd456c2b887",
"metadata": {},
"outputs": [],
"source": [
"doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
"print(f\"Document types found: {', '.join(doc_types)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8b5ef27-70c2-4111-bce7-854bc1ebd02a",
"metadata": {},
"outputs": [],
"source": [
"# Use a where filter to specify the metadata condition\n",
"# Get the 3 company vectors (corresponds to our 3 yellow dots)\n",
"\n",
"def get_company_vectors(collection):\n",
" company_vectors = collection.get(\n",
" where={\"doc_type\": \"company\"}, # Filter for documents where source = \"XXXX\"\n",
" limit=10,\n",
" include=[\"embeddings\", \"metadatas\", \"documents\"]\n",
" )\n",
" print(f\"Found {len(company_vectors)} company vectors\")\n",
" return company_vectors\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d688b873-b52b-4d80-9df2-f70b389f5dc7",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def print_vectors_summary(vectors):\n",
" for i in range(len(vectors[\"documents\"])):\n",
" print(f\"\\n--- Chunk {i+1} ---\")\n",
" \n",
" # Print document content (first 100 chars)\n",
" print(f\"Content: {vectors['documents'][i][:100]}...\")\n",
" \n",
" # Print metadata\n",
" print(f\"Metadata: {vectors['metadatas'][i]}\")\n",
" \n",
" # Print embedding info (not the full vector as it would be too long)\n",
" embedding = vectors[\"embeddings\"][i]\n",
" print(f\"Embedding: Vector of length {len(embedding)}, first 5 values: {embedding[:5]}\")\n",
"\n",
"\n",
"def get_dimensions_for_vectors(vectors):\n",
" dimensions = []\n",
"\n",
" for i in range(len(vectors[\"documents\"])):\n",
" embedding = vectors[\"embeddings\"][i]\n",
" dimensions.append(embedding)\n",
"\n",
" return dimensions\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b195184-4920-404a-9bfa-0231f1dbe276",
"metadata": {},
"outputs": [],
"source": [
"# Quick check if any single value is different\n",
"def quick_diff_check(emb1, emb2):\n",
" result = \"Embeddings are identical\"\n",
" print(\"\\n\\nComparing two embeddings:\\n\\n\")\n",
" print(emb1)\n",
" print(emb2)\n",
" for i, (v1, v2) in enumerate(zip(emb1, emb2)):\n",
" if v1 != v2:\n",
" result = f\"Different at dimension {i}: {v1} vs {v2}\"\n",
" break\n",
" print(result)\n",
" return result\n",
"\n",
"#quick_diff_check(dimensions[0], dimensions[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06ba838d-d179-4e2d-b208-dd9cc1fd0097",
"metadata": {},
"outputs": [],
"source": [
"\n",
"embeddings = OpenAIEmbeddings()\n",
"\n",
"def create_vectorstores(embeddings):\n",
"\n",
" if os.path.exists(\"vectorstore1\"):\n",
" Chroma(persist_directory=\"vectorstore1\", embedding_function=embeddings).delete_collection()\n",
" if os.path.exists(\"vectorstore2\"):\n",
" Chroma(persist_directory=\"vectorstore2\", embedding_function=embeddings).delete_collection()\n",
" \n",
" \n",
" # Create vectorstore 1\n",
" vectorstore1 = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=\"vectorstore1\")\n",
" print(f\"Vectorstore 1 created with {vectorstore1._collection.count()} documents\")\n",
" \n",
" # Create vectorstore 2\n",
" vectorstore2 = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=\"vectorstore2\")\n",
" print(f\"Vectorstore 2 created with {vectorstore2._collection.count()} documents\")\n",
"\n",
" return vectorstore1, vectorstore2\n",
"\n",
"vectorstore1, vectorstore2 = create_vectorstores(embeddings)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e24242eb-613a-4edb-a081-6b8937f106a7",
"metadata": {},
"outputs": [],
"source": [
"## Uncomment this and rerun cells below, \n",
"## to see that HuggingFaceEmbeddings is idential\n",
"\n",
"#from langchain.embeddings import HuggingFaceEmbeddings\n",
"#embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
"#vectorstore1, vectorstore2 = create_vectorstores(embeddings)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "000b9e70-2958-40db-bbed-56a00e4249ce",
"metadata": {},
"outputs": [],
"source": [
"# Get the 3 company doc_type vectors\n",
"collection1 = vectorstore1._collection\n",
"collection2 = vectorstore2._collection\n",
"\n",
"company_vectors1=get_company_vectors(collection1)\n",
"company_vectors2=get_company_vectors(collection2)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "63cd63e4-9d3e-405a-8ef9-dac16fe2570e",
"metadata": {},
"outputs": [],
"source": [
"# Lets print out summary info just to see we have the same chunks.\n",
"\n",
"def print_summary_info (vectors):\n",
" print(\"VECTORS SUMMARY\\n\")\n",
" print_vectors_summary(vectors)\n",
"\n",
"\n",
"print(\"\\n\\n\\n========= VECTORS 1 =========\\n\\n\")\n",
"print_summary_info(company_vectors1)\n",
"\n",
"print(\"\\n\\n\\n========= VECTORS 2 =========\\n\\n\")\n",
"print_summary_info(company_vectors2)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc085a35-f0ec-4ddb-955c-244cb2d3eb2a",
"metadata": {},
"outputs": [],
"source": [
"dimensions1 = get_dimensions_for_vectors(company_vectors1)\n",
"dimensions2 = get_dimensions_for_vectors(company_vectors2)\n",
"\n",
"result1 = quick_diff_check(dimensions1[0], dimensions2[0]) \n",
"result2 = quick_diff_check(dimensions1[1], dimensions2[1]) \n",
"result3 = quick_diff_check(dimensions1[2], dimensions2[2]) \n",
"\n",
"print(\"\\n\\nSUMMARY RESULTS:\")\n",
"print(\"================\\n\\n\")\n",
"print(result1) \n",
"print(result2)\n",
"print(result3)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "164cf94d-9d63-4bae-91f9-4b02da1537ae",
"metadata": {},
"outputs": [],
"source": [
"## ADVANCED COMPARISONS:\n",
"# More advanced comparisons (from Claude 3.7 Sonnet):\n",
"\n",
"\n",
"## !IMPORTANT *** Uncomment final line to execute ***\n",
"\n",
"\n",
"import numpy as np\n",
"from scipy.spatial.distance import cosine\n",
"\n",
"# Method 1: Euclidean distance (L2 norm)\n",
"def compare_embeddings_euclidean(emb1, emb2):\n",
" emb1_array = np.array(emb1)\n",
" emb2_array = np.array(emb2)\n",
" distance = np.linalg.norm(emb1_array - emb2_array)\n",
" return {\n",
" \"different\": distance > 0,\n",
" \"distance\": distance,\n",
" \"similarity\": 1/(1+distance) # Converts distance to similarity score\n",
" }\n",
"\n",
"# Method 2: Cosine similarity (common for embeddings)\n",
"def compare_embeddings_cosine(emb1, emb2):\n",
" emb1_array = np.array(emb1)\n",
" emb2_array = np.array(emb2)\n",
" similarity = 1 - cosine(emb1_array, emb2_array) # Cosine returns distance, so subtract from 1\n",
" return {\n",
" \"different\": similarity < 0.9999, # Almost identical if > 0.9999\n",
" \"similarity\": similarity\n",
" }\n",
"\n",
"# Method 3: Simple exact equality check\n",
"def are_embeddings_identical(emb1, emb2):\n",
" return np.array_equal(np.array(emb1), np.array(emb2))\n",
"\n",
"\n",
"def run_advanced_comparisons():\n",
" for i in range(0, 3):\n",
" print(f\"\\n\\nComparing vector dimensions for dimension[{i}]....\\n\")\n",
" print(\"Exactly identical? ---> \", are_embeddings_identical(dimensions1[i], dimensions2[i]))\n",
" print(\"Cosine comparison: ---> \", compare_embeddings_cosine(dimensions1[i], dimensions2[i]))\n",
" print(\"Euclidean comparison: ---> \", compare_embeddings_euclidean(dimensions1[i], dimensions2[i]))\n",
"\n",
"\n",
"#run_advanced_comparisons()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}