Merge pull request #265 from zoya-hammad/community-contributions
Added my contributions to community-contributions
This commit is contained in:
@@ -0,0 +1,235 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fad6ee3f-45b8-4ac3-aa39-4a44dac91994",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Creating Text Embeddings From a Text File\n",
|
||||
"- Loading data using TextLoader\n",
|
||||
"- Splitting into chunks using CharacterTextSplitter\n",
|
||||
"- Converting chunks into vector embeddings and creating a vectorstore\n",
|
||||
"- Retreiving, reducing dimensions to 2D and displaying text embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "33b79f0d-7bd5-4e82-9295-2cc5cfa9495b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "391d12b3-ea25-4c66-93ba-71ef7c590be3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import DirectoryLoader, TextLoader\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.schema import Document\n",
|
||||
"from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
|
||||
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
||||
"from langchain_chroma import Chroma\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.manifold import TSNE\n",
|
||||
"import plotly.graph_objects as go"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "365d4346-bcf7-48b3-be13-b492f1877fab",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL = \"gpt-4o-mini\"\n",
|
||||
"db_name = \"my_vector_db\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "93887c1e-fb5e-4f9a-95f6-91a284e49695",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "86289eb8-25d8-405f-b1bb-3d9d9fed8671",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = TextLoader(\"data.txt\", encoding=\"utf-8\")\n",
|
||||
"data = loader.load()\n",
|
||||
"\n",
|
||||
"documents = []\n",
|
||||
"for text in data:\n",
|
||||
" documents.append(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "32320fff-2321-40ea-9b7d-294dc2dfba3a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=20, chunk_overlap=5)\n",
|
||||
"chunks = text_splitter.split_documents(documents)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fce762a5-4c78-4102-ab55-f95ee0c97286",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(chunks)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ddb5bc12-af30-476d-bbbb-f91a3ae8af2f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embeddings = OpenAIEmbeddings()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "75ba81ec-9178-4ce4-83e2-82f937c85902",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if os.path.exists(db_name):\n",
|
||||
" Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c3ca2632-a8b3-4e7e-8370-d91579d31c23",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n",
|
||||
"print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0de67066-73f5-446f-9033-a00d45b0cdc1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get one vector and find how many dimensions it has\n",
|
||||
"\n",
|
||||
"collection = vectorstore._collection\n",
|
||||
"sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0] # represents a single vector\n",
|
||||
"dimensions = len(sample_embedding)\n",
|
||||
"print(f\"The vectors have {dimensions:,} dimensions\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e50d972c-d740-4f0a-8bc2-e55ebe462a41",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_embedding"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aa96105d-b882-48d9-b088-6aab5db7b1e9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"result = collection.get(include=['embeddings','documents'])\n",
|
||||
"vectors = np.array(result['embeddings']) \n",
|
||||
"documents = result['documents']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "213b4cf2-db0a-4610-8d8f-97607996ed17",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Reduce dimensionality to 2D using t-SNE\n",
|
||||
"tsne = TSNE(n_components=2,perplexity=5, random_state=42)\n",
|
||||
"reduced_vectors = tsne.fit_transform(vectors)\n",
|
||||
"\n",
|
||||
"# Create the 2D scatter plot\n",
|
||||
"fig = go.Figure(data=[go.Scatter(\n",
|
||||
" x=reduced_vectors[:, 0],\n",
|
||||
" y=reduced_vectors[:, 1],\n",
|
||||
" mode='markers',\n",
|
||||
" marker=dict(size=5, opacity=0.8),\n",
|
||||
" text=[f\"Text: {d[:200]}...\" for d in documents],\n",
|
||||
" hoverinfo='text'\n",
|
||||
")])\n",
|
||||
"\n",
|
||||
"fig.update_layout(\n",
|
||||
" title='2D Chroma Vector Store Visualization',\n",
|
||||
" scene=dict(xaxis_title='x',yaxis_title='y'),\n",
|
||||
" width=800,\n",
|
||||
" height=600,\n",
|
||||
" margin=dict(r=20, b=10, l=10, t=40)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"fig.show()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7d13aa60-da3e-4c61-af69-1ba9087e0181",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user