vectorstore with openAI
This commit is contained in:
283
week5/community-contributions/day5_vectorstore_openai.ipynb
Normal file
283
week5/community-contributions/day5_vectorstore_openai.ipynb
Normal file
@@ -0,0 +1,283 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Import documents exported from Evernote to a vectorstore\n",
|
||||
"### Use OpenAI file search with responses API\n",
|
||||
"#### Prerequisite steps\n",
|
||||
"* exported notes from your Evernote notebook as html \n",
|
||||
"* converted the notes further to md-files and remove broken image links (use python/AI)\n",
|
||||
"* the files are named with note titles\n",
|
||||
"\n",
|
||||
"Files are in one folder.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"##### Query ChromaDB vectorstore\n",
|
||||
"I tried to accomplish this task with RAG like the example by https://github.com/ed-donner/llm_engineering/commits?author=dinorrusso.\n",
|
||||
"\n",
|
||||
"I thought this to be a trivial task, but it was not 😃 That example uses Ollama running locally.\n",
|
||||
"Even though the retriever had the information required, it was dropped from the answer.\n",
|
||||
"\n",
|
||||
"I tried then to use Chroma + OpenAI. After several attemps succeeded to create a vectorstore and query it. That's it for this time.\n",
|
||||
"\n",
|
||||
"##### Openai vectorstore, see bottom of the notebook\n",
|
||||
"One attempt was to use OpenAI's fileSearch-tool which seemed pretty straightforward.\n",
|
||||
"The con: loading files was not working always. Code is left though as reference."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Imports\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import gradio as gr\n",
|
||||
"import openai\n",
|
||||
"import chromadb\n",
|
||||
"from chromadb.config import Settings\n",
|
||||
"import os"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Load files to vectorstore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
|
||||
"openai.api_key = os.environ['OPENAI_API_KEY']\n",
|
||||
"\n",
|
||||
"def chunk_text(text, max_tokens=2000):\n",
|
||||
" words = text.split()\n",
|
||||
" chunks = []\n",
|
||||
" current_chunk = []\n",
|
||||
" current_length = 0\n",
|
||||
"\n",
|
||||
" for word in words:\n",
|
||||
" current_length += len(word) + 1 # +1 for the space\n",
|
||||
" if current_length > max_tokens:\n",
|
||||
" chunks.append(\" \".join(current_chunk))\n",
|
||||
" current_chunk = [word]\n",
|
||||
" current_length = len(word) + 1\n",
|
||||
" else:\n",
|
||||
" current_chunk.append(word)\n",
|
||||
"\n",
|
||||
" if current_chunk:\n",
|
||||
" chunks.append(\" \".join(current_chunk))\n",
|
||||
"\n",
|
||||
" return chunks\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# # Set up OpenAI API key\n",
|
||||
"# openai.api_key = \"your_openai_api_key\" # Replace with your API key\n",
|
||||
"chroma_client = chromadb.Client()\n",
|
||||
"\n",
|
||||
"# Create or get the existing collection\n",
|
||||
"collection_name = \"EverNotes\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" existing_collection = chroma_client.get_collection(name=collection_name)\n",
|
||||
" if existing_collection.count() > 0:\n",
|
||||
" chroma_client.delete_collection(name=collection_name)\n",
|
||||
"except:\n",
|
||||
" print(f\"Collection {collection_name} does not exist. Creating a new one.\")\n",
|
||||
"\n",
|
||||
"# Create a collection in ChromaDB\n",
|
||||
"collection = chroma_client.get_or_create_collection(name=collection_name)\n",
|
||||
"\n",
|
||||
"# Define your data\n",
|
||||
"# it should be like this\n",
|
||||
"# documents = [\"OpenAI is revolutionizing AI.\", \"ChromaDB makes embedding storage easy.\"]\n",
|
||||
"# metadata = [{\"id\": 1}, {\"id\": 2}]\n",
|
||||
"\n",
|
||||
"folder_path = os.getenv('EVERNOTE_EXPORT')\n",
|
||||
"documents = []\n",
|
||||
"\n",
|
||||
"for root, dirs, files in os.walk(folder_path):\n",
|
||||
" for file in files:\n",
|
||||
" if file.endswith('.md'): # Change this to the file extension you need\n",
|
||||
" with open(os.path.join(root, file), 'r') as f:\n",
|
||||
" documents.append(f.read())\n",
|
||||
"\n",
|
||||
"metadata = [{\"id\": i + 1} for i in range(len(documents))]\n",
|
||||
"\n",
|
||||
"# Generate embeddings using OpenAI\n",
|
||||
"def get_embedding(text, model=\"text-embedding-ada-002\"):\n",
|
||||
" response = openai.embeddings.create(input=text, model=model)\n",
|
||||
" return response.data[0].embedding\n",
|
||||
"\n",
|
||||
"# Add documents and embeddings to ChromaDB in chunks\n",
|
||||
"for doc, meta in zip(documents, metadata):\n",
|
||||
" chunks = chunk_text(doc)\n",
|
||||
" for chunk in chunks:\n",
|
||||
" embedding = get_embedding(chunk)\n",
|
||||
" collection.add(\n",
|
||||
" documents=[chunk],\n",
|
||||
" embeddings=[embedding],\n",
|
||||
" metadatas=[meta],\n",
|
||||
" ids=[str(meta[\"id\"])]\n",
|
||||
" )\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Query ChromaDB"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# \n",
|
||||
"query_text = \"Is there a video for Fitting the Shimano speed hub 7\"\n",
|
||||
"query_embedding = get_embedding(query_text)\n",
|
||||
"\n",
|
||||
"results = collection.query(\n",
|
||||
" query_embeddings=[query_embedding],\n",
|
||||
" n_results=2\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Query Results:\", results)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Gradio interface"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Function to query ChromaDB\n",
|
||||
"def query_chromadb(query_text):\n",
|
||||
" query_embedding = get_embedding(query_text)\n",
|
||||
" results = collection.query(\n",
|
||||
" query_embeddings=[query_embedding],\n",
|
||||
" n_results=2\n",
|
||||
" )\n",
|
||||
" return results\n",
|
||||
"\n",
|
||||
"# Gradio interface\n",
|
||||
"def gradio_interface(query_text):\n",
|
||||
" results = query_chromadb(query_text)\n",
|
||||
" return results\n",
|
||||
"\n",
|
||||
"# Create Gradio app\n",
|
||||
"iface = gr.Interface(\n",
|
||||
" fn=gradio_interface,\n",
|
||||
" inputs=\"text\",\n",
|
||||
" outputs=\"text\",\n",
|
||||
" title=\"ChromaDB Query Interface\",\n",
|
||||
" description=\"Enter your query to search the ChromaDB collection.\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"iface.launch()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Below OpenAI filesearch variant which had some failures in file uploads."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import glob\n",
|
||||
"folder_path = os.environ['EVERNOTE_EXPORT'] \n",
|
||||
"# Filter out other except .md-files\n",
|
||||
"md_files = glob.glob(os.path.join(folder_path, '*.md'))\n",
|
||||
"file_paths = [os.path.join(folder_path, file) for file in md_files]\n",
|
||||
"file_streams = [open(path, 'rb') for path in file_paths]\n",
|
||||
"\n",
|
||||
"# Create vector store\n",
|
||||
"vector_store = openai.vector_stores.create(\n",
|
||||
" name=\"Evernote notes\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Batch Upload Limit: You can upload up to 100 files in a single batch\n",
|
||||
"# https://community.openai.com/t/max-100-files-in-vector-store/729876/4\n",
|
||||
"batch_size = 90\n",
|
||||
"for i in range(0, len(file_streams), batch_size):\n",
|
||||
" batch = file_streams[i:i + batch_size]\n",
|
||||
" file_batch = openai.vector_stores.file_batches.upload_and_poll(\n",
|
||||
" vector_store_id=vector_store.id,\n",
|
||||
" files=batch\n",
|
||||
" )\n",
|
||||
" print(file_batch.status)\n",
|
||||
" print(file_batch.file_counts)\n",
|
||||
"\n",
|
||||
"# There can be some fails in file counts:\n",
|
||||
"# \"FileCounts(cancelled=0, completed=89, failed=1, in_progress=0, total=90)\"\"\n",
|
||||
"# Usually 1 % fails. Did not find solution for improving that yet"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"response = openai.responses.create(\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" input=\"Is there a video for Fitting the Shimano speed hub 7?\",\n",
|
||||
" tools=[{\n",
|
||||
" \"type\": \"file_search\",\n",
|
||||
" \"vector_store_ids\": [vector_store.id]\n",
|
||||
" }],\n",
|
||||
" include=None\n",
|
||||
")\n",
|
||||
"print(response)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user