vectorstore with openAI

2025-03-25 11:47:01 +02:00
parent cdddffefa5
commit 212b6da186
1 changed files with 283 additions and 0 deletions
--- a/week5/community-contributions/day5_vectorstore_openai.ipynb
+++ b/week5/community-contributions/day5_vectorstore_openai.ipynb
@@ -0,0 +1,283 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import documents exported from Evernote to a vectorstore\n",
+    "### Use OpenAI file search with responses API\n",
+    "#### Prerequisite steps\n",
+    "* exported notes from your Evernote notebook as html \n",
+    "* converted the notes further to md-files and remove broken image links (use python/AI)\n",
+    "* the files are named with note titles\n",
+    "\n",
+    "Files are in one folder.\n",
+    "\n",
+    "\n",
+    "##### Query ChromaDB vectorstore\n",
+    "I tried to accomplish this task with RAG like the example by https://github.com/ed-donner/llm_engineering/commits?author=dinorrusso.\n",
+    "\n",
+    "I thought this to be a trivial task, but it was not 😃 That example uses Ollama running locally.\n",
+    "Even though the retriever had the information required, it was dropped from the answer.\n",
+    "\n",
+    "I tried then to use Chroma + OpenAI. After several attemps succeeded to create a vectorstore and query it. That's it for this time.\n",
+    "\n",
+    "##### Openai vectorstore, see bottom of the notebook\n",
+    "One attempt was to use OpenAI's fileSearch-tool which seemed pretty straightforward.\n",
+    "The con: loading files was not working always. Code is left though as reference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Imports\n",
+    "from dotenv import load_dotenv\n",
+    "import gradio as gr\n",
+    "import openai\n",
+    "import chromadb\n",
+    "from chromadb.config import Settings\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load files to vectorstore"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "load_dotenv(override=True)\n",
+    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
+    "openai.api_key = os.environ['OPENAI_API_KEY']\n",
+    "\n",
+    "def chunk_text(text, max_tokens=2000):\n",
+    "    words = text.split()\n",
+    "    chunks = []\n",
+    "    current_chunk = []\n",
+    "    current_length = 0\n",
+    "\n",
+    "    for word in words:\n",
+    "        current_length += len(word) + 1  # +1 for the space\n",
+    "        if current_length > max_tokens:\n",
+    "            chunks.append(\" \".join(current_chunk))\n",
+    "            current_chunk = [word]\n",
+    "            current_length = len(word) + 1\n",
+    "        else:\n",
+    "            current_chunk.append(word)\n",
+    "\n",
+    "    if current_chunk:\n",
+    "        chunks.append(\" \".join(current_chunk))\n",
+    "\n",
+    "    return chunks\n",
+    "\n",
+    "\n",
+    "# # Set up OpenAI API key\n",
+    "# openai.api_key = \"your_openai_api_key\"  # Replace with your API key\n",
+    "chroma_client = chromadb.Client()\n",
+    "\n",
+    "# Create or get the existing collection\n",
+    "collection_name = \"EverNotes\"\n",
+    "\n",
+    "try:\n",
+    "    existing_collection = chroma_client.get_collection(name=collection_name)\n",
+    "    if existing_collection.count() > 0:\n",
+    "        chroma_client.delete_collection(name=collection_name)\n",
+    "except:\n",
+    "    print(f\"Collection {collection_name} does not exist. Creating a new one.\")\n",
+    "\n",
+    "# Create a collection in ChromaDB\n",
+    "collection = chroma_client.get_or_create_collection(name=collection_name)\n",
+    "\n",
+    "# Define your data\n",
+    "# it should be like this\n",
+    "# documents = [\"OpenAI is revolutionizing AI.\", \"ChromaDB makes embedding storage easy.\"]\n",
+    "# metadata = [{\"id\": 1}, {\"id\": 2}]\n",
+    "\n",
+    "folder_path = os.getenv('EVERNOTE_EXPORT')\n",
+    "documents = []\n",
+    "\n",
+    "for root, dirs, files in os.walk(folder_path):\n",
+    "    for file in files:\n",
+    "        if file.endswith('.md'):  # Change this to the file extension you need\n",
+    "            with open(os.path.join(root, file), 'r') as f:\n",
+    "                documents.append(f.read())\n",
+    "\n",
+    "metadata = [{\"id\": i + 1} for i in range(len(documents))]\n",
+    "\n",
+    "# Generate embeddings using OpenAI\n",
+    "def get_embedding(text, model=\"text-embedding-ada-002\"):\n",
+    "    response = openai.embeddings.create(input=text, model=model)\n",
+    "    return response.data[0].embedding\n",
+    "\n",
+    "# Add documents and embeddings to ChromaDB in chunks\n",
+    "for doc, meta in zip(documents, metadata):\n",
+    "    chunks = chunk_text(doc)\n",
+    "    for chunk in chunks:\n",
+    "        embedding = get_embedding(chunk)\n",
+    "        collection.add(\n",
+    "            documents=[chunk],\n",
+    "            embeddings=[embedding],\n",
+    "            metadatas=[meta],\n",
+    "            ids=[str(meta[\"id\"])]\n",
+    "        )\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Query ChromaDB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# \n",
+    "query_text = \"Is there a video for Fitting the Shimano speed hub 7\"\n",
+    "query_embedding = get_embedding(query_text)\n",
+    "\n",
+    "results = collection.query(\n",
+    "    query_embeddings=[query_embedding],\n",
+    "    n_results=2\n",
+    ")\n",
+    "\n",
+    "print(\"Query Results:\", results)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Gradio interface"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function to query ChromaDB\n",
+    "def query_chromadb(query_text):\n",
+    "    query_embedding = get_embedding(query_text)\n",
+    "    results = collection.query(\n",
+    "        query_embeddings=[query_embedding],\n",
+    "        n_results=2\n",
+    "    )\n",
+    "    return results\n",
+    "\n",
+    "# Gradio interface\n",
+    "def gradio_interface(query_text):\n",
+    "    results = query_chromadb(query_text)\n",
+    "    return results\n",
+    "\n",
+    "# Create Gradio app\n",
+    "iface = gr.Interface(\n",
+    "    fn=gradio_interface,\n",
+    "    inputs=\"text\",\n",
+    "    outputs=\"text\",\n",
+    "    title=\"ChromaDB Query Interface\",\n",
+    "    description=\"Enter your query to search the ChromaDB collection.\"\n",
+    ")\n",
+    "\n",
+    "iface.launch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Below OpenAI filesearch variant which had some failures in file uploads."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "folder_path = os.environ['EVERNOTE_EXPORT'] \n",
+    "# Filter out other except .md-files\n",
+    "md_files = glob.glob(os.path.join(folder_path, '*.md'))\n",
+    "file_paths = [os.path.join(folder_path, file) for file in md_files]\n",
+    "file_streams = [open(path, 'rb') for path in file_paths]\n",
+    "\n",
+    "# Create vector store\n",
+    "vector_store = openai.vector_stores.create(\n",
+    "    name=\"Evernote notes\",\n",
+    ")\n",
+    "\n",
+    "# Batch Upload Limit: You can upload up to 100 files in a single batch\n",
+    "# https://community.openai.com/t/max-100-files-in-vector-store/729876/4\n",
+    "batch_size = 90\n",
+    "for i in range(0, len(file_streams), batch_size):\n",
+    "    batch = file_streams[i:i + batch_size]\n",
+    "    file_batch = openai.vector_stores.file_batches.upload_and_poll(\n",
+    "        vector_store_id=vector_store.id,\n",
+    "        files=batch\n",
+    "    )\n",
+    "    print(file_batch.status)\n",
+    "    print(file_batch.file_counts)\n",
+    "\n",
+    "# There can be some fails in file counts:\n",
+    "# \"FileCounts(cancelled=0, completed=89, failed=1, in_progress=0, total=90)\"\"\n",
+    "# Usually 1 % fails. Did not find solution for improving that yet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "response = openai.responses.create(\n",
+    "    model=\"gpt-4o-mini\",\n",
+    "    input=\"Is there a video for Fitting the Shimano speed hub 7?\",\n",
+    "    tools=[{\n",
+    "        \"type\": \"file_search\",\n",
+    "        \"vector_store_ids\": [vector_store.id]\n",
+    "    }],\n",
+    "    include=None\n",
+    ")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}