Add creating a RAG database using OCR by Muhammad (cleared output), only the file has been added

2025-07-22 17:10:46 +05:00
parent 96b796dfc9
commit 1faa95ca15
1 changed files with 229 additions and 0 deletions
--- a/week5/community-contributions/RAG-using-OCR.ipynb
+++ b/week5/community-contributions/RAG-using-OCR.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Just before starting \n",
+    "\n",
+    "## Issues and Considerations\n",
+    "\n",
+    "This notebook requires a few installation to run. First is tesseract:\n",
+    "\n",
+    "For windows: https://stackoverflow.com/questions/46140485/tesseract-installation-in-windows\n",
+    "\n",
+    "For Linux: run on the cli \"sudo apt-get install tesseract-ocr\"\n",
+    "\n",
+    "For Mac: https://www.oreilly.com/library/view/building-computer-vision/9781838644673/95de5b35-436b-4668-8ca2-44970a6e2924.xhtml\n",
+    "\n",
+    "\n",
+    "Next install pytesseract in your environment\n",
+    "\n",
+    "For uv: uv pip install pytesseract\n",
+    "\n",
+    "For pip install: pip install pytesseract\n",
+    "\n",
+    "\n",
+    "You would require an OpenAI API key and Pinecone API key in your .env file\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install LangChain Unstructured (which requires unstructured under the hood)\n",
+    "\n",
+    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
+    "from langchain.vectorstores import Pinecone\n",
+    "import getpass\n",
+    "from pinecone import Pinecone\n",
+    "import os\n",
+    "from pinecone import ServerlessSpec\n",
+    "from langchain_pinecone import PineconeVectorStore\n",
+    "from unstructured.partition.pdf import partition_pdf\n",
+    "import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 135,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize embeddings and Pinecone vector store\n",
+    "embeddings = OpenAIEmbeddings()  #"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Pinecone client\n",
+    "if not os.getenv(\"PINECONE_API_KEY\"):\n",
+    "    os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Enter your Pinecone API key: \")\n",
+    "pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n",
+    "pc = Pinecone(api_key=pinecone_api_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Index Creation and its testing \n",
+    "index_name = \"lahore-cases\" # Replace the name with anything you like \n",
+    "if not pc.has_index(index_name):\n",
+    "    pc.create_index(\n",
+    "        name=index_name,\n",
+    "        dimension=1536,\n",
+    "        metric=\"cosine\",\n",
+    "        spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# vector store\n",
+    "index = pc.Index(index_name)\n",
+    "vector_store = PineconeVectorStore(index=index, embedding=embeddings)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = glob.glob(pathname='./**/*.pdf',recursive=True) # I have set recursive = True so that we can check subdirectories too.\n",
+    "print(len(files)) # confirm that you have all of the pdfs here with the correct path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chunks = [] # The array to store the sections in \n",
+    "section_content = \"\"\n",
+    "index = 1\n",
+    "for file_path in files:\n",
+    "    print(f\"File Number {index} completed:\",file_path) # To keep track of files\n",
+    "    index+=1\n",
+    "    elements = partition_pdf(file_path, languages=[\"eng\"],strategy=\"fast\")\n",
+    "    for element in elements:\n",
+    "        if element.category == 'NarrativeText': # meaning that it is simmple text \n",
+    "            section_content+=element.text # Then append it to the already going section content\n",
+    "        elif element.category==\"ListItem\":\n",
+    "            chunks.append({\"page_content\":section_content,\"metadata\":element.metadata})\n",
+    "            section_content=\"\" # Because a new sectionn has started\n",
+    "            section_content += element.text # The string should start with the title of the text\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(set([chunk['metadata'].filename for chunk in chunks])) # Check if all of the completed files are here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chunks # How each chunk looks like"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 143,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.documents import Document\n",
+    "# How pinecone expects each chunk to be\n",
+    "docs = [Document(page_content=chunk['page_content'],metadata={\"source\":chunk['metadata'].filename}) for chunk in chunks]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for doc in docs:\n",
+    "    print(doc.page_content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Now add all of the docs in the pinceone namespace\n",
+    "from uuid import uuid4\n",
+    "uuids = [str(uuid4()) for _ in range(len(docs))]\n",
+    "batch_size = 200\n",
+    "for i in range(0, len(docs), batch_size):\n",
+    "    print(\"Current Batch Index is:\",i)\n",
+    "    batch = docs[i:i+batch_size]\n",
+    "    batch_ids = uuids[i:i+batch_size]\n",
+    "    vector_store.add_documents(batch,ids=batch_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 147,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = vector_store.similarity_search(query=\"Which act contains the words 'nothing from this act can be removed from the railways ..\",k=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}