From 1faa95ca1504545797fd0933c8bd6c47c8ddc4ff Mon Sep 17 00:00:00 2001
From: Muhammad Ijaz <muhammad.bese23seecs@seecs.edu.pk>
Date: Tue, 22 Jul 2025 17:10:46 +0500
Subject: [PATCH 1/2] Add creating a RAG database using OCR by Muhammad
 (cleared output), only the file has been added

---
 .../RAG-using-OCR.ipynb                       | 229 ++++++++++++++++++
 1 file changed, 229 insertions(+)
 create mode 100644 week5/community-contributions/RAG-using-OCR.ipynb

diff --git a/week5/community-contributions/RAG-using-OCR.ipynb b/week5/community-contributions/RAG-using-OCR.ipynb
new file mode 100644
index 0000000..5f6e4d6
--- /dev/null
+++ b/week5/community-contributions/RAG-using-OCR.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Just before starting \n",
+    "\n",
+    "## Issues and Considerations\n",
+    "\n",
+    "This notebook requires a few installation to run. First is tesseract:\n",
+    "\n",
+    "For windows: https://stackoverflow.com/questions/46140485/tesseract-installation-in-windows\n",
+    "\n",
+    "For Linux: run on the cli \"sudo apt-get install tesseract-ocr\"\n",
+    "\n",
+    "For Mac: https://www.oreilly.com/library/view/building-computer-vision/9781838644673/95de5b35-436b-4668-8ca2-44970a6e2924.xhtml\n",
+    "\n",
+    "\n",
+    "Next install pytesseract in your environment\n",
+    "\n",
+    "For uv: uv pip install pytesseract\n",
+    "\n",
+    "For pip install: pip install pytesseract\n",
+    "\n",
+    "\n",
+    "You would require an OpenAI API key and Pinecone API key in your .env file\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install LangChain Unstructured (which requires unstructured under the hood)\n",
+    "\n",
+    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
+    "from langchain.vectorstores import Pinecone\n",
+    "import getpass\n",
+    "from pinecone import Pinecone\n",
+    "import os\n",
+    "from pinecone import ServerlessSpec\n",
+    "from langchain_pinecone import PineconeVectorStore\n",
+    "from unstructured.partition.pdf import partition_pdf\n",
+    "import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 135,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize embeddings and Pinecone vector store\n",
+    "embeddings = OpenAIEmbeddings()  #"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Pinecone client\n",
+    "if not os.getenv(\"PINECONE_API_KEY\"):\n",
+    "    os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Enter your Pinecone API key: \")\n",
+    "pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n",
+    "pc = Pinecone(api_key=pinecone_api_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Index Creation and its testing \n",
+    "index_name = \"lahore-cases\" # Replace the name with anything you like \n",
+    "if not pc.has_index(index_name):\n",
+    "    pc.create_index(\n",
+    "        name=index_name,\n",
+    "        dimension=1536,\n",
+    "        metric=\"cosine\",\n",
+    "        spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# vector store\n",
+    "index = pc.Index(index_name)\n",
+    "vector_store = PineconeVectorStore(index=index, embedding=embeddings)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = glob.glob(pathname='./**/*.pdf',recursive=True) # I have set recursive = True so that we can check subdirectories too.\n",
+    "print(len(files)) # confirm that you have all of the pdfs here with the correct path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chunks = [] # The array to store the sections in \n",
+    "section_content = \"\"\n",
+    "index = 1\n",
+    "for file_path in files:\n",
+    "    print(f\"File Number {index} completed:\",file_path) # To keep track of files\n",
+    "    index+=1\n",
+    "    elements = partition_pdf(file_path, languages=[\"eng\"],strategy=\"fast\")\n",
+    "    for element in elements:\n",
+    "        if element.category == 'NarrativeText': # meaning that it is simmple text \n",
+    "            section_content+=element.text # Then append it to the already going section content\n",
+    "        elif element.category==\"ListItem\":\n",
+    "            chunks.append({\"page_content\":section_content,\"metadata\":element.metadata})\n",
+    "            section_content=\"\" # Because a new sectionn has started\n",
+    "            section_content += element.text # The string should start with the title of the text\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(set([chunk['metadata'].filename for chunk in chunks])) # Check if all of the completed files are here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chunks # How each chunk looks like"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 143,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.documents import Document\n",
+    "# How pinecone expects each chunk to be\n",
+    "docs = [Document(page_content=chunk['page_content'],metadata={\"source\":chunk['metadata'].filename}) for chunk in chunks]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for doc in docs:\n",
+    "    print(doc.page_content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Now add all of the docs in the pinceone namespace\n",
+    "from uuid import uuid4\n",
+    "uuids = [str(uuid4()) for _ in range(len(docs))]\n",
+    "batch_size = 200\n",
+    "for i in range(0, len(docs), batch_size):\n",
+    "    print(\"Current Batch Index is:\",i)\n",
+    "    batch = docs[i:i+batch_size]\n",
+    "    batch_ids = uuids[i:i+batch_size]\n",
+    "    vector_store.add_documents(batch,ids=batch_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 147,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = vector_store.similarity_search(query=\"Which act contains the words 'nothing from this act can be removed from the railways ..\",k=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From fc5f8a31c7f718cd32e1756a7d0b6eae9c507067 Mon Sep 17 00:00:00 2001
From: Muhammad Ijaz <muhammad.bese23seecs@seecs.edu.pk>
Date: Tue, 22 Jul 2025 17:17:35 +0500
Subject: [PATCH 2/2] Add RAG implementation using OCR, (cleared outputs)

---
 .../{RAG-using-OCR.ipynb => Pinecone-RAG-using-OCR.ipynb}     | 4 ++++
 1 file changed, 4 insertions(+)
 rename week5/community-contributions/{RAG-using-OCR.ipynb => Pinecone-RAG-using-OCR.ipynb} (96%)

diff --git a/week5/community-contributions/RAG-using-OCR.ipynb b/week5/community-contributions/Pinecone-RAG-using-OCR.ipynb
similarity index 96%
rename from week5/community-contributions/RAG-using-OCR.ipynb
rename to week5/community-contributions/Pinecone-RAG-using-OCR.ipynb
index 5f6e4d6..e5362ec 100644
--- a/week5/community-contributions/RAG-using-OCR.ipynb
+++ b/week5/community-contributions/Pinecone-RAG-using-OCR.ipynb
@@ -4,8 +4,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "\n",
+    "\n",
     "# Just before starting \n",
     "\n",
+    "You can check completely explained working here: https://medium.com/@muhammad.bese23seecs/building-a-rag-powered-pinecone-database-using-ocr-a-practical-guide-with-pakistani-law-d83e869e1458\n",
+    "\n",
     "## Issues and Considerations\n",
     "\n",
     "This notebook requires a few installation to run. First is tesseract:\n",