From 1faa95ca1504545797fd0933c8bd6c47c8ddc4ff Mon Sep 17 00:00:00 2001 From: Muhammad Ijaz Date: Tue, 22 Jul 2025 17:10:46 +0500 Subject: [PATCH 1/2] Add creating a RAG database using OCR by Muhammad (cleared output), only the file has been added --- .../RAG-using-OCR.ipynb | 229 ++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 week5/community-contributions/RAG-using-OCR.ipynb diff --git a/week5/community-contributions/RAG-using-OCR.ipynb b/week5/community-contributions/RAG-using-OCR.ipynb new file mode 100644 index 0000000..5f6e4d6 --- /dev/null +++ b/week5/community-contributions/RAG-using-OCR.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Just before starting \n", + "\n", + "## Issues and Considerations\n", + "\n", + "This notebook requires a few installation to run. First is tesseract:\n", + "\n", + "For windows: https://stackoverflow.com/questions/46140485/tesseract-installation-in-windows\n", + "\n", + "For Linux: run on the cli \"sudo apt-get install tesseract-ocr\"\n", + "\n", + "For Mac: https://www.oreilly.com/library/view/building-computer-vision/9781838644673/95de5b35-436b-4668-8ca2-44970a6e2924.xhtml\n", + "\n", + "\n", + "Next install pytesseract in your environment\n", + "\n", + "For uv: uv pip install pytesseract\n", + "\n", + "For pip install: pip install pytesseract\n", + "\n", + "\n", + "You would require an OpenAI API key and Pinecone API key in your .env file\n" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [], + "source": [ + "# Install LangChain Unstructured (which requires unstructured under the hood)\n", + "\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores import Pinecone\n", + "import getpass\n", + "from pinecone import Pinecone\n", + "import os\n", + "from pinecone import ServerlessSpec\n", + "from langchain_pinecone import PineconeVectorStore\n", + "from unstructured.partition.pdf import partition_pdf\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize embeddings and Pinecone vector store\n", + "embeddings = OpenAIEmbeddings() #" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Pinecone client\n", + "if not os.getenv(\"PINECONE_API_KEY\"):\n", + " os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Enter your Pinecone API key: \")\n", + "pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n", + "pc = Pinecone(api_key=pinecone_api_key)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "# Index Creation and its testing \n", + "index_name = \"lahore-cases\" # Replace the name with anything you like \n", + "if not pc.has_index(index_name):\n", + " pc.create_index(\n", + " name=index_name,\n", + " dimension=1536,\n", + " metric=\"cosine\",\n", + " spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [], + "source": [ + "# vector store\n", + "index = pc.Index(index_name)\n", + "vector_store = PineconeVectorStore(index=index, embedding=embeddings)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = glob.glob(pathname='./**/*.pdf',recursive=True) # I have set recursive = True so that we can check subdirectories too.\n", + "print(len(files)) # confirm that you have all of the pdfs here with the correct path" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "chunks = [] # The array to store the sections in \n", + "section_content = \"\"\n", + "index = 1\n", + "for file_path in files:\n", + " print(f\"File Number {index} completed:\",file_path) # To keep track of files\n", + " index+=1\n", + " elements = partition_pdf(file_path, languages=[\"eng\"],strategy=\"fast\")\n", + " for element in elements:\n", + " if element.category == 'NarrativeText': # meaning that it is simmple text \n", + " section_content+=element.text # Then append it to the already going section content\n", + " elif element.category==\"ListItem\":\n", + " chunks.append({\"page_content\":section_content,\"metadata\":element.metadata})\n", + " section_content=\"\" # Because a new sectionn has started\n", + " section_content += element.text # The string should start with the title of the text\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(set([chunk['metadata'].filename for chunk in chunks])) # Check if all of the completed files are here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chunks # How each chunk looks like" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.documents import Document\n", + "# How pinecone expects each chunk to be\n", + "docs = [Document(page_content=chunk['page_content'],metadata={\"source\":chunk['metadata'].filename}) for chunk in chunks]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for doc in docs:\n", + " print(doc.page_content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now add all of the docs in the pinceone namespace\n", + "from uuid import uuid4\n", + "uuids = [str(uuid4()) for _ in range(len(docs))]\n", + "batch_size = 200\n", + "for i in range(0, len(docs), batch_size):\n", + " print(\"Current Batch Index is:\",i)\n", + " batch = docs[i:i+batch_size]\n", + " batch_ids = uuids[i:i+batch_size]\n", + " vector_store.add_documents(batch,ids=batch_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [], + "source": [ + "res = vector_store.similarity_search(query=\"Which act contains the words 'nothing from this act can be removed from the railways ..\",k=10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From fc5f8a31c7f718cd32e1756a7d0b6eae9c507067 Mon Sep 17 00:00:00 2001 From: Muhammad Ijaz Date: Tue, 22 Jul 2025 17:17:35 +0500 Subject: [PATCH 2/2] Add RAG implementation using OCR, (cleared outputs) --- .../{RAG-using-OCR.ipynb => Pinecone-RAG-using-OCR.ipynb} | 4 ++++ 1 file changed, 4 insertions(+) rename week5/community-contributions/{RAG-using-OCR.ipynb => Pinecone-RAG-using-OCR.ipynb} (96%) diff --git a/week5/community-contributions/RAG-using-OCR.ipynb b/week5/community-contributions/Pinecone-RAG-using-OCR.ipynb similarity index 96% rename from week5/community-contributions/RAG-using-OCR.ipynb rename to week5/community-contributions/Pinecone-RAG-using-OCR.ipynb index 5f6e4d6..e5362ec 100644 --- a/week5/community-contributions/RAG-using-OCR.ipynb +++ b/week5/community-contributions/Pinecone-RAG-using-OCR.ipynb @@ -4,8 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "\n", + "\n", "# Just before starting \n", "\n", + "You can check completely explained working here: https://medium.com/@muhammad.bese23seecs/building-a-rag-powered-pinecone-database-using-ocr-a-practical-guide-with-pakistani-law-d83e869e1458\n", + "\n", "## Issues and Considerations\n", "\n", "This notebook requires a few installation to run. First is tesseract:\n",