Add creating a RAG database using OCR by Muhammad (cleared output), only the file has been added

This commit is contained in:
Muhammad Ijaz
2025-07-22 17:10:46 +05:00
parent 96b796dfc9
commit 1faa95ca15

View File

@@ -0,0 +1,229 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Just before starting \n",
"\n",
"## Issues and Considerations\n",
"\n",
"This notebook requires a few installation to run. First is tesseract:\n",
"\n",
"For windows: https://stackoverflow.com/questions/46140485/tesseract-installation-in-windows\n",
"\n",
"For Linux: run on the cli \"sudo apt-get install tesseract-ocr\"\n",
"\n",
"For Mac: https://www.oreilly.com/library/view/building-computer-vision/9781838644673/95de5b35-436b-4668-8ca2-44970a6e2924.xhtml\n",
"\n",
"\n",
"Next install pytesseract in your environment\n",
"\n",
"For uv: uv pip install pytesseract\n",
"\n",
"For pip install: pip install pytesseract\n",
"\n",
"\n",
"You would require an OpenAI API key and Pinecone API key in your .env file\n"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [],
"source": [
"# Install LangChain Unstructured (which requires unstructured under the hood)\n",
"\n",
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores import Pinecone\n",
"import getpass\n",
"from pinecone import Pinecone\n",
"import os\n",
"from pinecone import ServerlessSpec\n",
"from langchain_pinecone import PineconeVectorStore\n",
"from unstructured.partition.pdf import partition_pdf\n",
"import glob"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [],
"source": [
"# Initialize embeddings and Pinecone vector store\n",
"embeddings = OpenAIEmbeddings() #"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Pinecone client\n",
"if not os.getenv(\"PINECONE_API_KEY\"):\n",
" os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Enter your Pinecone API key: \")\n",
"pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n",
"pc = Pinecone(api_key=pinecone_api_key)"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"# Index Creation and its testing \n",
"index_name = \"lahore-cases\" # Replace the name with anything you like \n",
"if not pc.has_index(index_name):\n",
" pc.create_index(\n",
" name=index_name,\n",
" dimension=1536,\n",
" metric=\"cosine\",\n",
" spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n",
" )\n"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"# vector store\n",
"index = pc.Index(index_name)\n",
"vector_store = PineconeVectorStore(index=index, embedding=embeddings)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"files = glob.glob(pathname='./**/*.pdf',recursive=True) # I have set recursive = True so that we can check subdirectories too.\n",
"print(len(files)) # confirm that you have all of the pdfs here with the correct path"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [],
"source": [
"chunks = [] # The array to store the sections in \n",
"section_content = \"\"\n",
"index = 1\n",
"for file_path in files:\n",
" print(f\"File Number {index} completed:\",file_path) # To keep track of files\n",
" index+=1\n",
" elements = partition_pdf(file_path, languages=[\"eng\"],strategy=\"fast\")\n",
" for element in elements:\n",
" if element.category == 'NarrativeText': # meaning that it is simmple text \n",
" section_content+=element.text # Then append it to the already going section content\n",
" elif element.category==\"ListItem\":\n",
" chunks.append({\"page_content\":section_content,\"metadata\":element.metadata})\n",
" section_content=\"\" # Because a new sectionn has started\n",
" section_content += element.text # The string should start with the title of the text\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(set([chunk['metadata'].filename for chunk in chunks])) # Check if all of the completed files are here"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"chunks # How each chunk looks like"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [],
"source": [
"from langchain_core.documents import Document\n",
"# How pinecone expects each chunk to be\n",
"docs = [Document(page_content=chunk['page_content'],metadata={\"source\":chunk['metadata'].filename}) for chunk in chunks]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for doc in docs:\n",
" print(doc.page_content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Now add all of the docs in the pinceone namespace\n",
"from uuid import uuid4\n",
"uuids = [str(uuid4()) for _ in range(len(docs))]\n",
"batch_size = 200\n",
"for i in range(0, len(docs), batch_size):\n",
" print(\"Current Batch Index is:\",i)\n",
" batch = docs[i:i+batch_size]\n",
" batch_ids = uuids[i:i+batch_size]\n",
" vector_store.add_documents(batch,ids=batch_ids)"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [],
"source": [
"res = vector_store.similarity_search(query=\"Which act contains the words 'nothing from this act can be removed from the railways ..\",k=10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}