Add creating a RAG database using OCR by Muhammad (cleared output), only the file has been added
This commit is contained in:
229
week5/community-contributions/RAG-using-OCR.ipynb
Normal file
229
week5/community-contributions/RAG-using-OCR.ipynb
Normal file
@@ -0,0 +1,229 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Just before starting \n",
|
||||
"\n",
|
||||
"## Issues and Considerations\n",
|
||||
"\n",
|
||||
"This notebook requires a few installation to run. First is tesseract:\n",
|
||||
"\n",
|
||||
"For windows: https://stackoverflow.com/questions/46140485/tesseract-installation-in-windows\n",
|
||||
"\n",
|
||||
"For Linux: run on the cli \"sudo apt-get install tesseract-ocr\"\n",
|
||||
"\n",
|
||||
"For Mac: https://www.oreilly.com/library/view/building-computer-vision/9781838644673/95de5b35-436b-4668-8ca2-44970a6e2924.xhtml\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Next install pytesseract in your environment\n",
|
||||
"\n",
|
||||
"For uv: uv pip install pytesseract\n",
|
||||
"\n",
|
||||
"For pip install: pip install pytesseract\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"You would require an OpenAI API key and Pinecone API key in your .env file\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 134,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install LangChain Unstructured (which requires unstructured under the hood)\n",
|
||||
"\n",
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import Pinecone\n",
|
||||
"import getpass\n",
|
||||
"from pinecone import Pinecone\n",
|
||||
"import os\n",
|
||||
"from pinecone import ServerlessSpec\n",
|
||||
"from langchain_pinecone import PineconeVectorStore\n",
|
||||
"from unstructured.partition.pdf import partition_pdf\n",
|
||||
"import glob"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 135,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initialize embeddings and Pinecone vector store\n",
|
||||
"embeddings = OpenAIEmbeddings() #"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 136,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# Pinecone client\n",
|
||||
"if not os.getenv(\"PINECONE_API_KEY\"):\n",
|
||||
" os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Enter your Pinecone API key: \")\n",
|
||||
"pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n",
|
||||
"pc = Pinecone(api_key=pinecone_api_key)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 137,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Index Creation and its testing \n",
|
||||
"index_name = \"lahore-cases\" # Replace the name with anything you like \n",
|
||||
"if not pc.has_index(index_name):\n",
|
||||
" pc.create_index(\n",
|
||||
" name=index_name,\n",
|
||||
" dimension=1536,\n",
|
||||
" metric=\"cosine\",\n",
|
||||
" spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n",
|
||||
" )\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 138,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# vector store\n",
|
||||
"index = pc.Index(index_name)\n",
|
||||
"vector_store = PineconeVectorStore(index=index, embedding=embeddings)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"files = glob.glob(pathname='./**/*.pdf',recursive=True) # I have set recursive = True so that we can check subdirectories too.\n",
|
||||
"print(len(files)) # confirm that you have all of the pdfs here with the correct path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 140,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chunks = [] # The array to store the sections in \n",
|
||||
"section_content = \"\"\n",
|
||||
"index = 1\n",
|
||||
"for file_path in files:\n",
|
||||
" print(f\"File Number {index} completed:\",file_path) # To keep track of files\n",
|
||||
" index+=1\n",
|
||||
" elements = partition_pdf(file_path, languages=[\"eng\"],strategy=\"fast\")\n",
|
||||
" for element in elements:\n",
|
||||
" if element.category == 'NarrativeText': # meaning that it is simmple text \n",
|
||||
" section_content+=element.text # Then append it to the already going section content\n",
|
||||
" elif element.category==\"ListItem\":\n",
|
||||
" chunks.append({\"page_content\":section_content,\"metadata\":element.metadata})\n",
|
||||
" section_content=\"\" # Because a new sectionn has started\n",
|
||||
" section_content += element.text # The string should start with the title of the text\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(set([chunk['metadata'].filename for chunk in chunks])) # Check if all of the completed files are here"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chunks # How each chunk looks like"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 143,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_core.documents import Document\n",
|
||||
"# How pinecone expects each chunk to be\n",
|
||||
"docs = [Document(page_content=chunk['page_content'],metadata={\"source\":chunk['metadata'].filename}) for chunk in chunks]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for doc in docs:\n",
|
||||
" print(doc.page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Now add all of the docs in the pinceone namespace\n",
|
||||
"from uuid import uuid4\n",
|
||||
"uuids = [str(uuid4()) for _ in range(len(docs))]\n",
|
||||
"batch_size = 200\n",
|
||||
"for i in range(0, len(docs), batch_size):\n",
|
||||
" print(\"Current Batch Index is:\",i)\n",
|
||||
" batch = docs[i:i+batch_size]\n",
|
||||
" batch_ids = uuids[i:i+batch_size]\n",
|
||||
" vector_store.add_documents(batch,ids=batch_ids)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 147,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"res = vector_store.similarity_search(query=\"Which act contains the words 'nothing from this act can be removed from the railways ..\",k=10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"res"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user