234 lines
6.6 KiB
Plaintext
234 lines
6.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"\n",
|
|
"\n",
|
|
"# Just before starting \n",
|
|
"\n",
|
|
"You can check completely explained working here: https://medium.com/@muhammad.bese23seecs/building-a-rag-powered-pinecone-database-using-ocr-a-practical-guide-with-pakistani-law-d83e869e1458\n",
|
|
"\n",
|
|
"## Issues and Considerations\n",
|
|
"\n",
|
|
"This notebook requires a few installation to run. First is tesseract:\n",
|
|
"\n",
|
|
"For windows: https://stackoverflow.com/questions/46140485/tesseract-installation-in-windows\n",
|
|
"\n",
|
|
"For Linux: run on the cli \"sudo apt-get install tesseract-ocr\"\n",
|
|
"\n",
|
|
"For Mac: https://www.oreilly.com/library/view/building-computer-vision/9781838644673/95de5b35-436b-4668-8ca2-44970a6e2924.xhtml\n",
|
|
"\n",
|
|
"\n",
|
|
"Next install pytesseract in your environment\n",
|
|
"\n",
|
|
"For uv: uv pip install pytesseract\n",
|
|
"\n",
|
|
"For pip install: pip install pytesseract\n",
|
|
"\n",
|
|
"\n",
|
|
"You would require an OpenAI API key and Pinecone API key in your .env file\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 134,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Install LangChain Unstructured (which requires unstructured under the hood)\n",
|
|
"\n",
|
|
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
|
"from langchain.vectorstores import Pinecone\n",
|
|
"import getpass\n",
|
|
"from pinecone import Pinecone\n",
|
|
"import os\n",
|
|
"from pinecone import ServerlessSpec\n",
|
|
"from langchain_pinecone import PineconeVectorStore\n",
|
|
"from unstructured.partition.pdf import partition_pdf\n",
|
|
"import glob"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 135,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Initialize embeddings and Pinecone vector store\n",
|
|
"embeddings = OpenAIEmbeddings() #"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 136,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"# Pinecone client\n",
|
|
"if not os.getenv(\"PINECONE_API_KEY\"):\n",
|
|
" os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Enter your Pinecone API key: \")\n",
|
|
"pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n",
|
|
"pc = Pinecone(api_key=pinecone_api_key)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 137,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Index Creation and its testing \n",
|
|
"index_name = \"lahore-cases\" # Replace the name with anything you like \n",
|
|
"if not pc.has_index(index_name):\n",
|
|
" pc.create_index(\n",
|
|
" name=index_name,\n",
|
|
" dimension=1536,\n",
|
|
" metric=\"cosine\",\n",
|
|
" spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n",
|
|
" )\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 138,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# vector store\n",
|
|
"index = pc.Index(index_name)\n",
|
|
"vector_store = PineconeVectorStore(index=index, embedding=embeddings)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"files = glob.glob(pathname='./**/*.pdf',recursive=True) # I have set recursive = True so that we can check subdirectories too.\n",
|
|
"print(len(files)) # confirm that you have all of the pdfs here with the correct path"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 140,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"chunks = [] # The array to store the sections in \n",
|
|
"section_content = \"\"\n",
|
|
"index = 1\n",
|
|
"for file_path in files:\n",
|
|
" print(f\"File Number {index} completed:\",file_path) # To keep track of files\n",
|
|
" index+=1\n",
|
|
" elements = partition_pdf(file_path, languages=[\"eng\"],strategy=\"fast\")\n",
|
|
" for element in elements:\n",
|
|
" if element.category == 'NarrativeText': # meaning that it is simmple text \n",
|
|
" section_content+=element.text # Then append it to the already going section content\n",
|
|
" elif element.category==\"ListItem\":\n",
|
|
" chunks.append({\"page_content\":section_content,\"metadata\":element.metadata})\n",
|
|
" section_content=\"\" # Because a new sectionn has started\n",
|
|
" section_content += element.text # The string should start with the title of the text\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"len(set([chunk['metadata'].filename for chunk in chunks])) # Check if all of the completed files are here"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"chunks # How each chunk looks like"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 143,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain_core.documents import Document\n",
|
|
"# How pinecone expects each chunk to be\n",
|
|
"docs = [Document(page_content=chunk['page_content'],metadata={\"source\":chunk['metadata'].filename}) for chunk in chunks]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for doc in docs:\n",
|
|
" print(doc.page_content)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Now add all of the docs in the pinceone namespace\n",
|
|
"from uuid import uuid4\n",
|
|
"uuids = [str(uuid4()) for _ in range(len(docs))]\n",
|
|
"batch_size = 200\n",
|
|
"for i in range(0, len(docs), batch_size):\n",
|
|
" print(\"Current Batch Index is:\",i)\n",
|
|
" batch = docs[i:i+batch_size]\n",
|
|
" batch_ids = uuids[i:i+batch_size]\n",
|
|
" vector_store.add_documents(batch,ids=batch_ids)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 147,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"res = vector_store.similarity_search(query=\"Which act contains the words 'nothing from this act can be removed from the railways ..\",k=10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"res"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|