Merge pull request #479 from bluebells1/sm-branch-wk5
Week5 Final Project: RAG solution with multi doctype KB and batch embeddings
This commit is contained in:
552
week5/community-contributions/Wk5-final-multi-doc-type-KB.ipynb
Normal file
552
week5/community-contributions/Wk5-final-multi-doc-type-KB.ipynb
Normal file
@@ -0,0 +1,552 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "61777022-631c-4db0-afeb-70d8d22bc07b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Summary:\n",
|
||||||
|
"This is the project from week 5. The intention was to create a vector db of my own files (from an external drive) which can be used in a RAG solution.\n",
|
||||||
|
"This includes a number of file types (docx, pdf, txt, epub...) and includes the ability to exclude folders.\n",
|
||||||
|
"With the OpenAI embeddings API limit of 300k tokens, it was also necessary to create a batch embeddings process so that there were multiple requests.\n",
|
||||||
|
"This was based on estimating the tokens with a text to token rate of 1:4, however it wasn't perfect and one of the batches still exceeded the 300k limit when running.\n",
|
||||||
|
"I found that the responses from the llm were terrible in the end! I tried playing about with chunk sizes and the minimum # of chunks by llangchain and it did improve but was not fantastic. I also ensured the metadata was sent with each chunk to help.\n",
|
||||||
|
"This really highlighted the real world challenges of implementing RAG!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d78ef79d-e564-4c56-82f3-0485e4bf6986",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip install docx2txt\n",
|
||||||
|
"!pip install ebooklib\n",
|
||||||
|
"!pip install python-pptx\n",
|
||||||
|
"!pip install pypdf"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "9ec98119-456f-450c-a9a2-f375d74f5ce5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# imports\n",
|
||||||
|
"\n",
|
||||||
|
"import os\n",
|
||||||
|
"import requests\n",
|
||||||
|
"from dotenv import load_dotenv\n",
|
||||||
|
"import glob\n",
|
||||||
|
"import gradio as gr\n",
|
||||||
|
"import time\n",
|
||||||
|
"from typing import List"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "ac14410b-8c3c-4cf5-900e-fd4c33cdf2b2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# imports for langchain, plotly and Chroma\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain.document_loaders import (\n",
|
||||||
|
" DirectoryLoader,\n",
|
||||||
|
" Docx2txtLoader,\n",
|
||||||
|
" TextLoader,\n",
|
||||||
|
" PyPDFLoader,\n",
|
||||||
|
" UnstructuredExcelLoader,\n",
|
||||||
|
" BSHTMLLoader\n",
|
||||||
|
")\n",
|
||||||
|
"from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n",
|
||||||
|
"from langchain.schema import Document\n",
|
||||||
|
"from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
|
||||||
|
"from langchain_chroma import Chroma\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"from sklearn.manifold import TSNE\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import plotly.graph_objects as go\n",
|
||||||
|
"from langchain.memory import ConversationBufferMemory\n",
|
||||||
|
"from langchain.chains import ConversationalRetrievalChain\n",
|
||||||
|
"from langchain.embeddings import HuggingFaceEmbeddings"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3be698e7-71e1-4c75-9696-e1651e4bf357",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"MODEL = \"gpt-4o-mini\"\n",
|
||||||
|
"db_name = \"vector_db\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6f850068-c05b-4526-9494-034b0077347e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Load environment variables in a file called .env\n",
|
||||||
|
"\n",
|
||||||
|
"load_dotenv(override=True)\n",
|
||||||
|
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "0c5baad2-2033-40a6-8ebd-5861b5cf4350",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# handling epubs\n",
|
||||||
|
"\n",
|
||||||
|
"from ebooklib import epub\n",
|
||||||
|
"from bs4 import BeautifulSoup\n",
|
||||||
|
"from langchain.document_loaders.base import BaseLoader\n",
|
||||||
|
"\n",
|
||||||
|
"class EpubLoader(BaseLoader):\n",
|
||||||
|
" def __init__(self, file_path: str):\n",
|
||||||
|
" self.file_path = file_path\n",
|
||||||
|
"\n",
|
||||||
|
" def load(self) -> list[Document]:\n",
|
||||||
|
" book = epub.read_epub(self.file_path)\n",
|
||||||
|
" text = ''\n",
|
||||||
|
" for item in book.get_items():\n",
|
||||||
|
" if item.get_type() == epub.EpubHtml:\n",
|
||||||
|
" soup = BeautifulSoup(item.get_content(), 'html.parser')\n",
|
||||||
|
" text += soup.get_text() + '\\n'\n",
|
||||||
|
"\n",
|
||||||
|
" return [Document(page_content=text, metadata={\"source\": self.file_path})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "bd8b0e4e-d698-4484-bc94-d8b753f386cc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# handling pptx\n",
|
||||||
|
"\n",
|
||||||
|
"from pptx import Presentation\n",
|
||||||
|
"\n",
|
||||||
|
"class PptxLoader(BaseLoader):\n",
|
||||||
|
" def __init__(self, file_path: str):\n",
|
||||||
|
" self.file_path = file_path\n",
|
||||||
|
"\n",
|
||||||
|
" def load(self) -> list[Document]:\n",
|
||||||
|
" prs = Presentation(self.file_path)\n",
|
||||||
|
" text = ''\n",
|
||||||
|
" for slide in prs.slides:\n",
|
||||||
|
" for shape in slide.shapes:\n",
|
||||||
|
" if hasattr(shape, \"text\") and shape.text:\n",
|
||||||
|
" text += shape.text + '\\n'\n",
|
||||||
|
"\n",
|
||||||
|
" return [Document(page_content=text, metadata={\"source\": self.file_path})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b222b01d-6040-4ff3-a0e3-290819cfe94b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Class based version of document loader which can be expanded more easily for other document types. (Currently includes file types: docx, txt (windows encoding), xlsx, pdfs, epubs, pptx)\n",
|
||||||
|
"\n",
|
||||||
|
"class DocumentLoader:\n",
|
||||||
|
" \"\"\"A clean, extensible document loader for multiple file types.\"\"\"\n",
|
||||||
|
" \n",
|
||||||
|
" def __init__(self, base_path=\"D:/*\", exclude_folders=None):\n",
|
||||||
|
" self.base_path = base_path\n",
|
||||||
|
" self.documents = []\n",
|
||||||
|
" self.exclude_folders = exclude_folders or []\n",
|
||||||
|
" \n",
|
||||||
|
" # Configuration for different file types\n",
|
||||||
|
" self.loader_config = {\n",
|
||||||
|
" 'docx': {\n",
|
||||||
|
" 'loader_cls': Docx2txtLoader,\n",
|
||||||
|
" 'glob_pattern': \"**/*.docx\",\n",
|
||||||
|
" 'loader_kwargs': {},\n",
|
||||||
|
" 'post_process': None\n",
|
||||||
|
" },\n",
|
||||||
|
" 'txt': {\n",
|
||||||
|
" 'loader_cls': TextLoader,\n",
|
||||||
|
" 'glob_pattern': \"**/*.txt\",\n",
|
||||||
|
" 'loader_kwargs': {\"encoding\": \"cp1252\"},\n",
|
||||||
|
" 'post_process': None\n",
|
||||||
|
" },\n",
|
||||||
|
" 'pdf': {\n",
|
||||||
|
" 'loader_cls': PyPDFLoader,\n",
|
||||||
|
" 'glob_pattern': \"**/*.pdf\",\n",
|
||||||
|
" 'loader_kwargs': {},\n",
|
||||||
|
" 'post_process': None\n",
|
||||||
|
" },\n",
|
||||||
|
" 'xlsx': {\n",
|
||||||
|
" 'loader_cls': UnstructuredExcelLoader,\n",
|
||||||
|
" 'glob_pattern': \"**/*.xlsx\",\n",
|
||||||
|
" 'loader_kwargs': {},\n",
|
||||||
|
" 'post_process': None\n",
|
||||||
|
" },\n",
|
||||||
|
" 'html': {\n",
|
||||||
|
" 'loader_cls': BSHTMLLoader,\n",
|
||||||
|
" 'glob_pattern': \"**/*.html\",\n",
|
||||||
|
" 'loader_kwargs': {},\n",
|
||||||
|
" 'post_process': None\n",
|
||||||
|
" },\n",
|
||||||
|
" 'epub': {\n",
|
||||||
|
" 'loader_cls': EpubLoader,\n",
|
||||||
|
" 'glob_pattern': \"**/*.epub\",\n",
|
||||||
|
" 'loader_kwargs': {},\n",
|
||||||
|
" 'post_process': self._process_epub_metadata\n",
|
||||||
|
" },\n",
|
||||||
|
" 'pptx': {\n",
|
||||||
|
" 'loader_cls': PptxLoader,\n",
|
||||||
|
" 'glob_pattern': \"**/*.pptx\",\n",
|
||||||
|
" 'loader_kwargs': {},\n",
|
||||||
|
" 'post_process': None\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
" \n",
|
||||||
|
" def _get_epub_metadata(self, file_path):\n",
|
||||||
|
" \"\"\"Extract metadata from EPUB files.\"\"\"\n",
|
||||||
|
" try:\n",
|
||||||
|
" book = epub.read_epub(file_path)\n",
|
||||||
|
" title = book.get_metadata('DC', 'title')[0][0] if book.get_metadata('DC', 'title') else None\n",
|
||||||
|
" author = book.get_metadata('DC', 'creator')[0][0] if book.get_metadata('DC', 'creator') else None\n",
|
||||||
|
" return title, author\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(f\"Error extracting EPUB metadata: {e}\")\n",
|
||||||
|
" return None, None\n",
|
||||||
|
" \n",
|
||||||
|
" def _process_epub_metadata(self, doc) -> None:\n",
|
||||||
|
" \"\"\"Post-process EPUB documents to add metadata.\"\"\"\n",
|
||||||
|
" title, author = self._get_epub_metadata(doc.metadata['source'])\n",
|
||||||
|
" doc.metadata[\"author\"] = author\n",
|
||||||
|
" doc.metadata[\"title\"] = title\n",
|
||||||
|
" \n",
|
||||||
|
" def _load_file_type(self, folder, file_type, config):\n",
|
||||||
|
" \"\"\"Load documents of a specific file type from a folder.\"\"\"\n",
|
||||||
|
" try:\n",
|
||||||
|
" loader = DirectoryLoader(\n",
|
||||||
|
" folder, \n",
|
||||||
|
" glob=config['glob_pattern'], \n",
|
||||||
|
" loader_cls=config['loader_cls'],\n",
|
||||||
|
" loader_kwargs=config['loader_kwargs']\n",
|
||||||
|
" )\n",
|
||||||
|
" docs = loader.load()\n",
|
||||||
|
" print(f\" Found {len(docs)} .{file_type} files\")\n",
|
||||||
|
" \n",
|
||||||
|
" # Apply post-processing if defined\n",
|
||||||
|
" if config['post_process']:\n",
|
||||||
|
" for doc in docs:\n",
|
||||||
|
" config['post_process'](doc)\n",
|
||||||
|
" \n",
|
||||||
|
" return docs\n",
|
||||||
|
" \n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(f\" Error loading .{file_type} files: {e}\")\n",
|
||||||
|
" return []\n",
|
||||||
|
" \n",
|
||||||
|
" def load_all(self):\n",
|
||||||
|
" \"\"\"Load all documents from configured folders.\"\"\"\n",
|
||||||
|
" all_folders = [f for f in glob.glob(self.base_path) if os.path.isdir(f)]\n",
|
||||||
|
"\n",
|
||||||
|
" #filter out excluded folders\n",
|
||||||
|
" folders = []\n",
|
||||||
|
" for folder in all_folders:\n",
|
||||||
|
" folder_name = os.path.basename(folder)\n",
|
||||||
|
" if folder_name not in self.exclude_folders:\n",
|
||||||
|
" folders.append(folder)\n",
|
||||||
|
" else:\n",
|
||||||
|
" print(f\"Excluded folder: {folder_name}\")\n",
|
||||||
|
" \n",
|
||||||
|
" print(\"Scanning folders (directories only):\", folders)\n",
|
||||||
|
" \n",
|
||||||
|
" self.documents = []\n",
|
||||||
|
" \n",
|
||||||
|
" for folder in folders:\n",
|
||||||
|
" doc_type = os.path.basename(folder)\n",
|
||||||
|
" print(f\"\\nProcessing folder: {doc_type}\")\n",
|
||||||
|
" \n",
|
||||||
|
" for file_type, config in self.loader_config.items():\n",
|
||||||
|
" docs = self._load_file_type(folder, file_type, config)\n",
|
||||||
|
" \n",
|
||||||
|
" # Add doc_type metadata to all documents\n",
|
||||||
|
" for doc in docs:\n",
|
||||||
|
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||||
|
" self.documents.append(doc)\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"\\nTotal documents loaded: {len(self.documents)}\")\n",
|
||||||
|
" return self.documents\n",
|
||||||
|
" \n",
|
||||||
|
" def add_file_type(self, extension, loader_cls, glob_pattern=None, \n",
|
||||||
|
" loader_kwargs=None, post_process=None):\n",
|
||||||
|
" \"\"\"Add support for a new file type.\"\"\"\n",
|
||||||
|
" self.loader_config[extension] = {\n",
|
||||||
|
" 'loader_cls': loader_cls,\n",
|
||||||
|
" 'glob_pattern': glob_pattern or f\"**/*.{extension}\",\n",
|
||||||
|
" 'loader_kwargs': loader_kwargs or {},\n",
|
||||||
|
" 'post_process': post_process\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
"# load\n",
|
||||||
|
"loader = DocumentLoader(\"D:/*\", exclude_folders=[\"Music\", \"Online Courses\", \"Fitness\"])\n",
|
||||||
|
"documents = loader.load_all()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3fd43a4f-b623-4b08-89eb-27d3b3ba0f62",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# create batches (this was required as the # of tokens was exceed the openai request limit)\n",
|
||||||
|
"\n",
|
||||||
|
"def estimate_tokens(text, chars_per_token=4):\n",
|
||||||
|
" \"\"\"Rough estimate of tokens from character count.\"\"\"\n",
|
||||||
|
" return len(text) // chars_per_token\n",
|
||||||
|
"\n",
|
||||||
|
"def create_batches(chunks, max_tokens_per_batch=250000):\n",
|
||||||
|
" batches = []\n",
|
||||||
|
" current_batch = []\n",
|
||||||
|
" current_tokens = 0\n",
|
||||||
|
" \n",
|
||||||
|
" for chunk in chunks:\n",
|
||||||
|
" chunk_tokens = estimate_tokens(chunk.page_content)\n",
|
||||||
|
" \n",
|
||||||
|
" # If adding this chunk would exceed the limit, start a new batch\n",
|
||||||
|
" if current_tokens + chunk_tokens > max_tokens_per_batch and current_batch:\n",
|
||||||
|
" batches.append(current_batch)\n",
|
||||||
|
" current_batch = [chunk]\n",
|
||||||
|
" current_tokens = chunk_tokens\n",
|
||||||
|
" else:\n",
|
||||||
|
" current_batch.append(chunk)\n",
|
||||||
|
" current_tokens += chunk_tokens\n",
|
||||||
|
" \n",
|
||||||
|
" # Add the last batch if it has content\n",
|
||||||
|
" if current_batch:\n",
|
||||||
|
" batches.append(current_batch)\n",
|
||||||
|
" \n",
|
||||||
|
" return batches\n",
|
||||||
|
"\n",
|
||||||
|
"def create_vectorstore_with_progress(chunks, embeddings, db_name, batch_size_tokens=250000):\n",
|
||||||
|
" \n",
|
||||||
|
" # Delete existing database if it exists\n",
|
||||||
|
" if os.path.exists(db_name):\n",
|
||||||
|
" print(f\"Deleting existing database: {db_name}\")\n",
|
||||||
|
" Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n",
|
||||||
|
" \n",
|
||||||
|
" # Create batches\n",
|
||||||
|
" batches = create_batches(chunks, batch_size_tokens)\n",
|
||||||
|
" print(f\"Created {len(batches)} batches from {len(chunks)} chunks\")\n",
|
||||||
|
" \n",
|
||||||
|
" # Show batch sizes\n",
|
||||||
|
" for i, batch in enumerate(batches):\n",
|
||||||
|
" total_chars = sum(len(chunk.page_content) for chunk in batch)\n",
|
||||||
|
" estimated_tokens = estimate_tokens(''.join(chunk.page_content for chunk in batch))\n",
|
||||||
|
" print(f\" Batch {i+1}: {len(batch)} chunks, ~{estimated_tokens:,} tokens\")\n",
|
||||||
|
" \n",
|
||||||
|
" vectorstore = None\n",
|
||||||
|
" successful_batches = 0\n",
|
||||||
|
" failed_batches = 0\n",
|
||||||
|
" \n",
|
||||||
|
" for i, batch in enumerate(batches):\n",
|
||||||
|
" print(f\"\\n{'='*50}\")\n",
|
||||||
|
" print(f\"Processing batch {i+1}/{len(batches)}\")\n",
|
||||||
|
" print(f\"{'='*50}\")\n",
|
||||||
|
" \n",
|
||||||
|
" try:\n",
|
||||||
|
" start_time = time.time()\n",
|
||||||
|
" \n",
|
||||||
|
" if vectorstore is None:\n",
|
||||||
|
" # Create the initial vectorstore\n",
|
||||||
|
" vectorstore = Chroma.from_documents(\n",
|
||||||
|
" documents=batch,\n",
|
||||||
|
" embedding=embeddings,\n",
|
||||||
|
" persist_directory=db_name\n",
|
||||||
|
" )\n",
|
||||||
|
" print(f\"Created initial vectorstore with {len(batch)} documents\")\n",
|
||||||
|
" else:\n",
|
||||||
|
" # Add to existing vectorstore\n",
|
||||||
|
" vectorstore.add_documents(batch)\n",
|
||||||
|
" print(f\"Added {len(batch)} documents to vectorstore\")\n",
|
||||||
|
" \n",
|
||||||
|
" successful_batches += 1\n",
|
||||||
|
" elapsed = time.time() - start_time\n",
|
||||||
|
" print(f\"Processed in {elapsed:.1f} seconds\")\n",
|
||||||
|
" print(f\"Total documents in vectorstore: {vectorstore._collection.count()}\")\n",
|
||||||
|
" \n",
|
||||||
|
" # Rate limiting delay\n",
|
||||||
|
" time.sleep(2)\n",
|
||||||
|
" \n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" failed_batches += 1\n",
|
||||||
|
" print(f\"Error processing batch {i+1}: {e}\")\n",
|
||||||
|
" print(f\"Continuing with next batch...\")\n",
|
||||||
|
" continue\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"\\n{'='*50}\")\n",
|
||||||
|
" print(f\"SUMMARY\")\n",
|
||||||
|
" print(f\"{'='*50}\")\n",
|
||||||
|
" print(f\"Successful batches: {successful_batches}/{len(batches)}\")\n",
|
||||||
|
" print(f\"Failed batches: {failed_batches}/{len(batches)}\")\n",
|
||||||
|
" \n",
|
||||||
|
" if vectorstore:\n",
|
||||||
|
" final_count = vectorstore._collection.count()\n",
|
||||||
|
" print(f\"Final vectorstore contains: {final_count} documents\")\n",
|
||||||
|
" return vectorstore\n",
|
||||||
|
" else:\n",
|
||||||
|
" print(\"Failed to create vectorstore\")\n",
|
||||||
|
" return None\n",
|
||||||
|
"\n",
|
||||||
|
"# include metadata\n",
|
||||||
|
"def add_metadata_to_content(doc: Document) -> Document:\n",
|
||||||
|
" metadata_lines = []\n",
|
||||||
|
" if \"doc_type\" in doc.metadata:\n",
|
||||||
|
" metadata_lines.append(f\"Document Type: {doc.metadata['doc_type']}\")\n",
|
||||||
|
" if \"title\" in doc.metadata:\n",
|
||||||
|
" metadata_lines.append(f\"Title: {doc.metadata['title']}\")\n",
|
||||||
|
" if \"author\" in doc.metadata:\n",
|
||||||
|
" metadata_lines.append(f\"Author: {doc.metadata['author']}\")\n",
|
||||||
|
" metadata_text = \"\\n\".join(metadata_lines)\n",
|
||||||
|
"\n",
|
||||||
|
" new_content = f\"{metadata_text}\\n\\n{doc.page_content}\"\n",
|
||||||
|
" return Document(page_content=new_content, metadata=doc.metadata)\n",
|
||||||
|
"\n",
|
||||||
|
"# Apply to all documents before chunking\n",
|
||||||
|
"documents_with_metadata = [add_metadata_to_content(doc) for doc in documents]\n",
|
||||||
|
"\n",
|
||||||
|
"# Chunking\n",
|
||||||
|
"text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=200)\n",
|
||||||
|
"chunks = text_splitter.split_documents(documents_with_metadata)\n",
|
||||||
|
"\n",
|
||||||
|
"# Embedding\n",
|
||||||
|
"embeddings = OpenAIEmbeddings()\n",
|
||||||
|
"\n",
|
||||||
|
"# Store in vector DB\n",
|
||||||
|
"print(\"Creating vectorstore in batches...\")\n",
|
||||||
|
"vectorstore = create_vectorstore_with_progress(\n",
|
||||||
|
" chunks=chunks,\n",
|
||||||
|
" embeddings=embeddings, \n",
|
||||||
|
" db_name=db_name,\n",
|
||||||
|
" batch_size_tokens=250000\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"if vectorstore:\n",
|
||||||
|
" print(f\"Successfully created vectorstore with {vectorstore._collection.count()} documents\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"Failed to create vectorstore\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "46c29b11-2ae3-4f6b-901d-5de67a09fd49",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# create a new Chat with OpenAI\n",
|
||||||
|
"llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n",
|
||||||
|
"\n",
|
||||||
|
"# set up the conversation memory for the chat\n",
|
||||||
|
"memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
|
||||||
|
"\n",
|
||||||
|
"# the retriever is an abstraction over the VectorStore that will be used during RAG\n",
|
||||||
|
"retriever = vectorstore.as_retriever(search_kwargs={\"k\": 200})\n",
|
||||||
|
"\n",
|
||||||
|
"# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory\n",
|
||||||
|
"conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "be163251-0dfa-4f50-ab05-43c6c0833405",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Wrapping that in a function\n",
|
||||||
|
"\n",
|
||||||
|
"def chat(question, history):\n",
|
||||||
|
" result = conversation_chain.invoke({\"question\": question})\n",
|
||||||
|
" return result[\"answer\"]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a6320402-8213-47ec-8b05-dda234052274",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# And in Gradio:\n",
|
||||||
|
"\n",
|
||||||
|
"view = gr.ChatInterface(chat, type=\"messages\").launch(inbrowser=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "717e010b-8d7e-4a43-8cb1-9688ffdd76b6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Let's investigate what gets sent behind the scenes\n",
|
||||||
|
"\n",
|
||||||
|
"# from langchain_core.callbacks import StdOutCallbackHandler\n",
|
||||||
|
"\n",
|
||||||
|
"# llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n",
|
||||||
|
"\n",
|
||||||
|
"# memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
|
||||||
|
"\n",
|
||||||
|
"# retriever = vectorstore.as_retriever(search_kwargs={\"k\": 200})\n",
|
||||||
|
"\n",
|
||||||
|
"# conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])\n",
|
||||||
|
"\n",
|
||||||
|
"# query = \"Can you name some authors?\"\n",
|
||||||
|
"# result = conversation_chain.invoke({\"question\": query})\n",
|
||||||
|
"# answer = result[\"answer\"]\n",
|
||||||
|
"# print(\"\\nAnswer:\", answer)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2333a77e-8d32-4cc2-8ae9-f8e7a979b3ae",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user