p-chat
This commit is contained in:
353
week5/community-contributions/ranskills-week5-p-chat.ipynb
Normal file
353
week5/community-contributions/ranskills-week5-p-chat.ipynb
Normal file
@@ -0,0 +1,353 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1f2969c8",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# P-Chat 🔒💬\n",
|
||||||
|
"\n",
|
||||||
|
"A privacy-focused bring-your-own-document (BYOD) solution that empowers you to leverage the power of LLMs to interact with your documents. Nothing is persisted, and it exists entirely in ephemeral memory.\n",
|
||||||
|
"\n",
|
||||||
|
"## Features\n",
|
||||||
|
"- Parent-child chunking used to enrich the context\n",
|
||||||
|
"- Chunk augmentation with some parent data for structured documents\n",
|
||||||
|
"- Streamed responses for better user experience\n",
|
||||||
|
"- Secure by design; no data is stored permanently\n",
|
||||||
|
"- Uses locally-running Ollama for total privacy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "df7609cf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install -qU langchain_ollama langchain_chroma langchain_community"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "144bdf7c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import logging\n",
|
||||||
|
"import sys\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"from enum import StrEnum\n",
|
||||||
|
"\n",
|
||||||
|
"import gradio as gr\n",
|
||||||
|
"from langchain_core.documents import Document\n",
|
||||||
|
"from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter\n",
|
||||||
|
"from langchain_ollama import OllamaEmbeddings, ChatOllama\n",
|
||||||
|
"from langchain.storage import InMemoryStore\n",
|
||||||
|
"from langchain_chroma import Chroma\n",
|
||||||
|
"from langchain_community.document_loaders import TextLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "dfdb143d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"logger = logging.getLogger('rag')\n",
|
||||||
|
"logger.setLevel(logging.DEBUG)\n",
|
||||||
|
"\n",
|
||||||
|
"if not logger.handlers:\n",
|
||||||
|
" handler = logging.StreamHandler(sys.stdout)\n",
|
||||||
|
" formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')\n",
|
||||||
|
" handler.setFormatter(formatter)\n",
|
||||||
|
" logger.addHandler(handler)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0e2f176b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## RAG Pipeline"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "78f2a554",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def pretty_print(l: list[Document | tuple[Document, float]]):\n",
|
||||||
|
" for i,item in enumerate(l, start=1):\n",
|
||||||
|
" logger.debug('-' * 80 + '\\n')\n",
|
||||||
|
"\n",
|
||||||
|
" if isinstance(item, tuple):\n",
|
||||||
|
" doc, score = item\n",
|
||||||
|
" logger.debug(f'{i}. characters: {len(doc.page_content)}\\n')\n",
|
||||||
|
" logger.debug(f'Score: {score}\\nMetadata: {doc.metadata}\\nContent: {doc.page_content}')\n",
|
||||||
|
" else:\n",
|
||||||
|
" logger.debug(f'{i}. characters: {len(item.page_content)}\\n')\n",
|
||||||
|
" logger.debug(f'Metadata: {item.metadata}\\nContent: {item.page_content}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "42893f0b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Indexing\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "20ad0e80",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"model_id = 'qwen3:0.6b'\n",
|
||||||
|
"embedding_model = 'nomic-embed-text:latest'\n",
|
||||||
|
"\n",
|
||||||
|
"embeddings = OllamaEmbeddings(model=embedding_model)\n",
|
||||||
|
"model = ChatOllama(model=model_id, temperature=0.1)\n",
|
||||||
|
"\n",
|
||||||
|
"vectorstore = Chroma(\n",
|
||||||
|
" collection_name='p-chat',\n",
|
||||||
|
" embedding_function=embeddings,\n",
|
||||||
|
")\n",
|
||||||
|
"docstore = InMemoryStore()\n",
|
||||||
|
"\n",
|
||||||
|
"class Metadata(StrEnum):\n",
|
||||||
|
" ID = 'id'\n",
|
||||||
|
" PARENT_ID = 'parent_id'\n",
|
||||||
|
" SOURCE = 'source'\n",
|
||||||
|
" FILE_TYPE = 'file_type'\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"LOADER_MAPPING = {\n",
|
||||||
|
" '.md': TextLoader,\n",
|
||||||
|
" '.txt': TextLoader, \n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"def load_documents(file_path: Path) -> list[Document]:\n",
|
||||||
|
" # p = Path(file_path)\n",
|
||||||
|
" extension = file_path.suffix\n",
|
||||||
|
" logger.info(f'Loading loader for {extension}')\n",
|
||||||
|
" loader_cls = LOADER_MAPPING.get(extension)\n",
|
||||||
|
"\n",
|
||||||
|
" if loader_cls is None:\n",
|
||||||
|
" logger.warning(f'No loader configured for {extension}')\n",
|
||||||
|
" return []\n",
|
||||||
|
" \n",
|
||||||
|
" loader = loader_cls(file_path)\n",
|
||||||
|
" documents = loader.load()\n",
|
||||||
|
" logger.info(f'{len(documents)} loaded for {file_path.name}')\n",
|
||||||
|
"\n",
|
||||||
|
" return documents\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def preprocess(documents: list[Document]) -> list[Document]:\n",
|
||||||
|
" # Perform any cleaning, etc.\n",
|
||||||
|
" import uuid\n",
|
||||||
|
"\n",
|
||||||
|
" for doc in documents:\n",
|
||||||
|
" metadata = doc.metadata\n",
|
||||||
|
" shortened_source = metadata.get('source').split('/')[-1]\n",
|
||||||
|
"\n",
|
||||||
|
" metadata[Metadata.ID] = str(uuid.uuid4())\n",
|
||||||
|
" metadata[Metadata.SOURCE] = shortened_source\n",
|
||||||
|
" metadata[Metadata.FILE_TYPE] = shortened_source.split('.')[-1]\n",
|
||||||
|
"\n",
|
||||||
|
" return documents\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def index_document(file_path):\n",
|
||||||
|
" documents = load_documents(Path(file_path))\n",
|
||||||
|
" preprocessed_docs = preprocess(documents)\n",
|
||||||
|
" logger.debug([doc.metadata for doc in preprocessed_docs])\n",
|
||||||
|
"\n",
|
||||||
|
" for doc in preprocessed_docs:\n",
|
||||||
|
" chunks = chunk_documents(doc)\n",
|
||||||
|
"\n",
|
||||||
|
" vectorstore.add_documents(chunks)\n",
|
||||||
|
" docstore.mset([(doc.metadata.get(Metadata.ID) , doc)])\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def chunk_documents(parent: Document) -> list[Document]:\n",
|
||||||
|
" if parent.metadata.get(Metadata.FILE_TYPE) == '.md':\n",
|
||||||
|
" headers_to_split_on = [\n",
|
||||||
|
" ('#', 'employee_name'),\n",
|
||||||
|
" ('##', 'section'),\n",
|
||||||
|
" ('###', 'Header 3'),\n",
|
||||||
|
" ] \n",
|
||||||
|
" markdown_splitter = MarkdownHeaderTextSplitter(\n",
|
||||||
|
" headers_to_split_on=headers_to_split_on\n",
|
||||||
|
" )\n",
|
||||||
|
" chunks = markdown_splitter.split_text(parent.page_content) \n",
|
||||||
|
" else:\n",
|
||||||
|
" text_splitter = RecursiveCharacterTextSplitter(\n",
|
||||||
|
" chunk_size=400,\n",
|
||||||
|
" chunk_overlap=80,\n",
|
||||||
|
" separators=['\\n\\n', '\\n', ' ', '']\n",
|
||||||
|
" )\n",
|
||||||
|
" chunks = text_splitter.split_text(parent.page_content)\n",
|
||||||
|
"\n",
|
||||||
|
" children = []\n",
|
||||||
|
" parent_id = parent.metadata.get(Metadata.ID)\n",
|
||||||
|
" for i, chunk in enumerate(chunks, start=1):\n",
|
||||||
|
" if isinstance(chunk, Document):\n",
|
||||||
|
" metadata = {**parent.metadata, **chunk.metadata}\n",
|
||||||
|
" augmented_text = f'[Employee: {metadata.get('employee_name')}] '\n",
|
||||||
|
" content = augmented_text + chunk.page_content\n",
|
||||||
|
" else:\n",
|
||||||
|
" # chunk is a text\n",
|
||||||
|
" metadata = parent.metadata.copy()\n",
|
||||||
|
" content = chunk\n",
|
||||||
|
"\n",
|
||||||
|
" metadata.update({\n",
|
||||||
|
" Metadata.ID: f'{parent_id}-{i}',\n",
|
||||||
|
" Metadata.PARENT_ID: parent_id,\n",
|
||||||
|
" })\n",
|
||||||
|
" children.append(Document(page_content=content, metadata=metadata))\n",
|
||||||
|
"\n",
|
||||||
|
" logger.debug(f'Number chunks: {len(children)}, Parent ID: {parent_id}')\n",
|
||||||
|
" \n",
|
||||||
|
" return children"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "a90db6ee",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### LLM Interaction"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e2e15e99",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def retrieve_context(query) -> str:\n",
|
||||||
|
" results = vectorstore.similarity_search(query)\n",
|
||||||
|
" logger.info(f'Matching records: {len(results)}')\n",
|
||||||
|
" selected_parents = {}\n",
|
||||||
|
" for result in results:\n",
|
||||||
|
" parent_id = result.metadata.get('parent_id')\n",
|
||||||
|
" if parent_id in selected_parents:\n",
|
||||||
|
" continue\n",
|
||||||
|
"\n",
|
||||||
|
" parents = docstore.mget([parent_id])\n",
|
||||||
|
" selected_parents[parent_id] = parents[0]\n",
|
||||||
|
"\n",
|
||||||
|
" logger.info(f'Selected documents for query: {query} ids:{selected_parents.keys()}')\n",
|
||||||
|
" context = '\\n\\n'.join([doc.page_content for _,doc in selected_parents.items() if doc is not None])\n",
|
||||||
|
"\n",
|
||||||
|
" return context\n",
|
||||||
|
"\n",
|
||||||
|
" \n",
|
||||||
|
"def ask(message, history):\n",
|
||||||
|
" context = retrieve_context(message)\n",
|
||||||
|
" prompt = f'''\n",
|
||||||
|
" You are helpful assistant that answers a question based on the provided context.\n",
|
||||||
|
" If the context is not helpful to you in answering the question, say so.\n",
|
||||||
|
" Be concise with your responses.\n",
|
||||||
|
"\n",
|
||||||
|
" Context:\n",
|
||||||
|
" {context}\n",
|
||||||
|
" '''\n",
|
||||||
|
"\n",
|
||||||
|
" messages = [\n",
|
||||||
|
" ('system', prompt),\n",
|
||||||
|
" ('user', message)\n",
|
||||||
|
" ]\n",
|
||||||
|
"\n",
|
||||||
|
" stream = model.stream(messages)\n",
|
||||||
|
" response_text = ''\n",
|
||||||
|
"\n",
|
||||||
|
" for chunk in stream:\n",
|
||||||
|
" response_text += chunk.content or ''\n",
|
||||||
|
" if not response_text:\n",
|
||||||
|
" continue\n",
|
||||||
|
"\n",
|
||||||
|
" yield response_text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "c3e632dc-9e87-4510-9fcd-aa699c27e82b",
|
||||||
|
"metadata": {
|
||||||
|
"jp-MarkdownHeadingCollapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## Gradio UI"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a3d68a74",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def chat(message, history):\n",
|
||||||
|
" if message is None:\n",
|
||||||
|
" return ''\n",
|
||||||
|
"\n",
|
||||||
|
" text_input = message.get('text', '')\n",
|
||||||
|
" files_uploaded = message.get('files', [])\n",
|
||||||
|
" \n",
|
||||||
|
" latest_file_path = files_uploaded[-1] if files_uploaded else None\n",
|
||||||
|
" if latest_file_path:\n",
|
||||||
|
" index_document(latest_file_path)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" if not text_input:\n",
|
||||||
|
" yield '✅ Indexed document'\n",
|
||||||
|
" return\n",
|
||||||
|
"\n",
|
||||||
|
" for chunk in ask(text_input, history):\n",
|
||||||
|
" yield chunk\n",
|
||||||
|
"\n",
|
||||||
|
"title = 'P-Chat 🔒💬'\n",
|
||||||
|
"with gr.Blocks(title=title, fill_height=True) as ui:\n",
|
||||||
|
" gr.Markdown(f'# {title}')\n",
|
||||||
|
" gr.Markdown('## Privacy-focused bring-your-own-document (BYOD) solution 🤫.')\n",
|
||||||
|
"\n",
|
||||||
|
" gr.ChatInterface(\n",
|
||||||
|
" fn=chat,\n",
|
||||||
|
" type='messages',\n",
|
||||||
|
" textbox=gr.MultimodalTextbox(file_types=['text', '.txt', '.md'], autofocus=True),\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"ui.launch(debug=True)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": ".venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.12"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user