From 4436456699a704625b81da1ee931abb16ea20d7b Mon Sep 17 00:00:00 2001 From: Ransford Okpoti Date: Mon, 27 Oct 2025 00:58:11 +0000 Subject: [PATCH] p-chat --- .../ranskills-week5-p-chat.ipynb | 353 ++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 week5/community-contributions/ranskills-week5-p-chat.ipynb diff --git a/week5/community-contributions/ranskills-week5-p-chat.ipynb b/week5/community-contributions/ranskills-week5-p-chat.ipynb new file mode 100644 index 0000000..d07adcc --- /dev/null +++ b/week5/community-contributions/ranskills-week5-p-chat.ipynb @@ -0,0 +1,353 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1f2969c8", + "metadata": {}, + "source": [ + "# P-Chat 🔒💬\n", + "\n", + "A privacy-focused bring-your-own-document (BYOD) solution that empowers you to leverage the power of LLMs to interact with your documents. Nothing is persisted, and it exists entirely in ephemeral memory.\n", + "\n", + "## Features\n", + "- Parent-child chunking used to enrich the context\n", + "- Chunk augmentation with some parent data for structured documents\n", + "- Streamed responses for better user experience\n", + "- Secure by design; no data is stored permanently\n", + "- Uses locally-running Ollama for total privacy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df7609cf", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain_ollama langchain_chroma langchain_community" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "144bdf7c", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "from pathlib import Path\n", + "from enum import StrEnum\n", + "\n", + "import gradio as gr\n", + "from langchain_core.documents import Document\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter\n", + "from langchain_ollama import OllamaEmbeddings, ChatOllama\n", + "from langchain.storage import InMemoryStore\n", + "from langchain_chroma import Chroma\n", + "from langchain_community.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfdb143d", + "metadata": {}, + "outputs": [], + "source": [ + "logger = logging.getLogger('rag')\n", + "logger.setLevel(logging.DEBUG)\n", + "\n", + "if not logger.handlers:\n", + " handler = logging.StreamHandler(sys.stdout)\n", + " formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')\n", + " handler.setFormatter(formatter)\n", + " logger.addHandler(handler)" + ] + }, + { + "cell_type": "markdown", + "id": "0e2f176b", + "metadata": {}, + "source": [ + "## RAG Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78f2a554", + "metadata": {}, + "outputs": [], + "source": [ + "def pretty_print(l: list[Document | tuple[Document, float]]):\n", + " for i,item in enumerate(l, start=1):\n", + " logger.debug('-' * 80 + '\\n')\n", + "\n", + " if isinstance(item, tuple):\n", + " doc, score = item\n", + " logger.debug(f'{i}. characters: {len(doc.page_content)}\\n')\n", + " logger.debug(f'Score: {score}\\nMetadata: {doc.metadata}\\nContent: {doc.page_content}')\n", + " else:\n", + " logger.debug(f'{i}. characters: {len(item.page_content)}\\n')\n", + " logger.debug(f'Metadata: {item.metadata}\\nContent: {item.page_content}')" + ] + }, + { + "cell_type": "markdown", + "id": "42893f0b", + "metadata": {}, + "source": [ + "### Indexing\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20ad0e80", + "metadata": {}, + "outputs": [], + "source": [ + "model_id = 'qwen3:0.6b'\n", + "embedding_model = 'nomic-embed-text:latest'\n", + "\n", + "embeddings = OllamaEmbeddings(model=embedding_model)\n", + "model = ChatOllama(model=model_id, temperature=0.1)\n", + "\n", + "vectorstore = Chroma(\n", + " collection_name='p-chat',\n", + " embedding_function=embeddings,\n", + ")\n", + "docstore = InMemoryStore()\n", + "\n", + "class Metadata(StrEnum):\n", + " ID = 'id'\n", + " PARENT_ID = 'parent_id'\n", + " SOURCE = 'source'\n", + " FILE_TYPE = 'file_type'\n", + "\n", + "\n", + "LOADER_MAPPING = {\n", + " '.md': TextLoader,\n", + " '.txt': TextLoader, \n", + "}\n", + "\n", + "def load_documents(file_path: Path) -> list[Document]:\n", + " # p = Path(file_path)\n", + " extension = file_path.suffix\n", + " logger.info(f'Loading loader for {extension}')\n", + " loader_cls = LOADER_MAPPING.get(extension)\n", + "\n", + " if loader_cls is None:\n", + " logger.warning(f'No loader configured for {extension}')\n", + " return []\n", + " \n", + " loader = loader_cls(file_path)\n", + " documents = loader.load()\n", + " logger.info(f'{len(documents)} loaded for {file_path.name}')\n", + "\n", + " return documents\n", + "\n", + "\n", + "def preprocess(documents: list[Document]) -> list[Document]:\n", + " # Perform any cleaning, etc.\n", + " import uuid\n", + "\n", + " for doc in documents:\n", + " metadata = doc.metadata\n", + " shortened_source = metadata.get('source').split('/')[-1]\n", + "\n", + " metadata[Metadata.ID] = str(uuid.uuid4())\n", + " metadata[Metadata.SOURCE] = shortened_source\n", + " metadata[Metadata.FILE_TYPE] = shortened_source.split('.')[-1]\n", + "\n", + " return documents\n", + "\n", + "\n", + "def index_document(file_path):\n", + " documents = load_documents(Path(file_path))\n", + " preprocessed_docs = preprocess(documents)\n", + " logger.debug([doc.metadata for doc in preprocessed_docs])\n", + "\n", + " for doc in preprocessed_docs:\n", + " chunks = chunk_documents(doc)\n", + "\n", + " vectorstore.add_documents(chunks)\n", + " docstore.mset([(doc.metadata.get(Metadata.ID) , doc)])\n", + "\n", + "\n", + "def chunk_documents(parent: Document) -> list[Document]:\n", + " if parent.metadata.get(Metadata.FILE_TYPE) == '.md':\n", + " headers_to_split_on = [\n", + " ('#', 'employee_name'),\n", + " ('##', 'section'),\n", + " ('###', 'Header 3'),\n", + " ] \n", + " markdown_splitter = MarkdownHeaderTextSplitter(\n", + " headers_to_split_on=headers_to_split_on\n", + " )\n", + " chunks = markdown_splitter.split_text(parent.page_content) \n", + " else:\n", + " text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=400,\n", + " chunk_overlap=80,\n", + " separators=['\\n\\n', '\\n', ' ', '']\n", + " )\n", + " chunks = text_splitter.split_text(parent.page_content)\n", + "\n", + " children = []\n", + " parent_id = parent.metadata.get(Metadata.ID)\n", + " for i, chunk in enumerate(chunks, start=1):\n", + " if isinstance(chunk, Document):\n", + " metadata = {**parent.metadata, **chunk.metadata}\n", + " augmented_text = f'[Employee: {metadata.get('employee_name')}] '\n", + " content = augmented_text + chunk.page_content\n", + " else:\n", + " # chunk is a text\n", + " metadata = parent.metadata.copy()\n", + " content = chunk\n", + "\n", + " metadata.update({\n", + " Metadata.ID: f'{parent_id}-{i}',\n", + " Metadata.PARENT_ID: parent_id,\n", + " })\n", + " children.append(Document(page_content=content, metadata=metadata))\n", + "\n", + " logger.debug(f'Number chunks: {len(children)}, Parent ID: {parent_id}')\n", + " \n", + " return children" + ] + }, + { + "cell_type": "markdown", + "id": "a90db6ee", + "metadata": {}, + "source": [ + "### LLM Interaction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2e15e99", + "metadata": {}, + "outputs": [], + "source": [ + "def retrieve_context(query) -> str:\n", + " results = vectorstore.similarity_search(query)\n", + " logger.info(f'Matching records: {len(results)}')\n", + " selected_parents = {}\n", + " for result in results:\n", + " parent_id = result.metadata.get('parent_id')\n", + " if parent_id in selected_parents:\n", + " continue\n", + "\n", + " parents = docstore.mget([parent_id])\n", + " selected_parents[parent_id] = parents[0]\n", + "\n", + " logger.info(f'Selected documents for query: {query} ids:{selected_parents.keys()}')\n", + " context = '\\n\\n'.join([doc.page_content for _,doc in selected_parents.items() if doc is not None])\n", + "\n", + " return context\n", + "\n", + " \n", + "def ask(message, history):\n", + " context = retrieve_context(message)\n", + " prompt = f'''\n", + " You are helpful assistant that answers a question based on the provided context.\n", + " If the context is not helpful to you in answering the question, say so.\n", + " Be concise with your responses.\n", + "\n", + " Context:\n", + " {context}\n", + " '''\n", + "\n", + " messages = [\n", + " ('system', prompt),\n", + " ('user', message)\n", + " ]\n", + "\n", + " stream = model.stream(messages)\n", + " response_text = ''\n", + "\n", + " for chunk in stream:\n", + " response_text += chunk.content or ''\n", + " if not response_text:\n", + " continue\n", + "\n", + " yield response_text" + ] + }, + { + "cell_type": "markdown", + "id": "c3e632dc-9e87-4510-9fcd-aa699c27e82b", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Gradio UI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3d68a74", + "metadata": {}, + "outputs": [], + "source": [ + "def chat(message, history):\n", + " if message is None:\n", + " return ''\n", + "\n", + " text_input = message.get('text', '')\n", + " files_uploaded = message.get('files', [])\n", + " \n", + " latest_file_path = files_uploaded[-1] if files_uploaded else None\n", + " if latest_file_path:\n", + " index_document(latest_file_path)\n", + "\n", + "\n", + " if not text_input:\n", + " yield '✅ Indexed document'\n", + " return\n", + "\n", + " for chunk in ask(text_input, history):\n", + " yield chunk\n", + "\n", + "title = 'P-Chat 🔒💬'\n", + "with gr.Blocks(title=title, fill_height=True) as ui:\n", + " gr.Markdown(f'# {title}')\n", + " gr.Markdown('## Privacy-focused bring-your-own-document (BYOD) solution 🤫.')\n", + "\n", + " gr.ChatInterface(\n", + " fn=chat,\n", + " type='messages',\n", + " textbox=gr.MultimodalTextbox(file_types=['text', '.txt', '.md'], autofocus=True),\n", + " )\n", + "\n", + "ui.launch(debug=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}