From 4436456699a704625b81da1ee931abb16ea20d7b Mon Sep 17 00:00:00 2001
From: Ransford Okpoti <ranskills@yahoo.co.uk>
Date: Mon, 27 Oct 2025 00:58:11 +0000
Subject: [PATCH] p-chat

---
 .../ranskills-week5-p-chat.ipynb              | 353 ++++++++++++++++++
 1 file changed, 353 insertions(+)
 create mode 100644 week5/community-contributions/ranskills-week5-p-chat.ipynb

diff --git a/week5/community-contributions/ranskills-week5-p-chat.ipynb b/week5/community-contributions/ranskills-week5-p-chat.ipynb
new file mode 100644
index 0000000..d07adcc
--- /dev/null
+++ b/week5/community-contributions/ranskills-week5-p-chat.ipynb
@@ -0,0 +1,353 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1f2969c8",
+   "metadata": {},
+   "source": [
+    "# P-Chat 🔒💬\n",
+    "\n",
+    "A privacy-focused bring-your-own-document (BYOD) solution that empowers you to leverage the power of LLMs to interact with your documents. Nothing is persisted, and it exists entirely in ephemeral memory.\n",
+    "\n",
+    "## Features\n",
+    "- Parent-child chunking used to enrich the context\n",
+    "- Chunk augmentation with some parent data for structured documents\n",
+    "- Streamed responses for better user experience\n",
+    "- Secure by design; no data is stored permanently\n",
+    "- Uses locally-running Ollama for total privacy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "df7609cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -qU langchain_ollama langchain_chroma langchain_community"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "144bdf7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "from enum import StrEnum\n",
+    "\n",
+    "import gradio as gr\n",
+    "from langchain_core.documents import Document\n",
+    "from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter\n",
+    "from langchain_ollama import OllamaEmbeddings, ChatOllama\n",
+    "from langchain.storage import InMemoryStore\n",
+    "from langchain_chroma import Chroma\n",
+    "from langchain_community.document_loaders import TextLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dfdb143d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logger = logging.getLogger('rag')\n",
+    "logger.setLevel(logging.DEBUG)\n",
+    "\n",
+    "if not logger.handlers:\n",
+    "    handler = logging.StreamHandler(sys.stdout)\n",
+    "    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')\n",
+    "    handler.setFormatter(formatter)\n",
+    "    logger.addHandler(handler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e2f176b",
+   "metadata": {},
+   "source": [
+    "## RAG Pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78f2a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pretty_print(l: list[Document | tuple[Document, float]]):\n",
+    "    for i,item in enumerate(l, start=1):\n",
+    "        logger.debug('-' * 80 + '\\n')\n",
+    "\n",
+    "        if isinstance(item, tuple):\n",
+    "            doc, score = item\n",
+    "            logger.debug(f'{i}. characters: {len(doc.page_content)}\\n')\n",
+    "            logger.debug(f'Score: {score}\\nMetadata: {doc.metadata}\\nContent: {doc.page_content}')\n",
+    "        else:\n",
+    "            logger.debug(f'{i}. characters: {len(item.page_content)}\\n')\n",
+    "            logger.debug(f'Metadata: {item.metadata}\\nContent: {item.page_content}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "42893f0b",
+   "metadata": {},
+   "source": [
+    "### Indexing\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20ad0e80",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_id = 'qwen3:0.6b'\n",
+    "embedding_model = 'nomic-embed-text:latest'\n",
+    "\n",
+    "embeddings = OllamaEmbeddings(model=embedding_model)\n",
+    "model = ChatOllama(model=model_id, temperature=0.1)\n",
+    "\n",
+    "vectorstore = Chroma(\n",
+    "    collection_name='p-chat',\n",
+    "    embedding_function=embeddings,\n",
+    ")\n",
+    "docstore = InMemoryStore()\n",
+    "\n",
+    "class Metadata(StrEnum):\n",
+    "    ID = 'id'\n",
+    "    PARENT_ID = 'parent_id'\n",
+    "    SOURCE = 'source'\n",
+    "    FILE_TYPE = 'file_type'\n",
+    "\n",
+    "\n",
+    "LOADER_MAPPING = {\n",
+    "    '.md': TextLoader,\n",
+    "    '.txt': TextLoader, \n",
+    "}\n",
+    "\n",
+    "def load_documents(file_path: Path) -> list[Document]:\n",
+    "    # p = Path(file_path)\n",
+    "    extension = file_path.suffix\n",
+    "    logger.info(f'Loading loader for {extension}')\n",
+    "    loader_cls = LOADER_MAPPING.get(extension)\n",
+    "\n",
+    "    if loader_cls is None:\n",
+    "        logger.warning(f'No loader configured for {extension}')\n",
+    "        return []\n",
+    "    \n",
+    "    loader = loader_cls(file_path)\n",
+    "    documents = loader.load()\n",
+    "    logger.info(f'{len(documents)} loaded for {file_path.name}')\n",
+    "\n",
+    "    return documents\n",
+    "\n",
+    "\n",
+    "def preprocess(documents: list[Document]) -> list[Document]:\n",
+    "    # Perform any cleaning, etc.\n",
+    "    import uuid\n",
+    "\n",
+    "    for doc in documents:\n",
+    "        metadata = doc.metadata\n",
+    "        shortened_source = metadata.get('source').split('/')[-1]\n",
+    "\n",
+    "        metadata[Metadata.ID] = str(uuid.uuid4())\n",
+    "        metadata[Metadata.SOURCE] = shortened_source\n",
+    "        metadata[Metadata.FILE_TYPE] = shortened_source.split('.')[-1]\n",
+    "\n",
+    "    return documents\n",
+    "\n",
+    "\n",
+    "def index_document(file_path):\n",
+    "    documents = load_documents(Path(file_path))\n",
+    "    preprocessed_docs = preprocess(documents)\n",
+    "    logger.debug([doc.metadata for doc in preprocessed_docs])\n",
+    "\n",
+    "    for doc in preprocessed_docs:\n",
+    "        chunks = chunk_documents(doc)\n",
+    "\n",
+    "        vectorstore.add_documents(chunks)\n",
+    "        docstore.mset([(doc.metadata.get(Metadata.ID) , doc)])\n",
+    "\n",
+    "\n",
+    "def chunk_documents(parent: Document) -> list[Document]:\n",
+    "    if parent.metadata.get(Metadata.FILE_TYPE) == '.md':\n",
+    "        headers_to_split_on = [\n",
+    "            ('#', 'employee_name'),\n",
+    "            ('##', 'section'),\n",
+    "            ('###', 'Header 3'),\n",
+    "        ] \n",
+    "        markdown_splitter = MarkdownHeaderTextSplitter(\n",
+    "            headers_to_split_on=headers_to_split_on\n",
+    "        )\n",
+    "        chunks = markdown_splitter.split_text(parent.page_content)  \n",
+    "    else:\n",
+    "        text_splitter = RecursiveCharacterTextSplitter(\n",
+    "            chunk_size=400,\n",
+    "            chunk_overlap=80,\n",
+    "            separators=['\\n\\n', '\\n', ' ', '']\n",
+    "        )\n",
+    "        chunks = text_splitter.split_text(parent.page_content)\n",
+    "\n",
+    "    children = []\n",
+    "    parent_id = parent.metadata.get(Metadata.ID)\n",
+    "    for i, chunk in enumerate(chunks, start=1):\n",
+    "        if isinstance(chunk, Document):\n",
+    "            metadata = {**parent.metadata, **chunk.metadata}\n",
+    "            augmented_text = f'[Employee: {metadata.get('employee_name')}] '\n",
+    "            content = augmented_text + chunk.page_content\n",
+    "        else:\n",
+    "            # chunk is a text\n",
+    "            metadata = parent.metadata.copy()\n",
+    "            content = chunk\n",
+    "\n",
+    "        metadata.update({\n",
+    "            Metadata.ID: f'{parent_id}-{i}',\n",
+    "            Metadata.PARENT_ID: parent_id,\n",
+    "        })\n",
+    "        children.append(Document(page_content=content, metadata=metadata))\n",
+    "\n",
+    "    logger.debug(f'Number chunks: {len(children)}, Parent ID: {parent_id}')\n",
+    "    \n",
+    "    return children"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a90db6ee",
+   "metadata": {},
+   "source": [
+    "### LLM Interaction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2e15e99",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def retrieve_context(query) -> str:\n",
+    "    results = vectorstore.similarity_search(query)\n",
+    "    logger.info(f'Matching records: {len(results)}')\n",
+    "    selected_parents = {}\n",
+    "    for result in results:\n",
+    "        parent_id = result.metadata.get('parent_id')\n",
+    "        if parent_id in selected_parents:\n",
+    "            continue\n",
+    "\n",
+    "        parents = docstore.mget([parent_id])\n",
+    "        selected_parents[parent_id] = parents[0]\n",
+    "\n",
+    "    logger.info(f'Selected documents for query: {query} ids:{selected_parents.keys()}')\n",
+    "    context = '\\n\\n'.join([doc.page_content for _,doc in selected_parents.items() if doc is not None])\n",
+    "\n",
+    "    return context\n",
+    "\n",
+    "        \n",
+    "def ask(message, history):\n",
+    "    context = retrieve_context(message)\n",
+    "    prompt = f'''\n",
+    "    You are helpful assistant that answers a question based on the provided context.\n",
+    "    If the context is not helpful to you in answering the question, say so.\n",
+    "    Be concise with your responses.\n",
+    "\n",
+    "    Context:\n",
+    "    {context}\n",
+    "    '''\n",
+    "\n",
+    "    messages = [\n",
+    "        ('system', prompt),\n",
+    "        ('user', message)\n",
+    "    ]\n",
+    "\n",
+    "    stream = model.stream(messages)\n",
+    "    response_text = ''\n",
+    "\n",
+    "    for chunk in stream:\n",
+    "        response_text += chunk.content or ''\n",
+    "        if not response_text:\n",
+    "            continue\n",
+    "\n",
+    "        yield response_text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3e632dc-9e87-4510-9fcd-aa699c27e82b",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "## Gradio UI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3d68a74",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def chat(message, history):\n",
+    "    if message is None:\n",
+    "        return ''\n",
+    "\n",
+    "    text_input = message.get('text', '')\n",
+    "    files_uploaded = message.get('files', [])\n",
+    "    \n",
+    "    latest_file_path = files_uploaded[-1] if files_uploaded else None\n",
+    "    if latest_file_path:\n",
+    "        index_document(latest_file_path)\n",
+    "\n",
+    "\n",
+    "    if not text_input:\n",
+    "        yield '✅ Indexed document'\n",
+    "        return\n",
+    "\n",
+    "    for chunk in ask(text_input, history):\n",
+    "        yield chunk\n",
+    "\n",
+    "title = 'P-Chat 🔒💬'\n",
+    "with gr.Blocks(title=title, fill_height=True) as ui:\n",
+    "    gr.Markdown(f'# {title}')\n",
+    "    gr.Markdown('## Privacy-focused bring-your-own-document (BYOD) solution 🤫.')\n",
+    "\n",
+    "    gr.ChatInterface(\n",
+    "        fn=chat,\n",
+    "        type='messages',\n",
+    "        textbox=gr.MultimodalTextbox(file_types=['text', '.txt', '.md'], autofocus=True),\n",
+    "    )\n",
+    "\n",
+    "ui.launch(debug=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}