{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dfe37963-1af6-44fc-a841-8e462443f5e6",
   "metadata": {},
   "source": [
    "## Expert Knowledge Worker\n",
    "\n",
    "### A question answering agent that is an expert knowledge worker\n",
    "### To be used by employees of Insurellm, an Insurance Tech company\n",
    "### The agent needs to be accurate and the solution should be low cost.\n",
    "\n",
    "This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy.\n",
    "\n",
    "## TODAY:\n",
    "\n",
    "- Part A: We will divide our documents into CHUNKS\n",
    "- Part B: We will encode our CHUNKS into VECTORS and put in Chroma\n",
    "- Part C: We will visualize our vectors"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0769edb3",
   "metadata": {},
   "source": [
    "### PART A: Divide our documents into chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "import tiktoken\n",
    "import numpy as np\n",
    "from dotenv import load_dotenv\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain_chroma import Chroma\n",
    "from langchain_huggingface import HuggingFaceEmbeddings\n",
    "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n",
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "from sklearn.manifold import TSNE\n",
    "import plotly.graph_objects as go"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "802137aa-8a74-45e0-a487-d1974927d7ca",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OpenAI API Key exists and begins sk-proj-\n"
     ]
    }
   ],
   "source": [
    "# price is a factor for our company, so we're going to use a low cost model\n",
    "\n",
    "MODEL = \"gpt-4.1-nano\"\n",
    "db_name = \"vector_db\"\n",
    "load_dotenv(override=True)\n",
    "openai_api_key = os.getenv('OPENAI_API_KEY')\n",
    "if openai_api_key:\n",
    "    print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
    "else:\n",
    "    print(\"OpenAI API Key not set\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "58c85082-e417-4708-9efe-81a5d55d1424",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 76 files in the knowledge base\n",
      "Total characters in knowledge base: 304,434\n"
     ]
    }
   ],
   "source": [
    "# How many characters in all the documents?\n",
    "\n",
    "knowledge_base_path = \"knowledge-base/**/*.md\"\n",
    "files = glob.glob(knowledge_base_path, recursive=True)\n",
    "print(f\"Found {len(files)} files in the knowledge base\")\n",
    "\n",
    "entire_knowledge_base = \"\"\n",
    "\n",
    "for file_path in files:\n",
    "    with open(file_path, 'r', encoding='utf-8') as f:\n",
    "        entire_knowledge_base += f.read()\n",
    "        entire_knowledge_base += \"\\n\\n\"\n",
    "\n",
    "print(f\"Total characters in knowledge base: {len(entire_knowledge_base):,}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0b53a099",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total tokens for gpt-4.1-nano: 63,555\n"
     ]
    }
   ],
   "source": [
    "# How many tokens in all the documents?\n",
    "\n",
    "encoding = tiktoken.encoding_for_model(MODEL)\n",
    "tokens = encoding.encode(entire_knowledge_base)\n",
    "token_count = len(tokens)\n",
    "print(f\"Total tokens for {MODEL}: {token_count:,}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ee78efcb-60fe-449e-a944-40bab26261af",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 76 documents\n"
     ]
    }
   ],
   "source": [
    "# Load in everything in the knowledgebase using LangChain's loaders\n",
    "\n",
    "folders = glob.glob(\"knowledge-base/*\")\n",
    "\n",
    "documents = []\n",
    "for folder in folders:\n",
    "    doc_type = os.path.basename(folder)\n",
    "    loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs={'encoding': 'utf-8'})\n",
    "    folder_docs = loader.load()\n",
    "    for doc in folder_docs:\n",
    "        doc.metadata[\"doc_type\"] = doc_type\n",
    "        documents.append(doc)\n",
    "\n",
    "print(f\"Loaded {len(documents)} documents\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "68dab1ab",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Document(metadata={'source': 'knowledge-base/products/Claimllm.md', 'doc_type': 'products'}, page_content=\"# Product Summary\\n\\n# Claimllm\\n\\n## Summary\\n\\nClaimllm is Insurellm's revolutionary claims processing platform that transforms the claims experience for insurers, adjusters, and policyholders. Powered by advanced AI, machine learning, and computer vision, Claimllm automates claims handling across all insurance lines—from first notice of loss through final settlement. By dramatically reducing processing time, improving accuracy, and enhancing fraud detection, Claimllm enables insurers to deliver exceptional claims service while significantly reducing operational costs. The platform seamlessly integrates with existing policy administration and core systems to create a unified insurance ecosystem.\\n\\n## Features\\n\\n### 1. Intelligent FNOL Processing\\nClaimllm's AI-powered first notice of loss intake captures claim details through multiple channels including mobile apps, web portals, phone integrations, and chatbots. Natural language processing extracts key information automatically, creating structured claim files without manual data entry.\\n\\n### 2. Automated Triage and Routing\\nMachine learning algorithms instantly assess claim severity, complexity, and fraud potential to route claims to appropriate handlers. Simple claims enter straight-through processing while complex cases receive immediate adjuster assignment with pre-populated investigation guidance.\\n\\n### 3. Computer Vision Damage Assessment\\nAdvanced image recognition technology analyzes photos and videos of damaged property or vehicles to estimate repair costs accurately. The system identifies damage types, measures extent, and generates preliminary estimates in minutes rather than days.\\n\\n### 4. Predictive Fraud Detection\\nSophisticated fraud analytics evaluate claims against historical patterns, network analysis, and anomaly detection to flag suspicious claims for investigation. The system learns continuously from confirmed fraud cases to improve detection accuracy.\\n\\n### 5. Smart Document Processing\\nOptical character recognition and natural language processing automatically extract information from medical records, police reports, repair estimates, and supporting documentation. The system validates information against policy terms and flags discrepancies for review.\\n\\n### 6. Dynamic Reserve Setting\\nAI-powered predictive modeling analyzes claim characteristics, historical outcomes, and current case details to recommend accurate reserves. Automated reserve adjustments occur as new information becomes available, ensuring financial accuracy.\\n\\n### 7. Vendor Management Platform\\nIntegrated tools coordinate with repair shops, medical providers, lawyers, and other service providers. Automated assignment based on location, capacity, and performance metrics ensures efficient vendor utilization and quality outcomes.\\n\\n### 8. Payment Automation\\nStraight-through payment processing for approved claims includes direct deposit, digital payments, and traditional check issuance. Multi-party payment splitting handles complex scenarios involving mortgagees, lessors, and medical providers.\\n\\n### 9. Claimant Communication Hub\\nOmnichannel communication tools keep claimants informed via their preferred method—text, email, app notifications, or phone. Automated status updates reduce incoming inquiries while improving customer satisfaction.\\n\\n### 10. Analytics and Reporting\\nComprehensive dashboards track key metrics including cycle time, loss ratios, settlement patterns, and customer satisfaction. Predictive analytics identify process bottlenecks and opportunities for improvement.\\n\\n## Pricing\\n\\nClaimllm offers flexible pricing models to accommodate insurers of all sizes:\\n\\n- **Core Tier:** $4,500/month for smaller insurers processing up to 5,000 claims annually, including essential automation features and standard integrations.\\n- **Advanced Tier:** $9,500/month for mid-sized insurers handling up to 25,000 claims annually, adding computer vision, predictive analytics, and expanded vendor management.\\n- **Enterprise Tier:** Custom pricing for high-volume insurers requiring unlimited claim capacity, full API access, dedicated infrastructure, advanced fraud analytics, and white-label capabilities.\\n\\nAll tiers include comprehensive implementation, claims staff training, and continuous platform enhancements.\\n\\n## Roadmap\\n\\nClaimllm's ambitious development roadmap includes:\\n\\n- **Q1 2025:** Launch of Claimllm version 1.0 with core claims processing automation, FNOL capture, and basic fraud detection.\\n- **Q3 2025:** Introduction of advanced computer vision capabilities for property and auto damage assessment with repair cost estimation.\\n- **Q1 2026:** Release of predictive litigation analytics identifying claims likely to involve attorneys and recommending settlement strategies.\\n- **Q3 2026:** Launch of blockchain-based subrogation network enabling automated inter-carrier claim recovery and settlement.\\n- **Q1 2027:** Introduction of real-time settlement negotiation AI that optimizes settlement amounts based on claim details and historical outcomes.\\n- **Q3 2027:** Expansion into catastrophe claims management with disaster response coordination and large-loss handling capabilities.\\n\\nClaimllm represents the future of insurance claims—faster, smarter, and more customer-centric. Transform your claims operation and deliver the service your policyholders deserve!\\n\")"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "25987306",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Divided into 413 chunks\n",
      "First chunk:\n",
      "\n",
      "page_content='# Product Summary\n",
      "\n",
      "# Rellm: AI-Powered Enterprise Reinsurance Solution\n",
      "\n",
      "## Summary\n",
      "\n",
      "Rellm is an innovative enterprise reinsurance product developed by Insurellm, designed to transform the way reinsurance companies operate. Harnessing the power of artificial intelligence, Rellm offers an advanced platform that redefines risk management, enhances decision-making processes, and optimizes operational efficiencies within the reinsurance industry. With seamless integrations and robust analytics, Rellm enables insurers to proactively manage their portfolios and respond to market dynamics with agility.\n",
      "\n",
      "## Features\n",
      "\n",
      "### AI-Driven Analytics\n",
      "Rellm utilizes cutting-edge AI algorithms to provide predictive insights into risk exposures, enabling users to forecast trends and make informed decisions. Its real-time data analysis empowers reinsurance professionals with actionable intelligence.' metadata={'source': 'knowledge-base/products/Rellm.md', 'doc_type': 'products'}\n"
     ]
    }
   ],
   "source": [
    "# Divide into chunks using the RecursiveCharacterTextSplitter\n",
    "\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
    "chunks = text_splitter.split_documents(documents)\n",
    "\n",
    "print(f\"Divided into {len(chunks)} chunks\")\n",
    "print(f\"First chunk:\\n\\n{chunks[0]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9eb209db",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Document(metadata={'source': 'knowledge-base/contracts/Contract with National Claims Network for Claimllm.md', 'doc_type': 'contracts'}, page_content=\"7. **Business Continuity:** Insurellm provides disaster recovery with 4-hour RTO (Recovery Time Objective) and 1-hour RPO (Recovery Point Objective).\\n\\n---\\n\\n## Renewal\\n\\nThis agreement includes a mutual 120-day renewal notice period. National Claims Network receives guaranteed enterprise pricing for renewal equal to or better than new enterprise customers at renewal time. Contract may be extended in 12-month increments with mutual written agreement.\\n\\n---\\n\\n## Features\\n\\nNational Claims Network will receive the complete Claimllm Enterprise suite:\\n\\n1. **Unlimited Claims Processing:** No volume restrictions, supporting National's processing of 100,000+ claims annually with scalability to 500,000+ claims as business grows.\\n\\n2. **White-Label Platform:** Complete branding customization including:\\n   - Custom domain names (claims.nationalclaimsnetwork.com)\\n   - Branded mobile apps (iOS and Android)\\n   - Customized email templates and communications\\n   - Co-branded claimant portals\")"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunks[100]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ecee2169",
   "metadata": {},
   "source": [
    "### PART B: Make vectors and store in Chroma\n",
    "\n",
    "In Week 3, you set up a Hugging Face account and got an HF_TOKEN\n",
    "\n",
    "At this point, you might want to add it to your `.env` file and run `load_dotenv(override=True)`\n",
    "\n",
    "(This actually shouldn't be required)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vectorstore created with 413 documents\n"
     ]
    }
   ],
   "source": [
    "# Pick an embedding model\n",
    "\n",
    "embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
    "#embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\")\n",
    "\n",
    "if os.path.exists(db_name):\n",
    "    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n",
    "\n",
    "vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n",
    "print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "252f17e9-3529-4e81-996c-cfa9f08e75a8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 413 vectors with 384 dimensions in the vector store\n"
     ]
    }
   ],
   "source": [
    "# Let's investigate the vectors\n",
    "\n",
    "collection = vectorstore._collection\n",
    "count = collection.count()\n",
    "\n",
    "sample_embedding = collection.get(limit=1, include=[\"embeddings\"])[\"embeddings\"][0]\n",
    "dimensions = len(sample_embedding)\n",
    "print(f\"There are {count:,} vectors with {dimensions:,} dimensions in the vector store\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c30096a7",
   "metadata": {},
   "source": [
    "### Part C: Visualize!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d48dcb6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prework\n",
    "\n",
    "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n",
    "vectors = np.array(result['embeddings'])\n",
    "documents = result['documents']\n",
    "metadatas = result['metadatas']\n",
    "doc_types = [metadata['doc_type'] for metadata in metadatas]\n",
    "colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e8decb0-d9b0-4d51-8402-7a6174d22159",
   "metadata": {},
   "outputs": [],
   "source": [
    "# We humans find it easier to visalize things in 2D!\n",
    "# Reduce the dimensionality of the vectors to 2D using t-SNE\n",
    "# (t-distributed stochastic neighbor embedding)\n",
    "\n",
    "tsne = TSNE(n_components=2, random_state=42)\n",
    "reduced_vectors = tsne.fit_transform(vectors)\n",
    "\n",
    "# Create the 2D scatter plot\n",
    "fig = go.Figure(data=[go.Scatter(\n",
    "    x=reduced_vectors[:, 0],\n",
    "    y=reduced_vectors[:, 1],\n",
    "    mode='markers',\n",
    "    marker=dict(size=5, color=colors, opacity=0.8),\n",
    "    text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n",
    "    hoverinfo='text'\n",
    ")])\n",
    "\n",
    "fig.update_layout(title='2D Chroma Vector Store Visualization',\n",
    "    scene=dict(xaxis_title='x',yaxis_title='y'),\n",
    "    width=800,\n",
    "    height=600,\n",
    "    margin=dict(r=20, b=10, l=10, t=40)\n",
    ")\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's try 3D!\n",
    "\n",
    "tsne = TSNE(n_components=3, random_state=42)\n",
    "reduced_vectors = tsne.fit_transform(vectors)\n",
    "\n",
    "# Create the 3D scatter plot\n",
    "fig = go.Figure(data=[go.Scatter3d(\n",
    "    x=reduced_vectors[:, 0],\n",
    "    y=reduced_vectors[:, 1],\n",
    "    z=reduced_vectors[:, 2],\n",
    "    mode='markers',\n",
    "    marker=dict(size=5, color=colors, opacity=0.8),\n",
    "    text=[f\"Type: {t}<br>Text: {d[:100]}...\" for t, d in zip(doc_types, documents)],\n",
    "    hoverinfo='text'\n",
    ")])\n",
    "\n",
    "fig.update_layout(\n",
    "    title='3D Chroma Vector Store Visualization',\n",
    "    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n",
    "    width=900,\n",
    "    height=700,\n",
    "    margin=dict(r=10, b=10, l=10, t=40)\n",
    ")\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65489941",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}