diff --git a/week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb b/week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb new file mode 100644 index 0000000..8881804 --- /dev/null +++ b/week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb @@ -0,0 +1,623 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6f0f38e7", + "metadata": {}, + "source": [ + "# Email Mindmap Demo (Week 5 Community Contribution)\n", + "\n", + "Welcome to the **Email Mindmap Demo** notebook! This demo walks you through a workflow for exploring and visualizing email relationships using embeddings and mindmaps.\n", + "\n", + "---\n", + "\n", + "## 📋 Workflow Overview\n", + "\n", + "1. **Load/Create Synthetic Email Data** \n", + " Generate or load varied types of emails: work, personal, family, subscriptions, etc.\n", + "\n", + "2. **Generate Embeddings** \n", + " Use an open-source model to create vector embeddings for email content.\n", + "\n", + "3. **Build & Visualize a Mindmap** \n", + " Construct a mindmap of email relationships and visualize it interactively using `networkx` and `matplotlib`.\n", + "\n", + "4. **Question-Answering Interface** \n", + " Query the email content and the mindmap using a simple Q&A interface powered by Gradio.\n", + "\n", + "---\n", + "\n", + "## ⚙️ Requirements\n", + "\n", + "> **Tip:** \n", + "> I'm including an example of the synthetic emails in case you don't want to run that part.\n", + "> Might need to install other libraries like pyvis, nbformat and faiss-cpu\n", + "\n", + "\n", + "## ✨ Features\n", + "\n", + "- Synthetic generation of varied emails (work, personal, family, subscriptions)\n", + "- Embedding generation with open-source models (hugging face sentence-transformer)\n", + "- Interactive mindmap visualization (`networkx`, `pyvis`)\n", + "- Simple chatbot interface (Gradio) and visualization of mindmap created\n", + "\n", + "---\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a9aeb363", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OpenAI API Key exists and begins sk-proj-\n", + "Anthropic API Key exists and begins sk-ant-\n", + "Google API Key exists and begins AI\n", + "OLLAMA API Key exists and begins 36\n" + ] + } + ], + "source": [ + "# imports\n", + "\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from openai import OpenAI\n", + "import gradio as gr\n", + "\n", + "load_dotenv(override=True)\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n", + "google_api_key = os.getenv('GOOGLE_API_KEY')\n", + "ollama_api_key = os.getenv('OLLAMA_API_KEY')\n", + "\n", + "if openai_api_key:\n", + " print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n", + "else:\n", + " print(\"OpenAI API Key not set\")\n", + " \n", + "if anthropic_api_key:\n", + " print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n", + "else:\n", + " print(\"Anthropic API Key not set (and this is optional)\")\n", + "\n", + "if google_api_key:\n", + " print(f\"Google API Key exists and begins {google_api_key[:2]}\")\n", + "else:\n", + " print(\"Google API Key not set (and this is optional)\")\n", + "\n", + "if ollama_api_key:\n", + " print(f\"OLLAMA API Key exists and begins {ollama_api_key[:2]}\")\n", + "else:\n", + " print(\"OLLAMA API Key not set (and this is optional)\")\n", + "\n", + "# Connect to client libraries\n", + "\n", + "openai = OpenAI()\n", + "\n", + "anthropic_url = \"https://api.anthropic.com/v1/\"\n", + "gemini_url = \"https://generativelanguage.googleapis.com/v1beta/openai/\"\n", + "ollama_url = \"http://localhost:11434/v1\"\n", + "\n", + "anthropic = OpenAI(api_key=anthropic_api_key, base_url=anthropic_url)\n", + "gemini = OpenAI(api_key=google_api_key, base_url=gemini_url)\n", + "ollama = OpenAI(api_key=ollama_api_key, base_url=ollama_url)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "b8ddce62", + "metadata": {}, + "source": [ + "## Preparation of synthetic data (could have been week2 work)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2e250912", + "metadata": {}, + "outputs": [], + "source": [ + "#using ollama gpt oss 120b cloud i'm going to create synthetic emails using a persona.\n", + "#they are going to be saved in a json file with different keys\n", + "from pydantic import BaseModel, Field\n", + "from typing import List, Optional\n", + "\n", + "\n", + "class Email(BaseModel):\n", + " sender: str = Field(description=\"Email address of the sender\")\n", + " subject: str = Field(description=\"Email subject line\")\n", + " body: str = Field(description=\"Email body content\")\n", + " timestamp: str = Field(description=\"ISO 8601 timestamp when email was received\")\n", + " category: str = Field(description=\"Category of the email\")\n", + "\n", + "class EmailBatch(BaseModel):\n", + " emails: List[Email] = Field(description=\"List of generated emails\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1f67fdb3", + "metadata": {}, + "outputs": [], + "source": [ + "def create_persona(name: str, age: int, occupation: str, \n", + " interests: List[str], family_status: str) -> str:\n", + " persona = f\"\"\"\n", + " You are generating synthetic emails for a realistic inbox simulation.\n", + "\n", + " **Person Profile:**\n", + " - Name: {name}\n", + " - Age: {age}\n", + " - Occupation: {occupation}\n", + " - Interests: {', '.join(interests)}\n", + " - Family Status: {family_status}\n", + "\n", + " **Email Categories to Include:**\n", + " 1. **Work Emails**: Project updates, meeting invitations, colleague communications, \n", + " performance reviews, company announcements\n", + " 2. **Purchases**: Order confirmations, shipping notifications, delivery updates, \n", + " receipts from various retailers (Amazon, local shops, etc.)\n", + " 3. **Subscriptions**: Newsletter updates, streaming services (Netflix, Spotify), \n", + " software subscriptions (Adobe, Microsoft 365), magazine subscriptions\n", + " 4. **Family**: Communications with parents, siblings, children, extended family members,\n", + " family event planning, photo sharing\n", + " 5. **Friends**: Social plans, birthday wishes, casual conversations, group hangouts,\n", + " catching up messages\n", + " 6. **Finance**: Bank statements, credit card bills, investment updates, tax documents,\n", + " payment reminders\n", + " 7. **Social Media**: Facebook notifications, LinkedIn updates, Instagram activity,\n", + " Twitter mentions\n", + " 8. **Personal**: Doctor appointments, gym memberships, utility bills, insurance updates\n", + "\n", + " **Instructions:**\n", + " - Generate realistic email content that reflects the person's life over time\n", + " - Include temporal patterns (more work emails on weekdays, more personal on weekends)\n", + " - Create realistic sender names and email addresses\n", + " - Vary email length and formality based on context\n", + " - Include realistic subject lines\n", + " - Make emails interconnected when appropriate (e.g., follow-up emails, conversation threads)\n", + " - Include seasonal events (holidays, birthdays, annual renewals)\n", + " \"\"\"\n", + " return persona\n", + "\n", + "persona_description = create_persona(\n", + " name=\"John Doe\",\n", + " age=30,\n", + " occupation=\"Software Engineer\",\n", + " interests=[\"technology\", \"reading\", \"traveling\"],\n", + " family_status=\"single\"\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "cec185e3", + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "from datetime import datetime, timedelta\n", + "import random\n", + "from typing import List\n", + "\n", + "def generate_synthetic_emails(\n", + " persona_description: str,\n", + " num_emails: int,\n", + " start_date: str,\n", + " end_date: str,\n", + " model: str = \"gpt-4o-2024-08-06\"\n", + ") -> List[Email]:\n", + " \"\"\"\n", + " NEEDS TO WORK WITH OPENAI MODELS BECAUSE OF PARSED (STRUC OUTPUT) MODELS\n", + " Generates synthetic emails using OpenAI's structured output feature.\n", + " \n", + " Args:\n", + " persona_description: Detailed persona description\n", + " num_emails: Number of emails to generate per batch\n", + " start_date: Start date for email timestamps\n", + " end_date: End date for email timestamps\n", + " model: OpenAI model to use (must support structured outputs)\n", + " \n", + " Returns:\n", + " List of Email objects\n", + " \"\"\"\n", + " \n", + " # Calculate date range for context\n", + " date_range_context = f\"\"\"\n", + " Generate emails with timestamps between {start_date} and {end_date}.\n", + " Distribute emails naturally across this time period, with realistic patterns:\n", + " - More emails during business hours on weekdays\n", + " - Fewer emails late at night\n", + " - Occasional weekend emails\n", + " - Bursts of activity around events or busy periods\n", + " \"\"\"\n", + " \n", + " # System message combining persona and structure instructions\n", + " system_message = f\"\"\"\n", + " {persona_description}\n", + "\n", + " {date_range_context}\n", + "\n", + " Generate {num_emails} realistic emails that fit this person's life. \n", + " Ensure variety in categories, senders, and content while maintaining realism.\n", + " \"\"\"\n", + " \n", + " try:\n", + " client = OpenAI()\n", + "\n", + " response = client.chat.completions.parse(\n", + " model=model,\n", + " messages=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": system_message\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"Generate {num_emails} diverse, realistic emails for this person's inbox.\"\n", + " }\n", + " ],\n", + " response_format=EmailBatch,\n", + " )\n", + " return response.choices[0].message.parsed.emails\n", + " \n", + " except Exception as e:\n", + " print(f\"Error generating emails: {e}\")\n", + " return []\n", + "\n", + "\n", + "def save_emails_to_json(emails: List[Email], filename: str):\n", + " \"\"\"\n", + " Saves emails to a JSON file.\n", + " \"\"\"\n", + " import json\n", + " \n", + " emails_dict = [email.model_dump() for email in emails]\n", + " \n", + " with open(filename, 'w', encoding='utf-8') as f:\n", + " json.dump(emails_dict, f, indent=2, ensure_ascii=False)\n", + " \n", + " print(f\"Saved {len(emails)} emails to {filename}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "be31f352", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "now\n" + ] + } + ], + "source": [ + "mails_2 = generate_synthetic_emails(\n", + " persona_description = persona_description,\n", + " num_emails = 100,\n", + " start_date = '2024-06-01',\n", + " end_date = '2025-01-01',\n", + " model = \"gpt-4o\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "24d844f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved 101 emails to emails2.json\n" + ] + } + ], + "source": [ + "save_emails_to_json(mails_2, 'emails2.json')" + ] + }, + { + "cell_type": "markdown", + "id": "2b9c704e", + "metadata": {}, + "source": [ + "## Create embeddings for the mails\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "777012f8", + "metadata": {}, + "outputs": [], + "source": [ + "# imports for langchain, plotly and Chroma\n", + "\n", + "from langchain.document_loaders import DirectoryLoader, TextLoader\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.schema import Document\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_chroma import Chroma\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.manifold import TSNE\n", + "import numpy as np\n", + "import plotly.graph_objects as go\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "import json\n", + "from langchain.vectorstores import FAISS\n", + "\n", + "#MODEL = \"gpt-4o-mini\"\n", + "db_name = \"vector_db\"" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "ce95d9c7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of chunks: 206\n", + "Sample metadata fields: ['sender', 'timestamp', 'category']\n" + ] + } + ], + "source": [ + "# Read in emails from the emails.json file and construct LangChain documents\n", + "\n", + "\n", + "with open(\"emails.json\", \"r\", encoding=\"utf-8\") as f:\n", + " emails = json.load(f)\n", + "\n", + "documents = []\n", + "for email in emails:\n", + " # Extract metadata (all fields except 'content')\n", + " metadata = {k: v for k, v in email.items() if k in ['sender','category','timestamp']}\n", + " body = email.get(\"body\", \"\")\n", + " documents.append(Document(page_content=body, metadata=metadata))\n", + "\n", + "text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", + "chunks = text_splitter.split_documents(documents)\n", + "\n", + "print(f\"Total number of chunks: {len(chunks)}\")\n", + "print(f\"Sample metadata fields: {list(documents[0].metadata.keys()) if documents else []}\")\n", + "\n", + "embeddings_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + "\n", + "if os.path.exists(db_name):\n", + " Chroma(persist_directory=db_name, embedding_function=embeddings_model).delete_collection()\n", + "\n", + "vectorstore = FAISS.from_documents(chunks, embedding=embeddings_model)\n", + "\n", + "all_embeddings = [vectorstore.index.reconstruct(i) for i in range(vectorstore.index.ntotal)]\n", + "\n", + "total_vectors = vectorstore.index.ntotal\n", + "dimensions = vectorstore.index.d\n" + ] + }, + { + "cell_type": "markdown", + "id": "78ca65bb", + "metadata": {}, + "source": [ + "## Visualizing mindmap" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "a99dd2d6", + "metadata": {}, + "outputs": [], + "source": [ + "import networkx as nx\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import plotly.graph_objects as go\n", + "import numpy as np\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.manifold import TSNE # Or use UMAP\n", + "from pyvis.network import Network\n", + "\n", + "# Here, emails is your list of email objects, with .subject or .body\n", + "\n", + "# Build similarity graph\n", + "def build_mindmap_html(emails, all_embeddings, threshold=0.6):\n", + " similarity = cosine_similarity(all_embeddings)\n", + "\n", + " G = nx.Graph()\n", + " for i, email in enumerate(emails):\n", + " G.add_node(i, label=email['subject'][:80], title=email['body'][:50]) # Custom hover text\n", + "\n", + " for i in range(len(emails)):\n", + " for j in range(i+1, len(emails)):\n", + " if similarity[i][j] > threshold:\n", + " G.add_edge(i, j, weight=float(similarity[i][j]))\n", + "\n", + " # Convert to pyvis network\n", + " nt = Network(notebook=True, height='700px', width='100%', bgcolor='#222222', font_color='white')\n", + " nt.from_nx(G)\n", + " html = nt.generate_html().replace(\"'\", \"\\\"\")\n", + " return html\n" + ] + }, + { + "cell_type": "markdown", + "id": "53a2fbaf", + "metadata": {}, + "source": [ + "## Putting it all together in a gradio.\n", + "It needs to have an interface to make questions, and the visual to see the mindmap.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "161144ac", + "metadata": {}, + "outputs": [], + "source": [ + "# create a new Chat with OpenAI\n", + "MODEL=\"gpt-4o-mini\"\n", + "llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n", + "\n", + "# set up the conversation memory for the chat\n", + "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "\n", + "# the retriever is an abstraction over the VectorStore that will be used during RAG\n", + "retriever = vectorstore.as_retriever()\n", + "from langchain_core.callbacks import StdOutCallbackHandler\n", + "\n", + "# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory\n", + "conversation_chain_debug = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])\n", + "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)\n", + "\n", + "# Wrapping that in a function\n", + "\n", + "def chat(question, history):\n", + " result = conversation_chain.invoke({\"question\": question})\n", + " return result[\"answer\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "16a4d8d1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Javi\\Desktop\\course\\llm_engineering\\.venv\\Lib\\site-packages\\gradio\\chat_interface.py:347: UserWarning:\n", + "\n", + "The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n", + "* Running on local URL: http://127.0.0.1:7878\n", + "* To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "