Merge remote-tracking branch 'upstream/main'

This commit is contained in:
Bharat Puri
2025-10-24 12:14:59 +05:30
19 changed files with 8933 additions and 0 deletions

View File

@@ -0,0 +1,445 @@
#!/usr/bin/env python3
"""
Knowledge Worker with Document Upload and Google Drive Integration
This script creates a knowledge worker that:
1. Allows users to upload documents through a Gradio UI
2. Integrates with Google Drive to access documents
3. Uses Chroma vector database for efficient document retrieval
4. Implements RAG (Retrieval Augmented Generation) for accurate responses
The system updates its context dynamically when new documents are uploaded.
"""
import os
import glob
import tempfile
from pathlib import Path
from dotenv import load_dotenv
import gradio as gr
# LangChain imports
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
# Visualization imports
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
# Removed Google Drive API imports
# Additional document loaders
try:
from langchain_community.document_loaders import Docx2txtLoader, UnstructuredExcelLoader
except ImportError:
print("Warning: Some document loaders not available. PDF and text files will still work.")
Docx2txtLoader = None
UnstructuredExcelLoader = None
# Configuration
MODEL = "gpt-4o-mini" # Using a cost-effective model
DB_NAME = "knowledge_worker_db"
UPLOAD_FOLDER = "uploaded_documents"
# Create upload folder if it doesn't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
# Load environment variables
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
# Removed Google Drive credentials configuration
# Use a simple text splitter approach
class SimpleTextSplitter:
def __init__(self, chunk_size=1000, chunk_overlap=200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def split_documents(self, documents):
chunks = []
for doc in documents:
text = doc.page_content
start = 0
while start < len(text):
end = start + self.chunk_size
chunk_text = text[start:end]
chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata.copy())
chunks.append(chunk_doc)
start = end - self.chunk_overlap
return chunks
CharacterTextSplitter = SimpleTextSplitter
# Try different import paths for memory and chains
try:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
except ImportError:
try:
from langchain_core.memory import ConversationBufferMemory
from langchain_core.chains import ConversationalRetrievalChain
except ImportError:
try:
from langchain_community.memory import ConversationBufferMemory
from langchain_community.chains import ConversationalRetrievalChain
except ImportError:
print("Warning: Memory and chains modules not found. Creating simple alternatives.")
# Create simple alternatives
class ConversationBufferMemory:
def __init__(self, memory_key='chat_history', return_messages=True):
self.memory_key = memory_key
self.return_messages = return_messages
self.chat_memory = []
def save_context(self, inputs, outputs):
self.chat_memory.append((inputs, outputs))
def load_memory_variables(self, inputs):
return {self.memory_key: self.chat_memory}
class ConversationalRetrievalChain:
def __init__(self, llm, retriever, memory):
self.llm = llm
self.retriever = retriever
self.memory = memory
def invoke(self, inputs):
question = inputs.get("question", "")
# Simple implementation - just return a basic response
return {"answer": f"I received your question: {question}. This is a simplified response."}
# Removed Google Drive Integration Functions
# Document Processing Functions
def get_loader_for_file(file_path):
"""
Get the appropriate document loader based on file extension
"""
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.pdf':
return PyPDFLoader(file_path)
elif file_extension in ['.docx', '.doc'] and Docx2txtLoader:
return Docx2txtLoader(file_path)
elif file_extension in ['.xlsx', '.xls'] and UnstructuredExcelLoader:
return UnstructuredExcelLoader(file_path)
elif file_extension in ['.txt', '.md']:
return TextLoader(file_path, encoding='utf-8')
else:
# Default to text loader for unknown types
try:
return TextLoader(file_path, encoding='utf-8')
except:
return None
def load_document(file_path):
"""
Load a document using the appropriate loader
"""
loader = get_loader_for_file(file_path)
if loader:
try:
return loader.load()
except Exception as e:
print(f"Error loading document {file_path}: {e}")
return []
def process_documents(documents):
"""
Split documents into chunks for embedding
"""
text_splitter = CharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
chunks = text_splitter.split_documents(documents)
return chunks
# Knowledge Base Class
class KnowledgeBase:
def __init__(self, db_name=DB_NAME):
self.db_name = db_name
self.embeddings = OpenAIEmbeddings()
self.vectorstore = None
self.initialize_vectorstore()
def initialize_vectorstore(self):
"""
Initialize the vector store, loading from disk if it exists
"""
if os.path.exists(self.db_name):
self.vectorstore = Chroma(persist_directory=self.db_name, embedding_function=self.embeddings)
print(f"Loaded existing vector store with {self.vectorstore._collection.count()} documents")
else:
# Create empty vectorstore
self.vectorstore = Chroma(persist_directory=self.db_name, embedding_function=self.embeddings)
print("Created new vector store")
def add_documents(self, documents):
"""
Process and add documents to the vector store
"""
if not documents:
return False
chunks = process_documents(documents)
if not chunks:
return False
# Add to existing vectorstore
self.vectorstore.add_documents(chunks)
print(f"Added {len(chunks)} chunks to vector store")
return True
def get_retriever(self, k=4):
"""
Get a retriever for the vector store
"""
return self.vectorstore.as_retriever(search_kwargs={"k": k})
def visualize_vectors(self):
"""
Create a 3D visualization of the vector store
"""
try:
collection = self.vectorstore._collection
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
if result['embeddings'] is None or len(result['embeddings']) == 0:
print("No embeddings found in vector store")
return None
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
if len(vectors) < 2:
print("Not enough vectors for visualization (need at least 2)")
return None
# Get source info for coloring
sources = [metadata.get('source', 'unknown') for metadata in metadatas]
unique_sources = list(set(sources))
colors = [['blue', 'green', 'red', 'orange', 'purple', 'cyan'][unique_sources.index(s) % 6] for s in sources]
# Reduce dimensions for visualization
# Adjust perplexity based on number of samples
n_samples = len(vectors)
perplexity = min(30, max(1, n_samples - 1))
tsne = TSNE(n_components=3, random_state=42, perplexity=perplexity)
reduced_vectors = tsne.fit_transform(vectors)
# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
x=reduced_vectors[:, 0],
y=reduced_vectors[:, 1],
z=reduced_vectors[:, 2],
mode='markers',
marker=dict(size=5, color=colors, opacity=0.8),
text=[f"Source: {s}<br>Text: {d[:100]}..." for s, d in zip(sources, documents)],
hoverinfo='text'
)])
fig.update_layout(
title='3D Vector Store Visualization',
scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
width=900,
height=700,
margin=dict(r=20, b=10, l=10, t=40)
)
return fig
except Exception as e:
print(f"Error creating visualization: {e}")
return None
# Simple fallback chain implementation
class SimpleConversationalChain:
def __init__(self, llm, retriever, memory):
self.llm = llm
self.retriever = retriever
self.memory = memory
def invoke(self, inputs):
question = inputs.get("question", "")
# Get relevant documents - try different methods
try:
docs = self.retriever.get_relevant_documents(question)
except AttributeError:
try:
docs = self.retriever.invoke(question)
except:
docs = []
context = "\n".join([doc.page_content for doc in docs[:3]]) if docs else "No relevant context found."
# Create a simple prompt
prompt = f"""Based on the following context, answer the question:
Context: {context}
Question: {question}
Answer:"""
# Get response from LLM
response = self.llm.invoke(prompt)
return {"answer": response.content if hasattr(response, 'content') else str(response)}
# Chat System Class
class ChatSystem:
def __init__(self, knowledge_base, model_name=MODEL):
self.knowledge_base = knowledge_base
self.model_name = model_name
self.llm = ChatOpenAI(temperature=0.7, model_name=self.model_name)
self.memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
self.conversation_chain = self._create_conversation_chain()
def _create_conversation_chain(self):
"""
Create a new conversation chain with the current retriever
"""
retriever = self.knowledge_base.get_retriever()
# Skip the problematic ConversationalRetrievalChain and use simple implementation
print("Using simple conversational chain implementation")
return SimpleConversationalChain(self.llm, retriever, self.memory)
def reset_conversation(self):
"""
Reset the conversation memory and chain
"""
self.memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
self.conversation_chain = self._create_conversation_chain()
return "Conversation has been reset."
def chat(self, question, history):
"""
Process a question and return the answer
"""
if not question.strip():
return "Please ask a question."
result = self.conversation_chain.invoke({"question": question})
return result["answer"]
def update_knowledge_base(self):
"""
Update the conversation chain with the latest knowledge base
"""
self.conversation_chain = self._create_conversation_chain()
# UI Functions
def handle_file_upload(files):
"""
Process uploaded files and add them to the knowledge base
"""
if not files:
return "No files uploaded."
documents = []
for file in files:
try:
docs = load_document(file.name)
if docs:
# Add upload source metadata
for doc in docs:
doc.metadata['source'] = 'upload'
doc.metadata['filename'] = os.path.basename(file.name)
documents.extend(docs)
except Exception as e:
print(f"Error processing file {file.name}: {e}")
if documents:
success = kb.add_documents(documents)
if success:
# Update the chat system with new knowledge
chat_system.update_knowledge_base()
return f"Successfully processed {len(documents)} documents."
return "No documents could be processed. Please check file formats."
def create_ui():
"""
Create the Gradio UI
"""
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("""
# Knowledge Worker
Upload documents or ask questions about your knowledge base.
""")
with gr.Tabs():
with gr.TabItem("Chat"):
chatbot = gr.ChatInterface(
chat_system.chat,
chatbot=gr.Chatbot(height=500, type="messages"),
textbox=gr.Textbox(placeholder="Ask a question about your documents...", container=False),
title="Knowledge Worker Chat",
type="messages"
)
reset_btn = gr.Button("Reset Conversation")
reset_btn.click(chat_system.reset_conversation, inputs=None, outputs=gr.Textbox())
with gr.TabItem("Upload Documents"):
with gr.Column():
file_output = gr.Textbox(label="Upload Status")
upload_button = gr.UploadButton(
"Click to Upload Files",
file_types=[".pdf", ".docx", ".txt", ".md", ".xlsx"],
file_count="multiple"
)
upload_button.upload(handle_file_upload, upload_button, file_output)
with gr.TabItem("Visualize Knowledge"):
visualize_btn = gr.Button("Generate Vector Visualization")
plot_output = gr.Plot(label="Vector Space Visualization")
visualize_btn.click(kb.visualize_vectors, inputs=None, outputs=plot_output)
return app
def main():
"""
Main function to initialize and run the knowledge worker
"""
global kb, chat_system
print("=" * 60)
print("Initializing Knowledge Worker...")
print("=" * 60)
try:
# Initialize the knowledge base
print("Setting up vector database...")
kb = KnowledgeBase(DB_NAME)
print("Vector database initialized successfully")
# Google Drive integration removed
# Initialize the chat system
print("\nSetting up chat system...")
chat_system = ChatSystem(kb)
print("Chat system initialized successfully")
# Launch the Gradio app
print("\nLaunching Gradio interface...")
print("=" * 60)
print("The web interface will open in your browser")
print("You can also access it at the URL shown below")
print("=" * 60)
app = create_ui()
app.launch(inbrowser=True)
except Exception as e:
print(f"Error initializing Knowledge Worker: {e}")
print("Please check your configuration and try again.")
return
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,623 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6f0f38e7",
"metadata": {},
"source": [
"# Email Mindmap Demo (Week 5 Community Contribution)\n",
"\n",
"Welcome to the **Email Mindmap Demo** notebook! This demo walks you through a workflow for exploring and visualizing email relationships using embeddings and mindmaps.\n",
"\n",
"---\n",
"\n",
"## 📋 Workflow Overview\n",
"\n",
"1. **Load/Create Synthetic Email Data** \n",
" Generate or load varied types of emails: work, personal, family, subscriptions, etc.\n",
"\n",
"2. **Generate Embeddings** \n",
" Use an open-source model to create vector embeddings for email content.\n",
"\n",
"3. **Build & Visualize a Mindmap** \n",
" Construct a mindmap of email relationships and visualize it interactively using `networkx` and `matplotlib`.\n",
"\n",
"4. **Question-Answering Interface** \n",
" Query the email content and the mindmap using a simple Q&A interface powered by Gradio.\n",
"\n",
"---\n",
"\n",
"## ⚙️ Requirements\n",
"\n",
"> **Tip:** \n",
"> I'm including an example of the synthetic emails in case you don't want to run that part.\n",
"> Might need to install other libraries like pyvis, nbformat and faiss-cpu\n",
"\n",
"\n",
"## ✨ Features\n",
"\n",
"- Synthetic generation of varied emails (work, personal, family, subscriptions)\n",
"- Embedding generation with open-source models (hugging face sentence-transformer)\n",
"- Interactive mindmap visualization (`networkx`, `pyvis`)\n",
"- Simple chatbot interface (Gradio) and visualization of mindmap created\n",
"\n",
"---\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a9aeb363",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OpenAI API Key exists and begins sk-proj-\n",
"Anthropic API Key exists and begins sk-ant-\n",
"Google API Key exists and begins AI\n",
"OLLAMA API Key exists and begins 36\n"
]
}
],
"source": [
"# imports\n",
"\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"import gradio as gr\n",
"\n",
"load_dotenv(override=True)\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
"google_api_key = os.getenv('GOOGLE_API_KEY')\n",
"ollama_api_key = os.getenv('OLLAMA_API_KEY')\n",
"\n",
"if openai_api_key:\n",
" print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
"else:\n",
" print(\"OpenAI API Key not set\")\n",
" \n",
"if anthropic_api_key:\n",
" print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n",
"else:\n",
" print(\"Anthropic API Key not set (and this is optional)\")\n",
"\n",
"if google_api_key:\n",
" print(f\"Google API Key exists and begins {google_api_key[:2]}\")\n",
"else:\n",
" print(\"Google API Key not set (and this is optional)\")\n",
"\n",
"if ollama_api_key:\n",
" print(f\"OLLAMA API Key exists and begins {ollama_api_key[:2]}\")\n",
"else:\n",
" print(\"OLLAMA API Key not set (and this is optional)\")\n",
"\n",
"# Connect to client libraries\n",
"\n",
"openai = OpenAI()\n",
"\n",
"anthropic_url = \"https://api.anthropic.com/v1/\"\n",
"gemini_url = \"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
"ollama_url = \"http://localhost:11434/v1\"\n",
"\n",
"anthropic = OpenAI(api_key=anthropic_api_key, base_url=anthropic_url)\n",
"gemini = OpenAI(api_key=google_api_key, base_url=gemini_url)\n",
"ollama = OpenAI(api_key=ollama_api_key, base_url=ollama_url)\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "b8ddce62",
"metadata": {},
"source": [
"## Preparation of synthetic data (could have been week2 work)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2e250912",
"metadata": {},
"outputs": [],
"source": [
"#using ollama gpt oss 120b cloud i'm going to create synthetic emails using a persona.\n",
"#they are going to be saved in a json file with different keys\n",
"from pydantic import BaseModel, Field\n",
"from typing import List, Optional\n",
"\n",
"\n",
"class Email(BaseModel):\n",
" sender: str = Field(description=\"Email address of the sender\")\n",
" subject: str = Field(description=\"Email subject line\")\n",
" body: str = Field(description=\"Email body content\")\n",
" timestamp: str = Field(description=\"ISO 8601 timestamp when email was received\")\n",
" category: str = Field(description=\"Category of the email\")\n",
"\n",
"class EmailBatch(BaseModel):\n",
" emails: List[Email] = Field(description=\"List of generated emails\")\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1f67fdb3",
"metadata": {},
"outputs": [],
"source": [
"def create_persona(name: str, age: int, occupation: str, \n",
" interests: List[str], family_status: str) -> str:\n",
" persona = f\"\"\"\n",
" You are generating synthetic emails for a realistic inbox simulation.\n",
"\n",
" **Person Profile:**\n",
" - Name: {name}\n",
" - Age: {age}\n",
" - Occupation: {occupation}\n",
" - Interests: {', '.join(interests)}\n",
" - Family Status: {family_status}\n",
"\n",
" **Email Categories to Include:**\n",
" 1. **Work Emails**: Project updates, meeting invitations, colleague communications, \n",
" performance reviews, company announcements\n",
" 2. **Purchases**: Order confirmations, shipping notifications, delivery updates, \n",
" receipts from various retailers (Amazon, local shops, etc.)\n",
" 3. **Subscriptions**: Newsletter updates, streaming services (Netflix, Spotify), \n",
" software subscriptions (Adobe, Microsoft 365), magazine subscriptions\n",
" 4. **Family**: Communications with parents, siblings, children, extended family members,\n",
" family event planning, photo sharing\n",
" 5. **Friends**: Social plans, birthday wishes, casual conversations, group hangouts,\n",
" catching up messages\n",
" 6. **Finance**: Bank statements, credit card bills, investment updates, tax documents,\n",
" payment reminders\n",
" 7. **Social Media**: Facebook notifications, LinkedIn updates, Instagram activity,\n",
" Twitter mentions\n",
" 8. **Personal**: Doctor appointments, gym memberships, utility bills, insurance updates\n",
"\n",
" **Instructions:**\n",
" - Generate realistic email content that reflects the person's life over time\n",
" - Include temporal patterns (more work emails on weekdays, more personal on weekends)\n",
" - Create realistic sender names and email addresses\n",
" - Vary email length and formality based on context\n",
" - Include realistic subject lines\n",
" - Make emails interconnected when appropriate (e.g., follow-up emails, conversation threads)\n",
" - Include seasonal events (holidays, birthdays, annual renewals)\n",
" \"\"\"\n",
" return persona\n",
"\n",
"persona_description = create_persona(\n",
" name=\"John Doe\",\n",
" age=30,\n",
" occupation=\"Software Engineer\",\n",
" interests=[\"technology\", \"reading\", \"traveling\"],\n",
" family_status=\"single\"\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "cec185e3",
"metadata": {},
"outputs": [],
"source": [
"from openai import OpenAI\n",
"from datetime import datetime, timedelta\n",
"import random\n",
"from typing import List\n",
"\n",
"def generate_synthetic_emails(\n",
" persona_description: str,\n",
" num_emails: int,\n",
" start_date: str,\n",
" end_date: str,\n",
" model: str = \"gpt-4o-2024-08-06\"\n",
") -> List[Email]:\n",
" \"\"\"\n",
" NEEDS TO WORK WITH OPENAI MODELS BECAUSE OF PARSED (STRUC OUTPUT) MODELS\n",
" Generates synthetic emails using OpenAI's structured output feature.\n",
" \n",
" Args:\n",
" persona_description: Detailed persona description\n",
" num_emails: Number of emails to generate per batch\n",
" start_date: Start date for email timestamps\n",
" end_date: End date for email timestamps\n",
" model: OpenAI model to use (must support structured outputs)\n",
" \n",
" Returns:\n",
" List of Email objects\n",
" \"\"\"\n",
" \n",
" # Calculate date range for context\n",
" date_range_context = f\"\"\"\n",
" Generate emails with timestamps between {start_date} and {end_date}.\n",
" Distribute emails naturally across this time period, with realistic patterns:\n",
" - More emails during business hours on weekdays\n",
" - Fewer emails late at night\n",
" - Occasional weekend emails\n",
" - Bursts of activity around events or busy periods\n",
" \"\"\"\n",
" \n",
" # System message combining persona and structure instructions\n",
" system_message = f\"\"\"\n",
" {persona_description}\n",
"\n",
" {date_range_context}\n",
"\n",
" Generate {num_emails} realistic emails that fit this person's life. \n",
" Ensure variety in categories, senders, and content while maintaining realism.\n",
" \"\"\"\n",
" \n",
" try:\n",
" client = OpenAI()\n",
"\n",
" response = client.chat.completions.parse(\n",
" model=model,\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": system_message\n",
" },\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": f\"Generate {num_emails} diverse, realistic emails for this person's inbox.\"\n",
" }\n",
" ],\n",
" response_format=EmailBatch,\n",
" )\n",
" return response.choices[0].message.parsed.emails\n",
" \n",
" except Exception as e:\n",
" print(f\"Error generating emails: {e}\")\n",
" return []\n",
"\n",
"\n",
"def save_emails_to_json(emails: List[Email], filename: str):\n",
" \"\"\"\n",
" Saves emails to a JSON file.\n",
" \"\"\"\n",
" import json\n",
" \n",
" emails_dict = [email.model_dump() for email in emails]\n",
" \n",
" with open(filename, 'w', encoding='utf-8') as f:\n",
" json.dump(emails_dict, f, indent=2, ensure_ascii=False)\n",
" \n",
" print(f\"Saved {len(emails)} emails to {filename}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "be31f352",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"now\n"
]
}
],
"source": [
"mails_2 = generate_synthetic_emails(\n",
" persona_description = persona_description,\n",
" num_emails = 100,\n",
" start_date = '2024-06-01',\n",
" end_date = '2025-01-01',\n",
" model = \"gpt-4o\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "24d844f2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saved 101 emails to emails2.json\n"
]
}
],
"source": [
"save_emails_to_json(mails_2, 'emails2.json')"
]
},
{
"cell_type": "markdown",
"id": "2b9c704e",
"metadata": {},
"source": [
"## Create embeddings for the mails\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "777012f8",
"metadata": {},
"outputs": [],
"source": [
"# imports for langchain, plotly and Chroma\n",
"\n",
"from langchain.document_loaders import DirectoryLoader, TextLoader\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.schema import Document\n",
"from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
"from langchain_chroma import Chroma\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.manifold import TSNE\n",
"import numpy as np\n",
"import plotly.graph_objects as go\n",
"from langchain.memory import ConversationBufferMemory\n",
"from langchain.chains import ConversationalRetrievalChain\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"import json\n",
"from langchain.vectorstores import FAISS\n",
"\n",
"#MODEL = \"gpt-4o-mini\"\n",
"db_name = \"vector_db\""
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "ce95d9c7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total number of chunks: 206\n",
"Sample metadata fields: ['sender', 'timestamp', 'category']\n"
]
}
],
"source": [
"# Read in emails from the emails.json file and construct LangChain documents\n",
"\n",
"\n",
"with open(\"emails.json\", \"r\", encoding=\"utf-8\") as f:\n",
" emails = json.load(f)\n",
"\n",
"documents = []\n",
"for email in emails:\n",
" # Extract metadata (all fields except 'content')\n",
" metadata = {k: v for k, v in email.items() if k in ['sender','category','timestamp']}\n",
" body = email.get(\"body\", \"\")\n",
" documents.append(Document(page_content=body, metadata=metadata))\n",
"\n",
"text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n",
"chunks = text_splitter.split_documents(documents)\n",
"\n",
"print(f\"Total number of chunks: {len(chunks)}\")\n",
"print(f\"Sample metadata fields: {list(documents[0].metadata.keys()) if documents else []}\")\n",
"\n",
"embeddings_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
"\n",
"if os.path.exists(db_name):\n",
" Chroma(persist_directory=db_name, embedding_function=embeddings_model).delete_collection()\n",
"\n",
"vectorstore = FAISS.from_documents(chunks, embedding=embeddings_model)\n",
"\n",
"all_embeddings = [vectorstore.index.reconstruct(i) for i in range(vectorstore.index.ntotal)]\n",
"\n",
"total_vectors = vectorstore.index.ntotal\n",
"dimensions = vectorstore.index.d\n"
]
},
{
"cell_type": "markdown",
"id": "78ca65bb",
"metadata": {},
"source": [
"## Visualizing mindmap"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "a99dd2d6",
"metadata": {},
"outputs": [],
"source": [
"import networkx as nx\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import plotly.graph_objects as go\n",
"import numpy as np\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.manifold import TSNE # Or use UMAP\n",
"from pyvis.network import Network\n",
"\n",
"# Here, emails is your list of email objects, with .subject or .body\n",
"\n",
"# Build similarity graph\n",
"def build_mindmap_html(emails, all_embeddings, threshold=0.6):\n",
" similarity = cosine_similarity(all_embeddings)\n",
"\n",
" G = nx.Graph()\n",
" for i, email in enumerate(emails):\n",
" G.add_node(i, label=email['subject'][:80], title=email['body'][:50]) # Custom hover text\n",
"\n",
" for i in range(len(emails)):\n",
" for j in range(i+1, len(emails)):\n",
" if similarity[i][j] > threshold:\n",
" G.add_edge(i, j, weight=float(similarity[i][j]))\n",
"\n",
" # Convert to pyvis network\n",
" nt = Network(notebook=True, height='700px', width='100%', bgcolor='#222222', font_color='white')\n",
" nt.from_nx(G)\n",
" html = nt.generate_html().replace(\"'\", \"\\\"\")\n",
" return html\n"
]
},
{
"cell_type": "markdown",
"id": "53a2fbaf",
"metadata": {},
"source": [
"## Putting it all together in a gradio.\n",
"It needs to have an interface to make questions, and the visual to see the mindmap.\n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "161144ac",
"metadata": {},
"outputs": [],
"source": [
"# create a new Chat with OpenAI\n",
"MODEL=\"gpt-4o-mini\"\n",
"llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n",
"\n",
"# set up the conversation memory for the chat\n",
"memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
"\n",
"# the retriever is an abstraction over the VectorStore that will be used during RAG\n",
"retriever = vectorstore.as_retriever()\n",
"from langchain_core.callbacks import StdOutCallbackHandler\n",
"\n",
"# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory\n",
"conversation_chain_debug = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])\n",
"conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)\n",
"\n",
"# Wrapping that in a function\n",
"\n",
"def chat(question, history):\n",
" result = conversation_chain.invoke({\"question\": question})\n",
" return result[\"answer\"]"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "16a4d8d1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Javi\\Desktop\\course\\llm_engineering\\.venv\\Lib\\site-packages\\gradio\\chat_interface.py:347: UserWarning:\n",
"\n",
"The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n",
"* Running on local URL: http://127.0.0.1:7878\n",
"* To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7878/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": []
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n",
"Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n"
]
}
],
"source": [
"\n",
"import gradio as gr\n",
"\n",
"def show_mindmap():\n",
" # Call build_mindmap_html to generate the HTML\n",
" html = build_mindmap_html(emails, all_embeddings)\n",
" return f\"\"\"<iframe style=\"width: 100%; height: 600px;margin:0 auto\" name=\"result\" allow=\"midi; geolocation; microphone; camera; \n",
" display-capture; encrypted-media;\" sandbox=\"allow-modals allow-forms \n",
" allow-scripts allow-same-origin allow-popups \n",
" allow-top-navigation-by-user-activation allow-downloads\" allowfullscreen=\"\" \n",
" allowpaymentrequest=\"\" frameborder=\"0\" srcdoc='{html}'></iframe>\"\"\"\n",
"\n",
"\n",
"with gr.Blocks(title=\"Mindmap & Email Chatbot\") as demo:\n",
" gr.Markdown(\"# 📧 Mindmap Visualization & Email QA Chatbot\")\n",
" with gr.Row():\n",
" chatbot = gr.ChatInterface(fn=chat, title=\"Ask about your emails\",\n",
" examples=[\n",
" \"What is my most important message?\",\n",
" \"Who have I been communicating with?\",\n",
" \"Summarize recent emails\"\n",
" ],\n",
")\n",
" mindmap_html = gr.HTML(\n",
" show_mindmap,\n",
" label=\"🧠 Mindmap of Your Emails\",\n",
" )\n",
" # Reduce height: update show_mindmap (elsewhere) to ~400px, or do inline replace for the demo here:\n",
" # mindmap_html = gr.HTML(lambda: show_mindmap().replace(\"height: 600px\", \"height: 400px\"))\n",
" \n",
"demo.launch(inbrowser=True)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "221a9d98",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}