Merge branch 'ed-donner:main' into main
This commit is contained in:
445
week5/community-contributions/w5d5_worker.py
Normal file
445
week5/community-contributions/w5d5_worker.py
Normal file
@@ -0,0 +1,445 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Knowledge Worker with Document Upload and Google Drive Integration
|
||||
|
||||
This script creates a knowledge worker that:
|
||||
1. Allows users to upload documents through a Gradio UI
|
||||
2. Integrates with Google Drive to access documents
|
||||
3. Uses Chroma vector database for efficient document retrieval
|
||||
4. Implements RAG (Retrieval Augmented Generation) for accurate responses
|
||||
|
||||
The system updates its context dynamically when new documents are uploaded.
|
||||
"""
|
||||
|
||||
import os
|
||||
import glob
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import gradio as gr
|
||||
|
||||
# LangChain imports
|
||||
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
||||
from langchain_chroma import Chroma
|
||||
|
||||
# Visualization imports
|
||||
import numpy as np
|
||||
from sklearn.manifold import TSNE
|
||||
import plotly.graph_objects as go
|
||||
|
||||
# Removed Google Drive API imports
|
||||
|
||||
# Additional document loaders
|
||||
try:
|
||||
from langchain_community.document_loaders import Docx2txtLoader, UnstructuredExcelLoader
|
||||
except ImportError:
|
||||
print("Warning: Some document loaders not available. PDF and text files will still work.")
|
||||
Docx2txtLoader = None
|
||||
UnstructuredExcelLoader = None
|
||||
|
||||
# Configuration
|
||||
MODEL = "gpt-4o-mini" # Using a cost-effective model
|
||||
DB_NAME = "knowledge_worker_db"
|
||||
UPLOAD_FOLDER = "uploaded_documents"
|
||||
|
||||
# Create upload folder if it doesn't exist
|
||||
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(override=True)
|
||||
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
|
||||
|
||||
# Removed Google Drive credentials configuration
|
||||
|
||||
# Use a simple text splitter approach
|
||||
class SimpleTextSplitter:
|
||||
def __init__(self, chunk_size=1000, chunk_overlap=200):
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
|
||||
def split_documents(self, documents):
|
||||
chunks = []
|
||||
for doc in documents:
|
||||
text = doc.page_content
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + self.chunk_size
|
||||
chunk_text = text[start:end]
|
||||
chunk_doc = Document(page_content=chunk_text, metadata=doc.metadata.copy())
|
||||
chunks.append(chunk_doc)
|
||||
start = end - self.chunk_overlap
|
||||
return chunks
|
||||
|
||||
CharacterTextSplitter = SimpleTextSplitter
|
||||
|
||||
# Try different import paths for memory and chains
|
||||
try:
|
||||
from langchain.memory import ConversationBufferMemory
|
||||
from langchain.chains import ConversationalRetrievalChain
|
||||
except ImportError:
|
||||
try:
|
||||
from langchain_core.memory import ConversationBufferMemory
|
||||
from langchain_core.chains import ConversationalRetrievalChain
|
||||
except ImportError:
|
||||
try:
|
||||
from langchain_community.memory import ConversationBufferMemory
|
||||
from langchain_community.chains import ConversationalRetrievalChain
|
||||
except ImportError:
|
||||
print("Warning: Memory and chains modules not found. Creating simple alternatives.")
|
||||
# Create simple alternatives
|
||||
class ConversationBufferMemory:
|
||||
def __init__(self, memory_key='chat_history', return_messages=True):
|
||||
self.memory_key = memory_key
|
||||
self.return_messages = return_messages
|
||||
self.chat_memory = []
|
||||
|
||||
def save_context(self, inputs, outputs):
|
||||
self.chat_memory.append((inputs, outputs))
|
||||
|
||||
def load_memory_variables(self, inputs):
|
||||
return {self.memory_key: self.chat_memory}
|
||||
|
||||
class ConversationalRetrievalChain:
|
||||
def __init__(self, llm, retriever, memory):
|
||||
self.llm = llm
|
||||
self.retriever = retriever
|
||||
self.memory = memory
|
||||
|
||||
def invoke(self, inputs):
|
||||
question = inputs.get("question", "")
|
||||
# Simple implementation - just return a basic response
|
||||
return {"answer": f"I received your question: {question}. This is a simplified response."}
|
||||
|
||||
# Removed Google Drive Integration Functions
|
||||
|
||||
# Document Processing Functions
|
||||
def get_loader_for_file(file_path):
|
||||
"""
|
||||
Get the appropriate document loader based on file extension
|
||||
"""
|
||||
file_extension = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if file_extension == '.pdf':
|
||||
return PyPDFLoader(file_path)
|
||||
elif file_extension in ['.docx', '.doc'] and Docx2txtLoader:
|
||||
return Docx2txtLoader(file_path)
|
||||
elif file_extension in ['.xlsx', '.xls'] and UnstructuredExcelLoader:
|
||||
return UnstructuredExcelLoader(file_path)
|
||||
elif file_extension in ['.txt', '.md']:
|
||||
return TextLoader(file_path, encoding='utf-8')
|
||||
else:
|
||||
# Default to text loader for unknown types
|
||||
try:
|
||||
return TextLoader(file_path, encoding='utf-8')
|
||||
except:
|
||||
return None
|
||||
|
||||
def load_document(file_path):
|
||||
"""
|
||||
Load a document using the appropriate loader
|
||||
"""
|
||||
loader = get_loader_for_file(file_path)
|
||||
if loader:
|
||||
try:
|
||||
return loader.load()
|
||||
except Exception as e:
|
||||
print(f"Error loading document {file_path}: {e}")
|
||||
return []
|
||||
|
||||
def process_documents(documents):
|
||||
"""
|
||||
Split documents into chunks for embedding
|
||||
"""
|
||||
text_splitter = CharacterTextSplitter(
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200
|
||||
)
|
||||
chunks = text_splitter.split_documents(documents)
|
||||
return chunks
|
||||
|
||||
# Knowledge Base Class
|
||||
class KnowledgeBase:
|
||||
def __init__(self, db_name=DB_NAME):
|
||||
self.db_name = db_name
|
||||
self.embeddings = OpenAIEmbeddings()
|
||||
self.vectorstore = None
|
||||
self.initialize_vectorstore()
|
||||
|
||||
def initialize_vectorstore(self):
|
||||
"""
|
||||
Initialize the vector store, loading from disk if it exists
|
||||
"""
|
||||
if os.path.exists(self.db_name):
|
||||
self.vectorstore = Chroma(persist_directory=self.db_name, embedding_function=self.embeddings)
|
||||
print(f"Loaded existing vector store with {self.vectorstore._collection.count()} documents")
|
||||
else:
|
||||
# Create empty vectorstore
|
||||
self.vectorstore = Chroma(persist_directory=self.db_name, embedding_function=self.embeddings)
|
||||
print("Created new vector store")
|
||||
|
||||
def add_documents(self, documents):
|
||||
"""
|
||||
Process and add documents to the vector store
|
||||
"""
|
||||
if not documents:
|
||||
return False
|
||||
|
||||
chunks = process_documents(documents)
|
||||
if not chunks:
|
||||
return False
|
||||
|
||||
# Add to existing vectorstore
|
||||
self.vectorstore.add_documents(chunks)
|
||||
print(f"Added {len(chunks)} chunks to vector store")
|
||||
return True
|
||||
|
||||
def get_retriever(self, k=4):
|
||||
"""
|
||||
Get a retriever for the vector store
|
||||
"""
|
||||
return self.vectorstore.as_retriever(search_kwargs={"k": k})
|
||||
|
||||
def visualize_vectors(self):
|
||||
"""
|
||||
Create a 3D visualization of the vector store
|
||||
"""
|
||||
try:
|
||||
collection = self.vectorstore._collection
|
||||
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
|
||||
|
||||
if result['embeddings'] is None or len(result['embeddings']) == 0:
|
||||
print("No embeddings found in vector store")
|
||||
return None
|
||||
|
||||
vectors = np.array(result['embeddings'])
|
||||
documents = result['documents']
|
||||
metadatas = result['metadatas']
|
||||
|
||||
if len(vectors) < 2:
|
||||
print("Not enough vectors for visualization (need at least 2)")
|
||||
return None
|
||||
|
||||
# Get source info for coloring
|
||||
sources = [metadata.get('source', 'unknown') for metadata in metadatas]
|
||||
unique_sources = list(set(sources))
|
||||
colors = [['blue', 'green', 'red', 'orange', 'purple', 'cyan'][unique_sources.index(s) % 6] for s in sources]
|
||||
|
||||
# Reduce dimensions for visualization
|
||||
# Adjust perplexity based on number of samples
|
||||
n_samples = len(vectors)
|
||||
perplexity = min(30, max(1, n_samples - 1))
|
||||
|
||||
tsne = TSNE(n_components=3, random_state=42, perplexity=perplexity)
|
||||
reduced_vectors = tsne.fit_transform(vectors)
|
||||
|
||||
# Create the 3D scatter plot
|
||||
fig = go.Figure(data=[go.Scatter3d(
|
||||
x=reduced_vectors[:, 0],
|
||||
y=reduced_vectors[:, 1],
|
||||
z=reduced_vectors[:, 2],
|
||||
mode='markers',
|
||||
marker=dict(size=5, color=colors, opacity=0.8),
|
||||
text=[f"Source: {s}<br>Text: {d[:100]}..." for s, d in zip(sources, documents)],
|
||||
hoverinfo='text'
|
||||
)])
|
||||
|
||||
fig.update_layout(
|
||||
title='3D Vector Store Visualization',
|
||||
scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
|
||||
width=900,
|
||||
height=700,
|
||||
margin=dict(r=20, b=10, l=10, t=40)
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error creating visualization: {e}")
|
||||
return None
|
||||
|
||||
# Simple fallback chain implementation
|
||||
class SimpleConversationalChain:
|
||||
def __init__(self, llm, retriever, memory):
|
||||
self.llm = llm
|
||||
self.retriever = retriever
|
||||
self.memory = memory
|
||||
|
||||
def invoke(self, inputs):
|
||||
question = inputs.get("question", "")
|
||||
# Get relevant documents - try different methods
|
||||
try:
|
||||
docs = self.retriever.get_relevant_documents(question)
|
||||
except AttributeError:
|
||||
try:
|
||||
docs = self.retriever.invoke(question)
|
||||
except:
|
||||
docs = []
|
||||
|
||||
context = "\n".join([doc.page_content for doc in docs[:3]]) if docs else "No relevant context found."
|
||||
|
||||
# Create a simple prompt
|
||||
prompt = f"""Based on the following context, answer the question:
|
||||
|
||||
Context: {context}
|
||||
|
||||
Question: {question}
|
||||
|
||||
Answer:"""
|
||||
|
||||
# Get response from LLM
|
||||
response = self.llm.invoke(prompt)
|
||||
return {"answer": response.content if hasattr(response, 'content') else str(response)}
|
||||
|
||||
# Chat System Class
|
||||
class ChatSystem:
|
||||
def __init__(self, knowledge_base, model_name=MODEL):
|
||||
self.knowledge_base = knowledge_base
|
||||
self.model_name = model_name
|
||||
self.llm = ChatOpenAI(temperature=0.7, model_name=self.model_name)
|
||||
self.memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
|
||||
self.conversation_chain = self._create_conversation_chain()
|
||||
|
||||
def _create_conversation_chain(self):
|
||||
"""
|
||||
Create a new conversation chain with the current retriever
|
||||
"""
|
||||
retriever = self.knowledge_base.get_retriever()
|
||||
# Skip the problematic ConversationalRetrievalChain and use simple implementation
|
||||
print("Using simple conversational chain implementation")
|
||||
return SimpleConversationalChain(self.llm, retriever, self.memory)
|
||||
|
||||
def reset_conversation(self):
|
||||
"""
|
||||
Reset the conversation memory and chain
|
||||
"""
|
||||
self.memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
|
||||
self.conversation_chain = self._create_conversation_chain()
|
||||
return "Conversation has been reset."
|
||||
|
||||
def chat(self, question, history):
|
||||
"""
|
||||
Process a question and return the answer
|
||||
"""
|
||||
if not question.strip():
|
||||
return "Please ask a question."
|
||||
|
||||
result = self.conversation_chain.invoke({"question": question})
|
||||
return result["answer"]
|
||||
|
||||
def update_knowledge_base(self):
|
||||
"""
|
||||
Update the conversation chain with the latest knowledge base
|
||||
"""
|
||||
self.conversation_chain = self._create_conversation_chain()
|
||||
|
||||
# UI Functions
|
||||
def handle_file_upload(files):
|
||||
"""
|
||||
Process uploaded files and add them to the knowledge base
|
||||
"""
|
||||
if not files:
|
||||
return "No files uploaded."
|
||||
|
||||
documents = []
|
||||
for file in files:
|
||||
try:
|
||||
docs = load_document(file.name)
|
||||
if docs:
|
||||
# Add upload source metadata
|
||||
for doc in docs:
|
||||
doc.metadata['source'] = 'upload'
|
||||
doc.metadata['filename'] = os.path.basename(file.name)
|
||||
documents.extend(docs)
|
||||
except Exception as e:
|
||||
print(f"Error processing file {file.name}: {e}")
|
||||
|
||||
if documents:
|
||||
success = kb.add_documents(documents)
|
||||
if success:
|
||||
# Update the chat system with new knowledge
|
||||
chat_system.update_knowledge_base()
|
||||
return f"Successfully processed {len(documents)} documents."
|
||||
|
||||
return "No documents could be processed. Please check file formats."
|
||||
|
||||
def create_ui():
|
||||
"""
|
||||
Create the Gradio UI
|
||||
"""
|
||||
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
||||
gr.Markdown("""
|
||||
# Knowledge Worker
|
||||
Upload documents or ask questions about your knowledge base.
|
||||
""")
|
||||
|
||||
with gr.Tabs():
|
||||
with gr.TabItem("Chat"):
|
||||
chatbot = gr.ChatInterface(
|
||||
chat_system.chat,
|
||||
chatbot=gr.Chatbot(height=500, type="messages"),
|
||||
textbox=gr.Textbox(placeholder="Ask a question about your documents...", container=False),
|
||||
title="Knowledge Worker Chat",
|
||||
type="messages"
|
||||
)
|
||||
reset_btn = gr.Button("Reset Conversation")
|
||||
reset_btn.click(chat_system.reset_conversation, inputs=None, outputs=gr.Textbox())
|
||||
|
||||
with gr.TabItem("Upload Documents"):
|
||||
with gr.Column():
|
||||
file_output = gr.Textbox(label="Upload Status")
|
||||
upload_button = gr.UploadButton(
|
||||
"Click to Upload Files",
|
||||
file_types=[".pdf", ".docx", ".txt", ".md", ".xlsx"],
|
||||
file_count="multiple"
|
||||
)
|
||||
upload_button.upload(handle_file_upload, upload_button, file_output)
|
||||
|
||||
with gr.TabItem("Visualize Knowledge"):
|
||||
visualize_btn = gr.Button("Generate Vector Visualization")
|
||||
plot_output = gr.Plot(label="Vector Space Visualization")
|
||||
visualize_btn.click(kb.visualize_vectors, inputs=None, outputs=plot_output)
|
||||
|
||||
return app
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to initialize and run the knowledge worker
|
||||
"""
|
||||
global kb, chat_system
|
||||
|
||||
print("=" * 60)
|
||||
print("Initializing Knowledge Worker...")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
# Initialize the knowledge base
|
||||
print("Setting up vector database...")
|
||||
kb = KnowledgeBase(DB_NAME)
|
||||
print("Vector database initialized successfully")
|
||||
|
||||
# Google Drive integration removed
|
||||
|
||||
# Initialize the chat system
|
||||
print("\nSetting up chat system...")
|
||||
chat_system = ChatSystem(kb)
|
||||
print("Chat system initialized successfully")
|
||||
|
||||
# Launch the Gradio app
|
||||
print("\nLaunching Gradio interface...")
|
||||
print("=" * 60)
|
||||
print("The web interface will open in your browser")
|
||||
print("You can also access it at the URL shown below")
|
||||
print("=" * 60)
|
||||
|
||||
app = create_ui()
|
||||
app.launch(inbrowser=True)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error initializing Knowledge Worker: {e}")
|
||||
print("Please check your configuration and try again.")
|
||||
return
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
623
week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb
Normal file
623
week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb
Normal file
@@ -0,0 +1,623 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6f0f38e7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Email Mindmap Demo (Week 5 Community Contribution)\n",
|
||||
"\n",
|
||||
"Welcome to the **Email Mindmap Demo** notebook! This demo walks you through a workflow for exploring and visualizing email relationships using embeddings and mindmaps.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## 📋 Workflow Overview\n",
|
||||
"\n",
|
||||
"1. **Load/Create Synthetic Email Data** \n",
|
||||
" Generate or load varied types of emails: work, personal, family, subscriptions, etc.\n",
|
||||
"\n",
|
||||
"2. **Generate Embeddings** \n",
|
||||
" Use an open-source model to create vector embeddings for email content.\n",
|
||||
"\n",
|
||||
"3. **Build & Visualize a Mindmap** \n",
|
||||
" Construct a mindmap of email relationships and visualize it interactively using `networkx` and `matplotlib`.\n",
|
||||
"\n",
|
||||
"4. **Question-Answering Interface** \n",
|
||||
" Query the email content and the mindmap using a simple Q&A interface powered by Gradio.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## ⚙️ Requirements\n",
|
||||
"\n",
|
||||
"> **Tip:** \n",
|
||||
"> I'm including an example of the synthetic emails in case you don't want to run that part.\n",
|
||||
"> Might need to install other libraries like pyvis, nbformat and faiss-cpu\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## ✨ Features\n",
|
||||
"\n",
|
||||
"- Synthetic generation of varied emails (work, personal, family, subscriptions)\n",
|
||||
"- Embedding generation with open-source models (hugging face sentence-transformer)\n",
|
||||
"- Interactive mindmap visualization (`networkx`, `pyvis`)\n",
|
||||
"- Simple chatbot interface (Gradio) and visualization of mindmap created\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "a9aeb363",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"OpenAI API Key exists and begins sk-proj-\n",
|
||||
"Anthropic API Key exists and begins sk-ant-\n",
|
||||
"Google API Key exists and begins AI\n",
|
||||
"OLLAMA API Key exists and begins 36\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import gradio as gr\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
|
||||
"google_api_key = os.getenv('GOOGLE_API_KEY')\n",
|
||||
"ollama_api_key = os.getenv('OLLAMA_API_KEY')\n",
|
||||
"\n",
|
||||
"if openai_api_key:\n",
|
||||
" print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"OpenAI API Key not set\")\n",
|
||||
" \n",
|
||||
"if anthropic_api_key:\n",
|
||||
" print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"Anthropic API Key not set (and this is optional)\")\n",
|
||||
"\n",
|
||||
"if google_api_key:\n",
|
||||
" print(f\"Google API Key exists and begins {google_api_key[:2]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"Google API Key not set (and this is optional)\")\n",
|
||||
"\n",
|
||||
"if ollama_api_key:\n",
|
||||
" print(f\"OLLAMA API Key exists and begins {ollama_api_key[:2]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"OLLAMA API Key not set (and this is optional)\")\n",
|
||||
"\n",
|
||||
"# Connect to client libraries\n",
|
||||
"\n",
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"anthropic_url = \"https://api.anthropic.com/v1/\"\n",
|
||||
"gemini_url = \"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
|
||||
"ollama_url = \"http://localhost:11434/v1\"\n",
|
||||
"\n",
|
||||
"anthropic = OpenAI(api_key=anthropic_api_key, base_url=anthropic_url)\n",
|
||||
"gemini = OpenAI(api_key=google_api_key, base_url=gemini_url)\n",
|
||||
"ollama = OpenAI(api_key=ollama_api_key, base_url=ollama_url)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b8ddce62",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Preparation of synthetic data (could have been week2 work)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "2e250912",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#using ollama gpt oss 120b cloud i'm going to create synthetic emails using a persona.\n",
|
||||
"#they are going to be saved in a json file with different keys\n",
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"from typing import List, Optional\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Email(BaseModel):\n",
|
||||
" sender: str = Field(description=\"Email address of the sender\")\n",
|
||||
" subject: str = Field(description=\"Email subject line\")\n",
|
||||
" body: str = Field(description=\"Email body content\")\n",
|
||||
" timestamp: str = Field(description=\"ISO 8601 timestamp when email was received\")\n",
|
||||
" category: str = Field(description=\"Category of the email\")\n",
|
||||
"\n",
|
||||
"class EmailBatch(BaseModel):\n",
|
||||
" emails: List[Email] = Field(description=\"List of generated emails\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "1f67fdb3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_persona(name: str, age: int, occupation: str, \n",
|
||||
" interests: List[str], family_status: str) -> str:\n",
|
||||
" persona = f\"\"\"\n",
|
||||
" You are generating synthetic emails for a realistic inbox simulation.\n",
|
||||
"\n",
|
||||
" **Person Profile:**\n",
|
||||
" - Name: {name}\n",
|
||||
" - Age: {age}\n",
|
||||
" - Occupation: {occupation}\n",
|
||||
" - Interests: {', '.join(interests)}\n",
|
||||
" - Family Status: {family_status}\n",
|
||||
"\n",
|
||||
" **Email Categories to Include:**\n",
|
||||
" 1. **Work Emails**: Project updates, meeting invitations, colleague communications, \n",
|
||||
" performance reviews, company announcements\n",
|
||||
" 2. **Purchases**: Order confirmations, shipping notifications, delivery updates, \n",
|
||||
" receipts from various retailers (Amazon, local shops, etc.)\n",
|
||||
" 3. **Subscriptions**: Newsletter updates, streaming services (Netflix, Spotify), \n",
|
||||
" software subscriptions (Adobe, Microsoft 365), magazine subscriptions\n",
|
||||
" 4. **Family**: Communications with parents, siblings, children, extended family members,\n",
|
||||
" family event planning, photo sharing\n",
|
||||
" 5. **Friends**: Social plans, birthday wishes, casual conversations, group hangouts,\n",
|
||||
" catching up messages\n",
|
||||
" 6. **Finance**: Bank statements, credit card bills, investment updates, tax documents,\n",
|
||||
" payment reminders\n",
|
||||
" 7. **Social Media**: Facebook notifications, LinkedIn updates, Instagram activity,\n",
|
||||
" Twitter mentions\n",
|
||||
" 8. **Personal**: Doctor appointments, gym memberships, utility bills, insurance updates\n",
|
||||
"\n",
|
||||
" **Instructions:**\n",
|
||||
" - Generate realistic email content that reflects the person's life over time\n",
|
||||
" - Include temporal patterns (more work emails on weekdays, more personal on weekends)\n",
|
||||
" - Create realistic sender names and email addresses\n",
|
||||
" - Vary email length and formality based on context\n",
|
||||
" - Include realistic subject lines\n",
|
||||
" - Make emails interconnected when appropriate (e.g., follow-up emails, conversation threads)\n",
|
||||
" - Include seasonal events (holidays, birthdays, annual renewals)\n",
|
||||
" \"\"\"\n",
|
||||
" return persona\n",
|
||||
"\n",
|
||||
"persona_description = create_persona(\n",
|
||||
" name=\"John Doe\",\n",
|
||||
" age=30,\n",
|
||||
" occupation=\"Software Engineer\",\n",
|
||||
" interests=[\"technology\", \"reading\", \"traveling\"],\n",
|
||||
" family_status=\"single\"\n",
|
||||
")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "cec185e3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from openai import OpenAI\n",
|
||||
"from datetime import datetime, timedelta\n",
|
||||
"import random\n",
|
||||
"from typing import List\n",
|
||||
"\n",
|
||||
"def generate_synthetic_emails(\n",
|
||||
" persona_description: str,\n",
|
||||
" num_emails: int,\n",
|
||||
" start_date: str,\n",
|
||||
" end_date: str,\n",
|
||||
" model: str = \"gpt-4o-2024-08-06\"\n",
|
||||
") -> List[Email]:\n",
|
||||
" \"\"\"\n",
|
||||
" NEEDS TO WORK WITH OPENAI MODELS BECAUSE OF PARSED (STRUC OUTPUT) MODELS\n",
|
||||
" Generates synthetic emails using OpenAI's structured output feature.\n",
|
||||
" \n",
|
||||
" Args:\n",
|
||||
" persona_description: Detailed persona description\n",
|
||||
" num_emails: Number of emails to generate per batch\n",
|
||||
" start_date: Start date for email timestamps\n",
|
||||
" end_date: End date for email timestamps\n",
|
||||
" model: OpenAI model to use (must support structured outputs)\n",
|
||||
" \n",
|
||||
" Returns:\n",
|
||||
" List of Email objects\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" # Calculate date range for context\n",
|
||||
" date_range_context = f\"\"\"\n",
|
||||
" Generate emails with timestamps between {start_date} and {end_date}.\n",
|
||||
" Distribute emails naturally across this time period, with realistic patterns:\n",
|
||||
" - More emails during business hours on weekdays\n",
|
||||
" - Fewer emails late at night\n",
|
||||
" - Occasional weekend emails\n",
|
||||
" - Bursts of activity around events or busy periods\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" # System message combining persona and structure instructions\n",
|
||||
" system_message = f\"\"\"\n",
|
||||
" {persona_description}\n",
|
||||
"\n",
|
||||
" {date_range_context}\n",
|
||||
"\n",
|
||||
" Generate {num_emails} realistic emails that fit this person's life. \n",
|
||||
" Ensure variety in categories, senders, and content while maintaining realism.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" client = OpenAI()\n",
|
||||
"\n",
|
||||
" response = client.chat.completions.parse(\n",
|
||||
" model=model,\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": system_message\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": f\"Generate {num_emails} diverse, realistic emails for this person's inbox.\"\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" response_format=EmailBatch,\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.parsed.emails\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error generating emails: {e}\")\n",
|
||||
" return []\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def save_emails_to_json(emails: List[Email], filename: str):\n",
|
||||
" \"\"\"\n",
|
||||
" Saves emails to a JSON file.\n",
|
||||
" \"\"\"\n",
|
||||
" import json\n",
|
||||
" \n",
|
||||
" emails_dict = [email.model_dump() for email in emails]\n",
|
||||
" \n",
|
||||
" with open(filename, 'w', encoding='utf-8') as f:\n",
|
||||
" json.dump(emails_dict, f, indent=2, ensure_ascii=False)\n",
|
||||
" \n",
|
||||
" print(f\"Saved {len(emails)} emails to {filename}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"id": "be31f352",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"now\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"mails_2 = generate_synthetic_emails(\n",
|
||||
" persona_description = persona_description,\n",
|
||||
" num_emails = 100,\n",
|
||||
" start_date = '2024-06-01',\n",
|
||||
" end_date = '2025-01-01',\n",
|
||||
" model = \"gpt-4o\"\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"id": "24d844f2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Saved 101 emails to emails2.json\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"save_emails_to_json(mails_2, 'emails2.json')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2b9c704e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create embeddings for the mails\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "777012f8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports for langchain, plotly and Chroma\n",
|
||||
"\n",
|
||||
"from langchain.document_loaders import DirectoryLoader, TextLoader\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.schema import Document\n",
|
||||
"from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
|
||||
"from langchain_chroma import Chroma\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from sklearn.manifold import TSNE\n",
|
||||
"import numpy as np\n",
|
||||
"import plotly.graph_objects as go\n",
|
||||
"from langchain.memory import ConversationBufferMemory\n",
|
||||
"from langchain.chains import ConversationalRetrievalChain\n",
|
||||
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
||||
"import json\n",
|
||||
"from langchain.vectorstores import FAISS\n",
|
||||
"\n",
|
||||
"#MODEL = \"gpt-4o-mini\"\n",
|
||||
"db_name = \"vector_db\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "ce95d9c7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total number of chunks: 206\n",
|
||||
"Sample metadata fields: ['sender', 'timestamp', 'category']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Read in emails from the emails.json file and construct LangChain documents\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"with open(\"emails.json\", \"r\", encoding=\"utf-8\") as f:\n",
|
||||
" emails = json.load(f)\n",
|
||||
"\n",
|
||||
"documents = []\n",
|
||||
"for email in emails:\n",
|
||||
" # Extract metadata (all fields except 'content')\n",
|
||||
" metadata = {k: v for k, v in email.items() if k in ['sender','category','timestamp']}\n",
|
||||
" body = email.get(\"body\", \"\")\n",
|
||||
" documents.append(Document(page_content=body, metadata=metadata))\n",
|
||||
"\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n",
|
||||
"chunks = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"print(f\"Total number of chunks: {len(chunks)}\")\n",
|
||||
"print(f\"Sample metadata fields: {list(documents[0].metadata.keys()) if documents else []}\")\n",
|
||||
"\n",
|
||||
"embeddings_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
|
||||
"\n",
|
||||
"if os.path.exists(db_name):\n",
|
||||
" Chroma(persist_directory=db_name, embedding_function=embeddings_model).delete_collection()\n",
|
||||
"\n",
|
||||
"vectorstore = FAISS.from_documents(chunks, embedding=embeddings_model)\n",
|
||||
"\n",
|
||||
"all_embeddings = [vectorstore.index.reconstruct(i) for i in range(vectorstore.index.ntotal)]\n",
|
||||
"\n",
|
||||
"total_vectors = vectorstore.index.ntotal\n",
|
||||
"dimensions = vectorstore.index.d\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "78ca65bb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualizing mindmap"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"id": "a99dd2d6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import networkx as nx\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
||||
"import plotly.graph_objects as go\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.cluster import KMeans\n",
|
||||
"from sklearn.manifold import TSNE # Or use UMAP\n",
|
||||
"from pyvis.network import Network\n",
|
||||
"\n",
|
||||
"# Here, emails is your list of email objects, with .subject or .body\n",
|
||||
"\n",
|
||||
"# Build similarity graph\n",
|
||||
"def build_mindmap_html(emails, all_embeddings, threshold=0.6):\n",
|
||||
" similarity = cosine_similarity(all_embeddings)\n",
|
||||
"\n",
|
||||
" G = nx.Graph()\n",
|
||||
" for i, email in enumerate(emails):\n",
|
||||
" G.add_node(i, label=email['subject'][:80], title=email['body'][:50]) # Custom hover text\n",
|
||||
"\n",
|
||||
" for i in range(len(emails)):\n",
|
||||
" for j in range(i+1, len(emails)):\n",
|
||||
" if similarity[i][j] > threshold:\n",
|
||||
" G.add_edge(i, j, weight=float(similarity[i][j]))\n",
|
||||
"\n",
|
||||
" # Convert to pyvis network\n",
|
||||
" nt = Network(notebook=True, height='700px', width='100%', bgcolor='#222222', font_color='white')\n",
|
||||
" nt.from_nx(G)\n",
|
||||
" html = nt.generate_html().replace(\"'\", \"\\\"\")\n",
|
||||
" return html\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "53a2fbaf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Putting it all together in a gradio.\n",
|
||||
"It needs to have an interface to make questions, and the visual to see the mindmap.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"id": "161144ac",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create a new Chat with OpenAI\n",
|
||||
"MODEL=\"gpt-4o-mini\"\n",
|
||||
"llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n",
|
||||
"\n",
|
||||
"# set up the conversation memory for the chat\n",
|
||||
"memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
|
||||
"\n",
|
||||
"# the retriever is an abstraction over the VectorStore that will be used during RAG\n",
|
||||
"retriever = vectorstore.as_retriever()\n",
|
||||
"from langchain_core.callbacks import StdOutCallbackHandler\n",
|
||||
"\n",
|
||||
"# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory\n",
|
||||
"conversation_chain_debug = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])\n",
|
||||
"conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)\n",
|
||||
"\n",
|
||||
"# Wrapping that in a function\n",
|
||||
"\n",
|
||||
"def chat(question, history):\n",
|
||||
" result = conversation_chain.invoke({\"question\": question})\n",
|
||||
" return result[\"answer\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 60,
|
||||
"id": "16a4d8d1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\Javi\\Desktop\\course\\llm_engineering\\.venv\\Lib\\site-packages\\gradio\\chat_interface.py:347: UserWarning:\n",
|
||||
"\n",
|
||||
"The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n",
|
||||
"* Running on local URL: http://127.0.0.1:7878\n",
|
||||
"* To create a public link, set `share=True` in `launch()`.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div><iframe src=\"http://127.0.0.1:7878/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.HTML object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": []
|
||||
},
|
||||
"execution_count": 60,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n",
|
||||
"Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import gradio as gr\n",
|
||||
"\n",
|
||||
"def show_mindmap():\n",
|
||||
" # Call build_mindmap_html to generate the HTML\n",
|
||||
" html = build_mindmap_html(emails, all_embeddings)\n",
|
||||
" return f\"\"\"<iframe style=\"width: 100%; height: 600px;margin:0 auto\" name=\"result\" allow=\"midi; geolocation; microphone; camera; \n",
|
||||
" display-capture; encrypted-media;\" sandbox=\"allow-modals allow-forms \n",
|
||||
" allow-scripts allow-same-origin allow-popups \n",
|
||||
" allow-top-navigation-by-user-activation allow-downloads\" allowfullscreen=\"\" \n",
|
||||
" allowpaymentrequest=\"\" frameborder=\"0\" srcdoc='{html}'></iframe>\"\"\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"with gr.Blocks(title=\"Mindmap & Email Chatbot\") as demo:\n",
|
||||
" gr.Markdown(\"# 📧 Mindmap Visualization & Email QA Chatbot\")\n",
|
||||
" with gr.Row():\n",
|
||||
" chatbot = gr.ChatInterface(fn=chat, title=\"Ask about your emails\",\n",
|
||||
" examples=[\n",
|
||||
" \"What is my most important message?\",\n",
|
||||
" \"Who have I been communicating with?\",\n",
|
||||
" \"Summarize recent emails\"\n",
|
||||
" ],\n",
|
||||
")\n",
|
||||
" mindmap_html = gr.HTML(\n",
|
||||
" show_mindmap,\n",
|
||||
" label=\"🧠 Mindmap of Your Emails\",\n",
|
||||
" )\n",
|
||||
" # Reduce height: update show_mindmap (elsewhere) to ~400px, or do inline replace for the demo here:\n",
|
||||
" # mindmap_html = gr.HTML(lambda: show_mindmap().replace(\"height: 600px\", \"height: 400px\"))\n",
|
||||
" \n",
|
||||
"demo.launch(inbrowser=True)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "221a9d98",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user