Implement GMAIL RAG assistant

2025-10-30 05:37:27 +01:00
parent ba929c7ed4
commit f49d93fc87
5 changed files with 536 additions and 0 deletions
--- a/week5/community-contributions/emmy/gmail_rag/.gitignore
+++ b/week5/community-contributions/emmy/gmail_rag/.gitignore
@@ -0,0 +1,22 @@
+# Secrets - NEVER commit these!
+.env
+credentials.json
+token.json
+
+# Vector Database (contains your emails)
+chroma/
+
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.venv/
+venv/
+
+# IDE
+.vscode/
+.idea/
+.DS_Store
+
+# Logs
+*.log
--- a/week5/community-contributions/emmy/gmail_rag/README.md
+++ b/week5/community-contributions/emmy/gmail_rag/README.md
@@ -0,0 +1,88 @@
+# Gmail RAG Assistant 📧
+
+Search and ask questions about your Gmail emails using AI.
+
+## Setup
+
+### 1. Install Dependencies
+
+```bash
+python -m venv .venv
+source .venv/bin/activate  # Windows: .venv\Scripts\activate
+pip install -r requirements.txt
+```
+
+### 2. Google Cloud Setup
+
+1. Go to [Google Cloud Console](https://console.cloud.google.com)
+2. Create a project and enable **Gmail API**
+3. Create **OAuth 2.0 Desktop Client** credentials
+4. Download and save as `~/.config/gcp/langchain/credentials.json`
+5. Add your email as a test user in OAuth consent screen
+
+### 3. Configure Environment
+
+Create `.env` file:
+
+```env
+GOOGLE_CREDENTIALS_PATH=~/.config/gcp/langchain/credentials.json
+GOOGLE_TOKEN_PATH=~/.config/gcp/langchain/token.json
+OPENAI_API_KEY=your_openai_api_key_here
+```
+
+Get OpenAI API key from [platform.openai.com](https://platform.openai.com/api-keys)
+
+## Usage
+
+### Index your emails:
+```bash
+python ingest_gmail_drive.py
+```
+
+### Launch UI:
+```bash
+python app.py
+```
+
+Open `http://localhost:7860` in your browser.
+
+## File Structure
+
+```
+gmail_rag/
+├── ingest_gmail_drive.py  # Fetch and index emails
+├── app.py                 # Gradio UI
+├── requirements.txt       # Dependencies
+├── .env                   # API keys (create this)
+└── chroma/               # Vector database (auto-created)
+```
+
+## Configuration
+
+**Change number of emails** in `ingest_gmail_drive.py`:
+```python
+gmail_docs = load_gmail(n=100)  # Adjust this number
+```
+
+**Change AI model** in `app.py`:
+```python
+LLM_MODEL = "gpt-4o-mini"  # or "gpt-4", "gpt-3.5-turbo"
+```
+
+## Troubleshooting
+
+- **"Access Blocked"**: Add your email as test user in Google Cloud
+- **"ChromaDB not found"**: Run `ingest_gmail_drive.py` first
+- **Token expired**: Delete `~/.config/gcp/langchain/token.json` and re-run
+
+## Cost
+
+- Embeddings: ~$0.01-0.05 per 100 emails
+- Queries: ~$0.01 per 100 questions (using gpt-4o-mini)
+- Gmail API: Free
+
+## Security
+
+Never commit: `.env`, `credentials.json`, `token.json`, `chroma/`
+
+The `.gitignore` file protects these automatically.
--- a/week5/community-contributions/emmy/gmail_rag/app.py
+++ b/week5/community-contributions/emmy/gmail_rag/app.py
@@ -0,0 +1,223 @@
+import os
+import gradio as gr
+from dotenv import load_dotenv
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+
+load_dotenv()
+
+# ---- Settings ----
+CHROMA_DIR = "chroma"
+EMBED_MODEL = "text-embedding-3-small"
+LLM_MODEL = "gpt-4o-mini"
+
+# Initialize components
+embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
+vectorstore = None
+qa_chain = None
+
+
+def initialize_qa_chain():
+    """Initialize the QA chain with the vector store."""
+    global vectorstore, qa_chain
+    
+    if not os.path.exists(CHROMA_DIR):
+        return "❌ ChromaDB not found. Please run ingest_gmail_drive.py first to index your emails."
+    
+    try:
+        vectorstore = Chroma(
+            persist_directory=CHROMA_DIR,
+            embedding_function=embeddings
+        )
+        
+        # Create custom prompt
+        prompt_template = """Use the following pieces of context from Gmail emails to answer the question. 
+If you don't know the answer based on the context, just say you don't have that information in the emails.
+
+Context:
+{context}
+
+Question: {question}
+
+Answer:"""
+        
+        PROMPT = PromptTemplate(
+            template=prompt_template,
+            input_variables=["context", "question"]
+        )
+        
+        llm = ChatOpenAI(model=LLM_MODEL, temperature=0)
+        
+        qa_chain = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
+            chain_type_kwargs={"prompt": PROMPT},
+            return_source_documents=True
+        )
+        
+        return "✓ Ready to answer questions about your emails!"
+    except Exception as e:
+        return f"❌ Error initializing: {str(e)}"
+
+
+def query_emails(question, num_results=5):
+    """Query the email database."""
+    if qa_chain is None:
+        return "Please click 'Initialize System' first!", ""
+    
+    if not question.strip():
+        return "Please enter a question.", ""
+    
+    try:
+        # Get answer
+        result = qa_chain({"query": question})
+        answer = result['result']
+        
+        # Format sources
+        sources_text = "\n\n---\n\n**📧 Source Emails:**\n\n"
+        for i, doc in enumerate(result['source_documents'][:num_results], 1):
+            sources_text += f"**Email {i}:**\n"
+            sources_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n"
+            sources_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n"
+            sources_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n"
+            sources_text += f"- **Preview:** {doc.page_content[:200]}...\n\n"
+        
+        return answer, sources_text
+    except Exception as e:
+        return f"❌ Error: {str(e)}", ""
+
+
+def search_emails(query_text, num_results=5):
+    """Direct vector similarity search."""
+    if vectorstore is None:
+        return "Please click 'Initialize System' first!"
+    
+    if not query_text.strip():
+        return "Please enter a search query."
+    
+    try:
+        docs = vectorstore.similarity_search(query_text, k=num_results)
+        
+        results_text = f"**Found {len(docs)} relevant emails:**\n\n"
+        for i, doc in enumerate(docs, 1):
+            results_text += f"**Email {i}:**\n"
+            results_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n"
+            results_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n"
+            results_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n"
+            results_text += f"- **Content Preview:**\n{doc.page_content[:300]}...\n\n"
+            results_text += "---\n\n"
+        
+        return results_text
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+
+
+# Create Gradio Interface
+with gr.Blocks(title="Gmail RAG Assistant", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 📧 Gmail RAG Assistant
+        Ask questions about your emails or search for specific content.
+        """
+    )
+    
+    with gr.Row():
+        init_btn = gr.Button("🚀 Initialize System", variant="primary")
+        status_text = gr.Textbox(label="Status", interactive=False)
+    
+    init_btn.click(fn=initialize_qa_chain, outputs=status_text)
+    
+    gr.Markdown("---")
+    
+    with gr.Tab("💬 Ask Questions"):
+        gr.Markdown("Ask natural language questions about your emails.")
+        
+        with gr.Row():
+            with gr.Column(scale=4):
+                question_input = gr.Textbox(
+                    label="Your Question",
+                    placeholder="What is the latest message from Andela?",
+                    lines=2
+                )
+            with gr.Column(scale=1):
+                qa_num_results = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=5,
+                    step=1,
+                    label="Sources to Show"
+                )
+        
+        qa_btn = gr.Button("Ask Question", variant="primary")
+        
+        answer_output = gr.Markdown(label="Answer")
+        sources_output = gr.Markdown(label="Sources")
+        
+        qa_btn.click(
+            fn=query_emails,
+            inputs=[question_input, qa_num_results],
+            outputs=[answer_output, sources_output]
+        )
+        
+        # Example questions
+        gr.Examples(
+            examples=[
+                ["What is the latest message from Andela?"],
+                ["Summarize emails about project updates"],
+                ["What meetings do I have scheduled?"],
+                ["Find emails about invoices or payments"],
+            ],
+            inputs=question_input
+        )
+    
+    with gr.Tab("🔍 Search Emails"):
+        gr.Markdown("Search for emails using semantic similarity.")
+        
+        with gr.Row():
+            with gr.Column(scale=4):
+                search_input = gr.Textbox(
+                    label="Search Query",
+                    placeholder="project deadline",
+                    lines=2
+                )
+            with gr.Column(scale=1):
+                search_num_results = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=5,
+                    step=1,
+                    label="Results"
+                )
+        
+        search_btn = gr.Button("Search", variant="primary")
+        search_output = gr.Markdown(label="Search Results")
+        
+        search_btn.click(
+            fn=search_emails,
+            inputs=[search_input, search_num_results],
+            outputs=search_output
+        )
+        
+        gr.Examples(
+            examples=[
+                ["Andela"],
+                ["meeting schedule"],
+                ["invoice payment"],
+                ["project status update"],
+            ],
+            inputs=search_input
+        )
+    
+    gr.Markdown(
+        """
+        ---
+        **Note:** Make sure you've run `ingest_gmail_drive.py` first to index your emails.
+        """
+    )
+
+
+if __name__ == "__main__":
+    demo.launch()
--- a/week5/community-contributions/emmy/gmail_rag/ingest_gmail_drive.py
+++ b/week5/community-contributions/emmy/gmail_rag/ingest_gmail_drive.py
@@ -0,0 +1,189 @@
+import os
+import base64
+from pathlib import Path
+from dotenv import load_dotenv
+from email.utils import parsedate_to_datetime
+
+# Load environment variables from .env file
+load_dotenv()
+
+# --- Configuration ---
+GOOGLE_CREDENTIALS_PATH = os.getenv(
+    "GOOGLE_CREDENTIALS_PATH", "~/.config/gcp/langchain/credentials.json"
+)
+GOOGLE_TOKEN_PATH = os.getenv(
+    "GOOGLE_TOKEN_PATH", "~/.config/gcp/langchain/token.json"
+)
+
+# ---- LangChain imports ----
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings
+from langchain_core.documents import Document
+
+# ---- Settings ----
+CHROMA_DIR = "chroma"
+EMBED_MODEL = "text-embedding-3-small"
+
+
+def get_gmail_service():
+    """Authenticate and return Gmail API service."""
+    from google.auth.transport.requests import Request
+    from google.oauth2.credentials import Credentials
+    from google_auth_oauthlib.flow import InstalledAppFlow
+    from googleapiclient.discovery import build
+    
+    SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
+    
+    token_path = os.path.expanduser(GOOGLE_TOKEN_PATH)
+    creds_path = os.path.expanduser(GOOGLE_CREDENTIALS_PATH)
+    
+    if not os.path.exists(creds_path):
+        raise FileNotFoundError(
+            f"Credentials file not found at: {creds_path}\n"
+            f"Please download OAuth 2.0 Client ID credentials from Google Cloud Console."
+        )
+    
+    creds = None
+    if os.path.exists(token_path):
+        creds = Credentials.from_authorized_user_file(token_path, SCOPES)
+    
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+        else:
+            flow = InstalledAppFlow.from_client_secrets_file(creds_path, SCOPES)
+            creds = flow.run_local_server(port=0)
+        
+        with open(token_path, "w") as token:
+            token.write(creds.to_json())
+    
+    return build('gmail', 'v1', credentials=creds)
+
+
+def get_header_value(headers, name):
+    """Extract header value from Gmail headers list."""
+    for header in headers:
+        if header['name'].lower() == name.lower():
+            return header['value']
+    return ''
+
+
+def decode_body(payload):
+    """Decode email body from Gmail payload."""
+    body = ""
+    
+    if 'body' in payload and 'data' in payload['body']:
+        body = base64.urlsafe_b64decode(payload['body']['data']).decode('utf-8', errors='ignore')
+    
+    # Handle multipart messages
+    if 'parts' in payload:
+        for part in payload['parts']:
+            if part['mimeType'] == 'text/plain':
+                if 'data' in part['body']:
+                    body += base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore')
+            elif 'parts' in part:
+                # Recursively handle nested parts
+                body += decode_body(part)
+    
+    return body
+
+
+def load_gmail(n=50, query=None):
+    """Load Gmail messages directly using Gmail API."""
+    service = get_gmail_service()
+    
+    # Fetch message list
+    results = service.users().messages().list(
+        userId='me',
+        maxResults=n,
+        q=query if query else ''
+    ).execute()
+    
+    messages = results.get('messages', [])
+    
+    if not messages:
+        print("No messages found.")
+        return []
+    
+    print(f"Fetching {len(messages)} messages...")
+    
+    docs = []
+    for i, msg_ref in enumerate(messages, 1):
+        # Fetch full message
+        msg = service.users().messages().get(
+            userId='me',
+            id=msg_ref['id'],
+            format='full'
+        ).execute()
+        
+        # Extract headers
+        headers = msg['payload']['headers']
+        subject = get_header_value(headers, 'Subject')
+        sender = get_header_value(headers, 'From')
+        date = get_header_value(headers, 'Date')
+        to = get_header_value(headers, 'To')
+        
+        # Extract body
+        body = decode_body(msg['payload'])
+        
+        # Create metadata
+        metadata = {
+            'source': 'gmail',
+            'id': msg['id'],
+            'subject': subject,
+            'from': sender,
+            'to': to,
+            'date': date,
+            'thread_id': msg.get('threadId', ''),
+        }
+        
+        # Format content
+        content = f"Subject: {subject}\n"
+        content += f"From: {sender}\n"
+        content += f"To: {to}\n"
+        content += f"Date: {date}\n\n"
+        content += body
+        
+        docs.append(Document(page_content=content, metadata=metadata))
+        
+        if i % 10 == 0:
+            print(f"  Processed {i}/{len(messages)} messages...")
+    
+    print(f"✓ Gmail: loaded {len(docs)} documents")
+    return docs
+
+
+def main():
+    print("Starting Gmail RAG ingestion...\n")
+    
+    # 1) Load Gmail documents
+    gmail_docs = load_gmail(n=50)
+    
+    if not gmail_docs:
+        print("No documents to process. Exiting.")
+        return
+    
+    # 2) Split into chunks
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=150)
+    chunks = splitter.split_documents(gmail_docs)
+    print(f"✓ Created {len(chunks)} chunks")
+    
+    # 3) Create embeddings and store in ChromaDB
+    print(f"✓ Creating embeddings with {EMBED_MODEL}...")
+    embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
+    
+    Path(CHROMA_DIR).mkdir(parents=True, exist_ok=True)
+    vs = Chroma.from_documents(
+        chunks, 
+        embedding=embeddings, 
+        persist_directory=CHROMA_DIR
+    )
+    vs.persist()
+    
+    print(f"✓ Successfully persisted ChromaDB at: {CHROMA_DIR}\n")
+    print("Ingestion complete! You can now query your Gmail data.")
+
+
+if __name__ == "__main__":
+    main()
--- a/week5/community-contributions/emmy/gmail_rag/requirements.txt
+++ b/week5/community-contributions/emmy/gmail_rag/requirements.txt
@@ -0,0 +1,14 @@
+python-dotenv
+langchain
+langchain-core
+langchain-community
+langchain-openai
+langchain-chroma
+langchain-google-community
+chromadb
+openai
+gradio
+google-auth
+google-auth-oauthlib
+google-auth-httplib2
+google-api-python-client