Implement GMAIL RAG assistant

2025-10-30 05:37:27 +01:00
parent ba929c7ed4
commit f49d93fc87
5 changed files with 536 additions and 0 deletions
--- a/week5/community-contributions/emmy/gmail_rag/.gitignore
+++ b/week5/community-contributions/emmy/gmail_rag/.gitignore
@@ -0,0 +1,22 @@
 # Secrets - NEVER commit these!
 .env
 credentials.json
 token.json
 # Vector Database (contains your emails)
 chroma/
 # Python
 __pycache__/
 *.py[cod]
 *.egg-info/
 .venv/
 venv/
 # IDE
 .vscode/
 .idea/
 .DS_Store
 # Logs
 *.log
--- a/week5/community-contributions/emmy/gmail_rag/README.md
+++ b/week5/community-contributions/emmy/gmail_rag/README.md
@@ -0,0 +1,88 @@
 # Gmail RAG Assistant 📧
 Search and ask questions about your Gmail emails using AI.
 ## Setup
 ### 1. Install Dependencies
 ```bash
 python -m venv .venv
 source .venv/bin/activate  # Windows: .venv\Scripts\activate
 pip install -r requirements.txt
 ```
 ### 2. Google Cloud Setup
 1. Go to [Google Cloud Console](https://console.cloud.google.com)
 2. Create a project and enable **Gmail API**
 3. Create **OAuth 2.0 Desktop Client** credentials
 4. Download and save as `~/.config/gcp/langchain/credentials.json`
 5. Add your email as a test user in OAuth consent screen
 ### 3. Configure Environment
 Create `.env` file:
 ```env
 GOOGLE_CREDENTIALS_PATH=~/.config/gcp/langchain/credentials.json
 GOOGLE_TOKEN_PATH=~/.config/gcp/langchain/token.json
 OPENAI_API_KEY=your_openai_api_key_here
 ```
 Get OpenAI API key from [platform.openai.com](https://platform.openai.com/api-keys)
 ## Usage
 ### Index your emails:
 ```bash
 python ingest_gmail_drive.py
 ```
 ### Launch UI:
 ```bash
 python app.py
 ```
 Open `http://localhost:7860` in your browser.
 ## File Structure
 ```
 gmail_rag/
 ├── ingest_gmail_drive.py  # Fetch and index emails
 ├── app.py                 # Gradio UI
 ├── requirements.txt       # Dependencies
 ├── .env                   # API keys (create this)
 └── chroma/               # Vector database (auto-created)
 ```
 ## Configuration
 **Change number of emails** in `ingest_gmail_drive.py`:
 ```python
 gmail_docs = load_gmail(n=100)  # Adjust this number
 ```
 **Change AI model** in `app.py`:
 ```python
 LLM_MODEL = "gpt-4o-mini"  # or "gpt-4", "gpt-3.5-turbo"
 ```
 ## Troubleshooting
 - **"Access Blocked"**: Add your email as test user in Google Cloud
 - **"ChromaDB not found"**: Run `ingest_gmail_drive.py` first
 - **Token expired**: Delete `~/.config/gcp/langchain/token.json` and re-run
 ## Cost
 - Embeddings: ~$0.01-0.05 per 100 emails
 - Queries: ~$0.01 per 100 questions (using gpt-4o-mini)
 - Gmail API: Free
 ## Security
 Never commit: `.env`, `credentials.json`, `token.json`, `chroma/`
 The `.gitignore` file protects these automatically.
--- a/week5/community-contributions/emmy/gmail_rag/app.py
+++ b/week5/community-contributions/emmy/gmail_rag/app.py
@@ -0,0 +1,223 @@
 import os
 import gradio as gr
 from dotenv import load_dotenv
 from langchain_community.vectorstores import Chroma
 from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain.chains import RetrievalQA
 from langchain.prompts import PromptTemplate
 load_dotenv()
 # ---- Settings ----
 CHROMA_DIR = "chroma"
 EMBED_MODEL = "text-embedding-3-small"
 LLM_MODEL = "gpt-4o-mini"
 # Initialize components
 embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
 vectorstore = None
 qa_chain = None
 def initialize_qa_chain():
    """Initialize the QA chain with the vector store."""
    global vectorstore, qa_chain
    if not os.path.exists(CHROMA_DIR):
        return "❌ ChromaDB not found. Please run ingest_gmail_drive.py first to index your emails."
    try:
        vectorstore = Chroma(
            persist_directory=CHROMA_DIR,
            embedding_function=embeddings
        )
        # Create custom prompt
        prompt_template = """Use the following pieces of context from Gmail emails to answer the question. 
 If you don't know the answer based on the context, just say you don't have that information in the emails.
 Context:
 {context}
 Question: {question}
 Answer:"""
        PROMPT = PromptTemplate(
            template=prompt_template,
            input_variables=["context", "question"]
        )
        llm = ChatOpenAI(model=LLM_MODEL, temperature=0)
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
            chain_type_kwargs={"prompt": PROMPT},
            return_source_documents=True
        )
        return "✓ Ready to answer questions about your emails!"
    except Exception as e:
        return f"❌ Error initializing: {str(e)}"
 def query_emails(question, num_results=5):
    """Query the email database."""
    if qa_chain is None:
        return "Please click 'Initialize System' first!", ""
    if not question.strip():
        return "Please enter a question.", ""
    try:
        # Get answer
        result = qa_chain({"query": question})
        answer = result['result']
        # Format sources
        sources_text = "\n\n---\n\n**📧 Source Emails:**\n\n"
        for i, doc in enumerate(result['source_documents'][:num_results], 1):
            sources_text += f"**Email {i}:**\n"
            sources_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n"
            sources_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n"
            sources_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n"
            sources_text += f"- **Preview:** {doc.page_content[:200]}...\n\n"
        return answer, sources_text
    except Exception as e:
        return f"❌ Error: {str(e)}", ""
 def search_emails(query_text, num_results=5):
    """Direct vector similarity search."""
    if vectorstore is None:
        return "Please click 'Initialize System' first!"
    if not query_text.strip():
        return "Please enter a search query."
    try:
        docs = vectorstore.similarity_search(query_text, k=num_results)
        results_text = f"**Found {len(docs)} relevant emails:**\n\n"
        for i, doc in enumerate(docs, 1):
            results_text += f"**Email {i}:**\n"
            results_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n"
            results_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n"
            results_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n"
            results_text += f"- **Content Preview:**\n{doc.page_content[:300]}...\n\n"
            results_text += "---\n\n"
        return results_text
    except Exception as e:
        return f"❌ Error: {str(e)}"
 # Create Gradio Interface
 with gr.Blocks(title="Gmail RAG Assistant", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 📧 Gmail RAG Assistant
        Ask questions about your emails or search for specific content.
        """
    )
    with gr.Row():
        init_btn = gr.Button("🚀 Initialize System", variant="primary")
        status_text = gr.Textbox(label="Status", interactive=False)
    init_btn.click(fn=initialize_qa_chain, outputs=status_text)
    gr.Markdown("---")
    with gr.Tab("💬 Ask Questions"):
        gr.Markdown("Ask natural language questions about your emails.")
        with gr.Row():
            with gr.Column(scale=4):
                question_input = gr.Textbox(
                    label="Your Question",
                    placeholder="What is the latest message from Andela?",
                    lines=2
                )
            with gr.Column(scale=1):
                qa_num_results = gr.Slider(
                    minimum=1,
                    maximum=10,
                    value=5,
                    step=1,
                    label="Sources to Show"
                )
        qa_btn = gr.Button("Ask Question", variant="primary")
        answer_output = gr.Markdown(label="Answer")
        sources_output = gr.Markdown(label="Sources")
        qa_btn.click(
            fn=query_emails,
            inputs=[question_input, qa_num_results],
            outputs=[answer_output, sources_output]
        )
        # Example questions
        gr.Examples(
            examples=[
                ["What is the latest message from Andela?"],
                ["Summarize emails about project updates"],
                ["What meetings do I have scheduled?"],
                ["Find emails about invoices or payments"],
            ],
            inputs=question_input
        )
    with gr.Tab("🔍 Search Emails"):
        gr.Markdown("Search for emails using semantic similarity.")
        with gr.Row():
            with gr.Column(scale=4):
                search_input = gr.Textbox(
                    label="Search Query",
                    placeholder="project deadline",
                    lines=2
                )
            with gr.Column(scale=1):
                search_num_results = gr.Slider(
                    minimum=1,
                    maximum=10,
                    value=5,
                    step=1,
                    label="Results"
                )
        search_btn = gr.Button("Search", variant="primary")
        search_output = gr.Markdown(label="Search Results")
        search_btn.click(
            fn=search_emails,
            inputs=[search_input, search_num_results],
            outputs=search_output
        )
        gr.Examples(
            examples=[
                ["Andela"],
                ["meeting schedule"],
                ["invoice payment"],
                ["project status update"],
            ],
            inputs=search_input
        )
    gr.Markdown(
        """
        ---
        **Note:** Make sure you've run `ingest_gmail_drive.py` first to index your emails.
        """
    )
 if __name__ == "__main__":
    demo.launch()
--- a/week5/community-contributions/emmy/gmail_rag/ingest_gmail_drive.py
+++ b/week5/community-contributions/emmy/gmail_rag/ingest_gmail_drive.py
@@ -0,0 +1,189 @@
 import os
 import base64
 from pathlib import Path
 from dotenv import load_dotenv
 from email.utils import parsedate_to_datetime
 # Load environment variables from .env file
 load_dotenv()
 # --- Configuration ---
 GOOGLE_CREDENTIALS_PATH = os.getenv(
    "GOOGLE_CREDENTIALS_PATH", "~/.config/gcp/langchain/credentials.json"
 )
 GOOGLE_TOKEN_PATH = os.getenv(
    "GOOGLE_TOKEN_PATH", "~/.config/gcp/langchain/token.json"
 )
 # ---- LangChain imports ----
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_openai import OpenAIEmbeddings
 from langchain_core.documents import Document
 # ---- Settings ----
 CHROMA_DIR = "chroma"
 EMBED_MODEL = "text-embedding-3-small"
 def get_gmail_service():
    """Authenticate and return Gmail API service."""
    from google.auth.transport.requests import Request
    from google.oauth2.credentials import Credentials
    from google_auth_oauthlib.flow import InstalledAppFlow
    from googleapiclient.discovery import build
    SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
    token_path = os.path.expanduser(GOOGLE_TOKEN_PATH)
    creds_path = os.path.expanduser(GOOGLE_CREDENTIALS_PATH)
    if not os.path.exists(creds_path):
        raise FileNotFoundError(
            f"Credentials file not found at: {creds_path}\n"
            f"Please download OAuth 2.0 Client ID credentials from Google Cloud Console."
        )
    creds = None
    if os.path.exists(token_path):
        creds = Credentials.from_authorized_user_file(token_path, SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(creds_path, SCOPES)
            creds = flow.run_local_server(port=0)
        with open(token_path, "w") as token:
            token.write(creds.to_json())
    return build('gmail', 'v1', credentials=creds)
 def get_header_value(headers, name):
    """Extract header value from Gmail headers list."""
    for header in headers:
        if header['name'].lower() == name.lower():
            return header['value']
    return ''
 def decode_body(payload):
    """Decode email body from Gmail payload."""
    body = ""
    if 'body' in payload and 'data' in payload['body']:
        body = base64.urlsafe_b64decode(payload['body']['data']).decode('utf-8', errors='ignore')
    # Handle multipart messages
    if 'parts' in payload:
        for part in payload['parts']:
            if part['mimeType'] == 'text/plain':
                if 'data' in part['body']:
                    body += base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore')
            elif 'parts' in part:
                # Recursively handle nested parts
                body += decode_body(part)
    return body
 def load_gmail(n=50, query=None):
    """Load Gmail messages directly using Gmail API."""
    service = get_gmail_service()
    # Fetch message list
    results = service.users().messages().list(
        userId='me',
        maxResults=n,
        q=query if query else ''
    ).execute()
    messages = results.get('messages', [])
    if not messages:
        print("No messages found.")
        return []
    print(f"Fetching {len(messages)} messages...")
    docs = []
    for i, msg_ref in enumerate(messages, 1):
        # Fetch full message
        msg = service.users().messages().get(
            userId='me',
            id=msg_ref['id'],
            format='full'
        ).execute()
        # Extract headers
        headers = msg['payload']['headers']
        subject = get_header_value(headers, 'Subject')
        sender = get_header_value(headers, 'From')
        date = get_header_value(headers, 'Date')
        to = get_header_value(headers, 'To')
        # Extract body
        body = decode_body(msg['payload'])
        # Create metadata
        metadata = {
            'source': 'gmail',
            'id': msg['id'],
            'subject': subject,
            'from': sender,
            'to': to,
            'date': date,
            'thread_id': msg.get('threadId', ''),
        }
        # Format content
        content = f"Subject: {subject}\n"
        content += f"From: {sender}\n"
        content += f"To: {to}\n"
        content += f"Date: {date}\n\n"
        content += body
        docs.append(Document(page_content=content, metadata=metadata))
        if i % 10 == 0:
            print(f"  Processed {i}/{len(messages)} messages...")
    print(f"✓ Gmail: loaded {len(docs)} documents")
    return docs
 def main():
    print("Starting Gmail RAG ingestion...\n")
    # 1) Load Gmail documents
    gmail_docs = load_gmail(n=50)
    if not gmail_docs:
        print("No documents to process. Exiting.")
        return
    # 2) Split into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=150)
    chunks = splitter.split_documents(gmail_docs)
    print(f"✓ Created {len(chunks)} chunks")
    # 3) Create embeddings and store in ChromaDB
    print(f"✓ Creating embeddings with {EMBED_MODEL}...")
    embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
    Path(CHROMA_DIR).mkdir(parents=True, exist_ok=True)
    vs = Chroma.from_documents(
        chunks, 
        embedding=embeddings, 
        persist_directory=CHROMA_DIR
    )
    vs.persist()
    print(f"✓ Successfully persisted ChromaDB at: {CHROMA_DIR}\n")
    print("Ingestion complete! You can now query your Gmail data.")
 if __name__ == "__main__":
    main()
--- a/week5/community-contributions/emmy/gmail_rag/requirements.txt
+++ b/week5/community-contributions/emmy/gmail_rag/requirements.txt
@@ -0,0 +1,14 @@
 python-dotenv
 langchain
 langchain-core
 langchain-community
 langchain-openai
 langchain-chroma
 langchain-google-community
 chromadb
 openai
 gradio
 google-auth
 google-auth-oauthlib
 google-auth-httplib2
 google-api-python-client