Merge pull request #907 from Mogbeyi/emmy/week5-solution

Implement GMAIL RAG assistant
This commit is contained in:
Ed Donner
2025-10-30 22:03:51 -04:00
committed by GitHub
5 changed files with 536 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
# Secrets - NEVER commit these!
.env
credentials.json
token.json
# Vector Database (contains your emails)
chroma/
# Python
__pycache__/
*.py[cod]
*.egg-info/
.venv/
venv/
# IDE
.vscode/
.idea/
.DS_Store
# Logs
*.log

View File

@@ -0,0 +1,88 @@
# Gmail RAG Assistant 📧
Search and ask questions about your Gmail emails using AI.
## Setup
### 1. Install Dependencies
```bash
python -m venv .venv
source .venv/bin/activate # Windows: .venv\Scripts\activate
pip install -r requirements.txt
```
### 2. Google Cloud Setup
1. Go to [Google Cloud Console](https://console.cloud.google.com)
2. Create a project and enable **Gmail API**
3. Create **OAuth 2.0 Desktop Client** credentials
4. Download and save as `~/.config/gcp/langchain/credentials.json`
5. Add your email as a test user in OAuth consent screen
### 3. Configure Environment
Create `.env` file:
```env
GOOGLE_CREDENTIALS_PATH=~/.config/gcp/langchain/credentials.json
GOOGLE_TOKEN_PATH=~/.config/gcp/langchain/token.json
OPENAI_API_KEY=your_openai_api_key_here
```
Get OpenAI API key from [platform.openai.com](https://platform.openai.com/api-keys)
## Usage
### Index your emails:
```bash
python ingest_gmail_drive.py
```
### Launch UI:
```bash
python app.py
```
Open `http://localhost:7860` in your browser.
## File Structure
```
gmail_rag/
├── ingest_gmail_drive.py # Fetch and index emails
├── app.py # Gradio UI
├── requirements.txt # Dependencies
├── .env # API keys (create this)
└── chroma/ # Vector database (auto-created)
```
## Configuration
**Change number of emails** in `ingest_gmail_drive.py`:
```python
gmail_docs = load_gmail(n=100) # Adjust this number
```
**Change AI model** in `app.py`:
```python
LLM_MODEL = "gpt-4o-mini" # or "gpt-4", "gpt-3.5-turbo"
```
## Troubleshooting
- **"Access Blocked"**: Add your email as test user in Google Cloud
- **"ChromaDB not found"**: Run `ingest_gmail_drive.py` first
- **Token expired**: Delete `~/.config/gcp/langchain/token.json` and re-run
## Cost
- Embeddings: ~$0.01-0.05 per 100 emails
- Queries: ~$0.01 per 100 questions (using gpt-4o-mini)
- Gmail API: Free
## Security
Never commit: `.env`, `credentials.json`, `token.json`, `chroma/`
The `.gitignore` file protects these automatically.

View File

@@ -0,0 +1,223 @@
import os
import gradio as gr
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
load_dotenv()
# ---- Settings ----
CHROMA_DIR = "chroma"
EMBED_MODEL = "text-embedding-3-small"
LLM_MODEL = "gpt-4o-mini"
# Initialize components
embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
vectorstore = None
qa_chain = None
def initialize_qa_chain():
"""Initialize the QA chain with the vector store."""
global vectorstore, qa_chain
if not os.path.exists(CHROMA_DIR):
return "❌ ChromaDB not found. Please run ingest_gmail_drive.py first to index your emails."
try:
vectorstore = Chroma(
persist_directory=CHROMA_DIR,
embedding_function=embeddings
)
# Create custom prompt
prompt_template = """Use the following pieces of context from Gmail emails to answer the question.
If you don't know the answer based on the context, just say you don't have that information in the emails.
Context:
{context}
Question: {question}
Answer:"""
PROMPT = PromptTemplate(
template=prompt_template,
input_variables=["context", "question"]
)
llm = ChatOpenAI(model=LLM_MODEL, temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
chain_type_kwargs={"prompt": PROMPT},
return_source_documents=True
)
return "✓ Ready to answer questions about your emails!"
except Exception as e:
return f"❌ Error initializing: {str(e)}"
def query_emails(question, num_results=5):
"""Query the email database."""
if qa_chain is None:
return "Please click 'Initialize System' first!", ""
if not question.strip():
return "Please enter a question.", ""
try:
# Get answer
result = qa_chain({"query": question})
answer = result['result']
# Format sources
sources_text = "\n\n---\n\n**📧 Source Emails:**\n\n"
for i, doc in enumerate(result['source_documents'][:num_results], 1):
sources_text += f"**Email {i}:**\n"
sources_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n"
sources_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n"
sources_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n"
sources_text += f"- **Preview:** {doc.page_content[:200]}...\n\n"
return answer, sources_text
except Exception as e:
return f"❌ Error: {str(e)}", ""
def search_emails(query_text, num_results=5):
"""Direct vector similarity search."""
if vectorstore is None:
return "Please click 'Initialize System' first!"
if not query_text.strip():
return "Please enter a search query."
try:
docs = vectorstore.similarity_search(query_text, k=num_results)
results_text = f"**Found {len(docs)} relevant emails:**\n\n"
for i, doc in enumerate(docs, 1):
results_text += f"**Email {i}:**\n"
results_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n"
results_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n"
results_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n"
results_text += f"- **Content Preview:**\n{doc.page_content[:300]}...\n\n"
results_text += "---\n\n"
return results_text
except Exception as e:
return f"❌ Error: {str(e)}"
# Create Gradio Interface
with gr.Blocks(title="Gmail RAG Assistant", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 📧 Gmail RAG Assistant
Ask questions about your emails or search for specific content.
"""
)
with gr.Row():
init_btn = gr.Button("🚀 Initialize System", variant="primary")
status_text = gr.Textbox(label="Status", interactive=False)
init_btn.click(fn=initialize_qa_chain, outputs=status_text)
gr.Markdown("---")
with gr.Tab("💬 Ask Questions"):
gr.Markdown("Ask natural language questions about your emails.")
with gr.Row():
with gr.Column(scale=4):
question_input = gr.Textbox(
label="Your Question",
placeholder="What is the latest message from Andela?",
lines=2
)
with gr.Column(scale=1):
qa_num_results = gr.Slider(
minimum=1,
maximum=10,
value=5,
step=1,
label="Sources to Show"
)
qa_btn = gr.Button("Ask Question", variant="primary")
answer_output = gr.Markdown(label="Answer")
sources_output = gr.Markdown(label="Sources")
qa_btn.click(
fn=query_emails,
inputs=[question_input, qa_num_results],
outputs=[answer_output, sources_output]
)
# Example questions
gr.Examples(
examples=[
["What is the latest message from Andela?"],
["Summarize emails about project updates"],
["What meetings do I have scheduled?"],
["Find emails about invoices or payments"],
],
inputs=question_input
)
with gr.Tab("🔍 Search Emails"):
gr.Markdown("Search for emails using semantic similarity.")
with gr.Row():
with gr.Column(scale=4):
search_input = gr.Textbox(
label="Search Query",
placeholder="project deadline",
lines=2
)
with gr.Column(scale=1):
search_num_results = gr.Slider(
minimum=1,
maximum=10,
value=5,
step=1,
label="Results"
)
search_btn = gr.Button("Search", variant="primary")
search_output = gr.Markdown(label="Search Results")
search_btn.click(
fn=search_emails,
inputs=[search_input, search_num_results],
outputs=search_output
)
gr.Examples(
examples=[
["Andela"],
["meeting schedule"],
["invoice payment"],
["project status update"],
],
inputs=search_input
)
gr.Markdown(
"""
---
**Note:** Make sure you've run `ingest_gmail_drive.py` first to index your emails.
"""
)
if __name__ == "__main__":
demo.launch()

View File

@@ -0,0 +1,189 @@
import os
import base64
from pathlib import Path
from dotenv import load_dotenv
from email.utils import parsedate_to_datetime
# Load environment variables from .env file
load_dotenv()
# --- Configuration ---
GOOGLE_CREDENTIALS_PATH = os.getenv(
"GOOGLE_CREDENTIALS_PATH", "~/.config/gcp/langchain/credentials.json"
)
GOOGLE_TOKEN_PATH = os.getenv(
"GOOGLE_TOKEN_PATH", "~/.config/gcp/langchain/token.json"
)
# ---- LangChain imports ----
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
# ---- Settings ----
CHROMA_DIR = "chroma"
EMBED_MODEL = "text-embedding-3-small"
def get_gmail_service():
"""Authenticate and return Gmail API service."""
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
token_path = os.path.expanduser(GOOGLE_TOKEN_PATH)
creds_path = os.path.expanduser(GOOGLE_CREDENTIALS_PATH)
if not os.path.exists(creds_path):
raise FileNotFoundError(
f"Credentials file not found at: {creds_path}\n"
f"Please download OAuth 2.0 Client ID credentials from Google Cloud Console."
)
creds = None
if os.path.exists(token_path):
creds = Credentials.from_authorized_user_file(token_path, SCOPES)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(creds_path, SCOPES)
creds = flow.run_local_server(port=0)
with open(token_path, "w") as token:
token.write(creds.to_json())
return build('gmail', 'v1', credentials=creds)
def get_header_value(headers, name):
"""Extract header value from Gmail headers list."""
for header in headers:
if header['name'].lower() == name.lower():
return header['value']
return ''
def decode_body(payload):
"""Decode email body from Gmail payload."""
body = ""
if 'body' in payload and 'data' in payload['body']:
body = base64.urlsafe_b64decode(payload['body']['data']).decode('utf-8', errors='ignore')
# Handle multipart messages
if 'parts' in payload:
for part in payload['parts']:
if part['mimeType'] == 'text/plain':
if 'data' in part['body']:
body += base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore')
elif 'parts' in part:
# Recursively handle nested parts
body += decode_body(part)
return body
def load_gmail(n=50, query=None):
"""Load Gmail messages directly using Gmail API."""
service = get_gmail_service()
# Fetch message list
results = service.users().messages().list(
userId='me',
maxResults=n,
q=query if query else ''
).execute()
messages = results.get('messages', [])
if not messages:
print("No messages found.")
return []
print(f"Fetching {len(messages)} messages...")
docs = []
for i, msg_ref in enumerate(messages, 1):
# Fetch full message
msg = service.users().messages().get(
userId='me',
id=msg_ref['id'],
format='full'
).execute()
# Extract headers
headers = msg['payload']['headers']
subject = get_header_value(headers, 'Subject')
sender = get_header_value(headers, 'From')
date = get_header_value(headers, 'Date')
to = get_header_value(headers, 'To')
# Extract body
body = decode_body(msg['payload'])
# Create metadata
metadata = {
'source': 'gmail',
'id': msg['id'],
'subject': subject,
'from': sender,
'to': to,
'date': date,
'thread_id': msg.get('threadId', ''),
}
# Format content
content = f"Subject: {subject}\n"
content += f"From: {sender}\n"
content += f"To: {to}\n"
content += f"Date: {date}\n\n"
content += body
docs.append(Document(page_content=content, metadata=metadata))
if i % 10 == 0:
print(f" Processed {i}/{len(messages)} messages...")
print(f"✓ Gmail: loaded {len(docs)} documents")
return docs
def main():
print("Starting Gmail RAG ingestion...\n")
# 1) Load Gmail documents
gmail_docs = load_gmail(n=50)
if not gmail_docs:
print("No documents to process. Exiting.")
return
# 2) Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=150)
chunks = splitter.split_documents(gmail_docs)
print(f"✓ Created {len(chunks)} chunks")
# 3) Create embeddings and store in ChromaDB
print(f"✓ Creating embeddings with {EMBED_MODEL}...")
embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
Path(CHROMA_DIR).mkdir(parents=True, exist_ok=True)
vs = Chroma.from_documents(
chunks,
embedding=embeddings,
persist_directory=CHROMA_DIR
)
vs.persist()
print(f"✓ Successfully persisted ChromaDB at: {CHROMA_DIR}\n")
print("Ingestion complete! You can now query your Gmail data.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,14 @@
python-dotenv
langchain
langchain-core
langchain-community
langchain-openai
langchain-chroma
langchain-google-community
chromadb
openai
gradio
google-auth
google-auth-oauthlib
google-auth-httplib2
google-api-python-client