Implement GMAIL RAG assistant
This commit is contained in:
22
week5/community-contributions/emmy/gmail_rag/.gitignore
vendored
Normal file
22
week5/community-contributions/emmy/gmail_rag/.gitignore
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
# Secrets - NEVER commit these!
|
||||
.env
|
||||
credentials.json
|
||||
token.json
|
||||
|
||||
# Vector Database (contains your emails)
|
||||
chroma/
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*.egg-info/
|
||||
.venv/
|
||||
venv/
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
.DS_Store
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
88
week5/community-contributions/emmy/gmail_rag/README.md
Normal file
88
week5/community-contributions/emmy/gmail_rag/README.md
Normal file
@@ -0,0 +1,88 @@
|
||||
# Gmail RAG Assistant 📧
|
||||
|
||||
Search and ask questions about your Gmail emails using AI.
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Install Dependencies
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 2. Google Cloud Setup
|
||||
|
||||
1. Go to [Google Cloud Console](https://console.cloud.google.com)
|
||||
2. Create a project and enable **Gmail API**
|
||||
3. Create **OAuth 2.0 Desktop Client** credentials
|
||||
4. Download and save as `~/.config/gcp/langchain/credentials.json`
|
||||
5. Add your email as a test user in OAuth consent screen
|
||||
|
||||
### 3. Configure Environment
|
||||
|
||||
Create `.env` file:
|
||||
|
||||
```env
|
||||
GOOGLE_CREDENTIALS_PATH=~/.config/gcp/langchain/credentials.json
|
||||
GOOGLE_TOKEN_PATH=~/.config/gcp/langchain/token.json
|
||||
OPENAI_API_KEY=your_openai_api_key_here
|
||||
```
|
||||
|
||||
Get OpenAI API key from [platform.openai.com](https://platform.openai.com/api-keys)
|
||||
|
||||
## Usage
|
||||
|
||||
### Index your emails:
|
||||
```bash
|
||||
python ingest_gmail_drive.py
|
||||
```
|
||||
|
||||
### Launch UI:
|
||||
```bash
|
||||
python app.py
|
||||
```
|
||||
|
||||
Open `http://localhost:7860` in your browser.
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
gmail_rag/
|
||||
├── ingest_gmail_drive.py # Fetch and index emails
|
||||
├── app.py # Gradio UI
|
||||
├── requirements.txt # Dependencies
|
||||
├── .env # API keys (create this)
|
||||
└── chroma/ # Vector database (auto-created)
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
**Change number of emails** in `ingest_gmail_drive.py`:
|
||||
```python
|
||||
gmail_docs = load_gmail(n=100) # Adjust this number
|
||||
```
|
||||
|
||||
**Change AI model** in `app.py`:
|
||||
```python
|
||||
LLM_MODEL = "gpt-4o-mini" # or "gpt-4", "gpt-3.5-turbo"
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
- **"Access Blocked"**: Add your email as test user in Google Cloud
|
||||
- **"ChromaDB not found"**: Run `ingest_gmail_drive.py` first
|
||||
- **Token expired**: Delete `~/.config/gcp/langchain/token.json` and re-run
|
||||
|
||||
## Cost
|
||||
|
||||
- Embeddings: ~$0.01-0.05 per 100 emails
|
||||
- Queries: ~$0.01 per 100 questions (using gpt-4o-mini)
|
||||
- Gmail API: Free
|
||||
|
||||
## Security
|
||||
|
||||
Never commit: `.env`, `credentials.json`, `token.json`, `chroma/`
|
||||
|
||||
The `.gitignore` file protects these automatically.
|
||||
223
week5/community-contributions/emmy/gmail_rag/app.py
Normal file
223
week5/community-contributions/emmy/gmail_rag/app.py
Normal file
@@ -0,0 +1,223 @@
|
||||
import os
|
||||
import gradio as gr
|
||||
from dotenv import load_dotenv
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
||||
from langchain.chains import RetrievalQA
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# ---- Settings ----
|
||||
CHROMA_DIR = "chroma"
|
||||
EMBED_MODEL = "text-embedding-3-small"
|
||||
LLM_MODEL = "gpt-4o-mini"
|
||||
|
||||
# Initialize components
|
||||
embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
|
||||
vectorstore = None
|
||||
qa_chain = None
|
||||
|
||||
|
||||
def initialize_qa_chain():
|
||||
"""Initialize the QA chain with the vector store."""
|
||||
global vectorstore, qa_chain
|
||||
|
||||
if not os.path.exists(CHROMA_DIR):
|
||||
return "❌ ChromaDB not found. Please run ingest_gmail_drive.py first to index your emails."
|
||||
|
||||
try:
|
||||
vectorstore = Chroma(
|
||||
persist_directory=CHROMA_DIR,
|
||||
embedding_function=embeddings
|
||||
)
|
||||
|
||||
# Create custom prompt
|
||||
prompt_template = """Use the following pieces of context from Gmail emails to answer the question.
|
||||
If you don't know the answer based on the context, just say you don't have that information in the emails.
|
||||
|
||||
Context:
|
||||
{context}
|
||||
|
||||
Question: {question}
|
||||
|
||||
Answer:"""
|
||||
|
||||
PROMPT = PromptTemplate(
|
||||
template=prompt_template,
|
||||
input_variables=["context", "question"]
|
||||
)
|
||||
|
||||
llm = ChatOpenAI(model=LLM_MODEL, temperature=0)
|
||||
|
||||
qa_chain = RetrievalQA.from_chain_type(
|
||||
llm=llm,
|
||||
chain_type="stuff",
|
||||
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
|
||||
chain_type_kwargs={"prompt": PROMPT},
|
||||
return_source_documents=True
|
||||
)
|
||||
|
||||
return "✓ Ready to answer questions about your emails!"
|
||||
except Exception as e:
|
||||
return f"❌ Error initializing: {str(e)}"
|
||||
|
||||
|
||||
def query_emails(question, num_results=5):
|
||||
"""Query the email database."""
|
||||
if qa_chain is None:
|
||||
return "Please click 'Initialize System' first!", ""
|
||||
|
||||
if not question.strip():
|
||||
return "Please enter a question.", ""
|
||||
|
||||
try:
|
||||
# Get answer
|
||||
result = qa_chain({"query": question})
|
||||
answer = result['result']
|
||||
|
||||
# Format sources
|
||||
sources_text = "\n\n---\n\n**📧 Source Emails:**\n\n"
|
||||
for i, doc in enumerate(result['source_documents'][:num_results], 1):
|
||||
sources_text += f"**Email {i}:**\n"
|
||||
sources_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n"
|
||||
sources_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n"
|
||||
sources_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n"
|
||||
sources_text += f"- **Preview:** {doc.page_content[:200]}...\n\n"
|
||||
|
||||
return answer, sources_text
|
||||
except Exception as e:
|
||||
return f"❌ Error: {str(e)}", ""
|
||||
|
||||
|
||||
def search_emails(query_text, num_results=5):
|
||||
"""Direct vector similarity search."""
|
||||
if vectorstore is None:
|
||||
return "Please click 'Initialize System' first!"
|
||||
|
||||
if not query_text.strip():
|
||||
return "Please enter a search query."
|
||||
|
||||
try:
|
||||
docs = vectorstore.similarity_search(query_text, k=num_results)
|
||||
|
||||
results_text = f"**Found {len(docs)} relevant emails:**\n\n"
|
||||
for i, doc in enumerate(docs, 1):
|
||||
results_text += f"**Email {i}:**\n"
|
||||
results_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n"
|
||||
results_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n"
|
||||
results_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n"
|
||||
results_text += f"- **Content Preview:**\n{doc.page_content[:300]}...\n\n"
|
||||
results_text += "---\n\n"
|
||||
|
||||
return results_text
|
||||
except Exception as e:
|
||||
return f"❌ Error: {str(e)}"
|
||||
|
||||
|
||||
# Create Gradio Interface
|
||||
with gr.Blocks(title="Gmail RAG Assistant", theme=gr.themes.Soft()) as demo:
|
||||
gr.Markdown(
|
||||
"""
|
||||
# 📧 Gmail RAG Assistant
|
||||
Ask questions about your emails or search for specific content.
|
||||
"""
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
init_btn = gr.Button("🚀 Initialize System", variant="primary")
|
||||
status_text = gr.Textbox(label="Status", interactive=False)
|
||||
|
||||
init_btn.click(fn=initialize_qa_chain, outputs=status_text)
|
||||
|
||||
gr.Markdown("---")
|
||||
|
||||
with gr.Tab("💬 Ask Questions"):
|
||||
gr.Markdown("Ask natural language questions about your emails.")
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
question_input = gr.Textbox(
|
||||
label="Your Question",
|
||||
placeholder="What is the latest message from Andela?",
|
||||
lines=2
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
qa_num_results = gr.Slider(
|
||||
minimum=1,
|
||||
maximum=10,
|
||||
value=5,
|
||||
step=1,
|
||||
label="Sources to Show"
|
||||
)
|
||||
|
||||
qa_btn = gr.Button("Ask Question", variant="primary")
|
||||
|
||||
answer_output = gr.Markdown(label="Answer")
|
||||
sources_output = gr.Markdown(label="Sources")
|
||||
|
||||
qa_btn.click(
|
||||
fn=query_emails,
|
||||
inputs=[question_input, qa_num_results],
|
||||
outputs=[answer_output, sources_output]
|
||||
)
|
||||
|
||||
# Example questions
|
||||
gr.Examples(
|
||||
examples=[
|
||||
["What is the latest message from Andela?"],
|
||||
["Summarize emails about project updates"],
|
||||
["What meetings do I have scheduled?"],
|
||||
["Find emails about invoices or payments"],
|
||||
],
|
||||
inputs=question_input
|
||||
)
|
||||
|
||||
with gr.Tab("🔍 Search Emails"):
|
||||
gr.Markdown("Search for emails using semantic similarity.")
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column(scale=4):
|
||||
search_input = gr.Textbox(
|
||||
label="Search Query",
|
||||
placeholder="project deadline",
|
||||
lines=2
|
||||
)
|
||||
with gr.Column(scale=1):
|
||||
search_num_results = gr.Slider(
|
||||
minimum=1,
|
||||
maximum=10,
|
||||
value=5,
|
||||
step=1,
|
||||
label="Results"
|
||||
)
|
||||
|
||||
search_btn = gr.Button("Search", variant="primary")
|
||||
search_output = gr.Markdown(label="Search Results")
|
||||
|
||||
search_btn.click(
|
||||
fn=search_emails,
|
||||
inputs=[search_input, search_num_results],
|
||||
outputs=search_output
|
||||
)
|
||||
|
||||
gr.Examples(
|
||||
examples=[
|
||||
["Andela"],
|
||||
["meeting schedule"],
|
||||
["invoice payment"],
|
||||
["project status update"],
|
||||
],
|
||||
inputs=search_input
|
||||
)
|
||||
|
||||
gr.Markdown(
|
||||
"""
|
||||
---
|
||||
**Note:** Make sure you've run `ingest_gmail_drive.py` first to index your emails.
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo.launch()
|
||||
@@ -0,0 +1,189 @@
|
||||
import os
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
# --- Configuration ---
|
||||
GOOGLE_CREDENTIALS_PATH = os.getenv(
|
||||
"GOOGLE_CREDENTIALS_PATH", "~/.config/gcp/langchain/credentials.json"
|
||||
)
|
||||
GOOGLE_TOKEN_PATH = os.getenv(
|
||||
"GOOGLE_TOKEN_PATH", "~/.config/gcp/langchain/token.json"
|
||||
)
|
||||
|
||||
# ---- LangChain imports ----
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from langchain_core.documents import Document
|
||||
|
||||
# ---- Settings ----
|
||||
CHROMA_DIR = "chroma"
|
||||
EMBED_MODEL = "text-embedding-3-small"
|
||||
|
||||
|
||||
def get_gmail_service():
|
||||
"""Authenticate and return Gmail API service."""
|
||||
from google.auth.transport.requests import Request
|
||||
from google.oauth2.credentials import Credentials
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||
from googleapiclient.discovery import build
|
||||
|
||||
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
|
||||
|
||||
token_path = os.path.expanduser(GOOGLE_TOKEN_PATH)
|
||||
creds_path = os.path.expanduser(GOOGLE_CREDENTIALS_PATH)
|
||||
|
||||
if not os.path.exists(creds_path):
|
||||
raise FileNotFoundError(
|
||||
f"Credentials file not found at: {creds_path}\n"
|
||||
f"Please download OAuth 2.0 Client ID credentials from Google Cloud Console."
|
||||
)
|
||||
|
||||
creds = None
|
||||
if os.path.exists(token_path):
|
||||
creds = Credentials.from_authorized_user_file(token_path, SCOPES)
|
||||
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
flow = InstalledAppFlow.from_client_secrets_file(creds_path, SCOPES)
|
||||
creds = flow.run_local_server(port=0)
|
||||
|
||||
with open(token_path, "w") as token:
|
||||
token.write(creds.to_json())
|
||||
|
||||
return build('gmail', 'v1', credentials=creds)
|
||||
|
||||
|
||||
def get_header_value(headers, name):
|
||||
"""Extract header value from Gmail headers list."""
|
||||
for header in headers:
|
||||
if header['name'].lower() == name.lower():
|
||||
return header['value']
|
||||
return ''
|
||||
|
||||
|
||||
def decode_body(payload):
|
||||
"""Decode email body from Gmail payload."""
|
||||
body = ""
|
||||
|
||||
if 'body' in payload and 'data' in payload['body']:
|
||||
body = base64.urlsafe_b64decode(payload['body']['data']).decode('utf-8', errors='ignore')
|
||||
|
||||
# Handle multipart messages
|
||||
if 'parts' in payload:
|
||||
for part in payload['parts']:
|
||||
if part['mimeType'] == 'text/plain':
|
||||
if 'data' in part['body']:
|
||||
body += base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore')
|
||||
elif 'parts' in part:
|
||||
# Recursively handle nested parts
|
||||
body += decode_body(part)
|
||||
|
||||
return body
|
||||
|
||||
|
||||
def load_gmail(n=50, query=None):
|
||||
"""Load Gmail messages directly using Gmail API."""
|
||||
service = get_gmail_service()
|
||||
|
||||
# Fetch message list
|
||||
results = service.users().messages().list(
|
||||
userId='me',
|
||||
maxResults=n,
|
||||
q=query if query else ''
|
||||
).execute()
|
||||
|
||||
messages = results.get('messages', [])
|
||||
|
||||
if not messages:
|
||||
print("No messages found.")
|
||||
return []
|
||||
|
||||
print(f"Fetching {len(messages)} messages...")
|
||||
|
||||
docs = []
|
||||
for i, msg_ref in enumerate(messages, 1):
|
||||
# Fetch full message
|
||||
msg = service.users().messages().get(
|
||||
userId='me',
|
||||
id=msg_ref['id'],
|
||||
format='full'
|
||||
).execute()
|
||||
|
||||
# Extract headers
|
||||
headers = msg['payload']['headers']
|
||||
subject = get_header_value(headers, 'Subject')
|
||||
sender = get_header_value(headers, 'From')
|
||||
date = get_header_value(headers, 'Date')
|
||||
to = get_header_value(headers, 'To')
|
||||
|
||||
# Extract body
|
||||
body = decode_body(msg['payload'])
|
||||
|
||||
# Create metadata
|
||||
metadata = {
|
||||
'source': 'gmail',
|
||||
'id': msg['id'],
|
||||
'subject': subject,
|
||||
'from': sender,
|
||||
'to': to,
|
||||
'date': date,
|
||||
'thread_id': msg.get('threadId', ''),
|
||||
}
|
||||
|
||||
# Format content
|
||||
content = f"Subject: {subject}\n"
|
||||
content += f"From: {sender}\n"
|
||||
content += f"To: {to}\n"
|
||||
content += f"Date: {date}\n\n"
|
||||
content += body
|
||||
|
||||
docs.append(Document(page_content=content, metadata=metadata))
|
||||
|
||||
if i % 10 == 0:
|
||||
print(f" Processed {i}/{len(messages)} messages...")
|
||||
|
||||
print(f"✓ Gmail: loaded {len(docs)} documents")
|
||||
return docs
|
||||
|
||||
|
||||
def main():
|
||||
print("Starting Gmail RAG ingestion...\n")
|
||||
|
||||
# 1) Load Gmail documents
|
||||
gmail_docs = load_gmail(n=50)
|
||||
|
||||
if not gmail_docs:
|
||||
print("No documents to process. Exiting.")
|
||||
return
|
||||
|
||||
# 2) Split into chunks
|
||||
splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=150)
|
||||
chunks = splitter.split_documents(gmail_docs)
|
||||
print(f"✓ Created {len(chunks)} chunks")
|
||||
|
||||
# 3) Create embeddings and store in ChromaDB
|
||||
print(f"✓ Creating embeddings with {EMBED_MODEL}...")
|
||||
embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
|
||||
|
||||
Path(CHROMA_DIR).mkdir(parents=True, exist_ok=True)
|
||||
vs = Chroma.from_documents(
|
||||
chunks,
|
||||
embedding=embeddings,
|
||||
persist_directory=CHROMA_DIR
|
||||
)
|
||||
vs.persist()
|
||||
|
||||
print(f"✓ Successfully persisted ChromaDB at: {CHROMA_DIR}\n")
|
||||
print("Ingestion complete! You can now query your Gmail data.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,14 @@
|
||||
python-dotenv
|
||||
langchain
|
||||
langchain-core
|
||||
langchain-community
|
||||
langchain-openai
|
||||
langchain-chroma
|
||||
langchain-google-community
|
||||
chromadb
|
||||
openai
|
||||
gradio
|
||||
google-auth
|
||||
google-auth-oauthlib
|
||||
google-auth-httplib2
|
||||
google-api-python-client
|
||||
Reference in New Issue
Block a user