Implement GMAIL RAG assistant
This commit is contained in:
22
week5/community-contributions/emmy/gmail_rag/.gitignore
vendored
Normal file
22
week5/community-contributions/emmy/gmail_rag/.gitignore
vendored
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# Secrets - NEVER commit these!
|
||||||
|
.env
|
||||||
|
credentials.json
|
||||||
|
token.json
|
||||||
|
|
||||||
|
# Vector Database (contains your emails)
|
||||||
|
chroma/
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*.egg-info/
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
88
week5/community-contributions/emmy/gmail_rag/README.md
Normal file
88
week5/community-contributions/emmy/gmail_rag/README.md
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
# Gmail RAG Assistant 📧
|
||||||
|
|
||||||
|
Search and ask questions about your Gmail emails using AI.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
### 1. Install Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m venv .venv
|
||||||
|
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Google Cloud Setup
|
||||||
|
|
||||||
|
1. Go to [Google Cloud Console](https://console.cloud.google.com)
|
||||||
|
2. Create a project and enable **Gmail API**
|
||||||
|
3. Create **OAuth 2.0 Desktop Client** credentials
|
||||||
|
4. Download and save as `~/.config/gcp/langchain/credentials.json`
|
||||||
|
5. Add your email as a test user in OAuth consent screen
|
||||||
|
|
||||||
|
### 3. Configure Environment
|
||||||
|
|
||||||
|
Create `.env` file:
|
||||||
|
|
||||||
|
```env
|
||||||
|
GOOGLE_CREDENTIALS_PATH=~/.config/gcp/langchain/credentials.json
|
||||||
|
GOOGLE_TOKEN_PATH=~/.config/gcp/langchain/token.json
|
||||||
|
OPENAI_API_KEY=your_openai_api_key_here
|
||||||
|
```
|
||||||
|
|
||||||
|
Get OpenAI API key from [platform.openai.com](https://platform.openai.com/api-keys)
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Index your emails:
|
||||||
|
```bash
|
||||||
|
python ingest_gmail_drive.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Launch UI:
|
||||||
|
```bash
|
||||||
|
python app.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Open `http://localhost:7860` in your browser.
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
gmail_rag/
|
||||||
|
├── ingest_gmail_drive.py # Fetch and index emails
|
||||||
|
├── app.py # Gradio UI
|
||||||
|
├── requirements.txt # Dependencies
|
||||||
|
├── .env # API keys (create this)
|
||||||
|
└── chroma/ # Vector database (auto-created)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
**Change number of emails** in `ingest_gmail_drive.py`:
|
||||||
|
```python
|
||||||
|
gmail_docs = load_gmail(n=100) # Adjust this number
|
||||||
|
```
|
||||||
|
|
||||||
|
**Change AI model** in `app.py`:
|
||||||
|
```python
|
||||||
|
LLM_MODEL = "gpt-4o-mini" # or "gpt-4", "gpt-3.5-turbo"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
- **"Access Blocked"**: Add your email as test user in Google Cloud
|
||||||
|
- **"ChromaDB not found"**: Run `ingest_gmail_drive.py` first
|
||||||
|
- **Token expired**: Delete `~/.config/gcp/langchain/token.json` and re-run
|
||||||
|
|
||||||
|
## Cost
|
||||||
|
|
||||||
|
- Embeddings: ~$0.01-0.05 per 100 emails
|
||||||
|
- Queries: ~$0.01 per 100 questions (using gpt-4o-mini)
|
||||||
|
- Gmail API: Free
|
||||||
|
|
||||||
|
## Security
|
||||||
|
|
||||||
|
Never commit: `.env`, `credentials.json`, `token.json`, `chroma/`
|
||||||
|
|
||||||
|
The `.gitignore` file protects these automatically.
|
||||||
223
week5/community-contributions/emmy/gmail_rag/app.py
Normal file
223
week5/community-contributions/emmy/gmail_rag/app.py
Normal file
@@ -0,0 +1,223 @@
|
|||||||
|
import os
|
||||||
|
import gradio as gr
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from langchain_community.vectorstores import Chroma
|
||||||
|
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
||||||
|
from langchain.chains import RetrievalQA
|
||||||
|
from langchain.prompts import PromptTemplate
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# ---- Settings ----
|
||||||
|
CHROMA_DIR = "chroma"
|
||||||
|
EMBED_MODEL = "text-embedding-3-small"
|
||||||
|
LLM_MODEL = "gpt-4o-mini"
|
||||||
|
|
||||||
|
# Initialize components
|
||||||
|
embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
|
||||||
|
vectorstore = None
|
||||||
|
qa_chain = None
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_qa_chain():
|
||||||
|
"""Initialize the QA chain with the vector store."""
|
||||||
|
global vectorstore, qa_chain
|
||||||
|
|
||||||
|
if not os.path.exists(CHROMA_DIR):
|
||||||
|
return "❌ ChromaDB not found. Please run ingest_gmail_drive.py first to index your emails."
|
||||||
|
|
||||||
|
try:
|
||||||
|
vectorstore = Chroma(
|
||||||
|
persist_directory=CHROMA_DIR,
|
||||||
|
embedding_function=embeddings
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create custom prompt
|
||||||
|
prompt_template = """Use the following pieces of context from Gmail emails to answer the question.
|
||||||
|
If you don't know the answer based on the context, just say you don't have that information in the emails.
|
||||||
|
|
||||||
|
Context:
|
||||||
|
{context}
|
||||||
|
|
||||||
|
Question: {question}
|
||||||
|
|
||||||
|
Answer:"""
|
||||||
|
|
||||||
|
PROMPT = PromptTemplate(
|
||||||
|
template=prompt_template,
|
||||||
|
input_variables=["context", "question"]
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = ChatOpenAI(model=LLM_MODEL, temperature=0)
|
||||||
|
|
||||||
|
qa_chain = RetrievalQA.from_chain_type(
|
||||||
|
llm=llm,
|
||||||
|
chain_type="stuff",
|
||||||
|
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
|
||||||
|
chain_type_kwargs={"prompt": PROMPT},
|
||||||
|
return_source_documents=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return "✓ Ready to answer questions about your emails!"
|
||||||
|
except Exception as e:
|
||||||
|
return f"❌ Error initializing: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def query_emails(question, num_results=5):
|
||||||
|
"""Query the email database."""
|
||||||
|
if qa_chain is None:
|
||||||
|
return "Please click 'Initialize System' first!", ""
|
||||||
|
|
||||||
|
if not question.strip():
|
||||||
|
return "Please enter a question.", ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get answer
|
||||||
|
result = qa_chain({"query": question})
|
||||||
|
answer = result['result']
|
||||||
|
|
||||||
|
# Format sources
|
||||||
|
sources_text = "\n\n---\n\n**📧 Source Emails:**\n\n"
|
||||||
|
for i, doc in enumerate(result['source_documents'][:num_results], 1):
|
||||||
|
sources_text += f"**Email {i}:**\n"
|
||||||
|
sources_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n"
|
||||||
|
sources_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n"
|
||||||
|
sources_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n"
|
||||||
|
sources_text += f"- **Preview:** {doc.page_content[:200]}...\n\n"
|
||||||
|
|
||||||
|
return answer, sources_text
|
||||||
|
except Exception as e:
|
||||||
|
return f"❌ Error: {str(e)}", ""
|
||||||
|
|
||||||
|
|
||||||
|
def search_emails(query_text, num_results=5):
|
||||||
|
"""Direct vector similarity search."""
|
||||||
|
if vectorstore is None:
|
||||||
|
return "Please click 'Initialize System' first!"
|
||||||
|
|
||||||
|
if not query_text.strip():
|
||||||
|
return "Please enter a search query."
|
||||||
|
|
||||||
|
try:
|
||||||
|
docs = vectorstore.similarity_search(query_text, k=num_results)
|
||||||
|
|
||||||
|
results_text = f"**Found {len(docs)} relevant emails:**\n\n"
|
||||||
|
for i, doc in enumerate(docs, 1):
|
||||||
|
results_text += f"**Email {i}:**\n"
|
||||||
|
results_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n"
|
||||||
|
results_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n"
|
||||||
|
results_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n"
|
||||||
|
results_text += f"- **Content Preview:**\n{doc.page_content[:300]}...\n\n"
|
||||||
|
results_text += "---\n\n"
|
||||||
|
|
||||||
|
return results_text
|
||||||
|
except Exception as e:
|
||||||
|
return f"❌ Error: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
# Create Gradio Interface
|
||||||
|
with gr.Blocks(title="Gmail RAG Assistant", theme=gr.themes.Soft()) as demo:
|
||||||
|
gr.Markdown(
|
||||||
|
"""
|
||||||
|
# 📧 Gmail RAG Assistant
|
||||||
|
Ask questions about your emails or search for specific content.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
init_btn = gr.Button("🚀 Initialize System", variant="primary")
|
||||||
|
status_text = gr.Textbox(label="Status", interactive=False)
|
||||||
|
|
||||||
|
init_btn.click(fn=initialize_qa_chain, outputs=status_text)
|
||||||
|
|
||||||
|
gr.Markdown("---")
|
||||||
|
|
||||||
|
with gr.Tab("💬 Ask Questions"):
|
||||||
|
gr.Markdown("Ask natural language questions about your emails.")
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
with gr.Column(scale=4):
|
||||||
|
question_input = gr.Textbox(
|
||||||
|
label="Your Question",
|
||||||
|
placeholder="What is the latest message from Andela?",
|
||||||
|
lines=2
|
||||||
|
)
|
||||||
|
with gr.Column(scale=1):
|
||||||
|
qa_num_results = gr.Slider(
|
||||||
|
minimum=1,
|
||||||
|
maximum=10,
|
||||||
|
value=5,
|
||||||
|
step=1,
|
||||||
|
label="Sources to Show"
|
||||||
|
)
|
||||||
|
|
||||||
|
qa_btn = gr.Button("Ask Question", variant="primary")
|
||||||
|
|
||||||
|
answer_output = gr.Markdown(label="Answer")
|
||||||
|
sources_output = gr.Markdown(label="Sources")
|
||||||
|
|
||||||
|
qa_btn.click(
|
||||||
|
fn=query_emails,
|
||||||
|
inputs=[question_input, qa_num_results],
|
||||||
|
outputs=[answer_output, sources_output]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example questions
|
||||||
|
gr.Examples(
|
||||||
|
examples=[
|
||||||
|
["What is the latest message from Andela?"],
|
||||||
|
["Summarize emails about project updates"],
|
||||||
|
["What meetings do I have scheduled?"],
|
||||||
|
["Find emails about invoices or payments"],
|
||||||
|
],
|
||||||
|
inputs=question_input
|
||||||
|
)
|
||||||
|
|
||||||
|
with gr.Tab("🔍 Search Emails"):
|
||||||
|
gr.Markdown("Search for emails using semantic similarity.")
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
with gr.Column(scale=4):
|
||||||
|
search_input = gr.Textbox(
|
||||||
|
label="Search Query",
|
||||||
|
placeholder="project deadline",
|
||||||
|
lines=2
|
||||||
|
)
|
||||||
|
with gr.Column(scale=1):
|
||||||
|
search_num_results = gr.Slider(
|
||||||
|
minimum=1,
|
||||||
|
maximum=10,
|
||||||
|
value=5,
|
||||||
|
step=1,
|
||||||
|
label="Results"
|
||||||
|
)
|
||||||
|
|
||||||
|
search_btn = gr.Button("Search", variant="primary")
|
||||||
|
search_output = gr.Markdown(label="Search Results")
|
||||||
|
|
||||||
|
search_btn.click(
|
||||||
|
fn=search_emails,
|
||||||
|
inputs=[search_input, search_num_results],
|
||||||
|
outputs=search_output
|
||||||
|
)
|
||||||
|
|
||||||
|
gr.Examples(
|
||||||
|
examples=[
|
||||||
|
["Andela"],
|
||||||
|
["meeting schedule"],
|
||||||
|
["invoice payment"],
|
||||||
|
["project status update"],
|
||||||
|
],
|
||||||
|
inputs=search_input
|
||||||
|
)
|
||||||
|
|
||||||
|
gr.Markdown(
|
||||||
|
"""
|
||||||
|
---
|
||||||
|
**Note:** Make sure you've run `ingest_gmail_drive.py` first to index your emails.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
demo.launch()
|
||||||
@@ -0,0 +1,189 @@
|
|||||||
|
import os
|
||||||
|
import base64
|
||||||
|
from pathlib import Path
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from email.utils import parsedate_to_datetime
|
||||||
|
|
||||||
|
# Load environment variables from .env file
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# --- Configuration ---
|
||||||
|
GOOGLE_CREDENTIALS_PATH = os.getenv(
|
||||||
|
"GOOGLE_CREDENTIALS_PATH", "~/.config/gcp/langchain/credentials.json"
|
||||||
|
)
|
||||||
|
GOOGLE_TOKEN_PATH = os.getenv(
|
||||||
|
"GOOGLE_TOKEN_PATH", "~/.config/gcp/langchain/token.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- LangChain imports ----
|
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
from langchain_community.vectorstores import Chroma
|
||||||
|
from langchain_openai import OpenAIEmbeddings
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
# ---- Settings ----
|
||||||
|
CHROMA_DIR = "chroma"
|
||||||
|
EMBED_MODEL = "text-embedding-3-small"
|
||||||
|
|
||||||
|
|
||||||
|
def get_gmail_service():
|
||||||
|
"""Authenticate and return Gmail API service."""
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
from google.oauth2.credentials import Credentials
|
||||||
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||||
|
from googleapiclient.discovery import build
|
||||||
|
|
||||||
|
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
|
||||||
|
|
||||||
|
token_path = os.path.expanduser(GOOGLE_TOKEN_PATH)
|
||||||
|
creds_path = os.path.expanduser(GOOGLE_CREDENTIALS_PATH)
|
||||||
|
|
||||||
|
if not os.path.exists(creds_path):
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Credentials file not found at: {creds_path}\n"
|
||||||
|
f"Please download OAuth 2.0 Client ID credentials from Google Cloud Console."
|
||||||
|
)
|
||||||
|
|
||||||
|
creds = None
|
||||||
|
if os.path.exists(token_path):
|
||||||
|
creds = Credentials.from_authorized_user_file(token_path, SCOPES)
|
||||||
|
|
||||||
|
if not creds or not creds.valid:
|
||||||
|
if creds and creds.expired and creds.refresh_token:
|
||||||
|
creds.refresh(Request())
|
||||||
|
else:
|
||||||
|
flow = InstalledAppFlow.from_client_secrets_file(creds_path, SCOPES)
|
||||||
|
creds = flow.run_local_server(port=0)
|
||||||
|
|
||||||
|
with open(token_path, "w") as token:
|
||||||
|
token.write(creds.to_json())
|
||||||
|
|
||||||
|
return build('gmail', 'v1', credentials=creds)
|
||||||
|
|
||||||
|
|
||||||
|
def get_header_value(headers, name):
|
||||||
|
"""Extract header value from Gmail headers list."""
|
||||||
|
for header in headers:
|
||||||
|
if header['name'].lower() == name.lower():
|
||||||
|
return header['value']
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def decode_body(payload):
|
||||||
|
"""Decode email body from Gmail payload."""
|
||||||
|
body = ""
|
||||||
|
|
||||||
|
if 'body' in payload and 'data' in payload['body']:
|
||||||
|
body = base64.urlsafe_b64decode(payload['body']['data']).decode('utf-8', errors='ignore')
|
||||||
|
|
||||||
|
# Handle multipart messages
|
||||||
|
if 'parts' in payload:
|
||||||
|
for part in payload['parts']:
|
||||||
|
if part['mimeType'] == 'text/plain':
|
||||||
|
if 'data' in part['body']:
|
||||||
|
body += base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore')
|
||||||
|
elif 'parts' in part:
|
||||||
|
# Recursively handle nested parts
|
||||||
|
body += decode_body(part)
|
||||||
|
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
def load_gmail(n=50, query=None):
|
||||||
|
"""Load Gmail messages directly using Gmail API."""
|
||||||
|
service = get_gmail_service()
|
||||||
|
|
||||||
|
# Fetch message list
|
||||||
|
results = service.users().messages().list(
|
||||||
|
userId='me',
|
||||||
|
maxResults=n,
|
||||||
|
q=query if query else ''
|
||||||
|
).execute()
|
||||||
|
|
||||||
|
messages = results.get('messages', [])
|
||||||
|
|
||||||
|
if not messages:
|
||||||
|
print("No messages found.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
print(f"Fetching {len(messages)} messages...")
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
for i, msg_ref in enumerate(messages, 1):
|
||||||
|
# Fetch full message
|
||||||
|
msg = service.users().messages().get(
|
||||||
|
userId='me',
|
||||||
|
id=msg_ref['id'],
|
||||||
|
format='full'
|
||||||
|
).execute()
|
||||||
|
|
||||||
|
# Extract headers
|
||||||
|
headers = msg['payload']['headers']
|
||||||
|
subject = get_header_value(headers, 'Subject')
|
||||||
|
sender = get_header_value(headers, 'From')
|
||||||
|
date = get_header_value(headers, 'Date')
|
||||||
|
to = get_header_value(headers, 'To')
|
||||||
|
|
||||||
|
# Extract body
|
||||||
|
body = decode_body(msg['payload'])
|
||||||
|
|
||||||
|
# Create metadata
|
||||||
|
metadata = {
|
||||||
|
'source': 'gmail',
|
||||||
|
'id': msg['id'],
|
||||||
|
'subject': subject,
|
||||||
|
'from': sender,
|
||||||
|
'to': to,
|
||||||
|
'date': date,
|
||||||
|
'thread_id': msg.get('threadId', ''),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Format content
|
||||||
|
content = f"Subject: {subject}\n"
|
||||||
|
content += f"From: {sender}\n"
|
||||||
|
content += f"To: {to}\n"
|
||||||
|
content += f"Date: {date}\n\n"
|
||||||
|
content += body
|
||||||
|
|
||||||
|
docs.append(Document(page_content=content, metadata=metadata))
|
||||||
|
|
||||||
|
if i % 10 == 0:
|
||||||
|
print(f" Processed {i}/{len(messages)} messages...")
|
||||||
|
|
||||||
|
print(f"✓ Gmail: loaded {len(docs)} documents")
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Starting Gmail RAG ingestion...\n")
|
||||||
|
|
||||||
|
# 1) Load Gmail documents
|
||||||
|
gmail_docs = load_gmail(n=50)
|
||||||
|
|
||||||
|
if not gmail_docs:
|
||||||
|
print("No documents to process. Exiting.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2) Split into chunks
|
||||||
|
splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=150)
|
||||||
|
chunks = splitter.split_documents(gmail_docs)
|
||||||
|
print(f"✓ Created {len(chunks)} chunks")
|
||||||
|
|
||||||
|
# 3) Create embeddings and store in ChromaDB
|
||||||
|
print(f"✓ Creating embeddings with {EMBED_MODEL}...")
|
||||||
|
embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
|
||||||
|
|
||||||
|
Path(CHROMA_DIR).mkdir(parents=True, exist_ok=True)
|
||||||
|
vs = Chroma.from_documents(
|
||||||
|
chunks,
|
||||||
|
embedding=embeddings,
|
||||||
|
persist_directory=CHROMA_DIR
|
||||||
|
)
|
||||||
|
vs.persist()
|
||||||
|
|
||||||
|
print(f"✓ Successfully persisted ChromaDB at: {CHROMA_DIR}\n")
|
||||||
|
print("Ingestion complete! You can now query your Gmail data.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
python-dotenv
|
||||||
|
langchain
|
||||||
|
langchain-core
|
||||||
|
langchain-community
|
||||||
|
langchain-openai
|
||||||
|
langchain-chroma
|
||||||
|
langchain-google-community
|
||||||
|
chromadb
|
||||||
|
openai
|
||||||
|
gradio
|
||||||
|
google-auth
|
||||||
|
google-auth-oauthlib
|
||||||
|
google-auth-httplib2
|
||||||
|
google-api-python-client
|
||||||
Reference in New Issue
Block a user