diff --git a/week5/community-contributions/emmy/gmail_rag/.gitignore b/week5/community-contributions/emmy/gmail_rag/.gitignore new file mode 100644 index 0000000..d1bf9ba --- /dev/null +++ b/week5/community-contributions/emmy/gmail_rag/.gitignore @@ -0,0 +1,22 @@ +# Secrets - NEVER commit these! +.env +credentials.json +token.json + +# Vector Database (contains your emails) +chroma/ + +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +.venv/ +venv/ + +# IDE +.vscode/ +.idea/ +.DS_Store + +# Logs +*.log \ No newline at end of file diff --git a/week5/community-contributions/emmy/gmail_rag/README.md b/week5/community-contributions/emmy/gmail_rag/README.md new file mode 100644 index 0000000..7ea8264 --- /dev/null +++ b/week5/community-contributions/emmy/gmail_rag/README.md @@ -0,0 +1,88 @@ +# Gmail RAG Assistant 📧 + +Search and ask questions about your Gmail emails using AI. + +## Setup + +### 1. Install Dependencies + +```bash +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -r requirements.txt +``` + +### 2. Google Cloud Setup + +1. Go to [Google Cloud Console](https://console.cloud.google.com) +2. Create a project and enable **Gmail API** +3. Create **OAuth 2.0 Desktop Client** credentials +4. Download and save as `~/.config/gcp/langchain/credentials.json` +5. Add your email as a test user in OAuth consent screen + +### 3. Configure Environment + +Create `.env` file: + +```env +GOOGLE_CREDENTIALS_PATH=~/.config/gcp/langchain/credentials.json +GOOGLE_TOKEN_PATH=~/.config/gcp/langchain/token.json +OPENAI_API_KEY=your_openai_api_key_here +``` + +Get OpenAI API key from [platform.openai.com](https://platform.openai.com/api-keys) + +## Usage + +### Index your emails: +```bash +python ingest_gmail_drive.py +``` + +### Launch UI: +```bash +python app.py +``` + +Open `http://localhost:7860` in your browser. + +## File Structure + +``` +gmail_rag/ +├── ingest_gmail_drive.py # Fetch and index emails +├── app.py # Gradio UI +├── requirements.txt # Dependencies +├── .env # API keys (create this) +└── chroma/ # Vector database (auto-created) +``` + +## Configuration + +**Change number of emails** in `ingest_gmail_drive.py`: +```python +gmail_docs = load_gmail(n=100) # Adjust this number +``` + +**Change AI model** in `app.py`: +```python +LLM_MODEL = "gpt-4o-mini" # or "gpt-4", "gpt-3.5-turbo" +``` + +## Troubleshooting + +- **"Access Blocked"**: Add your email as test user in Google Cloud +- **"ChromaDB not found"**: Run `ingest_gmail_drive.py` first +- **Token expired**: Delete `~/.config/gcp/langchain/token.json` and re-run + +## Cost + +- Embeddings: ~$0.01-0.05 per 100 emails +- Queries: ~$0.01 per 100 questions (using gpt-4o-mini) +- Gmail API: Free + +## Security + +Never commit: `.env`, `credentials.json`, `token.json`, `chroma/` + +The `.gitignore` file protects these automatically. \ No newline at end of file diff --git a/week5/community-contributions/emmy/gmail_rag/app.py b/week5/community-contributions/emmy/gmail_rag/app.py new file mode 100644 index 0000000..e3eb42c --- /dev/null +++ b/week5/community-contributions/emmy/gmail_rag/app.py @@ -0,0 +1,223 @@ +import os +import gradio as gr +from dotenv import load_dotenv +from langchain_community.vectorstores import Chroma +from langchain_openai import OpenAIEmbeddings, ChatOpenAI +from langchain.chains import RetrievalQA +from langchain.prompts import PromptTemplate + +load_dotenv() + +# ---- Settings ---- +CHROMA_DIR = "chroma" +EMBED_MODEL = "text-embedding-3-small" +LLM_MODEL = "gpt-4o-mini" + +# Initialize components +embeddings = OpenAIEmbeddings(model=EMBED_MODEL) +vectorstore = None +qa_chain = None + + +def initialize_qa_chain(): + """Initialize the QA chain with the vector store.""" + global vectorstore, qa_chain + + if not os.path.exists(CHROMA_DIR): + return "❌ ChromaDB not found. Please run ingest_gmail_drive.py first to index your emails." + + try: + vectorstore = Chroma( + persist_directory=CHROMA_DIR, + embedding_function=embeddings + ) + + # Create custom prompt + prompt_template = """Use the following pieces of context from Gmail emails to answer the question. +If you don't know the answer based on the context, just say you don't have that information in the emails. + +Context: +{context} + +Question: {question} + +Answer:""" + + PROMPT = PromptTemplate( + template=prompt_template, + input_variables=["context", "question"] + ) + + llm = ChatOpenAI(model=LLM_MODEL, temperature=0) + + qa_chain = RetrievalQA.from_chain_type( + llm=llm, + chain_type="stuff", + retriever=vectorstore.as_retriever(search_kwargs={"k": 5}), + chain_type_kwargs={"prompt": PROMPT}, + return_source_documents=True + ) + + return "✓ Ready to answer questions about your emails!" + except Exception as e: + return f"❌ Error initializing: {str(e)}" + + +def query_emails(question, num_results=5): + """Query the email database.""" + if qa_chain is None: + return "Please click 'Initialize System' first!", "" + + if not question.strip(): + return "Please enter a question.", "" + + try: + # Get answer + result = qa_chain({"query": question}) + answer = result['result'] + + # Format sources + sources_text = "\n\n---\n\n**📧 Source Emails:**\n\n" + for i, doc in enumerate(result['source_documents'][:num_results], 1): + sources_text += f"**Email {i}:**\n" + sources_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n" + sources_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n" + sources_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n" + sources_text += f"- **Preview:** {doc.page_content[:200]}...\n\n" + + return answer, sources_text + except Exception as e: + return f"❌ Error: {str(e)}", "" + + +def search_emails(query_text, num_results=5): + """Direct vector similarity search.""" + if vectorstore is None: + return "Please click 'Initialize System' first!" + + if not query_text.strip(): + return "Please enter a search query." + + try: + docs = vectorstore.similarity_search(query_text, k=num_results) + + results_text = f"**Found {len(docs)} relevant emails:**\n\n" + for i, doc in enumerate(docs, 1): + results_text += f"**Email {i}:**\n" + results_text += f"- **Subject:** {doc.metadata.get('subject', 'N/A')}\n" + results_text += f"- **From:** {doc.metadata.get('from', 'N/A')}\n" + results_text += f"- **Date:** {doc.metadata.get('date', 'N/A')}\n" + results_text += f"- **Content Preview:**\n{doc.page_content[:300]}...\n\n" + results_text += "---\n\n" + + return results_text + except Exception as e: + return f"❌ Error: {str(e)}" + + +# Create Gradio Interface +with gr.Blocks(title="Gmail RAG Assistant", theme=gr.themes.Soft()) as demo: + gr.Markdown( + """ + # 📧 Gmail RAG Assistant + Ask questions about your emails or search for specific content. + """ + ) + + with gr.Row(): + init_btn = gr.Button("🚀 Initialize System", variant="primary") + status_text = gr.Textbox(label="Status", interactive=False) + + init_btn.click(fn=initialize_qa_chain, outputs=status_text) + + gr.Markdown("---") + + with gr.Tab("💬 Ask Questions"): + gr.Markdown("Ask natural language questions about your emails.") + + with gr.Row(): + with gr.Column(scale=4): + question_input = gr.Textbox( + label="Your Question", + placeholder="What is the latest message from Andela?", + lines=2 + ) + with gr.Column(scale=1): + qa_num_results = gr.Slider( + minimum=1, + maximum=10, + value=5, + step=1, + label="Sources to Show" + ) + + qa_btn = gr.Button("Ask Question", variant="primary") + + answer_output = gr.Markdown(label="Answer") + sources_output = gr.Markdown(label="Sources") + + qa_btn.click( + fn=query_emails, + inputs=[question_input, qa_num_results], + outputs=[answer_output, sources_output] + ) + + # Example questions + gr.Examples( + examples=[ + ["What is the latest message from Andela?"], + ["Summarize emails about project updates"], + ["What meetings do I have scheduled?"], + ["Find emails about invoices or payments"], + ], + inputs=question_input + ) + + with gr.Tab("🔍 Search Emails"): + gr.Markdown("Search for emails using semantic similarity.") + + with gr.Row(): + with gr.Column(scale=4): + search_input = gr.Textbox( + label="Search Query", + placeholder="project deadline", + lines=2 + ) + with gr.Column(scale=1): + search_num_results = gr.Slider( + minimum=1, + maximum=10, + value=5, + step=1, + label="Results" + ) + + search_btn = gr.Button("Search", variant="primary") + search_output = gr.Markdown(label="Search Results") + + search_btn.click( + fn=search_emails, + inputs=[search_input, search_num_results], + outputs=search_output + ) + + gr.Examples( + examples=[ + ["Andela"], + ["meeting schedule"], + ["invoice payment"], + ["project status update"], + ], + inputs=search_input + ) + + gr.Markdown( + """ + --- + **Note:** Make sure you've run `ingest_gmail_drive.py` first to index your emails. + """ + ) + + +if __name__ == "__main__": + demo.launch() \ No newline at end of file diff --git a/week5/community-contributions/emmy/gmail_rag/ingest_gmail_drive.py b/week5/community-contributions/emmy/gmail_rag/ingest_gmail_drive.py new file mode 100644 index 0000000..4cb3ecb --- /dev/null +++ b/week5/community-contributions/emmy/gmail_rag/ingest_gmail_drive.py @@ -0,0 +1,189 @@ +import os +import base64 +from pathlib import Path +from dotenv import load_dotenv +from email.utils import parsedate_to_datetime + +# Load environment variables from .env file +load_dotenv() + +# --- Configuration --- +GOOGLE_CREDENTIALS_PATH = os.getenv( + "GOOGLE_CREDENTIALS_PATH", "~/.config/gcp/langchain/credentials.json" +) +GOOGLE_TOKEN_PATH = os.getenv( + "GOOGLE_TOKEN_PATH", "~/.config/gcp/langchain/token.json" +) + +# ---- LangChain imports ---- +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.vectorstores import Chroma +from langchain_openai import OpenAIEmbeddings +from langchain_core.documents import Document + +# ---- Settings ---- +CHROMA_DIR = "chroma" +EMBED_MODEL = "text-embedding-3-small" + + +def get_gmail_service(): + """Authenticate and return Gmail API service.""" + from google.auth.transport.requests import Request + from google.oauth2.credentials import Credentials + from google_auth_oauthlib.flow import InstalledAppFlow + from googleapiclient.discovery import build + + SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"] + + token_path = os.path.expanduser(GOOGLE_TOKEN_PATH) + creds_path = os.path.expanduser(GOOGLE_CREDENTIALS_PATH) + + if not os.path.exists(creds_path): + raise FileNotFoundError( + f"Credentials file not found at: {creds_path}\n" + f"Please download OAuth 2.0 Client ID credentials from Google Cloud Console." + ) + + creds = None + if os.path.exists(token_path): + creds = Credentials.from_authorized_user_file(token_path, SCOPES) + + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file(creds_path, SCOPES) + creds = flow.run_local_server(port=0) + + with open(token_path, "w") as token: + token.write(creds.to_json()) + + return build('gmail', 'v1', credentials=creds) + + +def get_header_value(headers, name): + """Extract header value from Gmail headers list.""" + for header in headers: + if header['name'].lower() == name.lower(): + return header['value'] + return '' + + +def decode_body(payload): + """Decode email body from Gmail payload.""" + body = "" + + if 'body' in payload and 'data' in payload['body']: + body = base64.urlsafe_b64decode(payload['body']['data']).decode('utf-8', errors='ignore') + + # Handle multipart messages + if 'parts' in payload: + for part in payload['parts']: + if part['mimeType'] == 'text/plain': + if 'data' in part['body']: + body += base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore') + elif 'parts' in part: + # Recursively handle nested parts + body += decode_body(part) + + return body + + +def load_gmail(n=50, query=None): + """Load Gmail messages directly using Gmail API.""" + service = get_gmail_service() + + # Fetch message list + results = service.users().messages().list( + userId='me', + maxResults=n, + q=query if query else '' + ).execute() + + messages = results.get('messages', []) + + if not messages: + print("No messages found.") + return [] + + print(f"Fetching {len(messages)} messages...") + + docs = [] + for i, msg_ref in enumerate(messages, 1): + # Fetch full message + msg = service.users().messages().get( + userId='me', + id=msg_ref['id'], + format='full' + ).execute() + + # Extract headers + headers = msg['payload']['headers'] + subject = get_header_value(headers, 'Subject') + sender = get_header_value(headers, 'From') + date = get_header_value(headers, 'Date') + to = get_header_value(headers, 'To') + + # Extract body + body = decode_body(msg['payload']) + + # Create metadata + metadata = { + 'source': 'gmail', + 'id': msg['id'], + 'subject': subject, + 'from': sender, + 'to': to, + 'date': date, + 'thread_id': msg.get('threadId', ''), + } + + # Format content + content = f"Subject: {subject}\n" + content += f"From: {sender}\n" + content += f"To: {to}\n" + content += f"Date: {date}\n\n" + content += body + + docs.append(Document(page_content=content, metadata=metadata)) + + if i % 10 == 0: + print(f" Processed {i}/{len(messages)} messages...") + + print(f"✓ Gmail: loaded {len(docs)} documents") + return docs + + +def main(): + print("Starting Gmail RAG ingestion...\n") + + # 1) Load Gmail documents + gmail_docs = load_gmail(n=50) + + if not gmail_docs: + print("No documents to process. Exiting.") + return + + # 2) Split into chunks + splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=150) + chunks = splitter.split_documents(gmail_docs) + print(f"✓ Created {len(chunks)} chunks") + + # 3) Create embeddings and store in ChromaDB + print(f"✓ Creating embeddings with {EMBED_MODEL}...") + embeddings = OpenAIEmbeddings(model=EMBED_MODEL) + + Path(CHROMA_DIR).mkdir(parents=True, exist_ok=True) + vs = Chroma.from_documents( + chunks, + embedding=embeddings, + persist_directory=CHROMA_DIR + ) + vs.persist() + + print(f"✓ Successfully persisted ChromaDB at: {CHROMA_DIR}\n") + print("Ingestion complete! You can now query your Gmail data.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/week5/community-contributions/emmy/gmail_rag/requirements.txt b/week5/community-contributions/emmy/gmail_rag/requirements.txt new file mode 100644 index 0000000..ee4440c --- /dev/null +++ b/week5/community-contributions/emmy/gmail_rag/requirements.txt @@ -0,0 +1,14 @@ +python-dotenv +langchain +langchain-core +langchain-community +langchain-openai +langchain-chroma +langchain-google-community +chromadb +openai +gradio +google-auth +google-auth-oauthlib +google-auth-httplib2 +google-api-python-client \ No newline at end of file