diff --git a/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Gmail_API_Credential_Guide.ipynb b/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Gmail_API_Credential_Guide.ipynb new file mode 100644 index 0000000..1f5e1c6 --- /dev/null +++ b/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Gmail_API_Credential_Guide.ipynb @@ -0,0 +1,154 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "35177005-196a-48b3-bf92-fa37d84693f4", + "metadata": {}, + "source": [ + "# Gmail API Credential Guide" + ] + }, + { + "cell_type": "markdown", + "id": "7bcad9ee-cd11-4b12-834d-9f1ddcefb190", + "metadata": {}, + "source": [ + "Use Gmail API to Read Your Emails\n", + "1. Set up a Google Cloud Project\n", + "\n", + " Go to Google Cloud Platform(GCP) Console\n", + "\n", + " Create a new project\n", + "\n", + "2. Enable the Gmail API for that project\n", + "\n", + " Select the created project and go to \"APIs & services\" page\n", + "\n", + " Click \"+ Enable APIs and services\" button, search \"Gmail API\" and enable it\n", + "\n", + "3. Go to \"OAuth Consent Screen\" and configure:\n", + "\n", + " Choose External and Fill in app name, dedveloper email, etc.\n", + "\n", + "4. Create OAuth Credentials\n", + "\n", + " Go to APIs & Services > Credentials\n", + "\n", + " Click \"+ Create Credentials\" > \"OAuth client ID\"\n", + "\n", + " Choose Desktop App\n", + "\n", + " Download the generated credentials.json\n", + "\n", + " Sometimes, GCP will navigate you to \"Google Auth Platform\" > \"Clients\", and you can click \"+ Create client\" here to create the OAuth Credentials\n", + "\n", + " \n", + "5. Add Test Users for Gmail API OAuth Access\n", + " \n", + " Go to \"APIs & Services\" > \"OAuth consent screen\" > \"Audience\" > \"Test Users\"\n", + "\n", + " Add the email account from which you want to extract email content.\n", + "\n", + "\n", + "6. Create 'credentials' folders to store gmail credential and user tokens" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc86bec0-bda8-4e9e-9c85-423179a99981", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4270e52e-378c-4127-bd52-1d082e9834e0", + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import print_function\n", + "import os.path\n", + "import base64\n", + "import re\n", + "from email import message_from_bytes\n", + "from google.oauth2.credentials import Credentials\n", + "from google_auth_oauthlib.flow import InstalledAppFlow\n", + "from googleapiclient.discovery import build\n", + "\n", + "# If modifying these SCOPES, delete the token.json\n", + "SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']\n", + "PORT = 18000\n", + "\n", + "def main():\n", + " creds = None\n", + " # token.json stores the user's access and refresh tokens\n", + " if os.path.exists('token.json'):\n", + " creds = Credentials.from_authorized_user_file('token.json', SCOPES)\n", + " else:\n", + " flow = InstalledAppFlow.from_client_secrets_file('credentials/gmail_credentials.json', SCOPES)\n", + " creds = flow.run_local_server(port=PORT)\n", + " with open('token.json', 'w') as token:\n", + " token.write(creds.to_json())\n", + "\n", + " service = build('gmail', 'v1', credentials=creds)\n", + "\n", + " # Get the latest message\n", + " results = service.users().messages().list(userId='me', maxResults=1).execute()\n", + " messages = results.get('messages', [])\n", + "\n", + " if not messages:\n", + " print(\"No messages found.\")\n", + " return\n", + "\n", + " msg = service.users().messages().get(userId='me', id=messages[0]['id'], format='raw').execute()\n", + " raw_msg = base64.urlsafe_b64decode(msg['raw'].encode('ASCII'))\n", + " email_message = message_from_bytes(raw_msg)\n", + "\n", + " subject = email_message['Subject']\n", + " print(\"Subject:\", subject)\n", + "\n", + " # Extract text/plain body\n", + " for part in email_message.walk():\n", + " if part.get_content_type() == 'text/plain':\n", + " print(\"Body:\")\n", + " print(part.get_payload(decode=True).decode('utf-8'))\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ff68e06-3cfb-48ae-9dad-fa431d0d548a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Google_Workspace_API_Credential_Guide.ipynb b/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Google_Workspace_API_Credential_Guide.ipynb new file mode 100644 index 0000000..c300ec4 --- /dev/null +++ b/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Google_Workspace_API_Credential_Guide.ipynb @@ -0,0 +1,294 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "35177005-196a-48b3-bf92-fa37d84693f4", + "metadata": {}, + "source": [ + "# Google Workspace API Credential Guide" + ] + }, + { + "cell_type": "markdown", + "id": "7bcad9ee-cd11-4b12-834d-9f1ddcefb190", + "metadata": {}, + "source": [ + "Use Google Drive API to Read files in Google Workspace \n", + "1. Set up a Google Cloud Project\n", + "\n", + " Go to Google Cloud Platform(GCP) Console\n", + "\n", + " Create a new project\n", + "\n", + "2. Enable the Gmail API for that project\n", + "\n", + " Select the created project and go to \"APIs & services\" page\n", + "\n", + " Click \"+ Enable APIs and services\" button, enable these APIs: Google Drive API, Google Docs API, Google Sheets API, and Google Slides API \n", + "\n", + "3. Go to \"OAuth Consent Screen\" and configure:\n", + "\n", + " Choose External and Fill in app name, dedveloper email, etc.\n", + "\n", + "4. Create OAuth Credentials\n", + "\n", + " Go to APIs & Services > Credentials\n", + "\n", + " Click \"+ Create Credentials\" > \"OAuth client ID\"\n", + "\n", + " Choose Desktop App\n", + "\n", + " Download the generated credentials.json\n", + "\n", + " Sometimes, GCP will navigate you to \"Google Auth Platform\" > \"Clients\", and you can click \"+ Create client\" here to create the OAuth Credentials\n", + "\n", + " \n", + "5. Add Test Users for Gmail API OAuth Access\n", + " \n", + " Go to \"APIs & Services\" > \"OAuth consent screen\" > \"Audience\" > \"Test Users\"\n", + "\n", + " Add the email account from which you want to extract email content.\n", + "\n", + "\n", + "6. Create 'credentials' folders to store google workspace credential and user tokens" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc86bec0-bda8-4e9e-9c85-423179a99981", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install PyPDF2\n", + "# !pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4270e52e-378c-4127-bd52-1d082e9834e0", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ff68e06-3cfb-48ae-9dad-fa431d0d548a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69c20f2d-2f49-408c-8700-f12d6745efd3", + "metadata": {}, + "outputs": [], + "source": [ + "from google_auth_oauthlib.flow import InstalledAppFlow\n", + "from googleapiclient.discovery import build\n", + "from google.oauth2.credentials import Credentials\n", + "from googleapiclient.http import MediaIoBaseDownload\n", + "import os\n", + "\n", + "import io\n", + "from PyPDF2 import PdfReader\n", + "from langchain.vectorstores import Chroma\n", + "from langchain.embeddings import OpenAIEmbeddings\n", + "from langchain.schema import Document\n", + "\n", + "GOOGLE_WORKSPACE_SCOPES = [\"https://www.googleapis.com/auth/drive.readonly\",\n", + " 'https://www.googleapis.com/auth/documents.readonly',\n", + " 'https://www.googleapis.com/auth/spreadsheets.readonly',\n", + " 'https://www.googleapis.com/auth/presentations.readonly'\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7164903b-be81-46b2-8c04-886397599c27", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_google_doc(docs_service, file_id):\n", + " doc = docs_service.documents().get(documentId=file_id).execute()\n", + " content = \"\"\n", + " for elem in doc.get(\"body\", {}).get(\"content\", []):\n", + " if \"paragraph\" in elem:\n", + " for run in elem[\"paragraph\"][\"elements\"]:\n", + " content += run.get(\"textRun\", {}).get(\"content\", \"\")\n", + " return content.strip()\n", + "\n", + "def extract_google_sheet(service, file_id):\n", + " # Get spreadsheet metadata\n", + " spreadsheet = service.spreadsheets().get(spreadsheetId=file_id).execute()\n", + " all_text = \"\"\n", + "\n", + " # Loop through each sheet\n", + " for sheet in spreadsheet.get(\"sheets\", []):\n", + " title = sheet[\"properties\"][\"title\"]\n", + " result = service.spreadsheets().values().get(\n", + " spreadsheetId=file_id,\n", + " range=title\n", + " ).execute()\n", + "\n", + " values = result.get(\"values\", [])\n", + " sheet_text = f\"### Sheet: {title} ###\\n\"\n", + " sheet_text += \"\\n\".join([\", \".join(row) for row in values])\n", + " all_text += sheet_text + \"\\n\\n\"\n", + "\n", + " return all_text.strip()\n", + "\n", + "\n", + "def extract_google_slide(slides_service, file_id):\n", + " pres = slides_service.presentations().get(presentationId=file_id).execute()\n", + " text = \"\"\n", + " for slide in pres.get(\"slides\", []):\n", + " for element in slide.get(\"pageElements\", []):\n", + " shape = element.get(\"shape\")\n", + " if shape:\n", + " for p in shape.get(\"text\", {}).get(\"textElements\", []):\n", + " if \"textRun\" in p:\n", + " text += p[\"textRun\"][\"content\"]\n", + " return text.strip()\n", + "\n", + "def extract_pdf_from_drive(drive_service, file_id, filename='downloaded.pdf'):\n", + " request = drive_service.files().get_media(fileId=file_id)\n", + " fh = io.BytesIO()\n", + " downloader = MediaIoBaseDownload(fh, request)\n", + " done = False\n", + " while not done:\n", + " _, done = downloader.next_chunk()\n", + " fh.seek(0)\n", + " reader = PdfReader(fh)\n", + " return \"\\n\".join([page.extract_text() for page in reader.pages if page.extract_text()])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f2edc68-f9f8-4cba-810e-159bea4fe4ac", + "metadata": {}, + "outputs": [], + "source": [ + "def get_creds():\n", + " if os.path.exists(\"token.json\"):\n", + " creds = Credentials.from_authorized_user_file(\"token.json\", SCOPES)\n", + " else:\n", + " flow = InstalledAppFlow.from_client_secrets_file(\"credentials/google_drive_workspace_credentials.json\", SCOPES)\n", + " creds = flow.run_local_server(port=0)\n", + " with open(\"token.json\", \"w\") as token:\n", + " token.write(creds.to_json())\n", + " return creds\n", + " \n", + "\n", + "def get_folder_id_by_name(drive_service, folder_name):\n", + " query = f\"mimeType='application/vnd.google-apps.folder' and name='{folder_name}' and trashed=false\"\n", + " results = drive_service.files().list(\n", + " q=query,\n", + " fields=\"files(id, name)\",\n", + " pageSize=1\n", + " ).execute()\n", + "\n", + " folders = results.get(\"files\", [])\n", + " if not folders:\n", + " raise ValueError(f\"❌ Folder named '{folder_name}' not found.\")\n", + " return folders[0]['id']\n", + "\n", + "\n", + "def extract_docs_from_google_workspace(folder_name):\n", + " info = \"\"\n", + " \n", + " creds = get_creds()\n", + "\n", + " file_types = {\n", + " 'application/vnd.google-apps.document': lambda fid: extract_google_doc(docs_service, fid),\n", + " 'application/vnd.google-apps.spreadsheet': lambda fid: extract_google_sheet(sheets_service, fid),\n", + " 'application/vnd.google-apps.presentation': lambda fid: extract_google_slide(slides_service, fid),\n", + " 'application/pdf': lambda fid: extract_pdf_from_drive(drive_service, fid),\n", + " }\n", + " \n", + " drive_service = build(\"drive\", \"v3\", credentials=creds)\n", + " docs_service = build('docs', 'v1', credentials=creds)\n", + " sheets_service = build('sheets', 'v4', credentials=creds)\n", + " slides_service = build('slides', 'v1', credentials=creds)\n", + "\n", + " folder_id = get_folder_id_by_name(drive_service, folder_name)\n", + " info += f\"Collection files from folder: {folder_name}\\n\"\n", + " \n", + " query = (\n", + " f\"'{folder_id}' in parents and (\"\n", + " 'mimeType=\"application/vnd.google-apps.document\" or '\n", + " 'mimeType=\"application/vnd.google-apps.spreadsheet\" or '\n", + " 'mimeType=\"application/vnd.google-apps.presentation\" or '\n", + " 'mimeType=\"application/pdf\")'\n", + " )\n", + " \n", + " results = drive_service.files().list(\n", + " q=query,\n", + " fields=\"files(id, name, mimeType)\",\n", + " pageSize=20\n", + " ).execute()\n", + "\n", + " docs = []\n", + " summary_info = {\n", + " 'application/vnd.google-apps.document': {'file_type': 'Google Doc', 'count': 0},\n", + " 'application/vnd.google-apps.spreadsheet': {'file_type': 'Google Sheet', 'count': 0},\n", + " 'application/vnd.google-apps.presentation': {'file_type': 'Google Silde', 'count': 0},\n", + " 'application/pdf': {'file_type': 'PDF', 'count': 0}\n", + " }\n", + " for file in results.get(\"files\", []):\n", + " extractor = file_types.get(file['mimeType'])\n", + " if extractor:\n", + " try:\n", + " content = extractor(file[\"id\"])\n", + " if content:\n", + " docs.append(Document(page_content=content, metadata={\"source\": file[\"name\"]}))\n", + " summary_info[file['mimeType']]['count'] += 1\n", + " except Exception as e:\n", + " print(f\"❌ Error processing {file['name']}: {e}\")\n", + " \n", + " total = 0;\n", + " for file_type, element in summary_info.items():\n", + " total += element['count']\n", + " info += f\"Found {element['count']} {element['file_type']} files\\n\"\n", + " info += f\"Total documents loaded: {total}\"\n", + " return docs, info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a9da5c9-415c-4856-973a-627a1790f38d", + "metadata": {}, + "outputs": [], + "source": [ + "docs, info = extract_docs_from_google_workspace(\"google_workspace_knowledge_base\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Outlook_API_Credential_Guide.ipynb b/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Outlook_API_Credential_Guide.ipynb new file mode 100644 index 0000000..785d5dd --- /dev/null +++ b/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Outlook_API_Credential_Guide.ipynb @@ -0,0 +1,178 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "35177005-196a-48b3-bf92-fa37d84693f4", + "metadata": {}, + "source": [ + "# Outlook API Credential Guide" + ] + }, + { + "cell_type": "markdown", + "id": "7bcad9ee-cd11-4b12-834d-9f1ddcefb190", + "metadata": {}, + "source": [ + "Extract Outlook Emails via Microsoft Graph API\n", + "\n", + "1. Register Your App on Azure Portal\n", + "\n", + " Go to Azure Portal > Azure Active Directory > App registrations\n", + "\n", + " Click “New registration”\n", + "\n", + " Choose Mobole/Desktop app\n", + " \n", + " After creation, note the Application (client) ID\n", + "\n", + "2. API Permissions\n", + "\n", + " Go to API permissions tab\n", + "\n", + " Click Add permission > Microsoft Graph > Delegated\n", + "\n", + " Choose: Mail.Read\n", + "\n", + " Click Grant admin consent\n", + "\n", + "3. Allow public client flows\n", + "\n", + " Navigate to: Azure Active Directory > App registrations > Your App\n", + "\n", + " Go to Authentication tab\n", + "\n", + " Under \"Advanced settings\" → \"Allow public client flows\", set to \"Yes\"\n", + "\n", + " Save changes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc86bec0-bda8-4e9e-9c85-423179a99981", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install msal requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4270e52e-378c-4127-bd52-1d082e9834e0", + "metadata": {}, + "outputs": [], + "source": [ + "from msal import PublicClientApplication\n", + "import os\n", + "from dotenv import load_dotenv\n", + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ff68e06-3cfb-48ae-9dad-fa431d0d548a", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv()\n", + "\n", + "CLIENT_ID = os.getenv(\"AZURE_CLIENT_ID\")\n", + "AUTHORITY = \"https://login.microsoftonline.com/common\" \n", + "SCOPES = [\"Mail.Read\"]\n", + "\n", + "app = PublicClientApplication(CLIENT_ID, authority=AUTHORITY)\n", + "\n", + "flow = app.initiate_device_flow(scopes=SCOPES)\n", + "print(\"Go to:\", flow[\"verification_uri\"])\n", + "print(\"Enter code:\", flow[\"user_code\"])\n", + "\n", + "result = app.acquire_token_by_device_flow(flow)\n", + "\n", + "if \"access_token\" not in result:\n", + " raise Exception(\"Failed to authenticate:\", result)\n", + "\n", + "access_token = result[\"access_token\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c7f97da-68cc-4923-b280-1ddf7e5b7aa3", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Granted scopes:\", result.get(\"scope\"))\n", + "\n", + "headers = {\n", + " \"Authorization\": f\"Bearer {access_token}\",\n", + " \"Prefer\": \"outlook.body-content-type='text'\"\n", + "}\n", + "\n", + "query = (\n", + " \"https://graph.microsoft.com/v1.0/me/messages\"\n", + " \"?$top=1\"\n", + " \"&$select=id,subject,receivedDateTime,body\"\n", + ")\n", + "\n", + "all_emails = []\n", + "\n", + "while query:\n", + " response = requests.get(query, headers=headers)\n", + "\n", + " if not response.ok:\n", + " print(response.text)\n", + " print(f\"❌ HTTP {response.status_code}: {response.text}\")\n", + " break\n", + "\n", + " try:\n", + " res = response.json()\n", + " except ValueError:\n", + " print(\"❌ Invalid JSON:\", response.text)\n", + " break\n", + "\n", + " for msg in res.get(\"value\", []):\n", + " all_emails.append({\n", + " \"id\": msg.get(\"id\"),\n", + " \"subject\": msg.get(\"subject\", \"\"),\n", + " \"body\": msg.get(\"body\", {}).get(\"content\", \"\"),\n", + " \"date\": msg.get(\"receivedDateTime\", \"\")\n", + " })\n", + "\n", + " query = res.get(\"@odata.nextLink\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e29493b6-0a9e-4106-93c9-e58ff6aa0f97", + "metadata": {}, + "outputs": [], + "source": [ + "all_emails" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Week5_Exercise_Personal_Knowledge_Assistant.ipynb b/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Week5_Exercise_Personal_Knowledge_Assistant.ipynb new file mode 100644 index 0000000..9bab26f --- /dev/null +++ b/week5/community-contributions/Week5_Exercise_Personal_Knowledge/Week5_Exercise_Personal_Knowledge_Assistant.ipynb @@ -0,0 +1,1862 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e096ce5d-71a1-4fde-b171-8b9fed16cd7b", + "metadata": {}, + "source": [ + "# Personal Knowledge Assistant" + ] + }, + { + "cell_type": "markdown", + "id": "7bcad9ee-cd11-4b12-834d-9f1ddcefb190", + "metadata": {}, + "source": [ + "## Week 5 exercise\n", + "\n", + "\n", + "### Features:\n", + "1. Chat powered of uploaded knowlege\n", + "\n", + " The system prompt is designed to make the chatbot simulate a person based on the provided documents.\n", + "\n", + "2. Load files from local system\n", + "\n", + " Reuse code from bluebells1 [Wk5-final-multi-doc-type-KB.ipynb](../Wk5-final-multi-doc-type-KB.ipynb). Really appreciate it!\n", + "\n", + " Choose a folder located in the same directory as this script to extract content from. You can also specify subfolders to exclude from the extraction.\n", + "\n", + "3. Load emails from Gmail\n", + "\n", + " Enter an alias first, and a Google popup will guide you to grant permissions and log in, then extract emails for your specified time range\n", + "\n", + "4. Load emails from Outlook\n", + "\n", + " First, enter an alias. After clicking the 'Get Verification Code' button, a URI and code will appear in the 'Verification Instructions' textbox. Visit the Outlook website using the code, and follow the guide to grant permissions and complete the login.\n", + " Then, extract emails for your specified time range\n", + " \n", + "5. Load files from Google Workspace\n", + "\n", + " Enter with an alias first, and Google popup will guide you to grant permissions and log in, then extract emails for your specified folder in your Google Drive\n", + "\n", + "\n", + "### TO-DO Features:\n", + "1. Load messages from Slack\n", + "2. Use local inference/embedding models (llama) instead of relying on OpenAI-hosted models \n", + "3. Optimize Gmail/Outlook/Google Workspace login logic\n", + "4. Label different files. For example, extract prrivate and work emails respectively and store them into different vector stores\n", + "5. Add vector visualization\n", + "\n", + "### Requirements:\n", + "1. Store gmail credential json file under the 'credentials' folder\n", + "\n", + " The setup and configuration steps for Gmail API are in this guide: [Gmail_API_Credential_Guide](./Gmail_API_Credential_Guide.ipynb)\n", + "\n", + "2. Set AZURE_CLIENT_ID in .env file\n", + "\n", + " The setup and configuration steps for Outlook API are in this guide: [Outlook_API_Credential_Guide](./Outlook_API_Credential_Guide.ipynb)\n", + "\n", + "\n", + "3. Store google workspace credential json file under the 'credentials' folder\n", + "\n", + " The setup and configuration steps for Gmail API are in this guide: [Google_Workspace_API_Credential_Guide](./Google_Workspace_API_Credential_Guide.ipynb)\n", + "\n", + "The directories should be structured before launch as follows:\n", + "\n", + " ```text\n", + " The project/\n", + " │\n", + " ├── credentials/ <-- Need to create and store manually before launch; download from Google Cloud Plafotm(GCP)\n", + " │ ├── gmail_credentials.json\n", + " │ └── google_workspace_credentials.json\n", + " ├── tokens/ <-- Automatically created and saved\n", + " │ ├── gmail_tokens \n", + " │ │ └── gmail_token_{alias}.json\n", + " │ ├── google_workspace_tokens\n", + " │ └── outlook_tokens\n", + " ├── vector_index/ <-- Need to create manually before launch\n", + " │ ├── local_vector_index\n", + " │ ├── google_workspace_vector_index\n", + " │ ├── gmail_vector_index\n", + " │ └── output_vector_index\n", + " └── ***.ipynb <-- Script" + ] + }, + { + "cell_type": "markdown", + "id": "99c271af-9054-4066-9583-65a9253cb70a", + "metadata": {}, + "source": [ + "Feel free to contact me via zhufqiu@gmail.com or via [Linkedin](https://www.linkedin.com/in/zhufeng-zephyr-qiu/) if you have any questions about this project. If you have better idea about system prompt, chunk config or search_kwargs, I will be happy to talk with you!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc86bec0-bda8-4e9e-9c85-423179a99981", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install pymupdf\n", + "# !pip install openpyxl\n", + "# !pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4270e52e-378c-4127-bd52-1d082e9834e0", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import base64\n", + "from datetime import datetime\n", + "from email import message_from_bytes\n", + "from email.utils import parsedate_to_datetime\n", + "\n", + "from google.auth.transport.requests import Request\n", + "from google.oauth2.credentials import Credentials\n", + "from google_auth_oauthlib.flow import InstalledAppFlow\n", + "from googleapiclient.discovery import build\n", + "\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain.vectorstores import FAISS\n", + "from langchain.schema import Document\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain_chroma import Chroma\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.chains import ConversationChain\n", + "from langchain.retrievers import MergerRetriever\n", + "from collections import defaultdict\n", + "from langchain.document_loaders import (\n", + " DirectoryLoader, TextLoader, \n", + " Docx2txtLoader,\n", + " TextLoader,\n", + " PyPDFLoader,\n", + " UnstructuredExcelLoader,\n", + " BSHTMLLoader\n", + ")\n", + "import glob\n", + "from dotenv import load_dotenv\n", + "import gradio as gr\n", + "import tiktoken\n", + "\n", + "from msal import PublicClientApplication\n", + "import requests\n", + "from datetime import datetime, timezone\n", + "import json\n", + "import shutil\n", + "\n", + "from PIL import Image\n", + "import pytesseract\n", + "import fitz\n", + "import ebooklib\n", + "from ebooklib import epub\n", + "import io\n", + "\n", + "from langchain.prompts.chat import (\n", + " ChatPromptTemplate,\n", + " SystemMessagePromptTemplate,\n", + " HumanMessagePromptTemplate\n", + ")\n", + "from langchain.prompts import PromptTemplate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3478cbe-2854-4011-b1b4-70be3f1623fd", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL = \"gpt-4o-mini\"\n", + "load_dotenv(override=True)\n", + "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')" + ] + }, + { + "cell_type": "markdown", + "id": "a5195792-f6e1-43a1-9c5f-d6f8c84a253f", + "metadata": {}, + "source": [ + "### If it is your first time to create VECTOR_DIR and its sub-folder, you should create them, close this script and re-open it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ca9623f-fb8c-45d1-a968-370c92762924", + "metadata": {}, + "outputs": [], + "source": [ + "LOCAL_VECTOR_DIR = 'vector_index/local_vector_index'\n", + "GMAIL_VECTOR_DIR = 'vector_index/gmail_vector_index'\n", + "OUTLOOK_VECTOR_DIR = \"vector_index/outlook_vector_index\"\n", + "GOOGLE_WORKSPACE_VECTOR_DIR = 'vector_index/google_workspace_vector_index'\n", + "SLACK_VECTOR_DIR = 'vector_index/slack_vector_index'\n", + "\n", + "os.makedirs(LOCAL_VECTOR_DIR, exist_ok=True)\n", + "os.makedirs(GMAIL_VECTOR_DIR, exist_ok=True)\n", + "os.makedirs(OUTLOOK_VECTOR_DIR, exist_ok=True)\n", + "os.makedirs(GOOGLE_WORKSPACE_VECTOR_DIR, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "id": "b0f2a2ee-c9fb-49ad-8e09-919a7a7130ea", + "metadata": {}, + "source": [ + "#### Utilize functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f185451f-2e2a-4ebb-a570-8b7349f3df48", + "metadata": {}, + "outputs": [], + "source": [ + "def get_num_tokens(text, model=\"text-embedding-3-large\"):\n", + " enc = tiktoken.encoding_for_model(model)\n", + " return len(enc.encode(text))\n", + "\n", + "def batch_chunks(chunks, max_tokens=250000, model=\"text-embedding-3-large\"):\n", + " batches = []\n", + " current_batch = []\n", + " current_tokens = 0\n", + "\n", + " for doc in chunks:\n", + " doc_tokens = get_num_tokens(doc.page_content, model)\n", + " if current_tokens + doc_tokens > max_tokens:\n", + " batches.append(current_batch)\n", + " current_batch = [doc]\n", + " current_tokens = doc_tokens\n", + " else:\n", + " current_batch.append(doc)\n", + " current_tokens += doc_tokens\n", + "\n", + " if current_batch:\n", + " batches.append(current_batch)\n", + " \n", + " return batches" + ] + }, + { + "cell_type": "markdown", + "id": "a5546fd7-46bf-4a36-8eef-7b4192f247e9", + "metadata": {}, + "source": [ + "### 1. Local" + ] + }, + { + "cell_type": "markdown", + "id": "937c4f19-5e5b-46b8-b15d-f7ceddd81384", + "metadata": {}, + "source": [ + "Reuse code from bluebells1 [Wk5-final-multi-doc-type-KB.ipynb](../Wk5-final-multi-doc-type-KB.ipynb). Really appreciate it!\n", + "\n", + "Advanced features:\n", + "1. ImgLoader added to load image file (png, jpg, jpeg)\n", + "2. Add logic to use DocumentLoader, extract files and show summary in Gradio textbox" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74b85882-c2d6-42af-9079-9f2a61d9eb72", + "metadata": {}, + "outputs": [], + "source": [ + "from ebooklib import epub\n", + "from bs4 import BeautifulSoup\n", + "from langchain.document_loaders.base import BaseLoader\n", + "\n", + "class EpubLoader(BaseLoader):\n", + " def __init__(self, file_path: str):\n", + " self.file_path = file_path\n", + "\n", + " def load(self) -> list[Document]:\n", + " book = epub.read_epub(self.file_path)\n", + " text = ''\n", + " for item in book.get_items():\n", + " if item.get_type() == ebooklib.ITEM_DOCUMENT:\n", + " soup = BeautifulSoup(item.get_content().decode(\"utf-8\"), 'html.parser')\n", + " extracted = soup.get_text().strip()\n", + " if extracted:\n", + " text += extracted + '\\n\\n'\n", + "\n", + " return [Document(page_content=text.strip(), metadata={\"source\": self.file_path})]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85f94e96-83e1-4b5a-ad63-373a37474d25", + "metadata": {}, + "outputs": [], + "source": [ + "from pptx import Presentation\n", + "\n", + "class PptxLoader(BaseLoader):\n", + " def __init__(self, file_path: str):\n", + " self.file_path = file_path\n", + "\n", + " def load(self) -> list[Document]:\n", + " prs = Presentation(self.file_path)\n", + " text = ''\n", + " for slide in prs.slides:\n", + " for shape in slide.shapes:\n", + " if hasattr(shape, \"text\") and shape.text:\n", + " text += shape.text + '\\n'\n", + "\n", + " return [Document(page_content=text, metadata={\"source\": self.file_path})]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd3932ce-5179-4e83-9a2c-bdefc37028aa", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import pytesseract\n", + "\n", + "class ImgLoader(BaseLoader):\n", + " def __init__(self, file_path: str):\n", + " self.file_path = file_path\n", + "\n", + " def load(self) -> list[Document]:\n", + " text = ''\n", + " try:\n", + " text = pytesseract.image_to_string(Image.open(self.file_path))\n", + " except Exception as e:\n", + " print(f\"OCR failed for {path}: {e}\")\n", + " return [Document(page_content=text, metadata={\"source\": self.file_path})]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "427e758a-77ab-4de1-ae14-8f2f233ea6db", + "metadata": {}, + "outputs": [], + "source": [ + "# Class based version of document loader which can be expanded more easily for other document types. (Currently includes file types: docx, txt (windows encoding), xlsx, pdfs, epubs, pptx)\n", + "\n", + "class DocumentLoader:\n", + " \"\"\"A clean, extensible document loader for multiple file types.\"\"\"\n", + " \n", + " def __init__(self, base_path, exclude_folders=None):\n", + " self.base_path = base_path\n", + " self.documents = []\n", + " self.exclude_folders = exclude_folders or []\n", + " self.print_info = \"\"\n", + " \n", + " # Configuration for different file types\n", + " self.loader_config = {\n", + " 'docx': {\n", + " 'loader_cls': Docx2txtLoader,\n", + " 'glob_pattern': \"**/*.docx\",\n", + " 'loader_kwargs': {},\n", + " 'post_process': None\n", + " },\n", + " 'txt': {\n", + " 'loader_cls': TextLoader,\n", + " 'glob_pattern': \"**/*.txt\",\n", + " 'loader_kwargs': {\"encoding\": 'utf-8'},\n", + " 'post_process': None\n", + " },\n", + " 'md': {\n", + " 'loader_cls': TextLoader,\n", + " 'glob_pattern': \"**/*.md\",\n", + " 'loader_kwargs': {\"encoding\": 'utf-8'},\n", + " 'post_process': None\n", + " },\n", + " 'pdf': {\n", + " 'loader_cls': PyPDFLoader,\n", + " 'glob_pattern': \"**/*.pdf\",\n", + " 'loader_kwargs': {},\n", + " 'post_process': None\n", + " },\n", + " 'xlsx': {\n", + " 'loader_cls': UnstructuredExcelLoader,\n", + " 'glob_pattern': \"**/*.xlsx\",\n", + " 'loader_kwargs': {},\n", + " 'post_process': None\n", + " },\n", + " 'html': {\n", + " 'loader_cls': BSHTMLLoader,\n", + " 'glob_pattern': \"**/*.html\",\n", + " 'loader_kwargs': {},\n", + " 'post_process': None\n", + " },\n", + " 'epub': {\n", + " 'loader_cls': EpubLoader,\n", + " 'glob_pattern': \"**/*.epub\",\n", + " 'loader_kwargs': {},\n", + " 'post_process': self._process_epub_metadata\n", + " },\n", + " 'pptx': {\n", + " 'loader_cls': PptxLoader,\n", + " 'glob_pattern': \"**/*.pptx\",\n", + " 'loader_kwargs': {},\n", + " 'post_process': None\n", + " },\n", + " 'png': {\n", + " 'loader_cls': ImgLoader,\n", + " 'glob_pattern': \"**/*.png\",\n", + " 'loader_kwargs': {},\n", + " 'post_process': None\n", + " },\n", + " 'jpeg': {\n", + " 'loader_cls': ImgLoader,\n", + " 'glob_pattern': \"**/*.jpeg\",\n", + " 'loader_kwargs': {},\n", + " 'post_process': None\n", + " },\n", + " 'jpg': {\n", + " 'loader_cls': ImgLoader,\n", + " 'glob_pattern': \"**/*.jpg\",\n", + " 'loader_kwargs': {},\n", + " 'post_process': None\n", + " }\n", + " }\n", + " \n", + " def _get_epub_metadata(self, file_path):\n", + " \"\"\"Extract metadata from EPUB files.\"\"\"\n", + " try:\n", + " book = epub.read_epub(file_path)\n", + " title = book.get_metadata('DC', 'title')[0][0] if book.get_metadata('DC', 'title') else None\n", + " author = book.get_metadata('DC', 'creator')[0][0] if book.get_metadata('DC', 'creator') else None\n", + " return title, author\n", + " except Exception as e:\n", + " self.print_info += f\"Error extracting EPUB metadata: {e}\\n\"\n", + " return None, None\n", + " \n", + " def _process_epub_metadata(self, doc) -> None:\n", + " \"\"\"Post-process EPUB documents to add metadata.\"\"\"\n", + " title, author = self._get_epub_metadata(doc.metadata['source'])\n", + " doc.metadata[\"author\"] = author\n", + " doc.metadata[\"title\"] = title\n", + " \n", + " def _load_file_type(self, folder, file_type, config):\n", + " \"\"\"Load documents of a specific file type from a folder.\"\"\"\n", + " try:\n", + " loader = DirectoryLoader(\n", + " folder, \n", + " glob=config['glob_pattern'], \n", + " loader_cls=config['loader_cls'],\n", + " loader_kwargs=config['loader_kwargs']\n", + " )\n", + " docs = loader.load()\n", + " self.print_info += f\"Found {len(docs)} .{file_type} files\\n\"\n", + " \n", + " # Apply post-processing if defined\n", + " if config['post_process']:\n", + " for doc in docs:\n", + " config['post_process'](doc)\n", + " \n", + " return docs\n", + " \n", + " except Exception as e:\n", + " self.print_info += f\"Error loading .{file_type} files: {e}\\n\"\n", + " return []\n", + " \n", + " def load_all(self):\n", + " \"\"\"Load all documents from configured folders.\"\"\"\n", + " all_folders = [f for f in glob.glob(self.base_path) if os.path.isdir(f)]\n", + "\n", + " #filter out excluded folders\n", + " folders = []\n", + " for folder in all_folders:\n", + " folder_name = os.path.basename(folder)\n", + " if folder_name not in self.exclude_folders:\n", + " folders.append(folder)\n", + " else:\n", + " self.print_info += f\"Excluded folder: {folder_name}\\n\"\n", + " \n", + " self.print_info += f\"Scanning folders (directories only):{folders}\\n\" \n", + " \n", + " self.documents = []\n", + " \n", + " for folder in folders:\n", + " doc_type = os.path.basename(folder)\n", + " self.print_info += f\"\\nProcessing folder: {doc_type}\\n\"\n", + " \n", + " for file_type, config in self.loader_config.items():\n", + " docs = self._load_file_type(folder, file_type, config)\n", + " \n", + " # Add doc_type metadata to all documents\n", + " for doc in docs:\n", + " doc.metadata[\"doc_type\"] = doc_type\n", + " self.documents.append(doc)\n", + " \n", + " self.print_info += f\"\\nTotal documents loaded: {len(self.documents)}\\n\"\n", + " return self.documents\n", + " \n", + " def add_file_type(self, extension, loader_cls, glob_pattern=None, \n", + " loader_kwargs=None, post_process=None):\n", + " \"\"\"Add support for a new file type.\"\"\"\n", + " self.loader_config[extension] = {\n", + " 'loader_cls': loader_cls,\n", + " 'glob_pattern': glob_pattern or f\"**/*.{extension}\",\n", + " 'loader_kwargs': loader_kwargs or {},\n", + " 'post_process': post_process\n", + " }\n", + "\n", + "# load\n", + "# loader = DocumentLoader(\"local-knowledge-base/**\", exclude_folders=[\"Music\", \"Online Courses\", \"Fitness\"])\n", + "# documents = loader.load_all()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53e65a63-29fd-4db3-91f0-246cc2b61941", + "metadata": {}, + "outputs": [], + "source": [ + "def local_embed_and_store(docs):\n", + " text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + " chunks = [doc for doc in text_splitter.split_documents(docs) if doc.page_content.strip()]\n", + "\n", + " if not chunks:\n", + " return \"⚠️ No non-empty chunks to embed. Skipping vectorstore update.\"\n", + "\n", + " embeddings = OpenAIEmbeddings()\n", + "\n", + " vectorstore = None\n", + " if os.path.exists(LOCAL_VECTOR_DIR):\n", + " vectorstore = Chroma(persist_directory=LOCAL_VECTOR_DIR, embedding_function=embeddings)\n", + " else:\n", + " if chunks:\n", + " vectorstore = Chroma.from_documents(documents=chunks[:1], embedding=embeddings, persist_directory=LOCAL_VECTOR_DIR)\n", + " chunks = chunks[1:]\n", + " else:\n", + " return \"⚠️ No chunks to create new vectorstore.\"\n", + " \n", + " batches = batch_chunks(chunks)\n", + " total = 1 if not os.path.exists(LOCAL_VECTOR_DIR) else 0\n", + " \n", + " for batch in batches:\n", + " vectorstore.add_documents(batch)\n", + " total += len(batch)\n", + "\n", + " info = \"\"\n", + " info += f\"Vectorstore updated with {total} new chunks.\\n\"\n", + " num_docs = vectorstore._collection.count()\n", + " info += f\"Vectorstore contains {num_docs} chunks.\\n\"\n", + " return info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0a70a0e-08cd-4827-b42b-9a5394ff6dec", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_local_folder(folder_path=\"local-knowledge-base\", exclude=\"\"):\n", + "\n", + " # try:\n", + " info = f\"Process files under: {folder_path}\\n\"\n", + " loader = DocumentLoader(os.path.join(folder_path, \"**\"), exclude_folders=[folder.strip() for folder in exclude.split(',')])\n", + " docs = loader.load_all()\n", + " info += loader.print_info\n", + " if not docs:\n", + " return info + \"No valid files found in the given range.\"\n", + " info += f\"Fetched {len(docs)} files.\\n\"\n", + " info += local_embed_and_store(docs)\n", + " return info\n", + "\n", + " # except Exception as e:\n", + " # return f\"❌ Extraction failed: {str(e)}\"" + ] + }, + { + "cell_type": "markdown", + "id": "0e47d670-8c50-4744-8fbd-78112fa941dd", + "metadata": {}, + "source": [ + "### 2. Gmail" + ] + }, + { + "cell_type": "markdown", + "id": "4d52fe40-65e3-4d82-9999-1ed3e4cbae0a", + "metadata": {}, + "source": [ + "#### Store gmail credential json file under the credentials folder\n", + "\n", + "To avoid complicated steps and focus on LLMs stuff, I chose to utilize the Gmail API in test mode.\n", + "\n", + "I have included the setup and configuration steps in this guide:\n", + "[Gmail_API_Credential_Guide](./Gmail_API_Credential_Guide.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f32c4e4-fa7a-42a1-9ef8-b981af02f585", + "metadata": {}, + "outputs": [], + "source": [ + "GMAIL_SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']\n", + "GMAIL_CREDENTIALS_FILE = 'credentials/gmail_credentials.json'\n", + "GMAIL_TOKEN_DIR = 'tokens/gmail_tokens'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db344254-8c92-4e82-8414-40b3bef56db5", + "metadata": {}, + "outputs": [], + "source": [ + "def gmail_get_credentials(account_alias):\n", + " token_path = os.path.join(GMAIL_TOKEN_DIR, f'gmail_token_{account_alias}.json')\n", + " creds = None\n", + " if os.path.exists(token_path):\n", + " creds = Credentials.from_authorized_user_file(token_path, GMAIL_SCOPES)\n", + " if not creds or not creds.valid:\n", + " if creds and creds.expired and creds.refresh_token:\n", + " creds.refresh(Request())\n", + " else:\n", + " flow = InstalledAppFlow.from_client_secrets_file(GMAIL_CREDENTIALS_FILE, GMAIL_SCOPES)\n", + " creds = flow.run_local_server(port=0)\n", + " with open(token_path, 'w') as token_file:\n", + " token_file.write(creds.to_json())\n", + " return creds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "119558f0-4d35-4737-ad8a-eef516b540d2", + "metadata": {}, + "outputs": [], + "source": [ + "def parse_message(service, msg_id):\n", + " msg = service.users().messages().get(userId='me', id=msg_id, format='raw').execute()\n", + " raw_msg = base64.urlsafe_b64decode(msg['raw'].encode('ASCII'))\n", + " email_message = message_from_bytes(raw_msg)\n", + " subject = email_message['Subject'] or \"(No Subject)\"\n", + " date = parsedate_to_datetime(email_message['Date'])\n", + " sender = email_message['From'] or \"\"\n", + " to = email_message['To'] or \"\"\n", + " cc = email_message['Cc'] or \"\"\n", + " body = \"\"\n", + " \n", + " for part in email_message.walk():\n", + " if part.get_content_type() == 'text/plain' and not part.get('Content-Disposition'):\n", + " body = part.get_payload(decode=True).decode('utf-8', errors='ignore')\n", + " break\n", + "\n", + " content = f\"\"\"Subject: {subject}\n", + " From: {sender}\n", + " To: {to}\n", + " Cc: {cc}\n", + " {body}\n", + " \"\"\"\n", + " return {\n", + " \"id\": msg_id,\n", + " \"subject\": subject,\n", + " \"date\": date,\n", + " \"body\": content\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "481d0500-6270-47ec-bc30-44400c86dff2", + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_emails(service, start_date, end_date):\n", + " query = (\n", + " f\"(category:primary OR is:important OR is:starred OR is:snoozed OR is:sent OR in:chats OR label:SCHEDULED) \"\n", + " f\"after:{start_date} before:{end_date} -in:spam -in:trash -category:promotions -category:forums\"\n", + " ) \n", + " \n", + " all_messages = []\n", + " page_token = None\n", + "\n", + " while True:\n", + " response = service.users().messages().list(userId='me', q=query, pageToken=page_token).execute()\n", + " messages = response.get('messages', [])\n", + " print(f\"Found {len(messages)} sub-messages.\")\n", + " all_messages.extend(messages)\n", + " page_token = response.get('nextPageToken')\n", + " if not page_token:\n", + " break\n", + " print(f\"Total messages fetched: {len(all_messages)}\")\n", + " parsed_emails = []\n", + " for msg in all_messages:\n", + " parsed = parse_message(service, msg['id'])\n", + " if parsed:\n", + " parsed_emails.append(parsed)\n", + " \n", + " return parsed_emails\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aebb1598-e95d-4a7b-9d40-44afb62f587d", + "metadata": {}, + "outputs": [], + "source": [ + "def gmail_embed_and_store(emails, account):\n", + " docs = []\n", + " for email in emails:\n", + " content = f\"Subject: {email['subject']}\\n\\n{email['body']}\"\n", + " doc = Document(\n", + " page_content=content.strip(),\n", + " metadata={\n", + " \"date\": str(email['date']),\n", + " \"gmail_id\": email['id'],\n", + " \"account\": account\n", + " }\n", + " )\n", + " docs.append(doc)\n", + "\n", + " text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + " chunks = [doc for doc in text_splitter.split_documents(docs) if doc.page_content.strip()]\n", + "\n", + " if not chunks:\n", + " return \"⚠️ No non-empty chunks to embed. Skipping vectorstore update.\"\n", + "\n", + " embeddings = OpenAIEmbeddings()\n", + "\n", + " vectorstore = None\n", + " if os.path.exists(GMAIL_VECTOR_DIR):\n", + " vectorstore = Chroma(persist_directory=GMAIL_VECTOR_DIR, embedding_function=embeddings)\n", + " else:\n", + " if chunks:\n", + " vectorstore = Chroma.from_documents(documents=chunks[:1], embedding=embeddings, persist_directory=GMAIL_VECTOR_DIR)\n", + " chunks = chunks[1:]\n", + " else:\n", + " return \"⚠️ No chunks to create new vectorstore.\"\n", + " \n", + " batches = batch_chunks(chunks)\n", + " total = 1 if not os.path.exists(GMAIL_VECTOR_DIR) else 0\n", + " \n", + " for batch in batches:\n", + " vectorstore.add_documents(batch)\n", + " total += len(batch)\n", + "\n", + " info = \"\"\n", + " info += f\"Vectorstore updated with {total} new chunks from {account}.\\n\"\n", + " num_docs = vectorstore._collection.count()\n", + " info += f\"Vectorstore contains {num_docs} chunks.\\n\"\n", + " return info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3d67265-ef62-4104-ae79-783e6d20d31c", + "metadata": {}, + "outputs": [], + "source": [ + "def login_gmail(alias):\n", + " try:\n", + " creds = gmail_get_credentials(alias)\n", + " service = build('gmail', 'v1', credentials=creds)\n", + " profile = service.users().getProfile(userId='me').execute()\n", + " email = profile.get(\"emailAddress\")\n", + "\n", + " # Store in session\n", + " SESSION_STATE[\"gmail_service\"] = service\n", + " SESSION_STATE[\"gmail_email\"] = email\n", + " SESSION_STATE[\"gmail_alias\"] = alias\n", + "\n", + " return f\"✅ Logged in as: {email}\"\n", + " except Exception as e:\n", + " return f\"❌ Login failed: {str(e)}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69cb6320-7ef0-49bb-8893-d51d6d2cd87c", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_gmail(start_date, end_date):\n", + " service = SESSION_STATE.get(\"gmail_service\")\n", + " email_address = SESSION_STATE.get(\"gmail_email\")\n", + "\n", + " if not service:\n", + " return \"❌ Please login first.\"\n", + "\n", + " # try:\n", + " info = f\"Connected to: {email_address}\\n\"\n", + " emails = fetch_emails(service, start_date, end_date)\n", + "\n", + " if not emails:\n", + " return info + \"No emails found in the given range.\"\n", + " info += f\"Fetched {len(emails)} emails.\\n\"\n", + " info += gmail_embed_and_store(emails, account=email_address)\n", + " return info\n", + "\n", + " # except Exception as e:\n", + " # return f\"❌ Extraction failed: {str(e)}\"" + ] + }, + { + "cell_type": "markdown", + "id": "b049fee6-5b51-4458-b089-6a11c6050492", + "metadata": {}, + "source": [ + "### 3. Outlook" + ] + }, + { + "cell_type": "markdown", + "id": "7660ec50-23ca-476f-97f7-42b764de46fa", + "metadata": {}, + "source": [ + "#### Set AZURE_CLIENT_ID in .env file\n", + "\n", + "I have included the setup and configuration steps in this guide:\n", + "[Outlook_API_Credential_Guide](./Outlook_API_Credential_Guide.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1f2b0d2-d2c0-414f-be53-c3bc74ceb6a6", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv()\n", + "\n", + "OUTLOOK_TOKEN_DIR = \"tokens/outlook_tokens\"\n", + "OUTLOOK_CLIENT_ID = os.getenv(\"AZURE_CLIENT_ID\")\n", + "OUTLOOK_AUTHORITY = \"https://login.microsoftonline.com/common\" \n", + "OUTLOOK_SCOPES = [\"Mail.Read\", \"User.Read\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2197700b-1103-4ba0-b929-28fea4af6881", + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_outlook_emails(access_token, start_date, end_date):\n", + " headers = {\n", + " \"Authorization\": f\"Bearer {access_token}\",\n", + " \"Prefer\": \"outlook.body-content-type='text'\"\n", + " }\n", + "\n", + " # Filter format: yyyy-mm-ddTHH:MM:SSZ\n", + " query = (\n", + " \"https://graph.microsoft.com/v1.0/me/messages\"\n", + " f\"?$top=100\"\n", + " \"&$select=id,subject,receivedDateTime,body,sender,toRecipients,ccRecipients\"\n", + " )\n", + "\n", + " all_emails = []\n", + "\n", + " while query:\n", + " response = requests.get(query, headers=headers)\n", + " if not response.ok:\n", + " print(f\"❌ HTTP {response.status_code}: {response.text}\")\n", + " break\n", + "\n", + " res = response.json()\n", + " for msg in res.get(\"value\", []):\n", + " received = msg.get(\"receivedDateTime\", \"\")\n", + " try:\n", + " received_dt = datetime.fromisoformat(received.replace(\"Z\", \"+00:00\"))\n", + " except Exception:\n", + " continue\n", + "\n", + " if not (start_date <= received_dt <= end_date):\n", + " continue\n", + "\n", + " email_data = {\n", + " \"id\": msg.get(\"id\"),\n", + " \"subject\": msg.get(\"subject\", \"\"),\n", + " \"body\": msg.get(\"body\", {}).get(\"content\", \"\"),\n", + " \"sender\": msg.get(\"sender\", {}).get(\"emailAddress\", {}).get(\"address\", \"\"),\n", + " \"to\": [r[\"emailAddress\"][\"address\"] for r in msg.get(\"toRecipients\", [])],\n", + " \"cc\": [r[\"emailAddress\"][\"address\"] for r in msg.get(\"ccRecipients\", [])],\n", + " \"date\": received_dt.isoformat()\n", + " }\n", + "\n", + " all_emails.append(email_data)\n", + "\n", + " query = res.get(\"@odata.nextLink\")\n", + "\n", + " print(f\"✅ Total emails extracted: {len(all_emails)}\")\n", + " return all_emails" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d9759ad-47fa-4f3a-8e67-59b59ccccfd9", + "metadata": {}, + "outputs": [], + "source": [ + "def outlook_embed_and_store(emails):\n", + " if not emails:\n", + " return \"No emails to embed.\\n\"\n", + "\n", + " docs = []\n", + " for email in emails:\n", + " content = (\n", + " f\"Subject: {email['subject']}\\n\"\n", + " f\"From: {email['sender']}\\n\"\n", + " f\"To: {', '.join(email['to'])}\\n\"\n", + " f\"CC: {', '.join(email['cc'])}\\n\\n\"\n", + " f\"{email['body']}\"\n", + " )\n", + " doc = Document(\n", + " page_content=content,\n", + " metadata={\"date\": email[\"date\"], \"outlook_id\": email[\"id\"]}\n", + " )\n", + " docs.append(doc)\n", + "\n", + " text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + " chunks = [doc for doc in text_splitter.split_documents(docs) if doc.page_content.strip()]\n", + "\n", + " if not chunks:\n", + " return \"⚠️ No non-empty chunks to embed. Skipping vectorstore update.\"\n", + "\n", + " embeddings = OpenAIEmbeddings()\n", + "\n", + " vectorstore = None\n", + " if os.path.exists(OUTLOOK_VECTOR_DIR):\n", + " vectorstore = Chroma(persist_directory=OUTLOOK_VECTOR_DIR, embedding_function=embeddings)\n", + " else:\n", + " if chunks:\n", + " vectorstore = Chroma.from_documents(documents=chunks[:1], embedding=embeddings, persist_directory=OUTLOOK_VECTOR_DIR)\n", + " chunks = chunks[1:]\n", + " else:\n", + " return \"⚠️ No chunks to create new vectorstore.\\n\"\n", + " \n", + " batches = batch_chunks(chunks)\n", + " total = 1 if not os.path.exists(OUTLOOK_VECTOR_DIR) else 0\n", + " \n", + " for batch in batches:\n", + " vectorstore.add_documents(batch)\n", + " total += len(batch)\n", + "\n", + " info = \"\"\n", + " info += f\"✅ Vectorstore updated with {total} chunks.\\n\"\n", + " num_docs = vectorstore._collection.count()\n", + " info += f\"Vectorstore contains {num_docs} chunks.\\n\"\n", + " return info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1feb49b5-9df0-4232-a233-c6ceb97361a7", + "metadata": {}, + "outputs": [], + "source": [ + "def login_outlook(alias):\n", + " # try:\n", + " token_path = os.path.join(OUTLOOK_TOKEN_DIR, f\"outlook_token_{alias}.json\")\n", + " SESSION_STATE[\"outlook_alias\"] = alias\n", + " access_token = None\n", + "\n", + " # Load existing token\n", + " if os.path.exists(token_path):\n", + " with open(token_path, \"r\") as f:\n", + " result = json.load(f)\n", + " access_token = result.get(\"access_token\")\n", + "\n", + " # If no token, run device flow\n", + " if not access_token:\n", + " app = PublicClientApplication(OUTLOOK_CLIENT_ID, authority=OUTLOOK_AUTHORITY)\n", + " flow = app.initiate_device_flow(scopes=OUTLOOK_SCOPES)\n", + "\n", + " if \"user_code\" not in flow:\n", + " return \"❌ Failed to initiate device login.\"\n", + "\n", + " print(\"🔗 Visit:\", flow[\"verification_uri\"])\n", + " print(\"🔐 Enter code:\", flow[\"user_code\"])\n", + "\n", + " result = app.acquire_token_by_device_flow(flow)\n", + "\n", + " if \"access_token\" not in result:\n", + " return f\"❌ Login failed: {result.get('error_description', 'Unknown error')}\"\n", + "\n", + " access_token = result[\"access_token\"]\n", + "\n", + " with open(token_path, \"w\") as f:\n", + " json.dump(result, f)\n", + "\n", + " # Get user's email via Microsoft Graph\n", + " headers = {\"Authorization\": f\"Bearer {access_token}\"}\n", + " user_info = requests.get(\"https://graph.microsoft.com/v1.0/me\", headers=headers).json()\n", + " email = user_info.get(\"mail\") or user_info.get(\"userPrincipalName\")\n", + "\n", + " # Store in session\n", + " SESSION_STATE[\"outlook_token\"] = access_token\n", + " SESSION_STATE[\"outlook_email\"] = email\n", + "\n", + " return f\"✅ Logged in to Outlook as: {email}\"\n", + "\n", + " # except Exception as e:\n", + " # return f\"❌ Login failed: {str(e)}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e11523b-a757-459c-8c4c-1ceef586439f", + "metadata": {}, + "outputs": [], + "source": [ + "def start_outlook_login(alias):\n", + " token_path = os.path.join(OUTLOOK_TOKEN_DIR, f\"outlook_token_{alias}.json\")\n", + " access_token = None\n", + " SESSION_STATE[\"outlook_token_path\"] = token_path\n", + " \n", + " # Load existing token\n", + " if os.path.exists(token_path):\n", + " return True, \"This alias already verified\"\n", + "\n", + " # If no token, run device flow\n", + " if not access_token:\n", + " app = PublicClientApplication(OUTLOOK_CLIENT_ID, authority=OUTLOOK_AUTHORITY)\n", + " flow = app.initiate_device_flow(scopes=OUTLOOK_SCOPES)\n", + "\n", + " if \"user_code\" not in flow:\n", + " return False, \"❌ Failed to initiate device login.\"\n", + "\n", + " # Store the flow for next step\n", + " SESSION_STATE[\"outlook_alias\"] = alias\n", + " SESSION_STATE[\"outlook_app\"] = app\n", + " SESSION_STATE[\"outlook_flow\"] = flow\n", + " \n", + " msg = f\"🔗 Visit: {flow['verification_uri']}\\n🔐 Enter code: {flow['user_code']}\"\n", + " return False, \"🔄 Waiting for verification...\\n\" + msg\n", + "\n", + "def finish_outlook_login():\n", + " flag = SESSION_STATE.get(\"outlook_login_flag\")\n", + " token_path = SESSION_STATE.get(\"outlook_token_path\")\n", + " if flag:\n", + " with open(token_path, \"r\") as f:\n", + " result = json.load(f)\n", + " access_token = result.get(\"access_token\")\n", + " else: \n", + " app = SESSION_STATE.get(\"outlook_app\")\n", + " flow = SESSION_STATE.get(\"outlook_flow\")\n", + " \n", + " result = app.acquire_token_by_device_flow(flow)\n", + " \n", + " if \"access_token\" not in result:\n", + " return f\"❌ Login failed: {result.get('error_description', 'Unknown error')}\"\n", + " \n", + " access_token = result[\"access_token\"]\n", + " \n", + " with open(token_path, \"w\") as f:\n", + " json.dump(result, f)\n", + " \n", + "\n", + " # Get user's email via Microsoft Graph\n", + " headers = {\"Authorization\": f\"Bearer {access_token}\"}\n", + " user_info = requests.get(\"https://graph.microsoft.com/v1.0/me\", headers=headers).json()\n", + " email = user_info.get(\"mail\") or user_info.get(\"userPrincipalName\")\n", + "\n", + " # Store in session\n", + " SESSION_STATE[\"outlook_token\"] = access_token\n", + " SESSION_STATE[\"outlook_email\"] = email\n", + "\n", + " return f\"✅ Logged in to Outlook as: {email}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3faea0d-723d-41e3-9683-db92dd918aba", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_outlook_emails(start, end, alias):\n", + " try:\n", + " start_date = datetime.strptime(start.strip(), \"%Y/%m/%d\").replace(tzinfo=timezone.utc)\n", + " end_date = datetime.strptime(end.strip(), \"%Y/%m/%d\").replace(tzinfo=timezone.utc)\n", + " except ValueError:\n", + " return \"❌ Invalid date format. Use YYYY/MM/DD.\"\n", + "\n", + " access_token = SESSION_STATE[\"outlook_token\"]\n", + "\n", + " if not access_token:\n", + " return f\"❌ No access token found for '{alias}'. Please login first.\"\n", + "\n", + " info = \"\"\n", + " try:\n", + " emails = fetch_outlook_emails(access_token, start_date, end_date)\n", + " if not emails:\n", + " return f\"❌ No email found.\"\n", + " info += f\"✅ Extracted and embedded {len(emails)} Outlook emails.\\n\"\n", + " info += outlook_embed_and_store(emails)\n", + " return info\n", + " except Exception as e:\n", + " return f\"❌ Error: {str(e)}\"\n" + ] + }, + { + "cell_type": "markdown", + "id": "0c030701-8f16-4101-a501-f310ce61871c", + "metadata": {}, + "source": [ + "### 4. Google Workspace" + ] + }, + { + "cell_type": "markdown", + "id": "4b04baa3-0dfe-491a-974e-c1b97c978031", + "metadata": {}, + "source": [ + "#### Store google workspace credential json file under the credentials folder\n", + "\n", + "To avoid complicated steps and focus on LLMs stuff, I chose to utilize the Google Drive/Workspace API in test mode.\n", + "\n", + "I have included the setup and configuration steps in this guide:\n", + "[Google_Workspace_API_Credential_Guide](./Google_Workspace_API_Credential_Guide.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aeb8a99-d039-4550-8cac-f9370e7d7401", + "metadata": {}, + "outputs": [], + "source": [ + "GOOGLE_WORKSPACE_SCOPES = [\n", + " 'https://www.googleapis.com/auth/gmail.readonly',\n", + " 'https://www.googleapis.com/auth/drive.readonly',\n", + " 'https://www.googleapis.com/auth/documents.readonly',\n", + " 'https://www.googleapis.com/auth/spreadsheets.readonly',\n", + " 'https://www.googleapis.com/auth/presentations.readonly'\n", + "]\n", + "GOOGLE_WORKSPACE_CREDENTIALS_FILE = 'credentials/google_drive_workspace_credentials.json'\n", + "GOOGLE_WORKSPACE_TOKEN_DIR = 'tokens/google_workspace_tokens'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d7c1ad3-d288-42a7-bc7b-4ddae0f3aaa3", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_google_doc(docs_service, file_id):\n", + " doc = docs_service.documents().get(documentId=file_id).execute()\n", + " content = \"\"\n", + " for elem in doc.get(\"body\", {}).get(\"content\", []):\n", + " if \"paragraph\" in elem:\n", + " for run in elem[\"paragraph\"][\"elements\"]:\n", + " content += run.get(\"textRun\", {}).get(\"content\", \"\")\n", + " return content.strip()\n", + "\n", + "def extract_google_sheet(service, file_id):\n", + " # Get spreadsheet metadata\n", + " spreadsheet = service.spreadsheets().get(spreadsheetId=file_id).execute()\n", + " all_text = \"\"\n", + "\n", + " # Loop through each sheet\n", + " for sheet in spreadsheet.get(\"sheets\", []):\n", + " title = sheet[\"properties\"][\"title\"]\n", + " result = service.spreadsheets().values().get(\n", + " spreadsheetId=file_id,\n", + " range=title\n", + " ).execute()\n", + "\n", + " values = result.get(\"values\", [])\n", + " sheet_text = f\"### Sheet: {title} ###\\n\"\n", + " sheet_text += \"\\n\".join([\", \".join(row) for row in values])\n", + " all_text += sheet_text + \"\\n\\n\"\n", + "\n", + " return all_text.strip()\n", + "\n", + "\n", + "def extract_google_slide(slides_service, file_id):\n", + " pres = slides_service.presentations().get(presentationId=file_id).execute()\n", + " text = \"\"\n", + " for slide in pres.get(\"slides\", []):\n", + " for element in slide.get(\"pageElements\", []):\n", + " shape = element.get(\"shape\")\n", + " if shape:\n", + " for p in shape.get(\"text\", {}).get(\"textElements\", []):\n", + " if \"textRun\" in p:\n", + " text += p[\"textRun\"][\"content\"]\n", + " return text.strip()\n", + "\n", + "def extract_pdf_from_drive(drive_service, file_id):\n", + " request = drive_service.files().get_media(fileId=file_id)\n", + " fh = io.BytesIO()\n", + " downloader = MediaIoBaseDownload(fh, request)\n", + " done = False\n", + " while not done:\n", + " _, done = downloader.next_chunk()\n", + " fh.seek(0)\n", + " reader = PdfReader(fh)\n", + " return \"\\n\".join([page.extract_text() for page in reader.pages if page.extract_text()])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "005640a5-b7b3-4397-be1c-d86dccafdc61", + "metadata": {}, + "outputs": [], + "source": [ + "def login_google_workspace(alias):\n", + " try:\n", + " creds = google_workspace_get_creds(alias)\n", + " service = build('gmail', 'v1', credentials=creds)\n", + " profile = service.users().getProfile(userId='me').execute()\n", + " email = profile.get(\"emailAddress\")\n", + "\n", + " drive_service = build(\"drive\", \"v3\", credentials=creds)\n", + " docs_service = build('docs', 'v1', credentials=creds)\n", + " sheets_service = build('sheets', 'v4', credentials=creds)\n", + " slides_service = build('slides', 'v1', credentials=creds)\n", + "\n", + " # Store in session\n", + " SESSION_STATE[\"google_workspace_drive_service\"] = drive_service\n", + " SESSION_STATE[\"google_workspace_docs_service\"] = docs_service\n", + " SESSION_STATE[\"google_workspace_sheets_service\"] = sheets_service\n", + " SESSION_STATE[\"google_workspace_slides_service\"] = slides_service\n", + " SESSION_STATE[\"google_workspace_email\"] = email\n", + " SESSION_STATE[\"google_workspace_alias\"] = alias\n", + "\n", + " return f\"✅ Logged in as: {email}\"\n", + " except Exception as e:\n", + " return f\"❌ Login failed: {str(e)}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2677d0aa-d61d-45e2-994a-b4707d839b48", + "metadata": {}, + "outputs": [], + "source": [ + "def google_workspace_get_creds(account_alias):\n", + " token_path = os.path.join(GOOGLE_WORKSPACE_TOKEN_DIR, f'google_workspace_token_{account_alias}.json')\n", + " \n", + " if os.path.exists(token_path):\n", + " creds = Credentials.from_authorized_user_file(token_path, GOOGLE_WORKSPACE_SCOPES)\n", + " else:\n", + " flow = InstalledAppFlow.from_client_secrets_file(GOOGLE_WORKSPACE_CREDENTIALS_FILE, GOOGLE_WORKSPACE_SCOPES)\n", + " creds = flow.run_local_server(port=0)\n", + " with open(\"token.json\", \"w\") as token:\n", + " token.write(creds.to_json())\n", + " return creds\n", + " \n", + "\n", + "def get_folder_id_by_name(drive_service, folder_name):\n", + " query = f\"mimeType='application/vnd.google-apps.folder' and name='{folder_name}' and trashed=false\"\n", + " results = drive_service.files().list(\n", + " q=query,\n", + " fields=\"files(id, name)\",\n", + " pageSize=1\n", + " ).execute()\n", + "\n", + " folders = results.get(\"files\", [])\n", + " if not folders:\n", + " raise ValueError(f\"❌ Folder named '{folder_name}' not found.\")\n", + " return folders[0]['id']\n", + "\n", + "\n", + "def extract_docs_from_google_workspace(folder_name):\n", + " info = \"\"\n", + "\n", + " file_types = {\n", + " 'application/vnd.google-apps.document': lambda fid: extract_google_doc(docs_service, fid),\n", + " 'application/vnd.google-apps.spreadsheet': lambda fid: extract_google_sheet(sheets_service, fid),\n", + " 'application/vnd.google-apps.presentation': lambda fid: extract_google_slide(slides_service, fid),\n", + " 'application/pdf': lambda fid: extract_pdf_from_drive(drive_service, fid),\n", + " }\n", + "\n", + " drive_service = SESSION_STATE.get(\"google_workspace_drive_service\")\n", + " docs_service = SESSION_STATE.get(\"google_workspace_docs_service\")\n", + " sheets_service = SESSION_STATE.get(\"google_workspace_sheets_service\")\n", + " slides_service = SESSION_STATE.get(\"google_workspace_slides_service\")\n", + " \n", + " if not drive_service or not docs_service or not sheets_service or not slides_service: \n", + " return None, \"Please login first.\\n\"\n", + " \n", + "\n", + " folder_id = get_folder_id_by_name(drive_service, folder_name)\n", + " print(\"folder_id\")\n", + " print(folder_id)\n", + " info += f\"Collection files from folder: {folder_name}\\n\"\n", + " \n", + " query = (\n", + " f\"'{folder_id}' in parents and (\"\n", + " 'mimeType=\"application/vnd.google-apps.document\" or '\n", + " 'mimeType=\"application/vnd.google-apps.spreadsheet\" or '\n", + " 'mimeType=\"application/vnd.google-apps.presentation\" or '\n", + " 'mimeType=\"application/pdf\")'\n", + " )\n", + " \n", + " results = drive_service.files().list(\n", + " q=query,\n", + " fields=\"files(id, name, mimeType)\",\n", + " pageSize=20\n", + " ).execute()\n", + "\n", + " docs = []\n", + " summary_info = {\n", + " 'application/vnd.google-apps.document': {'file_type': 'Google Doc', 'count': 0},\n", + " 'application/vnd.google-apps.spreadsheet': {'file_type': 'Google Sheet', 'count': 0},\n", + " 'application/vnd.google-apps.presentation': {'file_type': 'Google Silde', 'count': 0},\n", + " 'application/pdf': {'file_type': 'PDF', 'count': 0}\n", + " }\n", + " for file in results.get(\"files\", []):\n", + " print(file['mimeType'])\n", + " extractor = file_types.get(file['mimeType'])\n", + " if extractor:\n", + " try:\n", + " content = extractor(file[\"id\"])\n", + " if content:\n", + " docs.append(Document(page_content=content, metadata={\"source\": file[\"name\"]}))\n", + " summary_info[file['mimeType']]['count'] += 1\n", + " print(file['mimeType'])\n", + " print(summary_info[file['mimeType']]['count'])\n", + " except Exception as e:\n", + " print(f\"❌ Error processing {file['name']}: {e}\")\n", + " \n", + " total = 0;\n", + " for file_type, element in summary_info.items():\n", + " total += element['count']\n", + " info += f\"Found {element['count']} {element['file_type']} files\\n\"\n", + " info += f\"Total documents loaded: {total}\\n\"\n", + " return docs, info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5e7aee3-a7fe-4dd1-ada4-e7290cb1d1c4", + "metadata": {}, + "outputs": [], + "source": [ + "def google_workspace_embed_and_store(docs):\n", + " text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + " chunks = [doc for doc in text_splitter.split_documents(docs) if doc.page_content.strip()]\n", + "\n", + " if not chunks:\n", + " return \"⚠️ No non-empty chunks to embed. Skipping vectorstore update.\"\n", + "\n", + " embeddings = OpenAIEmbeddings()\n", + "\n", + " vectorstore = None\n", + " if os.path.exists(GOOGLE_WORKSPACE_VECTOR_DIR):\n", + " vectorstore = Chroma(persist_directory=GOOGLE_WORKSPACE_VECTOR_DIR, embedding_function=embeddings)\n", + " else:\n", + " if chunks:\n", + " vectorstore = Chroma.from_documents(documents=chunks[:1], embedding=embeddings, persist_directory=GOOGLE_WORKSPACE_VECTOR_DIR)\n", + " chunks = chunks[1:]\n", + " else:\n", + " return \"⚠️ No chunks to create new vectorstore.\"\n", + " \n", + " batches = batch_chunks(chunks)\n", + " total = 1 if not os.path.exists(GOOGLE_WORKSPACE_VECTOR_DIR) else 0\n", + " \n", + " for batch in batches:\n", + " vectorstore.add_documents(batch)\n", + " total += len(batch)\n", + "\n", + " info = \"\"\n", + " info += f\"Vectorstore updated with {total} new chunks.\\n\"\n", + " num_docs = vectorstore._collection.count()\n", + " info += f\"Vectorstore contains {num_docs} chunks.\\n\"\n", + " return info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fd47dcc-03be-4ff2-8e13-406067242c0d", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_google_workspace_folder(folder_path):\n", + "\n", + " # try:\n", + " info = f\"Process files under: {folder_path}\\n\"\n", + " docs, embed_store_info = extract_docs_from_google_workspace(folder_path)\n", + " info += embed_store_info\n", + " if not docs:\n", + " return info + \"No valid files found in the given range.\"\n", + " info += f\"Fetched {len(docs)} files.\\n\"\n", + " info += google_workspace_embed_and_store(docs)\n", + " return info\n", + "\n", + " # except Exception as e:\n", + " # return f\"❌ Extraction failed: {str(e)}\"" + ] + }, + { + "cell_type": "markdown", + "id": "59794946-dfdd-40b7-909d-f8290d628242", + "metadata": {}, + "source": [ + "### 5. Slack" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33c6bf19-f685-4654-9fda-06ec32afd2e5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "9de15f01-7749-46df-9526-306c51310797", + "metadata": {}, + "source": [ + "### 6. Gradio UI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d613a92f-e16b-4cc0-a454-7fbae162f27b", + "metadata": {}, + "outputs": [], + "source": [ + "VECTOR_DIR = [LOCAL_VECTOR_DIR, GMAIL_VECTOR_DIR, OUTLOOK_VECTOR_DIR, GOOGLE_WORKSPACE_VECTOR_DIR, SLACK_VECTOR_DIR]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d6f2df3-8471-443c-94da-dde496a9a02d", + "metadata": {}, + "outputs": [], + "source": [ + "# system prompt\n", + "prompt_template = PromptTemplate(\n", + " input_variables=[\"question\", \"context\", \"chat_history\"],\n", + " template=\"\"\"\n", + "You are a personal assistant trained on the user's private documents, emails, and notes.\n", + "Your role is to answer questions as if you are the user themself — based on their experiences, thoughts, habits, personality, and preferences reflected in the uploaded materials.\n", + "Also, you are having a conversation with the user. Use the chat history to understand the context of the conversation.\n", + "At the beginning of each conversation, ask the user what name they would like to assign to you. If the user later requests a name change, update your name accordingly without delay.\n", + "\n", + "Use the retrieved documents to:\n", + "- Summarize the user's background, actions, and communication patterns\n", + "- Simulate how the user would respond to questions\n", + "- Infer personality traits, professional history, and personal interests\n", + "\n", + "Always cite the type of source (e.g., email, resume, journal) when appropriate. If no relevant information is available, say so honestly.\n", + "\n", + "You must never make assumptions beyond what the user's data reveals.\n", + "\n", + "Chat History:\n", + "{chat_history}\n", + "\n", + "Retrieved Context:\n", + "{context}\n", + "\n", + "User Question:\n", + "{question}\n", + "\"\"\"\n", + ")\n", + "\n", + "llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n", + "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "embeddings = OpenAIEmbeddings()\n", + "retrievers = []\n", + "for vec_dir in VECTOR_DIR:\n", + " if os.path.exists(vec_dir):\n", + " vectorstore = Chroma(persist_directory=vec_dir, embedding_function=embeddings)\n", + " retriever = vectorstore.as_retriever(search_kwargs={\"k\": 10})\n", + " retrievers.append(retriever)\n", + "\n", + "merged_retriever = MergerRetriever(retrievers=retrievers)\n", + "conversation_chain = ConversationalRetrievalChain.from_llm(\n", + " llm=llm, \n", + " retriever=merged_retriever, \n", + " memory=memory,\n", + " combine_docs_chain_kwargs={\"prompt\": prompt_template}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30861ea3-1005-4fe0-b06b-060571b382bc", + "metadata": {}, + "outputs": [], + "source": [ + "def chat_with_rag(user_input, chat_history):\n", + " result = conversation_chain.invoke({\"question\": user_input})\n", + " answer = result[\"answer\"]\n", + " chat_history.append({\"role\": \"user\", \"content\": user_input})\n", + " chat_history.append({\"role\": \"assistant\", \"content\": answer})\n", + " return \"\", chat_history" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55be9260-4a05-4f7b-990a-3979ea0a49c3", + "metadata": {}, + "outputs": [], + "source": [ + "def delete_knowledge(delete_type):\n", + " global conversation_chain, retrievers\n", + " \n", + " if delete_type == \"Local Folder\":\n", + " vector_dir = LOCAL_VECTOR_DIR\n", + " elif delete_type == \"Gmail\":\n", + " vector_dir = GMAIL_VECTOR_DIR\n", + " elif delete_type == \"Outlook\":\n", + " vector_dir = OUTLOOK_VECTOR_DIR\n", + " elif delete_type == \"Google Workspace\":\n", + " vector_dir = GOOGLE_WORKSPACE_VECTOR_DIR\n", + " elif delete_type == \"Slack\":\n", + " vector_dir = SLACK_VECTOR_DIR\n", + " \n", + " if os.path.exists(vector_dir):\n", + " Chroma(persist_directory=vector_dir, embedding_function=embeddings).delete_collection()\n", + " retrievers = []\n", + " for vec_dir in VECTOR_DIR:\n", + " if os.path.exists(vec_dir):\n", + " vectorstore = Chroma(persist_directory=vec_dir, embedding_function=embeddings)\n", + " retriever = vectorstore.as_retriever(search_kwargs={\"k\": 10})\n", + " retrievers.append(retriever)\n", + " \n", + " merged_retriever = MergerRetriever(retrievers=retrievers)\n", + " conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=merged_retriever, memory=memory)\n", + " return \"Deleted successfully.\"\n", + " else:\n", + " return \"Vector store does not exist.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02f80830-8dd2-4a6a-aca3-0c79f28e703a", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "with gr.Blocks(title=\"Personla Knowledge Assistant\", theme=gr.themes.Citrus(), css=\"\"\"\n", + ".selected {\n", + " background-color: orange !important;\n", + " box-shadow: 0 4px 12px rgba(255, 140, 0, 0.5) !important;\n", + " color: black;\n", + "}\n", + ".unselected {\n", + " background-color: gray !important;\n", + " box-shadow: 0 4px 12px rgba(128, 128, 128, 0.4);\n", + " color: white;\n", + "}\n", + ".gr-button-stop {\n", + " background-color: #cf142b !important;\n", + " color: white !important;\n", + " box-shadow: 0 4px 12px rgba(128, 128, 128, 0.4);\n", + "}\n", + "\"\"\") as ui:\n", + " SESSION_STATE = {\n", + " \"gmail_service\": None, \"gmail_email\": None, \"gmail_alias\": None,\n", + " \"outlook_email\": None, \"outlook_alias\": None,\n", + " \"outlook_login_app\": None, \"outlook_login_flow\": None,\n", + " \"outlook_token_path\": None,\n", + " \"google_workspace_email\": None, \"google_workspace_alias\": None, \n", + " \"google_workspace_drive_service\": None, \"google_workspace_docs_service\": None,\n", + " \"google_workspace_sheets_service\": None, \"google_workspace_slides_service\": None\n", + " }\n", + " outlook_login_flag = gr.State(False)\n", + " current_selected = gr.State(\"\")\n", + " section_names = [\"Local Folder\", \"Gmail\", \"Outlook\", \"Google Workspace\", \"Slack\"]\n", + "\n", + " def show_section(current_selected, current_section):\n", + " updates = []\n", + " if current_selected == current_section:\n", + "\n", + " for sec in section_names:\n", + " updates.append(gr.update(visible=False))\n", + " for sec in section_names:\n", + " updates.append(gr.update(elem_classes=[\"unselected\"]))\n", + " updates.append(\"\")\n", + " else:\n", + " updates = []\n", + " for sec in section_names:\n", + " if sec == current_selected:\n", + " updates.append(gr.update(visible=True))\n", + " else:\n", + " updates.append(gr.update(visible=False))\n", + " for sec in section_names:\n", + " if sec == current_selected:\n", + " updates.append(gr.update(elem_classes=[\"selected\"]))\n", + " else:\n", + " updates.append(gr.update(elem_classes=[\"unselected\"]))\n", + " updates.append(current_selected)\n", + " return tuple(updates)\n", + "\n", + " \n", + " \n", + " gr.Markdown(\"## Personal Knowledge Assistant\")\n", + "\n", + " chatbot = gr.Chatbot(label=\"Chat\", show_copy_button=True, type=\"messages\")\n", + " user_input = gr.Textbox(\n", + " placeholder=\"Talk with your personal knowledge assistant...\",\n", + " label=\"Enter Message\",\n", + " lines=1\n", + " )\n", + " user_input.submit(\n", + " fn=chat_with_rag,\n", + " inputs=[user_input, chatbot],\n", + " outputs=[user_input, chatbot]\n", + " )\n", + " \n", + " gr.HTML(\"