From ea4957e85d2a960a204a9933e2e7adebee27ea7c Mon Sep 17 00:00:00 2001 From: Ajay Singh Date: Fri, 22 Aug 2025 10:43:15 -0700 Subject: [PATCH] Removing the output from my testdata --- .../day-1-bank-account-summarization.ipynb | 270 ++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 week1/community-contributions/day-1-bank-account-summarization.ipynb diff --git a/week1/community-contributions/day-1-bank-account-summarization.ipynb b/week1/community-contributions/day-1-bank-account-summarization.ipynb new file mode 100644 index 0000000..bae0cfe --- /dev/null +++ b/week1/community-contributions/day-1-bank-account-summarization.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "f60dab2a-a377-4761-8be3-69a3b8124ca6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pdfplumber\n", + "import re\n", + "import json\n", + "\n", + "def parse_transaction_line(line):\n", + " # More specific pattern that captures each component'\n", + " pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})\\s+(-?[\\d,]+\\.\\d{2})$'\n", + " match = re.match(pattern, line.strip())\n", + " \n", + " if match:\n", + " date, description, amount, balance = match.groups()\n", + " return {\n", + " 'date': date,\n", + " 'description': description.strip(),\n", + " 'amount': amount,\n", + " 'balance': balance\n", + " }\n", + " return None\n", + "\n", + "def parse_Credit_Card_transaction_line(line):\n", + " # More specific pattern that captures each component'\n", + " pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})$'\n", + " match = re.match(pattern, line.strip())\n", + " \n", + " if match:\n", + " date, description, amount = match.groups()\n", + " return {\n", + " 'date': date,\n", + " 'description': description.strip(),\n", + " 'amount': amount\n", + " }\n", + " return None\n", + "\n", + "# \n", + "def extract_transactions_CA_from_pdf(pdf_path):\n", + " transactions = []\n", + " \n", + " with pdfplumber.open(pdf_path) as pdf:\n", + " for page in pdf.pages:\n", + " text = page.extract_text()\n", + " for line in text.split(\"\\n\"):\n", + " parsed = parse_transaction_line(line)\n", + " if parsed:\n", + " transactions.append(parsed)\n", + " return transactions\n", + "\n", + "def extract_transactions_CreditCard_from_pdf(pdf_path):\n", + " transactions = []\n", + " \n", + " with pdfplumber.open(pdf_path) as pdf:\n", + " for page in pdf.pages:\n", + " text = page.extract_text()\n", + " for line in text.split(\"\\n\"):\n", + " parsed = parse_Credit_Card_transaction_line(line)\n", + " if parsed:\n", + " transactions.append(parsed)\n", + " return transactions\n", + "# print(transactions, len(transactions)) # check first 10 extracted lines\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82c34eac-fc30-41d6-8325-77efc48d0dd8", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Load environment variables in a file called .env\n", + "from dotenv import load_dotenv\n", + "from openai import OpenAI\n", + "import os\n", + "\n", + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "# Check the key\n", + "\n", + "if not api_key:\n", + " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", + "elif not api_key.startswith(\"sk-proj-\"):\n", + " print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n", + "elif api_key.strip() != api_key:\n", + " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", + "else:\n", + " print(\"API key found and looks good so far!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "769ee512-75f5-480a-9407-f9c4cd46b679", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# ---------- STEP 3: Build prompts ----------\n", + "\n", + "def build_prompts(transactions):\n", + " system_prompt = \"\"\"\n", + "You are a personal financial assistant.\n", + "Your job is to analyze bank transactions, categorize each expense into categories such as:\n", + "Food, Clothing, Rent, Utilities, Entertainment, Travel, Health, Miscellaneous, and Others.\n", + "\n", + "Your responsibilities:\n", + "\n", + "Categorize all transactions and compute total spending per category.\n", + "\n", + "Identify the top 5 categories by total spending.\n", + "\n", + "Detect high-frequency purchases, even if individual amounts are small (e.g., $4 coffee bought 40 times).\n", + "\n", + "For these, group transactions by merchant/description and count frequency.\n", + "\n", + "Highlight the top 5 frequent purchases, with both frequency and total spend.\n", + "\n", + "Provide a practical summary of spending habits, covering both biggest expenses and frequent small purchases.\n", + "\n", + "Suggest 2–3 actionable recommendations to reduce spending, targeting both:\n", + "\n", + "Big categories (e.g., Rent, Travel, Entertainment).\n", + "\n", + "Small but frequent “habit expenses” (e.g., coffee, fast food, subscriptions).\n", + "\n", + "The output should be a valid JSON object with this structure:\n", + "{\n", + " \"summary\": {\n", + " \"Food\": ,\n", + " \"Clothing\": ,\n", + " \"Rent\": ,\n", + " \"Utilities\": ,\n", + " \"Entertainment\": ,\n", + " \"Travel\": ,\n", + " \"Health\": ,\n", + " \"Miscellaneous\": ,\n", + " \"Others\": \n", + " },\n", + " \"total_expenses\": ,\n", + " \"top_5_categories\": [ {\"category\": , \"amount\": } ],\n", + " \"top_5_frequent_purchases\": [ {\"item\": , \"count\": , \"total\": } ],\n", + " \"insights\": \"\",\n", + " \"recommendations\": [ \"\", \"\", \"\" ]\n", + "}\n", + "\n", + "\"\"\"\n", + "\n", + " user_prompt = \"Here are my bank account transactions for the past few months:\\n\\n\"\n", + " for txn in transactions:\n", + " user_prompt += f\"- Date: {txn['date']}, Description: {txn['description']}, Amount: {txn['amount']}\\n\"\n", + "\n", + " user_prompt += \"\"\"\n", + "Please analyze these transactions according to the instructions in the system prompt.\n", + "\"\"\"\n", + "\n", + " return system_prompt, user_prompt\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307ca02b-2df6-4996-85e7-d073f74592f5", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# ---------- STEP 4: Call OpenAI ----------\n", + "def analyze_transactions(pdf_path):\n", + " transactions = extract_transactions_CreditCard_from_pdf(pdf_path)\n", + " system_prompt, user_prompt = build_prompts(transactions)\n", + "\n", + " client = OpenAI() # assumes OPENAI_API_KEY is set in env\n", + "\n", + " response = client.chat.completions.create(\n", + " model = \"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ],\n", + " response_format={\"type\": \"json_object\"} # ensures valid JSON\n", + " )\n", + "\n", + " result = response.choices[0].message.content\n", + " return json.loads(result)\n", + "\n", + "# ---------- MAIN ----------\n", + "if __name__ == \"__main__\":\n", + " cc_pdf_file = \"cc_statement.pdf\"\n", + " # To Debug in case of failures\n", + " # transactions = extract_transactions_from_pdf(pdf_file)\n", + " # print(cc_transactions,len(cc_transactions))\n", + " # system_prompt, user_prompt = build_prompts(cc_transactions)\n", + " # print(system_prompt, user_prompt)\n", + "\n", + " # Analyse the function to create a smart alert\n", + " cc_transactions = extract_transactions_CreditCard_from_pdf(cc_pdf_file)\n", + " analysis = analyze_transactions(cc_pdf_file)\n", + " print(\"=========================================\")\n", + " print(\"=== Top 5 Spending Habits & Insights ====\")\n", + " print(\"=========================================\")\n", + " print(json.dumps(analysis, indent=2))\n", + " print(\"=========================================\")\n", + " print(\"=========================================\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "831922f4-5efd-4cba-9975-54767b65f6d6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}