Merge pull request #615 from ajaycloud9/community-contributions-branch

Automated Spending Analysis from Bank Statements
2025-08-23 10:15:36 +01:00
parent 9e6f240422 ea4957e85d
commit 4376efe9a5
1 changed files with 270 additions and 0 deletions
--- a/week1/community-contributions/day-1-bank-account-summarization.ipynb
+++ b/week1/community-contributions/day-1-bank-account-summarization.ipynb
@@ -0,0 +1,270 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f60dab2a-a377-4761-8be3-69a3b8124ca6",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pdfplumber\n",
+    "import re\n",
+    "import json\n",
+    "\n",
+    "def parse_transaction_line(line):\n",
+    "    # More specific pattern that captures each component'\n",
+    "    pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})\\s+(-?[\\d,]+\\.\\d{2})$'\n",
+    "    match = re.match(pattern, line.strip())\n",
+    "    \n",
+    "    if match:\n",
+    "        date, description, amount, balance = match.groups()\n",
+    "        return {\n",
+    "            'date': date,\n",
+    "            'description': description.strip(),\n",
+    "            'amount': amount,\n",
+    "            'balance': balance\n",
+    "        }\n",
+    "    return None\n",
+    "\n",
+    "def parse_Credit_Card_transaction_line(line):\n",
+    "    # More specific pattern that captures each component'\n",
+    "    pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})$'\n",
+    "    match = re.match(pattern, line.strip())\n",
+    "    \n",
+    "    if match:\n",
+    "        date, description, amount = match.groups()\n",
+    "        return {\n",
+    "            'date': date,\n",
+    "            'description': description.strip(),\n",
+    "            'amount': amount\n",
+    "        }\n",
+    "    return None\n",
+    "\n",
+    "# \n",
+    "def extract_transactions_CA_from_pdf(pdf_path):\n",
+    "    transactions = []\n",
+    "    \n",
+    "    with pdfplumber.open(pdf_path) as pdf:\n",
+    "        for page in pdf.pages:\n",
+    "            text = page.extract_text()\n",
+    "            for line in text.split(\"\\n\"):\n",
+    "                parsed = parse_transaction_line(line)\n",
+    "                if parsed:\n",
+    "                    transactions.append(parsed)\n",
+    "    return transactions\n",
+    "\n",
+    "def extract_transactions_CreditCard_from_pdf(pdf_path):\n",
+    "    transactions = []\n",
+    "    \n",
+    "    with pdfplumber.open(pdf_path) as pdf:\n",
+    "        for page in pdf.pages:\n",
+    "            text = page.extract_text()\n",
+    "            for line in text.split(\"\\n\"):\n",
+    "                parsed = parse_Credit_Card_transaction_line(line)\n",
+    "                if parsed:\n",
+    "                    transactions.append(parsed)\n",
+    "    return transactions\n",
+    "# print(transactions, len(transactions))  # check first 10 extracted lines\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82c34eac-fc30-41d6-8325-77efc48d0dd8",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Load environment variables in a file called .env\n",
+    "from dotenv import load_dotenv\n",
+    "from openai import OpenAI\n",
+    "import os\n",
+    "\n",
+    "load_dotenv(override=True)\n",
+    "api_key = os.getenv('OPENAI_API_KEY')\n",
+    "\n",
+    "# Check the key\n",
+    "\n",
+    "if not api_key:\n",
+    "    print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
+    "elif not api_key.startswith(\"sk-proj-\"):\n",
+    "    print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
+    "elif api_key.strip() != api_key:\n",
+    "    print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
+    "else:\n",
+    "    print(\"API key found and looks good so far!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "769ee512-75f5-480a-9407-f9c4cd46b679",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# ---------- STEP 3: Build prompts ----------\n",
+    "\n",
+    "def build_prompts(transactions):\n",
+    "    system_prompt = \"\"\"\n",
+    "You are a personal financial assistant.\n",
+    "Your job is to analyze bank transactions, categorize each expense into categories such as:\n",
+    "Food, Clothing, Rent, Utilities, Entertainment, Travel, Health, Miscellaneous, and Others.\n",
+    "\n",
+    "Your responsibilities:\n",
+    "\n",
+    "Categorize all transactions and compute total spending per category.\n",
+    "\n",
+    "Identify the top 5 categories by total spending.\n",
+    "\n",
+    "Detect high-frequency purchases, even if individual amounts are small (e.g., $4 coffee bought 40 times).\n",
+    "\n",
+    "For these, group transactions by merchant/description and count frequency.\n",
+    "\n",
+    "Highlight the top 5 frequent purchases, with both frequency and total spend.\n",
+    "\n",
+    "Provide a practical summary of spending habits, covering both biggest expenses and frequent small purchases.\n",
+    "\n",
+    "Suggest 2–3 actionable recommendations to reduce spending, targeting both:\n",
+    "\n",
+    "Big categories (e.g., Rent, Travel, Entertainment).\n",
+    "\n",
+    "Small but frequent “habit expenses” (e.g., coffee, fast food, subscriptions).\n",
+    "\n",
+    "The output should be a valid JSON object with this structure:\n",
+    "{\n",
+    "  \"summary\": {\n",
+    "      \"Food\": <amount>,\n",
+    "      \"Clothing\": <amount>,\n",
+    "      \"Rent\": <amount>,\n",
+    "      \"Utilities\": <amount>,\n",
+    "      \"Entertainment\": <amount>,\n",
+    "      \"Travel\": <amount>,\n",
+    "      \"Health\": <amount>,\n",
+    "      \"Miscellaneous\": <amount>,\n",
+    "      \"Others\": <amount>\n",
+    "  },\n",
+    "  \"total_expenses\": <total>,\n",
+    "  \"top_5_categories\": [ {\"category\": <name>, \"amount\": <amount>} ],\n",
+    "  \"top_5_frequent_purchases\": [ {\"item\": <merchant/description>, \"count\": <frequency>, \"total\": <amount>} ],\n",
+    "  \"insights\": \"<short paragraph summary of spending, including both big categories and frequent small habits>\",\n",
+    "  \"recommendations\": [ \"<tip1>\", \"<tip2>\", \"<tip3>\" ]\n",
+    "}\n",
+    "\n",
+    "\"\"\"\n",
+    "\n",
+    "    user_prompt = \"Here are my bank account transactions for the past few months:\\n\\n\"\n",
+    "    for txn in transactions:\n",
+    "        user_prompt += f\"- Date: {txn['date']}, Description: {txn['description']}, Amount: {txn['amount']}\\n\"\n",
+    "\n",
+    "    user_prompt += \"\"\"\n",
+    "Please analyze these transactions according to the instructions in the system prompt.\n",
+    "\"\"\"\n",
+    "\n",
+    "    return system_prompt, user_prompt\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "307ca02b-2df6-4996-85e7-d073f74592f5",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# ---------- STEP 4: Call OpenAI ----------\n",
+    "def analyze_transactions(pdf_path):\n",
+    "    transactions = extract_transactions_CreditCard_from_pdf(pdf_path)\n",
+    "    system_prompt, user_prompt = build_prompts(transactions)\n",
+    "\n",
+    "    client = OpenAI()  # assumes OPENAI_API_KEY is set in env\n",
+    "\n",
+    "    response = client.chat.completions.create(\n",
+    "        model = \"gpt-4o-mini\",\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": system_prompt},\n",
+    "            {\"role\": \"user\", \"content\": user_prompt}\n",
+    "        ],\n",
+    "        response_format={\"type\": \"json_object\"}  # ensures valid JSON\n",
+    "    )\n",
+    "\n",
+    "    result = response.choices[0].message.content\n",
+    "    return json.loads(result)\n",
+    "\n",
+    "# ---------- MAIN ----------\n",
+    "if __name__ == \"__main__\":\n",
+    "    cc_pdf_file = \"cc_statement.pdf\"\n",
+    "    # To Debug in case of failures\n",
+    "    # transactions = extract_transactions_from_pdf(pdf_file)\n",
+    "    # print(cc_transactions,len(cc_transactions))\n",
+    "    # system_prompt, user_prompt = build_prompts(cc_transactions)\n",
+    "    # print(system_prompt, user_prompt)\n",
+    "\n",
+    "    # Analyse the function to create a smart alert\n",
+    "    cc_transactions = extract_transactions_CreditCard_from_pdf(cc_pdf_file)\n",
+    "    analysis = analyze_transactions(cc_pdf_file)\n",
+    "    print(\"=========================================\")\n",
+    "    print(\"=== Top 5 Spending Habits & Insights ====\")\n",
+    "    print(\"=========================================\")\n",
+    "    print(json.dumps(analysis, indent=2))\n",
+    "    print(\"=========================================\")\n",
+    "    print(\"=========================================\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "831922f4-5efd-4cba-9975-54767b65f6d6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}