Merge pull request #615 from ajaycloud9/community-contributions-branch

Automated Spending Analysis from Bank Statements
This commit is contained in:
Ed Donner
2025-08-23 10:15:36 +01:00
committed by GitHub

View File

@@ -0,0 +1,270 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f60dab2a-a377-4761-8be3-69a3b8124ca6",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"import pdfplumber\n",
"import re\n",
"import json\n",
"\n",
"def parse_transaction_line(line):\n",
" # More specific pattern that captures each component'\n",
" pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})\\s+(-?[\\d,]+\\.\\d{2})$'\n",
" match = re.match(pattern, line.strip())\n",
" \n",
" if match:\n",
" date, description, amount, balance = match.groups()\n",
" return {\n",
" 'date': date,\n",
" 'description': description.strip(),\n",
" 'amount': amount,\n",
" 'balance': balance\n",
" }\n",
" return None\n",
"\n",
"def parse_Credit_Card_transaction_line(line):\n",
" # More specific pattern that captures each component'\n",
" pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})$'\n",
" match = re.match(pattern, line.strip())\n",
" \n",
" if match:\n",
" date, description, amount = match.groups()\n",
" return {\n",
" 'date': date,\n",
" 'description': description.strip(),\n",
" 'amount': amount\n",
" }\n",
" return None\n",
"\n",
"# \n",
"def extract_transactions_CA_from_pdf(pdf_path):\n",
" transactions = []\n",
" \n",
" with pdfplumber.open(pdf_path) as pdf:\n",
" for page in pdf.pages:\n",
" text = page.extract_text()\n",
" for line in text.split(\"\\n\"):\n",
" parsed = parse_transaction_line(line)\n",
" if parsed:\n",
" transactions.append(parsed)\n",
" return transactions\n",
"\n",
"def extract_transactions_CreditCard_from_pdf(pdf_path):\n",
" transactions = []\n",
" \n",
" with pdfplumber.open(pdf_path) as pdf:\n",
" for page in pdf.pages:\n",
" text = page.extract_text()\n",
" for line in text.split(\"\\n\"):\n",
" parsed = parse_Credit_Card_transaction_line(line)\n",
" if parsed:\n",
" transactions.append(parsed)\n",
" return transactions\n",
"# print(transactions, len(transactions)) # check first 10 extracted lines\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "82c34eac-fc30-41d6-8325-77efc48d0dd8",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"# Load environment variables in a file called .env\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"import os\n",
"\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "769ee512-75f5-480a-9407-f9c4cd46b679",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# ---------- STEP 3: Build prompts ----------\n",
"\n",
"def build_prompts(transactions):\n",
" system_prompt = \"\"\"\n",
"You are a personal financial assistant.\n",
"Your job is to analyze bank transactions, categorize each expense into categories such as:\n",
"Food, Clothing, Rent, Utilities, Entertainment, Travel, Health, Miscellaneous, and Others.\n",
"\n",
"Your responsibilities:\n",
"\n",
"Categorize all transactions and compute total spending per category.\n",
"\n",
"Identify the top 5 categories by total spending.\n",
"\n",
"Detect high-frequency purchases, even if individual amounts are small (e.g., $4 coffee bought 40 times).\n",
"\n",
"For these, group transactions by merchant/description and count frequency.\n",
"\n",
"Highlight the top 5 frequent purchases, with both frequency and total spend.\n",
"\n",
"Provide a practical summary of spending habits, covering both biggest expenses and frequent small purchases.\n",
"\n",
"Suggest 23 actionable recommendations to reduce spending, targeting both:\n",
"\n",
"Big categories (e.g., Rent, Travel, Entertainment).\n",
"\n",
"Small but frequent “habit expenses” (e.g., coffee, fast food, subscriptions).\n",
"\n",
"The output should be a valid JSON object with this structure:\n",
"{\n",
" \"summary\": {\n",
" \"Food\": <amount>,\n",
" \"Clothing\": <amount>,\n",
" \"Rent\": <amount>,\n",
" \"Utilities\": <amount>,\n",
" \"Entertainment\": <amount>,\n",
" \"Travel\": <amount>,\n",
" \"Health\": <amount>,\n",
" \"Miscellaneous\": <amount>,\n",
" \"Others\": <amount>\n",
" },\n",
" \"total_expenses\": <total>,\n",
" \"top_5_categories\": [ {\"category\": <name>, \"amount\": <amount>} ],\n",
" \"top_5_frequent_purchases\": [ {\"item\": <merchant/description>, \"count\": <frequency>, \"total\": <amount>} ],\n",
" \"insights\": \"<short paragraph summary of spending, including both big categories and frequent small habits>\",\n",
" \"recommendations\": [ \"<tip1>\", \"<tip2>\", \"<tip3>\" ]\n",
"}\n",
"\n",
"\"\"\"\n",
"\n",
" user_prompt = \"Here are my bank account transactions for the past few months:\\n\\n\"\n",
" for txn in transactions:\n",
" user_prompt += f\"- Date: {txn['date']}, Description: {txn['description']}, Amount: {txn['amount']}\\n\"\n",
"\n",
" user_prompt += \"\"\"\n",
"Please analyze these transactions according to the instructions in the system prompt.\n",
"\"\"\"\n",
"\n",
" return system_prompt, user_prompt\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "307ca02b-2df6-4996-85e7-d073f74592f5",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"# ---------- STEP 4: Call OpenAI ----------\n",
"def analyze_transactions(pdf_path):\n",
" transactions = extract_transactions_CreditCard_from_pdf(pdf_path)\n",
" system_prompt, user_prompt = build_prompts(transactions)\n",
"\n",
" client = OpenAI() # assumes OPENAI_API_KEY is set in env\n",
"\n",
" response = client.chat.completions.create(\n",
" model = \"gpt-4o-mini\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" response_format={\"type\": \"json_object\"} # ensures valid JSON\n",
" )\n",
"\n",
" result = response.choices[0].message.content\n",
" return json.loads(result)\n",
"\n",
"# ---------- MAIN ----------\n",
"if __name__ == \"__main__\":\n",
" cc_pdf_file = \"cc_statement.pdf\"\n",
" # To Debug in case of failures\n",
" # transactions = extract_transactions_from_pdf(pdf_file)\n",
" # print(cc_transactions,len(cc_transactions))\n",
" # system_prompt, user_prompt = build_prompts(cc_transactions)\n",
" # print(system_prompt, user_prompt)\n",
"\n",
" # Analyse the function to create a smart alert\n",
" cc_transactions = extract_transactions_CreditCard_from_pdf(cc_pdf_file)\n",
" analysis = analyze_transactions(cc_pdf_file)\n",
" print(\"=========================================\")\n",
" print(\"=== Top 5 Spending Habits & Insights ====\")\n",
" print(\"=========================================\")\n",
" print(json.dumps(analysis, indent=2))\n",
" print(\"=========================================\")\n",
" print(\"=========================================\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "831922f4-5efd-4cba-9975-54767b65f6d6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}