Merge pull request #615 from ajaycloud9/community-contributions-branch
Automated Spending Analysis from Bank Statements
This commit is contained in:
@@ -0,0 +1,270 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f60dab2a-a377-4761-8be3-69a3b8124ca6",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pdfplumber\n",
|
||||
"import re\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"def parse_transaction_line(line):\n",
|
||||
" # More specific pattern that captures each component'\n",
|
||||
" pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})\\s+(-?[\\d,]+\\.\\d{2})$'\n",
|
||||
" match = re.match(pattern, line.strip())\n",
|
||||
" \n",
|
||||
" if match:\n",
|
||||
" date, description, amount, balance = match.groups()\n",
|
||||
" return {\n",
|
||||
" 'date': date,\n",
|
||||
" 'description': description.strip(),\n",
|
||||
" 'amount': amount,\n",
|
||||
" 'balance': balance\n",
|
||||
" }\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
"def parse_Credit_Card_transaction_line(line):\n",
|
||||
" # More specific pattern that captures each component'\n",
|
||||
" pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})$'\n",
|
||||
" match = re.match(pattern, line.strip())\n",
|
||||
" \n",
|
||||
" if match:\n",
|
||||
" date, description, amount = match.groups()\n",
|
||||
" return {\n",
|
||||
" 'date': date,\n",
|
||||
" 'description': description.strip(),\n",
|
||||
" 'amount': amount\n",
|
||||
" }\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
"# \n",
|
||||
"def extract_transactions_CA_from_pdf(pdf_path):\n",
|
||||
" transactions = []\n",
|
||||
" \n",
|
||||
" with pdfplumber.open(pdf_path) as pdf:\n",
|
||||
" for page in pdf.pages:\n",
|
||||
" text = page.extract_text()\n",
|
||||
" for line in text.split(\"\\n\"):\n",
|
||||
" parsed = parse_transaction_line(line)\n",
|
||||
" if parsed:\n",
|
||||
" transactions.append(parsed)\n",
|
||||
" return transactions\n",
|
||||
"\n",
|
||||
"def extract_transactions_CreditCard_from_pdf(pdf_path):\n",
|
||||
" transactions = []\n",
|
||||
" \n",
|
||||
" with pdfplumber.open(pdf_path) as pdf:\n",
|
||||
" for page in pdf.pages:\n",
|
||||
" text = page.extract_text()\n",
|
||||
" for line in text.split(\"\\n\"):\n",
|
||||
" parsed = parse_Credit_Card_transaction_line(line)\n",
|
||||
" if parsed:\n",
|
||||
" transactions.append(parsed)\n",
|
||||
" return transactions\n",
|
||||
"# print(transactions, len(transactions)) # check first 10 extracted lines\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "82c34eac-fc30-41d6-8325-77efc48d0dd8",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "769ee512-75f5-480a-9407-f9c4cd46b679",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# ---------- STEP 3: Build prompts ----------\n",
|
||||
"\n",
|
||||
"def build_prompts(transactions):\n",
|
||||
" system_prompt = \"\"\"\n",
|
||||
"You are a personal financial assistant.\n",
|
||||
"Your job is to analyze bank transactions, categorize each expense into categories such as:\n",
|
||||
"Food, Clothing, Rent, Utilities, Entertainment, Travel, Health, Miscellaneous, and Others.\n",
|
||||
"\n",
|
||||
"Your responsibilities:\n",
|
||||
"\n",
|
||||
"Categorize all transactions and compute total spending per category.\n",
|
||||
"\n",
|
||||
"Identify the top 5 categories by total spending.\n",
|
||||
"\n",
|
||||
"Detect high-frequency purchases, even if individual amounts are small (e.g., $4 coffee bought 40 times).\n",
|
||||
"\n",
|
||||
"For these, group transactions by merchant/description and count frequency.\n",
|
||||
"\n",
|
||||
"Highlight the top 5 frequent purchases, with both frequency and total spend.\n",
|
||||
"\n",
|
||||
"Provide a practical summary of spending habits, covering both biggest expenses and frequent small purchases.\n",
|
||||
"\n",
|
||||
"Suggest 2–3 actionable recommendations to reduce spending, targeting both:\n",
|
||||
"\n",
|
||||
"Big categories (e.g., Rent, Travel, Entertainment).\n",
|
||||
"\n",
|
||||
"Small but frequent “habit expenses” (e.g., coffee, fast food, subscriptions).\n",
|
||||
"\n",
|
||||
"The output should be a valid JSON object with this structure:\n",
|
||||
"{\n",
|
||||
" \"summary\": {\n",
|
||||
" \"Food\": <amount>,\n",
|
||||
" \"Clothing\": <amount>,\n",
|
||||
" \"Rent\": <amount>,\n",
|
||||
" \"Utilities\": <amount>,\n",
|
||||
" \"Entertainment\": <amount>,\n",
|
||||
" \"Travel\": <amount>,\n",
|
||||
" \"Health\": <amount>,\n",
|
||||
" \"Miscellaneous\": <amount>,\n",
|
||||
" \"Others\": <amount>\n",
|
||||
" },\n",
|
||||
" \"total_expenses\": <total>,\n",
|
||||
" \"top_5_categories\": [ {\"category\": <name>, \"amount\": <amount>} ],\n",
|
||||
" \"top_5_frequent_purchases\": [ {\"item\": <merchant/description>, \"count\": <frequency>, \"total\": <amount>} ],\n",
|
||||
" \"insights\": \"<short paragraph summary of spending, including both big categories and frequent small habits>\",\n",
|
||||
" \"recommendations\": [ \"<tip1>\", \"<tip2>\", \"<tip3>\" ]\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
" user_prompt = \"Here are my bank account transactions for the past few months:\\n\\n\"\n",
|
||||
" for txn in transactions:\n",
|
||||
" user_prompt += f\"- Date: {txn['date']}, Description: {txn['description']}, Amount: {txn['amount']}\\n\"\n",
|
||||
"\n",
|
||||
" user_prompt += \"\"\"\n",
|
||||
"Please analyze these transactions according to the instructions in the system prompt.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
" return system_prompt, user_prompt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "307ca02b-2df6-4996-85e7-d073f74592f5",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ---------- STEP 4: Call OpenAI ----------\n",
|
||||
"def analyze_transactions(pdf_path):\n",
|
||||
" transactions = extract_transactions_CreditCard_from_pdf(pdf_path)\n",
|
||||
" system_prompt, user_prompt = build_prompts(transactions)\n",
|
||||
"\n",
|
||||
" client = OpenAI() # assumes OPENAI_API_KEY is set in env\n",
|
||||
"\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ],\n",
|
||||
" response_format={\"type\": \"json_object\"} # ensures valid JSON\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" result = response.choices[0].message.content\n",
|
||||
" return json.loads(result)\n",
|
||||
"\n",
|
||||
"# ---------- MAIN ----------\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" cc_pdf_file = \"cc_statement.pdf\"\n",
|
||||
" # To Debug in case of failures\n",
|
||||
" # transactions = extract_transactions_from_pdf(pdf_file)\n",
|
||||
" # print(cc_transactions,len(cc_transactions))\n",
|
||||
" # system_prompt, user_prompt = build_prompts(cc_transactions)\n",
|
||||
" # print(system_prompt, user_prompt)\n",
|
||||
"\n",
|
||||
" # Analyse the function to create a smart alert\n",
|
||||
" cc_transactions = extract_transactions_CreditCard_from_pdf(cc_pdf_file)\n",
|
||||
" analysis = analyze_transactions(cc_pdf_file)\n",
|
||||
" print(\"=========================================\")\n",
|
||||
" print(\"=== Top 5 Spending Habits & Insights ====\")\n",
|
||||
" print(\"=========================================\")\n",
|
||||
" print(json.dumps(analysis, indent=2))\n",
|
||||
" print(\"=========================================\")\n",
|
||||
" print(\"=========================================\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "831922f4-5efd-4cba-9975-54767b65f6d6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user