Files
LLM_Engineering_OLD/week1/community-contributions/day-1-bank-account-summarization.ipynb
2025-08-22 11:06:01 -07:00

271 lines
9.4 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f60dab2a-a377-4761-8be3-69a3b8124ca6",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"import pdfplumber\n",
"import re\n",
"import json\n",
"\n",
"def parse_transaction_line(line):\n",
" # More specific pattern that captures each component'\n",
" pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})\\s+(-?[\\d,]+\\.\\d{2})$'\n",
" match = re.match(pattern, line.strip())\n",
" \n",
" if match:\n",
" date, description, amount, balance = match.groups()\n",
" return {\n",
" 'date': date,\n",
" 'description': description.strip(),\n",
" 'amount': amount,\n",
" 'balance': balance\n",
" }\n",
" return None\n",
"\n",
"def parse_Credit_Card_transaction_line(line):\n",
" # More specific pattern that captures each component'\n",
" pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})$'\n",
" match = re.match(pattern, line.strip())\n",
" \n",
" if match:\n",
" date, description, amount = match.groups()\n",
" return {\n",
" 'date': date,\n",
" 'description': description.strip(),\n",
" 'amount': amount\n",
" }\n",
" return None\n",
"\n",
"# \n",
"def extract_transactions_CA_from_pdf(pdf_path):\n",
" transactions = []\n",
" \n",
" with pdfplumber.open(pdf_path) as pdf:\n",
" for page in pdf.pages:\n",
" text = page.extract_text()\n",
" for line in text.split(\"\\n\"):\n",
" parsed = parse_transaction_line(line)\n",
" if parsed:\n",
" transactions.append(parsed)\n",
" return transactions\n",
"\n",
"def extract_transactions_CreditCard_from_pdf(pdf_path):\n",
" transactions = []\n",
" \n",
" with pdfplumber.open(pdf_path) as pdf:\n",
" for page in pdf.pages:\n",
" text = page.extract_text()\n",
" for line in text.split(\"\\n\"):\n",
" parsed = parse_Credit_Card_transaction_line(line)\n",
" if parsed:\n",
" transactions.append(parsed)\n",
" return transactions\n",
"# print(transactions, len(transactions)) # check first 10 extracted lines\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "82c34eac-fc30-41d6-8325-77efc48d0dd8",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"# Load environment variables in a file called .env\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"import os\n",
"\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "769ee512-75f5-480a-9407-f9c4cd46b679",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# ---------- STEP 3: Build prompts ----------\n",
"\n",
"def build_prompts(transactions):\n",
" system_prompt = \"\"\"\n",
"You are a personal financial assistant.\n",
"Your job is to analyze bank transactions, categorize each expense into categories such as:\n",
"Food, Clothing, Rent, Utilities, Entertainment, Travel, Health, Miscellaneous, and Others.\n",
"\n",
"Your responsibilities:\n",
"\n",
"Categorize all transactions and compute total spending per category.\n",
"\n",
"Identify the top 5 categories by total spending.\n",
"\n",
"Detect high-frequency purchases, even if individual amounts are small (e.g., $4 coffee bought 40 times).\n",
"\n",
"For these, group transactions by merchant/description and count frequency.\n",
"\n",
"Highlight the top 5 frequent purchases, with both frequency and total spend.\n",
"\n",
"Provide a practical summary of spending habits, covering both biggest expenses and frequent small purchases.\n",
"\n",
"Suggest 23 actionable recommendations to reduce spending, targeting both:\n",
"\n",
"Big categories (e.g., Rent, Travel, Entertainment).\n",
"\n",
"Small but frequent “habit expenses” (e.g., coffee, fast food, subscriptions).\n",
"\n",
"The output should be a valid JSON object with this structure:\n",
"{\n",
" \"summary\": {\n",
" \"Food\": <amount>,\n",
" \"Clothing\": <amount>,\n",
" \"Rent\": <amount>,\n",
" \"Utilities\": <amount>,\n",
" \"Entertainment\": <amount>,\n",
" \"Travel\": <amount>,\n",
" \"Health\": <amount>,\n",
" \"Miscellaneous\": <amount>,\n",
" \"Others\": <amount>\n",
" },\n",
" \"total_expenses\": <total>,\n",
" \"top_5_categories\": [ {\"category\": <name>, \"amount\": <amount>} ],\n",
" \"top_5_frequent_purchases\": [ {\"item\": <merchant/description>, \"count\": <frequency>, \"total\": <amount>} ],\n",
" \"insights\": \"<short paragraph summary of spending, including both big categories and frequent small habits>\",\n",
" \"recommendations\": [ \"<tip1>\", \"<tip2>\", \"<tip3>\" ]\n",
"}\n",
"\n",
"\"\"\"\n",
"\n",
" user_prompt = \"Here are my bank account transactions for the past few months:\\n\\n\"\n",
" for txn in transactions:\n",
" user_prompt += f\"- Date: {txn['date']}, Description: {txn['description']}, Amount: {txn['amount']}\\n\"\n",
"\n",
" user_prompt += \"\"\"\n",
"Please analyze these transactions according to the instructions in the system prompt.\n",
"\"\"\"\n",
"\n",
" return system_prompt, user_prompt\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "307ca02b-2df6-4996-85e7-d073f74592f5",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"# ---------- STEP 4: Call OpenAI ----------\n",
"def analyze_transactions(pdf_path):\n",
" transactions = extract_transactions_CreditCard_from_pdf(pdf_path)\n",
" system_prompt, user_prompt = build_prompts(transactions)\n",
"\n",
" client = OpenAI() # assumes OPENAI_API_KEY is set in env\n",
"\n",
" response = client.chat.completions.create(\n",
" model = \"gpt-4o-mini\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" response_format={\"type\": \"json_object\"} # ensures valid JSON\n",
" )\n",
"\n",
" result = response.choices[0].message.content\n",
" return json.loads(result)\n",
"\n",
"# ---------- MAIN ----------\n",
"if __name__ == \"__main__\":\n",
" cc_pdf_file = \"cc_statement.pdf\"\n",
" # To Debug in case of failures\n",
" # transactions = extract_transactions_from_pdf(pdf_file)\n",
" # print(cc_transactions,len(cc_transactions))\n",
" # system_prompt, user_prompt = build_prompts(cc_transactions)\n",
" # print(system_prompt, user_prompt)\n",
"\n",
" # Analyse the function to create a smart alert\n",
" cc_transactions = extract_transactions_CreditCard_from_pdf(cc_pdf_file)\n",
" analysis = analyze_transactions(cc_pdf_file)\n",
" print(\"=========================================\")\n",
" print(\"=== Top 5 Spending Habits & Insights ====\")\n",
" print(\"=========================================\")\n",
" print(json.dumps(analysis, indent=2))\n",
" print(\"=========================================\")\n",
" print(\"=========================================\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "831922f4-5efd-4cba-9975-54767b65f6d6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}