{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "f60dab2a-a377-4761-8be3-69a3b8124ca6", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "import pdfplumber\n", "import re\n", "import json\n", "\n", "def parse_transaction_line(line):\n", " # More specific pattern that captures each component'\n", " pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})\\s+(-?[\\d,]+\\.\\d{2})$'\n", " match = re.match(pattern, line.strip())\n", " \n", " if match:\n", " date, description, amount, balance = match.groups()\n", " return {\n", " 'date': date,\n", " 'description': description.strip(),\n", " 'amount': amount,\n", " 'balance': balance\n", " }\n", " return None\n", "\n", "def parse_Credit_Card_transaction_line(line):\n", " # More specific pattern that captures each component'\n", " pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})$'\n", " match = re.match(pattern, line.strip())\n", " \n", " if match:\n", " date, description, amount = match.groups()\n", " return {\n", " 'date': date,\n", " 'description': description.strip(),\n", " 'amount': amount\n", " }\n", " return None\n", "\n", "# \n", "def extract_transactions_CA_from_pdf(pdf_path):\n", " transactions = []\n", " \n", " with pdfplumber.open(pdf_path) as pdf:\n", " for page in pdf.pages:\n", " text = page.extract_text()\n", " for line in text.split(\"\\n\"):\n", " parsed = parse_transaction_line(line)\n", " if parsed:\n", " transactions.append(parsed)\n", " return transactions\n", "\n", "def extract_transactions_CreditCard_from_pdf(pdf_path):\n", " transactions = []\n", " \n", " with pdfplumber.open(pdf_path) as pdf:\n", " for page in pdf.pages:\n", " text = page.extract_text()\n", " for line in text.split(\"\\n\"):\n", " parsed = parse_Credit_Card_transaction_line(line)\n", " if parsed:\n", " transactions.append(parsed)\n", " return transactions\n", "# print(transactions, len(transactions)) # check first 10 extracted lines\n" ] }, { "cell_type": "code", "execution_count": null, "id": "82c34eac-fc30-41d6-8325-77efc48d0dd8", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "# Load environment variables in a file called .env\n", "from dotenv import load_dotenv\n", "from openai import OpenAI\n", "import os\n", "\n", "load_dotenv(override=True)\n", "api_key = os.getenv('OPENAI_API_KEY')\n", "\n", "# Check the key\n", "\n", "if not api_key:\n", " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", "elif not api_key.startswith(\"sk-proj-\"):\n", " print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n", "elif api_key.strip() != api_key:\n", " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", "else:\n", " print(\"API key found and looks good so far!\")" ] }, { "cell_type": "code", "execution_count": 49, "id": "769ee512-75f5-480a-9407-f9c4cd46b679", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# ---------- STEP 3: Build prompts ----------\n", "\n", "def build_prompts(transactions):\n", " system_prompt = \"\"\"\n", "You are a personal financial assistant.\n", "Your job is to analyze bank transactions, categorize each expense into categories such as:\n", "Food, Clothing, Rent, Utilities, Entertainment, Travel, Health, Miscellaneous, and Others.\n", "\n", "Your responsibilities:\n", "\n", "Categorize all transactions and compute total spending per category.\n", "\n", "Identify the top 5 categories by total spending.\n", "\n", "Detect high-frequency purchases, even if individual amounts are small (e.g., $4 coffee bought 40 times).\n", "\n", "For these, group transactions by merchant/description and count frequency.\n", "\n", "Highlight the top 5 frequent purchases, with both frequency and total spend.\n", "\n", "Provide a practical summary of spending habits, covering both biggest expenses and frequent small purchases.\n", "\n", "Suggest 2–3 actionable recommendations to reduce spending, targeting both:\n", "\n", "Big categories (e.g., Rent, Travel, Entertainment).\n", "\n", "Small but frequent “habit expenses” (e.g., coffee, fast food, subscriptions).\n", "\n", "The output should be a valid JSON object with this structure:\n", "{\n", " \"summary\": {\n", " \"Food\": ,\n", " \"Clothing\": ,\n", " \"Rent\": ,\n", " \"Utilities\": ,\n", " \"Entertainment\": ,\n", " \"Travel\": ,\n", " \"Health\": ,\n", " \"Miscellaneous\": ,\n", " \"Others\": \n", " },\n", " \"total_expenses\": ,\n", " \"top_5_categories\": [ {\"category\": , \"amount\": } ],\n", " \"top_5_frequent_purchases\": [ {\"item\": , \"count\": , \"total\": } ],\n", " \"insights\": \"\",\n", " \"recommendations\": [ \"\", \"\", \"\" ]\n", "}\n", "\n", "\"\"\"\n", "\n", " user_prompt = \"Here are my bank account transactions for the past few months:\\n\\n\"\n", " for txn in transactions:\n", " user_prompt += f\"- Date: {txn['date']}, Description: {txn['description']}, Amount: {txn['amount']}\\n\"\n", "\n", " user_prompt += \"\"\"\n", "Please analyze these transactions according to the instructions in the system prompt.\n", "\"\"\"\n", "\n", " return system_prompt, user_prompt\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "307ca02b-2df6-4996-85e7-d073f74592f5", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "# ---------- STEP 4: Call OpenAI ----------\n", "def analyze_transactions(pdf_path):\n", " transactions = extract_transactions_CreditCard_from_pdf(pdf_path)\n", " system_prompt, user_prompt = build_prompts(transactions)\n", "\n", " client = OpenAI() # assumes OPENAI_API_KEY is set in env\n", "\n", " response = client.chat.completions.create(\n", " model = \"gpt-4o-mini\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": system_prompt},\n", " {\"role\": \"user\", \"content\": user_prompt}\n", " ],\n", " response_format={\"type\": \"json_object\"} # ensures valid JSON\n", " )\n", "\n", " result = response.choices[0].message.content\n", " return json.loads(result)\n", "\n", "# ---------- MAIN ----------\n", "if __name__ == \"__main__\":\n", " cc_pdf_file = \"cc_statement.pdf\"\n", " # To Debug in case of failures\n", " # transactions = extract_transactions_from_pdf(pdf_file)\n", " # print(cc_transactions,len(cc_transactions))\n", " # system_prompt, user_prompt = build_prompts(cc_transactions)\n", " # print(system_prompt, user_prompt)\n", "\n", " # Analyse the function to create a smart alert\n", " cc_transactions = extract_transactions_CreditCard_from_pdf(cc_pdf_file)\n", " analysis = analyze_transactions(cc_pdf_file)\n", " print(\"=========================================\")\n", " print(\"=== Top 5 Spending Habits & Insights ====\")\n", " print(\"=========================================\")\n", " print(json.dumps(analysis, indent=2))\n", " print(\"=========================================\")\n", " print(\"=========================================\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "831922f4-5efd-4cba-9975-54767b65f6d6", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 5 }