LLM_Engineering_OLD/week1/community-contributions/day-1-bank-account-summarization.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f60dab2a-a377-4761-8be3-69a3b8124ca6",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pdfplumber\n",
    "import re\n",
    "import json\n",
    "\n",
    "def parse_transaction_line(line):\n",
    "    # More specific pattern that captures each component'\n",
    "    pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})\\s+(-?[\\d,]+\\.\\d{2})$'\n",
    "    match = re.match(pattern, line.strip())\n",
    "    \n",
    "    if match:\n",
    "        date, description, amount, balance = match.groups()\n",
    "        return {\n",
    "            'date': date,\n",
    "            'description': description.strip(),\n",
    "            'amount': amount,\n",
    "            'balance': balance\n",
    "        }\n",
    "    return None\n",
    "\n",
    "def parse_Credit_Card_transaction_line(line):\n",
    "    # More specific pattern that captures each component'\n",
    "    pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})$'\n",
    "    match = re.match(pattern, line.strip())\n",
    "    \n",
    "    if match:\n",
    "        date, description, amount = match.groups()\n",
    "        return {\n",
    "            'date': date,\n",
    "            'description': description.strip(),\n",
    "            'amount': amount\n",
    "        }\n",
    "    return None\n",
    "\n",
    "# \n",
    "def extract_transactions_CA_from_pdf(pdf_path):\n",
    "    transactions = []\n",
    "    \n",
    "    with pdfplumber.open(pdf_path) as pdf:\n",
    "        for page in pdf.pages:\n",
    "            text = page.extract_text()\n",
    "            for line in text.split(\"\\n\"):\n",
    "                parsed = parse_transaction_line(line)\n",
    "                if parsed:\n",
    "                    transactions.append(parsed)\n",
    "    return transactions\n",
    "\n",
    "def extract_transactions_CreditCard_from_pdf(pdf_path):\n",
    "    transactions = []\n",
    "    \n",
    "    with pdfplumber.open(pdf_path) as pdf:\n",
    "        for page in pdf.pages:\n",
    "            text = page.extract_text()\n",
    "            for line in text.split(\"\\n\"):\n",
    "                parsed = parse_Credit_Card_transaction_line(line)\n",
    "                if parsed:\n",
    "                    transactions.append(parsed)\n",
    "    return transactions\n",
    "# print(transactions, len(transactions))  # check first 10 extracted lines\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82c34eac-fc30-41d6-8325-77efc48d0dd8",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Load environment variables in a file called .env\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "import os\n",
    "\n",
    "load_dotenv(override=True)\n",
    "api_key = os.getenv('OPENAI_API_KEY')\n",
    "\n",
    "# Check the key\n",
    "\n",
    "if not api_key:\n",
    "    print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
    "elif not api_key.startswith(\"sk-proj-\"):\n",
    "    print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
    "elif api_key.strip() != api_key:\n",
    "    print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
    "else:\n",
    "    print(\"API key found and looks good so far!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "769ee512-75f5-480a-9407-f9c4cd46b679",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# ---------- STEP 3: Build prompts ----------\n",
    "\n",
    "def build_prompts(transactions):\n",
    "    system_prompt = \"\"\"\n",
    "You are a personal financial assistant.\n",
    "Your job is to analyze bank transactions, categorize each expense into categories such as:\n",
    "Food, Clothing, Rent, Utilities, Entertainment, Travel, Health, Miscellaneous, and Others.\n",
    "\n",
    "Your responsibilities:\n",
    "\n",
    "Categorize all transactions and compute total spending per category.\n",
    "\n",
    "Identify the top 5 categories by total spending.\n",
    "\n",
    "Detect high-frequency purchases, even if individual amounts are small (e.g., $4 coffee bought 40 times).\n",
    "\n",
    "For these, group transactions by merchant/description and count frequency.\n",
    "\n",
    "Highlight the top 5 frequent purchases, with both frequency and total spend.\n",
    "\n",
    "Provide a practical summary of spending habits, covering both biggest expenses and frequent small purchases.\n",
    "\n",
    "Suggest 2–3 actionable recommendations to reduce spending, targeting both:\n",
    "\n",
    "Big categories (e.g., Rent, Travel, Entertainment).\n",
    "\n",
    "Small but frequent “habit expenses” (e.g., coffee, fast food, subscriptions).\n",
    "\n",
    "The output should be a valid JSON object with this structure:\n",
    "{\n",
    "  \"summary\": {\n",
    "      \"Food\": <amount>,\n",
    "      \"Clothing\": <amount>,\n",
    "      \"Rent\": <amount>,\n",
    "      \"Utilities\": <amount>,\n",
    "      \"Entertainment\": <amount>,\n",
    "      \"Travel\": <amount>,\n",
    "      \"Health\": <amount>,\n",
    "      \"Miscellaneous\": <amount>,\n",
    "      \"Others\": <amount>\n",
    "  },\n",
    "  \"total_expenses\": <total>,\n",
    "  \"top_5_categories\": [ {\"category\": <name>, \"amount\": <amount>} ],\n",
    "  \"top_5_frequent_purchases\": [ {\"item\": <merchant/description>, \"count\": <frequency>, \"total\": <amount>} ],\n",
    "  \"insights\": \"<short paragraph summary of spending, including both big categories and frequent small habits>\",\n",
    "  \"recommendations\": [ \"<tip1>\", \"<tip2>\", \"<tip3>\" ]\n",
    "}\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "    user_prompt = \"Here are my bank account transactions for the past few months:\\n\\n\"\n",
    "    for txn in transactions:\n",
    "        user_prompt += f\"- Date: {txn['date']}, Description: {txn['description']}, Amount: {txn['amount']}\\n\"\n",
    "\n",
    "    user_prompt += \"\"\"\n",
    "Please analyze these transactions according to the instructions in the system prompt.\n",
    "\"\"\"\n",
    "\n",
    "    return system_prompt, user_prompt\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "307ca02b-2df6-4996-85e7-d073f74592f5",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# ---------- STEP 4: Call OpenAI ----------\n",
    "def analyze_transactions(pdf_path):\n",
    "    transactions = extract_transactions_CreditCard_from_pdf(pdf_path)\n",
    "    system_prompt, user_prompt = build_prompts(transactions)\n",
    "\n",
    "    client = OpenAI()  # assumes OPENAI_API_KEY is set in env\n",
    "\n",
    "    response = client.chat.completions.create(\n",
    "        model = \"gpt-4o-mini\",\n",
    "        messages=[\n",
    "            {\"role\": \"system\", \"content\": system_prompt},\n",
    "            {\"role\": \"user\", \"content\": user_prompt}\n",
    "        ],\n",
    "        response_format={\"type\": \"json_object\"}  # ensures valid JSON\n",
    "    )\n",
    "\n",
    "    result = response.choices[0].message.content\n",
    "    return json.loads(result)\n",
    "\n",
    "# ---------- MAIN ----------\n",
    "if __name__ == \"__main__\":\n",
    "    cc_pdf_file = \"cc_statement.pdf\"\n",
    "    # To Debug in case of failures\n",
    "    # transactions = extract_transactions_from_pdf(pdf_file)\n",
    "    # print(cc_transactions,len(cc_transactions))\n",
    "    # system_prompt, user_prompt = build_prompts(cc_transactions)\n",
    "    # print(system_prompt, user_prompt)\n",
    "\n",
    "    # Analyse the function to create a smart alert\n",
    "    cc_transactions = extract_transactions_CreditCard_from_pdf(cc_pdf_file)\n",
    "    analysis = analyze_transactions(cc_pdf_file)\n",
    "    print(\"=========================================\")\n",
    "    print(\"=== Top 5 Spending Habits & Insights ====\")\n",
    "    print(\"=========================================\")\n",
    "    print(json.dumps(analysis, indent=2))\n",
    "    print(\"=========================================\")\n",
    "    print(\"=========================================\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "831922f4-5efd-4cba-9975-54767b65f6d6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}