From dfeb572c55bd34a1250829f8b884020162d3017d Mon Sep 17 00:00:00 2001
From: Hope Ogbons <hopeogbons@gmail.com>
Date: Fri, 31 Oct 2025 16:31:03 +0100
Subject: [PATCH] Reduce the file size for Week7 project

---
 .../hopeogbons/week7_EXERCISE.ipynb           | 466 ++++++++++++++++++
 1 file changed, 466 insertions(+)
 create mode 100644 week7/community_contributions/hopeogbons/week7_EXERCISE.ipynb

diff --git a/week7/community_contributions/hopeogbons/week7_EXERCISE.ipynb b/week7/community_contributions/hopeogbons/week7_EXERCISE.ipynb
new file mode 100644
index 0000000..cf4f4a0
--- /dev/null
+++ b/week7/community_contributions/hopeogbons/week7_EXERCISE.ipynb
@@ -0,0 +1,466 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GHsssBgWM_l0"
+      },
+      "source": [
+        "# Fine-Tuned Product Price Predictor\n",
+        "\n",
+        "Evaluate fine-tuned Llama 3.1 8B model for product price estimation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MDyR63OTNUJ6"
+      },
+      "outputs": [],
+      "source": [
+        "# Install required libraries for model inference\n",
+        "%pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
+        "%pip install -q --upgrade requests==2.32.3 bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 datasets==3.2.0 peft==0.14.0 trl==0.14.0 matplotlib wandb"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-yikV8pRBer9"
+      },
+      "outputs": [],
+      "source": [
+        "# Import required libraries\n",
+        "import os\n",
+        "import re\n",
+        "import math\n",
+        "from tqdm import tqdm\n",
+        "from google.colab import userdata\n",
+        "from huggingface_hub import login\n",
+        "import torch\n",
+        "import torch.nn.functional as F\n",
+        "import transformers\n",
+        "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "from datetime import datetime\n",
+        "from peft import PeftModel\n",
+        "import matplotlib.pyplot as plt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uuTX-xonNeOK"
+      },
+      "outputs": [],
+      "source": [
+        "# Configuration\n",
+        "BASE_MODEL = \"meta-llama/Meta-Llama-3.1-8B\"\n",
+        "PROJECT_NAME = \"pricer\"\n",
+        "HF_USER = \"ed-donner\"  # Change to your HF username\n",
+        "RUN_NAME = \"2024-09-13_13.04.39\"\n",
+        "PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n",
+        "REVISION = \"e8d637df551603dc86cd7a1598a8f44af4d7ae36\"\n",
+        "FINETUNED_MODEL = f\"{HF_USER}/{PROJECT_RUN_NAME}\"\n",
+        "DATASET_NAME = f\"{HF_USER}/pricer-data\"\n",
+        "\n",
+        "# Quantization setting (False = 8-bit = better accuracy, more memory)\n",
+        "QUANT_4_BIT = False  # Changed to 8-bit for better accuracy\n",
+        "\n",
+        "%matplotlib inline\n",
+        "\n",
+        "# Color codes for output\n",
+        "GREEN = \"\\033[92m\"\n",
+        "YELLOW = \"\\033[93m\"\n",
+        "RED = \"\\033[91m\"\n",
+        "RESET = \"\\033[0m\"\n",
+        "COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8JArT3QAQAjx"
+      },
+      "source": [
+        "# Step 1\n",
+        "\n",
+        "### Load dataset and fine-tuned model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WyFPZeMcM88v"
+      },
+      "outputs": [],
+      "source": [
+        "# Login to HuggingFace\n",
+        "hf_token = userdata.get('HF_TOKEN')\n",
+        "login(hf_token, add_to_git_credential=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cvXVoJH8LS6u"
+      },
+      "outputs": [],
+      "source": [
+        "# Load product pricing dataset\n",
+        "dataset = load_dataset(DATASET_NAME)\n",
+        "train = dataset['train']\n",
+        "test = dataset['test']\n",
+        "\n",
+        "print(f\"✓ Loaded {len(train)} train and {len(test)} test samples\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xb86e__Wc7j_"
+      },
+      "outputs": [],
+      "source": [
+        "# Verify data structure\n",
+        "test[0]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qJWQ0a3wZ0Bw"
+      },
+      "source": [
+        "### Load Tokenizer and Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lAUAAcEC6ido"
+      },
+      "outputs": [],
+      "source": [
+        "# Configure quantization for memory efficiency\n",
+        "if QUANT_4_BIT:\n",
+        "  quant_config = BitsAndBytesConfig(\n",
+        "    load_in_4bit=True,\n",
+        "    bnb_4bit_use_double_quant=True,\n",
+        "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+        "    bnb_4bit_quant_type=\"nf4\"\n",
+        "  )\n",
+        "else:\n",
+        "  quant_config = BitsAndBytesConfig(\n",
+        "    load_in_8bit=True,\n",
+        "    bnb_8bit_compute_dtype=torch.bfloat16\n",
+        "  )"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "R_O04fKxMMT-"
+      },
+      "outputs": [],
+      "source": [
+        "# Load tokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n",
+        "tokenizer.pad_token = tokenizer.eos_token\n",
+        "tokenizer.padding_side = \"right\"\n",
+        "\n",
+        "# Load base model with quantization\n",
+        "base_model = AutoModelForCausalLM.from_pretrained(\n",
+        "    BASE_MODEL,\n",
+        "    quantization_config=quant_config,\n",
+        "    device_map=\"auto\",\n",
+        ")\n",
+        "base_model.generation_config.pad_token_id = tokenizer.pad_token_id\n",
+        "\n",
+        "# Load fine-tuned weights\n",
+        "if REVISION:\n",
+        "  fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)\n",
+        "else:\n",
+        "  fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)\n",
+        "\n",
+        "print(f\"✓ Model loaded - Memory: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kD-GJtbrdd5t"
+      },
+      "outputs": [],
+      "source": [
+        "# Verify model loaded\n",
+        "fine_tuned_model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UObo1-RqaNnT"
+      },
+      "source": [
+        "# Step 2\n",
+        "\n",
+        "### Model inference and evaluation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Qst1LhBVAB04"
+      },
+      "outputs": [],
+      "source": [
+        "# Extract price from model response\n",
+        "def extract_price(s):\n",
+        "    if \"Price is $\" in s:\n",
+        "      contents = s.split(\"Price is $\")[1]\n",
+        "      contents = contents.replace(',','')\n",
+        "      match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n",
+        "      return float(match.group()) if match else 0\n",
+        "    return 0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jXFBW_5UeEcp"
+      },
+      "outputs": [],
+      "source": [
+        "# Test extract_price function\n",
+        "extract_price(\"Price is $a fabulous 899.99 or so\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Oj_PzpdFAIMk"
+      },
+      "outputs": [],
+      "source": [
+        "# Simple prediction: takes most likely next token\n",
+        "def model_predict(prompt):\n",
+        "    set_seed(42)\n",
+        "    inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(\"cuda\")\n",
+        "    attention_mask = torch.ones(inputs.shape, device=\"cuda\")\n",
+        "    outputs = fine_tuned_model.generate(\n",
+        "        inputs,\n",
+        "        attention_mask=attention_mask,\n",
+        "        max_new_tokens=5,  # Increased for flexibility\n",
+        "        temperature=0.1,  # Low temperature for consistency\n",
+        "        num_return_sequences=1\n",
+        "    )\n",
+        "    response = tokenizer.decode(outputs[0])\n",
+        "    return extract_price(response)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Je5dR8QEAI1d"
+      },
+      "outputs": [],
+      "source": [
+        "# Improved prediction: weighted average of top K predictions\n",
+        "top_K = 5  # Increased from 3 to 5 for better accuracy\n",
+        "\n",
+        "def improved_model_predict(prompt, device=\"cuda\"):\n",
+        "    set_seed(42)\n",
+        "    inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(device)\n",
+        "    attention_mask = torch.ones(inputs.shape, device=device)\n",
+        "\n",
+        "    with torch.no_grad():\n",
+        "        outputs = fine_tuned_model(inputs, attention_mask=attention_mask)\n",
+        "        next_token_logits = outputs.logits[:, -1, :].to('cpu')\n",
+        "\n",
+        "    next_token_probs = F.softmax(next_token_logits, dim=-1)\n",
+        "    top_prob, top_token_id = next_token_probs.topk(top_K)\n",
+        "    prices, weights = [], []\n",
+        "    for i in range(top_K):\n",
+        "      predicted_token = tokenizer.decode(top_token_id[0][i])\n",
+        "      probability = top_prob[0][i]\n",
+        "      try:\n",
+        "        result = float(predicted_token)\n",
+        "      except ValueError as e:\n",
+        "        result = 0.0\n",
+        "      if result > 0:\n",
+        "        prices.append(result)\n",
+        "        weights.append(probability)\n",
+        "    if not prices:\n",
+        "      return 0.0, 0.0\n",
+        "    total = sum(weights)\n",
+        "    weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]\n",
+        "    return sum(weighted_prices).item()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EpGVJPuC1iho"
+      },
+      "source": [
+        "# Step 3\n",
+        "\n",
+        "### Test and evaluate model performance"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "30lzJXBH7BcK"
+      },
+      "outputs": [],
+      "source": [
+        "# Evaluation framework\n",
+        "class Tester:\n",
+        "    def __init__(self, predictor, data, title=None, size=250):\n",
+        "        self.predictor = predictor\n",
+        "        self.data = data\n",
+        "        self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n",
+        "        self.size = size\n",
+        "        self.guesses = []\n",
+        "        self.truths = []\n",
+        "        self.errors = []\n",
+        "        self.sles = []\n",
+        "        self.colors = []\n",
+        "\n",
+        "    def color_for(self, error, truth):\n",
+        "        if error<40 or error/truth < 0.2:\n",
+        "            return \"green\"\n",
+        "        elif error<80 or error/truth < 0.4:\n",
+        "            return \"orange\"\n",
+        "        else:\n",
+        "            return \"red\"\n",
+        "\n",
+        "    def run_datapoint(self, i):\n",
+        "        datapoint = self.data[i]\n",
+        "        guess = self.predictor(datapoint[\"text\"])\n",
+        "        truth = datapoint[\"price\"]\n",
+        "        error = abs(guess - truth)\n",
+        "        log_error = math.log(truth+1) - math.log(guess+1)\n",
+        "        sle = log_error ** 2\n",
+        "        color = self.color_for(error, truth)\n",
+        "        title = datapoint[\"text\"].split(\"\\n\\n\")[1][:20] + \"...\"\n",
+        "        self.guesses.append(guess)\n",
+        "        self.truths.append(truth)\n",
+        "        self.errors.append(error)\n",
+        "        self.sles.append(sle)\n",
+        "        self.colors.append(color)\n",
+        "        print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}\")\n",
+        "\n",
+        "    def chart(self, title):\n",
+        "        max_error = max(self.errors)\n",
+        "        plt.figure(figsize=(12, 8))\n",
+        "        max_val = max(max(self.truths), max(self.guesses))\n",
+        "        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n",
+        "        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n",
+        "        plt.xlabel('Ground Truth')\n",
+        "        plt.ylabel('Model Estimate')\n",
+        "        plt.xlim(0, max_val)\n",
+        "        plt.ylim(0, max_val)\n",
+        "        plt.title(title)\n",
+        "        plt.show()\n",
+        "\n",
+        "    def report(self):\n",
+        "        average_error = sum(self.errors) / self.size\n",
+        "        rmsle = math.sqrt(sum(self.sles) / self.size)\n",
+        "        hits = sum(1 for color in self.colors if color==\"green\")\n",
+        "        title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%\"\n",
+        "        self.chart(title)\n",
+        "\n",
+        "    def run(self):\n",
+        "        self.error = 0\n",
+        "        for i in range(self.size):\n",
+        "            self.run_datapoint(i)\n",
+        "        self.report()\n",
+        "\n",
+        "    @classmethod\n",
+        "    def test(cls, function, data):\n",
+        "        cls(function, data).run()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "W_KcLvyt6kbb"
+      },
+      "outputs": [],
+      "source": [
+        "# Run evaluation on 250 test examples\n",
+        "Tester.test(improved_model_predict, test)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nVwiWGVN1ihp"
+      },
+      "source": [
+        "### Performance Optimizations Applied\n",
+        "\n",
+        "**Changes for better accuracy:**\n",
+        "- ✅ 8-bit quantization (vs 4-bit) - Better precision\n",
+        "- ✅ top_K = 5 (vs 3) - More predictions in weighted average\n",
+        "- ✅ max_new_tokens = 5 - More flexibility in response\n",
+        "- ✅ temperature = 0.1 - More consistent predictions\n",
+        "\n",
+        "**Expected improvement:** ~10-15% reduction in average error\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hO4DdLa81ihp"
+      },
+      "source": [
+        "### Expected Performance\n",
+        "\n",
+        "**Baseline comparisons:**\n",
+        "- GPT-4o: $76 avg error\n",
+        "- Llama 3.1 base: $396 avg error  \n",
+        "- Human: $127 avg error\n",
+        "\n",
+        "**Fine-tuned model (optimized):**\n",
+        "- Target: $70-85 avg error\n",
+        "- With 8-bit quant + top_K=5 + temp=0.1\n",
+        "- Expected to rival or beat GPT-4o\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file