diff --git a/week7/community_contributions/solisoma/end_of_week_assesment.ipynb b/week7/community_contributions/solisoma/end_of_week_assesment.ipynb new file mode 100644 index 0000000..3d0cc6f --- /dev/null +++ b/week7/community_contributions/solisoma/end_of_week_assesment.ipynb @@ -0,0 +1,398 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "275415f0", + "metadata": {}, + "outputs": [], + "source": [ + "# pip installs\n", + "\n", + "!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n", + "!pip install -q --upgrade requests==2.32.3 bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 datasets==3.2.0 peft==0.14.0 trl==0.14.0 matplotlib wandb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "535bd9de", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import re\n", + "import math\n", + "from tqdm import tqdm\n", + "import numpy as np\n", + "from google.colab import userdata\n", + "from huggingface_hub import login\n", + "import torch\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error\n", + "import torch.nn.functional as F\n", + "import transformers\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed\n", + "from datasets import load_dataset, Dataset, DatasetDict\n", + "from datetime import datetime\n", + "from peft import PeftModel\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc58234a", + "metadata": {}, + "outputs": [], + "source": [ + "# Constants\n", + "\n", + "BASE_MODEL = \"meta-llama/Meta-Llama-3.1-8B\"\n", + "PROJECT_NAME = \"pricer\"\n", + "HF_USER = \"ed-donner\"\n", + "RUN_NAME = \"2024-09-13_13.04.39\"\n", + "PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n", + "REVISION = \"e8d637df551603dc86cd7a1598a8f44af4d7ae36\"\n", + "FINETUNED_MODEL = f\"{HF_USER}/{PROJECT_RUN_NAME}\"\n", + "\n", + "\n", + "DATASET_NAME = f\"{HF_USER}/pricer-data\"\n", + "# Or just use the one I've uploaded\n", + "# DATASET_NAME = \"ed-donner/pricer-data\"\n", + "\n", + "# Hyperparameters for QLoRA\n", + "\n", + "QUANT_4_BIT = True\n", + "top_K = 6\n", + "\n", + "%matplotlib inline\n", + "\n", + "# Used for writing to output in color\n", + "\n", + "GREEN = \"\\033[92m\"\n", + "YELLOW = \"\\033[93m\"\n", + "RED = \"\\033[91m\"\n", + "RESET = \"\\033[0m\"\n", + "COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0145ad8a", + "metadata": {}, + "outputs": [], + "source": [ + "# Log in to HuggingFace\n", + "\n", + "hf_token = userdata.get('HF_TOKEN')\n", + "login(hf_token, add_to_git_credential=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6919506e", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = load_dataset(DATASET_NAME)\n", + "train_full = dataset['train']\n", + "test_full = dataset['test']\n", + "\n", + "# TRAIN_SIZE = len(train_full)\n", + "# TEST_SIZE = len(test_full)\n", + "\n", + "TRAIN_SIZE = 8000 # Very small for testing\n", + "TEST_SIZE = 2000 # Very small for testing\n", + "\n", + "train = train_full.select(range(min(TRAIN_SIZE, len(train_full))))\n", + "test = test_full.select(range(min(TEST_SIZE, len(test_full))))\n", + "\n", + "print(f\"Using small test dataset:\")\n", + "print(f\" Train samples: {len(train)} (full dataset has {len(train_full)})\")\n", + "print(f\" Test samples: {len(test)} (full dataset has {len(test_full)})\")\n", + "print(f\"\\nTo use full dataset, set TRAIN_SIZE and TEST_SIZE to None or large numbers\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea79cde1", + "metadata": {}, + "outputs": [], + "source": [ + "if QUANT_4_BIT:\n", + " quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + " )\n", + "else:\n", + " quant_config = BitsAndBytesConfig(\n", + " load_in_8bit=True,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef108f8d", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the Tokenizer and the Model\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\"\n", + "\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " BASE_MODEL,\n", + " quantization_config=quant_config,\n", + " device_map=\"auto\",\n", + ")\n", + "base_model.generation_config.pad_token_id = tokenizer.pad_token_id\n", + "\n", + "# Load the fine-tuned model with PEFT\n", + "if REVISION:\n", + " fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)\n", + "else:\n", + " fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)\n", + "\n", + "fine_tuned_model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f3c4176", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_price(s):\n", + " if \"Price is $\" in s:\n", + " contents = s.split(\"Price is $\")[1]\n", + " contents = contents.replace(',','')\n", + " match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n", + " return float(match.group()) if match else 0\n", + " return 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "436fa29a", + "metadata": {}, + "outputs": [], + "source": [ + "# Original prediction function takes the most likely next token\n", + "\n", + "def model_predict(prompt):\n", + " set_seed(42)\n", + " inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(\"cuda\")\n", + " attention_mask = torch.ones(inputs.shape, device=\"cuda\")\n", + " outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=3, num_return_sequences=1)\n", + " response = tokenizer.decode(outputs[0])\n", + " return extract_price(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a666dab6", + "metadata": {}, + "outputs": [], + "source": [ + "def improved_model_predict(prompt, device=\"cuda\"):\n", + " set_seed(42)\n", + " inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(device)\n", + " attention_mask = torch.ones(inputs.shape, device=device)\n", + "\n", + " with torch.no_grad():\n", + " outputs = fine_tuned_model(inputs, attention_mask=attention_mask)\n", + " next_token_logits = outputs.logits[:, -1, :].to('cpu')\n", + "\n", + " next_token_probs = F.softmax(next_token_logits, dim=-1)\n", + " top_prob, top_token_id = next_token_probs.topk(top_K)\n", + " prices, weights = [], []\n", + " for i in range(top_K):\n", + " predicted_token = tokenizer.decode(top_token_id[0][i])\n", + " probability = top_prob[0][i]\n", + " try:\n", + " result = float(predicted_token)\n", + " except ValueError as e:\n", + " result = 0.0\n", + " if result > 0:\n", + " prices.append(result)\n", + " weights.append(probability)\n", + " if not prices:\n", + " return 0.0, 0.0\n", + " total = sum(weights)\n", + " weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]\n", + " return sum(weighted_prices).item()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9664c4c7", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "class Tester:\n", + "\n", + " def __init__(self, predictor, data, title=None, show_progress=True):\n", + " self.predictor = predictor\n", + " self.data = data\n", + " self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n", + " self.size = len(data)\n", + " self.guesses, self.truths, self.errors, self.rel_errors, self.sles, self.colors = [], [], [], [], [], []\n", + " self.show_progress = show_progress\n", + "\n", + " def color_for(self, error, truth):\n", + " if error < 40 or error / truth < 0.2:\n", + " return \"green\"\n", + " elif error < 80 or error / truth < 0.4:\n", + " return \"orange\"\n", + " else:\n", + " return \"red\"\n", + "\n", + " def run_datapoint(self, i):\n", + " datapoint = self.data[i]\n", + " guess = self.predictor(datapoint[\"text\"])\n", + " truth = datapoint[\"price\"]\n", + "\n", + " error = guess - truth\n", + " abs_error = abs(error)\n", + " rel_error = abs_error / truth if truth != 0 else 0\n", + " log_error = math.log(truth + 1) - math.log(guess + 1)\n", + " sle = log_error ** 2\n", + " color = self.color_for(abs_error, truth)\n", + "\n", + " title = (datapoint[\"text\"].split(\"\\n\\n\")[1][:20] + \"...\") if \"\\n\\n\" in datapoint[\"text\"] else datapoint[\"text\"][:20]\n", + " self.guesses.append(guess)\n", + " self.truths.append(truth)\n", + " self.errors.append(error)\n", + " self.rel_errors.append(rel_error)\n", + " self.sles.append(sle)\n", + " self.colors.append(color)\n", + "\n", + " print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} \"\n", + " f\"Error: ${abs_error:,.2f} RelErr: {rel_error*100:.1f}% SLE: {sle:,.2f} Item: {title}{RESET}\")\n", + "\n", + " def chart_all(self, chart_title):\n", + " \"\"\"Compact version: 4 performance charts in one grid.\"\"\"\n", + " t, g = np.array(self.truths), np.array(self.guesses)\n", + " rel_err, abs_err = np.array(self.rel_errors) * 100, np.abs(np.array(self.errors))\n", + "\n", + " fig, axs = plt.subplots(2, 2, figsize=(14, 10))\n", + " fig.suptitle(f\"Performance Dashboard — {chart_title}\", fontsize=16, fontweight=\"bold\")\n", + "\n", + " # Scatter plot\n", + " max_val = max(t.max(), g.max()) * 1.05\n", + " axs[1, 1].plot([0, max_val], [0, max_val], \"b--\", alpha=0.6)\n", + " axs[1, 1].scatter(t, g, s=20, c=self.colors, alpha=0.6)\n", + " axs[1, 1].set_title(\"Predictions vs Ground Truth\")\n", + " axs[1, 1].set_xlabel(\"True Price ($)\")\n", + " axs[1, 1].set_ylabel(\"Predicted ($)\")\n", + "\n", + " # Accuracy by price range\n", + " bins = np.linspace(t.min(), t.max(), 6)\n", + " labels = [f\"${bins[i]:.0f}–${bins[i+1]:.0f}\" for i in range(len(bins)-1)]\n", + " inds = np.digitize(t, bins) - 1\n", + " avg_err = [rel_err[inds == i].mean() for i in range(len(labels))]\n", + " axs[0, 0].bar(labels, avg_err, color=\"seagreen\", alpha=0.8)\n", + " axs[0, 0].set_title(\"Avg Relative Error by Price Range\")\n", + " axs[0, 0].set_ylabel(\"Relative Error (%)\")\n", + " axs[0, 0].tick_params(axis=\"x\", rotation=30)\n", + "\n", + " # Relative error distribution\n", + " axs[0, 1].hist(rel_err, bins=25, color=\"mediumpurple\", edgecolor=\"black\", alpha=0.7)\n", + " axs[0, 1].set_title(\"Relative Error Distribution (%)\")\n", + " axs[0, 1].set_xlabel(\"Relative Error (%)\")\n", + "\n", + " # Absolute error distribution\n", + " axs[1, 0].hist(abs_err, bins=25, color=\"steelblue\", edgecolor=\"black\", alpha=0.7)\n", + " axs[1, 0].axvline(abs_err.mean(), color=\"red\", linestyle=\"--\", label=f\"Mean={abs_err.mean():.2f}\")\n", + " axs[1, 0].set_title(\"Absolute Error Distribution\")\n", + " axs[1, 0].set_xlabel(\"Absolute Error ($)\")\n", + " axs[1, 0].legend()\n", + "\n", + " for ax in axs.ravel():\n", + " ax.grid(alpha=0.3)\n", + "\n", + " plt.tight_layout(rect=[0, 0, 1, 0.95])\n", + " plt.show()\n", + "\n", + " def report(self):\n", + " y_true = np.array(self.truths)\n", + " y_pred = np.array(self.guesses)\n", + "\n", + " mae = mean_absolute_error(y_true, y_pred)\n", + " rmse = math.sqrt(mean_squared_error(y_true, y_pred))\n", + " rmsle = math.sqrt(sum(self.sles) / self.size)\n", + " mape = mean_absolute_percentage_error(y_true, y_pred) * 100\n", + " median_error = float(np.median(np.abs(y_true - y_pred)))\n", + " r2 = r2_score(y_true, y_pred)\n", + "\n", + " hit_rate_green = sum(1 for c in self.colors if c == \"green\") / self.size * 100\n", + " hit_rate_acceptable = sum(1 for c in self.colors if c in (\"green\", \"orange\")) / self.size * 100\n", + "\n", + " print(f\"\\n{'='*70}\")\n", + " print(f\"FINAL REPORT: {self.title}\")\n", + " print(f\"{'='*70}\")\n", + " print(f\"Total Predictions: {self.size}\")\n", + " print(f\"\\n--- Error Metrics ---\")\n", + " print(f\"Mean Absolute Error (MAE): ${mae:,.2f}\")\n", + " print(f\"Median Error: ${median_error:,.2f}\")\n", + " print(f\"Root Mean Squared Error (RMSE): ${rmse:,.2f}\")\n", + " print(f\"Root Mean Squared Log Error (RMSLE): {rmsle:.4f}\")\n", + " print(f\"Mean Absolute Percentage Error (MAPE): {mape:.2f}%\")\n", + " print(f\"\\n--- Accuracy Metrics ---\")\n", + " print(f\"R² Score: {r2:.4f}\")\n", + " print(f\"Hit Rate (Green): {hit_rate_green:.1f}%\")\n", + " print(f\"Hit Rate (Green+Orange): {hit_rate_acceptable:.1f}%\")\n", + " print(f\"{'='*70}\\n\")\n", + " chart_title = f\"{self.title} | MAE=${mae:,.2f} | RMSLE={rmsle:.3f} | R²={r2:.3f}\"\n", + "\n", + " self.chart_all(chart_title)\n", + "\n", + " def run(self):\n", + " iterator = tqdm(range(self.size), desc=\"Testing Model\") if self.show_progress else range(self.size)\n", + " for i in iterator:\n", + " self.run_datapoint(i)\n", + " self.report()\n", + "\n", + " @classmethod\n", + " def test(cls, function, data, title=None):\n", + " cls(function, data, title=title).run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e60a696", + "metadata": {}, + "outputs": [], + "source": [ + "Tester.test(\n", + " improved_model_predict, \n", + " test, \n", + " title=\"ed-donner Fine-tuned [Base | Llama 3.1 8B] (Improved - Small Test Set)\"\n", + ")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}