diff --git a/community-contributions/abdoul/week_seven_exercise.ipynb b/community-contributions/abdoul/week_seven_exercise.ipynb new file mode 100644 index 0000000..b30a376 --- /dev/null +++ b/community-contributions/abdoul/week_seven_exercise.ipynb @@ -0,0 +1,495 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "GHsssBgWM_l0" + }, + "source": [ + "# QLoRA Fine-Tuning; LLaMA 3.1 8B" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MDyR63OTNUJ6" + }, + "outputs": [], + "source": [ + "import sys\n", + "print(f\"Python: {sys.version}\")\n", + "\n", + "import torch\n", + "print(f\"PyTorch: {torch.__version__}\")\n", + "print(f\"CUDA Available: {torch.cuda.is_available()}\")\n", + "print(f\"CUDA Version: {torch.version.cuda}\")\n", + "print(f\"GPU: {torch.cuda.get_device_name(0)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3_8F5m3xxMtz" + }, + "outputs": [], + "source": [ + "!pip install -q --upgrade transformers==4.48.3 accelerate==1.3.0 datasets==3.2.0\n", + "!pip install -q --upgrade peft==0.14.0 trl==0.14.0 bitsandbytes==0.46.0\n", + "!pip install -q --upgrade matplotlib scipy scikit-learn\n", + "!pip install -q --upgrade \"huggingface_hub<1.0,>=0.24.0\"\n", + "!pip install -q --upgrade bitsandbytes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CpVdMBUVxMtz" + }, + "source": [ + "## Environment Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-yikV8pRBer9" + }, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "import math\n", + "import torch\n", + "import torch.nn.functional as F\n", + "import matplotlib.pyplot as plt\n", + "from tqdm import tqdm\n", + "from huggingface_hub import login\n", + "from transformers import (\n", + " AutoModelForCausalLM,\n", + " AutoTokenizer,\n", + " BitsAndBytesConfig,\n", + " set_seed\n", + ")\n", + "from datasets import load_dataset\n", + "from peft import PeftModel\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uuTX-xonNeOK" + }, + "outputs": [], + "source": [ + "BASE_MODEL = \"meta-llama/Meta-Llama-3.1-8B\"\n", + "DATASET_NAME = \"ed-donner/pricer-data\"\n", + "FINETUNED_MODEL = \"ed-donner/pricer-2024-09-13_13.04.39\"\n", + "REVISION = \"e8d637df551603dc86cd7a1598a8f44af4d7ae36\"\n", + "\n", + "TOP_K = 3\n", + "TEST_SIZE = 250\n", + "\n", + "GREEN = \"\\033[92m\"\n", + "YELLOW = \"\\033[93m\"\n", + "RED = \"\\033[91m\"\n", + "RESET = \"\\033[0m\"\n", + "COLOR_MAP = {\"red\": RED, \"orange\": YELLOW, \"green\": GREEN}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WyFPZeMcM88v" + }, + "outputs": [], + "source": [ + "from google.colab import userdata\n", + "hf_token = userdata.get('HF_TOKEN')\n", + "\n", + "login(hf_token, add_to_git_credential=True)\n", + "print(\"Successfully authenticated with HuggingFace\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qDqXth7MxMt0" + }, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cvXVoJH8LS6u" + }, + "outputs": [], + "source": [ + "print(f\"Loading dataset: {DATASET_NAME}\")\n", + "dataset = load_dataset(DATASET_NAME)\n", + "train = dataset['train']\n", + "test = dataset['test']\n", + "\n", + "print(f\"\\nDataset loaded successfully:\")\n", + "print(f\" Training examples: {len(train):,}\")\n", + "print(f\" Test examples: {len(test):,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xb86e__Wc7j_" + }, + "outputs": [], + "source": [ + "print(\"Sample test example:\\n\")\n", + "sample = test[0]\n", + "print(f\"Text: {sample['text'][:200]}...\")\n", + "print(f\"\\nGround truth price: ${sample['price']:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qJWQ0a3wZ0Bw" + }, + "source": [ + "## Quantization & Model Loading\n", + "\n", + "(4-bit quantization reduces LLaMA 3.1 8B from ~32GB to ~5-6GB VRAM)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lAUAAcEC6ido" + }, + "outputs": [], + "source": [ + "quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + ")\n", + "\n", + "print(\"Using 4-bit NF4 quantization\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R_O04fKxMMT-" + }, + "outputs": [], + "source": [ + "print(f\"Loading base model: {BASE_MODEL}\")\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\"\n", + "\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " BASE_MODEL,\n", + " quantization_config=quant_config,\n", + " device_map=\"auto\",\n", + ")\n", + "base_model.generation_config.pad_token_id = tokenizer.pad_token_id\n", + "\n", + "print(f\"Base model loaded - Memory: {base_model.get_memory_footprint() / 1e9:.2f} GB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m7PHUKDVxMt1" + }, + "source": [ + "## Load PEFT Adapters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6_Q1fqluxMt1" + }, + "outputs": [], + "source": [ + "print(f\"Loading fine-tuned adapters: {FINETUNED_MODEL}\")\n", + "print(f\"Revision: {REVISION}\")\n", + "\n", + "fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)\n", + "\n", + "print(f\"Fine-tuned model ready - Total memory: {fine_tuned_model.get_memory_footprint() / 1e9:.2f} GB\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Qst1LhBVAB04" + }, + "outputs": [], + "source": [ + "def extract_price(text):\n", + " if \"Price is $\" in text:\n", + " content = text.split(\"Price is $\")[1]\n", + " content = content.replace(',', '').replace('$', '')\n", + " match = re.search(r\"[-+]?\\d*\\.?\\d+\", content)\n", + " return float(match.group()) if match else 0.0\n", + " return 0.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jXFBW_5UeEcp" + }, + "outputs": [], + "source": [ + "test_cases = [\n", + " \"Price is $24.99\",\n", + " \"Price is $1,234.50\",\n", + " \"Price is $a fabulous 899.99 or so\"\n", + "]\n", + "\n", + "for test in test_cases:\n", + " result = extract_price(test)\n", + " print(f\"{test} -> ${result:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TTy_WAGexMt2" + }, + "source": [ + "## Prediction Function\n", + "\n", + "Top-K weighted averaging computes probability-weighted average of top K tokens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Je5dR8QEAI1d" + }, + "outputs": [], + "source": [ + "def advanced_predict(prompt, top_k=TOP_K):\n", + " set_seed(42)\n", + " inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(\"cuda\")\n", + " attention_mask = torch.ones(inputs.shape, device=\"cuda\")\n", + "\n", + " with torch.no_grad():\n", + " outputs = fine_tuned_model(inputs, attention_mask=attention_mask)\n", + " next_token_logits = outputs.logits[:, -1, :].to('cpu')\n", + "\n", + " next_token_probs = F.softmax(next_token_logits, dim=-1)\n", + " top_probs, top_token_ids = next_token_probs.topk(top_k)\n", + "\n", + " prices, weights = [], []\n", + "\n", + " for i in range(top_k):\n", + " predicted_token = tokenizer.decode(top_token_ids[0][i])\n", + " probability = top_probs[0][i]\n", + "\n", + " try:\n", + " price = float(predicted_token)\n", + " if price > 0:\n", + " prices.append(price)\n", + " weights.append(probability)\n", + " except ValueError:\n", + " continue\n", + "\n", + " if not prices:\n", + " return 0.0\n", + "\n", + " total_weight = sum(weights)\n", + " weighted_avg = sum(p * w / total_weight for p, w in zip(prices, weights))\n", + "\n", + " return weighted_avg.item()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7nI3Ec7exMt2" + }, + "source": [ + "## Evaluation Framework\n", + "\n", + "Metrics:\n", + "- Dollar Error: |prediction - truth|\n", + "- RMSLE: Root Mean Squared Log Error (penalizes relative errors)\n", + "- Hit Rate: Percentage in green zone (error < $40 OR < 20% of true price)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "30lzJXBH7BcK" + }, + "outputs": [], + "source": [ + "class Tester:\n", + "\n", + " def __init__(self, predictor, data, title=None, size=TEST_SIZE):\n", + " self.predictor = predictor\n", + " self.data = data\n", + " self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n", + " self.size = min(size, len(data))\n", + " self.guesses = []\n", + " self.truths = []\n", + " self.errors = []\n", + " self.sles = []\n", + " self.colors = []\n", + "\n", + " def color_for(self, error, truth):\n", + " if error < 40 or error / truth < 0.2:\n", + " return \"green\"\n", + " elif error < 80 or error / truth < 0.4:\n", + " return \"orange\"\n", + " else:\n", + " return \"red\"\n", + "\n", + " def run_datapoint(self, i):\n", + " datapoint = self.data[i]\n", + " guess = self.predictor(datapoint[\"text\"])\n", + " truth = datapoint[\"price\"]\n", + " error = abs(guess - truth)\n", + "\n", + " log_error = math.log(truth + 1) - math.log(guess + 1)\n", + " sle = log_error ** 2\n", + "\n", + " color = self.color_for(error, truth)\n", + " title = datapoint[\"text\"].split(\"\\n\\n\")[1][:30] + \"...\"\n", + "\n", + " self.guesses.append(guess)\n", + " self.truths.append(truth)\n", + " self.errors.append(error)\n", + " self.sles.append(sle)\n", + " self.colors.append(color)\n", + "\n", + " print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} | Truth: ${truth:,.2f} | Error: ${error:,.2f} | SLE: {sle:,.3f} | {title}{RESET}\")\n", + "\n", + " def chart(self, title):\n", + " plt.figure(figsize=(14, 10))\n", + " max_val = max(max(self.truths), max(self.guesses))\n", + "\n", + " plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=3, alpha=0.7, label='Perfect prediction')\n", + " plt.scatter(self.truths, self.guesses, s=20, c=self.colors, alpha=0.6)\n", + "\n", + " plt.xlabel('Ground Truth Price ($)', fontsize=12)\n", + " plt.ylabel('Model Prediction ($)', fontsize=12)\n", + " plt.xlim(0, max_val)\n", + " plt.ylim(0, max_val)\n", + " plt.title(title, fontsize=14, fontweight='bold')\n", + " plt.grid(alpha=0.3)\n", + " plt.legend()\n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + " def report(self):\n", + " average_error = sum(self.errors) / self.size\n", + " rmsle = math.sqrt(sum(self.sles) / self.size)\n", + " hits = sum(1 for color in self.colors if color == \"green\")\n", + " hit_rate = hits / self.size * 100\n", + "\n", + " title = f\"{self.title} | Avg Error: ${average_error:,.2f} | RMSLE: {rmsle:.3f} | Hit Rate: {hit_rate:.1f}%\"\n", + "\n", + " print(f\"\\n{'='*80}\")\n", + " print(f\"EVALUATION SUMMARY\")\n", + " print(f\"{'='*80}\")\n", + " print(f\"Model: {self.title}\")\n", + " print(f\"Test Size: {self.size}\")\n", + " print(f\"Average Dollar Error: ${average_error:,.2f}\")\n", + " print(f\"RMSLE: {rmsle:.4f}\")\n", + " print(f\"Hit Rate (Green): {hit_rate:.2f}% ({hits}/{self.size})\")\n", + " print(f\"{'='*80}\\n\")\n", + "\n", + " self.chart(title)\n", + "\n", + " def run(self):\n", + " print(f\"Running evaluation on {self.size} examples...\\n\")\n", + " for i in tqdm(range(self.size), desc=\"Evaluating\"):\n", + " self.run_datapoint(i)\n", + " self.report()\n", + "\n", + " @classmethod\n", + " def test(cls, function, data, **kwargs):\n", + " cls(function, data, **kwargs).run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "krjnRyHO4Fp6" + }, + "outputs": [], + "source": [ + "print(f\"Loading dataset: {DATASET_NAME}\")\n", + "dataset = load_dataset(DATASET_NAME)\n", + "train = dataset['train']\n", + "test = dataset['test']\n", + "\n", + "print(f\"\\nDataset loaded successfully:\")\n", + "print(f\" Training examples: {len(train):,}\")\n", + "print(f\" Test examples: {len(test):,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5JjNwXMDxMt2" + }, + "source": [ + "## Run Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W_KcLvyt6kbb" + }, + "outputs": [], + "source": [ + "Tester.test(advanced_predict, test, title=\"LLaMA 3.1 8B QLoRA (400K)\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}