Merge pull request #913 from kbaah/kwabena_bootcamp_week7

Kwabena Bootcamp Week7
This commit is contained in:
Ed Donner
2025-10-31 09:17:38 -04:00
committed by GitHub

View File

@@ -0,0 +1,459 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"source": [
"\n",
"\n",
"## Predict Product Prices\n",
"\n",
"### And now, to evaluate our fine-tuned open source model\n",
"\n"
],
"metadata": {
"id": "GHsssBgWM_l0"
}
},
{
"cell_type": "code",
"source": [
"# pip installs\n",
"\n",
"!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
"!pip install -q --upgrade requests==2.32.3 bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 datasets==3.2.0 peft==0.14.0 trl==0.14.0 matplotlib wandb"
],
"metadata": {
"id": "MDyR63OTNUJ6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# imports\n",
"\n",
"import os\n",
"import re\n",
"import math\n",
"from tqdm import tqdm\n",
"from google.colab import userdata\n",
"from huggingface_hub import login\n",
"import torch\n",
"import torch.nn.functional as F\n",
"import transformers\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed\n",
"from datasets import load_dataset, Dataset, DatasetDict\n",
"from datetime import datetime\n",
"from peft import PeftModel\n",
"import matplotlib.pyplot as plt"
],
"metadata": {
"id": "-yikV8pRBer9"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Constants\n",
"\n",
"BASE_MODEL = \"meta-llama/Meta-Llama-3.1-8B\"\n",
"PROJECT_NAME = \"pricer\"\n",
"HF_USER = \"ampelox\" # your HF name here! Or use mine if you just want to reproduce my results.\n",
"\n",
"# The run itself\n",
"\n",
"RUN_NAME = \"2025-10-30_09.40.59\"\n",
"PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n",
"REVISION = \"dd79bbfe3922ac56eeba2b2473ca35b08beedaa4\" # or REVISION = None\n",
"FINETUNED_MODEL = f\"{HF_USER}/{PROJECT_RUN_NAME}\"\n",
"\n",
"# Uncomment this line if you wish to use my model\n",
"# FINETUNED_MODEL = f\"ed-donner/{PROJECT_RUN_NAME}\"\n",
"\n",
"# Data\n",
"\n",
"DATASET_NAME = f\"{HF_USER}/pricer-data\"\n",
"# Or just use the one I've uploaded\n",
"# DATASET_NAME = \"ed-donner/pricer-data\"\n",
"\n",
"# Hyperparameters for QLoRA\n",
"\n",
"QUANT_4_BIT = True\n",
"\n",
"%matplotlib inline\n",
"\n",
"# Used for writing to output in color\n",
"\n",
"GREEN = \"\\033[92m\"\n",
"YELLOW = \"\\033[93m\"\n",
"RED = \"\\033[91m\"\n",
"RESET = \"\\033[0m\"\n",
"COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}"
],
"metadata": {
"id": "uuTX-xonNeOK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Log in to HuggingFace\n",
"\n",
"If you don't already have a HuggingFace account, visit https://huggingface.co to sign up and create a token.\n",
"\n",
"Then select the Secrets for this Notebook by clicking on the key icon in the left, and add a new secret called `HF_TOKEN` with the value as your token.\n"
],
"metadata": {
"id": "8JArT3QAQAjx"
}
},
{
"cell_type": "code",
"source": [
"# Log in to HuggingFace\n",
"\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)"
],
"metadata": {
"id": "WyFPZeMcM88v"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"dataset = load_dataset(DATASET_NAME)\n",
"train = dataset['train']\n",
"test = dataset['test']"
],
"metadata": {
"id": "cvXVoJH8LS6u"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"test[0]"
],
"metadata": {
"id": "xb86e__Wc7j_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Now load the Tokenizer and Model"
],
"metadata": {
"id": "qJWQ0a3wZ0Bw"
}
},
{
"cell_type": "code",
"source": [
"# pick the right quantization (thank you Robert M. for spotting the bug with the 8 bit version!)\n",
"\n",
"if QUANT_4_BIT:\n",
" quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
" )\n",
"else:\n",
" quant_config = BitsAndBytesConfig(\n",
" load_in_8bit=True,\n",
" bnb_8bit_compute_dtype=torch.bfloat16\n",
" )"
],
"metadata": {
"id": "lAUAAcEC6ido"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Load the Tokenizer and the Model\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"tokenizer.padding_side = \"right\"\n",
"\n",
"base_model = AutoModelForCausalLM.from_pretrained(\n",
" BASE_MODEL,\n",
" quantization_config=quant_config,\n",
" device_map=\"auto\",\n",
")\n",
"base_model.generation_config.pad_token_id = tokenizer.pad_token_id\n",
"\n",
"# Load the fine-tuned model with PEFT\n",
"if REVISION:\n",
" fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)\n",
"else:\n",
" fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)\n",
"\n",
"\n",
"print(f\"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")"
],
"metadata": {
"id": "R_O04fKxMMT-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"fine_tuned_model"
],
"metadata": {
"id": "kD-GJtbrdd5t"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# THE MOMENT OF TRUTH!\n",
"\n",
"## Use the model in inference mode\n",
"\n",
"Remember, GPT-4o had an average error of \\$76. \n",
"Llama 3.1 base model had an average error of \\$395.72. \n",
"This human had an error of \\$127. \n",
"\n",
"## Caveat\n",
"\n",
"Keep in mind that prices of goods vary considerably; the model can't predict things like sale prices that it doesn't have any information about."
],
"metadata": {
"id": "UObo1-RqaNnT"
}
},
{
"cell_type": "code",
"source": [
"def extract_price(s):\n",
" if \"Price is $\" in s:\n",
" contents = s.split(\"Price is $\")[1]\n",
" contents = contents.replace(',','')\n",
" match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n",
" return float(match.group()) if match else 0\n",
" return 0"
],
"metadata": {
"id": "Qst1LhBVAB04"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"extract_price(\"Price is $a fabulous 899.99 or so\")"
],
"metadata": {
"id": "jXFBW_5UeEcp"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Original prediction function takes the most likely next token\n",
"\n",
"def model_predict(prompt):\n",
" set_seed(42)\n",
" inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(\"cuda\")\n",
" attention_mask = torch.ones(inputs.shape, device=\"cuda\")\n",
" outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=3, num_return_sequences=1)\n",
" response = tokenizer.decode(outputs[0])\n",
" return extract_price(response)"
],
"metadata": {
"id": "Oj_PzpdFAIMk"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# An improved prediction function takes a weighted average of the top 3 choices\n",
"# This code would be more complex if we couldn't take advantage of the fact\n",
"# That Llama generates 1 token for any 3 digit number\n",
"\n",
"top_K = 3\n",
"\n",
"def improved_model_predict(prompt, device=\"cuda\"):\n",
" set_seed(42)\n",
" inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(device)\n",
" attention_mask = torch.ones(inputs.shape, device=device)\n",
"\n",
" with torch.no_grad():\n",
" outputs = fine_tuned_model(inputs, attention_mask=attention_mask)\n",
" next_token_logits = outputs.logits[:, -1, :].to('cpu')\n",
"\n",
" next_token_probs = F.softmax(next_token_logits, dim=-1)\n",
" top_prob, top_token_id = next_token_probs.topk(top_K)\n",
" prices, weights = [], []\n",
" for i in range(top_K):\n",
" predicted_token = tokenizer.decode(top_token_id[0][i])\n",
" probability = top_prob[0][i]\n",
" try:\n",
" result = float(predicted_token)\n",
" except ValueError as e:\n",
" result = 0.0\n",
" if result > 0:\n",
" prices.append(result)\n",
" weights.append(probability)\n",
" if not prices:\n",
" return 0.0, 0.0\n",
" total = sum(weights)\n",
" weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]\n",
" return sum(weighted_prices).item()"
],
"metadata": {
"id": "Je5dR8QEAI1d"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "lQk7jNlm1oV9"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"class Tester:\n",
"\n",
" def __init__(self, predictor, data, title=None, size=250):\n",
" self.predictor = predictor\n",
" self.data = data\n",
" self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n",
" self.size = size\n",
" self.guesses = []\n",
" self.truths = []\n",
" self.errors = []\n",
" self.sles = []\n",
" self.colors = []\n",
"\n",
" def color_for(self, error, truth):\n",
" if error<40 or error/truth < 0.2:\n",
" return \"green\"\n",
" elif error<80 or error/truth < 0.4:\n",
" return \"orange\"\n",
" else:\n",
" return \"red\"\n",
"\n",
" def run_datapoint(self, i):\n",
" datapoint = self.data[i]\n",
" guess = self.predictor(datapoint[\"text\"])\n",
" truth = datapoint[\"price\"]\n",
" error = abs(guess - truth)\n",
" log_error = math.log(truth+1) - math.log(guess+1)\n",
" sle = log_error ** 2\n",
" color = self.color_for(error, truth)\n",
" title = datapoint[\"text\"].split(\"\\n\\n\")[1][:20] + \"...\"\n",
" self.guesses.append(guess)\n",
" self.truths.append(truth)\n",
" self.errors.append(error)\n",
" self.sles.append(sle)\n",
" self.colors.append(color)\n",
" print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}\")\n",
"\n",
" def chart(self, title):\n",
" max_error = max(self.errors)\n",
" plt.figure(figsize=(12, 8))\n",
" max_val = max(max(self.truths), max(self.guesses))\n",
" plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n",
" plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n",
" plt.xlabel('Ground Truth')\n",
" plt.ylabel('Model Estimate')\n",
" plt.xlim(0, max_val)\n",
" plt.ylim(0, max_val)\n",
" plt.title(title)\n",
" plt.show()\n",
"\n",
" def report(self):\n",
" average_error = sum(self.errors) / self.size\n",
" rmsle = math.sqrt(sum(self.sles) / self.size)\n",
" hits = sum(1 for color in self.colors if color==\"green\")\n",
" title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%\"\n",
" self.chart(title)\n",
"\n",
" def run(self):\n",
" self.error = 0\n",
" for i in range(self.size):\n",
" self.run_datapoint(i)\n",
" self.report()\n",
"\n",
" @classmethod\n",
" def test(cls, function, data):\n",
" cls(function, data).run()"
],
"metadata": {
"id": "30lzJXBH7BcK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"Tester.test(improved_model_predict, test)"
],
"metadata": {
"id": "W_KcLvyt6kbb"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "M4NSMcKl3Bhw"
},
"execution_count": null,
"outputs": []
}
]
}