{ "cells": [ { "cell_type": "markdown", "id": "cbf08d83", "metadata": {}, "source": [ "# Training" ] }, { "cell_type": "code", "execution_count": null, "id": "f22db0ae", "metadata": {}, "outputs": [], "source": [ "!pip install unsloth" ] }, { "cell_type": "code", "execution_count": null, "id": "e5e1ac78", "metadata": {}, "outputs": [], "source": [ "import unsloth\n", "\n", "import os\n", "import re\n", "import math\n", "from tqdm import tqdm\n", "from google.colab import userdata\n", "from huggingface_hub import login\n", "# import torch\n", "# import transformers\n", "# from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig\n", "from datasets import load_dataset, Dataset, DatasetDict\n", "import wandb\n", "#from peft import LoraConfig\n", "#from trl import SFTTrainer, SFTConfig\n", "from datetime import datetime\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "id": "75bee643", "metadata": {}, "outputs": [], "source": [ "# Constants\n", "BASE_MODEL = \"unsloth/phi-4-unsloth-bnb-4bit\"\n", "\n", "PROJECT_NAME = \"pricer\"\n", "HF_USER = \"javiomotero\" # your HF name here!\n", "\n", "DATASET_NAME = f\"{HF_USER}/lite-data\"\n", "\n", "dataset = load_dataset(DATASET_NAME)\n", "train = dataset['train']\n", "test = dataset['test']\n", "\n", "# Split your dataset into train and eval\n", "split_dataset = train.train_test_split(test_size=0.1, seed=42)\n", "\n", "train_dataset = split_dataset[\"train\"]\n", "eval_dataset = split_dataset[\"test\"]\n", "\n", "\n", "\n", "RUN_NAME = f\"{datetime.now():%Y-%m-%d_%H.%M.%S}\"\n", "PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n", "HUB_MODEL_NAME = f\"{HF_USER}/{PROJECT_RUN_NAME}\"\n", "\n", "\n", "\n", "LOGGING_STEPS = 50\n", "SAVE_STEPS = 500\n", "LOG_TO_WANDB = True\n", "\n", "# Log in to HuggingFace\n", "\n", "hf_token = userdata.get('HF_TOKEN')\n", "login(hf_token, add_to_git_credential=True)\n", "\n", "# Log in to Weights & Biases\n", "wandb_api_key = userdata.get('WANDB_API_KEY')\n", "os.environ[\"WANDB_API_KEY\"] = wandb_api_key\n", "wandb.login()\n", "\n", "# Configure Weights & Biases to record against our project\n", "os.environ[\"WANDB_PROJECT\"] = PROJECT_NAME\n", "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\" if LOG_TO_WANDB else \"end\"\n", "os.environ[\"WANDB_WATCH\"] = \"gradients\"\n", "\n", "if LOG_TO_WANDB:\n", " run = wandb.init(project=PROJECT_NAME, name=RUN_NAME)" ] }, { "cell_type": "code", "execution_count": null, "id": "260975b0", "metadata": {}, "outputs": [], "source": [ "from unsloth import FastLanguageModel, is_bfloat16_supported\n", "from trl import SFTTrainer, SFTConfig\n", "from peft import LoraConfig\n", "import torch\n", "\n", "\n", "# Your hyperparameters\n", "LORA_R = 8\n", "LORA_ALPHA = 2 * LORA_R\n", "LORA_DROPOUT = 0.2\n", "TARGET_MODULES = [\"q_proj\", \"v_proj\", \"k_proj\", \"o_proj\"] # keep small for T4\n", "\n", "\n", "EPOCHS = 1\n", "BATCH_SIZE = 1\n", "GRADIENT_ACCUMULATION_STEPS = 1\n", "LEARNING_RATE = 1e-4\n", "LR_SCHEDULER_TYPE = \"cosine\"\n", "WARMUP_RATIO = 0.03\n", "OPTIMIZER = \"paged_adamw_32bit\" # consider adamw_8bit if you hit NaNs or OOM\n", "MAX_SEQUENCE_LENGTH = 182\n", "\n", "# 1) Load model via Unsloth in 4-bit\n", "dtype = \"bfloat16\" if is_bfloat16_supported() else \"float16\"\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = BASE_MODEL,\n", " max_seq_length = MAX_SEQUENCE_LENGTH,\n", " load_in_4bit = True,\n", " dtype = dtype,\n", ")\n", "\n", "tokenizer.pad_token = tokenizer.eos_token\n", "tokenizer.padding_side = \"right\"\n", "\n", "# 2) Apply LoRA using Unsloth helper (uses gradient checkpointing under the hood if set)\n", "peft_config = LoraConfig(\n", " r = LORA_R,\n", " lora_alpha = LORA_ALPHA,\n", " lora_dropout = LORA_DROPOUT,\n", " bias = \"none\",\n", " task_type = \"CAUSAL_LM\",\n", " target_modules = TARGET_MODULES,\n", ")\n", "\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " r = peft_config.r,\n", " lora_alpha = peft_config.lora_alpha,\n", " lora_dropout = peft_config.lora_dropout,\n", " target_modules = peft_config.target_modules,\n", " bias = peft_config.bias,\n", " use_gradient_checkpointing = \"unsloth\",\n", ")\n", "\n", "# 3) Your SFTConfig (same API, Unsloth integrates with TRL’s SFTTrainer)\n", "train_parameters = SFTConfig(\n", " output_dir = PROJECT_RUN_NAME,\n", " num_train_epochs = EPOCHS,\n", " per_device_train_batch_size = BATCH_SIZE,\n", " per_device_eval_batch_size = 1,\n", " eval_strategy = \"steps\",\n", " eval_steps = SAVE_STEPS,\n", " gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,\n", " optim = OPTIMIZER,\n", " save_steps = SAVE_STEPS,\n", " save_total_limit = 10,\n", " logging_steps = LOGGING_STEPS,\n", " learning_rate = LEARNING_RATE,\n", " weight_decay = 0.001,\n", " fp16 = (dtype == \"float16\"),\n", " bf16 = (dtype == \"bfloat16\"),\n", " max_grad_norm = 0.3,\n", " max_steps = -1,\n", " warmup_ratio = WARMUP_RATIO,\n", " group_by_length = True,\n", " lr_scheduler_type = LR_SCHEDULER_TYPE,\n", " report_to = \"wandb\" if LOG_TO_WANDB else None,\n", " run_name = RUN_NAME,\n", " max_seq_length = MAX_SEQUENCE_LENGTH,\n", " dataset_text_field = \"text\",\n", " save_strategy = \"steps\",\n", " hub_strategy = \"every_save\",\n", " push_to_hub = True,\n", " hub_model_id = HUB_MODEL_NAME,\n", " hub_private_repo = True,\n", ")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f1b324fb", "metadata": {}, "outputs": [], "source": [ "\n", "#Checkpointing from wandb (run) - I guess we can also do from HF\n", "#checkpoint_url = \"javier-otero-marquez-personal-education/pricer/model-2025-10-25_15.39.13:v3\" #This was for first retrain\n", "checkpoint_url = \"javier-otero-marquez-personal-education/pricer/model-2025-10-26_09.54.35:v1\"\n", "\n", "artifact = run.use_artifact(checkpoint_url, type='model')\n", "artifact_dir = artifact.download()\n", "\n", "trainer = SFTTrainer(\n", " model = model,\n", " tokenizer = tokenizer,\n", " args = train_parameters,\n", " train_dataset = train_dataset,\n", " eval_dataset = eval_dataset,\n", " packing = False, # safer for stability; can turn on after it fits\n", " completion_only_loss=True\n", ")\n", "trainer.train(resume_from_checkpoint=artifact_dir)\n" ] }, { "cell_type": "markdown", "id": "affe0724", "metadata": {}, "source": [ "# Inference" ] }, { "cell_type": "code", "execution_count": null, "id": "b855e0a6", "metadata": {}, "outputs": [], "source": [ "# imports\n", "\n", "import os\n", "import re\n", "import math\n", "from tqdm import tqdm\n", "from google.colab import userdata\n", "from huggingface_hub import login\n", "import torch\n", "import torch.nn.functional as F\n", "import transformers\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed\n", "from datasets import load_dataset, Dataset, DatasetDict\n", "from datetime import datetime\n", "from peft import PeftModel\n", "import matplotlib.pyplot as plt\n", "# Constants\n", "\n", "BASE_MODEL = \"unsloth/phi-4-unsloth-bnb-4bit\"\n", "PROJECT_NAME = \"pricer\"\n", "HF_USER = \"javiomotero\" # your HF name here! Or use mine if you just want to reproduce my results.\n", "\n", "# The run itselfjaviomotero/pricer-\n", "RUN_NAME = \"2025-10-26_09.54.35\"\n", "PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n", "REVISION = \"53c8d992140e5b184e9388418d711d3e38f7bd9d\" # or REVISION = None\n", "FINETUNED_MODEL = f\"{HF_USER}/{PROJECT_RUN_NAME}\"\n", "\n", "# Uncomment this line if you wish to use my model\n", "# FINETUNED_MODEL = f\"ed-donner/{PROJECT_RUN_NAME}\"\n", "\n", "# Data\n", "\n", "DATASET_NAME = f\"{HF_USER}/lite-data\"\n", "# Or just use the one I've uploaded\n", "# DATASET_NAME = \"ed-donner/pricer-data\"\n", "\n", "# Hyperparameters for QLoRA\n", "\n", "QUANT_4_BIT = True\n", "\n", "%matplotlib inline\n", "\n", "# Used for writing to output in color\n", "\n", "GREEN = \"\\033[92m\"\n", "YELLOW = \"\\033[93m\"\n", "RED = \"\\033[91m\"\n", "RESET = \"\\033[0m\"\n", "COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}\n", "# Log in to HuggingFace\n", "\n", "hf_token = userdata.get('HF_TOKEN')\n", "login(hf_token, add_to_git_credential=True)\n", "dataset = load_dataset(DATASET_NAME)\n", "train = dataset['train']\n", "test = dataset['test']\n", "# pick the right quantization (thank you Robert M. for spotting the bug with the 8 bit version!)\n", "\n", "if QUANT_4_BIT:\n", " quant_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_use_double_quant=True,\n", " bnb_4bit_compute_dtype=torch.bfloat16,\n", " bnb_4bit_quant_type=\"nf4\"\n", " )\n", "else:\n", " quant_config = BitsAndBytesConfig(\n", " load_in_8bit=True,\n", " bnb_8bit_compute_dtype=torch.bfloat16\n", " )\n", "# Load the Tokenizer and the Model\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "tokenizer.padding_side = \"right\"\n", "\n", "base_model = AutoModelForCausalLM.from_pretrained(\n", " BASE_MODEL,\n", " quantization_config=quant_config,\n", " device_map=\"auto\",\n", ")\n", "base_model.generation_config.pad_token_id = tokenizer.pad_token_id\n", "\n", "# Load the fine-tuned model with PEFT\n", "if REVISION:\n", " fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)\n", "else:\n", " fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)\n", "\n", "\n", "print(f\"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d4e6e25c", "metadata": {}, "outputs": [], "source": [ "def extract_price(s):\n", " if \"Price is $\" in s:\n", " contents = s.split(\"Price is $\")[1]\n", " contents = contents.replace(',','')\n", " match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n", " return float(match.group()) if match else 0\n", " return 0\n", "top_K = 3\n", "\n", "def improved_model_predict(prompt, device=\"cuda\"):\n", " set_seed(42)\n", " inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(device)\n", " attention_mask = torch.ones(inputs.shape, device=device)\n", "\n", " with torch.no_grad():\n", " outputs = fine_tuned_model(inputs, attention_mask=attention_mask)\n", " next_token_logits = outputs.logits[:, -1, :].to('cpu')\n", "\n", " next_token_probs = F.softmax(next_token_logits, dim=-1)\n", " top_prob, top_token_id = next_token_probs.topk(top_K)\n", " prices, weights = [], []\n", " for i in range(top_K):\n", " predicted_token = tokenizer.decode(top_token_id[0][i])\n", " probability = top_prob[0][i]\n", " try:\n", " result = float(predicted_token)\n", " except ValueError as e:\n", " result = 0.0\n", " if result > 0:\n", " prices.append(result)\n", " weights.append(probability)\n", " if not prices:\n", " return 0.0, 0.0\n", " total = sum(weights)\n", " weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]\n", " return sum(weighted_prices).item()\n", "\n", "class Tester:\n", "\n", " def __init__(self, predictor, data, title=None, size=250):\n", " self.predictor = predictor\n", " self.data = data\n", " self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n", " self.size = size\n", " self.guesses = []\n", " self.truths = []\n", " self.errors = []\n", " self.sles = []\n", " self.colors = []\n", "\n", " def color_for(self, error, truth):\n", " if error<40 or error/truth < 0.2:\n", " return \"green\"\n", " elif error<80 or error/truth < 0.4:\n", " return \"orange\"\n", " else:\n", " return \"red\"\n", "\n", " def run_datapoint(self, i):\n", " datapoint = self.data[i]\n", " guess = self.predictor(datapoint[\"text\"])\n", " truth = datapoint[\"price\"]\n", " error = abs(guess - truth)\n", " log_error = math.log(truth+1) - math.log(guess+1)\n", " sle = log_error ** 2\n", " color = self.color_for(error, truth)\n", " title = datapoint[\"text\"].split(\"\\n\\n\")[1][:20] + \"...\"\n", " self.guesses.append(guess)\n", " self.truths.append(truth)\n", " self.errors.append(error)\n", " self.sles.append(sle)\n", " self.colors.append(color)\n", " print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}\")\n", "\n", " def chart(self, title):\n", " max_error = max(self.errors)\n", " plt.figure(figsize=(12, 8))\n", " max_val = max(max(self.truths), max(self.guesses))\n", " plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n", " plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n", " plt.xlabel('Ground Truth')\n", " plt.ylabel('Model Estimate')\n", " plt.xlim(0, max_val)\n", " plt.ylim(0, max_val)\n", " plt.title(title)\n", " plt.show()\n", "\n", " def report(self):\n", " average_error = sum(self.errors) / self.size\n", " rmsle = math.sqrt(sum(self.sles) / self.size)\n", " hits = sum(1 for color in self.colors if color==\"green\")\n", " title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%\"\n", " self.chart(title)\n", "\n", " def run(self):\n", " self.error = 0\n", " for i in range(self.size):\n", " self.run_datapoint(i)\n", " self.report()\n", "\n", " @classmethod\n", " def test(cls, function, data):\n", " cls(function, data).run()\n", "#Step 6000\n", "Tester.test(improved_model_predict, test)" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 5 }