Files
LLM_Engineering_OLD/week7/community_contributions/Exercise_Week_7_jom.ipynb
2025-10-27 11:30:17 +01:00

458 lines
16 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "cbf08d83",
"metadata": {},
"source": [
"# Training"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f22db0ae",
"metadata": {},
"outputs": [],
"source": [
"!pip install unsloth"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5e1ac78",
"metadata": {},
"outputs": [],
"source": [
"import unsloth\n",
"\n",
"import os\n",
"import re\n",
"import math\n",
"from tqdm import tqdm\n",
"from google.colab import userdata\n",
"from huggingface_hub import login\n",
"# import torch\n",
"# import transformers\n",
"# from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig\n",
"from datasets import load_dataset, Dataset, DatasetDict\n",
"import wandb\n",
"#from peft import LoraConfig\n",
"#from trl import SFTTrainer, SFTConfig\n",
"from datetime import datetime\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75bee643",
"metadata": {},
"outputs": [],
"source": [
"# Constants\n",
"BASE_MODEL = \"unsloth/phi-4-unsloth-bnb-4bit\"\n",
"\n",
"PROJECT_NAME = \"pricer\"\n",
"HF_USER = \"javiomotero\" # your HF name here!\n",
"\n",
"DATASET_NAME = f\"{HF_USER}/lite-data\"\n",
"\n",
"dataset = load_dataset(DATASET_NAME)\n",
"train = dataset['train']\n",
"test = dataset['test']\n",
"\n",
"# Split your dataset into train and eval\n",
"split_dataset = train.train_test_split(test_size=0.1, seed=42)\n",
"\n",
"train_dataset = split_dataset[\"train\"]\n",
"eval_dataset = split_dataset[\"test\"]\n",
"\n",
"\n",
"\n",
"RUN_NAME = f\"{datetime.now():%Y-%m-%d_%H.%M.%S}\"\n",
"PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n",
"HUB_MODEL_NAME = f\"{HF_USER}/{PROJECT_RUN_NAME}\"\n",
"\n",
"\n",
"\n",
"LOGGING_STEPS = 50\n",
"SAVE_STEPS = 500\n",
"LOG_TO_WANDB = True\n",
"\n",
"# Log in to HuggingFace\n",
"\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)\n",
"\n",
"# Log in to Weights & Biases\n",
"wandb_api_key = userdata.get('WANDB_API_KEY')\n",
"os.environ[\"WANDB_API_KEY\"] = wandb_api_key\n",
"wandb.login()\n",
"\n",
"# Configure Weights & Biases to record against our project\n",
"os.environ[\"WANDB_PROJECT\"] = PROJECT_NAME\n",
"os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\" if LOG_TO_WANDB else \"end\"\n",
"os.environ[\"WANDB_WATCH\"] = \"gradients\"\n",
"\n",
"if LOG_TO_WANDB:\n",
" run = wandb.init(project=PROJECT_NAME, name=RUN_NAME)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "260975b0",
"metadata": {},
"outputs": [],
"source": [
"from unsloth import FastLanguageModel, is_bfloat16_supported\n",
"from trl import SFTTrainer, SFTConfig\n",
"from peft import LoraConfig\n",
"import torch\n",
"\n",
"\n",
"# Your hyperparameters\n",
"LORA_R = 8\n",
"LORA_ALPHA = 2 * LORA_R\n",
"LORA_DROPOUT = 0.2\n",
"TARGET_MODULES = [\"q_proj\", \"v_proj\", \"k_proj\", \"o_proj\"] # keep small for T4\n",
"\n",
"\n",
"EPOCHS = 1\n",
"BATCH_SIZE = 1\n",
"GRADIENT_ACCUMULATION_STEPS = 1\n",
"LEARNING_RATE = 1e-4\n",
"LR_SCHEDULER_TYPE = \"cosine\"\n",
"WARMUP_RATIO = 0.03\n",
"OPTIMIZER = \"paged_adamw_32bit\" # consider adamw_8bit if you hit NaNs or OOM\n",
"MAX_SEQUENCE_LENGTH = 182\n",
"\n",
"# 1) Load model via Unsloth in 4-bit\n",
"dtype = \"bfloat16\" if is_bfloat16_supported() else \"float16\"\n",
"model, tokenizer = FastLanguageModel.from_pretrained(\n",
" model_name = BASE_MODEL,\n",
" max_seq_length = MAX_SEQUENCE_LENGTH,\n",
" load_in_4bit = True,\n",
" dtype = dtype,\n",
")\n",
"\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"tokenizer.padding_side = \"right\"\n",
"\n",
"# 2) Apply LoRA using Unsloth helper (uses gradient checkpointing under the hood if set)\n",
"peft_config = LoraConfig(\n",
" r = LORA_R,\n",
" lora_alpha = LORA_ALPHA,\n",
" lora_dropout = LORA_DROPOUT,\n",
" bias = \"none\",\n",
" task_type = \"CAUSAL_LM\",\n",
" target_modules = TARGET_MODULES,\n",
")\n",
"\n",
"model = FastLanguageModel.get_peft_model(\n",
" model,\n",
" r = peft_config.r,\n",
" lora_alpha = peft_config.lora_alpha,\n",
" lora_dropout = peft_config.lora_dropout,\n",
" target_modules = peft_config.target_modules,\n",
" bias = peft_config.bias,\n",
" use_gradient_checkpointing = \"unsloth\",\n",
")\n",
"\n",
"# 3) Your SFTConfig (same API, Unsloth integrates with TRLs SFTTrainer)\n",
"train_parameters = SFTConfig(\n",
" output_dir = PROJECT_RUN_NAME,\n",
" num_train_epochs = EPOCHS,\n",
" per_device_train_batch_size = BATCH_SIZE,\n",
" per_device_eval_batch_size = 1,\n",
" eval_strategy = \"steps\",\n",
" eval_steps = SAVE_STEPS,\n",
" gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,\n",
" optim = OPTIMIZER,\n",
" save_steps = SAVE_STEPS,\n",
" save_total_limit = 10,\n",
" logging_steps = LOGGING_STEPS,\n",
" learning_rate = LEARNING_RATE,\n",
" weight_decay = 0.001,\n",
" fp16 = (dtype == \"float16\"),\n",
" bf16 = (dtype == \"bfloat16\"),\n",
" max_grad_norm = 0.3,\n",
" max_steps = -1,\n",
" warmup_ratio = WARMUP_RATIO,\n",
" group_by_length = True,\n",
" lr_scheduler_type = LR_SCHEDULER_TYPE,\n",
" report_to = \"wandb\" if LOG_TO_WANDB else None,\n",
" run_name = RUN_NAME,\n",
" max_seq_length = MAX_SEQUENCE_LENGTH,\n",
" dataset_text_field = \"text\",\n",
" save_strategy = \"steps\",\n",
" hub_strategy = \"every_save\",\n",
" push_to_hub = True,\n",
" hub_model_id = HUB_MODEL_NAME,\n",
" hub_private_repo = True,\n",
")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1b324fb",
"metadata": {},
"outputs": [],
"source": [
"\n",
"#Checkpointing from wandb (run) - I guess we can also do from HF\n",
"#checkpoint_url = \"javier-otero-marquez-personal-education/pricer/model-2025-10-25_15.39.13:v3\" #This was for first retrain\n",
"checkpoint_url = \"javier-otero-marquez-personal-education/pricer/model-2025-10-26_09.54.35:v1\"\n",
"\n",
"artifact = run.use_artifact(checkpoint_url, type='model')\n",
"artifact_dir = artifact.download()\n",
"\n",
"trainer = SFTTrainer(\n",
" model = model,\n",
" tokenizer = tokenizer,\n",
" args = train_parameters,\n",
" train_dataset = train_dataset,\n",
" eval_dataset = eval_dataset,\n",
" packing = False, # safer for stability; can turn on after it fits\n",
" completion_only_loss=True\n",
")\n",
"trainer.train(resume_from_checkpoint=artifact_dir)\n"
]
},
{
"cell_type": "markdown",
"id": "affe0724",
"metadata": {},
"source": [
"# Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b855e0a6",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import re\n",
"import math\n",
"from tqdm import tqdm\n",
"from google.colab import userdata\n",
"from huggingface_hub import login\n",
"import torch\n",
"import torch.nn.functional as F\n",
"import transformers\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed\n",
"from datasets import load_dataset, Dataset, DatasetDict\n",
"from datetime import datetime\n",
"from peft import PeftModel\n",
"import matplotlib.pyplot as plt\n",
"# Constants\n",
"\n",
"BASE_MODEL = \"unsloth/phi-4-unsloth-bnb-4bit\"\n",
"PROJECT_NAME = \"pricer\"\n",
"HF_USER = \"javiomotero\" # your HF name here! Or use mine if you just want to reproduce my results.\n",
"\n",
"# The run itselfjaviomotero/pricer-\n",
"RUN_NAME = \"2025-10-26_09.54.35\"\n",
"PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n",
"REVISION = \"53c8d992140e5b184e9388418d711d3e38f7bd9d\" # or REVISION = None\n",
"FINETUNED_MODEL = f\"{HF_USER}/{PROJECT_RUN_NAME}\"\n",
"\n",
"# Uncomment this line if you wish to use my model\n",
"# FINETUNED_MODEL = f\"ed-donner/{PROJECT_RUN_NAME}\"\n",
"\n",
"# Data\n",
"\n",
"DATASET_NAME = f\"{HF_USER}/lite-data\"\n",
"# Or just use the one I've uploaded\n",
"# DATASET_NAME = \"ed-donner/pricer-data\"\n",
"\n",
"# Hyperparameters for QLoRA\n",
"\n",
"QUANT_4_BIT = True\n",
"\n",
"%matplotlib inline\n",
"\n",
"# Used for writing to output in color\n",
"\n",
"GREEN = \"\\033[92m\"\n",
"YELLOW = \"\\033[93m\"\n",
"RED = \"\\033[91m\"\n",
"RESET = \"\\033[0m\"\n",
"COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}\n",
"# Log in to HuggingFace\n",
"\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)\n",
"dataset = load_dataset(DATASET_NAME)\n",
"train = dataset['train']\n",
"test = dataset['test']\n",
"# pick the right quantization (thank you Robert M. for spotting the bug with the 8 bit version!)\n",
"\n",
"if QUANT_4_BIT:\n",
" quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
" )\n",
"else:\n",
" quant_config = BitsAndBytesConfig(\n",
" load_in_8bit=True,\n",
" bnb_8bit_compute_dtype=torch.bfloat16\n",
" )\n",
"# Load the Tokenizer and the Model\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"tokenizer.padding_side = \"right\"\n",
"\n",
"base_model = AutoModelForCausalLM.from_pretrained(\n",
" BASE_MODEL,\n",
" quantization_config=quant_config,\n",
" device_map=\"auto\",\n",
")\n",
"base_model.generation_config.pad_token_id = tokenizer.pad_token_id\n",
"\n",
"# Load the fine-tuned model with PEFT\n",
"if REVISION:\n",
" fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)\n",
"else:\n",
" fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)\n",
"\n",
"\n",
"print(f\"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4e6e25c",
"metadata": {},
"outputs": [],
"source": [
"def extract_price(s):\n",
" if \"Price is $\" in s:\n",
" contents = s.split(\"Price is $\")[1]\n",
" contents = contents.replace(',','')\n",
" match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n",
" return float(match.group()) if match else 0\n",
" return 0\n",
"top_K = 3\n",
"\n",
"def improved_model_predict(prompt, device=\"cuda\"):\n",
" set_seed(42)\n",
" inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(device)\n",
" attention_mask = torch.ones(inputs.shape, device=device)\n",
"\n",
" with torch.no_grad():\n",
" outputs = fine_tuned_model(inputs, attention_mask=attention_mask)\n",
" next_token_logits = outputs.logits[:, -1, :].to('cpu')\n",
"\n",
" next_token_probs = F.softmax(next_token_logits, dim=-1)\n",
" top_prob, top_token_id = next_token_probs.topk(top_K)\n",
" prices, weights = [], []\n",
" for i in range(top_K):\n",
" predicted_token = tokenizer.decode(top_token_id[0][i])\n",
" probability = top_prob[0][i]\n",
" try:\n",
" result = float(predicted_token)\n",
" except ValueError as e:\n",
" result = 0.0\n",
" if result > 0:\n",
" prices.append(result)\n",
" weights.append(probability)\n",
" if not prices:\n",
" return 0.0, 0.0\n",
" total = sum(weights)\n",
" weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]\n",
" return sum(weighted_prices).item()\n",
"\n",
"class Tester:\n",
"\n",
" def __init__(self, predictor, data, title=None, size=250):\n",
" self.predictor = predictor\n",
" self.data = data\n",
" self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n",
" self.size = size\n",
" self.guesses = []\n",
" self.truths = []\n",
" self.errors = []\n",
" self.sles = []\n",
" self.colors = []\n",
"\n",
" def color_for(self, error, truth):\n",
" if error<40 or error/truth < 0.2:\n",
" return \"green\"\n",
" elif error<80 or error/truth < 0.4:\n",
" return \"orange\"\n",
" else:\n",
" return \"red\"\n",
"\n",
" def run_datapoint(self, i):\n",
" datapoint = self.data[i]\n",
" guess = self.predictor(datapoint[\"text\"])\n",
" truth = datapoint[\"price\"]\n",
" error = abs(guess - truth)\n",
" log_error = math.log(truth+1) - math.log(guess+1)\n",
" sle = log_error ** 2\n",
" color = self.color_for(error, truth)\n",
" title = datapoint[\"text\"].split(\"\\n\\n\")[1][:20] + \"...\"\n",
" self.guesses.append(guess)\n",
" self.truths.append(truth)\n",
" self.errors.append(error)\n",
" self.sles.append(sle)\n",
" self.colors.append(color)\n",
" print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}\")\n",
"\n",
" def chart(self, title):\n",
" max_error = max(self.errors)\n",
" plt.figure(figsize=(12, 8))\n",
" max_val = max(max(self.truths), max(self.guesses))\n",
" plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n",
" plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n",
" plt.xlabel('Ground Truth')\n",
" plt.ylabel('Model Estimate')\n",
" plt.xlim(0, max_val)\n",
" plt.ylim(0, max_val)\n",
" plt.title(title)\n",
" plt.show()\n",
"\n",
" def report(self):\n",
" average_error = sum(self.errors) / self.size\n",
" rmsle = math.sqrt(sum(self.sles) / self.size)\n",
" hits = sum(1 for color in self.colors if color==\"green\")\n",
" title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%\"\n",
" self.chart(title)\n",
"\n",
" def run(self):\n",
" self.error = 0\n",
" for i in range(self.size):\n",
" self.run_datapoint(i)\n",
" self.report()\n",
"\n",
" @classmethod\n",
" def test(cls, function, data):\n",
" cls(function, data).run()\n",
"#Step 6000\n",
"Tester.test(improved_model_predict, test)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
}