Exercise Week 7 using Phi4 unsloth
This commit is contained in:
457
week7/community_contributions/Exercise_Week_7_jom.ipynb
Normal file
457
week7/community_contributions/Exercise_Week_7_jom.ipynb
Normal file
@@ -0,0 +1,457 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cbf08d83",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f22db0ae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install unsloth"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e5e1ac78",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import unsloth\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"import math\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from google.colab import userdata\n",
|
||||
"from huggingface_hub import login\n",
|
||||
"# import torch\n",
|
||||
"# import transformers\n",
|
||||
"# from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig\n",
|
||||
"from datasets import load_dataset, Dataset, DatasetDict\n",
|
||||
"import wandb\n",
|
||||
"#from peft import LoraConfig\n",
|
||||
"#from trl import SFTTrainer, SFTConfig\n",
|
||||
"from datetime import datetime\n",
|
||||
"import matplotlib.pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "75bee643",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"BASE_MODEL = \"unsloth/phi-4-unsloth-bnb-4bit\"\n",
|
||||
"\n",
|
||||
"PROJECT_NAME = \"pricer\"\n",
|
||||
"HF_USER = \"javiomotero\" # your HF name here!\n",
|
||||
"\n",
|
||||
"DATASET_NAME = f\"{HF_USER}/lite-data\"\n",
|
||||
"\n",
|
||||
"dataset = load_dataset(DATASET_NAME)\n",
|
||||
"train = dataset['train']\n",
|
||||
"test = dataset['test']\n",
|
||||
"\n",
|
||||
"# Split your dataset into train and eval\n",
|
||||
"split_dataset = train.train_test_split(test_size=0.1, seed=42)\n",
|
||||
"\n",
|
||||
"train_dataset = split_dataset[\"train\"]\n",
|
||||
"eval_dataset = split_dataset[\"test\"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"RUN_NAME = f\"{datetime.now():%Y-%m-%d_%H.%M.%S}\"\n",
|
||||
"PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n",
|
||||
"HUB_MODEL_NAME = f\"{HF_USER}/{PROJECT_RUN_NAME}\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"LOGGING_STEPS = 50\n",
|
||||
"SAVE_STEPS = 500\n",
|
||||
"LOG_TO_WANDB = True\n",
|
||||
"\n",
|
||||
"# Log in to HuggingFace\n",
|
||||
"\n",
|
||||
"hf_token = userdata.get('HF_TOKEN')\n",
|
||||
"login(hf_token, add_to_git_credential=True)\n",
|
||||
"\n",
|
||||
"# Log in to Weights & Biases\n",
|
||||
"wandb_api_key = userdata.get('WANDB_API_KEY')\n",
|
||||
"os.environ[\"WANDB_API_KEY\"] = wandb_api_key\n",
|
||||
"wandb.login()\n",
|
||||
"\n",
|
||||
"# Configure Weights & Biases to record against our project\n",
|
||||
"os.environ[\"WANDB_PROJECT\"] = PROJECT_NAME\n",
|
||||
"os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\" if LOG_TO_WANDB else \"end\"\n",
|
||||
"os.environ[\"WANDB_WATCH\"] = \"gradients\"\n",
|
||||
"\n",
|
||||
"if LOG_TO_WANDB:\n",
|
||||
" run = wandb.init(project=PROJECT_NAME, name=RUN_NAME)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "260975b0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from unsloth import FastLanguageModel, is_bfloat16_supported\n",
|
||||
"from trl import SFTTrainer, SFTConfig\n",
|
||||
"from peft import LoraConfig\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Your hyperparameters\n",
|
||||
"LORA_R = 8\n",
|
||||
"LORA_ALPHA = 2 * LORA_R\n",
|
||||
"LORA_DROPOUT = 0.2\n",
|
||||
"TARGET_MODULES = [\"q_proj\", \"v_proj\", \"k_proj\", \"o_proj\"] # keep small for T4\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"EPOCHS = 1\n",
|
||||
"BATCH_SIZE = 1\n",
|
||||
"GRADIENT_ACCUMULATION_STEPS = 1\n",
|
||||
"LEARNING_RATE = 1e-4\n",
|
||||
"LR_SCHEDULER_TYPE = \"cosine\"\n",
|
||||
"WARMUP_RATIO = 0.03\n",
|
||||
"OPTIMIZER = \"paged_adamw_32bit\" # consider adamw_8bit if you hit NaNs or OOM\n",
|
||||
"MAX_SEQUENCE_LENGTH = 182\n",
|
||||
"\n",
|
||||
"# 1) Load model via Unsloth in 4-bit\n",
|
||||
"dtype = \"bfloat16\" if is_bfloat16_supported() else \"float16\"\n",
|
||||
"model, tokenizer = FastLanguageModel.from_pretrained(\n",
|
||||
" model_name = BASE_MODEL,\n",
|
||||
" max_seq_length = MAX_SEQUENCE_LENGTH,\n",
|
||||
" load_in_4bit = True,\n",
|
||||
" dtype = dtype,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"tokenizer.pad_token = tokenizer.eos_token\n",
|
||||
"tokenizer.padding_side = \"right\"\n",
|
||||
"\n",
|
||||
"# 2) Apply LoRA using Unsloth helper (uses gradient checkpointing under the hood if set)\n",
|
||||
"peft_config = LoraConfig(\n",
|
||||
" r = LORA_R,\n",
|
||||
" lora_alpha = LORA_ALPHA,\n",
|
||||
" lora_dropout = LORA_DROPOUT,\n",
|
||||
" bias = \"none\",\n",
|
||||
" task_type = \"CAUSAL_LM\",\n",
|
||||
" target_modules = TARGET_MODULES,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model = FastLanguageModel.get_peft_model(\n",
|
||||
" model,\n",
|
||||
" r = peft_config.r,\n",
|
||||
" lora_alpha = peft_config.lora_alpha,\n",
|
||||
" lora_dropout = peft_config.lora_dropout,\n",
|
||||
" target_modules = peft_config.target_modules,\n",
|
||||
" bias = peft_config.bias,\n",
|
||||
" use_gradient_checkpointing = \"unsloth\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# 3) Your SFTConfig (same API, Unsloth integrates with TRL’s SFTTrainer)\n",
|
||||
"train_parameters = SFTConfig(\n",
|
||||
" output_dir = PROJECT_RUN_NAME,\n",
|
||||
" num_train_epochs = EPOCHS,\n",
|
||||
" per_device_train_batch_size = BATCH_SIZE,\n",
|
||||
" per_device_eval_batch_size = 1,\n",
|
||||
" eval_strategy = \"steps\",\n",
|
||||
" eval_steps = SAVE_STEPS,\n",
|
||||
" gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,\n",
|
||||
" optim = OPTIMIZER,\n",
|
||||
" save_steps = SAVE_STEPS,\n",
|
||||
" save_total_limit = 10,\n",
|
||||
" logging_steps = LOGGING_STEPS,\n",
|
||||
" learning_rate = LEARNING_RATE,\n",
|
||||
" weight_decay = 0.001,\n",
|
||||
" fp16 = (dtype == \"float16\"),\n",
|
||||
" bf16 = (dtype == \"bfloat16\"),\n",
|
||||
" max_grad_norm = 0.3,\n",
|
||||
" max_steps = -1,\n",
|
||||
" warmup_ratio = WARMUP_RATIO,\n",
|
||||
" group_by_length = True,\n",
|
||||
" lr_scheduler_type = LR_SCHEDULER_TYPE,\n",
|
||||
" report_to = \"wandb\" if LOG_TO_WANDB else None,\n",
|
||||
" run_name = RUN_NAME,\n",
|
||||
" max_seq_length = MAX_SEQUENCE_LENGTH,\n",
|
||||
" dataset_text_field = \"text\",\n",
|
||||
" save_strategy = \"steps\",\n",
|
||||
" hub_strategy = \"every_save\",\n",
|
||||
" push_to_hub = True,\n",
|
||||
" hub_model_id = HUB_MODEL_NAME,\n",
|
||||
" hub_private_repo = True,\n",
|
||||
")\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f1b324fb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"#Checkpointing from wandb (run) - I guess we can also do from HF\n",
|
||||
"#checkpoint_url = \"javier-otero-marquez-personal-education/pricer/model-2025-10-25_15.39.13:v3\" #This was for first retrain\n",
|
||||
"checkpoint_url = \"javier-otero-marquez-personal-education/pricer/model-2025-10-26_09.54.35:v1\"\n",
|
||||
"\n",
|
||||
"artifact = run.use_artifact(checkpoint_url, type='model')\n",
|
||||
"artifact_dir = artifact.download()\n",
|
||||
"\n",
|
||||
"trainer = SFTTrainer(\n",
|
||||
" model = model,\n",
|
||||
" tokenizer = tokenizer,\n",
|
||||
" args = train_parameters,\n",
|
||||
" train_dataset = train_dataset,\n",
|
||||
" eval_dataset = eval_dataset,\n",
|
||||
" packing = False, # safer for stability; can turn on after it fits\n",
|
||||
" completion_only_loss=True\n",
|
||||
")\n",
|
||||
"trainer.train(resume_from_checkpoint=artifact_dir)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "affe0724",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b855e0a6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"import math\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from google.colab import userdata\n",
|
||||
"from huggingface_hub import login\n",
|
||||
"import torch\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"import transformers\n",
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed\n",
|
||||
"from datasets import load_dataset, Dataset, DatasetDict\n",
|
||||
"from datetime import datetime\n",
|
||||
"from peft import PeftModel\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"# Constants\n",
|
||||
"\n",
|
||||
"BASE_MODEL = \"unsloth/phi-4-unsloth-bnb-4bit\"\n",
|
||||
"PROJECT_NAME = \"pricer\"\n",
|
||||
"HF_USER = \"javiomotero\" # your HF name here! Or use mine if you just want to reproduce my results.\n",
|
||||
"\n",
|
||||
"# The run itselfjaviomotero/pricer-\n",
|
||||
"RUN_NAME = \"2025-10-26_09.54.35\"\n",
|
||||
"PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n",
|
||||
"REVISION = \"53c8d992140e5b184e9388418d711d3e38f7bd9d\" # or REVISION = None\n",
|
||||
"FINETUNED_MODEL = f\"{HF_USER}/{PROJECT_RUN_NAME}\"\n",
|
||||
"\n",
|
||||
"# Uncomment this line if you wish to use my model\n",
|
||||
"# FINETUNED_MODEL = f\"ed-donner/{PROJECT_RUN_NAME}\"\n",
|
||||
"\n",
|
||||
"# Data\n",
|
||||
"\n",
|
||||
"DATASET_NAME = f\"{HF_USER}/lite-data\"\n",
|
||||
"# Or just use the one I've uploaded\n",
|
||||
"# DATASET_NAME = \"ed-donner/pricer-data\"\n",
|
||||
"\n",
|
||||
"# Hyperparameters for QLoRA\n",
|
||||
"\n",
|
||||
"QUANT_4_BIT = True\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"\n",
|
||||
"# Used for writing to output in color\n",
|
||||
"\n",
|
||||
"GREEN = \"\\033[92m\"\n",
|
||||
"YELLOW = \"\\033[93m\"\n",
|
||||
"RED = \"\\033[91m\"\n",
|
||||
"RESET = \"\\033[0m\"\n",
|
||||
"COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}\n",
|
||||
"# Log in to HuggingFace\n",
|
||||
"\n",
|
||||
"hf_token = userdata.get('HF_TOKEN')\n",
|
||||
"login(hf_token, add_to_git_credential=True)\n",
|
||||
"dataset = load_dataset(DATASET_NAME)\n",
|
||||
"train = dataset['train']\n",
|
||||
"test = dataset['test']\n",
|
||||
"# pick the right quantization (thank you Robert M. for spotting the bug with the 8 bit version!)\n",
|
||||
"\n",
|
||||
"if QUANT_4_BIT:\n",
|
||||
" quant_config = BitsAndBytesConfig(\n",
|
||||
" load_in_4bit=True,\n",
|
||||
" bnb_4bit_use_double_quant=True,\n",
|
||||
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
||||
" bnb_4bit_quant_type=\"nf4\"\n",
|
||||
" )\n",
|
||||
"else:\n",
|
||||
" quant_config = BitsAndBytesConfig(\n",
|
||||
" load_in_8bit=True,\n",
|
||||
" bnb_8bit_compute_dtype=torch.bfloat16\n",
|
||||
" )\n",
|
||||
"# Load the Tokenizer and the Model\n",
|
||||
"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n",
|
||||
"tokenizer.pad_token = tokenizer.eos_token\n",
|
||||
"tokenizer.padding_side = \"right\"\n",
|
||||
"\n",
|
||||
"base_model = AutoModelForCausalLM.from_pretrained(\n",
|
||||
" BASE_MODEL,\n",
|
||||
" quantization_config=quant_config,\n",
|
||||
" device_map=\"auto\",\n",
|
||||
")\n",
|
||||
"base_model.generation_config.pad_token_id = tokenizer.pad_token_id\n",
|
||||
"\n",
|
||||
"# Load the fine-tuned model with PEFT\n",
|
||||
"if REVISION:\n",
|
||||
" fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)\n",
|
||||
"else:\n",
|
||||
" fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(f\"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d4e6e25c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_price(s):\n",
|
||||
" if \"Price is $\" in s:\n",
|
||||
" contents = s.split(\"Price is $\")[1]\n",
|
||||
" contents = contents.replace(',','')\n",
|
||||
" match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n",
|
||||
" return float(match.group()) if match else 0\n",
|
||||
" return 0\n",
|
||||
"top_K = 3\n",
|
||||
"\n",
|
||||
"def improved_model_predict(prompt, device=\"cuda\"):\n",
|
||||
" set_seed(42)\n",
|
||||
" inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(device)\n",
|
||||
" attention_mask = torch.ones(inputs.shape, device=device)\n",
|
||||
"\n",
|
||||
" with torch.no_grad():\n",
|
||||
" outputs = fine_tuned_model(inputs, attention_mask=attention_mask)\n",
|
||||
" next_token_logits = outputs.logits[:, -1, :].to('cpu')\n",
|
||||
"\n",
|
||||
" next_token_probs = F.softmax(next_token_logits, dim=-1)\n",
|
||||
" top_prob, top_token_id = next_token_probs.topk(top_K)\n",
|
||||
" prices, weights = [], []\n",
|
||||
" for i in range(top_K):\n",
|
||||
" predicted_token = tokenizer.decode(top_token_id[0][i])\n",
|
||||
" probability = top_prob[0][i]\n",
|
||||
" try:\n",
|
||||
" result = float(predicted_token)\n",
|
||||
" except ValueError as e:\n",
|
||||
" result = 0.0\n",
|
||||
" if result > 0:\n",
|
||||
" prices.append(result)\n",
|
||||
" weights.append(probability)\n",
|
||||
" if not prices:\n",
|
||||
" return 0.0, 0.0\n",
|
||||
" total = sum(weights)\n",
|
||||
" weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]\n",
|
||||
" return sum(weighted_prices).item()\n",
|
||||
"\n",
|
||||
"class Tester:\n",
|
||||
"\n",
|
||||
" def __init__(self, predictor, data, title=None, size=250):\n",
|
||||
" self.predictor = predictor\n",
|
||||
" self.data = data\n",
|
||||
" self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n",
|
||||
" self.size = size\n",
|
||||
" self.guesses = []\n",
|
||||
" self.truths = []\n",
|
||||
" self.errors = []\n",
|
||||
" self.sles = []\n",
|
||||
" self.colors = []\n",
|
||||
"\n",
|
||||
" def color_for(self, error, truth):\n",
|
||||
" if error<40 or error/truth < 0.2:\n",
|
||||
" return \"green\"\n",
|
||||
" elif error<80 or error/truth < 0.4:\n",
|
||||
" return \"orange\"\n",
|
||||
" else:\n",
|
||||
" return \"red\"\n",
|
||||
"\n",
|
||||
" def run_datapoint(self, i):\n",
|
||||
" datapoint = self.data[i]\n",
|
||||
" guess = self.predictor(datapoint[\"text\"])\n",
|
||||
" truth = datapoint[\"price\"]\n",
|
||||
" error = abs(guess - truth)\n",
|
||||
" log_error = math.log(truth+1) - math.log(guess+1)\n",
|
||||
" sle = log_error ** 2\n",
|
||||
" color = self.color_for(error, truth)\n",
|
||||
" title = datapoint[\"text\"].split(\"\\n\\n\")[1][:20] + \"...\"\n",
|
||||
" self.guesses.append(guess)\n",
|
||||
" self.truths.append(truth)\n",
|
||||
" self.errors.append(error)\n",
|
||||
" self.sles.append(sle)\n",
|
||||
" self.colors.append(color)\n",
|
||||
" print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}\")\n",
|
||||
"\n",
|
||||
" def chart(self, title):\n",
|
||||
" max_error = max(self.errors)\n",
|
||||
" plt.figure(figsize=(12, 8))\n",
|
||||
" max_val = max(max(self.truths), max(self.guesses))\n",
|
||||
" plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n",
|
||||
" plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n",
|
||||
" plt.xlabel('Ground Truth')\n",
|
||||
" plt.ylabel('Model Estimate')\n",
|
||||
" plt.xlim(0, max_val)\n",
|
||||
" plt.ylim(0, max_val)\n",
|
||||
" plt.title(title)\n",
|
||||
" plt.show()\n",
|
||||
"\n",
|
||||
" def report(self):\n",
|
||||
" average_error = sum(self.errors) / self.size\n",
|
||||
" rmsle = math.sqrt(sum(self.sles) / self.size)\n",
|
||||
" hits = sum(1 for color in self.colors if color==\"green\")\n",
|
||||
" title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%\"\n",
|
||||
" self.chart(title)\n",
|
||||
"\n",
|
||||
" def run(self):\n",
|
||||
" self.error = 0\n",
|
||||
" for i in range(self.size):\n",
|
||||
" self.run_datapoint(i)\n",
|
||||
" self.report()\n",
|
||||
"\n",
|
||||
" @classmethod\n",
|
||||
" def test(cls, function, data):\n",
|
||||
" cls(function, data).run()\n",
|
||||
"#Step 6000\n",
|
||||
"Tester.test(improved_model_predict, test)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user