Add Week 7 submission for muhammad_qasim_sheikh
This commit is contained in:
@@ -0,0 +1,463 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "f0c31d90",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import wandb\n",
|
||||||
|
"from datetime import datetime\n",
|
||||||
|
"from datasets import load_dataset\n",
|
||||||
|
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, EarlyStoppingCallback, set_seed\n",
|
||||||
|
"from peft import LoraConfig, PeftModel\n",
|
||||||
|
"from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM\n",
|
||||||
|
"import math\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"import torch.nn.functional as F"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "768666b4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Testing it on the lite pricer data that I have created\n",
|
||||||
|
"DATASET_NAME = f\"qshaikh/lite-pricer-data\"\n",
|
||||||
|
"dataset = load_dataset(DATASET_NAME)\n",
|
||||||
|
"train = dataset['train']\n",
|
||||||
|
"test = dataset['test']\n",
|
||||||
|
"split_ratio = 0.10 # 10% for validation\n",
|
||||||
|
"\n",
|
||||||
|
"TRAIN_SIZE = 15000\n",
|
||||||
|
"train = train.select(range(TRAIN_SIZE))\n",
|
||||||
|
"\n",
|
||||||
|
"total_size = len(train)\n",
|
||||||
|
"val_size = int(total_size * split_ratio)\n",
|
||||||
|
"\n",
|
||||||
|
"val_data = train.select(range(val_size))\n",
|
||||||
|
"train_data = train.select(range(val_size, total_size))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "12e2b634",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(f\"Train data size : {len(train_data)}\")\n",
|
||||||
|
"print(f\"Validation data size: {len(val_data)}\")\n",
|
||||||
|
"print(f\"Test data size : {len(test)}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "1a75961f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"PROJECT_NAME = \"llama3-new-pricer\"\n",
|
||||||
|
"RUN_NAME = f\"{datetime.now():%Y-%m-%d_%H.%M.%S}-size{total_size}\"\n",
|
||||||
|
"PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n",
|
||||||
|
"HUB_MODEL_NAME = f\"qshaikh/{PROJECT_RUN_NAME}\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "bf9bca38",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"LOG_TO_WANDB = True\n",
|
||||||
|
"os.environ[\"WANDB_PROJECT\"] = PROJECT_NAME\n",
|
||||||
|
"os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\" if LOG_TO_WANDB else \"end\"\n",
|
||||||
|
"os.environ[\"WANDB_WATCH\"] = \"gradients\"\n",
|
||||||
|
"\n",
|
||||||
|
"if LOG_TO_WANDB:\n",
|
||||||
|
" wandb.init(project=PROJECT_NAME, name=RUN_NAME)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "77e0ee8c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"BASE_MODEL = \"meta-llama/Llama-3.2-1B\"\n",
|
||||||
|
"\n",
|
||||||
|
"quant_config = BitsAndBytesConfig(\n",
|
||||||
|
" load_in_4bit=True, \n",
|
||||||
|
" bnb_4bit_use_double_quant=True,\n",
|
||||||
|
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
||||||
|
" bnb_4bit_quant_type=\"nf4\"\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n",
|
||||||
|
"tokenizer.pad_token = tokenizer.eos_token\n",
|
||||||
|
"tokenizer.padding_side = \"right\"\n",
|
||||||
|
"\n",
|
||||||
|
"base_model = AutoModelForCausalLM.from_pretrained(\n",
|
||||||
|
" BASE_MODEL,\n",
|
||||||
|
" quantization_config=quant_config,\n",
|
||||||
|
" device_map=\"auto\",\n",
|
||||||
|
")\n",
|
||||||
|
"base_model.generation_config.pad_token_id = tokenizer.pad_token_id\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "bf4a0187",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"response_template = \"Price is $\"\n",
|
||||||
|
"collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "25ce4f40",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"LORA_R = 32\n",
|
||||||
|
"LORA_ALPHA = 64\n",
|
||||||
|
"TARGET_MODULES = [\"q_proj\", \"v_proj\", \"k_proj\", \"o_proj\"]\n",
|
||||||
|
"LORA_DROPOUT = 0.1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "e1e6c237",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"lora_parameters = LoraConfig(\n",
|
||||||
|
" r=LORA_R,\n",
|
||||||
|
" lora_alpha=LORA_ALPHA,\n",
|
||||||
|
" lora_dropout=LORA_DROPOUT,\n",
|
||||||
|
" target_modules=TARGET_MODULES,\n",
|
||||||
|
" bias=\"none\",\n",
|
||||||
|
" task_type=\"CAUSAL_LM\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f68d0fdc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"EPOCHS = 1\n",
|
||||||
|
"BATCH_SIZE = 8 # Tested it with 4 first to be on the safer side, however was taking too long. Increased it upto 8 then.\n",
|
||||||
|
"GRADIENT_ACCUMULATION_STEPS = 1\n",
|
||||||
|
"MAX_SEQUENCE_LENGTH = 182\n",
|
||||||
|
"LEARNING_RATE = 1e-4\n",
|
||||||
|
"LR_SCHEDULER_TYPE = 'cosine'\n",
|
||||||
|
"WARMUP_RATIO = 0.03\n",
|
||||||
|
"OPTIMIZER = \"paged_adamw_32bit\"\n",
|
||||||
|
"\n",
|
||||||
|
"SAVE_STEPS = 200\n",
|
||||||
|
"STEPS = 20\n",
|
||||||
|
"save_total_limit = 10\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "37c3ab80",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_parameters = SFTConfig(\n",
|
||||||
|
" output_dir=PROJECT_RUN_NAME,\n",
|
||||||
|
" run_name=RUN_NAME,\n",
|
||||||
|
" dataset_text_field=\"text\",\n",
|
||||||
|
" max_seq_length=MAX_SEQUENCE_LENGTH,\n",
|
||||||
|
"\n",
|
||||||
|
" num_train_epochs=EPOCHS,\n",
|
||||||
|
" per_device_train_batch_size=BATCH_SIZE,\n",
|
||||||
|
" gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n",
|
||||||
|
" max_steps=-1,\n",
|
||||||
|
" group_by_length=True,\n",
|
||||||
|
"\n",
|
||||||
|
" eval_strategy=\"steps\",\n",
|
||||||
|
" eval_steps=STEPS,\n",
|
||||||
|
" per_device_eval_batch_size=1,\n",
|
||||||
|
"\n",
|
||||||
|
" learning_rate=LEARNING_RATE,\n",
|
||||||
|
" lr_scheduler_type=LR_SCHEDULER_TYPE,\n",
|
||||||
|
" warmup_ratio=WARMUP_RATIO,\n",
|
||||||
|
" optim=OPTIMIZER,\n",
|
||||||
|
" weight_decay=0.001,\n",
|
||||||
|
" max_grad_norm=0.3,\n",
|
||||||
|
"\n",
|
||||||
|
" fp16=False,\n",
|
||||||
|
" bf16=True,\n",
|
||||||
|
"\n",
|
||||||
|
" logging_steps=STEPS,\n",
|
||||||
|
" save_strategy=\"steps\",\n",
|
||||||
|
" save_steps=SAVE_STEPS,\n",
|
||||||
|
" save_total_limit=save_total_limit,\n",
|
||||||
|
" report_to=\"wandb\" if LOG_TO_WANDB else None,\n",
|
||||||
|
"\n",
|
||||||
|
" push_to_hub=True,\n",
|
||||||
|
" hub_strategy=\"every_save\",\n",
|
||||||
|
" load_best_model_at_end=True,\n",
|
||||||
|
" metric_for_best_model=\"eval_loss\",\n",
|
||||||
|
" greater_is_better=False\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "56abe69d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"fine_tuning = SFTTrainer(\n",
|
||||||
|
" model=base_model,\n",
|
||||||
|
" train_dataset=train_data,\n",
|
||||||
|
" eval_dataset=val_data,\n",
|
||||||
|
" peft_config=lora_parameters, \n",
|
||||||
|
" args=train_parameters, \n",
|
||||||
|
" data_collator=collator,\n",
|
||||||
|
" callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] \n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "17ed25ec",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"fine_tuning.train()\n",
|
||||||
|
"print(f\"Best model pushed to HF Hub: {HUB_MODEL_NAME}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7230239e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Evaluating Model Performance"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "61c2955c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"GREEN = \"\\033[92m\"\n",
|
||||||
|
"YELLOW = \"\\033[93m\"\n",
|
||||||
|
"RED = \"\\033[91m\"\n",
|
||||||
|
"RESET = \"\\033[0m\"\n",
|
||||||
|
"COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}\n",
|
||||||
|
"\n",
|
||||||
|
"class Tester:\n",
|
||||||
|
"\n",
|
||||||
|
" def __init__(self, predictor, data, title=None, size=250):\n",
|
||||||
|
" self.predictor = predictor\n",
|
||||||
|
" self.data = data\n",
|
||||||
|
" self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n",
|
||||||
|
" self.size = size\n",
|
||||||
|
" self.guesses = []\n",
|
||||||
|
" self.truths = []\n",
|
||||||
|
" self.errors = []\n",
|
||||||
|
" self.sles = []\n",
|
||||||
|
" self.colors = []\n",
|
||||||
|
"\n",
|
||||||
|
" def color_for(self, error, truth):\n",
|
||||||
|
" if error<40 or error/truth < 0.2:\n",
|
||||||
|
" return \"green\"\n",
|
||||||
|
" elif error<80 or error/truth < 0.4:\n",
|
||||||
|
" return \"orange\"\n",
|
||||||
|
" else:\n",
|
||||||
|
" return \"red\"\n",
|
||||||
|
"\n",
|
||||||
|
" def run_datapoint(self, i):\n",
|
||||||
|
" datapoint = self.data[i]\n",
|
||||||
|
" guess = self.predictor(datapoint[\"text\"])\n",
|
||||||
|
" truth = datapoint[\"price\"]\n",
|
||||||
|
" error = abs(guess - truth)\n",
|
||||||
|
" log_error = math.log(truth+1) - math.log(guess+1)\n",
|
||||||
|
" sle = log_error ** 2\n",
|
||||||
|
" color = self.color_for(error, truth)\n",
|
||||||
|
" self.guesses.append(guess)\n",
|
||||||
|
" self.truths.append(truth)\n",
|
||||||
|
" self.errors.append(error)\n",
|
||||||
|
" self.sles.append(sle)\n",
|
||||||
|
" self.colors.append(color)\n",
|
||||||
|
"\n",
|
||||||
|
" def chart(self, title):\n",
|
||||||
|
" plt.figure(figsize=(12, 8))\n",
|
||||||
|
" max_val = max(max(self.truths), max(self.guesses))\n",
|
||||||
|
" plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n",
|
||||||
|
" plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n",
|
||||||
|
" plt.xlabel('Ground Truth')\n",
|
||||||
|
" plt.ylabel('Model Estimate')\n",
|
||||||
|
" plt.xlim(0, max_val)\n",
|
||||||
|
" plt.ylim(0, max_val)\n",
|
||||||
|
" plt.title(title)\n",
|
||||||
|
"\n",
|
||||||
|
" from matplotlib.lines import Line2D\n",
|
||||||
|
" legend_elements = [\n",
|
||||||
|
" Line2D([0], [0], marker='o', color='w', label='Accurate (green)', markerfacecolor='green', markersize=8),\n",
|
||||||
|
" Line2D([0], [0], marker='o', color='w', label='Medium error (orange)', markerfacecolor='orange', markersize=8),\n",
|
||||||
|
" Line2D([0], [0], marker='o', color='w', label='High error (red)', markerfacecolor='red', markersize=8)\n",
|
||||||
|
" ]\n",
|
||||||
|
" plt.legend(handles=legend_elements, loc='upper right')\n",
|
||||||
|
"\n",
|
||||||
|
" plt.show()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" def report(self):\n",
|
||||||
|
" average_error = sum(self.errors) / self.size\n",
|
||||||
|
" rmsle = math.sqrt(sum(self.sles) / self.size)\n",
|
||||||
|
" hits = sum(1 for color in self.colors if color==\"green\")\n",
|
||||||
|
" title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%\"\n",
|
||||||
|
" self.chart(title)\n",
|
||||||
|
"\n",
|
||||||
|
" def run(self):\n",
|
||||||
|
" self.error = 0\n",
|
||||||
|
" for i in range(self.size):\n",
|
||||||
|
" self.run_datapoint(i)\n",
|
||||||
|
" self.report()\n",
|
||||||
|
"\n",
|
||||||
|
" @classmethod\n",
|
||||||
|
" def test(cls, function, data):\n",
|
||||||
|
" cls(function, data).run()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e1ff9de0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"test[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5b88ec79",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"FINETUNED_MODEL = \"qshaikh/llama3-new-pricer-2025-10-29_00.32.54-size15000\"\n",
|
||||||
|
"fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)\n",
|
||||||
|
"print(f\"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")\n",
|
||||||
|
"fine_tuned_model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "eb2d3a22",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"top_K = 3\n",
|
||||||
|
"\n",
|
||||||
|
"def improved_model_predict(prompt, device=\"cuda\"):\n",
|
||||||
|
" set_seed(42) \n",
|
||||||
|
" inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(device)\n",
|
||||||
|
" attention_mask = torch.ones(inputs.shape, device=device)\n",
|
||||||
|
"\n",
|
||||||
|
" with torch.no_grad(): \n",
|
||||||
|
" outputs = fine_tuned_model(inputs, attention_mask=attention_mask)\n",
|
||||||
|
" next_token_logits = outputs.logits[:, -1, :].to('cpu')\n",
|
||||||
|
"\n",
|
||||||
|
" next_token_probs = F.softmax(next_token_logits, dim=-1)\n",
|
||||||
|
" top_prob, top_token_id = next_token_probs.topk(top_K)\n",
|
||||||
|
"\n",
|
||||||
|
" prices, weights = [], [] \n",
|
||||||
|
"\n",
|
||||||
|
" for i in range(top_K):\n",
|
||||||
|
" predicted_token = tokenizer.decode(top_token_id[0][i])\n",
|
||||||
|
" probability = top_prob[0][i]\n",
|
||||||
|
"\n",
|
||||||
|
" try:\n",
|
||||||
|
" result = float(predicted_token)\n",
|
||||||
|
" except ValueError as e:\n",
|
||||||
|
" result = 0.0\n",
|
||||||
|
"\n",
|
||||||
|
" if result > 0:\n",
|
||||||
|
" prices.append(result)\n",
|
||||||
|
" weights.append(probability)\n",
|
||||||
|
"\n",
|
||||||
|
" if not prices:\n",
|
||||||
|
" return 0.0, 0.0\n",
|
||||||
|
"\n",
|
||||||
|
" total = sum(weights)\n",
|
||||||
|
"\n",
|
||||||
|
" weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]\n",
|
||||||
|
"\n",
|
||||||
|
" return sum(weighted_prices).item()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "dc47ff1e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"improved_model_predict(test[0][\"text\"], device=\"cuda\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3dbecfff",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"Tester.test(improved_model_predict, test)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.19"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user