diff --git a/community-contributions/muhammad_qasim_sheikh/Week 7/fine_tuned_open_source.ipynb b/community-contributions/muhammad_qasim_sheikh/Week 7/fine_tuned_open_source.ipynb new file mode 100644 index 0000000..dfe57e2 --- /dev/null +++ b/community-contributions/muhammad_qasim_sheikh/Week 7/fine_tuned_open_source.ipynb @@ -0,0 +1,463 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "id": "f0c31d90", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import torch\n", + "import wandb\n", + "from datetime import datetime\n", + "from datasets import load_dataset\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, EarlyStoppingCallback, set_seed\n", + "from peft import LoraConfig, PeftModel\n", + "from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM\n", + "import math\n", + "import matplotlib.pyplot as plt\n", + "import torch.nn.functional as F" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "768666b4", + "metadata": {}, + "outputs": [], + "source": [ + "# Testing it on the lite pricer data that I have created\n", + "DATASET_NAME = f\"qshaikh/lite-pricer-data\"\n", + "dataset = load_dataset(DATASET_NAME)\n", + "train = dataset['train']\n", + "test = dataset['test']\n", + "split_ratio = 0.10 # 10% for validation\n", + "\n", + "TRAIN_SIZE = 15000\n", + "train = train.select(range(TRAIN_SIZE))\n", + "\n", + "total_size = len(train)\n", + "val_size = int(total_size * split_ratio)\n", + "\n", + "val_data = train.select(range(val_size))\n", + "train_data = train.select(range(val_size, total_size))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12e2b634", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Train data size : {len(train_data)}\")\n", + "print(f\"Validation data size: {len(val_data)}\")\n", + "print(f\"Test data size : {len(test)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1a75961f", + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT_NAME = \"llama3-new-pricer\"\n", + "RUN_NAME = f\"{datetime.now():%Y-%m-%d_%H.%M.%S}-size{total_size}\"\n", + "PROJECT_RUN_NAME = f\"{PROJECT_NAME}-{RUN_NAME}\"\n", + "HUB_MODEL_NAME = f\"qshaikh/{PROJECT_RUN_NAME}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf9bca38", + "metadata": {}, + "outputs": [], + "source": [ + "LOG_TO_WANDB = True\n", + "os.environ[\"WANDB_PROJECT\"] = PROJECT_NAME\n", + "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\" if LOG_TO_WANDB else \"end\"\n", + "os.environ[\"WANDB_WATCH\"] = \"gradients\"\n", + "\n", + "if LOG_TO_WANDB:\n", + " wandb.init(project=PROJECT_NAME, name=RUN_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77e0ee8c", + "metadata": {}, + "outputs": [], + "source": [ + "BASE_MODEL = \"meta-llama/Llama-3.2-1B\"\n", + "\n", + "quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True, \n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + ")\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\"\n", + "\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " BASE_MODEL,\n", + " quantization_config=quant_config,\n", + " device_map=\"auto\",\n", + ")\n", + "base_model.generation_config.pad_token_id = tokenizer.pad_token_id\n", + "\n", + "print(f\"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bf4a0187", + "metadata": {}, + "outputs": [], + "source": [ + "response_template = \"Price is $\"\n", + "collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "25ce4f40", + "metadata": {}, + "outputs": [], + "source": [ + "LORA_R = 32\n", + "LORA_ALPHA = 64\n", + "TARGET_MODULES = [\"q_proj\", \"v_proj\", \"k_proj\", \"o_proj\"]\n", + "LORA_DROPOUT = 0.1" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e1e6c237", + "metadata": {}, + "outputs": [], + "source": [ + "lora_parameters = LoraConfig(\n", + " r=LORA_R,\n", + " lora_alpha=LORA_ALPHA,\n", + " lora_dropout=LORA_DROPOUT,\n", + " target_modules=TARGET_MODULES,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f68d0fdc", + "metadata": {}, + "outputs": [], + "source": [ + "EPOCHS = 1\n", + "BATCH_SIZE = 8 # Tested it with 4 first to be on the safer side, however was taking too long. Increased it upto 8 then.\n", + "GRADIENT_ACCUMULATION_STEPS = 1\n", + "MAX_SEQUENCE_LENGTH = 182\n", + "LEARNING_RATE = 1e-4\n", + "LR_SCHEDULER_TYPE = 'cosine'\n", + "WARMUP_RATIO = 0.03\n", + "OPTIMIZER = \"paged_adamw_32bit\"\n", + "\n", + "SAVE_STEPS = 200\n", + "STEPS = 20\n", + "save_total_limit = 10\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37c3ab80", + "metadata": {}, + "outputs": [], + "source": [ + "train_parameters = SFTConfig(\n", + " output_dir=PROJECT_RUN_NAME,\n", + " run_name=RUN_NAME,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=MAX_SEQUENCE_LENGTH,\n", + "\n", + " num_train_epochs=EPOCHS,\n", + " per_device_train_batch_size=BATCH_SIZE,\n", + " gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n", + " max_steps=-1,\n", + " group_by_length=True,\n", + "\n", + " eval_strategy=\"steps\",\n", + " eval_steps=STEPS,\n", + " per_device_eval_batch_size=1,\n", + "\n", + " learning_rate=LEARNING_RATE,\n", + " lr_scheduler_type=LR_SCHEDULER_TYPE,\n", + " warmup_ratio=WARMUP_RATIO,\n", + " optim=OPTIMIZER,\n", + " weight_decay=0.001,\n", + " max_grad_norm=0.3,\n", + "\n", + " fp16=False,\n", + " bf16=True,\n", + "\n", + " logging_steps=STEPS,\n", + " save_strategy=\"steps\",\n", + " save_steps=SAVE_STEPS,\n", + " save_total_limit=save_total_limit,\n", + " report_to=\"wandb\" if LOG_TO_WANDB else None,\n", + "\n", + " push_to_hub=True,\n", + " hub_strategy=\"every_save\",\n", + " load_best_model_at_end=True,\n", + " metric_for_best_model=\"eval_loss\",\n", + " greater_is_better=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56abe69d", + "metadata": {}, + "outputs": [], + "source": [ + "fine_tuning = SFTTrainer(\n", + " model=base_model,\n", + " train_dataset=train_data,\n", + " eval_dataset=val_data,\n", + " peft_config=lora_parameters, \n", + " args=train_parameters, \n", + " data_collator=collator,\n", + " callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17ed25ec", + "metadata": {}, + "outputs": [], + "source": [ + "fine_tuning.train()\n", + "print(f\"Best model pushed to HF Hub: {HUB_MODEL_NAME}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7230239e", + "metadata": {}, + "source": [ + "## Evaluating Model Performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61c2955c", + "metadata": {}, + "outputs": [], + "source": [ + "GREEN = \"\\033[92m\"\n", + "YELLOW = \"\\033[93m\"\n", + "RED = \"\\033[91m\"\n", + "RESET = \"\\033[0m\"\n", + "COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}\n", + "\n", + "class Tester:\n", + "\n", + " def __init__(self, predictor, data, title=None, size=250):\n", + " self.predictor = predictor\n", + " self.data = data\n", + " self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n", + " self.size = size\n", + " self.guesses = []\n", + " self.truths = []\n", + " self.errors = []\n", + " self.sles = []\n", + " self.colors = []\n", + "\n", + " def color_for(self, error, truth):\n", + " if error<40 or error/truth < 0.2:\n", + " return \"green\"\n", + " elif error<80 or error/truth < 0.4:\n", + " return \"orange\"\n", + " else:\n", + " return \"red\"\n", + "\n", + " def run_datapoint(self, i):\n", + " datapoint = self.data[i]\n", + " guess = self.predictor(datapoint[\"text\"])\n", + " truth = datapoint[\"price\"]\n", + " error = abs(guess - truth)\n", + " log_error = math.log(truth+1) - math.log(guess+1)\n", + " sle = log_error ** 2\n", + " color = self.color_for(error, truth)\n", + " self.guesses.append(guess)\n", + " self.truths.append(truth)\n", + " self.errors.append(error)\n", + " self.sles.append(sle)\n", + " self.colors.append(color)\n", + "\n", + " def chart(self, title):\n", + " plt.figure(figsize=(12, 8))\n", + " max_val = max(max(self.truths), max(self.guesses))\n", + " plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n", + " plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n", + " plt.xlabel('Ground Truth')\n", + " plt.ylabel('Model Estimate')\n", + " plt.xlim(0, max_val)\n", + " plt.ylim(0, max_val)\n", + " plt.title(title)\n", + "\n", + " from matplotlib.lines import Line2D\n", + " legend_elements = [\n", + " Line2D([0], [0], marker='o', color='w', label='Accurate (green)', markerfacecolor='green', markersize=8),\n", + " Line2D([0], [0], marker='o', color='w', label='Medium error (orange)', markerfacecolor='orange', markersize=8),\n", + " Line2D([0], [0], marker='o', color='w', label='High error (red)', markerfacecolor='red', markersize=8)\n", + " ]\n", + " plt.legend(handles=legend_elements, loc='upper right')\n", + "\n", + " plt.show()\n", + "\n", + "\n", + " def report(self):\n", + " average_error = sum(self.errors) / self.size\n", + " rmsle = math.sqrt(sum(self.sles) / self.size)\n", + " hits = sum(1 for color in self.colors if color==\"green\")\n", + " title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%\"\n", + " self.chart(title)\n", + "\n", + " def run(self):\n", + " self.error = 0\n", + " for i in range(self.size):\n", + " self.run_datapoint(i)\n", + " self.report()\n", + "\n", + " @classmethod\n", + " def test(cls, function, data):\n", + " cls(function, data).run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1ff9de0", + "metadata": {}, + "outputs": [], + "source": [ + "test[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b88ec79", + "metadata": {}, + "outputs": [], + "source": [ + "FINETUNED_MODEL = \"qshaikh/llama3-new-pricer-2025-10-29_00.32.54-size15000\"\n", + "fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)\n", + "print(f\"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")\n", + "fine_tuned_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb2d3a22", + "metadata": {}, + "outputs": [], + "source": [ + "top_K = 3\n", + "\n", + "def improved_model_predict(prompt, device=\"cuda\"):\n", + " set_seed(42) \n", + " inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(device)\n", + " attention_mask = torch.ones(inputs.shape, device=device)\n", + "\n", + " with torch.no_grad(): \n", + " outputs = fine_tuned_model(inputs, attention_mask=attention_mask)\n", + " next_token_logits = outputs.logits[:, -1, :].to('cpu')\n", + "\n", + " next_token_probs = F.softmax(next_token_logits, dim=-1)\n", + " top_prob, top_token_id = next_token_probs.topk(top_K)\n", + "\n", + " prices, weights = [], [] \n", + "\n", + " for i in range(top_K):\n", + " predicted_token = tokenizer.decode(top_token_id[0][i])\n", + " probability = top_prob[0][i]\n", + "\n", + " try:\n", + " result = float(predicted_token)\n", + " except ValueError as e:\n", + " result = 0.0\n", + "\n", + " if result > 0:\n", + " prices.append(result)\n", + " weights.append(probability)\n", + "\n", + " if not prices:\n", + " return 0.0, 0.0\n", + "\n", + " total = sum(weights)\n", + "\n", + " weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]\n", + "\n", + " return sum(weighted_prices).item()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc47ff1e", + "metadata": {}, + "outputs": [], + "source": [ + "improved_model_predict(test[0][\"text\"], device=\"cuda\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3dbecfff", + "metadata": {}, + "outputs": [], + "source": [ + "Tester.test(improved_model_predict, test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}