# -*- coding: utf-8 -*- """Week7_Exercise.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1wxcBNWbsDDC_SwXnQZP2dmo7ddOxkJmU """ #my pip installtions (some of them were not used in the project but i initally planned to use them (and we're left here so that I after the project i revisit the notebook and update when i have time and more sources.)) !pip install -q --upgrade pip !pip install -q transformers accelerate peft bitsandbytes trl sentencepiece safetensors !pip install -q wandb !pip install -q git+https://github.com/huggingface/peft.git@main !pip install datasets==3.0.1 !pip install evaluate -q !pip install --upgrade scikit-learn #All imports import os, random, json, re import pandas as pd import numpy as np import torch import matplotlib.pyplot as plt from tqdm import tqdm from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split from datasets import Dataset, DatasetDict from IPython.display import Markdown as md from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) #I tried a lot of models and every time my colab ran out of RAM, i did a lot of tweakings including replacing models with smaller ones #I also used try and error to come up with samples size, eval size etc that would fit in my limited T4 Ram (had started from sample size of 15k down to 200) MODEL_NAME = "facebook/opt-125m" SAMPLE_SIZE = 200 EVAL_SIZE = 50 MAX_LENGTH = 128 RANDOM_SEED = 42 #Seeting LoRa hyper parameters LORA_R = 4 LORA_ALPHA = 8 LORA_DROPOUT = 0.05 #Target modules to apply LoRA. I kept these to just "q_proj" and "v_proj" to lower memory usage on a T4 GPU. TARGET_MODULES = ["q_proj", "v_proj"] #to make sure thes expriment is reproducible random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) #Hf data DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023" SUBSET = "raw_meta_Appliances" #loading the data dataset = load_dataset(DATASET_NAME, SUBSET, split="full") df = dataset.to_pandas() # somecleaning on prices df["price_clean"] = pd.to_numeric(df["price"], errors="coerce") # Bringing the text fields togeter def combine_text(row): title = row["title"] or "" features = " ".join(row["features"]) if isinstance(row["features"], list) else "" description = " ".join(row["description"]) if isinstance(row["description"], list) else "" return f"TITLE: {title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}" df["text"] = df.apply(combine_text, axis=1) df_clean = df.dropna(subset=["price_clean"]).reset_index(drop=True) # Example: sample the cleaned data for faster experimentation df_sample = df_clean.sample(n=5000, random_state=42) # adjust n as needed train_df, eval_df = train_test_split(df_sample, test_size=0.2, random_state=42) print(f"Train size: {len(train_df)}, Eval size: {len(eval_df)}") train_df.head(2) hf_train = Dataset.from_pandas(train_df.reset_index(drop=True)) hf_eval = Dataset.from_pandas(eval_df.reset_index(drop=True)) dataset = DatasetDict({"train": hf_train, "eval": hf_eval}) # Add instruction + numeric target (log-transformed) import numpy as np def add_instruction_and_target(ex): ex["instruction"] = "Estimate the fair market price of this product in USD. Return only a single number." price = float(ex["price_clean"]) ex["target_log"] = np.log1p(price) # log1p makes training easier ex["target_str"] = f"{ex['target_log']:.6f}" # as string for LM return ex dataset = dataset.map(add_instruction_and_target) print(dataset) print(dataset["train"][0]) print(df_clean[["title", "price", "price_clean"]].head(10)) print(f"\nNumber of valid price entries: {df_clean['price_clean'].notna().sum()}") # trying to downsamble for RAM purposes-hoping the punshment to results wasn't much if len(df_clean) > SAMPLE_SIZE + EVAL_SIZE: df_sample = df_clean.sample(SAMPLE_SIZE + EVAL_SIZE, random_state=RANDOM_SEED).reset_index(drop=True) else: df_sample = df_clean.copy() train_df, eval_df = train_test_split(df_sample, test_size=EVAL_SIZE, random_state=RANDOM_SEED) # FHow to format the examples def make_example(row): instruction = "Estimate the fair market price of this product in USD. Return only a single number." input_text = row["text"] output = f"{float(row['price_clean']):.2f}" return {"instruction": instruction, "input": input_text, "output": output} train_examples = [make_example(r) for _, r in train_df.iterrows()] eval_examples = [make_example(r) for _, r in eval_df.iterrows()] # Saving into JSONL with open("pricing_train.jsonl", "w") as f: for ex in train_examples: f.write(json.dumps(ex) + "\n") with open("pricing_eval.jsonl", "w") as f: for ex in eval_examples: f.write(json.dumps(ex) + "\n") #A good formating for the llm def format_for_model(ex): return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}" #seeing the examples print("Example formatted prompts (3):") #iterating over the egs for ex in train_examples[:3]: print(format_for_model(ex)) print("-"*80) #tokenization now tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto") #check ifthe model successful print(f"{MODEL_NAME} succceeded") # ===== Tokenization & dataset preprocessing ===== MAX_LENGTH = 128 def preprocess_for_training(examples): input_ids, attention_masks, labels = [], [], [] for instr, inp, tgt in zip(examples["instruction"], examples["text"], examples["target_str"]): prompt = f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n" full = prompt + tgt tok_full = tokenizer(full, truncation=True, max_length=MAX_LENGTH, padding="max_length") tok_prompt = tokenizer(prompt, truncation=True, max_length=MAX_LENGTH, padding="max_length") inp_ids = tok_full["input_ids"] attn_mask = tok_full["attention_mask"] prompt_len = sum(1 for t in tok_prompt["input_ids"] if t != tokenizer.pad_token_id) label_ids = [-100] * prompt_len + inp_ids[prompt_len:] label_ids = label_ids[:MAX_LENGTH] input_ids.append(inp_ids) attention_masks.append(attn_mask) labels.append(label_ids) return { "input_ids": input_ids, "attention_mask": attention_masks, "labels": labels } # Map this to dataset tokenized_datasets = dataset.map( preprocess_for_training, batched=True, remove_columns=dataset["train"].column_names ) print(" Tokenization complete:", tokenized_datasets) # Sample random evaluation sample_eval = random.sample(eval_examples, 10) baseline_preds, baseline_truths = [], [] baseline_preds = [] baseline_truths = [] for ex in tqdm(sample_eval, desc="Evaluating"): true_price = float(ex["output"]) prompt = ( f"### Instruction:\nEstimate the fair market price of this product in USD. " f"Return only a number — no text, no currency symbols.\n\n" f"### Input:\n{ex['input']}\n\n### Response:" ) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=20, temperature=0.0, do_sample=False ) generated_tokens = output[0][inputs["input_ids"].shape[-1]:] text_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip() # Extract number from output match = re.search(r"[-+]?\d*\.?\d+", text_output) pred_price = float(match.group()) if match else None # Apply inverse log if model outputs logs if pred_price is not None: if pred_price < 20: # log range, assuming trained on log(price) pred_price = np.expm1(pred_price) else: print(f" Skipping log transform, raw pred {pred_price}") # Only keep realistic prices if 0 < pred_price < 1e4: baseline_preds.append(pred_price) baseline_truths.append(true_price) else: print(f" Skipping unreasonable pred {pred_price}") else: print(f" No number extracted from: {text_output}") print(f" Predicted: {pred_price}, True: {true_price}, Raw: {text_output}") print(f"\nNumber of baseline predictions: {len(baseline_preds)}") print(f"Number of baseline truths: {len(baseline_truths)}") # Manual computation of metrics if baseline_preds: mae = mean_absolute_error(baseline_truths, baseline_preds) mse = mean_squared_error(baseline_truths, baseline_preds) rmse = mse ** 0.5 # take square root manually print(f"\nBaseline MAE: ${mae:.2f}") print(f"Baseline RMSE: ${rmse:.2f}") print(f"Number of baseline predictions: {len(baseline_preds)}") print(f"Number of baseline truths: {len(baseline_truths)}") #inspectthe data a little print(dataset) print(dataset["train"].column_names) print(dataset["train"][0]) # show one sample # create TrainingArguments and Trainer, compatible with older transformers === import transformers print("transformers version:", transformers.__version__) # decide whether evaluation_strategy is supported supports_eval_strategy = False try: import inspect sig = inspect.signature(transformers.TrainingArguments.__init__) if 'evaluation_strategy' in sig.parameters: supports_eval_strategy = True except Exception: # fallback: assume older version supports_eval_strategy = False from transformers import TrainingArguments, Trainer # common args common_args = dict( output_dir="./price-predictor-checkpoints", num_train_epochs=3, per_device_train_batch_size=2, gradient_accumulation_steps=4, learning_rate=2e-5, fp16=True, # if you see fp16 errors, set this to False save_total_limit=1, logging_steps=10, report_to="none", ) if supports_eval_strategy: print("Using evaluation_strategy in TrainingArguments (newer transformers).") training_args = TrainingArguments( **common_args, evaluation_strategy="steps", eval_steps=100, save_strategy="steps", save_steps=200, ) else: print("evaluation_strategy not supported in this transformers version. Using minimal TrainingArguments and running trainer.evaluate() after training.") # remove args unknown to old versions # older versions may also not accept fp16 or report_to; if errors appear, set fp16=False and remove report_to training_args = TrainingArguments(**common_args) # Build the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["eval"], tokenizer=tokenizer ) # Train train_result = trainer.train() # If evaluation_strategy wasn't available, run a manual evaluate here if not supports_eval_strategy: print("Running manual evaluation because evaluation_strategy was not available...") eval_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["eval"]) print("Trainer.evaluate metrics:", eval_metrics) # Save model and tokenizer trainer.save_model("./price-predictor-finetuned") try: tokenizer.save_pretrained("./price-predictor-finetuned") except Exception as e: print("Could not save tokenizer:", e) #outcomes #train_result = trainer.train() trainer.save_model("./price-predictor-finetuned") # Loading fine-tuned model model = AutoModelForCausalLM.from_pretrained("./price-predictor-finetuned", device_map="auto") tokenizer = AutoTokenizer.from_pretrained("./price-predictor-finetuned") #small evaluation subset eval_dataset_small = dataset["eval"].shuffle(seed=42).select(range(min(50, len(dataset["eval"])))) pred_prices, true_prices = [], [] pred_prices = [] true_prices = [] for ex in tqdm(eval_dataset_small, desc="Evaluating"): # check which column exists if "target_str" in ex: true_price_log = float(ex["target_str"]) true_price = np.expm1(true_price_log) # convert back from log1p elif "output" in ex: true_price = float(ex["output"]) elif "price_clean" in ex: true_price = float(ex["price_clean"]) else: raise KeyError("No valid price column found in eval example.") # Skip invalid prices if np.isnan(true_price): continue prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex.get('text', ex.get('input', ''))}\n\n### Response:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate(**inputs, max_new_tokens=20, temperature=0.2, do_sample=False) text = tokenizer.decode(output[0], skip_special_tokens=True) # Extract numeric prediction numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text) if not numbers: continue pred = float(numbers[-1]) # If you trained on log prices, exponentiate to get back to USD if "target_str" in ex: pred = np.expm1(pred) pred_prices.append(pred) true_prices.append(true_price) # -- Compute metrics -- pred_prices = np.array(pred_prices) true_prices = np.array(true_prices) mae = mean_absolute_error(true_prices, pred_prices) rmse = np.sqrt(mean_squared_error(true_prices, pred_prices)) r2 = r2_score(true_prices, pred_prices) print(f"\nFine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}") # #iteration Over the tqdm # for ex in tqdm(eval_dataset_small, desc="Evaluating"): # prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:" # inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # with torch.no_grad(): # output = model.generate(**inputs, max_new_tokens=20) # text = tokenizer.decode(output[0], skip_special_tokens=True) # # Extract numeric prediction # numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text) # pred = float(numbers[-1]) if numbers else np.nan # pred_prices.append(pred) # true_prices.append(float(ex["output"])) # Filter out invalid predictions mask = ~np.isnan(pred_prices) pred_prices = np.array(pred_prices)[mask] true_prices = np.array(true_prices)[mask] # Compute metrics manually again mae = mean_absolute_error(true_prices, pred_prices) mse = mean_squared_error(true_prices, pred_prices) rmse = np.sqrt(mse) r2 = r2_score(true_prices, pred_prices) print(f"Fine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}") #see what was predicted plt.figure(figsize=(6,6)) plt.scatter(true_prices, pred_prices, alpha=0.5) plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction") plt.xlabel("Actual Price (USD)") plt.ylabel("Predicted Price (USD)") plt.title("Predicted vs Actual Prices") plt.legend() plt.grid(True) plt.show()