# -*- coding: utf-8 -*- """Week7_Exercise.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1wxcBNWbsDDC_SwXnQZP2dmo7ddOxkJmU """ #my pip installtions (some of them were not used in the project but i initally planned to use them (and we're left here so that I after the project i revisit the notebook and update when i have time and more sources.)) !pip install -q --upgrade pip !pip install -q transformers accelerate peft bitsandbytes trl sentencepiece safetensors !pip install -q wandb !pip install -q git+https://github.com/huggingface/peft.git@main !pip install datasets==3.0.1 !pip install evaluate -q !pip install --upgrade scikit-learn #All imports import os, random, json, re import pandas as pd import numpy as np import torch import matplotlib.pyplot as plt from tqdm import tqdm from sklearn.model_selection import train_test_split from datasets import load_dataset from IPython.display import Markdown as md from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) #I tried a lot of models and every time my colab ran out of RAM, i did a lot of tweakings including replacing models with smaller ones #I also used try and error to come up with samples size, eval size etc that would fit in my limited T4 Ram (had started from sample size of 15k down to 200) MODEL_NAME = "facebook/opt-125m" SAMPLE_SIZE = 200 EVAL_SIZE = 50 MAX_LENGTH = 128 RANDOM_SEED = 42 #Seeting LoRa hyper parameters LORA_R = 4 LORA_ALPHA = 8 LORA_DROPOUT = 0.05 #Target modules to apply LoRA. I kept these to just "q_proj" and "v_proj" to lower memory usage on a T4 GPU. TARGET_MODULES = ["q_proj", "v_proj"] #to make sure thes expriment is reproducible random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) #Hf data DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023" SUBSET = "raw_meta_Appliances" #loading the data dataset = load_dataset(DATASET_NAME, SUBSET, split="full") df = dataset.to_pandas() from datasets import Dataset, DatasetDict # #this took forever to run making me update it # Split into train/eval split = dataset.train_test_split(test_size=0.2, seed=42) train_dataset = split["train"] eval_dataset = split["test"] # Reduce dataset sizes for quick experimentation MAX_TRAIN_SAMPLES = 2000 # or 2000 if you want it even faster MAX_EVAL_SAMPLES = 500 train_dataset = train_dataset.shuffle(seed=42).select(range(min(MAX_TRAIN_SAMPLES, len(train_dataset)))) eval_dataset = eval_dataset.shuffle(seed=42).select(range(min(MAX_EVAL_SAMPLES, len(eval_dataset)))) # Wrap into a DatasetDict for Trainer compatibility dataset = DatasetDict({"train": train_dataset, "eval": eval_dataset}) # Prepare columns for your preprocessing # Rename relevant columns to match what preprocess_function expects dataset = dataset.rename_columns({ "title": "input", "price": "output" }) # Add a fixed instruction since your dataset doesn’t have one def add_instruction(example): example["instruction"] = "Estimate the fair market price of this product in USD. Return only a single number." return example dataset = dataset.map(add_instruction) print(dataset) print(dataset["train"][0]) # somecleaning on prices df["price_clean"] = pd.to_numeric(df["price"], errors="coerce") #Print the data showing the price and price cleaned to see they are actual not all 0 print(df_clean[["title", "price", "price_clean"]].head(10)) print(f"\nNumber of valid price entries: {df_clean['price_clean'].notna().sum()}") # Bringing the text fields togeter def combine_text(row): title = row["title"] or "" features = " ".join(row["features"]) if isinstance(row["features"], list) else "" description = " ".join(row["description"]) if isinstance(row["description"], list) else "" return f"TITLE: {title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}" df["text"] = df.apply(combine_text, axis=1) df_clean = df.dropna(subset=["price_clean"]).reset_index(drop=True) # trying to downsamble for RAM purposes-hoping the punshment to results wasn't much if len(df_clean) > SAMPLE_SIZE + EVAL_SIZE: df_sample = df_clean.sample(SAMPLE_SIZE + EVAL_SIZE, random_state=RANDOM_SEED).reset_index(drop=True) else: df_sample = df_clean.copy() train_df, eval_df = train_test_split(df_sample, test_size=EVAL_SIZE, random_state=RANDOM_SEED) # FHow to format the examples def make_example(row): instruction = "Estimate the fair market price of this product in USD. Return only a single number." input_text = row["text"] output = f"{float(row['price_clean']):.2f}" return {"instruction": instruction, "input": input_text, "output": output} train_examples = [make_example(r) for _, r in train_df.iterrows()] eval_examples = [make_example(r) for _, r in eval_df.iterrows()] # Saving into JSONL with open("pricing_train.jsonl", "w") as f: for ex in train_examples: f.write(json.dumps(ex) + "\n") with open("pricing_eval.jsonl", "w") as f: for ex in eval_examples: f.write(json.dumps(ex) + "\n") #Check the price exists in the Saved JSON aboved with open("pricing_train.jsonl") as f: lines = [json.loads(line) for line in f] print("Sample outputs from training data:") for ex in lines[:5]: print(ex["output"]) #A good formating for the llm def format_for_model(ex): return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}" #seeing the examples print("Example formatted prompts (3):") #iterating over the egs for ex in train_examples[:3]: print(format_for_model(ex)) print("-"*80) #tokenization now tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto") #check ifthe model successful print(f"{MODEL_NAME} succceeded") # Sample random evaluation sample_eval = random.sample(eval_examples, 10) baseline_preds, baseline_truths = [], [] #iteration over the evals for ex in sample_eval: prompt = f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=50, temperature=0.2, do_sample=False) text_output = tokenizer.decode(output[0], skip_special_tokens=True) # Extract numeric prediction from model output match = re.search(r"\$?(\d+(\.\d+)?)", text_output) pred_price = float(match.group(1)) if match else None true_price = float(ex["output"]) if pred_price is not None: baseline_preds.append(pred_price) baseline_truths.append(true_price) print(f"Predicted: {pred_price}, True: {true_price}") # Manual computation of metrics if baseline_preds: mae = mean_absolute_error(baseline_truths, baseline_preds) mse = mean_squared_error(baseline_truths, baseline_preds) rmse = mse ** 0.5 # take square root manually print(f"\nBaseline MAE: ${mae:.2f}") print(f"Baseline RMSE: ${rmse:.2f}") #inspectthe data a little print(dataset) print(dataset["train"].column_names) print(dataset["train"][0]) # show one sample def preprocess_function(examples): prompts = [ f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n{out}" for instr, inp, out in zip(examples["instruction"], examples["input"], examples["output"]) ] return tokenizer(prompts, truncation=True, padding="max_length", max_length=MAX_LENGTH) tokenized_datasets = dataset.map(preprocess_function, batched=True) data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) #updated for faster exp training_args = TrainingArguments( output_dir="./price-predictor-checkpoints", num_train_epochs=1, # ⬅change from 2 to 1 per_device_train_batch_size=1, gradient_accumulation_steps=2, learning_rate=2e-4, fp16=True, save_total_limit=1, logging_steps=10, report_to="none", ) #our trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["eval"], tokenizer=tokenizer, data_collator=data_collator ) #outcomes train_result = trainer.train() trainer.save_model("./price-predictor-finetuned") # Loading fine-tuned model model = AutoModelForCausalLM.from_pretrained("./price-predictor-finetuned", device_map="auto") tokenizer = AutoTokenizer.from_pretrained("./price-predictor-finetuned") # Inspect one example from your fine-tuning eval dataset before using it print("Inspecting one evaluation example (should have instruction, input, output):") with open("pricing_eval.jsonl") as f: sample_eval = [json.loads(line) for line in f][:3] # just a few samples for ex in sample_eval: print(json.dumps(ex, indent=2)) eval_dataset_small = load_dataset("json", data_files="pricing_eval.jsonl")["train"] eval_dataset_small = eval_dataset_small.shuffle(seed=42).select(range(min(50, len(eval_dataset_small)))) for ex in eval_dataset_small.select(range(5)): print("Output price:", ex["output"]) #iteration Over the tqdm for ex in tqdm(eval_dataset_small, desc="Evaluating"): prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate(**inputs, max_new_tokens=20) text = tokenizer.decode(output[0], skip_special_tokens=True) # Extract numeric prediction numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text) pred = float(numbers[-1]) if numbers else np.nan pred_prices.append(pred) true_prices.append(float(ex["output"])) # --- Fix length mismatch and mask NaNs --- import numpy as np pred_prices = np.array(pred_prices, dtype=float) true_prices = np.array(true_prices, dtype=float) # Ensure both arrays are same length min_len = min(len(pred_prices), len(true_prices)) pred_prices = pred_prices[:min_len] true_prices = true_prices[:min_len] # Filter out NaNs or nonsensical large predictions mask = (~np.isnan(pred_prices)) & (pred_prices < 10000) # exclude any predictions above $10k pred_prices = pred_prices[mask] true_prices = true_prices[mask] print("Arrays aligned:") print("Preds:", len(pred_prices), "Truths:", len(true_prices)) # Filter out invalid predictions mask = ~np.isnan(pred_prices) pred_prices = np.array(pred_prices)[mask] true_prices = np.array(true_prices)[mask] # Compute metrics manually again mae = mean_absolute_error(true_prices, pred_prices) mse = mean_squared_error(true_prices, pred_prices) rmse = np.sqrt(mse) r2 = r2_score(true_prices, pred_prices) print(f"Fine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}") #see what was predicted plt.figure(figsize=(6,6)) plt.scatter(true_prices, pred_prices, alpha=0.5) plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction") plt.xlabel("Actual Price (USD)") plt.ylabel("Predicted Price (USD)") plt.title("Predicted vs Actual Prices") plt.legend() plt.grid(True) plt.show() #Zoom plt.figure(figsize=(6,6)) plt.scatter(true_prices, pred_prices, alpha=0.6) plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction") plt.xlabel("Actual Price (USD)") plt.ylabel("Predicted Price (USD)") plt.title("Predicted vs Actual Prices (Zoomed In)") plt.ylim(0, 600) # Zoom y-axis plt.legend() plt.grid(True) plt.show() #check the distribution errors = np.abs(pred_prices - true_prices) plt.figure(figsize=(8,4)) plt.hist(errors, bins=30, edgecolor='k', alpha=0.7) plt.title("Distribution of Absolute Errors") plt.xlabel("Absolute Error ($)") plt.ylabel("Frequency") plt.show() print(f"Average Error: ${np.mean(errors):.2f}, Median Error: ${np.median(errors):.2f}") # Load the base model model_name = "facebook/opt-125m" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") # Sample 5 examples from your eval_df examples = eval_df.sample(5, random_state=42) for i, row in examples.iterrows(): prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{row['text']}\n\n### Response:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=20) prediction_text = tokenizer.decode(outputs[0], skip_special_tokens=True) print(f"\n--- Example {i} ---") print("Prompt:\n", prompt[:200], "...") print("Model output:\n", prediction_text) print("Actual price:", row["price_clean"])