Week7 exercise

This commit is contained in:
Cosmus Mutuku
2025-10-29 06:23:12 +03:00
parent a8619a1caa
commit d0d7bf22bf
2 changed files with 445 additions and 375 deletions

View File

@@ -0,0 +1,445 @@
# -*- coding: utf-8 -*-
"""Week7_Exercise.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1wxcBNWbsDDC_SwXnQZP2dmo7ddOxkJmU
"""
#my pip installtions (some of them were not used in the project but i initally planned to use them (and we're left here so that I after the project i revisit the notebook and update when i have time and more sources.))
!pip install -q --upgrade pip
!pip install -q transformers accelerate peft bitsandbytes trl sentencepiece safetensors
!pip install -q wandb
!pip install -q git+https://github.com/huggingface/peft.git@main
!pip install datasets==3.0.1
!pip install evaluate -q
!pip install --upgrade scikit-learn
#All imports
import os, random, json, re
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from IPython.display import Markdown as md
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
#I tried a lot of models and every time my colab ran out of RAM, i did a lot of tweakings including replacing models with smaller ones
#I also used try and error to come up with samples size, eval size etc that would fit in my limited T4 Ram (had started from sample size of 15k down to 200)
MODEL_NAME = "facebook/opt-125m"
SAMPLE_SIZE = 200
EVAL_SIZE = 50
MAX_LENGTH = 128
RANDOM_SEED = 42
#Seeting LoRa hyper parameters
LORA_R = 4
LORA_ALPHA = 8
LORA_DROPOUT = 0.05
#Target modules to apply LoRA. I kept these to just "q_proj" and "v_proj" to lower memory usage on a T4 GPU.
TARGET_MODULES = ["q_proj", "v_proj"]
#to make sure thes expriment is reproducible
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
#Hf data
DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023"
SUBSET = "raw_meta_Appliances"
#loading the data
dataset = load_dataset(DATASET_NAME, SUBSET, split="full")
df = dataset.to_pandas()
# somecleaning on prices
df["price_clean"] = pd.to_numeric(df["price"], errors="coerce")
# Bringing the text fields togeter
def combine_text(row):
title = row["title"] or ""
features = " ".join(row["features"]) if isinstance(row["features"], list) else ""
description = " ".join(row["description"]) if isinstance(row["description"], list) else ""
return f"TITLE: {title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}"
df["text"] = df.apply(combine_text, axis=1)
df_clean = df.dropna(subset=["price_clean"]).reset_index(drop=True)
# Example: sample the cleaned data for faster experimentation
df_sample = df_clean.sample(n=5000, random_state=42) # adjust n as needed
train_df, eval_df = train_test_split(df_sample, test_size=0.2, random_state=42)
print(f"Train size: {len(train_df)}, Eval size: {len(eval_df)}")
train_df.head(2)
hf_train = Dataset.from_pandas(train_df.reset_index(drop=True))
hf_eval = Dataset.from_pandas(eval_df.reset_index(drop=True))
dataset = DatasetDict({"train": hf_train, "eval": hf_eval})
# Add instruction + numeric target (log-transformed)
import numpy as np
def add_instruction_and_target(ex):
ex["instruction"] = "Estimate the fair market price of this product in USD. Return only a single number."
price = float(ex["price_clean"])
ex["target_log"] = np.log1p(price) # log1p makes training easier
ex["target_str"] = f"{ex['target_log']:.6f}" # as string for LM
return ex
dataset = dataset.map(add_instruction_and_target)
print(dataset)
print(dataset["train"][0])
print(df_clean[["title", "price", "price_clean"]].head(10))
print(f"\nNumber of valid price entries: {df_clean['price_clean'].notna().sum()}")
# trying to downsamble for RAM purposes-hoping the punshment to results wasn't much
if len(df_clean) > SAMPLE_SIZE + EVAL_SIZE:
df_sample = df_clean.sample(SAMPLE_SIZE + EVAL_SIZE, random_state=RANDOM_SEED).reset_index(drop=True)
else:
df_sample = df_clean.copy()
train_df, eval_df = train_test_split(df_sample, test_size=EVAL_SIZE, random_state=RANDOM_SEED)
# FHow to format the examples
def make_example(row):
instruction = "Estimate the fair market price of this product in USD. Return only a single number."
input_text = row["text"]
output = f"{float(row['price_clean']):.2f}"
return {"instruction": instruction, "input": input_text, "output": output}
train_examples = [make_example(r) for _, r in train_df.iterrows()]
eval_examples = [make_example(r) for _, r in eval_df.iterrows()]
# Saving into JSONL
with open("pricing_train.jsonl", "w") as f:
for ex in train_examples:
f.write(json.dumps(ex) + "\n")
with open("pricing_eval.jsonl", "w") as f:
for ex in eval_examples:
f.write(json.dumps(ex) + "\n")
#A good formating for the llm
def format_for_model(ex):
return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}"
#seeing the examples
print("Example formatted prompts (3):")
#iterating over the egs
for ex in train_examples[:3]:
print(format_for_model(ex))
print("-"*80)
#tokenization now
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
#check ifthe model successful
print(f"{MODEL_NAME} succceeded")
# ===== Tokenization & dataset preprocessing =====
MAX_LENGTH = 128
def preprocess_for_training(examples):
input_ids, attention_masks, labels = [], [], []
for instr, inp, tgt in zip(examples["instruction"], examples["text"], examples["target_str"]):
prompt = f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n"
full = prompt + tgt
tok_full = tokenizer(full, truncation=True, max_length=MAX_LENGTH, padding="max_length")
tok_prompt = tokenizer(prompt, truncation=True, max_length=MAX_LENGTH, padding="max_length")
inp_ids = tok_full["input_ids"]
attn_mask = tok_full["attention_mask"]
prompt_len = sum(1 for t in tok_prompt["input_ids"] if t != tokenizer.pad_token_id)
label_ids = [-100] * prompt_len + inp_ids[prompt_len:]
label_ids = label_ids[:MAX_LENGTH]
input_ids.append(inp_ids)
attention_masks.append(attn_mask)
labels.append(label_ids)
return {
"input_ids": input_ids,
"attention_mask": attention_masks,
"labels": labels
}
# Map this to dataset
tokenized_datasets = dataset.map(
preprocess_for_training,
batched=True,
remove_columns=dataset["train"].column_names
)
print(" Tokenization complete:", tokenized_datasets)
# Sample random evaluation
sample_eval = random.sample(eval_examples, 10)
baseline_preds, baseline_truths = [], []
baseline_preds = []
baseline_truths = []
for ex in tqdm(sample_eval, desc="Evaluating"):
true_price = float(ex["output"])
prompt = (
f"### Instruction:\nEstimate the fair market price of this product in USD. "
f"Return only a number — no text, no currency symbols.\n\n"
f"### Input:\n{ex['input']}\n\n### Response:"
)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=20,
temperature=0.0,
do_sample=False
)
generated_tokens = output[0][inputs["input_ids"].shape[-1]:]
text_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
# Extract number from output
match = re.search(r"[-+]?\d*\.?\d+", text_output)
pred_price = float(match.group()) if match else None
# Apply inverse log if model outputs logs
if pred_price is not None:
if pred_price < 20: # log range, assuming trained on log(price)
pred_price = np.expm1(pred_price)
else:
print(f" Skipping log transform, raw pred {pred_price}")
# Only keep realistic prices
if 0 < pred_price < 1e4:
baseline_preds.append(pred_price)
baseline_truths.append(true_price)
else:
print(f" Skipping unreasonable pred {pred_price}")
else:
print(f" No number extracted from: {text_output}")
print(f" Predicted: {pred_price}, True: {true_price}, Raw: {text_output}")
print(f"\nNumber of baseline predictions: {len(baseline_preds)}")
print(f"Number of baseline truths: {len(baseline_truths)}")
# Manual computation of metrics
if baseline_preds:
mae = mean_absolute_error(baseline_truths, baseline_preds)
mse = mean_squared_error(baseline_truths, baseline_preds)
rmse = mse ** 0.5 # take square root manually
print(f"\nBaseline MAE: ${mae:.2f}")
print(f"Baseline RMSE: ${rmse:.2f}")
print(f"Number of baseline predictions: {len(baseline_preds)}")
print(f"Number of baseline truths: {len(baseline_truths)}")
#inspectthe data a little
print(dataset)
print(dataset["train"].column_names)
print(dataset["train"][0]) # show one sample
# create TrainingArguments and Trainer, compatible with older transformers ===
import transformers
print("transformers version:", transformers.__version__)
# decide whether evaluation_strategy is supported
supports_eval_strategy = False
try:
import inspect
sig = inspect.signature(transformers.TrainingArguments.__init__)
if 'evaluation_strategy' in sig.parameters:
supports_eval_strategy = True
except Exception:
# fallback: assume older version
supports_eval_strategy = False
from transformers import TrainingArguments, Trainer
# common args
common_args = dict(
output_dir="./price-predictor-checkpoints",
num_train_epochs=3,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
learning_rate=2e-5,
fp16=True, # if you see fp16 errors, set this to False
save_total_limit=1,
logging_steps=10,
report_to="none",
)
if supports_eval_strategy:
print("Using evaluation_strategy in TrainingArguments (newer transformers).")
training_args = TrainingArguments(
**common_args,
evaluation_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=200,
)
else:
print("evaluation_strategy not supported in this transformers version. Using minimal TrainingArguments and running trainer.evaluate() after training.")
# remove args unknown to old versions
# older versions may also not accept fp16 or report_to; if errors appear, set fp16=False and remove report_to
training_args = TrainingArguments(**common_args)
# Build the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["eval"],
tokenizer=tokenizer
)
# Train
train_result = trainer.train()
# If evaluation_strategy wasn't available, run a manual evaluate here
if not supports_eval_strategy:
print("Running manual evaluation because evaluation_strategy was not available...")
eval_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["eval"])
print("Trainer.evaluate metrics:", eval_metrics)
# Save model and tokenizer
trainer.save_model("./price-predictor-finetuned")
try:
tokenizer.save_pretrained("./price-predictor-finetuned")
except Exception as e:
print("Could not save tokenizer:", e)
#outcomes
#train_result = trainer.train()
trainer.save_model("./price-predictor-finetuned")
# Loading fine-tuned model
model = AutoModelForCausalLM.from_pretrained("./price-predictor-finetuned", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("./price-predictor-finetuned")
#small evaluation subset
eval_dataset_small = dataset["eval"].shuffle(seed=42).select(range(min(50, len(dataset["eval"]))))
pred_prices, true_prices = [], []
pred_prices = []
true_prices = []
for ex in tqdm(eval_dataset_small, desc="Evaluating"):
# check which column exists
if "target_str" in ex:
true_price_log = float(ex["target_str"])
true_price = np.expm1(true_price_log) # convert back from log1p
elif "output" in ex:
true_price = float(ex["output"])
elif "price_clean" in ex:
true_price = float(ex["price_clean"])
else:
raise KeyError("No valid price column found in eval example.")
# Skip invalid prices
if np.isnan(true_price):
continue
prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex.get('text', ex.get('input', ''))}\n\n### Response:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=20, temperature=0.2, do_sample=False)
text = tokenizer.decode(output[0], skip_special_tokens=True)
# Extract numeric prediction
numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text)
if not numbers:
continue
pred = float(numbers[-1])
# If you trained on log prices, exponentiate to get back to USD
if "target_str" in ex:
pred = np.expm1(pred)
pred_prices.append(pred)
true_prices.append(true_price)
# -- Compute metrics --
pred_prices = np.array(pred_prices)
true_prices = np.array(true_prices)
mae = mean_absolute_error(true_prices, pred_prices)
rmse = np.sqrt(mean_squared_error(true_prices, pred_prices))
r2 = r2_score(true_prices, pred_prices)
print(f"\nFine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}")
# #iteration Over the tqdm
# for ex in tqdm(eval_dataset_small, desc="Evaluating"):
# prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:"
# inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# with torch.no_grad():
# output = model.generate(**inputs, max_new_tokens=20)
# text = tokenizer.decode(output[0], skip_special_tokens=True)
# # Extract numeric prediction
# numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text)
# pred = float(numbers[-1]) if numbers else np.nan
# pred_prices.append(pred)
# true_prices.append(float(ex["output"]))
# Filter out invalid predictions
mask = ~np.isnan(pred_prices)
pred_prices = np.array(pred_prices)[mask]
true_prices = np.array(true_prices)[mask]
# Compute metrics manually again
mae = mean_absolute_error(true_prices, pred_prices)
mse = mean_squared_error(true_prices, pred_prices)
rmse = np.sqrt(mse)
r2 = r2_score(true_prices, pred_prices)
print(f"Fine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}")
#see what was predicted
plt.figure(figsize=(6,6))
plt.scatter(true_prices, pred_prices, alpha=0.5)
plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction")
plt.xlabel("Actual Price (USD)")
plt.ylabel("Predicted Price (USD)")
plt.title("Predicted vs Actual Prices")
plt.legend()
plt.grid(True)
plt.show()

View File

@@ -1,375 +0,0 @@
# -*- coding: utf-8 -*-
"""Week7_Exercise.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1wxcBNWbsDDC_SwXnQZP2dmo7ddOxkJmU
"""
#my pip installtions (some of them were not used in the project but i initally planned to use them (and we're left here so that I after the project i revisit the notebook and update when i have time and more sources.))
!pip install -q --upgrade pip
!pip install -q transformers accelerate peft bitsandbytes trl sentencepiece safetensors
!pip install -q wandb
!pip install -q git+https://github.com/huggingface/peft.git@main
!pip install datasets==3.0.1
!pip install evaluate -q
!pip install --upgrade scikit-learn
#All imports
import os, random, json, re
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from IPython.display import Markdown as md
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
#I tried a lot of models and every time my colab ran out of RAM, i did a lot of tweakings including replacing models with smaller ones
#I also used try and error to come up with samples size, eval size etc that would fit in my limited T4 Ram (had started from sample size of 15k down to 200)
MODEL_NAME = "facebook/opt-125m"
SAMPLE_SIZE = 200
EVAL_SIZE = 50
MAX_LENGTH = 128
RANDOM_SEED = 42
#Seeting LoRa hyper parameters
LORA_R = 4
LORA_ALPHA = 8
LORA_DROPOUT = 0.05
#Target modules to apply LoRA. I kept these to just "q_proj" and "v_proj" to lower memory usage on a T4 GPU.
TARGET_MODULES = ["q_proj", "v_proj"]
#to make sure thes expriment is reproducible
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
#Hf data
DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023"
SUBSET = "raw_meta_Appliances"
#loading the data
dataset = load_dataset(DATASET_NAME, SUBSET, split="full")
df = dataset.to_pandas()
from datasets import Dataset, DatasetDict
# #this took forever to run making me update it
# Split into train/eval
split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
# Reduce dataset sizes for quick experimentation
MAX_TRAIN_SAMPLES = 2000 # or 2000 if you want it even faster
MAX_EVAL_SAMPLES = 500
train_dataset = train_dataset.shuffle(seed=42).select(range(min(MAX_TRAIN_SAMPLES, len(train_dataset))))
eval_dataset = eval_dataset.shuffle(seed=42).select(range(min(MAX_EVAL_SAMPLES, len(eval_dataset))))
# Wrap into a DatasetDict for Trainer compatibility
dataset = DatasetDict({"train": train_dataset, "eval": eval_dataset})
# Prepare columns for your preprocessing
# Rename relevant columns to match what preprocess_function expects
dataset = dataset.rename_columns({
"title": "input",
"price": "output"
})
# Add a fixed instruction since your dataset doesnt have one
def add_instruction(example):
example["instruction"] = "Estimate the fair market price of this product in USD. Return only a single number."
return example
dataset = dataset.map(add_instruction)
print(dataset)
print(dataset["train"][0])
# somecleaning on prices
df["price_clean"] = pd.to_numeric(df["price"], errors="coerce")
#Print the data showing the price and price cleaned to see they are actual not all 0
print(df_clean[["title", "price", "price_clean"]].head(10))
print(f"\nNumber of valid price entries: {df_clean['price_clean'].notna().sum()}")
# Bringing the text fields togeter
def combine_text(row):
title = row["title"] or ""
features = " ".join(row["features"]) if isinstance(row["features"], list) else ""
description = " ".join(row["description"]) if isinstance(row["description"], list) else ""
return f"TITLE: {title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}"
df["text"] = df.apply(combine_text, axis=1)
df_clean = df.dropna(subset=["price_clean"]).reset_index(drop=True)
# trying to downsamble for RAM purposes-hoping the punshment to results wasn't much
if len(df_clean) > SAMPLE_SIZE + EVAL_SIZE:
df_sample = df_clean.sample(SAMPLE_SIZE + EVAL_SIZE, random_state=RANDOM_SEED).reset_index(drop=True)
else:
df_sample = df_clean.copy()
train_df, eval_df = train_test_split(df_sample, test_size=EVAL_SIZE, random_state=RANDOM_SEED)
# FHow to format the examples
def make_example(row):
instruction = "Estimate the fair market price of this product in USD. Return only a single number."
input_text = row["text"]
output = f"{float(row['price_clean']):.2f}"
return {"instruction": instruction, "input": input_text, "output": output}
train_examples = [make_example(r) for _, r in train_df.iterrows()]
eval_examples = [make_example(r) for _, r in eval_df.iterrows()]
# Saving into JSONL
with open("pricing_train.jsonl", "w") as f:
for ex in train_examples:
f.write(json.dumps(ex) + "\n")
with open("pricing_eval.jsonl", "w") as f:
for ex in eval_examples:
f.write(json.dumps(ex) + "\n")
#Check the price exists in the Saved JSON aboved
with open("pricing_train.jsonl") as f:
lines = [json.loads(line) for line in f]
print("Sample outputs from training data:")
for ex in lines[:5]:
print(ex["output"])
#A good formating for the llm
def format_for_model(ex):
return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}"
#seeing the examples
print("Example formatted prompts (3):")
#iterating over the egs
for ex in train_examples[:3]:
print(format_for_model(ex))
print("-"*80)
#tokenization now
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
#check ifthe model successful
print(f"{MODEL_NAME} succceeded")
# Sample random evaluation
sample_eval = random.sample(eval_examples, 10)
baseline_preds, baseline_truths = [], []
#iteration over the evals
for ex in sample_eval:
prompt = f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=50, temperature=0.2, do_sample=False)
text_output = tokenizer.decode(output[0], skip_special_tokens=True)
# Extract numeric prediction from model output
match = re.search(r"\$?(\d+(\.\d+)?)", text_output)
pred_price = float(match.group(1)) if match else None
true_price = float(ex["output"])
if pred_price is not None:
baseline_preds.append(pred_price)
baseline_truths.append(true_price)
print(f"Predicted: {pred_price}, True: {true_price}")
# Manual computation of metrics
if baseline_preds:
mae = mean_absolute_error(baseline_truths, baseline_preds)
mse = mean_squared_error(baseline_truths, baseline_preds)
rmse = mse ** 0.5 # take square root manually
print(f"\nBaseline MAE: ${mae:.2f}")
print(f"Baseline RMSE: ${rmse:.2f}")
#inspectthe data a little
print(dataset)
print(dataset["train"].column_names)
print(dataset["train"][0]) # show one sample
def preprocess_function(examples):
prompts = [
f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n{out}"
for instr, inp, out in zip(examples["instruction"], examples["input"], examples["output"])
]
return tokenizer(prompts, truncation=True, padding="max_length", max_length=MAX_LENGTH)
tokenized_datasets = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
#updated for faster exp
training_args = TrainingArguments(
output_dir="./price-predictor-checkpoints",
num_train_epochs=1, # ⬅change from 2 to 1
per_device_train_batch_size=1,
gradient_accumulation_steps=2,
learning_rate=2e-4,
fp16=True,
save_total_limit=1,
logging_steps=10,
report_to="none",
)
#our trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["eval"],
tokenizer=tokenizer,
data_collator=data_collator
)
#outcomes
train_result = trainer.train()
trainer.save_model("./price-predictor-finetuned")
# Loading fine-tuned model
model = AutoModelForCausalLM.from_pretrained("./price-predictor-finetuned", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("./price-predictor-finetuned")
# Inspect one example from your fine-tuning eval dataset before using it
print("Inspecting one evaluation example (should have instruction, input, output):")
with open("pricing_eval.jsonl") as f:
sample_eval = [json.loads(line) for line in f][:3] # just a few samples
for ex in sample_eval:
print(json.dumps(ex, indent=2))
eval_dataset_small = load_dataset("json", data_files="pricing_eval.jsonl")["train"]
eval_dataset_small = eval_dataset_small.shuffle(seed=42).select(range(min(50, len(eval_dataset_small))))
for ex in eval_dataset_small.select(range(5)):
print("Output price:", ex["output"])
#iteration Over the tqdm
for ex in tqdm(eval_dataset_small, desc="Evaluating"):
prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=20)
text = tokenizer.decode(output[0], skip_special_tokens=True)
# Extract numeric prediction
numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text)
pred = float(numbers[-1]) if numbers else np.nan
pred_prices.append(pred)
true_prices.append(float(ex["output"]))
# --- Fix length mismatch and mask NaNs ---
import numpy as np
pred_prices = np.array(pred_prices, dtype=float)
true_prices = np.array(true_prices, dtype=float)
# Ensure both arrays are same length
min_len = min(len(pred_prices), len(true_prices))
pred_prices = pred_prices[:min_len]
true_prices = true_prices[:min_len]
# Filter out NaNs or nonsensical large predictions
mask = (~np.isnan(pred_prices)) & (pred_prices < 10000) # exclude any predictions above $10k
pred_prices = pred_prices[mask]
true_prices = true_prices[mask]
print("Arrays aligned:")
print("Preds:", len(pred_prices), "Truths:", len(true_prices))
# Filter out invalid predictions
mask = ~np.isnan(pred_prices)
pred_prices = np.array(pred_prices)[mask]
true_prices = np.array(true_prices)[mask]
# Compute metrics manually again
mae = mean_absolute_error(true_prices, pred_prices)
mse = mean_squared_error(true_prices, pred_prices)
rmse = np.sqrt(mse)
r2 = r2_score(true_prices, pred_prices)
print(f"Fine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}")
#see what was predicted
plt.figure(figsize=(6,6))
plt.scatter(true_prices, pred_prices, alpha=0.5)
plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction")
plt.xlabel("Actual Price (USD)")
plt.ylabel("Predicted Price (USD)")
plt.title("Predicted vs Actual Prices")
plt.legend()
plt.grid(True)
plt.show()
#Zoom
plt.figure(figsize=(6,6))
plt.scatter(true_prices, pred_prices, alpha=0.6)
plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction")
plt.xlabel("Actual Price (USD)")
plt.ylabel("Predicted Price (USD)")
plt.title("Predicted vs Actual Prices (Zoomed In)")
plt.ylim(0, 600) # Zoom y-axis
plt.legend()
plt.grid(True)
plt.show()
#check the distribution
errors = np.abs(pred_prices - true_prices)
plt.figure(figsize=(8,4))
plt.hist(errors, bins=30, edgecolor='k', alpha=0.7)
plt.title("Distribution of Absolute Errors")
plt.xlabel("Absolute Error ($)")
plt.ylabel("Frequency")
plt.show()
print(f"Average Error: ${np.mean(errors):.2f}, Median Error: ${np.median(errors):.2f}")
# Load the base model
model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
# Sample 5 examples from your eval_df
examples = eval_df.sample(5, random_state=42)
for i, row in examples.iterrows():
prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{row['text']}\n\n### Response:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=20)
prediction_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\n--- Example {i} ---")
print("Prompt:\n", prompt[:200], "...")
print("Model output:\n", prediction_text)
print("Actual price:", row["price_clean"])