From 72f80edcee7d441d773a7c1a2b1484b7f0731bb2 Mon Sep 17 00:00:00 2001 From: Cosmus Mutuku Date: Mon, 27 Oct 2025 20:32:43 +0300 Subject: [PATCH 1/3] Week7 exercise --- .../Week_7_exercise_final.py | 339 ++++++++++++++++++ 1 file changed, 339 insertions(+) create mode 100644 week7/community_contributions/Week_7_exercise_final.py diff --git a/week7/community_contributions/Week_7_exercise_final.py b/week7/community_contributions/Week_7_exercise_final.py new file mode 100644 index 0000000..efd8207 --- /dev/null +++ b/week7/community_contributions/Week_7_exercise_final.py @@ -0,0 +1,339 @@ +# -*- coding: utf-8 -*- +"""Week7_Exercise.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1wxcBNWbsDDC_SwXnQZP2dmo7ddOxkJmU +""" + +#my pip installtions (some of them were not used in the project but i initally planned to use them (and we're left here so that I after the project i revisit the notebook and update when i have time and more sources.)) +!pip install -q --upgrade pip +!pip install -q transformers accelerate peft bitsandbytes trl sentencepiece safetensors +!pip install -q wandb +!pip install -q git+https://github.com/huggingface/peft.git@main +!pip install datasets==3.0.1 +!pip install evaluate -q +!pip install --upgrade scikit-learn + +#All imports +import os, random, json, re +import pandas as pd +import numpy as np +import torch +import matplotlib.pyplot as plt +from tqdm import tqdm +from sklearn.model_selection import train_test_split +from datasets import load_dataset +from IPython.display import Markdown as md +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +from transformers import ( + AutoTokenizer, + AutoModelForCausalLM, + TrainingArguments, + Trainer, + DataCollatorForLanguageModeling +) + +#I tried a lot of models and every time my colab ran out of RAM, i did a lot of tweakings including replacing models with smaller ones +#I also used try and error to come up with samples size, eval size etc that would fit in my limited T4 Ram (had started from sample size of 15k down to 200) +MODEL_NAME = "facebook/opt-125m" +SAMPLE_SIZE = 200 +EVAL_SIZE = 50 +MAX_LENGTH = 128 +RANDOM_SEED = 42 + +#Seeting LoRa hyper parameters +LORA_R = 4 +LORA_ALPHA = 8 +LORA_DROPOUT = 0.05 +#Target modules to apply LoRA. I kept these to just "q_proj" and "v_proj" to lower memory usage on a T4 GPU. +TARGET_MODULES = ["q_proj", "v_proj"] + +#to make sure thes expriment is reproducible +random.seed(RANDOM_SEED) +np.random.seed(RANDOM_SEED) +torch.manual_seed(RANDOM_SEED) + +#Hf data +DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023" +SUBSET = "raw_meta_Appliances" + +#loading the data +dataset = load_dataset(DATASET_NAME, SUBSET, split="full") +df = dataset.to_pandas() + +from datasets import Dataset, DatasetDict + +# #this took forever to run making me update it + +# Split into train/eval +split = dataset.train_test_split(test_size=0.2, seed=42) +train_dataset = split["train"] +eval_dataset = split["test"] + +# Reduce dataset sizes for quick experimentation +MAX_TRAIN_SAMPLES = 2000 # or 2000 if you want it even faster +MAX_EVAL_SAMPLES = 500 + +train_dataset = train_dataset.shuffle(seed=42).select(range(min(MAX_TRAIN_SAMPLES, len(train_dataset)))) +eval_dataset = eval_dataset.shuffle(seed=42).select(range(min(MAX_EVAL_SAMPLES, len(eval_dataset)))) + + + +# Wrap into a DatasetDict for Trainer compatibility +dataset = DatasetDict({"train": train_dataset, "eval": eval_dataset}) + +# Prepare columns for your preprocessing +# Rename relevant columns to match what preprocess_function expects +dataset = dataset.rename_columns({ + "title": "input", + "price": "output" +}) + +# Add a fixed instruction since your dataset doesn’t have one +def add_instruction(example): + example["instruction"] = "Estimate the fair market price of this product in USD. Return only a single number." + return example + +dataset = dataset.map(add_instruction) + +print(dataset) +print(dataset["train"][0]) + +# somecleaning on prices +df["price_clean"] = pd.to_numeric(df["price"], errors="coerce") + +# Bringing the text fields togeter +def combine_text(row): + title = row["title"] or "" + features = " ".join(row["features"]) if isinstance(row["features"], list) else "" + description = " ".join(row["description"]) if isinstance(row["description"], list) else "" + return f"TITLE: {title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}" + +df["text"] = df.apply(combine_text, axis=1) + +df_clean = df.dropna(subset=["price_clean"]).reset_index(drop=True) + +# trying to downsamble for RAM purposes-hoping the punshment to results wasn't much +if len(df_clean) > SAMPLE_SIZE + EVAL_SIZE: + df_sample = df_clean.sample(SAMPLE_SIZE + EVAL_SIZE, random_state=RANDOM_SEED).reset_index(drop=True) +else: + df_sample = df_clean.copy() + +train_df, eval_df = train_test_split(df_sample, test_size=EVAL_SIZE, random_state=RANDOM_SEED) + +# FHow to format the examples +def make_example(row): + instruction = "Estimate the fair market price of this product in USD. Return only a single number." + input_text = row["text"] + output = f"{float(row['price_clean']):.2f}" + return {"instruction": instruction, "input": input_text, "output": output} + +train_examples = [make_example(r) for _, r in train_df.iterrows()] +eval_examples = [make_example(r) for _, r in eval_df.iterrows()] + +# Saving into JSONL +with open("pricing_train.jsonl", "w") as f: + for ex in train_examples: + f.write(json.dumps(ex) + "\n") + +with open("pricing_eval.jsonl", "w") as f: + for ex in eval_examples: + f.write(json.dumps(ex) + "\n") + +#A good formating for the llm +def format_for_model(ex): + return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}" + +#seeing the examples +print("Example formatted prompts (3):") + +#iterating over the egs +for ex in train_examples[:3]: + print(format_for_model(ex)) + print("-"*80) + +#tokenization now +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +tokenizer.pad_token = tokenizer.eos_token + +model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto") + +#check ifthe model successful +print(f"{MODEL_NAME} succceeded") + +# Sample random evaluation +sample_eval = random.sample(eval_examples, 10) + +baseline_preds, baseline_truths = [], [] + +#iteration over the evals +for ex in sample_eval: + prompt = f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:" + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + output = model.generate(**inputs, max_new_tokens=50, temperature=0.2, do_sample=False) + text_output = tokenizer.decode(output[0], skip_special_tokens=True) + + # Extract numeric prediction from model output + match = re.search(r"\$?(\d+(\.\d+)?)", text_output) + pred_price = float(match.group(1)) if match else None + true_price = float(ex["output"]) + + if pred_price is not None: + baseline_preds.append(pred_price) + baseline_truths.append(true_price) + + print(f"Predicted: {pred_price}, True: {true_price}") + +# Manual computation of metrics +if baseline_preds: + mae = mean_absolute_error(baseline_truths, baseline_preds) + mse = mean_squared_error(baseline_truths, baseline_preds) + rmse = mse ** 0.5 # take square root manually + print(f"\nBaseline MAE: ${mae:.2f}") + print(f"Baseline RMSE: ${rmse:.2f}") + +#inspectthe data a little +print(dataset) +print(dataset["train"].column_names) +print(dataset["train"][0]) # show one sample + +def preprocess_function(examples): + prompts = [ + f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n{out}" + for instr, inp, out in zip(examples["instruction"], examples["input"], examples["output"]) + ] + return tokenizer(prompts, truncation=True, padding="max_length", max_length=MAX_LENGTH) + +tokenized_datasets = dataset.map(preprocess_function, batched=True) + +data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) + +#updated for faster exp +training_args = TrainingArguments( + output_dir="./price-predictor-checkpoints", + num_train_epochs=1, # ⬅️ change from 2 to 1 + per_device_train_batch_size=1, + gradient_accumulation_steps=2, + learning_rate=2e-4, + fp16=True, + save_total_limit=1, + logging_steps=10, + report_to="none", +) + +#our trainer +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["eval"], + tokenizer=tokenizer, + data_collator=data_collator +) + +#outcomes +train_result = trainer.train() +trainer.save_model("./price-predictor-finetuned") + +# Loading fine-tuned model +model = AutoModelForCausalLM.from_pretrained("./price-predictor-finetuned", device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("./price-predictor-finetuned") + +#small evaluation subset +eval_dataset_small = dataset["eval"].shuffle(seed=42).select(range(min(50, len(dataset["eval"])))) +pred_prices, true_prices = [], [] + +# #iteration Over the tqdm +# for ex in tqdm(eval_dataset_small, desc="Evaluating"): +# prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:" +# inputs = tokenizer(prompt, return_tensors="pt").to(model.device) +# with torch.no_grad(): +# output = model.generate(**inputs, max_new_tokens=20) +# text = tokenizer.decode(output[0], skip_special_tokens=True) + +# # Extract numeric prediction +# numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text) +# pred = float(numbers[-1]) if numbers else np.nan + +# pred_prices.append(pred) +# true_prices.append(float(ex["output"])) + + +# Safe evaluation loop +for ex in tqdm(eval_dataset_small, desc="Evaluating"): + # Skip if output is missing or invalid + try: + true_val = float(ex["output"]) + except (ValueError, TypeError): + continue # skip this example + + prompt = ( + "### Instruction:\nEstimate the fair market price of this product in USD. " + "Return only a single number.\n\n" + f"### Input:\n{ex['input']}\n\n### Response:" + ) + + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + with torch.no_grad(): + output = model.generate(**inputs, max_new_tokens=20) + + text = tokenizer.decode(output[0], skip_special_tokens=True) + numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text) + pred = float(numbers[-1]) if numbers else np.nan + + pred_prices.append(pred) + true_prices.append(true_val) + +# Filter out invalid predictions +mask = ~np.isnan(pred_prices) + +pred_prices = np.array(pred_prices)[mask] + +# Convert to numpy arrays and align lengths +pred_prices = np.array(pred_prices, dtype=float) +true_prices = np.array(true_prices, dtype=float) + +# Ensure equal lengths just in case (zip trims to shortest) +min_len = min(len(pred_prices), len(true_prices)) +pred_prices = pred_prices[:min_len] +true_prices = true_prices[:min_len] + +# Drop NaNs safely +mask = ~np.isnan(pred_prices) +pred_prices = pred_prices[mask] +true_prices = true_prices[mask] + +# Compute metrics manually again +mae = mean_absolute_error(true_prices, pred_prices) + +mse = mean_squared_error(true_prices, pred_prices) + +rmse = np.sqrt(mse) + +r2 = r2_score(true_prices, pred_prices) + +print(f"Fine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}") + +#see what was predicted +plt.figure(figsize=(6,6)) +plt.scatter(true_prices, pred_prices, alpha=0.5) +plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction") +plt.xlabel("Actual Price (USD)") +plt.ylabel("Predicted Price (USD)") +plt.title("Predicted vs Actual Prices") +plt.legend() +plt.grid(True) +plt.show() + +#check the distribution +errors = np.abs(pred_prices - true_prices) +plt.figure(figsize=(8,4)) +plt.hist(errors, bins=30, edgecolor='k', alpha=0.7) +plt.title("Distribution of Absolute Errors") +plt.xlabel("Absolute Error ($)") +plt.ylabel("Frequency") +plt.show() + +print(f"Average Error: ${np.mean(errors):.2f}, Median Error: ${np.median(errors):.2f}") \ No newline at end of file From a8619a1caad89fa8dfb2e327b7a5025efefba07d Mon Sep 17 00:00:00 2001 From: Cosmus Mutuku Date: Tue, 28 Oct 2025 07:51:03 +0300 Subject: [PATCH 2/3] Week7 exercise --- ...xercise_final.py => week7_exercise (7).py} | 130 +++++++++++------- 1 file changed, 83 insertions(+), 47 deletions(-) rename week7/community_contributions/{Week_7_exercise_final.py => week7_exercise (7).py} (76%) diff --git a/week7/community_contributions/Week_7_exercise_final.py b/week7/community_contributions/week7_exercise (7).py similarity index 76% rename from week7/community_contributions/Week_7_exercise_final.py rename to week7/community_contributions/week7_exercise (7).py index efd8207..bf3d5e2 100644 --- a/week7/community_contributions/Week_7_exercise_final.py +++ b/week7/community_contributions/week7_exercise (7).py @@ -104,6 +104,10 @@ print(dataset["train"][0]) # somecleaning on prices df["price_clean"] = pd.to_numeric(df["price"], errors="coerce") +#Print the data showing the price and price cleaned to see they are actual not all 0 +print(df_clean[["title", "price", "price_clean"]].head(10)) +print(f"\nNumber of valid price entries: {df_clean['price_clean'].notna().sum()}") + # Bringing the text fields togeter def combine_text(row): title = row["title"] or "" @@ -142,6 +146,14 @@ with open("pricing_eval.jsonl", "w") as f: for ex in eval_examples: f.write(json.dumps(ex) + "\n") +#Check the price exists in the Saved JSON aboved +with open("pricing_train.jsonl") as f: + lines = [json.loads(line) for line in f] + +print("Sample outputs from training data:") +for ex in lines[:5]: + print(ex["output"]) + #A good formating for the llm def format_for_model(ex): return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}" @@ -213,7 +225,7 @@ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) #updated for faster exp training_args = TrainingArguments( output_dir="./price-predictor-checkpoints", - num_train_epochs=1, # ⬅️ change from 2 to 1 + num_train_epochs=1, # ⬅change from 2 to 1 per_device_train_batch_size=1, gradient_accumulation_steps=2, learning_rate=2e-4, @@ -241,69 +253,59 @@ trainer.save_model("./price-predictor-finetuned") model = AutoModelForCausalLM.from_pretrained("./price-predictor-finetuned", device_map="auto") tokenizer = AutoTokenizer.from_pretrained("./price-predictor-finetuned") -#small evaluation subset -eval_dataset_small = dataset["eval"].shuffle(seed=42).select(range(min(50, len(dataset["eval"])))) -pred_prices, true_prices = [], [] +# Inspect one example from your fine-tuning eval dataset before using it +print("Inspecting one evaluation example (should have instruction, input, output):") +with open("pricing_eval.jsonl") as f: + sample_eval = [json.loads(line) for line in f][:3] # just a few samples +for ex in sample_eval: + print(json.dumps(ex, indent=2)) -# #iteration Over the tqdm -# for ex in tqdm(eval_dataset_small, desc="Evaluating"): -# prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:" -# inputs = tokenizer(prompt, return_tensors="pt").to(model.device) -# with torch.no_grad(): -# output = model.generate(**inputs, max_new_tokens=20) -# text = tokenizer.decode(output[0], skip_special_tokens=True) +eval_dataset_small = load_dataset("json", data_files="pricing_eval.jsonl")["train"] +eval_dataset_small = eval_dataset_small.shuffle(seed=42).select(range(min(50, len(eval_dataset_small)))) -# # Extract numeric prediction -# numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text) -# pred = float(numbers[-1]) if numbers else np.nan +for ex in eval_dataset_small.select(range(5)): + print("Output price:", ex["output"]) -# pred_prices.append(pred) -# true_prices.append(float(ex["output"])) - - -# Safe evaluation loop +#iteration Over the tqdm for ex in tqdm(eval_dataset_small, desc="Evaluating"): - # Skip if output is missing or invalid - try: - true_val = float(ex["output"]) - except (ValueError, TypeError): - continue # skip this example - - prompt = ( - "### Instruction:\nEstimate the fair market price of this product in USD. " - "Return only a single number.\n\n" - f"### Input:\n{ex['input']}\n\n### Response:" - ) - + prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate(**inputs, max_new_tokens=20) - text = tokenizer.decode(output[0], skip_special_tokens=True) + + # Extract numeric prediction numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text) pred = float(numbers[-1]) if numbers else np.nan pred_prices.append(pred) - true_prices.append(true_val) + true_prices.append(float(ex["output"])) + +# --- Fix length mismatch and mask NaNs --- +import numpy as np + +pred_prices = np.array(pred_prices, dtype=float) +true_prices = np.array(true_prices, dtype=float) + +# Ensure both arrays are same length +min_len = min(len(pred_prices), len(true_prices)) +pred_prices = pred_prices[:min_len] +true_prices = true_prices[:min_len] + +# Filter out NaNs or nonsensical large predictions +mask = (~np.isnan(pred_prices)) & (pred_prices < 10000) # exclude any predictions above $10k +pred_prices = pred_prices[mask] +true_prices = true_prices[mask] + +print("Arrays aligned:") +print("Preds:", len(pred_prices), "Truths:", len(true_prices)) # Filter out invalid predictions mask = ~np.isnan(pred_prices) pred_prices = np.array(pred_prices)[mask] -# Convert to numpy arrays and align lengths -pred_prices = np.array(pred_prices, dtype=float) -true_prices = np.array(true_prices, dtype=float) - -# Ensure equal lengths just in case (zip trims to shortest) -min_len = min(len(pred_prices), len(true_prices)) -pred_prices = pred_prices[:min_len] -true_prices = true_prices[:min_len] - -# Drop NaNs safely -mask = ~np.isnan(pred_prices) -pred_prices = pred_prices[mask] -true_prices = true_prices[mask] +true_prices = np.array(true_prices)[mask] # Compute metrics manually again mae = mean_absolute_error(true_prices, pred_prices) @@ -327,6 +329,18 @@ plt.legend() plt.grid(True) plt.show() +#Zoom +plt.figure(figsize=(6,6)) +plt.scatter(true_prices, pred_prices, alpha=0.6) +plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction") +plt.xlabel("Actual Price (USD)") +plt.ylabel("Predicted Price (USD)") +plt.title("Predicted vs Actual Prices (Zoomed In)") +plt.ylim(0, 600) # Zoom y-axis +plt.legend() +plt.grid(True) +plt.show() + #check the distribution errors = np.abs(pred_prices - true_prices) plt.figure(figsize=(8,4)) @@ -336,4 +350,26 @@ plt.xlabel("Absolute Error ($)") plt.ylabel("Frequency") plt.show() -print(f"Average Error: ${np.mean(errors):.2f}, Median Error: ${np.median(errors):.2f}") \ No newline at end of file +print(f"Average Error: ${np.mean(errors):.2f}, Median Error: ${np.median(errors):.2f}") + +# Load the base model +model_name = "facebook/opt-125m" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") + +# Sample 5 examples from your eval_df +examples = eval_df.sample(5, random_state=42) + +for i, row in examples.iterrows(): + prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{row['text']}\n\n### Response:" + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + + with torch.no_grad(): + outputs = model.generate(**inputs, max_new_tokens=20) + + prediction_text = tokenizer.decode(outputs[0], skip_special_tokens=True) + + print(f"\n--- Example {i} ---") + print("Prompt:\n", prompt[:200], "...") + print("Model output:\n", prediction_text) + print("Actual price:", row["price_clean"]) \ No newline at end of file From d0d7bf22bf50e442d3b3e45e043d58e34c4c2276 Mon Sep 17 00:00:00 2001 From: Cosmus Mutuku Date: Wed, 29 Oct 2025 06:23:12 +0300 Subject: [PATCH 3/3] Week7 exercise --- .../week7_exercise (10).py | 445 ++++++++++++++++++ .../week7_exercise (7).py | 375 --------------- 2 files changed, 445 insertions(+), 375 deletions(-) create mode 100644 week7/community_contributions/week7_exercise (10).py delete mode 100644 week7/community_contributions/week7_exercise (7).py diff --git a/week7/community_contributions/week7_exercise (10).py b/week7/community_contributions/week7_exercise (10).py new file mode 100644 index 0000000..985ad93 --- /dev/null +++ b/week7/community_contributions/week7_exercise (10).py @@ -0,0 +1,445 @@ +# -*- coding: utf-8 -*- +"""Week7_Exercise.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1wxcBNWbsDDC_SwXnQZP2dmo7ddOxkJmU +""" + +#my pip installtions (some of them were not used in the project but i initally planned to use them (and we're left here so that I after the project i revisit the notebook and update when i have time and more sources.)) +!pip install -q --upgrade pip +!pip install -q transformers accelerate peft bitsandbytes trl sentencepiece safetensors +!pip install -q wandb +!pip install -q git+https://github.com/huggingface/peft.git@main +!pip install datasets==3.0.1 +!pip install evaluate -q +!pip install --upgrade scikit-learn + +#All imports +import os, random, json, re +import pandas as pd +import numpy as np +import torch +import matplotlib.pyplot as plt +from tqdm import tqdm +from sklearn.model_selection import train_test_split +from sklearn.model_selection import train_test_split +from datasets import Dataset, DatasetDict +from IPython.display import Markdown as md +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +from transformers import ( + AutoTokenizer, + AutoModelForCausalLM, + TrainingArguments, + Trainer, + DataCollatorForLanguageModeling +) + +#I tried a lot of models and every time my colab ran out of RAM, i did a lot of tweakings including replacing models with smaller ones +#I also used try and error to come up with samples size, eval size etc that would fit in my limited T4 Ram (had started from sample size of 15k down to 200) +MODEL_NAME = "facebook/opt-125m" +SAMPLE_SIZE = 200 +EVAL_SIZE = 50 +MAX_LENGTH = 128 +RANDOM_SEED = 42 + +#Seeting LoRa hyper parameters +LORA_R = 4 +LORA_ALPHA = 8 +LORA_DROPOUT = 0.05 +#Target modules to apply LoRA. I kept these to just "q_proj" and "v_proj" to lower memory usage on a T4 GPU. +TARGET_MODULES = ["q_proj", "v_proj"] + +#to make sure thes expriment is reproducible +random.seed(RANDOM_SEED) +np.random.seed(RANDOM_SEED) +torch.manual_seed(RANDOM_SEED) + +#Hf data +DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023" +SUBSET = "raw_meta_Appliances" + +#loading the data +dataset = load_dataset(DATASET_NAME, SUBSET, split="full") +df = dataset.to_pandas() + +# somecleaning on prices +df["price_clean"] = pd.to_numeric(df["price"], errors="coerce") + +# Bringing the text fields togeter +def combine_text(row): + title = row["title"] or "" + features = " ".join(row["features"]) if isinstance(row["features"], list) else "" + description = " ".join(row["description"]) if isinstance(row["description"], list) else "" + return f"TITLE: {title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}" + +df["text"] = df.apply(combine_text, axis=1) + +df_clean = df.dropna(subset=["price_clean"]).reset_index(drop=True) + +# Example: sample the cleaned data for faster experimentation +df_sample = df_clean.sample(n=5000, random_state=42) # adjust n as needed + +train_df, eval_df = train_test_split(df_sample, test_size=0.2, random_state=42) + +print(f"Train size: {len(train_df)}, Eval size: {len(eval_df)}") + +train_df.head(2) + +hf_train = Dataset.from_pandas(train_df.reset_index(drop=True)) +hf_eval = Dataset.from_pandas(eval_df.reset_index(drop=True)) +dataset = DatasetDict({"train": hf_train, "eval": hf_eval}) + +# Add instruction + numeric target (log-transformed) +import numpy as np + +def add_instruction_and_target(ex): + ex["instruction"] = "Estimate the fair market price of this product in USD. Return only a single number." + price = float(ex["price_clean"]) + ex["target_log"] = np.log1p(price) # log1p makes training easier + ex["target_str"] = f"{ex['target_log']:.6f}" # as string for LM + return ex + +dataset = dataset.map(add_instruction_and_target) + +print(dataset) +print(dataset["train"][0]) + +print(df_clean[["title", "price", "price_clean"]].head(10)) +print(f"\nNumber of valid price entries: {df_clean['price_clean'].notna().sum()}") + +# trying to downsamble for RAM purposes-hoping the punshment to results wasn't much +if len(df_clean) > SAMPLE_SIZE + EVAL_SIZE: + df_sample = df_clean.sample(SAMPLE_SIZE + EVAL_SIZE, random_state=RANDOM_SEED).reset_index(drop=True) +else: + df_sample = df_clean.copy() + +train_df, eval_df = train_test_split(df_sample, test_size=EVAL_SIZE, random_state=RANDOM_SEED) + +# FHow to format the examples +def make_example(row): + instruction = "Estimate the fair market price of this product in USD. Return only a single number." + input_text = row["text"] + output = f"{float(row['price_clean']):.2f}" + return {"instruction": instruction, "input": input_text, "output": output} + +train_examples = [make_example(r) for _, r in train_df.iterrows()] +eval_examples = [make_example(r) for _, r in eval_df.iterrows()] + +# Saving into JSONL +with open("pricing_train.jsonl", "w") as f: + for ex in train_examples: + f.write(json.dumps(ex) + "\n") + +with open("pricing_eval.jsonl", "w") as f: + for ex in eval_examples: + f.write(json.dumps(ex) + "\n") + +#A good formating for the llm +def format_for_model(ex): + return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}" + +#seeing the examples +print("Example formatted prompts (3):") + +#iterating over the egs +for ex in train_examples[:3]: + print(format_for_model(ex)) + print("-"*80) + +#tokenization now +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +tokenizer.pad_token = tokenizer.eos_token + +model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto") + +#check ifthe model successful +print(f"{MODEL_NAME} succceeded") + +# ===== Tokenization & dataset preprocessing ===== +MAX_LENGTH = 128 + +def preprocess_for_training(examples): + input_ids, attention_masks, labels = [], [], [] + + for instr, inp, tgt in zip(examples["instruction"], examples["text"], examples["target_str"]): + prompt = f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n" + full = prompt + tgt + + tok_full = tokenizer(full, truncation=True, max_length=MAX_LENGTH, padding="max_length") + tok_prompt = tokenizer(prompt, truncation=True, max_length=MAX_LENGTH, padding="max_length") + + inp_ids = tok_full["input_ids"] + attn_mask = tok_full["attention_mask"] + + prompt_len = sum(1 for t in tok_prompt["input_ids"] if t != tokenizer.pad_token_id) + label_ids = [-100] * prompt_len + inp_ids[prompt_len:] + label_ids = label_ids[:MAX_LENGTH] + + input_ids.append(inp_ids) + attention_masks.append(attn_mask) + labels.append(label_ids) + + return { + "input_ids": input_ids, + "attention_mask": attention_masks, + "labels": labels + } + +# Map this to dataset +tokenized_datasets = dataset.map( + preprocess_for_training, + batched=True, + remove_columns=dataset["train"].column_names +) + +print(" Tokenization complete:", tokenized_datasets) + +# Sample random evaluation +sample_eval = random.sample(eval_examples, 10) + +baseline_preds, baseline_truths = [], [] + +baseline_preds = [] +baseline_truths = [] + +for ex in tqdm(sample_eval, desc="Evaluating"): + true_price = float(ex["output"]) + + prompt = ( + f"### Instruction:\nEstimate the fair market price of this product in USD. " + f"Return only a number — no text, no currency symbols.\n\n" + f"### Input:\n{ex['input']}\n\n### Response:" + ) + + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + + with torch.no_grad(): + output = model.generate( + **inputs, + max_new_tokens=20, + temperature=0.0, + do_sample=False + ) + + generated_tokens = output[0][inputs["input_ids"].shape[-1]:] + text_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip() + + # Extract number from output + match = re.search(r"[-+]?\d*\.?\d+", text_output) + pred_price = float(match.group()) if match else None + + # Apply inverse log if model outputs logs + if pred_price is not None: + if pred_price < 20: # log range, assuming trained on log(price) + pred_price = np.expm1(pred_price) + else: + print(f" Skipping log transform, raw pred {pred_price}") + + # Only keep realistic prices + if 0 < pred_price < 1e4: + baseline_preds.append(pred_price) + baseline_truths.append(true_price) + else: + print(f" Skipping unreasonable pred {pred_price}") + else: + print(f" No number extracted from: {text_output}") + + print(f" Predicted: {pred_price}, True: {true_price}, Raw: {text_output}") + +print(f"\nNumber of baseline predictions: {len(baseline_preds)}") +print(f"Number of baseline truths: {len(baseline_truths)}") + +# Manual computation of metrics +if baseline_preds: + mae = mean_absolute_error(baseline_truths, baseline_preds) + mse = mean_squared_error(baseline_truths, baseline_preds) + rmse = mse ** 0.5 # take square root manually + print(f"\nBaseline MAE: ${mae:.2f}") + print(f"Baseline RMSE: ${rmse:.2f}") + +print(f"Number of baseline predictions: {len(baseline_preds)}") +print(f"Number of baseline truths: {len(baseline_truths)}") + +#inspectthe data a little +print(dataset) +print(dataset["train"].column_names) +print(dataset["train"][0]) # show one sample + +# create TrainingArguments and Trainer, compatible with older transformers === +import transformers +print("transformers version:", transformers.__version__) + +# decide whether evaluation_strategy is supported +supports_eval_strategy = False +try: + import inspect + sig = inspect.signature(transformers.TrainingArguments.__init__) + if 'evaluation_strategy' in sig.parameters: + supports_eval_strategy = True +except Exception: + # fallback: assume older version + supports_eval_strategy = False + +from transformers import TrainingArguments, Trainer + +# common args +common_args = dict( + output_dir="./price-predictor-checkpoints", + num_train_epochs=3, + per_device_train_batch_size=2, + gradient_accumulation_steps=4, + learning_rate=2e-5, + fp16=True, # if you see fp16 errors, set this to False + save_total_limit=1, + logging_steps=10, + report_to="none", +) + +if supports_eval_strategy: + print("Using evaluation_strategy in TrainingArguments (newer transformers).") + training_args = TrainingArguments( + **common_args, + evaluation_strategy="steps", + eval_steps=100, + save_strategy="steps", + save_steps=200, + ) +else: + print("evaluation_strategy not supported in this transformers version. Using minimal TrainingArguments and running trainer.evaluate() after training.") + # remove args unknown to old versions + # older versions may also not accept fp16 or report_to; if errors appear, set fp16=False and remove report_to + training_args = TrainingArguments(**common_args) + +# Build the Trainer +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["eval"], + tokenizer=tokenizer +) + +# Train +train_result = trainer.train() + +# If evaluation_strategy wasn't available, run a manual evaluate here +if not supports_eval_strategy: + print("Running manual evaluation because evaluation_strategy was not available...") + eval_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["eval"]) + print("Trainer.evaluate metrics:", eval_metrics) + +# Save model and tokenizer +trainer.save_model("./price-predictor-finetuned") +try: + tokenizer.save_pretrained("./price-predictor-finetuned") +except Exception as e: + print("Could not save tokenizer:", e) + +#outcomes +#train_result = trainer.train() +trainer.save_model("./price-predictor-finetuned") + +# Loading fine-tuned model +model = AutoModelForCausalLM.from_pretrained("./price-predictor-finetuned", device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("./price-predictor-finetuned") + +#small evaluation subset +eval_dataset_small = dataset["eval"].shuffle(seed=42).select(range(min(50, len(dataset["eval"])))) +pred_prices, true_prices = [], [] + +pred_prices = [] +true_prices = [] + +for ex in tqdm(eval_dataset_small, desc="Evaluating"): + # check which column exists + if "target_str" in ex: + true_price_log = float(ex["target_str"]) + true_price = np.expm1(true_price_log) # convert back from log1p + elif "output" in ex: + true_price = float(ex["output"]) + elif "price_clean" in ex: + true_price = float(ex["price_clean"]) + else: + raise KeyError("No valid price column found in eval example.") + + # Skip invalid prices + if np.isnan(true_price): + continue + + prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex.get('text', ex.get('input', ''))}\n\n### Response:" + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + + with torch.no_grad(): + output = model.generate(**inputs, max_new_tokens=20, temperature=0.2, do_sample=False) + + text = tokenizer.decode(output[0], skip_special_tokens=True) + + # Extract numeric prediction + numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text) + if not numbers: + continue + + pred = float(numbers[-1]) + + # If you trained on log prices, exponentiate to get back to USD + if "target_str" in ex: + pred = np.expm1(pred) + + pred_prices.append(pred) + true_prices.append(true_price) + +# -- Compute metrics -- +pred_prices = np.array(pred_prices) +true_prices = np.array(true_prices) + +mae = mean_absolute_error(true_prices, pred_prices) +rmse = np.sqrt(mean_squared_error(true_prices, pred_prices)) +r2 = r2_score(true_prices, pred_prices) + +print(f"\nFine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}") + +# #iteration Over the tqdm +# for ex in tqdm(eval_dataset_small, desc="Evaluating"): +# prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:" +# inputs = tokenizer(prompt, return_tensors="pt").to(model.device) +# with torch.no_grad(): +# output = model.generate(**inputs, max_new_tokens=20) +# text = tokenizer.decode(output[0], skip_special_tokens=True) + +# # Extract numeric prediction +# numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text) +# pred = float(numbers[-1]) if numbers else np.nan + +# pred_prices.append(pred) +# true_prices.append(float(ex["output"])) + +# Filter out invalid predictions +mask = ~np.isnan(pred_prices) + +pred_prices = np.array(pred_prices)[mask] + +true_prices = np.array(true_prices)[mask] + +# Compute metrics manually again +mae = mean_absolute_error(true_prices, pred_prices) + +mse = mean_squared_error(true_prices, pred_prices) + +rmse = np.sqrt(mse) + +r2 = r2_score(true_prices, pred_prices) + +print(f"Fine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}") + +#see what was predicted +plt.figure(figsize=(6,6)) +plt.scatter(true_prices, pred_prices, alpha=0.5) +plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction") +plt.xlabel("Actual Price (USD)") +plt.ylabel("Predicted Price (USD)") +plt.title("Predicted vs Actual Prices") +plt.legend() +plt.grid(True) +plt.show() \ No newline at end of file diff --git a/week7/community_contributions/week7_exercise (7).py b/week7/community_contributions/week7_exercise (7).py deleted file mode 100644 index bf3d5e2..0000000 --- a/week7/community_contributions/week7_exercise (7).py +++ /dev/null @@ -1,375 +0,0 @@ -# -*- coding: utf-8 -*- -"""Week7_Exercise.ipynb - -Automatically generated by Colab. - -Original file is located at - https://colab.research.google.com/drive/1wxcBNWbsDDC_SwXnQZP2dmo7ddOxkJmU -""" - -#my pip installtions (some of them were not used in the project but i initally planned to use them (and we're left here so that I after the project i revisit the notebook and update when i have time and more sources.)) -!pip install -q --upgrade pip -!pip install -q transformers accelerate peft bitsandbytes trl sentencepiece safetensors -!pip install -q wandb -!pip install -q git+https://github.com/huggingface/peft.git@main -!pip install datasets==3.0.1 -!pip install evaluate -q -!pip install --upgrade scikit-learn - -#All imports -import os, random, json, re -import pandas as pd -import numpy as np -import torch -import matplotlib.pyplot as plt -from tqdm import tqdm -from sklearn.model_selection import train_test_split -from datasets import load_dataset -from IPython.display import Markdown as md -from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score -from transformers import ( - AutoTokenizer, - AutoModelForCausalLM, - TrainingArguments, - Trainer, - DataCollatorForLanguageModeling -) - -#I tried a lot of models and every time my colab ran out of RAM, i did a lot of tweakings including replacing models with smaller ones -#I also used try and error to come up with samples size, eval size etc that would fit in my limited T4 Ram (had started from sample size of 15k down to 200) -MODEL_NAME = "facebook/opt-125m" -SAMPLE_SIZE = 200 -EVAL_SIZE = 50 -MAX_LENGTH = 128 -RANDOM_SEED = 42 - -#Seeting LoRa hyper parameters -LORA_R = 4 -LORA_ALPHA = 8 -LORA_DROPOUT = 0.05 -#Target modules to apply LoRA. I kept these to just "q_proj" and "v_proj" to lower memory usage on a T4 GPU. -TARGET_MODULES = ["q_proj", "v_proj"] - -#to make sure thes expriment is reproducible -random.seed(RANDOM_SEED) -np.random.seed(RANDOM_SEED) -torch.manual_seed(RANDOM_SEED) - -#Hf data -DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023" -SUBSET = "raw_meta_Appliances" - -#loading the data -dataset = load_dataset(DATASET_NAME, SUBSET, split="full") -df = dataset.to_pandas() - -from datasets import Dataset, DatasetDict - -# #this took forever to run making me update it - -# Split into train/eval -split = dataset.train_test_split(test_size=0.2, seed=42) -train_dataset = split["train"] -eval_dataset = split["test"] - -# Reduce dataset sizes for quick experimentation -MAX_TRAIN_SAMPLES = 2000 # or 2000 if you want it even faster -MAX_EVAL_SAMPLES = 500 - -train_dataset = train_dataset.shuffle(seed=42).select(range(min(MAX_TRAIN_SAMPLES, len(train_dataset)))) -eval_dataset = eval_dataset.shuffle(seed=42).select(range(min(MAX_EVAL_SAMPLES, len(eval_dataset)))) - - - -# Wrap into a DatasetDict for Trainer compatibility -dataset = DatasetDict({"train": train_dataset, "eval": eval_dataset}) - -# Prepare columns for your preprocessing -# Rename relevant columns to match what preprocess_function expects -dataset = dataset.rename_columns({ - "title": "input", - "price": "output" -}) - -# Add a fixed instruction since your dataset doesn’t have one -def add_instruction(example): - example["instruction"] = "Estimate the fair market price of this product in USD. Return only a single number." - return example - -dataset = dataset.map(add_instruction) - -print(dataset) -print(dataset["train"][0]) - -# somecleaning on prices -df["price_clean"] = pd.to_numeric(df["price"], errors="coerce") - -#Print the data showing the price and price cleaned to see they are actual not all 0 -print(df_clean[["title", "price", "price_clean"]].head(10)) -print(f"\nNumber of valid price entries: {df_clean['price_clean'].notna().sum()}") - -# Bringing the text fields togeter -def combine_text(row): - title = row["title"] or "" - features = " ".join(row["features"]) if isinstance(row["features"], list) else "" - description = " ".join(row["description"]) if isinstance(row["description"], list) else "" - return f"TITLE: {title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}" - -df["text"] = df.apply(combine_text, axis=1) - -df_clean = df.dropna(subset=["price_clean"]).reset_index(drop=True) - -# trying to downsamble for RAM purposes-hoping the punshment to results wasn't much -if len(df_clean) > SAMPLE_SIZE + EVAL_SIZE: - df_sample = df_clean.sample(SAMPLE_SIZE + EVAL_SIZE, random_state=RANDOM_SEED).reset_index(drop=True) -else: - df_sample = df_clean.copy() - -train_df, eval_df = train_test_split(df_sample, test_size=EVAL_SIZE, random_state=RANDOM_SEED) - -# FHow to format the examples -def make_example(row): - instruction = "Estimate the fair market price of this product in USD. Return only a single number." - input_text = row["text"] - output = f"{float(row['price_clean']):.2f}" - return {"instruction": instruction, "input": input_text, "output": output} - -train_examples = [make_example(r) for _, r in train_df.iterrows()] -eval_examples = [make_example(r) for _, r in eval_df.iterrows()] - -# Saving into JSONL -with open("pricing_train.jsonl", "w") as f: - for ex in train_examples: - f.write(json.dumps(ex) + "\n") - -with open("pricing_eval.jsonl", "w") as f: - for ex in eval_examples: - f.write(json.dumps(ex) + "\n") - -#Check the price exists in the Saved JSON aboved -with open("pricing_train.jsonl") as f: - lines = [json.loads(line) for line in f] - -print("Sample outputs from training data:") -for ex in lines[:5]: - print(ex["output"]) - -#A good formating for the llm -def format_for_model(ex): - return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}" - -#seeing the examples -print("Example formatted prompts (3):") - -#iterating over the egs -for ex in train_examples[:3]: - print(format_for_model(ex)) - print("-"*80) - -#tokenization now -tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) -tokenizer.pad_token = tokenizer.eos_token - -model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto") - -#check ifthe model successful -print(f"{MODEL_NAME} succceeded") - -# Sample random evaluation -sample_eval = random.sample(eval_examples, 10) - -baseline_preds, baseline_truths = [], [] - -#iteration over the evals -for ex in sample_eval: - prompt = f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:" - inputs = tokenizer(prompt, return_tensors="pt").to(model.device) - output = model.generate(**inputs, max_new_tokens=50, temperature=0.2, do_sample=False) - text_output = tokenizer.decode(output[0], skip_special_tokens=True) - - # Extract numeric prediction from model output - match = re.search(r"\$?(\d+(\.\d+)?)", text_output) - pred_price = float(match.group(1)) if match else None - true_price = float(ex["output"]) - - if pred_price is not None: - baseline_preds.append(pred_price) - baseline_truths.append(true_price) - - print(f"Predicted: {pred_price}, True: {true_price}") - -# Manual computation of metrics -if baseline_preds: - mae = mean_absolute_error(baseline_truths, baseline_preds) - mse = mean_squared_error(baseline_truths, baseline_preds) - rmse = mse ** 0.5 # take square root manually - print(f"\nBaseline MAE: ${mae:.2f}") - print(f"Baseline RMSE: ${rmse:.2f}") - -#inspectthe data a little -print(dataset) -print(dataset["train"].column_names) -print(dataset["train"][0]) # show one sample - -def preprocess_function(examples): - prompts = [ - f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n{out}" - for instr, inp, out in zip(examples["instruction"], examples["input"], examples["output"]) - ] - return tokenizer(prompts, truncation=True, padding="max_length", max_length=MAX_LENGTH) - -tokenized_datasets = dataset.map(preprocess_function, batched=True) - -data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) - -#updated for faster exp -training_args = TrainingArguments( - output_dir="./price-predictor-checkpoints", - num_train_epochs=1, # ⬅change from 2 to 1 - per_device_train_batch_size=1, - gradient_accumulation_steps=2, - learning_rate=2e-4, - fp16=True, - save_total_limit=1, - logging_steps=10, - report_to="none", -) - -#our trainer -trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_datasets["train"], - eval_dataset=tokenized_datasets["eval"], - tokenizer=tokenizer, - data_collator=data_collator -) - -#outcomes -train_result = trainer.train() -trainer.save_model("./price-predictor-finetuned") - -# Loading fine-tuned model -model = AutoModelForCausalLM.from_pretrained("./price-predictor-finetuned", device_map="auto") -tokenizer = AutoTokenizer.from_pretrained("./price-predictor-finetuned") - -# Inspect one example from your fine-tuning eval dataset before using it -print("Inspecting one evaluation example (should have instruction, input, output):") -with open("pricing_eval.jsonl") as f: - sample_eval = [json.loads(line) for line in f][:3] # just a few samples -for ex in sample_eval: - print(json.dumps(ex, indent=2)) - -eval_dataset_small = load_dataset("json", data_files="pricing_eval.jsonl")["train"] -eval_dataset_small = eval_dataset_small.shuffle(seed=42).select(range(min(50, len(eval_dataset_small)))) - -for ex in eval_dataset_small.select(range(5)): - print("Output price:", ex["output"]) - -#iteration Over the tqdm -for ex in tqdm(eval_dataset_small, desc="Evaluating"): - prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:" - inputs = tokenizer(prompt, return_tensors="pt").to(model.device) - with torch.no_grad(): - output = model.generate(**inputs, max_new_tokens=20) - text = tokenizer.decode(output[0], skip_special_tokens=True) - - # Extract numeric prediction - numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text) - pred = float(numbers[-1]) if numbers else np.nan - - pred_prices.append(pred) - true_prices.append(float(ex["output"])) - -# --- Fix length mismatch and mask NaNs --- -import numpy as np - -pred_prices = np.array(pred_prices, dtype=float) -true_prices = np.array(true_prices, dtype=float) - -# Ensure both arrays are same length -min_len = min(len(pred_prices), len(true_prices)) -pred_prices = pred_prices[:min_len] -true_prices = true_prices[:min_len] - -# Filter out NaNs or nonsensical large predictions -mask = (~np.isnan(pred_prices)) & (pred_prices < 10000) # exclude any predictions above $10k -pred_prices = pred_prices[mask] -true_prices = true_prices[mask] - -print("Arrays aligned:") -print("Preds:", len(pred_prices), "Truths:", len(true_prices)) - -# Filter out invalid predictions -mask = ~np.isnan(pred_prices) - -pred_prices = np.array(pred_prices)[mask] - -true_prices = np.array(true_prices)[mask] - -# Compute metrics manually again -mae = mean_absolute_error(true_prices, pred_prices) - -mse = mean_squared_error(true_prices, pred_prices) - -rmse = np.sqrt(mse) - -r2 = r2_score(true_prices, pred_prices) - -print(f"Fine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}") - -#see what was predicted -plt.figure(figsize=(6,6)) -plt.scatter(true_prices, pred_prices, alpha=0.5) -plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction") -plt.xlabel("Actual Price (USD)") -plt.ylabel("Predicted Price (USD)") -plt.title("Predicted vs Actual Prices") -plt.legend() -plt.grid(True) -plt.show() - -#Zoom -plt.figure(figsize=(6,6)) -plt.scatter(true_prices, pred_prices, alpha=0.6) -plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction") -plt.xlabel("Actual Price (USD)") -plt.ylabel("Predicted Price (USD)") -plt.title("Predicted vs Actual Prices (Zoomed In)") -plt.ylim(0, 600) # Zoom y-axis -plt.legend() -plt.grid(True) -plt.show() - -#check the distribution -errors = np.abs(pred_prices - true_prices) -plt.figure(figsize=(8,4)) -plt.hist(errors, bins=30, edgecolor='k', alpha=0.7) -plt.title("Distribution of Absolute Errors") -plt.xlabel("Absolute Error ($)") -plt.ylabel("Frequency") -plt.show() - -print(f"Average Error: ${np.mean(errors):.2f}, Median Error: ${np.median(errors):.2f}") - -# Load the base model -model_name = "facebook/opt-125m" -tokenizer = AutoTokenizer.from_pretrained(model_name) -model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") - -# Sample 5 examples from your eval_df -examples = eval_df.sample(5, random_state=42) - -for i, row in examples.iterrows(): - prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{row['text']}\n\n### Response:" - inputs = tokenizer(prompt, return_tensors="pt").to(model.device) - - with torch.no_grad(): - outputs = model.generate(**inputs, max_new_tokens=20) - - prediction_text = tokenizer.decode(outputs[0], skip_special_tokens=True) - - print(f"\n--- Example {i} ---") - print("Prompt:\n", prompt[:200], "...") - print("Model output:\n", prediction_text) - print("Actual price:", row["price_clean"]) \ No newline at end of file