Merge pull request #918 from bharat109puri/main
Week7 and 8 - Assignment submitted by Bharat Puri
This commit is contained in:
447
week7/community_contributions/bharat_puri/fine_tuning.py
Normal file
447
week7/community_contributions/bharat_puri/fine_tuning.py
Normal file
@@ -0,0 +1,447 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Week7_Exercise.ipynb
|
||||
|
||||
submitted by Bharat Puri
|
||||
|
||||
Automatically generated by Colab.
|
||||
|
||||
Original file is located at
|
||||
https://colab.research.google.com/drive/1wxcBNWbsDDC_SwXnQZP2dmo7ddOxkJmU
|
||||
"""
|
||||
|
||||
#my pip installtions (some of them were not used in the project but i initally planned to use them (and we're left here so that I after the project i revisit the notebook and update when i have time and more sources.))
|
||||
!pip install -q --upgrade pip
|
||||
!pip install -q transformers accelerate peft bitsandbytes trl sentencepiece safetensors
|
||||
!pip install -q wandb
|
||||
!pip install -q git+https://github.com/huggingface/peft.git@main
|
||||
!pip install datasets==3.0.1
|
||||
!pip install evaluate -q
|
||||
!pip install --upgrade scikit-learn
|
||||
|
||||
#All imports
|
||||
import os, random, json, re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
import matplotlib.pyplot as plt
|
||||
from tqdm import tqdm
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import train_test_split
|
||||
from datasets import Dataset, DatasetDict
|
||||
from IPython.display import Markdown as md
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForCausalLM,
|
||||
TrainingArguments,
|
||||
Trainer,
|
||||
DataCollatorForLanguageModeling
|
||||
)
|
||||
|
||||
#I tried a lot of models and every time my colab ran out of RAM, i did a lot of tweakings including replacing models with smaller ones
|
||||
#I also used try and error to come up with samples size, eval size etc that would fit in my limited T4 Ram (had started from sample size of 15k down to 200)
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
SAMPLE_SIZE = 200
|
||||
EVAL_SIZE = 50
|
||||
MAX_LENGTH = 128
|
||||
RANDOM_SEED = 42
|
||||
|
||||
#Seeting LoRa hyper parameters
|
||||
LORA_R = 4
|
||||
LORA_ALPHA = 8
|
||||
LORA_DROPOUT = 0.05
|
||||
#Target modules to apply LoRA. I kept these to just "q_proj" and "v_proj" to lower memory usage on a T4 GPU.
|
||||
TARGET_MODULES = ["q_proj", "v_proj"]
|
||||
|
||||
#to make sure thes expriment is reproducible
|
||||
random.seed(RANDOM_SEED)
|
||||
np.random.seed(RANDOM_SEED)
|
||||
torch.manual_seed(RANDOM_SEED)
|
||||
|
||||
#Hf data
|
||||
DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023"
|
||||
SUBSET = "raw_meta_Appliances"
|
||||
|
||||
#loading the data
|
||||
dataset = load_dataset(DATASET_NAME, SUBSET, split="full")
|
||||
df = dataset.to_pandas()
|
||||
|
||||
# somecleaning on prices
|
||||
df["price_clean"] = pd.to_numeric(df["price"], errors="coerce")
|
||||
|
||||
# Bringing the text fields togeter
|
||||
def combine_text(row):
|
||||
title = row["title"] or ""
|
||||
features = " ".join(row["features"]) if isinstance(row["features"], list) else ""
|
||||
description = " ".join(row["description"]) if isinstance(row["description"], list) else ""
|
||||
return f"TITLE: {title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}"
|
||||
|
||||
df["text"] = df.apply(combine_text, axis=1)
|
||||
|
||||
df_clean = df.dropna(subset=["price_clean"]).reset_index(drop=True)
|
||||
|
||||
# Example: sample the cleaned data for faster experimentation
|
||||
df_sample = df_clean.sample(n=5000, random_state=42) # adjust n as needed
|
||||
|
||||
train_df, eval_df = train_test_split(df_sample, test_size=0.2, random_state=42)
|
||||
|
||||
print(f"Train size: {len(train_df)}, Eval size: {len(eval_df)}")
|
||||
|
||||
train_df.head(2)
|
||||
|
||||
hf_train = Dataset.from_pandas(train_df.reset_index(drop=True))
|
||||
hf_eval = Dataset.from_pandas(eval_df.reset_index(drop=True))
|
||||
dataset = DatasetDict({"train": hf_train, "eval": hf_eval})
|
||||
|
||||
# Add instruction + numeric target (log-transformed)
|
||||
import numpy as np
|
||||
|
||||
def add_instruction_and_target(ex):
|
||||
ex["instruction"] = "Estimate the fair market price of this product in USD. Return only a single number."
|
||||
price = float(ex["price_clean"])
|
||||
ex["target_log"] = np.log1p(price) # log1p makes training easier
|
||||
ex["target_str"] = f"{ex['target_log']:.6f}" # as string for LM
|
||||
return ex
|
||||
|
||||
dataset = dataset.map(add_instruction_and_target)
|
||||
|
||||
print(dataset)
|
||||
print(dataset["train"][0])
|
||||
|
||||
print(df_clean[["title", "price", "price_clean"]].head(10))
|
||||
print(f"\nNumber of valid price entries: {df_clean['price_clean'].notna().sum()}")
|
||||
|
||||
# trying to downsamble for RAM purposes-hoping the punshment to results wasn't much
|
||||
if len(df_clean) > SAMPLE_SIZE + EVAL_SIZE:
|
||||
df_sample = df_clean.sample(SAMPLE_SIZE + EVAL_SIZE, random_state=RANDOM_SEED).reset_index(drop=True)
|
||||
else:
|
||||
df_sample = df_clean.copy()
|
||||
|
||||
train_df, eval_df = train_test_split(df_sample, test_size=EVAL_SIZE, random_state=RANDOM_SEED)
|
||||
|
||||
# FHow to format the examples
|
||||
def make_example(row):
|
||||
instruction = "Estimate the fair market price of this product in USD. Return only a single number."
|
||||
input_text = row["text"]
|
||||
output = f"{float(row['price_clean']):.2f}"
|
||||
return {"instruction": instruction, "input": input_text, "output": output}
|
||||
|
||||
train_examples = [make_example(r) for _, r in train_df.iterrows()]
|
||||
eval_examples = [make_example(r) for _, r in eval_df.iterrows()]
|
||||
|
||||
# Saving into JSONL
|
||||
with open("pricing_train.jsonl", "w") as f:
|
||||
for ex in train_examples:
|
||||
f.write(json.dumps(ex) + "\n")
|
||||
|
||||
with open("pricing_eval.jsonl", "w") as f:
|
||||
for ex in eval_examples:
|
||||
f.write(json.dumps(ex) + "\n")
|
||||
|
||||
#A good formating for the llm
|
||||
def format_for_model(ex):
|
||||
return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}"
|
||||
|
||||
#seeing the examples
|
||||
print("Example formatted prompts (3):")
|
||||
|
||||
#iterating over the egs
|
||||
for ex in train_examples[:3]:
|
||||
print(format_for_model(ex))
|
||||
print("-"*80)
|
||||
|
||||
#tokenization now
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
|
||||
|
||||
#check ifthe model successful
|
||||
print(f"{MODEL_NAME} succceeded")
|
||||
|
||||
# ===== Tokenization & dataset preprocessing =====
|
||||
MAX_LENGTH = 128
|
||||
|
||||
def preprocess_for_training(examples):
|
||||
input_ids, attention_masks, labels = [], [], []
|
||||
|
||||
for instr, inp, tgt in zip(examples["instruction"], examples["text"], examples["target_str"]):
|
||||
prompt = f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n"
|
||||
full = prompt + tgt
|
||||
|
||||
tok_full = tokenizer(full, truncation=True, max_length=MAX_LENGTH, padding="max_length")
|
||||
tok_prompt = tokenizer(prompt, truncation=True, max_length=MAX_LENGTH, padding="max_length")
|
||||
|
||||
inp_ids = tok_full["input_ids"]
|
||||
attn_mask = tok_full["attention_mask"]
|
||||
|
||||
prompt_len = sum(1 for t in tok_prompt["input_ids"] if t != tokenizer.pad_token_id)
|
||||
label_ids = [-100] * prompt_len + inp_ids[prompt_len:]
|
||||
label_ids = label_ids[:MAX_LENGTH]
|
||||
|
||||
input_ids.append(inp_ids)
|
||||
attention_masks.append(attn_mask)
|
||||
labels.append(label_ids)
|
||||
|
||||
return {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_masks,
|
||||
"labels": labels
|
||||
}
|
||||
|
||||
# Map this to dataset
|
||||
tokenized_datasets = dataset.map(
|
||||
preprocess_for_training,
|
||||
batched=True,
|
||||
remove_columns=dataset["train"].column_names
|
||||
)
|
||||
|
||||
print(" Tokenization complete:", tokenized_datasets)
|
||||
|
||||
# Sample random evaluation
|
||||
sample_eval = random.sample(eval_examples, 10)
|
||||
|
||||
baseline_preds, baseline_truths = [], []
|
||||
|
||||
baseline_preds = []
|
||||
baseline_truths = []
|
||||
|
||||
for ex in tqdm(sample_eval, desc="Evaluating"):
|
||||
true_price = float(ex["output"])
|
||||
|
||||
prompt = (
|
||||
f"### Instruction:\nEstimate the fair market price of this product in USD. "
|
||||
f"Return only a number — no text, no currency symbols.\n\n"
|
||||
f"### Input:\n{ex['input']}\n\n### Response:"
|
||||
)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
output = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=20,
|
||||
temperature=0.0,
|
||||
do_sample=False
|
||||
)
|
||||
|
||||
generated_tokens = output[0][inputs["input_ids"].shape[-1]:]
|
||||
text_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
|
||||
|
||||
# Extract number from output
|
||||
match = re.search(r"[-+]?\d*\.?\d+", text_output)
|
||||
pred_price = float(match.group()) if match else None
|
||||
|
||||
# Apply inverse log if model outputs logs
|
||||
if pred_price is not None:
|
||||
if pred_price < 20: # log range, assuming trained on log(price)
|
||||
pred_price = np.expm1(pred_price)
|
||||
else:
|
||||
print(f" Skipping log transform, raw pred {pred_price}")
|
||||
|
||||
# Only keep realistic prices
|
||||
if 0 < pred_price < 1e4:
|
||||
baseline_preds.append(pred_price)
|
||||
baseline_truths.append(true_price)
|
||||
else:
|
||||
print(f" Skipping unreasonable pred {pred_price}")
|
||||
else:
|
||||
print(f" No number extracted from: {text_output}")
|
||||
|
||||
print(f" Predicted: {pred_price}, True: {true_price}, Raw: {text_output}")
|
||||
|
||||
print(f"\nNumber of baseline predictions: {len(baseline_preds)}")
|
||||
print(f"Number of baseline truths: {len(baseline_truths)}")
|
||||
|
||||
# Manual computation of metrics
|
||||
if baseline_preds:
|
||||
mae = mean_absolute_error(baseline_truths, baseline_preds)
|
||||
mse = mean_squared_error(baseline_truths, baseline_preds)
|
||||
rmse = mse ** 0.5 # take square root manually
|
||||
print(f"\nBaseline MAE: ${mae:.2f}")
|
||||
print(f"Baseline RMSE: ${rmse:.2f}")
|
||||
|
||||
print(f"Number of baseline predictions: {len(baseline_preds)}")
|
||||
print(f"Number of baseline truths: {len(baseline_truths)}")
|
||||
|
||||
#inspectthe data a little
|
||||
print(dataset)
|
||||
print(dataset["train"].column_names)
|
||||
print(dataset["train"][0]) # show one sample
|
||||
|
||||
# create TrainingArguments and Trainer, compatible with older transformers ===
|
||||
import transformers
|
||||
print("transformers version:", transformers.__version__)
|
||||
|
||||
# decide whether evaluation_strategy is supported
|
||||
supports_eval_strategy = False
|
||||
try:
|
||||
import inspect
|
||||
sig = inspect.signature(transformers.TrainingArguments.__init__)
|
||||
if 'evaluation_strategy' in sig.parameters:
|
||||
supports_eval_strategy = True
|
||||
except Exception:
|
||||
# fallback: assume older version
|
||||
supports_eval_strategy = False
|
||||
|
||||
from transformers import TrainingArguments, Trainer
|
||||
|
||||
# common args
|
||||
common_args = dict(
|
||||
output_dir="./price-predictor-checkpoints",
|
||||
num_train_epochs=3,
|
||||
per_device_train_batch_size=2,
|
||||
gradient_accumulation_steps=4,
|
||||
learning_rate=2e-5,
|
||||
fp16=True, # if you see fp16 errors, set this to False
|
||||
save_total_limit=1,
|
||||
logging_steps=10,
|
||||
report_to="none",
|
||||
)
|
||||
|
||||
if supports_eval_strategy:
|
||||
print("Using evaluation_strategy in TrainingArguments (newer transformers).")
|
||||
training_args = TrainingArguments(
|
||||
**common_args,
|
||||
evaluation_strategy="steps",
|
||||
eval_steps=100,
|
||||
save_strategy="steps",
|
||||
save_steps=200,
|
||||
)
|
||||
else:
|
||||
print("evaluation_strategy not supported in this transformers version. Using minimal TrainingArguments and running trainer.evaluate() after training.")
|
||||
# remove args unknown to old versions
|
||||
# older versions may also not accept fp16 or report_to; if errors appear, set fp16=False and remove report_to
|
||||
training_args = TrainingArguments(**common_args)
|
||||
|
||||
# Build the Trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["eval"],
|
||||
tokenizer=tokenizer
|
||||
)
|
||||
|
||||
# Train
|
||||
train_result = trainer.train()
|
||||
|
||||
# If evaluation_strategy wasn't available, run a manual evaluate here
|
||||
if not supports_eval_strategy:
|
||||
print("Running manual evaluation because evaluation_strategy was not available...")
|
||||
eval_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["eval"])
|
||||
print("Trainer.evaluate metrics:", eval_metrics)
|
||||
|
||||
# Save model and tokenizer
|
||||
trainer.save_model("./price-predictor-finetuned")
|
||||
try:
|
||||
tokenizer.save_pretrained("./price-predictor-finetuned")
|
||||
except Exception as e:
|
||||
print("Could not save tokenizer:", e)
|
||||
|
||||
#outcomes
|
||||
#train_result = trainer.train()
|
||||
trainer.save_model("./price-predictor-finetuned")
|
||||
|
||||
# Loading fine-tuned model
|
||||
model = AutoModelForCausalLM.from_pretrained("./price-predictor-finetuned", device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("./price-predictor-finetuned")
|
||||
|
||||
#small evaluation subset
|
||||
eval_dataset_small = dataset["eval"].shuffle(seed=42).select(range(min(50, len(dataset["eval"]))))
|
||||
pred_prices, true_prices = [], []
|
||||
|
||||
pred_prices = []
|
||||
true_prices = []
|
||||
|
||||
for ex in tqdm(eval_dataset_small, desc="Evaluating"):
|
||||
# check which column exists
|
||||
if "target_str" in ex:
|
||||
true_price_log = float(ex["target_str"])
|
||||
true_price = np.expm1(true_price_log) # convert back from log1p
|
||||
elif "output" in ex:
|
||||
true_price = float(ex["output"])
|
||||
elif "price_clean" in ex:
|
||||
true_price = float(ex["price_clean"])
|
||||
else:
|
||||
raise KeyError("No valid price column found in eval example.")
|
||||
|
||||
# Skip invalid prices
|
||||
if np.isnan(true_price):
|
||||
continue
|
||||
|
||||
prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex.get('text', ex.get('input', ''))}\n\n### Response:"
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
output = model.generate(**inputs, max_new_tokens=20, temperature=0.2, do_sample=False)
|
||||
|
||||
text = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
|
||||
# Extract numeric prediction
|
||||
numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text)
|
||||
if not numbers:
|
||||
continue
|
||||
|
||||
pred = float(numbers[-1])
|
||||
|
||||
# If you trained on log prices, exponentiate to get back to USD
|
||||
if "target_str" in ex:
|
||||
pred = np.expm1(pred)
|
||||
|
||||
pred_prices.append(pred)
|
||||
true_prices.append(true_price)
|
||||
|
||||
# -- Compute metrics --
|
||||
pred_prices = np.array(pred_prices)
|
||||
true_prices = np.array(true_prices)
|
||||
|
||||
mae = mean_absolute_error(true_prices, pred_prices)
|
||||
rmse = np.sqrt(mean_squared_error(true_prices, pred_prices))
|
||||
r2 = r2_score(true_prices, pred_prices)
|
||||
|
||||
print(f"\nFine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}")
|
||||
|
||||
# #iteration Over the tqdm
|
||||
# for ex in tqdm(eval_dataset_small, desc="Evaluating"):
|
||||
# prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:"
|
||||
# inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
# with torch.no_grad():
|
||||
# output = model.generate(**inputs, max_new_tokens=20)
|
||||
# text = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
|
||||
# # Extract numeric prediction
|
||||
# numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text)
|
||||
# pred = float(numbers[-1]) if numbers else np.nan
|
||||
|
||||
# pred_prices.append(pred)
|
||||
# true_prices.append(float(ex["output"]))
|
||||
|
||||
# Filter out invalid predictions
|
||||
mask = ~np.isnan(pred_prices)
|
||||
|
||||
pred_prices = np.array(pred_prices)[mask]
|
||||
|
||||
true_prices = np.array(true_prices)[mask]
|
||||
|
||||
# Compute metrics manually again
|
||||
mae = mean_absolute_error(true_prices, pred_prices)
|
||||
|
||||
mse = mean_squared_error(true_prices, pred_prices)
|
||||
|
||||
rmse = np.sqrt(mse)
|
||||
|
||||
r2 = r2_score(true_prices, pred_prices)
|
||||
|
||||
print(f"Fine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}")
|
||||
|
||||
#see what was predicted
|
||||
plt.figure(figsize=(6,6))
|
||||
plt.scatter(true_prices, pred_prices, alpha=0.5)
|
||||
plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction")
|
||||
plt.xlabel("Actual Price (USD)")
|
||||
plt.ylabel("Predicted Price (USD)")
|
||||
plt.title("Predicted vs Actual Prices")
|
||||
plt.legend()
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
2
week7/community_contributions/bharat_puri/test.csv
Normal file
2
week7/community_contributions/bharat_puri/test.csv
Normal file
@@ -0,0 +1,2 @@
|
||||
text,price
|
||||
Smartwatch with GPS and heart-rate tracking,199.99
|
||||
|
9
week7/community_contributions/bharat_puri/train.csv
Normal file
9
week7/community_contributions/bharat_puri/train.csv
Normal file
@@ -0,0 +1,9 @@
|
||||
text,price
|
||||
"Lightweight running shoes for men, size 9",79.99
|
||||
Wireless bluetooth headphones with noise cancellation,129.99
|
||||
Wooden study table with drawer and shelf,139.99
|
||||
Gaming laptop with RTX 4070 GPU and 16GB RAM,1899.99
|
||||
Microwave oven with grill and convection mode,229.99
|
||||
4K Ultra HD 55-inch Smart TV with Dolby Vision,499.99
|
||||
"Organic cotton t-shirt, medium size, white color",14.99
|
||||
Professional DSLR camera with 24MP sensor,649.99
|
||||
|
2
week7/community_contributions/bharat_puri/valid.csv
Normal file
2
week7/community_contributions/bharat_puri/valid.csv
Normal file
@@ -0,0 +1,2 @@
|
||||
text,price
|
||||
Portable Bluetooth speaker with deep bass,59.99
|
||||
|
349
week8/community_contributions/bharat_puri/exercise.py
Normal file
349
week8/community_contributions/bharat_puri/exercise.py
Normal file
@@ -0,0 +1,349 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""week8_exercie.ipynb
|
||||
|
||||
Automatically generated by Colab.
|
||||
|
||||
Original file is located at
|
||||
https://colab.research.google.com/drive/1jJ4pKoJat0ZnC99sTQjEEe9BMK--ArwQ
|
||||
"""
|
||||
|
||||
!pip install -q pandas datasets matplotlib seaborn
|
||||
!pip install datasets==3.0.1
|
||||
!pip install anthropic -q
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from datasets import load_dataset
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
#chec perfomance
|
||||
from sklearn.metrics import classification_report, confusion_matrix
|
||||
from sklearn.utils import resample
|
||||
import os
|
||||
from anthropic import Anthropic
|
||||
import re
|
||||
|
||||
|
||||
|
||||
pd.set_option("display.max_colwidth", 100)
|
||||
|
||||
# # Initialize client using environment variable
|
||||
# client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
# # Quick test
|
||||
# print("Anthropic client initialized " if client else " Anthropic not detected.")
|
||||
|
||||
from google.colab import userdata
|
||||
userdata.get('ANTHROPIC_API_KEY')
|
||||
|
||||
api_key = userdata.get('ANTHROPIC_API_KEY')
|
||||
os.environ["ANTHROPIC_API_KEY"] = api_key
|
||||
|
||||
client = Anthropic(api_key=api_key)
|
||||
|
||||
# List models
|
||||
models = client.models.list()
|
||||
|
||||
print("Available Anthropic Models:\n")
|
||||
for m in models.data:
|
||||
print(f"- {m.id}")
|
||||
|
||||
#dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
|
||||
|
||||
|
||||
|
||||
# Loading a sample from the full reviews data
|
||||
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
|
||||
|
||||
# creating a DF
|
||||
df = pd.DataFrame(dataset)
|
||||
df = df[["title", "text", "rating"]].dropna().reset_index(drop=True)
|
||||
|
||||
# Renaming th columns for clarity/easy ref
|
||||
df.rename(columns={"text": "review_body"}, inplace=True)
|
||||
|
||||
print(f"Loaded {len(df)} rows with reviews and ratings")
|
||||
df.head()
|
||||
|
||||
#inspect the data
|
||||
# Basic info
|
||||
print(df.info())
|
||||
print(df.isnull().sum())
|
||||
|
||||
# Unique ratings dist
|
||||
print(df["rating"].value_counts().sort_index())
|
||||
|
||||
# Check Random reviews
|
||||
display(df.sample(5, random_state=42))
|
||||
|
||||
# Review length distribution
|
||||
df["review_length"] = df["review_body"].apply(lambda x: len(str(x).split()))
|
||||
|
||||
#Summarize the review length
|
||||
print(df["review_length"].describe())
|
||||
|
||||
# pltt the rating distribution
|
||||
plt.figure(figsize=(6,4))
|
||||
df["rating"].hist(bins=5, edgecolor='black')
|
||||
plt.title("Ratings Distribution (1–5 stars)")
|
||||
plt.xlabel("Rating")
|
||||
plt.ylabel("Number of Reviews")
|
||||
plt.show()
|
||||
|
||||
# review length
|
||||
plt.figure(figsize=(6,4))
|
||||
df["review_length"].hist(bins=30, color="lightblue", edgecolor='black')
|
||||
plt.title("Review Length Distribution")
|
||||
plt.xlabel("Number of Words in Review")
|
||||
plt.ylabel("Number of Reviews")
|
||||
plt.show()
|
||||
|
||||
#cleaning
|
||||
def clean_text(text):
|
||||
text = text.lower()
|
||||
# remove URLs
|
||||
text = re.sub(r"http\S+|www\S+|https\S+", '', text)
|
||||
# remove punctuation/special chars
|
||||
text = re.sub(r"[^a-z0-9\s]", '', text)
|
||||
# normalize whitespace
|
||||
text = re.sub(r"\s+", ' ', text).strip()
|
||||
return text
|
||||
|
||||
df["clean_review"] = df["review_body"].apply(clean_text)
|
||||
|
||||
df.head(3)
|
||||
|
||||
"""'#sentiment analysis"""
|
||||
|
||||
# Rating labellings
|
||||
def label_sentiment(rating):
|
||||
if rating <= 2:
|
||||
return "negative"
|
||||
elif rating == 3:
|
||||
return "neutral"
|
||||
else:
|
||||
return "positive"
|
||||
|
||||
df["sentiment"] = df["rating"].apply(label_sentiment)
|
||||
|
||||
df["sentiment"].value_counts()
|
||||
|
||||
#train/tets split
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
df["clean_review"], df["sentiment"], test_size=0.2, random_state=42, stratify=df["sentiment"]
|
||||
)
|
||||
|
||||
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
|
||||
|
||||
# Convert text to TF-IDF features
|
||||
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
|
||||
|
||||
X_train_tfidf = vectorizer.fit_transform(X_train)
|
||||
|
||||
X_test_tfidf = vectorizer.transform(X_test)
|
||||
|
||||
print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")
|
||||
|
||||
#trian classfier
|
||||
|
||||
# Train lightweight model
|
||||
clf = LogisticRegression(max_iter=200)
|
||||
|
||||
clf.fit(X_train_tfidf, y_train)
|
||||
|
||||
y_pred = clf.predict(X_test_tfidf)
|
||||
|
||||
print("Classification Report:\n", classification_report(y_test, y_pred))
|
||||
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
|
||||
|
||||
sample_texts = [
|
||||
"This blender broke after two days. Waste of money!",
|
||||
"Works exactly as described, very satisfied!",
|
||||
"It’s okay, does the job but nothing special."
|
||||
]
|
||||
|
||||
sample_features = vectorizer.transform(sample_texts)
|
||||
sample_preds = clf.predict(sample_features)
|
||||
|
||||
for text, pred in zip(sample_texts, sample_preds):
|
||||
print(f"\nReview: {text}\nPredicted Sentiment: {pred}")
|
||||
|
||||
"""#Improving Model Balance & Realism"""
|
||||
|
||||
# Separate by sentiment
|
||||
pos = df[df["sentiment"] == "positive"]
|
||||
neg = df[df["sentiment"] == "negative"]
|
||||
neu = df[df["sentiment"] == "neutral"]
|
||||
|
||||
# Undersample positive to match roughly others
|
||||
pos_down = resample(pos, replace=False, n_samples=len(neg) + len(neu), random_state=42)
|
||||
|
||||
# Combine
|
||||
df_balanced = pd.concat([pos_down, neg, neu]).sample(frac=1, random_state=42).reset_index(drop=True)
|
||||
|
||||
print(df_balanced["sentiment"].value_counts())
|
||||
|
||||
#retain classfier
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
df_balanced["clean_review"], df_balanced["sentiment"],
|
||||
test_size=0.2, random_state=42, stratify=df_balanced["sentiment"]
|
||||
)
|
||||
|
||||
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
|
||||
|
||||
X_train_tfidf = vectorizer.fit_transform(X_train)
|
||||
|
||||
X_test_tfidf = vectorizer.transform(X_test)
|
||||
|
||||
clf = LogisticRegression(max_iter=300, class_weight="balanced")
|
||||
clf.fit(X_train_tfidf, y_train)
|
||||
|
||||
print("Balanced model trained successfully ")
|
||||
|
||||
#evaluate agan
|
||||
y_pred = clf.predict(X_test_tfidf)
|
||||
|
||||
print("Classification Report:\n", classification_report(y_test, y_pred))
|
||||
|
||||
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
|
||||
|
||||
"""#Agents"""
|
||||
|
||||
# Base class for all agents
|
||||
class BaseAgent:
|
||||
"""A simple base agent with a name and a run() method."""
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
def run(self, *args, **kwargs):
|
||||
raise NotImplementedError("Subclasses must implement run() method.")
|
||||
|
||||
def log(self, message):
|
||||
print(f"[{self.name}] {message}")
|
||||
|
||||
#DataAgent for loading/cleaning
|
||||
class DataAgent(BaseAgent):
|
||||
"""Handles dataset preparation tasks."""
|
||||
|
||||
def __init__(self, data):
|
||||
super().__init__("DataAgent")
|
||||
self.data = data
|
||||
|
||||
def run(self):
|
||||
self.log("Preprocessing data...")
|
||||
df_clean = self.data.copy()
|
||||
df_clean["review_body"] = df_clean["review_body"].str.strip()
|
||||
df_clean.drop_duplicates(subset=["review_body"], inplace=True)
|
||||
self.log(f"Dataset ready with {len(df_clean)} reviews.")
|
||||
return df_clean
|
||||
|
||||
#analisyis agent-->using the tianed sentiment model *TF-IDF +Logistic Regression) to classfy Reviews
|
||||
class AnalysisAgent(BaseAgent):
|
||||
"""Analyzes text sentiment using a trained model."""
|
||||
|
||||
def __init__(self, vectorizer, model):
|
||||
super().__init__("AnalysisAgent")
|
||||
self.vectorizer = vectorizer
|
||||
self.model = model
|
||||
|
||||
def run(self, reviews):
|
||||
self.log(f"Analyzing {len(reviews)} reviews...")
|
||||
X = self.vectorizer.transform(reviews)
|
||||
predictions = self.model.predict(X)
|
||||
return predictions
|
||||
|
||||
#ReviewerAgent. Serves as the summary agnt using the anthropic API to give LLM review insights
|
||||
class ReviewerAgent(BaseAgent):
|
||||
"""Summarizes overall sentiment trends using Anthropic Claude."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("ReviewerAgent")
|
||||
# Retrieve your key once — it’s already stored in Colab userdata
|
||||
api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||
if not api_key:
|
||||
from google.colab import userdata
|
||||
api_key = userdata.get("ANTHROPIC_API_KEY")
|
||||
|
||||
if not api_key:
|
||||
raise ValueError("Anthropic API key not found. Make sure it's set in Colab userdata as 'ANTHROPIC_API_KEY'.")
|
||||
|
||||
self.client = Anthropic(api_key=api_key)
|
||||
|
||||
def run(self, summary_text):
|
||||
"""Generate an insights summary using Claude."""
|
||||
self.log("Generating summary using Claude...")
|
||||
|
||||
prompt = f"""
|
||||
You are a product insights assistant.
|
||||
Based on the following summarized customer reviews, write a concise 3–4 sentence sentiment analysis report.
|
||||
Clearly describe the main themes and tone in user feedback on these home appliance products.
|
||||
|
||||
Reviews Summary:
|
||||
{summary_text}
|
||||
"""
|
||||
|
||||
response = self.client.messages.create(
|
||||
model="claude-3-5-haiku-20241022",
|
||||
max_tokens=250,
|
||||
temperature=0.6,
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
|
||||
output = response.content[0].text.strip()
|
||||
self.log("Summary generated successfully ")
|
||||
return output
|
||||
|
||||
# Instantiate agents
|
||||
data_agent = DataAgent(df)
|
||||
analysis_agent = AnalysisAgent(vectorizer, clf)
|
||||
reviewer_agent = ReviewerAgent()
|
||||
|
||||
# Clean data
|
||||
df_ready = data_agent.run()
|
||||
|
||||
# Classify sentiments
|
||||
df_ready["predicted_sentiment"] = analysis_agent.run(df_ready["review_body"])
|
||||
|
||||
# Prepare summary text by sentiment group
|
||||
summary_text = df_ready.groupby("predicted_sentiment")["review_body"].apply(lambda x: " ".join(x[:3])).to_string()
|
||||
|
||||
# Generate AI summary using Anthropic
|
||||
insight_summary = reviewer_agent.run(summary_text)
|
||||
|
||||
print(insight_summary)
|
||||
|
||||
"""#Evaluation & Visualization"""
|
||||
|
||||
# Evaluation & Visualization ===
|
||||
|
||||
# Count predicted sentiments
|
||||
sentiment_counts = df_ready["predicted_sentiment"].value_counts()
|
||||
|
||||
print(sentiment_counts)
|
||||
|
||||
# Plot sentiment distribution
|
||||
plt.figure(figsize=(6,4))
|
||||
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
|
||||
plt.title("Sentiment Distribution of Reviews", fontsize=14)
|
||||
plt.xlabel("Sentiment")
|
||||
plt.ylabel("Number of Reviews")
|
||||
plt.show()
|
||||
|
||||
# Compute average review length per sentiment
|
||||
df_ready["review_length"] = df_ready["review_body"].apply(lambda x: len(x.split()))
|
||||
|
||||
avg_length = df_ready.groupby("predicted_sentiment")["review_length"].mean()
|
||||
|
||||
print(avg_length)
|
||||
|
||||
# Visualize it
|
||||
plt.figure(figsize=(6,4))
|
||||
sns.barplot(x=avg_length.index, y=avg_length.values, palette="coolwarm")
|
||||
plt.title("Average Review Length per Sentiment")
|
||||
plt.xlabel("Sentiment")
|
||||
plt.ylabel("Average Word Count")
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user