Merge pull request #918 from bharat109puri/main

Week7 and 8 - Assignment submitted by Bharat Puri
2025-10-30 22:29:19 -04:00
parent 8bbbb483b1 bb34a0ce2d
commit 2a09d50cb7
5 changed files with 809 additions and 0 deletions
--- a/week7/community_contributions/bharat_puri/fine_tuning.py
+++ b/week7/community_contributions/bharat_puri/fine_tuning.py
@@ -0,0 +1,447 @@
 # -*- coding: utf-8 -*-
 """Week7_Exercise.ipynb
 submitted by Bharat Puri
 Automatically generated by Colab.
 Original file is located at
    https://colab.research.google.com/drive/1wxcBNWbsDDC_SwXnQZP2dmo7ddOxkJmU
 """
 #my pip installtions (some of them were not used in the project but i initally planned to use them (and we're left here so that I after the project i revisit the notebook and update when i have time and more sources.))
 !pip install -q --upgrade pip
 !pip install -q transformers accelerate peft bitsandbytes trl sentencepiece safetensors
 !pip install -q wandb
 !pip install -q git+https://github.com/huggingface/peft.git@main
 !pip install datasets==3.0.1
 !pip install evaluate -q
 !pip install --upgrade scikit-learn
 #All imports
 import os, random, json, re
 import pandas as pd
 import numpy as np
 import torch
 import matplotlib.pyplot as plt
 from tqdm import tqdm
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import train_test_split
 from datasets import Dataset, DatasetDict
 from IPython.display import Markdown as md
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
 from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
 )
 #I tried a lot of models and every time my colab ran out of RAM, i did a lot of tweakings including replacing models with smaller ones
 #I also used try and error to come up with samples size, eval size etc that would fit in my limited T4 Ram (had started from sample size of 15k down to 200)
 MODEL_NAME = "facebook/opt-125m"
 SAMPLE_SIZE = 200
 EVAL_SIZE = 50
 MAX_LENGTH = 128
 RANDOM_SEED = 42
 #Seeting LoRa hyper parameters
 LORA_R = 4
 LORA_ALPHA = 8
 LORA_DROPOUT = 0.05
 #Target modules to apply LoRA. I kept these to just "q_proj" and "v_proj" to lower memory usage on a T4 GPU.
 TARGET_MODULES = ["q_proj", "v_proj"]
 #to make sure thes expriment is reproducible
 random.seed(RANDOM_SEED)
 np.random.seed(RANDOM_SEED)
 torch.manual_seed(RANDOM_SEED)
 #Hf data
 DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023"
 SUBSET = "raw_meta_Appliances"
 #loading the data
 dataset = load_dataset(DATASET_NAME, SUBSET, split="full")
 df = dataset.to_pandas()
 # somecleaning on prices
 df["price_clean"] = pd.to_numeric(df["price"], errors="coerce")
 # Bringing the text fields togeter
 def combine_text(row):
    title = row["title"] or ""
    features = " ".join(row["features"]) if isinstance(row["features"], list) else ""
    description = " ".join(row["description"]) if isinstance(row["description"], list) else ""
    return f"TITLE: {title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}"
 df["text"] = df.apply(combine_text, axis=1)
 df_clean = df.dropna(subset=["price_clean"]).reset_index(drop=True)
 # Example: sample the cleaned data for faster experimentation
 df_sample = df_clean.sample(n=5000, random_state=42)  # adjust n as needed
 train_df, eval_df = train_test_split(df_sample, test_size=0.2, random_state=42)
 print(f"Train size: {len(train_df)}, Eval size: {len(eval_df)}")
 train_df.head(2)
 hf_train = Dataset.from_pandas(train_df.reset_index(drop=True))
 hf_eval  = Dataset.from_pandas(eval_df.reset_index(drop=True))
 dataset = DatasetDict({"train": hf_train, "eval": hf_eval})
 # Add instruction + numeric target (log-transformed)
 import numpy as np
 def add_instruction_and_target(ex):
    ex["instruction"] = "Estimate the fair market price of this product in USD. Return only a single number."
    price = float(ex["price_clean"])
    ex["target_log"] = np.log1p(price)  # log1p makes training easier
    ex["target_str"] = f"{ex['target_log']:.6f}"  # as string for LM
    return ex
 dataset = dataset.map(add_instruction_and_target)
 print(dataset)
 print(dataset["train"][0])
 print(df_clean[["title", "price", "price_clean"]].head(10))
 print(f"\nNumber of valid price entries: {df_clean['price_clean'].notna().sum()}")
 # trying to downsamble for RAM purposes-hoping the punshment to results wasn't much
 if len(df_clean) > SAMPLE_SIZE + EVAL_SIZE:
    df_sample = df_clean.sample(SAMPLE_SIZE + EVAL_SIZE, random_state=RANDOM_SEED).reset_index(drop=True)
 else:
    df_sample = df_clean.copy()
 train_df, eval_df = train_test_split(df_sample, test_size=EVAL_SIZE, random_state=RANDOM_SEED)
 # FHow to format the examples
 def make_example(row):
    instruction = "Estimate the fair market price of this product in USD. Return only a single number."
    input_text = row["text"]
    output = f"{float(row['price_clean']):.2f}"
    return {"instruction": instruction, "input": input_text, "output": output}
 train_examples = [make_example(r) for _, r in train_df.iterrows()]
 eval_examples  = [make_example(r) for _, r in eval_df.iterrows()]
 # Saving into JSONL
 with open("pricing_train.jsonl", "w") as f:
    for ex in train_examples:
        f.write(json.dumps(ex) + "\n")
 with open("pricing_eval.jsonl", "w") as f:
    for ex in eval_examples:
        f.write(json.dumps(ex) + "\n")
 #A good formating for the llm
 def format_for_model(ex):
    return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}"
 #seeing the examples
 print("Example formatted prompts (3):")
 #iterating over the egs
 for ex in train_examples[:3]:
    print(format_for_model(ex))
    print("-"*80)
 #tokenization now
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 tokenizer.pad_token = tokenizer.eos_token
 model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
 #check ifthe model successful
 print(f"{MODEL_NAME} succceeded")
 # ===== Tokenization & dataset preprocessing =====
 MAX_LENGTH = 128
 def preprocess_for_training(examples):
    input_ids, attention_masks, labels = [], [], []
    for instr, inp, tgt in zip(examples["instruction"], examples["text"], examples["target_str"]):
        prompt = f"### Instruction:\n{instr}\n\n### Input:\n{inp}\n\n### Response:\n"
        full = prompt + tgt
        tok_full = tokenizer(full, truncation=True, max_length=MAX_LENGTH, padding="max_length")
        tok_prompt = tokenizer(prompt, truncation=True, max_length=MAX_LENGTH, padding="max_length")
        inp_ids = tok_full["input_ids"]
        attn_mask = tok_full["attention_mask"]
        prompt_len = sum(1 for t in tok_prompt["input_ids"] if t != tokenizer.pad_token_id)
        label_ids = [-100] * prompt_len + inp_ids[prompt_len:]
        label_ids = label_ids[:MAX_LENGTH]
        input_ids.append(inp_ids)
        attention_masks.append(attn_mask)
        labels.append(label_ids)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels
    }
 # Map this to dataset
 tokenized_datasets = dataset.map(
    preprocess_for_training,
    batched=True,
    remove_columns=dataset["train"].column_names
 )
 print(" Tokenization complete:", tokenized_datasets)
 # Sample random evaluation
 sample_eval = random.sample(eval_examples, 10)
 baseline_preds, baseline_truths = [], []
 baseline_preds = []
 baseline_truths = []
 for ex in tqdm(sample_eval, desc="Evaluating"):
    true_price = float(ex["output"])
    prompt = (
        f"### Instruction:\nEstimate the fair market price of this product in USD. "
        f"Return only a number — no text, no currency symbols.\n\n"
        f"### Input:\n{ex['input']}\n\n### Response:"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=20,
            temperature=0.0,
            do_sample=False
        )
    generated_tokens = output[0][inputs["input_ids"].shape[-1]:]
    text_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    # Extract number from output
    match = re.search(r"[-+]?\d*\.?\d+", text_output)
    pred_price = float(match.group()) if match else None
    # Apply inverse log if model outputs logs
    if pred_price is not None:
        if pred_price < 20:  # log range, assuming trained on log(price)
            pred_price = np.expm1(pred_price)
        else:
            print(f" Skipping log transform, raw pred {pred_price}")
        # Only keep realistic prices
        if 0 < pred_price < 1e4:
            baseline_preds.append(pred_price)
            baseline_truths.append(true_price)
        else:
            print(f" Skipping unreasonable pred {pred_price}")
    else:
        print(f" No number extracted from: {text_output}")
    print(f" Predicted: {pred_price}, True: {true_price}, Raw: {text_output}")
 print(f"\nNumber of baseline predictions: {len(baseline_preds)}")
 print(f"Number of baseline truths: {len(baseline_truths)}")
 # Manual computation of metrics
 if baseline_preds:
    mae = mean_absolute_error(baseline_truths, baseline_preds)
    mse = mean_squared_error(baseline_truths, baseline_preds)
    rmse = mse ** 0.5  # take square root manually
    print(f"\nBaseline MAE: ${mae:.2f}")
    print(f"Baseline RMSE: ${rmse:.2f}")
 print(f"Number of baseline predictions: {len(baseline_preds)}")
 print(f"Number of baseline truths: {len(baseline_truths)}")
 #inspectthe data a little
 print(dataset)
 print(dataset["train"].column_names)
 print(dataset["train"][0])  # show one sample
 # create TrainingArguments and Trainer, compatible with older transformers ===
 import transformers
 print("transformers version:", transformers.__version__)
 # decide whether evaluation_strategy is supported
 supports_eval_strategy = False
 try:
    import inspect
    sig = inspect.signature(transformers.TrainingArguments.__init__)
    if 'evaluation_strategy' in sig.parameters:
        supports_eval_strategy = True
 except Exception:
    # fallback: assume older version
    supports_eval_strategy = False
 from transformers import TrainingArguments, Trainer
 # common args
 common_args = dict(
    output_dir="./price-predictor-checkpoints",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    fp16=True,                # if you see fp16 errors, set this to False
    save_total_limit=1,
    logging_steps=10,
    report_to="none",
 )
 if supports_eval_strategy:
    print("Using evaluation_strategy in TrainingArguments (newer transformers).")
    training_args = TrainingArguments(
        **common_args,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=200,
    )
 else:
    print("evaluation_strategy not supported in this transformers version. Using minimal TrainingArguments and running trainer.evaluate() after training.")
    # remove args unknown to old versions
    # older versions may also not accept fp16 or report_to; if errors appear, set fp16=False and remove report_to
    training_args = TrainingArguments(**common_args)
 # Build the Trainer
 trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    tokenizer=tokenizer
 )
 # Train
 train_result = trainer.train()
 # If evaluation_strategy wasn't available, run a manual evaluate here
 if not supports_eval_strategy:
    print("Running manual evaluation because evaluation_strategy was not available...")
    eval_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["eval"])
    print("Trainer.evaluate metrics:", eval_metrics)
 # Save model and tokenizer
 trainer.save_model("./price-predictor-finetuned")
 try:
    tokenizer.save_pretrained("./price-predictor-finetuned")
 except Exception as e:
    print("Could not save tokenizer:", e)
 #outcomes
 #train_result = trainer.train()
 trainer.save_model("./price-predictor-finetuned")
 # Loading fine-tuned model
 model = AutoModelForCausalLM.from_pretrained("./price-predictor-finetuned", device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained("./price-predictor-finetuned")
 #small evaluation subset
 eval_dataset_small = dataset["eval"].shuffle(seed=42).select(range(min(50, len(dataset["eval"]))))
 pred_prices, true_prices = [], []
 pred_prices = []
 true_prices = []
 for ex in tqdm(eval_dataset_small, desc="Evaluating"):
    # check which column exists
    if "target_str" in ex:
        true_price_log = float(ex["target_str"])
        true_price = np.expm1(true_price_log)  # convert back from log1p
    elif "output" in ex:
        true_price = float(ex["output"])
    elif "price_clean" in ex:
        true_price = float(ex["price_clean"])
    else:
        raise KeyError("No valid price column found in eval example.")
    # Skip invalid prices
    if np.isnan(true_price):
        continue
    prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex.get('text', ex.get('input', ''))}\n\n### Response:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=20, temperature=0.2, do_sample=False)
    text = tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract numeric prediction
    numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text)
    if not numbers:
        continue
    pred = float(numbers[-1])
    # If you trained on log prices, exponentiate to get back to USD
    if "target_str" in ex:
        pred = np.expm1(pred)
    pred_prices.append(pred)
    true_prices.append(true_price)
 # -- Compute metrics --
 pred_prices = np.array(pred_prices)
 true_prices = np.array(true_prices)
 mae = mean_absolute_error(true_prices, pred_prices)
 rmse = np.sqrt(mean_squared_error(true_prices, pred_prices))
 r2 = r2_score(true_prices, pred_prices)
 print(f"\nFine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}")
 # #iteration Over the tqdm
 # for ex in tqdm(eval_dataset_small, desc="Evaluating"):
 #     prompt = f"### Instruction:\nEstimate the fair market price of this product in USD. Return only a single number.\n\n### Input:\n{ex['input']}\n\n### Response:"
 #     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 #     with torch.no_grad():
 #         output = model.generate(**inputs, max_new_tokens=20)
 #     text = tokenizer.decode(output[0], skip_special_tokens=True)
 #     # Extract numeric prediction
 #     numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text)
 #     pred = float(numbers[-1]) if numbers else np.nan
 #     pred_prices.append(pred)
 #     true_prices.append(float(ex["output"]))
 # Filter out invalid predictions
 mask = ~np.isnan(pred_prices)
 pred_prices = np.array(pred_prices)[mask]
 true_prices = np.array(true_prices)[mask]
 # Compute metrics manually again
 mae = mean_absolute_error(true_prices, pred_prices)
 mse = mean_squared_error(true_prices, pred_prices)
 rmse = np.sqrt(mse)
 r2 = r2_score(true_prices, pred_prices)
 print(f"Fine-Tuned Evaluation:\nMAE: ${mae:.2f}, RMSE: ${rmse:.2f}, R²: {r2:.4f}")
 #see what was predicted
 plt.figure(figsize=(6,6))
 plt.scatter(true_prices, pred_prices, alpha=0.5)
 plt.plot([0, max(true_prices)], [0, max(true_prices)], 'r--', label="Perfect Prediction")
 plt.xlabel("Actual Price (USD)")
 plt.ylabel("Predicted Price (USD)")
 plt.title("Predicted vs Actual Prices")
 plt.legend()
 plt.grid(True)
 plt.show()
--- a/week7/community_contributions/bharat_puri/test.csv
+++ b/week7/community_contributions/bharat_puri/test.csv
@@ -0,0 +1,2 @@
 text,price
 Smartwatch with GPS and heart-rate tracking,199.99
--- a/week7/community_contributions/bharat_puri/train.csv
+++ b/week7/community_contributions/bharat_puri/train.csv
@@ -0,0 +1,9 @@
 text,price
 "Lightweight running shoes for men, size 9",79.99
 Wireless bluetooth headphones with noise cancellation,129.99
 Wooden study table with drawer and shelf,139.99
 Gaming laptop with RTX 4070 GPU and 16GB RAM,1899.99
 Microwave oven with grill and convection mode,229.99
 4K Ultra HD 55-inch Smart TV with Dolby Vision,499.99
 "Organic cotton t-shirt, medium size, white color",14.99
 Professional DSLR camera with 24MP sensor,649.99
--- a/week7/community_contributions/bharat_puri/valid.csv
+++ b/week7/community_contributions/bharat_puri/valid.csv
@@ -0,0 +1,2 @@
 text,price
 Portable Bluetooth speaker with deep bass,59.99
--- a/week8/community_contributions/bharat_puri/exercise.py
+++ b/week8/community_contributions/bharat_puri/exercise.py
@@ -0,0 +1,349 @@
 # -*- coding: utf-8 -*-
 """week8_exercie.ipynb
 Automatically generated by Colab.
 Original file is located at
    https://colab.research.google.com/drive/1jJ4pKoJat0ZnC99sTQjEEe9BMK--ArwQ
 """
 !pip install -q pandas datasets matplotlib seaborn
 !pip install datasets==3.0.1
 !pip install anthropic -q
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from datasets import load_dataset
 from sklearn.model_selection import train_test_split
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 #chec perfomance
 from sklearn.metrics import classification_report, confusion_matrix
 from sklearn.utils import resample
 import os
 from anthropic import Anthropic
 import re
 pd.set_option("display.max_colwidth", 100)
 # # Initialize client using environment variable
 # client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 # # Quick test
 # print("Anthropic client initialized " if client else " Anthropic not detected.")
 from google.colab import userdata
 userdata.get('ANTHROPIC_API_KEY')
 api_key = userdata.get('ANTHROPIC_API_KEY')
 os.environ["ANTHROPIC_API_KEY"] = api_key
 client = Anthropic(api_key=api_key)
 #  List models
 models = client.models.list()
 print("Available Anthropic Models:\n")
 for m in models.data:
    print(f"- {m.id}")
 #dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
 # Loading a sample from the full reviews data
 dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
 # creating a DF
 df = pd.DataFrame(dataset)
 df = df[["title", "text", "rating"]].dropna().reset_index(drop=True)
 # Renaming th columns for clarity/easy ref
 df.rename(columns={"text": "review_body"}, inplace=True)
 print(f"Loaded {len(df)} rows with reviews and ratings")
 df.head()
 #inspect the data
 # Basic info
 print(df.info())
 print(df.isnull().sum())
 # Unique ratings dist
 print(df["rating"].value_counts().sort_index())
 # Check Random reviews
 display(df.sample(5, random_state=42))
 # Review length distribution
 df["review_length"] = df["review_body"].apply(lambda x: len(str(x).split()))
 #Summarize the review length
 print(df["review_length"].describe())
 # pltt the rating distribution
 plt.figure(figsize=(6,4))
 df["rating"].hist(bins=5, edgecolor='black')
 plt.title("Ratings Distribution (1–5 stars)")
 plt.xlabel("Rating")
 plt.ylabel("Number of Reviews")
 plt.show()
 #  review length
 plt.figure(figsize=(6,4))
 df["review_length"].hist(bins=30, color="lightblue", edgecolor='black')
 plt.title("Review Length Distribution")
 plt.xlabel("Number of Words in Review")
 plt.ylabel("Number of Reviews")
 plt.show()
 #cleaning
 def clean_text(text):
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    # remove punctuation/special chars
    text = re.sub(r"[^a-z0-9\s]", '', text)
    # normalize whitespace
    text = re.sub(r"\s+", ' ', text).strip()
    return text
 df["clean_review"] = df["review_body"].apply(clean_text)
 df.head(3)
 """'#sentiment analysis"""
 # Rating labellings
 def label_sentiment(rating):
    if rating <= 2:
        return "negative"
    elif rating == 3:
        return "neutral"
    else:
        return "positive"
 df["sentiment"] = df["rating"].apply(label_sentiment)
 df["sentiment"].value_counts()
 #train/tets split
 X_train, X_test, y_train, y_test = train_test_split(
    df["clean_review"], df["sentiment"], test_size=0.2, random_state=42, stratify=df["sentiment"]
 )
 print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
 # Convert text to TF-IDF features
 vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
 X_train_tfidf = vectorizer.fit_transform(X_train)
 X_test_tfidf = vectorizer.transform(X_test)
 print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")
 #trian classfier
 # Train lightweight model
 clf = LogisticRegression(max_iter=200)
 clf.fit(X_train_tfidf, y_train)
 y_pred = clf.predict(X_test_tfidf)
 print("Classification Report:\n", classification_report(y_test, y_pred))
 print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
 sample_texts = [
    "This blender broke after two days. Waste of money!",
    "Works exactly as described, very satisfied!",
    "It’s okay, does the job but nothing special."
 ]
 sample_features = vectorizer.transform(sample_texts)
 sample_preds = clf.predict(sample_features)
 for text, pred in zip(sample_texts, sample_preds):
    print(f"\nReview: {text}\nPredicted Sentiment: {pred}")
 """#Improving Model Balance & Realism"""
 # Separate by sentiment
 pos = df[df["sentiment"] == "positive"]
 neg = df[df["sentiment"] == "negative"]
 neu = df[df["sentiment"] == "neutral"]
 # Undersample positive to match roughly others
 pos_down = resample(pos, replace=False, n_samples=len(neg) + len(neu), random_state=42)
 # Combine
 df_balanced = pd.concat([pos_down, neg, neu]).sample(frac=1, random_state=42).reset_index(drop=True)
 print(df_balanced["sentiment"].value_counts())
 #retain classfier
 X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["clean_review"], df_balanced["sentiment"],
    test_size=0.2, random_state=42, stratify=df_balanced["sentiment"]
 )
 vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
 X_train_tfidf = vectorizer.fit_transform(X_train)
 X_test_tfidf = vectorizer.transform(X_test)
 clf = LogisticRegression(max_iter=300, class_weight="balanced")
 clf.fit(X_train_tfidf, y_train)
 print("Balanced model trained successfully ")
 #evaluate agan
 y_pred = clf.predict(X_test_tfidf)
 print("Classification Report:\n", classification_report(y_test, y_pred))
 print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
 """#Agents"""
 # Base class for all agents
 class BaseAgent:
    """A simple base agent with a name and a run() method."""
    def __init__(self, name):
        self.name = name
    def run(self, *args, **kwargs):
        raise NotImplementedError("Subclasses must implement run() method.")
    def log(self, message):
        print(f"[{self.name}] {message}")
 #DataAgent for loading/cleaning
 class DataAgent(BaseAgent):
    """Handles dataset preparation tasks."""
    def __init__(self, data):
        super().__init__("DataAgent")
        self.data = data
    def run(self):
        self.log("Preprocessing data...")
        df_clean = self.data.copy()
        df_clean["review_body"] = df_clean["review_body"].str.strip()
        df_clean.drop_duplicates(subset=["review_body"], inplace=True)
        self.log(f"Dataset ready with {len(df_clean)} reviews.")
        return df_clean
 #analisyis agent-->using the tianed sentiment model *TF-IDF +Logistic Regression) to classfy Reviews
 class AnalysisAgent(BaseAgent):
    """Analyzes text sentiment using a trained model."""
    def __init__(self, vectorizer, model):
        super().__init__("AnalysisAgent")
        self.vectorizer = vectorizer
        self.model = model
    def run(self, reviews):
        self.log(f"Analyzing {len(reviews)} reviews...")
        X = self.vectorizer.transform(reviews)
        predictions = self.model.predict(X)
        return predictions
 #ReviewerAgent. Serves as the summary agnt using the anthropic API to give LLM review insights
 class ReviewerAgent(BaseAgent):
    """Summarizes overall sentiment trends using Anthropic Claude."""
    def __init__(self):
        super().__init__("ReviewerAgent")
        # Retrieve your key once — it’s already stored in Colab userdata
        api_key = os.getenv("ANTHROPIC_API_KEY")
        if not api_key:
            from google.colab import userdata
            api_key = userdata.get("ANTHROPIC_API_KEY")
        if not api_key:
            raise ValueError("Anthropic API key not found. Make sure it's set in Colab userdata as 'ANTHROPIC_API_KEY'.")
        self.client = Anthropic(api_key=api_key)
    def run(self, summary_text):
        """Generate an insights summary using Claude."""
        self.log("Generating summary using Claude...")
        prompt = f"""
        You are a product insights assistant.
        Based on the following summarized customer reviews, write a concise 3–4 sentence sentiment analysis report.
        Clearly describe the main themes and tone in user feedback on these home appliance products.
        Reviews Summary:
        {summary_text}
        """
        response = self.client.messages.create(
            model="claude-3-5-haiku-20241022",
            max_tokens=250,
            temperature=0.6,
            messages=[{"role": "user", "content": prompt}]
        )
        output = response.content[0].text.strip()
        self.log("Summary generated successfully ")
        return output
 # Instantiate agents
 data_agent = DataAgent(df)
 analysis_agent = AnalysisAgent(vectorizer, clf)
 reviewer_agent = ReviewerAgent()
 # Clean data
 df_ready = data_agent.run()
 # Classify sentiments
 df_ready["predicted_sentiment"] = analysis_agent.run(df_ready["review_body"])
 #  Prepare summary text by sentiment group
 summary_text = df_ready.groupby("predicted_sentiment")["review_body"].apply(lambda x: " ".join(x[:3])).to_string()
 # Generate AI summary using Anthropic
 insight_summary = reviewer_agent.run(summary_text)
 print(insight_summary)
 """#Evaluation & Visualization"""
 #  Evaluation & Visualization ===
 # Count predicted sentiments
 sentiment_counts = df_ready["predicted_sentiment"].value_counts()
 print(sentiment_counts)
 # Plot sentiment distribution
 plt.figure(figsize=(6,4))
 sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
 plt.title("Sentiment Distribution of Reviews", fontsize=14)
 plt.xlabel("Sentiment")
 plt.ylabel("Number of Reviews")
 plt.show()
 # Compute average review length per sentiment
 df_ready["review_length"] = df_ready["review_body"].apply(lambda x: len(x.split()))
 avg_length = df_ready.groupby("predicted_sentiment")["review_length"].mean()
 print(avg_length)
 # Visualize it
 plt.figure(figsize=(6,4))
 sns.barplot(x=avg_length.index, y=avg_length.values, palette="coolwarm")
 plt.title("Average Review Length per Sentiment")
 plt.xlabel("Sentiment")
 plt.ylabel("Average Word Count")
 plt.show()
		`@@ -0,0 +1,2 @@`
							`text,price`
							`Smartwatch with GPS and heart-rate tracking,199.99`
		`@@ -0,0 +1,2 @@`
							`text,price`
							`Portable Bluetooth speaker with deep bass,59.99`