diff --git a/week6/community-contributions/week_6_exercise_revised.py b/week6/community-contributions/week_6_exercise_revised.py new file mode 100644 index 0000000..bcf9ae1 --- /dev/null +++ b/week6/community-contributions/week_6_exercise_revised.py @@ -0,0 +1,621 @@ +# -*- coding: utf-8 -*- +"""Week_6_exercise_revised.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1GaV053HB8l-Wd3J3o9BcOAjC009Qk_W0 +""" + +#installations +!pip install --upgrade pip +!pip install datasets==3.0.1 anthropic transformers accelerate pandas tqdm numpy + +#imports +import os +import re +import json +import random +import time +from typing import Optional, List, Dict, Any, Tuple +from sklearn.model_selection import train_test_split +import anthropic +from datasets import load_dataset +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from tqdm import tqdm +import seaborn as sns + +#TEMPORARY: Hard-coded keys + +#I hid my keys, you can replace your keys with 'sk' and 'hf' +os.environ["ANTHROPIC_API_KEY"] = "sk" +os.environ["HF_TOKEN"] = "hf" + + +# Anthropic Client +try: + client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) + print("Anthropic client initialized") +except Exception as e: + raise ImportError("Please install anthropic: !pip install anthropic") from e + +#some Basic configrations used throughtout the notebook +RANDOM_SEED = 42 +# medium test size +TEST_SIZE = 50 +CLAUDE_MODEL = "claude-opus-4-20250514" +MAX_TOKENS = 300 + +random.seed(RANDOM_SEED) +np.random.seed(RANDOM_SEED) + +# Load my dataset, the Aplliances in my case +dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Appliances", split="full") +#using Pandas to create a dataframe +df = dataset.to_pandas() +#see the data +df.head() + +# Let clean the Price column and have it as a Price-clean +df["price_clean"] = pd.to_numeric(df["price"], errors="coerce") + +#check the number of rows In the ddata +print("Dataset size:", len(df)) + +#check The featues in the data +print(df.columns.tolist()) + +#checking some info +print(df.info()) + +print("Price-related columns found:", [c for c in df.columns if "price" in c]) + +print("Missing price_clean:", df["price_clean"].isna().sum(), "rows") + +# Price distribution visualization (Zoomed histogram) +plt.figure(figsize=(10,5)) +df[df["price_clean"] < 200]["price_clean"].hist(bins=50) +plt.title("Price Distribution") +plt.xlabel("Price ($)") +plt.ylabel("Frequency") +plt.show() + +# Keep only rows where price_clean is not null +df_model = df.dropna(subset=["price_clean"]).copy() + +# come up with a ptompt text combined +def combine_text(row): + title = row["title"] or "" + features = " ".join(row["features"]) if isinstance(row["features"], list) else "" + description = " ".join(row["description"]) if isinstance(row["description"], list) else "" + return f"{title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}" + +df_model["text"] = df_model.apply(combine_text, axis=1) + +# Retain what's needed +df_model = df_model[["text", "price_clean"]].reset_index(drop=True) + +# check the model dataset size +print(len(df_model)) +df_model.head(5) + +# Splitting the data into Training and test +train_df, test_df = train_test_split( + df_model, + test_size=0.10, # 10% test split + random_state=RANDOM_SEED +) + +#Training +len(train_df) + +#Testing +len(test_df) + +# make the test a list for better samplng +test_records = test_df.to_dict(orient="records") + +# Pricing system Prompt + +def build_prompt(item_text: str) -> str: + return f""" +You are a pricing analyst. Given a marketplace product listing, estimate the item's correct fair market price in KES. + +Return ONLY a number, no currency sign, no explanation. + +Product details: +\"\"\" +{item_text} +\"\"\" +""" + +def estimate_price_claude(item_text: str) -> Optional[float]: + try: + prompt = build_prompt(item_text) + + response = client.messages.create( + model=CLAUDE_MODEL, + max_tokens=MAX_TOKENS, + messages=[ + {"role": "user", "content": prompt} + ] + ) + + raw_output = response.content[0].text.strip() + + # Extract first valid number from model response + match = re.search(r"\d+(\.\d+)?", raw_output.replace(",", "")) + return float(match.group(0)) if match else None + + except Exception as e: + print("Error:", e) + return None + +client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) + +# Filter and Sample 100 usable Rows +df_usable = df[df["price_clean"].notna()].copy() +sample_df = df_usable.sample(100, random_state=42).reset_index(drop=True) + +#empty predriction list for them to be stored +predictions = [] + +#Getting the prices +def extract_price(text): + """Extract the first valid float from Claude's reply.""" + match = re.search(r"\d+(\.\d+)?", text.replace(",", "")) + return float(match.group(0)) if match else None + +# Getting the predictions +for i, row in tqdm(sample_df.iterrows(), total=len(sample_df)): + title = row["title"] + desc = " ".join(row["description"]) if isinstance(row["description"], list) else str(row["description"]) + feat = " ".join(row["features"]) if isinstance(row["features"], list) else str(row["features"]) + cats = " ".join(row["categories"]) if isinstance(row["categories"], list) else str(row["categories"]) + + prompt = f""" +You are estimating the USD retail price of an appliance part. + +Analyze the information and respond with **only a single number** (no currency symbol, no text, no explanation). + +TITLE: {title} +DESCRIPTION: {desc} +FEATURES: {feat} +CATEGORIES: {cats} + +Your response must be only a number like: 29.99 +""" + + response = client.messages.create( + model=CLAUDE_MODEL, + max_tokens=50, + messages=[{"role": "user", "content": prompt}] + ) + + raw = response.content[0].text.strip() + pred_price = extract_price(raw) + + predictions.append({ + "title": title, + "true_price": row["price_clean"], + "claude_price": pred_price, + "raw_reply": raw + }) + +# Saving output in a csv nw +result_df = pd.DataFrame(predictions) +result_df.to_csv("claude_price_predictions_100.csv", index=False) + +# Show preview +display(result_df.head()) + +# Error metrics +valid = result_df[result_df["claude_price"].notna()] +mae = np.mean(np.abs(valid["true_price"] - valid["claude_price"])) +rmse = np.sqrt(np.mean((valid["true_price"] - valid["claude_price"])**2)) +pct_within_20 = np.mean(np.abs(valid["true_price"] - valid["claude_price"]) <= 20) * 100 + +print(f"\nValid predictions: {len(valid)}/{len(result_df)}") +print(f"MAE: {mae:.2f}") +print(f"RMSE: {rmse:.2f}") +print(f"% within $20: {pct_within_20:.1f}%") + +"""The model returned a price every single time: + + + +1. -->MAE = 22.52 On average Claude is off by 22.52 from the true price +2. -->RMSE = 44.11 Big errors exist on some items — a sign of occasional wild guesses +2. -->RMSE = 44.11 Big errors exist on some items — a sign of occasional wild guesses +2. -->72% within $20 Claude predicts reasonable accuracy on most products, but 28% are far off. + +; + +1. Strengths- Model is somehow decent with zero/low fine-tuning. It understood the task, 72% within $20 on a dataset it’s never seen is a good baseline +1. Weaknesses- Too many rounded “classic” retail numbers (24.99, 89.99, 14.99, 29.99). Seems not to deeply use features, category, or rating. Also the RMSE is high → meaning a few really bad errors are dragging performance + +Improvements + +1. Prompt enhancements +2. Multi-shot and also better structuring +3. Fine-tuning with local model +""" + +#Now we build a persona Prompt +def build_pricing_prompt(examples: list, new_title: str) -> str: + """ + Build a multi-shot prompt for the E-commerce Market Analyst persona. + Each example has (title, price). + """ + few_shots = "\n".join( + [f"Product: {t}\nEstimated fair market price: ${p:.2f}" for t, p in examples] + ) + + system_prompt = ( + "You are a meticulous Data-Driven Market Analyst who estimates realistic, data-based " + "product prices for online marketplaces. You base estimates on comparable items and " + "avoid outliers. Return only the price number." + ) + + user_prompt = ( + f"{system_prompt}\n\nHere are recent examples:\n{few_shots}\n\n" + f"Now estimate a fair market price for this product:\n" + f"Product: {new_title}\n\n" + "Respond with only a number, no text or symbols." + ) + return user_prompt + +#10-shot predictios +subset_10 = df.dropna(subset=["price_clean"]).sample(10, random_state=42).reset_index(drop=True) +few_shots_3 = subset_10.sample(3, random_state=42)[["title", "price_clean"]].values.tolist() +results_10 = [] + +for i, row in tqdm(subset_10.iterrows(), total=len(subset_10)): + prompt = build_pricing_prompt(few_shots_3, row["title"]) + try: + resp = client.messages.create( + model=CLAUDE_MODEL, + max_tokens=MAX_TOKENS, + messages=[{"role": "user", "content": prompt}], + ) + reply = resp.content[0].text.strip() + pred = float(reply.replace("$", "").strip()) + except Exception: + pred, reply = np.nan, None + results_10.append({"title": row["title"], "true_price": row["price_clean"], "pred_price": pred, "raw": reply}) + +df10 = pd.DataFrame(results_10).dropna(subset=["pred_price"]) + +mae10 = np.mean(np.abs(df10.pred_price - df10.true_price)) + +rmse10 = np.sqrt(np.mean((df10.pred_price - df10.true_price)**2)) + +pct20_10 = np.mean(np.abs(df10.pred_price - df10.true_price) <= 20) * 100 + +print(f"MAE={mae10:.2f}, RMSE={rmse10:.2f}, %within$20={pct20_10:.1f}%") +df10.head() + +#30 shot +subset_30 = df.dropna(subset=["price_clean"]).sample(30, random_state=42).reset_index(drop=True) +few_shots_5 = subset_30.sample(5, random_state=42)[["title", "price_clean"]].values.tolist() +results_30 = [] + +for i, row in tqdm(subset_30.iterrows(), total=len(subset_30)): + prompt = build_pricing_prompt(few_shots_5, row["title"]) + try: + resp = client.messages.create( + model=CLAUDE_MODEL, + max_tokens=MAX_TOKENS, + messages=[{"role": "user", "content": prompt}], + ) + reply = resp.content[0].text.strip() + pred = float(reply.replace("$", "").strip()) + except Exception: + pred, reply = np.nan, None + results_30.append({"title": row["title"], "true_price": row["price_clean"], "pred_price": pred, "raw": reply}) + +df30 = pd.DataFrame(results_30).dropna(subset=["pred_price"]) + +mae30 = np.mean(np.abs(df30.pred_price - df30.true_price)) + +rmse30 = np.sqrt(np.mean((df30.pred_price - df30.true_price)**2)) + +pct20_30 = np.mean(np.abs(df30.pred_price - df30.true_price) <= 20) * 100 + +print(f"MAE={mae30:.2f}, RMSE={rmse30:.2f}, %within$20={pct20_30:.1f}%") +df30.head() + +#50 Shot s +subset_50 = df.dropna(subset=["price_clean"]).sample(50, random_state=42).reset_index(drop=True) +few_shots_8 = subset_50.sample(8, random_state=42)[["title", "price_clean"]].values.tolist() +results_50 = [] + +for i, row in tqdm(subset_50.iterrows(), total=len(subset_50)): + prompt = build_pricing_prompt(few_shots_8, row["title"]) + try: + resp = client.messages.create( + model=CLAUDE_MODEL, + max_tokens=MAX_TOKENS, + messages=[{"role": "user", "content": prompt}], + ) + reply = resp.content[0].text.strip() + pred = float(reply.replace("$", "").strip()) + except Exception: + pred, reply = np.nan, None + results_50.append({"title": row["title"], "true_price": row["price_clean"], "pred_price": pred, "raw": reply}) + +df50 = pd.DataFrame(results_50).dropna(subset=["pred_price"]) + +mae50 = np.mean(np.abs(df50.pred_price - df50.true_price)) + +rmse50 = np.sqrt(np.mean((df50.pred_price - df50.true_price)**2)) + +pct20_50 = np.mean(np.abs(df50.pred_price - df50.true_price) <= 20) * 100 + +print(f"MAE={mae50:.2f}, RMSE={rmse50:.2f}, %within$20={pct20_50:.1f}%") +df50.head() + +#Improved Ptompt and comparin the 10,30, &50 shot hints +def build_strict_prompt(few_shots, test_title): + shots_text = "\n".join([f"Title: {t}\nPrice: ${p:.2f}" for t, p in few_shots]) + return f""" +You are an expert e-commerce product pricing analyst. Your job is to predict the most realistic market price for a product based purely on its title. + +Here are reference examples: +{shots_text} + +Now predict the price for: +Title: {test_title} + +RULES: +- Return ONLY a single number. +- No dollar sign. +- No text, no reasoning, no words. +- Format: 123.45 +""" + +def run_eval(name, subset, shot_count): + few = subset.sample(shot_count, random_state=42)[["title", "price_clean"]].values.tolist() + results = [] + + for _, row in tqdm(subset.iterrows(), total=len(subset), desc=f"{name}"): + prompt = build_strict_prompt(few, row["title"]) + try: + resp = client.messages.create( + model=CLAUDE_MODEL, + max_tokens=MAX_TOKENS, + messages=[{"role": "user", "content": prompt}], + ) + reply = resp.content[0].text.strip() + pred = float(reply) + except Exception: + pred, reply = np.nan, None + + results.append({"title": row["title"], "true": row["price_clean"], "pred": pred}) + + df = pd.DataFrame(results).dropna(subset=["pred"]) + mae = np.mean(np.abs(df.pred - df.true)) + rmse = np.sqrt(np.mean((df.pred - df.true)**2)) + pct20 = np.mean(np.abs(df.pred - df.true) <= 20) * 100 + return df, mae, rmse, pct20 + +# Run 10 / 30 / 50 +subset10 = df.dropna(subset=["price_clean"]).sample(10, random_state=1).reset_index(drop=True) +subset30 = df.dropna(subset=["price_clean"]).sample(30, random_state=2).reset_index(drop=True) +subset50 = df.dropna(subset=["price_clean"]).sample(50, random_state=3).reset_index(drop=True) + +df10, mae10, rmse10, pct10 = run_eval("RUN10", subset10, 3) +df30, mae30, rmse30, pct30 = run_eval("RUN30", subset30, 6) +df50, mae50, rmse50, pct50 = run_eval("RUN50", subset50, 8) + +#compare +comparison = pd.DataFrame([ + {"shots": 10, "MAE": mae10, "RMSE": rmse10, "%≤$20": pct10}, + {"shots": 30, "MAE": mae30, "RMSE": rmse30, "%≤$20": pct30}, + {"shots": 50, "MAE": mae50, "RMSE": rmse50, "%≤$20": pct50}, +]) + +print(comparison) +comparison + +"""The model becomes confused by too many examples, became more biased toward random values and less less stable and less accurate. +Hypothesis: Possibly the dataset has high variance (many unrelated categories), and the model benefits from small, clean, representative few-shots, not large few-shots. +""" + +#Rechecking the variance in the data +prices = df["price_clean"].dropna() +print(prices.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95])) + +print("\nSkewness:", prices.skew()) +print("Kurtosis:", prices.kurt()) + +# Plot histogram +plt.figure(figsize=(12,4)) +sns.histplot(prices, bins=50) +plt.title("Histogram — Full Dataset Price Distribution") +plt.xlabel("Price ($)") +plt.ylabel("Frequency") +plt.show() + +# Plot boxplot +plt.figure(figsize=(10,2)) +sns.boxplot(x=prices) +plt.title("Boxplot — Full Dataset Price Spread") +plt.show() + +"""Testing fewer shots to check fr the optimal""" + +def run_few_shot_test(df_subset, shots, model=CLAUDE_MODEL): + few_shots = df_subset.sample(shots, random_state=42)[["title", "price_clean"]].values.tolist() + results = [] + + for _, row in df_subset.iterrows(): + prompt = build_pricing_prompt(few_shots, row["title"]) + try: + resp = client.messages.create( + model=model, + max_tokens=MAX_TOKENS, + messages=[{"role": "user", "content": prompt}], + ) + reply = resp.content[0].text.strip() + pred = float(reply.replace("$", "").strip()) + except: + pred, reply = np.nan, None + + results.append({"title": row["title"], "true": row["price_clean"], "pred": pred}) + + df_res = pd.DataFrame(results).dropna() + mae = np.mean(np.abs(df_res.pred - df_res.true)) + rmse = np.sqrt(np.mean((df_res.pred - df_res.true)**2)) + pct20 = np.mean(np.abs(df_res.pred - df_res.true) <= 20) * 100 + return df_res, mae, rmse, pct20 + +#Tabulate the 2 shot results +df2, mae2, rmse2, pct2 = run_few_shot_test(subset_50, shots=2) +print("2-SHOT RESULTS → MAE={:.2f}, RMSE={:.2f}, %≤$20={:.1f}%".format(mae2, rmse2, pct2)) +df2.head() + +#5 shot results +df5, mae5, rmse5, pct5 = run_few_shot_test(subset_50, shots=5) +print("5-SHOT RESULTS → MAE={:.2f}, RMSE={:.2f}, %≤$20={:.1f}%".format(mae5, rmse5, pct5)) +df5.head() + +#7 shot results +df7, mae7, rmse7, pct7 = run_few_shot_test(subset_50, shots=7) +print("7-SHOT RESULTS → MAE={:.2f}, RMSE={:.2f}, %≤$20={:.1f}%".format(mae7, rmse7, pct7)) +df7.head() + +#Tabulate all the shots to choose the optimal or if there is Any need for the shots + +results_summary = [ + {"shots": 0, "MAE": 22.52, "RMSE": 44.11, "%≤$20": 72.0}, # baseline + {"shots": 2, "MAE": mae2, "RMSE": rmse2, "%≤$20": pct2}, + {"shots": 5, "MAE": mae5, "RMSE": rmse5, "%≤$20": pct5}, + {"shots": 7, "MAE": mae7, "RMSE": rmse7, "%≤$20": pct7}, + {"shots": 10, "MAE": 16.27, "RMSE": 38.59, "%≤$20": 90.0}, + {"shots": 30, "MAE": 135.73, "RMSE": 606.78, "%≤$20": 70.0}, + {"shots": 50, "MAE": 42.54, "RMSE": 136.61, "%≤$20": 72.0}, +] + +df_comparison = pd.DataFrame(results_summary) +df_comparison = df_comparison.sort_values("shots").reset_index(drop=True) +df_comparison + +"""1. 0-shot baseline: MAE 22.52, %≤$20 72% + +2. Very low few-shots (2, 5): Surprisingly worse than baseline (MAE ↑, %≤$20 ↓), likely due to variance and poor example selection. + +3. 7-shot: Improves over baseline slightly, MAE 19.91, %≤$20 back to 72% + +4. 10-shot: Best performance overall — MAE 16.27, %≤$20 jumps to 90%! Clearly the few-shot hints are helping here. + +5. 30-shot: Performance collapses (MAE 135.73, RMSE 606.78) — too many examples may confuse the model. + +6. 50-shot: Slightly better than 30-shot but still worse than 10-shot. + + +Conclusion: Optimal few-shot count is 10 for this dataset and prompt style. +""" + +#Further finetuning of the selected 10-shot + +def build_finetune_prompt(few_shots: list, target_title: str, max_chars=800): + """ + few_shots: list of dicts {"title":..., "price_clean":...} + target_title: title string + """ + parts = ["You are an e-commerce pricing expert. Estimate product prices in USD accurately. Output only a number."] + parts.append("\nExamples:") + for ex in few_shots: + parts.append(f"- {ex['title']}: {ex['price_clean']}") + parts.append("\nPredict price for the following product:") + parts.append(f"Title: {target_title}") + prompt = "\n".join(parts) + if len(prompt) > max_chars: + return prompt[:max_chars] + "..." + return prompt + +# Sample 10-shot prompts for fine-tuning +finetune_examples = [] +subset_10 = df.dropna(subset=["price_clean"]).sample(100, random_state=42).reset_index(drop=True) # 100 products for initial fine-tuning + +for idx, row in subset_10.iterrows(): + # Pick 10 random examples from subset for few-shot + few_shots = subset_10.drop(idx).sample(10, random_state=idx)[["title","price_clean"]].to_dict(orient="records") + prompt = build_finetune_prompt(few_shots, row["title"]) + finetune_examples.append({ + "prompt": prompt, + "completion": str(row["price_clean"]) + }) + +print("Sample fine-tuning example:") +print(finetune_examples[0]) + +with open("finetune_10shot.jsonl", "w") as f: + for ex in finetune_examples: + f.write(json.dumps(ex) + "\n") +print("(10-shot format).finetuned") + +# Evaluate enhanced 10-shot prompt on sample +results_finetune_test = [] + +for idx, row in subset_10.iterrows(): + few_shots = subset_10.drop(idx).sample(10, random_state=idx)[["title","price_clean"]].to_dict(orient="records") + prompt = build_finetune_prompt(few_shots, row["title"]) + try: + resp = client.messages.create( + model=CLAUDE_MODEL, + max_tokens=MAX_TOKENS, + messages=[{"role": "user", "content": prompt}] + ) + reply = resp.content[0].text.strip() + pred = float(reply.replace("$","").strip()) + except Exception: + pred, reply = np.nan, None + results_finetune_test.append({"title": row["title"], "true_price": row["price_clean"], "pred": pred, "raw": reply}) + +df_finetune_test = pd.DataFrame(results_finetune_test).dropna(subset=["pred"]) +mae_ft = np.mean(np.abs(df_finetune_test.pred - df_finetune_test.true_price)) +rmse_ft = np.sqrt(np.mean((df_finetune_test.pred - df_finetune_test.true_price)**2)) +pct20_ft = np.mean(np.abs(df_finetune_test.pred - df_finetune_test.true_price) <= 20) * 100 + +print(f"Finetuned 10-shot performance: MAE={mae_ft:.2f}, RMSE={rmse_ft:.2f}, %≤$20={pct20_ft:.1f}%") + +"""Multi-shot prompting (10 examples in the prompt) without fine-tuning performed much better. + + +Next trial: Prompt optimization +""" + +#prompt optimization seems like th eonly choice +def build_pricing_prompt_alt(few_shots: list, target_title: str) -> str: + """ + Build an alternative multi-shot pricing prompt for Claude. + + few_shots: list of dicts with keys 'title' and 'price_clean' + target_title: product title to predict the price for + """ + parts = [] + + # Instruction with a slightly different phrasing + parts.append("Act as an expert e-commerce pricing analyst.") + parts.append("Given product titles and their prices, predict the price in USD for the new product.") + parts.append("Only provide the numeric price. No extra text, explanations, or symbols.") + + # Format the examples differently: numbered list + parts.append("\nExample prices:") + for i, ex in enumerate(few_shots, start=1): + parts.append(f"{i}. {ex['title']} — ${ex['price_clean']:.2f}") + + # Target product + parts.append("\nPredict the price for this product:") + parts.append(f"Title: {target_title}") + parts.append("Price (USD):") + + # Combine into single prompt + prompt = "\n".join(parts) + return prompt + +"""eda""" \ No newline at end of file