Files
LLM_Engineering_OLD/week6/community-contributions/week_6_exercise_revised.py
Cosmus Mutuku 9019fe1562 Week_6_Exercise
2025-10-25 15:05:16 +03:00

621 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""Week_6_exercise_revised.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1GaV053HB8l-Wd3J3o9BcOAjC009Qk_W0
"""
#installations
!pip install --upgrade pip
!pip install datasets==3.0.1 anthropic transformers accelerate pandas tqdm numpy
#imports
import os
import re
import json
import random
import time
from typing import Optional, List, Dict, Any, Tuple
from sklearn.model_selection import train_test_split
import anthropic
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
#TEMPORARY: Hard-coded keys
#I hid my keys, you can replace your keys with 'sk' and 'hf'
os.environ["ANTHROPIC_API_KEY"] = "sk"
os.environ["HF_TOKEN"] = "hf"
# Anthropic Client
try:
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
print("Anthropic client initialized")
except Exception as e:
raise ImportError("Please install anthropic: !pip install anthropic") from e
#some Basic configrations used throughtout the notebook
RANDOM_SEED = 42
# medium test size
TEST_SIZE = 50
CLAUDE_MODEL = "claude-opus-4-20250514"
MAX_TOKENS = 300
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
# Load my dataset, the Aplliances in my case
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Appliances", split="full")
#using Pandas to create a dataframe
df = dataset.to_pandas()
#see the data
df.head()
# Let clean the Price column and have it as a Price-clean
df["price_clean"] = pd.to_numeric(df["price"], errors="coerce")
#check the number of rows In the ddata
print("Dataset size:", len(df))
#check The featues in the data
print(df.columns.tolist())
#checking some info
print(df.info())
print("Price-related columns found:", [c for c in df.columns if "price" in c])
print("Missing price_clean:", df["price_clean"].isna().sum(), "rows")
# Price distribution visualization (Zoomed histogram)
plt.figure(figsize=(10,5))
df[df["price_clean"] < 200]["price_clean"].hist(bins=50)
plt.title("Price Distribution")
plt.xlabel("Price ($)")
plt.ylabel("Frequency")
plt.show()
# Keep only rows where price_clean is not null
df_model = df.dropna(subset=["price_clean"]).copy()
# come up with a ptompt text combined
def combine_text(row):
title = row["title"] or ""
features = " ".join(row["features"]) if isinstance(row["features"], list) else ""
description = " ".join(row["description"]) if isinstance(row["description"], list) else ""
return f"{title}\n\nFEATURES: {features}\n\nDESCRIPTION: {description}"
df_model["text"] = df_model.apply(combine_text, axis=1)
# Retain what's needed
df_model = df_model[["text", "price_clean"]].reset_index(drop=True)
# check the model dataset size
print(len(df_model))
df_model.head(5)
# Splitting the data into Training and test
train_df, test_df = train_test_split(
df_model,
test_size=0.10, # 10% test split
random_state=RANDOM_SEED
)
#Training
len(train_df)
#Testing
len(test_df)
# make the test a list for better samplng
test_records = test_df.to_dict(orient="records")
# Pricing system Prompt
def build_prompt(item_text: str) -> str:
return f"""
You are a pricing analyst. Given a marketplace product listing, estimate the item's correct fair market price in KES.
Return ONLY a number, no currency sign, no explanation.
Product details:
\"\"\"
{item_text}
\"\"\"
"""
def estimate_price_claude(item_text: str) -> Optional[float]:
try:
prompt = build_prompt(item_text)
response = client.messages.create(
model=CLAUDE_MODEL,
max_tokens=MAX_TOKENS,
messages=[
{"role": "user", "content": prompt}
]
)
raw_output = response.content[0].text.strip()
# Extract first valid number from model response
match = re.search(r"\d+(\.\d+)?", raw_output.replace(",", ""))
return float(match.group(0)) if match else None
except Exception as e:
print("Error:", e)
return None
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
# Filter and Sample 100 usable Rows
df_usable = df[df["price_clean"].notna()].copy()
sample_df = df_usable.sample(100, random_state=42).reset_index(drop=True)
#empty predriction list for them to be stored
predictions = []
#Getting the prices
def extract_price(text):
"""Extract the first valid float from Claude's reply."""
match = re.search(r"\d+(\.\d+)?", text.replace(",", ""))
return float(match.group(0)) if match else None
# Getting the predictions
for i, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
title = row["title"]
desc = " ".join(row["description"]) if isinstance(row["description"], list) else str(row["description"])
feat = " ".join(row["features"]) if isinstance(row["features"], list) else str(row["features"])
cats = " ".join(row["categories"]) if isinstance(row["categories"], list) else str(row["categories"])
prompt = f"""
You are estimating the USD retail price of an appliance part.
Analyze the information and respond with **only a single number** (no currency symbol, no text, no explanation).
TITLE: {title}
DESCRIPTION: {desc}
FEATURES: {feat}
CATEGORIES: {cats}
Your response must be only a number like: 29.99
"""
response = client.messages.create(
model=CLAUDE_MODEL,
max_tokens=50,
messages=[{"role": "user", "content": prompt}]
)
raw = response.content[0].text.strip()
pred_price = extract_price(raw)
predictions.append({
"title": title,
"true_price": row["price_clean"],
"claude_price": pred_price,
"raw_reply": raw
})
# Saving output in a csv nw
result_df = pd.DataFrame(predictions)
result_df.to_csv("claude_price_predictions_100.csv", index=False)
# Show preview
display(result_df.head())
# Error metrics
valid = result_df[result_df["claude_price"].notna()]
mae = np.mean(np.abs(valid["true_price"] - valid["claude_price"]))
rmse = np.sqrt(np.mean((valid["true_price"] - valid["claude_price"])**2))
pct_within_20 = np.mean(np.abs(valid["true_price"] - valid["claude_price"]) <= 20) * 100
print(f"\nValid predictions: {len(valid)}/{len(result_df)}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"% within $20: {pct_within_20:.1f}%")
"""The model returned a price every single time:
1. -->MAE = 22.52 On average Claude is off by 22.52 from the true price
2. -->RMSE = 44.11 Big errors exist on some items — a sign of occasional wild guesses
2. -->RMSE = 44.11 Big errors exist on some items — a sign of occasional wild guesses
2. -->72% within $20 Claude predicts reasonable accuracy on most products, but 28% are far off.
;
1. Strengths- Model is somehow decent with zero/low fine-tuning. It understood the task, 72% within $20 on a dataset its never seen is a good baseline
1. Weaknesses- Too many rounded “classic” retail numbers (24.99, 89.99, 14.99, 29.99). Seems not to deeply use features, category, or rating. Also the RMSE is high → meaning a few really bad errors are dragging performance
Improvements
1. Prompt enhancements
2. Multi-shot and also better structuring
3. Fine-tuning with local model
"""
#Now we build a persona Prompt
def build_pricing_prompt(examples: list, new_title: str) -> str:
"""
Build a multi-shot prompt for the E-commerce Market Analyst persona.
Each example has (title, price).
"""
few_shots = "\n".join(
[f"Product: {t}\nEstimated fair market price: ${p:.2f}" for t, p in examples]
)
system_prompt = (
"You are a meticulous Data-Driven Market Analyst who estimates realistic, data-based "
"product prices for online marketplaces. You base estimates on comparable items and "
"avoid outliers. Return only the price number."
)
user_prompt = (
f"{system_prompt}\n\nHere are recent examples:\n{few_shots}\n\n"
f"Now estimate a fair market price for this product:\n"
f"Product: {new_title}\n\n"
"Respond with only a number, no text or symbols."
)
return user_prompt
#10-shot predictios
subset_10 = df.dropna(subset=["price_clean"]).sample(10, random_state=42).reset_index(drop=True)
few_shots_3 = subset_10.sample(3, random_state=42)[["title", "price_clean"]].values.tolist()
results_10 = []
for i, row in tqdm(subset_10.iterrows(), total=len(subset_10)):
prompt = build_pricing_prompt(few_shots_3, row["title"])
try:
resp = client.messages.create(
model=CLAUDE_MODEL,
max_tokens=MAX_TOKENS,
messages=[{"role": "user", "content": prompt}],
)
reply = resp.content[0].text.strip()
pred = float(reply.replace("$", "").strip())
except Exception:
pred, reply = np.nan, None
results_10.append({"title": row["title"], "true_price": row["price_clean"], "pred_price": pred, "raw": reply})
df10 = pd.DataFrame(results_10).dropna(subset=["pred_price"])
mae10 = np.mean(np.abs(df10.pred_price - df10.true_price))
rmse10 = np.sqrt(np.mean((df10.pred_price - df10.true_price)**2))
pct20_10 = np.mean(np.abs(df10.pred_price - df10.true_price) <= 20) * 100
print(f"MAE={mae10:.2f}, RMSE={rmse10:.2f}, %within$20={pct20_10:.1f}%")
df10.head()
#30 shot
subset_30 = df.dropna(subset=["price_clean"]).sample(30, random_state=42).reset_index(drop=True)
few_shots_5 = subset_30.sample(5, random_state=42)[["title", "price_clean"]].values.tolist()
results_30 = []
for i, row in tqdm(subset_30.iterrows(), total=len(subset_30)):
prompt = build_pricing_prompt(few_shots_5, row["title"])
try:
resp = client.messages.create(
model=CLAUDE_MODEL,
max_tokens=MAX_TOKENS,
messages=[{"role": "user", "content": prompt}],
)
reply = resp.content[0].text.strip()
pred = float(reply.replace("$", "").strip())
except Exception:
pred, reply = np.nan, None
results_30.append({"title": row["title"], "true_price": row["price_clean"], "pred_price": pred, "raw": reply})
df30 = pd.DataFrame(results_30).dropna(subset=["pred_price"])
mae30 = np.mean(np.abs(df30.pred_price - df30.true_price))
rmse30 = np.sqrt(np.mean((df30.pred_price - df30.true_price)**2))
pct20_30 = np.mean(np.abs(df30.pred_price - df30.true_price) <= 20) * 100
print(f"MAE={mae30:.2f}, RMSE={rmse30:.2f}, %within$20={pct20_30:.1f}%")
df30.head()
#50 Shot s
subset_50 = df.dropna(subset=["price_clean"]).sample(50, random_state=42).reset_index(drop=True)
few_shots_8 = subset_50.sample(8, random_state=42)[["title", "price_clean"]].values.tolist()
results_50 = []
for i, row in tqdm(subset_50.iterrows(), total=len(subset_50)):
prompt = build_pricing_prompt(few_shots_8, row["title"])
try:
resp = client.messages.create(
model=CLAUDE_MODEL,
max_tokens=MAX_TOKENS,
messages=[{"role": "user", "content": prompt}],
)
reply = resp.content[0].text.strip()
pred = float(reply.replace("$", "").strip())
except Exception:
pred, reply = np.nan, None
results_50.append({"title": row["title"], "true_price": row["price_clean"], "pred_price": pred, "raw": reply})
df50 = pd.DataFrame(results_50).dropna(subset=["pred_price"])
mae50 = np.mean(np.abs(df50.pred_price - df50.true_price))
rmse50 = np.sqrt(np.mean((df50.pred_price - df50.true_price)**2))
pct20_50 = np.mean(np.abs(df50.pred_price - df50.true_price) <= 20) * 100
print(f"MAE={mae50:.2f}, RMSE={rmse50:.2f}, %within$20={pct20_50:.1f}%")
df50.head()
#Improved Ptompt and comparin the 10,30, &50 shot hints
def build_strict_prompt(few_shots, test_title):
shots_text = "\n".join([f"Title: {t}\nPrice: ${p:.2f}" for t, p in few_shots])
return f"""
You are an expert e-commerce product pricing analyst. Your job is to predict the most realistic market price for a product based purely on its title.
Here are reference examples:
{shots_text}
Now predict the price for:
Title: {test_title}
RULES:
- Return ONLY a single number.
- No dollar sign.
- No text, no reasoning, no words.
- Format: 123.45
"""
def run_eval(name, subset, shot_count):
few = subset.sample(shot_count, random_state=42)[["title", "price_clean"]].values.tolist()
results = []
for _, row in tqdm(subset.iterrows(), total=len(subset), desc=f"{name}"):
prompt = build_strict_prompt(few, row["title"])
try:
resp = client.messages.create(
model=CLAUDE_MODEL,
max_tokens=MAX_TOKENS,
messages=[{"role": "user", "content": prompt}],
)
reply = resp.content[0].text.strip()
pred = float(reply)
except Exception:
pred, reply = np.nan, None
results.append({"title": row["title"], "true": row["price_clean"], "pred": pred})
df = pd.DataFrame(results).dropna(subset=["pred"])
mae = np.mean(np.abs(df.pred - df.true))
rmse = np.sqrt(np.mean((df.pred - df.true)**2))
pct20 = np.mean(np.abs(df.pred - df.true) <= 20) * 100
return df, mae, rmse, pct20
# Run 10 / 30 / 50
subset10 = df.dropna(subset=["price_clean"]).sample(10, random_state=1).reset_index(drop=True)
subset30 = df.dropna(subset=["price_clean"]).sample(30, random_state=2).reset_index(drop=True)
subset50 = df.dropna(subset=["price_clean"]).sample(50, random_state=3).reset_index(drop=True)
df10, mae10, rmse10, pct10 = run_eval("RUN10", subset10, 3)
df30, mae30, rmse30, pct30 = run_eval("RUN30", subset30, 6)
df50, mae50, rmse50, pct50 = run_eval("RUN50", subset50, 8)
#compare
comparison = pd.DataFrame([
{"shots": 10, "MAE": mae10, "RMSE": rmse10, "%≤$20": pct10},
{"shots": 30, "MAE": mae30, "RMSE": rmse30, "%≤$20": pct30},
{"shots": 50, "MAE": mae50, "RMSE": rmse50, "%≤$20": pct50},
])
print(comparison)
comparison
"""The model becomes confused by too many examples, became more biased toward random values and less less stable and less accurate.
Hypothesis: Possibly the dataset has high variance (many unrelated categories), and the model benefits from small, clean, representative few-shots, not large few-shots.
"""
#Rechecking the variance in the data
prices = df["price_clean"].dropna()
print(prices.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95]))
print("\nSkewness:", prices.skew())
print("Kurtosis:", prices.kurt())
# Plot histogram
plt.figure(figsize=(12,4))
sns.histplot(prices, bins=50)
plt.title("Histogram — Full Dataset Price Distribution")
plt.xlabel("Price ($)")
plt.ylabel("Frequency")
plt.show()
# Plot boxplot
plt.figure(figsize=(10,2))
sns.boxplot(x=prices)
plt.title("Boxplot — Full Dataset Price Spread")
plt.show()
"""Testing fewer shots to check fr the optimal"""
def run_few_shot_test(df_subset, shots, model=CLAUDE_MODEL):
few_shots = df_subset.sample(shots, random_state=42)[["title", "price_clean"]].values.tolist()
results = []
for _, row in df_subset.iterrows():
prompt = build_pricing_prompt(few_shots, row["title"])
try:
resp = client.messages.create(
model=model,
max_tokens=MAX_TOKENS,
messages=[{"role": "user", "content": prompt}],
)
reply = resp.content[0].text.strip()
pred = float(reply.replace("$", "").strip())
except:
pred, reply = np.nan, None
results.append({"title": row["title"], "true": row["price_clean"], "pred": pred})
df_res = pd.DataFrame(results).dropna()
mae = np.mean(np.abs(df_res.pred - df_res.true))
rmse = np.sqrt(np.mean((df_res.pred - df_res.true)**2))
pct20 = np.mean(np.abs(df_res.pred - df_res.true) <= 20) * 100
return df_res, mae, rmse, pct20
#Tabulate the 2 shot results
df2, mae2, rmse2, pct2 = run_few_shot_test(subset_50, shots=2)
print("2-SHOT RESULTS → MAE={:.2f}, RMSE={:.2f}, %≤$20={:.1f}%".format(mae2, rmse2, pct2))
df2.head()
#5 shot results
df5, mae5, rmse5, pct5 = run_few_shot_test(subset_50, shots=5)
print("5-SHOT RESULTS → MAE={:.2f}, RMSE={:.2f}, %≤$20={:.1f}%".format(mae5, rmse5, pct5))
df5.head()
#7 shot results
df7, mae7, rmse7, pct7 = run_few_shot_test(subset_50, shots=7)
print("7-SHOT RESULTS → MAE={:.2f}, RMSE={:.2f}, %≤$20={:.1f}%".format(mae7, rmse7, pct7))
df7.head()
#Tabulate all the shots to choose the optimal or if there is Any need for the shots
results_summary = [
{"shots": 0, "MAE": 22.52, "RMSE": 44.11, "%≤$20": 72.0}, # baseline
{"shots": 2, "MAE": mae2, "RMSE": rmse2, "%≤$20": pct2},
{"shots": 5, "MAE": mae5, "RMSE": rmse5, "%≤$20": pct5},
{"shots": 7, "MAE": mae7, "RMSE": rmse7, "%≤$20": pct7},
{"shots": 10, "MAE": 16.27, "RMSE": 38.59, "%≤$20": 90.0},
{"shots": 30, "MAE": 135.73, "RMSE": 606.78, "%≤$20": 70.0},
{"shots": 50, "MAE": 42.54, "RMSE": 136.61, "%≤$20": 72.0},
]
df_comparison = pd.DataFrame(results_summary)
df_comparison = df_comparison.sort_values("shots").reset_index(drop=True)
df_comparison
"""1. 0-shot baseline: MAE 22.52, %≤$20 72%
2. Very low few-shots (2, 5): Surprisingly worse than baseline (MAE ↑, %≤$20 ↓), likely due to variance and poor example selection.
3. 7-shot: Improves over baseline slightly, MAE 19.91, %≤$20 back to 72%
4. 10-shot: Best performance overall — MAE 16.27, %≤$20 jumps to 90%! Clearly the few-shot hints are helping here.
5. 30-shot: Performance collapses (MAE 135.73, RMSE 606.78) — too many examples may confuse the model.
6. 50-shot: Slightly better than 30-shot but still worse than 10-shot.
Conclusion: Optimal few-shot count is 10 for this dataset and prompt style.
"""
#Further finetuning of the selected 10-shot
def build_finetune_prompt(few_shots: list, target_title: str, max_chars=800):
"""
few_shots: list of dicts {"title":..., "price_clean":...}
target_title: title string
"""
parts = ["You are an e-commerce pricing expert. Estimate product prices in USD accurately. Output only a number."]
parts.append("\nExamples:")
for ex in few_shots:
parts.append(f"- {ex['title']}: {ex['price_clean']}")
parts.append("\nPredict price for the following product:")
parts.append(f"Title: {target_title}")
prompt = "\n".join(parts)
if len(prompt) > max_chars:
return prompt[:max_chars] + "..."
return prompt
# Sample 10-shot prompts for fine-tuning
finetune_examples = []
subset_10 = df.dropna(subset=["price_clean"]).sample(100, random_state=42).reset_index(drop=True) # 100 products for initial fine-tuning
for idx, row in subset_10.iterrows():
# Pick 10 random examples from subset for few-shot
few_shots = subset_10.drop(idx).sample(10, random_state=idx)[["title","price_clean"]].to_dict(orient="records")
prompt = build_finetune_prompt(few_shots, row["title"])
finetune_examples.append({
"prompt": prompt,
"completion": str(row["price_clean"])
})
print("Sample fine-tuning example:")
print(finetune_examples[0])
with open("finetune_10shot.jsonl", "w") as f:
for ex in finetune_examples:
f.write(json.dumps(ex) + "\n")
print("(10-shot format).finetuned")
# Evaluate enhanced 10-shot prompt on sample
results_finetune_test = []
for idx, row in subset_10.iterrows():
few_shots = subset_10.drop(idx).sample(10, random_state=idx)[["title","price_clean"]].to_dict(orient="records")
prompt = build_finetune_prompt(few_shots, row["title"])
try:
resp = client.messages.create(
model=CLAUDE_MODEL,
max_tokens=MAX_TOKENS,
messages=[{"role": "user", "content": prompt}]
)
reply = resp.content[0].text.strip()
pred = float(reply.replace("$","").strip())
except Exception:
pred, reply = np.nan, None
results_finetune_test.append({"title": row["title"], "true_price": row["price_clean"], "pred": pred, "raw": reply})
df_finetune_test = pd.DataFrame(results_finetune_test).dropna(subset=["pred"])
mae_ft = np.mean(np.abs(df_finetune_test.pred - df_finetune_test.true_price))
rmse_ft = np.sqrt(np.mean((df_finetune_test.pred - df_finetune_test.true_price)**2))
pct20_ft = np.mean(np.abs(df_finetune_test.pred - df_finetune_test.true_price) <= 20) * 100
print(f"Finetuned 10-shot performance: MAE={mae_ft:.2f}, RMSE={rmse_ft:.2f}, %≤$20={pct20_ft:.1f}%")
"""Multi-shot prompting (10 examples in the prompt) without fine-tuning performed much better.
Next trial: Prompt optimization
"""
#prompt optimization seems like th eonly choice
def build_pricing_prompt_alt(few_shots: list, target_title: str) -> str:
"""
Build an alternative multi-shot pricing prompt for Claude.
few_shots: list of dicts with keys 'title' and 'price_clean'
target_title: product title to predict the price for
"""
parts = []
# Instruction with a slightly different phrasing
parts.append("Act as an expert e-commerce pricing analyst.")
parts.append("Given product titles and their prices, predict the price in USD for the new product.")
parts.append("Only provide the numeric price. No extra text, explanations, or symbols.")
# Format the examples differently: numbered list
parts.append("\nExample prices:")
for i, ex in enumerate(few_shots, start=1):
parts.append(f"{i}. {ex['title']} — ${ex['price_clean']:.2f}")
# Target product
parts.append("\nPredict the price for this product:")
parts.append(f"Title: {target_title}")
parts.append("Price (USD):")
# Combine into single prompt
prompt = "\n".join(parts)
return prompt
"""eda"""