Files
LLM_Engineering_OLD/week8/community_contributions/week8_exercie (2).py
Cosmus Mutuku 0b660f4884 Week8 exercise
2025-10-29 23:55:30 +03:00

349 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""week8_exercie.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1jJ4pKoJat0ZnC99sTQjEEe9BMK--ArwQ
"""
!pip install -q pandas datasets matplotlib seaborn
!pip install datasets==3.0.1
!pip install anthropic -q
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
#chec perfomance
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample
import os
from anthropic import Anthropic
import re
pd.set_option("display.max_colwidth", 100)
# # Initialize client using environment variable
# client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
# # Quick test
# print("Anthropic client initialized " if client else " Anthropic not detected.")
from google.colab import userdata
userdata.get('ANTHROPIC_API_KEY')
api_key = userdata.get('ANTHROPIC_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = api_key
client = Anthropic(api_key=api_key)
# List models
models = client.models.list()
print("Available Anthropic Models:\n")
for m in models.data:
print(f"- {m.id}")
#dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
# Loading a sample from the full reviews data
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
# creating a DF
df = pd.DataFrame(dataset)
df = df[["title", "text", "rating"]].dropna().reset_index(drop=True)
# Renaming th columns for clarity/easy ref
df.rename(columns={"text": "review_body"}, inplace=True)
print(f"Loaded {len(df)} rows with reviews and ratings")
df.head()
#inspect the data
# Basic info
print(df.info())
print(df.isnull().sum())
# Unique ratings dist
print(df["rating"].value_counts().sort_index())
# Check Random reviews
display(df.sample(5, random_state=42))
# Review length distribution
df["review_length"] = df["review_body"].apply(lambda x: len(str(x).split()))
#Summarize the review length
print(df["review_length"].describe())
# pltt the rating distribution
plt.figure(figsize=(6,4))
df["rating"].hist(bins=5, edgecolor='black')
plt.title("Ratings Distribution (15 stars)")
plt.xlabel("Rating")
plt.ylabel("Number of Reviews")
plt.show()
# review length
plt.figure(figsize=(6,4))
df["review_length"].hist(bins=30, color="lightblue", edgecolor='black')
plt.title("Review Length Distribution")
plt.xlabel("Number of Words in Review")
plt.ylabel("Number of Reviews")
plt.show()
#cleaning
def clean_text(text):
text = text.lower()
# remove URLs
text = re.sub(r"http\S+|www\S+|https\S+", '', text)
# remove punctuation/special chars
text = re.sub(r"[^a-z0-9\s]", '', text)
# normalize whitespace
text = re.sub(r"\s+", ' ', text).strip()
return text
df["clean_review"] = df["review_body"].apply(clean_text)
df.head(3)
"""'#sentiment analysis"""
# Rating labellings
def label_sentiment(rating):
if rating <= 2:
return "negative"
elif rating == 3:
return "neutral"
else:
return "positive"
df["sentiment"] = df["rating"].apply(label_sentiment)
df["sentiment"].value_counts()
#train/tets split
X_train, X_test, y_train, y_test = train_test_split(
df["clean_review"], df["sentiment"], test_size=0.2, random_state=42, stratify=df["sentiment"]
)
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")
#trian classfier
# Train lightweight model
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
sample_texts = [
"This blender broke after two days. Waste of money!",
"Works exactly as described, very satisfied!",
"Its okay, does the job but nothing special."
]
sample_features = vectorizer.transform(sample_texts)
sample_preds = clf.predict(sample_features)
for text, pred in zip(sample_texts, sample_preds):
print(f"\nReview: {text}\nPredicted Sentiment: {pred}")
"""#Improving Model Balance & Realism"""
# Separate by sentiment
pos = df[df["sentiment"] == "positive"]
neg = df[df["sentiment"] == "negative"]
neu = df[df["sentiment"] == "neutral"]
# Undersample positive to match roughly others
pos_down = resample(pos, replace=False, n_samples=len(neg) + len(neu), random_state=42)
# Combine
df_balanced = pd.concat([pos_down, neg, neu]).sample(frac=1, random_state=42).reset_index(drop=True)
print(df_balanced["sentiment"].value_counts())
#retain classfier
X_train, X_test, y_train, y_test = train_test_split(
df_balanced["clean_review"], df_balanced["sentiment"],
test_size=0.2, random_state=42, stratify=df_balanced["sentiment"]
)
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
clf = LogisticRegression(max_iter=300, class_weight="balanced")
clf.fit(X_train_tfidf, y_train)
print("Balanced model trained successfully ")
#evaluate agan
y_pred = clf.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
"""#Agents"""
# Base class for all agents
class BaseAgent:
"""A simple base agent with a name and a run() method."""
def __init__(self, name):
self.name = name
def run(self, *args, **kwargs):
raise NotImplementedError("Subclasses must implement run() method.")
def log(self, message):
print(f"[{self.name}] {message}")
#DataAgent for loading/cleaning
class DataAgent(BaseAgent):
"""Handles dataset preparation tasks."""
def __init__(self, data):
super().__init__("DataAgent")
self.data = data
def run(self):
self.log("Preprocessing data...")
df_clean = self.data.copy()
df_clean["review_body"] = df_clean["review_body"].str.strip()
df_clean.drop_duplicates(subset=["review_body"], inplace=True)
self.log(f"Dataset ready with {len(df_clean)} reviews.")
return df_clean
#analisyis agent-->using the tianed sentiment model *TF-IDF +Logistic Regression) to classfy Reviews
class AnalysisAgent(BaseAgent):
"""Analyzes text sentiment using a trained model."""
def __init__(self, vectorizer, model):
super().__init__("AnalysisAgent")
self.vectorizer = vectorizer
self.model = model
def run(self, reviews):
self.log(f"Analyzing {len(reviews)} reviews...")
X = self.vectorizer.transform(reviews)
predictions = self.model.predict(X)
return predictions
#ReviewerAgent. Serves as the summary agnt using the anthropic API to give LLM review insights
class ReviewerAgent(BaseAgent):
"""Summarizes overall sentiment trends using Anthropic Claude."""
def __init__(self):
super().__init__("ReviewerAgent")
# Retrieve your key once — its already stored in Colab userdata
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
from google.colab import userdata
api_key = userdata.get("ANTHROPIC_API_KEY")
if not api_key:
raise ValueError("Anthropic API key not found. Make sure it's set in Colab userdata as 'ANTHROPIC_API_KEY'.")
self.client = Anthropic(api_key=api_key)
def run(self, summary_text):
"""Generate an insights summary using Claude."""
self.log("Generating summary using Claude...")
prompt = f"""
You are a product insights assistant.
Based on the following summarized customer reviews, write a concise 34 sentence sentiment analysis report.
Clearly describe the main themes and tone in user feedback on these home appliance products.
Reviews Summary:
{summary_text}
"""
response = self.client.messages.create(
model="claude-3-5-haiku-20241022",
max_tokens=250,
temperature=0.6,
messages=[{"role": "user", "content": prompt}]
)
output = response.content[0].text.strip()
self.log("Summary generated successfully ")
return output
# Instantiate agents
data_agent = DataAgent(df)
analysis_agent = AnalysisAgent(vectorizer, clf)
reviewer_agent = ReviewerAgent()
# Clean data
df_ready = data_agent.run()
# Classify sentiments
df_ready["predicted_sentiment"] = analysis_agent.run(df_ready["review_body"])
# Prepare summary text by sentiment group
summary_text = df_ready.groupby("predicted_sentiment")["review_body"].apply(lambda x: " ".join(x[:3])).to_string()
# Generate AI summary using Anthropic
insight_summary = reviewer_agent.run(summary_text)
print(insight_summary)
"""#Evaluation & Visualization"""
# Evaluation & Visualization ===
# Count predicted sentiments
sentiment_counts = df_ready["predicted_sentiment"].value_counts()
print(sentiment_counts)
# Plot sentiment distribution
plt.figure(figsize=(6,4))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
plt.title("Sentiment Distribution of Reviews", fontsize=14)
plt.xlabel("Sentiment")
plt.ylabel("Number of Reviews")
plt.show()
# Compute average review length per sentiment
df_ready["review_length"] = df_ready["review_body"].apply(lambda x: len(x.split()))
avg_length = df_ready.groupby("predicted_sentiment")["review_length"].mean()
print(avg_length)
# Visualize it
plt.figure(figsize=(6,4))
sns.barplot(x=avg_length.index, y=avg_length.values, palette="coolwarm")
plt.title("Average Review Length per Sentiment")
plt.xlabel("Sentiment")
plt.ylabel("Average Word Count")
plt.show()