Added week8 exercise
This commit is contained in:
349
week8/community_contributions/bharat_puri/exercise.py
Normal file
349
week8/community_contributions/bharat_puri/exercise.py
Normal file
@@ -0,0 +1,349 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""week8_exercie.ipynb
|
||||||
|
|
||||||
|
Automatically generated by Colab.
|
||||||
|
|
||||||
|
Original file is located at
|
||||||
|
https://colab.research.google.com/drive/1jJ4pKoJat0ZnC99sTQjEEe9BMK--ArwQ
|
||||||
|
"""
|
||||||
|
|
||||||
|
!pip install -q pandas datasets matplotlib seaborn
|
||||||
|
!pip install datasets==3.0.1
|
||||||
|
!pip install anthropic -q
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
from datasets import load_dataset
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
#chec perfomance
|
||||||
|
from sklearn.metrics import classification_report, confusion_matrix
|
||||||
|
from sklearn.utils import resample
|
||||||
|
import os
|
||||||
|
from anthropic import Anthropic
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pd.set_option("display.max_colwidth", 100)
|
||||||
|
|
||||||
|
# # Initialize client using environment variable
|
||||||
|
# client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||||
|
|
||||||
|
# # Quick test
|
||||||
|
# print("Anthropic client initialized " if client else " Anthropic not detected.")
|
||||||
|
|
||||||
|
from google.colab import userdata
|
||||||
|
userdata.get('ANTHROPIC_API_KEY')
|
||||||
|
|
||||||
|
api_key = userdata.get('ANTHROPIC_API_KEY')
|
||||||
|
os.environ["ANTHROPIC_API_KEY"] = api_key
|
||||||
|
|
||||||
|
client = Anthropic(api_key=api_key)
|
||||||
|
|
||||||
|
# List models
|
||||||
|
models = client.models.list()
|
||||||
|
|
||||||
|
print("Available Anthropic Models:\n")
|
||||||
|
for m in models.data:
|
||||||
|
print(f"- {m.id}")
|
||||||
|
|
||||||
|
#dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Loading a sample from the full reviews data
|
||||||
|
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
|
||||||
|
|
||||||
|
# creating a DF
|
||||||
|
df = pd.DataFrame(dataset)
|
||||||
|
df = df[["title", "text", "rating"]].dropna().reset_index(drop=True)
|
||||||
|
|
||||||
|
# Renaming th columns for clarity/easy ref
|
||||||
|
df.rename(columns={"text": "review_body"}, inplace=True)
|
||||||
|
|
||||||
|
print(f"Loaded {len(df)} rows with reviews and ratings")
|
||||||
|
df.head()
|
||||||
|
|
||||||
|
#inspect the data
|
||||||
|
# Basic info
|
||||||
|
print(df.info())
|
||||||
|
print(df.isnull().sum())
|
||||||
|
|
||||||
|
# Unique ratings dist
|
||||||
|
print(df["rating"].value_counts().sort_index())
|
||||||
|
|
||||||
|
# Check Random reviews
|
||||||
|
display(df.sample(5, random_state=42))
|
||||||
|
|
||||||
|
# Review length distribution
|
||||||
|
df["review_length"] = df["review_body"].apply(lambda x: len(str(x).split()))
|
||||||
|
|
||||||
|
#Summarize the review length
|
||||||
|
print(df["review_length"].describe())
|
||||||
|
|
||||||
|
# pltt the rating distribution
|
||||||
|
plt.figure(figsize=(6,4))
|
||||||
|
df["rating"].hist(bins=5, edgecolor='black')
|
||||||
|
plt.title("Ratings Distribution (1–5 stars)")
|
||||||
|
plt.xlabel("Rating")
|
||||||
|
plt.ylabel("Number of Reviews")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# review length
|
||||||
|
plt.figure(figsize=(6,4))
|
||||||
|
df["review_length"].hist(bins=30, color="lightblue", edgecolor='black')
|
||||||
|
plt.title("Review Length Distribution")
|
||||||
|
plt.xlabel("Number of Words in Review")
|
||||||
|
plt.ylabel("Number of Reviews")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
#cleaning
|
||||||
|
def clean_text(text):
|
||||||
|
text = text.lower()
|
||||||
|
# remove URLs
|
||||||
|
text = re.sub(r"http\S+|www\S+|https\S+", '', text)
|
||||||
|
# remove punctuation/special chars
|
||||||
|
text = re.sub(r"[^a-z0-9\s]", '', text)
|
||||||
|
# normalize whitespace
|
||||||
|
text = re.sub(r"\s+", ' ', text).strip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
df["clean_review"] = df["review_body"].apply(clean_text)
|
||||||
|
|
||||||
|
df.head(3)
|
||||||
|
|
||||||
|
"""'#sentiment analysis"""
|
||||||
|
|
||||||
|
# Rating labellings
|
||||||
|
def label_sentiment(rating):
|
||||||
|
if rating <= 2:
|
||||||
|
return "negative"
|
||||||
|
elif rating == 3:
|
||||||
|
return "neutral"
|
||||||
|
else:
|
||||||
|
return "positive"
|
||||||
|
|
||||||
|
df["sentiment"] = df["rating"].apply(label_sentiment)
|
||||||
|
|
||||||
|
df["sentiment"].value_counts()
|
||||||
|
|
||||||
|
#train/tets split
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
df["clean_review"], df["sentiment"], test_size=0.2, random_state=42, stratify=df["sentiment"]
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
|
||||||
|
|
||||||
|
# Convert text to TF-IDF features
|
||||||
|
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
|
||||||
|
|
||||||
|
X_train_tfidf = vectorizer.fit_transform(X_train)
|
||||||
|
|
||||||
|
X_test_tfidf = vectorizer.transform(X_test)
|
||||||
|
|
||||||
|
print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")
|
||||||
|
|
||||||
|
#trian classfier
|
||||||
|
|
||||||
|
# Train lightweight model
|
||||||
|
clf = LogisticRegression(max_iter=200)
|
||||||
|
|
||||||
|
clf.fit(X_train_tfidf, y_train)
|
||||||
|
|
||||||
|
y_pred = clf.predict(X_test_tfidf)
|
||||||
|
|
||||||
|
print("Classification Report:\n", classification_report(y_test, y_pred))
|
||||||
|
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
|
||||||
|
|
||||||
|
sample_texts = [
|
||||||
|
"This blender broke after two days. Waste of money!",
|
||||||
|
"Works exactly as described, very satisfied!",
|
||||||
|
"It’s okay, does the job but nothing special."
|
||||||
|
]
|
||||||
|
|
||||||
|
sample_features = vectorizer.transform(sample_texts)
|
||||||
|
sample_preds = clf.predict(sample_features)
|
||||||
|
|
||||||
|
for text, pred in zip(sample_texts, sample_preds):
|
||||||
|
print(f"\nReview: {text}\nPredicted Sentiment: {pred}")
|
||||||
|
|
||||||
|
"""#Improving Model Balance & Realism"""
|
||||||
|
|
||||||
|
# Separate by sentiment
|
||||||
|
pos = df[df["sentiment"] == "positive"]
|
||||||
|
neg = df[df["sentiment"] == "negative"]
|
||||||
|
neu = df[df["sentiment"] == "neutral"]
|
||||||
|
|
||||||
|
# Undersample positive to match roughly others
|
||||||
|
pos_down = resample(pos, replace=False, n_samples=len(neg) + len(neu), random_state=42)
|
||||||
|
|
||||||
|
# Combine
|
||||||
|
df_balanced = pd.concat([pos_down, neg, neu]).sample(frac=1, random_state=42).reset_index(drop=True)
|
||||||
|
|
||||||
|
print(df_balanced["sentiment"].value_counts())
|
||||||
|
|
||||||
|
#retain classfier
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
df_balanced["clean_review"], df_balanced["sentiment"],
|
||||||
|
test_size=0.2, random_state=42, stratify=df_balanced["sentiment"]
|
||||||
|
)
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
|
||||||
|
|
||||||
|
X_train_tfidf = vectorizer.fit_transform(X_train)
|
||||||
|
|
||||||
|
X_test_tfidf = vectorizer.transform(X_test)
|
||||||
|
|
||||||
|
clf = LogisticRegression(max_iter=300, class_weight="balanced")
|
||||||
|
clf.fit(X_train_tfidf, y_train)
|
||||||
|
|
||||||
|
print("Balanced model trained successfully ")
|
||||||
|
|
||||||
|
#evaluate agan
|
||||||
|
y_pred = clf.predict(X_test_tfidf)
|
||||||
|
|
||||||
|
print("Classification Report:\n", classification_report(y_test, y_pred))
|
||||||
|
|
||||||
|
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
|
||||||
|
|
||||||
|
"""#Agents"""
|
||||||
|
|
||||||
|
# Base class for all agents
|
||||||
|
class BaseAgent:
|
||||||
|
"""A simple base agent with a name and a run() method."""
|
||||||
|
|
||||||
|
def __init__(self, name):
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
def run(self, *args, **kwargs):
|
||||||
|
raise NotImplementedError("Subclasses must implement run() method.")
|
||||||
|
|
||||||
|
def log(self, message):
|
||||||
|
print(f"[{self.name}] {message}")
|
||||||
|
|
||||||
|
#DataAgent for loading/cleaning
|
||||||
|
class DataAgent(BaseAgent):
|
||||||
|
"""Handles dataset preparation tasks."""
|
||||||
|
|
||||||
|
def __init__(self, data):
|
||||||
|
super().__init__("DataAgent")
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.log("Preprocessing data...")
|
||||||
|
df_clean = self.data.copy()
|
||||||
|
df_clean["review_body"] = df_clean["review_body"].str.strip()
|
||||||
|
df_clean.drop_duplicates(subset=["review_body"], inplace=True)
|
||||||
|
self.log(f"Dataset ready with {len(df_clean)} reviews.")
|
||||||
|
return df_clean
|
||||||
|
|
||||||
|
#analisyis agent-->using the tianed sentiment model *TF-IDF +Logistic Regression) to classfy Reviews
|
||||||
|
class AnalysisAgent(BaseAgent):
|
||||||
|
"""Analyzes text sentiment using a trained model."""
|
||||||
|
|
||||||
|
def __init__(self, vectorizer, model):
|
||||||
|
super().__init__("AnalysisAgent")
|
||||||
|
self.vectorizer = vectorizer
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def run(self, reviews):
|
||||||
|
self.log(f"Analyzing {len(reviews)} reviews...")
|
||||||
|
X = self.vectorizer.transform(reviews)
|
||||||
|
predictions = self.model.predict(X)
|
||||||
|
return predictions
|
||||||
|
|
||||||
|
#ReviewerAgent. Serves as the summary agnt using the anthropic API to give LLM review insights
|
||||||
|
class ReviewerAgent(BaseAgent):
|
||||||
|
"""Summarizes overall sentiment trends using Anthropic Claude."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__("ReviewerAgent")
|
||||||
|
# Retrieve your key once — it’s already stored in Colab userdata
|
||||||
|
api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
from google.colab import userdata
|
||||||
|
api_key = userdata.get("ANTHROPIC_API_KEY")
|
||||||
|
|
||||||
|
if not api_key:
|
||||||
|
raise ValueError("Anthropic API key not found. Make sure it's set in Colab userdata as 'ANTHROPIC_API_KEY'.")
|
||||||
|
|
||||||
|
self.client = Anthropic(api_key=api_key)
|
||||||
|
|
||||||
|
def run(self, summary_text):
|
||||||
|
"""Generate an insights summary using Claude."""
|
||||||
|
self.log("Generating summary using Claude...")
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
You are a product insights assistant.
|
||||||
|
Based on the following summarized customer reviews, write a concise 3–4 sentence sentiment analysis report.
|
||||||
|
Clearly describe the main themes and tone in user feedback on these home appliance products.
|
||||||
|
|
||||||
|
Reviews Summary:
|
||||||
|
{summary_text}
|
||||||
|
"""
|
||||||
|
|
||||||
|
response = self.client.messages.create(
|
||||||
|
model="claude-3-5-haiku-20241022",
|
||||||
|
max_tokens=250,
|
||||||
|
temperature=0.6,
|
||||||
|
messages=[{"role": "user", "content": prompt}]
|
||||||
|
)
|
||||||
|
|
||||||
|
output = response.content[0].text.strip()
|
||||||
|
self.log("Summary generated successfully ")
|
||||||
|
return output
|
||||||
|
|
||||||
|
# Instantiate agents
|
||||||
|
data_agent = DataAgent(df)
|
||||||
|
analysis_agent = AnalysisAgent(vectorizer, clf)
|
||||||
|
reviewer_agent = ReviewerAgent()
|
||||||
|
|
||||||
|
# Clean data
|
||||||
|
df_ready = data_agent.run()
|
||||||
|
|
||||||
|
# Classify sentiments
|
||||||
|
df_ready["predicted_sentiment"] = analysis_agent.run(df_ready["review_body"])
|
||||||
|
|
||||||
|
# Prepare summary text by sentiment group
|
||||||
|
summary_text = df_ready.groupby("predicted_sentiment")["review_body"].apply(lambda x: " ".join(x[:3])).to_string()
|
||||||
|
|
||||||
|
# Generate AI summary using Anthropic
|
||||||
|
insight_summary = reviewer_agent.run(summary_text)
|
||||||
|
|
||||||
|
print(insight_summary)
|
||||||
|
|
||||||
|
"""#Evaluation & Visualization"""
|
||||||
|
|
||||||
|
# Evaluation & Visualization ===
|
||||||
|
|
||||||
|
# Count predicted sentiments
|
||||||
|
sentiment_counts = df_ready["predicted_sentiment"].value_counts()
|
||||||
|
|
||||||
|
print(sentiment_counts)
|
||||||
|
|
||||||
|
# Plot sentiment distribution
|
||||||
|
plt.figure(figsize=(6,4))
|
||||||
|
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
|
||||||
|
plt.title("Sentiment Distribution of Reviews", fontsize=14)
|
||||||
|
plt.xlabel("Sentiment")
|
||||||
|
plt.ylabel("Number of Reviews")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# Compute average review length per sentiment
|
||||||
|
df_ready["review_length"] = df_ready["review_body"].apply(lambda x: len(x.split()))
|
||||||
|
|
||||||
|
avg_length = df_ready.groupby("predicted_sentiment")["review_length"].mean()
|
||||||
|
|
||||||
|
print(avg_length)
|
||||||
|
|
||||||
|
# Visualize it
|
||||||
|
plt.figure(figsize=(6,4))
|
||||||
|
sns.barplot(x=avg_length.index, y=avg_length.values, palette="coolwarm")
|
||||||
|
plt.title("Average Review Length per Sentiment")
|
||||||
|
plt.xlabel("Sentiment")
|
||||||
|
plt.ylabel("Average Word Count")
|
||||||
|
plt.show()
|
||||||
Reference in New Issue
Block a user