Merge pull request #903 from CosmusMutuku/week4-exercise

Week 8 exercise
2025-10-29 22:42:13 -04:00
parent 17f76624a6 0b660f4884
commit 5f83adeedb
1 changed files with 349 additions and 0 deletions
--- a/week8/community_contributions/week8_exercie
+++ b/week8/community_contributions/week8_exercie
@@ -0,0 +1,349 @@
+# -*- coding: utf-8 -*-
+"""week8_exercie.ipynb
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1jJ4pKoJat0ZnC99sTQjEEe9BMK--ArwQ
+"""
+
+!pip install -q pandas datasets matplotlib seaborn
+!pip install datasets==3.0.1
+!pip install anthropic -q
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from datasets import load_dataset
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+#chec perfomance
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.utils import resample
+import os
+from anthropic import Anthropic
+import re
+
+
+
+pd.set_option("display.max_colwidth", 100)
+
+# # Initialize client using environment variable
+# client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+
+# # Quick test
+# print("Anthropic client initialized " if client else " Anthropic not detected.")
+
+from google.colab import userdata
+userdata.get('ANTHROPIC_API_KEY')
+
+api_key = userdata.get('ANTHROPIC_API_KEY')
+os.environ["ANTHROPIC_API_KEY"] = api_key
+
+client = Anthropic(api_key=api_key)
+
+#  List models
+models = client.models.list()
+
+print("Available Anthropic Models:\n")
+for m in models.data:
+    print(f"- {m.id}")
+
+#dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
+
+
+
+# Loading a sample from the full reviews data
+dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
+
+# creating a DF
+df = pd.DataFrame(dataset)
+df = df[["title", "text", "rating"]].dropna().reset_index(drop=True)
+
+# Renaming th columns for clarity/easy ref
+df.rename(columns={"text": "review_body"}, inplace=True)
+
+print(f"Loaded {len(df)} rows with reviews and ratings")
+df.head()
+
+#inspect the data
+# Basic info
+print(df.info())
+print(df.isnull().sum())
+
+# Unique ratings dist
+print(df["rating"].value_counts().sort_index())
+
+# Check Random reviews
+display(df.sample(5, random_state=42))
+
+# Review length distribution
+df["review_length"] = df["review_body"].apply(lambda x: len(str(x).split()))
+
+#Summarize the review length
+print(df["review_length"].describe())
+
+# pltt the rating distribution
+plt.figure(figsize=(6,4))
+df["rating"].hist(bins=5, edgecolor='black')
+plt.title("Ratings Distribution (1–5 stars)")
+plt.xlabel("Rating")
+plt.ylabel("Number of Reviews")
+plt.show()
+
+#  review length
+plt.figure(figsize=(6,4))
+df["review_length"].hist(bins=30, color="lightblue", edgecolor='black')
+plt.title("Review Length Distribution")
+plt.xlabel("Number of Words in Review")
+plt.ylabel("Number of Reviews")
+plt.show()
+
+#cleaning
+def clean_text(text):
+    text = text.lower()
+    # remove URLs
+    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
+    # remove punctuation/special chars
+    text = re.sub(r"[^a-z0-9\s]", '', text)
+    # normalize whitespace
+    text = re.sub(r"\s+", ' ', text).strip()
+    return text
+
+df["clean_review"] = df["review_body"].apply(clean_text)
+
+df.head(3)
+
+"""'#sentiment analysis"""
+
+# Rating labellings
+def label_sentiment(rating):
+    if rating <= 2:
+        return "negative"
+    elif rating == 3:
+        return "neutral"
+    else:
+        return "positive"
+
+df["sentiment"] = df["rating"].apply(label_sentiment)
+
+df["sentiment"].value_counts()
+
+#train/tets split
+X_train, X_test, y_train, y_test = train_test_split(
+    df["clean_review"], df["sentiment"], test_size=0.2, random_state=42, stratify=df["sentiment"]
+)
+
+print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
+
+# Convert text to TF-IDF features
+vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
+
+X_train_tfidf = vectorizer.fit_transform(X_train)
+
+X_test_tfidf = vectorizer.transform(X_test)
+
+print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")
+
+#trian classfier
+
+# Train lightweight model
+clf = LogisticRegression(max_iter=200)
+
+clf.fit(X_train_tfidf, y_train)
+
+y_pred = clf.predict(X_test_tfidf)
+
+print("Classification Report:\n", classification_report(y_test, y_pred))
+print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
+
+sample_texts = [
+    "This blender broke after two days. Waste of money!",
+    "Works exactly as described, very satisfied!",
+    "It’s okay, does the job but nothing special."
+]
+
+sample_features = vectorizer.transform(sample_texts)
+sample_preds = clf.predict(sample_features)
+
+for text, pred in zip(sample_texts, sample_preds):
+    print(f"\nReview: {text}\nPredicted Sentiment: {pred}")
+
+"""#Improving Model Balance & Realism"""
+
+# Separate by sentiment
+pos = df[df["sentiment"] == "positive"]
+neg = df[df["sentiment"] == "negative"]
+neu = df[df["sentiment"] == "neutral"]
+
+# Undersample positive to match roughly others
+pos_down = resample(pos, replace=False, n_samples=len(neg) + len(neu), random_state=42)
+
+# Combine
+df_balanced = pd.concat([pos_down, neg, neu]).sample(frac=1, random_state=42).reset_index(drop=True)
+
+print(df_balanced["sentiment"].value_counts())
+
+#retain classfier
+X_train, X_test, y_train, y_test = train_test_split(
+    df_balanced["clean_review"], df_balanced["sentiment"],
+    test_size=0.2, random_state=42, stratify=df_balanced["sentiment"]
+)
+
+vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
+
+X_train_tfidf = vectorizer.fit_transform(X_train)
+
+X_test_tfidf = vectorizer.transform(X_test)
+
+clf = LogisticRegression(max_iter=300, class_weight="balanced")
+clf.fit(X_train_tfidf, y_train)
+
+print("Balanced model trained successfully ")
+
+#evaluate agan
+y_pred = clf.predict(X_test_tfidf)
+
+print("Classification Report:\n", classification_report(y_test, y_pred))
+
+print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
+
+"""#Agents"""
+
+# Base class for all agents
+class BaseAgent:
+    """A simple base agent with a name and a run() method."""
+
+    def __init__(self, name):
+        self.name = name
+
+    def run(self, *args, **kwargs):
+        raise NotImplementedError("Subclasses must implement run() method.")
+
+    def log(self, message):
+        print(f"[{self.name}] {message}")
+
+#DataAgent for loading/cleaning
+class DataAgent(BaseAgent):
+    """Handles dataset preparation tasks."""
+
+    def __init__(self, data):
+        super().__init__("DataAgent")
+        self.data = data
+
+    def run(self):
+        self.log("Preprocessing data...")
+        df_clean = self.data.copy()
+        df_clean["review_body"] = df_clean["review_body"].str.strip()
+        df_clean.drop_duplicates(subset=["review_body"], inplace=True)
+        self.log(f"Dataset ready with {len(df_clean)} reviews.")
+        return df_clean
+
+#analisyis agent-->using the tianed sentiment model *TF-IDF +Logistic Regression) to classfy Reviews
+class AnalysisAgent(BaseAgent):
+    """Analyzes text sentiment using a trained model."""
+
+    def __init__(self, vectorizer, model):
+        super().__init__("AnalysisAgent")
+        self.vectorizer = vectorizer
+        self.model = model
+
+    def run(self, reviews):
+        self.log(f"Analyzing {len(reviews)} reviews...")
+        X = self.vectorizer.transform(reviews)
+        predictions = self.model.predict(X)
+        return predictions
+
+#ReviewerAgent. Serves as the summary agnt using the anthropic API to give LLM review insights
+class ReviewerAgent(BaseAgent):
+    """Summarizes overall sentiment trends using Anthropic Claude."""
+
+    def __init__(self):
+        super().__init__("ReviewerAgent")
+        # Retrieve your key once — it’s already stored in Colab userdata
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            from google.colab import userdata
+            api_key = userdata.get("ANTHROPIC_API_KEY")
+
+        if not api_key:
+            raise ValueError("Anthropic API key not found. Make sure it's set in Colab userdata as 'ANTHROPIC_API_KEY'.")
+
+        self.client = Anthropic(api_key=api_key)
+
+    def run(self, summary_text):
+        """Generate an insights summary using Claude."""
+        self.log("Generating summary using Claude...")
+
+        prompt = f"""
+        You are a product insights assistant.
+        Based on the following summarized customer reviews, write a concise 3–4 sentence sentiment analysis report.
+        Clearly describe the main themes and tone in user feedback on these home appliance products.
+
+        Reviews Summary:
+        {summary_text}
+        """
+
+        response = self.client.messages.create(
+            model="claude-3-5-haiku-20241022",
+            max_tokens=250,
+            temperature=0.6,
+            messages=[{"role": "user", "content": prompt}]
+        )
+
+        output = response.content[0].text.strip()
+        self.log("Summary generated successfully ")
+        return output
+
+# Instantiate agents
+data_agent = DataAgent(df)
+analysis_agent = AnalysisAgent(vectorizer, clf)
+reviewer_agent = ReviewerAgent()
+
+# Clean data
+df_ready = data_agent.run()
+
+# Classify sentiments
+df_ready["predicted_sentiment"] = analysis_agent.run(df_ready["review_body"])
+
+#  Prepare summary text by sentiment group
+summary_text = df_ready.groupby("predicted_sentiment")["review_body"].apply(lambda x: " ".join(x[:3])).to_string()
+
+# Generate AI summary using Anthropic
+insight_summary = reviewer_agent.run(summary_text)
+
+print(insight_summary)
+
+"""#Evaluation & Visualization"""
+
+#  Evaluation & Visualization ===
+
+# Count predicted sentiments
+sentiment_counts = df_ready["predicted_sentiment"].value_counts()
+
+print(sentiment_counts)
+
+# Plot sentiment distribution
+plt.figure(figsize=(6,4))
+sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
+plt.title("Sentiment Distribution of Reviews", fontsize=14)
+plt.xlabel("Sentiment")
+plt.ylabel("Number of Reviews")
+plt.show()
+
+# Compute average review length per sentiment
+df_ready["review_length"] = df_ready["review_body"].apply(lambda x: len(x.split()))
+
+avg_length = df_ready.groupby("predicted_sentiment")["review_length"].mean()
+
+print(avg_length)
+
+# Visualize it
+plt.figure(figsize=(6,4))
+sns.barplot(x=avg_length.index, y=avg_length.values, palette="coolwarm")
+plt.title("Average Review Length per Sentiment")
+plt.xlabel("Sentiment")
+plt.ylabel("Average Word Count")
+plt.show()