Added week8 exercise

2025-10-30 19:02:44 +05:30
parent 58d5fc0cac
commit bb34a0ce2d
1 changed files with 349 additions and 0 deletions
--- a/week8/community_contributions/bharat_puri/exercise.py
+++ b/week8/community_contributions/bharat_puri/exercise.py
@@ -0,0 +1,349 @@
 # -*- coding: utf-8 -*-
 """week8_exercie.ipynb
 Automatically generated by Colab.
 Original file is located at
    https://colab.research.google.com/drive/1jJ4pKoJat0ZnC99sTQjEEe9BMK--ArwQ
 """
 !pip install -q pandas datasets matplotlib seaborn
 !pip install datasets==3.0.1
 !pip install anthropic -q
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from datasets import load_dataset
 from sklearn.model_selection import train_test_split
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 #chec perfomance
 from sklearn.metrics import classification_report, confusion_matrix
 from sklearn.utils import resample
 import os
 from anthropic import Anthropic
 import re
 pd.set_option("display.max_colwidth", 100)
 # # Initialize client using environment variable
 # client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 # # Quick test
 # print("Anthropic client initialized " if client else " Anthropic not detected.")
 from google.colab import userdata
 userdata.get('ANTHROPIC_API_KEY')
 api_key = userdata.get('ANTHROPIC_API_KEY')
 os.environ["ANTHROPIC_API_KEY"] = api_key
 client = Anthropic(api_key=api_key)
 #  List models
 models = client.models.list()
 print("Available Anthropic Models:\n")
 for m in models.data:
    print(f"- {m.id}")
 #dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
 # Loading a sample from the full reviews data
 dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]")
 # creating a DF
 df = pd.DataFrame(dataset)
 df = df[["title", "text", "rating"]].dropna().reset_index(drop=True)
 # Renaming th columns for clarity/easy ref
 df.rename(columns={"text": "review_body"}, inplace=True)
 print(f"Loaded {len(df)} rows with reviews and ratings")
 df.head()
 #inspect the data
 # Basic info
 print(df.info())
 print(df.isnull().sum())
 # Unique ratings dist
 print(df["rating"].value_counts().sort_index())
 # Check Random reviews
 display(df.sample(5, random_state=42))
 # Review length distribution
 df["review_length"] = df["review_body"].apply(lambda x: len(str(x).split()))
 #Summarize the review length
 print(df["review_length"].describe())
 # pltt the rating distribution
 plt.figure(figsize=(6,4))
 df["rating"].hist(bins=5, edgecolor='black')
 plt.title("Ratings Distribution (1–5 stars)")
 plt.xlabel("Rating")
 plt.ylabel("Number of Reviews")
 plt.show()
 #  review length
 plt.figure(figsize=(6,4))
 df["review_length"].hist(bins=30, color="lightblue", edgecolor='black')
 plt.title("Review Length Distribution")
 plt.xlabel("Number of Words in Review")
 plt.ylabel("Number of Reviews")
 plt.show()
 #cleaning
 def clean_text(text):
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    # remove punctuation/special chars
    text = re.sub(r"[^a-z0-9\s]", '', text)
    # normalize whitespace
    text = re.sub(r"\s+", ' ', text).strip()
    return text
 df["clean_review"] = df["review_body"].apply(clean_text)
 df.head(3)
 """'#sentiment analysis"""
 # Rating labellings
 def label_sentiment(rating):
    if rating <= 2:
        return "negative"
    elif rating == 3:
        return "neutral"
    else:
        return "positive"
 df["sentiment"] = df["rating"].apply(label_sentiment)
 df["sentiment"].value_counts()
 #train/tets split
 X_train, X_test, y_train, y_test = train_test_split(
    df["clean_review"], df["sentiment"], test_size=0.2, random_state=42, stratify=df["sentiment"]
 )
 print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
 # Convert text to TF-IDF features
 vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
 X_train_tfidf = vectorizer.fit_transform(X_train)
 X_test_tfidf = vectorizer.transform(X_test)
 print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")
 #trian classfier
 # Train lightweight model
 clf = LogisticRegression(max_iter=200)
 clf.fit(X_train_tfidf, y_train)
 y_pred = clf.predict(X_test_tfidf)
 print("Classification Report:\n", classification_report(y_test, y_pred))
 print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
 sample_texts = [
    "This blender broke after two days. Waste of money!",
    "Works exactly as described, very satisfied!",
    "It’s okay, does the job but nothing special."
 ]
 sample_features = vectorizer.transform(sample_texts)
 sample_preds = clf.predict(sample_features)
 for text, pred in zip(sample_texts, sample_preds):
    print(f"\nReview: {text}\nPredicted Sentiment: {pred}")
 """#Improving Model Balance & Realism"""
 # Separate by sentiment
 pos = df[df["sentiment"] == "positive"]
 neg = df[df["sentiment"] == "negative"]
 neu = df[df["sentiment"] == "neutral"]
 # Undersample positive to match roughly others
 pos_down = resample(pos, replace=False, n_samples=len(neg) + len(neu), random_state=42)
 # Combine
 df_balanced = pd.concat([pos_down, neg, neu]).sample(frac=1, random_state=42).reset_index(drop=True)
 print(df_balanced["sentiment"].value_counts())
 #retain classfier
 X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["clean_review"], df_balanced["sentiment"],
    test_size=0.2, random_state=42, stratify=df_balanced["sentiment"]
 )
 vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
 X_train_tfidf = vectorizer.fit_transform(X_train)
 X_test_tfidf = vectorizer.transform(X_test)
 clf = LogisticRegression(max_iter=300, class_weight="balanced")
 clf.fit(X_train_tfidf, y_train)
 print("Balanced model trained successfully ")
 #evaluate agan
 y_pred = clf.predict(X_test_tfidf)
 print("Classification Report:\n", classification_report(y_test, y_pred))
 print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
 """#Agents"""
 # Base class for all agents
 class BaseAgent:
    """A simple base agent with a name and a run() method."""
    def __init__(self, name):
        self.name = name
    def run(self, *args, **kwargs):
        raise NotImplementedError("Subclasses must implement run() method.")
    def log(self, message):
        print(f"[{self.name}] {message}")
 #DataAgent for loading/cleaning
 class DataAgent(BaseAgent):
    """Handles dataset preparation tasks."""
    def __init__(self, data):
        super().__init__("DataAgent")
        self.data = data
    def run(self):
        self.log("Preprocessing data...")
        df_clean = self.data.copy()
        df_clean["review_body"] = df_clean["review_body"].str.strip()
        df_clean.drop_duplicates(subset=["review_body"], inplace=True)
        self.log(f"Dataset ready with {len(df_clean)} reviews.")
        return df_clean
 #analisyis agent-->using the tianed sentiment model *TF-IDF +Logistic Regression) to classfy Reviews
 class AnalysisAgent(BaseAgent):
    """Analyzes text sentiment using a trained model."""
    def __init__(self, vectorizer, model):
        super().__init__("AnalysisAgent")
        self.vectorizer = vectorizer
        self.model = model
    def run(self, reviews):
        self.log(f"Analyzing {len(reviews)} reviews...")
        X = self.vectorizer.transform(reviews)
        predictions = self.model.predict(X)
        return predictions
 #ReviewerAgent. Serves as the summary agnt using the anthropic API to give LLM review insights
 class ReviewerAgent(BaseAgent):
    """Summarizes overall sentiment trends using Anthropic Claude."""
    def __init__(self):
        super().__init__("ReviewerAgent")
        # Retrieve your key once — it’s already stored in Colab userdata
        api_key = os.getenv("ANTHROPIC_API_KEY")
        if not api_key:
            from google.colab import userdata
            api_key = userdata.get("ANTHROPIC_API_KEY")
        if not api_key:
            raise ValueError("Anthropic API key not found. Make sure it's set in Colab userdata as 'ANTHROPIC_API_KEY'.")
        self.client = Anthropic(api_key=api_key)
    def run(self, summary_text):
        """Generate an insights summary using Claude."""
        self.log("Generating summary using Claude...")
        prompt = f"""
        You are a product insights assistant.
        Based on the following summarized customer reviews, write a concise 3–4 sentence sentiment analysis report.
        Clearly describe the main themes and tone in user feedback on these home appliance products.
        Reviews Summary:
        {summary_text}
        """
        response = self.client.messages.create(
            model="claude-3-5-haiku-20241022",
            max_tokens=250,
            temperature=0.6,
            messages=[{"role": "user", "content": prompt}]
        )
        output = response.content[0].text.strip()
        self.log("Summary generated successfully ")
        return output
 # Instantiate agents
 data_agent = DataAgent(df)
 analysis_agent = AnalysisAgent(vectorizer, clf)
 reviewer_agent = ReviewerAgent()
 # Clean data
 df_ready = data_agent.run()
 # Classify sentiments
 df_ready["predicted_sentiment"] = analysis_agent.run(df_ready["review_body"])
 #  Prepare summary text by sentiment group
 summary_text = df_ready.groupby("predicted_sentiment")["review_body"].apply(lambda x: " ".join(x[:3])).to_string()
 # Generate AI summary using Anthropic
 insight_summary = reviewer_agent.run(summary_text)
 print(insight_summary)
 """#Evaluation & Visualization"""
 #  Evaluation & Visualization ===
 # Count predicted sentiments
 sentiment_counts = df_ready["predicted_sentiment"].value_counts()
 print(sentiment_counts)
 # Plot sentiment distribution
 plt.figure(figsize=(6,4))
 sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
 plt.title("Sentiment Distribution of Reviews", fontsize=14)
 plt.xlabel("Sentiment")
 plt.ylabel("Number of Reviews")
 plt.show()
 # Compute average review length per sentiment
 df_ready["review_length"] = df_ready["review_body"].apply(lambda x: len(x.split()))
 avg_length = df_ready.groupby("predicted_sentiment")["review_length"].mean()
 print(avg_length)
 # Visualize it
 plt.figure(figsize=(6,4))
 sns.barplot(x=avg_length.index, y=avg_length.values, palette="coolwarm")
 plt.title("Average Review Length per Sentiment")
 plt.xlabel("Sentiment")
 plt.ylabel("Average Word Count")
 plt.show()