diff --git a/week8/community_contributions/bharat_puri/exercise.py b/week8/community_contributions/bharat_puri/exercise.py new file mode 100644 index 0000000..5957853 --- /dev/null +++ b/week8/community_contributions/bharat_puri/exercise.py @@ -0,0 +1,349 @@ +# -*- coding: utf-8 -*- +"""week8_exercie.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1jJ4pKoJat0ZnC99sTQjEEe9BMK--ArwQ +""" + +!pip install -q pandas datasets matplotlib seaborn +!pip install datasets==3.0.1 +!pip install anthropic -q + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from datasets import load_dataset +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +#chec perfomance +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.utils import resample +import os +from anthropic import Anthropic +import re + + + +pd.set_option("display.max_colwidth", 100) + +# # Initialize client using environment variable +# client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + +# # Quick test +# print("Anthropic client initialized " if client else " Anthropic not detected.") + +from google.colab import userdata +userdata.get('ANTHROPIC_API_KEY') + +api_key = userdata.get('ANTHROPIC_API_KEY') +os.environ["ANTHROPIC_API_KEY"] = api_key + +client = Anthropic(api_key=api_key) + +# List models +models = client.models.list() + +print("Available Anthropic Models:\n") +for m in models.data: + print(f"- {m.id}") + +#dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]") + + + +# Loading a sample from the full reviews data +dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", split="full[:5000]") + +# creating a DF +df = pd.DataFrame(dataset) +df = df[["title", "text", "rating"]].dropna().reset_index(drop=True) + +# Renaming th columns for clarity/easy ref +df.rename(columns={"text": "review_body"}, inplace=True) + +print(f"Loaded {len(df)} rows with reviews and ratings") +df.head() + +#inspect the data +# Basic info +print(df.info()) +print(df.isnull().sum()) + +# Unique ratings dist +print(df["rating"].value_counts().sort_index()) + +# Check Random reviews +display(df.sample(5, random_state=42)) + +# Review length distribution +df["review_length"] = df["review_body"].apply(lambda x: len(str(x).split())) + +#Summarize the review length +print(df["review_length"].describe()) + +# pltt the rating distribution +plt.figure(figsize=(6,4)) +df["rating"].hist(bins=5, edgecolor='black') +plt.title("Ratings Distribution (1–5 stars)") +plt.xlabel("Rating") +plt.ylabel("Number of Reviews") +plt.show() + +# review length +plt.figure(figsize=(6,4)) +df["review_length"].hist(bins=30, color="lightblue", edgecolor='black') +plt.title("Review Length Distribution") +plt.xlabel("Number of Words in Review") +plt.ylabel("Number of Reviews") +plt.show() + +#cleaning +def clean_text(text): + text = text.lower() + # remove URLs + text = re.sub(r"http\S+|www\S+|https\S+", '', text) + # remove punctuation/special chars + text = re.sub(r"[^a-z0-9\s]", '', text) + # normalize whitespace + text = re.sub(r"\s+", ' ', text).strip() + return text + +df["clean_review"] = df["review_body"].apply(clean_text) + +df.head(3) + +"""'#sentiment analysis""" + +# Rating labellings +def label_sentiment(rating): + if rating <= 2: + return "negative" + elif rating == 3: + return "neutral" + else: + return "positive" + +df["sentiment"] = df["rating"].apply(label_sentiment) + +df["sentiment"].value_counts() + +#train/tets split +X_train, X_test, y_train, y_test = train_test_split( + df["clean_review"], df["sentiment"], test_size=0.2, random_state=42, stratify=df["sentiment"] +) + +print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}") + +# Convert text to TF-IDF features +vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2)) + +X_train_tfidf = vectorizer.fit_transform(X_train) + +X_test_tfidf = vectorizer.transform(X_test) + +print(f"TF-IDF matrix shape: {X_train_tfidf.shape}") + +#trian classfier + +# Train lightweight model +clf = LogisticRegression(max_iter=200) + +clf.fit(X_train_tfidf, y_train) + +y_pred = clf.predict(X_test_tfidf) + +print("Classification Report:\n", classification_report(y_test, y_pred)) +print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred)) + +sample_texts = [ + "This blender broke after two days. Waste of money!", + "Works exactly as described, very satisfied!", + "It’s okay, does the job but nothing special." +] + +sample_features = vectorizer.transform(sample_texts) +sample_preds = clf.predict(sample_features) + +for text, pred in zip(sample_texts, sample_preds): + print(f"\nReview: {text}\nPredicted Sentiment: {pred}") + +"""#Improving Model Balance & Realism""" + +# Separate by sentiment +pos = df[df["sentiment"] == "positive"] +neg = df[df["sentiment"] == "negative"] +neu = df[df["sentiment"] == "neutral"] + +# Undersample positive to match roughly others +pos_down = resample(pos, replace=False, n_samples=len(neg) + len(neu), random_state=42) + +# Combine +df_balanced = pd.concat([pos_down, neg, neu]).sample(frac=1, random_state=42).reset_index(drop=True) + +print(df_balanced["sentiment"].value_counts()) + +#retain classfier +X_train, X_test, y_train, y_test = train_test_split( + df_balanced["clean_review"], df_balanced["sentiment"], + test_size=0.2, random_state=42, stratify=df_balanced["sentiment"] +) + +vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2)) + +X_train_tfidf = vectorizer.fit_transform(X_train) + +X_test_tfidf = vectorizer.transform(X_test) + +clf = LogisticRegression(max_iter=300, class_weight="balanced") +clf.fit(X_train_tfidf, y_train) + +print("Balanced model trained successfully ") + +#evaluate agan +y_pred = clf.predict(X_test_tfidf) + +print("Classification Report:\n", classification_report(y_test, y_pred)) + +print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred)) + +"""#Agents""" + +# Base class for all agents +class BaseAgent: + """A simple base agent with a name and a run() method.""" + + def __init__(self, name): + self.name = name + + def run(self, *args, **kwargs): + raise NotImplementedError("Subclasses must implement run() method.") + + def log(self, message): + print(f"[{self.name}] {message}") + +#DataAgent for loading/cleaning +class DataAgent(BaseAgent): + """Handles dataset preparation tasks.""" + + def __init__(self, data): + super().__init__("DataAgent") + self.data = data + + def run(self): + self.log("Preprocessing data...") + df_clean = self.data.copy() + df_clean["review_body"] = df_clean["review_body"].str.strip() + df_clean.drop_duplicates(subset=["review_body"], inplace=True) + self.log(f"Dataset ready with {len(df_clean)} reviews.") + return df_clean + +#analisyis agent-->using the tianed sentiment model *TF-IDF +Logistic Regression) to classfy Reviews +class AnalysisAgent(BaseAgent): + """Analyzes text sentiment using a trained model.""" + + def __init__(self, vectorizer, model): + super().__init__("AnalysisAgent") + self.vectorizer = vectorizer + self.model = model + + def run(self, reviews): + self.log(f"Analyzing {len(reviews)} reviews...") + X = self.vectorizer.transform(reviews) + predictions = self.model.predict(X) + return predictions + +#ReviewerAgent. Serves as the summary agnt using the anthropic API to give LLM review insights +class ReviewerAgent(BaseAgent): + """Summarizes overall sentiment trends using Anthropic Claude.""" + + def __init__(self): + super().__init__("ReviewerAgent") + # Retrieve your key once — it’s already stored in Colab userdata + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + from google.colab import userdata + api_key = userdata.get("ANTHROPIC_API_KEY") + + if not api_key: + raise ValueError("Anthropic API key not found. Make sure it's set in Colab userdata as 'ANTHROPIC_API_KEY'.") + + self.client = Anthropic(api_key=api_key) + + def run(self, summary_text): + """Generate an insights summary using Claude.""" + self.log("Generating summary using Claude...") + + prompt = f""" + You are a product insights assistant. + Based on the following summarized customer reviews, write a concise 3–4 sentence sentiment analysis report. + Clearly describe the main themes and tone in user feedback on these home appliance products. + + Reviews Summary: + {summary_text} + """ + + response = self.client.messages.create( + model="claude-3-5-haiku-20241022", + max_tokens=250, + temperature=0.6, + messages=[{"role": "user", "content": prompt}] + ) + + output = response.content[0].text.strip() + self.log("Summary generated successfully ") + return output + +# Instantiate agents +data_agent = DataAgent(df) +analysis_agent = AnalysisAgent(vectorizer, clf) +reviewer_agent = ReviewerAgent() + +# Clean data +df_ready = data_agent.run() + +# Classify sentiments +df_ready["predicted_sentiment"] = analysis_agent.run(df_ready["review_body"]) + +# Prepare summary text by sentiment group +summary_text = df_ready.groupby("predicted_sentiment")["review_body"].apply(lambda x: " ".join(x[:3])).to_string() + +# Generate AI summary using Anthropic +insight_summary = reviewer_agent.run(summary_text) + +print(insight_summary) + +"""#Evaluation & Visualization""" + +# Evaluation & Visualization === + +# Count predicted sentiments +sentiment_counts = df_ready["predicted_sentiment"].value_counts() + +print(sentiment_counts) + +# Plot sentiment distribution +plt.figure(figsize=(6,4)) +sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis") +plt.title("Sentiment Distribution of Reviews", fontsize=14) +plt.xlabel("Sentiment") +plt.ylabel("Number of Reviews") +plt.show() + +# Compute average review length per sentiment +df_ready["review_length"] = df_ready["review_body"].apply(lambda x: len(x.split())) + +avg_length = df_ready.groupby("predicted_sentiment")["review_length"].mean() + +print(avg_length) + +# Visualize it +plt.figure(figsize=(6,4)) +sns.barplot(x=avg_length.index, y=avg_length.values, palette="coolwarm") +plt.title("Average Review Length per Sentiment") +plt.xlabel("Sentiment") +plt.ylabel("Average Word Count") +plt.show() \ No newline at end of file