Merge pull request #925 from hopeogbons/week6_exercise_hopeogbons

(Oct 2025 Bootcamp): Add week 6 exercise notebook for banking intent classification
2025-10-30 22:34:06 -04:00
parent 67c116fd45 a22a5cef2c
commit 875cbda5e0
4 changed files with 1194 additions and 0 deletions
--- a/week6/community-contributions/hopeogbons/banking_intents.py
+++ b/week6/community-contributions/hopeogbons/banking_intents.py
@@ -0,0 +1,148 @@
+"""
+Banking77 Intent Mapping
+Maps label numbers (0-76) to intent names
+"""
+
+INTENT_LABELS = [
+    "activate_my_card",
+    "age_limit",
+    "apple_pay_or_google_pay",
+    "atm_support",
+    "automatic_top_up",
+    "balance_not_updated_after_bank_transfer",
+    "balance_not_updated_after_cheque_or_cash_deposit",
+    "beneficiary_not_allowed",
+    "cancel_transfer",
+    "card_about_to_expire",
+    "card_acceptance",
+    "card_arrival",
+    "card_delivery_estimate",
+    "card_linking",
+    "card_not_working",
+    "card_payment_fee_charged",
+    "card_payment_not_recognised",
+    "card_payment_wrong_exchange_rate",
+    "card_swallowed",
+    "cash_withdrawal_charge",
+    "cash_withdrawal_not_recognised",
+    "change_pin",
+    "compromised_card",
+    "contactless_not_working",
+    "country_support",
+    "declined_card_payment",
+    "declined_cash_withdrawal",
+    "declined_transfer",
+    "direct_debit_payment_not_recognised",
+    "disposable_card_limits",
+    "edit_personal_details",
+    "exchange_charge",
+    "exchange_rate",
+    "exchange_via_app",
+    "extra_charge_on_statement",
+    "failed_transfer",
+    "fiat_currency_support",
+    "get_disposable_virtual_card",
+    "get_physical_card",
+    "getting_spare_card",
+    "getting_virtual_card",
+    "lost_or_stolen_card",
+    "lost_or_stolen_phone",
+    "order_physical_card",
+    "passcode_forgotten",
+    "pending_card_payment",
+    "pending_cash_withdrawal",
+    "pending_top_up",
+    "pending_transfer",
+    "pin_blocked",
+    "receiving_money",
+    "Refund_not_showing_up",
+    "request_refund",
+    "reverted_card_payment?",
+    "supported_cards_and_currencies",
+    "terminate_account",
+    "top_up_by_bank_transfer_charge",
+    "top_up_by_card_charge",
+    "top_up_by_cash_or_cheque",
+    "top_up_failed",
+    "top_up_limits",
+    "top_up_reverted",
+    "topping_up_by_card",
+    "transaction_charged_twice",
+    "transfer_fee_charged",
+    "transfer_into_account",
+    "transfer_not_received_by_recipient",
+    "transfer_timing",
+    "unable_to_verify_identity",
+    "verify_my_identity",
+    "verify_source_of_funds",
+    "verify_top_up",
+    "virtual_card_not_working",
+    "visa_or_mastercard",
+    "why_verify_identity",
+    "wrong_amount_of_cash_received",
+    "wrong_exchange_rate_for_cash_withdrawal"
+]
+
+
+def get_intent(label_number):
+    """
+    Get intent name from label number.
+    
+    Args:
+        label_number (int): Label from 0 to 76
+        
+    Returns:
+        str: Intent name
+        
+    Example:
+        >>> get_intent(0)
+        'activate_my_card'
+        >>> get_intent(25)
+        'declined_card_payment'
+    """
+    if 0 <= label_number <= 76:
+        return INTENT_LABELS[label_number]
+    else:
+        raise ValueError(f"Label must be between 0 and 76, got {label_number}")
+
+
+def get_label(intent_name):
+    """
+    Get label number from intent name.
+    
+    Args:
+        intent_name (str): Intent name
+        
+    Returns:
+        int: Label number (0-76)
+        
+    Example:
+        >>> get_label('activate_my_card')
+        0
+        >>> get_label('declined_card_payment')
+        25
+    """
+    try:
+        return INTENT_LABELS.index(intent_name)
+    except ValueError:
+        raise ValueError(f"Intent '{intent_name}' not found in labels")
+
+
+# Quick access
+def show_all_intents():
+    """Display all 77 intents with their labels"""
+    for i, intent in enumerate(INTENT_LABELS):
+        print(f"{i}\t{intent}")
+
+
+if __name__ == "__main__":
+    # Test the functions
+    print("Testing get_intent:")
+    print(f"Label 0: {get_intent(0)}")
+    print(f"Label 25: {get_intent(25)}")
+    print(f"Label 76: {get_intent(76)}")
+    
+    print("\nTesting get_label:")
+    print(f"'activate_my_card': {get_label('activate_my_card')}")
+    print(f"'declined_card_payment': {get_label('declined_card_payment')}")
+
--- a/week6/community-contributions/hopeogbons/classifier_tester.py
+++ b/week6/community-contributions/hopeogbons/classifier_tester.py
@@ -0,0 +1,123 @@
+"""
+Classification Tester for Banking Intent Model
+Evaluates model accuracy on intent classification
+"""
+
+import matplotlib.pyplot as plt
+from collections import Counter
+from banking_intents import get_intent
+
+GREEN = "\033[92m"
+RED = "\033[91m"
+RESET = "\033[0m"
+
+
+class ClassifierTester:
+    """Test framework for classification models"""
+    
+    def __init__(self, predictor, data, title=None, size=100):
+        self.predictor = predictor
+        self.data = data
+        self.title = title or predictor.__name__.replace("_", " ").title()
+        self.size = min(size, len(data))
+        self.predictions = []
+        self.actuals = []
+        self.correct = 0
+        self.incorrect = 0
+    
+    def run_datapoint(self, i):
+        """Test a single example"""
+        item = self.data[i]
+        
+        # Get prediction
+        predicted_intent = self.predictor(item)
+        actual_intent = get_intent(item['label'])
+        
+        # Check if correct
+        is_correct = predicted_intent == actual_intent
+        
+        if is_correct:
+            self.correct += 1
+            color = GREEN
+            status = "✓"
+        else:
+            self.incorrect += 1
+            color = RED
+            status = "✗"
+        
+        self.predictions.append(predicted_intent)
+        self.actuals.append(actual_intent)
+        
+        # Print result
+        query = item['text'][:60] + "..." if len(item['text']) > 60 else item['text']
+        print(f"{color}{status} {i+1}: {query}")
+        print(f"   Predicted: {predicted_intent} | Actual: {actual_intent}{RESET}")
+    
+    def chart(self):
+        """Visualize top confusion pairs"""
+        # Find misclassifications
+        errors = {}
+        for pred, actual in zip(self.predictions, self.actuals):
+            if pred != actual:
+                pair = f"{actual} → {pred}"
+                errors[pair] = errors.get(pair, 0) + 1
+        
+        if not errors:
+            print("\n🎉 Perfect accuracy - no confusion to plot!")
+            return
+        
+        # Plot top 10 confusions
+        top_errors = sorted(errors.items(), key=lambda x: x[1], reverse=True)[:10]
+        
+        if top_errors:
+            labels = [pair for pair, _ in top_errors]
+            counts = [count for _, count in top_errors]
+            
+            plt.figure(figsize=(12, 6))
+            plt.barh(labels, counts, color='coral')
+            plt.xlabel('Count')
+            plt.title('Top 10 Confusion Pairs (Actual → Predicted)')
+            plt.tight_layout()
+            plt.show()
+    
+    def report(self):
+        """Print final metrics and chart"""
+        accuracy = (self.correct / self.size) * 100
+        
+        print("\n" + "="*70)
+        print(f"MODEL: {self.title}")
+        print(f"TESTED: {self.size} examples")
+        print(f"CORRECT: {self.correct} ({accuracy:.1f}%)")
+        print(f"INCORRECT: {self.incorrect}")
+        print("="*70)
+        
+        # Show most common errors
+        if self.incorrect > 0:
+            print("\nMost Common Errors:")
+            error_pairs = [(self.actuals[i], self.predictions[i]) 
+                          for i in range(len(self.actuals)) 
+                          if self.actuals[i] != self.predictions[i]]
+            error_counts = Counter(error_pairs).most_common(5)
+            
+            for (actual, pred), count in error_counts:
+                print(f"  {actual} → {pred}: {count} times")
+        
+        # Chart
+        self.chart()
+        
+        return accuracy
+    
+    def run(self):
+        """Run the complete evaluation"""
+        print(f"Testing {self.title} on {self.size} examples...\n")
+        
+        for i in range(self.size):
+            self.run_datapoint(i)
+        
+        return self.report()
+    
+    @classmethod
+    def test(cls, function, data, size=100):
+        """Convenience method to test a predictor function"""
+        return cls(function, data, size=size).run()
+
--- a/week6/community-contributions/hopeogbons/data_cleaner.py
+++ b/week6/community-contributions/hopeogbons/data_cleaner.py
@@ -0,0 +1,68 @@
+"""
+Data cleaning utilities for dataset preparation
+"""
+
+from collections import defaultdict
+
+
+def clean_dataset(data, min_length=10, max_samples_per_intent=None):
+    """
+    Clean and prepare dataset for fine-tuning
+    
+    Args:
+        data: HuggingFace dataset or list of examples
+        min_length: Minimum text length to keep (default: 10)
+        max_samples_per_intent: Max samples per intent for balancing (default: None = no limit)
+    
+    Returns:
+        list: Cleaned examples
+    
+    Example:
+        >>> cleaned = clean_dataset(dataset['train'], min_length=10, max_samples_per_intent=200)
+        >>> print(f"Cleaned {len(cleaned)} examples")
+    """
+    cleaned = []
+    
+    for example in data:
+        text = example['text'].strip()
+        
+        # Skip if too short
+        if len(text) < min_length:
+            continue
+        
+        # Normalize text - remove extra whitespace
+        text = ' '.join(text.split())
+        
+        cleaned.append({
+            'text': text,
+            'label': example['label']
+        })
+    
+    # Balance classes if max_samples_per_intent is specified
+    if max_samples_per_intent:
+        balanced = defaultdict(list)
+        
+        for item in cleaned:
+            balanced[item['label']].append(item)
+        
+        cleaned = []
+        for label, items in balanced.items():
+            cleaned.extend(items[:max_samples_per_intent])
+    
+    return cleaned
+
+
+def analyze_distribution(data):
+    """
+    Analyze label distribution in dataset
+    
+    Args:
+        data: List of examples with 'label' field
+    
+    Returns:
+        dict: Label counts
+    """
+    from collections import Counter
+    labels = [item['label'] for item in data]
+    return Counter(labels)
+
--- a/week6/community-contributions/hopeogbons/week6
+++ b/week6/community-contributions/hopeogbons/week6