Add data cleaning utilities for dataset preparation

This commit introduces a new Python module, data_cleaner.py, which provides functions for cleaning and preparing datasets for fine-tuning. The module includes a method to clean datasets based on text length and balance class distributions, as well as a function to analyze label distributions. These utilities enhance the data preprocessing capabilities for the application.
2025-10-31 03:20:08 +01:00
parent 3414454f43
commit 95a3766d85
1 changed files with 68 additions and 0 deletions
--- a/week6/community-contributions/hopeogbons/data_cleaner.py
+++ b/week6/community-contributions/hopeogbons/data_cleaner.py
@@ -0,0 +1,68 @@
+"""
+Data cleaning utilities for dataset preparation
+"""
+
+from collections import defaultdict
+
+
+def clean_dataset(data, min_length=10, max_samples_per_intent=None):
+    """
+    Clean and prepare dataset for fine-tuning
+    
+    Args:
+        data: HuggingFace dataset or list of examples
+        min_length: Minimum text length to keep (default: 10)
+        max_samples_per_intent: Max samples per intent for balancing (default: None = no limit)
+    
+    Returns:
+        list: Cleaned examples
+    
+    Example:
+        >>> cleaned = clean_dataset(dataset['train'], min_length=10, max_samples_per_intent=200)
+        >>> print(f"Cleaned {len(cleaned)} examples")
+    """
+    cleaned = []
+    
+    for example in data:
+        text = example['text'].strip()
+        
+        # Skip if too short
+        if len(text) < min_length:
+            continue
+        
+        # Normalize text - remove extra whitespace
+        text = ' '.join(text.split())
+        
+        cleaned.append({
+            'text': text,
+            'label': example['label']
+        })
+    
+    # Balance classes if max_samples_per_intent is specified
+    if max_samples_per_intent:
+        balanced = defaultdict(list)
+        
+        for item in cleaned:
+            balanced[item['label']].append(item)
+        
+        cleaned = []
+        for label, items in balanced.items():
+            cleaned.extend(items[:max_samples_per_intent])
+    
+    return cleaned
+
+
+def analyze_distribution(data):
+    """
+    Analyze label distribution in dataset
+    
+    Args:
+        data: List of examples with 'label' field
+    
+    Returns:
+        dict: Label counts
+    """
+    from collections import Counter
+    labels = [item['label'] for item in data]
+    return Counter(labels)
+