From 95a3766d85b167f717fc4eafaf994e4e638950b0 Mon Sep 17 00:00:00 2001 From: Hope Ogbons Date: Fri, 31 Oct 2025 03:20:08 +0100 Subject: [PATCH] Add data cleaning utilities for dataset preparation This commit introduces a new Python module, data_cleaner.py, which provides functions for cleaning and preparing datasets for fine-tuning. The module includes a method to clean datasets based on text length and balance class distributions, as well as a function to analyze label distributions. These utilities enhance the data preprocessing capabilities for the application. --- .../hopeogbons/data_cleaner.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 week6/community-contributions/hopeogbons/data_cleaner.py diff --git a/week6/community-contributions/hopeogbons/data_cleaner.py b/week6/community-contributions/hopeogbons/data_cleaner.py new file mode 100644 index 0000000..31db9d6 --- /dev/null +++ b/week6/community-contributions/hopeogbons/data_cleaner.py @@ -0,0 +1,68 @@ +""" +Data cleaning utilities for dataset preparation +""" + +from collections import defaultdict + + +def clean_dataset(data, min_length=10, max_samples_per_intent=None): + """ + Clean and prepare dataset for fine-tuning + + Args: + data: HuggingFace dataset or list of examples + min_length: Minimum text length to keep (default: 10) + max_samples_per_intent: Max samples per intent for balancing (default: None = no limit) + + Returns: + list: Cleaned examples + + Example: + >>> cleaned = clean_dataset(dataset['train'], min_length=10, max_samples_per_intent=200) + >>> print(f"Cleaned {len(cleaned)} examples") + """ + cleaned = [] + + for example in data: + text = example['text'].strip() + + # Skip if too short + if len(text) < min_length: + continue + + # Normalize text - remove extra whitespace + text = ' '.join(text.split()) + + cleaned.append({ + 'text': text, + 'label': example['label'] + }) + + # Balance classes if max_samples_per_intent is specified + if max_samples_per_intent: + balanced = defaultdict(list) + + for item in cleaned: + balanced[item['label']].append(item) + + cleaned = [] + for label, items in balanced.items(): + cleaned.extend(items[:max_samples_per_intent]) + + return cleaned + + +def analyze_distribution(data): + """ + Analyze label distribution in dataset + + Args: + data: List of examples with 'label' field + + Returns: + dict: Label counts + """ + from collections import Counter + labels = [item['label'] for item in data] + return Counter(labels) +