Add week6 contributions

2025-06-05 16:41:08 +02:00
parent 5782ca2b43
commit cf85184eab
8 changed files with 9383 additions and 0 deletions
--- a/week6/community-contributions/09_part1_data_curation.ipynb
+++ b/week6/community-contributions/09_part1_data_curation.ipynb
--- a/week6/community-contributions/09_part2_tradml_vs_frontier.ipynb
+++ b/week6/community-contributions/09_part2_tradml_vs_frontier.ipynb
--- a/week6/community-contributions/09_part3_e5embeddings_rag.ipynb
+++ b/week6/community-contributions/09_part3_e5embeddings_rag.ipynb
--- a/week6/community-contributions/09_part4_ft_gpt4omini.ipynb
+++ b/week6/community-contributions/09_part4_ft_gpt4omini.ipynb
--- a/week6/community-contributions/helpers/init.py
+++ b/week6/community-contributions/helpers/init.py
--- a/week6/community-contributions/helpers/items.py
+++ b/week6/community-contributions/helpers/items.py
@@ -0,0 +1,120 @@
 from typing import Optional  # A variable might be a certain type or None
 from transformers import AutoTokenizer
 import re
 BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
 MIN_TOKENS = 150 # Minimum tokens required to accept an item
 MAX_TOKENS = 160 # We limit to 160 tokens so that after adding prompt text, the total stays around 180 tokens.
 MIN_CHARS = 300 # Reject items with less than 300 characters
 CEILING_CHARS = MAX_TOKENS * 7 # Truncate long text to about 1120 characters (approx 160 tokens)
 class Item:
    """
    An Item is a cleaned, curated datapoint of a Product with a Price
    """
    # Load tokenizer for the model
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    # Define PRICE_LABEL and question for the training prompt
    PRICE_LABEL = "Price is $"
    QUESTION = "How much does this cost to the nearest dollar?"
    # A list of useless phrases to remove to reduce noise for price prediction
    REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', "By Manufacturer", "Item", "Date First", "Package", ":", "Number of", "Best Sellers", "Number", "Product "]
    # Attributes for each item
    title: str
    price: float
    category: str
    token_count: int = 0 # How many tokens in the final prompt
    # Optional fields
    details: Optional[str] # The value can be a string or can be None
    prompt: Optional[str] = None
    include = False # Whether to keep the item or not
    def __init__(self, data, price):
        self.title = data['title']
        self.price = price
        self.parse(data)
    def scrub_details(self):
        """
        Removes useless phrases from details, which often has repeated specs or boilerplate text.
        """
        details = self.details
        for remove in self.REMOVALS:
            details = details.replace(remove, "")
        return details
    def scrub(self, stuff):
        """
        Clean up the provided text by removing unnecessary characters and whitespace
        Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers
        """
        stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip()
        stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",")
        words = stuff.split(' ')
        select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]
        return " ".join(select)
    def parse(self, data):
        """
        Prepares the text, checks length, tokenizes it, and sets include = True if it’s valid.
        """
        # Builds a full contents string by combining description, features, and cleaned details.
        contents = '\n'.join(data['description'])
        if contents:
            contents += '\n'
        features = '\n'.join(data['features'])
        if features:
            contents += features + '\n'
        self.details = data['details']
        if self.details:
            contents += self.scrub_details() + '\n'
        # If content is long enough, trim it to max char limit before processing.
        if len(contents) > MIN_CHARS:
            contents = contents[:CEILING_CHARS]
            # Clean and tokenize text, then check token count.
            text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
            tokens = self.tokenizer.encode(text, add_special_tokens=False)
            if len(tokens) > MIN_TOKENS:  
                # Truncate tokens, decode them back and create the training prompt
                tokens = tokens[:MAX_TOKENS]
                text = self.tokenizer.decode(tokens)
                self.make_prompt(text)
                # Mark the item as valid and ready to be used in training
                self.include = True  # Only items with MIN_TOKENS <= tokens <= MAX_TOKENS are kept
    def make_prompt(self, text):
        """
        Builds the training prompt using the question, text, and price. Then counts the tokens.
        """
        self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
        self.prompt += f"{self.PRICE_LABEL }{str(round(self.price))}.00"
        self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))
    def test_prompt(self):
        """
        Returns the prompt without the actual price, useful for testing/inference.
        """
        return self.prompt.split(self.PRICE_LABEL )[0] + self.PRICE_LABEL 
    def __repr__(self):
        """
        Defines how the Item object looks when printed — it shows the title and price.
        """
        return f"<{self.title} = ${self.price}>"
--- a/week6/community-contributions/helpers/loaders.py
+++ b/week6/community-contributions/helpers/loaders.py
@@ -0,0 +1,106 @@
 from datetime import datetime # Measure how long loading takes
 from tqdm import tqdm # Shows a progress bar while processing data
 from datasets import load_dataset # Load a dataset from Hugging Face Hub
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor # For parallel processing (speed)
 from items import Item
 CHUNK_SIZE = 1000 # Process the dataset in chunks of 1000 datapoints at a time (for efficiency)
 MIN_PRICE = 0.5
 MAX_PRICE = 999.49
 WORKER = 4 # Set the number of workers here
 class ItemLoader:
    def __init__(self, name):
        """
        Initialize the loader with a dataset name.
        """
        self.name = name # Store the category name
        self.dataset = None #Placeholder for the dataset (we load it later in load())
    def process_chunk(self, chunk):
        """
        Convert a chunk of datapoints into valid Item objects.
        """
        batch = [] # Initialize the list to hold valid items
        # Loop through each datapoint in the chunk
        for datapoint in chunk:
            try:
                # Extract price from datapoint
                price_str = datapoint['price']
                if price_str:
                    price = float(price_str)
                    # Check if price is within valid range
                    if MIN_PRICE <= price <= MAX_PRICE:
                        item = Item(datapoint, price)
                        # Keep only valid items
                        if item.include:
                            batch.append(item)
            except ValueError:
                continue # Skip datapoints with invalid price format
        return batch # Return the list of valid items
    def load_in_parallel(self, workers):
        """
        Split the dataset into chunks and process them in parallel.
        """
        results = []
        size = len(self.dataset)
        chunk_count = (size // CHUNK_SIZE) + 1
        # Build chunks directly here (no separate function)
        chunks = [
            self.dataset.select(range(i, min(i + CHUNK_SIZE, size)))
            for i in range(0, size, CHUNK_SIZE)
        ]
        # Process chunks in parallel using multiple CPU cores
        with ProcessPoolExecutor(max_workers=workers) as pool:
            for batch in tqdm(pool.map(self.process_chunk, chunks), total=chunk_count):
                results.extend(batch)
        # Add the category name to each result
        for result in results:
            result.category = self.name
        return results
    def load(self, workers=WORKER):
        """
        Load and process the dataset, returning valid items.
        """
        # Record start time
        start = datetime.now()
        # Print loading message
        print(f"Loading dataset {self.name}", flush=True)
        # Load dataset from Hugging Face (based on category name)
        self.dataset = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023",
            f"raw_meta_{self.name}",
            split="full",
            trust_remote_code=True
        )
        # Process the dataset in parallel and collect valid items
        results = self.load_in_parallel(workers)
        # Record end time and print summary
        finish = datetime.now()
        print(
            f"Completed {self.name} with {len(results):,} datapoints in {(finish-start).total_seconds()/60:.1f} mins",
            flush=True
        )
        # Return the list of valid items
        return results
--- a/week6/community-contributions/helpers/testing.py
+++ b/week6/community-contributions/helpers/testing.py
@@ -0,0 +1,84 @@
 import math
 import matplotlib.pyplot as plt
 GREEN = "\033[92m"
 YELLOW = "\033[93m"
 RED = "\033[91m"
 RESET = "\033[0m"
 COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}
 class Tester:
    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []
    def color_for(self, error, truth):
        if error<40 or error/truth < 0.2:
            return "green"
        elif error<80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"
    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint)
        truth = datapoint["price"]
        error = abs(guess - truth)
        log_error = math.log(truth+1) - math.log(guess+1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = datapoint["text"][:40] + "..." if len(datapoint["text"]) > 40 else datapoint["text"]
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        # print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")
    def chart(self, title):
        max_error = max(self.errors)
        plt.figure(figsize=(15, 6))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        # Add color legend
        from matplotlib.lines import Line2D
        legend_elements = [
            Line2D([0], [0], marker='o', color='w', label='Accurate (green)', markerfacecolor='green', markersize=8),
            Line2D([0], [0], marker='o', color='w', label='Medium error (orange)', markerfacecolor='orange', markersize=8),
            Line2D([0], [0], marker='o', color='w', label='High error (red)', markerfacecolor='red', markersize=8)
        ]
        plt.legend(handles=legend_elements, loc='upper left')
        plt.show()
    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color=="green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)
    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()
    @classmethod
    def test(cls, function, data):
        cls(function, data).run()