Add week8 contributions

2025-06-05 16:42:02 +02:00
parent 5782ca2b43
commit 141216e8f7
13 changed files with 12066 additions and 0 deletions
--- a/week8/community_contributions/10_part1_ensemble_model.ipynb
+++ b/week8/community_contributions/10_part1_ensemble_model.ipynb
--- a/week8/community_contributions/10_part2_modal.ipynb
+++ b/week8/community_contributions/10_part2_modal.ipynb
--- a/week8/community_contributions/agents/init.py
+++ b/week8/community_contributions/agents/init.py
--- a/week8/community_contributions/agents/base_agent.py
+++ b/week8/community_contributions/agents/base_agent.py
@@ -0,0 +1,33 @@
+import logging
+
+class Agent:
+    """
+    An abstract superclass for Agents
+    Used to log messages in a way that can identify each Agent
+    """
+
+    # Foreground colors
+    RED = '\033[31m'
+    GREEN = '\033[32m'
+    YELLOW = '\033[33m'
+    BLUE = '\033[34m'
+    MAGENTA = '\033[35m'
+    CYAN = '\033[36m'
+    WHITE = '\033[37m'
+    
+    # Background color
+    BG_BLACK = '\033[40m'
+    
+    # Reset code to return to default color
+    RESET = '\033[0m'
+
+    name: str = ""
+    color: str = '\033[37m'
+
+    def log(self, message):
+        """
+        Log this as an info message, identifying the agent
+        """
+        color_code = self.BG_BLACK + self.color
+        message = f"[{self.name}] {message}"
+        logging.info(color_code + message + self.RESET)
--- a/week8/community_contributions/agents/ft_price_agent.py
+++ b/week8/community_contributions/agents/ft_price_agent.py
@@ -0,0 +1,29 @@
+import modal
+from agents.base_agent import Agent
+
+
+class FTPriceAgent(Agent):
+    """
+    An Agent that runs the fine-tuned LLM that's running remotely on Modal
+    """
+
+    name = "FTPrice Agent"
+    color = Agent.RED
+
+    def __init__(self):
+        """
+        Set up this Agent by creating an instance of the modal class
+        """
+        self.log("FTPrice Agent is initializing - connecting to modal")
+        Pricer = modal.Cls.from_name("llm-ft-pricer", "Pricer") #  1st API call: to fetch Pricer (remote class)
+        self.pricer = Pricer()
+        self.log("FTPrice Agent is ready")
+        
+    def price(self, description: str) -> float:
+        """
+        Make a remote call to return the estimate of the price of this item
+        """
+        self.log("FTPrice Agent is calling remote fine-tuned model")
+        result = self.pricer.price.remote(description) # 2nd API call: to run the price method in the remote Pricer class
+        self.log(f"FTPrice Agent completed - predicting ${result:.2f}")
+        return result
--- a/week8/community_contributions/data/human_output.csv
+++ b/week8/community_contributions/data/human_output.csv
--- a/week8/community_contributions/helpers/init.py
+++ b/week8/community_contributions/helpers/init.py
--- a/week8/community_contributions/helpers/items.py
+++ b/week8/community_contributions/helpers/items.py
@@ -0,0 +1,120 @@
+from typing import Optional  # A variable might be a certain type or None
+from transformers import AutoTokenizer
+import re
+
+BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
+
+MIN_TOKENS = 150 # Minimum tokens required to accept an item
+MAX_TOKENS = 160 # We limit to 160 tokens so that after adding prompt text, the total stays around 180 tokens.
+
+MIN_CHARS = 300 # Reject items with less than 300 characters
+CEILING_CHARS = MAX_TOKENS * 7 # Truncate long text to about 1120 characters (approx 160 tokens)
+
+class Item:
+    """
+    An Item is a cleaned, curated datapoint of a Product with a Price
+    """
+    
+    # Load tokenizer for the model
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+    
+    # Define PRICE_LABEL and question for the training prompt
+    PRICE_LABEL = "Price is $"
+    QUESTION = "How much does this cost to the nearest dollar?"
+
+    # A list of useless phrases to remove to reduce noise for price prediction
+    REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', "By Manufacturer", "Item", "Date First", "Package", ":", "Number of", "Best Sellers", "Number", "Product "]
+
+    # Attributes for each item
+    title: str
+    price: float
+    category: str
+    token_count: int = 0 # How many tokens in the final prompt
+    
+    # Optional fields
+    details: Optional[str] # The value can be a string or can be None
+    prompt: Optional[str] = None
+    include = False # Whether to keep the item or not
+
+    def __init__(self, data, price):
+        self.title = data['title']
+        self.price = price
+        self.parse(data)
+
+    def scrub_details(self):
+        """
+        Removes useless phrases from details, which often has repeated specs or boilerplate text.
+        """
+        details = self.details
+        for remove in self.REMOVALS:
+            details = details.replace(remove, "")
+        return details
+
+    def scrub(self, stuff):
+        """
+        Clean up the provided text by removing unnecessary characters and whitespace
+        Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers
+        """
+        stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip()
+        stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",")
+        words = stuff.split(' ')
+        select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]
+        return " ".join(select)
+    
+    def parse(self, data):
+        """
+        Prepares the text, checks length, tokenizes it, and sets include = True if it’s valid.
+        """
+        # Builds a full contents string by combining description, features, and cleaned details.
+        contents = '\n'.join(data['description'])
+        if contents:
+            contents += '\n'
+        features = '\n'.join(data['features'])
+        if features:
+            contents += features + '\n'
+        self.details = data['details']
+        if self.details:
+            contents += self.scrub_details() + '\n'
+
+        # If content is long enough, trim it to max char limit before processing.
+        if len(contents) > MIN_CHARS:
+            contents = contents[:CEILING_CHARS]
+            
+            # Clean and tokenize text, then check token count.
+            text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
+            tokens = self.tokenizer.encode(text, add_special_tokens=False)
+            
+            if len(tokens) > MIN_TOKENS:  
+                # Truncate tokens, decode them back and create the training prompt
+                tokens = tokens[:MAX_TOKENS]
+                text = self.tokenizer.decode(tokens)
+                self.make_prompt(text)
+                
+                # Mark the item as valid and ready to be used in training
+                self.include = True  # Only items with MIN_TOKENS <= tokens <= MAX_TOKENS are kept
+
+
+    def make_prompt(self, text):
+        """
+        Builds the training prompt using the question, text, and price. Then counts the tokens.
+        """
+        self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
+        self.prompt += f"{self.PRICE_LABEL }{str(round(self.price))}.00"
+        self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))
+
+    def test_prompt(self):
+        """
+        Returns the prompt without the actual price, useful for testing/inference.
+        """
+        return self.prompt.split(self.PRICE_LABEL )[0] + self.PRICE_LABEL 
+
+    def __repr__(self):
+        """
+        Defines how the Item object looks when printed — it shows the title and price.
+        """
+        return f"<{self.title} = ${self.price}>"
+
+        
+
+    
+    
--- a/week8/community_contributions/helpers/loaders.py
+++ b/week8/community_contributions/helpers/loaders.py
@@ -0,0 +1,106 @@
+from datetime import datetime # Measure how long loading takes
+from tqdm import tqdm # Shows a progress bar while processing data
+from datasets import load_dataset # Load a dataset from Hugging Face Hub
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor # For parallel processing (speed)
+from items import Item
+
+CHUNK_SIZE = 1000 # Process the dataset in chunks of 1000 datapoints at a time (for efficiency)
+MIN_PRICE = 0.5
+MAX_PRICE = 999.49
+WORKER = 4 # Set the number of workers here
+
+class ItemLoader:
+
+    def __init__(self, name):
+        """
+        Initialize the loader with a dataset name.
+        """
+        self.name = name # Store the category name
+        self.dataset = None #Placeholder for the dataset (we load it later in load())
+
+    def process_chunk(self, chunk):
+        """
+        Convert a chunk of datapoints into valid Item objects.
+        """
+        batch = [] # Initialize the list to hold valid items
+
+        # Loop through each datapoint in the chunk
+        for datapoint in chunk:
+            try:
+                # Extract price from datapoint
+                price_str = datapoint['price']
+                if price_str:
+                    price = float(price_str)
+
+                    # Check if price is within valid range
+                    if MIN_PRICE <= price <= MAX_PRICE:
+                        item = Item(datapoint, price)
+
+                        # Keep only valid items
+                        if item.include:
+                            batch.append(item)
+            except ValueError:
+                continue # Skip datapoints with invalid price format
+        return batch # Return the list of valid items
+
+
+    def load_in_parallel(self, workers):
+        """
+        Split the dataset into chunks and process them in parallel.
+        """
+        results = []
+        size = len(self.dataset)
+        chunk_count = (size // CHUNK_SIZE) + 1
+    
+        # Build chunks directly here (no separate function)
+        chunks = [
+            self.dataset.select(range(i, min(i + CHUNK_SIZE, size)))
+            for i in range(0, size, CHUNK_SIZE)
+        ]
+
+        # Process chunks in parallel using multiple CPU cores
+        with ProcessPoolExecutor(max_workers=workers) as pool:
+            for batch in tqdm(pool.map(self.process_chunk, chunks), total=chunk_count):
+                results.extend(batch)
+
+        # Add the category name to each result
+        for result in results:
+            result.category = self.name
+    
+        return results
+
+            
+    def load(self, workers=WORKER):
+        """
+        Load and process the dataset, returning valid items.
+        """
+        # Record start time
+        start = datetime.now()
+    
+        # Print loading message
+        print(f"Loading dataset {self.name}", flush=True)
+    
+        # Load dataset from Hugging Face (based on category name)
+        self.dataset = load_dataset(
+            "McAuley-Lab/Amazon-Reviews-2023",
+            f"raw_meta_{self.name}",
+            split="full",
+            trust_remote_code=True
+        )
+    
+        # Process the dataset in parallel and collect valid items
+        results = self.load_in_parallel(workers)
+    
+        # Record end time and print summary
+        finish = datetime.now()
+        print(
+            f"Completed {self.name} with {len(results):,} datapoints in {(finish-start).total_seconds()/60:.1f} mins",
+            flush=True
+        )
+    
+        # Return the list of valid items
+        return results
+
+
+    
+    
--- a/week8/community_contributions/helpers/testing.py
+++ b/week8/community_contributions/helpers/testing.py
@@ -0,0 +1,84 @@
+import math
+import matplotlib.pyplot as plt
+
+GREEN = "\033[92m"
+YELLOW = "\033[93m"
+RED = "\033[91m"
+RESET = "\033[0m"
+COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}
+
+class Tester:
+
+    def __init__(self, predictor, data, title=None, size=250):
+        self.predictor = predictor
+        self.data = data
+        self.title = title or predictor.__name__.replace("_", " ").title()
+        self.size = size
+        self.guesses = []
+        self.truths = []
+        self.errors = []
+        self.sles = []
+        self.colors = []
+
+    def color_for(self, error, truth):
+        if error<40 or error/truth < 0.2:
+            return "green"
+        elif error<80 or error/truth < 0.4:
+            return "orange"
+        else:
+            return "red"
+    
+    def run_datapoint(self, i):
+        datapoint = self.data[i]
+        guess = self.predictor(datapoint)
+        truth = datapoint["price"]
+        error = abs(guess - truth)
+        log_error = math.log(truth+1) - math.log(guess+1)
+        sle = log_error ** 2
+        color = self.color_for(error, truth)
+        title = datapoint["text"][:40] + "..." if len(datapoint["text"]) > 40 else datapoint["text"]
+        self.guesses.append(guess)
+        self.truths.append(truth)
+        self.errors.append(error)
+        self.sles.append(sle)
+        self.colors.append(color)
+        # print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")
+
+    def chart(self, title):
+        max_error = max(self.errors)
+        plt.figure(figsize=(15, 6))
+        max_val = max(max(self.truths), max(self.guesses))
+        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
+        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
+        plt.xlabel('Ground Truth')
+        plt.ylabel('Model Estimate')
+        plt.xlim(0, max_val)
+        plt.ylim(0, max_val)
+        plt.title(title)
+        
+        # Add color legend
+        from matplotlib.lines import Line2D
+        legend_elements = [
+            Line2D([0], [0], marker='o', color='w', label='Accurate (green)', markerfacecolor='green', markersize=8),
+            Line2D([0], [0], marker='o', color='w', label='Medium error (orange)', markerfacecolor='orange', markersize=8),
+            Line2D([0], [0], marker='o', color='w', label='High error (red)', markerfacecolor='red', markersize=8)
+        ]
+        plt.legend(handles=legend_elements, loc='upper left')
+        plt.show()
+
+    def report(self):
+        average_error = sum(self.errors) / self.size
+        rmsle = math.sqrt(sum(self.sles) / self.size)
+        hits = sum(1 for color in self.colors if color=="green")
+        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
+        self.chart(title)
+
+    def run(self):
+        self.error = 0
+        for i in range(self.size):
+            self.run_datapoint(i)
+        self.report()
+
+    @classmethod
+    def test(cls, function, data):
+        cls(function, data).run()
--- a/week8/community_contributions/modal_services/init.py
+++ b/week8/community_contributions/modal_services/init.py
--- a/week8/community_contributions/modal_services/ft_pricer.py
+++ b/week8/community_contributions/modal_services/ft_pricer.py
@@ -0,0 +1,140 @@
+import modal
+from modal import App, Volume, Image
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Constants
+# ─────────────────────────────────────────────────────────────────────────────
+
+GPU = "T4"  # Use a T4 GPU for inference
+CACHE_PATH = "/cache"  # Mount point for the Modal volume
+
+# Hugging Face model references
+BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
+FINETUNED_MODEL = "ed-donner/pricer-2024-09-13_13.04.39"
+REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"  # Commit of the fine-tuned model
+
+# Local cache paths (inside the volume)
+BASE_MODEL_DIR = f"{CACHE_PATH}/llama_base_model"
+FINETUNED_MODEL_DIR = f"{CACHE_PATH}/llama_finetuned_model"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Structure
+# ─────────────────────────────────────────────────────────────────────────────
+
+# Container (App: llm-ft-pricer)
+# ├── /app                                     ← Code + installed Python packages (from image)
+# ├── /cache                                   ← Mounted Modal volume (`hf-hub-cache`)
+# │   └── meta-llama/Meta-Llama-3.1-8B/...     ← HuggingFace model files downloaded via snapshot_download
+
+
+
+QUESTION = "How much does this cost to the nearest dollar?"
+PREFIX = "Price is $"  # Used to parse generated output
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Modal App, Image, Volume, Secrets
+# ─────────────────────────────────────────────────────────────────────────────
+
+app = modal.App("llm-ft-pricer")  # Define the Modal app
+
+image = (
+    Image.debian_slim()
+    .pip_install("huggingface", "torch", "transformers", "bitsandbytes", "accelerate", "peft")  # All needed libraries
+    .env({"HF_HUB_CACHE": CACHE_PATH})  # Hugging Face will store model files in /cache
+)
+
+cache_vol = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)  # Persisted volume for caching models
+secrets = [modal.Secret.from_name("HF_TOKEN")]  # Hugging Face auth token
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Modal Class: Pricer
+# ─────────────────────────────────────────────────────────────────────────────
+
+# All methods in this class run inside the container with the image, volume, secrets, and GPU you configured.
+@app.cls(
+    image=image,
+    secrets=secrets,
+    volumes={CACHE_PATH: cache_vol},  # Mount volume into /cache
+    gpu=GPU,
+    timeout=1800,                     # 30-minute max runtime
+    min_containers=0,                 # = 1 : Keeping one container warm uses credits continuously if you forget to stop it.
+    scaledown_window=300,            # Shuts down the container
+)
+class Pricer:
+    @modal.enter()
+    def setup(self):
+        import os, torch
+        import logging
+        from huggingface_hub import snapshot_download
+        from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+        from peft import PeftModel
+
+        # Create cache path if it doesn't exist
+        os.makedirs(CACHE_PATH, exist_ok=True)
+        
+        # Download base and fine-tuned models into volume
+        logging.info("Downloading base model...")
+        snapshot_download(BASE_MODEL, local_dir=BASE_MODEL_DIR)
+        
+        logging.info("Downloading fine-tuned model...")
+        snapshot_download(FINETUNED_MODEL, revision=REVISION, local_dir=FINETUNED_MODEL_DIR)
+        
+        # Quantization config (4-bit)
+        quant_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_quant_type="nf4"
+        )
+        
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_DIR)
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.tokenizer.padding_side = "right"
+        
+        # Load base model (quantized)
+        base_model = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL_DIR,
+            quantization_config=quant_config,
+            device_map="auto"
+        )
+        
+        # Apply fine-tuned weights
+        self.fine_tuned_model = PeftModel.from_pretrained(
+            base_model,
+            FINETUNED_MODEL_DIR,
+            revision=REVISION
+        )
+        self.fine_tuned_model.generation_config.pad_token_id = self.tokenizer.pad_token_id
+        
+    @modal.method()
+    def price(self, description: str) -> float:
+        import re, torch
+        from transformers import set_seed
+
+        set_seed(42)  # Deterministic output
+
+        # Construct prompt
+        prompt = f"{QUESTION}\n\n{description}\n\n{PREFIX}"
+        inputs = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
+        attention_mask = torch.ones(inputs.shape, device="cuda")
+
+        # Generate model output (max 5 tokens)
+        outputs = self.fine_tuned_model.generate(
+            inputs,
+            attention_mask=attention_mask,
+            max_new_tokens=5,
+            num_return_sequences=1
+        )
+        result = self.tokenizer.decode(outputs[0])
+
+        # Extract number after "Price is $"
+        contents = result.split("Price is $")[1]
+        contents = contents.replace(',', '')
+        match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
+        return float(match.group()) if match else 0  # Return parsed price or 0 if not found
+
+
--- a/week8/community_contributions/modal_services/get_started.py
+++ b/week8/community_contributions/modal_services/get_started.py
@@ -0,0 +1,12 @@
+import sys, modal
+
+app = modal.App("example-hello-world")
+
+@app.function()
+def f(i: int) -> int:
+    if i % 2 == 0:
+        print("hello", i)
+    else:
+        print("world", i, file=sys.stderr)
+
+    return i * i