Add week6 contributions

2025-06-05 16:41:08 +02:00
parent 5782ca2b43
commit cf85184eab
8 changed files with 9383 additions and 0 deletions
--- a/week6/community-contributions/helpers/items.py
+++ b/week6/community-contributions/helpers/items.py
@@ -0,0 +1,120 @@
+from typing import Optional  # A variable might be a certain type or None
+from transformers import AutoTokenizer
+import re
+
+BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
+
+MIN_TOKENS = 150 # Minimum tokens required to accept an item
+MAX_TOKENS = 160 # We limit to 160 tokens so that after adding prompt text, the total stays around 180 tokens.
+
+MIN_CHARS = 300 # Reject items with less than 300 characters
+CEILING_CHARS = MAX_TOKENS * 7 # Truncate long text to about 1120 characters (approx 160 tokens)
+
+class Item:
+    """
+    An Item is a cleaned, curated datapoint of a Product with a Price
+    """
+    
+    # Load tokenizer for the model
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+    
+    # Define PRICE_LABEL and question for the training prompt
+    PRICE_LABEL = "Price is $"
+    QUESTION = "How much does this cost to the nearest dollar?"
+
+    # A list of useless phrases to remove to reduce noise for price prediction
+    REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', "By Manufacturer", "Item", "Date First", "Package", ":", "Number of", "Best Sellers", "Number", "Product "]
+
+    # Attributes for each item
+    title: str
+    price: float
+    category: str
+    token_count: int = 0 # How many tokens in the final prompt
+    
+    # Optional fields
+    details: Optional[str] # The value can be a string or can be None
+    prompt: Optional[str] = None
+    include = False # Whether to keep the item or not
+
+    def __init__(self, data, price):
+        self.title = data['title']
+        self.price = price
+        self.parse(data)
+
+    def scrub_details(self):
+        """
+        Removes useless phrases from details, which often has repeated specs or boilerplate text.
+        """
+        details = self.details
+        for remove in self.REMOVALS:
+            details = details.replace(remove, "")
+        return details
+
+    def scrub(self, stuff):
+        """
+        Clean up the provided text by removing unnecessary characters and whitespace
+        Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers
+        """
+        stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip()
+        stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",")
+        words = stuff.split(' ')
+        select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]
+        return " ".join(select)
+    
+    def parse(self, data):
+        """
+        Prepares the text, checks length, tokenizes it, and sets include = True if it’s valid.
+        """
+        # Builds a full contents string by combining description, features, and cleaned details.
+        contents = '\n'.join(data['description'])
+        if contents:
+            contents += '\n'
+        features = '\n'.join(data['features'])
+        if features:
+            contents += features + '\n'
+        self.details = data['details']
+        if self.details:
+            contents += self.scrub_details() + '\n'
+
+        # If content is long enough, trim it to max char limit before processing.
+        if len(contents) > MIN_CHARS:
+            contents = contents[:CEILING_CHARS]
+            
+            # Clean and tokenize text, then check token count.
+            text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
+            tokens = self.tokenizer.encode(text, add_special_tokens=False)
+            
+            if len(tokens) > MIN_TOKENS:  
+                # Truncate tokens, decode them back and create the training prompt
+                tokens = tokens[:MAX_TOKENS]
+                text = self.tokenizer.decode(tokens)
+                self.make_prompt(text)
+                
+                # Mark the item as valid and ready to be used in training
+                self.include = True  # Only items with MIN_TOKENS <= tokens <= MAX_TOKENS are kept
+
+
+    def make_prompt(self, text):
+        """
+        Builds the training prompt using the question, text, and price. Then counts the tokens.
+        """
+        self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
+        self.prompt += f"{self.PRICE_LABEL }{str(round(self.price))}.00"
+        self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))
+
+    def test_prompt(self):
+        """
+        Returns the prompt without the actual price, useful for testing/inference.
+        """
+        return self.prompt.split(self.PRICE_LABEL )[0] + self.PRICE_LABEL 
+
+    def __repr__(self):
+        """
+        Defines how the Item object looks when printed — it shows the title and price.
+        """
+        return f"<{self.title} = ${self.price}>"
+
+        
+
+    
+