Files
LLM_Engineering_OLD/week6/community-contributions/helpers/items.py
2025-06-05 16:41:08 +02:00

120 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from typing import Optional # A variable might be a certain type or None
from transformers import AutoTokenizer
import re
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
MIN_TOKENS = 150 # Minimum tokens required to accept an item
MAX_TOKENS = 160 # We limit to 160 tokens so that after adding prompt text, the total stays around 180 tokens.
MIN_CHARS = 300 # Reject items with less than 300 characters
CEILING_CHARS = MAX_TOKENS * 7 # Truncate long text to about 1120 characters (approx 160 tokens)
class Item:
"""
An Item is a cleaned, curated datapoint of a Product with a Price
"""
# Load tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
# Define PRICE_LABEL and question for the training prompt
PRICE_LABEL = "Price is $"
QUESTION = "How much does this cost to the nearest dollar?"
# A list of useless phrases to remove to reduce noise for price prediction
REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', "By Manufacturer", "Item", "Date First", "Package", ":", "Number of", "Best Sellers", "Number", "Product "]
# Attributes for each item
title: str
price: float
category: str
token_count: int = 0 # How many tokens in the final prompt
# Optional fields
details: Optional[str] # The value can be a string or can be None
prompt: Optional[str] = None
include = False # Whether to keep the item or not
def __init__(self, data, price):
self.title = data['title']
self.price = price
self.parse(data)
def scrub_details(self):
"""
Removes useless phrases from details, which often has repeated specs or boilerplate text.
"""
details = self.details
for remove in self.REMOVALS:
details = details.replace(remove, "")
return details
def scrub(self, stuff):
"""
Clean up the provided text by removing unnecessary characters and whitespace
Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers
"""
stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip()
stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",")
words = stuff.split(' ')
select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]
return " ".join(select)
def parse(self, data):
"""
Prepares the text, checks length, tokenizes it, and sets include = True if its valid.
"""
# Builds a full contents string by combining description, features, and cleaned details.
contents = '\n'.join(data['description'])
if contents:
contents += '\n'
features = '\n'.join(data['features'])
if features:
contents += features + '\n'
self.details = data['details']
if self.details:
contents += self.scrub_details() + '\n'
# If content is long enough, trim it to max char limit before processing.
if len(contents) > MIN_CHARS:
contents = contents[:CEILING_CHARS]
# Clean and tokenize text, then check token count.
text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
tokens = self.tokenizer.encode(text, add_special_tokens=False)
if len(tokens) > MIN_TOKENS:
# Truncate tokens, decode them back and create the training prompt
tokens = tokens[:MAX_TOKENS]
text = self.tokenizer.decode(tokens)
self.make_prompt(text)
# Mark the item as valid and ready to be used in training
self.include = True # Only items with MIN_TOKENS <= tokens <= MAX_TOKENS are kept
def make_prompt(self, text):
"""
Builds the training prompt using the question, text, and price. Then counts the tokens.
"""
self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
self.prompt += f"{self.PRICE_LABEL }{str(round(self.price))}.00"
self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))
def test_prompt(self):
"""
Returns the prompt without the actual price, useful for testing/inference.
"""
return self.prompt.split(self.PRICE_LABEL )[0] + self.PRICE_LABEL
def __repr__(self):
"""
Defines how the Item object looks when printed — it shows the title and price.
"""
return f"<{self.title} = ${self.price}>"