Organize week 6 files in lisekarimi folder
This commit is contained in:
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
1500
week6/community-contributions/lisekarimi/data/human_output.csv
Normal file
1500
week6/community-contributions/lisekarimi/data/human_output.csv
Normal file
File diff suppressed because it is too large
Load Diff
120
week6/community-contributions/lisekarimi/helpers/items.py
Normal file
120
week6/community-contributions/lisekarimi/helpers/items.py
Normal file
@@ -0,0 +1,120 @@
|
||||
from typing import Optional # A variable might be a certain type or None
|
||||
from transformers import AutoTokenizer
|
||||
import re
|
||||
|
||||
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
|
||||
|
||||
MIN_TOKENS = 150 # Minimum tokens required to accept an item
|
||||
MAX_TOKENS = 160 # We limit to 160 tokens so that after adding prompt text, the total stays around 180 tokens.
|
||||
|
||||
MIN_CHARS = 300 # Reject items with less than 300 characters
|
||||
CEILING_CHARS = MAX_TOKENS * 7 # Truncate long text to about 1120 characters (approx 160 tokens)
|
||||
|
||||
class Item:
|
||||
"""
|
||||
An Item is a cleaned, curated datapoint of a Product with a Price
|
||||
"""
|
||||
|
||||
# Load tokenizer for the model
|
||||
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
||||
|
||||
# Define PRICE_LABEL and question for the training prompt
|
||||
PRICE_LABEL = "Price is $"
|
||||
QUESTION = "How much does this cost to the nearest dollar?"
|
||||
|
||||
# A list of useless phrases to remove to reduce noise for price prediction
|
||||
REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', "By Manufacturer", "Item", "Date First", "Package", ":", "Number of", "Best Sellers", "Number", "Product "]
|
||||
|
||||
# Attributes for each item
|
||||
title: str
|
||||
price: float
|
||||
category: str
|
||||
token_count: int = 0 # How many tokens in the final prompt
|
||||
|
||||
# Optional fields
|
||||
details: Optional[str] # The value can be a string or can be None
|
||||
prompt: Optional[str] = None
|
||||
include = False # Whether to keep the item or not
|
||||
|
||||
def __init__(self, data, price):
|
||||
self.title = data['title']
|
||||
self.price = price
|
||||
self.parse(data)
|
||||
|
||||
def scrub_details(self):
|
||||
"""
|
||||
Removes useless phrases from details, which often has repeated specs or boilerplate text.
|
||||
"""
|
||||
details = self.details
|
||||
for remove in self.REMOVALS:
|
||||
details = details.replace(remove, "")
|
||||
return details
|
||||
|
||||
def scrub(self, stuff):
|
||||
"""
|
||||
Clean up the provided text by removing unnecessary characters and whitespace
|
||||
Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers
|
||||
"""
|
||||
stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip()
|
||||
stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",")
|
||||
words = stuff.split(' ')
|
||||
select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]
|
||||
return " ".join(select)
|
||||
|
||||
def parse(self, data):
|
||||
"""
|
||||
Prepares the text, checks length, tokenizes it, and sets include = True if it’s valid.
|
||||
"""
|
||||
# Builds a full contents string by combining description, features, and cleaned details.
|
||||
contents = '\n'.join(data['description'])
|
||||
if contents:
|
||||
contents += '\n'
|
||||
features = '\n'.join(data['features'])
|
||||
if features:
|
||||
contents += features + '\n'
|
||||
self.details = data['details']
|
||||
if self.details:
|
||||
contents += self.scrub_details() + '\n'
|
||||
|
||||
# If content is long enough, trim it to max char limit before processing.
|
||||
if len(contents) > MIN_CHARS:
|
||||
contents = contents[:CEILING_CHARS]
|
||||
|
||||
# Clean and tokenize text, then check token count.
|
||||
text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
|
||||
tokens = self.tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
if len(tokens) > MIN_TOKENS:
|
||||
# Truncate tokens, decode them back and create the training prompt
|
||||
tokens = tokens[:MAX_TOKENS]
|
||||
text = self.tokenizer.decode(tokens)
|
||||
self.make_prompt(text)
|
||||
|
||||
# Mark the item as valid and ready to be used in training
|
||||
self.include = True # Only items with MIN_TOKENS <= tokens <= MAX_TOKENS are kept
|
||||
|
||||
|
||||
def make_prompt(self, text):
|
||||
"""
|
||||
Builds the training prompt using the question, text, and price. Then counts the tokens.
|
||||
"""
|
||||
self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
|
||||
self.prompt += f"{self.PRICE_LABEL }{str(round(self.price))}.00"
|
||||
self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))
|
||||
|
||||
def test_prompt(self):
|
||||
"""
|
||||
Returns the prompt without the actual price, useful for testing/inference.
|
||||
"""
|
||||
return self.prompt.split(self.PRICE_LABEL )[0] + self.PRICE_LABEL
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Defines how the Item object looks when printed — it shows the title and price.
|
||||
"""
|
||||
return f"<{self.title} = ${self.price}>"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
106
week6/community-contributions/lisekarimi/helpers/loaders.py
Normal file
106
week6/community-contributions/lisekarimi/helpers/loaders.py
Normal file
@@ -0,0 +1,106 @@
|
||||
from datetime import datetime # Measure how long loading takes
|
||||
from tqdm import tqdm # Shows a progress bar while processing data
|
||||
from datasets import load_dataset # Load a dataset from Hugging Face Hub
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor # For parallel processing (speed)
|
||||
from items import Item
|
||||
|
||||
CHUNK_SIZE = 1000 # Process the dataset in chunks of 1000 datapoints at a time (for efficiency)
|
||||
MIN_PRICE = 0.5
|
||||
MAX_PRICE = 999.49
|
||||
WORKER = 4 # Set the number of workers here
|
||||
|
||||
class ItemLoader:
|
||||
|
||||
def __init__(self, name):
|
||||
"""
|
||||
Initialize the loader with a dataset name.
|
||||
"""
|
||||
self.name = name # Store the category name
|
||||
self.dataset = None #Placeholder for the dataset (we load it later in load())
|
||||
|
||||
def process_chunk(self, chunk):
|
||||
"""
|
||||
Convert a chunk of datapoints into valid Item objects.
|
||||
"""
|
||||
batch = [] # Initialize the list to hold valid items
|
||||
|
||||
# Loop through each datapoint in the chunk
|
||||
for datapoint in chunk:
|
||||
try:
|
||||
# Extract price from datapoint
|
||||
price_str = datapoint['price']
|
||||
if price_str:
|
||||
price = float(price_str)
|
||||
|
||||
# Check if price is within valid range
|
||||
if MIN_PRICE <= price <= MAX_PRICE:
|
||||
item = Item(datapoint, price)
|
||||
|
||||
# Keep only valid items
|
||||
if item.include:
|
||||
batch.append(item)
|
||||
except ValueError:
|
||||
continue # Skip datapoints with invalid price format
|
||||
return batch # Return the list of valid items
|
||||
|
||||
|
||||
def load_in_parallel(self, workers):
|
||||
"""
|
||||
Split the dataset into chunks and process them in parallel.
|
||||
"""
|
||||
results = []
|
||||
size = len(self.dataset)
|
||||
chunk_count = (size // CHUNK_SIZE) + 1
|
||||
|
||||
# Build chunks directly here (no separate function)
|
||||
chunks = [
|
||||
self.dataset.select(range(i, min(i + CHUNK_SIZE, size)))
|
||||
for i in range(0, size, CHUNK_SIZE)
|
||||
]
|
||||
|
||||
# Process chunks in parallel using multiple CPU cores
|
||||
with ProcessPoolExecutor(max_workers=workers) as pool:
|
||||
for batch in tqdm(pool.map(self.process_chunk, chunks), total=chunk_count):
|
||||
results.extend(batch)
|
||||
|
||||
# Add the category name to each result
|
||||
for result in results:
|
||||
result.category = self.name
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def load(self, workers=WORKER):
|
||||
"""
|
||||
Load and process the dataset, returning valid items.
|
||||
"""
|
||||
# Record start time
|
||||
start = datetime.now()
|
||||
|
||||
# Print loading message
|
||||
print(f"Loading dataset {self.name}", flush=True)
|
||||
|
||||
# Load dataset from Hugging Face (based on category name)
|
||||
self.dataset = load_dataset(
|
||||
"McAuley-Lab/Amazon-Reviews-2023",
|
||||
f"raw_meta_{self.name}",
|
||||
split="full",
|
||||
trust_remote_code=True
|
||||
)
|
||||
|
||||
# Process the dataset in parallel and collect valid items
|
||||
results = self.load_in_parallel(workers)
|
||||
|
||||
# Record end time and print summary
|
||||
finish = datetime.now()
|
||||
print(
|
||||
f"Completed {self.name} with {len(results):,} datapoints in {(finish-start).total_seconds()/60:.1f} mins",
|
||||
flush=True
|
||||
)
|
||||
|
||||
# Return the list of valid items
|
||||
return results
|
||||
|
||||
|
||||
|
||||
|
||||
84
week6/community-contributions/lisekarimi/helpers/testing.py
Normal file
84
week6/community-contributions/lisekarimi/helpers/testing.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import math
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
GREEN = "\033[92m"
|
||||
YELLOW = "\033[93m"
|
||||
RED = "\033[91m"
|
||||
RESET = "\033[0m"
|
||||
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}
|
||||
|
||||
class Tester:
|
||||
|
||||
def __init__(self, predictor, data, title=None, size=250):
|
||||
self.predictor = predictor
|
||||
self.data = data
|
||||
self.title = title or predictor.__name__.replace("_", " ").title()
|
||||
self.size = size
|
||||
self.guesses = []
|
||||
self.truths = []
|
||||
self.errors = []
|
||||
self.sles = []
|
||||
self.colors = []
|
||||
|
||||
def color_for(self, error, truth):
|
||||
if error<40 or error/truth < 0.2:
|
||||
return "green"
|
||||
elif error<80 or error/truth < 0.4:
|
||||
return "orange"
|
||||
else:
|
||||
return "red"
|
||||
|
||||
def run_datapoint(self, i):
|
||||
datapoint = self.data[i]
|
||||
guess = self.predictor(datapoint)
|
||||
truth = datapoint["price"]
|
||||
error = abs(guess - truth)
|
||||
log_error = math.log(truth+1) - math.log(guess+1)
|
||||
sle = log_error ** 2
|
||||
color = self.color_for(error, truth)
|
||||
title = datapoint["text"][:40] + "..." if len(datapoint["text"]) > 40 else datapoint["text"]
|
||||
self.guesses.append(guess)
|
||||
self.truths.append(truth)
|
||||
self.errors.append(error)
|
||||
self.sles.append(sle)
|
||||
self.colors.append(color)
|
||||
# print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")
|
||||
|
||||
def chart(self, title):
|
||||
max_error = max(self.errors)
|
||||
plt.figure(figsize=(15, 6))
|
||||
max_val = max(max(self.truths), max(self.guesses))
|
||||
plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
|
||||
plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
|
||||
plt.xlabel('Ground Truth')
|
||||
plt.ylabel('Model Estimate')
|
||||
plt.xlim(0, max_val)
|
||||
plt.ylim(0, max_val)
|
||||
plt.title(title)
|
||||
|
||||
# Add color legend
|
||||
from matplotlib.lines import Line2D
|
||||
legend_elements = [
|
||||
Line2D([0], [0], marker='o', color='w', label='Accurate (green)', markerfacecolor='green', markersize=8),
|
||||
Line2D([0], [0], marker='o', color='w', label='Medium error (orange)', markerfacecolor='orange', markersize=8),
|
||||
Line2D([0], [0], marker='o', color='w', label='High error (red)', markerfacecolor='red', markersize=8)
|
||||
]
|
||||
plt.legend(handles=legend_elements, loc='upper left')
|
||||
plt.show()
|
||||
|
||||
def report(self):
|
||||
average_error = sum(self.errors) / self.size
|
||||
rmsle = math.sqrt(sum(self.sles) / self.size)
|
||||
hits = sum(1 for color in self.colors if color=="green")
|
||||
title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
|
||||
self.chart(title)
|
||||
|
||||
def run(self):
|
||||
self.error = 0
|
||||
for i in range(self.size):
|
||||
self.run_datapoint(i)
|
||||
self.report()
|
||||
|
||||
@classmethod
|
||||
def test(cls, function, data):
|
||||
cls(function, data).run()
|
||||
Reference in New Issue
Block a user