Files
LLM_Engineering_OLD/week6/community-contributions/solisoma/end_of_week_assesment.ipynb

540 lines
18 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "8153067f",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import json\n",
"from openai import OpenAI\n",
"import tiktoken\n",
"from dotenv import load_dotenv\n",
"import math\n",
"import matplotlib.pyplot as plt\n",
"from huggingface_hub import login\n",
"from datasets import load_dataset, Dataset, DatasetDict\n",
"from transformers import AutoTokenizer\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import ast"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "939441c1",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')\n",
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')\n",
"\n",
"OUTLIER_EXECUTED = False\n",
"BASE_MODEL = \"meta-llama/Meta-Llama-3.1-8B\"\n",
"\n",
"# This is the my fine-tuned model you can use it or decide to train your own\n",
"FINE_TUNED_MODEL = \"ft:gpt-4o-mini-2024-07-18:quicksearch-plus::CV6dqS5l\"\n",
"GREEN = \"\\033[92m\"\n",
"YELLOW = \"\\033[93m\"\n",
"RED = \"\\033[91m\"\n",
"RESET = \"\\033[0m\"\n",
"COLOR_MAP = {\"red\": RED, \"orange\": YELLOW, \"green\": GREEN}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ce01a10",
"metadata": {},
"outputs": [],
"source": [
"hf_token = os.environ['HF_TOKEN']\n",
"login(hf_token, add_to_git_credential=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c9e57b0",
"metadata": {},
"outputs": [],
"source": [
"dataset = load_dataset(\"McAuley-Lab/Amazon-Reviews-2023\", f\"raw_meta_Appliances\", split=\"full\", trust_remote_code=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "64670e7f",
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame(dataset,columns=[\"main_category\", \"title\", \"description\", \"features\", \"details\", \"price\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2c34577",
"metadata": {},
"outputs": [],
"source": [
"data[\"title\"] = data[\"title\"].apply(str)\n",
"data[\"description\"] = data[\"description\"].apply(str)\n",
"data[\"features\"] = data[\"features\"].apply(str)\n",
"\n",
"# Replace \"None\" and [] with None \n",
"data[\"price\"] = data[\"price\"].replace(\"None\", None)\n",
"data[\"title\"] = data[\"title\"].replace(\"\", None)\n",
"data[\"description\"] = data[\"description\"].replace(\"[]\", None)\n",
"data[\"features\"] = data[\"features\"].replace(\"[]\", None)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f99208b5",
"metadata": {},
"outputs": [],
"source": [
"data = data.dropna()\n",
"data[\"price\"] = data[\"price\"].apply(float)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c42b5c9",
"metadata": {},
"outputs": [],
"source": [
"data = data.drop_duplicates(subset=[\"title\", \"description\",\"price\"])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73856ce5",
"metadata": {},
"outputs": [],
"source": [
"# Handle outliers\n",
"# To do that we use the interquartile range\n",
"# First we need to calculate the first and third quartiles\n",
"# Make sure to run this just once \n",
"\n",
"q1 = data[\"price\"].quantile(0.25)\n",
"q3 = data[\"price\"].quantile(0.75)\n",
"iqr = q3 - q1\n",
"\n",
"lower_bound = q1 - 1.5 * iqr\n",
"higher_bound = q3 + 1.5 * iqr\n",
"\n",
"if not OUTLIER_EXECUTED:\n",
" OUTLIER_EXECUTED = True\n",
" data = data[(data[\"price\"] >= lower_bound) & (data[\"price\"] <= higher_bound) & (data[\"price\"] > 0)]\n",
"else:\n",
" print(\"Outlier already executed\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "567b9e4b",
"metadata": {},
"outputs": [],
"source": [
"#Further cleansing of the data (dealing with lists and dicts)\n",
"def clean_list_string(field):\n",
" \"\"\"Convert string representation of list to clean string\"\"\"\n",
" try:\n",
" # Try to parse as literal list\n",
" if field.startswith('[') and field.endswith(']'):\n",
" parsed = ast.literal_eval(field)\n",
" return ' '.join(str(item) for item in parsed)\n",
" except:\n",
" pass\n",
" return str(field)\n",
"\n",
"def clean_dict_string(field):\n",
" \"\"\"Convert string representation of dict to clean string\"\"\"\n",
" try:\n",
" # Try to parse as literal dict\n",
" if field.startswith('{') and field.endswith('}'):\n",
" parsed = ast.literal_eval(field)\n",
" parts = []\n",
" for key, value in parsed.items():\n",
" if isinstance(value, dict):\n",
" value = ', '.join(f\"{k}: {v}\" for k, v in value.items())\n",
" parts.append(f\"{key}: {value}\")\n",
" return ' | '.join(parts)\n",
" except:\n",
" pass\n",
" return str(field)\n",
"\n",
"\n",
"data[\"description\"] = data[\"description\"].apply(clean_list_string)\n",
"data[\"features\"] = data[\"features\"].apply(clean_list_string)\n",
"data[\"details\"] = data[\"details\"].apply(clean_dict_string)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0011b3fa",
"metadata": {},
"outputs": [],
"source": [
"SYSTEM_PROMPT = \"\"\"\n",
"You are a price prediction expert. Given a product's title, description, features, or details, predict its price in USD.\n",
"\n",
"Rules:\n",
"1. Analyze all available product information carefully\n",
"2. If information is incomplete or truncated, use your knowledge of similar products and market pricing to make informed predictions\n",
"3. Consider product quality indicators, brand reputation, features, and typical market values\n",
"4. Return ONLY the numeric price (e.g., \"29.99\") \n",
"5. Do not include currency symbols, explanations, or additional text\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "043cb9d7",
"metadata": {},
"outputs": [],
"source": [
"def truncate_by_tokens(text, max_tokens=300):\n",
" \"\"\"Truncate to max tokens\"\"\"\n",
" encoding = tiktoken.encoding_for_model(\"gpt-4o-mini\")\n",
" tokens = encoding.encode(text)\n",
" \n",
" if len(tokens) <= max_tokens:\n",
" return text\n",
" \n",
" truncated_tokens = tokens[:max_tokens]\n",
" return encoding.decode(truncated_tokens)\n",
"\n",
"def generate_prompt(data):\n",
" \"\"\"\n",
" Generate a prompt for the model to predict the price of a product\n",
" \"\"\"\n",
"\n",
" prompt = f\"\"\"\n",
" Below are the details of the product: \n",
" Title: {data['title']}\n",
" Description: {data['description']}\n",
" Features: {data['features']}\n",
" \"\"\"\n",
" return truncate_by_tokens(prompt)\n",
"\n",
"def generate_message(data):\n",
" \"\"\"\n",
" Generate a message for the model to predict the price of a product\n",
" \"\"\"\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
" {\"role\": \"user\", \"content\": data[\"prompt\"]},\n",
" {\"role\": \"assistant\", \"content\": str(data['price'])}\n",
" ]\n",
" return messages\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cdc8e3ff",
"metadata": {},
"outputs": [],
"source": [
"data[\"prompt\"] = data.apply(lambda x: generate_prompt(x), axis=1)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d1837c7",
"metadata": {},
"outputs": [],
"source": [
"train_data = data.sample(n=200, random_state=42)\n",
"train_set = train_data.sample(frac=0.8, random_state=42)\n",
"test_set = train_data.drop(train_set.index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7abfec95",
"metadata": {},
"outputs": [],
"source": [
"# Create a jsonl file for the training set\n",
"\n",
"with open('training_data.jsonl', 'w') as f:\n",
" for index, row in train_set.iterrows():\n",
" messages = {\"messages\": generate_message(row)}\n",
" f.write(json.dumps(messages) + '\\n')\n",
"\n",
"with open('validation_data.jsonl', 'w') as f:\n",
" for index, row in test_set.iterrows():\n",
" messages = {\"messages\": generate_message(row)}\n",
" f.write(json.dumps(messages) + '\\n')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a69291cb",
"metadata": {},
"outputs": [],
"source": [
"client = OpenAI()\n",
"\n",
"# Uncoment the following code to train your own model\n",
"\n",
"# print(\"Uploading training file...\")\n",
"# training_file = client.files.create(\n",
"# file=open('training_data.jsonl', 'rb'),\n",
"# purpose='fine-tune'\n",
"# )\n",
"# print(f\"File uploaded: {training_file.id}\")\n",
"\n",
"# print(\"Uploading validation file...\")\n",
"# validation_file = client.files.create(\n",
"# file=open('validation_data.jsonl', 'rb'),\n",
"# purpose='fine-tune'\n",
"# )\n",
"# print(f\"Validation file uploaded: {validation_file.id}\")\n",
"\n",
"# print(\"Starting fine-tuning...\")\n",
"# job = client.fine_tuning.jobs.create(\n",
"# validation_file=validation_file.id,\n",
"# training_file=training_file.id,\n",
"# model='gpt-4o-mini-2024-07-18'\n",
"# )\n",
"# print(f\"Job created: {job.id}\")\n",
"\n",
"# status = client.fine_tuning.jobs.retrieve(job.id)\n",
"# print(f\"Status: {status.status}\")\n",
"\n",
"# import time\n",
"# while status.status not in ['succeeded', 'failed']:\n",
"# time.sleep(60)\n",
"# status = client.fine_tuning.jobs.retrieve(job.id)\n",
"# print(f\"Status: {status.status}\")\n",
"\n",
"# if status.status == 'succeeded':\n",
"# print(f\"Model ready: {status.fine_tuned_model}\")\n",
"# else:\n",
"# print(f\"Training failed: {status.error}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c0dfc1d",
"metadata": {},
"outputs": [],
"source": [
"class PriceTester:\n",
" \n",
" def __init__(self, predictor, data, title=\"Price Prediction Model\", size=None):\n",
" \"\"\"\n",
" predictor: function that takes a row and returns predicted price\n",
" data: pandas DataFrame with test data\n",
" \"\"\"\n",
" self.predictor = predictor\n",
" self.data = data\n",
" self.title = title\n",
" self.size = size or len(data)\n",
" self.guesses = []\n",
" self.truths = []\n",
" self.errors = []\n",
" self.sles = []\n",
" self.colors = []\n",
" \n",
" def color_for(self, error, truth):\n",
" \"\"\"Determine color based on error\"\"\"\n",
" if error < 40 or error/truth < 0.2:\n",
" return \"green\"\n",
" elif error < 80 or error/truth < 0.4:\n",
" return \"orange\"\n",
" else:\n",
" return \"red\"\n",
" \n",
" def run_datapoint(self, i):\n",
" \"\"\"Test single datapoint\"\"\"\n",
" row = self.data.iloc[i]\n",
" \n",
" # Get prediction\n",
" predict = self.predictor(row)\n",
" \n",
" # Try to convert to float, skip if fails\n",
" try:\n",
" guess = float(predict)\n",
" except (ValueError, TypeError):\n",
" print(f\"{YELLOW}{i+1}: Skipped - Non-numeric response: {predict[:50]}...{RESET}\")\n",
" return # Skip this datapoint\n",
" \n",
" truth = float(row['price']) \n",
" \n",
" # Calculate metrics\n",
" error = abs(guess - truth)\n",
" log_error = math.log(truth + 1) - math.log(guess + 1)\n",
" sle = log_error ** 2\n",
" color = self.color_for(error, truth)\n",
" \n",
" # Get title for display\n",
" title = row['title'] if len(row['title']) <= 40 else row['title'][:40] + \"...\"\n",
" \n",
" # Store results\n",
" self.guesses.append(guess)\n",
" self.truths.append(truth)\n",
" self.errors.append(error)\n",
" self.sles.append(sle)\n",
" self.colors.append(color)\n",
" \n",
" # Print result\n",
" print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:.4f} Item: {title}{RESET}\")\n",
" \n",
" def chart(self, title):\n",
" \"\"\"Create scatter plot of predictions vs truth\"\"\"\n",
" plt.figure(figsize=(12, 8))\n",
" max_val = max(max(self.truths), max(self.guesses))\n",
" \n",
" # Perfect prediction line\n",
" plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6, label='Perfect Prediction')\n",
" \n",
" # Scatter plot\n",
" plt.scatter(self.truths, self.guesses, s=30, c=self.colors, alpha=0.6)\n",
" \n",
" plt.xlabel('Ground Truth Price ($)', fontsize=12)\n",
" plt.ylabel('Predicted Price ($)', fontsize=12)\n",
" plt.xlim(0, max_val)\n",
" plt.ylim(0, max_val)\n",
" plt.title(title, fontsize=14)\n",
" plt.legend()\n",
" plt.grid(True, alpha=0.3)\n",
" plt.show()\n",
" \n",
" def report(self):\n",
" \"\"\"Generate final report with metrics\"\"\"\n",
" average_error = sum(self.errors) / self.size\n",
" rmsle = math.sqrt(sum(self.sles) / self.size)\n",
" hits = sum(1 for color in self.colors if color == \"green\")\n",
" hit_rate = hits / self.size * 100\n",
" \n",
" # Print summary\n",
" print(f\"\\n{'='*60}\")\n",
" print(f\"FINAL REPORT: {self.title}\")\n",
" print(f\"{'='*60}\")\n",
" print(f\"Total Predictions: {self.size}\")\n",
" print(f\"Average Error: ${average_error:,.2f}\")\n",
" print(f\"RMSLE: {rmsle:.4f}\")\n",
" print(f\"Hit Rate (Green): {hit_rate:.1f}% ({hits}/{self.size})\")\n",
" print(f\"{'='*60}\\n\")\n",
" \n",
" # Create chart\n",
" chart_title = f\"{self.title}\\nError=${average_error:,.2f} | RMSLE={rmsle:.4f} | Hits={hit_rate:.1f}%\"\n",
" self.chart(chart_title)\n",
" \n",
" # Return metrics\n",
" return {\n",
" 'average_error': average_error,\n",
" 'rmsle': rmsle,\n",
" 'hit_rate': hit_rate,\n",
" 'hits': hits,\n",
" 'guesses': self.guesses,\n",
" 'truths': self.truths,\n",
" 'errors': self.errors,\n",
" 'sles': self.sles,\n",
" 'colors': self.colors\n",
" }\n",
" \n",
" def run(self):\n",
" \"\"\"Run test on all datapoints\"\"\"\n",
" print(f\"Testing {self.size} predictions...\\n\")\n",
" \n",
" for i in range(self.size):\n",
" self.run_datapoint(i)\n",
" \n",
" return self.report()\n",
" \n",
" @classmethod\n",
" def test(cls, predictor, data, title=\"Price Prediction Model\", size=None):\n",
" \"\"\"Quick test method\"\"\"\n",
" return cls(predictor, data, title, size).run()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4cc250e6",
"metadata": {},
"outputs": [],
"source": [
"def predictor(data):\n",
" user_prompt = data[\"description\"] \n",
" if not user_prompt or user_prompt.strip() == \"\":\n",
" print(\"Warning: Empty prompt!\")\n",
" return data[\"price\"]\n",
"\n",
" test = client.chat.completions.create(\n",
" # uncomment this line to use your own model\n",
" # model=status.fine_tuned_model, \n",
" model=FINE_TUNED_MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" )\n",
"\n",
" return test.choices[0].message.content\n",
"\n",
"\n",
"#"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "297f1aed",
"metadata": {},
"outputs": [],
"source": [
"test_array = data.sample(n=300, random_state=42)\n",
"result = PriceTester.test(predictor, test_array, title=\"GPT-4o-mini Fine-tuned\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}