diff --git a/week6/community-contributions/finetuning-joshua/Week6_Product_Pricer_Clean.ipynb b/week6/community-contributions/finetuning-joshua/Week6_Product_Pricer_Clean.ipynb new file mode 100644 index 0000000..6b4fc33 --- /dev/null +++ b/week6/community-contributions/finetuning-joshua/Week6_Product_Pricer_Clean.ipynb @@ -0,0 +1,828 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Week 6 - Product Pricer Challenge\n", + "\n", + "**A baseline established by GPT-4o and attempt to beat it with fine-tuning**\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize and Load Configuration\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import re\n", + "import math\n", + "import json\n", + "import random\n", + "import pickle\n", + "from collections import Counter\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from huggingface_hub import login\n", + "from openai import OpenAI\n", + "\n", + "# SimpleItem class definition for pickle compatibility\n", + "class SimpleItem:\n", + " \"\"\"\n", + " Simple item class for pickle compatibility\n", + " This matches the structure used in the CSV conversion script\n", + " \"\"\"\n", + " def __init__(self, title, description, price, category=\"Human_Generated\", token_count=0):\n", + " self.title = title\n", + " self.description = description\n", + " self.price = price\n", + " self.category = category\n", + " self.token_count = token_count\n", + "\n", + " def test_prompt(self):\n", + " \"\"\"\n", + " Return a prompt suitable for testing, with the actual price removed\n", + " This method is needed for compatibility with the testing framework\n", + " \"\"\"\n", + " return f\"How much does this cost to the nearest dollar?\\n\\n{self.title}\\n\\n{self.description}\\n\\nPrice is $\"\n", + "\n", + " def __repr__(self):\n", + " return f\"SimpleItem(title='{self.title[:50]}...', price=${self.price})\"\n", + "\n", + "# Import our custom classes\n", + "# Use original testing class to avoid matplotlib color issues\n", + "try:\n", + " from enhanced_items import Item\n", + " # Use original Tester to avoid matplotlib color issues\n", + " import sys\n", + " import os\n", + " sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))\n", + " from testing import Tester\n", + " print(\"✅ Using enhanced items and original testing from parent directory\")\n", + "except ImportError:\n", + " # Fallback to parent directory modules\n", + " import sys\n", + " import os\n", + " sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))\n", + " from items import Item\n", + " from testing import Tester\n", + " print(\"✅ Using modules from parent directory\")\n", + "\n", + "print(\"✅ All imports successful!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Environment setup\n", + "try:\n", + " from google.colab import userdata\n", + " os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n", + " os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')\n", + " print(\"✅ Using Colab secrets\")\n", + "except:\n", + " from dotenv import load_dotenv\n", + " load_dotenv(override=True)\n", + " os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n", + " os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')\n", + " print(\"✅ Using local .env file\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Log in to HuggingFace\n", + "hf_token = os.environ['HF_TOKEN']\n", + "login(hf_token)\n", + "\n", + "# Initialize OpenAI client\n", + "openai = OpenAI()\n", + "\n", + "# Enable matplotlib inline for Colab\n", + "%matplotlib inline\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load pre-processed pickle files (our data loading hack)\n", + "def load_pickle_data():\n", + " \"\"\"\n", + " Load pre-processed pickle files with fallback to sample data\n", + " \"\"\"\n", + " print(\"📦 Loading pre-processed pickle files...\")\n", + " \n", + " # Try to load pickle files\n", + " pickle_files = ['train.pkl', 'test.pkl', 'validation.pkl', \n", + " 'data/train.pkl', 'data/test.pkl', 'data/validation.pkl',\n", + " '../train.pkl', '../test.pkl', '../validation.pkl']\n", + " \n", + " train = None\n", + " test = None\n", + " validation = None\n", + " \n", + " # Load training data\n", + " for file_path in ['train.pkl', 'data/train.pkl', '../train.pkl']:\n", + " if os.path.exists(file_path):\n", + " try:\n", + " with open(file_path, 'rb') as f:\n", + " train = pickle.load(f)\n", + " print(f\"✅ Loaded training data: {file_path} ({len(train)} items)\")\n", + " break\n", + " except Exception as e:\n", + " print(f\"❌ Error loading {file_path}: {e}\")\n", + " # Try to load as dictionary and convert to SimpleItem\n", + " try:\n", + " with open(file_path, 'rb') as f:\n", + " raw_data = pickle.load(f)\n", + " if isinstance(raw_data, list) and len(raw_data) > 0:\n", + " if isinstance(raw_data[0], dict):\n", + " # Convert dictionary to SimpleItem\n", + " train = []\n", + " for item_dict in raw_data:\n", + " item = SimpleItem(\n", + " title=item_dict.get('title', ''),\n", + " description=item_dict.get('description', ''),\n", + " price=item_dict.get('price', 0.0),\n", + " category=item_dict.get('category', 'Human_Generated'),\n", + " token_count=item_dict.get('token_count', 0)\n", + " )\n", + " train.append(item)\n", + " print(f\" Converted {len(train)} training items from dictionary format\")\n", + " break\n", + " except Exception as e2:\n", + " print(f\" ❌ Failed to convert {file_path}: {e2}\")\n", + " \n", + " # Load test data\n", + " for file_path in ['test.pkl', 'data/test.pkl', '../test.pkl']:\n", + " if os.path.exists(file_path):\n", + " try:\n", + " with open(file_path, 'rb') as f:\n", + " test = pickle.load(f)\n", + " print(f\"✅ Loaded test data: {file_path} ({len(test)} items)\")\n", + " break\n", + " except Exception as e:\n", + " print(f\"❌ Error loading {file_path}: {e}\")\n", + " # Try to load as dictionary and convert to SimpleItem\n", + " try:\n", + " with open(file_path, 'rb') as f:\n", + " raw_data = pickle.load(f)\n", + " if isinstance(raw_data, list) and len(raw_data) > 0:\n", + " if isinstance(raw_data[0], dict):\n", + " # Convert dictionary to SimpleItem\n", + " test = []\n", + " for item_dict in raw_data:\n", + " item = SimpleItem(\n", + " title=item_dict.get('title', ''),\n", + " description=item_dict.get('description', ''),\n", + " price=item_dict.get('price', 0.0),\n", + " category=item_dict.get('category', 'Human_Generated'),\n", + " token_count=item_dict.get('token_count', 0)\n", + " )\n", + " test.append(item)\n", + " print(f\" Converted {len(test)} test items from dictionary format\")\n", + " break\n", + " except Exception as e2:\n", + " print(f\" ❌ Failed to convert {file_path}: {e2}\")\n", + " \n", + " # Load validation data\n", + " for file_path in ['validation.pkl', 'data/validation.pkl', '../validation.pkl']:\n", + " if os.path.exists(file_path):\n", + " try:\n", + " with open(file_path, 'rb') as f:\n", + " validation = pickle.load(f)\n", + " print(f\"✅ Loaded validation data: {file_path} ({len(validation)} items)\")\n", + " break\n", + " except Exception as e:\n", + " print(f\"❌ Error loading {file_path}: {e}\")\n", + " # Try to load as dictionary and convert to SimpleItem\n", + " try:\n", + " with open(file_path, 'rb') as f:\n", + " raw_data = pickle.load(f)\n", + " if isinstance(raw_data, list) and len(raw_data) > 0:\n", + " if isinstance(raw_data[0], dict):\n", + " # Convert dictionary to SimpleItem\n", + " validation = []\n", + " for item_dict in raw_data:\n", + " item = SimpleItem(\n", + " title=item_dict.get('title', ''),\n", + " description=item_dict.get('description', ''),\n", + " price=item_dict.get('price', 0.0),\n", + " category=item_dict.get('category', 'Human_Generated'),\n", + " token_count=item_dict.get('token_count', 0)\n", + " )\n", + " validation.append(item)\n", + " print(f\" Converted {len(validation)} validation items from dictionary format\")\n", + " break\n", + " except Exception as e2:\n", + " print(f\" ❌ Failed to convert {file_path}: {e2}\")\n", + " \n", + " # If no pickle files found, create sample data\n", + " if not train or not test:\n", + " print(\"🔄 No pickle files found, creating sample data...\")\n", + " train, test, validation = create_sample_data()\n", + " \n", + " # Debug: Check what we actually loaded\n", + " print(f\"\\n🔍 Debug - Data loaded:\")\n", + " print(f\" train: {len(train) if train else 0} items\")\n", + " print(f\" test: {len(test) if test else 0} items\") \n", + " print(f\" validation: {len(validation) if validation else 0} items\")\n", + " \n", + " # Additional safety check\n", + " if not test or len(test) == 0:\n", + " print(\"⚠️ WARNING: Test dataset is empty! Creating emergency sample data...\")\n", + " # Create emergency test data\n", + " emergency_test = [\n", + " SimpleItem(\"Test Product 1\", \"A test product for evaluation\", 25.99, \"Test\", 10),\n", + " SimpleItem(\"Test Product 2\", \"Another test product\", 45.50, \"Test\", 12),\n", + " SimpleItem(\"Test Product 3\", \"Third test product\", 15.75, \"Test\", 8)\n", + " ]\n", + " test = emergency_test\n", + " print(f\" Emergency test data created: {len(test)} items\")\n", + " \n", + " return train, test, validation\n", + "\n", + "def create_sample_data():\n", + " \"\"\"\n", + " Create sample data for demonstration\n", + " \"\"\"\n", + " # Sample product data (expanded for better testing)\n", + " sample_products = [\n", + " {\"title\": \"Wireless Bluetooth Headphones\", \"price\": 89.99, \"category\": \"Electronics\"},\n", + " {\"title\": \"Stainless Steel Water Bottle\", \"price\": 24.99, \"category\": \"Home & Kitchen\"},\n", + " {\"title\": \"Organic Cotton T-Shirt\", \"price\": 19.99, \"category\": \"Clothing\"},\n", + " {\"title\": \"Ceramic Coffee Mug\", \"price\": 12.99, \"category\": \"Home & Kitchen\"},\n", + " {\"title\": \"LED Desk Lamp\", \"price\": 45.99, \"category\": \"Electronics\"},\n", + " {\"title\": \"Yoga Mat\", \"price\": 29.99, \"category\": \"Sports & Outdoors\"},\n", + " {\"title\": \"Leather Wallet\", \"price\": 39.99, \"category\": \"Accessories\"},\n", + " {\"title\": \"Bluetooth Speaker\", \"price\": 79.99, \"category\": \"Electronics\"},\n", + " {\"title\": \"Kitchen Knife Set\", \"price\": 129.99, \"category\": \"Home & Kitchen\"},\n", + " {\"title\": \"Running Shoes\", \"price\": 89.99, \"category\": \"Sports & Outdoors\"},\n", + " {\"title\": \"Smartphone Case\", \"price\": 15.99, \"category\": \"Electronics\"},\n", + " {\"title\": \"Coffee Maker\", \"price\": 89.99, \"category\": \"Home & Kitchen\"},\n", + " {\"title\": \"Backpack\", \"price\": 49.99, \"category\": \"Accessories\"},\n", + " {\"title\": \"Tennis Racket\", \"price\": 79.99, \"category\": \"Sports & Outdoors\"},\n", + " {\"title\": \"Laptop Stand\", \"price\": 34.99, \"category\": \"Electronics\"}\n", + " ]\n", + " \n", + " # Create SimpleItem objects\n", + " items = []\n", + " for product in sample_products:\n", + " item = SimpleItem(\n", + " title=product['title'],\n", + " description=f\"High-quality {product['title'].lower()}\",\n", + " price=product['price'],\n", + " category=product['category'],\n", + " token_count=len(product['title'] + f\"High-quality {product['title'].lower()}\") // 4\n", + " )\n", + " items.append(item)\n", + " \n", + " # Split into train/test/validation (more balanced split)\n", + " train = items[:10] # 10 items\n", + " test = items[10:13] # 3 items \n", + " validation = items[13:] # 2 items\n", + " \n", + " print(f\"✅ Created sample data: {len(train)} train, {len(test)} test, {len(validation)} validation\")\n", + " return train, test, validation\n", + "\n", + "# Load the data\n", + "train, test, validation = load_pickle_data()\n", + "\n", + "print(f\"\\n📊 Dataset Statistics:\")\n", + "print(f\" Training: {len(train)} items\")\n", + "print(f\" Test: {len(test)} items\")\n", + "print(f\" Validation: {len(validation)} items\")\n", + "\n", + "if train:\n", + " print(f\"\\n🔍 Sample Training Item:\")\n", + " print(f\" Title: {train[0].title}\")\n", + " print(f\" Price: ${train[0].price}\")\n", + " print(f\" Category: {train[0].category}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Fine-tuning Data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# OpenAI recommends fine-tuning with 50-100 examples\n", + "# Use our actual train/validation split from the pickle files\n", + "fine_tune_train = train # Use all training data (150 items)\n", + "fine_tune_validation = validation # Use validation data (50 items)\n", + "\n", + "print(f\"📊 Fine-tuning data prepared:\")\n", + "print(f\" Training: {len(fine_tune_train)} items\")\n", + "print(f\" Validation: {len(fine_tune_validation)} items\")\n", + "\n", + "# Weight and Biases integration (optional)\n", + "wandb_integration = {\"type\": \"wandb\", \"wandb\": {\"project\": \"gpt-pricer-ft\"}}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Helper Functions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Utility function to extract price from a string\n", + "def get_price(s):\n", + " s = s.replace('$', '').replace(',', '')\n", + " match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", s)\n", + " return float(match.group()) if match else 0\n", + "\n", + "# Prompt generation functions\n", + "def messages_for(item):\n", + " system_message = \"You estimate prices of items. Reply only with the price, no explanation\"\n", + " user_prompt = item.test_prompt().replace(\" to the nearest dollar\", \"\").replace(\"\\n\\nPrice is $\", \"\")\n", + " return [\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": user_prompt},\n", + " {\"role\": \"assistant\", \"content\": \"Price is $\"}\n", + " ]\n", + "\n", + "def messages_with_price(item):\n", + " system_message = \"You estimate prices of items. Reply only with the price, no explanation\"\n", + " user_prompt = item.test_prompt().replace(\" to the nearest dollar\", \"\").replace(\"\\n\\nPrice is $\", \"\")\n", + " return [\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": user_prompt},\n", + " {\"role\": \"assistant\", \"content\": f\"Price is ${item.price:.2f}\"}\n", + " ]\n", + "\n", + "print(\"✅ Helper functions defined!\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baseline GPT-4o Model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def gpt_4o_frontier(item):\n", + " response = openai.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=messages_for(item),\n", + " seed=42,\n", + " max_tokens=5\n", + " )\n", + " reply = response.choices[0].message.content\n", + " return get_price(reply)\n", + "\n", + "print(\"🧪 Testing baseline GPT-4o model...\")\n", + "\n", + "# Safety check: Make sure we have test data\n", + "if not test or len(test) == 0:\n", + " print(\"❌ No test data available! Cannot run baseline test.\")\n", + " print(\"💡 Please check the data loading section above.\")\n", + " print(\"🔍 Debug info:\")\n", + " print(f\" test variable exists: {test is not None}\")\n", + " print(f\" test length: {len(test) if test else 'N/A'}\")\n", + " print(f\" test type: {type(test)}\")\n", + "else:\n", + " print(f\"📊 Testing on {len(test)} items...\")\n", + " print(f\"🔍 Test data preview:\")\n", + " for i, item in enumerate(test[:3]): # Show first 3 items\n", + " print(f\" Item {i}: {item.title} - ${item.price}\")\n", + " \n", + " try:\n", + " # Create Tester with correct size parameter\n", + " tester = Tester(gpt_4o_frontier, test, size=len(test))\n", + " tester.run()\n", + " except IndexError as e:\n", + " print(f\"❌ IndexError in Tester.test: {e}\")\n", + " print(f\"🔍 Test data length: {len(test)}\")\n", + " print(\"💡 This suggests the Tester is trying to access more items than available.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-tuning Implementation\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if fine_tuned_model_name:\n", + " def gpt_fine_tuned(item):\n", + " response = openai.chat.completions.create(\n", + " model=fine_tuned_model_name,\n", + " messages=messages_for(item),\n", + " seed=42,\n", + " max_tokens=7\n", + " )\n", + " reply = response.choices[0].message.content\n", + " return get_price(reply)\n", + " \n", + " print(\"🧪 Testing fine-tuned model...\")\n", + " # Create Tester with correct size parameter to avoid IndexError\n", + " tester = Tester(gpt_fine_tuned, test, size=len(test))\n", + " tester.run()\n", + "else:\n", + " print(\"⏳ Fine-tuned model not ready yet. Please wait and re-run the previous cell.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert items to JSONL format for fine-tuning\n", + "def make_jsonl(items):\n", + " result = \"\"\n", + " for item in items:\n", + " messages = messages_with_price(item)\n", + " messages_str = json.dumps(messages)\n", + " result += '{\"messages\": ' + messages_str + '}\\n'\n", + " return result.strip()\n", + "\n", + "def write_jsonl(items, filename):\n", + " with open(filename, \"w\") as f:\n", + " jsonl = make_jsonl(items)\n", + " f.write(jsonl)\n", + "\n", + "# Create fine-tuning files\n", + "write_jsonl(fine_tune_train, \"fine_tune_train.jsonl\")\n", + "write_jsonl(fine_tune_validation, \"fine_tune_validation.jsonl\")\n", + "\n", + "print(\"✅ Fine-tuning files created:\")\n", + "print(\" - fine_tune_train.jsonl\")\n", + "print(\" - fine_tune_validation.jsonl\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Upload files to OpenAI\n", + "with open(\"fine_tune_train.jsonl\", \"rb\") as f:\n", + " train_file = openai.files.create(file=f, purpose=\"fine-tune\")\n", + "\n", + "with open(\"fine_tune_validation.jsonl\", \"rb\") as f:\n", + " validation_file = openai.files.create(file=f, purpose=\"fine-tune\")\n", + "\n", + "print(f\"✅ Files uploaded to OpenAI:\")\n", + "print(f\" Training file ID: {train_file.id}\")\n", + "print(f\" Validation file ID: {validation_file.id}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create fine-tuning job\n", + "fine_tuning_job = openai.fine_tuning.jobs.create(\n", + " training_file=train_file.id,\n", + " validation_file=validation_file.id,\n", + " model=\"gpt-4o-mini\",\n", + " seed=42,\n", + " hyperparameters={\"n_epochs\": 1},\n", + " integrations=[wandb_integration],\n", + " suffix=\"pricer\"\n", + ")\n", + "\n", + "print(f\"🚀 Fine-tuning job created: {fine_tuning_job.id}\")\n", + "print(\"⏳ This will take some time to complete...\")\n", + "print(\"💡 You can monitor progress in the OpenAI dashboard or Weights & Biases\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# FIXED: Test enhanced model (if ready) - with correct Tester size\n", + "try:\n", + " enhanced_model_name = openai.fine_tuning.jobs.retrieve(fine_tuning_job_v2.id).fine_tuned_model\n", + " \n", + " def gpt_enhanced_fine_tuned(item):\n", + " response = openai.chat.completions.create(\n", + " model=enhanced_model_name,\n", + " messages=messages_v2(item, with_price=False),\n", + " seed=42,\n", + " temperature=1.0,\n", + " max_tokens=7\n", + " )\n", + " reply = response.choices[0].message.content\n", + " return get_price(reply)\n", + " \n", + " print(\"🧪 Testing enhanced fine-tuned model...\")\n", + " # Create Tester with correct size parameter to avoid IndexError\n", + " tester = Tester(gpt_enhanced_fine_tuned, test, size=len(test))\n", + " tester.run()\n", + " \n", + "except:\n", + " print(\"⏳ Enhanced fine-tuned model not ready yet.\")\n", + " print(\"💡 Please wait for completion and re-run this cell.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check job status\n", + "job_id = fine_tuning_job.id\n", + "job_status = openai.fine_tuning.jobs.retrieve(job_id)\n", + "\n", + "print(f\"📊 Job Status: {job_status.status}\")\n", + "print(f\"📈 Training File: {job_status.training_file}\")\n", + "print(f\"📈 Validation File: {job_status.validation_file}\")\n", + "print(f\"🤖 Model: {job_status.model}\")\n", + "\n", + "# Get recent events\n", + "events = openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10)\n", + "print(f\"\\n📋 Recent Events:\")\n", + "for event in events.data:\n", + " print(f\" {event.created_at}: {event.message}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test Fine-tuned Model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Wait for fine-tuning to complete and get the model name\n", + "# Note: In practice, you would wait for the job to complete\n", + "try:\n", + " fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model\n", + " print(f\"✅ Fine-tuned model ready: {fine_tuned_model_name}\")\n", + "except:\n", + " print(\"⏳ Fine-tuning still in progress...\")\n", + " print(\"💡 Please wait for completion and re-run this cell\")\n", + " fine_tuned_model_name = None\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test the fine-tuned model (if ready)\n", + "if fine_tuned_model_name:\n", + " def gpt_fine_tuned(item):\n", + " response = openai.chat.completions.create(\n", + " model=fine_tuned_model_name,\n", + " messages=messages_for(item),\n", + " seed=42,\n", + " max_tokens=7\n", + " )\n", + " reply = response.choices[0].message.content\n", + " return get_price(reply)\n", + " \n", + " print(\"🧪 Testing fine-tuned model...\")\n", + " Tester.test(gpt_fine_tuned, test)\n", + "else:\n", + " print(\"⏳ Fine-tuned model not ready yet. Please wait and re-run the previous cell.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced Fine-tuning with Enhanced Prompts\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Enhanced prompt function (based on gold standard)\n", + "def messages_v2(item, with_price=True):\n", + " system_message = (\n", + " \"Role: You are a retail price estimator.\\n\"\n", + " \"Market: United States; Currency: USD.\\n\"\n", + " \"Scope: Predict the most likely new retail price. Ignore taxes, shipping, coupons, bundles, used/renewed.\\n\"\n", + " \"Output: Only a number with two decimals (e.g., 129.99). No $ sign. No words.\\n\"\n", + " \"Think silently; do not reveal reasoning.\"\n", + " )\n", + " \n", + " user_prompt = item.test_prompt().replace(\" to the nearest dollar\", \"\").replace(\"\\n\\nPrice is $\", \"\")\n", + " \n", + " return [\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": str({\n", + " \"query\": \"price_estimate\",\n", + " \"locale\": \"en_US\",\n", + " \"currency\": \"USD\",\n", + " \"category\": item.category,\n", + " \"description\": user_prompt,\n", + " \"brand\": json.loads(item.details).get(\"Brand\", \"Unknown\") if item.details else \"Unknown\"\n", + " })},\n", + " {\"role\": \"assistant\", \"content\": f\"Price is ${item.price:.2f}\" if with_price else \"Price is $\"}\n", + " ]\n", + "\n", + "print(\"✅ Enhanced prompt function created!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create enhanced fine-tuning data\n", + "def make_jsonl_v2(items):\n", + " result = \"\"\n", + " for item in items:\n", + " messages = messages_v2(item)\n", + " messages_str = json.dumps(messages)\n", + " result += '{\"messages\": ' + messages_str + '}\\n'\n", + " return result.strip()\n", + "\n", + "def write_jsonl_v2(items, filename):\n", + " with open(filename, \"w\") as f:\n", + " jsonl = make_jsonl_v2(items)\n", + " f.write(jsonl)\n", + "\n", + "# Create enhanced fine-tuning files\n", + "write_jsonl_v2(fine_tune_train, \"fine_tune_train_v2.jsonl\")\n", + "write_jsonl_v2(fine_tune_validation, \"fine_tune_validation_v2.jsonl\")\n", + "\n", + "print(\"✅ Enhanced fine-tuning files created:\")\n", + "print(\" - fine_tune_train_v2.jsonl\")\n", + "print(\" - fine_tune_validation_v2.jsonl\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Upload enhanced files and create second fine-tuning job\n", + "with open(\"fine_tune_train_v2.jsonl\", \"rb\") as f:\n", + " train_file_v2 = openai.files.create(file=f, purpose=\"fine-tune\")\n", + "\n", + "with open(\"fine_tune_validation_v2.jsonl\", \"rb\") as f:\n", + " validation_file_v2 = openai.files.create(file=f, purpose=\"fine-tune\")\n", + "\n", + "# Create second fine-tuning job with enhanced prompts\n", + "fine_tuning_job_v2 = openai.fine_tuning.jobs.create(\n", + " training_file=train_file_v2.id,\n", + " validation_file=validation_file_v2.id,\n", + " model=\"gpt-4o-mini\",\n", + " seed=42,\n", + " hyperparameters={\"n_epochs\": 1},\n", + " integrations=[wandb_integration],\n", + " suffix=\"pricer-v2\"\n", + ")\n", + "\n", + "print(f\"🚀 Enhanced fine-tuning job created: {fine_tuning_job_v2.id}\")\n", + "print(\"⏳ This will take some time to complete...\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Comparison and Results\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test enhanced model (if ready)\n", + "try:\n", + " enhanced_model_name = openai.fine_tuning.jobs.retrieve(fine_tuning_job_v2.id).fine_tuned_model\n", + " \n", + " def gpt_enhanced_fine_tuned(item):\n", + " response = openai.chat.completions.create(\n", + " model=enhanced_model_name,\n", + " messages=messages_v2(item, with_price=False),\n", + " seed=42,\n", + " temperature=1.0,\n", + " max_tokens=7\n", + " )\n", + " reply = response.choices[0].message.content\n", + " return get_price(reply)\n", + " \n", + " print(\"🧪 Testing enhanced fine-tuned model...\")\n", + " Tester.test(gpt_enhanced_fine_tuned, test)\n", + " \n", + "except:\n", + " print(\"⏳ Enhanced fine-tuned model not ready yet.\")\n", + " print(\"💡 Please wait for completion and re-run this cell.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary and Next Steps\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"🎉 Week 6 Product Pricer Challenge Complete!\")\n", + "print(\"=\" * 50)\n", + "\n", + "print(\"\\n📊 What We Accomplished:\")\n", + "print(\"✅ Loaded data using pickle files (our data loading hack)\")\n", + "print(\"✅ Established baseline with GPT-4o\")\n", + "print(\"✅ Implemented fine-tuning with OpenAI API\")\n", + "print(\"✅ Created enhanced prompts for better performance\")\n", + "print(\"✅ Set up comprehensive evaluation framework\")\n", + "\n", + "print(\"\\n🚀 Next Steps:\")\n", + "print(\"1. Wait for fine-tuning jobs to complete\")\n", + "print(\"2. Compare performance of all models\")\n", + "print(\"3. Experiment with different hyperparameters\")\n", + "print(\"4. Try different base models (GPT-4.1, etc.)\")\n", + "print(\"5. Implement ensemble methods\")\n", + "\n", + "print(\"\\n💡 Key Learnings:\")\n", + "print(\"• Fine-tuning can significantly improve model performance\")\n", + "print(\"• Prompt engineering is crucial for good results\")\n", + "print(\"• Data quality and quantity matter for fine-tuning\")\n", + "print(\"• Evaluation metrics help track progress\")\n", + "\n", + "print(\"\\n🎯 This implementation follows the gold standard approach\")\n", + "print(\" while incorporating our data loading improvements!\")\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week6/community-contributions/finetuning-joshua/enhanced_items.py b/week6/community-contributions/finetuning-joshua/enhanced_items.py new file mode 100644 index 0000000..e727573 --- /dev/null +++ b/week6/community-contributions/finetuning-joshua/enhanced_items.py @@ -0,0 +1,149 @@ +from typing import Optional +from transformers import AutoTokenizer +import re +import os + +# Try multiple model sources in order of preference +BASE_MODEL_OPTIONS = [ + "/root/.llama/checkpoints/Llama3.1-8B", # Local llama-stack download + "microsoft/DialoGPT-medium", # Accessible alternative + "gpt2" # Fallback +] + +BASE_MODEL = None + +MIN_TOKENS = 150 # Any less than this, and we don't have enough useful content +MAX_TOKENS = 160 # Truncate after this many tokens. Then after adding in prompt text, we will get to around 180 tokens + +MIN_CHARS = 300 +CEILING_CHARS = MAX_TOKENS * 7 + +class Item: + """ + An Item is a cleaned, curated datapoint of a Product with a Price + Enhanced version with better error handling and alternative tokenizer + """ + + # Initialize tokenizer with fallback options + tokenizer = None + for model_path in BASE_MODEL_OPTIONS: + try: + if model_path.startswith("/") and not os.path.exists(model_path): + continue # Skip local paths that don't exist + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + BASE_MODEL = model_path + print(f"✅ Successfully loaded tokenizer from: {model_path}") + break + except Exception as e: + print(f"⚠️ Failed to load {model_path}: {e}") + continue + + if tokenizer is None: + print("❌ All tokenizer options failed. Using character-based fallback.") + # Create a dummy tokenizer for fallback + class DummyTokenizer: + def encode(self, text, add_special_tokens=False): + # Rough approximation: 1 token ≈ 4 characters + return list(range(len(text) // 4)) + def decode(self, tokens): + return "dummy text" + tokenizer = DummyTokenizer() + BASE_MODEL = "fallback" + + PREFIX = "Price is $" + QUESTION = "How much does this cost to the nearest dollar?" + REMOVALS = [ + '"Batteries Included?": "No"', + '"Batteries Included?": "Yes"', + '"Batteries Required?": "No"', + '"Batteries Required?": "Yes"', + "By Manufacturer", + "Item", + "Date First", + "Package", + ":", + "Number of", + "Best Sellers", + "Number", + "Product " + ] + + title: str + price: float + category: str + token_count: int = 0 + details: Optional[str] + prompt: Optional[str] = None + include = False + + def __init__(self, data, price): + self.title = data['title'] + self.price = price + self.parse(data) + + def scrub_details(self): + """ + Clean up the details string by removing common text that doesn't add value + """ + details = self.details + for remove in self.REMOVALS: + details = details.replace(remove, "") + return details + + def scrub(self, stuff): + """ + Clean up the provided text by removing unnecessary characters and whitespace + Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers + """ + stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip() + stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",") + words = stuff.split(' ') + select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)] + return " ".join(select) + + def parse(self, data): + """ + Parse this datapoint and if it fits within the allowed Token range, + then set include to True + """ + contents = '\n'.join(data['description']) + if contents: + contents += '\n' + features = '\n'.join(data['features']) + if features: + contents += features + '\n' + self.details = data['details'] + if self.details: + contents += self.scrub_details() + '\n' + if len(contents) > MIN_CHARS: + contents = contents[:CEILING_CHARS] + text = f"{self.scrub(self.title)}\n{self.scrub(contents)}" + tokens = self.tokenizer.encode(text, add_special_tokens=False) + if len(tokens) > MIN_TOKENS: + tokens = tokens[:MAX_TOKENS] + text = self.tokenizer.decode(tokens) + self.make_prompt(text) + self.include = True + + def make_prompt(self, text): + """ + Set the prompt instance variable to be a prompt appropriate for training + """ + self.prompt = f"{self.QUESTION}\n\n{text}\n\n" + self.prompt += f"{self.PREFIX}{str(round(self.price))}.00" + self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False)) + + def test_prompt(self): + """ + Return a prompt suitable for testing, with the actual price removed + """ + return self.prompt.split(self.PREFIX)[0] + self.PREFIX + + def __repr__(self): + """ + Return a String version of this Item + """ + return f"<{self.title} = ${self.price}>" + + + diff --git a/week6/community-contributions/finetuning-joshua/test.pkl b/week6/community-contributions/finetuning-joshua/test.pkl new file mode 100644 index 0000000..bdbfb32 Binary files /dev/null and b/week6/community-contributions/finetuning-joshua/test.pkl differ diff --git a/week6/community-contributions/finetuning-joshua/testing.py b/week6/community-contributions/finetuning-joshua/testing.py new file mode 100644 index 0000000..cd43924 --- /dev/null +++ b/week6/community-contributions/finetuning-joshua/testing.py @@ -0,0 +1,75 @@ +import math +import matplotlib.pyplot as plt + +GREEN = "\033[92m" +YELLOW = "\033[93m" +RED = "\033[91m" +RESET = "\033[0m" +COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN} + +class Tester: + + def __init__(self, predictor, data, title=None, size=250): + self.predictor = predictor + self.data = data + self.title = title or predictor.__name__.replace("_", " ").title() + self.size = size + self.guesses = [] + self.truths = [] + self.errors = [] + self.sles = [] + self.colors = [] + + def color_for(self, error, truth): + if error<40 or error/truth < 0.2: + return "green" + elif error<80 or error/truth < 0.4: + return "orange" + else: + return "red" + + def run_datapoint(self, i): + datapoint = self.data[i] + guess = self.predictor(datapoint) + truth = datapoint.price + error = abs(guess - truth) + log_error = math.log(truth+1) - math.log(guess+1) + sle = log_error ** 2 + color = self.color_for(error, truth) + title = datapoint.title if len(datapoint.title) <= 40 else datapoint.title[:40]+"..." + self.guesses.append(guess) + self.truths.append(truth) + self.errors.append(error) + self.sles.append(sle) + self.colors.append(color) + print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}") + + def chart(self, title): + max_error = max(self.errors) + plt.figure(figsize=(12, 8)) + max_val = max(max(self.truths), max(self.guesses)) + plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6) + plt.scatter(self.truths, self.guesses, s=3, c=self.colors) + plt.xlabel('Ground Truth') + plt.ylabel('Model Estimate') + plt.xlim(0, max_val) + plt.ylim(0, max_val) + plt.title(title) + plt.show() + + def report(self): + average_error = sum(self.errors) / self.size + rmsle = math.sqrt(sum(self.sles) / self.size) + hits = sum(1 for color in self.colors if color=="green") + title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%" + self.chart(title) + + def run(self): + self.error = 0 + for i in range(self.size): + self.run_datapoint(i) + self.report() + + @classmethod + def test(cls, function, data): + cls(function, data).run() \ No newline at end of file diff --git a/week6/community-contributions/finetuning-joshua/train.pkl b/week6/community-contributions/finetuning-joshua/train.pkl new file mode 100644 index 0000000..3d395c5 Binary files /dev/null and b/week6/community-contributions/finetuning-joshua/train.pkl differ diff --git a/week6/community-contributions/finetuning-joshua/validation.pkl b/week6/community-contributions/finetuning-joshua/validation.pkl new file mode 100644 index 0000000..eb626e4 Binary files /dev/null and b/week6/community-contributions/finetuning-joshua/validation.pkl differ diff --git a/week7/community_contributions/finetuning-joshua/Week7_Complete_FineTuning.ipynb b/week7/community_contributions/finetuning-joshua/Week7_Complete_FineTuning.ipynb new file mode 100644 index 0000000..bb1e960 --- /dev/null +++ b/week7/community_contributions/finetuning-joshua/Week7_Complete_FineTuning.ipynb @@ -0,0 +1,2270 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d2afa3e9", + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluation utilities for the fine-tuned open-source model (Week 7)\n", + "import re\n", + "import math\n", + "import numpy as np\n", + "import torch\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Extract numeric price from model output\n", + "def extract_price(text: str) -> float:\n", + " text = (text or \"\").replace(\"$\", \"\").replace(\",\", \"\")\n", + " m = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", text)\n", + " return float(m.group(0)) if m else 0.0\n", + "\n", + "# Build prompt consistent with Week 7 training template\n", + "def build_pricing_prompt(item) -> str:\n", + " # Matches the training format used in Week 7\n", + " return (\n", + " \"<|system|>\\nYou are a retail price estimator. Predict the most likely new retail price in USD.\\n\"\n", + " \"<|user|>\\n\"\n", + " f\"{item.title}\\n{item.description}\\n\"\n", + " \"<|assistant|>\\n\"\n", + " )\n", + "\n", + "# Single-item prediction using the fine-tuned causal LM\n", + "@torch.no_grad()\n", + "def predict_price(model, tokenizer, item, max_new_tokens: int = 20) -> float:\n", + " prompt = build_pricing_prompt(item)\n", + " inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n", + " outputs = model.generate(\n", + " **inputs,\n", + " max_new_tokens=max_new_tokens,\n", + " temperature=0.7,\n", + " do_sample=True,\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " )\n", + " decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " # Take only the newly generated continuation beyond the prompt\n", + " continuation = decoded[len(tokenizer.decode(inputs[\"input_ids\"][0], skip_special_tokens=True)) :]\n", + " return extract_price(continuation)\n", + "\n", + "# Batch evaluation (MAE, RMSE, MAPE) with quick scatter plot\n", + "def evaluate_model(model, tokenizer, test_items, limit: int = None, title: str = \"Fine-tuned Model Evaluation\"):\n", + " if not test_items:\n", + " print(\"⚠️ No test items available.\")\n", + " return {\"mae\": None, \"rmse\": None, \"mape\": None}\n", + "\n", + " items = test_items[:limit] if limit else test_items\n", + "\n", + " y_true, y_pred = [], []\n", + " for i, item in enumerate(items):\n", + " try:\n", + " pred = predict_price(model, tokenizer, item)\n", + " except Exception as e:\n", + " print(f\"Error on item {i}: {e}\")\n", + " pred = 0.0\n", + " y_true.append(float(getattr(item, \"price\", 0.0)))\n", + " y_pred.append(float(pred))\n", + "\n", + " y_true_np = np.array(y_true, dtype=float)\n", + " y_pred_np = np.array(y_pred, dtype=float)\n", + "\n", + " mae = float(np.mean(np.abs(y_pred_np - y_true_np)))\n", + " rmse = float(np.sqrt(np.mean((y_pred_np - y_true_np) ** 2)))\n", + " with np.errstate(divide='ignore', invalid='ignore'):\n", + " mape_arr = np.where(y_true_np != 0, np.abs((y_pred_np - y_true_np) / y_true_np), np.nan)\n", + " mape = float(np.nanmean(mape_arr)) * 100.0\n", + "\n", + " print(f\"\\n📈 {title}\")\n", + " print(f\"MAE : {mae:.2f}\")\n", + " print(f\"RMSE: {rmse:.2f}\")\n", + " print(f\"MAPE: {mape:.2f}%\")\n", + "\n", + " # Scatter plot\n", + " try:\n", + " plt.figure(figsize=(6, 6))\n", + " plt.scatter(y_true_np, y_pred_np, alpha=0.6)\n", + " mx = max(y_true_np.max() if y_true_np.size else 0, y_pred_np.max() if y_pred_np.size else 0)\n", + " plt.plot([0, mx], [0, mx], 'r--', label='Ideal')\n", + " plt.xlabel('Actual Price')\n", + " plt.ylabel('Predicted Price')\n", + " plt.title(title)\n", + " plt.legend()\n", + " plt.tight_layout()\n", + " plt.show()\n", + " except Exception as e:\n", + " print(f\"Plotting error: {e}\")\n", + "\n", + " return {\"mae\": mae, \"rmse\": rmse, \"mape\": mape}\n", + "\n", + "# Convenience wrapper mirroring Week 6's Tester usage pattern\n", + "# Usage:\n", + "# results = evaluate_model(model, tokenizer, test, limit=len(test))\n", + "print(\"✅ Evaluation utilities for Week 7 added. Use evaluate_model(model, tokenizer, test, limit=len(test)).\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "c88d0ea8", + "metadata": { + "id": "c88d0ea8" + }, + "source": [ + "# Week 7 - Complete Fine-tuning with Open Source LLMs\n", + "\n", + "This notebook implements QLoRA fine-tuning of open-source LLMs for product price prediction.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "721835a5", + "metadata": { + "id": "721835a5" + }, + "outputs": [], + "source": [ + "%pip install -q -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121\n", + "%pip install -q -U transformers>=4.45.0 accelerate>=0.33.0 peft>=0.11.1 trl>=0.8.0\n", + "%pip install -q -U datasets \"huggingface_hub>=0.23.2,<1.0\" sentencepiece einops safetensors\n", + "%pip install -q -U bitsandbytes>=0.43.2 xformers\n", + "%pip install -q -U wandb tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8a8017b0", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8a8017b0", + "outputId": "6c5288b6-3d15-4439-de01-ad2ff7b2b262" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PyTorch version: 2.8.0+cu126\n", + "CUDA available: True\n", + "GPU: NVIDIA A100-SXM4-40GB\n", + "GPU Memory: 42.5 GB\n", + "CUDA version: 12.6\n" + ] + } + ], + "source": [ + "# Core imports\n", + "import os\n", + "import torch\n", + "import pickle\n", + "import numpy as np\n", + "import json\n", + "import re\n", + "from datetime import datetime\n", + "from datasets import Dataset\n", + "from transformers import (\n", + " AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,\n", + " TrainingArguments, Trainer, DataCollatorForLanguageModeling\n", + ")\n", + "from peft import LoraConfig, TaskType, get_peft_model, PeftModel\n", + "from trl import SFTTrainer\n", + "import transformers\n", + "import wandb\n", + "\n", + "# Enable optimizations for Colab Pro\n", + "torch.backends.cudnn.benchmark = True\n", + "torch.backends.cuda.matmul.allow_tf32 = True\n", + "torch.backends.cudnn.allow_tf32 = True\n", + "\n", + "print(f\"PyTorch version: {torch.__version__}\")\n", + "print(f\"CUDA available: {torch.cuda.is_available()}\")\n", + "if torch.cuda.is_available():\n", + " print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n", + " print(f\"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n", + " print(f\"CUDA version: {torch.version.cuda}\")\n", + "else:\n", + " raise SystemExit(\"❌ No GPU detected.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0b4d0cd3", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 280 + }, + "id": "0b4d0cd3", + "outputId": "65ab54e5-4fec-4db8-e6e9-3fb86d2a13f3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Using Colab secrets\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n", + "WARNING:huggingface_hub._login:Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n" + ] + }, + { + "data": { + "text/html": [ + "Finishing previous runs because reinit is set to 'default'." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run wobbly-resonance-1 at: https://wandb.ai/oluoch-joshua-udemy/colab-pro-finetuning/runs/fwkqveds
View project at: https://wandb.ai/oluoch-joshua-udemy/colab-pro-finetuning
Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Find logs at: ./wandb/run-20251028_115212-fwkqveds/logs" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Tracking run with wandb version 0.22.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run data is saved locally in /content/wandb/run-20251028_115650-rd1q63l3" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Syncing run easy-cloud-2 to Weights & Biases (docs)
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View project at https://wandb.ai/oluoch-joshua-udemy/colab-pro-finetuning" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run at https://wandb.ai/oluoch-joshua-udemy/colab-pro-finetuning/runs/rd1q63l3" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ W&B initialized\n" + ] + } + ], + "source": [ + "# Environment setup for Colab Pro\n", + "try:\n", + " from google.colab import userdata\n", + " os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')\n", + " os.environ['WANDB_API_KEY'] = userdata.get('WANDB_API_KEY')\n", + " print(\"✅ Using Colab secrets\")\n", + "except:\n", + " from dotenv import load_dotenv\n", + " load_dotenv(override=True)\n", + " os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-hf-token')\n", + " os.environ['WANDB_API_KEY'] = os.getenv('WANDB_API_KEY', 'your-wandb-key')\n", + " print(\"✅ Using local environment\")\n", + "\n", + "# Login to HuggingFace\n", + "from huggingface_hub import login\n", + "login(os.environ['HF_TOKEN'])\n", + "\n", + "# Initialize Weights & Biases (optional)\n", + "try:\n", + " wandb.init(project=\"colab-pro-finetuning\", mode=\"online\")\n", + " print(\"✅ W&B initialized\")\n", + "except:\n", + " print(\"⚠️ W&B not available, continuing without logging\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "809d2271", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "809d2271", + "outputId": "2afd08ed-5da7-4a93-99cd-4d8f881bd0af" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📦 Loading pre-processed pickle files...\n", + "✅ Loaded training data: train.pkl (150 items)\n", + "✅ Loaded test data: test.pkl (50 items)\n", + "✅ Loaded validation data: validation.pkl (50 items)\n", + "\n", + "📊 Dataset Statistics:\n", + " Training: 150 items\n", + " Test: 50 items\n", + " Validation: 50 items\n" + ] + } + ], + "source": [ + "# Load pre-processed pickle files (optimized for Colab Pro)\n", + "def load_pickle_data():\n", + " \"\"\"Load pre-processed pickle files with robust error handling\"\"\"\n", + " print(\"📦 Loading pre-processed pickle files...\")\n", + "\n", + " # Try multiple locations for pickle files\n", + " pickle_files = [\n", + " 'train.pkl', 'test.pkl', 'validation.pkl'\n", + " ]\n", + "\n", + " train = None\n", + " test = None\n", + " validation = None\n", + "\n", + " # Load training data\n", + " for file_path in ['train.pkl']:\n", + " if os.path.exists(file_path):\n", + " try:\n", + " with open(file_path, 'rb') as f:\n", + " train = pickle.load(f)\n", + " print(f\"✅ Loaded training data: {file_path} ({len(train)} items)\")\n", + " break\n", + " except Exception as e:\n", + " print(f\"❌ Error loading {file_path}: {e}\")\n", + "\n", + " # Load test data\n", + " for file_path in ['test.pkl']:\n", + " if os.path.exists(file_path):\n", + " try:\n", + " with open(file_path, 'rb') as f:\n", + " test = pickle.load(f)\n", + " print(f\"✅ Loaded test data: {file_path} ({len(test)} items)\")\n", + " break\n", + " except Exception as e:\n", + " print(f\"❌ Error loading {file_path}: {e}\")\n", + "\n", + " # Load validation data\n", + " for file_path in ['validation.pkl']:\n", + " if os.path.exists(file_path):\n", + " try:\n", + " with open(file_path, 'rb') as f:\n", + " validation = pickle.load(f)\n", + " print(f\"✅ Loaded validation data: {file_path} ({len(validation)} items)\")\n", + " break\n", + " except Exception as e:\n", + " print(f\"❌ Error loading {file_path}: {e}\")\n", + "\n", + " # If no pickle files found, create sample data\n", + " if not train or not test or not validation:\n", + " print(\"🔄 No pickle files found, creating sample data...\")\n", + " train, test, validation = create_sample_data()\n", + "\n", + " return train, test, validation\n", + "\n", + "def create_sample_data():\n", + " \"\"\"Create sample data for demonstration\"\"\"\n", + " # Sample product data (expanded for better training)\n", + " sample_products = [\n", + " {\"title\": \"Wireless Bluetooth Headphones\", \"price\": 89.99, \"category\": \"Electronics\"},\n", + " {\"title\": \"Stainless Steel Water Bottle\", \"price\": 24.99, \"category\": \"Home & Kitchen\"},\n", + " {\"title\": \"Organic Cotton T-Shirt\", \"price\": 19.99, \"category\": \"Clothing\"},\n", + " {\"title\": \"Ceramic Coffee Mug\", \"price\": 12.99, \"category\": \"Home & Kitchen\"},\n", + " {\"title\": \"LED Desk Lamp\", \"price\": 45.99, \"category\": \"Electronics\"},\n", + " {\"title\": \"Yoga Mat\", \"price\": 29.99, \"category\": \"Sports & Outdoors\"},\n", + " {\"title\": \"Leather Wallet\", \"price\": 39.99, \"category\": \"Accessories\"},\n", + " {\"title\": \"Bluetooth Speaker\", \"price\": 79.99, \"category\": \"Electronics\"},\n", + " {\"title\": \"Kitchen Knife Set\", \"price\": 129.99, \"category\": \"Home & Kitchen\"},\n", + " {\"title\": \"Running Shoes\", \"price\": 89.99, \"category\": \"Sports & Outdoors\"},\n", + " {\"title\": \"Smartphone Case\", \"price\": 15.99, \"category\": \"Electronics\"},\n", + " {\"title\": \"Coffee Maker\", \"price\": 89.99, \"category\": \"Home & Kitchen\"},\n", + " {\"title\": \"Backpack\", \"price\": 49.99, \"category\": \"Accessories\"},\n", + " {\"title\": \"Tennis Racket\", \"price\": 79.99, \"category\": \"Sports & Outdoors\"},\n", + " {\"title\": \"Laptop Stand\", \"price\": 34.99, \"category\": \"Electronics\"}\n", + " ]\n", + "\n", + " # Create SimpleItem objects\n", + " items = []\n", + " for product in sample_products:\n", + " item = SimpleItem(\n", + " title=product['title'],\n", + " description=f\"High-quality {product['title'].lower()}\",\n", + " price=product['price'],\n", + " category=product['category'],\n", + " token_count=len(product['title'] + f\"High-quality {product['title'].lower()}\") // 4\n", + " )\n", + " items.append(item)\n", + "\n", + " # Split into train/test/validation\n", + " train = items[:10] # 10 items\n", + " test = items[10:13] # 3 items\n", + " validation = items[13:] # 2 items\n", + "\n", + " print(f\"✅ Created sample data: {len(train)} train, {len(test)} test, {len(validation)} validation\")\n", + " return train, test, validation\n", + "\n", + "# SimpleItem class definition for pickle compatibility\n", + "class SimpleItem:\n", + " \"\"\"Simple item class for pickle compatibility\"\"\"\n", + " def __init__(self, title, description, price, category=\"Human_Generated\", token_count=0):\n", + " self.title = title\n", + " self.description = description\n", + " self.price = price\n", + " self.category = category\n", + " self.token_count = token_count\n", + "\n", + " def test_prompt(self):\n", + " \"\"\"Return a prompt suitable for testing\"\"\"\n", + " return f\"How much does this cost to the nearest dollar?\\n\\n{self.title}\\n\\n{self.description}\\n\\nPrice is $\"\n", + "\n", + " def __repr__(self):\n", + " return f\"SimpleItem(title='{self.title[:50]}...', price=${self.price})\"\n", + "\n", + "# Load the data\n", + "train, test, validation = load_pickle_data()\n", + "\n", + "print(f\"\\n📊 Dataset Statistics:\")\n", + "print(f\" Training: {len(train)} items\")\n", + "print(f\" Test: {len(test)} items\")\n", + "print(f\" Validation: {len(validation)} items\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "946a3a05", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "946a3a05", + "outputId": "41936ca5-d092-43a2-ed29-d66607af7d89" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Datasets prepared:\n", + " Training: 150 examples\n", + " Validation: 50 examples\n", + " Sample training text: <|system|>\n", + "You are a retail price estimator. Predict the most likely new retail price in USD.\n", + "<|user...\n" + ] + } + ], + "source": [ + "# Prepare datasets for training (optimized for Colab Pro)\n", + "def prepare_training_data(items):\n", + " \"\"\"Convert items to training format\"\"\"\n", + " data = []\n", + " for item in items:\n", + " # Create training prompt\n", + " prompt = f\"<|system|>\\nYou are a retail price estimator. Predict the most likely new retail price in USD.\\n<|user|>\\n{item.title}\\n{item.description}\\n<|assistant|>\\n${item.price:.2f}\"\n", + " data.append({\"text\": prompt})\n", + " return data\n", + "\n", + "# Prepare training and validation datasets\n", + "train_data = prepare_training_data(train)\n", + "val_data = prepare_training_data(validation)\n", + "\n", + "# Convert to HuggingFace datasets\n", + "train_ds = Dataset.from_list(train_data)\n", + "val_ds = Dataset.from_list(val_data)\n", + "\n", + "print(f\"✅ Datasets prepared:\")\n", + "print(f\" Training: {len(train_ds)} examples\")\n", + "print(f\" Validation: {len(val_ds)} examples\")\n", + "print(f\" Sample training text: {train_ds[0]['text'][:100]}...\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "zWgL4fhku_XN", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 289, + "referenced_widgets": [ + "245570c62c3844728d7125a706fbbc9b", + "8d95da3803e542f8b855175013d497ba", + "34a89db126a64690bc2f6c8656ba2210", + "4c47ce21b5a14328aa22403782e4da9b", + "7dff366d9e71427dbae40b1dce7a9bfa", + "0614c35b3690494ca3b8f9ab71d71a08", + "42315e83fbac49c2bc7f2faf1abcc22e", + "4cdff5bdf7574795802e821aa42f3c4e", + "754aa440f45c4a878d99572368d659c8", + "8b5f0c156a9641cfa5413668a0b97b9c", + "aff498bd632f4036958f59cfc6587ea3", + "d6825cc926a24f2482ce72c15242081e", + "24b2b5f5d92049a79014b8278e97451b", + "a6001d34e58a47cab0d8bff2451afb6e", + "b42e8d8b61d7431a814a03c5e07a1166", + "9ce1659c776140bcaf3c16eae6f70967", + "cceae79c145d4b73a64e80ad3fc8866c", + "56bc56071ff04223935dc2d98d2703ab", + "67cacc87afe14250baaa073289fb4a8f", + "227eea7074544adbb2c34b9dde340fa5", + "8bd9aebb2cc5420094b2b441a5183523", + "1c3eb3793b6e4291b4fa57ce8419ef1f" + ] + }, + "id": "zWgL4fhku_XN", + "outputId": "8d375a13-59cf-4eea-f16f-fde5dbf7f0e8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔄 Checking dataset status...\n", + "Training dataset columns: ['input_ids', 'attention_mask', 'labels']\n", + "Validation dataset columns: ['input_ids', 'attention_mask', 'labels']\n", + "✅ Datasets already tokenized\n", + "🔄 Ensuring consistent sequence lengths...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "245570c62c3844728d7125a706fbbc9b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/150 [00:00 max_len:\n", + " seq = seq[:max_len]\n", + " attn = attn[:max_len]\n", + " lbl = lbl[:max_len]\n", + "\n", + " # Pad if too short\n", + " while len(seq) < max_len:\n", + " seq.append(tokenizer.pad_token_id)\n", + " attn.append(0) # 0 for padding\n", + " lbl.append(-100) # -100 for padding in labels (ignored in loss)\n", + "\n", + " input_ids.append(seq)\n", + " attention_masks.append(attn)\n", + " labels.append(lbl)\n", + "\n", + " return {\n", + " \"input_ids\": input_ids,\n", + " \"attention_mask\": attention_masks,\n", + " \"labels\": labels\n", + " }\n", + "\n", + " return dataset.map(pad_sequences, batched=True)\n", + "\n", + "print(\"🔄 Checking dataset status...\")\n", + "print(f\"Training dataset columns: {train_ds.column_names}\")\n", + "print(f\"Validation dataset columns: {val_ds.column_names}\")\n", + "\n", + "# Check if we need to tokenize or just ensure consistent lengths\n", + "if \"text\" in train_ds.column_names:\n", + " print(\"🔄 Tokenizing datasets...\")\n", + " train_ds = train_ds.map(tokenize_function, batched=True, remove_columns=[\"text\"])\n", + " val_ds = val_ds.map(tokenize_function, batched=True, remove_columns=[\"text\"])\n", + " print(\"✅ Tokenization complete\")\n", + "else:\n", + " print(\"✅ Datasets already tokenized\")\n", + "\n", + "# Ensure consistent lengths\n", + "print(\"🔄 Ensuring consistent sequence lengths...\")\n", + "train_ds = ensure_consistent_lengths(train_ds, MAX_LEN)\n", + "val_ds = ensure_consistent_lengths(val_ds, MAX_LEN)\n", + "\n", + "# Verify all sequences are the same length\n", + "print(\"🔍 Verifying sequence lengths...\")\n", + "train_lengths = [len(seq) for seq in train_ds[\"input_ids\"]]\n", + "val_lengths = [len(seq) for seq in val_ds[\"input_ids\"]]\n", + "\n", + "print(f\"Training sequence lengths - Min: {min(train_lengths)}, Max: {max(train_lengths)}\")\n", + "print(f\"Validation sequence lengths - Min: {min(val_lengths)}, Max: {max(val_lengths)}\")\n", + "\n", + "if len(set(train_lengths)) == 1 and len(set(val_lengths)) == 1:\n", + " print(\"✅ All sequences have consistent length\")\n", + "else:\n", + " print(\"⚠️ Inconsistent sequence lengths detected - this will cause training errors\")\n", + "\n", + "# Set format for PyTorch\n", + "train_ds.set_format(type=\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n", + "val_ds.set_format(type=\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n", + "\n", + "print(f\"Sample input_ids shape: {train_ds[0]['input_ids'].shape}\")\n", + "print(f\"Sample attention_mask shape: {train_ds[0]['attention_mask'].shape}\")\n", + "print(f\"Sample labels shape: {train_ds[0]['labels'].shape}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "55a3b346", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "55a3b346", + "outputId": "34ba52e3-3a22-4c60-e18b-18592c0b1d80" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading tokenizer...\n", + "✅ Tokenizer loaded successfully\n", + "Loading base model (4-bit optimized for Colab Pro)...\n", + "⚠️ Error with 4-bit quantization: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`\n", + "🔄 Trying without quantization...\n", + "✅ Model loaded without quantization\n", + "Model device: cuda:0\n", + "Model dtype: torch.float16\n" + ] + } + ], + "source": [ + "# Model setup optimized for Colab Pro\n", + "# Using a more compatible model that works well with current transformers\n", + "base_model = \"microsoft/DialoGPT-medium\" # More stable and widely supported\n", + "\n", + "# 4-bit quantization config optimized for Colab Pro\n", + "bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.float16,\n", + ")\n", + "\n", + "print(\"Loading tokenizer...\")\n", + "try:\n", + " tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True, trust_remote_code=True)\n", + " print(\"✅ Tokenizer loaded successfully\")\n", + "except Exception as e:\n", + " print(f\"⚠️ Error loading tokenizer: {e}\")\n", + " print(\"🔄 Trying alternative approach...\")\n", + " tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=False, trust_remote_code=False)\n", + "\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\"\n", + "\n", + "print(\"Loading base model (4-bit optimized for Colab Pro)...\")\n", + "try:\n", + " model = AutoModelForCausalLM.from_pretrained(\n", + " base_model,\n", + " quantization_config=bnb_config,\n", + " device_map=\"auto\",\n", + " low_cpu_mem_usage=True,\n", + " trust_remote_code=True,\n", + " torch_dtype=torch.float16,\n", + " )\n", + " print(\"✅ Model loaded successfully\")\n", + "except Exception as e:\n", + " print(f\"⚠️ Error with 4-bit quantization: {e}\")\n", + " print(\"🔄 Trying without quantization...\")\n", + " model = AutoModelForCausalLM.from_pretrained(\n", + " base_model,\n", + " device_map=\"auto\",\n", + " low_cpu_mem_usage=True,\n", + " torch_dtype=torch.float16,\n", + " )\n", + " print(\"✅ Model loaded without quantization\")\n", + "\n", + "print(f\"Model device: {next(model.parameters()).device}\")\n", + "print(f\"Model dtype: {next(model.parameters()).dtype}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "QCcujyKNyTud", + "metadata": { + "id": "QCcujyKNyTud" + }, + "outputs": [], + "source": [ + "from peft import prepare_model_for_kbit_training\n", + "\n", + "# disable cache for gradient checkpointing\n", + "model.config.use_cache = False\n", + "\n", + "# enable gradient checkpointing\n", + "model.gradient_checkpointing_enable()\n", + "\n", + "# IMPORTANT: prepare for k-bit training (sets up norms, cast, etc.)\n", + "model = prepare_model_for_kbit_training(model)\n", + "\n", + "# ensure inputs carry grads for checkpointing\n", + "if hasattr(model, \"enable_input_require_grads\"):\n", + " model.enable_input_require_grads()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "a3o-5dxDr5MH", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a3o-5dxDr5MH", + "outputId": "778c6094-3fbc-468f-dbec-7d1798791f04" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 6,291,456 || all params: 361,114,624 || trainable%: 1.7422\n", + "✅ LoRA configuration applied for GPT-2/DialoGPT modules\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/peft/mapping_func.py:73: UserWarning: You are trying to modify a model with PEFT for a second time. If you want to reload the model with a different config, make sure to call `.unload()` before.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.12/dist-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# LoRA configuration compatible with GPT-2/DialoGPT modules\n", + "from peft import LoraConfig, get_peft_model, TaskType\n", + "\n", + "# For GPT-2/DialoGPT, target modules typically are c_attn (QKV), c_fc and c_proj (MLP)\n", + "lora_config = LoraConfig(\n", + " r=16,\n", + " lora_alpha=32,\n", + " lora_dropout=0.05,\n", + " bias=\"none\",\n", + " task_type=TaskType.CAUSAL_LM,\n", + " target_modules=[\"c_attn\", \"c_fc\", \"c_proj\"],\n", + ")\n", + "\n", + "# Apply LoRA to model\n", + "model = get_peft_model(model, lora_config)\n", + "model.print_trainable_parameters()\n", + "print(\"✅ LoRA configuration applied for GPT-2/DialoGPT modules\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "ac85c418", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ac85c418", + "outputId": "196789bc-c66f-4fbd-eb66-65a50b3cf995" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Training arguments configured !!\n" + ] + } + ], + "source": [ + "# Training arguments\n", + "training_args = TrainingArguments(\n", + " output_dir=\"./outputs\",\n", + " per_device_train_batch_size=2,\n", + " per_device_eval_batch_size=2,\n", + " gradient_accumulation_steps=8,\n", + " num_train_epochs=3,\n", + " learning_rate=2e-4,\n", + " bf16=True,\n", + " logging_steps=10,\n", + " eval_strategy=\"steps\",\n", + " eval_steps=50,\n", + " save_steps=100,\n", + " save_total_limit=3,\n", + " lr_scheduler_type=\"cosine\",\n", + " warmup_ratio=0.03,\n", + " gradient_checkpointing=True,\n", + " dataloader_pin_memory=False,\n", + " remove_unused_columns=False,\n", + " report_to=[\"wandb\"] if os.environ.get('WANDB_API_KEY') else [],\n", + " seed=42,\n", + " # Colab Pro optimizations\n", + " dataloader_num_workers=2,\n", + " save_safetensors=True,\n", + " load_best_model_at_end=True,\n", + " metric_for_best_model=\"eval_loss\",\n", + " greater_is_better=False,\n", + ")\n", + "\n", + "print(\"✅ Training arguments configured !!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "b7452949", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b7452949", + "outputId": "c84d0df8-efef-4423-ab1f-bc343766b386" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Custom data collator for pre-padded sequences configured\n" + ] + } + ], + "source": [ + "# Custom data collator for pre-padded sequences\n", + "# Since we already padded during tokenization, we just need to stack tensors\n", + "def custom_collate_fn(batch):\n", + " \"\"\"Custom collate function for pre-padded sequences\"\"\"\n", + " # Extract the fields we need\n", + " input_ids = torch.stack([torch.tensor(item[\"input_ids\"]) for item in batch])\n", + " attention_mask = torch.stack([torch.tensor(item[\"attention_mask\"]) for item in batch])\n", + " labels = torch.stack([torch.tensor(item[\"labels\"]) for item in batch])\n", + "\n", + " return {\n", + " \"input_ids\": input_ids,\n", + " \"attention_mask\": attention_mask,\n", + " \"labels\": labels\n", + " }\n", + "\n", + "# Use our custom collator\n", + "data_collator = custom_collate_fn\n", + "\n", + "print(\"✅ Custom data collator for pre-padded sequences configured\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "qFVD1QGmxgv4", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qFVD1QGmxgv4", + "outputId": "591ac10d-f8e2-461a-a629-9617bb6a120a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Data collator configured\n" + ] + } + ], + "source": [ + "# Data collator for language modeling\n", + "data_collator = DataCollatorForLanguageModeling(\n", + " tokenizer=tokenizer,\n", + " mlm=False, # We're doing causal LM, not masked LM\n", + " pad_to_multiple_of=8, # Optimize for GPU\n", + ")\n", + "\n", + "print(\"✅ Data collator configured\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "fc57cf22", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fc57cf22", + "outputId": "b83c8108-13ac-4114-a4a9-7beb26963ee5" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipython-input-3978596696.py:2: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n", + " trainer = Trainer(\n", + "The model is already on multiple devices. Skipping the move to device specified in `args`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Trainer configured\n", + "Training examples: 150\n", + "Validation examples: 50\n", + "Total training steps: 27\n" + ] + } + ], + "source": [ + "# Create trainer\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_ds,\n", + " eval_dataset=val_ds,\n", + " data_collator=data_collator,\n", + " tokenizer=tokenizer,\n", + ")\n", + "\n", + "print(\"✅ Trainer configured\")\n", + "print(f\"Training examples: {len(train_ds)}\")\n", + "print(f\"Validation examples: {len(val_ds)}\")\n", + "print(f\"Total training steps: {len(train_ds) // training_args.per_device_train_batch_size // training_args.gradient_accumulation_steps * training_args.num_train_epochs}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "547502bd", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 197 + }, + "id": "547502bd", + "outputId": "11530f9b-1a40-4353-dfa2-6caa4d4ff22e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🚀 Starting training...\n", + "Training on: NVIDIA A100-SXM4-40GB\n", + "Batch size: 2\n", + "Gradient accumulation: 8\n", + "Effective batch size: 16\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [30/30 00:40, Epoch 3/3]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining LossValidation Loss

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Training completed!\n", + "Model saved to: ./outputs\n" + ] + } + ], + "source": [ + "# Start training\n", + "print(\"🚀 Starting training...\")\n", + "print(f\"Training on: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}\")\n", + "print(f\"Batch size: {training_args.per_device_train_batch_size}\")\n", + "print(f\"Gradient accumulation: {training_args.gradient_accumulation_steps}\")\n", + "print(f\"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}\")\n", + "\n", + "# Train the model\n", + "trainer.train()\n", + "\n", + "print(\"✅ Training completed!\")\n", + "print(f\"Model saved to: {training_args.output_dir}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "a4df3b21", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a4df3b21", + "outputId": "3cb4ac09-9b61-4ab8-bc11-cf480cb764be" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Model and tokenizer saved\n", + "Saved to: ./outputs\n", + "Mounted at /content/drive\n", + "✅ Model also saved to Google Drive: /content/drive/MyDrive/Colab Notebooks/finetuned_model_20251028_123003\n" + ] + } + ], + "source": [ + "# Save the final model\n", + "trainer.save_model()\n", + "tokenizer.save_pretrained(training_args.output_dir)\n", + "\n", + "print(\"✅ Model and tokenizer saved\")\n", + "print(f\"Saved to: {training_args.output_dir}\")\n", + "\n", + "# Save to Google Drive (optional)\n", + "try:\n", + " from google.colab import drive\n", + " drive.mount('/content/drive')\n", + "\n", + " # Copy to Drive\n", + " import shutil\n", + " drive_path = f\"/content/drive/MyDrive/Colab Notebooks/finetuned_model_{datetime.now().strftime('%Y%m%d_%H%M%S')}\"\n", + " shutil.copytree(training_args.output_dir, drive_path)\n", + " print(f\"✅ Model also saved to Google Drive: {drive_path}\")\n", + "except:\n", + " print(\"⚠️ Google Drive not available, model saved locally only\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "e2507760", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 228 + }, + "id": "e2507760", + "outputId": "03dda8b6-711c-4bee-b4ed-b00aedde5ab4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📊 Evaluating model...\n", + "⚠️ Best checkpoint not found, using final model\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [25/25 00:01]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📈 Evaluation Results:\n", + " eval_loss: 5.8997\n", + " eval_runtime: 1.5263\n", + " eval_samples_per_second: 32.7600\n", + " eval_steps_per_second: 16.3800\n", + " epoch: 3.0000\n", + "\n", + "✅ Evaluation completed!\n" + ] + } + ], + "source": [ + "# Evaluate the model\n", + "print(\"📊 Evaluating model...\")\n", + "\n", + "# Load the best model\n", + "best_model_path = f\"{training_args.output_dir}/checkpoint-best\"\n", + "if os.path.exists(best_model_path):\n", + " model = PeftModel.from_pretrained(model, best_model_path)\n", + " print(\"✅ Loaded best checkpoint\")\n", + "else:\n", + " print(\"⚠️ Best checkpoint not found, using final model\")\n", + "\n", + "# Run evaluation\n", + "eval_results = trainer.evaluate()\n", + "print(f\"\\n📈 Evaluation Results:\")\n", + "for key, value in eval_results.items():\n", + " print(f\" {key}: {value:.4f}\")\n", + "\n", + "print(\"\\n✅ Evaluation completed!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "c80bebe1", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "c80bebe1", + "outputId": "75b9cf98-cae9-48c0-de41-eb245f574e0c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🧪 Testing inference...\n", + "\n", + "--- Test 1 ---\n", + "Item: MyCableMart 3.5mm Plug/Jack, 4 Conductor TRRS, Self Solder, Male\n", + "Actual Price: $25.00\n", + "Model Response: <|system|>\n", + "You are a retail price estimator. Predict the most likely new retail price in USD.\n", + "<|user|>\n", + "MyCableMart 3.5mm Plug/Jack, 4 Conductor TRRS, Self Solder, Male\n", + "Connects stereo audio & microphone devices requiring 4 conductors (left and right audio and microphone plus ground). This connector MAY also be suitable for left/right audio 1 video (composite) and ground. Great for making your own 3.5mm 4 conductor Cables or for repairing existing cables. Wire terminals are attached using solder (not included).Features 3.5mm 4 conductor (3 band) plug 3.5mm 4 conductor (3 band) plug Nickel Plated Nickel Plated Strain relief Strain relief Outer Dimensions (at PVC outer molding) Outer Dimensions (at PVC outer molding) Outer Dimensions (with PVC outer molding\n", + "<|assistant|>\n", + "input.5, 3.00,,5,2,2,2,2,2\n", + "\n", + "--- Test 2 ---\n", + "Item: OtterBox + Pop Symmetry Series Case for iPhone 11 Pro (ONLY) - Retail Packaging - White Marble\n", + "Actual Price: $20.00\n", + "Model Response: <|system|>\n", + "You are a retail price estimator. Predict the most likely new retail price in USD.\n", + "<|user|>\n", + "OtterBox + Pop Symmetry Series Case for iPhone 11 Pro (ONLY) - Retail Packaging - White Marble\n", + "OtterBox + Pop Symmetry Series Case for iPhone 11 Pro (ONLY) - Retail Packaging - White Marble Compatible with iPhone 11 Pro Thin one-piece case with durable protection against drops, bumps and fumbles that is also compatible with Qi wireless charging PopSockets PopGrip is integrated into case to help with holding, texting, snapping better pictures and hand-free viewing PopTop designs are easy to switch out — just close flat, press down and turn to swap the PopTop. Includes OtterBox limited lifetime warranty (see website for details) and 100% authentic Dimensions 7.8 x 4.29 x 1.06 inches, Weight 3\n", + "<|assistant|>\n", + "Type.html,.html.html,, Material, width, material, material, material,\n", + "\n", + "--- Test 3 ---\n", + "Item: Dell XPS Desktop ( Intel Core i7 4790 (3.6 GHz), 8GB, 1TB HDD,Windows 10 Home Black\n", + "Actual Price: $500.00\n", + "Model Response: <|system|>\n", + "You are a retail price estimator. Predict the most likely new retail price in USD.\n", + "<|user|>\n", + "Dell XPS Desktop ( Intel Core i7 4790 (3.6 GHz), 8GB, 1TB HDD,Windows 10 Home Black\n", + "Product description Bring your multimedia to life with Dell XPS desktop PCs offering powerful processors, superb graphics performance and lots of storage space. Amazon.com Processor 4th Generation Intel Core processor (8M Cache, up to 4.00 GHz) OS Windows 7 Professional, English Graphics Card NVIDIA GeForce GTX 750Ti 2GB DDR5 Memory 32GB Dual Channel DDR3 - 4 DIMMs Hard Drive 1TB 7200 RPM SATA Hard Drive 6.0 Gb/s + 256GB SSD Processor 3.6 GHz RAM 8 GB DDR5, Memory Speed 1600 MHz,\n", + "<|assistant|>\n", + "USB HDD,RAM, HDD.8GB2,USB HDD,USB HDD,USB HDD,\n", + "\n", + "✅ Inference testing completed!\n" + ] + } + ], + "source": [ + "# Test inference on sample data\n", + "print(\"🧪 Testing inference...\")\n", + "\n", + "def test_inference(model, tokenizer, test_item):\n", + " \"\"\"Test inference on a single item\"\"\"\n", + " prompt = f\"<|system|>\\nYou are a retail price estimator. Predict the most likely new retail price in USD.\\n<|user|>\\n{test_item.title}\\n{test_item.description}\\n<|assistant|>\\n\"\n", + "\n", + " inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n", + "\n", + " with torch.no_grad():\n", + " outputs = model.generate(\n", + " **inputs,\n", + " max_new_tokens=20,\n", + " temperature=0.7,\n", + " do_sample=True,\n", + " pad_token_id=tokenizer.eos_token_id\n", + " )\n", + "\n", + " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " return response\n", + "\n", + "# Test on a few examples\n", + "for i, item in enumerate(test[:3]):\n", + " print(f\"\\n--- Test {i+1} ---\")\n", + " print(f\"Item: {item.title}\")\n", + " print(f\"Actual Price: ${item.price:.2f}\")\n", + "\n", + " try:\n", + " response = test_inference(model, tokenizer, item)\n", + " print(f\"Model Response: {response}\")\n", + " except Exception as e:\n", + " print(f\"Error: {e}\")\n", + "\n", + "print(\"\\n✅ Inference testing completed!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e716982", + "metadata": {}, + "outputs": [], + "source": [ + "# Fixed evaluation with price range constraints and better post-processing\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import re\n", + "import torch\n", + "\n", + "def extract_price_safe(text: str) -> float:\n", + " \"\"\"Extract price with safety constraints\"\"\"\n", + " if not text:\n", + " return 0.0\n", + " \n", + " # Clean the text\n", + " text = str(text).replace(\"$\", \"\").replace(\",\", \"\").strip()\n", + " \n", + " # Look for price patterns\n", + " patterns = [\n", + " r'\\$?(\\d+\\.?\\d*)\\s*(?:dollars?|USD|usd)?', # $123.45 or 123.45 dollars\n", + " r'(\\d+\\.?\\d*)', # Just numbers\n", + " ]\n", + " \n", + " for pattern in patterns:\n", + " matches = re.findall(pattern, text, re.IGNORECASE)\n", + " if matches:\n", + " try:\n", + " price = float(matches[0])\n", + " # Apply reasonable price constraints\n", + " if 0.01 <= price <= 100000: # Between 1 cent and $100k\n", + " return price\n", + " except ValueError:\n", + " continue\n", + " \n", + " return 0.0\n", + "\n", + "def build_pricing_prompt_fixed(item) -> str:\n", + " \"\"\"Build prompt with explicit price range guidance\"\"\"\n", + " return (\n", + " \"<|system|>\\n\"\n", + " \"You are a retail price estimator. Predict the most likely new retail price in USD. \"\n", + " \"Typical prices range from $1 to $10,000. Be realistic and conservative.\\n\"\n", + " \"<|user|>\\n\"\n", + " f\"Product: {item.title}\\n\"\n", + " f\"Description: {item.description}\\n\"\n", + " f\"Category: {getattr(item, 'category', 'Unknown')}\\n\"\n", + " \"What is the retail price?\\n\"\n", + " \"<|assistant|>\\n\"\n", + " \"The retail price is $\"\n", + " )\n", + "\n", + "@torch.no_grad()\n", + "def predict_price_fixed(model, tokenizer, item, max_new_tokens=15) -> float:\n", + " \"\"\"Predict price with better constraints\"\"\"\n", + " prompt = build_pricing_prompt_fixed(item)\n", + " inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n", + " \n", + " # Generate with more conservative settings\n", + " outputs = model.generate(\n", + " **inputs,\n", + " max_new_tokens=max_new_tokens,\n", + " temperature=0.3, # Lower temperature for more conservative predictions\n", + " do_sample=True,\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " repetition_penalty=1.1, # Reduce repetition\n", + " no_repeat_ngram_size=2,\n", + " )\n", + " \n", + " # Decode only the new tokens\n", + " prompt_length = len(tokenizer.decode(inputs[\"input_ids\"][0], skip_special_tokens=True))\n", + " full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " new_text = full_response[prompt_length:]\n", + " \n", + " # Extract price with constraints\n", + " price = extract_price_safe(new_text)\n", + " \n", + " # Additional safety: if price is still unreasonable, use a fallback\n", + " if price > 50000: # If over $50k, it's probably wrong\n", + " # Try to extract a more reasonable number\n", + " numbers = re.findall(r'\\d+\\.?\\d*', new_text)\n", + " if numbers:\n", + " try:\n", + " # Take the first reasonable number\n", + " for num in numbers:\n", + " candidate = float(num)\n", + " if 1 <= candidate <= 10000:\n", + " return candidate\n", + " except ValueError:\n", + " pass\n", + " return 0.0\n", + " \n", + " return price\n", + "\n", + "def evaluate_model_fixed(model, tokenizer, test_items, limit=None, title=\"Fixed Fine-tuned Model\"):\n", + " \"\"\"Evaluate with fixed price extraction\"\"\"\n", + " if not test_items:\n", + " print(\"⚠️ No test items available.\")\n", + " return {\"mae\": None, \"rmse\": None, \"mape\": None}\n", + " \n", + " items = test_items[:limit] if limit else test_items\n", + " print(f\"🔍 Evaluating on {len(items)} items...\")\n", + " \n", + " y_true, y_pred = [], []\n", + " errors = []\n", + " \n", + " for i, item in enumerate(items):\n", + " try:\n", + " pred = predict_price_fixed(model, tokenizer, item)\n", + " true_price = float(getattr(item, \"price\", 0.0))\n", + " \n", + " y_true.append(true_price)\n", + " y_pred.append(pred)\n", + " \n", + " # Track individual errors for debugging\n", + " error = abs(pred - true_price)\n", + " errors.append({\n", + " 'item': i,\n", + " 'title': getattr(item, 'title', 'Unknown')[:50],\n", + " 'true': true_price,\n", + " 'pred': pred,\n", + " 'error': error\n", + " })\n", + " \n", + " except Exception as e:\n", + " print(f\"Error on item {i}: {e}\")\n", + " y_true.append(0.0)\n", + " y_pred.append(0.0)\n", + " \n", + " y_true = np.array(y_true, dtype=float)\n", + " y_pred = np.array(y_pred, dtype=float)\n", + " \n", + " # Calculate metrics\n", + " mae = float(np.mean(np.abs(y_pred - y_true)))\n", + " rmse = float(np.sqrt(np.mean((y_pred - y_true) ** 2)))\n", + " \n", + " # MAPE (avoid division by zero)\n", + " mape = float(np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1.0)))) * 100\n", + " \n", + " # Hits within 15% tolerance\n", + " tolerance = 0.15\n", + " hits = float(np.mean(np.abs(y_pred - y_true) <= (tolerance * np.maximum(y_true, 1.0)))) * 100\n", + " \n", + " # Create scatter plot\n", + " plt.figure(figsize=(8, 6))\n", + " plt.scatter(y_true, y_pred, alpha=0.7, s=30, c='blue')\n", + " \n", + " # Add diagonal line\n", + " max_val = max(y_true.max() if y_true.size else 0, y_pred.max() if y_pred.size else 0, 1)\n", + " plt.plot([0, max_val], [0, max_val], 'r--', alpha=0.8, label='Perfect Prediction')\n", + " \n", + " plt.xlabel('True Price ($)')\n", + " plt.ylabel('Predicted Price ($)')\n", + " plt.title(f'{title}\\nMAE=${mae:.2f} RMSE=${rmse:.2f} MAPE={mape:.1f}% Hits={hits:.1f}%')\n", + " plt.legend()\n", + " plt.grid(True, alpha=0.3)\n", + " plt.tight_layout()\n", + " plt.show()\n", + " \n", + " # Show worst predictions\n", + " errors.sort(key=lambda x: x['error'], reverse=True)\n", + " print(f\"\\n🔍 Top 5 Worst Predictions:\")\n", + " for i, err in enumerate(errors[:5]):\n", + " print(f\" {i+1}. {err['title']}...\")\n", + " print(f\" True: ${err['true']:.2f}, Pred: ${err['pred']:.2f}, Error: ${err['error']:.2f}\")\n", + " \n", + " return {\n", + " \"mae\": mae,\n", + " \"rmse\": rmse, \n", + " \"mape\": mape,\n", + " \"hits_pct\": hits,\n", + " \"y_true\": y_true,\n", + " \"y_pred\": y_pred,\n", + " \"errors\": errors\n", + " }\n", + "\n", + "# Test the fixed evaluation\n", + "print(\"🧪 Testing fixed price prediction...\")\n", + "results = evaluate_model_fixed(model, tokenizer, test, limit=20, title=\"Fixed Fine-tuned Model\")\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "A100", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0614c35b3690494ca3b8f9ab71d71a08": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1c3eb3793b6e4291b4fa57ce8419ef1f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "227eea7074544adbb2c34b9dde340fa5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "245570c62c3844728d7125a706fbbc9b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8d95da3803e542f8b855175013d497ba", + "IPY_MODEL_34a89db126a64690bc2f6c8656ba2210", + "IPY_MODEL_4c47ce21b5a14328aa22403782e4da9b" + ], + "layout": "IPY_MODEL_7dff366d9e71427dbae40b1dce7a9bfa" + } + }, + "24b2b5f5d92049a79014b8278e97451b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cceae79c145d4b73a64e80ad3fc8866c", + "placeholder": "​", + "style": "IPY_MODEL_56bc56071ff04223935dc2d98d2703ab", + "value": "Map: 100%" + } + }, + "34a89db126a64690bc2f6c8656ba2210": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4cdff5bdf7574795802e821aa42f3c4e", + "max": 150, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_754aa440f45c4a878d99572368d659c8", + "value": 150 + } + }, + "42315e83fbac49c2bc7f2faf1abcc22e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4c47ce21b5a14328aa22403782e4da9b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8b5f0c156a9641cfa5413668a0b97b9c", + "placeholder": "​", + "style": "IPY_MODEL_aff498bd632f4036958f59cfc6587ea3", + "value": " 150/150 [00:00<00:00, 1904.80 examples/s]" + } + }, + "4cdff5bdf7574795802e821aa42f3c4e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "56bc56071ff04223935dc2d98d2703ab": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "67cacc87afe14250baaa073289fb4a8f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "754aa440f45c4a878d99572368d659c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7dff366d9e71427dbae40b1dce7a9bfa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b5f0c156a9641cfa5413668a0b97b9c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8bd9aebb2cc5420094b2b441a5183523": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8d95da3803e542f8b855175013d497ba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0614c35b3690494ca3b8f9ab71d71a08", + "placeholder": "​", + "style": "IPY_MODEL_42315e83fbac49c2bc7f2faf1abcc22e", + "value": "Map: 100%" + } + }, + "9ce1659c776140bcaf3c16eae6f70967": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a6001d34e58a47cab0d8bff2451afb6e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_67cacc87afe14250baaa073289fb4a8f", + "max": 50, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_227eea7074544adbb2c34b9dde340fa5", + "value": 50 + } + }, + "aff498bd632f4036958f59cfc6587ea3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b42e8d8b61d7431a814a03c5e07a1166": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8bd9aebb2cc5420094b2b441a5183523", + "placeholder": "​", + "style": "IPY_MODEL_1c3eb3793b6e4291b4fa57ce8419ef1f", + "value": " 50/50 [00:00<00:00, 1509.88 examples/s]" + } + }, + "cceae79c145d4b73a64e80ad3fc8866c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d6825cc926a24f2482ce72c15242081e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_24b2b5f5d92049a79014b8278e97451b", + "IPY_MODEL_a6001d34e58a47cab0d8bff2451afb6e", + "IPY_MODEL_b42e8d8b61d7431a814a03c5e07a1166" + ], + "layout": "IPY_MODEL_9ce1659c776140bcaf3c16eae6f70967" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week8/community_contributions/ensemble-joshua/agents/agent.py b/week8/community_contributions/ensemble-joshua/agents/agent.py new file mode 100644 index 0000000..8f376fa --- /dev/null +++ b/week8/community_contributions/ensemble-joshua/agents/agent.py @@ -0,0 +1,35 @@ +import logging + +class Agent: + """ + An abstract superclass for Agents + Used to log messages in a way that can identify each Agent + """ + + # Foreground colors + RED = '\033[31m' + GREEN = '\033[32m' + YELLOW = '\033[33m' + BLUE = '\033[34m' + MAGENTA = '\033[35m' + CYAN = '\033[36m' + WHITE = '\033[37m' + + # Background color + BG_BLACK = '\033[40m' + + # Reset code to return to default color + RESET = '\033[0m' + + name: str = "" + color: str = '\033[37m' + + def log(self, message): + """ + Log this as an info message, identifying the agent + """ + color_code = self.BG_BLACK + self.color + message = f"[{self.name}] {message}" + logging.info(color_code + message + self.RESET) + + diff --git a/week8/community_contributions/ensemble-joshua/agents/deals.py b/week8/community_contributions/ensemble-joshua/agents/deals.py new file mode 100644 index 0000000..acfcb74 --- /dev/null +++ b/week8/community_contributions/ensemble-joshua/agents/deals.py @@ -0,0 +1,111 @@ +from pydantic import BaseModel +from typing import List, Dict, Self +from bs4 import BeautifulSoup +import re +import feedparser +from tqdm import tqdm +import requests +import time + +feeds = [ + "https://www.dealnews.com/c142/Electronics/?rss=1", + "https://www.dealnews.com/c39/Computers/?rss=1", + "https://www.dealnews.com/c238/Automotive/?rss=1", + "https://www.dealnews.com/f1912/Smart-Home/?rss=1", + "https://www.dealnews.com/c196/Home-Garden/?rss=1", + ] + +def extract(html_snippet: str) -> str: + """ + Use Beautiful Soup to clean up this HTML snippet and extract useful text + """ + soup = BeautifulSoup(html_snippet, 'html.parser') + snippet_div = soup.find('div', class_='snippet summary') + + if snippet_div: + description = snippet_div.get_text(strip=True) + description = BeautifulSoup(description, 'html.parser').get_text() + description = re.sub('<[^<]+?>', '', description) + result = description.strip() + else: + result = html_snippet + return result.replace('\n', ' ') + +class ScrapedDeal: + """ + A class to represent a Deal retrieved from an RSS feed + """ + category: str + title: str + summary: str + url: str + details: str + features: str + + def __init__(self, entry: Dict[str, str]): + """ + Populate this instance based on the provided dict + """ + self.title = entry['title'] + self.summary = extract(entry['summary']) + self.url = entry['links'][0]['href'] + stuff = requests.get(self.url).content + soup = BeautifulSoup(stuff, 'html.parser') + content = soup.find('div', class_='content-section').get_text() + content = content.replace('\nmore', '').replace('\n', ' ') + if "Features" in content: + self.details, self.features = content.split("Features") + else: + self.details = content + self.features = "" + + def __repr__(self): + """ + Return a string to describe this deal + """ + return f"<{self.title}>" + + def describe(self): + """ + Return a longer string to describe this deal for use in calling a model + """ + return f"Title: {self.title}\nDetails: {self.details.strip()}\nFeatures: {self.features.strip()}\nURL: {self.url}" + + @classmethod + def fetch(cls, show_progress : bool = False) -> List[Self]: + """ + Retrieve all deals from the selected RSS feeds + """ + deals = [] + feed_iter = tqdm(feeds) if show_progress else feeds + for feed_url in feed_iter: + feed = feedparser.parse(feed_url) + for entry in feed.entries[:10]: + deals.append(cls(entry)) + time.sleep(0.5) + return deals + +class Deal(BaseModel): + """ + A class to Represent a Deal with a summary description + """ + product_description: str + price: float + url: str + +class DealSelection(BaseModel): + """ + A class to Represent a list of Deals + """ + deals: List[Deal] + +class Opportunity(BaseModel): + """ + A class to represent a possible opportunity: a Deal where we estimate + it should cost more than it's being offered + """ + deal: Deal + estimate: float + discount: float + + diff --git a/week8/community_contributions/ensemble-joshua/agents/ensemble_agent.py b/week8/community_contributions/ensemble-joshua/agents/ensemble_agent.py new file mode 100644 index 0000000..84add6b --- /dev/null +++ b/week8/community_contributions/ensemble-joshua/agents/ensemble_agent.py @@ -0,0 +1,57 @@ +import pandas as pd +from sklearn.linear_model import LinearRegression +import joblib +import os + +from agents.agent import Agent +from agents.specialist_agent import SpecialistAgent +from agents.frontier_agent import FrontierAgent +from agents.random_forest_agent import RandomForestAgent + +class EnsembleAgent(Agent): + + name = "Ensemble Agent" + color = Agent.YELLOW + + def __init__(self, collection): + """ + Create an instance of Ensemble, by creating each of the models + And loading the weights of the Ensemble + """ + self.log("Initializing Ensemble Agent") + self.specialist = SpecialistAgent() + self.frontier = FrontierAgent(collection) + self.random_forest = RandomForestAgent() + # Resolve model path: prefer local contribution folder copy, fallback to week8 root + candidate_paths = [ + os.path.join(os.path.dirname(os.path.dirname(__file__)), 'ensemble_model.pkl'), # ../../ensemble_model.pkl + os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'ensemble_model.pkl'), # ../../../ensemble_model.pkl (week8 root) + 'ensemble_model.pkl', + ] + model_path = next((p for p in candidate_paths if os.path.exists(p)), candidate_paths[-1]) + self.model = joblib.load(model_path) + self.log("Ensemble Agent is ready") + + def price(self, description: str) -> float: + """ + Run this ensemble model + Ask each of the models to price the product + Then use the Linear Regression model to return the weighted price + :param description: the description of a product + :return: an estimate of its price + """ + self.log("Running Ensemble Agent - collaborating with specialist, frontier and random forest agents") + specialist = self.specialist.price(description) + frontier = self.frontier.price(description) + random_forest = self.random_forest.price(description) + X = pd.DataFrame({ + 'Specialist': [specialist], + 'Frontier': [frontier], + 'RandomForest': [random_forest], + 'Min': [min(specialist, frontier, random_forest)], + 'Max': [max(specialist, frontier, random_forest)], + }) + y = max(0, self.model.predict(X)[0]) + self.log(f"Ensemble Agent complete - returning ${y:.2f}") + return y + diff --git a/week8/community_contributions/ensemble-joshua/agents/messaging_agent.py b/week8/community_contributions/ensemble-joshua/agents/messaging_agent.py new file mode 100644 index 0000000..2c51d8d --- /dev/null +++ b/week8/community_contributions/ensemble-joshua/agents/messaging_agent.py @@ -0,0 +1,80 @@ +import os +# from twilio.rest import Client +from agents.deals import Opportunity +import http.client +import urllib +from agents.agent import Agent + +# Uncomment the Twilio lines if you wish to use Twilio + +DO_TEXT = False +DO_PUSH = True + +class MessagingAgent(Agent): + + name = "Messaging Agent" + color = Agent.WHITE + + def __init__(self): + """ + Set up this object to either do push notifications via Pushover, + or SMS via Twilio, + whichever is specified in the constants + """ + self.log(f"Messaging Agent is initializing") + if DO_TEXT: + account_sid = os.getenv('TWILIO_ACCOUNT_SID', 'your-sid-if-not-using-env') + auth_token = os.getenv('TWILIO_AUTH_TOKEN', 'your-auth-if-not-using-env') + self.me_from = os.getenv('TWILIO_FROM', 'your-phone-number-if-not-using-env') + self.me_to = os.getenv('MY_PHONE_NUMBER', 'your-phone-number-if-not-using-env') + # self.client = Client(account_sid, auth_token) + self.log("Messaging Agent has initialized Twilio") + if DO_PUSH: + self.pushover_user = os.getenv('PUSHOVER_USER', 'your-pushover-user-if-not-using-env') + self.pushover_token = os.getenv('PUSHOVER_TOKEN', 'your-pushover-user-if-not-using-env') + self.log("Messaging Agent has initialized Pushover") + + def message(self, text): + """ + Send an SMS message using the Twilio API + """ + self.log("Messaging Agent is sending a text message") + message = self.client.messages.create( + from_=self.me_from, + body=text, + to=self.me_to + ) + + def push(self, text): + """ + Send a Push Notification using the Pushover API + """ + self.log("Messaging Agent is sending a push notification") + conn = http.client.HTTPSConnection("api.pushover.net:443") + conn.request("POST", "/1/messages.json", + urllib.parse.urlencode({ + "token": self.pushover_token, + "user": self.pushover_user, + "message": text, + "sound": "cashregister" + }), { "Content-type": "application/x-www-form-urlencoded" }) + conn.getresponse() + + def alert(self, opportunity: Opportunity): + """ + Make an alert about the specified Opportunity + """ + text = f"Deal Alert! Price=${opportunity.deal.price:.2f}, " + text += f"Estimate=${opportunity.estimate:.2f}, " + text += f"Discount=${opportunity.discount:.2f} :" + text += opportunity.deal.product_description[:10]+'... ' + text += opportunity.deal.url + if DO_TEXT: + self.message(text) + if DO_PUSH: + self.push(text) + self.log("Messaging Agent has completed") + + + + diff --git a/week8/community_contributions/ensemble-joshua/agents/planning_agent.py b/week8/community_contributions/ensemble-joshua/agents/planning_agent.py new file mode 100644 index 0000000..891a06d --- /dev/null +++ b/week8/community_contributions/ensemble-joshua/agents/planning_agent.py @@ -0,0 +1,58 @@ +from typing import Optional, List +from agents.agent import Agent +from agents.deals import ScrapedDeal, DealSelection, Deal, Opportunity +from agents.scanner_agent import ScannerAgent +from agents.ensemble_agent import EnsembleAgent +from agents.messaging_agent import MessagingAgent + + +class PlanningAgent(Agent): + + name = "Planning Agent" + color = Agent.GREEN + DEAL_THRESHOLD = 50 + + def __init__(self, collection): + """ + Create instances of the 3 Agents that this planner coordinates across + """ + self.log("Planning Agent is initializing") + self.scanner = ScannerAgent() + self.ensemble = EnsembleAgent(collection) + self.messenger = MessagingAgent() + self.log("Planning Agent is ready") + + def run(self, deal: Deal) -> Opportunity: + """ + Run the workflow for a particular deal + :param deal: the deal, summarized from an RSS scrape + :returns: an opportunity including the discount + """ + self.log("Planning Agent is pricing up a potential deal") + estimate = self.ensemble.price(deal.product_description) + discount = estimate - deal.price + self.log(f"Planning Agent has processed a deal with discount ${discount:.2f}") + return Opportunity(deal=deal, estimate=estimate, discount=discount) + + def plan(self, memory: List[str] = []) -> Optional[Opportunity]: + """ + Run the full workflow: + 1. Use the ScannerAgent to find deals from RSS feeds + 2. Use the EnsembleAgent to estimate them + 3. Use the MessagingAgent to send a notification of deals + :param memory: a list of URLs that have been surfaced in the past + :return: an Opportunity if one was surfaced, otherwise None + """ + self.log("Planning Agent is kicking off a run") + selection = self.scanner.scan(memory=memory) + if selection: + opportunities = [self.run(deal) for deal in selection.deals[:5]] + opportunities.sort(key=lambda opp: opp.discount, reverse=True) + best = opportunities[0] + self.log(f"Planning Agent has identified the best deal has discount ${best.discount:.2f}") + if best.discount > self.DEAL_THRESHOLD: + self.messenger.alert(best) + self.log("Planning Agent has completed a run") + return best if best.discount > self.DEAL_THRESHOLD else None + return None + diff --git a/week8/community_contributions/ensemble-joshua/agents/random_forest_agent.py b/week8/community_contributions/ensemble-joshua/agents/random_forest_agent.py new file mode 100644 index 0000000..a114f3a --- /dev/null +++ b/week8/community_contributions/ensemble-joshua/agents/random_forest_agent.py @@ -0,0 +1,46 @@ +# imports + +import os +import re +from typing import List +from sentence_transformers import SentenceTransformer +import joblib +import os +from agents.agent import Agent + + + +class RandomForestAgent(Agent): + + name = "Random Forest Agent" + color = Agent.MAGENTA + + def __init__(self): + """ + Initialize this object by loading in the saved model weights + and the SentenceTransformer vector encoding model + """ + self.log("Random Forest Agent is initializing") + self.vectorizer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') + # Resolve model path: prefer local contribution folder copy, fallback to week8 root + candidate_paths = [ + os.path.join(os.path.dirname(os.path.dirname(__file__)), 'random_forest_model.pkl'), # ../../random_forest_model.pkl + os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'random_forest_model.pkl'), # ../../../random_forest_model.pkl (week8 root) + 'random_forest_model.pkl', + ] + model_path = next((p for p in candidate_paths if os.path.exists(p)), candidate_paths[-1]) + self.model = joblib.load(model_path) + self.log("Random Forest Agent is ready") + + def price(self, description: str) -> float: + """ + Use a Random Forest model to estimate the price of the described item + :param description: the product to be estimated + :return: the price as a float + """ + self.log("Random Forest Agent is starting a prediction") + vector = self.vectorizer.encode([description]) + result = max(0, self.model.predict(vector)[0]) + self.log(f"Random Forest Agent completed - predicting ${result:.2f}") + return result + diff --git a/week8/community_contributions/ensemble-joshua/agents/scanner_agent.py b/week8/community_contributions/ensemble-joshua/agents/scanner_agent.py new file mode 100644 index 0000000..2b34207 --- /dev/null +++ b/week8/community_contributions/ensemble-joshua/agents/scanner_agent.py @@ -0,0 +1,95 @@ +import os +import json +from typing import Optional, List +from openai import OpenAI +from agents.deals import ScrapedDeal, DealSelection +from agents.agent import Agent + + +class ScannerAgent(Agent): + + MODEL = "gpt-4o-mini" + + SYSTEM_PROMPT = """You identify and summarize the 5 most detailed deals from a list, by selecting deals that have the most detailed, high quality description and the most clear price. + Respond strictly in JSON with no explanation, using this format. You should provide the price as a number derived from the description. If the price of a deal isn't clear, do not include that deal in your response. + Most important is that you respond with the 5 deals that have the most detailed product description with price. It's not important to mention the terms of the deal; most important is a thorough description of the product. + Be careful with products that are described as "$XXX off" or "reduced by $XXX" - this isn't the actual price of the product. Only respond with products when you are highly confident about the price. + + {"deals": [ + { + "product_description": "Your clearly expressed summary of the product in 4-5 sentences. Details of the item are much more important than why it's a good deal. Avoid mentioning discounts and coupons; focus on the item itself. There should be a paragpraph of text for each item you choose.", + "price": 99.99, + "url": "the url as provided" + }, + ... + ]}""" + + USER_PROMPT_PREFIX = """Respond with the most promising 5 deals from this list, selecting those which have the most detailed, high quality product description and a clear price that is greater than 0. + Respond strictly in JSON, and only JSON. You should rephrase the description to be a summary of the product itself, not the terms of the deal. + Remember to respond with a paragraph of text in the product_description field for each of the 5 items that you select. + Be careful with products that are described as "$XXX off" or "reduced by $XXX" - this isn't the actual price of the product. Only respond with products when you are highly confident about the price. + + Deals: + + """ + + USER_PROMPT_SUFFIX = "\n\nStrictly respond in JSON and include exactly 5 deals, no more." + + name = "Scanner Agent" + color = Agent.CYAN + + def __init__(self): + """ + Set up this instance by initializing OpenAI + """ + self.log("Scanner Agent is initializing") + self.openai = OpenAI() + self.log("Scanner Agent is ready") + + def fetch_deals(self, memory) -> List[ScrapedDeal]: + """ + Look up deals published on RSS feeds + Return any new deals that are not already in the memory provided + """ + self.log("Scanner Agent is about to fetch deals from RSS feed") + urls = [opp.deal.url for opp in memory] + scraped = ScrapedDeal.fetch() + result = [scrape for scrape in scraped if scrape.url not in urls] + self.log(f"Scanner Agent received {len(result)} deals not already scraped") + return result + + def make_user_prompt(self, scraped) -> str: + """ + Create a user prompt for OpenAI based on the scraped deals provided + """ + user_prompt = self.USER_PROMPT_PREFIX + user_prompt += '\n\n'.join([scrape.describe() for scrape in scraped]) + user_prompt += self.USER_PROMPT_SUFFIX + return user_prompt + + def scan(self, memory: List[str]=[]) -> Optional[DealSelection]: + """ + Call OpenAI to provide a high potential list of deals with good descriptions and prices + Use StructuredOutputs to ensure it conforms to our specifications + :param memory: a list of URLs representing deals already raised + :return: a selection of good deals, or None if there aren't any + """ + scraped = self.fetch_deals(memory) + if scraped: + user_prompt = self.make_user_prompt(scraped) + self.log("Scanner Agent is calling OpenAI using Structured Output") + result = self.openai.beta.chat.completions.parse( + model=self.MODEL, + messages=[ + {"role": "system", "content": self.SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt} + ], + response_format=DealSelection + ) + result = result.choices[0].message.parsed + result.deals = [deal for deal in result.deals if deal.price>0] + self.log(f"Scanner Agent received {len(result.deals)} selected deals with price>0 from OpenAI") + return result + return None + + diff --git a/week8/community_contributions/ensemble-joshua/api.py b/week8/community_contributions/ensemble-joshua/api.py new file mode 100644 index 0000000..0b604ea --- /dev/null +++ b/week8/community_contributions/ensemble-joshua/api.py @@ -0,0 +1,75 @@ +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +import os +import chromadb + +from agents.specialist_agent import SpecialistAgent +from agents.frontier_agent import FrontierAgent +from agents.random_forest_agent import RandomForestAgent +from agents.ensemble_agent import EnsembleAgent +from deal_agent_framework import DealAgentFramework + + +class PriceRequest(BaseModel): + description: str + + +class DealScanResponse(BaseModel): + opportunities: list + + +DB_PATH = os.path.join(os.path.dirname(__file__), "../../products_vectorstore") +client = chromadb.PersistentClient(path=DB_PATH) +collection = client.get_or_create_collection("products") + +app = FastAPI(title="Week8 Pricer API", version="1.0.0") + + +@app.get("/healthz") +def healthz(): + return {"ok": True} + + +@app.post("/price/specialist") +def price_specialist(body: PriceRequest): + if not body.description: + raise HTTPException(400, "description is required") + agent = SpecialistAgent() + price = float(agent.price(body.description)) + return {"price": price, "agent": "specialist"} + + +@app.post("/price/frontier") +def price_frontier(body: PriceRequest): + if not body.description: + raise HTTPException(400, "description is required") + agent = FrontierAgent(collection) + price = float(agent.price(body.description)) + return {"price": price, "agent": "frontier"} + + +@app.post("/price/random_forest") +def price_random_forest(body: PriceRequest): + if not body.description: + raise HTTPException(400, "description is required") + agent = RandomForestAgent() + price = float(agent.price(body.description)) + return {"price": price, "agent": "random_forest"} + + +@app.post("/price/ensemble") +def price_ensemble(body: PriceRequest): + if not body.description: + raise HTTPException(400, "description is required") + agent = EnsembleAgent(collection) + price = float(agent.price(body.description)) + return {"price": price, "agent": "ensemble"} + + +@app.post("/deals/scan") +def deals_scan(): + framework = DealAgentFramework() + opportunities = framework.run() + return {"count": len(opportunities), "opportunities": [o.dict() for o in opportunities]} + + diff --git a/week8/community_contributions/ensemble-joshua/ensemble_model.pkl b/week8/community_contributions/ensemble-joshua/ensemble_model.pkl new file mode 100644 index 0000000..94efeec Binary files /dev/null and b/week8/community_contributions/ensemble-joshua/ensemble_model.pkl differ diff --git a/week8/community_contributions/ensemble-joshua/frontier_agent.py b/week8/community_contributions/ensemble-joshua/frontier_agent.py new file mode 100644 index 0000000..e1e9858 --- /dev/null +++ b/week8/community_contributions/ensemble-joshua/frontier_agent.py @@ -0,0 +1,150 @@ +# imports + +import os +import re +import math +import json +from typing import List, Dict +from openai import OpenAI +try: + from openai import APIStatusError + APIStatusError = Exception +import statistics +from sentence_transformers import SentenceTransformer +from datasets import load_dataset +import chromadb +from items import Item +from testing import Tester +from agents.agent import Agent + + +class FrontierAgent(Agent): + + name = "Frontier Agent" + color = Agent.BLUE + + MODEL = "gpt-4o-mini" + + def __init__(self, collection): + """ + Set up this instance by connecting to OpenAI or DeepSeek, to the Chroma Datastore, + And setting up the vector encoding model + """ + self.log("Initializing Frontier Agent") + deepseek_api_key = os.getenv("DEEPSEEK_API_KEY") + if deepseek_api_key: + self.client = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com") + self.MODEL = "deepseek-chat" + self.log("Frontier Agent is set up with DeepSeek") + else: + self.client = OpenAI() + self.MODEL = "gpt-4o-mini" + self.log("Frontier Agent is setting up with OpenAI") + self.collection = collection + self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') + self.log("Frontier Agent is ready") + + def make_context(self, similars: List[str], prices: List[float]) -> str: + """ + Create context that can be inserted into the prompt + :param similars: similar products to the one being estimated + :param prices: prices of the similar products + :return: text to insert in the prompt that provides context + """ + message = "To provide some context, here are some other items that might be similar to the item you need to estimate.\n\n" + for similar, price in zip(similars, prices): + message += f"Potentially related product:\n{similar}\nPrice is ${price:.2f}\n\n" + return message + + def messages_for(self, description: str, similars: List[str], prices: List[float]) -> List[Dict[str, str]]: + """ + Create the message list to be included in a call to OpenAI + With the system and user prompt + :param description: a description of the product + :param similars: similar products to this one + :param prices: prices of similar products + :return: the list of messages in the format expected by OpenAI + """ + system_message = "You estimate prices of items. Reply only with the price, no explanation" + user_prompt = self.make_context(similars, prices) + user_prompt += "And now the question for you:\n\n" + user_prompt += "How much does this cost?\n\n" + description + return [ + {"role": "system", "content": system_message}, + {"role": "user", "content": user_prompt}, + {"role": "assistant", "content": "Price is $"} + ] + + def find_similars(self, description: str): + """ + Return a list of items similar to the given one by looking in the Chroma datastore + """ + self.log("Frontier Agent is performing a RAG search of the Chroma datastore to find 5 similar products") + vector = self.model.encode([description]) + results = self.collection.query(query_embeddings=vector.astype(float).tolist(), n_results=5) + documents = results['documents'][0][:] + prices = [m['price'] for m in results['metadatas'][0][:]] + self.log("Frontier Agent has found similar products") + return documents, prices + + def get_price(self, s) -> float: + """ + A utility that plucks a floating point number out of a string + """ + s = s.replace('$','').replace(',','') + match = re.search(r"[-+]?\d*\.\d+|\d+", s) + return float(match.group()) if match else 0.0 + + def price(self, description: str) -> float: + """ + Make a call to OpenAI or DeepSeek to estimate the price of the described product, + by looking up 5 similar products and including them in the prompt to give context + :param description: a description of the product + :return: an estimate of the price + """ + documents, prices = self.find_similars(description) + + # If external calls are disabled, or similar pricing is empty, use heuristic + allow_external = os.getenv("FRONTIER_ALLOW_EXTERNAL", "true").lower() in {"1", "true", "yes"} + + def heuristic_price() -> float: + if prices: + # Robust central tendency fallback + try: + return float(statistics.median(prices)) + except Exception: + return float(sum(prices) / max(len(prices), 1)) + # As a last resort, return 0.0 + return 0.0 + + if not allow_external: + self.log("External LLM calls disabled via FRONTIER_ALLOW_EXTERNAL; using heuristic fallback") + result = heuristic_price() + self.log(f"Frontier Agent (fallback) - predicting ${result:.2f}") + return result + + self.log(f"Frontier Agent is about to call {self.MODEL} with context including 5 similar products") + try: + response = self.client.chat.completions.create( + model=self.MODEL, + messages=self.messages_for(description, documents, prices), + seed=42, + max_tokens=5, + ) + reply = response.choices[0].message.content + result = self.get_price(reply) + self.log(f"Frontier Agent completed - predicting ${result:.2f}") + return result + except APIStatusError as e: # Insufficient balance or other HTTP errors + msg = getattr(e, "message", str(e)) + self.log(f"Frontier Agent API error: {msg}. Falling back to heuristic price.") + result = heuristic_price() + self.log(f"Frontier Agent (fallback) - predicting ${result:.2f}") + return result + except Exception as e: + self.log(f"Frontier Agent unexpected error: {e}. Falling back to heuristic price.") + result = heuristic_price() + self.log(f"Frontier Agent (fallback) - predicting ${result:.2f}") + return result + + diff --git a/week8/community_contributions/ensemble-joshua/pricer_service2.py b/week8/community_contributions/ensemble-joshua/pricer_service2.py new file mode 100644 index 0000000..8bdd854 --- /dev/null +++ b/week8/community_contributions/ensemble-joshua/pricer_service2.py @@ -0,0 +1,98 @@ +import modal +from modal import App, Volume, Image + + +app = modal.App("pricer-service") +image = Image.debian_slim().pip_install("huggingface", "torch", "transformers", "bitsandbytes", "accelerate", "peft") + +secrets = [modal.Secret.from_name("hf-secret")] + +# Constants +GPU = "T4" +BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B" +PROJECT_NAME = "pricer" +HF_USER = "ed-donner" +RUN_NAME = "2024-09-13_13.04.39" +PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}" +REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36" +FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}" +CACHE_DIR = "/cache" + + +MIN_CONTAINERS = 0 + +QUESTION = "How much does this cost to the nearest dollar?" +PREFIX = "Price is $" + +hf_cache_volume = Volume.from_name("hf-hub-cache", create_if_missing=True) + +@app.cls( + image=image.env({"HF_HUB_CACHE": CACHE_DIR}), + secrets=secrets, + gpu=GPU, + timeout=1800, + min_containers=MIN_CONTAINERS, + volumes={CACHE_DIR: hf_cache_volume} +) +class Pricer: + + @modal.enter() + def setup(self): + import torch + from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed + from peft import PeftModel + + # Quant Config + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4" + ) + + # Load model and tokenizer + self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) + self.tokenizer.pad_token = self.tokenizer.eos_token + self.tokenizer.padding_side = "right" + self.base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL, + quantization_config=quant_config, + device_map="auto" + ) + self.fine_tuned_model = PeftModel.from_pretrained(self.base_model, FINETUNED_MODEL, revision=REVISION) + + @modal.method() + def price(self, description: str) -> float: + import os + import re + import torch + from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed + from peft import PeftModel + + set_seed(42) + prompt = f"{QUESTION}\n\n{description}\n\n{PREFIX}" + inputs = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda") + attention_mask = torch.ones(inputs.shape, device="cuda") + outputs = self.fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=5, num_return_sequences=1) + result = self.tokenizer.decode(outputs[0]) + + contents = result.split("Price is $")[1] + contents = contents.replace(',','') + match = re.search(r"[-+]?\d*\.\d+|\d+", contents) + return float(match.group()) if match else 0 + + +# Simple HTTP endpoint so external apps can call this on Modal +@app.function(image=image, secrets=secrets, gpu=GPU, timeout=1800) +@modal.web_endpoint(method="POST") +def price_http(body: dict): + """HTTP endpoint: {"description": str} -> {"price": float}""" + description = body.get("description", '').strip() + if not description: + return {"error": "Missing 'description'"} + + pricer = Pricer() + value = pricer.price.remote(description) + return {"price": float(value)} + +