Merge pull request #873 from TheTopDeveloper/community-contributions-branch
Add Week 6 finetuning solution with pickle data and enhanced modules- Joshua Oluoch (Gen AI Bootcamp)
This commit is contained in:
@@ -0,0 +1,828 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Week 6 - Product Pricer Challenge\n",
|
||||
"\n",
|
||||
"**A baseline established by GPT-4o and attempt to beat it with fine-tuning**\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize and Load Configuration\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Imports\n",
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"import math\n",
|
||||
"import json\n",
|
||||
"import random\n",
|
||||
"import pickle\n",
|
||||
"from collections import Counter\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from huggingface_hub import login\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# SimpleItem class definition for pickle compatibility\n",
|
||||
"class SimpleItem:\n",
|
||||
" \"\"\"\n",
|
||||
" Simple item class for pickle compatibility\n",
|
||||
" This matches the structure used in the CSV conversion script\n",
|
||||
" \"\"\"\n",
|
||||
" def __init__(self, title, description, price, category=\"Human_Generated\", token_count=0):\n",
|
||||
" self.title = title\n",
|
||||
" self.description = description\n",
|
||||
" self.price = price\n",
|
||||
" self.category = category\n",
|
||||
" self.token_count = token_count\n",
|
||||
"\n",
|
||||
" def test_prompt(self):\n",
|
||||
" \"\"\"\n",
|
||||
" Return a prompt suitable for testing, with the actual price removed\n",
|
||||
" This method is needed for compatibility with the testing framework\n",
|
||||
" \"\"\"\n",
|
||||
" return f\"How much does this cost to the nearest dollar?\\n\\n{self.title}\\n\\n{self.description}\\n\\nPrice is $\"\n",
|
||||
"\n",
|
||||
" def __repr__(self):\n",
|
||||
" return f\"SimpleItem(title='{self.title[:50]}...', price=${self.price})\"\n",
|
||||
"\n",
|
||||
"# Import our custom classes\n",
|
||||
"# Use original testing class to avoid matplotlib color issues\n",
|
||||
"try:\n",
|
||||
" from enhanced_items import Item\n",
|
||||
" # Use original Tester to avoid matplotlib color issues\n",
|
||||
" import sys\n",
|
||||
" import os\n",
|
||||
" sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))\n",
|
||||
" from testing import Tester\n",
|
||||
" print(\"✅ Using enhanced items and original testing from parent directory\")\n",
|
||||
"except ImportError:\n",
|
||||
" # Fallback to parent directory modules\n",
|
||||
" import sys\n",
|
||||
" import os\n",
|
||||
" sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))\n",
|
||||
" from items import Item\n",
|
||||
" from testing import Tester\n",
|
||||
" print(\"✅ Using modules from parent directory\")\n",
|
||||
"\n",
|
||||
"print(\"✅ All imports successful!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Environment setup\n",
|
||||
"try:\n",
|
||||
" from google.colab import userdata\n",
|
||||
" os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
|
||||
" os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')\n",
|
||||
" print(\"✅ Using Colab secrets\")\n",
|
||||
"except:\n",
|
||||
" from dotenv import load_dotenv\n",
|
||||
" load_dotenv(override=True)\n",
|
||||
" os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
|
||||
" os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')\n",
|
||||
" print(\"✅ Using local .env file\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Log in to HuggingFace\n",
|
||||
"hf_token = os.environ['HF_TOKEN']\n",
|
||||
"login(hf_token)\n",
|
||||
"\n",
|
||||
"# Initialize OpenAI client\n",
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# Enable matplotlib inline for Colab\n",
|
||||
"%matplotlib inline\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Data\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load pre-processed pickle files (our data loading hack)\n",
|
||||
"def load_pickle_data():\n",
|
||||
" \"\"\"\n",
|
||||
" Load pre-processed pickle files with fallback to sample data\n",
|
||||
" \"\"\"\n",
|
||||
" print(\"📦 Loading pre-processed pickle files...\")\n",
|
||||
" \n",
|
||||
" # Try to load pickle files\n",
|
||||
" pickle_files = ['train.pkl', 'test.pkl', 'validation.pkl', \n",
|
||||
" 'data/train.pkl', 'data/test.pkl', 'data/validation.pkl',\n",
|
||||
" '../train.pkl', '../test.pkl', '../validation.pkl']\n",
|
||||
" \n",
|
||||
" train = None\n",
|
||||
" test = None\n",
|
||||
" validation = None\n",
|
||||
" \n",
|
||||
" # Load training data\n",
|
||||
" for file_path in ['train.pkl', 'data/train.pkl', '../train.pkl']:\n",
|
||||
" if os.path.exists(file_path):\n",
|
||||
" try:\n",
|
||||
" with open(file_path, 'rb') as f:\n",
|
||||
" train = pickle.load(f)\n",
|
||||
" print(f\"✅ Loaded training data: {file_path} ({len(train)} items)\")\n",
|
||||
" break\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"❌ Error loading {file_path}: {e}\")\n",
|
||||
" # Try to load as dictionary and convert to SimpleItem\n",
|
||||
" try:\n",
|
||||
" with open(file_path, 'rb') as f:\n",
|
||||
" raw_data = pickle.load(f)\n",
|
||||
" if isinstance(raw_data, list) and len(raw_data) > 0:\n",
|
||||
" if isinstance(raw_data[0], dict):\n",
|
||||
" # Convert dictionary to SimpleItem\n",
|
||||
" train = []\n",
|
||||
" for item_dict in raw_data:\n",
|
||||
" item = SimpleItem(\n",
|
||||
" title=item_dict.get('title', ''),\n",
|
||||
" description=item_dict.get('description', ''),\n",
|
||||
" price=item_dict.get('price', 0.0),\n",
|
||||
" category=item_dict.get('category', 'Human_Generated'),\n",
|
||||
" token_count=item_dict.get('token_count', 0)\n",
|
||||
" )\n",
|
||||
" train.append(item)\n",
|
||||
" print(f\" Converted {len(train)} training items from dictionary format\")\n",
|
||||
" break\n",
|
||||
" except Exception as e2:\n",
|
||||
" print(f\" ❌ Failed to convert {file_path}: {e2}\")\n",
|
||||
" \n",
|
||||
" # Load test data\n",
|
||||
" for file_path in ['test.pkl', 'data/test.pkl', '../test.pkl']:\n",
|
||||
" if os.path.exists(file_path):\n",
|
||||
" try:\n",
|
||||
" with open(file_path, 'rb') as f:\n",
|
||||
" test = pickle.load(f)\n",
|
||||
" print(f\"✅ Loaded test data: {file_path} ({len(test)} items)\")\n",
|
||||
" break\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"❌ Error loading {file_path}: {e}\")\n",
|
||||
" # Try to load as dictionary and convert to SimpleItem\n",
|
||||
" try:\n",
|
||||
" with open(file_path, 'rb') as f:\n",
|
||||
" raw_data = pickle.load(f)\n",
|
||||
" if isinstance(raw_data, list) and len(raw_data) > 0:\n",
|
||||
" if isinstance(raw_data[0], dict):\n",
|
||||
" # Convert dictionary to SimpleItem\n",
|
||||
" test = []\n",
|
||||
" for item_dict in raw_data:\n",
|
||||
" item = SimpleItem(\n",
|
||||
" title=item_dict.get('title', ''),\n",
|
||||
" description=item_dict.get('description', ''),\n",
|
||||
" price=item_dict.get('price', 0.0),\n",
|
||||
" category=item_dict.get('category', 'Human_Generated'),\n",
|
||||
" token_count=item_dict.get('token_count', 0)\n",
|
||||
" )\n",
|
||||
" test.append(item)\n",
|
||||
" print(f\" Converted {len(test)} test items from dictionary format\")\n",
|
||||
" break\n",
|
||||
" except Exception as e2:\n",
|
||||
" print(f\" ❌ Failed to convert {file_path}: {e2}\")\n",
|
||||
" \n",
|
||||
" # Load validation data\n",
|
||||
" for file_path in ['validation.pkl', 'data/validation.pkl', '../validation.pkl']:\n",
|
||||
" if os.path.exists(file_path):\n",
|
||||
" try:\n",
|
||||
" with open(file_path, 'rb') as f:\n",
|
||||
" validation = pickle.load(f)\n",
|
||||
" print(f\"✅ Loaded validation data: {file_path} ({len(validation)} items)\")\n",
|
||||
" break\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"❌ Error loading {file_path}: {e}\")\n",
|
||||
" # Try to load as dictionary and convert to SimpleItem\n",
|
||||
" try:\n",
|
||||
" with open(file_path, 'rb') as f:\n",
|
||||
" raw_data = pickle.load(f)\n",
|
||||
" if isinstance(raw_data, list) and len(raw_data) > 0:\n",
|
||||
" if isinstance(raw_data[0], dict):\n",
|
||||
" # Convert dictionary to SimpleItem\n",
|
||||
" validation = []\n",
|
||||
" for item_dict in raw_data:\n",
|
||||
" item = SimpleItem(\n",
|
||||
" title=item_dict.get('title', ''),\n",
|
||||
" description=item_dict.get('description', ''),\n",
|
||||
" price=item_dict.get('price', 0.0),\n",
|
||||
" category=item_dict.get('category', 'Human_Generated'),\n",
|
||||
" token_count=item_dict.get('token_count', 0)\n",
|
||||
" )\n",
|
||||
" validation.append(item)\n",
|
||||
" print(f\" Converted {len(validation)} validation items from dictionary format\")\n",
|
||||
" break\n",
|
||||
" except Exception as e2:\n",
|
||||
" print(f\" ❌ Failed to convert {file_path}: {e2}\")\n",
|
||||
" \n",
|
||||
" # If no pickle files found, create sample data\n",
|
||||
" if not train or not test:\n",
|
||||
" print(\"🔄 No pickle files found, creating sample data...\")\n",
|
||||
" train, test, validation = create_sample_data()\n",
|
||||
" \n",
|
||||
" # Debug: Check what we actually loaded\n",
|
||||
" print(f\"\\n🔍 Debug - Data loaded:\")\n",
|
||||
" print(f\" train: {len(train) if train else 0} items\")\n",
|
||||
" print(f\" test: {len(test) if test else 0} items\") \n",
|
||||
" print(f\" validation: {len(validation) if validation else 0} items\")\n",
|
||||
" \n",
|
||||
" # Additional safety check\n",
|
||||
" if not test or len(test) == 0:\n",
|
||||
" print(\"⚠️ WARNING: Test dataset is empty! Creating emergency sample data...\")\n",
|
||||
" # Create emergency test data\n",
|
||||
" emergency_test = [\n",
|
||||
" SimpleItem(\"Test Product 1\", \"A test product for evaluation\", 25.99, \"Test\", 10),\n",
|
||||
" SimpleItem(\"Test Product 2\", \"Another test product\", 45.50, \"Test\", 12),\n",
|
||||
" SimpleItem(\"Test Product 3\", \"Third test product\", 15.75, \"Test\", 8)\n",
|
||||
" ]\n",
|
||||
" test = emergency_test\n",
|
||||
" print(f\" Emergency test data created: {len(test)} items\")\n",
|
||||
" \n",
|
||||
" return train, test, validation\n",
|
||||
"\n",
|
||||
"def create_sample_data():\n",
|
||||
" \"\"\"\n",
|
||||
" Create sample data for demonstration\n",
|
||||
" \"\"\"\n",
|
||||
" # Sample product data (expanded for better testing)\n",
|
||||
" sample_products = [\n",
|
||||
" {\"title\": \"Wireless Bluetooth Headphones\", \"price\": 89.99, \"category\": \"Electronics\"},\n",
|
||||
" {\"title\": \"Stainless Steel Water Bottle\", \"price\": 24.99, \"category\": \"Home & Kitchen\"},\n",
|
||||
" {\"title\": \"Organic Cotton T-Shirt\", \"price\": 19.99, \"category\": \"Clothing\"},\n",
|
||||
" {\"title\": \"Ceramic Coffee Mug\", \"price\": 12.99, \"category\": \"Home & Kitchen\"},\n",
|
||||
" {\"title\": \"LED Desk Lamp\", \"price\": 45.99, \"category\": \"Electronics\"},\n",
|
||||
" {\"title\": \"Yoga Mat\", \"price\": 29.99, \"category\": \"Sports & Outdoors\"},\n",
|
||||
" {\"title\": \"Leather Wallet\", \"price\": 39.99, \"category\": \"Accessories\"},\n",
|
||||
" {\"title\": \"Bluetooth Speaker\", \"price\": 79.99, \"category\": \"Electronics\"},\n",
|
||||
" {\"title\": \"Kitchen Knife Set\", \"price\": 129.99, \"category\": \"Home & Kitchen\"},\n",
|
||||
" {\"title\": \"Running Shoes\", \"price\": 89.99, \"category\": \"Sports & Outdoors\"},\n",
|
||||
" {\"title\": \"Smartphone Case\", \"price\": 15.99, \"category\": \"Electronics\"},\n",
|
||||
" {\"title\": \"Coffee Maker\", \"price\": 89.99, \"category\": \"Home & Kitchen\"},\n",
|
||||
" {\"title\": \"Backpack\", \"price\": 49.99, \"category\": \"Accessories\"},\n",
|
||||
" {\"title\": \"Tennis Racket\", \"price\": 79.99, \"category\": \"Sports & Outdoors\"},\n",
|
||||
" {\"title\": \"Laptop Stand\", \"price\": 34.99, \"category\": \"Electronics\"}\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
" # Create SimpleItem objects\n",
|
||||
" items = []\n",
|
||||
" for product in sample_products:\n",
|
||||
" item = SimpleItem(\n",
|
||||
" title=product['title'],\n",
|
||||
" description=f\"High-quality {product['title'].lower()}\",\n",
|
||||
" price=product['price'],\n",
|
||||
" category=product['category'],\n",
|
||||
" token_count=len(product['title'] + f\"High-quality {product['title'].lower()}\") // 4\n",
|
||||
" )\n",
|
||||
" items.append(item)\n",
|
||||
" \n",
|
||||
" # Split into train/test/validation (more balanced split)\n",
|
||||
" train = items[:10] # 10 items\n",
|
||||
" test = items[10:13] # 3 items \n",
|
||||
" validation = items[13:] # 2 items\n",
|
||||
" \n",
|
||||
" print(f\"✅ Created sample data: {len(train)} train, {len(test)} test, {len(validation)} validation\")\n",
|
||||
" return train, test, validation\n",
|
||||
"\n",
|
||||
"# Load the data\n",
|
||||
"train, test, validation = load_pickle_data()\n",
|
||||
"\n",
|
||||
"print(f\"\\n📊 Dataset Statistics:\")\n",
|
||||
"print(f\" Training: {len(train)} items\")\n",
|
||||
"print(f\" Test: {len(test)} items\")\n",
|
||||
"print(f\" Validation: {len(validation)} items\")\n",
|
||||
"\n",
|
||||
"if train:\n",
|
||||
" print(f\"\\n🔍 Sample Training Item:\")\n",
|
||||
" print(f\" Title: {train[0].title}\")\n",
|
||||
" print(f\" Price: ${train[0].price}\")\n",
|
||||
" print(f\" Category: {train[0].category}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prepare Fine-tuning Data\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OpenAI recommends fine-tuning with 50-100 examples\n",
|
||||
"# Use our actual train/validation split from the pickle files\n",
|
||||
"fine_tune_train = train # Use all training data (150 items)\n",
|
||||
"fine_tune_validation = validation # Use validation data (50 items)\n",
|
||||
"\n",
|
||||
"print(f\"📊 Fine-tuning data prepared:\")\n",
|
||||
"print(f\" Training: {len(fine_tune_train)} items\")\n",
|
||||
"print(f\" Validation: {len(fine_tune_validation)} items\")\n",
|
||||
"\n",
|
||||
"# Weight and Biases integration (optional)\n",
|
||||
"wandb_integration = {\"type\": \"wandb\", \"wandb\": {\"project\": \"gpt-pricer-ft\"}}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Helper Functions\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Utility function to extract price from a string\n",
|
||||
"def get_price(s):\n",
|
||||
" s = s.replace('$', '').replace(',', '')\n",
|
||||
" match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", s)\n",
|
||||
" return float(match.group()) if match else 0\n",
|
||||
"\n",
|
||||
"# Prompt generation functions\n",
|
||||
"def messages_for(item):\n",
|
||||
" system_message = \"You estimate prices of items. Reply only with the price, no explanation\"\n",
|
||||
" user_prompt = item.test_prompt().replace(\" to the nearest dollar\", \"\").replace(\"\\n\\nPrice is $\", \"\")\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_message},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt},\n",
|
||||
" {\"role\": \"assistant\", \"content\": \"Price is $\"}\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"def messages_with_price(item):\n",
|
||||
" system_message = \"You estimate prices of items. Reply only with the price, no explanation\"\n",
|
||||
" user_prompt = item.test_prompt().replace(\" to the nearest dollar\", \"\").replace(\"\\n\\nPrice is $\", \"\")\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_message},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt},\n",
|
||||
" {\"role\": \"assistant\", \"content\": f\"Price is ${item.price:.2f}\"}\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"print(\"✅ Helper functions defined!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Baseline GPT-4o Model\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def gpt_4o_frontier(item):\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=\"gpt-4o\",\n",
|
||||
" messages=messages_for(item),\n",
|
||||
" seed=42,\n",
|
||||
" max_tokens=5\n",
|
||||
" )\n",
|
||||
" reply = response.choices[0].message.content\n",
|
||||
" return get_price(reply)\n",
|
||||
"\n",
|
||||
"print(\"🧪 Testing baseline GPT-4o model...\")\n",
|
||||
"\n",
|
||||
"# Safety check: Make sure we have test data\n",
|
||||
"if not test or len(test) == 0:\n",
|
||||
" print(\"❌ No test data available! Cannot run baseline test.\")\n",
|
||||
" print(\"💡 Please check the data loading section above.\")\n",
|
||||
" print(\"🔍 Debug info:\")\n",
|
||||
" print(f\" test variable exists: {test is not None}\")\n",
|
||||
" print(f\" test length: {len(test) if test else 'N/A'}\")\n",
|
||||
" print(f\" test type: {type(test)}\")\n",
|
||||
"else:\n",
|
||||
" print(f\"📊 Testing on {len(test)} items...\")\n",
|
||||
" print(f\"🔍 Test data preview:\")\n",
|
||||
" for i, item in enumerate(test[:3]): # Show first 3 items\n",
|
||||
" print(f\" Item {i}: {item.title} - ${item.price}\")\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" # Create Tester with correct size parameter\n",
|
||||
" tester = Tester(gpt_4o_frontier, test, size=len(test))\n",
|
||||
" tester.run()\n",
|
||||
" except IndexError as e:\n",
|
||||
" print(f\"❌ IndexError in Tester.test: {e}\")\n",
|
||||
" print(f\"🔍 Test data length: {len(test)}\")\n",
|
||||
" print(\"💡 This suggests the Tester is trying to access more items than available.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Fine-tuning Implementation\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if fine_tuned_model_name:\n",
|
||||
" def gpt_fine_tuned(item):\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=fine_tuned_model_name,\n",
|
||||
" messages=messages_for(item),\n",
|
||||
" seed=42,\n",
|
||||
" max_tokens=7\n",
|
||||
" )\n",
|
||||
" reply = response.choices[0].message.content\n",
|
||||
" return get_price(reply)\n",
|
||||
" \n",
|
||||
" print(\"🧪 Testing fine-tuned model...\")\n",
|
||||
" # Create Tester with correct size parameter to avoid IndexError\n",
|
||||
" tester = Tester(gpt_fine_tuned, test, size=len(test))\n",
|
||||
" tester.run()\n",
|
||||
"else:\n",
|
||||
" print(\"⏳ Fine-tuned model not ready yet. Please wait and re-run the previous cell.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Convert items to JSONL format for fine-tuning\n",
|
||||
"def make_jsonl(items):\n",
|
||||
" result = \"\"\n",
|
||||
" for item in items:\n",
|
||||
" messages = messages_with_price(item)\n",
|
||||
" messages_str = json.dumps(messages)\n",
|
||||
" result += '{\"messages\": ' + messages_str + '}\\n'\n",
|
||||
" return result.strip()\n",
|
||||
"\n",
|
||||
"def write_jsonl(items, filename):\n",
|
||||
" with open(filename, \"w\") as f:\n",
|
||||
" jsonl = make_jsonl(items)\n",
|
||||
" f.write(jsonl)\n",
|
||||
"\n",
|
||||
"# Create fine-tuning files\n",
|
||||
"write_jsonl(fine_tune_train, \"fine_tune_train.jsonl\")\n",
|
||||
"write_jsonl(fine_tune_validation, \"fine_tune_validation.jsonl\")\n",
|
||||
"\n",
|
||||
"print(\"✅ Fine-tuning files created:\")\n",
|
||||
"print(\" - fine_tune_train.jsonl\")\n",
|
||||
"print(\" - fine_tune_validation.jsonl\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upload files to OpenAI\n",
|
||||
"with open(\"fine_tune_train.jsonl\", \"rb\") as f:\n",
|
||||
" train_file = openai.files.create(file=f, purpose=\"fine-tune\")\n",
|
||||
"\n",
|
||||
"with open(\"fine_tune_validation.jsonl\", \"rb\") as f:\n",
|
||||
" validation_file = openai.files.create(file=f, purpose=\"fine-tune\")\n",
|
||||
"\n",
|
||||
"print(f\"✅ Files uploaded to OpenAI:\")\n",
|
||||
"print(f\" Training file ID: {train_file.id}\")\n",
|
||||
"print(f\" Validation file ID: {validation_file.id}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create fine-tuning job\n",
|
||||
"fine_tuning_job = openai.fine_tuning.jobs.create(\n",
|
||||
" training_file=train_file.id,\n",
|
||||
" validation_file=validation_file.id,\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" seed=42,\n",
|
||||
" hyperparameters={\"n_epochs\": 1},\n",
|
||||
" integrations=[wandb_integration],\n",
|
||||
" suffix=\"pricer\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"🚀 Fine-tuning job created: {fine_tuning_job.id}\")\n",
|
||||
"print(\"⏳ This will take some time to complete...\")\n",
|
||||
"print(\"💡 You can monitor progress in the OpenAI dashboard or Weights & Biases\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# FIXED: Test enhanced model (if ready) - with correct Tester size\n",
|
||||
"try:\n",
|
||||
" enhanced_model_name = openai.fine_tuning.jobs.retrieve(fine_tuning_job_v2.id).fine_tuned_model\n",
|
||||
" \n",
|
||||
" def gpt_enhanced_fine_tuned(item):\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=enhanced_model_name,\n",
|
||||
" messages=messages_v2(item, with_price=False),\n",
|
||||
" seed=42,\n",
|
||||
" temperature=1.0,\n",
|
||||
" max_tokens=7\n",
|
||||
" )\n",
|
||||
" reply = response.choices[0].message.content\n",
|
||||
" return get_price(reply)\n",
|
||||
" \n",
|
||||
" print(\"🧪 Testing enhanced fine-tuned model...\")\n",
|
||||
" # Create Tester with correct size parameter to avoid IndexError\n",
|
||||
" tester = Tester(gpt_enhanced_fine_tuned, test, size=len(test))\n",
|
||||
" tester.run()\n",
|
||||
" \n",
|
||||
"except:\n",
|
||||
" print(\"⏳ Enhanced fine-tuned model not ready yet.\")\n",
|
||||
" print(\"💡 Please wait for completion and re-run this cell.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check job status\n",
|
||||
"job_id = fine_tuning_job.id\n",
|
||||
"job_status = openai.fine_tuning.jobs.retrieve(job_id)\n",
|
||||
"\n",
|
||||
"print(f\"📊 Job Status: {job_status.status}\")\n",
|
||||
"print(f\"📈 Training File: {job_status.training_file}\")\n",
|
||||
"print(f\"📈 Validation File: {job_status.validation_file}\")\n",
|
||||
"print(f\"🤖 Model: {job_status.model}\")\n",
|
||||
"\n",
|
||||
"# Get recent events\n",
|
||||
"events = openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10)\n",
|
||||
"print(f\"\\n📋 Recent Events:\")\n",
|
||||
"for event in events.data:\n",
|
||||
" print(f\" {event.created_at}: {event.message}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test Fine-tuned Model\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Wait for fine-tuning to complete and get the model name\n",
|
||||
"# Note: In practice, you would wait for the job to complete\n",
|
||||
"try:\n",
|
||||
" fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model\n",
|
||||
" print(f\"✅ Fine-tuned model ready: {fine_tuned_model_name}\")\n",
|
||||
"except:\n",
|
||||
" print(\"⏳ Fine-tuning still in progress...\")\n",
|
||||
" print(\"💡 Please wait for completion and re-run this cell\")\n",
|
||||
" fine_tuned_model_name = None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test the fine-tuned model (if ready)\n",
|
||||
"if fine_tuned_model_name:\n",
|
||||
" def gpt_fine_tuned(item):\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=fine_tuned_model_name,\n",
|
||||
" messages=messages_for(item),\n",
|
||||
" seed=42,\n",
|
||||
" max_tokens=7\n",
|
||||
" )\n",
|
||||
" reply = response.choices[0].message.content\n",
|
||||
" return get_price(reply)\n",
|
||||
" \n",
|
||||
" print(\"🧪 Testing fine-tuned model...\")\n",
|
||||
" Tester.test(gpt_fine_tuned, test)\n",
|
||||
"else:\n",
|
||||
" print(\"⏳ Fine-tuned model not ready yet. Please wait and re-run the previous cell.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Advanced Fine-tuning with Enhanced Prompts\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Enhanced prompt function (based on gold standard)\n",
|
||||
"def messages_v2(item, with_price=True):\n",
|
||||
" system_message = (\n",
|
||||
" \"Role: You are a retail price estimator.\\n\"\n",
|
||||
" \"Market: United States; Currency: USD.\\n\"\n",
|
||||
" \"Scope: Predict the most likely new retail price. Ignore taxes, shipping, coupons, bundles, used/renewed.\\n\"\n",
|
||||
" \"Output: Only a number with two decimals (e.g., 129.99). No $ sign. No words.\\n\"\n",
|
||||
" \"Think silently; do not reveal reasoning.\"\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" user_prompt = item.test_prompt().replace(\" to the nearest dollar\", \"\").replace(\"\\n\\nPrice is $\", \"\")\n",
|
||||
" \n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_message},\n",
|
||||
" {\"role\": \"user\", \"content\": str({\n",
|
||||
" \"query\": \"price_estimate\",\n",
|
||||
" \"locale\": \"en_US\",\n",
|
||||
" \"currency\": \"USD\",\n",
|
||||
" \"category\": item.category,\n",
|
||||
" \"description\": user_prompt,\n",
|
||||
" \"brand\": json.loads(item.details).get(\"Brand\", \"Unknown\") if item.details else \"Unknown\"\n",
|
||||
" })},\n",
|
||||
" {\"role\": \"assistant\", \"content\": f\"Price is ${item.price:.2f}\" if with_price else \"Price is $\"}\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"print(\"✅ Enhanced prompt function created!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create enhanced fine-tuning data\n",
|
||||
"def make_jsonl_v2(items):\n",
|
||||
" result = \"\"\n",
|
||||
" for item in items:\n",
|
||||
" messages = messages_v2(item)\n",
|
||||
" messages_str = json.dumps(messages)\n",
|
||||
" result += '{\"messages\": ' + messages_str + '}\\n'\n",
|
||||
" return result.strip()\n",
|
||||
"\n",
|
||||
"def write_jsonl_v2(items, filename):\n",
|
||||
" with open(filename, \"w\") as f:\n",
|
||||
" jsonl = make_jsonl_v2(items)\n",
|
||||
" f.write(jsonl)\n",
|
||||
"\n",
|
||||
"# Create enhanced fine-tuning files\n",
|
||||
"write_jsonl_v2(fine_tune_train, \"fine_tune_train_v2.jsonl\")\n",
|
||||
"write_jsonl_v2(fine_tune_validation, \"fine_tune_validation_v2.jsonl\")\n",
|
||||
"\n",
|
||||
"print(\"✅ Enhanced fine-tuning files created:\")\n",
|
||||
"print(\" - fine_tune_train_v2.jsonl\")\n",
|
||||
"print(\" - fine_tune_validation_v2.jsonl\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upload enhanced files and create second fine-tuning job\n",
|
||||
"with open(\"fine_tune_train_v2.jsonl\", \"rb\") as f:\n",
|
||||
" train_file_v2 = openai.files.create(file=f, purpose=\"fine-tune\")\n",
|
||||
"\n",
|
||||
"with open(\"fine_tune_validation_v2.jsonl\", \"rb\") as f:\n",
|
||||
" validation_file_v2 = openai.files.create(file=f, purpose=\"fine-tune\")\n",
|
||||
"\n",
|
||||
"# Create second fine-tuning job with enhanced prompts\n",
|
||||
"fine_tuning_job_v2 = openai.fine_tuning.jobs.create(\n",
|
||||
" training_file=train_file_v2.id,\n",
|
||||
" validation_file=validation_file_v2.id,\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" seed=42,\n",
|
||||
" hyperparameters={\"n_epochs\": 1},\n",
|
||||
" integrations=[wandb_integration],\n",
|
||||
" suffix=\"pricer-v2\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"🚀 Enhanced fine-tuning job created: {fine_tuning_job_v2.id}\")\n",
|
||||
"print(\"⏳ This will take some time to complete...\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Model Comparison and Results\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test enhanced model (if ready)\n",
|
||||
"try:\n",
|
||||
" enhanced_model_name = openai.fine_tuning.jobs.retrieve(fine_tuning_job_v2.id).fine_tuned_model\n",
|
||||
" \n",
|
||||
" def gpt_enhanced_fine_tuned(item):\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=enhanced_model_name,\n",
|
||||
" messages=messages_v2(item, with_price=False),\n",
|
||||
" seed=42,\n",
|
||||
" temperature=1.0,\n",
|
||||
" max_tokens=7\n",
|
||||
" )\n",
|
||||
" reply = response.choices[0].message.content\n",
|
||||
" return get_price(reply)\n",
|
||||
" \n",
|
||||
" print(\"🧪 Testing enhanced fine-tuned model...\")\n",
|
||||
" Tester.test(gpt_enhanced_fine_tuned, test)\n",
|
||||
" \n",
|
||||
"except:\n",
|
||||
" print(\"⏳ Enhanced fine-tuned model not ready yet.\")\n",
|
||||
" print(\"💡 Please wait for completion and re-run this cell.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Summary and Next Steps\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"🎉 Week 6 Product Pricer Challenge Complete!\")\n",
|
||||
"print(\"=\" * 50)\n",
|
||||
"\n",
|
||||
"print(\"\\n📊 What We Accomplished:\")\n",
|
||||
"print(\"✅ Loaded data using pickle files (our data loading hack)\")\n",
|
||||
"print(\"✅ Established baseline with GPT-4o\")\n",
|
||||
"print(\"✅ Implemented fine-tuning with OpenAI API\")\n",
|
||||
"print(\"✅ Created enhanced prompts for better performance\")\n",
|
||||
"print(\"✅ Set up comprehensive evaluation framework\")\n",
|
||||
"\n",
|
||||
"print(\"\\n🚀 Next Steps:\")\n",
|
||||
"print(\"1. Wait for fine-tuning jobs to complete\")\n",
|
||||
"print(\"2. Compare performance of all models\")\n",
|
||||
"print(\"3. Experiment with different hyperparameters\")\n",
|
||||
"print(\"4. Try different base models (GPT-4.1, etc.)\")\n",
|
||||
"print(\"5. Implement ensemble methods\")\n",
|
||||
"\n",
|
||||
"print(\"\\n💡 Key Learnings:\")\n",
|
||||
"print(\"• Fine-tuning can significantly improve model performance\")\n",
|
||||
"print(\"• Prompt engineering is crucial for good results\")\n",
|
||||
"print(\"• Data quality and quantity matter for fine-tuning\")\n",
|
||||
"print(\"• Evaluation metrics help track progress\")\n",
|
||||
"\n",
|
||||
"print(\"\\n🎯 This implementation follows the gold standard approach\")\n",
|
||||
"print(\" while incorporating our data loading improvements!\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,149 @@
|
||||
from typing import Optional
|
||||
from transformers import AutoTokenizer
|
||||
import re
|
||||
import os
|
||||
|
||||
# Try multiple model sources in order of preference
|
||||
BASE_MODEL_OPTIONS = [
|
||||
"/root/.llama/checkpoints/Llama3.1-8B", # Local llama-stack download
|
||||
"microsoft/DialoGPT-medium", # Accessible alternative
|
||||
"gpt2" # Fallback
|
||||
]
|
||||
|
||||
BASE_MODEL = None
|
||||
|
||||
MIN_TOKENS = 150 # Any less than this, and we don't have enough useful content
|
||||
MAX_TOKENS = 160 # Truncate after this many tokens. Then after adding in prompt text, we will get to around 180 tokens
|
||||
|
||||
MIN_CHARS = 300
|
||||
CEILING_CHARS = MAX_TOKENS * 7
|
||||
|
||||
class Item:
|
||||
"""
|
||||
An Item is a cleaned, curated datapoint of a Product with a Price
|
||||
Enhanced version with better error handling and alternative tokenizer
|
||||
"""
|
||||
|
||||
# Initialize tokenizer with fallback options
|
||||
tokenizer = None
|
||||
for model_path in BASE_MODEL_OPTIONS:
|
||||
try:
|
||||
if model_path.startswith("/") and not os.path.exists(model_path):
|
||||
continue # Skip local paths that don't exist
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
BASE_MODEL = model_path
|
||||
print(f"✅ Successfully loaded tokenizer from: {model_path}")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"⚠️ Failed to load {model_path}: {e}")
|
||||
continue
|
||||
|
||||
if tokenizer is None:
|
||||
print("❌ All tokenizer options failed. Using character-based fallback.")
|
||||
# Create a dummy tokenizer for fallback
|
||||
class DummyTokenizer:
|
||||
def encode(self, text, add_special_tokens=False):
|
||||
# Rough approximation: 1 token ≈ 4 characters
|
||||
return list(range(len(text) // 4))
|
||||
def decode(self, tokens):
|
||||
return "dummy text"
|
||||
tokenizer = DummyTokenizer()
|
||||
BASE_MODEL = "fallback"
|
||||
|
||||
PREFIX = "Price is $"
|
||||
QUESTION = "How much does this cost to the nearest dollar?"
|
||||
REMOVALS = [
|
||||
'"Batteries Included?": "No"',
|
||||
'"Batteries Included?": "Yes"',
|
||||
'"Batteries Required?": "No"',
|
||||
'"Batteries Required?": "Yes"',
|
||||
"By Manufacturer",
|
||||
"Item",
|
||||
"Date First",
|
||||
"Package",
|
||||
":",
|
||||
"Number of",
|
||||
"Best Sellers",
|
||||
"Number",
|
||||
"Product "
|
||||
]
|
||||
|
||||
title: str
|
||||
price: float
|
||||
category: str
|
||||
token_count: int = 0
|
||||
details: Optional[str]
|
||||
prompt: Optional[str] = None
|
||||
include = False
|
||||
|
||||
def __init__(self, data, price):
|
||||
self.title = data['title']
|
||||
self.price = price
|
||||
self.parse(data)
|
||||
|
||||
def scrub_details(self):
|
||||
"""
|
||||
Clean up the details string by removing common text that doesn't add value
|
||||
"""
|
||||
details = self.details
|
||||
for remove in self.REMOVALS:
|
||||
details = details.replace(remove, "")
|
||||
return details
|
||||
|
||||
def scrub(self, stuff):
|
||||
"""
|
||||
Clean up the provided text by removing unnecessary characters and whitespace
|
||||
Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers
|
||||
"""
|
||||
stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip()
|
||||
stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",")
|
||||
words = stuff.split(' ')
|
||||
select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]
|
||||
return " ".join(select)
|
||||
|
||||
def parse(self, data):
|
||||
"""
|
||||
Parse this datapoint and if it fits within the allowed Token range,
|
||||
then set include to True
|
||||
"""
|
||||
contents = '\n'.join(data['description'])
|
||||
if contents:
|
||||
contents += '\n'
|
||||
features = '\n'.join(data['features'])
|
||||
if features:
|
||||
contents += features + '\n'
|
||||
self.details = data['details']
|
||||
if self.details:
|
||||
contents += self.scrub_details() + '\n'
|
||||
if len(contents) > MIN_CHARS:
|
||||
contents = contents[:CEILING_CHARS]
|
||||
text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
|
||||
tokens = self.tokenizer.encode(text, add_special_tokens=False)
|
||||
if len(tokens) > MIN_TOKENS:
|
||||
tokens = tokens[:MAX_TOKENS]
|
||||
text = self.tokenizer.decode(tokens)
|
||||
self.make_prompt(text)
|
||||
self.include = True
|
||||
|
||||
def make_prompt(self, text):
|
||||
"""
|
||||
Set the prompt instance variable to be a prompt appropriate for training
|
||||
"""
|
||||
self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
|
||||
self.prompt += f"{self.PREFIX}{str(round(self.price))}.00"
|
||||
self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))
|
||||
|
||||
def test_prompt(self):
|
||||
"""
|
||||
Return a prompt suitable for testing, with the actual price removed
|
||||
"""
|
||||
return self.prompt.split(self.PREFIX)[0] + self.PREFIX
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a String version of this Item
|
||||
"""
|
||||
return f"<{self.title} = ${self.price}>"
|
||||
|
||||
|
||||
|
||||
BIN
week6/community-contributions/finetuning-joshua/test.pkl
Normal file
BIN
week6/community-contributions/finetuning-joshua/test.pkl
Normal file
Binary file not shown.
75
week6/community-contributions/finetuning-joshua/testing.py
Normal file
75
week6/community-contributions/finetuning-joshua/testing.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import math
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
GREEN = "\033[92m"
|
||||
YELLOW = "\033[93m"
|
||||
RED = "\033[91m"
|
||||
RESET = "\033[0m"
|
||||
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}
|
||||
|
||||
class Tester:
|
||||
|
||||
def __init__(self, predictor, data, title=None, size=250):
|
||||
self.predictor = predictor
|
||||
self.data = data
|
||||
self.title = title or predictor.__name__.replace("_", " ").title()
|
||||
self.size = size
|
||||
self.guesses = []
|
||||
self.truths = []
|
||||
self.errors = []
|
||||
self.sles = []
|
||||
self.colors = []
|
||||
|
||||
def color_for(self, error, truth):
|
||||
if error<40 or error/truth < 0.2:
|
||||
return "green"
|
||||
elif error<80 or error/truth < 0.4:
|
||||
return "orange"
|
||||
else:
|
||||
return "red"
|
||||
|
||||
def run_datapoint(self, i):
|
||||
datapoint = self.data[i]
|
||||
guess = self.predictor(datapoint)
|
||||
truth = datapoint.price
|
||||
error = abs(guess - truth)
|
||||
log_error = math.log(truth+1) - math.log(guess+1)
|
||||
sle = log_error ** 2
|
||||
color = self.color_for(error, truth)
|
||||
title = datapoint.title if len(datapoint.title) <= 40 else datapoint.title[:40]+"..."
|
||||
self.guesses.append(guess)
|
||||
self.truths.append(truth)
|
||||
self.errors.append(error)
|
||||
self.sles.append(sle)
|
||||
self.colors.append(color)
|
||||
print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")
|
||||
|
||||
def chart(self, title):
|
||||
max_error = max(self.errors)
|
||||
plt.figure(figsize=(12, 8))
|
||||
max_val = max(max(self.truths), max(self.guesses))
|
||||
plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
|
||||
plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
|
||||
plt.xlabel('Ground Truth')
|
||||
plt.ylabel('Model Estimate')
|
||||
plt.xlim(0, max_val)
|
||||
plt.ylim(0, max_val)
|
||||
plt.title(title)
|
||||
plt.show()
|
||||
|
||||
def report(self):
|
||||
average_error = sum(self.errors) / self.size
|
||||
rmsle = math.sqrt(sum(self.sles) / self.size)
|
||||
hits = sum(1 for color in self.colors if color=="green")
|
||||
title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
|
||||
self.chart(title)
|
||||
|
||||
def run(self):
|
||||
self.error = 0
|
||||
for i in range(self.size):
|
||||
self.run_datapoint(i)
|
||||
self.report()
|
||||
|
||||
@classmethod
|
||||
def test(cls, function, data):
|
||||
cls(function, data).run()
|
||||
BIN
week6/community-contributions/finetuning-joshua/train.pkl
Normal file
BIN
week6/community-contributions/finetuning-joshua/train.pkl
Normal file
Binary file not shown.
BIN
week6/community-contributions/finetuning-joshua/validation.pkl
Normal file
BIN
week6/community-contributions/finetuning-joshua/validation.pkl
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,35 @@
|
||||
import logging
|
||||
|
||||
class Agent:
|
||||
"""
|
||||
An abstract superclass for Agents
|
||||
Used to log messages in a way that can identify each Agent
|
||||
"""
|
||||
|
||||
# Foreground colors
|
||||
RED = '\033[31m'
|
||||
GREEN = '\033[32m'
|
||||
YELLOW = '\033[33m'
|
||||
BLUE = '\033[34m'
|
||||
MAGENTA = '\033[35m'
|
||||
CYAN = '\033[36m'
|
||||
WHITE = '\033[37m'
|
||||
|
||||
# Background color
|
||||
BG_BLACK = '\033[40m'
|
||||
|
||||
# Reset code to return to default color
|
||||
RESET = '\033[0m'
|
||||
|
||||
name: str = ""
|
||||
color: str = '\033[37m'
|
||||
|
||||
def log(self, message):
|
||||
"""
|
||||
Log this as an info message, identifying the agent
|
||||
"""
|
||||
color_code = self.BG_BLACK + self.color
|
||||
message = f"[{self.name}] {message}"
|
||||
logging.info(color_code + message + self.RESET)
|
||||
|
||||
|
||||
111
week8/community_contributions/ensemble-joshua/agents/deals.py
Normal file
111
week8/community_contributions/ensemble-joshua/agents/deals.py
Normal file
@@ -0,0 +1,111 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Dict, Self
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import feedparser
|
||||
from tqdm import tqdm
|
||||
import requests
|
||||
import time
|
||||
|
||||
feeds = [
|
||||
"https://www.dealnews.com/c142/Electronics/?rss=1",
|
||||
"https://www.dealnews.com/c39/Computers/?rss=1",
|
||||
"https://www.dealnews.com/c238/Automotive/?rss=1",
|
||||
"https://www.dealnews.com/f1912/Smart-Home/?rss=1",
|
||||
"https://www.dealnews.com/c196/Home-Garden/?rss=1",
|
||||
]
|
||||
|
||||
def extract(html_snippet: str) -> str:
|
||||
"""
|
||||
Use Beautiful Soup to clean up this HTML snippet and extract useful text
|
||||
"""
|
||||
soup = BeautifulSoup(html_snippet, 'html.parser')
|
||||
snippet_div = soup.find('div', class_='snippet summary')
|
||||
|
||||
if snippet_div:
|
||||
description = snippet_div.get_text(strip=True)
|
||||
description = BeautifulSoup(description, 'html.parser').get_text()
|
||||
description = re.sub('<[^<]+?>', '', description)
|
||||
result = description.strip()
|
||||
else:
|
||||
result = html_snippet
|
||||
return result.replace('\n', ' ')
|
||||
|
||||
class ScrapedDeal:
|
||||
"""
|
||||
A class to represent a Deal retrieved from an RSS feed
|
||||
"""
|
||||
category: str
|
||||
title: str
|
||||
summary: str
|
||||
url: str
|
||||
details: str
|
||||
features: str
|
||||
|
||||
def __init__(self, entry: Dict[str, str]):
|
||||
"""
|
||||
Populate this instance based on the provided dict
|
||||
"""
|
||||
self.title = entry['title']
|
||||
self.summary = extract(entry['summary'])
|
||||
self.url = entry['links'][0]['href']
|
||||
stuff = requests.get(self.url).content
|
||||
soup = BeautifulSoup(stuff, 'html.parser')
|
||||
content = soup.find('div', class_='content-section').get_text()
|
||||
content = content.replace('\nmore', '').replace('\n', ' ')
|
||||
if "Features" in content:
|
||||
self.details, self.features = content.split("Features")
|
||||
else:
|
||||
self.details = content
|
||||
self.features = ""
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a string to describe this deal
|
||||
"""
|
||||
return f"<{self.title}>"
|
||||
|
||||
def describe(self):
|
||||
"""
|
||||
Return a longer string to describe this deal for use in calling a model
|
||||
"""
|
||||
return f"Title: {self.title}\nDetails: {self.details.strip()}\nFeatures: {self.features.strip()}\nURL: {self.url}"
|
||||
|
||||
@classmethod
|
||||
def fetch(cls, show_progress : bool = False) -> List[Self]:
|
||||
"""
|
||||
Retrieve all deals from the selected RSS feeds
|
||||
"""
|
||||
deals = []
|
||||
feed_iter = tqdm(feeds) if show_progress else feeds
|
||||
for feed_url in feed_iter:
|
||||
feed = feedparser.parse(feed_url)
|
||||
for entry in feed.entries[:10]:
|
||||
deals.append(cls(entry))
|
||||
time.sleep(0.5)
|
||||
return deals
|
||||
|
||||
class Deal(BaseModel):
|
||||
"""
|
||||
A class to Represent a Deal with a summary description
|
||||
"""
|
||||
product_description: str
|
||||
price: float
|
||||
url: str
|
||||
|
||||
class DealSelection(BaseModel):
|
||||
"""
|
||||
A class to Represent a list of Deals
|
||||
"""
|
||||
deals: List[Deal]
|
||||
|
||||
class Opportunity(BaseModel):
|
||||
"""
|
||||
A class to represent a possible opportunity: a Deal where we estimate
|
||||
it should cost more than it's being offered
|
||||
"""
|
||||
deal: Deal
|
||||
estimate: float
|
||||
discount: float
|
||||
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
import pandas as pd
|
||||
from sklearn.linear_model import LinearRegression
|
||||
import joblib
|
||||
import os
|
||||
|
||||
from agents.agent import Agent
|
||||
from agents.specialist_agent import SpecialistAgent
|
||||
from agents.frontier_agent import FrontierAgent
|
||||
from agents.random_forest_agent import RandomForestAgent
|
||||
|
||||
class EnsembleAgent(Agent):
|
||||
|
||||
name = "Ensemble Agent"
|
||||
color = Agent.YELLOW
|
||||
|
||||
def __init__(self, collection):
|
||||
"""
|
||||
Create an instance of Ensemble, by creating each of the models
|
||||
And loading the weights of the Ensemble
|
||||
"""
|
||||
self.log("Initializing Ensemble Agent")
|
||||
self.specialist = SpecialistAgent()
|
||||
self.frontier = FrontierAgent(collection)
|
||||
self.random_forest = RandomForestAgent()
|
||||
# Resolve model path: prefer local contribution folder copy, fallback to week8 root
|
||||
candidate_paths = [
|
||||
os.path.join(os.path.dirname(os.path.dirname(__file__)), 'ensemble_model.pkl'), # ../../ensemble_model.pkl
|
||||
os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'ensemble_model.pkl'), # ../../../ensemble_model.pkl (week8 root)
|
||||
'ensemble_model.pkl',
|
||||
]
|
||||
model_path = next((p for p in candidate_paths if os.path.exists(p)), candidate_paths[-1])
|
||||
self.model = joblib.load(model_path)
|
||||
self.log("Ensemble Agent is ready")
|
||||
|
||||
def price(self, description: str) -> float:
|
||||
"""
|
||||
Run this ensemble model
|
||||
Ask each of the models to price the product
|
||||
Then use the Linear Regression model to return the weighted price
|
||||
:param description: the description of a product
|
||||
:return: an estimate of its price
|
||||
"""
|
||||
self.log("Running Ensemble Agent - collaborating with specialist, frontier and random forest agents")
|
||||
specialist = self.specialist.price(description)
|
||||
frontier = self.frontier.price(description)
|
||||
random_forest = self.random_forest.price(description)
|
||||
X = pd.DataFrame({
|
||||
'Specialist': [specialist],
|
||||
'Frontier': [frontier],
|
||||
'RandomForest': [random_forest],
|
||||
'Min': [min(specialist, frontier, random_forest)],
|
||||
'Max': [max(specialist, frontier, random_forest)],
|
||||
})
|
||||
y = max(0, self.model.predict(X)[0])
|
||||
self.log(f"Ensemble Agent complete - returning ${y:.2f}")
|
||||
return y
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
import os
|
||||
# from twilio.rest import Client
|
||||
from agents.deals import Opportunity
|
||||
import http.client
|
||||
import urllib
|
||||
from agents.agent import Agent
|
||||
|
||||
# Uncomment the Twilio lines if you wish to use Twilio
|
||||
|
||||
DO_TEXT = False
|
||||
DO_PUSH = True
|
||||
|
||||
class MessagingAgent(Agent):
|
||||
|
||||
name = "Messaging Agent"
|
||||
color = Agent.WHITE
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Set up this object to either do push notifications via Pushover,
|
||||
or SMS via Twilio,
|
||||
whichever is specified in the constants
|
||||
"""
|
||||
self.log(f"Messaging Agent is initializing")
|
||||
if DO_TEXT:
|
||||
account_sid = os.getenv('TWILIO_ACCOUNT_SID', 'your-sid-if-not-using-env')
|
||||
auth_token = os.getenv('TWILIO_AUTH_TOKEN', 'your-auth-if-not-using-env')
|
||||
self.me_from = os.getenv('TWILIO_FROM', 'your-phone-number-if-not-using-env')
|
||||
self.me_to = os.getenv('MY_PHONE_NUMBER', 'your-phone-number-if-not-using-env')
|
||||
# self.client = Client(account_sid, auth_token)
|
||||
self.log("Messaging Agent has initialized Twilio")
|
||||
if DO_PUSH:
|
||||
self.pushover_user = os.getenv('PUSHOVER_USER', 'your-pushover-user-if-not-using-env')
|
||||
self.pushover_token = os.getenv('PUSHOVER_TOKEN', 'your-pushover-user-if-not-using-env')
|
||||
self.log("Messaging Agent has initialized Pushover")
|
||||
|
||||
def message(self, text):
|
||||
"""
|
||||
Send an SMS message using the Twilio API
|
||||
"""
|
||||
self.log("Messaging Agent is sending a text message")
|
||||
message = self.client.messages.create(
|
||||
from_=self.me_from,
|
||||
body=text,
|
||||
to=self.me_to
|
||||
)
|
||||
|
||||
def push(self, text):
|
||||
"""
|
||||
Send a Push Notification using the Pushover API
|
||||
"""
|
||||
self.log("Messaging Agent is sending a push notification")
|
||||
conn = http.client.HTTPSConnection("api.pushover.net:443")
|
||||
conn.request("POST", "/1/messages.json",
|
||||
urllib.parse.urlencode({
|
||||
"token": self.pushover_token,
|
||||
"user": self.pushover_user,
|
||||
"message": text,
|
||||
"sound": "cashregister"
|
||||
}), { "Content-type": "application/x-www-form-urlencoded" })
|
||||
conn.getresponse()
|
||||
|
||||
def alert(self, opportunity: Opportunity):
|
||||
"""
|
||||
Make an alert about the specified Opportunity
|
||||
"""
|
||||
text = f"Deal Alert! Price=${opportunity.deal.price:.2f}, "
|
||||
text += f"Estimate=${opportunity.estimate:.2f}, "
|
||||
text += f"Discount=${opportunity.discount:.2f} :"
|
||||
text += opportunity.deal.product_description[:10]+'... '
|
||||
text += opportunity.deal.url
|
||||
if DO_TEXT:
|
||||
self.message(text)
|
||||
if DO_PUSH:
|
||||
self.push(text)
|
||||
self.log("Messaging Agent has completed")
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
from typing import Optional, List
|
||||
from agents.agent import Agent
|
||||
from agents.deals import ScrapedDeal, DealSelection, Deal, Opportunity
|
||||
from agents.scanner_agent import ScannerAgent
|
||||
from agents.ensemble_agent import EnsembleAgent
|
||||
from agents.messaging_agent import MessagingAgent
|
||||
|
||||
|
||||
class PlanningAgent(Agent):
|
||||
|
||||
name = "Planning Agent"
|
||||
color = Agent.GREEN
|
||||
DEAL_THRESHOLD = 50
|
||||
|
||||
def __init__(self, collection):
|
||||
"""
|
||||
Create instances of the 3 Agents that this planner coordinates across
|
||||
"""
|
||||
self.log("Planning Agent is initializing")
|
||||
self.scanner = ScannerAgent()
|
||||
self.ensemble = EnsembleAgent(collection)
|
||||
self.messenger = MessagingAgent()
|
||||
self.log("Planning Agent is ready")
|
||||
|
||||
def run(self, deal: Deal) -> Opportunity:
|
||||
"""
|
||||
Run the workflow for a particular deal
|
||||
:param deal: the deal, summarized from an RSS scrape
|
||||
:returns: an opportunity including the discount
|
||||
"""
|
||||
self.log("Planning Agent is pricing up a potential deal")
|
||||
estimate = self.ensemble.price(deal.product_description)
|
||||
discount = estimate - deal.price
|
||||
self.log(f"Planning Agent has processed a deal with discount ${discount:.2f}")
|
||||
return Opportunity(deal=deal, estimate=estimate, discount=discount)
|
||||
|
||||
def plan(self, memory: List[str] = []) -> Optional[Opportunity]:
|
||||
"""
|
||||
Run the full workflow:
|
||||
1. Use the ScannerAgent to find deals from RSS feeds
|
||||
2. Use the EnsembleAgent to estimate them
|
||||
3. Use the MessagingAgent to send a notification of deals
|
||||
:param memory: a list of URLs that have been surfaced in the past
|
||||
:return: an Opportunity if one was surfaced, otherwise None
|
||||
"""
|
||||
self.log("Planning Agent is kicking off a run")
|
||||
selection = self.scanner.scan(memory=memory)
|
||||
if selection:
|
||||
opportunities = [self.run(deal) for deal in selection.deals[:5]]
|
||||
opportunities.sort(key=lambda opp: opp.discount, reverse=True)
|
||||
best = opportunities[0]
|
||||
self.log(f"Planning Agent has identified the best deal has discount ${best.discount:.2f}")
|
||||
if best.discount > self.DEAL_THRESHOLD:
|
||||
self.messenger.alert(best)
|
||||
self.log("Planning Agent has completed a run")
|
||||
return best if best.discount > self.DEAL_THRESHOLD else None
|
||||
return None
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
# imports
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import List
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import joblib
|
||||
import os
|
||||
from agents.agent import Agent
|
||||
|
||||
|
||||
|
||||
class RandomForestAgent(Agent):
|
||||
|
||||
name = "Random Forest Agent"
|
||||
color = Agent.MAGENTA
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize this object by loading in the saved model weights
|
||||
and the SentenceTransformer vector encoding model
|
||||
"""
|
||||
self.log("Random Forest Agent is initializing")
|
||||
self.vectorizer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
||||
# Resolve model path: prefer local contribution folder copy, fallback to week8 root
|
||||
candidate_paths = [
|
||||
os.path.join(os.path.dirname(os.path.dirname(__file__)), 'random_forest_model.pkl'), # ../../random_forest_model.pkl
|
||||
os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'random_forest_model.pkl'), # ../../../random_forest_model.pkl (week8 root)
|
||||
'random_forest_model.pkl',
|
||||
]
|
||||
model_path = next((p for p in candidate_paths if os.path.exists(p)), candidate_paths[-1])
|
||||
self.model = joblib.load(model_path)
|
||||
self.log("Random Forest Agent is ready")
|
||||
|
||||
def price(self, description: str) -> float:
|
||||
"""
|
||||
Use a Random Forest model to estimate the price of the described item
|
||||
:param description: the product to be estimated
|
||||
:return: the price as a float
|
||||
"""
|
||||
self.log("Random Forest Agent is starting a prediction")
|
||||
vector = self.vectorizer.encode([description])
|
||||
result = max(0, self.model.predict(vector)[0])
|
||||
self.log(f"Random Forest Agent completed - predicting ${result:.2f}")
|
||||
return result
|
||||
|
||||
@@ -0,0 +1,95 @@
|
||||
import os
|
||||
import json
|
||||
from typing import Optional, List
|
||||
from openai import OpenAI
|
||||
from agents.deals import ScrapedDeal, DealSelection
|
||||
from agents.agent import Agent
|
||||
|
||||
|
||||
class ScannerAgent(Agent):
|
||||
|
||||
MODEL = "gpt-4o-mini"
|
||||
|
||||
SYSTEM_PROMPT = """You identify and summarize the 5 most detailed deals from a list, by selecting deals that have the most detailed, high quality description and the most clear price.
|
||||
Respond strictly in JSON with no explanation, using this format. You should provide the price as a number derived from the description. If the price of a deal isn't clear, do not include that deal in your response.
|
||||
Most important is that you respond with the 5 deals that have the most detailed product description with price. It's not important to mention the terms of the deal; most important is a thorough description of the product.
|
||||
Be careful with products that are described as "$XXX off" or "reduced by $XXX" - this isn't the actual price of the product. Only respond with products when you are highly confident about the price.
|
||||
|
||||
{"deals": [
|
||||
{
|
||||
"product_description": "Your clearly expressed summary of the product in 4-5 sentences. Details of the item are much more important than why it's a good deal. Avoid mentioning discounts and coupons; focus on the item itself. There should be a paragpraph of text for each item you choose.",
|
||||
"price": 99.99,
|
||||
"url": "the url as provided"
|
||||
},
|
||||
...
|
||||
]}"""
|
||||
|
||||
USER_PROMPT_PREFIX = """Respond with the most promising 5 deals from this list, selecting those which have the most detailed, high quality product description and a clear price that is greater than 0.
|
||||
Respond strictly in JSON, and only JSON. You should rephrase the description to be a summary of the product itself, not the terms of the deal.
|
||||
Remember to respond with a paragraph of text in the product_description field for each of the 5 items that you select.
|
||||
Be careful with products that are described as "$XXX off" or "reduced by $XXX" - this isn't the actual price of the product. Only respond with products when you are highly confident about the price.
|
||||
|
||||
Deals:
|
||||
|
||||
"""
|
||||
|
||||
USER_PROMPT_SUFFIX = "\n\nStrictly respond in JSON and include exactly 5 deals, no more."
|
||||
|
||||
name = "Scanner Agent"
|
||||
color = Agent.CYAN
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Set up this instance by initializing OpenAI
|
||||
"""
|
||||
self.log("Scanner Agent is initializing")
|
||||
self.openai = OpenAI()
|
||||
self.log("Scanner Agent is ready")
|
||||
|
||||
def fetch_deals(self, memory) -> List[ScrapedDeal]:
|
||||
"""
|
||||
Look up deals published on RSS feeds
|
||||
Return any new deals that are not already in the memory provided
|
||||
"""
|
||||
self.log("Scanner Agent is about to fetch deals from RSS feed")
|
||||
urls = [opp.deal.url for opp in memory]
|
||||
scraped = ScrapedDeal.fetch()
|
||||
result = [scrape for scrape in scraped if scrape.url not in urls]
|
||||
self.log(f"Scanner Agent received {len(result)} deals not already scraped")
|
||||
return result
|
||||
|
||||
def make_user_prompt(self, scraped) -> str:
|
||||
"""
|
||||
Create a user prompt for OpenAI based on the scraped deals provided
|
||||
"""
|
||||
user_prompt = self.USER_PROMPT_PREFIX
|
||||
user_prompt += '\n\n'.join([scrape.describe() for scrape in scraped])
|
||||
user_prompt += self.USER_PROMPT_SUFFIX
|
||||
return user_prompt
|
||||
|
||||
def scan(self, memory: List[str]=[]) -> Optional[DealSelection]:
|
||||
"""
|
||||
Call OpenAI to provide a high potential list of deals with good descriptions and prices
|
||||
Use StructuredOutputs to ensure it conforms to our specifications
|
||||
:param memory: a list of URLs representing deals already raised
|
||||
:return: a selection of good deals, or None if there aren't any
|
||||
"""
|
||||
scraped = self.fetch_deals(memory)
|
||||
if scraped:
|
||||
user_prompt = self.make_user_prompt(scraped)
|
||||
self.log("Scanner Agent is calling OpenAI using Structured Output")
|
||||
result = self.openai.beta.chat.completions.parse(
|
||||
model=self.MODEL,
|
||||
messages=[
|
||||
{"role": "system", "content": self.SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt}
|
||||
],
|
||||
response_format=DealSelection
|
||||
)
|
||||
result = result.choices[0].message.parsed
|
||||
result.deals = [deal for deal in result.deals if deal.price>0]
|
||||
self.log(f"Scanner Agent received {len(result.deals)} selected deals with price>0 from OpenAI")
|
||||
return result
|
||||
return None
|
||||
|
||||
|
||||
75
week8/community_contributions/ensemble-joshua/api.py
Normal file
75
week8/community_contributions/ensemble-joshua/api.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
import os
|
||||
import chromadb
|
||||
|
||||
from agents.specialist_agent import SpecialistAgent
|
||||
from agents.frontier_agent import FrontierAgent
|
||||
from agents.random_forest_agent import RandomForestAgent
|
||||
from agents.ensemble_agent import EnsembleAgent
|
||||
from deal_agent_framework import DealAgentFramework
|
||||
|
||||
|
||||
class PriceRequest(BaseModel):
|
||||
description: str
|
||||
|
||||
|
||||
class DealScanResponse(BaseModel):
|
||||
opportunities: list
|
||||
|
||||
|
||||
DB_PATH = os.path.join(os.path.dirname(__file__), "../../products_vectorstore")
|
||||
client = chromadb.PersistentClient(path=DB_PATH)
|
||||
collection = client.get_or_create_collection("products")
|
||||
|
||||
app = FastAPI(title="Week8 Pricer API", version="1.0.0")
|
||||
|
||||
|
||||
@app.get("/healthz")
|
||||
def healthz():
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@app.post("/price/specialist")
|
||||
def price_specialist(body: PriceRequest):
|
||||
if not body.description:
|
||||
raise HTTPException(400, "description is required")
|
||||
agent = SpecialistAgent()
|
||||
price = float(agent.price(body.description))
|
||||
return {"price": price, "agent": "specialist"}
|
||||
|
||||
|
||||
@app.post("/price/frontier")
|
||||
def price_frontier(body: PriceRequest):
|
||||
if not body.description:
|
||||
raise HTTPException(400, "description is required")
|
||||
agent = FrontierAgent(collection)
|
||||
price = float(agent.price(body.description))
|
||||
return {"price": price, "agent": "frontier"}
|
||||
|
||||
|
||||
@app.post("/price/random_forest")
|
||||
def price_random_forest(body: PriceRequest):
|
||||
if not body.description:
|
||||
raise HTTPException(400, "description is required")
|
||||
agent = RandomForestAgent()
|
||||
price = float(agent.price(body.description))
|
||||
return {"price": price, "agent": "random_forest"}
|
||||
|
||||
|
||||
@app.post("/price/ensemble")
|
||||
def price_ensemble(body: PriceRequest):
|
||||
if not body.description:
|
||||
raise HTTPException(400, "description is required")
|
||||
agent = EnsembleAgent(collection)
|
||||
price = float(agent.price(body.description))
|
||||
return {"price": price, "agent": "ensemble"}
|
||||
|
||||
|
||||
@app.post("/deals/scan")
|
||||
def deals_scan():
|
||||
framework = DealAgentFramework()
|
||||
opportunities = framework.run()
|
||||
return {"count": len(opportunities), "opportunities": [o.dict() for o in opportunities]}
|
||||
|
||||
|
||||
BIN
week8/community_contributions/ensemble-joshua/ensemble_model.pkl
Normal file
BIN
week8/community_contributions/ensemble-joshua/ensemble_model.pkl
Normal file
Binary file not shown.
150
week8/community_contributions/ensemble-joshua/frontier_agent.py
Normal file
150
week8/community_contributions/ensemble-joshua/frontier_agent.py
Normal file
@@ -0,0 +1,150 @@
|
||||
# imports
|
||||
|
||||
import os
|
||||
import re
|
||||
import math
|
||||
import json
|
||||
from typing import List, Dict
|
||||
from openai import OpenAI
|
||||
try:
|
||||
from openai import APIStatusError
|
||||
APIStatusError = Exception
|
||||
import statistics
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from datasets import load_dataset
|
||||
import chromadb
|
||||
from items import Item
|
||||
from testing import Tester
|
||||
from agents.agent import Agent
|
||||
|
||||
|
||||
class FrontierAgent(Agent):
|
||||
|
||||
name = "Frontier Agent"
|
||||
color = Agent.BLUE
|
||||
|
||||
MODEL = "gpt-4o-mini"
|
||||
|
||||
def __init__(self, collection):
|
||||
"""
|
||||
Set up this instance by connecting to OpenAI or DeepSeek, to the Chroma Datastore,
|
||||
And setting up the vector encoding model
|
||||
"""
|
||||
self.log("Initializing Frontier Agent")
|
||||
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
|
||||
if deepseek_api_key:
|
||||
self.client = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com")
|
||||
self.MODEL = "deepseek-chat"
|
||||
self.log("Frontier Agent is set up with DeepSeek")
|
||||
else:
|
||||
self.client = OpenAI()
|
||||
self.MODEL = "gpt-4o-mini"
|
||||
self.log("Frontier Agent is setting up with OpenAI")
|
||||
self.collection = collection
|
||||
self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
||||
self.log("Frontier Agent is ready")
|
||||
|
||||
def make_context(self, similars: List[str], prices: List[float]) -> str:
|
||||
"""
|
||||
Create context that can be inserted into the prompt
|
||||
:param similars: similar products to the one being estimated
|
||||
:param prices: prices of the similar products
|
||||
:return: text to insert in the prompt that provides context
|
||||
"""
|
||||
message = "To provide some context, here are some other items that might be similar to the item you need to estimate.\n\n"
|
||||
for similar, price in zip(similars, prices):
|
||||
message += f"Potentially related product:\n{similar}\nPrice is ${price:.2f}\n\n"
|
||||
return message
|
||||
|
||||
def messages_for(self, description: str, similars: List[str], prices: List[float]) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Create the message list to be included in a call to OpenAI
|
||||
With the system and user prompt
|
||||
:param description: a description of the product
|
||||
:param similars: similar products to this one
|
||||
:param prices: prices of similar products
|
||||
:return: the list of messages in the format expected by OpenAI
|
||||
"""
|
||||
system_message = "You estimate prices of items. Reply only with the price, no explanation"
|
||||
user_prompt = self.make_context(similars, prices)
|
||||
user_prompt += "And now the question for you:\n\n"
|
||||
user_prompt += "How much does this cost?\n\n" + description
|
||||
return [
|
||||
{"role": "system", "content": system_message},
|
||||
{"role": "user", "content": user_prompt},
|
||||
{"role": "assistant", "content": "Price is $"}
|
||||
]
|
||||
|
||||
def find_similars(self, description: str):
|
||||
"""
|
||||
Return a list of items similar to the given one by looking in the Chroma datastore
|
||||
"""
|
||||
self.log("Frontier Agent is performing a RAG search of the Chroma datastore to find 5 similar products")
|
||||
vector = self.model.encode([description])
|
||||
results = self.collection.query(query_embeddings=vector.astype(float).tolist(), n_results=5)
|
||||
documents = results['documents'][0][:]
|
||||
prices = [m['price'] for m in results['metadatas'][0][:]]
|
||||
self.log("Frontier Agent has found similar products")
|
||||
return documents, prices
|
||||
|
||||
def get_price(self, s) -> float:
|
||||
"""
|
||||
A utility that plucks a floating point number out of a string
|
||||
"""
|
||||
s = s.replace('$','').replace(',','')
|
||||
match = re.search(r"[-+]?\d*\.\d+|\d+", s)
|
||||
return float(match.group()) if match else 0.0
|
||||
|
||||
def price(self, description: str) -> float:
|
||||
"""
|
||||
Make a call to OpenAI or DeepSeek to estimate the price of the described product,
|
||||
by looking up 5 similar products and including them in the prompt to give context
|
||||
:param description: a description of the product
|
||||
:return: an estimate of the price
|
||||
"""
|
||||
documents, prices = self.find_similars(description)
|
||||
|
||||
# If external calls are disabled, or similar pricing is empty, use heuristic
|
||||
allow_external = os.getenv("FRONTIER_ALLOW_EXTERNAL", "true").lower() in {"1", "true", "yes"}
|
||||
|
||||
def heuristic_price() -> float:
|
||||
if prices:
|
||||
# Robust central tendency fallback
|
||||
try:
|
||||
return float(statistics.median(prices))
|
||||
except Exception:
|
||||
return float(sum(prices) / max(len(prices), 1))
|
||||
# As a last resort, return 0.0
|
||||
return 0.0
|
||||
|
||||
if not allow_external:
|
||||
self.log("External LLM calls disabled via FRONTIER_ALLOW_EXTERNAL; using heuristic fallback")
|
||||
result = heuristic_price()
|
||||
self.log(f"Frontier Agent (fallback) - predicting ${result:.2f}")
|
||||
return result
|
||||
|
||||
self.log(f"Frontier Agent is about to call {self.MODEL} with context including 5 similar products")
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.MODEL,
|
||||
messages=self.messages_for(description, documents, prices),
|
||||
seed=42,
|
||||
max_tokens=5,
|
||||
)
|
||||
reply = response.choices[0].message.content
|
||||
result = self.get_price(reply)
|
||||
self.log(f"Frontier Agent completed - predicting ${result:.2f}")
|
||||
return result
|
||||
except APIStatusError as e: # Insufficient balance or other HTTP errors
|
||||
msg = getattr(e, "message", str(e))
|
||||
self.log(f"Frontier Agent API error: {msg}. Falling back to heuristic price.")
|
||||
result = heuristic_price()
|
||||
self.log(f"Frontier Agent (fallback) - predicting ${result:.2f}")
|
||||
return result
|
||||
except Exception as e:
|
||||
self.log(f"Frontier Agent unexpected error: {e}. Falling back to heuristic price.")
|
||||
result = heuristic_price()
|
||||
self.log(f"Frontier Agent (fallback) - predicting ${result:.2f}")
|
||||
return result
|
||||
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
import modal
|
||||
from modal import App, Volume, Image
|
||||
|
||||
|
||||
app = modal.App("pricer-service")
|
||||
image = Image.debian_slim().pip_install("huggingface", "torch", "transformers", "bitsandbytes", "accelerate", "peft")
|
||||
|
||||
secrets = [modal.Secret.from_name("hf-secret")]
|
||||
|
||||
# Constants
|
||||
GPU = "T4"
|
||||
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
|
||||
PROJECT_NAME = "pricer"
|
||||
HF_USER = "ed-donner"
|
||||
RUN_NAME = "2024-09-13_13.04.39"
|
||||
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
|
||||
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"
|
||||
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
|
||||
CACHE_DIR = "/cache"
|
||||
|
||||
|
||||
MIN_CONTAINERS = 0
|
||||
|
||||
QUESTION = "How much does this cost to the nearest dollar?"
|
||||
PREFIX = "Price is $"
|
||||
|
||||
hf_cache_volume = Volume.from_name("hf-hub-cache", create_if_missing=True)
|
||||
|
||||
@app.cls(
|
||||
image=image.env({"HF_HUB_CACHE": CACHE_DIR}),
|
||||
secrets=secrets,
|
||||
gpu=GPU,
|
||||
timeout=1800,
|
||||
min_containers=MIN_CONTAINERS,
|
||||
volumes={CACHE_DIR: hf_cache_volume}
|
||||
)
|
||||
class Pricer:
|
||||
|
||||
@modal.enter()
|
||||
def setup(self):
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
|
||||
from peft import PeftModel
|
||||
|
||||
# Quant Config
|
||||
quant_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_quant_type="nf4"
|
||||
)
|
||||
|
||||
# Load model and tokenizer
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
||||
self.tokenizer.pad_token = self.tokenizer.eos_token
|
||||
self.tokenizer.padding_side = "right"
|
||||
self.base_model = AutoModelForCausalLM.from_pretrained(
|
||||
BASE_MODEL,
|
||||
quantization_config=quant_config,
|
||||
device_map="auto"
|
||||
)
|
||||
self.fine_tuned_model = PeftModel.from_pretrained(self.base_model, FINETUNED_MODEL, revision=REVISION)
|
||||
|
||||
@modal.method()
|
||||
def price(self, description: str) -> float:
|
||||
import os
|
||||
import re
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
|
||||
from peft import PeftModel
|
||||
|
||||
set_seed(42)
|
||||
prompt = f"{QUESTION}\n\n{description}\n\n{PREFIX}"
|
||||
inputs = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
|
||||
attention_mask = torch.ones(inputs.shape, device="cuda")
|
||||
outputs = self.fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=5, num_return_sequences=1)
|
||||
result = self.tokenizer.decode(outputs[0])
|
||||
|
||||
contents = result.split("Price is $")[1]
|
||||
contents = contents.replace(',','')
|
||||
match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
|
||||
return float(match.group()) if match else 0
|
||||
|
||||
|
||||
# Simple HTTP endpoint so external apps can call this on Modal
|
||||
@app.function(image=image, secrets=secrets, gpu=GPU, timeout=1800)
|
||||
@modal.web_endpoint(method="POST")
|
||||
def price_http(body: dict):
|
||||
"""HTTP endpoint: {"description": str} -> {"price": float}"""
|
||||
description = body.get("description", '').strip()
|
||||
if not description:
|
||||
return {"error": "Missing 'description'"}
|
||||
|
||||
pricer = Pricer()
|
||||
value = pricer.price.remote(description)
|
||||
return {"price": float(value)}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user