363 lines
9.5 KiB
Plaintext
363 lines
9.5 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "db8736a7-ed94-441c-9556-831fa57b5a10",
|
|
"metadata": {},
|
|
"source": [
|
|
"# The Product Pricer\n",
|
|
"\n",
|
|
"A model that can estimate how much something costs, from its description.\n",
|
|
"\n",
|
|
"## Fine Tuning a model!"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "681c717b-4c24-4ac3-a5f3-3c5881d6e70a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# imports\n",
|
|
"\n",
|
|
"import os\n",
|
|
"import re\n",
|
|
"import math\n",
|
|
"import json\n",
|
|
"import random\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"from huggingface_hub import login\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import numpy as np\n",
|
|
"import pickle\n",
|
|
"from collections import Counter\n",
|
|
"from openai import OpenAI\n",
|
|
"from anthropic import Anthropic\n",
|
|
"\n",
|
|
"from items import Item\n",
|
|
"from testing import Tester"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "36d05bdc-0155-4c72-a7ee-aa4e614ffd3c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# environment\n",
|
|
"\n",
|
|
"load_dotenv(override=True)\n",
|
|
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
|
|
"os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')\n",
|
|
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')\n",
|
|
"\n",
|
|
"hf_token = os.environ['HF_TOKEN']\n",
|
|
"login(hf_token, add_to_git_credential=True)\n",
|
|
"\n",
|
|
"openai = OpenAI()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "c830ed3e-24ee-4af6-a07b-a1bfdcd39278",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%matplotlib inline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 48,
|
|
"id": "5c9b05f4-c9eb-462c-8d86-de9140a2d985",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Let's avoid curating all our data again! Load in the pickle files:\n",
|
|
"\n",
|
|
"with open('train.pkl', 'rb') as file:\n",
|
|
" train = pickle.load(file)\n",
|
|
"\n",
|
|
"with open('test.pkl', 'rb') as file:\n",
|
|
" test = pickle.load(file)\n",
|
|
"\n",
|
|
"# OpenAI recommends fine-tuning with populations of 50-100 examples\n",
|
|
"# But as our examples are very small, I'm suggesting we go with 200 examples (and 1 epoch)\n",
|
|
"\n",
|
|
"fine_tune_train = train[:2000]\n",
|
|
"fine_tune_validation = train[2000:2200]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8be4a889-81c3-42b1-a2fc-034cdc7321a6",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Step 1\n",
|
|
"\n",
|
|
"Prepare our data for fine-tuning in JSONL (JSON Lines) format and upload to OpenAI"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8ae2fb3c-1cff-4ce3-911e-627c970edd7b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# First let's work on a good prompt for a Frontier model\n",
|
|
"\n",
|
|
"def messages_for(item):\n",
|
|
" system_message = \"You estimate prices of items. Reply only with the price, no explanation\"\n",
|
|
" user_prompt = item.test_prompt().replace(\" to the nearest dollar\",\"\").replace(\"\\n\\nPrice is $\",\"\")\n",
|
|
" return [\n",
|
|
" {\"role\": \"system\", \"content\": system_message},\n",
|
|
" {\"role\": \"user\", \"content\": user_prompt},\n",
|
|
" {\"role\": \"assistant\", \"content\": f\"Price is ${item.price:.2f}\"}\n",
|
|
" ]\n",
|
|
"\n",
|
|
"messages_for(train[0])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c0e5b56c-8a0b-4d8e-a112-ce87efb4e152",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# {\"messages\" : [{\"role\": \"system\", \"content\": \"You estimate prices...\n",
|
|
"\n",
|
|
"def make_jsonl(items):\n",
|
|
" result = \"\"\n",
|
|
" for item in items:\n",
|
|
" messages = messages_for(item)\n",
|
|
" messages_str = json.dumps(messages)\n",
|
|
" result += '{\"messages\": ' + messages_str +'}\\n'\n",
|
|
" return result.strip()\n",
|
|
"\n",
|
|
"print(make_jsonl(train[:3]))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 51,
|
|
"id": "7734bff0-95c4-4e67-a87e-7e2254e2c67d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Convert the items into jsonl and write them to a file\n",
|
|
"\n",
|
|
"def write_jsonl(items, filename):\n",
|
|
" with open(filename, \"w\") as f:\n",
|
|
" jsonl = make_jsonl(items)\n",
|
|
" f.write(jsonl)\n",
|
|
"\n",
|
|
"write_jsonl(fine_tune_train, \"fine_tune_train.jsonl\")\n",
|
|
"write_jsonl(fine_tune_validation, \"fine_tune_validation.jsonl\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d59ad8d2-c61a-448e-b7ed-232f1606970f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"fine_tune_train.jsonl\", \"rb\") as f:\n",
|
|
" train_file = openai.files.create(file=f, purpose=\"fine-tune\")\n",
|
|
"\n",
|
|
"with open(\"fine_tune_validation.jsonl\", \"rb\") as f:\n",
|
|
" validation_file = openai.files.create(file=f, purpose=\"fine-tune\")\n",
|
|
"\n",
|
|
"train_file\n",
|
|
"validation_file"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "466052b9-9fb9-48f6-8cf9-c74e6ddc1394",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Step 2\n",
|
|
"\n",
|
|
"## And now time to Fine-tune!"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "45421b86-5531-4e42-ab19-d6abbb8f4c13",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"openai.fine_tuning.jobs.create(\n",
|
|
" training_file=train_file.id,\n",
|
|
" validation_file=validation_file.id,\n",
|
|
" model=\"gpt-4o-mini-2024-07-18\",\n",
|
|
" seed=42,\n",
|
|
" hyperparameters={\n",
|
|
" \"n_epochs\": 6,\n",
|
|
" \"batch_size\": 32,\n",
|
|
" \"learning_rate_multiplier\": 0.8\n",
|
|
" },\n",
|
|
" suffix=\"ft-accuracy\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "aeb9de2e-542c-4e83-81c7-b6745133e48b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"openai.fine_tuning.jobs.list(limit=1)\n",
|
|
"\n",
|
|
"job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id\n",
|
|
"\n",
|
|
"\n",
|
|
"openai.fine_tuning.jobs.retrieve(job_id)\n",
|
|
"openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f2062e4d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"job_id"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "066fef03-8338-4526-9df3-89b649ad4f0a",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Step 3\n",
|
|
"\n",
|
|
"Test our fine tuned model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fa4488cb-3c17-4eda-abd1-53c1c68a491b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model\n",
|
|
"fine_tuned_model_name"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2206d9d0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"print(fine_tuned_model_name)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "66ea68e8-ab1b-4f0d-aba4-a59574d8f85e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# The prompt\n",
|
|
"\n",
|
|
"def messages_for(item):\n",
|
|
" system_message = \"You estimate prices of items. Reply only with the price, no explanation\"\n",
|
|
" user_prompt = item.test_prompt().replace(\" to the nearest dollar\",\"\").replace(\"\\n\\nPrice is $\",\"\")\n",
|
|
" return [\n",
|
|
" {\"role\": \"system\", \"content\": system_message},\n",
|
|
" {\"role\": \"user\", \"content\": user_prompt},\n",
|
|
" {\"role\": \"assistant\", \"content\": \"Price is $\"}\n",
|
|
" ]\n",
|
|
"\n",
|
|
"def get_price(s):\n",
|
|
" s = s.replace('$','').replace(',','')\n",
|
|
" match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", s)\n",
|
|
" return float(match.group()) if match else 0\n",
|
|
"\n",
|
|
"messages_for(test[0])\n",
|
|
"get_price(\"The price is roughly $99.99 because blah blah\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 61,
|
|
"id": "501a2a7a-69c8-451b-bbc0-398bcb9e1612",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# The function for gpt-4o-mini\n",
|
|
"\n",
|
|
"def gpt_fine_tuned(item):\n",
|
|
" response = openai.chat.completions.create(\n",
|
|
" model=fine_tuned_model_name,\n",
|
|
" messages=messages_for(item),\n",
|
|
" seed=42,\n",
|
|
" max_tokens=7\n",
|
|
" )\n",
|
|
" reply = response.choices[0].message.content\n",
|
|
" return get_price(reply)\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "84e3813a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(test[0].test_prompt())\n",
|
|
"\n",
|
|
"print(test[0].price)\n",
|
|
"print(gpt_fine_tuned(test[0]))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "36bdd2c9-1859-4f99-a09f-3ec83b845b30",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"Tester.test(gpt_fine_tuned, test)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|