{ "cells": [ { "cell_type": "markdown", "id": "41fb78a4-5aa1-4288-9cc2-6f742062f0a3", "metadata": { "id": "41fb78a4-5aa1-4288-9cc2-6f742062f0a3" }, "source": [ "# Fine Tuning with OpenAI" ] }, { "cell_type": "markdown", "id": "f8d0713f-0f79-460f-8acb-47afb877d24a", "metadata": { "jp-MarkdownHeadingCollapsed": true, "id": "f8d0713f-0f79-460f-8acb-47afb877d24a" }, "source": [ "## Utilities" ] }, { "cell_type": "code", "execution_count": null, "id": "2cdfe762-3200-4459-981e-0ded7c14b4de", "metadata": { "id": "2cdfe762-3200-4459-981e-0ded7c14b4de" }, "outputs": [], "source": [ "# Constants - used for printing to stdout in color\n", "\n", "GREEN = \"\\033[92m\"\n", "YELLOW = \"\\033[93m\"\n", "RED = \"\\033[91m\"\n", "RESET = \"\\033[0m\"\n", "COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}" ] }, { "cell_type": "markdown", "id": "d9f325d5-fb67-475c-aca0-01c0f0ea5ec1", "metadata": { "jp-MarkdownHeadingCollapsed": true, "id": "d9f325d5-fb67-475c-aca0-01c0f0ea5ec1" }, "source": [ "### Item" ] }, { "cell_type": "code", "execution_count": null, "id": "0832e74b-2779-4822-8e6c-4361ec165c7f", "metadata": { "id": "0832e74b-2779-4822-8e6c-4361ec165c7f" }, "outputs": [], "source": [ "from typing import Optional\n", "from transformers import AutoTokenizer\n", "import re\n", "\n", "BASE_MODEL = \"meta-llama/Meta-Llama-3.1-8B\"\n", "\n", "MIN_TOKENS = 150 # Any less than this, and we don't have enough useful content\n", "MAX_TOKENS = 160 # Truncate after this many tokens. Then after adding in prompt text, we will get to around 180 tokens\n", "\n", "MIN_CHARS = 300\n", "CEILING_CHARS = MAX_TOKENS * 7\n", "\n", "class Item:\n", " \"\"\"\n", " An Item is a cleaned, curated datapoint of a Product with a Price\n", " \"\"\"\n", "\n", " tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n", " PREFIX = \"Price is $\"\n", " QUESTION = \"How much does this cost to the nearest dollar?\"\n", " REMOVALS = ['\"Batteries Included?\": \"No\"', '\"Batteries Included?\": \"Yes\"', '\"Batteries Required?\": \"No\"', '\"Batteries Required?\": \"Yes\"', \"By Manufacturer\", \"Item\", \"Date First\", \"Package\", \":\", \"Number of\", \"Best Sellers\", \"Number\", \"Product \"]\n", "\n", " title: str\n", " price: float\n", " category: str\n", " token_count: int = 0\n", " details: Optional[str]\n", " prompt: Optional[str] = None\n", " include = False\n", "\n", " def __init__(self, data, price):\n", " self.title = data['title']\n", " self.price = price\n", " self.parse(data)\n", "\n", " def scrub_details(self):\n", " \"\"\"\n", " Clean up the details string by removing common text that doesn't add value\n", " \"\"\"\n", " details = self.details\n", " for remove in self.REMOVALS:\n", " details = details.replace(remove, \"\")\n", " return details\n", "\n", " def scrub(self, stuff):\n", " \"\"\"\n", " Clean up the provided text by removing unnecessary characters and whitespace\n", " Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers\n", " \"\"\"\n", " stuff = re.sub(r'[:\\[\\]\"{}【】\\s]+', ' ', stuff).strip()\n", " stuff = stuff.replace(\" ,\", \",\").replace(\",,,\",\",\").replace(\",,\",\",\")\n", " words = stuff.split(' ')\n", " select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]\n", " return \" \".join(select)\n", "\n", " def parse(self, data):\n", " \"\"\"\n", " Parse this datapoint and if it fits within the allowed Token range,\n", " then set include to True\n", " \"\"\"\n", " contents = '\\n'.join(data['description'])\n", " if contents:\n", " contents += '\\n'\n", " features = '\\n'.join(data['features'])\n", " if features:\n", " contents += features + '\\n'\n", " self.details = data['details']\n", " if self.details:\n", " contents += self.scrub_details() + '\\n'\n", " if len(contents) > MIN_CHARS:\n", " contents = contents[:CEILING_CHARS]\n", " text = f\"{self.scrub(self.title)}\\n{self.scrub(contents)}\"\n", " tokens = self.tokenizer.encode(text, add_special_tokens=False)\n", " if len(tokens) > MIN_TOKENS:\n", " tokens = tokens[:MAX_TOKENS]\n", " text = self.tokenizer.decode(tokens)\n", " self.make_prompt(text)\n", " self.include = True\n", "\n", " def make_prompt(self, text):\n", " \"\"\"\n", " Set the prompt instance variable to be a prompt appropriate for training\n", " \"\"\"\n", " self.prompt = f\"{self.QUESTION}\\n\\n{text}\\n\\n\"\n", " self.prompt += f\"{self.PREFIX}{str(round(self.price))}.00\"\n", " self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))\n", "\n", " def test_prompt(self):\n", " \"\"\"\n", " Return a prompt suitable for testing, with the actual price removed\n", " \"\"\"\n", " return self.prompt.split(self.PREFIX)[0] + self.PREFIX\n", "\n", " def __repr__(self):\n", " \"\"\"\n", " Return a String version of this Item\n", " \"\"\"\n", " return f\"<{self.title} = ${self.price}>\"\n" ] }, { "cell_type": "markdown", "source": [ "### Tester" ], "metadata": { "id": "LaIwYGzItsEi" }, "id": "LaIwYGzItsEi" }, { "cell_type": "code", "execution_count": null, "id": "129470d7-a5b1-4851-8800-970cccc8bcf5", "metadata": { "id": "129470d7-a5b1-4851-8800-970cccc8bcf5" }, "outputs": [], "source": [ "class Tester:\n", "\n", " def __init__(self, predictor, data, title=None, size=250):\n", " self.predictor = predictor\n", " self.data = data\n", " self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n", " self.size = size\n", " self.guesses = []\n", " self.truths = []\n", " self.errors = []\n", " self.sles = []\n", " self.colors = []\n", "\n", " def color_for(self, error, truth):\n", " if error<40 or error/truth < 0.2:\n", " return \"green\"\n", " elif error<80 or error/truth < 0.4:\n", " return \"orange\"\n", " else:\n", " return \"red\"\n", "\n", " def run_datapoint(self, i):\n", " datapoint = self.data[i]\n", " guess = self.predictor(datapoint)\n", " truth = datapoint.price\n", " error = abs(guess - truth)\n", " log_error = math.log(truth+1) - math.log(guess+1)\n", " sle = log_error ** 2\n", " color = self.color_for(error, truth)\n", " title = datapoint.title if len(datapoint.title) <= 40 else datapoint.title[:40]+\"...\"\n", " self.guesses.append(guess)\n", " self.truths.append(truth)\n", " self.errors.append(error)\n", " self.sles.append(sle)\n", " self.colors.append(color)\n", " print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}\")\n", "\n", " def chart(self, title):\n", " max_error = max(self.errors)\n", " plt.figure(figsize=(12, 8))\n", " max_val = max(max(self.truths), max(self.guesses))\n", " plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n", " plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n", " plt.xlabel('Ground Truth')\n", " plt.ylabel('Model Estimate')\n", " plt.xlim(0, max_val)\n", " plt.ylim(0, max_val)\n", " plt.title(title)\n", " plt.show()\n", "\n", " def report(self):\n", " average_error = sum(self.errors) / self.size\n", " rmsle = math.sqrt(sum(self.sles) / self.size)\n", " hits = sum(1 for color in self.colors if color==\"green\")\n", " title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%\"\n", " self.chart(title)\n", "\n", " def run(self):\n", " self.error = 0\n", " for i in range(self.size):\n", " self.run_datapoint(i)\n", " self.report()\n", "\n", " @classmethod\n", " def test(cls, function, data):\n", " cls(function, data).run()" ] }, { "cell_type": "code", "source": [ "# A utility function to extract the price from a string\n", "\n", "def get_price(s):\n", " s = s.replace('$','').replace(',','')\n", " match = re.search(r'[-+]?\\d*\\.?\\d+', s) # Simplify regex\n", " return float(match.group()) if match else 0" ], "metadata": { "id": "6XywRUiUro69" }, "id": "6XywRUiUro69", "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "id": "10af1228-30b7-4dfc-a364-059ea099af81", "metadata": { "id": "10af1228-30b7-4dfc-a364-059ea099af81" }, "source": [ "## Data Curation" ] }, { "cell_type": "code", "execution_count": null, "id": "5faa087c-bdf7-42e5-9c32-c0b0a4d4160f", "metadata": { "id": "5faa087c-bdf7-42e5-9c32-c0b0a4d4160f" }, "outputs": [], "source": [ "%pip install --upgrade --quiet jupyterlab ipython ipywidgets huggingface_hub datasets transformers\n", "\n", "%matplotlib notebook\n" ] }, { "cell_type": "markdown", "source": [ "### Load Dataset from Hugging Face" ], "metadata": { "id": "3XTxVhq0xC8Z" }, "id": "3XTxVhq0xC8Z" }, { "cell_type": "code", "execution_count": null, "id": "2bd6fc25-77c4-47a6-a2d2-ce80403f3c22", "metadata": { "id": "2bd6fc25-77c4-47a6-a2d2-ce80403f3c22" }, "outputs": [], "source": [ "from datasets import load_dataset, Dataset, DatasetDict\n", "from transformers import AutoTokenizer\n", "\n", "\n", "dataset = load_dataset('ranskills/Amazon-Reviews-2023-raw_meta_All_Beauty', split='full')" ] }, { "cell_type": "code", "execution_count": null, "id": "b66b59c2-80b2-4d47-b739-c59423cf9d7d", "metadata": { "id": "b66b59c2-80b2-4d47-b739-c59423cf9d7d" }, "outputs": [], "source": [ "from IPython.display import display, JSON\n", "\n", "\n", "print(f'Number of datapoints: {dataset.num_rows:,}')\n", "display(JSON(dataset.features.to_dict()))" ] }, { "cell_type": "code", "execution_count": null, "id": "e9620ed3-205e-48ee-b67a-e56b30bf6b6b", "metadata": { "id": "e9620ed3-205e-48ee-b67a-e56b30bf6b6b" }, "outputs": [], "source": [ "def non_zero_price_filter(datapoint: dict):\n", " try:\n", " price = float(datapoint['price'])\n", " return price > 0\n", " except:\n", " return False\n", "\n", "filtered_dataset = dataset.filter(non_zero_price_filter)\n", "\n", "print(f'Prices with non-zero prices:{filtered_dataset.num_rows:,} = {filtered_dataset.num_rows / dataset.num_rows * 100:,.2f}%')" ] }, { "cell_type": "code", "execution_count": null, "id": "834a3c4b-fc9c-4bc7-b6b9-bdf7e8d6d585", "metadata": { "id": "834a3c4b-fc9c-4bc7-b6b9-bdf7e8d6d585" }, "outputs": [], "source": [ "from collections import defaultdict\n", "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "\n", "data = defaultdict(lambda: [])\n", "for datapoint in filtered_dataset:\n", " price = float(datapoint['price'])\n", " contents = datapoint[\"title\"] + str(datapoint[\"description\"]) + str(datapoint[\"features\"]) + str(datapoint[\"details\"])\n", "\n", " data['price'].append(price)\n", " data['characters'].append(len(contents))\n", "\n", "%matplotlib inline\n", "\n", "df = pd.DataFrame(data)\n", "\n", "combined_describe = pd.concat(\n", " [df['price'].describe(), df['characters'].describe()],\n", " axis=1\n", ")\n", "\n", "display(combined_describe)\n", "\n", "prices = data['price']\n", "lengths = data['characters']\n", "\n", "plt.figure(figsize=(15, 6))\n", "plt.title(f\"Prices: Avg {df['price'].mean():,.2f} and highest {df['price'].max():,}\\n\")\n", "plt.xlabel('Length (chars)')\n", "plt.ylabel('Count')\n", "plt.hist(prices, rwidth=0.7, color=\"orange\", bins=range(0, 300, 10))\n", "plt.show()\n", "\n", "plt.figure(figsize=(15, 6))\n", "plt.title(f\"Characters: Avg {sum(lengths)/len(lengths):,.0f} and highest {max(lengths):,}\\n\")\n", "plt.xlabel('Length (characters)')\n", "plt.ylabel('Count')\n", "plt.hist(lengths, rwidth=0.7, color=\"lightblue\", bins=range(0, 2500, 50))\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "a506f42c-81c0-4198-bc0b-1e0653620be8", "metadata": { "id": "a506f42c-81c0-4198-bc0b-1e0653620be8" }, "outputs": [], "source": [ "BASE_MODEL = 'meta-llama/Meta-Llama-3.1-8B'\n", "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n", "\n", "tokenizer.encode('114', add_special_tokens=False)\n", "\n", "items = []\n", "for datapoint in filtered_dataset:\n", " price = float(datapoint['price'])\n", " items.append(Item(datapoint, price))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5842ace6-332d-46da-a853-5ea5a2a1cf88", "metadata": { "id": "5842ace6-332d-46da-a853-5ea5a2a1cf88" }, "outputs": [], "source": [ "print(items[0].test_prompt())" ] }, { "cell_type": "code", "execution_count": null, "id": "42ee0099-0d2a-4331-a01c-3462363a6987", "metadata": { "id": "42ee0099-0d2a-4331-a01c-3462363a6987" }, "outputs": [], "source": [ "# filter out items with None prompt as a result of their content being below the minimum threshold\n", "valid_items = [item for item in items if item.prompt is not None]\n", "\n", "data_size = len(valid_items)\n", "\n", "\n", "training_size = int(data_size * 0.9)\n", "train = valid_items[:training_size]\n", "test = valid_items[training_size:]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1146d5a2-f93e-4fe9-864e-4ce7e01e257b", "metadata": { "id": "1146d5a2-f93e-4fe9-864e-4ce7e01e257b" }, "outputs": [], "source": [ "train_prompts = [item.prompt for item in train]\n", "train_prices = [item.price for item in train]\n", "test_prompts = [item.test_prompt() for item in test]\n", "test_prices = [item.price for item in test]" ] }, { "cell_type": "code", "execution_count": null, "id": "31ca360d-5fc6-487a-91c6-d61758b2ff16", "metadata": { "id": "31ca360d-5fc6-487a-91c6-d61758b2ff16" }, "outputs": [], "source": [ "# Create a Dataset from the lists\n", "\n", "train_dataset = Dataset.from_dict({\"text\": train_prompts, \"price\": train_prices})\n", "test_dataset = Dataset.from_dict({\"text\": test_prompts, \"price\": test_prices})\n", "dataset = DatasetDict({\n", " \"train\": train_dataset,\n", " \"test\": test_dataset\n", "})" ] }, { "cell_type": "markdown", "id": "05e6ca7e-bf40-49f9-bffb-a5b22e5800d8", "metadata": { "id": "05e6ca7e-bf40-49f9-bffb-a5b22e5800d8" }, "source": [ "### Export Data" ] }, { "cell_type": "code", "execution_count": null, "id": "b0ff2fe3-78bf-49e3-a682-6a46742d010c", "metadata": { "id": "b0ff2fe3-78bf-49e3-a682-6a46742d010c" }, "outputs": [], "source": [ "import pickle\n", "\n", "DATA_DIR = 'data'\n", "\n", "train_storage_file = lambda ext: f'{DATA_DIR}/all_beauty_train{ext}'\n", "test_storage_file = lambda ext: f'{DATA_DIR}/all_beauty_test{ext}'\n", "\n", "with open(train_storage_file('.pkl'), 'wb') as file:\n", " pickle.dump(train, file)\n", "\n", "with open(test_storage_file('.pkl'), 'wb') as file:\n", " pickle.dump(test, file)" ] }, { "cell_type": "code", "execution_count": null, "id": "b2164662-9bc9-4a66-9e4e-a8a955a45753", "metadata": { "id": "b2164662-9bc9-4a66-9e4e-a8a955a45753" }, "outputs": [], "source": [ "dataset['train'].to_parquet(train_storage_file('.parquet'))\n", "dataset['test'].to_parquet(test_storage_file('.parquet'))\n", "\n", "# How to load back the data\n", "# loaded_dataset = load_dataset(\"parquet\", data_files='amazon_polarity_train.parquet')" ] }, { "cell_type": "markdown", "id": "6fe428a2-41c4-4f7f-a43f-e8ba2f344013", "metadata": { "id": "6fe428a2-41c4-4f7f-a43f-e8ba2f344013" }, "source": [ "### Predictions" ] }, { "cell_type": "markdown", "source": [ "#### Random Pricer" ], "metadata": { "id": "qX0c_prppnyZ" }, "id": "qX0c_prppnyZ" }, { "cell_type": "code", "execution_count": null, "id": "7323252b-db50-4b8a-a7fc-8504bb3d218b", "metadata": { "id": "7323252b-db50-4b8a-a7fc-8504bb3d218b" }, "outputs": [], "source": [ "import random\n", "import math\n", "\n", "\n", "def random_pricer(item):\n", " return random.randrange(1,200)\n", "\n", "random.seed(42)\n", "\n", "# Run our TestRunner\n", "Tester.test(random_pricer, test)" ] }, { "cell_type": "markdown", "source": [ "#### Constant Pricer" ], "metadata": { "id": "O0xVXRXkp9sQ" }, "id": "O0xVXRXkp9sQ" }, { "cell_type": "code", "execution_count": null, "id": "6a932b0e-ba6e-45d2-8436-b740c3681272", "metadata": { "id": "6a932b0e-ba6e-45d2-8436-b740c3681272" }, "outputs": [], "source": [ "training_prices = [item.price for item in train]\n", "training_average = sum(training_prices) / len(training_prices)\n", "\n", "def constant_pricer(item):\n", " return training_average\n", "\n", "Tester.test(constant_pricer, test)" ] }, { "cell_type": "code", "execution_count": null, "id": "d3410bd4-98e4-42a6-a702-4423cfd034b4", "metadata": { "id": "d3410bd4-98e4-42a6-a702-4423cfd034b4" }, "outputs": [], "source": [ "train[0].details" ] }, { "cell_type": "markdown", "id": "44537051-7b4e-4b8c-95a7-a989ea51e517", "metadata": { "id": "44537051-7b4e-4b8c-95a7-a989ea51e517" }, "source": [ "### Prepare Fine-Tuning Data" ] }, { "cell_type": "code", "execution_count": null, "id": "47d03b0b-4a93-4f9d-80ac-10f3fc11ccec", "metadata": { "id": "47d03b0b-4a93-4f9d-80ac-10f3fc11ccec" }, "outputs": [], "source": [ "fine_tune_train = train[:100]\n", "fine_tune_validation = train[100:125]" ] }, { "cell_type": "code", "execution_count": null, "id": "4d7b6f35-890c-4227-8990-6b62694a332d", "metadata": { "id": "4d7b6f35-890c-4227-8990-6b62694a332d" }, "outputs": [], "source": [ "def messages_for(item):\n", " system_message = \"You estimate prices of items. Reply only with the price, no explanation\"\n", " user_prompt = item.test_prompt().replace(\" to the nearest dollar\",\"\").replace(\"\\n\\nPrice is $\",\"\")\n", " return [\n", " {\"role\": \"system\", \"content\": system_message},\n", " {\"role\": \"user\", \"content\": user_prompt},\n", " {\"role\": \"assistant\", \"content\": f\"Price is ${item.price:.2f}\"}\n", " ]\n", "\n", "messages_for(train[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "1a6e06f3-614f-4687-bd43-9ac03aaface8", "metadata": { "id": "1a6e06f3-614f-4687-bd43-9ac03aaface8" }, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "DATA_DIR = 'data'\n", "\n", "data_path = Path(DATA_DIR)\n", "\n", "def make_jsonl(items):\n", " result = \"\"\n", " for item in items:\n", " messages = messages_for(item)\n", " messages_str = json.dumps(messages)\n", " result += '{\"messages\": ' + messages_str +'}\\n'\n", " return result.strip()\n", "\n", "# print(make_jsonl(train[:3]))\n", "data_path.absolute()\n", "if not data_path.exists():\n", " data_path.mkdir(parents=True)\n", "\n", "\n", "\n", "train_jsonl_path = f'{data_path}/pricer_train.jsonl'\n", "validation_jsonl_path = f'{data_path}/pricer_validation.jsonl'" ] }, { "cell_type": "code", "execution_count": null, "id": "d8dda552-8003-4fdc-b36a-7d0afa9b0b42", "metadata": { "id": "d8dda552-8003-4fdc-b36a-7d0afa9b0b42" }, "outputs": [], "source": [ "def write_jsonl(items, filename):\n", " with open(filename, \"w\") as f:\n", " jsonl = make_jsonl(items)\n", " f.write(jsonl)" ] }, { "cell_type": "code", "execution_count": null, "id": "189e959c-d70c-4509-bff6-1cbd8e8db637", "metadata": { "id": "189e959c-d70c-4509-bff6-1cbd8e8db637" }, "outputs": [], "source": [ "\n", "write_jsonl(fine_tune_train, train_jsonl_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "6b1480e2-ed19-4d0e-bc5d-a00086d104a2", "metadata": { "id": "6b1480e2-ed19-4d0e-bc5d-a00086d104a2" }, "outputs": [], "source": [ "write_jsonl(fine_tune_validation, validation_jsonl_path)" ] }, { "cell_type": "markdown", "source": [ "## Training" ], "metadata": { "id": "ga-f4JK7sPU2" }, "id": "ga-f4JK7sPU2" }, { "cell_type": "code", "execution_count": null, "id": "de958a51-69ba-420c-84b7-d32765898fd2", "metadata": { "id": "de958a51-69ba-420c-84b7-d32765898fd2" }, "outputs": [], "source": [ "import os\n", "from openai import OpenAI\n", "from dotenv import load_dotenv\n", "from google.colab import userdata\n", "\n", "load_dotenv()\n", "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n", "\n", "openai = OpenAI()" ] }, { "cell_type": "code", "source": [ "with open(train_jsonl_path, 'rb') as f:\n", " train_file = openai.files.create(file=f, purpose='fine-tune')" ], "metadata": { "id": "QFDAoNnoRCk1" }, "id": "QFDAoNnoRCk1", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "train_file" ], "metadata": { "id": "kBVWisusQwDq" }, "id": "kBVWisusQwDq", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "with open(validation_jsonl_path, 'rb') as f:\n", " validation_file = openai.files.create(file=f, purpose='fine-tune')\n", "\n", "validation_file" ], "metadata": { "id": "wgth1KvMSEOb" }, "id": "wgth1KvMSEOb", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "wandb_integration = {\"type\": \"wandb\", \"wandb\": {\"project\": \"gpt-pricer\"}}" ], "metadata": { "id": "-ohEia37Sjtx" }, "id": "-ohEia37Sjtx", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "openai.fine_tuning.jobs.create(\n", " training_file=train_file.id,\n", " validation_file=validation_file.id,\n", " model=\"gpt-4o-mini-2024-07-18\",\n", " seed=42,\n", " hyperparameters={\"n_epochs\": 1},\n", " integrations = [wandb_integration],\n", " suffix=\"pricer\"\n", ")" ], "metadata": { "id": "g7uz8SC5S3_s" }, "id": "g7uz8SC5S3_s", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "openai.fine_tuning.jobs.list(limit=1)" ], "metadata": { "id": "_zHswJwzWCHZ" }, "id": "_zHswJwzWCHZ", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id\n", "job_id" ], "metadata": { "id": "rSHYkQojWH8Q" }, "id": "rSHYkQojWH8Q", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "openai.fine_tuning.jobs.retrieve(job_id)" ], "metadata": { "id": "Yqq-jd1yWMuO" }, "id": "Yqq-jd1yWMuO", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data" ], "metadata": { "id": "37BH0u-QWOiY" }, "id": "37BH0u-QWOiY", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import wandb\n", "from wandb.integration.openai.fine_tuning import WandbLogger\n", "\n", "\n", "wandb.login()\n", "# Sync the fine-tuning job with Weights & Biases.\n", "WandbLogger.sync(fine_tune_job_id=job_id, project=\"gpt-pricer\")" ], "metadata": { "id": "2nNSE_AzWYMq" }, "id": "2nNSE_AzWYMq", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model\n", "fine_tuned_model_name" ], "metadata": { "id": "ASiJUw-Fh8Ul" }, "id": "ASiJUw-Fh8Ul", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def messages_for(item):\n", " system_message = \"You estimate prices of items. Reply only with the price, no explanation\"\n", " user_prompt = item.test_prompt().replace(\" to the nearest dollar\",\"\").replace(\"\\n\\nPrice is $\",\"\")\n", " return [\n", " {\"role\": \"system\", \"content\": system_message},\n", " {\"role\": \"user\", \"content\": user_prompt},\n", " {\"role\": \"assistant\", \"content\": \"Price is $\"}\n", " ]" ], "metadata": { "id": "7jB_7gqBiH_r" }, "id": "7jB_7gqBiH_r", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# The function for gpt-4o-mini\n", "\n", "def gpt_fine_tuned(item):\n", " response = openai.chat.completions.create(\n", " model=fine_tuned_model_name,\n", " messages=messages_for(item),\n", " seed=42,\n", " max_tokens=7\n", " )\n", " reply = response.choices[0].message.content\n", " return get_price(reply)" ], "metadata": { "id": "BHfLSadhiVQE" }, "id": "BHfLSadhiVQE", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(test[0].price)\n", "print(gpt_fine_tuned(test[0]))" ], "metadata": { "id": "C0CiTZ4jkjrI" }, "id": "C0CiTZ4jkjrI", "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "Tester.test(gpt_fine_tuned, test)" ], "metadata": { "id": "WInQE0ObkuBl" }, "id": "WInQE0ObkuBl", "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "sagemaker-distribution:Python", "language": "python", "name": "conda-env-sagemaker-distribution-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" }, "colab": { "provenance": [] } }, "nbformat": 4, "nbformat_minor": 5 }