From beacdf756ceac9e8c2bbfc476adb42e71d58d5ca Mon Sep 17 00:00:00 2001
From: Nik <nikhil.raut94@gmail.com>
Date: Sun, 26 Oct 2025 22:11:51 +0530
Subject: [PATCH] Copy base notebook.

---
 .../nikhil_raut/week6_challenge.ipynb         | 571 ++++++++++++++++++
 1 file changed, 571 insertions(+)
 create mode 100644 week6/community-contributions/nikhil_raut/week6_challenge.ipynb

diff --git a/week6/community-contributions/nikhil_raut/week6_challenge.ipynb b/week6/community-contributions/nikhil_raut/week6_challenge.ipynb
new file mode 100644
index 0000000..14abeab
--- /dev/null
+++ b/week6/community-contributions/nikhil_raut/week6_challenge.ipynb
@@ -0,0 +1,571 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "db8736a7-ed94-441c-9556-831fa57b5a10",
+   "metadata": {},
+   "source": [
+    "# The Product Pricer Continued\n",
+    "\n",
+    "A model that can estimate how much something costs, from its description.\n",
+    "\n",
+    "## AT LAST - it's time for Fine Tuning!\n",
+    "\n",
+    "After all this data preparation, and old school machine learning, we've finally arrived at the moment you've been waiting for. Fine-tuning a model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "681c717b-4c24-4ac3-a5f3-3c5881d6e70a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import os\n",
+    "import re\n",
+    "import math\n",
+    "import json\n",
+    "import random\n",
+    "from dotenv import load_dotenv\n",
+    "from huggingface_hub import login\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pickle\n",
+    "from collections import Counter\n",
+    "from openai import OpenAI\n",
+    "from anthropic import Anthropic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36d05bdc-0155-4c72-a7ee-aa4e614ffd3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# environment\n",
+    "\n",
+    "load_dotenv(override=True)\n",
+    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
+    "os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')\n",
+    "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4dd3aad2-6f99-433c-8792-e461d2f06622",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Log in to HuggingFace\n",
+    "\n",
+    "hf_token = os.environ['HF_TOKEN']\n",
+    "login(hf_token, add_to_git_credential=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "884a50bd-8cae-425e-8e56-f079fc3e65ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# moved our Tester into a separate package\n",
+    "# call it with Tester.test(function_name, test_dataset)\n",
+    "\n",
+    "from items import Item\n",
+    "from testing import Tester"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0a6fb86-74a4-403c-ab25-6db2d74e9d2b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "openai = OpenAI()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c830ed3e-24ee-4af6-a07b-a1bfdcd39278",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c9b05f4-c9eb-462c-8d86-de9140a2d985",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's avoid curating all our data again! Load in the pickle files:\n",
+    "\n",
+    "with open('train.pkl', 'rb') as file:\n",
+    "    train = pickle.load(file)\n",
+    "\n",
+    "with open('test.pkl', 'rb') as file:\n",
+    "    test = pickle.load(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8367135-f40e-43e1-8f3c-09e990ab1194",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# OpenAI recommends fine-tuning with populations of 50-100 examples\n",
+    "# But as our examples are very small, I'm suggesting we go with 200 examples (and 1 epoch)\n",
+    "\n",
+    "fine_tune_train = train[:200]\n",
+    "fine_tune_validation = train[200:250]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8be4a889-81c3-42b1-a2fc-034cdc7321a6",
+   "metadata": {},
+   "source": [
+    "# Step 1\n",
+    "\n",
+    "Prepare our data for fine-tuning in JSONL (JSON Lines) format and upload to OpenAI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ae2fb3c-1cff-4ce3-911e-627c970edd7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# First let's work on a good prompt for a Frontier model\n",
+    "# Notice that I'm removing the \" to the nearest dollar\"\n",
+    "# When we train our own models, we'll need to make the problem as easy as possible,\n",
+    "# but a Frontier model needs no such simplification.\n",
+    "\n",
+    "def messages_for(item):\n",
+    "    system_message = \"You estimate prices of items. Reply only with the price, no explanation\"\n",
+    "    user_prompt = item.test_prompt().replace(\" to the nearest dollar\",\"\").replace(\"\\n\\nPrice is $\",\"\")\n",
+    "    return [\n",
+    "        {\"role\": \"system\", \"content\": system_message},\n",
+    "        {\"role\": \"user\", \"content\": user_prompt},\n",
+    "        {\"role\": \"assistant\", \"content\": f\"Price is ${item.price:.2f}\"}\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1aa280f6-1227-426a-a2e2-1ce985feba1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages_for(train[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0e5b56c-8a0b-4d8e-a112-ce87efb4e152",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert the items into a list of json objects - a \"jsonl\" string\n",
+    "# Each row represents a message in the form:\n",
+    "# {\"messages\" : [{\"role\": \"system\", \"content\": \"You estimate prices...\n",
+    "\n",
+    "\n",
+    "def make_jsonl(items):\n",
+    "    result = \"\"\n",
+    "    for item in items:\n",
+    "        messages = messages_for(item)\n",
+    "        messages_str = json.dumps(messages)\n",
+    "        result += '{\"messages\": ' + messages_str +'}\\n'\n",
+    "    return result.strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e72de93-a6a6-4b35-855e-15786b97bf5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(make_jsonl(train[:3]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7734bff0-95c4-4e67-a87e-7e2254e2c67d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert the items into jsonl and write them to a file\n",
+    "\n",
+    "def write_jsonl(items, filename):\n",
+    "    with open(filename, \"w\") as f:\n",
+    "        jsonl = make_jsonl(items)\n",
+    "        f.write(jsonl)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "393d3ad8-999a-4f99-8c04-339d9166d604",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "write_jsonl(fine_tune_train, \"fine_tune_train.jsonl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e23927f-d73e-4668-ac20-abe6f14a56cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "write_jsonl(fine_tune_validation, \"fine_tune_validation.jsonl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d59ad8d2-c61a-448e-b7ed-232f1606970f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"fine_tune_train.jsonl\", \"rb\") as f:\n",
+    "    train_file = openai.files.create(file=f, purpose=\"fine-tune\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "083fefba-fd54-47ce-9ff3-aabbc200846f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97df3360-0760-4422-a556-5f26d23de6dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"fine_tune_validation.jsonl\", \"rb\") as f:\n",
+    "    validation_file = openai.files.create(file=f, purpose=\"fine-tune\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1abb8f3-9e52-4061-970c-fcf399d8ffa3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation_file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "466052b9-9fb9-48f6-8cf9-c74e6ddc1394",
+   "metadata": {},
+   "source": [
+    "# Step 2\n",
+    "\n",
+    "I love Weights and Biases - a beautiful, free platform for monitoring training runs.  \n",
+    "Weights and Biases is integrated with OpenAI for fine-tuning.\n",
+    "\n",
+    "First set up your weights & biases free account at:\n",
+    "\n",
+    "https://wandb.ai\n",
+    "\n",
+    "From the Avatar >> Settings menu, near the bottom, you can create an API key.\n",
+    "\n",
+    "Then visit the OpenAI dashboard at:\n",
+    "\n",
+    "https://platform.openai.com/account/organization\n",
+    "\n",
+    "In the integrations section, you can add your Weights & Biases key.\n",
+    "\n",
+    "## And now time to Fine-tune!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7add1a7-a746-4d6e-a5f8-e25629b8b527",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wandb_integration = {\"type\": \"wandb\", \"wandb\": {\"project\": \"gpt-pricer\"}}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49801e69-9277-4deb-9f33-99efb6b45ac2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_file.id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45421b86-5531-4e42-ab19-d6abbb8f4c13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "openai.fine_tuning.jobs.create(\n",
+    "    training_file=train_file.id,\n",
+    "    validation_file=validation_file.id,\n",
+    "    model=\"gpt-4o-mini-2024-07-18\",\n",
+    "    seed=42,\n",
+    "    hyperparameters={\"n_epochs\": 1},\n",
+    "    integrations = [wandb_integration],\n",
+    "    suffix=\"pricer\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aeb9de2e-542c-4e83-81c7-b6745133e48b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "openai.fine_tuning.jobs.list(limit=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40d24873-8ff5-413f-b0d4-8f77c28f18e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job_id = openai.fine_tuning.jobs.list(limit=1).data[0].id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a32aef35-4b38-436c-ad00-d082f758efa7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7e01247-c133-48e1-93d3-c79c399e6178",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "openai.fine_tuning.jobs.retrieve(job_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f5150e1-b8de-485f-8eba-cf1e5b00c117",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "openai.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10).data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b19ea9e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import wandb\n",
+    "from wandb.integration.openai.fine_tuning import WandbLogger\n",
+    "\n",
+    "# Log in to Weights & Biases.\n",
+    "wandb.login()\n",
+    "# Sync the fine-tuning job with Weights & Biases.\n",
+    "WandbLogger.sync(fine_tune_job_id=job_id, project=\"gpt-pricer\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "066fef03-8338-4526-9df3-89b649ad4f0a",
+   "metadata": {},
+   "source": [
+    "# Step 3\n",
+    "\n",
+    "Test our fine tuned model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa4488cb-3c17-4eda-abd1-53c1c68a491b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fine_tuned_model_name = openai.fine_tuning.jobs.retrieve(job_id).fine_tuned_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9370937-5a6f-4724-8265-b208663b4450",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fine_tuned_model_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "66ea68e8-ab1b-4f0d-aba4-a59574d8f85e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The prompt\n",
+    "\n",
+    "def messages_for(item):\n",
+    "    system_message = \"You estimate prices of items. Reply only with the price, no explanation\"\n",
+    "    user_prompt = item.test_prompt().replace(\" to the nearest dollar\",\"\").replace(\"\\n\\nPrice is $\",\"\")\n",
+    "    return [\n",
+    "        {\"role\": \"system\", \"content\": system_message},\n",
+    "        {\"role\": \"user\", \"content\": user_prompt},\n",
+    "        {\"role\": \"assistant\", \"content\": \"Price is $\"}\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ff92d61-0d27-4b0d-8b32-c9891016509b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Try this out\n",
+    "\n",
+    "messages_for(test[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1af1888-f94a-4106-b0d8-8a70939eec4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# A utility function to extract the price from a string\n",
+    "\n",
+    "def get_price(s):\n",
+    "    s = s.replace('$','').replace(',','')\n",
+    "    match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", s)\n",
+    "    return float(match.group()) if match else 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f138c5b7-bcc1-4085-aced-68dad1bf36b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_price(\"The price is roughly $99.99 because blah blah\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "501a2a7a-69c8-451b-bbc0-398bcb9e1612",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The function for gpt-4o-mini\n",
+    "\n",
+    "def gpt_fine_tuned(item):\n",
+    "    response = openai.chat.completions.create(\n",
+    "        model=fine_tuned_model_name,\n",
+    "        messages=messages_for(item),\n",
+    "        seed=42,\n",
+    "        max_tokens=7\n",
+    "    )\n",
+    "    reply = response.choices[0].message.content\n",
+    "    return get_price(reply)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "843d88b4-364a-431b-b48b-8a7c1f68b786",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(test[0].price)\n",
+    "print(gpt_fine_tuned(test[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edd7ada0-15b7-42ec-bbbb-1250e0eb9af1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(test[0].test_prompt())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36bdd2c9-1859-4f99-a09f-3ec83b845b30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Tester.test(gpt_fine_tuned, test)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}