{ "cells": [ { "cell_type": "markdown", "id": "db8736a7-ed94-441c-9556-831fa57b5a10", "metadata": {}, "source": [ "# The Product Pricer Fine Tuning\n", "\n", "Submitted By: Bharat Puri\n", "\n", "A model that can estimate how much something costs, from its description.\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "681c717b-4c24-4ac3-a5f3-3c5881d6e70a", "metadata": {}, "outputs": [], "source": [ "# imports\n", "\n", "import os\n", "import re\n", "import math\n", "import json\n", "import random\n", "from dotenv import load_dotenv\n", "from huggingface_hub import login\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import numpy as np\n", "import pickle\n", "from collections import Counter\n", "import sys\n", "sys.path.append(os.path.abspath(os.path.join(\"..\", \"..\"))) \n", "from openai import OpenAI\n", "from anthropic import Anthropic\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_absolute_error\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "36d05bdc-0155-4c72-a7ee-aa4e614ffd3c", "metadata": {}, "outputs": [], "source": [ "# environment\n", "\n", "load_dotenv(override=True)\n", "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n", "os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')\n", "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')" ] }, { "cell_type": "code", "execution_count": 3, "id": "4dd3aad2-6f99-433c-8792-e461d2f06622", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n" ] } ], "source": [ "# Log in to HuggingFace\n", "\n", "hf_token = os.environ['HF_TOKEN']\n", "login(hf_token, add_to_git_credential=True)" ] }, { "cell_type": "code", "execution_count": 7, "id": "884a50bd-8cae-425e-8e56-f079fc3e65ce", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Input columns: [\"How much does this cost to the nearest dollar?\\n\\nOEM AC Compressor w/A/C Repair Kit For Ford F150 F-150 V8 & Lincoln Mark LT 2007 2008 - BuyAutoParts NEW\\nAs one of the world's largest automotive parts suppliers, our parts are trusted every day by mechanics and vehicle owners worldwide. This A/C Compressor and Components Kit is manufactured and tested to the strictest OE standards for unparalleled performance. Built for trouble-free ownership and 100% visually inspected and quality tested, this A/C Compressor and Components Kit is backed by our 100% satisfaction guarantee. Guaranteed Exact Fit for easy installation 100% BRAND NEW, premium ISO/TS 16949 quality - tested to meet or exceed OEM specifications Engineered for superior durability, backed by industry-leading unlimited-mileage warranty Included in this K\\n\\nPrice is $\", '0']\n", "Output columns: [\"How much does this cost to the nearest dollar?\\n\\nOEM AC Compressor w/A/C Repair Kit For Ford F150 F-150 V8 & Lincoln Mark LT 2007 2008 - BuyAutoParts NEW\\nAs one of the world's largest automotive parts suppliers, our parts are trusted every day by mechanics and vehicle owners worldwide. This A/C Compressor and Components Kit is manufactured and tested to the strictest OE standards for unparalleled performance. Built for trouble-free ownership and 100% visually inspected and quality tested, this A/C Compressor and Components Kit is backed by our 100% satisfaction guarantee. Guaranteed Exact Fit for easy installation 100% BRAND NEW, premium ISO/TS 16949 quality - tested to meet or exceed OEM specifications Engineered for superior durability, backed by industry-leading unlimited-mileage warranty Included in this K\\n\\nPrice is $\", '120']\n" ] } ], "source": [ "# =============================================\n", "# Step 1 – Load and Inspect Dataset (CSV files)\n", "# =============================================\n", "\n", "df_input = pd.read_csv(\"../../human_input.csv\")\n", "df_output = pd.read_csv(\"../../human_output.csv\")\n", "\n", "print(\"Input columns:\", df_input.columns.tolist())\n", "print(\"Output columns:\", df_output.columns.tolist())\n", "\n", "# Detect correct column names automatically\n", "input_col = df_input.columns[0] # first column name\n", "output_col = df_output.columns[0] # first column name\n", "\n", "data = pd.DataFrame({\n", " \"prompt\": df_input[input_col].astype(str),\n", " \"completion\": df_output[output_col].astype(str)\n", "})" ] }, { "cell_type": "code", "execution_count": 8, "id": "b0a6fb86-74a4-403c-ab25-6db2d74e9d2b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training samples: 199 | Validation samples: 50\n", "✅ Train and validation data prepared successfully.\n" ] } ], "source": [ "# =============================================\n", "# Step 2 – Split into Train and Validation Sets\n", "# =============================================\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Keep this small to minimize cost\n", "train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)\n", "\n", "print(f\"Training samples: {len(train_df)} | Validation samples: {len(val_df)}\")\n", "\n", "# Save to JSONL format (required by OpenAI fine-tuning API)\n", "train_df.to_json(\"train.jsonl\", orient=\"records\", lines=True)\n", "val_df.to_json(\"val.jsonl\", orient=\"records\", lines=True)\n", "\n", "print(\"✅ Train and validation data prepared successfully.\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "c830ed3e-24ee-4af6-a07b-a1bfdcd39278", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | prompt | \n", "completion | \n", "
|---|---|---|
| 137 | \n", "How much does this cost to the nearest dollar?... | \n", "How much does this cost to the nearest dollar?... | \n", "
| 6 | \n", "How much does this cost to the nearest dollar?... | \n", "How much does this cost to the nearest dollar?... | \n", "
| 97 | \n", "How much does this cost to the nearest dollar?... | \n", "How much does this cost to the nearest dollar?... | \n", "