Files
LLM_Engineering_OLD/week6/community-contributions/bharat_puri/fine_tuned_simulation.ipynb
2025-10-25 23:34:43 +05:30

346 lines
12 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "db8736a7-ed94-441c-9556-831fa57b5a10",
"metadata": {},
"source": [
"# The Product Pricer Fine-Tuning a Frontier Model - Similation (GPT-4 mini)\n",
"\n",
"Submitted By: Bharat Puri\n",
"\n",
"A model that can estimate how much something costs, from its description.\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "681c717b-4c24-4ac3-a5f3-3c5881d6e70a",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import re\n",
"import math\n",
"import json\n",
"import random\n",
"from dotenv import load_dotenv\n",
"from huggingface_hub import login\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import pickle\n",
"from collections import Counter\n",
"import sys\n",
"sys.path.append(os.path.abspath(os.path.join(\"..\", \"..\"))) \n",
"from openai import OpenAI\n",
"from anthropic import Anthropic\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import mean_absolute_error\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "36d05bdc-0155-4c72-a7ee-aa4e614ffd3c",
"metadata": {},
"outputs": [],
"source": [
"# environment\n",
"\n",
"load_dotenv(override=True)\n",
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
"os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')\n",
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4dd3aad2-6f99-433c-8792-e461d2f06622",
"metadata": {},
"outputs": [],
"source": [
"# Log in to HuggingFace\n",
"\n",
"hf_token = os.environ['HF_TOKEN']\n",
"login(hf_token, add_to_git_credential=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c69e347-91bc-4eb1-843f-a17ed485667c",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# =============================================================\n",
"# Step 1 — Data Curation and Preparation (Integrated from 09_part1_data_curation)\n",
"# =============================================================\n",
"\n",
"import pandas as pd\n",
"import pickle\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"print(\"🔍 Starting data curation...\")\n",
"\n",
"# Load input/output CSVs (adjust paths as needed)\n",
"df_input = pd.read_csv(\"../../human_input.csv\")\n",
"df_output = pd.read_csv(\"../../human_output.csv\")\n",
"\n",
"# Detect and combine dynamically\n",
"i_col, o_col = df_input.columns[0], df_output.columns[0]\n",
"df = pd.DataFrame({\n",
" \"prompt\": df_input[i_col].astype(str).str.strip(),\n",
" \"completion\": df_output[o_col].astype(str).str.strip()\n",
"})\n",
"\n",
"# Basic cleaning\n",
"df.dropna(inplace=True)\n",
"df = df[df[\"prompt\"].str.len() > 0]\n",
"df = df[df[\"completion\"].str.len() > 0]\n",
"df = df.reset_index(drop=True)\n",
"\n",
"print(f\"✅ Cleaned dataset shape: {df.shape}\")\n",
"print(df.head(3))\n",
"\n",
"# Split into training and validation\n",
"train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)\n",
"print(f\"Training samples: {len(train_df)}, Validation samples: {len(val_df)}\")\n",
"\n",
"# Save curated datasets to reuse later\n",
"with open(\"train.pkl\", \"wb\") as f:\n",
" pickle.dump(train_df, f)\n",
"with open(\"test.pkl\", \"wb\") as f:\n",
" pickle.dump(val_df, f)\n",
"\n",
"print(\"💾 Saved train.pkl and test.pkl successfully.\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0a6fb86-74a4-403c-ab25-6db2d74e9d2b",
"metadata": {},
"outputs": [],
"source": [
"# =============================================================\n",
"# Step 2 — Prepare Data for Fine-Tuning\n",
"# =============================================================\n",
"import pickle\n",
"import pandas as pd\n",
"\n",
"print(\"📦 Loading curated train/test data from pickle files...\")\n",
"\n",
"with open(\"train.pkl\", \"rb\") as f:\n",
" train_df = pickle.load(f)\n",
"with open(\"test.pkl\", \"rb\") as f:\n",
" val_df = pickle.load(f)\n",
"\n",
"print(f\"✅ Loaded train={len(train_df)} | val={len(val_df)}\")\n",
"\n",
"# Ensure correct column names\n",
"train_df = train_df.rename(columns={train_df.columns[0]: \"prompt\", train_df.columns[1]: \"completion\"})\n",
"val_df = val_df.rename(columns={val_df.columns[0]: \"prompt\", val_df.columns[1]: \"completion\"})\n",
"\n",
"# Save as JSONL for OpenAI Fine-Tuning\n",
"train_df.to_json(\"train.jsonl\", orient=\"records\", lines=True)\n",
"val_df.to_json(\"val.jsonl\", orient=\"records\", lines=True)\n",
"\n",
"print(\"💾 Saved train.jsonl and val.jsonl for fine-tuning.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c830ed3e-24ee-4af6-a07b-a1bfdcd39278",
"metadata": {},
"outputs": [],
"source": [
"# =============================================================\n",
"# Step 3 — Fine-Tuning Configuration\n",
"# =============================================================\n",
"import json\n",
"\n",
"hyperparams = {\n",
" \"model\": \"gpt-4o-mini\", # Frontier model from the course\n",
" \"n_epochs\": 3, # Small safe run\n",
" \"batch_size\": 8, # Reasonable for small data\n",
" \"learning_rate_multiplier\": 0.5, # Trainer's suggested mid value\n",
" \"suffix\": \"week6_bharat_ft_v1\" # Unique identifier for your run\n",
"}\n",
"\n",
"print(\"⚙️ Fine-tuning configuration:\")\n",
"print(json.dumps(hyperparams, indent=2))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5c9b05f4-c9eb-462c-8d86-de9140a2d985",
"metadata": {},
"outputs": [],
"source": [
"# =============================================\n",
"# Step 3 Define Fine-Tuning Configuration\n",
"# =============================================\n",
"\n",
"hyperparams = {\n",
" \"model\": \"gpt-4o-mini\", \n",
" \"n_epochs\": 1, \n",
" \"batch_size\": 4, # Small batch = less token use\n",
" \"learning_rate_multiplier\": 0.5, # Gentle learning rate\n",
" \"suffix\": \"week6_lowcost_bharat\" # Custom suffix for tracking\n",
"}\n",
"\n",
"print(\"✅ Fine-tuning configuration defined:\")\n",
"for k, v in hyperparams.items():\n",
" print(f\"{k:25}: {v}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e8367135-f40e-43e1-8f3c-09e990ab1194",
"metadata": {},
"outputs": [],
"source": [
"# =============================================================\n",
"# Step 4 — Launch Fine-Tuning Job (Fixed for latest SDK)\n",
"# =============================================================\n",
"from openai import OpenAI\n",
"import time, os, json\n",
"\n",
"client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
"\n",
"simulate = True # Set True for simulation (no cost)\n",
"\n",
"if simulate:\n",
" print(\"\\n🧪 Simulation mode — running mock fine-tuning steps...\")\n",
" for e in range(3):\n",
" print(f\"Simulated Epoch {e+1}/3\")\n",
" time.sleep(1)\n",
" ft_model = \"ft:gpt-4o-mini:SIMULATED\"\n",
" print(\"✅ Simulation complete — no API cost.\")\n",
"else:\n",
" print(\"\\n🚀 Creating fine-tuning job...\")\n",
"\n",
" # Upload training and validation data\n",
" train_file = client.files.create(file=open(\"train.jsonl\", \"rb\"), purpose=\"fine-tune\")\n",
" val_file = client.files.create(file=open(\"val.jsonl\", \"rb\"), purpose=\"fine-tune\")\n",
"\n",
" # ✅ Correct usage: hyperparameters must go inside a dictionary named `hyperparameters`\n",
" job = client.fine_tuning.jobs.create(\n",
" model=\"gpt-4o-mini\",\n",
" training_file=train_file.id,\n",
" validation_file=val_file.id,\n",
" hyperparameters={\n",
" \"n_epochs\": 3,\n",
" \"batch_size\": 8,\n",
" \"learning_rate_multiplier\": 0.5\n",
" },\n",
" suffix=\"week6_bharat_ft_v1\"\n",
" )\n",
"\n",
" print(\"🆔 Job created:\", job.id)\n",
"\n",
" # Poll until completion\n",
" status = job.status\n",
" while status in (\"validating_files\", \"queued\", \"running\"):\n",
" print(\"⏳ Status:\", status)\n",
" time.sleep(20)\n",
" job = client.fine_tuning.jobs.retrieve(job.id)\n",
" status = job.status\n",
"\n",
" if job.status != \"succeeded\":\n",
" raise RuntimeError(f\"❌ Fine-tune failed with status: {job.status}\")\n",
"\n",
" ft_model = job.fine_tuned_model\n",
" print(\"🎯 Fine-tuning complete! Model ID:\", ft_model)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "32a2b85e-e978-4c8f-90d9-d697731e6569",
"metadata": {},
"outputs": [],
"source": [
"# =============================================================\n",
"# Step 5 — Evaluate Simulated Fine-Tuned Model\n",
"# =============================================================\n",
"import numpy as np\n",
"from sklearn.metrics import mean_absolute_error\n",
"import matplotlib.pyplot as plt\n",
"import re\n",
"\n",
"print(\"\\n🧮 Evaluating simulated fine-tuned model performance...\")\n",
"\n",
"# Use small sample of validation data\n",
"val_subset = val_df.sample(min(20, len(val_df)), random_state=42).reset_index(drop=True)\n",
"prompts = val_subset[\"prompt\"].tolist()\n",
"actuals = val_subset[\"completion\"].tolist()\n",
"\n",
"# Convert actuals into numeric form (if applicable)\n",
"def extract_number(x):\n",
" match = re.findall(r\"[-+]?\\d*\\.?\\d+\", str(x))\n",
" return float(match[0]) if match else np.random.uniform(70, 90)\n",
"\n",
"actual_values = [extract_number(a) for a in actuals]\n",
"\n",
"# 🧪 Simulate predicted values (normally would come from API)\n",
"predicted_values = [v + np.random.uniform(-3, 3) for v in actual_values]\n",
"\n",
"# Calculate Mean Absolute Error\n",
"mae = mean_absolute_error(actual_values, predicted_values)\n",
"print(f\"\\n📊 Validation Mean Absolute Error (Simulated): {mae:.2f}\")\n",
"\n",
"# Plot comparison\n",
"plt.figure(figsize=(6, 4))\n",
"plt.plot(predicted_values, label=\"Predicted\", marker=\"o\")\n",
"plt.plot(actual_values, label=\"Actual\", marker=\"x\")\n",
"plt.title(\"Validation Predictions vs Actuals (Simulated)\")\n",
"plt.xlabel(\"Sample Index\")\n",
"plt.ylabel(\"Value\")\n",
"plt.legend()\n",
"plt.grid(True)\n",
"plt.show()\n",
"\n",
"# Reflection Summary\n",
"print(\"\\n===== WEEK 6 REFLECTION =====\")\n",
"print(\"✅ Completed full fine-tuning workflow (simulated) successfully.\")\n",
"print(\"🧠 Understood how fine-tuning integrates with GPT-4o-mini API workflow.\")\n",
"print(f\"📊 Validation MAE (simulated): {mae:.2f}\")\n",
"print(\"🔍 Practiced prompt alignment, data curation, and evaluation safely.\")\n",
"print(\"💡 Next step: Try real fine-tuning (simulate=False) on small data if credits are available.\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}