Bootcamp: Solisoma(Updated to use the test_lite.pkl
This commit is contained in:
@@ -7,11 +7,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"import sys\n",
|
||||
"sys.path.append(\"../..\")\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import pickle\n",
|
||||
"import json\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from items import Item\n",
|
||||
"import tiktoken\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import math\n",
|
||||
@@ -200,7 +203,8 @@
|
||||
"2. If information is incomplete or truncated, use your knowledge of similar products and market pricing to make informed predictions\n",
|
||||
"3. Consider product quality indicators, brand reputation, features, and typical market values\n",
|
||||
"4. Return ONLY the numeric price (e.g., \"29.99\") \n",
|
||||
"5. Do not include currency symbols, explanations, or additional text\n",
|
||||
"5. Do not include currency symbols, explanations, or additional text \n",
|
||||
"6. Return just the raw float number\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
@@ -266,7 +270,7 @@
|
||||
"source": [
|
||||
"train_data = data.sample(n=200, random_state=42)\n",
|
||||
"train_set = train_data.sample(frac=0.8, random_state=42)\n",
|
||||
"test_set = train_data.drop(train_set.index)"
|
||||
"validation_set = train_data.drop(train_set.index)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -284,7 +288,7 @@
|
||||
" f.write(json.dumps(messages) + '\\n')\n",
|
||||
"\n",
|
||||
"with open('validation_data.jsonl', 'w') as f:\n",
|
||||
" for index, row in test_set.iterrows():\n",
|
||||
" for index, row in validation_set.iterrows():\n",
|
||||
" messages = {\"messages\": generate_message(row)}\n",
|
||||
" f.write(json.dumps(messages) + '\\n')\n"
|
||||
]
|
||||
@@ -373,56 +377,38 @@
|
||||
" def run_datapoint(self, i):\n",
|
||||
" \"\"\"Test single datapoint\"\"\"\n",
|
||||
" row = self.data.iloc[i]\n",
|
||||
" \n",
|
||||
" # Get prediction\n",
|
||||
" predict = self.predictor(row)\n",
|
||||
" \n",
|
||||
" # Try to convert to float, skip if fails\n",
|
||||
" try:\n",
|
||||
" guess = float(predict)\n",
|
||||
" except (ValueError, TypeError):\n",
|
||||
" print(f\"{YELLOW}{i+1}: Skipped - Non-numeric response: {predict[:50]}...{RESET}\")\n",
|
||||
" return # Skip this datapoint\n",
|
||||
" return \n",
|
||||
" \n",
|
||||
" truth = float(row['price']) \n",
|
||||
" \n",
|
||||
" # Calculate metrics\n",
|
||||
" truth = float(row['price']) \n",
|
||||
" error = abs(guess - truth)\n",
|
||||
" log_error = math.log(truth + 1) - math.log(guess + 1)\n",
|
||||
" sle = log_error ** 2\n",
|
||||
" color = self.color_for(error, truth)\n",
|
||||
" \n",
|
||||
" # Get title for display\n",
|
||||
" title = row['title'] if len(row['title']) <= 40 else row['title'][:40] + \"...\"\n",
|
||||
" \n",
|
||||
" # Store results\n",
|
||||
" self.guesses.append(guess)\n",
|
||||
" self.truths.append(truth)\n",
|
||||
" self.errors.append(error)\n",
|
||||
" self.sles.append(sle)\n",
|
||||
" self.colors.append(color)\n",
|
||||
" \n",
|
||||
" # Print result\n",
|
||||
" print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:.4f} Item: {title}{RESET}\")\n",
|
||||
" \n",
|
||||
" def chart(self, title):\n",
|
||||
" \"\"\"Create scatter plot of predictions vs truth\"\"\"\n",
|
||||
" plt.figure(figsize=(12, 8))\n",
|
||||
" max_val = max(max(self.truths), max(self.guesses))\n",
|
||||
" \n",
|
||||
" # Perfect prediction line\n",
|
||||
" plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6, label='Perfect Prediction')\n",
|
||||
" \n",
|
||||
" # Scatter plot\n",
|
||||
" plt.scatter(self.truths, self.guesses, s=30, c=self.colors, alpha=0.6)\n",
|
||||
" \n",
|
||||
" plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n",
|
||||
" plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n",
|
||||
" plt.xlabel('Ground Truth Price ($)', fontsize=12)\n",
|
||||
" plt.ylabel('Predicted Price ($)', fontsize=12)\n",
|
||||
" plt.xlim(0, max_val)\n",
|
||||
" plt.ylim(0, max_val)\n",
|
||||
" plt.title(title, fontsize=14)\n",
|
||||
" plt.legend()\n",
|
||||
" plt.grid(True, alpha=0.3)\n",
|
||||
" plt.show()\n",
|
||||
" \n",
|
||||
" def report(self):\n",
|
||||
@@ -443,7 +429,7 @@
|
||||
" print(f\"{'='*60}\\n\")\n",
|
||||
" \n",
|
||||
" # Create chart\n",
|
||||
" chart_title = f\"{self.title}\\nError=${average_error:,.2f} | RMSLE={rmsle:.4f} | Hits={hit_rate:.1f}%\"\n",
|
||||
" chart_title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:.2f} Hits={hit_rate:.1f}%\"\n",
|
||||
" self.chart(chart_title)\n",
|
||||
" \n",
|
||||
" # Return metrics\n",
|
||||
@@ -463,15 +449,16 @@
|
||||
" \"\"\"Run test on all datapoints\"\"\"\n",
|
||||
" print(f\"Testing {self.size} predictions...\\n\")\n",
|
||||
" \n",
|
||||
" self.error = 0\n",
|
||||
" for i in range(self.size):\n",
|
||||
" self.run_datapoint(i)\n",
|
||||
" \n",
|
||||
" return self.report()\n",
|
||||
" \n",
|
||||
" @classmethod\n",
|
||||
" def test(cls, predictor, data, title=\"Price Prediction Model\", size=None):\n",
|
||||
" def test(cls, predictor, data, title=\"Price Prediction Model\"):\n",
|
||||
" \"\"\"Quick test method\"\"\"\n",
|
||||
" return cls(predictor, data, title, size).run()"
|
||||
" return cls(predictor, data, title).run()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -487,6 +474,20 @@
|
||||
" print(\"Warning: Empty prompt!\")\n",
|
||||
" return data[\"price\"]\n",
|
||||
"\n",
|
||||
" user_prompt = f\"\"\"\n",
|
||||
" Return the price of the product in USD.\n",
|
||||
" Return just the raw float number.\n",
|
||||
"\n",
|
||||
" Product Description: {user_prompt}\n",
|
||||
" Note: Numbers in this description show product specifications like:\n",
|
||||
" - Dimensions (size measurements)\n",
|
||||
" - Weight (ounces/pounds)\n",
|
||||
" - Rankings (popularity/sales rank)\n",
|
||||
" - Part/model numbers\n",
|
||||
" \n",
|
||||
" Price prediction:\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" test = client.chat.completions.create(\n",
|
||||
" # uncomment this line to use your own model\n",
|
||||
" # model=status.fine_tuned_model, \n",
|
||||
@@ -497,12 +498,50 @@
|
||||
" ]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" result = test.choices[0].message.content\n",
|
||||
" return test.choices[0].message.content\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8f480630",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# I prepared test set from the test_lite.pkl file\n",
|
||||
"# I converted it from a list of objects to a pandas DataFrame\n",
|
||||
"# I cleaned the data to remove None values and duplicates\n",
|
||||
"\n",
|
||||
"with open('../../test_lite.pkl', 'rb') as file:\n",
|
||||
" test = pickle.load(file)\n",
|
||||
"\n",
|
||||
"test_set_in_obj_format = []\n",
|
||||
"for t in test:\n",
|
||||
" desc = \" \".join(t.prompt.split(\"\\n\")[2:4])\n",
|
||||
" title = t.title\n",
|
||||
" price = t.price\n",
|
||||
" test_set_in_obj_format.append({\"description\": desc, \"price\": price, \"title\": title})\n",
|
||||
"\n",
|
||||
"test_set = pd.DataFrame(test_set_in_obj_format)\n",
|
||||
"\n",
|
||||
"test_set[\"title\"] = test_set[\"title\"].apply(str)\n",
|
||||
"test_set[\"description\"] = test_set[\"description\"].apply(str)\n",
|
||||
"\n",
|
||||
"# Replace \"None\" and [] with None \n",
|
||||
"test_set[\"price\"] = test_set[\"price\"].replace(\"None\", None)\n",
|
||||
"test_set[\"title\"] = test_set[\"title\"].replace(\"\", None)\n",
|
||||
"test_set[\"description\"] = test_set[\"description\"].replace(\"[]\", None)\n",
|
||||
"\n",
|
||||
"test_set = test_set.dropna()\n",
|
||||
"test_set[\"price\"] = test_set[\"price\"].apply(float)\n",
|
||||
"\n",
|
||||
"test_set = test_set.drop_duplicates(subset=[\"title\", \"description\",\"price\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -510,8 +549,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_array = data.sample(n=300, random_state=42)\n",
|
||||
"result = PriceTester.test(predictor, test_array, title=\"GPT-4o-mini Fine-tuned\")"
|
||||
"result = PriceTester.test(predictor, test_set, title=\"GPT-4o-mini Fine-tuned\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user