LLM_Engineering_OLD/week6/community-contributions/solisoma/end_of_week_assesment.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8153067f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../..\")\n",
    "\n",
    "import os\n",
    "import pickle\n",
    "import json\n",
    "from openai import OpenAI\n",
    "from items import Item\n",
    "import tiktoken\n",
    "from dotenv import load_dotenv\n",
    "import math\n",
    "import matplotlib.pyplot as plt\n",
    "from huggingface_hub import login\n",
    "from datasets import load_dataset, Dataset, DatasetDict\n",
    "from transformers import AutoTokenizer\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import ast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "939441c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv(override=True)\n",
    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')\n",
    "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')\n",
    "\n",
    "OUTLIER_EXECUTED = False\n",
    "BASE_MODEL = \"meta-llama/Meta-Llama-3.1-8B\"\n",
    "\n",
    "# This is the my fine-tuned model you can use it or decide to train your own\n",
    "FINE_TUNED_MODEL = \"ft:gpt-4o-mini-2024-07-18:quicksearch-plus::CV6dqS5l\"\n",
    "GREEN = \"\\033[92m\"\n",
    "YELLOW = \"\\033[93m\"\n",
    "RED = \"\\033[91m\"\n",
    "RESET = \"\\033[0m\"\n",
    "COLOR_MAP = {\"red\": RED, \"orange\": YELLOW, \"green\": GREEN}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ce01a10",
   "metadata": {},
   "outputs": [],
   "source": [
    "hf_token = os.environ['HF_TOKEN']\n",
    "login(hf_token, add_to_git_credential=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c9e57b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = load_dataset(\"McAuley-Lab/Amazon-Reviews-2023\", f\"raw_meta_Appliances\", split=\"full\", trust_remote_code=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64670e7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame(dataset,columns=[\"main_category\", \"title\", \"description\", \"features\", \"details\", \"price\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2c34577",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"title\"] = data[\"title\"].apply(str)\n",
    "data[\"description\"] = data[\"description\"].apply(str)\n",
    "data[\"features\"] = data[\"features\"].apply(str)\n",
    "\n",
    "# Replace \"None\" and [] with None \n",
    "data[\"price\"] = data[\"price\"].replace(\"None\", None)\n",
    "data[\"title\"] = data[\"title\"].replace(\"\", None)\n",
    "data[\"description\"] = data[\"description\"].replace(\"[]\", None)\n",
    "data[\"features\"] = data[\"features\"].replace(\"[]\", None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f99208b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.dropna()\n",
    "data[\"price\"] = data[\"price\"].apply(float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c42b5c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.drop_duplicates(subset=[\"title\", \"description\",\"price\"])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73856ce5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Handle outliers\n",
    "# To do that we use the interquartile range\n",
    "# First we need to calculate the first and third quartiles\n",
    "# Make sure to run this just once  \n",
    "\n",
    "q1 = data[\"price\"].quantile(0.25)\n",
    "q3 = data[\"price\"].quantile(0.75)\n",
    "iqr = q3 - q1\n",
    "\n",
    "lower_bound = q1 - 1.5 * iqr\n",
    "higher_bound = q3 + 1.5 * iqr\n",
    "\n",
    "if not OUTLIER_EXECUTED:\n",
    "    OUTLIER_EXECUTED = True\n",
    "    data = data[(data[\"price\"] >= lower_bound) & (data[\"price\"] <= higher_bound) & (data[\"price\"] > 0)]\n",
    "else:\n",
    "   print(\"Outlier already executed\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "567b9e4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Further cleansing of the data (dealing with lists and dicts)\n",
    "def clean_list_string(field):\n",
    "    \"\"\"Convert string representation of list to clean string\"\"\"\n",
    "    try:\n",
    "        # Try to parse as literal list\n",
    "        if field.startswith('[') and field.endswith(']'):\n",
    "            parsed = ast.literal_eval(field)\n",
    "            return ' '.join(str(item) for item in parsed)\n",
    "    except:\n",
    "        pass\n",
    "    return str(field)\n",
    "\n",
    "def clean_dict_string(field):\n",
    "    \"\"\"Convert string representation of dict to clean string\"\"\"\n",
    "    try:\n",
    "        # Try to parse as literal dict\n",
    "        if field.startswith('{') and field.endswith('}'):\n",
    "            parsed = ast.literal_eval(field)\n",
    "            parts = []\n",
    "            for key, value in parsed.items():\n",
    "                if isinstance(value, dict):\n",
    "                    value = ', '.join(f\"{k}: {v}\" for k, v in value.items())\n",
    "                parts.append(f\"{key}: {value}\")\n",
    "            return ' | '.join(parts)\n",
    "    except:\n",
    "        pass\n",
    "    return str(field)\n",
    "\n",
    "\n",
    "data[\"description\"] = data[\"description\"].apply(clean_list_string)\n",
    "data[\"features\"] = data[\"features\"].apply(clean_list_string)\n",
    "data[\"details\"] = data[\"details\"].apply(clean_dict_string)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0011b3fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "SYSTEM_PROMPT = \"\"\"\n",
    "You are a price prediction expert. Given a product's title, description, features, or details, predict its price in USD.\n",
    "\n",
    "Rules:\n",
    "1. Analyze all available product information carefully\n",
    "2. If information is incomplete or truncated, use your knowledge of similar products and market pricing to make informed predictions\n",
    "3. Consider product quality indicators, brand reputation, features, and typical market values\n",
    "4. Return ONLY the numeric price (e.g., \"29.99\") \n",
    "5. Do not include currency symbols, explanations, or additional text \n",
    "6. Return just the raw float number\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "043cb9d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def truncate_by_tokens(text, max_tokens=300):\n",
    "    \"\"\"Truncate to max tokens\"\"\"\n",
    "    encoding = tiktoken.encoding_for_model(\"gpt-4o-mini\")\n",
    "    tokens = encoding.encode(text)\n",
    "    \n",
    "    if len(tokens) <= max_tokens:\n",
    "        return text\n",
    "    \n",
    "    truncated_tokens = tokens[:max_tokens]\n",
    "    return encoding.decode(truncated_tokens)\n",
    "\n",
    "def generate_prompt(data):\n",
    "    \"\"\"\n",
    "    Generate a prompt for the model to predict the price of a product\n",
    "    \"\"\"\n",
    "\n",
    "    prompt = f\"\"\"\n",
    "    Below are the details of the product: \n",
    "    Title: {data['title']}\n",
    "    Description: {data['description']}\n",
    "    Features: {data['features']}\n",
    "    \"\"\"\n",
    "    return truncate_by_tokens(prompt)\n",
    "\n",
    "def generate_message(data):\n",
    "    \"\"\"\n",
    "    Generate a message for the model to predict the price of a product\n",
    "    \"\"\"\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
    "        {\"role\": \"user\", \"content\": data[\"prompt\"]},\n",
    "        {\"role\": \"assistant\", \"content\": str(data['price'])}\n",
    "    ]\n",
    "    return messages\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdc8e3ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"prompt\"] = data.apply(lambda x: generate_prompt(x), axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d1837c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = data.sample(n=200, random_state=42)\n",
    "train_set = train_data.sample(frac=0.8, random_state=42)\n",
    "validation_set = train_data.drop(train_set.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7abfec95",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a jsonl file for the training set\n",
    "\n",
    "with open('training_data.jsonl', 'w') as f:\n",
    "    for index, row in train_set.iterrows():\n",
    "        messages = {\"messages\": generate_message(row)}\n",
    "        f.write(json.dumps(messages) + '\\n')\n",
    "\n",
    "with open('validation_data.jsonl', 'w') as f:\n",
    "    for index, row in validation_set.iterrows():\n",
    "        messages = {\"messages\": generate_message(row)}\n",
    "        f.write(json.dumps(messages) + '\\n')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a69291cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "client = OpenAI()\n",
    "\n",
    "# Uncoment the following code to train your own model\n",
    "\n",
    "# print(\"Uploading training file...\")\n",
    "# training_file = client.files.create(\n",
    "#     file=open('training_data.jsonl', 'rb'),\n",
    "#     purpose='fine-tune'\n",
    "# )\n",
    "# print(f\"File uploaded: {training_file.id}\")\n",
    "\n",
    "# print(\"Uploading validation file...\")\n",
    "# validation_file = client.files.create(\n",
    "#     file=open('validation_data.jsonl', 'rb'),\n",
    "#     purpose='fine-tune'\n",
    "# )\n",
    "# print(f\"Validation file uploaded: {validation_file.id}\")\n",
    "\n",
    "# print(\"Starting fine-tuning...\")\n",
    "# job = client.fine_tuning.jobs.create(\n",
    "#     validation_file=validation_file.id,\n",
    "#     training_file=training_file.id,\n",
    "#     model='gpt-4o-mini-2024-07-18'\n",
    "# )\n",
    "# print(f\"Job created: {job.id}\")\n",
    "\n",
    "# status = client.fine_tuning.jobs.retrieve(job.id)\n",
    "# print(f\"Status: {status.status}\")\n",
    "\n",
    "# import time\n",
    "# while status.status not in ['succeeded', 'failed']:\n",
    "#     time.sleep(60)\n",
    "#     status = client.fine_tuning.jobs.retrieve(job.id)\n",
    "#     print(f\"Status: {status.status}\")\n",
    "\n",
    "# if status.status == 'succeeded':\n",
    "#     print(f\"Model ready: {status.fine_tuned_model}\")\n",
    "# else:\n",
    "#     print(f\"Training failed: {status.error}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c0dfc1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "class PriceTester:\n",
    "    \n",
    "    def __init__(self, predictor, data, title=\"Price Prediction Model\", size=None):\n",
    "        \"\"\"\n",
    "        predictor: function that takes a row and returns predicted price\n",
    "        data: pandas DataFrame with test data\n",
    "        \"\"\"\n",
    "        self.predictor = predictor\n",
    "        self.data = data\n",
    "        self.title = title\n",
    "        self.size = size or len(data)\n",
    "        self.guesses = []\n",
    "        self.truths = []\n",
    "        self.errors = []\n",
    "        self.sles = []\n",
    "        self.colors = []\n",
    "    \n",
    "    def color_for(self, error, truth):\n",
    "        \"\"\"Determine color based on error\"\"\"\n",
    "        if error < 40 or error/truth < 0.2:\n",
    "            return \"green\"\n",
    "        elif error < 80 or error/truth < 0.4:\n",
    "            return \"orange\"\n",
    "        else:\n",
    "            return \"red\"\n",
    "    \n",
    "    def run_datapoint(self, i):\n",
    "        \"\"\"Test single datapoint\"\"\"\n",
    "        row = self.data.iloc[i]\n",
    "        predict = self.predictor(row)\n",
    "        try:\n",
    "            guess = float(predict)\n",
    "        except (ValueError, TypeError):\n",
    "            print(f\"{YELLOW}{i+1}: Skipped - Non-numeric response: {predict[:50]}...{RESET}\")\n",
    "            return \n",
    "        \n",
    "        truth = float(row['price']) \n",
    "        error = abs(guess - truth)\n",
    "        log_error = math.log(truth + 1) - math.log(guess + 1)\n",
    "        sle = log_error ** 2\n",
    "        color = self.color_for(error, truth)\n",
    "        title = row['title'] if len(row['title']) <= 40 else row['title'][:40] + \"...\"\n",
    "        \n",
    "        self.guesses.append(guess)\n",
    "        self.truths.append(truth)\n",
    "        self.errors.append(error)\n",
    "        self.sles.append(sle)\n",
    "        self.colors.append(color)\n",
    "        print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:.4f} Item: {title}{RESET}\")\n",
    "    \n",
    "    def chart(self, title):\n",
    "        \"\"\"Create scatter plot of predictions vs truth\"\"\"\n",
    "        plt.figure(figsize=(12, 8))\n",
    "        max_val = max(max(self.truths), max(self.guesses))\n",
    "        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n",
    "        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n",
    "        plt.xlabel('Ground Truth Price ($)', fontsize=12)\n",
    "        plt.ylabel('Predicted Price ($)', fontsize=12)\n",
    "        plt.xlim(0, max_val)\n",
    "        plt.ylim(0, max_val)\n",
    "        plt.title(title, fontsize=14)\n",
    "        plt.show()\n",
    "    \n",
    "    def report(self):\n",
    "        \"\"\"Generate final report with metrics\"\"\"\n",
    "        average_error = sum(self.errors) / self.size\n",
    "        rmsle = math.sqrt(sum(self.sles) / self.size)\n",
    "        hits = sum(1 for color in self.colors if color == \"green\")\n",
    "        hit_rate = hits / self.size * 100\n",
    "        \n",
    "        # Print summary\n",
    "        print(f\"\\n{'='*60}\")\n",
    "        print(f\"FINAL REPORT: {self.title}\")\n",
    "        print(f\"{'='*60}\")\n",
    "        print(f\"Total Predictions: {self.size}\")\n",
    "        print(f\"Average Error: ${average_error:,.2f}\")\n",
    "        print(f\"RMSLE: {rmsle:.4f}\")\n",
    "        print(f\"Hit Rate (Green): {hit_rate:.1f}% ({hits}/{self.size})\")\n",
    "        print(f\"{'='*60}\\n\")\n",
    "        \n",
    "        # Create chart\n",
    "        chart_title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:.2f} Hits={hit_rate:.1f}%\"\n",
    "        self.chart(chart_title)\n",
    "        \n",
    "        # Return metrics\n",
    "        return {\n",
    "            'average_error': average_error,\n",
    "            'rmsle': rmsle,\n",
    "            'hit_rate': hit_rate,\n",
    "            'hits': hits,\n",
    "            'guesses': self.guesses,\n",
    "            'truths': self.truths,\n",
    "            'errors': self.errors,\n",
    "            'sles': self.sles,\n",
    "            'colors': self.colors\n",
    "        }\n",
    "    \n",
    "    def run(self):\n",
    "        \"\"\"Run test on all datapoints\"\"\"\n",
    "        print(f\"Testing {self.size} predictions...\\n\")\n",
    "        \n",
    "        self.error = 0\n",
    "        for i in range(self.size):\n",
    "            self.run_datapoint(i)\n",
    "        \n",
    "        return self.report()\n",
    "    \n",
    "    @classmethod\n",
    "    def test(cls, predictor, data, title=\"Price Prediction Model\"):\n",
    "        \"\"\"Quick test method\"\"\"\n",
    "        return cls(predictor, data, title).run()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cc250e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predictor(data):\n",
    "    user_prompt = data[\"description\"]    \n",
    "    if not user_prompt or user_prompt.strip() == \"\":\n",
    "        print(\"Warning: Empty prompt!\")\n",
    "        return data[\"price\"]\n",
    "\n",
    "    user_prompt = f\"\"\"\n",
    "    Return the price of the product in USD.\n",
    "    Return just the raw float number.\n",
    "\n",
    "    Product Description: {user_prompt}\n",
    "    Note: Numbers in this description show product specifications like:\n",
    "    - Dimensions (size measurements)\n",
    "    - Weight (ounces/pounds)\n",
    "    - Rankings (popularity/sales rank)\n",
    "    - Part/model numbers\n",
    "    \n",
    "    Price prediction:\n",
    "    \"\"\"\n",
    "\n",
    "    test = client.chat.completions.create(\n",
    "        # uncomment this line to use your own model\n",
    "        # model=status.fine_tuned_model, \n",
    "        model=FINE_TUNED_MODEL,\n",
    "        messages=[\n",
    "            {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
    "            {\"role\": \"user\", \"content\": user_prompt}\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    result = test.choices[0].message.content\n",
    "    return test.choices[0].message.content\n",
    "\n",
    "\n",
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f480630",
   "metadata": {},
   "outputs": [],
   "source": [
    "# I prepared test set from the test_lite.pkl file\n",
    "# I converted it from a list of objects to a pandas DataFrame\n",
    "# I cleaned the data to remove None values and duplicates\n",
    "\n",
    "with open('../../test_lite.pkl', 'rb') as file:\n",
    "    test = pickle.load(file)\n",
    "\n",
    "test_set_in_obj_format = []\n",
    "for t in test:\n",
    "    desc = \" \".join(t.prompt.split(\"\\n\")[2:4])\n",
    "    title = t.title\n",
    "    price = t.price\n",
    "    test_set_in_obj_format.append({\"description\": desc, \"price\": price, \"title\": title})\n",
    "\n",
    "test_set = pd.DataFrame(test_set_in_obj_format)\n",
    "\n",
    "test_set[\"title\"] = test_set[\"title\"].apply(str)\n",
    "test_set[\"description\"] = test_set[\"description\"].apply(str)\n",
    "\n",
    "# Replace \"None\" and [] with None \n",
    "test_set[\"price\"] = test_set[\"price\"].replace(\"None\", None)\n",
    "test_set[\"title\"] = test_set[\"title\"].replace(\"\", None)\n",
    "test_set[\"description\"] = test_set[\"description\"].replace(\"[]\", None)\n",
    "\n",
    "test_set = test_set.dropna()\n",
    "test_set[\"price\"] = test_set[\"price\"].apply(float)\n",
    "\n",
    "test_set = test_set.drop_duplicates(subset=[\"title\", \"description\",\"price\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "297f1aed",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = PriceTester.test(predictor, test_set, title=\"GPT-4o-mini Fine-tuned\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}