{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "mzYB4XYQeWRQ",
      "metadata": {
        "id": "mzYB4XYQeWRQ"
      },
      "outputs": [],
      "source": [
        "!pip install tqdm huggingface_hub numpy sentence-transformers datasets chromadb catboost peft torch bitsandbytes"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "b3caecd1-8712-4acd-80b5-e8059c16f43f",
      "metadata": {
        "id": "b3caecd1-8712-4acd-80b5-e8059c16f43f"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "import re\n",
        "import zipfile\n",
        "import chromadb\n",
        "import joblib\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import requests\n",
        "import torch\n",
        "from datasets import load_dataset\n",
        "from google.colab import userdata\n",
        "from huggingface_hub import HfApi, hf_hub_download, login\n",
        "from openai import OpenAI\n",
        "from peft import PeftModel\n",
        "from sentence_transformers import SentenceTransformer\n",
        "from sklearn.linear_model import LinearRegression\n",
        "from sklearn.metrics import r2_score, mean_squared_error\n",
        "from tqdm import tqdm\n",
        "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
        "from catboost import CatBoostRegressor"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "id": "05d9523f-b6c9-4132-bd2b-6712772b3cd2",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "05d9523f-b6c9-4132-bd2b-6712772b3cd2",
        "outputId": "f6bb70c7-58f8-4e3c-a592-83cfb4f395a7"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ],
      "source": [
        "from google.colab import drive\n",
        "drive.mount(\"/content/drive\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "id": "z9735RD_TUHw",
      "metadata": {
        "id": "z9735RD_TUHw"
      },
      "outputs": [],
      "source": [
        "openai_api_key = userdata.get(\"OPENAI_API_KEY\")\n",
        "openai = OpenAI(api_key=openai_api_key)\n",
        "\n",
        "hf_token = userdata.get(\"HF_TOKEN\")\n",
        "login(hf_token, add_to_git_credential=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "id": "DtswsfBQxxJF",
      "metadata": {
        "id": "DtswsfBQxxJF"
      },
      "outputs": [],
      "source": [
        "# Configuration\n",
        "HF_USER = \"qshaikh\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0eKakxSFTVcA",
      "metadata": {
        "collapsed": true,
        "id": "0eKakxSFTVcA"
      },
      "outputs": [],
      "source": [
        "DATASET_NAME = f\"{HF_USER}/pricer-data\"\n",
        "dataset = load_dataset(DATASET_NAME)\n",
        "test = dataset[\"test\"]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cWqvs8JRTggE",
      "metadata": {
        "collapsed": true,
        "id": "cWqvs8JRTggE"
      },
      "outputs": [],
      "source": [
        "def description(item):\n",
        "    text = item[\"text\"].replace(\n",
        "        \"How much does this cost to the nearest dollar?\\n\\n\", \"\"\n",
        "    )\n",
        "    text = text.split(\"\\n\\nPrice is $\")[0]\n",
        "    return f\"passage: {text}\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "pjPBEgXqmHOA",
      "metadata": {
        "id": "pjPBEgXqmHOA"
      },
      "outputs": [],
      "source": [
        "CHROMA_PATH = \"/content/drive/MyDrive/chroma\"\n",
        "COLLECTION_NAME = \"price_items\"\n",
        "\n",
        "print(f\"Attempting to load ChromaDB from: {CHROMA_PATH}\")\n",
        "\n",
        "client = chromadb.PersistentClient(path=CHROMA_PATH)\n",
        "collection = client.get_or_create_collection(name=COLLECTION_NAME)\n",
        "\n",
        "print(f\"Successfully loaded ChromaDB collection '{COLLECTION_NAME}'.\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8fi1BS71XCv1",
      "metadata": {
        "id": "8fi1BS71XCv1"
      },
      "outputs": [],
      "source": [
        "embedding_model = SentenceTransformer(\"intfloat/e5-small-v2\", device=\"cuda\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "zmwIbufXUzMo",
      "metadata": {
        "id": "zmwIbufXUzMo"
      },
      "outputs": [],
      "source": [
        "BASE_MODEL = \"meta-llama/Llama-3.1-8B\"\n",
        "FINETUNED_MODEL = \"ed-donner/pricer-2024-09-13_13.04.39\"\n",
        "REVISION = \"e8d637df551603dc86cd7a1598a8f44af4d7ae36\"\n",
        "\n",
        "quant_config = BitsAndBytesConfig(\n",
        "    load_in_4bit=True,\n",
        "    bnb_4bit_use_double_quant=True,\n",
        "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
        "    bnb_4bit_quant_type=\"nf4\",\n",
        ")\n",
        "\n",
        "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n",
        "tokenizer.pad_token = tokenizer.eos_token\n",
        "tokenizer.padding_side = \"right\"\n",
        "\n",
        "base_model = AutoModelForCausalLM.from_pretrained(\n",
        "    BASE_MODEL, quantization_config=quant_config, device_map=\"auto\"\n",
        ")\n",
        "\n",
        "fine_tuned_model = PeftModel.from_pretrained(\n",
        "    base_model, FINETUNED_MODEL, revision=REVISION\n",
        ")\n",
        "\n",
        "fine_tuned_model.generation_config.pad_token_id = tokenizer.pad_token_id\n",
        "\n",
        "print(f\"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0IHiJNU7a4XC",
      "metadata": {
        "id": "0IHiJNU7a4XC"
      },
      "outputs": [],
      "source": [
        "#Cat Boost Trained Model\n",
        "catboost_model_path = \"/content/drive/MyDrive/catboost_model.pkl\"\n",
        "catboost_model = joblib.load(catboost_model_path)\n",
        "print(f\"Successfully loaded CatBoost model from {catboost_model_path}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 24,
      "id": "LgGmUKJxayZ6",
      "metadata": {
        "id": "LgGmUKJxayZ6"
      },
      "outputs": [],
      "source": [
        "def extract_tagged_price(output: str):\n",
        "    try:\n",
        "        contents = output.split(\"Price is $\")[1].replace(\",\", \"\")\n",
        "        match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n",
        "        return float(match.group()) if match else 0.0\n",
        "    except Exception:\n",
        "        return 0.0"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 25,
      "id": "ggKf1nSQbAnv",
      "metadata": {
        "id": "ggKf1nSQbAnv"
      },
      "outputs": [],
      "source": [
        "def ft_llama_price(description: str):\n",
        "    prompt = (\n",
        "        f\"How much does this cost to the nearest dollar?\\n\\n{description}\\n\\nPrice is $\"\n",
        "    )\n",
        "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n",
        "\n",
        "    outputs = fine_tuned_model.generate(\n",
        "        **inputs, max_new_tokens=5, num_return_sequences=1\n",
        "    )\n",
        "\n",
        "    result = tokenizer.decode(outputs[0])\n",
        "    price = extract_tagged_price(result)\n",
        "    return price"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 26,
      "id": "_cWyYUd4Ub-K",
      "metadata": {
        "id": "_cWyYUd4Ub-K"
      },
      "outputs": [],
      "source": [
        "def catboost_price(description: str):\n",
        "    vector = embedding_model.encode([description], normalize_embeddings=True)[0]\n",
        "    pred = catboost_model.predict([vector])[0]\n",
        "    return round(float(max(0, pred)), 2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 27,
      "id": "3Skod8juXgnN",
      "metadata": {
        "id": "3Skod8juXgnN"
      },
      "outputs": [],
      "source": [
        "def gpt4o_price(item):\n",
        "    def get_embedding(text):\n",
        "        return embedding_model.encode([text], normalize_embeddings=True)\n",
        "\n",
        "    def find_similars(text):\n",
        "        results = collection.query(\n",
        "            query_embeddings=get_embedding(text).astype(float).tolist(), n_results=5\n",
        "        )\n",
        "        docs = results[\"documents\"][0]\n",
        "        prices = [m[\"price\"] for m in results[\"metadatas\"][0]]\n",
        "        return docs, prices\n",
        "\n",
        "    def format_context(similars, prices):\n",
        "        context = (\n",
        "            \"To provide some context, here are similar products and their prices:\\n\\n\"\n",
        "        )\n",
        "        for sim, price in zip(similars, prices):\n",
        "            context += f\"Product:\\n{sim}\\nPrice is ${price:.2f}\\n\\n\"\n",
        "        return context\n",
        "\n",
        "    def build_messages(description, similars, prices):\n",
        "        system_message = (\n",
        "            \"You are a pricing expert. \"\n",
        "            \"Given a product description and a few similar products with their prices, \"\n",
        "            \"estimate the most likely price. \"\n",
        "            \"Respond ONLY with a number, no words.\"\n",
        "        )\n",
        "        context = format_context(similars, prices)\n",
        "        user_prompt = (\n",
        "            \"Estimate the price for the following product:\\n\\n\"\n",
        "            + description\n",
        "            + \"\\n\\n\"\n",
        "            + context\n",
        "        )\n",
        "        return [\n",
        "            {\"role\": \"system\", \"content\": system_message},\n",
        "            {\"role\": \"user\", \"content\": user_prompt},\n",
        "            {\"role\": \"assistant\", \"content\": \"Price is $\"},\n",
        "        ]\n",
        "\n",
        "    docs, prices = find_similars(description(item))\n",
        "    messages = build_messages(description(item), docs, prices)\n",
        "    response = openai.chat.completions.create(\n",
        "        model=\"gpt-4o-mini\", messages=messages, seed=42, max_tokens=5\n",
        "    )\n",
        "    reply = response.choices[0].message.content\n",
        "    return float(\n",
        "        re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", reply.replace(\"$\", \"\").replace(\",\", \"\")).group()\n",
        "        or 0\n",
        "    )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8XQK5yrk8On4",
      "metadata": {
        "id": "8XQK5yrk8On4"
      },
      "outputs": [],
      "source": [
        "print(\"Splitting entire dataset...\")\n",
        "np.random.seed(42)\n",
        "all_indices = list(range(len(test)))\n",
        "np.random.shuffle(all_indices)\n",
        "\n",
        "train_split_size = int(0.8 * len(all_indices))\n",
        "train_indices = all_indices[:train_split_size]  # 80%\n",
        "test_indices = all_indices[train_split_size:]  # 20%\n",
        "\n",
        "train_indices = train_indices[:250]\n",
        "test_indices = test_indices[:50]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "XN7P5fkkXfgP",
      "metadata": {
        "id": "XN7P5fkkXfgP"
      },
      "outputs": [],
      "source": [
        "ft_llama_preds_train = []\n",
        "gpt4omini_preds_train = []\n",
        "catboost_preds_train = []\n",
        "true_prices_train = []\n",
        "\n",
        "for i in tqdm(train_indices):\n",
        "    item = test[i]\n",
        "    text = description(item)\n",
        "    true_prices_train.append(item[\"price\"])\n",
        "    ft_llama_preds_train.append(ft_llama_price(text))\n",
        "    gpt4omini_preds_train.append(gpt4o_price(item))\n",
        "    catboost_preds_train.append(catboost_price(text))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1_6_atEgHnFR",
      "metadata": {
        "id": "1_6_atEgHnFR"
      },
      "outputs": [],
      "source": [
        "print(\"True Prices:\", true_prices_train)\n",
        "print(\"FT-LLaMA Predictions:\", ft_llama_preds_train)\n",
        "print(\"GPT-4o-mini Predictions:\", gpt4omini_preds_train)\n",
        "print(\"CatBoost Predictions:\", catboost_preds_train)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "tYWMhTrXcA7x",
      "metadata": {
        "id": "tYWMhTrXcA7x"
      },
      "outputs": [],
      "source": [
        "maxes_train = [\n",
        "    max(a, b, c)\n",
        "    for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, catboost_preds_train)\n",
        "]\n",
        "means_train = [\n",
        "    np.mean([a, b, c])\n",
        "    for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, catboost_preds_train)\n",
        "]\n",
        "\n",
        "X_train = pd.DataFrame(\n",
        "    {\n",
        "        \"FT_LLaMA\": ft_llama_preds_train,\n",
        "        \"GPT4oMini\": gpt4omini_preds_train,\n",
        "        \"CatBoost\": catboost_preds_train,\n",
        "        \"Max\": maxes_train,\n",
        "        \"Mean\": means_train,\n",
        "    }\n",
        ")\n",
        "\n",
        "y_train = pd.Series(true_prices_train)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "-WsFABEicOyo",
      "metadata": {
        "id": "-WsFABEicOyo"
      },
      "outputs": [],
      "source": [
        "np.random.seed(42)\n",
        "lr = LinearRegression()\n",
        "lr.fit(X_train, y_train)\n",
        "\n",
        "feature_columns = X_train.columns.tolist()\n",
        "for feature, coef in zip(feature_columns, lr.coef_):\n",
        "    print(f\"{feature}: {coef:.2f}\")\n",
        "print(f\"Intercept={lr.intercept_:.2f}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "W3F0nNBXlrUJ",
      "metadata": {
        "id": "W3F0nNBXlrUJ"
      },
      "outputs": [],
      "source": [
        "ft_llama_preds_test = []\n",
        "gpt4omini_preds_test = []\n",
        "catboost_preds_test = []\n",
        "true_prices_test = []\n",
        "\n",
        "print(\"Processing TEST data (50 items)...\")\n",
        "for i in tqdm(test_indices):\n",
        "    item = test[i]\n",
        "    text = description(item)\n",
        "    true_prices_test.append(item[\"price\"])\n",
        "    ft_llama_preds_test.append(ft_llama_price(text))\n",
        "    gpt4omini_preds_test.append(gpt4o_price(item))\n",
        "    catboost_preds_test.append(catboost_price(text))\n",
        "\n",
        "maxes_test = [\n",
        "    max(a, b, c)\n",
        "    for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, catboost_preds_test)\n",
        "]\n",
        "means_test = [\n",
        "    np.mean([a, b, c])\n",
        "    for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, catboost_preds_test)\n",
        "]\n",
        "\n",
        "X_test = pd.DataFrame(\n",
        "    {\n",
        "        \"FT_LLaMA\": ft_llama_preds_test,\n",
        "        \"GPT4oMini\": gpt4omini_preds_test,\n",
        "        \"CatBoost\": catboost_preds_test,\n",
        "        \"Max\": maxes_test,\n",
        "        \"Mean\": means_test,\n",
        "    }\n",
        ")\n",
        "\n",
        "y_test = pd.Series(true_prices_test)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "y25l8rR791wG",
      "metadata": {
        "id": "y25l8rR791wG"
      },
      "outputs": [],
      "source": [
        "print(\"Evaluating model...\")\n",
        "y_pred = lr.predict(X_test)\n",
        "r2 = r2_score(y_test, y_pred)\n",
        "print(f\"R² score: {r2:.4f}\")\n",
        "\n",
        "rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
        "print(f\"RMSE: {rmse:.2f}\")\n",
        "\n",
        "mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100\n",
        "print(f\"MAPE: {mape:.2f}%\")"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.11"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}