{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "mzYB4XYQeWRQ", "metadata": { "id": "mzYB4XYQeWRQ" }, "outputs": [], "source": [ "!pip install tqdm huggingface_hub numpy sentence-transformers datasets chromadb catboost peft torch bitsandbytes" ] }, { "cell_type": "code", "execution_count": null, "id": "b3caecd1-8712-4acd-80b5-e8059c16f43f", "metadata": { "id": "b3caecd1-8712-4acd-80b5-e8059c16f43f" }, "outputs": [], "source": [ "import os\n", "import re\n", "import zipfile\n", "import chromadb\n", "import joblib\n", "import numpy as np\n", "import pandas as pd\n", "import requests\n", "import torch\n", "from datasets import load_dataset\n", "from google.colab import userdata\n", "from huggingface_hub import HfApi, hf_hub_download, login\n", "from openai import OpenAI\n", "from peft import PeftModel\n", "from sentence_transformers import SentenceTransformer\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import r2_score, mean_squared_error\n", "from tqdm import tqdm\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", "from catboost import CatBoostRegressor" ] }, { "cell_type": "code", "execution_count": 4, "id": "05d9523f-b6c9-4132-bd2b-6712772b3cd2", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "05d9523f-b6c9-4132-bd2b-6712772b3cd2", "outputId": "f6bb70c7-58f8-4e3c-a592-83cfb4f395a7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount(\"/content/drive\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "z9735RD_TUHw", "metadata": { "id": "z9735RD_TUHw" }, "outputs": [], "source": [ "openai_api_key = userdata.get(\"OPENAI_API_KEY\")\n", "openai = OpenAI(api_key=openai_api_key)\n", "\n", "hf_token = userdata.get(\"HF_TOKEN\")\n", "login(hf_token, add_to_git_credential=True)" ] }, { "cell_type": "code", "execution_count": 6, "id": "DtswsfBQxxJF", "metadata": { "id": "DtswsfBQxxJF" }, "outputs": [], "source": [ "# Configuration\n", "HF_USER = \"qshaikh\"" ] }, { "cell_type": "code", "execution_count": null, "id": "0eKakxSFTVcA", "metadata": { "collapsed": true, "id": "0eKakxSFTVcA" }, "outputs": [], "source": [ "DATASET_NAME = f\"{HF_USER}/pricer-data\"\n", "dataset = load_dataset(DATASET_NAME)\n", "test = dataset[\"test\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "cWqvs8JRTggE", "metadata": { "collapsed": true, "id": "cWqvs8JRTggE" }, "outputs": [], "source": [ "def description(item):\n", " text = item[\"text\"].replace(\n", " \"How much does this cost to the nearest dollar?\\n\\n\", \"\"\n", " )\n", " text = text.split(\"\\n\\nPrice is $\")[0]\n", " return f\"passage: {text}\"" ] }, { "cell_type": "code", "execution_count": null, "id": "pjPBEgXqmHOA", "metadata": { "id": "pjPBEgXqmHOA" }, "outputs": [], "source": [ "CHROMA_PATH = \"/content/drive/MyDrive/chroma\"\n", "COLLECTION_NAME = \"price_items\"\n", "\n", "print(f\"Attempting to load ChromaDB from: {CHROMA_PATH}\")\n", "\n", "client = chromadb.PersistentClient(path=CHROMA_PATH)\n", "collection = client.get_or_create_collection(name=COLLECTION_NAME)\n", "\n", "print(f\"Successfully loaded ChromaDB collection '{COLLECTION_NAME}'.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8fi1BS71XCv1", "metadata": { "id": "8fi1BS71XCv1" }, "outputs": [], "source": [ "embedding_model = SentenceTransformer(\"intfloat/e5-small-v2\", device=\"cuda\")" ] }, { "cell_type": "code", "execution_count": null, "id": "zmwIbufXUzMo", "metadata": { "id": "zmwIbufXUzMo" }, "outputs": [], "source": [ "BASE_MODEL = \"meta-llama/Llama-3.1-8B\"\n", "FINETUNED_MODEL = \"ed-donner/pricer-2024-09-13_13.04.39\"\n", "REVISION = \"e8d637df551603dc86cd7a1598a8f44af4d7ae36\"\n", "\n", "quant_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_use_double_quant=True,\n", " bnb_4bit_compute_dtype=torch.bfloat16,\n", " bnb_4bit_quant_type=\"nf4\",\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "tokenizer.padding_side = \"right\"\n", "\n", "base_model = AutoModelForCausalLM.from_pretrained(\n", " BASE_MODEL, quantization_config=quant_config, device_map=\"auto\"\n", ")\n", "\n", "fine_tuned_model = PeftModel.from_pretrained(\n", " base_model, FINETUNED_MODEL, revision=REVISION\n", ")\n", "\n", "fine_tuned_model.generation_config.pad_token_id = tokenizer.pad_token_id\n", "\n", "print(f\"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0IHiJNU7a4XC", "metadata": { "id": "0IHiJNU7a4XC" }, "outputs": [], "source": [ "#Cat Boost Trained Model\n", "catboost_model_path = \"/content/drive/MyDrive/catboost_model.pkl\"\n", "catboost_model = joblib.load(catboost_model_path)\n", "print(f\"Successfully loaded CatBoost model from {catboost_model_path}\")" ] }, { "cell_type": "code", "execution_count": 24, "id": "LgGmUKJxayZ6", "metadata": { "id": "LgGmUKJxayZ6" }, "outputs": [], "source": [ "def extract_tagged_price(output: str):\n", " try:\n", " contents = output.split(\"Price is $\")[1].replace(\",\", \"\")\n", " match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n", " return float(match.group()) if match else 0.0\n", " except Exception:\n", " return 0.0" ] }, { "cell_type": "code", "execution_count": 25, "id": "ggKf1nSQbAnv", "metadata": { "id": "ggKf1nSQbAnv" }, "outputs": [], "source": [ "def ft_llama_price(description: str):\n", " prompt = (\n", " f\"How much does this cost to the nearest dollar?\\n\\n{description}\\n\\nPrice is $\"\n", " )\n", " inputs = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n", "\n", " outputs = fine_tuned_model.generate(\n", " **inputs, max_new_tokens=5, num_return_sequences=1\n", " )\n", "\n", " result = tokenizer.decode(outputs[0])\n", " price = extract_tagged_price(result)\n", " return price" ] }, { "cell_type": "code", "execution_count": 26, "id": "_cWyYUd4Ub-K", "metadata": { "id": "_cWyYUd4Ub-K" }, "outputs": [], "source": [ "def catboost_price(description: str):\n", " vector = embedding_model.encode([description], normalize_embeddings=True)[0]\n", " pred = catboost_model.predict([vector])[0]\n", " return round(float(max(0, pred)), 2)" ] }, { "cell_type": "code", "execution_count": 27, "id": "3Skod8juXgnN", "metadata": { "id": "3Skod8juXgnN" }, "outputs": [], "source": [ "def gpt4o_price(item):\n", " def get_embedding(text):\n", " return embedding_model.encode([text], normalize_embeddings=True)\n", "\n", " def find_similars(text):\n", " results = collection.query(\n", " query_embeddings=get_embedding(text).astype(float).tolist(), n_results=5\n", " )\n", " docs = results[\"documents\"][0]\n", " prices = [m[\"price\"] for m in results[\"metadatas\"][0]]\n", " return docs, prices\n", "\n", " def format_context(similars, prices):\n", " context = (\n", " \"To provide some context, here are similar products and their prices:\\n\\n\"\n", " )\n", " for sim, price in zip(similars, prices):\n", " context += f\"Product:\\n{sim}\\nPrice is ${price:.2f}\\n\\n\"\n", " return context\n", "\n", " def build_messages(description, similars, prices):\n", " system_message = (\n", " \"You are a pricing expert. \"\n", " \"Given a product description and a few similar products with their prices, \"\n", " \"estimate the most likely price. \"\n", " \"Respond ONLY with a number, no words.\"\n", " )\n", " context = format_context(similars, prices)\n", " user_prompt = (\n", " \"Estimate the price for the following product:\\n\\n\"\n", " + description\n", " + \"\\n\\n\"\n", " + context\n", " )\n", " return [\n", " {\"role\": \"system\", \"content\": system_message},\n", " {\"role\": \"user\", \"content\": user_prompt},\n", " {\"role\": \"assistant\", \"content\": \"Price is $\"},\n", " ]\n", "\n", " docs, prices = find_similars(description(item))\n", " messages = build_messages(description(item), docs, prices)\n", " response = openai.chat.completions.create(\n", " model=\"gpt-4o-mini\", messages=messages, seed=42, max_tokens=5\n", " )\n", " reply = response.choices[0].message.content\n", " return float(\n", " re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", reply.replace(\"$\", \"\").replace(\",\", \"\")).group()\n", " or 0\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "8XQK5yrk8On4", "metadata": { "id": "8XQK5yrk8On4" }, "outputs": [], "source": [ "print(\"Splitting entire dataset...\")\n", "np.random.seed(42)\n", "all_indices = list(range(len(test)))\n", "np.random.shuffle(all_indices)\n", "\n", "train_split_size = int(0.8 * len(all_indices))\n", "train_indices = all_indices[:train_split_size] # 80%\n", "test_indices = all_indices[train_split_size:] # 20%\n", "\n", "train_indices = train_indices[:250]\n", "test_indices = test_indices[:50]" ] }, { "cell_type": "code", "execution_count": null, "id": "XN7P5fkkXfgP", "metadata": { "id": "XN7P5fkkXfgP" }, "outputs": [], "source": [ "ft_llama_preds_train = []\n", "gpt4omini_preds_train = []\n", "catboost_preds_train = []\n", "true_prices_train = []\n", "\n", "for i in tqdm(train_indices):\n", " item = test[i]\n", " text = description(item)\n", " true_prices_train.append(item[\"price\"])\n", " ft_llama_preds_train.append(ft_llama_price(text))\n", " gpt4omini_preds_train.append(gpt4o_price(item))\n", " catboost_preds_train.append(catboost_price(text))" ] }, { "cell_type": "code", "execution_count": null, "id": "1_6_atEgHnFR", "metadata": { "id": "1_6_atEgHnFR" }, "outputs": [], "source": [ "print(\"True Prices:\", true_prices_train)\n", "print(\"FT-LLaMA Predictions:\", ft_llama_preds_train)\n", "print(\"GPT-4o-mini Predictions:\", gpt4omini_preds_train)\n", "print(\"CatBoost Predictions:\", catboost_preds_train)" ] }, { "cell_type": "code", "execution_count": null, "id": "tYWMhTrXcA7x", "metadata": { "id": "tYWMhTrXcA7x" }, "outputs": [], "source": [ "maxes_train = [\n", " max(a, b, c)\n", " for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, catboost_preds_train)\n", "]\n", "means_train = [\n", " np.mean([a, b, c])\n", " for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, catboost_preds_train)\n", "]\n", "\n", "X_train = pd.DataFrame(\n", " {\n", " \"FT_LLaMA\": ft_llama_preds_train,\n", " \"GPT4oMini\": gpt4omini_preds_train,\n", " \"CatBoost\": catboost_preds_train,\n", " \"Max\": maxes_train,\n", " \"Mean\": means_train,\n", " }\n", ")\n", "\n", "y_train = pd.Series(true_prices_train)" ] }, { "cell_type": "code", "execution_count": null, "id": "-WsFABEicOyo", "metadata": { "id": "-WsFABEicOyo" }, "outputs": [], "source": [ "np.random.seed(42)\n", "lr = LinearRegression()\n", "lr.fit(X_train, y_train)\n", "\n", "feature_columns = X_train.columns.tolist()\n", "for feature, coef in zip(feature_columns, lr.coef_):\n", " print(f\"{feature}: {coef:.2f}\")\n", "print(f\"Intercept={lr.intercept_:.2f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "W3F0nNBXlrUJ", "metadata": { "id": "W3F0nNBXlrUJ" }, "outputs": [], "source": [ "ft_llama_preds_test = []\n", "gpt4omini_preds_test = []\n", "catboost_preds_test = []\n", "true_prices_test = []\n", "\n", "print(\"Processing TEST data (50 items)...\")\n", "for i in tqdm(test_indices):\n", " item = test[i]\n", " text = description(item)\n", " true_prices_test.append(item[\"price\"])\n", " ft_llama_preds_test.append(ft_llama_price(text))\n", " gpt4omini_preds_test.append(gpt4o_price(item))\n", " catboost_preds_test.append(catboost_price(text))\n", "\n", "maxes_test = [\n", " max(a, b, c)\n", " for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, catboost_preds_test)\n", "]\n", "means_test = [\n", " np.mean([a, b, c])\n", " for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, catboost_preds_test)\n", "]\n", "\n", "X_test = pd.DataFrame(\n", " {\n", " \"FT_LLaMA\": ft_llama_preds_test,\n", " \"GPT4oMini\": gpt4omini_preds_test,\n", " \"CatBoost\": catboost_preds_test,\n", " \"Max\": maxes_test,\n", " \"Mean\": means_test,\n", " }\n", ")\n", "\n", "y_test = pd.Series(true_prices_test)" ] }, { "cell_type": "code", "execution_count": null, "id": "y25l8rR791wG", "metadata": { "id": "y25l8rR791wG" }, "outputs": [], "source": [ "print(\"Evaluating model...\")\n", "y_pred = lr.predict(X_test)\n", "r2 = r2_score(y_test, y_pred)\n", "print(f\"R² score: {r2:.4f}\")\n", "\n", "rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n", "print(f\"RMSE: {rmse:.2f}\")\n", "\n", "mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100\n", "print(f\"MAPE: {mape:.2f}%\")" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }