From 651e3e6c8ea829aa19ffe33d0de38b763ad4f9a8 Mon Sep 17 00:00:00 2001 From: aashahid Date: Wed, 29 Oct 2025 17:49:41 +0500 Subject: [PATCH] Add Week 8 submission for muhammad_qasim_sheikh --- .../Week 8/Ensemble_Model.ipynb | 538 ++++++++++++++++++ 1 file changed, 538 insertions(+) create mode 100644 community-contributions/muhammad_qasim_sheikh/Week 8/Ensemble_Model.ipynb diff --git a/community-contributions/muhammad_qasim_sheikh/Week 8/Ensemble_Model.ipynb b/community-contributions/muhammad_qasim_sheikh/Week 8/Ensemble_Model.ipynb new file mode 100644 index 0000000..1269895 --- /dev/null +++ b/community-contributions/muhammad_qasim_sheikh/Week 8/Ensemble_Model.ipynb @@ -0,0 +1,538 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "mzYB4XYQeWRQ", + "metadata": { + "id": "mzYB4XYQeWRQ" + }, + "outputs": [], + "source": [ + "!pip install tqdm huggingface_hub numpy sentence-transformers datasets chromadb catboost peft torch bitsandbytes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3caecd1-8712-4acd-80b5-e8059c16f43f", + "metadata": { + "id": "b3caecd1-8712-4acd-80b5-e8059c16f43f" + }, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "import zipfile\n", + "import chromadb\n", + "import joblib\n", + "import numpy as np\n", + "import pandas as pd\n", + "import requests\n", + "import torch\n", + "from datasets import load_dataset\n", + "from google.colab import userdata\n", + "from huggingface_hub import HfApi, hf_hub_download, login\n", + "from openai import OpenAI\n", + "from peft import PeftModel\n", + "from sentence_transformers import SentenceTransformer\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import r2_score, mean_squared_error\n", + "from tqdm import tqdm\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", + "from catboost import CatBoostRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "05d9523f-b6c9-4132-bd2b-6712772b3cd2", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "05d9523f-b6c9-4132-bd2b-6712772b3cd2", + "outputId": "f6bb70c7-58f8-4e3c-a592-83cfb4f395a7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount(\"/content/drive\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "z9735RD_TUHw", + "metadata": { + "id": "z9735RD_TUHw" + }, + "outputs": [], + "source": [ + "openai_api_key = userdata.get(\"OPENAI_API_KEY\")\n", + "openai = OpenAI(api_key=openai_api_key)\n", + "\n", + "hf_token = userdata.get(\"HF_TOKEN\")\n", + "login(hf_token, add_to_git_credential=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "DtswsfBQxxJF", + "metadata": { + "id": "DtswsfBQxxJF" + }, + "outputs": [], + "source": [ + "# Configuration\n", + "HF_USER = \"qshaikh\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0eKakxSFTVcA", + "metadata": { + "collapsed": true, + "id": "0eKakxSFTVcA" + }, + "outputs": [], + "source": [ + "DATASET_NAME = f\"{HF_USER}/pricer-data\"\n", + "dataset = load_dataset(DATASET_NAME)\n", + "test = dataset[\"test\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cWqvs8JRTggE", + "metadata": { + "collapsed": true, + "id": "cWqvs8JRTggE" + }, + "outputs": [], + "source": [ + "def description(item):\n", + " text = item[\"text\"].replace(\n", + " \"How much does this cost to the nearest dollar?\\n\\n\", \"\"\n", + " )\n", + " text = text.split(\"\\n\\nPrice is $\")[0]\n", + " return f\"passage: {text}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "pjPBEgXqmHOA", + "metadata": { + "id": "pjPBEgXqmHOA" + }, + "outputs": [], + "source": [ + "CHROMA_PATH = \"/content/drive/MyDrive/chroma\"\n", + "COLLECTION_NAME = \"price_items\"\n", + "\n", + "print(f\"Attempting to load ChromaDB from: {CHROMA_PATH}\")\n", + "\n", + "client = chromadb.PersistentClient(path=CHROMA_PATH)\n", + "collection = client.get_or_create_collection(name=COLLECTION_NAME)\n", + "\n", + "print(f\"Successfully loaded ChromaDB collection '{COLLECTION_NAME}'.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fi1BS71XCv1", + "metadata": { + "id": "8fi1BS71XCv1" + }, + "outputs": [], + "source": [ + "embedding_model = SentenceTransformer(\"intfloat/e5-small-v2\", device=\"cuda\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "zmwIbufXUzMo", + "metadata": { + "id": "zmwIbufXUzMo" + }, + "outputs": [], + "source": [ + "BASE_MODEL = \"meta-llama/Llama-3.1-8B\"\n", + "FINETUNED_MODEL = \"ed-donner/pricer-2024-09-13_13.04.39\"\n", + "REVISION = \"e8d637df551603dc86cd7a1598a8f44af4d7ae36\"\n", + "\n", + "quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + ")\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "tokenizer.padding_side = \"right\"\n", + "\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " BASE_MODEL, quantization_config=quant_config, device_map=\"auto\"\n", + ")\n", + "\n", + "fine_tuned_model = PeftModel.from_pretrained(\n", + " base_model, FINETUNED_MODEL, revision=REVISION\n", + ")\n", + "\n", + "fine_tuned_model.generation_config.pad_token_id = tokenizer.pad_token_id\n", + "\n", + "print(f\"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0IHiJNU7a4XC", + "metadata": { + "id": "0IHiJNU7a4XC" + }, + "outputs": [], + "source": [ + "#Cat Boost Trained Model\n", + "catboost_model_path = \"/content/drive/MyDrive/catboost_model.pkl\"\n", + "catboost_model = joblib.load(catboost_model_path)\n", + "print(f\"Successfully loaded CatBoost model from {catboost_model_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "LgGmUKJxayZ6", + "metadata": { + "id": "LgGmUKJxayZ6" + }, + "outputs": [], + "source": [ + "def extract_tagged_price(output: str):\n", + " try:\n", + " contents = output.split(\"Price is $\")[1].replace(\",\", \"\")\n", + " match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n", + " return float(match.group()) if match else 0.0\n", + " except Exception:\n", + " return 0.0" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "ggKf1nSQbAnv", + "metadata": { + "id": "ggKf1nSQbAnv" + }, + "outputs": [], + "source": [ + "def ft_llama_price(description: str):\n", + " prompt = (\n", + " f\"How much does this cost to the nearest dollar?\\n\\n{description}\\n\\nPrice is $\"\n", + " )\n", + " inputs = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n", + "\n", + " outputs = fine_tuned_model.generate(\n", + " **inputs, max_new_tokens=5, num_return_sequences=1\n", + " )\n", + "\n", + " result = tokenizer.decode(outputs[0])\n", + " price = extract_tagged_price(result)\n", + " return price" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "_cWyYUd4Ub-K", + "metadata": { + "id": "_cWyYUd4Ub-K" + }, + "outputs": [], + "source": [ + "def catboost_price(description: str):\n", + " vector = embedding_model.encode([description], normalize_embeddings=True)[0]\n", + " pred = catboost_model.predict([vector])[0]\n", + " return round(float(max(0, pred)), 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3Skod8juXgnN", + "metadata": { + "id": "3Skod8juXgnN" + }, + "outputs": [], + "source": [ + "def gpt4o_price(item):\n", + " def get_embedding(text):\n", + " return embedding_model.encode([text], normalize_embeddings=True)\n", + "\n", + " def find_similars(text):\n", + " results = collection.query(\n", + " query_embeddings=get_embedding(text).astype(float).tolist(), n_results=5\n", + " )\n", + " docs = results[\"documents\"][0]\n", + " prices = [m[\"price\"] for m in results[\"metadatas\"][0]]\n", + " return docs, prices\n", + "\n", + " def format_context(similars, prices):\n", + " context = (\n", + " \"To provide some context, here are similar products and their prices:\\n\\n\"\n", + " )\n", + " for sim, price in zip(similars, prices):\n", + " context += f\"Product:\\n{sim}\\nPrice is ${price:.2f}\\n\\n\"\n", + " return context\n", + "\n", + " def build_messages(description, similars, prices):\n", + " system_message = (\n", + " \"You are a pricing expert. \"\n", + " \"Given a product description and a few similar products with their prices, \"\n", + " \"estimate the most likely price. \"\n", + " \"Respond ONLY with a number, no words.\"\n", + " )\n", + " context = format_context(similars, prices)\n", + " user_prompt = (\n", + " \"Estimate the price for the following product:\\n\\n\"\n", + " + description\n", + " + \"\\n\\n\"\n", + " + context\n", + " )\n", + " return [\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": user_prompt},\n", + " {\"role\": \"assistant\", \"content\": \"Price is $\"},\n", + " ]\n", + "\n", + " docs, prices = find_similars(description(item))\n", + " messages = build_messages(description(item), docs, prices)\n", + " response = openai.chat.completions.create(\n", + " model=\"gpt-4o-mini\", messages=messages, seed=42, max_tokens=5\n", + " )\n", + " reply = response.choices[0].message.content\n", + " return float(\n", + " re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", reply.replace(\"$\", \"\").replace(\",\", \"\")).group()\n", + " or 0\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8XQK5yrk8On4", + "metadata": { + "id": "8XQK5yrk8On4" + }, + "outputs": [], + "source": [ + "print(\"Splitting entire dataset...\")\n", + "np.random.seed(42)\n", + "all_indices = list(range(len(test)))\n", + "np.random.shuffle(all_indices)\n", + "\n", + "train_split_size = int(0.8 * len(all_indices))\n", + "train_indices = all_indices[:train_split_size] # 80%\n", + "test_indices = all_indices[train_split_size:] # 20%\n", + "\n", + "train_indices = train_indices[:250]\n", + "test_indices = test_indices[:50]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "XN7P5fkkXfgP", + "metadata": { + "id": "XN7P5fkkXfgP" + }, + "outputs": [], + "source": [ + "ft_llama_preds_train = []\n", + "gpt4omini_preds_train = []\n", + "catboost_preds_train = []\n", + "true_prices_train = []\n", + "\n", + "for i in tqdm(train_indices):\n", + " item = test[i]\n", + " text = description(item)\n", + " true_prices_train.append(item[\"price\"])\n", + " ft_llama_preds_train.append(ft_llama_price(text))\n", + " gpt4omini_preds_train.append(gpt4o_price(item))\n", + " catboost_preds_train.append(catboost_price(text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1_6_atEgHnFR", + "metadata": { + "id": "1_6_atEgHnFR" + }, + "outputs": [], + "source": [ + "print(\"True Prices:\", true_prices_train)\n", + "print(\"FT-LLaMA Predictions:\", ft_llama_preds_train)\n", + "print(\"GPT-4o-mini Predictions:\", gpt4omini_preds_train)\n", + "print(\"CatBoost Predictions:\", catboost_preds_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tYWMhTrXcA7x", + "metadata": { + "id": "tYWMhTrXcA7x" + }, + "outputs": [], + "source": [ + "maxes_train = [\n", + " max(a, b, c)\n", + " for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, catboost_preds_train)\n", + "]\n", + "means_train = [\n", + " np.mean([a, b, c])\n", + " for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, catboost_preds_train)\n", + "]\n", + "\n", + "X_train = pd.DataFrame(\n", + " {\n", + " \"FT_LLaMA\": ft_llama_preds_train,\n", + " \"GPT4oMini\": gpt4omini_preds_train,\n", + " \"CatBoost\": catboost_preds_train,\n", + " \"Max\": maxes_train,\n", + " \"Mean\": means_train,\n", + " }\n", + ")\n", + "\n", + "y_train = pd.Series(true_prices_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "-WsFABEicOyo", + "metadata": { + "id": "-WsFABEicOyo" + }, + "outputs": [], + "source": [ + "np.random.seed(42)\n", + "lr = LinearRegression()\n", + "lr.fit(X_train, y_train)\n", + "\n", + "feature_columns = X_train.columns.tolist()\n", + "for feature, coef in zip(feature_columns, lr.coef_):\n", + " print(f\"{feature}: {coef:.2f}\")\n", + "print(f\"Intercept={lr.intercept_:.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "W3F0nNBXlrUJ", + "metadata": { + "id": "W3F0nNBXlrUJ" + }, + "outputs": [], + "source": [ + "ft_llama_preds_test = []\n", + "gpt4omini_preds_test = []\n", + "catboost_preds_test = []\n", + "true_prices_test = []\n", + "\n", + "print(\"Processing TEST data (50 items)...\")\n", + "for i in tqdm(test_indices):\n", + " item = test[i]\n", + " text = description(item)\n", + " true_prices_test.append(item[\"price\"])\n", + " ft_llama_preds_test.append(ft_llama_price(text))\n", + " gpt4omini_preds_test.append(gpt4o_price(item))\n", + " catboost_preds_test.append(catboost_price(text))\n", + "\n", + "maxes_test = [\n", + " max(a, b, c)\n", + " for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, catboost_preds_test)\n", + "]\n", + "means_test = [\n", + " np.mean([a, b, c])\n", + " for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, catboost_preds_test)\n", + "]\n", + "\n", + "X_test = pd.DataFrame(\n", + " {\n", + " \"FT_LLaMA\": ft_llama_preds_test,\n", + " \"GPT4oMini\": gpt4omini_preds_test,\n", + " \"CatBoost\": catboost_preds_test,\n", + " \"Max\": maxes_test,\n", + " \"Mean\": means_test,\n", + " }\n", + ")\n", + "\n", + "y_test = pd.Series(true_prices_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "y25l8rR791wG", + "metadata": { + "id": "y25l8rR791wG" + }, + "outputs": [], + "source": [ + "print(\"Evaluating model...\")\n", + "y_pred = lr.predict(X_test)\n", + "r2 = r2_score(y_test, y_pred)\n", + "print(f\"R² score: {r2:.4f}\")\n", + "\n", + "rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n", + "print(f\"RMSE: {rmse:.2f}\")\n", + "\n", + "mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100\n", + "print(f\"MAPE: {mape:.2f}%\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}