From 651e3e6c8ea829aa19ffe33d0de38b763ad4f9a8 Mon Sep 17 00:00:00 2001
From: aashahid <sp21-bcs-034@cuilahore.edu.pk>
Date: Wed, 29 Oct 2025 17:49:41 +0500
Subject: [PATCH] Add Week 8 submission for muhammad_qasim_sheikh

---
 .../Week 8/Ensemble_Model.ipynb               | 538 ++++++++++++++++++
 1 file changed, 538 insertions(+)
 create mode 100644 community-contributions/muhammad_qasim_sheikh/Week 8/Ensemble_Model.ipynb

diff --git a/community-contributions/muhammad_qasim_sheikh/Week 8/Ensemble_Model.ipynb b/community-contributions/muhammad_qasim_sheikh/Week 8/Ensemble_Model.ipynb
new file mode 100644
index 0000000..1269895
--- /dev/null
+++ b/community-contributions/muhammad_qasim_sheikh/Week 8/Ensemble_Model.ipynb	
@@ -0,0 +1,538 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "mzYB4XYQeWRQ",
+      "metadata": {
+        "id": "mzYB4XYQeWRQ"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install tqdm huggingface_hub numpy sentence-transformers datasets chromadb catboost peft torch bitsandbytes"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "b3caecd1-8712-4acd-80b5-e8059c16f43f",
+      "metadata": {
+        "id": "b3caecd1-8712-4acd-80b5-e8059c16f43f"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import re\n",
+        "import zipfile\n",
+        "import chromadb\n",
+        "import joblib\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import requests\n",
+        "import torch\n",
+        "from datasets import load_dataset\n",
+        "from google.colab import userdata\n",
+        "from huggingface_hub import HfApi, hf_hub_download, login\n",
+        "from openai import OpenAI\n",
+        "from peft import PeftModel\n",
+        "from sentence_transformers import SentenceTransformer\n",
+        "from sklearn.linear_model import LinearRegression\n",
+        "from sklearn.metrics import r2_score, mean_squared_error\n",
+        "from tqdm import tqdm\n",
+        "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
+        "from catboost import CatBoostRegressor"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "id": "05d9523f-b6c9-4132-bd2b-6712772b3cd2",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "05d9523f-b6c9-4132-bd2b-6712772b3cd2",
+        "outputId": "f6bb70c7-58f8-4e3c-a592-83cfb4f395a7"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Mounted at /content/drive\n"
+          ]
+        }
+      ],
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount(\"/content/drive\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "id": "z9735RD_TUHw",
+      "metadata": {
+        "id": "z9735RD_TUHw"
+      },
+      "outputs": [],
+      "source": [
+        "openai_api_key = userdata.get(\"OPENAI_API_KEY\")\n",
+        "openai = OpenAI(api_key=openai_api_key)\n",
+        "\n",
+        "hf_token = userdata.get(\"HF_TOKEN\")\n",
+        "login(hf_token, add_to_git_credential=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "id": "DtswsfBQxxJF",
+      "metadata": {
+        "id": "DtswsfBQxxJF"
+      },
+      "outputs": [],
+      "source": [
+        "# Configuration\n",
+        "HF_USER = \"qshaikh\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "0eKakxSFTVcA",
+      "metadata": {
+        "collapsed": true,
+        "id": "0eKakxSFTVcA"
+      },
+      "outputs": [],
+      "source": [
+        "DATASET_NAME = f\"{HF_USER}/pricer-data\"\n",
+        "dataset = load_dataset(DATASET_NAME)\n",
+        "test = dataset[\"test\"]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "cWqvs8JRTggE",
+      "metadata": {
+        "collapsed": true,
+        "id": "cWqvs8JRTggE"
+      },
+      "outputs": [],
+      "source": [
+        "def description(item):\n",
+        "    text = item[\"text\"].replace(\n",
+        "        \"How much does this cost to the nearest dollar?\\n\\n\", \"\"\n",
+        "    )\n",
+        "    text = text.split(\"\\n\\nPrice is $\")[0]\n",
+        "    return f\"passage: {text}\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "pjPBEgXqmHOA",
+      "metadata": {
+        "id": "pjPBEgXqmHOA"
+      },
+      "outputs": [],
+      "source": [
+        "CHROMA_PATH = \"/content/drive/MyDrive/chroma\"\n",
+        "COLLECTION_NAME = \"price_items\"\n",
+        "\n",
+        "print(f\"Attempting to load ChromaDB from: {CHROMA_PATH}\")\n",
+        "\n",
+        "client = chromadb.PersistentClient(path=CHROMA_PATH)\n",
+        "collection = client.get_or_create_collection(name=COLLECTION_NAME)\n",
+        "\n",
+        "print(f\"Successfully loaded ChromaDB collection '{COLLECTION_NAME}'.\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "8fi1BS71XCv1",
+      "metadata": {
+        "id": "8fi1BS71XCv1"
+      },
+      "outputs": [],
+      "source": [
+        "embedding_model = SentenceTransformer(\"intfloat/e5-small-v2\", device=\"cuda\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "zmwIbufXUzMo",
+      "metadata": {
+        "id": "zmwIbufXUzMo"
+      },
+      "outputs": [],
+      "source": [
+        "BASE_MODEL = \"meta-llama/Llama-3.1-8B\"\n",
+        "FINETUNED_MODEL = \"ed-donner/pricer-2024-09-13_13.04.39\"\n",
+        "REVISION = \"e8d637df551603dc86cd7a1598a8f44af4d7ae36\"\n",
+        "\n",
+        "quant_config = BitsAndBytesConfig(\n",
+        "    load_in_4bit=True,\n",
+        "    bnb_4bit_use_double_quant=True,\n",
+        "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+        "    bnb_4bit_quant_type=\"nf4\",\n",
+        ")\n",
+        "\n",
+        "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n",
+        "tokenizer.pad_token = tokenizer.eos_token\n",
+        "tokenizer.padding_side = \"right\"\n",
+        "\n",
+        "base_model = AutoModelForCausalLM.from_pretrained(\n",
+        "    BASE_MODEL, quantization_config=quant_config, device_map=\"auto\"\n",
+        ")\n",
+        "\n",
+        "fine_tuned_model = PeftModel.from_pretrained(\n",
+        "    base_model, FINETUNED_MODEL, revision=REVISION\n",
+        ")\n",
+        "\n",
+        "fine_tuned_model.generation_config.pad_token_id = tokenizer.pad_token_id\n",
+        "\n",
+        "print(f\"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "0IHiJNU7a4XC",
+      "metadata": {
+        "id": "0IHiJNU7a4XC"
+      },
+      "outputs": [],
+      "source": [
+        "#Cat Boost Trained Model\n",
+        "catboost_model_path = \"/content/drive/MyDrive/catboost_model.pkl\"\n",
+        "catboost_model = joblib.load(catboost_model_path)\n",
+        "print(f\"Successfully loaded CatBoost model from {catboost_model_path}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 24,
+      "id": "LgGmUKJxayZ6",
+      "metadata": {
+        "id": "LgGmUKJxayZ6"
+      },
+      "outputs": [],
+      "source": [
+        "def extract_tagged_price(output: str):\n",
+        "    try:\n",
+        "        contents = output.split(\"Price is $\")[1].replace(\",\", \"\")\n",
+        "        match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", contents)\n",
+        "        return float(match.group()) if match else 0.0\n",
+        "    except Exception:\n",
+        "        return 0.0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 25,
+      "id": "ggKf1nSQbAnv",
+      "metadata": {
+        "id": "ggKf1nSQbAnv"
+      },
+      "outputs": [],
+      "source": [
+        "def ft_llama_price(description: str):\n",
+        "    prompt = (\n",
+        "        f\"How much does this cost to the nearest dollar?\\n\\n{description}\\n\\nPrice is $\"\n",
+        "    )\n",
+        "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n",
+        "\n",
+        "    outputs = fine_tuned_model.generate(\n",
+        "        **inputs, max_new_tokens=5, num_return_sequences=1\n",
+        "    )\n",
+        "\n",
+        "    result = tokenizer.decode(outputs[0])\n",
+        "    price = extract_tagged_price(result)\n",
+        "    return price"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 26,
+      "id": "_cWyYUd4Ub-K",
+      "metadata": {
+        "id": "_cWyYUd4Ub-K"
+      },
+      "outputs": [],
+      "source": [
+        "def catboost_price(description: str):\n",
+        "    vector = embedding_model.encode([description], normalize_embeddings=True)[0]\n",
+        "    pred = catboost_model.predict([vector])[0]\n",
+        "    return round(float(max(0, pred)), 2)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 27,
+      "id": "3Skod8juXgnN",
+      "metadata": {
+        "id": "3Skod8juXgnN"
+      },
+      "outputs": [],
+      "source": [
+        "def gpt4o_price(item):\n",
+        "    def get_embedding(text):\n",
+        "        return embedding_model.encode([text], normalize_embeddings=True)\n",
+        "\n",
+        "    def find_similars(text):\n",
+        "        results = collection.query(\n",
+        "            query_embeddings=get_embedding(text).astype(float).tolist(), n_results=5\n",
+        "        )\n",
+        "        docs = results[\"documents\"][0]\n",
+        "        prices = [m[\"price\"] for m in results[\"metadatas\"][0]]\n",
+        "        return docs, prices\n",
+        "\n",
+        "    def format_context(similars, prices):\n",
+        "        context = (\n",
+        "            \"To provide some context, here are similar products and their prices:\\n\\n\"\n",
+        "        )\n",
+        "        for sim, price in zip(similars, prices):\n",
+        "            context += f\"Product:\\n{sim}\\nPrice is ${price:.2f}\\n\\n\"\n",
+        "        return context\n",
+        "\n",
+        "    def build_messages(description, similars, prices):\n",
+        "        system_message = (\n",
+        "            \"You are a pricing expert. \"\n",
+        "            \"Given a product description and a few similar products with their prices, \"\n",
+        "            \"estimate the most likely price. \"\n",
+        "            \"Respond ONLY with a number, no words.\"\n",
+        "        )\n",
+        "        context = format_context(similars, prices)\n",
+        "        user_prompt = (\n",
+        "            \"Estimate the price for the following product:\\n\\n\"\n",
+        "            + description\n",
+        "            + \"\\n\\n\"\n",
+        "            + context\n",
+        "        )\n",
+        "        return [\n",
+        "            {\"role\": \"system\", \"content\": system_message},\n",
+        "            {\"role\": \"user\", \"content\": user_prompt},\n",
+        "            {\"role\": \"assistant\", \"content\": \"Price is $\"},\n",
+        "        ]\n",
+        "\n",
+        "    docs, prices = find_similars(description(item))\n",
+        "    messages = build_messages(description(item), docs, prices)\n",
+        "    response = openai.chat.completions.create(\n",
+        "        model=\"gpt-4o-mini\", messages=messages, seed=42, max_tokens=5\n",
+        "    )\n",
+        "    reply = response.choices[0].message.content\n",
+        "    return float(\n",
+        "        re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", reply.replace(\"$\", \"\").replace(\",\", \"\")).group()\n",
+        "        or 0\n",
+        "    )"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "8XQK5yrk8On4",
+      "metadata": {
+        "id": "8XQK5yrk8On4"
+      },
+      "outputs": [],
+      "source": [
+        "print(\"Splitting entire dataset...\")\n",
+        "np.random.seed(42)\n",
+        "all_indices = list(range(len(test)))\n",
+        "np.random.shuffle(all_indices)\n",
+        "\n",
+        "train_split_size = int(0.8 * len(all_indices))\n",
+        "train_indices = all_indices[:train_split_size]  # 80%\n",
+        "test_indices = all_indices[train_split_size:]  # 20%\n",
+        "\n",
+        "train_indices = train_indices[:250]\n",
+        "test_indices = test_indices[:50]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "XN7P5fkkXfgP",
+      "metadata": {
+        "id": "XN7P5fkkXfgP"
+      },
+      "outputs": [],
+      "source": [
+        "ft_llama_preds_train = []\n",
+        "gpt4omini_preds_train = []\n",
+        "catboost_preds_train = []\n",
+        "true_prices_train = []\n",
+        "\n",
+        "for i in tqdm(train_indices):\n",
+        "    item = test[i]\n",
+        "    text = description(item)\n",
+        "    true_prices_train.append(item[\"price\"])\n",
+        "    ft_llama_preds_train.append(ft_llama_price(text))\n",
+        "    gpt4omini_preds_train.append(gpt4o_price(item))\n",
+        "    catboost_preds_train.append(catboost_price(text))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "1_6_atEgHnFR",
+      "metadata": {
+        "id": "1_6_atEgHnFR"
+      },
+      "outputs": [],
+      "source": [
+        "print(\"True Prices:\", true_prices_train)\n",
+        "print(\"FT-LLaMA Predictions:\", ft_llama_preds_train)\n",
+        "print(\"GPT-4o-mini Predictions:\", gpt4omini_preds_train)\n",
+        "print(\"CatBoost Predictions:\", catboost_preds_train)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "tYWMhTrXcA7x",
+      "metadata": {
+        "id": "tYWMhTrXcA7x"
+      },
+      "outputs": [],
+      "source": [
+        "maxes_train = [\n",
+        "    max(a, b, c)\n",
+        "    for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, catboost_preds_train)\n",
+        "]\n",
+        "means_train = [\n",
+        "    np.mean([a, b, c])\n",
+        "    for a, b, c in zip(ft_llama_preds_train, gpt4omini_preds_train, catboost_preds_train)\n",
+        "]\n",
+        "\n",
+        "X_train = pd.DataFrame(\n",
+        "    {\n",
+        "        \"FT_LLaMA\": ft_llama_preds_train,\n",
+        "        \"GPT4oMini\": gpt4omini_preds_train,\n",
+        "        \"CatBoost\": catboost_preds_train,\n",
+        "        \"Max\": maxes_train,\n",
+        "        \"Mean\": means_train,\n",
+        "    }\n",
+        ")\n",
+        "\n",
+        "y_train = pd.Series(true_prices_train)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "-WsFABEicOyo",
+      "metadata": {
+        "id": "-WsFABEicOyo"
+      },
+      "outputs": [],
+      "source": [
+        "np.random.seed(42)\n",
+        "lr = LinearRegression()\n",
+        "lr.fit(X_train, y_train)\n",
+        "\n",
+        "feature_columns = X_train.columns.tolist()\n",
+        "for feature, coef in zip(feature_columns, lr.coef_):\n",
+        "    print(f\"{feature}: {coef:.2f}\")\n",
+        "print(f\"Intercept={lr.intercept_:.2f}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "W3F0nNBXlrUJ",
+      "metadata": {
+        "id": "W3F0nNBXlrUJ"
+      },
+      "outputs": [],
+      "source": [
+        "ft_llama_preds_test = []\n",
+        "gpt4omini_preds_test = []\n",
+        "catboost_preds_test = []\n",
+        "true_prices_test = []\n",
+        "\n",
+        "print(\"Processing TEST data (50 items)...\")\n",
+        "for i in tqdm(test_indices):\n",
+        "    item = test[i]\n",
+        "    text = description(item)\n",
+        "    true_prices_test.append(item[\"price\"])\n",
+        "    ft_llama_preds_test.append(ft_llama_price(text))\n",
+        "    gpt4omini_preds_test.append(gpt4o_price(item))\n",
+        "    catboost_preds_test.append(catboost_price(text))\n",
+        "\n",
+        "maxes_test = [\n",
+        "    max(a, b, c)\n",
+        "    for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, catboost_preds_test)\n",
+        "]\n",
+        "means_test = [\n",
+        "    np.mean([a, b, c])\n",
+        "    for a, b, c in zip(ft_llama_preds_test, gpt4omini_preds_test, catboost_preds_test)\n",
+        "]\n",
+        "\n",
+        "X_test = pd.DataFrame(\n",
+        "    {\n",
+        "        \"FT_LLaMA\": ft_llama_preds_test,\n",
+        "        \"GPT4oMini\": gpt4omini_preds_test,\n",
+        "        \"CatBoost\": catboost_preds_test,\n",
+        "        \"Max\": maxes_test,\n",
+        "        \"Mean\": means_test,\n",
+        "    }\n",
+        ")\n",
+        "\n",
+        "y_test = pd.Series(true_prices_test)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "y25l8rR791wG",
+      "metadata": {
+        "id": "y25l8rR791wG"
+      },
+      "outputs": [],
+      "source": [
+        "print(\"Evaluating model...\")\n",
+        "y_pred = lr.predict(X_test)\n",
+        "r2 = r2_score(y_test, y_pred)\n",
+        "print(f\"R² score: {r2:.4f}\")\n",
+        "\n",
+        "rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
+        "print(f\"RMSE: {rmse:.2f}\")\n",
+        "\n",
+        "mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100\n",
+        "print(f\"MAPE: {mape:.2f}%\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.11"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}