{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fbcdfea8-7241-46d7-a771-c0381a3e7063",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "import re\n",
    "import math\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "import random\n",
    "from dotenv import load_dotenv\n",
    "from huggingface_hub import login\n",
    "import numpy as np\n",
    "import pickle\n",
    "from openai import OpenAI\n",
    "from sentence_transformers import SentenceTransformer\n",
    "from datasets import load_dataset\n",
    "import chromadb\n",
    "from items import Item\n",
    "from testing import Tester\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.linear_model import LinearRegression, ElasticNet\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "import joblib\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6e88bd1-f89c-4b98-92fa-aa4bc1575bca",
   "metadata": {},
   "outputs": [],
   "source": [
    "# CONSTANTS\n",
    "\n",
    "QUESTION = \"How much does this cost to the nearest dollar?\\n\\n\"\n",
    "DB = \"products_vectorstore\"\n",
    "# environment\n",
    "\n",
    "load_dotenv(override=True)\n",
    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
    "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')\n",
    "\n",
    "hf_token = os.environ['HF_TOKEN']\n",
    "login(hf_token, add_to_git_credential=True)\n",
    "\n",
    "from items import Item\n",
    "\n",
    "with open('test.pkl', 'rb') as file:\n",
    "    test = pickle.load(file)\n",
    "\n",
    "client = chromadb.PersistentClient(path=DB)\n",
    "collection = client.get_or_create_collection('products')\n",
    "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n",
    "vectors = np.array(result['embeddings'])\n",
    "documents = result['documents']\n",
    "prices = [metadata['price'] for metadata in result['metadatas']]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bf6492cb-b11a-4ad5-859b-a71a78ffb949",
   "metadata": {},
   "source": [
    "# Catboost GBT\n",
    "\n",
    "We will now train a Random Forest model.\n",
    "\n",
    "Can you spot the difference from what we did in Week 6? In week 6 we used the word2vec model to form vectors; this time we'll use the vectors we already have in Chroma, from the SentenceTransformer model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d25befe",
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import CatBoostRegressor\n",
    "import numpy as np\n",
    "\n",
    "# Initialize the model\n",
    "model = CatBoostRegressor(\n",
    "    iterations=1000,\n",
    "    learning_rate=0.03,\n",
    "    depth=6,\n",
    "    loss_function='RMSE',\n",
    "    verbose=100\n",
    ")\n",
    "\n",
    "model.fit(vectors, prices)\n",
    "joblib.dump(model, 'random_forest_model.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a78e1e02",
   "metadata": {},
   "outputs": [],
   "source": [
    "Tester.test(model, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d438dec-8e5b-4e60-bb6f-c3f82e522dd9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from agents.specialist_agent import SpecialistAgent\n",
    "from agents.frontier_agent import FrontierAgent\n",
    "from agents.random_forest_agent import RandomForestAgent\n",
    "from agents.my_specialist_agent import MySpecialistAgent\n",
    "\n",
    "specialist = SpecialistAgent()\n",
    "my_specialist = MySpecialistAgent()\n",
    "frontier = FrontierAgent(collection)\n",
    "random_forest = RandomForestAgent()\n",
    "\n",
    "def description(item):\n",
    "    return item.prompt.split(\"to the nearest dollar?\\n\\n\")[1].split(\"\\n\\nPrice is $\")[0]\n",
    "def rf(item):\n",
    "    return random_forest.price(description(item))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e44dbd25-fb95-4b6b-bbbb-8da5fc817105",
   "metadata": {},
   "outputs": [],
   "source": [
    "product = \"Quadcast HyperX condenser mic for high quality audio for podcasting\"\n",
    "print(specialist.price(product))\n",
    "print(my_specialist.price(product))\n",
    "\n",
    "print(frontier.price(product))\n",
    "print(random_forest.price(product))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1779b353-e2bb-4fc7-be7c-93057e4d688a",
   "metadata": {},
   "outputs": [],
   "source": [
    "specialists = []\n",
    "my_specialists = []\n",
    "frontiers = []\n",
    "random_forests = []\n",
    "prices = []\n",
    "for item in tqdm(test[1040:1250]):\n",
    "    text = description(item)\n",
    "    specialists.append(specialist.price(text))\n",
    "    my_specialists.append(my_specialist.price(text))\n",
    "    frontiers.append(frontier.price(text))\n",
    "    random_forests.append(random_forest.price(text))\n",
    "    prices.append(item.price)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0bca725-4e34-405b-8d90-41d67086a25d",
   "metadata": {},
   "outputs": [],
   "source": [
    "mins = [min(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]\n",
    "maxes = [max(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]\n",
    "means = [np.mean([s,ms,f,r]) for s, ms, f, r, in zip(specialists, my_specialists, frontiers, random_forests)]\n",
    "\n",
    "X = pd.DataFrame({\n",
    "    'Specialist': specialists,\n",
    "    'MySpecialist': my_specialists,\n",
    "    'Frontier': frontiers,\n",
    "    'RandomForest': random_forests,\n",
    "    'Min': mins,\n",
    "    'Max': maxes,\n",
    "    'Mean': means,\n",
    "})\n",
    "\n",
    "# Convert y to a Series\n",
    "y = pd.Series(prices)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bdb37a84",
   "metadata": {},
   "source": [
    "# Ensemble GBT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1be5be8a-3e7f-42a2-be54-0c7e380f7cc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "\n",
    "np.random.seed(42)\n",
    "\n",
    "\n",
    "lr = GradientBoostingRegressor(\n",
    "    n_estimators=150, \n",
    "    max_depth=3, \n",
    "    random_state=42,\n",
    "    learning_rate=0.05,\n",
    "    subsample=0.8,\n",
    "    min_samples_split=4,\n",
    "    min_samples_leaf=2,\n",
    "    max_features='sqrt'\n",
    ")\n",
    "\n",
    "lr.fit(X, y)\n",
    "\n",
    "feature_columns = X.columns.tolist()\n",
    "\n",
    "print(\"Feature importances:\")\n",
    "for feature, importance in zip(feature_columns, lr.feature_importances_):\n",
    "    print(f\"{feature}: {importance:.4f}\")\n",
    "\n",
    "joblib.dump(lr, 'ensemble_model.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e762441a-9470-4dd7-8a8f-ec0430e908c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from agents.ensemble_agent import EnsembleAgent\n",
    "ensemble = EnsembleAgent(collection)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a29f03c-8010-43b7-ae7d-1bc85ca6e8e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "ensemble.price(product)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6a5e226-a508-43d5-aa42-cefbde72ffdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def ensemble_pricer(item):\n",
    "    return max(0,ensemble.price(description(item)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8397b1ef-2ea3-4af8-bb34-36594e0600cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "Tester.test(ensemble_pricer, test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "29c1bcdd",
   "metadata": {},
   "source": [
    "# More changes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16c3f35f",
   "metadata": {},
   "source": [
    "## Added my_specialist_agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d8f0334",
   "metadata": {},
   "outputs": [],
   "source": [
    "import modal\n",
    "from agents.agent import Agent\n",
    "\n",
    "\n",
    "class MySpecialistAgent(Agent):\n",
    "    \"\"\"\n",
    "    An Agent that runs our fine-tuned LLM that's running remotely on Modal\n",
    "    \"\"\"\n",
    "\n",
    "    name = \"Specialist Agent\"\n",
    "    color = Agent.RED\n",
    "\n",
    "    def __init__(self):\n",
    "        \"\"\"\n",
    "        Set up this Agent by creating an instance of the modal class\n",
    "        \"\"\"\n",
    "        self.log(\"Specialist Agent is initializing - connecting to modal\")\n",
    "        Pricer = modal.Cls.from_name(\"my_pricer-service\", \"Pricer\") #it just points to my modal service with custom model\n",
    "        self.pricer = Pricer()\n",
    "        self.log(\"Specialist Agent is ready\")\n",
    "        \n",
    "    def price(self, description: str) -> float:\n",
    "        \"\"\"\n",
    "        Make a remote call to return the estimate of the price of this item\n",
    "        \"\"\"\n",
    "        self.log(\"Specialist Agent is calling remote fine-tuned model\")\n",
    "        result = self.pricer.price.remote(description)\n",
    "        self.log(f\"Specialist Agent completed - predicting ${result:.2f}\")\n",
    "        return result\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "161c5e77",
   "metadata": {},
   "source": [
    "## Modified ensemble_agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44398889",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.linear_model import LinearRegression\n",
    "import joblib\n",
    "import numpy as np\n",
    "from agents.agent import Agent\n",
    "from agents.specialist_agent import SpecialistAgent\n",
    "from agents.frontier_agent import FrontierAgent\n",
    "from agents.random_forest_agent import RandomForestAgent\n",
    "from agents.my_specialist_agent import MySpecialistAgent\n",
    "\n",
    "specialist = SpecialistAgent()\n",
    "\n",
    "class EnsembleAgent(Agent):\n",
    "\n",
    "    name = \"Ensemble Agent\"\n",
    "    color = Agent.YELLOW\n",
    "    \n",
    "    def __init__(self, collection):\n",
    "        \"\"\"\n",
    "        Create an instance of Ensemble, by creating each of the models\n",
    "        And loading the weights of the Ensemble\n",
    "        \"\"\"\n",
    "        self.log(\"Initializing Ensemble Agent\")\n",
    "        self.specialist = SpecialistAgent()\n",
    "        self.my_specialist = MySpecialistAgent() #added my specialist\n",
    "        self.frontier = FrontierAgent(collection)\n",
    "        self.random_forest = RandomForestAgent() #my model here is a cabtoost regularized and pruned\n",
    "        self.model = joblib.load('ensemble_model.pkl') #my model is actually a gbt\n",
    "        self.log(\"Ensemble Agent is ready\")\n",
    "\n",
    "    def price(self, description: str) -> float:\n",
    "        \"\"\"\n",
    "        Run this ensemble model\n",
    "        Ask each of the models to price the product\n",
    "        Then use the Linear Regression model to return the weighted price\n",
    "        :param description: the description of a product\n",
    "        :return: an estimate of its price\n",
    "        \"\"\"\n",
    "        self.log(\"Running Ensemble Agent - collaborating with specialist, frontier and random forest agents\")\n",
    "        specialist = self.specialist.price(description)\n",
    "        my_specialist = self.my_specialist.price(description) #added my specialist estimate\n",
    "        frontier = self.frontier.price(description)\n",
    "        random_forest = self.random_forest.price(description)\n",
    "        X = pd.DataFrame({\n",
    "            'Specialist': [specialist],\n",
    "            'MySpecialist': [my_specialist],\n",
    "            'Frontier': [frontier],\n",
    "            'RandomForest': [random_forest],\n",
    "            'Min': [min(specialist, frontier, random_forest)],\n",
    "            'Max': [max(specialist, frontier, random_forest)],\n",
    "            'Mean': [np.mean([specialist, my_specialist, frontier, random_forest])], #added the mean and myspecialist prediction.\n",
    "        })\n",
    "        y = max(0, self.model.predict(X)[0])\n",
    "        self.log(f\"Ensemble Agent complete - returning ${y:.2f}\")\n",
    "        return y"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}