{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "fbcdfea8-7241-46d7-a771-c0381a3e7063", "metadata": {}, "outputs": [], "source": [ "# imports\n", "\n", "import os\n", "import re\n", "import math\n", "import json\n", "from tqdm import tqdm\n", "import random\n", "from dotenv import load_dotenv\n", "from huggingface_hub import login\n", "import numpy as np\n", "import pickle\n", "from openai import OpenAI\n", "from sentence_transformers import SentenceTransformer\n", "from datasets import load_dataset\n", "import chromadb\n", "from items import Item\n", "from testing import Tester\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.linear_model import LinearRegression, ElasticNet\n", "from sklearn.metrics import mean_squared_error, r2_score\n", "import joblib\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e6e88bd1-f89c-4b98-92fa-aa4bc1575bca", "metadata": {}, "outputs": [], "source": [ "# CONSTANTS\n", "\n", "QUESTION = \"How much does this cost to the nearest dollar?\\n\\n\"\n", "DB = \"products_vectorstore\"\n", "# environment\n", "\n", "load_dotenv(override=True)\n", "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n", "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')\n", "\n", "hf_token = os.environ['HF_TOKEN']\n", "login(hf_token, add_to_git_credential=True)\n", "\n", "from items import Item\n", "\n", "with open('test.pkl', 'rb') as file:\n", " test = pickle.load(file)\n", "\n", "client = chromadb.PersistentClient(path=DB)\n", "collection = client.get_or_create_collection('products')\n", "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n", "vectors = np.array(result['embeddings'])\n", "documents = result['documents']\n", "prices = [metadata['price'] for metadata in result['metadatas']]\n" ] }, { "cell_type": "markdown", "id": "bf6492cb-b11a-4ad5-859b-a71a78ffb949", "metadata": {}, "source": [ "# Catboost GBT\n", "\n", "We will now train a Random Forest model.\n", "\n", "Can you spot the difference from what we did in Week 6? In week 6 we used the word2vec model to form vectors; this time we'll use the vectors we already have in Chroma, from the SentenceTransformer model." ] }, { "cell_type": "code", "execution_count": null, "id": "6d25befe", "metadata": {}, "outputs": [], "source": [ "from catboost import CatBoostRegressor\n", "import numpy as np\n", "\n", "# Initialize the model\n", "model = CatBoostRegressor(\n", " iterations=1000,\n", " learning_rate=0.03,\n", " depth=6,\n", " loss_function='RMSE',\n", " verbose=100\n", ")\n", "\n", "model.fit(vectors, prices)\n", "joblib.dump(model, 'random_forest_model.pkl')" ] }, { "cell_type": "code", "execution_count": null, "id": "a78e1e02", "metadata": {}, "outputs": [], "source": [ "Tester.test(model, test)" ] }, { "cell_type": "code", "execution_count": null, "id": "5d438dec-8e5b-4e60-bb6f-c3f82e522dd9", "metadata": {}, "outputs": [], "source": [ "from agents.specialist_agent import SpecialistAgent\n", "from agents.frontier_agent import FrontierAgent\n", "from agents.random_forest_agent import RandomForestAgent\n", "from agents.my_specialist_agent import MySpecialistAgent\n", "\n", "specialist = SpecialistAgent()\n", "my_specialist = MySpecialistAgent()\n", "frontier = FrontierAgent(collection)\n", "random_forest = RandomForestAgent()\n", "\n", "def description(item):\n", " return item.prompt.split(\"to the nearest dollar?\\n\\n\")[1].split(\"\\n\\nPrice is $\")[0]\n", "def rf(item):\n", " return random_forest.price(description(item))" ] }, { "cell_type": "code", "execution_count": null, "id": "e44dbd25-fb95-4b6b-bbbb-8da5fc817105", "metadata": {}, "outputs": [], "source": [ "product = \"Quadcast HyperX condenser mic for high quality audio for podcasting\"\n", "print(specialist.price(product))\n", "print(my_specialist.price(product))\n", "\n", "print(frontier.price(product))\n", "print(random_forest.price(product))" ] }, { "cell_type": "code", "execution_count": null, "id": "1779b353-e2bb-4fc7-be7c-93057e4d688a", "metadata": {}, "outputs": [], "source": [ "specialists = []\n", "my_specialists = []\n", "frontiers = []\n", "random_forests = []\n", "prices = []\n", "for item in tqdm(test[1040:1250]):\n", " text = description(item)\n", " specialists.append(specialist.price(text))\n", " my_specialists.append(my_specialist.price(text))\n", " frontiers.append(frontier.price(text))\n", " random_forests.append(random_forest.price(text))\n", " prices.append(item.price)" ] }, { "cell_type": "code", "execution_count": null, "id": "f0bca725-4e34-405b-8d90-41d67086a25d", "metadata": {}, "outputs": [], "source": [ "mins = [min(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]\n", "maxes = [max(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]\n", "means = [np.mean([s,ms,f,r]) for s, ms, f, r, in zip(specialists, my_specialists, frontiers, random_forests)]\n", "\n", "X = pd.DataFrame({\n", " 'Specialist': specialists,\n", " 'MySpecialist': my_specialists,\n", " 'Frontier': frontiers,\n", " 'RandomForest': random_forests,\n", " 'Min': mins,\n", " 'Max': maxes,\n", " 'Mean': means,\n", "})\n", "\n", "# Convert y to a Series\n", "y = pd.Series(prices)" ] }, { "cell_type": "markdown", "id": "bdb37a84", "metadata": {}, "source": [ "# Ensemble GBT" ] }, { "cell_type": "code", "execution_count": null, "id": "1be5be8a-3e7f-42a2-be54-0c7e380f7cc4", "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import GradientBoostingRegressor\n", "\n", "np.random.seed(42)\n", "\n", "\n", "lr = GradientBoostingRegressor(\n", " n_estimators=150, \n", " max_depth=3, \n", " random_state=42,\n", " learning_rate=0.05,\n", " subsample=0.8,\n", " min_samples_split=4,\n", " min_samples_leaf=2,\n", " max_features='sqrt'\n", ")\n", "\n", "lr.fit(X, y)\n", "\n", "feature_columns = X.columns.tolist()\n", "\n", "print(\"Feature importances:\")\n", "for feature, importance in zip(feature_columns, lr.feature_importances_):\n", " print(f\"{feature}: {importance:.4f}\")\n", "\n", "joblib.dump(lr, 'ensemble_model.pkl')" ] }, { "cell_type": "code", "execution_count": null, "id": "e762441a-9470-4dd7-8a8f-ec0430e908c7", "metadata": {}, "outputs": [], "source": [ "from agents.ensemble_agent import EnsembleAgent\n", "ensemble = EnsembleAgent(collection)" ] }, { "cell_type": "code", "execution_count": null, "id": "1a29f03c-8010-43b7-ae7d-1bc85ca6e8e2", "metadata": {}, "outputs": [], "source": [ "ensemble.price(product)" ] }, { "cell_type": "code", "execution_count": null, "id": "e6a5e226-a508-43d5-aa42-cefbde72ffdf", "metadata": {}, "outputs": [], "source": [ "def ensemble_pricer(item):\n", " return max(0,ensemble.price(description(item)))" ] }, { "cell_type": "code", "execution_count": null, "id": "8397b1ef-2ea3-4af8-bb34-36594e0600cc", "metadata": {}, "outputs": [], "source": [ "Tester.test(ensemble_pricer, test)" ] }, { "cell_type": "markdown", "id": "29c1bcdd", "metadata": {}, "source": [ "# More changes" ] }, { "cell_type": "markdown", "id": "16c3f35f", "metadata": {}, "source": [ "## Added my_specialist_agent" ] }, { "cell_type": "code", "execution_count": null, "id": "9d8f0334", "metadata": {}, "outputs": [], "source": [ "import modal\n", "from agents.agent import Agent\n", "\n", "\n", "class MySpecialistAgent(Agent):\n", " \"\"\"\n", " An Agent that runs our fine-tuned LLM that's running remotely on Modal\n", " \"\"\"\n", "\n", " name = \"Specialist Agent\"\n", " color = Agent.RED\n", "\n", " def __init__(self):\n", " \"\"\"\n", " Set up this Agent by creating an instance of the modal class\n", " \"\"\"\n", " self.log(\"Specialist Agent is initializing - connecting to modal\")\n", " Pricer = modal.Cls.from_name(\"my_pricer-service\", \"Pricer\") #it just points to my modal service with custom model\n", " self.pricer = Pricer()\n", " self.log(\"Specialist Agent is ready\")\n", " \n", " def price(self, description: str) -> float:\n", " \"\"\"\n", " Make a remote call to return the estimate of the price of this item\n", " \"\"\"\n", " self.log(\"Specialist Agent is calling remote fine-tuned model\")\n", " result = self.pricer.price.remote(description)\n", " self.log(f\"Specialist Agent completed - predicting ${result:.2f}\")\n", " return result\n" ] }, { "cell_type": "markdown", "id": "161c5e77", "metadata": {}, "source": [ "## Modified ensemble_agent" ] }, { "cell_type": "code", "execution_count": null, "id": "44398889", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.linear_model import LinearRegression\n", "import joblib\n", "import numpy as np\n", "from agents.agent import Agent\n", "from agents.specialist_agent import SpecialistAgent\n", "from agents.frontier_agent import FrontierAgent\n", "from agents.random_forest_agent import RandomForestAgent\n", "from agents.my_specialist_agent import MySpecialistAgent\n", "\n", "specialist = SpecialistAgent()\n", "\n", "class EnsembleAgent(Agent):\n", "\n", " name = \"Ensemble Agent\"\n", " color = Agent.YELLOW\n", " \n", " def __init__(self, collection):\n", " \"\"\"\n", " Create an instance of Ensemble, by creating each of the models\n", " And loading the weights of the Ensemble\n", " \"\"\"\n", " self.log(\"Initializing Ensemble Agent\")\n", " self.specialist = SpecialistAgent()\n", " self.my_specialist = MySpecialistAgent() #added my specialist\n", " self.frontier = FrontierAgent(collection)\n", " self.random_forest = RandomForestAgent() #my model here is a cabtoost regularized and pruned\n", " self.model = joblib.load('ensemble_model.pkl') #my model is actually a gbt\n", " self.log(\"Ensemble Agent is ready\")\n", "\n", " def price(self, description: str) -> float:\n", " \"\"\"\n", " Run this ensemble model\n", " Ask each of the models to price the product\n", " Then use the Linear Regression model to return the weighted price\n", " :param description: the description of a product\n", " :return: an estimate of its price\n", " \"\"\"\n", " self.log(\"Running Ensemble Agent - collaborating with specialist, frontier and random forest agents\")\n", " specialist = self.specialist.price(description)\n", " my_specialist = self.my_specialist.price(description) #added my specialist estimate\n", " frontier = self.frontier.price(description)\n", " random_forest = self.random_forest.price(description)\n", " X = pd.DataFrame({\n", " 'Specialist': [specialist],\n", " 'MySpecialist': [my_specialist],\n", " 'Frontier': [frontier],\n", " 'RandomForest': [random_forest],\n", " 'Min': [min(specialist, frontier, random_forest)],\n", " 'Max': [max(specialist, frontier, random_forest)],\n", " 'Mean': [np.mean([specialist, my_specialist, frontier, random_forest])], #added the mean and myspecialist prediction.\n", " })\n", " y = max(0, self.model.predict(X)[0])\n", " self.log(f\"Ensemble Agent complete - returning ${y:.2f}\")\n", " return y" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 5 }