{ "cells": [ { "cell_type": "markdown", "id": "40d49349-faaa-420c-9b65-0bdc9edfabce", "metadata": {}, "source": [ "# The Price is Right\n", "\n", "## Finishing off with Random Forests, XG Boost & Ensemble" ] }, { "cell_type": "code", "execution_count": null, "id": "6cd8b15e-f88a-470d-a9a6-b6370effaff9", "metadata": {}, "outputs": [], "source": [ "!pip install xgboost" ] }, { "cell_type": "code", "execution_count": null, "id": "fbcdfea8-7241-46d7-a771-c0381a3e7063", "metadata": {}, "outputs": [], "source": [ "# imports\n", "\n", "import os\n", "import re\n", "import math\n", "import json\n", "from tqdm import tqdm\n", "import random\n", "from dotenv import load_dotenv\n", "from huggingface_hub import login\n", "import numpy as np\n", "import pickle\n", "from openai import OpenAI\n", "from sentence_transformers import SentenceTransformer\n", "from datasets import load_dataset\n", "import chromadb\n", "from items import Item\n", "from testing import Tester\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import mean_squared_error, r2_score\n", "import joblib\n", "import xgboost as xgb" ] }, { "cell_type": "code", "execution_count": null, "id": "e6e88bd1-f89c-4b98-92fa-aa4bc1575bca", "metadata": {}, "outputs": [], "source": [ "# CONSTANTS\n", "\n", "DB = \"products_vectorstore\"" ] }, { "cell_type": "code", "execution_count": null, "id": "98666e73-938e-469d-8987-e6e55ba5e034", "metadata": {}, "outputs": [], "source": [ "# environment\n", "\n", "load_dotenv(override=True)\n", "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n", "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')" ] }, { "cell_type": "code", "execution_count": null, "id": "dc696493-0b6f-48aa-9fa8-b1ae0ecaf3cd", "metadata": {}, "outputs": [], "source": [ "# Load in the test pickle file:\n", "\n", "with open('test.pkl', 'rb') as file:\n", " test = pickle.load(file)\n", " \n", "# training data is already in Chroma" ] }, { "cell_type": "code", "execution_count": null, "id": "d26a1104-cd11-4361-ab25-85fb576e0582", "metadata": {}, "outputs": [], "source": [ "client = chromadb.PersistentClient(path=DB)\n", "collection = client.get_or_create_collection('products')" ] }, { "cell_type": "code", "execution_count": null, "id": "e00b82a9-a8dc-46f1-8ea9-2f07cbc8e60d", "metadata": {}, "outputs": [], "source": [ "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n", "vectors = np.array(result['embeddings'])\n", "documents = result['documents']\n", "prices = [metadata['price'] for metadata in result['metadatas']]" ] }, { "cell_type": "markdown", "id": "bf6492cb-b11a-4ad5-859b-a71a78ffb949", "metadata": {}, "source": [ "# Random Forest\n", "\n", "We will now train a Random Forest model.\n", "\n", "Can you spot the difference from what we did in Week 6? In week 6 we used the word2vec model to form vectors; this time we'll use the vectors we already have in Chroma, from the SentenceTransformer model." ] }, { "cell_type": "code", "execution_count": null, "id": "48894777-101f-4fe5-998c-47079407f340", "metadata": {}, "outputs": [], "source": [ "# This next line takes an hour on my M1 Mac!\n", "\n", "rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)\n", "rf_model.fit(vectors, prices)" ] }, { "cell_type": "markdown", "id": "90a07dde-6f57-4488-8d08-e8e5646754e7", "metadata": {}, "source": [ "n_job = -1 means it is using every core" ] }, { "cell_type": "code", "execution_count": null, "id": "62eb7ddf-e1da-481e-84c6-1256547566bd", "metadata": {}, "outputs": [], "source": [ "# Save the model to a file\n", "\n", "joblib.dump(rf_model, 'random_forest_model.pkl')" ] }, { "cell_type": "code", "execution_count": null, "id": "d281dc5e-761e-4a5e-86b3-29d9c0a33d4a", "metadata": {}, "outputs": [], "source": [ "# Load it back in again\n", "\n", "rf_model = joblib.load('random_forest_model.pkl')" ] }, { "cell_type": "markdown", "id": "23760bf5-fe52-473d-bfbe-def6b7a67a77", "metadata": {}, "source": [ "# XG Boost Model" ] }, { "cell_type": "code", "execution_count": null, "id": "c65dcfb9-d2c1-431c-843d-c5908bc39e3f", "metadata": {}, "outputs": [], "source": [ "train_dmatrix = xgb.DMatrix(vectors, label=prices)\n", "\n", "params = {\n", " \"objective\": \"reg:squarederror\",\n", " \"max_depth\": 6,\n", " \"learning_rate\": 0.1,\n", " \"nthread\": -1,\n", " \"verbosity\": 1,\n", " \"subsample\": 0.8,\n", "}\n", "\n", "model = xgb.train(params, train_dmatrix, num_boost_round=100)" ] }, { "cell_type": "code", "execution_count": null, "id": "a6980ca7-fc38-482c-8346-80c435058886", "metadata": {}, "outputs": [], "source": [ "joblib.dump(model,'xg_boost_model.pkl')" ] }, { "cell_type": "code", "execution_count": null, "id": "a0605f48-04f8-44a3-8d8c-c7be4cd840b2", "metadata": {}, "outputs": [], "source": [ "xgb_model = joblib.load('xg_boost_model.pkl')" ] }, { "cell_type": "markdown", "id": "22d10315-2b11-43b0-b042-679a2814dea1", "metadata": {}, "source": [ "# Agents" ] }, { "cell_type": "code", "execution_count": null, "id": "5d438dec-8e5b-4e60-bb6f-c3f82e522dd9", "metadata": {}, "outputs": [], "source": [ "from agents.specialist_agent import SpecialistAgent\n", "from agents.frontier_agent import FrontierAgent\n", "from agents.random_forest_agent import RandomForestAgent\n", "from agents.xg_boost_agent import XGBoostAgent" ] }, { "cell_type": "code", "execution_count": null, "id": "afc39369-b97b-4a90-b17e-b20ef501d3c9", "metadata": {}, "outputs": [], "source": [ "specialist = SpecialistAgent()\n", "frontier = FrontierAgent(collection)\n", "random_forest = RandomForestAgent()\n", "xg_boost = XGBoostAgent()" ] }, { "cell_type": "code", "execution_count": null, "id": "8e2d0d0a-8bb8-4b39-b046-322828c39244", "metadata": {}, "outputs": [], "source": [ "def description(item):\n", " return item.prompt.split(\"to the nearest dollar?\\n\\n\")[1].split(\"\\n\\nPrice is $\")[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "bfe0434f-b29e-4cc0-bad9-b07624665727", "metadata": {}, "outputs": [], "source": [ "def rf(item):\n", " return random_forest.price(description(item))" ] }, { "cell_type": "code", "execution_count": null, "id": "cdf233ec-264f-4b34-9f2b-27c39692137b", "metadata": { "scrolled": true }, "outputs": [], "source": [ "Tester.test(rf, test)" ] }, { "cell_type": "code", "execution_count": null, "id": "192b94ac-37d0-4569-bc7c-8fc4f92d129b", "metadata": {}, "outputs": [], "source": [ "def xg_b(item):\n", " return xg_boost.price(description(item))" ] }, { "cell_type": "code", "execution_count": null, "id": "a3fa01c2-42d9-4ce7-ae36-1d874a0003c1", "metadata": {}, "outputs": [], "source": [ "xg_b(test[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "9183aab7-0586-4d43-b212-c40442c7ab34", "metadata": { "scrolled": true }, "outputs": [], "source": [ "Tester.test(xg_b, test)" ] }, { "cell_type": "markdown", "id": "0045825e-2df0-429a-8ebb-2617517a2e75", "metadata": {}, "source": [ "# Moving towards the ensemble model" ] }, { "cell_type": "code", "execution_count": null, "id": "9f759bd2-7a7e-4c1a-80a0-e12470feca89", "metadata": {}, "outputs": [], "source": [ "product = \"Quadcast HyperX condenser mic for high quality audio for podcasting\"" ] }, { "cell_type": "code", "execution_count": null, "id": "e44dbd25-fb95-4b6b-bbbb-8da5fc817105", "metadata": {}, "outputs": [], "source": [ "print(specialist.price(product))\n", "print(frontier.price(product))\n", "print(random_forest.price(product))\n", "print(xg_boost.price(product))" ] }, { "cell_type": "code", "execution_count": null, "id": "1779b353-e2bb-4fc7-be7c-93057e4d688a", "metadata": {}, "outputs": [], "source": [ "specialists = []\n", "frontiers = []\n", "random_forests = []\n", "xg_boosts = []\n", "prices = []\n", "\n", "for item in tqdm(test[1000:1250]):\n", " text = description(item)\n", " specialists.append(specialist.price(text))\n", " frontiers.append(frontier.price(text))\n", " random_forests.append(random_forest.price(text))\n", " xg_boosts.append(xg_boost.price(text))\n", " prices.append(item.price)" ] }, { "cell_type": "code", "execution_count": null, "id": "f0bca725-4e34-405b-8d90-41d67086a25d", "metadata": {}, "outputs": [], "source": [ "mins = [min(s,f,r,x) for s,f,r,x in zip(specialists, frontiers, random_forests, xg_boosts)]\n", "maxes = [max(s,f,r,x) for s,f,r,x in zip(specialists, frontiers, random_forests, xg_boosts)]\n", "\n", "X = pd.DataFrame({\n", " 'Specialist': specialists,\n", " 'Frontier': frontiers,\n", " 'RandomForest': random_forests,\n", " 'XGBoost' : xg_boosts,\n", " 'Min': mins,\n", " 'Max': maxes,\n", "})\n", "\n", "# Convert y to a Series\n", "y = pd.Series(prices)" ] }, { "cell_type": "code", "execution_count": null, "id": "baac4947-02d8-4d12-82ed-9ace3c0bee39", "metadata": {}, "outputs": [], "source": [ "# Train a Linear Regression - current\n", "np.random.seed(42)\n", "\n", "lr = LinearRegression()\n", "lr.fit(X, y)\n", "\n", "feature_columns = X.columns.tolist()\n", "\n", "for feature, coef in zip(feature_columns, lr.coef_):\n", " print(f\"{feature}: {coef:.2f}\")\n", "print(f\"Intercept={lr.intercept_:.2f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "702de4cb-2311-4753-9c05-f3a0fa7e9990", "metadata": {}, "outputs": [], "source": [ "# Train a Linear Regression - old vals w/o xg\n", "np.random.seed(42)\n", "\n", "lr = LinearRegression()\n", "lr.fit(X, y)\n", "\n", "feature_columns = X.columns.tolist()\n", "\n", "for feature, coef in zip(feature_columns, lr.coef_):\n", " print(f\"{feature}: {coef:.2f}\")\n", "print(f\"Intercept={lr.intercept_:.2f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0bdf6e68-28a3-4ed2-b17e-de0ede923d34", "metadata": {}, "outputs": [], "source": [ "joblib.dump(lr, 'ensemble_model.pkl')" ] }, { "cell_type": "code", "execution_count": null, "id": "e762441a-9470-4dd7-8a8f-ec0430e908c7", "metadata": {}, "outputs": [], "source": [ "from agents.ensemble_agent import EnsembleAgent\n", "ensemble = EnsembleAgent(collection)" ] }, { "cell_type": "code", "execution_count": null, "id": "1a29f03c-8010-43b7-ae7d-1bc85ca6e8e2", "metadata": {}, "outputs": [], "source": [ "ensemble.price(product) #old val" ] }, { "cell_type": "code", "execution_count": null, "id": "13dbf002-eba6-4c7a-898f-d697f68ca28e", "metadata": {}, "outputs": [], "source": [ "ensemble.price(product)" ] }, { "cell_type": "code", "execution_count": null, "id": "e6a5e226-a508-43d5-aa42-cefbde72ffdf", "metadata": {}, "outputs": [], "source": [ "def ensemble_pricer(item):\n", " return max(0,ensemble.price(description(item)))" ] }, { "cell_type": "code", "execution_count": null, "id": "8397b1ef-2ea3-4af8-bb34-36594e0600cc", "metadata": { "scrolled": true }, "outputs": [], "source": [ "Tester.test(ensemble_pricer, test) #old " ] }, { "cell_type": "code", "execution_count": null, "id": "0d26c9ff-994b-4799-af51-09d00ddc0c06", "metadata": {}, "outputs": [], "source": [ "Tester.test(ensemble_pricer, test)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.12" } }, "nbformat": 4, "nbformat_minor": 5 }