From 7e09e1011e9a108296fe88da087c2586953c063c Mon Sep 17 00:00:00 2001 From: sach91 Date: Wed, 29 Oct 2025 22:29:19 +0530 Subject: [PATCH] sach91 bootcamp week6 exercise --- .../sach91-bootcamp/week6-exercise.ipynb | 1167 +++++++++++++++++ 1 file changed, 1167 insertions(+) create mode 100644 community-contributions/sach91-bootcamp/week6-exercise.ipynb diff --git a/community-contributions/sach91-bootcamp/week6-exercise.ipynb b/community-contributions/sach91-bootcamp/week6-exercise.ipynb new file mode 100644 index 0000000..79bdcbb --- /dev/null +++ b/community-contributions/sach91-bootcamp/week6-exercise.ipynb @@ -0,0 +1,1167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "db8736a7-ed94-441c-9556-831fa57b5a10", + "metadata": {}, + "source": [ + "# Trying some approaches inline in the course notebook\n", + "## 1. XGBoost Algorithm\n", + "## 2. Mix of experts (XGBoost, RandomForest, ) - Two variants: (1) Simple average (2) Average of 2 nearest predictions\n", + "## 3. Named Entity Recognition (NER) - Filter text to include only entities in the feature vectors\n", + "## 4. MinMax and Standard Scaler for normalizing word2vec embeddings\n", + "## Alas, None improved the performance from Random Forest. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "681c717b-4c24-4ac3-a5f3-3c5881d6e70a", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import math\n", + "import json\n", + "import random\n", + "from dotenv import load_dotenv\n", + "from huggingface_hub import login\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pickle\n", + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "933b6e75-3661-4f30-b0b5-c28d04e3748e", + "metadata": {}, + "outputs": [], + "source": [ + "# More imports for our traditional machine learning\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42cf33b7-7abd-44ba-9780-c156b70473b5", + "metadata": {}, + "outputs": [], + "source": [ + "# NLP related imports\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from gensim.models import Word2Vec\n", + "from gensim.utils import simple_preprocess" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1ac3ec0-183c-4a12-920b-b06397f86815", + "metadata": {}, + "outputs": [], + "source": [ + "# Finally, more imports for more advanced machine learning\n", + "from sklearn.svm import LinearSVR\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "from polire import IDW" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c01ee5f-c4fc-44fe-9d3a-907e8a0426d2", + "metadata": {}, + "outputs": [], + "source": [ + "# Constants - used for printing to stdout in color\n", + "\n", + "GREEN = \"\\033[92m\"\n", + "YELLOW = \"\\033[93m\"\n", + "RED = \"\\033[91m\"\n", + "RESET = \"\\033[0m\"\n", + "COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c830ed3e-24ee-4af6-a07b-a1bfdcd39278", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "id": "5105e13c-bca0-4c70-bfaa-649345f53322", + "metadata": {}, + "source": [ + "## Loading the pkl files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c9b05f4-c9eb-462c-8d86-de9140a2d985", + "metadata": {}, + "outputs": [], + "source": [ + "with open('./train.pkl', 'rb') as file:\n", + " train = pickle.load(file)\n", + "\n", + "with open('./test.pkl', 'rb') as file:\n", + " test = pickle.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66459184-535b-4195-9dea-a0de1b349605", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5793f5c-e23e-4a74-9496-1e30dd1e8935", + "metadata": {}, + "outputs": [], + "source": [ + "class Tester:\n", + "\n", + " def __init__(self, predictor, title=None, data=test, size=250):\n", + " self.predictor = predictor\n", + " self.data = data\n", + " self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n", + " self.size = size\n", + " self.guesses = []\n", + " self.truths = []\n", + " self.errors = []\n", + " self.sles = []\n", + " self.colors = []\n", + "\n", + " def color_for(self, error, truth):\n", + " if error<40 or error/truth < 0.2:\n", + " return \"green\"\n", + " elif error<80 or error/truth < 0.4:\n", + " return \"orange\"\n", + " else:\n", + " return \"red\"\n", + " \n", + " def run_datapoint(self, i):\n", + " datapoint = self.data[i]\n", + " guess = self.predictor(datapoint)\n", + " truth = datapoint.price\n", + " error = abs(guess - truth)\n", + " log_error = math.log(truth+1) - math.log(guess+1)\n", + " sle = log_error ** 2\n", + " color = self.color_for(error, truth)\n", + " title = datapoint.title if len(datapoint.title) <= 40 else datapoint.title[:40]+\"...\"\n", + " self.guesses.append(guess)\n", + " self.truths.append(truth)\n", + " self.errors.append(error)\n", + " self.sles.append(sle)\n", + " self.colors.append(color)\n", + " print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}\")\n", + "\n", + " def chart(self, title):\n", + " max_error = max(self.errors)\n", + " plt.figure(figsize=(12, 8))\n", + " max_val = max(max(self.truths), max(self.guesses))\n", + " plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n", + " plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n", + " plt.xlabel('Ground Truth')\n", + " plt.ylabel('Model Estimate')\n", + " plt.xlim(0, max_val)\n", + " plt.ylim(0, max_val)\n", + " plt.title(title)\n", + " plt.show()\n", + "\n", + " def report(self):\n", + " average_error = sum(self.errors) / self.size\n", + " rmsle = math.sqrt(sum(self.sles) / self.size)\n", + " hits = sum(1 for color in self.colors if color==\"green\")\n", + " title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%\"\n", + " self.chart(title)\n", + "\n", + " def run(self):\n", + " self.error = 0\n", + " for i in range(self.size):\n", + " self.run_datapoint(i)\n", + " self.report()\n", + "\n", + " @classmethod\n", + " def test(cls, function):\n", + " cls(function).run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66ea68e8-ab1b-4f0d-aba4-a59574d8f85e", + "metadata": {}, + "outputs": [], + "source": [ + "def random_pricer(item):\n", + " return random.randrange(1,1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53d941cb-5b73-44ea-b893-3a0ce9997066", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Set the random seed\n", + "\n", + "random.seed(42)\n", + "\n", + "# Run our TestRunner\n", + "Tester.test(random_pricer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97451c73-9c1b-43a8-b3b9-9c41942e48a2", + "metadata": {}, + "outputs": [], + "source": [ + "# That was fun!\n", + "# We can do better - here's another rather trivial model\n", + "\n", + "training_prices = [item.price for item in train]\n", + "training_average = sum(training_prices) / len(training_prices)\n", + "\n", + "def constant_pricer(item):\n", + " return training_average" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cf384eb-30c2-40d8-b7e5-48942ac6a969", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Run our constant predictor\n", + "Tester.test(constant_pricer)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce16eee8-bb34-4914-9aa5-57e30a567842", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a new \"features\" field on items, and populate it with json parsed from the details dict\n", + "\n", + "for item in train:\n", + " item.features = json.loads(item.details)\n", + "for item in test:\n", + " item.features = json.loads(item.details)\n", + "\n", + "# Look at one" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac702a10-dccb-43d4-887b-6f92a0fb298f", + "metadata": {}, + "outputs": [], + "source": [ + "train[0].features.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd7a41c5-0c51-41be-a61d-8e80c3e90930", + "metadata": {}, + "outputs": [], + "source": [ + "# Look at 20 most common features in training set\n", + "\n", + "feature_count = Counter()\n", + "for item in train:\n", + " for f in item.features.keys():\n", + " feature_count[f]+=1\n", + "\n", + "feature_count.most_common(40)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cef84a9-4932-48fd-9f7a-51cfc06e3216", + "metadata": {}, + "outputs": [], + "source": [ + "# Now some janky code to pluck out the Item Weight\n", + "# Don't worry too much about this: spoiler alert, it's not going to be much use in training!\n", + "\n", + "def get_weight(item):\n", + " weight_str = item.features.get('Item Weight')\n", + " if weight_str:\n", + " parts = weight_str.split(' ')\n", + " amount = float(parts[0])\n", + " unit = parts[1].lower()\n", + " if unit==\"pounds\":\n", + " return amount\n", + " elif unit==\"ounces\":\n", + " return amount / 16\n", + " elif unit==\"grams\":\n", + " return amount / 453.592\n", + " elif unit==\"milligrams\":\n", + " return amount / 453592\n", + " elif unit==\"kilograms\":\n", + " return amount / 0.453592\n", + " elif unit==\"hundredths\" and parts[2].lower()==\"pounds\":\n", + " return amount / 100\n", + " else:\n", + " print(weight_str)\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4848b4a-3c5a-4168-83a5-57a1f3ff270d", + "metadata": {}, + "outputs": [], + "source": [ + "weights = [get_weight(t) for t in train]\n", + "weights = [w for w in weights if w]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cd11cc8-f16e-4991-b531-482189ddc4b6", + "metadata": {}, + "outputs": [], + "source": [ + "average_weight = sum(weights)/len(weights)\n", + "average_weight" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efe8ec7f-9777-464f-a809-b06b7033bdb2", + "metadata": {}, + "outputs": [], + "source": [ + "def get_weight_with_default(item):\n", + " weight = get_weight(item)\n", + " return weight or average_weight" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2659fef-a455-431a-9a0e-59342b80084b", + "metadata": {}, + "outputs": [], + "source": [ + "def get_rank(item):\n", + " rank_dict = item.features.get(\"Best Sellers Rank\")\n", + " if rank_dict:\n", + " ranks = rank_dict.values()\n", + " return sum(ranks)/len(ranks)\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20b9b5be-30bc-4d3a-8492-fbae119421a0", + "metadata": {}, + "outputs": [], + "source": [ + "ranks = [get_rank(t) for t in train]\n", + "ranks = [r for r in ranks if r]\n", + "average_rank = sum(ranks)/len(ranks)\n", + "average_rank" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "081e646a-ea50-4ec3-9512-6d5f96f8aef6", + "metadata": {}, + "outputs": [], + "source": [ + "def get_rank_with_default(item):\n", + " rank = get_rank(item)\n", + " return rank or average_rank" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afd5daf7-cb2b-47af-bf17-dd71a9db65d0", + "metadata": {}, + "outputs": [], + "source": [ + "def get_text_length(item):\n", + " return len(item.test_prompt())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85c89012-a922-401b-8a3b-94af641bf27a", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# investigate the brands\n", + "\n", + "brands = Counter()\n", + "for t in train:\n", + " brand = t.features.get(\"Brand\")\n", + " if brand:\n", + " brands[brand]+=1\n", + "\n", + "# Look at most common 40 brands\n", + "\n", + "brands.most_common(40)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "386dde54-e028-4a6d-b291-cce889ac1fa3", + "metadata": {}, + "outputs": [], + "source": [ + "TOP_ELECTRONICS_BRANDS = [\"hp\", \"dell\", \"lenovo\", \"samsung\", \"asus\", \"sony\", \"canon\", \"apple\", \"intel\"]\n", + "def is_top_electronics_brand(item):\n", + " brand = item.features.get(\"Brand\")\n", + " return brand and brand.lower() in TOP_ELECTRONICS_BRANDS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c31c9c59-9d0d-47a8-a046-f20ed8d38d4c", + "metadata": {}, + "outputs": [], + "source": [ + "def get_features(item):\n", + " return {\n", + " \"weight\": get_weight_with_default(item),\n", + " \"rank\": get_rank_with_default(item),\n", + " \"text_length\": get_text_length(item),\n", + " \"is_top_electronics_brand\": 1 if is_top_electronics_brand(item) else 0\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88850855-f5bd-4be2-9d7c-75bf8a21609b", + "metadata": {}, + "outputs": [], + "source": [ + "# Look at features in a training item\n", + "get_features(train[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee9b5298-68b7-497d-8b2e-875287bb25b2", + "metadata": {}, + "outputs": [], + "source": [ + "# A utility function to convert our features into a pandas dataframe\n", + "\n", + "def list_to_dataframe(items):\n", + " features = [get_features(item) for item in items]\n", + " df = pd.DataFrame(features)\n", + " df['price'] = [item.price for item in items]\n", + " return df\n", + "\n", + "train_df = list_to_dataframe(train)\n", + "test_df = list_to_dataframe(test[:250])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc1d68e0-ab33-40f4-9334-461d426af25c", + "metadata": {}, + "outputs": [], + "source": [ + "# Traditional Linear Regression!\n", + "\n", + "np.random.seed(42)\n", + "\n", + "# Separate features and target\n", + "feature_columns = ['weight', 'rank', 'text_length', 'is_top_electronics_brand']\n", + "\n", + "X_train = train_df[feature_columns]\n", + "y_train = train_df['price']\n", + "X_test = test_df[feature_columns]\n", + "y_test = test_df['price']\n", + "\n", + "# Train a Linear Regression\n", + "model = LinearRegression()\n", + "model.fit(X_train, y_train)\n", + "\n", + "for feature, coef in zip(feature_columns, model.coef_):\n", + " print(f\"{feature}: {coef}\")\n", + "print(f\"Intercept: {model.intercept_}\")\n", + "\n", + "# Predict the test set and evaluate\n", + "y_pred = model.predict(X_test)\n", + "mse = mean_squared_error(y_test, y_pred)\n", + "r2 = r2_score(y_test, y_pred)\n", + "\n", + "print(f\"Mean Squared Error: {mse}\")\n", + "print(f\"R-squared Score: {r2}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6561c3c7-ac7f-458b-983c-4a164b9d02c3", + "metadata": {}, + "outputs": [], + "source": [ + "# Function to predict price for a new item\n", + "\n", + "def linear_regression_pricer(item):\n", + " features = get_features(item)\n", + " features_df = pd.DataFrame([features])\n", + " return model.predict(features_df)[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bf2caa4-657a-4fc6-9dcb-bed7eaf8dd65", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# test it\n", + "\n", + "Tester.test(linear_regression_pricer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79e1574b-52ef-49cc-bfb5-e97252ed5db8", + "metadata": {}, + "outputs": [], + "source": [ + "# For the next few models, we prepare our documents and prices\n", + "# Note that we use the test prompt for the documents, otherwise we'll reveal the answer!!\n", + "\n", + "prices = np.array([float(item.price) for item in train])\n", + "documents = [item.test_prompt() for item in train]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e126c22e-53e7-4967-9ebb-6b7dd7fe4ade", + "metadata": {}, + "outputs": [], + "source": [ + "# Use the CountVectorizer for a Bag of Words model\n", + "\n", + "np.random.seed(42)\n", + "vectorizer = CountVectorizer(max_features=1000, stop_words='english')\n", + "X = vectorizer.fit_transform(documents)\n", + "regressor = LinearRegression()\n", + "regressor.fit(X, prices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b7148d3-3202-4536-a75c-1627495c51d3", + "metadata": {}, + "outputs": [], + "source": [ + "def bow_lr_pricer(item):\n", + " x = vectorizer.transform([item.test_prompt()])\n", + " return max(regressor.predict(x)[0], 0)\n", + "pred_lr = {}\n", + "for i in range(len(test)):\n", + " pred_lr[test[i]] = bow_lr_pricer(test[i])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38f7f7d0-d22c-4282-92e5-9666a7b8535d", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# test it\n", + "def get_pred_lr(item):\n", + " return pred_lr[item]\n", + "Tester.test(get_pred_lr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a82f0a3-191c-4653-b9d6-5622730a6c94", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "354b631d-de40-4dee-91fc-adb2ab094c4f", + "metadata": {}, + "outputs": [], + "source": [ + "xgb = GradientBoostingRegressor()\n", + "xgb.fit(X, prices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05e7d00b-db89-4f80-bca6-69d251d8cf62", + "metadata": {}, + "outputs": [], + "source": [ + "def bow_xgb_pricer(item):\n", + " x = vectorizer.transform([item.test_prompt()])\n", + " return max(xgb.predict(x)[0], 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a3109d3-ecb6-42df-970c-b798968d467c", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# test\n", + "pred_xgb = {}\n", + "for i in range(len(test)):\n", + " pred_xgb[test[i]] = bow_xgb_pricer(test[i])\n", + "def get_pred_xgb(item):\n", + " return pred_xgb[item]\n", + "Tester.test(get_pred_xgb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21a5ce93-e58e-48f7-ab94-407e4c2269c6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a19b662c-d0ad-40c8-8116-f57c368a841d", + "metadata": {}, + "outputs": [], + "source": [ + "import spacy \n", + "nlp = spacy.load(\"en_core_web_sm\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c52fc528-d8af-4b01-9e58-8502033884cc", + "metadata": {}, + "outputs": [], + "source": [ + "def ner_doc(doc):\n", + " d = nlp(doc.replace('\\n',' '))\n", + " ents = []\n", + " for ent in d.ents:\n", + " ents.extend(ent.text.split(' '))\n", + " return ' '.join(list(set(ents)))\n", + "def ner_docs(docs):\n", + " ret = []\n", + " for i,doc in enumerate(docs):\n", + " ret.append(ner_doc(doc))\n", + " if i%1000 == 0:\n", + " print(i, ret[-1])\n", + " return ret" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b1745e1-5aca-4239-b33c-99a1d13ba567", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "if os.path.exists('docs.pkl'):\n", + " docs2 = pickle.load(open('docs.pkl','rb'))\n", + "else:\n", + " docs2 = ner_doc(documents)\n", + " pickle.dump(docs2, open('docs.pkl','wb'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19f69fc0-fbc5-4069-b45c-e5f5c0fb14a1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90597133-f057-4597-bb2b-d0b8986342ae", + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(42)\n", + "vectorizer2 = CountVectorizer(max_features=1000, stop_words='english')\n", + "X2 = vectorizer2.fit_transform(docs2)\n", + "regressor2 = LinearRegression()\n", + "regressor2.fit(X2, prices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "362875d7-a6b1-4965-ae9b-0d7a0104245f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def ner_pricer(item):\n", + " x = vectorizer2.transform([ner_doc(item.test_prompt())])\n", + " return max(regressor2.predict(x)[0], 0)\n", + "# test\n", + "pred_ner = {}\n", + "for i in range(len(test)):\n", + " pred_ner[test[i]] = ner_pricer(test[i])\n", + "def get_pred_ner(item):\n", + " return pred_ner[item]\n", + "Tester.test(get_pred_ner)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a44bd333-9577-4338-9da5-6947f7e19849", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b623079e-54fa-418f-b209-7d54ebbcc23a", + "metadata": {}, + "outputs": [], + "source": [ + "# The amazing word2vec model, implemented in gensim NLP library\n", + "\n", + "np.random.seed(42)\n", + "\n", + "# Preprocess the documents\n", + "processed_docs = [simple_preprocess(doc) for doc in documents]\n", + "\n", + "# Train Word2Vec model\n", + "w2v_model = Word2Vec(sentences=processed_docs, vector_size=400, window=5, min_count=1, workers=8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3de4efc7-68a6-4443-b9fd-70ee9d722362", + "metadata": {}, + "outputs": [], + "source": [ + "# This step of averaging vectors across the document is a weakness in our approach\n", + "\n", + "def document_vector(doc):\n", + " doc_words = simple_preprocess(doc)\n", + " word_vectors = [w2v_model.wv[word] for word in doc_words if word in w2v_model.wv]\n", + " return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(w2v_model.vector_size)\n", + "\n", + "# Create feature matrix\n", + "X_w2v = np.array([document_vector(doc) for doc in documents])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f05eeec-dab8-4007-8e8c-dcf4175b8861", + "metadata": {}, + "outputs": [], + "source": [ + "# Run Linear Regression on word2vec\n", + "\n", + "word2vec_lr_regressor = LinearRegression()\n", + "word2vec_lr_regressor.fit(X_w2v, prices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e43d3fb9-e013-4573-90bf-9a522132b555", + "metadata": {}, + "outputs": [], + "source": [ + "def word2vec_lr_pricer(item):\n", + " doc = item.test_prompt()\n", + " doc_vector = document_vector(doc)\n", + " return max(0, word2vec_lr_regressor.predict([doc_vector])[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6740319d-5c8e-4125-9106-97e2e8ab72c7", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "pred_lr_w2v = {}\n", + "for i in range(len(test)):\n", + " pred_lr_w2v[test[i]] = word2vec_lr_pricer(test[i])\n", + "def get_pred_lr_w2v(item):\n", + " return pred_lr_w2v[item]\n", + "Tester.test(get_pred_lr_w2v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd894bae-ef2a-47e1-b2dd-7aed0d8da75c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5edf290b-ed91-42a2-ae6a-1a6cfdf190ee", + "metadata": {}, + "outputs": [], + "source": [ + "# MIXTURE OF EXPERTS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a500794-9fc2-473e-adc8-955789cf49c4", + "metadata": {}, + "outputs": [], + "source": [ + "def get_pred_best2_mean(item):\n", + " v1 = pred_lr[item]\n", + " v2 = pred_lr_w2v[item]\n", + " v3 = pred_xgb[item]\n", + " d1 = abs(v1-v2)\n", + " d2 = abs(v2-v3)\n", + " d3 = abs(v3-v1)\n", + " if d1 <= min(d2,d3):\n", + " v = (v1+v2)/2\n", + " elif d2 <= min(d1,d3):\n", + " v = (v2+v3)/2\n", + " else:\n", + " v = (v1+v3)/2\n", + " return v\n", + "Tester.test(get_pred_best2_mean)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "134029c2-a9a4-44e5-b057-712a7147aec8", + "metadata": {}, + "outputs": [], + "source": [ + "def get_pred_mean(item):\n", + " v1 = pred_lr[item]\n", + " v2 = pred_lr_w2v[item]\n", + " v3 = pred_xgb[item]\n", + " return (v1+v2+v3)/3\n", + "Tester.test(get_pred_mean)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61318267-222d-43de-a63d-4bef09ac8838", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9913c2f-d004-44dc-ac71-485811bbad73", + "metadata": {}, + "outputs": [], + "source": [ + "# Apply MinMax and Standard Scaler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec519cbe-6d96-4d04-be56-274cc360d8d9", + "metadata": {}, + "outputs": [], + "source": [ + "scalar = [MinMaxScaler, StandardScaler][0]().fit(X_w2v)\n", + "X_w2v_scaled = scalar.transform(X_w2v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "600df04d-8c81-448a-aab7-2da70650274c", + "metadata": {}, + "outputs": [], + "source": [ + "word2vec_lr_reg_scaled = LinearRegression().fit(X_w2v_scaled, prices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f5f1f33-a4d8-44bb-a899-5c71478cf7f1", + "metadata": {}, + "outputs": [], + "source": [ + "def word2vec_lr_pricer_scaled(item):\n", + " doc = item.test_prompt()\n", + " doc_vector = document_vector(doc)\n", + " doc_vector_scaled = stdscalar.transform([doc_vector])\n", + " return max(0, word2vec_lr_reg_scaled.predict([doc_vector_scaled[0]])[0])\n", + "\n", + "Tester.test(word2vec_lr_pricer_scaled)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "194796e9-c07e-47d9-8908-f75dd1787ab4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bb5957e-780d-4e85-80bc-50d52350aa58", + "metadata": {}, + "outputs": [], + "source": [ + "# Run XGB on word2vec\n", + "\n", + "word2vec_xgb_regressor = GradientBoostingRegressor()\n", + "word2vec_xgb_regressor.fit(X_w2v, prices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4281d224-b16a-4276-a48f-fa1829c49c71", + "metadata": {}, + "outputs": [], + "source": [ + "def word2vec_xgb_pricer(item):\n", + " doc = item.test_prompt()\n", + " doc_vector = document_vector(doc)\n", + " return max(0, word2vec_xgb_regressor.predict([doc_vector])[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72adf5b4-3781-452e-ba0e-69dfaceec91d", + "metadata": {}, + "outputs": [], + "source": [ + "Tester.test(word2vec_xgb_pricer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40791fa8-5ff4-4cc7-8813-9abdc6a037fa", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d6d3265-37c1-464c-a489-5be4df0a7276", + "metadata": {}, + "outputs": [], + "source": [ + "# Support Vector Machines\n", + "\n", + "np.random.seed(42)\n", + "svr_regressor = LinearSVR()\n", + "\n", + "svr_regressor.fit(X_w2v, prices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcc289e6-56a1-4119-864f-2fdf8efde643", + "metadata": {}, + "outputs": [], + "source": [ + "def svr_pricer(item):\n", + " np.random.seed(42)\n", + " doc = item.test_prompt()\n", + " doc_vector = document_vector(doc)\n", + " return max(float(svr_regressor.predict([doc_vector])[0]),0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80286a48-7cca-40e6-af76-a814a23bb9dc", + "metadata": {}, + "outputs": [], + "source": [ + "Tester.test(svr_pricer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b5b2758-35ce-4401-943c-3221ac2c6d2d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6c44fe4-e4d9-4559-a8ed-d8f97e25b69f", + "metadata": {}, + "outputs": [], + "source": [ + "# And the powerful Random Forest regression\n", + "mfile = 'random_forest_model.pkl'\n", + "if os.path.exists(mfile):\n", + " rf_model = pickle.load(open(mfile,'rb'))\n", + "else:\n", + " rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=8)\n", + " rf_model.fit(X_w2v, prices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a38812d0-913b-400b-804f-51434d895d05", + "metadata": {}, + "outputs": [], + "source": [ + "def random_forest_pricer(item):\n", + " doc = item.test_prompt()\n", + " doc_vector = document_vector(doc)\n", + " return max(0, rf_model.predict([doc_vector])[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88b51c01-c791-4fdc-8010-00b2e486b8ce", + "metadata": {}, + "outputs": [], + "source": [ + "Tester.test(random_forest_pricer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc85b271-4c92-480c-8843-2d7713b0fa57", + "metadata": {}, + "outputs": [], + "source": [ + "pred_rf = {}\n", + "for i in range(len(test)):\n", + " pred_rf[test[i]] = random_forest_pricer(test[i])\n", + "def get_pred_rf(item):\n", + " return pred_rf[item]\n", + "Tester.test(get_pred_rf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "222ce4e3-9816-4674-bdf7-40ece20d23eb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}