Files
LLM_Engineering_OLD/community-contributions/sach91-bootcamp/week7-exercise.ipynb
2025-10-30 03:00:19 +05:30

1304 lines
36 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "db8736a7-ed94-441c-9556-831fa57b5a10",
"metadata": {},
"source": [
"## Testing baseline algorithms for toys and games dataset\n",
"### Train Data: https://drive.google.com/file/d/180ZI9OIdivkO0T-H1wki1-K514iYYK_n\n",
"### Test Data: https://drive.google.com/file/d/1cW5doBO4jpbLQfZwygKFhy0CSSf6U1MQ"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "681c717b-4c24-4ac3-a5f3-3c5881d6e70a",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import math\n",
"import json\n",
"import random\n",
"from dotenv import load_dotenv\n",
"from huggingface_hub import login\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pickle\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "933b6e75-3661-4f30-b0b5-c28d04e3748e",
"metadata": {},
"outputs": [],
"source": [
"# More imports for our traditional machine learning\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42cf33b7-7abd-44ba-9780-c156b70473b5",
"metadata": {},
"outputs": [],
"source": [
"# NLP related imports\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from gensim.models import Word2Vec\n",
"from gensim.utils import simple_preprocess"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1ac3ec0-183c-4a12-920b-b06397f86815",
"metadata": {},
"outputs": [],
"source": [
"# Finally, more imports for more advanced machine learning\n",
"from sklearn.svm import LinearSVR\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"from polire import IDW"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c01ee5f-c4fc-44fe-9d3a-907e8a0426d2",
"metadata": {},
"outputs": [],
"source": [
"# Constants - used for printing to stdout in color\n",
"\n",
"GREEN = \"\\033[92m\"\n",
"YELLOW = \"\\033[93m\"\n",
"RED = \"\\033[91m\"\n",
"RESET = \"\\033[0m\"\n",
"COLOR_MAP = {\"red\":RED, \"orange\": YELLOW, \"green\": GREEN}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c830ed3e-24ee-4af6-a07b-a1bfdcd39278",
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff2017ee-a5b0-4bda-96ff-ba962c05fb4c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b34ba46-c506-412a-b785-c6774f1cdd26",
"metadata": {},
"outputs": [],
"source": [
"from typing import Optional\n",
"from transformers import AutoTokenizer\n",
"import re\n",
"\n",
"BASE_MODEL = \"meta-llama/Meta-Llama-3.1-8B\"\n",
"\n",
"MIN_TOKENS = 150 # Any less than this, and we don't have enough useful content\n",
"MAX_TOKENS = 160 # Truncate after this many tokens. Then after adding in prompt text, we will get to around 180 tokens\n",
"\n",
"MIN_CHARS = 300\n",
"CEILING_CHARS = MAX_TOKENS * 7\n",
"\n",
"class Item:\n",
" \"\"\"\n",
" An Item is a cleaned, curated datapoint of a Product with a Price\n",
" \"\"\"\n",
" \n",
" tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n",
" PREFIX = \"Price is $\"\n",
" QUESTION = \"How much does this cost to the nearest dollar?\"\n",
" REMOVALS = ['\"Batteries Included?\": \"No\"', '\"Batteries Included?\": \"Yes\"', '\"Batteries Required?\": \"No\"', '\"Batteries Required?\": \"Yes\"', \"By Manufacturer\", \"Item\", \"Date First\", \"Package\", \":\", \"Number of\", \"Best Sellers\", \"Number\", \"Product \"]\n",
"\n",
" title: str\n",
" price: float\n",
" category: str\n",
" token_count: int = 0\n",
" details: Optional[str]\n",
" prompt: Optional[str] = None\n",
" include = False\n",
"\n",
" def __init__(self, data, price):\n",
" self.title = data['title']\n",
" self.price = price\n",
" self.parse(data)\n",
"\n",
" def scrub_details(self):\n",
" \"\"\"\n",
" Clean up the details string by removing common text that doesn't add value\n",
" \"\"\"\n",
" details = self.details\n",
" for remove in self.REMOVALS:\n",
" details = details.replace(remove, \"\")\n",
" return details\n",
"\n",
" def scrub(self, stuff):\n",
" \"\"\"\n",
" Clean up the provided text by removing unnecessary characters and whitespace\n",
" Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers\n",
" \"\"\"\n",
" stuff = re.sub(r'[:\\[\\]\"{}【】\\s]+', ' ', stuff).strip()\n",
" stuff = stuff.replace(\" ,\", \",\").replace(\",,,\",\",\").replace(\",,\",\",\")\n",
" words = stuff.split(' ')\n",
" select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]\n",
" return \" \".join(select)\n",
" \n",
" def parse(self, data):\n",
" \"\"\"\n",
" Parse this datapoint and if it fits within the allowed Token range,\n",
" then set include to True\n",
" \"\"\"\n",
" contents = '\\n'.join(data['description'])\n",
" if contents:\n",
" contents += '\\n'\n",
" features = '\\n'.join(data['features'])\n",
" if features:\n",
" contents += features + '\\n'\n",
" self.details = data['details']\n",
" if self.details:\n",
" contents += self.scrub_details() + '\\n'\n",
" if len(contents) > MIN_CHARS:\n",
" contents = contents[:CEILING_CHARS]\n",
" text = f\"{self.scrub(self.title)}\\n{self.scrub(contents)}\"\n",
" tokens = self.tokenizer.encode(text, add_special_tokens=False)\n",
" if len(tokens) > MIN_TOKENS:\n",
" tokens = tokens[:MAX_TOKENS]\n",
" text = self.tokenizer.decode(tokens)\n",
" self.make_prompt(text)\n",
" self.include = True\n",
"\n",
" def make_prompt(self, text):\n",
" \"\"\"\n",
" Set the prompt instance variable to be a prompt appropriate for training\n",
" \"\"\"\n",
" self.prompt = f\"{self.QUESTION}\\n\\n{text}\\n\\n\"\n",
" self.prompt += f\"{self.PREFIX}{str(round(self.price))}.00\"\n",
" self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))\n",
"\n",
" def test_prompt(self):\n",
" \"\"\"\n",
" Return a prompt suitable for testing, with the actual price removed\n",
" \"\"\"\n",
" return self.prompt.split(self.PREFIX)[0] + self.PREFIX\n",
"\n",
" def __repr__(self):\n",
" \"\"\"\n",
" Return a String version of this Item\n",
" \"\"\"\n",
" return f\"<{self.title} = ${self.price}>\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5c9b05f4-c9eb-462c-8d86-de9140a2d985",
"metadata": {},
"outputs": [],
"source": [
"# Download files from link\n",
"\n",
"with open('./train_lite.pkl', 'rb') as file:\n",
" train = pickle.load(file)\n",
"\n",
"with open('./test_lite.pkl', 'rb') as file:\n",
" test = pickle.load(file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "66459184-535b-4195-9dea-a0de1b349605",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5793f5c-e23e-4a74-9496-1e30dd1e8935",
"metadata": {},
"outputs": [],
"source": [
"class Tester:\n",
"\n",
" def __init__(self, predictor, title=None, data=test, size=200):\n",
" self.predictor = predictor\n",
" self.data = data\n",
" self.title = title or predictor.__name__.replace(\"_\", \" \").title()\n",
" self.size = size\n",
" self.guesses = []\n",
" self.truths = []\n",
" self.errors = []\n",
" self.sles = []\n",
" self.colors = []\n",
"\n",
" def color_for(self, error, truth):\n",
" if error<5 or error/truth < 0.1:\n",
" return \"green\"\n",
" elif error<15 or error/truth < 0.3:\n",
" return \"orange\"\n",
" else:\n",
" return \"red\"\n",
" \n",
" def run_datapoint(self, i):\n",
" datapoint = self.data[i]\n",
" guess = self.predictor(datapoint)\n",
" truth = datapoint.price\n",
" error = abs(guess - truth)\n",
" log_error = math.log(truth+1) - math.log(guess+1)\n",
" sle = log_error ** 2\n",
" color = self.color_for(error, truth)\n",
" title = datapoint.title if len(datapoint.title) <= 40 else datapoint.title[:40]+\"...\"\n",
" self.guesses.append(guess)\n",
" self.truths.append(truth)\n",
" self.errors.append(error)\n",
" self.sles.append(sle)\n",
" self.colors.append(color)\n",
" print(f\"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}\")\n",
"\n",
" def chart(self, title):\n",
" max_error = max(self.errors)\n",
" plt.figure(figsize=(12, 8))\n",
" max_val = max(max(self.truths), max(self.guesses))\n",
" plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)\n",
" plt.scatter(self.truths, self.guesses, s=3, c=self.colors)\n",
" plt.xlabel('Ground Truth')\n",
" plt.ylabel('Model Estimate')\n",
" plt.xlim(0, max_val)\n",
" plt.ylim(0, max_val)\n",
" plt.title(title)\n",
" plt.show()\n",
"\n",
" def report(self):\n",
" average_error = sum(self.errors) / self.size\n",
" rmsle = math.sqrt(sum(self.sles) / self.size)\n",
" hits = sum(1 for color in self.colors if color==\"green\")\n",
" title = f\"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%\"\n",
" self.chart(title)\n",
"\n",
" def run(self):\n",
" self.error = 0\n",
" for i in range(self.size):\n",
" self.run_datapoint(i)\n",
" self.report()\n",
"\n",
" @classmethod\n",
" def test(cls, function):\n",
" cls(function).run()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "66ea68e8-ab1b-4f0d-aba4-a59574d8f85e",
"metadata": {},
"outputs": [],
"source": [
"def random_pricer(item):\n",
" return random.randrange(1,60)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53d941cb-5b73-44ea-b893-3a0ce9997066",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Set the random seed\n",
"\n",
"random.seed(42)\n",
"\n",
"# Run our TestRunner\n",
"Tester.test(random_pricer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97451c73-9c1b-43a8-b3b9-9c41942e48a2",
"metadata": {},
"outputs": [],
"source": [
"# That was fun!\n",
"# We can do better - here's another rather trivial model\n",
"\n",
"training_prices = [item.price for item in train]\n",
"training_average = sum(training_prices) / len(training_prices)\n",
"\n",
"def constant_pricer(item):\n",
" return training_average"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8cf384eb-30c2-40d8-b7e5-48942ac6a969",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Run our constant predictor\n",
"Tester.test(constant_pricer)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce16eee8-bb34-4914-9aa5-57e30a567842",
"metadata": {},
"outputs": [],
"source": [
"# Create a new \"features\" field on items, and populate it with json parsed from the details dict\n",
"\n",
"for item in train:\n",
" item.features = json.loads(item.details)\n",
"for item in test:\n",
" item.features = json.loads(item.details)\n",
"\n",
"# Look at one"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac702a10-dccb-43d4-887b-6f92a0fb298f",
"metadata": {},
"outputs": [],
"source": [
"train[0].features.keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd7a41c5-0c51-41be-a61d-8e80c3e90930",
"metadata": {},
"outputs": [],
"source": [
"# Look at 20 most common features in training set\n",
"\n",
"feature_count = Counter()\n",
"for item in train:\n",
" for f in item.features.keys():\n",
" feature_count[f]+=1\n",
"\n",
"feature_count.most_common(40)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3cef84a9-4932-48fd-9f7a-51cfc06e3216",
"metadata": {},
"outputs": [],
"source": [
"# Now some janky code to pluck out the Item Weight\n",
"# Don't worry too much about this: spoiler alert, it's not going to be much use in training!\n",
"\n",
"def get_weight(item):\n",
" weight_str = item.features.get('Item Weight')\n",
" if weight_str:\n",
" parts = weight_str.split(' ')\n",
" amount = float(parts[0])\n",
" unit = parts[1].lower()\n",
" if unit==\"pounds\":\n",
" return amount\n",
" elif unit==\"ounces\":\n",
" return amount / 16\n",
" elif unit==\"grams\":\n",
" return amount / 453.592\n",
" elif unit==\"milligrams\":\n",
" return amount / 453592\n",
" elif unit==\"kilograms\":\n",
" return amount / 0.453592\n",
" elif unit==\"hundredths\" and parts[2].lower()==\"pounds\":\n",
" return amount / 100\n",
" else:\n",
" print(weight_str)\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4848b4a-3c5a-4168-83a5-57a1f3ff270d",
"metadata": {},
"outputs": [],
"source": [
"weights = [get_weight(t) for t in train]\n",
"weights = [w for w in weights if w]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0cd11cc8-f16e-4991-b531-482189ddc4b6",
"metadata": {},
"outputs": [],
"source": [
"average_weight = sum(weights)/len(weights)\n",
"average_weight"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "efe8ec7f-9777-464f-a809-b06b7033bdb2",
"metadata": {},
"outputs": [],
"source": [
"def get_weight_with_default(item):\n",
" weight = get_weight(item)\n",
" return weight or average_weight"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2659fef-a455-431a-9a0e-59342b80084b",
"metadata": {},
"outputs": [],
"source": [
"def get_rank(item):\n",
" rank_dict = item.features.get(\"Best Sellers Rank\")\n",
" if rank_dict:\n",
" ranks = rank_dict.values()\n",
" return sum(ranks)/len(ranks)\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20b9b5be-30bc-4d3a-8492-fbae119421a0",
"metadata": {},
"outputs": [],
"source": [
"ranks = [get_rank(t) for t in train]\n",
"ranks = [r for r in ranks if r]\n",
"average_rank = sum(ranks)/len(ranks)\n",
"average_rank"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "081e646a-ea50-4ec3-9512-6d5f96f8aef6",
"metadata": {},
"outputs": [],
"source": [
"def get_rank_with_default(item):\n",
" rank = get_rank(item)\n",
" return rank or average_rank"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "afd5daf7-cb2b-47af-bf17-dd71a9db65d0",
"metadata": {},
"outputs": [],
"source": [
"def get_text_length(item):\n",
" return len(item.test_prompt())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85c89012-a922-401b-8a3b-94af641bf27a",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# investigate the brands\n",
"\n",
"brands = Counter()\n",
"for t in train:\n",
" brand = t.features.get(\"Brand\")\n",
" if brand:\n",
" brands[brand]+=1\n",
"\n",
"# Look at most common 40 brands\n",
"\n",
"brands.most_common(40)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "386dde54-e028-4a6d-b291-cce889ac1fa3",
"metadata": {},
"outputs": [],
"source": [
"TOP_ELECTRONICS_BRANDS = [\"hp\", \"dell\", \"lenovo\", \"samsung\", \"asus\", \"sony\", \"canon\", \"apple\", \"intel\"]\n",
"def is_top_electronics_brand(item):\n",
" brand = item.features.get(\"Brand\")\n",
" return brand and brand.lower() in TOP_ELECTRONICS_BRANDS"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c31c9c59-9d0d-47a8-a046-f20ed8d38d4c",
"metadata": {},
"outputs": [],
"source": [
"def get_features(item):\n",
" return {\n",
" \"weight\": get_weight_with_default(item),\n",
" \"rank\": get_rank_with_default(item),\n",
" \"text_length\": get_text_length(item),\n",
" \"is_top_electronics_brand\": 1 if is_top_electronics_brand(item) else 0\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88850855-f5bd-4be2-9d7c-75bf8a21609b",
"metadata": {},
"outputs": [],
"source": [
"# Look at features in a training item\n",
"get_features(train[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee9b5298-68b7-497d-8b2e-875287bb25b2",
"metadata": {},
"outputs": [],
"source": [
"# A utility function to convert our features into a pandas dataframe\n",
"\n",
"def list_to_dataframe(items):\n",
" features = [get_features(item) for item in items]\n",
" df = pd.DataFrame(features)\n",
" df['price'] = [item.price for item in items]\n",
" return df\n",
"\n",
"train_df = list_to_dataframe(train)\n",
"test_df = list_to_dataframe(test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cc1d68e0-ab33-40f4-9334-461d426af25c",
"metadata": {},
"outputs": [],
"source": [
"# Traditional Linear Regression!\n",
"\n",
"np.random.seed(42)\n",
"\n",
"# Separate features and target\n",
"feature_columns = ['weight', 'rank', 'text_length', 'is_top_electronics_brand']\n",
"\n",
"X_train = train_df[feature_columns]\n",
"y_train = train_df['price']\n",
"X_test = test_df[feature_columns]\n",
"y_test = test_df['price']\n",
"\n",
"# Train a Linear Regression\n",
"model = LinearRegression()\n",
"model.fit(X_train, y_train)\n",
"\n",
"for feature, coef in zip(feature_columns, model.coef_):\n",
" print(f\"{feature}: {coef}\")\n",
"print(f\"Intercept: {model.intercept_}\")\n",
"\n",
"# Predict the test set and evaluate\n",
"y_pred = model.predict(X_test)\n",
"mse = mean_squared_error(y_test, y_pred)\n",
"r2 = r2_score(y_test, y_pred)\n",
"\n",
"print(f\"Mean Squared Error: {mse}\")\n",
"print(f\"R-squared Score: {r2}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6561c3c7-ac7f-458b-983c-4a164b9d02c3",
"metadata": {},
"outputs": [],
"source": [
"# Function to predict price for a new item\n",
"\n",
"def linear_regression_pricer(item):\n",
" features = get_features(item)\n",
" features_df = pd.DataFrame([features])\n",
" return model.predict(features_df)[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9bf2caa4-657a-4fc6-9dcb-bed7eaf8dd65",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# test it\n",
"\n",
"Tester.test(linear_regression_pricer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "79e1574b-52ef-49cc-bfb5-e97252ed5db8",
"metadata": {},
"outputs": [],
"source": [
"# For the next few models, we prepare our documents and prices\n",
"# Note that we use the test prompt for the documents, otherwise we'll reveal the answer!!\n",
"\n",
"prices = np.array([float(item.price) for item in train])\n",
"documents = [item.test_prompt() for item in train]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e126c22e-53e7-4967-9ebb-6b7dd7fe4ade",
"metadata": {},
"outputs": [],
"source": [
"# Use the CountVectorizer for a Bag of Words model\n",
"\n",
"np.random.seed(42)\n",
"vectorizer = CountVectorizer(max_features=10000, stop_words='english')\n",
"X = vectorizer.fit_transform(documents)\n",
"regressor = LinearRegression()\n",
"regressor.fit(X, prices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b7148d3-3202-4536-a75c-1627495c51d3",
"metadata": {},
"outputs": [],
"source": [
"def bow_lr_pricer(item):\n",
" x = vectorizer.transform([item.test_prompt()])\n",
" return max(regressor.predict(x)[0], 0)\n",
"pred_lr = {}\n",
"for i in range(len(test)):\n",
" pred_lr[test[i]] = bow_lr_pricer(test[i])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38f7f7d0-d22c-4282-92e5-9666a7b8535d",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# test it\n",
"def get_pred_lr(item):\n",
" return pred_lr[item]\n",
"Tester.test(get_pred_lr)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7a82f0a3-191c-4653-b9d6-5622730a6c94",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd22b1a1-89b1-4755-b150-c47309f7b5ed",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "354b631d-de40-4dee-91fc-adb2ab094c4f",
"metadata": {},
"outputs": [],
"source": [
"xgb = GradientBoostingRegressor()\n",
"xgb.fit(X, prices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05e7d00b-db89-4f80-bca6-69d251d8cf62",
"metadata": {},
"outputs": [],
"source": [
"def bow_xgb_pricer(item):\n",
" x = vectorizer.transform([item.test_prompt()])\n",
" return max(xgb.predict(x)[0], 0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a3109d3-ecb6-42df-970c-b798968d467c",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# test\n",
"pred_xgb = {}\n",
"for i in range(len(test)):\n",
" pred_xgb[test[i]] = bow_xgb_pricer(test[i])\n",
"def get_pred_xgb(item):\n",
" return pred_xgb[item]\n",
"Tester.test(get_pred_xgb)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "21a5ce93-e58e-48f7-ab94-407e4c2269c6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "a19b662c-d0ad-40c8-8116-f57c368a841d",
"metadata": {},
"outputs": [],
"source": [
"import spacy \n",
"nlp = spacy.load(\"en_core_web_sm\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c52fc528-d8af-4b01-9e58-8502033884cc",
"metadata": {},
"outputs": [],
"source": [
"def ner_doc(doc):\n",
" d = nlp(doc.replace('\\n',' '))\n",
" ents = []\n",
" for ent in d.ents:\n",
" ents.extend(ent.text.split(' '))\n",
" return ' '.join(list(set(ents)))\n",
"def ner_docs(docs):\n",
" ret = []\n",
" for i,doc in enumerate(docs):\n",
" ret.append(ner_doc(doc))\n",
" if i%1000 == 0:\n",
" print(i, ret[-1])\n",
" return ret"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b1745e1-5aca-4239-b33c-99a1d13ba567",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"if 0 and os.path.exists('docs.pkl'):\n",
" docs2 = pickle.load(open('docs.pkl','rb'))\n",
"else:\n",
" docs2 = ner_docs(documents)\n",
" # pickle.dump(docs2, open('docs.pkl','wb'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "827b0934-1671-4698-82a8-72d1cf564e8f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "19f69fc0-fbc5-4069-b45c-e5f5c0fb14a1",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "90597133-f057-4597-bb2b-d0b8986342ae",
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(42)\n",
"vectorizer2 = CountVectorizer(max_features=10000, stop_words='english')\n",
"X2 = vectorizer2.fit_transform(docs2)\n",
"regressor2 = LinearRegression()\n",
"regressor2.fit(X2, prices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "362875d7-a6b1-4965-ae9b-0d7a0104245f",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def ner_pricer(item):\n",
" x = vectorizer2.transform([ner_doc(item.test_prompt())])\n",
" return max(regressor2.predict(x)[0], 0)\n",
"# test\n",
"pred_ner = {}\n",
"for i in range(len(test)):\n",
" pred_ner[test[i]] = ner_pricer(test[i])\n",
"def get_pred_ner(item):\n",
" return pred_ner[item]\n",
"Tester.test(get_pred_ner)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a44bd333-9577-4338-9da5-6947f7e19849",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7e84bbc-917c-4912-9e55-29a508219566",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b623079e-54fa-418f-b209-7d54ebbcc23a",
"metadata": {},
"outputs": [],
"source": [
"# The amazing word2vec model, implemented in gensim NLP library\n",
"\n",
"np.random.seed(42)\n",
"\n",
"# Preprocess the documents\n",
"processed_docs = [simple_preprocess(doc) for doc in documents]\n",
"\n",
"# Train Word2Vec model\n",
"w2v_model = Word2Vec(sentences=processed_docs, vector_size=400, window=5, min_count=1, workers=8)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3de4efc7-68a6-4443-b9fd-70ee9d722362",
"metadata": {},
"outputs": [],
"source": [
"# This step of averaging vectors across the document is a weakness in our approach\n",
"\n",
"def document_vector(doc):\n",
" doc_words = simple_preprocess(doc)\n",
" word_vectors = [w2v_model.wv[word] for word in doc_words if word in w2v_model.wv]\n",
" return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(w2v_model.vector_size)\n",
"\n",
"# Create feature matrix\n",
"X_w2v = np.array([document_vector(doc) for doc in documents])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f05eeec-dab8-4007-8e8c-dcf4175b8861",
"metadata": {},
"outputs": [],
"source": [
"# Run Linear Regression on word2vec\n",
"\n",
"word2vec_lr_regressor = LinearRegression()\n",
"word2vec_lr_regressor.fit(X_w2v, prices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e43d3fb9-e013-4573-90bf-9a522132b555",
"metadata": {},
"outputs": [],
"source": [
"def word2vec_lr_pricer(item):\n",
" doc = item.test_prompt()\n",
" doc_vector = document_vector(doc)\n",
" return max(0, word2vec_lr_regressor.predict([doc_vector])[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6740319d-5c8e-4125-9106-97e2e8ab72c7",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"pred_lr_w2v = {}\n",
"for i in range(len(test)):\n",
" pred_lr_w2v[test[i]] = word2vec_lr_pricer(test[i])\n",
"def get_pred_lr_w2v(item):\n",
" return pred_lr_w2v[item]\n",
"Tester.test(get_pred_lr_w2v)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd894bae-ef2a-47e1-b2dd-7aed0d8da75c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "5edf290b-ed91-42a2-ae6a-1a6cfdf190ee",
"metadata": {},
"outputs": [],
"source": [
"# MIXTURE OF EXPERTS"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a500794-9fc2-473e-adc8-955789cf49c4",
"metadata": {},
"outputs": [],
"source": [
"def get_pred_best2_mean(item):\n",
" v1 = pred_lr[item]\n",
" v2 = pred_lr_w2v[item]\n",
" v3 = pred_xgb[item]\n",
" d1 = abs(v1-v2)\n",
" d2 = abs(v2-v3)\n",
" d3 = abs(v3-v1)\n",
" if d1 <= min(d2,d3):\n",
" v = (v1+v2)/2\n",
" elif d2 <= min(d1,d3):\n",
" v = (v2+v3)/2\n",
" else:\n",
" v = (v1+v3)/2\n",
" return v\n",
"Tester.test(get_pred_best2_mean)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "134029c2-a9a4-44e5-b057-712a7147aec8",
"metadata": {},
"outputs": [],
"source": [
"def get_pred_mean(item):\n",
" v1 = pred_lr[item]\n",
" v2 = pred_lr_w2v[item]\n",
" v3 = pred_xgb[item]\n",
" return (v1+v2+v3)/3\n",
"Tester.test(get_pred_mean)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "61318267-222d-43de-a63d-4bef09ac8838",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9913c2f-d004-44dc-ac71-485811bbad73",
"metadata": {},
"outputs": [],
"source": [
"# Apply MinMax and Standard Scaler"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec519cbe-6d96-4d04-be56-274cc360d8d9",
"metadata": {},
"outputs": [],
"source": [
"scalar = [MinMaxScaler, StandardScaler][1]().fit(X_w2v)\n",
"X_w2v_scaled = scalar.transform(X_w2v)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "600df04d-8c81-448a-aab7-2da70650274c",
"metadata": {},
"outputs": [],
"source": [
"word2vec_lr_reg_scaled = LinearRegression().fit(X_w2v_scaled, prices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f5f1f33-a4d8-44bb-a899-5c71478cf7f1",
"metadata": {},
"outputs": [],
"source": [
"def word2vec_lr_pricer_scaled(item):\n",
" doc = item.test_prompt()\n",
" doc_vector = document_vector(doc)\n",
" doc_vector_scaled = scalar.transform([doc_vector])\n",
" return max(0, word2vec_lr_reg_scaled.predict([doc_vector_scaled[0]])[0])\n",
"\n",
"Tester.test(word2vec_lr_pricer_scaled)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "194796e9-c07e-47d9-8908-f75dd1787ab4",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1bb5957e-780d-4e85-80bc-50d52350aa58",
"metadata": {},
"outputs": [],
"source": [
"# Run XGB on word2vec\n",
"\n",
"word2vec_xgb_regressor = GradientBoostingRegressor()\n",
"word2vec_xgb_regressor.fit(X_w2v, prices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4281d224-b16a-4276-a48f-fa1829c49c71",
"metadata": {},
"outputs": [],
"source": [
"def word2vec_xgb_pricer(item):\n",
" doc = item.test_prompt()\n",
" doc_vector = document_vector(doc)\n",
" return max(0, word2vec_xgb_regressor.predict([doc_vector])[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "72adf5b4-3781-452e-ba0e-69dfaceec91d",
"metadata": {},
"outputs": [],
"source": [
"Tester.test(word2vec_xgb_pricer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9956920d-db99-4e3a-a219-af29c8734a0b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "40791fa8-5ff4-4cc7-8813-9abdc6a037fa",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d6d3265-37c1-464c-a489-5be4df0a7276",
"metadata": {},
"outputs": [],
"source": [
"# Support Vector Machines\n",
"\n",
"np.random.seed(42)\n",
"svr_regressor = LinearSVR()\n",
"\n",
"svr_regressor.fit(X_w2v, prices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fcc289e6-56a1-4119-864f-2fdf8efde643",
"metadata": {},
"outputs": [],
"source": [
"def svr_pricer(item):\n",
" np.random.seed(42)\n",
" doc = item.test_prompt()\n",
" doc_vector = document_vector(doc)\n",
" return max(float(svr_regressor.predict([doc_vector])[0]),0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "80286a48-7cca-40e6-af76-a814a23bb9dc",
"metadata": {},
"outputs": [],
"source": [
"Tester.test(svr_pricer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "be953b13-e91f-4d27-bd05-e05598873be2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b5b2758-35ce-4401-943c-3221ac2c6d2d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6c44fe4-e4d9-4559-a8ed-d8f97e25b69f",
"metadata": {},
"outputs": [],
"source": [
"# And the powerful Random Forest regression\n",
"mfile = 'random_forest_model.pkl'\n",
"if 0 and os.path.exists(mfile):\n",
" rf_model = pickle.load(open(mfile,'rb'))\n",
"else:\n",
" rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=8)\n",
" rf_model.fit(X_w2v, prices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a38812d0-913b-400b-804f-51434d895d05",
"metadata": {},
"outputs": [],
"source": [
"def random_forest_pricer(item):\n",
" doc = item.test_prompt()\n",
" doc_vector = document_vector(doc)\n",
" return max(0, rf_model.predict([doc_vector])[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc85b271-4c92-480c-8843-2d7713b0fa57",
"metadata": {},
"outputs": [],
"source": [
"pred_rf = {}\n",
"for i in range(len(test)):\n",
" pred_rf[test[i]] = random_forest_pricer(test[i])\n",
"def get_pred_rf(item):\n",
" return pred_rf[item]\n",
"Tester.test(get_pred_rf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "222ce4e3-9816-4674-bdf7-40ece20d23eb",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}