Exercise week 8 and cleanup week 5 (no outputs)

This commit is contained in:
unknown
2025-10-27 14:21:38 +01:00
parent d6718a658f
commit fbc8e68196
2 changed files with 446 additions and 108 deletions

View File

@@ -47,21 +47,10 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "a9aeb363",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OpenAI API Key exists and begins sk-proj-\n",
"Anthropic API Key exists and begins sk-ant-\n",
"Google API Key exists and begins AI\n",
"OLLAMA API Key exists and begins 36\n"
]
}
],
"outputs": [],
"source": [
"# imports\n",
"\n",
@@ -120,7 +109,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "2e250912",
"metadata": {},
"outputs": [],
@@ -144,7 +133,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "1f67fdb3",
"metadata": {},
"outputs": [],
@@ -200,7 +189,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "cec185e3",
"metadata": {},
"outputs": [],
@@ -292,18 +281,10 @@
},
{
"cell_type": "code",
"execution_count": 51,
"execution_count": null,
"id": "be31f352",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"now\n"
]
}
],
"outputs": [],
"source": [
"mails_2 = generate_synthetic_emails(\n",
" persona_description = persona_description,\n",
@@ -316,18 +297,10 @@
},
{
"cell_type": "code",
"execution_count": 52,
"execution_count": null,
"id": "24d844f2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saved 101 emails to emails2.json\n"
]
}
],
"outputs": [],
"source": [
"save_emails_to_json(mails_2, 'emails2.json')"
]
@@ -343,7 +316,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"id": "777012f8",
"metadata": {},
"outputs": [],
@@ -371,19 +344,10 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": null,
"id": "ce95d9c7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total number of chunks: 206\n",
"Sample metadata fields: ['sender', 'timestamp', 'category']\n"
]
}
],
"outputs": [],
"source": [
"# Read in emails from the emails.json file and construct LangChain documents\n",
"\n",
@@ -427,7 +391,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": null,
"id": "a99dd2d6",
"metadata": {},
"outputs": [],
@@ -474,7 +438,7 @@
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": null,
"id": "161144ac",
"metadata": {},
"outputs": [],
@@ -503,58 +467,10 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": null,
"id": "16a4d8d1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Javi\\Desktop\\course\\llm_engineering\\.venv\\Lib\\site-packages\\gradio\\chat_interface.py:347: UserWarning:\n",
"\n",
"The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n",
"* Running on local URL: http://127.0.0.1:7878\n",
"* To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7878/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": []
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n",
"Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n"
]
}
],
"outputs": [],
"source": [
"\n",
"import gradio as gr\n",
@@ -589,14 +505,6 @@
"demo.launch(inbrowser=True)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "221a9d98",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,430 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "fbcdfea8-7241-46d7-a771-c0381a3e7063",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import re\n",
"import math\n",
"import json\n",
"from tqdm import tqdm\n",
"import random\n",
"from dotenv import load_dotenv\n",
"from huggingface_hub import login\n",
"import numpy as np\n",
"import pickle\n",
"from openai import OpenAI\n",
"from sentence_transformers import SentenceTransformer\n",
"from datasets import load_dataset\n",
"import chromadb\n",
"from items import Item\n",
"from testing import Tester\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.linear_model import LinearRegression, ElasticNet\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"import joblib\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6e88bd1-f89c-4b98-92fa-aa4bc1575bca",
"metadata": {},
"outputs": [],
"source": [
"# CONSTANTS\n",
"\n",
"QUESTION = \"How much does this cost to the nearest dollar?\\n\\n\"\n",
"DB = \"products_vectorstore\"\n",
"# environment\n",
"\n",
"load_dotenv(override=True)\n",
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')\n",
"\n",
"hf_token = os.environ['HF_TOKEN']\n",
"login(hf_token, add_to_git_credential=True)\n",
"\n",
"from items import Item\n",
"\n",
"with open('test.pkl', 'rb') as file:\n",
" test = pickle.load(file)\n",
"\n",
"client = chromadb.PersistentClient(path=DB)\n",
"collection = client.get_or_create_collection('products')\n",
"result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n",
"vectors = np.array(result['embeddings'])\n",
"documents = result['documents']\n",
"prices = [metadata['price'] for metadata in result['metadatas']]\n"
]
},
{
"cell_type": "markdown",
"id": "bf6492cb-b11a-4ad5-859b-a71a78ffb949",
"metadata": {},
"source": [
"# Catboost GBT\n",
"\n",
"We will now train a Random Forest model.\n",
"\n",
"Can you spot the difference from what we did in Week 6? In week 6 we used the word2vec model to form vectors; this time we'll use the vectors we already have in Chroma, from the SentenceTransformer model."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d25befe",
"metadata": {},
"outputs": [],
"source": [
"from catboost import CatBoostRegressor\n",
"import numpy as np\n",
"\n",
"# Initialize the model\n",
"model = CatBoostRegressor(\n",
" iterations=1000,\n",
" learning_rate=0.03,\n",
" depth=6,\n",
" loss_function='RMSE',\n",
" verbose=100\n",
")\n",
"\n",
"model.fit(vectors, prices)\n",
"joblib.dump(model, 'random_forest_model.pkl')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a78e1e02",
"metadata": {},
"outputs": [],
"source": [
"Tester.test(model, test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5d438dec-8e5b-4e60-bb6f-c3f82e522dd9",
"metadata": {},
"outputs": [],
"source": [
"from agents.specialist_agent import SpecialistAgent\n",
"from agents.frontier_agent import FrontierAgent\n",
"from agents.random_forest_agent import RandomForestAgent\n",
"from agents.my_specialist_agent import MySpecialistAgent\n",
"\n",
"specialist = SpecialistAgent()\n",
"my_specialist = MySpecialistAgent()\n",
"frontier = FrontierAgent(collection)\n",
"random_forest = RandomForestAgent()\n",
"\n",
"def description(item):\n",
" return item.prompt.split(\"to the nearest dollar?\\n\\n\")[1].split(\"\\n\\nPrice is $\")[0]\n",
"def rf(item):\n",
" return random_forest.price(description(item))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e44dbd25-fb95-4b6b-bbbb-8da5fc817105",
"metadata": {},
"outputs": [],
"source": [
"product = \"Quadcast HyperX condenser mic for high quality audio for podcasting\"\n",
"print(specialist.price(product))\n",
"print(my_specialist.price(product))\n",
"\n",
"print(frontier.price(product))\n",
"print(random_forest.price(product))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1779b353-e2bb-4fc7-be7c-93057e4d688a",
"metadata": {},
"outputs": [],
"source": [
"specialists = []\n",
"my_specialists = []\n",
"frontiers = []\n",
"random_forests = []\n",
"prices = []\n",
"for item in tqdm(test[1040:1250]):\n",
" text = description(item)\n",
" specialists.append(specialist.price(text))\n",
" my_specialists.append(my_specialist.price(text))\n",
" frontiers.append(frontier.price(text))\n",
" random_forests.append(random_forest.price(text))\n",
" prices.append(item.price)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0bca725-4e34-405b-8d90-41d67086a25d",
"metadata": {},
"outputs": [],
"source": [
"mins = [min(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]\n",
"maxes = [max(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]\n",
"means = [np.mean([s,ms,f,r]) for s, ms, f, r, in zip(specialists, my_specialists, frontiers, random_forests)]\n",
"\n",
"X = pd.DataFrame({\n",
" 'Specialist': specialists,\n",
" 'MySpecialist': my_specialists,\n",
" 'Frontier': frontiers,\n",
" 'RandomForest': random_forests,\n",
" 'Min': mins,\n",
" 'Max': maxes,\n",
" 'Mean': means,\n",
"})\n",
"\n",
"# Convert y to a Series\n",
"y = pd.Series(prices)"
]
},
{
"cell_type": "markdown",
"id": "bdb37a84",
"metadata": {},
"source": [
"# Ensemble GBT"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1be5be8a-3e7f-42a2-be54-0c7e380f7cc4",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import GradientBoostingRegressor\n",
"\n",
"np.random.seed(42)\n",
"\n",
"\n",
"lr = GradientBoostingRegressor(\n",
" n_estimators=150, \n",
" max_depth=3, \n",
" random_state=42,\n",
" learning_rate=0.05,\n",
" subsample=0.8,\n",
" min_samples_split=4,\n",
" min_samples_leaf=2,\n",
" max_features='sqrt'\n",
")\n",
"\n",
"lr.fit(X, y)\n",
"\n",
"feature_columns = X.columns.tolist()\n",
"\n",
"print(\"Feature importances:\")\n",
"for feature, importance in zip(feature_columns, lr.feature_importances_):\n",
" print(f\"{feature}: {importance:.4f}\")\n",
"\n",
"joblib.dump(lr, 'ensemble_model.pkl')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e762441a-9470-4dd7-8a8f-ec0430e908c7",
"metadata": {},
"outputs": [],
"source": [
"from agents.ensemble_agent import EnsembleAgent\n",
"ensemble = EnsembleAgent(collection)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a29f03c-8010-43b7-ae7d-1bc85ca6e8e2",
"metadata": {},
"outputs": [],
"source": [
"ensemble.price(product)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6a5e226-a508-43d5-aa42-cefbde72ffdf",
"metadata": {},
"outputs": [],
"source": [
"def ensemble_pricer(item):\n",
" return max(0,ensemble.price(description(item)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8397b1ef-2ea3-4af8-bb34-36594e0600cc",
"metadata": {},
"outputs": [],
"source": [
"Tester.test(ensemble_pricer, test)"
]
},
{
"cell_type": "markdown",
"id": "29c1bcdd",
"metadata": {},
"source": [
"# More changes"
]
},
{
"cell_type": "markdown",
"id": "16c3f35f",
"metadata": {},
"source": [
"## Added my_specialist_agent"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d8f0334",
"metadata": {},
"outputs": [],
"source": [
"import modal\n",
"from agents.agent import Agent\n",
"\n",
"\n",
"class MySpecialistAgent(Agent):\n",
" \"\"\"\n",
" An Agent that runs our fine-tuned LLM that's running remotely on Modal\n",
" \"\"\"\n",
"\n",
" name = \"Specialist Agent\"\n",
" color = Agent.RED\n",
"\n",
" def __init__(self):\n",
" \"\"\"\n",
" Set up this Agent by creating an instance of the modal class\n",
" \"\"\"\n",
" self.log(\"Specialist Agent is initializing - connecting to modal\")\n",
" Pricer = modal.Cls.from_name(\"my_pricer-service\", \"Pricer\") #it just points to my modal service with custom model\n",
" self.pricer = Pricer()\n",
" self.log(\"Specialist Agent is ready\")\n",
" \n",
" def price(self, description: str) -> float:\n",
" \"\"\"\n",
" Make a remote call to return the estimate of the price of this item\n",
" \"\"\"\n",
" self.log(\"Specialist Agent is calling remote fine-tuned model\")\n",
" result = self.pricer.price.remote(description)\n",
" self.log(f\"Specialist Agent completed - predicting ${result:.2f}\")\n",
" return result\n"
]
},
{
"cell_type": "markdown",
"id": "161c5e77",
"metadata": {},
"source": [
"## Modified ensemble_agent"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "44398889",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.linear_model import LinearRegression\n",
"import joblib\n",
"import numpy as np\n",
"from agents.agent import Agent\n",
"from agents.specialist_agent import SpecialistAgent\n",
"from agents.frontier_agent import FrontierAgent\n",
"from agents.random_forest_agent import RandomForestAgent\n",
"from agents.my_specialist_agent import MySpecialistAgent\n",
"\n",
"specialist = SpecialistAgent()\n",
"\n",
"class EnsembleAgent(Agent):\n",
"\n",
" name = \"Ensemble Agent\"\n",
" color = Agent.YELLOW\n",
" \n",
" def __init__(self, collection):\n",
" \"\"\"\n",
" Create an instance of Ensemble, by creating each of the models\n",
" And loading the weights of the Ensemble\n",
" \"\"\"\n",
" self.log(\"Initializing Ensemble Agent\")\n",
" self.specialist = SpecialistAgent()\n",
" self.my_specialist = MySpecialistAgent() #added my specialist\n",
" self.frontier = FrontierAgent(collection)\n",
" self.random_forest = RandomForestAgent() #my model here is a cabtoost regularized and pruned\n",
" self.model = joblib.load('ensemble_model.pkl') #my model is actually a gbt\n",
" self.log(\"Ensemble Agent is ready\")\n",
"\n",
" def price(self, description: str) -> float:\n",
" \"\"\"\n",
" Run this ensemble model\n",
" Ask each of the models to price the product\n",
" Then use the Linear Regression model to return the weighted price\n",
" :param description: the description of a product\n",
" :return: an estimate of its price\n",
" \"\"\"\n",
" self.log(\"Running Ensemble Agent - collaborating with specialist, frontier and random forest agents\")\n",
" specialist = self.specialist.price(description)\n",
" my_specialist = self.my_specialist.price(description) #added my specialist estimate\n",
" frontier = self.frontier.price(description)\n",
" random_forest = self.random_forest.price(description)\n",
" X = pd.DataFrame({\n",
" 'Specialist': [specialist],\n",
" 'MySpecialist': [my_specialist],\n",
" 'Frontier': [frontier],\n",
" 'RandomForest': [random_forest],\n",
" 'Min': [min(specialist, frontier, random_forest)],\n",
" 'Max': [max(specialist, frontier, random_forest)],\n",
" 'Mean': [np.mean([specialist, my_specialist, frontier, random_forest])], #added the mean and myspecialist prediction.\n",
" })\n",
" y = max(0, self.model.predict(X)[0])\n",
" self.log(f\"Ensemble Agent complete - returning ${y:.2f}\")\n",
" return y"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}