Exercise week 8 and cleanup week 5 (no outputs)

2025-10-27 14:21:38 +01:00
parent d6718a658f
commit fbc8e68196
2 changed files with 446 additions and 108 deletions
--- a/week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb
+++ b/week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb
@@ -47,21 +47,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "id": "a9aeb363",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "OpenAI API Key exists and begins sk-proj-\n",
-      "Anthropic API Key exists and begins sk-ant-\n",
-      "Google API Key exists and begins AI\n",
-      "OLLAMA API Key exists and begins 36\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
@@ -120,7 +109,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "id": "2e250912",
   "metadata": {},
   "outputs": [],
@@ -144,7 +133,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "id": "1f67fdb3",
   "metadata": {},
   "outputs": [],
@@ -200,7 +189,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "id": "cec185e3",
   "metadata": {},
   "outputs": [],
@@ -292,18 +281,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": null,
   "id": "be31f352",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "now\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "mails_2 = generate_synthetic_emails(\n",
    "    persona_description = persona_description,\n",
@@ -316,18 +297,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": null,
   "id": "24d844f2",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Saved 101 emails to emails2.json\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "save_emails_to_json(mails_2, 'emails2.json')"
   ]
@@ -343,7 +316,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "id": "777012f8",
   "metadata": {},
   "outputs": [],
@@ -371,19 +344,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": null,
   "id": "ce95d9c7",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Total number of chunks: 206\n",
-      "Sample metadata fields: ['sender', 'timestamp', 'category']\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Read in emails from the emails.json file and construct LangChain documents\n",
    "\n",
@@ -427,7 +391,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": null,
   "id": "a99dd2d6",
   "metadata": {},
   "outputs": [],
@@ -474,7 +438,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": null,
   "id": "161144ac",
   "metadata": {},
   "outputs": [],
@@ -503,58 +467,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": null,
   "id": "16a4d8d1",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\Javi\\Desktop\\course\\llm_engineering\\.venv\\Lib\\site-packages\\gradio\\chat_interface.py:347: UserWarning:\n",
-      "\n",
-      "The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Warning: When  cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n",
-      "* Running on local URL:  http://127.0.0.1:7878\n",
-      "* To create a public link, set `share=True` in `launch()`.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div><iframe src=\"http://127.0.0.1:7878/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": []
-     },
-     "execution_count": 60,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Warning: When  cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n",
-      "Warning: When  cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "\n",
    "import gradio as gr\n",
@@ -589,14 +505,6 @@
    "demo.launch(inbrowser=True)\n",
    "\n"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "221a9d98",
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/week8/community_contributions/Exercise_Week_8_jom.ipynb
+++ b/week8/community_contributions/Exercise_Week_8_jom.ipynb
@@ -0,0 +1,430 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbcdfea8-7241-46d7-a771-c0381a3e7063",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import os\n",
+    "import re\n",
+    "import math\n",
+    "import json\n",
+    "from tqdm import tqdm\n",
+    "import random\n",
+    "from dotenv import load_dotenv\n",
+    "from huggingface_hub import login\n",
+    "import numpy as np\n",
+    "import pickle\n",
+    "from openai import OpenAI\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "from datasets import load_dataset\n",
+    "import chromadb\n",
+    "from items import Item\n",
+    "from testing import Tester\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.ensemble import RandomForestRegressor\n",
+    "from sklearn.linear_model import LinearRegression, ElasticNet\n",
+    "from sklearn.metrics import mean_squared_error, r2_score\n",
+    "import joblib\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6e88bd1-f89c-4b98-92fa-aa4bc1575bca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# CONSTANTS\n",
+    "\n",
+    "QUESTION = \"How much does this cost to the nearest dollar?\\n\\n\"\n",
+    "DB = \"products_vectorstore\"\n",
+    "# environment\n",
+    "\n",
+    "load_dotenv(override=True)\n",
+    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
+    "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')\n",
+    "\n",
+    "hf_token = os.environ['HF_TOKEN']\n",
+    "login(hf_token, add_to_git_credential=True)\n",
+    "\n",
+    "from items import Item\n",
+    "\n",
+    "with open('test.pkl', 'rb') as file:\n",
+    "    test = pickle.load(file)\n",
+    "\n",
+    "client = chromadb.PersistentClient(path=DB)\n",
+    "collection = client.get_or_create_collection('products')\n",
+    "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n",
+    "vectors = np.array(result['embeddings'])\n",
+    "documents = result['documents']\n",
+    "prices = [metadata['price'] for metadata in result['metadatas']]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bf6492cb-b11a-4ad5-859b-a71a78ffb949",
+   "metadata": {},
+   "source": [
+    "# Catboost GBT\n",
+    "\n",
+    "We will now train a Random Forest model.\n",
+    "\n",
+    "Can you spot the difference from what we did in Week 6? In week 6 we used the word2vec model to form vectors; this time we'll use the vectors we already have in Chroma, from the SentenceTransformer model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d25befe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from catboost import CatBoostRegressor\n",
+    "import numpy as np\n",
+    "\n",
+    "# Initialize the model\n",
+    "model = CatBoostRegressor(\n",
+    "    iterations=1000,\n",
+    "    learning_rate=0.03,\n",
+    "    depth=6,\n",
+    "    loss_function='RMSE',\n",
+    "    verbose=100\n",
+    ")\n",
+    "\n",
+    "model.fit(vectors, prices)\n",
+    "joblib.dump(model, 'random_forest_model.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a78e1e02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Tester.test(model, test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d438dec-8e5b-4e60-bb6f-c3f82e522dd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from agents.specialist_agent import SpecialistAgent\n",
+    "from agents.frontier_agent import FrontierAgent\n",
+    "from agents.random_forest_agent import RandomForestAgent\n",
+    "from agents.my_specialist_agent import MySpecialistAgent\n",
+    "\n",
+    "specialist = SpecialistAgent()\n",
+    "my_specialist = MySpecialistAgent()\n",
+    "frontier = FrontierAgent(collection)\n",
+    "random_forest = RandomForestAgent()\n",
+    "\n",
+    "def description(item):\n",
+    "    return item.prompt.split(\"to the nearest dollar?\\n\\n\")[1].split(\"\\n\\nPrice is $\")[0]\n",
+    "def rf(item):\n",
+    "    return random_forest.price(description(item))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e44dbd25-fb95-4b6b-bbbb-8da5fc817105",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "product = \"Quadcast HyperX condenser mic for high quality audio for podcasting\"\n",
+    "print(specialist.price(product))\n",
+    "print(my_specialist.price(product))\n",
+    "\n",
+    "print(frontier.price(product))\n",
+    "print(random_forest.price(product))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1779b353-e2bb-4fc7-be7c-93057e4d688a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "specialists = []\n",
+    "my_specialists = []\n",
+    "frontiers = []\n",
+    "random_forests = []\n",
+    "prices = []\n",
+    "for item in tqdm(test[1040:1250]):\n",
+    "    text = description(item)\n",
+    "    specialists.append(specialist.price(text))\n",
+    "    my_specialists.append(my_specialist.price(text))\n",
+    "    frontiers.append(frontier.price(text))\n",
+    "    random_forests.append(random_forest.price(text))\n",
+    "    prices.append(item.price)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0bca725-4e34-405b-8d90-41d67086a25d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mins = [min(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]\n",
+    "maxes = [max(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]\n",
+    "means = [np.mean([s,ms,f,r]) for s, ms, f, r, in zip(specialists, my_specialists, frontiers, random_forests)]\n",
+    "\n",
+    "X = pd.DataFrame({\n",
+    "    'Specialist': specialists,\n",
+    "    'MySpecialist': my_specialists,\n",
+    "    'Frontier': frontiers,\n",
+    "    'RandomForest': random_forests,\n",
+    "    'Min': mins,\n",
+    "    'Max': maxes,\n",
+    "    'Mean': means,\n",
+    "})\n",
+    "\n",
+    "# Convert y to a Series\n",
+    "y = pd.Series(prices)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bdb37a84",
+   "metadata": {},
+   "source": [
+    "# Ensemble GBT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1be5be8a-3e7f-42a2-be54-0c7e380f7cc4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import GradientBoostingRegressor\n",
+    "\n",
+    "np.random.seed(42)\n",
+    "\n",
+    "\n",
+    "lr = GradientBoostingRegressor(\n",
+    "    n_estimators=150, \n",
+    "    max_depth=3, \n",
+    "    random_state=42,\n",
+    "    learning_rate=0.05,\n",
+    "    subsample=0.8,\n",
+    "    min_samples_split=4,\n",
+    "    min_samples_leaf=2,\n",
+    "    max_features='sqrt'\n",
+    ")\n",
+    "\n",
+    "lr.fit(X, y)\n",
+    "\n",
+    "feature_columns = X.columns.tolist()\n",
+    "\n",
+    "print(\"Feature importances:\")\n",
+    "for feature, importance in zip(feature_columns, lr.feature_importances_):\n",
+    "    print(f\"{feature}: {importance:.4f}\")\n",
+    "\n",
+    "joblib.dump(lr, 'ensemble_model.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e762441a-9470-4dd7-8a8f-ec0430e908c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from agents.ensemble_agent import EnsembleAgent\n",
+    "ensemble = EnsembleAgent(collection)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a29f03c-8010-43b7-ae7d-1bc85ca6e8e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ensemble.price(product)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6a5e226-a508-43d5-aa42-cefbde72ffdf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ensemble_pricer(item):\n",
+    "    return max(0,ensemble.price(description(item)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8397b1ef-2ea3-4af8-bb34-36594e0600cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Tester.test(ensemble_pricer, test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29c1bcdd",
+   "metadata": {},
+   "source": [
+    "# More changes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "16c3f35f",
+   "metadata": {},
+   "source": [
+    "## Added my_specialist_agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d8f0334",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import modal\n",
+    "from agents.agent import Agent\n",
+    "\n",
+    "\n",
+    "class MySpecialistAgent(Agent):\n",
+    "    \"\"\"\n",
+    "    An Agent that runs our fine-tuned LLM that's running remotely on Modal\n",
+    "    \"\"\"\n",
+    "\n",
+    "    name = \"Specialist Agent\"\n",
+    "    color = Agent.RED\n",
+    "\n",
+    "    def __init__(self):\n",
+    "        \"\"\"\n",
+    "        Set up this Agent by creating an instance of the modal class\n",
+    "        \"\"\"\n",
+    "        self.log(\"Specialist Agent is initializing - connecting to modal\")\n",
+    "        Pricer = modal.Cls.from_name(\"my_pricer-service\", \"Pricer\") #it just points to my modal service with custom model\n",
+    "        self.pricer = Pricer()\n",
+    "        self.log(\"Specialist Agent is ready\")\n",
+    "        \n",
+    "    def price(self, description: str) -> float:\n",
+    "        \"\"\"\n",
+    "        Make a remote call to return the estimate of the price of this item\n",
+    "        \"\"\"\n",
+    "        self.log(\"Specialist Agent is calling remote fine-tuned model\")\n",
+    "        result = self.pricer.price.remote(description)\n",
+    "        self.log(f\"Specialist Agent completed - predicting ${result:.2f}\")\n",
+    "        return result\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "161c5e77",
+   "metadata": {},
+   "source": [
+    "## Modified ensemble_agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44398889",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "import joblib\n",
+    "import numpy as np\n",
+    "from agents.agent import Agent\n",
+    "from agents.specialist_agent import SpecialistAgent\n",
+    "from agents.frontier_agent import FrontierAgent\n",
+    "from agents.random_forest_agent import RandomForestAgent\n",
+    "from agents.my_specialist_agent import MySpecialistAgent\n",
+    "\n",
+    "specialist = SpecialistAgent()\n",
+    "\n",
+    "class EnsembleAgent(Agent):\n",
+    "\n",
+    "    name = \"Ensemble Agent\"\n",
+    "    color = Agent.YELLOW\n",
+    "    \n",
+    "    def __init__(self, collection):\n",
+    "        \"\"\"\n",
+    "        Create an instance of Ensemble, by creating each of the models\n",
+    "        And loading the weights of the Ensemble\n",
+    "        \"\"\"\n",
+    "        self.log(\"Initializing Ensemble Agent\")\n",
+    "        self.specialist = SpecialistAgent()\n",
+    "        self.my_specialist = MySpecialistAgent() #added my specialist\n",
+    "        self.frontier = FrontierAgent(collection)\n",
+    "        self.random_forest = RandomForestAgent() #my model here is a cabtoost regularized and pruned\n",
+    "        self.model = joblib.load('ensemble_model.pkl') #my model is actually a gbt\n",
+    "        self.log(\"Ensemble Agent is ready\")\n",
+    "\n",
+    "    def price(self, description: str) -> float:\n",
+    "        \"\"\"\n",
+    "        Run this ensemble model\n",
+    "        Ask each of the models to price the product\n",
+    "        Then use the Linear Regression model to return the weighted price\n",
+    "        :param description: the description of a product\n",
+    "        :return: an estimate of its price\n",
+    "        \"\"\"\n",
+    "        self.log(\"Running Ensemble Agent - collaborating with specialist, frontier and random forest agents\")\n",
+    "        specialist = self.specialist.price(description)\n",
+    "        my_specialist = self.my_specialist.price(description) #added my specialist estimate\n",
+    "        frontier = self.frontier.price(description)\n",
+    "        random_forest = self.random_forest.price(description)\n",
+    "        X = pd.DataFrame({\n",
+    "            'Specialist': [specialist],\n",
+    "            'MySpecialist': [my_specialist],\n",
+    "            'Frontier': [frontier],\n",
+    "            'RandomForest': [random_forest],\n",
+    "            'Min': [min(specialist, frontier, random_forest)],\n",
+    "            'Max': [max(specialist, frontier, random_forest)],\n",
+    "            'Mean': [np.mean([specialist, my_specialist, frontier, random_forest])], #added the mean and myspecialist prediction.\n",
+    "        })\n",
+    "        y = max(0, self.model.predict(X)[0])\n",
+    "        self.log(f\"Ensemble Agent complete - returning ${y:.2f}\")\n",
+    "        return y"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}