diff --git a/week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb b/week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb index 8881804..c7c4e2d 100644 --- a/week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb +++ b/week5/community-contributions/week5_jom/Exercise_week5_jom.ipynb @@ -47,21 +47,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "a9aeb363", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "OpenAI API Key exists and begins sk-proj-\n", - "Anthropic API Key exists and begins sk-ant-\n", - "Google API Key exists and begins AI\n", - "OLLAMA API Key exists and begins 36\n" - ] - } - ], + "outputs": [], "source": [ "# imports\n", "\n", @@ -120,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "2e250912", "metadata": {}, "outputs": [], @@ -144,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "1f67fdb3", "metadata": {}, "outputs": [], @@ -200,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "cec185e3", "metadata": {}, "outputs": [], @@ -292,18 +281,10 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": null, "id": "be31f352", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "now\n" - ] - } - ], + "outputs": [], "source": [ "mails_2 = generate_synthetic_emails(\n", " persona_description = persona_description,\n", @@ -316,18 +297,10 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "id": "24d844f2", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saved 101 emails to emails2.json\n" - ] - } - ], + "outputs": [], "source": [ "save_emails_to_json(mails_2, 'emails2.json')" ] @@ -343,7 +316,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "777012f8", "metadata": {}, "outputs": [], @@ -371,19 +344,10 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "ce95d9c7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total number of chunks: 206\n", - "Sample metadata fields: ['sender', 'timestamp', 'category']\n" - ] - } - ], + "outputs": [], "source": [ "# Read in emails from the emails.json file and construct LangChain documents\n", "\n", @@ -427,7 +391,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "id": "a99dd2d6", "metadata": {}, "outputs": [], @@ -474,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "id": "161144ac", "metadata": {}, "outputs": [], @@ -503,58 +467,10 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "id": "16a4d8d1", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\Javi\\Desktop\\course\\llm_engineering\\.venv\\Lib\\site-packages\\gradio\\chat_interface.py:347: UserWarning:\n", - "\n", - "The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n", - "* Running on local URL: http://127.0.0.1:7878\n", - "* To create a public link, set `share=True` in `launch()`.\n" - ] - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n", - "Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n" - ] - } - ], + "outputs": [], "source": [ "\n", "import gradio as gr\n", @@ -589,14 +505,6 @@ "demo.launch(inbrowser=True)\n", "\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "221a9d98", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/week8/community_contributions/Exercise_Week_8_jom.ipynb b/week8/community_contributions/Exercise_Week_8_jom.ipynb new file mode 100644 index 0000000..3b4be5e --- /dev/null +++ b/week8/community_contributions/Exercise_Week_8_jom.ipynb @@ -0,0 +1,430 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "fbcdfea8-7241-46d7-a771-c0381a3e7063", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import re\n", + "import math\n", + "import json\n", + "from tqdm import tqdm\n", + "import random\n", + "from dotenv import load_dotenv\n", + "from huggingface_hub import login\n", + "import numpy as np\n", + "import pickle\n", + "from openai import OpenAI\n", + "from sentence_transformers import SentenceTransformer\n", + "from datasets import load_dataset\n", + "import chromadb\n", + "from items import Item\n", + "from testing import Tester\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.linear_model import LinearRegression, ElasticNet\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "import joblib\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6e88bd1-f89c-4b98-92fa-aa4bc1575bca", + "metadata": {}, + "outputs": [], + "source": [ + "# CONSTANTS\n", + "\n", + "QUESTION = \"How much does this cost to the nearest dollar?\\n\\n\"\n", + "DB = \"products_vectorstore\"\n", + "# environment\n", + "\n", + "load_dotenv(override=True)\n", + "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n", + "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')\n", + "\n", + "hf_token = os.environ['HF_TOKEN']\n", + "login(hf_token, add_to_git_credential=True)\n", + "\n", + "from items import Item\n", + "\n", + "with open('test.pkl', 'rb') as file:\n", + " test = pickle.load(file)\n", + "\n", + "client = chromadb.PersistentClient(path=DB)\n", + "collection = client.get_or_create_collection('products')\n", + "result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n", + "vectors = np.array(result['embeddings'])\n", + "documents = result['documents']\n", + "prices = [metadata['price'] for metadata in result['metadatas']]\n" + ] + }, + { + "cell_type": "markdown", + "id": "bf6492cb-b11a-4ad5-859b-a71a78ffb949", + "metadata": {}, + "source": [ + "# Catboost GBT\n", + "\n", + "We will now train a Random Forest model.\n", + "\n", + "Can you spot the difference from what we did in Week 6? In week 6 we used the word2vec model to form vectors; this time we'll use the vectors we already have in Chroma, from the SentenceTransformer model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d25befe", + "metadata": {}, + "outputs": [], + "source": [ + "from catboost import CatBoostRegressor\n", + "import numpy as np\n", + "\n", + "# Initialize the model\n", + "model = CatBoostRegressor(\n", + " iterations=1000,\n", + " learning_rate=0.03,\n", + " depth=6,\n", + " loss_function='RMSE',\n", + " verbose=100\n", + ")\n", + "\n", + "model.fit(vectors, prices)\n", + "joblib.dump(model, 'random_forest_model.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a78e1e02", + "metadata": {}, + "outputs": [], + "source": [ + "Tester.test(model, test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d438dec-8e5b-4e60-bb6f-c3f82e522dd9", + "metadata": {}, + "outputs": [], + "source": [ + "from agents.specialist_agent import SpecialistAgent\n", + "from agents.frontier_agent import FrontierAgent\n", + "from agents.random_forest_agent import RandomForestAgent\n", + "from agents.my_specialist_agent import MySpecialistAgent\n", + "\n", + "specialist = SpecialistAgent()\n", + "my_specialist = MySpecialistAgent()\n", + "frontier = FrontierAgent(collection)\n", + "random_forest = RandomForestAgent()\n", + "\n", + "def description(item):\n", + " return item.prompt.split(\"to the nearest dollar?\\n\\n\")[1].split(\"\\n\\nPrice is $\")[0]\n", + "def rf(item):\n", + " return random_forest.price(description(item))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e44dbd25-fb95-4b6b-bbbb-8da5fc817105", + "metadata": {}, + "outputs": [], + "source": [ + "product = \"Quadcast HyperX condenser mic for high quality audio for podcasting\"\n", + "print(specialist.price(product))\n", + "print(my_specialist.price(product))\n", + "\n", + "print(frontier.price(product))\n", + "print(random_forest.price(product))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1779b353-e2bb-4fc7-be7c-93057e4d688a", + "metadata": {}, + "outputs": [], + "source": [ + "specialists = []\n", + "my_specialists = []\n", + "frontiers = []\n", + "random_forests = []\n", + "prices = []\n", + "for item in tqdm(test[1040:1250]):\n", + " text = description(item)\n", + " specialists.append(specialist.price(text))\n", + " my_specialists.append(my_specialist.price(text))\n", + " frontiers.append(frontier.price(text))\n", + " random_forests.append(random_forest.price(text))\n", + " prices.append(item.price)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0bca725-4e34-405b-8d90-41d67086a25d", + "metadata": {}, + "outputs": [], + "source": [ + "mins = [min(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]\n", + "maxes = [max(s,f,r) for s,ms,f,r in zip(specialists, my_specialists, frontiers, random_forests)]\n", + "means = [np.mean([s,ms,f,r]) for s, ms, f, r, in zip(specialists, my_specialists, frontiers, random_forests)]\n", + "\n", + "X = pd.DataFrame({\n", + " 'Specialist': specialists,\n", + " 'MySpecialist': my_specialists,\n", + " 'Frontier': frontiers,\n", + " 'RandomForest': random_forests,\n", + " 'Min': mins,\n", + " 'Max': maxes,\n", + " 'Mean': means,\n", + "})\n", + "\n", + "# Convert y to a Series\n", + "y = pd.Series(prices)" + ] + }, + { + "cell_type": "markdown", + "id": "bdb37a84", + "metadata": {}, + "source": [ + "# Ensemble GBT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1be5be8a-3e7f-42a2-be54-0c7e380f7cc4", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import GradientBoostingRegressor\n", + "\n", + "np.random.seed(42)\n", + "\n", + "\n", + "lr = GradientBoostingRegressor(\n", + " n_estimators=150, \n", + " max_depth=3, \n", + " random_state=42,\n", + " learning_rate=0.05,\n", + " subsample=0.8,\n", + " min_samples_split=4,\n", + " min_samples_leaf=2,\n", + " max_features='sqrt'\n", + ")\n", + "\n", + "lr.fit(X, y)\n", + "\n", + "feature_columns = X.columns.tolist()\n", + "\n", + "print(\"Feature importances:\")\n", + "for feature, importance in zip(feature_columns, lr.feature_importances_):\n", + " print(f\"{feature}: {importance:.4f}\")\n", + "\n", + "joblib.dump(lr, 'ensemble_model.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e762441a-9470-4dd7-8a8f-ec0430e908c7", + "metadata": {}, + "outputs": [], + "source": [ + "from agents.ensemble_agent import EnsembleAgent\n", + "ensemble = EnsembleAgent(collection)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a29f03c-8010-43b7-ae7d-1bc85ca6e8e2", + "metadata": {}, + "outputs": [], + "source": [ + "ensemble.price(product)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6a5e226-a508-43d5-aa42-cefbde72ffdf", + "metadata": {}, + "outputs": [], + "source": [ + "def ensemble_pricer(item):\n", + " return max(0,ensemble.price(description(item)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8397b1ef-2ea3-4af8-bb34-36594e0600cc", + "metadata": {}, + "outputs": [], + "source": [ + "Tester.test(ensemble_pricer, test)" + ] + }, + { + "cell_type": "markdown", + "id": "29c1bcdd", + "metadata": {}, + "source": [ + "# More changes" + ] + }, + { + "cell_type": "markdown", + "id": "16c3f35f", + "metadata": {}, + "source": [ + "## Added my_specialist_agent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d8f0334", + "metadata": {}, + "outputs": [], + "source": [ + "import modal\n", + "from agents.agent import Agent\n", + "\n", + "\n", + "class MySpecialistAgent(Agent):\n", + " \"\"\"\n", + " An Agent that runs our fine-tuned LLM that's running remotely on Modal\n", + " \"\"\"\n", + "\n", + " name = \"Specialist Agent\"\n", + " color = Agent.RED\n", + "\n", + " def __init__(self):\n", + " \"\"\"\n", + " Set up this Agent by creating an instance of the modal class\n", + " \"\"\"\n", + " self.log(\"Specialist Agent is initializing - connecting to modal\")\n", + " Pricer = modal.Cls.from_name(\"my_pricer-service\", \"Pricer\") #it just points to my modal service with custom model\n", + " self.pricer = Pricer()\n", + " self.log(\"Specialist Agent is ready\")\n", + " \n", + " def price(self, description: str) -> float:\n", + " \"\"\"\n", + " Make a remote call to return the estimate of the price of this item\n", + " \"\"\"\n", + " self.log(\"Specialist Agent is calling remote fine-tuned model\")\n", + " result = self.pricer.price.remote(description)\n", + " self.log(f\"Specialist Agent completed - predicting ${result:.2f}\")\n", + " return result\n" + ] + }, + { + "cell_type": "markdown", + "id": "161c5e77", + "metadata": {}, + "source": [ + "## Modified ensemble_agent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44398889", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.linear_model import LinearRegression\n", + "import joblib\n", + "import numpy as np\n", + "from agents.agent import Agent\n", + "from agents.specialist_agent import SpecialistAgent\n", + "from agents.frontier_agent import FrontierAgent\n", + "from agents.random_forest_agent import RandomForestAgent\n", + "from agents.my_specialist_agent import MySpecialistAgent\n", + "\n", + "specialist = SpecialistAgent()\n", + "\n", + "class EnsembleAgent(Agent):\n", + "\n", + " name = \"Ensemble Agent\"\n", + " color = Agent.YELLOW\n", + " \n", + " def __init__(self, collection):\n", + " \"\"\"\n", + " Create an instance of Ensemble, by creating each of the models\n", + " And loading the weights of the Ensemble\n", + " \"\"\"\n", + " self.log(\"Initializing Ensemble Agent\")\n", + " self.specialist = SpecialistAgent()\n", + " self.my_specialist = MySpecialistAgent() #added my specialist\n", + " self.frontier = FrontierAgent(collection)\n", + " self.random_forest = RandomForestAgent() #my model here is a cabtoost regularized and pruned\n", + " self.model = joblib.load('ensemble_model.pkl') #my model is actually a gbt\n", + " self.log(\"Ensemble Agent is ready\")\n", + "\n", + " def price(self, description: str) -> float:\n", + " \"\"\"\n", + " Run this ensemble model\n", + " Ask each of the models to price the product\n", + " Then use the Linear Regression model to return the weighted price\n", + " :param description: the description of a product\n", + " :return: an estimate of its price\n", + " \"\"\"\n", + " self.log(\"Running Ensemble Agent - collaborating with specialist, frontier and random forest agents\")\n", + " specialist = self.specialist.price(description)\n", + " my_specialist = self.my_specialist.price(description) #added my specialist estimate\n", + " frontier = self.frontier.price(description)\n", + " random_forest = self.random_forest.price(description)\n", + " X = pd.DataFrame({\n", + " 'Specialist': [specialist],\n", + " 'MySpecialist': [my_specialist],\n", + " 'Frontier': [frontier],\n", + " 'RandomForest': [random_forest],\n", + " 'Min': [min(specialist, frontier, random_forest)],\n", + " 'Max': [max(specialist, frontier, random_forest)],\n", + " 'Mean': [np.mean([specialist, my_specialist, frontier, random_forest])], #added the mean and myspecialist prediction.\n", + " })\n", + " y = max(0, self.model.predict(X)[0])\n", + " self.log(f\"Ensemble Agent complete - returning ${y:.2f}\")\n", + " return y" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}