From 17f797c0248537e7f671740f08737118ba835699 Mon Sep 17 00:00:00 2001
From: Edward Donner <ed.donner@gmail.com>
Date: Wed, 30 Apr 2025 15:33:27 -0400
Subject: [PATCH] Fixed modal issue

---
 week8/day1.ipynb         | 13 ++++-----
 week8/pricer_service2.py | 63 ++++++++++++++++++----------------------
 2 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/week8/day1.ipynb b/week8/day1.ipynb
index 1c54283..d59e021 100644
--- a/week8/day1.ipynb
+++ b/week8/day1.ipynb
@@ -322,15 +322,14 @@
     "\n",
     "## A way to improve the speed of the Modal pricer service\n",
     "\n",
-    "A student mentioned to me that he was concerned by how slow Modal seems to be. The reason is that Modal puts our service to sleep if we don't use it, and then it takes 2.5 minutes to spin back up.\n",
+    "The first time you run this modal class, it might take as much as 10 minutes to build.  \n",
+    "Subsequently it should be much faster.. 30 seconds if it needs to wake up, otherwise 2 seconds.  \n",
+    "If you want it to always be 2 seconds, you can keep the container from going to sleep by editing this constant in pricer_service2.py:\n",
     "\n",
-    "I've added a utility called `keep_warm.py` that will keep our Modal warm by pinging it every 30 seconds.\n",
+    "`MIN_CONTAINERS = 0`\n",
     "\n",
-    "To use the utliity, bring up a new Terminal (Mac) or Anaconda prompt (Windows), ensure the environment is activated with `conda activate llms`\n",
-    "\n",
-    "Then run: `python keep_warm.py` from within the week8 drectory.\n",
-    "\n",
-    "Remember to press ctrl+C or exit the window when you no longer need Modal running.\n"
+    "Make it 1 to keep a container alive.  \n",
+    "But please note: this will eat up credits! Only do this if you are comfortable to have a process running continually.\n"
    ]
   },
   {
diff --git a/week8/pricer_service2.py b/week8/pricer_service2.py
index 031df1c..babe318 100644
--- a/week8/pricer_service2.py
+++ b/week8/pricer_service2.py
@@ -1,22 +1,13 @@
 import modal
-from pathlib import PurePosixPath
-
+from modal import App, Volume, Image
 # Setup - define our infrastructure with code!
+
 app = modal.App("pricer-service")
-secrets = [modal.Secret.from_name("huggingface-secret")]
-
-image = modal.Image.debian_slim().pip_install(
-    "huggingface", "torch", "transformers", "bitsandbytes", 
-    "accelerate", "peft", "huggingface_hub[hf_transfer]"
-).env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
-
-# This is where we cache model files to avoid redownloading each time a container is started
-hf_cache_vol = modal.Volume.from_name("hf-cache", create_if_missing=True)
+image = Image.debian_slim().pip_install("huggingface", "torch", "transformers", "bitsandbytes", "accelerate", "peft")
+secrets = [modal.Secret.from_name("hf-secret")]
 
+# Constants
 GPU = "T4"
-# Keep N containers active to avoid cold starts
-MIN_CONTAINERS = 0
-
 BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
 PROJECT_NAME = "pricer"
 HF_USER = "ed-donner" # your HF name here! Or use mine if you just want to reproduce my results.
@@ -24,28 +15,28 @@ RUN_NAME = "2024-09-13_13.04.39"
 PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
 REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"
 FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
-
-# Mount for cache location
-MODEL_DIR = PurePosixPath("/models")
-BASE_DIR = MODEL_DIR / BASE_MODEL
-FINETUNED_DIR = MODEL_DIR / FINETUNED_MODEL
+CACHE_DIR = "/cache" 
 
 QUESTION = "How much does this cost to the nearest dollar?"
 PREFIX = "Price is $"
 
-@app.cls(image=image, secrets=secrets, gpu=GPU, timeout=1800, min_containers=MIN_CONTAINERS, volumes={MODEL_DIR: hf_cache_vol})
+hf_cache_volume = Volume.from_name("hf-hub-cache", create_if_missing=True)
+
+@app.cls(
+    image=image.env({"HF_HUB_CACHE": CACHE_DIR}),
+    secrets=secrets, 
+    gpu=GPU, 
+    timeout=1800,
+    volumes={CACHE_DIR: hf_cache_volume}
+)
 class Pricer:
+
     @modal.enter()
     def setup(self):
         import torch
-        from huggingface_hub import snapshot_download
-        from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+        from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
         from peft import PeftModel
-
-        # Download and cache model files to the volume
-        snapshot_download(BASE_MODEL, local_dir=BASE_DIR)
-        snapshot_download(FINETUNED_MODEL, revision=REVISION, local_dir=FINETUNED_DIR)
-
+        
         # Quant Config
         quant_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -55,22 +46,23 @@ class Pricer:
         )
 
         # Load model and tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(BASE_DIR)
+        self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
         self.tokenizer.pad_token = self.tokenizer.eos_token
         self.tokenizer.padding_side = "right"
-        
         self.base_model = AutoModelForCausalLM.from_pretrained(
-            BASE_DIR, 
+            BASE_MODEL, 
             quantization_config=quant_config,
             device_map="auto"
         )
-    
-        self.fine_tuned_model = PeftModel.from_pretrained(self.base_model, FINETUNED_DIR, revision=REVISION)
+        self.fine_tuned_model = PeftModel.from_pretrained(self.base_model, FINETUNED_MODEL, revision=REVISION)
 
     @modal.method()
     def price(self, description: str) -> float:
-        import re, torch
-        from transformers import set_seed
+        import os
+        import re
+        import torch
+        from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
+        from peft import PeftModel
     
         set_seed(42)
         prompt = f"{QUESTION}\n\n{description}\n\n{PREFIX}"
@@ -84,3 +76,6 @@ class Pricer:
         match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
         return float(match.group()) if match else 0
 
+    @modal.method()
+    def wake_up(self) -> str:
+        return "ok"
\ No newline at end of file