Fixed modal issue
This commit is contained in:
@@ -322,15 +322,14 @@
|
||||
"\n",
|
||||
"## A way to improve the speed of the Modal pricer service\n",
|
||||
"\n",
|
||||
"A student mentioned to me that he was concerned by how slow Modal seems to be. The reason is that Modal puts our service to sleep if we don't use it, and then it takes 2.5 minutes to spin back up.\n",
|
||||
"The first time you run this modal class, it might take as much as 10 minutes to build. \n",
|
||||
"Subsequently it should be much faster.. 30 seconds if it needs to wake up, otherwise 2 seconds. \n",
|
||||
"If you want it to always be 2 seconds, you can keep the container from going to sleep by editing this constant in pricer_service2.py:\n",
|
||||
"\n",
|
||||
"I've added a utility called `keep_warm.py` that will keep our Modal warm by pinging it every 30 seconds.\n",
|
||||
"`MIN_CONTAINERS = 0`\n",
|
||||
"\n",
|
||||
"To use the utliity, bring up a new Terminal (Mac) or Anaconda prompt (Windows), ensure the environment is activated with `conda activate llms`\n",
|
||||
"\n",
|
||||
"Then run: `python keep_warm.py` from within the week8 drectory.\n",
|
||||
"\n",
|
||||
"Remember to press ctrl+C or exit the window when you no longer need Modal running.\n"
|
||||
"Make it 1 to keep a container alive. \n",
|
||||
"But please note: this will eat up credits! Only do this if you are comfortable to have a process running continually.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -1,22 +1,13 @@
|
||||
import modal
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
from modal import App, Volume, Image
|
||||
# Setup - define our infrastructure with code!
|
||||
|
||||
app = modal.App("pricer-service")
|
||||
secrets = [modal.Secret.from_name("huggingface-secret")]
|
||||
|
||||
image = modal.Image.debian_slim().pip_install(
|
||||
"huggingface", "torch", "transformers", "bitsandbytes",
|
||||
"accelerate", "peft", "huggingface_hub[hf_transfer]"
|
||||
).env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
|
||||
|
||||
# This is where we cache model files to avoid redownloading each time a container is started
|
||||
hf_cache_vol = modal.Volume.from_name("hf-cache", create_if_missing=True)
|
||||
image = Image.debian_slim().pip_install("huggingface", "torch", "transformers", "bitsandbytes", "accelerate", "peft")
|
||||
secrets = [modal.Secret.from_name("hf-secret")]
|
||||
|
||||
# Constants
|
||||
GPU = "T4"
|
||||
# Keep N containers active to avoid cold starts
|
||||
MIN_CONTAINERS = 0
|
||||
|
||||
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
|
||||
PROJECT_NAME = "pricer"
|
||||
HF_USER = "ed-donner" # your HF name here! Or use mine if you just want to reproduce my results.
|
||||
@@ -24,28 +15,28 @@ RUN_NAME = "2024-09-13_13.04.39"
|
||||
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
|
||||
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"
|
||||
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
|
||||
|
||||
# Mount for cache location
|
||||
MODEL_DIR = PurePosixPath("/models")
|
||||
BASE_DIR = MODEL_DIR / BASE_MODEL
|
||||
FINETUNED_DIR = MODEL_DIR / FINETUNED_MODEL
|
||||
CACHE_DIR = "/cache"
|
||||
|
||||
QUESTION = "How much does this cost to the nearest dollar?"
|
||||
PREFIX = "Price is $"
|
||||
|
||||
@app.cls(image=image, secrets=secrets, gpu=GPU, timeout=1800, min_containers=MIN_CONTAINERS, volumes={MODEL_DIR: hf_cache_vol})
|
||||
hf_cache_volume = Volume.from_name("hf-hub-cache", create_if_missing=True)
|
||||
|
||||
@app.cls(
|
||||
image=image.env({"HF_HUB_CACHE": CACHE_DIR}),
|
||||
secrets=secrets,
|
||||
gpu=GPU,
|
||||
timeout=1800,
|
||||
volumes={CACHE_DIR: hf_cache_volume}
|
||||
)
|
||||
class Pricer:
|
||||
|
||||
@modal.enter()
|
||||
def setup(self):
|
||||
import torch
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
|
||||
from peft import PeftModel
|
||||
|
||||
# Download and cache model files to the volume
|
||||
snapshot_download(BASE_MODEL, local_dir=BASE_DIR)
|
||||
snapshot_download(FINETUNED_MODEL, revision=REVISION, local_dir=FINETUNED_DIR)
|
||||
|
||||
|
||||
# Quant Config
|
||||
quant_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
@@ -55,22 +46,23 @@ class Pricer:
|
||||
)
|
||||
|
||||
# Load model and tokenizer
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(BASE_DIR)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
||||
self.tokenizer.pad_token = self.tokenizer.eos_token
|
||||
self.tokenizer.padding_side = "right"
|
||||
|
||||
self.base_model = AutoModelForCausalLM.from_pretrained(
|
||||
BASE_DIR,
|
||||
BASE_MODEL,
|
||||
quantization_config=quant_config,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
self.fine_tuned_model = PeftModel.from_pretrained(self.base_model, FINETUNED_DIR, revision=REVISION)
|
||||
self.fine_tuned_model = PeftModel.from_pretrained(self.base_model, FINETUNED_MODEL, revision=REVISION)
|
||||
|
||||
@modal.method()
|
||||
def price(self, description: str) -> float:
|
||||
import re, torch
|
||||
from transformers import set_seed
|
||||
import os
|
||||
import re
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
|
||||
from peft import PeftModel
|
||||
|
||||
set_seed(42)
|
||||
prompt = f"{QUESTION}\n\n{description}\n\n{PREFIX}"
|
||||
@@ -84,3 +76,6 @@ class Pricer:
|
||||
match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
|
||||
return float(match.group()) if match else 0
|
||||
|
||||
@modal.method()
|
||||
def wake_up(self) -> str:
|
||||
return "ok"
|
||||
Reference in New Issue
Block a user