Updated README and Week 8 coming together
This commit is contained in:
294
week8_wip/day1.ipynb
Normal file
294
week8_wip/day1.ipynb
Normal file
@@ -0,0 +1,294 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "56297249-4a8c-4e67-b8c3-a0d8652c104e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import modal"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "0d240622-8422-4c99-8464-c04d063e4cb6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !modal setup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "3b133701-f550-44a1-a67f-eb7ccc4769a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from hello import app, hello"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "0f3f73ae-1295-49f3-9099-b8b41fc3429b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Hello from Seaport, New York, US!!'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with app.run(show_progress=False):\n",
|
||||
" reply=hello.local()\n",
|
||||
"reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "c1d8c6f9-edc7-4e52-9b3a-c07d7cff1ac7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Hello from Frankfurt am Main, Hesse, DE!!'"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with app.run(show_progress=False):\n",
|
||||
" reply=hello.remote()\n",
|
||||
"reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cb8b6c41-8259-4329-b1c4-a1f67d26d1be",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import modal\n",
|
||||
"from llama import app, generate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "db4a718a-d95d-4f61-9688-c9df21d88fe6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with modal.enable_output():\n",
|
||||
" with app.run():\n",
|
||||
" result=generate.remote(\"Life is a mystery, everyone must stand alone, I hear\")\n",
|
||||
"result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9a9a6844-29ec-4264-8e72-362d976b3968",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import modal\n",
|
||||
"from pricer_ephemeral import app, price"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "50e6cf99-8959-4ae3-ba02-e325cb7fff94",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with modal.enable_output():\n",
|
||||
" with app.run():\n",
|
||||
" result=price.remote(\"Quadcast HyperX condenser mic, connects via usb-c to your computer for crystal clear audio\")\n",
|
||||
"result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7f90d857-2f12-4521-bb90-28efd917f7d1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!modal deploy pricer_service"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1dec70ff-1986-4405-8624-9bbbe0ce1f4a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pricer = modal.Function.lookup(\"pricer-service\", \"price\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "17776139-0d9e-4ad0-bcd0-82d3a92ca61f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pricer.remote(\"Quadcast HyperX condenser mic, connects via usb-c to your computer for crystal clear audio\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "58f5d19f-8ffc-496c-832b-04e0d5892f54",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import modal"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "f56d1e55-2a03-4ce2-bb47-2ab6b9175a02",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[2K\u001b[34m⠸\u001b[0m Creating objects.....\n",
|
||||
"\u001b[38;5;244m└── \u001b[0m\u001b[34m⠋\u001b[0m Creating mount /Users/ed/dev/llm_engineering/week8/pricer_service2.py: \n",
|
||||
"\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[34m⠦\u001b[0m Creating objects...\n",
|
||||
"\u001b[38;5;244m└── \u001b[0m\u001b[34m⠸\u001b[0m Creating mount /Users/ed/dev/llm_engineering/week8/pricer_service2.py: \n",
|
||||
"\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[34m⠏\u001b[0m Creating objects...\n",
|
||||
"\u001b[38;5;244m└── \u001b[0m\u001b[34m⠦\u001b[0m Creating mount /Users/ed/dev/llm_engineering/week8/pricer_service2.py: \n",
|
||||
"\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[34m⠹\u001b[0m Creating objects...\n",
|
||||
"\u001b[38;5;244m├── \u001b[0m🔨 Created mount /Users/ed/dev/llm_engineering/week8/pricer_service2.py\n",
|
||||
"\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[34m⠴\u001b[0m Creating objects...load_model_to_folder.\n",
|
||||
"\u001b[38;5;244m├── \u001b[0m🔨 Created mount /Users/ed/dev/llm_engineering/week8/pricer_service2.py\n",
|
||||
"\u001b[38;5;244m├── \u001b[0m🔨 Created function Pricer.download_model_to_folder.\n",
|
||||
"\u001b[38;5;244m├── \u001b[0m🔨 Created function Pricer.*.\n",
|
||||
"\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[34m⠴\u001b[0m Creating objects...\n",
|
||||
"\u001b[38;5;244m├── \u001b[0m🔨 Created mount /Users/ed/dev/llm_engineering/week8/pricer_service2.py\n",
|
||||
"\u001b[38;5;244m├── \u001b[0m🔨 Created function Pricer.download_model_to_folder.\n",
|
||||
"\u001b[38;5;244m├── \u001b[0m🔨 Created function Pricer.*.\n",
|
||||
"\u001b[38;5;244m└── \u001b[0m🔨 Created function Pricer.price.\n",
|
||||
"\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[32m✓\u001b[0m Created objects.\n",
|
||||
"\u001b[38;5;244m├── \u001b[0m🔨 Created mount /Users/ed/dev/llm_engineering/week8/pricer_service2.py\n",
|
||||
"\u001b[38;5;244m├── \u001b[0m🔨 Created function Pricer.download_model_to_folder.\n",
|
||||
"\u001b[38;5;244m├── \u001b[0m🔨 Created function Pricer.*.\n",
|
||||
"\u001b[38;5;244m└── \u001b[0m🔨 Created function Pricer.price.\n",
|
||||
"\u001b[32m✓\u001b[0m App deployed in 1.570s! 🎉\n",
|
||||
"\n",
|
||||
"View Deployment: \u001b[35mhttps://modal.com/apps/ed-donner/main/deployed/pricer-service\u001b[0m\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!modal deploy pricer_service2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "9e19daeb-1281-484b-9d2f-95cc6fed2622",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"133.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import modal\n",
|
||||
"\n",
|
||||
"Pricer = modal.Cls.lookup(\"pricer-service\", \"Pricer\")\n",
|
||||
"pricer = Pricer()\n",
|
||||
"reply = pricer.price.remote(\"Quadcast HyperX condenser mic, connects via usb-c to your computer for crystal clear audio\")\n",
|
||||
"print(reply)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "ac331454-21e2-4b37-9602-4667006e34ee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"reply = pricer.price.remote(\"iphone SE\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "c3a71dcd-b71d-4c48-b0d9-3ac296d2046a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"299.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ba9aedca-6a7b-4d30-9f64-59d76f76fb6d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
328
week8_wip/day2.0.ipynb
Normal file
328
week8_wip/day2.0.ipynb
Normal file
@@ -0,0 +1,328 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "993a2a24-1a58-42be-8034-6d116fb8d786",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"import math\n",
|
||||
"import json\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import random\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from huggingface_hub import login\n",
|
||||
"import numpy as np\n",
|
||||
"import pickle\n",
|
||||
"from sentence_transformers import SentenceTransformer\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"import chromadb\n",
|
||||
"from items import Item\n",
|
||||
"from sklearn.manifold import TSNE\n",
|
||||
"import plotly.graph_objects as go"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "0e31676f-6f31-465f-a80e-02d51ff8425a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# CONSTANTS\n",
|
||||
"\n",
|
||||
"HF_USER = \"ed-donner\" # your HF name here! Or use mine if you just want to reproduce my results.\n",
|
||||
"DATASET_NAME = f\"{HF_USER}/pricer-data\"\n",
|
||||
"QUESTION = \"How much does this cost to the nearest dollar?\\n\\n\"\n",
|
||||
"DB = \"products_vectorstore\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "2359ccc0-dbf2-4b1e-9473-e472b32f548b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# environment\n",
|
||||
"\n",
|
||||
"load_dotenv()\n",
|
||||
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
|
||||
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "a29fcc4e-e4d7-4c54-aa6b-e5d1111ea9c4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Token is valid (permission: write).\n",
|
||||
"Your token has been saved in your configured git credential helpers (osxkeychain).\n",
|
||||
"Your token has been saved to /Users/ed/.cache/huggingface/token\n",
|
||||
"Login successful\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Log in to HuggingFace\n",
|
||||
"\n",
|
||||
"hf_token = os.environ['HF_TOKEN']\n",
|
||||
"login(hf_token, add_to_git_credential=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "688bd995-ec3e-43cd-8179-7fe14b275877",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's avoid curating all our data again! Load in the pickle files:\n",
|
||||
"\n",
|
||||
"with open('train.pkl', 'rb') as file:\n",
|
||||
" train = pickle.load(file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "2817eaf5-4302-4a18-9148-d1062e3b3dbb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"400000"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"items = train\n",
|
||||
"len(items)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "f4aab95e-d719-4476-b6e7-e248120df25a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"client = chromadb.PersistentClient(path=DB)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "5f95dafd-ab80-464e-ba8a-dec7a2424780",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Deleted existing collection: products\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Check if the collection exists and delete it if it does\n",
|
||||
"collection_name = \"products\"\n",
|
||||
"existing_collection_names = [collection.name for collection in client.list_collections()]\n",
|
||||
"if collection_name in existing_collection_names:\n",
|
||||
" client.delete_collection(collection_name)\n",
|
||||
" print(f\"Deleted existing collection: {collection_name}\")\n",
|
||||
"\n",
|
||||
"collection = client.create_collection(collection_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "a87db200-d19d-44bf-acbd-15c45c70f5c9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "38de1bf8-c9b5-45b4-9f4b-86af93b3f80d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def description(item):\n",
|
||||
" text = item.prompt.replace(\"How much does this cost to the nearest dollar?\\n\\n\", \"\")\n",
|
||||
" return text.split(\"\\n\\nPrice is $\")[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "8c79e2fe-1f50-4ebf-9a93-34f3088f2996",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [21:47<00:00, 3.27s/it]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for i in tqdm(range(0, len(items), 1000)):\n",
|
||||
" documents = [description(item) for item in items[i: i+1000]]\n",
|
||||
" vectors = model.encode(documents).astype(float).tolist()\n",
|
||||
" metadatas = [{\"category\": item.category, \"price\": item.price} for item in items[i: i+1000]]\n",
|
||||
" ids = [f\"doc_{j}\" for j in range(i, i+1000)]\n",
|
||||
" collection.add(\n",
|
||||
" ids=ids,\n",
|
||||
" documents=documents,\n",
|
||||
" embeddings=vectors,\n",
|
||||
" metadatas=metadatas\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "525fc313-8a16-4ac0-8c42-6a6d1ba1c9b8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"CATEGORIES = ['Appliances', 'Automotive', 'Cell_Phones_and_Accessories', 'Electronics','Musical_Instruments', 'Office_Products', 'Tools_and_Home_Improvement', 'Toys_and_Games']\n",
|
||||
"COLORS = ['red', 'blue', 'brown', 'orange', 'yellow', 'green' , 'purple', 'cyan']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a4cf1c9a-1ced-48d4-974c-3c850905034e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Prework\n",
|
||||
"\n",
|
||||
"vectors_np = np.array(vectors)\n",
|
||||
"colors = [COLORS[CATEGORIES.index(t)] for t in categories]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0c6718b3-e0fd-4319-a1b5-d9d34d6b1dd9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# We humans find it easier to visalize things in 2D!\n",
|
||||
"# Reduce the dimensionality of the vectors to 2D using t-SNE\n",
|
||||
"# (t-distributed stochastic neighbor embedding)\n",
|
||||
"\n",
|
||||
"tsne = TSNE(n_components=2, random_state=42)\n",
|
||||
"reduced_vectors = tsne.fit_transform(vectors_np)\n",
|
||||
"\n",
|
||||
"# Create the 2D scatter plot\n",
|
||||
"fig = go.Figure(data=[go.Scatter(\n",
|
||||
" x=reduced_vectors[:, 0],\n",
|
||||
" y=reduced_vectors[:, 1],\n",
|
||||
" mode='markers',\n",
|
||||
" marker=dict(size=3, color=colors, opacity=0.8),\n",
|
||||
" text=[f\"Category: {c}<br>Text: {d[:100]}...\" for c, d in zip(categories, descriptions)],\n",
|
||||
" hoverinfo='text'\n",
|
||||
")])\n",
|
||||
"\n",
|
||||
"fig.update_layout(\n",
|
||||
" title='2D Chroma Vector Store Visualization',\n",
|
||||
" scene=dict(xaxis_title='x',yaxis_title='y'),\n",
|
||||
" width=1200,\n",
|
||||
" height=800,\n",
|
||||
" margin=dict(r=20, b=10, l=10, t=40)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c54df150-c8d8-4bc3-8877-6759691eeb42",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's try 3D!\n",
|
||||
"\n",
|
||||
"tsne = TSNE(n_components=3, random_state=42)\n",
|
||||
"reduced_vectors = tsne.fit_transform(vectors_np)\n",
|
||||
"\n",
|
||||
"# Create the 3D scatter plot\n",
|
||||
"fig = go.Figure(data=[go.Scatter3d(\n",
|
||||
" x=reduced_vectors[:, 0],\n",
|
||||
" y=reduced_vectors[:, 1],\n",
|
||||
" z=reduced_vectors[:, 2],\n",
|
||||
" mode='markers',\n",
|
||||
" marker=dict(size=3, color=colors, opacity=0.7),\n",
|
||||
" text=[f\"Category: {c}<br>Text: {d[:100]}...\" for c, d in zip(categories, descriptions)],\n",
|
||||
" hoverinfo='text'\n",
|
||||
")])\n",
|
||||
"\n",
|
||||
"fig.update_layout(\n",
|
||||
" title='3D Chroma Vector Store Visualization',\n",
|
||||
" scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n",
|
||||
" width=1200,\n",
|
||||
" height=800,\n",
|
||||
" margin=dict(r=20, b=10, l=10, t=40)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e8fb2a63-24c5-4dce-9e63-aa208272f82d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
150
week8_wip/day2.2.ipynb
Normal file
150
week8_wip/day2.2.ipynb
Normal file
@@ -0,0 +1,150 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "993a2a24-1a58-42be-8034-6d116fb8d786",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"import math\n",
|
||||
"import json\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import random\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from huggingface_hub import login\n",
|
||||
"import numpy as np\n",
|
||||
"import pickle\n",
|
||||
"from sentence_transformers import SentenceTransformer\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"import chromadb\n",
|
||||
"from items import Item\n",
|
||||
"from sklearn.manifold import TSNE\n",
|
||||
"import plotly.graph_objects as go"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "f4aab95e-d719-4476-b6e7-e248120df25a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DB = \"products_vectorstore\"\n",
|
||||
"client = chromadb.PersistentClient(path=DB)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "5f95dafd-ab80-464e-ba8a-dec7a2424780",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Deleted existing collection: products\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"collection = client.get_or_create_collection('products')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "525fc313-8a16-4ac0-8c42-6a6d1ba1c9b8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"CATEGORIES = ['Appliances', 'Automotive', 'Cell_Phones_and_Accessories', 'Electronics','Musical_Instruments', 'Office_Products', 'Tools_and_Home_Improvement', 'Toys_and_Games']\n",
|
||||
"COLORS = ['red', 'blue', 'brown', 'orange', 'yellow', 'green' , 'purple', 'cyan']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a4cf1c9a-1ced-48d4-974c-3c850905034e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Prework\n",
|
||||
"result = collection.get(include=['embeddings', 'documents', 'metadatas'])\n",
|
||||
"vectors = np.array(result['embeddings'])\n",
|
||||
"documents = result['documents']\n",
|
||||
"categories = [metadata['category'] for metadata in result['metadatas']]\n",
|
||||
"colors = [COLORS[CATEGORIES.index(c)] for c in categories]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c54df150-c8d8-4bc3-8877-6759691eeb42",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's try 3D!\n",
|
||||
"\n",
|
||||
"tsne = TSNE(n_components=3, random_state=42, max_iter=250, n_jobs=-1)\n",
|
||||
"reduced_vectors = tsne.fit_transform(vectors)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e8fb2a63-24c5-4dce-9e63-aa208272f82d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# Create the 3D scatter plot\n",
|
||||
"fig = go.Figure(data=[go.Scatter3d(\n",
|
||||
" x=reduced_vectors[:, 0],\n",
|
||||
" y=reduced_vectors[:, 1],\n",
|
||||
" z=reduced_vectors[:, 2],\n",
|
||||
" mode='markers',\n",
|
||||
" marker=dict(size=3, color=colors, opacity=0.7),\n",
|
||||
" text=[f\"Category: {c}<br>Text: {d[:100]}...\" for c, d in zip(categories, documents)],\n",
|
||||
" hoverinfo='text'\n",
|
||||
")])\n",
|
||||
"\n",
|
||||
"fig.update_layout(\n",
|
||||
" title='3D Chroma Vector Store Visualization',\n",
|
||||
" scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n",
|
||||
" width=1200,\n",
|
||||
" height=800,\n",
|
||||
" margin=dict(r=20, b=10, l=10, t=40)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
618
week8_wip/day2.3.ipynb
Normal file
618
week8_wip/day2.3.ipynb
Normal file
File diff suppressed because one or more lines are too long
1257
week8_wip/day2.4.ipynb
Normal file
1257
week8_wip/day2.4.ipynb
Normal file
File diff suppressed because one or more lines are too long
19
week8_wip/hello.py
Normal file
19
week8_wip/hello.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import modal
|
||||
from modal import App, Volume, Image
|
||||
|
||||
# Setup
|
||||
|
||||
app = modal.App("hello")
|
||||
image = Image.debian_slim().pip_install("requests")
|
||||
gpu = "T4"
|
||||
|
||||
# Hello!
|
||||
|
||||
@app.function(image=image)
|
||||
def hello() -> str:
|
||||
import requests
|
||||
|
||||
response = requests.get('https://ipinfo.io/json')
|
||||
data = response.json()
|
||||
city, region, country = data['city'], data['region'], data['country']
|
||||
return f"Hello from {city}, {region}, {country}!!"
|
||||
101
week8_wip/items.py
Normal file
101
week8_wip/items.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from typing import Optional
|
||||
from transformers import AutoTokenizer
|
||||
import re
|
||||
|
||||
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
|
||||
MIN_TOKENS = 150
|
||||
MAX_TOKENS = 160
|
||||
MIN_CHARS = 300
|
||||
CEILING_CHARS = MAX_TOKENS * 7
|
||||
|
||||
class Item:
|
||||
"""
|
||||
An Item is a cleaned, curated datapoint of a Product with a Price
|
||||
"""
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
||||
PREFIX = "Price is $"
|
||||
QUESTION = "How much does this cost to the nearest dollar?"
|
||||
REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', "By Manufacturer", "Item", "Date First", "Package", ":", "Number of", "Best Sellers", "Number", "Product "]
|
||||
|
||||
title: str
|
||||
price: float
|
||||
category: str
|
||||
token_count: int = 0
|
||||
details: Optional[str]
|
||||
prompt: Optional[str] = None
|
||||
include = False
|
||||
|
||||
def __init__(self, data, price):
|
||||
self.title = data['title']
|
||||
self.price = price
|
||||
self.parse(data)
|
||||
|
||||
def scrub_details(self):
|
||||
"""
|
||||
Clean up the details string by removing common text that doesn't add value
|
||||
"""
|
||||
details = self.details
|
||||
for remove in self.REMOVALS:
|
||||
details = details.replace(remove, "")
|
||||
return details
|
||||
|
||||
def scrub(self, stuff):
|
||||
"""
|
||||
Clean up the provided text by removing unnecessary characters and whitespace
|
||||
Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers
|
||||
"""
|
||||
stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip()
|
||||
stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",")
|
||||
words = stuff.split(' ')
|
||||
select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]
|
||||
return " ".join(select)
|
||||
|
||||
def parse(self, data):
|
||||
"""
|
||||
Parse this datapoint and if it fits within the allowed Token range,
|
||||
then set include to True
|
||||
"""
|
||||
contents = '\n'.join(data['description'])
|
||||
if contents:
|
||||
contents += '\n'
|
||||
features = '\n'.join(data['features'])
|
||||
if features:
|
||||
contents += features + '\n'
|
||||
self.details = data['details']
|
||||
if self.details:
|
||||
contents += self.scrub_details() + '\n'
|
||||
if len(contents) > MIN_CHARS:
|
||||
contents = contents[:CEILING_CHARS]
|
||||
text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
|
||||
tokens = self.tokenizer.encode(text, add_special_tokens=False)
|
||||
if len(tokens) > MIN_TOKENS:
|
||||
tokens = tokens[:MAX_TOKENS]
|
||||
text = self.tokenizer.decode(tokens)
|
||||
self.make_prompt(text)
|
||||
self.include = True
|
||||
|
||||
def make_prompt(self, text):
|
||||
"""
|
||||
Set the prompt instance variable to be a prompt appropriate for training
|
||||
"""
|
||||
self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
|
||||
self.prompt += f"{self.PREFIX}{str(round(self.price))}.00"
|
||||
self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))
|
||||
|
||||
def test_prompt(self):
|
||||
"""
|
||||
Return a prompt suitable for testing, with the actual price removed
|
||||
"""
|
||||
return self.prompt.split(self.PREFIX)[0] + self.PREFIX
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a String version of this Item
|
||||
"""
|
||||
return f"<{self.title} = ${self.price}>"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
44
week8_wip/llama.py
Normal file
44
week8_wip/llama.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import modal
|
||||
from modal import App, Volume, Image
|
||||
|
||||
# Setup
|
||||
|
||||
app = modal.App("llama")
|
||||
image = Image.debian_slim().pip_install("torch", "transformers", "bitsandbytes", "accelerate")
|
||||
secrets = [modal.Secret.from_name("hf-secret")]
|
||||
GPU = "T4"
|
||||
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B"
|
||||
|
||||
|
||||
|
||||
@app.function(image=image, secrets=secrets, gpu=GPU)
|
||||
def generate(prompt: str) -> str:
|
||||
import os
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
|
||||
|
||||
# Quant Config
|
||||
quant_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_quant_type="nf4"
|
||||
)
|
||||
|
||||
# Load model and tokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
tokenizer.padding_side = "right"
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_NAME,
|
||||
quantization_config=quant_config,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
set_seed(42)
|
||||
inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
|
||||
attention_mask = torch.ones(inputs.shape, device="cuda")
|
||||
outputs = model.generate(inputs, attention_mask=attention_mask, max_new_tokens=5, num_return_sequences=1)
|
||||
return tokenizer.decode(outputs[0])
|
||||
66
week8_wip/pricer_ephemeral.py
Normal file
66
week8_wip/pricer_ephemeral.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import modal
|
||||
from modal import App, Volume, Image
|
||||
|
||||
# Setup
|
||||
|
||||
app = modal.App("pricer")
|
||||
image = Image.debian_slim().pip_install("torch", "transformers", "bitsandbytes", "accelerate", "peft")
|
||||
secrets = [modal.Secret.from_name("hf-secret")]
|
||||
|
||||
# Constants
|
||||
|
||||
GPU = "T4"
|
||||
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
|
||||
PROJECT_NAME = "pricer"
|
||||
HF_USER = "ed-donner" # your HF name here! Or use mine if you just want to reproduce my results.
|
||||
RUN_NAME = "2024-09-13_13.04.39"
|
||||
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
|
||||
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"
|
||||
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
|
||||
|
||||
|
||||
@app.function(image=image, secrets=secrets, gpu=GPU)
|
||||
def price(description: str) -> float:
|
||||
import os
|
||||
import re
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
|
||||
from peft import PeftModel
|
||||
|
||||
QUESTION = "How much does this cost to the nearest dollar?"
|
||||
PREFIX = "Price is $"
|
||||
|
||||
prompt = f"{QUESTION}\n{description}\n{PREFIX}"
|
||||
|
||||
# Quant Config
|
||||
quant_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_quant_type="nf4"
|
||||
)
|
||||
|
||||
# Load model and tokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
tokenizer.padding_side = "right"
|
||||
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
BASE_MODEL,
|
||||
quantization_config=quant_config,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)
|
||||
|
||||
set_seed(42)
|
||||
inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
|
||||
attention_mask = torch.ones(inputs.shape, device="cuda")
|
||||
outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=5, num_return_sequences=1)
|
||||
result = tokenizer.decode(outputs[0])
|
||||
|
||||
contents = result.split("Price is $")[1]
|
||||
contents = contents.replace(',','')
|
||||
match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
|
||||
return float(match.group()) if match else 0
|
||||
66
week8_wip/pricer_service.py
Normal file
66
week8_wip/pricer_service.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import modal
|
||||
from modal import App, Volume, Image
|
||||
|
||||
# Setup - define our infrastructure with code!
|
||||
|
||||
app = modal.App("pricer-service")
|
||||
image = Image.debian_slim().pip_install("torch", "transformers", "bitsandbytes", "accelerate", "peft")
|
||||
secrets = [modal.Secret.from_name("hf-secret")]
|
||||
|
||||
# Constants
|
||||
|
||||
GPU = "T4"
|
||||
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
|
||||
PROJECT_NAME = "pricer"
|
||||
HF_USER = "ed-donner" # your HF name here! Or use mine if you just want to reproduce my results.
|
||||
RUN_NAME = "2024-09-13_13.04.39"
|
||||
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
|
||||
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"
|
||||
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
|
||||
|
||||
|
||||
@app.function(image=image, secrets=secrets, gpu=GPU)
|
||||
def price(description: str) -> float:
|
||||
import os
|
||||
import re
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
|
||||
from peft import PeftModel
|
||||
|
||||
QUESTION = "How much does this cost to the nearest dollar?"
|
||||
PREFIX = "Price is $"
|
||||
|
||||
prompt = f"{QUESTION}\n{description}\n{PREFIX}"
|
||||
|
||||
# Quant Config
|
||||
quant_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_quant_type="nf4"
|
||||
)
|
||||
|
||||
# Load model and tokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
tokenizer.padding_side = "right"
|
||||
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
BASE_MODEL,
|
||||
quantization_config=quant_config,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)
|
||||
|
||||
set_seed(42)
|
||||
inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
|
||||
attention_mask = torch.ones(inputs.shape, device="cuda")
|
||||
outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=5, num_return_sequences=1)
|
||||
result = tokenizer.decode(outputs[0])
|
||||
|
||||
contents = result.split("Price is $")[1]
|
||||
contents = contents.replace(',','')
|
||||
match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
|
||||
return float(match.group()) if match else 0
|
||||
84
week8_wip/pricer_service2.py
Normal file
84
week8_wip/pricer_service2.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import modal
|
||||
from modal import App, Volume, Image
|
||||
|
||||
# Setup - define our infrastructure with code!
|
||||
|
||||
app = modal.App("pricer-service")
|
||||
image = Image.debian_slim().pip_install("huggingface", "torch", "transformers", "bitsandbytes", "accelerate", "peft")
|
||||
secrets = [modal.Secret.from_name("hf-secret")]
|
||||
|
||||
# Constants
|
||||
|
||||
GPU = "T4"
|
||||
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
|
||||
PROJECT_NAME = "pricer"
|
||||
HF_USER = "ed-donner" # your HF name here! Or use mine if you just want to reproduce my results.
|
||||
RUN_NAME = "2024-09-13_13.04.39"
|
||||
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
|
||||
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"
|
||||
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
|
||||
|
||||
QUESTION = "How much does this cost to the nearest dollar?"
|
||||
PREFIX = "Price is $"
|
||||
|
||||
|
||||
@app.cls(image=image, secrets=secrets, gpu=GPU)
|
||||
class Pricer:
|
||||
@modal.build()
|
||||
def download_model_to_folder(self):
|
||||
from huggingface_hub import snapshot_download
|
||||
import os
|
||||
MODEL_DIR = "~/.cache/huggingface/hub/"
|
||||
os.makedirs(MODEL_DIR, exist_ok=True)
|
||||
snapshot_download(BASE_MODEL, local_dir=MODEL_DIR)
|
||||
snapshot_download(FINETUNED_MODEL, revision=REVISION, local_dir=MODEL_DIR)
|
||||
|
||||
@modal.enter()
|
||||
def setup(self):
|
||||
import os
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
|
||||
from peft import PeftModel
|
||||
|
||||
# Quant Config
|
||||
quant_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_quant_type="nf4"
|
||||
)
|
||||
|
||||
# Load model and tokenizer
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
||||
self.tokenizer.pad_token = self.tokenizer.eos_token
|
||||
self.tokenizer.padding_side = "right"
|
||||
|
||||
self.base_model = AutoModelForCausalLM.from_pretrained(
|
||||
BASE_MODEL,
|
||||
quantization_config=quant_config,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
self.fine_tuned_model = PeftModel.from_pretrained(self.base_model, FINETUNED_MODEL, revision=REVISION)
|
||||
|
||||
@modal.method()
|
||||
def price(self, description: str) -> float:
|
||||
import os
|
||||
import re
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
|
||||
from peft import PeftModel
|
||||
|
||||
set_seed(42)
|
||||
prompt = f"{QUESTION}\n\n{description}\n\n{PREFIX}"
|
||||
inputs = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
|
||||
attention_mask = torch.ones(inputs.shape, device="cuda")
|
||||
outputs = self.fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=5, num_return_sequences=1)
|
||||
result = self.tokenizer.decode(outputs[0])
|
||||
|
||||
contents = result.split("Price is $")[1]
|
||||
contents = contents.replace(',','')
|
||||
match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
|
||||
return float(match.group()) if match else 0
|
||||
|
||||
75
week8_wip/testing.py
Normal file
75
week8_wip/testing.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import math
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
GREEN = "\033[92m"
|
||||
YELLOW = "\033[93m"
|
||||
RED = "\033[91m"
|
||||
RESET = "\033[0m"
|
||||
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}
|
||||
|
||||
class Tester:
|
||||
|
||||
def __init__(self, predictor, data, title=None, size=250):
|
||||
self.predictor = predictor
|
||||
self.data = data
|
||||
self.title = title or predictor.__name__.replace("_", " ").title()
|
||||
self.size = size
|
||||
self.guesses = []
|
||||
self.truths = []
|
||||
self.errors = []
|
||||
self.sles = []
|
||||
self.colors = []
|
||||
|
||||
def color_for(self, error, truth):
|
||||
if error<40 or error/truth < 0.2:
|
||||
return "green"
|
||||
elif error<80 or error/truth < 0.4:
|
||||
return "orange"
|
||||
else:
|
||||
return "red"
|
||||
|
||||
def run_datapoint(self, i):
|
||||
datapoint = self.data[i]
|
||||
guess = self.predictor(datapoint)
|
||||
truth = datapoint.price
|
||||
error = abs(guess - truth)
|
||||
log_error = math.log(truth+1) - math.log(guess+1)
|
||||
sle = log_error ** 2
|
||||
color = self.color_for(error, truth)
|
||||
title = datapoint.title if len(datapoint.title) <= 40 else datapoint.title[:40]+"..."
|
||||
self.guesses.append(guess)
|
||||
self.truths.append(truth)
|
||||
self.errors.append(error)
|
||||
self.sles.append(sle)
|
||||
self.colors.append(color)
|
||||
print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")
|
||||
|
||||
def chart(self, title):
|
||||
max_error = max(self.errors)
|
||||
plt.figure(figsize=(12, 8))
|
||||
max_val = max(max(self.truths), max(self.guesses))
|
||||
plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
|
||||
plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
|
||||
plt.xlabel('Ground Truth')
|
||||
plt.ylabel('Model Estimate')
|
||||
plt.xlim(0, max_val)
|
||||
plt.ylim(0, max_val)
|
||||
plt.title(title)
|
||||
plt.show()
|
||||
|
||||
def report(self):
|
||||
average_error = sum(self.errors) / self.size
|
||||
rmsle = math.sqrt(sum(self.sles) / self.size)
|
||||
hits = sum(1 for color in self.colors if color=="green")
|
||||
title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
|
||||
self.chart(title)
|
||||
|
||||
def run(self):
|
||||
self.error = 0
|
||||
for i in range(self.size):
|
||||
self.run_datapoint(i)
|
||||
self.report()
|
||||
|
||||
@classmethod
|
||||
def test(cls, function, data):
|
||||
cls(function, data).run()
|
||||
Reference in New Issue
Block a user