Added my contributions to community-contributions

This commit is contained in:
Sameer Khadatkar
2025-06-19 10:05:55 +05:30
parent 3a8a1f2b1c
commit 8cef23aec6
26 changed files with 4850 additions and 0 deletions

View File

@@ -0,0 +1,181 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "a71ed017-e1b0-4299-88b3-f0eb05adc4df",
"metadata": {},
"source": [
"# Build UI\n",
"\n",
"We will use more advanced aspects of Gradio - building piece by piece."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "614c6202-4575-448d-98ee-78b735775d2b",
"metadata": {},
"outputs": [],
"source": [
"import gradio as gr\n",
"from deal_agent_framework import DealAgentFramework\n",
"from agents.deals import Opportunity, Deal"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0534e714-5a9c-45c6-998c-3472ac0bb8b5",
"metadata": {},
"outputs": [],
"source": [
"with gr.Blocks(title=\"Deal Intel\", fill_width=True) as ui:\n",
"\n",
" with gr.Row():\n",
" gr.Markdown('<div style=\"text-align: center;font-size:24px\">Deal Intel - Deal Hunting Agentic AI</div>')\n",
" with gr.Row():\n",
" gr.Markdown('<div style=\"text-align: center;font-size:14px\">Autonomous agent framework that finds online deals, collaborating with a proprietary fine-tuned LLM deployed on Modal, and a RAG pipeline with a frontier model and Chroma.</div>')\n",
" \n",
"\n",
"ui.launch(inbrowser=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18c12c10-750c-4da3-8df5-f2bc3393f9e0",
"metadata": {},
"outputs": [],
"source": [
"# Updated to change from height to max_height due to change in Gradio v5\n",
"# With much thanks to student Ed B. for raising this\n",
"\n",
"with gr.Blocks(title=\"Deal Intel\", fill_width=True) as ui:\n",
"\n",
" initial_deal = Deal(product_description=\"Example description\", price=100.0, url=\"https://cnn.com\")\n",
" initial_opportunity = Opportunity(deal=initial_deal, estimate=200.0, discount=100.0)\n",
" opportunities = gr.State([initial_opportunity])\n",
"\n",
" def get_table(opps):\n",
" return [[opp.deal.product_description, opp.deal.price, opp.estimate, opp.discount, opp.deal.url] for opp in opps]\n",
"\n",
" with gr.Row():\n",
" gr.Markdown('<div style=\"text-align: center;font-size:24px\">\"Deal Intel\" - Deal Hunting Agentic AI</div>')\n",
" with gr.Row():\n",
" gr.Markdown('<div style=\"text-align: center;font-size:14px\">Deals surfaced so far:</div>')\n",
" with gr.Row():\n",
" opportunities_dataframe = gr.Dataframe(\n",
" headers=[\"Description\", \"Price\", \"Estimate\", \"Discount\", \"URL\"],\n",
" wrap=True,\n",
" column_widths=[4, 1, 1, 1, 2],\n",
" row_count=10,\n",
" col_count=5,\n",
" max_height=400,\n",
" )\n",
"\n",
" ui.load(get_table, inputs=[opportunities], outputs=[opportunities_dataframe])\n",
"\n",
"ui.launch(inbrowser=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87106328-a17a-447e-90b9-c547613468da",
"metadata": {},
"outputs": [],
"source": [
"agent_framework = DealAgentFramework()\n",
"agent_framework.init_agents_as_needed()\n",
"\n",
"with gr.Blocks(title=\"Deal Intel\", fill_width=True) as ui:\n",
"\n",
" initial_deal = Deal(product_description=\"Example description\", price=100.0, url=\"https://cnn.com\")\n",
" initial_opportunity = Opportunity(deal=initial_deal, estimate=200.0, discount=100.0)\n",
" opportunities = gr.State([initial_opportunity])\n",
"\n",
" def get_table(opps):\n",
" return [[opp.deal.product_description, opp.deal.price, opp.estimate, opp.discount, opp.deal.url] for opp in opps]\n",
"\n",
" def do_select(opportunities, selected_index: gr.SelectData):\n",
" row = selected_index.index[0]\n",
" opportunity = opportunities[row]\n",
" agent_framework.planner.messenger.alert(opportunity)\n",
"\n",
" with gr.Row():\n",
" gr.Markdown('<div style=\"text-align: center;font-size:24px\">\"Deal Intel\" - Deal Hunting Agentic AI</div>')\n",
" with gr.Row():\n",
" gr.Markdown('<div style=\"text-align: center;font-size:14px\">Deals surfaced so far:</div>')\n",
" with gr.Row():\n",
" opportunities_dataframe = gr.Dataframe(\n",
" headers=[\"Description\", \"Price\", \"Estimate\", \"Discount\", \"URL\"],\n",
" wrap=True,\n",
" column_widths=[4, 1, 1, 1, 2],\n",
" row_count=10,\n",
" col_count=5,\n",
" max_height=400,\n",
" )\n",
"\n",
" ui.load(get_table, inputs=[opportunities], outputs=[opportunities_dataframe])\n",
" opportunities_dataframe.select(do_select, inputs=[opportunities], outputs=[])\n",
"\n",
"ui.launch(inbrowser=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48506465-1c7a-433f-a665-b277a8b4665c",
"metadata": {},
"outputs": [],
"source": [
"!python price_is_right_final.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9dd0a27-7d46-4c9e-bbe4-a61c9c899c99",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1504cb8-7bf7-4dc4-9b1a-eaba79404aac",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ed84afd-4a04-43d6-8a3b-5143deaf96b2",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,119 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "23f53670-1a73-46ba-a754-4a497e8e0e64",
"metadata": {},
"source": [
"# Messaging Agent and Planning Agent\n",
"\n",
"Then we'll put it all together into an Agent Framework."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "80d683d9-9e92-44ae-af87-a413ca84db21",
"metadata": {},
"outputs": [],
"source": [
"from dotenv import load_dotenv\n",
"from agents.messaging_agent import MessagingAgent"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ba769cc-5301-4810-b01f-cab584cfb3b3",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"DB = \"products_vectorstore\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e05cc427-3d2c-4792-ade1-d356f95a82a9",
"metadata": {},
"outputs": [],
"source": [
"agent = MessagingAgent()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ec518f5-dae4-44b1-a185-d7eaf853ec00",
"metadata": {},
"outputs": [],
"source": [
"agent.push(\"MASSIVE NEWS!!!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57b3a014-0b15-425a-a29b-6fefc5006dee",
"metadata": {},
"outputs": [],
"source": [
"import chromadb\n",
"DB = \"products_vectorstore\"\n",
"client = chromadb.PersistentClient(path=DB)\n",
"collection = client.get_or_create_collection('products')\n",
"from agents.planning_agent import PlanningAgent"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5c31c39-e357-446e-9cec-b4775c298941",
"metadata": {},
"outputs": [],
"source": [
"planner = PlanningAgent(collection)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d9ac771b-ea12-41c0-a7ce-05f12e27ad9e",
"metadata": {},
"outputs": [],
"source": [
"planner.plan()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d91ac0bb-738e-4be5-9074-d583190b1e2a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,342 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "fbcdfea8-7241-46d7-a771-c0381a3e7063",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import re\n",
"import math\n",
"import json\n",
"from tqdm import tqdm\n",
"import random\n",
"from dotenv import load_dotenv\n",
"from huggingface_hub import login\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pickle\n",
"from openai import OpenAI\n",
"from sentence_transformers import SentenceTransformer\n",
"from datasets import load_dataset\n",
"import chromadb\n",
"from items import Item\n",
"from testing import Tester"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98666e73-938e-469d-8987-e6e55ba5e034",
"metadata": {},
"outputs": [],
"source": [
"# environment\n",
"load_dotenv(override=True)\n",
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a25a5cf-8f6c-4b5d-ad98-fdd096f5adf8",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc696493-0b6f-48aa-9fa8-b1ae0ecaf3cd",
"metadata": {},
"outputs": [],
"source": [
"# Load in the test pickle file\n",
"with open('test.pkl', 'rb') as file:\n",
" test = pickle.load(file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "33d38a06-0c0d-4e96-94d1-35ee183416ce",
"metadata": {},
"outputs": [],
"source": [
"def make_context(similars, prices):\n",
" message = \"To provide some context, here are some other items that might be similar to the item you need to estimate.\\n\\n\"\n",
" for similar, price in zip(similars, prices):\n",
" message += f\"Potentially related product:\\n{similar}\\nPrice is ${price:.2f}\\n\\n\"\n",
" return message"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "61f203b7-63b6-48ed-869b-e393b5bfcad3",
"metadata": {},
"outputs": [],
"source": [
"def messages_for(item, similars, prices):\n",
" system_message = \"You estimate prices of items. Reply only with the price, no explanation. Price is always below $1000.\"\n",
" user_prompt = make_context(similars, prices)\n",
" user_prompt += \"And now the question for you:\\n\\n\"\n",
" user_prompt += item.test_prompt().replace(\" to the nearest dollar\",\"\").replace(\"\\n\\nPrice is $\",\"\")\n",
" return [\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\", \"content\": user_prompt},\n",
" {\"role\": \"assistant\", \"content\": \"Price is $\"}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b26f405d-6e1f-4caa-b97f-1f62cd9d1ebc",
"metadata": {},
"outputs": [],
"source": [
"DB = \"products_vectorstore\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d26a1104-cd11-4361-ab25-85fb576e0582",
"metadata": {},
"outputs": [],
"source": [
"client = chromadb.PersistentClient(path=DB)\n",
"collection = client.get_or_create_collection('products')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e339760-96d8-4485-bec7-43fadcd30c4d",
"metadata": {},
"outputs": [],
"source": [
"def description(item):\n",
" text = item.prompt.replace(\"How much does this cost to the nearest dollar?\\n\\n\", \"\")\n",
" return text.split(\"\\n\\nPrice is $\")[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f759bd2-7a7e-4c1a-80a0-e12470feca89",
"metadata": {},
"outputs": [],
"source": [
"model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e44dbd25-fb95-4b6b-bbbb-8da5fc817105",
"metadata": {},
"outputs": [],
"source": [
"def vector(item):\n",
" return model.encode([description(item)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ffd5ee47-db5d-4263-b0d9-80d568c91341",
"metadata": {},
"outputs": [],
"source": [
"def find_similars(item):\n",
" results = collection.query(query_embeddings=vector(item).astype(float).tolist(), n_results=5)\n",
" documents = results['documents'][0][:]\n",
" prices = [m['price'] for m in results['metadatas'][0][:]]\n",
" return documents, prices"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f7b9ff9-fd90-4627-bb17-7c2f7bbd21f3",
"metadata": {},
"outputs": [],
"source": [
"print(test[1].prompt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff1b2659-cc6b-47aa-a797-dd1cd3d1d6c3",
"metadata": {},
"outputs": [],
"source": [
"documents, prices = find_similars(test[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24756d4d-edac-41ce-bb80-c3b6f1cea7ee",
"metadata": {},
"outputs": [],
"source": [
"print(make_context(documents, prices))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b81eca2-0b58-4fe8-9dd6-47f13ba5f8ee",
"metadata": {},
"outputs": [],
"source": [
"print(messages_for(test[1], documents, prices))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d11f1c8d-7480-4d64-a274-b030d701f1b8",
"metadata": {},
"outputs": [],
"source": [
"def get_price(s):\n",
" s = s.replace('$','').replace(',','')\n",
" match = re.search(r\"[-+]?\\d*\\.\\d+|\\d+\", s)\n",
" return float(match.group()) if match else 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06743833-c362-47f8-b02a-139be2cd52ab",
"metadata": {},
"outputs": [],
"source": [
"get_price(\"The price for this is $99.99\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a919cf7d-b3d3-4968-8c96-54a0da0b0219",
"metadata": {},
"outputs": [],
"source": [
"# The function for gpt-4o-mini\n",
"\n",
"def gpt_4o_mini_rag(item):\n",
" documents, prices = find_similars(item)\n",
" response = openai.chat.completions.create(\n",
" model=\"gpt-4o-mini\", \n",
" messages=messages_for(item, documents, prices),\n",
" seed=42,\n",
" max_tokens=5\n",
" )\n",
" reply = response.choices[0].message.content\n",
" return get_price(reply)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b42e1b9-eaa0-4b45-a847-e8932367f596",
"metadata": {},
"outputs": [],
"source": [
"# The function for gpt-4.1\n",
"\n",
"# def gpt_4_1_rag(item):\n",
"# documents, prices = find_similars(item)\n",
"# response = openai.chat.completions.create(\n",
"# model=\"gpt-4.1\", \n",
"# messages=messages_for(item, documents, prices),\n",
"# seed=42,\n",
"# max_tokens=5\n",
"# )\n",
"# reply = response.choices[0].message.content\n",
"# return get_price(reply)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e519e26-ff15-4425-90bb-bfbf55deb39b",
"metadata": {},
"outputs": [],
"source": [
"gpt_4o_mini_rag(test[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "082c6a5a-0f2a-4941-a465-ffb3137a2e8d",
"metadata": {},
"outputs": [],
"source": [
"# gpt_4_1_rag(test[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce78741b-2966-41d2-9831-cbf8f8d176be",
"metadata": {},
"outputs": [],
"source": [
"test[1].price"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16d90455-ff7d-4f5f-8b8c-8e061263d1c7",
"metadata": {},
"outputs": [],
"source": [
"Tester.test(gpt_4o_mini_rag, test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26d5ddc6-baa6-4760-a430-05671847ac47",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,235 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0df0d850-49eb-4a0b-a27a-146969db710d",
"metadata": {},
"source": [
"# ScanningAgent\n",
"\n",
"Looks for promising deals by subscribing to RSS feeds."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3763a79-8a5a-4300-8de4-93e85475af10",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import json\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"from agents.deals import ScrapedDeal, DealSelection"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6469e32-16c3-4443-9475-ade710ef6933",
"metadata": {},
"outputs": [],
"source": [
"# Initialize and constants\n",
"\n",
"load_dotenv(override=True)\n",
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
"MODEL = 'gpt-4o-mini'\n",
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "afece9db-8cd4-46be-ac57-0b472e84da7d",
"metadata": {},
"outputs": [],
"source": [
"deals = ScrapedDeal.fetch(show_progress=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8cd15c4d-eb44-4601-bf0c-f945c1d8e3ec",
"metadata": {},
"outputs": [],
"source": [
"len(deals)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4259f30a-6455-49ed-8863-2f9ddd4776cb",
"metadata": {},
"outputs": [],
"source": [
"deals[44].describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8100e5ac-38f5-40c1-a712-08ae12c85038",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"\"\"You identify and summarize the 5 most detailed deals from a list, by selecting deals that have the most detailed, high quality description and the most clear price.\n",
"Respond strictly in JSON with no explanation, using this format. You should provide the price as a number derived from the description. If the price of a deal isn't clear, do not include that deal in your response.\n",
"Most important is that you respond with the 5 deals that have the most detailed product description with price. It's not important to mention the terms of the deal; most important is a thorough description of the product.\n",
"Be careful with products that are described as \"$XXX off\" or \"reduced by $XXX\" - this isn't the actual price of the product. Only respond with products when you are highly confident about the price. \n",
"\n",
"{\"deals\": [\n",
" {\n",
" \"product_description\": \"Your clearly expressed summary of the product in 4-5 sentences. Details of the item are much more important than why it's a good deal. Avoid mentioning discounts and coupons; focus on the item itself. There should be a paragpraph of text for each item you choose.\",\n",
" \"price\": 99.99,\n",
" \"url\": \"the url as provided\"\n",
" },\n",
" ...\n",
"]}\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4bca170-af71-40c9-9597-1d72980c74d8",
"metadata": {},
"outputs": [],
"source": [
"user_prompt = \"\"\"Respond with the most promising 5 deals from this list, selecting those which have the most detailed, high quality product description and a clear price.\n",
"Respond strictly in JSON, and only JSON. You should rephrase the description to be a summary of the product itself, not the terms of the deal.\n",
"Remember to respond with a paragraph of text in the product_description field for each of the 5 items that you select.\n",
"Be careful with products that are described as \"$XXX off\" or \"reduced by $XXX\" - this isn't the actual price of the product. Only respond with products when you are highly confident about the price. \n",
"\n",
"Deals:\n",
"\n",
"\"\"\"\n",
"user_prompt += '\\n\\n'.join([deal.describe() for deal in deals])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "020947a6-561b-417b-98a0-a085e31d2ce3",
"metadata": {},
"outputs": [],
"source": [
"print(user_prompt[:2000])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7de46f74-868c-4127-8a68-cf2da7d600bb",
"metadata": {},
"outputs": [],
"source": [
"def get_recommendations():\n",
" completion = openai.beta.chat.completions.parse(\n",
" model=\"gpt-4o-mini\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" response_format=DealSelection\n",
" )\n",
" result = completion.choices[0].message.parsed\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c06270d-8c17-4d5a-9cfe-b6cefe788d5e",
"metadata": {},
"outputs": [],
"source": [
"result = get_recommendations()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "84e62845-3338-441a-8161-c70097af4773",
"metadata": {},
"outputs": [],
"source": [
"len(result.deals)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5554a0a-ae40-4684-ad3e-faa3d22e030c",
"metadata": {},
"outputs": [],
"source": [
"result.deals[1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8bdc57fb-7497-47af-a643-6ba5a21cc17e",
"metadata": {},
"outputs": [],
"source": [
"from agents.scanner_agent import ScannerAgent"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "132278bc-217a-43a6-b6c4-724140c6a225",
"metadata": {},
"outputs": [],
"source": [
"agent = ScannerAgent()\n",
"result = agent.scan()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2e1d013a-c930-4dad-901b-41433379e14b",
"metadata": {},
"outputs": [],
"source": [
"result"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ee2e837-1f1d-42d4-8bc4-51cccc343006",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,208 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "993a2a24-1a58-42be-8034-6d116fb8d786",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import re\n",
"import math\n",
"import json\n",
"from tqdm import tqdm\n",
"import random\n",
"from dotenv import load_dotenv\n",
"from huggingface_hub import login\n",
"import numpy as np\n",
"import pickle\n",
"from sentence_transformers import SentenceTransformer\n",
"from datasets import load_dataset\n",
"import chromadb\n",
"from items import Item\n",
"from sklearn.manifold import TSNE\n",
"import plotly.graph_objects as go"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2359ccc0-dbf2-4b1e-9473-e472b32f548b",
"metadata": {},
"outputs": [],
"source": [
"# environment\n",
"\n",
"load_dotenv(override=True)\n",
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')\n",
"DB = \"products_vectorstore\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "645167e6-cf0d-42d2-949f-1089a25a2841",
"metadata": {},
"outputs": [],
"source": [
"# Log in to HuggingFace\n",
"\n",
"hf_token = os.environ['HF_TOKEN']\n",
"login(hf_token, add_to_git_credential=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "688bd995-ec3e-43cd-8179-7fe14b275877",
"metadata": {},
"outputs": [],
"source": [
"# With train.pkl in this folder\n",
"with open('train.pkl', 'rb') as file:\n",
" train = pickle.load(file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4aab95e-d719-4476-b6e7-e248120df25a",
"metadata": {},
"outputs": [],
"source": [
"client = chromadb.PersistentClient(path=DB)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f95dafd-ab80-464e-ba8a-dec7a2424780",
"metadata": {},
"outputs": [],
"source": [
"# Check if the collection exists and delete it if it does\n",
"collection_name = \"products\"\n",
"existing_collection_names = [collection.name for collection in client.list_collections()]\n",
"if collection_name in existing_collection_names:\n",
" client.delete_collection(collection_name)\n",
" print(f\"Deleted existing collection: {collection_name}\")\n",
"\n",
"collection = client.create_collection(collection_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a87db200-d19d-44bf-acbd-15c45c70f5c9",
"metadata": {},
"outputs": [],
"source": [
"model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b23a025-4c35-4d3a-96ad-b956cad37b0a",
"metadata": {},
"outputs": [],
"source": [
"# Pass in a list of texts, get back a numpy array of vectors\n",
"vector = model.encode([\"Well hi there\"])[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8adde63f-e732-4f7c-bba9-f8b2a469f14e",
"metadata": {},
"outputs": [],
"source": [
"vector"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38de1bf8-c9b5-45b4-9f4b-86af93b3f80d",
"metadata": {},
"outputs": [],
"source": [
"def description(item):\n",
" text = item.prompt.replace(\"How much does this cost to the nearest dollar?\\n\\n\", \"\")\n",
" return text.split(\"\\n\\nPrice is $\")[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c1205bd-4692-44ef-8ea4-69f255354537",
"metadata": {},
"outputs": [],
"source": [
"description(train[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c79e2fe-1f50-4ebf-9a93-34f3088f2996",
"metadata": {},
"outputs": [],
"source": [
"for i in tqdm(range(0, len(train), 1000)):\n",
" documents = [description(item) for item in train[i: i+1000]]\n",
" vectors = model.encode(documents).astype(float).tolist()\n",
" metadatas = [{\"category\": item.category, \"price\": item.price} for item in train[i: i+1000]]\n",
" ids = [f\"doc_{j}\" for j in range(i, i+1000)]\n",
" collection.add(\n",
" ids=ids,\n",
" documents=documents,\n",
" embeddings=vectors,\n",
" metadatas=metadatas\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a9395db-7bc9-47f9-902f-af8d380c9c09",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "745f73d9-f1a6-4e9f-96d9-1c38a1dd7559",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "bc0e1c1c-be6a-4395-bbbd-eeafc9330d7e",
"metadata": {},
"outputs": [],
"source": [
"# import modal\n",
"import modal"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0d240622-8422-4c99-8464-c04d063e4cb6",
"metadata": {},
"outputs": [],
"source": [
"# !modal setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0050c070-146f-4c26-8045-5ff284761199",
"metadata": {},
"outputs": [],
"source": [
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ebf35de4-ef8f-4e5b-8d4e-9a1771bfbe25",
"metadata": {},
"outputs": [],
"source": [
"os.environ['PYTHONIOENCODING'] = 'utf-8'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f90d857-2f12-4521-bb90-28efd917f7d1",
"metadata": {},
"outputs": [],
"source": [
"!modal deploy pricer_service"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1dec70ff-1986-4405-8624-9bbbe0ce1f4a",
"metadata": {},
"outputs": [],
"source": [
"pricer = modal.Cls.from_name(\"pricer-service\", \"Pricer\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "17776139-0d9e-4ad0-bcd0-82d3a92ca61f",
"metadata": {},
"outputs": [],
"source": [
"pricer().price.remote(\"Quadcast HyperX condenser mic, connects via usb-c to your computer for crystal clear audio\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "deb6cdf6-bcb0-49fb-8671-bb5eb22f02e3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,195 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "993a2a24-1a58-42be-8034-6d116fb8d786",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import re\n",
"import math\n",
"import json\n",
"from tqdm import tqdm\n",
"import random\n",
"from dotenv import load_dotenv\n",
"from huggingface_hub import login\n",
"import numpy as np\n",
"import pickle\n",
"from sentence_transformers import SentenceTransformer\n",
"from datasets import load_dataset\n",
"import chromadb\n",
"from items import Item\n",
"from sklearn.manifold import TSNE\n",
"import plotly.graph_objects as go"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cc1fe53-612f-4228-aa02-8758f4c2098f",
"metadata": {},
"outputs": [],
"source": [
"MAXIMUM_DATAPOINTS = 30_000"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4aab95e-d719-4476-b6e7-e248120df25a",
"metadata": {},
"outputs": [],
"source": [
"DB = \"products_vectorstore\"\n",
"client = chromadb.PersistentClient(path=DB)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f95dafd-ab80-464e-ba8a-dec7a2424780",
"metadata": {},
"outputs": [],
"source": [
"collection = client.get_or_create_collection('products')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "525fc313-8a16-4ac0-8c42-6a6d1ba1c9b8",
"metadata": {},
"outputs": [],
"source": [
"CATEGORIES = ['Appliances', 'Automotive', 'Cell_Phones_and_Accessories', 'Electronics','Musical_Instruments', 'Office_Products', 'Tools_and_Home_Improvement', 'Toys_and_Games']\n",
"COLORS = ['red', 'blue', 'brown', 'orange', 'yellow', 'green' , 'purple', 'cyan']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4cf1c9a-1ced-48d4-974c-3c850905034e",
"metadata": {},
"outputs": [],
"source": [
"# Prework\n",
"result = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=MAXIMUM_DATAPOINTS)\n",
"vectors = np.array(result['embeddings'])\n",
"documents = result['documents']\n",
"categories = [metadata['category'] for metadata in result['metadatas']]\n",
"colors = [COLORS[CATEGORIES.index(c)] for c in categories]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c54df150-c8d8-4bc3-8877-6759691eeb42",
"metadata": {},
"outputs": [],
"source": [
"# Let's try a 2D chart\n",
"tsne_2d = TSNE(n_components=2, random_state=42, n_jobs=-1)\n",
"reduced_vectors_2d = tsne_2d.fit_transform(vectors)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c93457ab-d895-4d9c-8e5c-1173e2089cfd",
"metadata": {},
"outputs": [],
"source": [
"# Let's try 3D!\n",
"tsne_3d = TSNE(n_components=3, random_state=42, n_jobs=-1)\n",
"reduced_vectors_3d = tsne_3d.fit_transform(vectors)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e8fb2a63-24c5-4dce-9e63-aa208272f82d",
"metadata": {},
"outputs": [],
"source": [
"# Create the 2D scatter plot\n",
"fig = go.Figure(data=[go.Scatter(\n",
" x=reduced_vectors_2d[:, 0],\n",
" y=reduced_vectors_2d[:, 1],\n",
" mode='markers',\n",
" marker=dict(size=3, color=colors, opacity=0.7),\n",
")])\n",
"\n",
"fig.update_layout(\n",
" title='2D Chroma Vectorstore Visualization',\n",
" scene=dict(xaxis_title='x', yaxis_title='y'),\n",
" width=1200,\n",
" height=800,\n",
" margin=dict(r=20, b=10, l=10, t=40)\n",
")\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5e4ae088-3d29-45d3-87a2-fea805fe2c65",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Create the 3D scatter plot\n",
"fig = go.Figure(data=[go.Scatter3d(\n",
" x=reduced_vectors_3d[:, 0],\n",
" y=reduced_vectors_3d[:, 1],\n",
" z=reduced_vectors_3d[:, 2],\n",
" mode='markers',\n",
" marker=dict(size=3, color=colors, opacity=0.7),\n",
")])\n",
"\n",
"fig.update_layout(\n",
" title='3D Chroma Vector Store Visualization',\n",
" scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n",
" width=1200,\n",
" height=800,\n",
" margin=dict(r=20, b=10, l=10, t=40)\n",
")\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a12d1e8-7da8-401d-8c8d-ba0098096ded",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,33 @@
import logging
class Agent:
"""
An abstract superclass for Agents
Used to log messages in a way that can identify each Agent
"""
# Foreground colors
RED = '\033[31m'
GREEN = '\033[32m'
YELLOW = '\033[33m'
BLUE = '\033[34m'
MAGENTA = '\033[35m'
CYAN = '\033[36m'
WHITE = '\033[37m'
# Background color
BG_BLACK = '\033[40m'
# Reset code to return to default color
RESET = '\033[0m'
name: str = ""
color: str = '\033[37m'
def log(self, message):
"""
Log this as an info message, identifying the agent
"""
color_code = self.BG_BLACK + self.color
message = f"[{self.name}] {message}"
logging.info(color_code + message + self.RESET)

View File

@@ -0,0 +1,109 @@
from pydantic import BaseModel
from typing import List, Dict, Self
from bs4 import BeautifulSoup
import re
import feedparser
from tqdm import tqdm
import requests
import time
feeds = [
"https://www.dealnews.com/c142/Electronics/?rss=1",
"https://www.dealnews.com/c39/Computers/?rss=1",
"https://www.dealnews.com/c238/Automotive/?rss=1",
"https://www.dealnews.com/f1912/Smart-Home/?rss=1",
"https://www.dealnews.com/c196/Home-Garden/?rss=1",
]
def extract(html_snippet: str) -> str:
"""
Use Beautiful Soup to clean up this HTML snippet and extract useful text
"""
soup = BeautifulSoup(html_snippet, 'html.parser')
snippet_div = soup.find('div', class_='snippet summary')
if snippet_div:
description = snippet_div.get_text(strip=True)
description = BeautifulSoup(description, 'html.parser').get_text()
description = re.sub('<[^<]+?>', '', description)
result = description.strip()
else:
result = html_snippet
return result.replace('\n', ' ')
class ScrapedDeal:
"""
A class to represent a Deal retrieved from an RSS feed
"""
category: str
title: str
summary: str
url: str
details: str
features: str
def __init__(self, entry: Dict[str, str]):
"""
Populate this instance based on the provided dict
"""
self.title = entry['title']
self.summary = extract(entry['summary'])
self.url = entry['links'][0]['href']
stuff = requests.get(self.url).content
soup = BeautifulSoup(stuff, 'html.parser')
content = soup.find('div', class_='content-section').get_text()
content = content.replace('\nmore', '').replace('\n', ' ')
if "Features" in content:
self.details, self.features = content.split("Features")
else:
self.details = content
self.features = ""
def __repr__(self):
"""
Return a string to describe this deal
"""
return f"<{self.title}>"
def describe(self):
"""
Return a longer string to describe this deal for use in calling a model
"""
return f"Title: {self.title}\nDetails: {self.details.strip()}\nFeatures: {self.features.strip()}\nURL: {self.url}"
@classmethod
def fetch(cls, show_progress : bool = False) -> List[Self]:
"""
Retrieve all deals from the selected RSS feeds
"""
deals = []
feed_iter = tqdm(feeds) if show_progress else feeds
for feed_url in feed_iter:
feed = feedparser.parse(feed_url)
for entry in feed.entries[:10]:
deals.append(cls(entry))
time.sleep(0.5)
return deals
class Deal(BaseModel):
"""
A class to Represent a Deal with a summary description
"""
product_description: str
price: float
url: str
class DealSelection(BaseModel):
"""
A class to Represent a list of Deals
"""
deals: List[Deal]
class Opportunity(BaseModel):
"""
A class to represent a possible opportunity: a Deal where we estimate
it should cost more than it's being offered
"""
deal: Deal
estimate: float
discount: float

View File

@@ -0,0 +1,52 @@
import pandas as pd
from sklearn.linear_model import LinearRegression
import joblib
from agents.agent import Agent
from agents.specialist_agent import SpecialistAgent
from agents.frontier_agent import FrontierAgent
from agents.random_forest_agent import RandomForestAgent
from agents.gradient_boosting_agent import GradientBoostingAgent
class EnsembleAgent(Agent):
name = "Ensemble Agent"
color = Agent.YELLOW
def __init__(self, collection):
"""
Create an instance of Ensemble, by creating each of the models
And loading the weights of the Ensemble
"""
self.log("Initializing Ensemble Agent")
self.specialist = SpecialistAgent()
self.frontier = FrontierAgent(collection)
self.random_forest = RandomForestAgent()
self.gradient_boosting = GradientBoostingAgent()
self.model = joblib.load('ensemble_model.pkl')
self.log("Ensemble Agent is ready")
def price(self, description: str) -> float:
"""
Run this ensemble model
Ask each of the models to price the product
Then use the Linear Regression model to return the weighted price
:param description: the description of a product
:return: an estimate of its price
"""
self.log("Running Ensemble Agent - collaborating with specialist, frontier and random forest agents")
specialist = self.specialist.price(description)
frontier = self.frontier.price(description)
random_forest = self.random_forest.price(description)
gradient_boosting = self.gradient_boosting.price(description)
X = pd.DataFrame({
'Specialist': [specialist],
'Frontier': [frontier],
'RandomForest': [random_forest],
'GradientBoosting': [gradient_boosting],
'Min': [min(specialist, frontier, random_forest)],
'Max': [max(specialist, frontier, random_forest)],
})
y = max(0, self.model.predict(X)[0])
self.log(f"Ensemble Agent complete - returning ${y:.2f}")
return y

View File

@@ -0,0 +1,109 @@
# imports
import os
import re
import math
import json
from typing import List, Dict
import openai
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from items import Item
from testing import Tester
from agents.agent import Agent
class FrontierAgent(Agent):
name = "Frontier Agent"
color = Agent.BLUE
MODEL = "gpt-4o-mini"
def __init__(self, collection):
"""
Set up this instance by connecting to OpenAI or DeepSeek, to the Chroma Datastore,
And setting up the vector encoding model
"""
self.log("Initializing Frontier Agent")
openai.api_key = os.getenv("OPENAI_API_KEY")
self.client = OpenAI()
self.MODEL = "gpt-4o-mini"
self.log("Frontier Agent is setting up with OpenAI")
self.collection = collection
self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
self.log("Frontier Agent is ready")
def make_context(self, similars: List[str], prices: List[float]) -> str:
"""
Create context that can be inserted into the prompt
:param similars: similar products to the one being estimated
:param prices: prices of the similar products
:return: text to insert in the prompt that provides context
"""
message = "To provide some context, here are some other items that might be similar to the item you need to estimate.\n\n"
for similar, price in zip(similars, prices):
message += f"Potentially related product:\n{similar}\nPrice is ${price:.2f}\n\n"
return message
def messages_for(self, description: str, similars: List[str], prices: List[float]) -> List[Dict[str, str]]:
"""
Create the message list to be included in a call to OpenAI
With the system and user prompt
:param description: a description of the product
:param similars: similar products to this one
:param prices: prices of similar products
:return: the list of messages in the format expected by OpenAI
"""
system_message = "You estimate prices of items. Reply only with the price, no explanation. Price is always below $1000."
user_prompt = self.make_context(similars, prices)
user_prompt += "And now the question for you:\n\n"
user_prompt += "How much does this cost?\n\n" + description
return [
{"role": "system", "content": system_message},
{"role": "user", "content": user_prompt},
{"role": "assistant", "content": "Price is $"}
]
def find_similars(self, description: str):
"""
Return a list of items similar to the given one by looking in the Chroma datastore
"""
self.log("Frontier Agent is performing a RAG search of the Chroma datastore to find 5 similar products")
vector = self.model.encode([description])
results = self.collection.query(query_embeddings=vector.astype(float).tolist(), n_results=5)
documents = results['documents'][0][:]
prices = [m['price'] for m in results['metadatas'][0][:]]
self.log("Frontier Agent has found similar products")
return documents, prices
def get_price(self, s) -> float:
"""
A utility that plucks a floating point number out of a string
"""
s = s.replace('$','').replace(',','')
match = re.search(r"[-+]?\d*\.\d+|\d+", s)
return float(match.group()) if match else 0.0
def price(self, description: str) -> float:
"""
Make a call to OpenAI to estimate the price of the described product,
by looking up 5 similar products and including them in the prompt to give context
:param description: a description of the product
:return: an estimate of the price
"""
documents, prices = self.find_similars(description)
self.log(f"Frontier Agent is about to call {self.MODEL} with context including 5 similar products")
response = self.client.chat.completions.create(
model=self.MODEL,
messages=self.messages_for(description, documents, prices),
seed=42,
max_tokens=5
)
reply = response.choices[0].message.content
result = self.get_price(reply)
self.log(f"Frontier Agent completed - predicting ${result:.2f}")
return result

View File

@@ -0,0 +1,37 @@
# imports
import os
import re
from typing import List
from sentence_transformers import SentenceTransformer
import joblib
from agents.agent import Agent
class GradientBoostingAgent(Agent):
name = "Gradient Boosting Agent"
color = Agent.MAGENTA
def __init__(self):
"""
Initialize this object by loading in the saved model weights
and the SentenceTransformer vector encoding model
"""
self.log("Gradient Boosting Agent is initializing")
self.vectorizer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
self.model = joblib.load('gradient_boosting_model.pkl')
self.log("Gradient Boosting Agent is ready")
def price(self, description: str) -> float:
"""
Use a Random Forest model to estimate the price of the described item
:param description: the product to be estimated
:return: the price as a float
"""
self.log("Gradient Boosting Agent is starting a prediction")
vector = self.vectorizer.encode([description])
result = max(0, self.model.predict(vector)[0])
self.log(f"Gradient Boosting Agent completed - predicting ${result:.2f}")
return result

View File

@@ -0,0 +1,79 @@
import os
# from twilio.rest import Client
from agents.deals import Opportunity
import http.client
import urllib
from agents.agent import Agent
# Uncomment the Twilio lines if you wish to use Twilio
DO_TEXT = False
DO_PUSH = True
class MessagingAgent(Agent):
name = "Messaging Agent"
color = Agent.WHITE
def __init__(self):
"""
Set up this object to either do push notifications via Pushover,
or SMS via Twilio,
whichever is specified in the constants
"""
self.log(f"Messaging Agent is initializing")
if DO_TEXT:
account_sid = os.getenv('TWILIO_ACCOUNT_SID', 'your-sid-if-not-using-env')
auth_token = os.getenv('TWILIO_AUTH_TOKEN', 'your-auth-if-not-using-env')
self.me_from = os.getenv('TWILIO_FROM', 'your-phone-number-if-not-using-env')
self.me_to = os.getenv('MY_PHONE_NUMBER', 'your-phone-number-if-not-using-env')
# self.client = Client(account_sid, auth_token)
self.log("Messaging Agent has initialized Twilio")
if DO_PUSH:
self.pushover_user = os.getenv('PUSHOVER_USER', 'your-pushover-user-if-not-using-env')
self.pushover_token = os.getenv('PUSHOVER_TOKEN', 'your-pushover-user-if-not-using-env')
self.log("Messaging Agent has initialized Pushover")
def message(self, text):
"""
Send an SMS message using the Twilio API
"""
self.log("Messaging Agent is sending a text message")
message = self.client.messages.create(
from_=self.me_from,
body=text,
to=self.me_to
)
def push(self, text):
"""
Send a Push Notification using the Pushover API
"""
self.log("Messaging Agent is sending a push notification")
conn = http.client.HTTPSConnection("api.pushover.net:443")
conn.request("POST", "/1/messages.json",
urllib.parse.urlencode({
"token": self.pushover_token,
"user": self.pushover_user,
"message": text,
"sound": "cashregister"
}), { "Content-type": "application/x-www-form-urlencoded" })
conn.getresponse()
def alert(self, opportunity: Opportunity):
"""
Make an alert about the specified Opportunity
"""
text = f"Deal Alert! Price=${opportunity.deal.price:.2f}, "
text += f"Estimate=${opportunity.estimate:.2f}, "
text += f"Discount=${opportunity.discount:.2f} :"
text += opportunity.deal.product_description[:10]+'... '
text += opportunity.deal.url
if DO_TEXT:
self.message(text)
if DO_PUSH:
self.push(text)
self.log("Messaging Agent has completed")

View File

@@ -0,0 +1,57 @@
from typing import Optional, List
from agents.agent import Agent
from agents.deals import ScrapedDeal, DealSelection, Deal, Opportunity
from agents.scanner_agent import ScannerAgent
from agents.ensemble_agent import EnsembleAgent
from agents.messaging_agent import MessagingAgent
class PlanningAgent(Agent):
name = "Planning Agent"
color = Agent.GREEN
DEAL_THRESHOLD = 50
def __init__(self, collection):
"""
Create instances of the 3 Agents that this planner coordinates across
"""
self.log("Planning Agent is initializing")
self.scanner = ScannerAgent()
self.ensemble = EnsembleAgent(collection)
self.messenger = MessagingAgent()
self.log("Planning Agent is ready")
def run(self, deal: Deal) -> Opportunity:
"""
Run the workflow for a particular deal
:param deal: the deal, summarized from an RSS scrape
:returns: an opportunity including the discount
"""
self.log("Planning Agent is pricing up a potential deal")
estimate = self.ensemble.price(deal.product_description)
discount = estimate - deal.price
self.log(f"Planning Agent has processed a deal with discount ${discount:.2f}")
return Opportunity(deal=deal, estimate=estimate, discount=discount)
def plan(self, memory: List[str] = []) -> Optional[Opportunity]:
"""
Run the full workflow:
1. Use the ScannerAgent to find deals from RSS feeds
2. Use the EnsembleAgent to estimate them
3. Use the MessagingAgent to send a notification of deals
:param memory: a list of URLs that have been surfaced in the past
:return: an Opportunity if one was surfaced, otherwise None
"""
self.log("Planning Agent is kicking off a run")
selection = self.scanner.scan(memory=memory)
if selection:
opportunities = [self.run(deal) for deal in selection.deals[:5]]
opportunities.sort(key=lambda opp: opp.discount, reverse=True)
best = opportunities[0]
self.log(f"Planning Agent has identified the best deal has discount ${best.discount:.2f}")
if best.discount > self.DEAL_THRESHOLD:
self.messenger.alert(best)
self.log("Planning Agent has completed a run")
return best if best.discount > self.DEAL_THRESHOLD else None
return None

View File

@@ -0,0 +1,37 @@
# imports
import os
import re
from typing import List
from sentence_transformers import SentenceTransformer
import joblib
from agents.agent import Agent
class RandomForestAgent(Agent):
name = "Random Forest Agent"
color = Agent.MAGENTA
def __init__(self):
"""
Initialize this object by loading in the saved model weights
and the SentenceTransformer vector encoding model
"""
self.log("Random Forest Agent is initializing")
self.vectorizer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
self.model = joblib.load('random_forest_model.pkl')
self.log("Random Forest Agent is ready")
def price(self, description: str) -> float:
"""
Use a Random Forest model to estimate the price of the described item
:param description: the product to be estimated
:return: the price as a float
"""
self.log("Random Forest Agent is starting a prediction")
vector = self.vectorizer.encode([description])
result = max(0, self.model.predict(vector)[0])
self.log(f"Random Forest Agent completed - predicting ${result:.2f}")
return result

View File

@@ -0,0 +1,94 @@
import os
import json
from typing import Optional, List
from openai import OpenAI
from agents.deals import ScrapedDeal, DealSelection
from agents.agent import Agent
class ScannerAgent(Agent):
MODEL = "gpt-4o-mini"
SYSTEM_PROMPT = """You identify and summarize the 5 most detailed deals from a list, by selecting deals that have the most detailed, high quality description and the most clear price.
Respond strictly in JSON with no explanation, using this format. You should provide the price as a number derived from the description. If the price of a deal isn't clear, do not include that deal in your response.
Most important is that you respond with the 5 deals that have the most detailed product description with price. It's not important to mention the terms of the deal; most important is a thorough description of the product.
Be careful with products that are described as "$XXX off" or "reduced by $XXX" - this isn't the actual price of the product. Only respond with products when you are highly confident about the price.
{"deals": [
{
"product_description": "Your clearly expressed summary of the product in 4-5 sentences. Details of the item are much more important than why it's a good deal. Avoid mentioning discounts and coupons; focus on the item itself. There should be a paragpraph of text for each item you choose.",
"price": 99.99,
"url": "the url as provided"
},
...
]}"""
USER_PROMPT_PREFIX = """Respond with the most promising 5 deals from this list, selecting those which have the most detailed, high quality product description and a clear price that is greater than 0.
Respond strictly in JSON, and only JSON. You should rephrase the description to be a summary of the product itself, not the terms of the deal.
Remember to respond with a paragraph of text in the product_description field for each of the 5 items that you select.
Be careful with products that are described as "$XXX off" or "reduced by $XXX" - this isn't the actual price of the product. Only respond with products when you are highly confident about the price.
Deals:
"""
USER_PROMPT_SUFFIX = "\n\nStrictly respond in JSON and include exactly 5 deals, no more."
name = "Scanner Agent"
color = Agent.CYAN
def __init__(self):
"""
Set up this instance by initializing OpenAI
"""
self.log("Scanner Agent is initializing")
self.openai = OpenAI()
self.log("Scanner Agent is ready")
def fetch_deals(self, memory) -> List[ScrapedDeal]:
"""
Look up deals published on RSS feeds
Return any new deals that are not already in the memory provided
"""
self.log("Scanner Agent is about to fetch deals from RSS feed")
urls = [opp.deal.url for opp in memory]
scraped = ScrapedDeal.fetch()
result = [scrape for scrape in scraped if scrape.url not in urls]
self.log(f"Scanner Agent received {len(result)} deals not already scraped")
return result
def make_user_prompt(self, scraped) -> str:
"""
Create a user prompt for OpenAI based on the scraped deals provided
"""
user_prompt = self.USER_PROMPT_PREFIX
user_prompt += '\n\n'.join([scrape.describe() for scrape in scraped])
user_prompt += self.USER_PROMPT_SUFFIX
return user_prompt
def scan(self, memory: List[str]=[]) -> Optional[DealSelection]:
"""
Call OpenAI to provide a high potential list of deals with good descriptions and prices
Use StructuredOutputs to ensure it conforms to our specifications
:param memory: a list of URLs representing deals already raised
:return: a selection of good deals, or None if there aren't any
"""
scraped = self.fetch_deals(memory)
if scraped:
user_prompt = self.make_user_prompt(scraped)
self.log("Scanner Agent is calling OpenAI using Structured Output")
result = self.openai.beta.chat.completions.parse(
model=self.MODEL,
messages=[
{"role": "system", "content": self.SYSTEM_PROMPT},
{"role": "user", "content": user_prompt}
],
response_format=DealSelection
)
result = result.choices[0].message.parsed
result.deals = [deal for deal in result.deals if deal.price>0]
self.log(f"Scanner Agent received {len(result.deals)} selected deals with price>0 from OpenAI")
return result
return None

View File

@@ -0,0 +1,29 @@
import modal
from agents.agent import Agent
class SpecialistAgent(Agent):
"""
An Agent that runs our fine-tuned LLM that's running remotely on Modal
"""
name = "Specialist Agent"
color = Agent.RED
def __init__(self):
"""
Set up this Agent by creating an instance of the modal class
"""
self.log("Specialist Agent is initializing - connecting to modal")
Pricer = modal.Cls.from_name("pricer-service", "Pricer")
self.pricer = Pricer()
self.log("Specialist Agent is ready")
def price(self, description: str) -> float:
"""
Make a remote call to return the estimate of the price of this item
"""
self.log("Specialist Agent is calling remote fine-tuned model")
result = self.pricer.price.remote(description)
self.log(f"Specialist Agent completed - predicting ${result:.2f}")
return result

View File

@@ -0,0 +1,99 @@
import os
import sys
import logging
import json
from typing import List, Optional
from twilio.rest import Client
from dotenv import load_dotenv
import chromadb
from agents.planning_agent import PlanningAgent
from agents.deals import Opportunity
from sklearn.manifold import TSNE
import numpy as np
# Colors for logging
BG_BLUE = '\033[44m'
WHITE = '\033[37m'
RESET = '\033[0m'
# Colors for plot
CATEGORIES = ['Appliances', 'Automotive', 'Cell_Phones_and_Accessories', 'Electronics','Musical_Instruments', 'Office_Products', 'Tools_and_Home_Improvement', 'Toys_and_Games']
COLORS = ['red', 'blue', 'brown', 'orange', 'yellow', 'green' , 'purple', 'cyan']
def init_logging():
root = logging.getLogger()
root.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"[%(asctime)s] [Agents] [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S %z",
)
handler.setFormatter(formatter)
root.addHandler(handler)
class DealAgentFramework:
DB = "products_vectorstore"
MEMORY_FILENAME = "memory.json"
def __init__(self):
init_logging()
load_dotenv()
client = chromadb.PersistentClient(path=self.DB)
self.memory = self.read_memory()
self.collection = client.get_or_create_collection('products')
self.planner = None
def init_agents_as_needed(self):
if not self.planner:
self.log("Initializing Agent Framework")
self.planner = PlanningAgent(self.collection)
self.log("Agent Framework is ready")
def read_memory(self) -> List[Opportunity]:
if os.path.exists(self.MEMORY_FILENAME):
with open(self.MEMORY_FILENAME, "r") as file:
data = json.load(file)
opportunities = [Opportunity(**item) for item in data]
return opportunities
return []
def write_memory(self) -> None:
data = [opportunity.dict() for opportunity in self.memory]
with open(self.MEMORY_FILENAME, "w") as file:
json.dump(data, file, indent=2)
def log(self, message: str):
text = BG_BLUE + WHITE + "[Agent Framework] " + message + RESET
logging.info(text)
def run(self) -> List[Opportunity]:
self.init_agents_as_needed()
logging.info("Kicking off Planning Agent")
result = self.planner.plan(memory=self.memory)
logging.info(f"Planning Agent has completed and returned: {result}")
if result:
self.memory.append(result)
self.write_memory()
return self.memory
@classmethod
def get_plot_data(cls, max_datapoints=10000):
client = chromadb.PersistentClient(path=cls.DB)
collection = client.get_or_create_collection('products')
result = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=max_datapoints)
vectors = np.array(result['embeddings'])
documents = result['documents']
categories = [metadata['category'] for metadata in result['metadatas']]
colors = [COLORS[CATEGORIES.index(c)] for c in categories]
tsne = TSNE(n_components=3, random_state=42, n_jobs=-1)
reduced_vectors = tsne.fit_transform(vectors)
return documents, reduced_vectors, colors
if __name__=="__main__":
DealAgentFramework().run()

View File

@@ -0,0 +1,101 @@
from typing import Optional
from transformers import AutoTokenizer
import re
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
MIN_TOKENS = 150
MAX_TOKENS = 160
MIN_CHARS = 300
CEILING_CHARS = MAX_TOKENS * 7
class Item:
"""
An Item is a cleaned, curated datapoint of a Product with a Price
"""
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
PREFIX = "Price is $"
QUESTION = "How much does this cost to the nearest dollar?"
REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', "By Manufacturer", "Item", "Date First", "Package", ":", "Number of", "Best Sellers", "Number", "Product "]
title: str
price: float
category: str
token_count: int = 0
details: Optional[str]
prompt: Optional[str] = None
include = False
def __init__(self, data, price):
self.title = data['title']
self.price = price
self.parse(data)
def scrub_details(self):
"""
Clean up the details string by removing common text that doesn't add value
"""
details = self.details
for remove in self.REMOVALS:
details = details.replace(remove, "")
return details
def scrub(self, stuff):
"""
Clean up the provided text by removing unnecessary characters and whitespace
Also remove words that are 7+ chars and contain numbers, as these are likely irrelevant product numbers
"""
stuff = re.sub(r'[:\[\]"{}【】\s]+', ' ', stuff).strip()
stuff = stuff.replace(" ,", ",").replace(",,,",",").replace(",,",",")
words = stuff.split(' ')
select = [word for word in words if len(word)<7 or not any(char.isdigit() for char in word)]
return " ".join(select)
def parse(self, data):
"""
Parse this datapoint and if it fits within the allowed Token range,
then set include to True
"""
contents = '\n'.join(data['description'])
if contents:
contents += '\n'
features = '\n'.join(data['features'])
if features:
contents += features + '\n'
self.details = data['details']
if self.details:
contents += self.scrub_details() + '\n'
if len(contents) > MIN_CHARS:
contents = contents[:CEILING_CHARS]
text = f"{self.scrub(self.title)}\n{self.scrub(contents)}"
tokens = self.tokenizer.encode(text, add_special_tokens=False)
if len(tokens) > MIN_TOKENS:
tokens = tokens[:MAX_TOKENS]
text = self.tokenizer.decode(tokens)
self.make_prompt(text)
self.include = True
def make_prompt(self, text):
"""
Set the prompt instance variable to be a prompt appropriate for training
"""
self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
self.prompt += f"{self.PREFIX}{str(round(self.price))}.00"
self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))
def test_prompt(self):
"""
Return a prompt suitable for testing, with the actual price removed
"""
return self.prompt.split(self.PREFIX)[0] + self.PREFIX
def __repr__(self):
"""
Return a String version of this Item
"""
return f"<{self.title} = ${self.price}>"

View File

@@ -0,0 +1,35 @@
# Foreground colors
RED = '\033[31m'
GREEN = '\033[32m'
YELLOW = '\033[33m'
BLUE = '\033[34m'
MAGENTA = '\033[35m'
CYAN = '\033[36m'
WHITE = '\033[37m'
# Background color
BG_BLACK = '\033[40m'
BG_BLUE = '\033[44m'
# Reset code to return to default color
RESET = '\033[0m'
mapper = {
BG_BLACK+RED: "#dd0000",
BG_BLACK+GREEN: "#00dd00",
BG_BLACK+YELLOW: "#dddd00",
BG_BLACK+BLUE: "#0000ee",
BG_BLACK+MAGENTA: "#aa00dd",
BG_BLACK+CYAN: "#00dddd",
BG_BLACK+WHITE: "#87CEEB",
BG_BLUE+WHITE: "#ff7800"
}
def reformat(message):
for key, value in mapper.items():
message = message.replace(key, f'<span style="color: {value}">')
message = message.replace(RESET, '</span>')
return message

View File

@@ -0,0 +1,62 @@
import gradio as gr
from deal_agent_framework import DealAgentFramework
from agents.deals import Opportunity, Deal
class App:
def __init__(self):
self.agent_framework = None
def run(self):
with gr.Blocks(title="Deal Intel", fill_width=True) as ui:
def table_for(opps):
return [[opp.deal.product_description, f"${opp.deal.price:.2f}", f"${opp.estimate:.2f}", f"${opp.discount:.2f}", opp.deal.url] for opp in opps]
def start():
self.agent_framework = DealAgentFramework()
self.agent_framework.init_agents_as_needed()
opportunities = self.agent_framework.memory
table = table_for(opportunities)
return table
def go():
self.agent_framework.run()
new_opportunities = self.agent_framework.memory
table = table_for(new_opportunities)
return table
def do_select(selected_index: gr.SelectData):
opportunities = self.agent_framework.memory
row = selected_index.index[0]
opportunity = opportunities[row]
self.agent_framework.planner.messenger.alert(opportunity)
with gr.Row():
gr.Markdown('<div style="text-align: center;font-size:24px">"Deal Intel" - Deal Hunting Agentic AI</div>')
with gr.Row():
gr.Markdown('<div style="text-align: center;font-size:14px">Autonomous agent framework that finds online deals, collaborating with a proprietary fine-tuned LLM deployed on Modal, and a RAG pipeline with a frontier model and Chroma.</div>')
with gr.Row():
gr.Markdown('<div style="text-align: center;font-size:14px">Deals surfaced so far:</div>')
with gr.Row():
opportunities_dataframe = gr.Dataframe(
headers=["Description", "Price", "Estimate", "Discount", "URL"],
wrap=True,
column_widths=[4, 1, 1, 1, 2],
row_count=10,
col_count=5,
max_height=400,
)
ui.load(start, inputs=[], outputs=[opportunities_dataframe])
timer = gr.Timer(value=60)
timer.tick(go, inputs=[], outputs=[opportunities_dataframe])
opportunities_dataframe.select(do_select)
ui.launch(share=False, inbrowser=True)
if __name__=="__main__":
App().run()

View File

@@ -0,0 +1,166 @@
import logging
import queue
import threading
import time
import gradio as gr
from deal_agent_framework import DealAgentFramework
from agents.deals import Opportunity, Deal
from log_utils import reformat
import plotly.graph_objects as go
class QueueHandler(logging.Handler):
def __init__(self, log_queue):
super().__init__()
self.log_queue = log_queue
def emit(self, record):
self.log_queue.put(self.format(record))
def html_for(log_data):
output = '<br>'.join(log_data[-18:])
return f"""
<div id="scrollContent" style="height: 400px; overflow-y: auto; border: 1px solid #ccc; background-color: #222229; padding: 10px;">
{output}
</div>
"""
def setup_logging(log_queue):
handler = QueueHandler(log_queue)
formatter = logging.Formatter(
"[%(asctime)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S %z",
)
handler.setFormatter(formatter)
logger = logging.getLogger()
logger.addHandler(handler)
logger.setLevel(logging.INFO)
class App:
def __init__(self):
self.agent_framework = None
def get_agent_framework(self):
if not self.agent_framework:
self.agent_framework = DealAgentFramework()
self.agent_framework.init_agents_as_needed()
return self.agent_framework
def run(self):
with gr.Blocks(title="Deal Intel", fill_width=True) as ui:
log_data = gr.State([])
def table_for(opps):
return [[opp.deal.product_description, f"${opp.deal.price:.2f}", f"${opp.estimate:.2f}", f"${opp.discount:.2f}", opp.deal.url] for opp in opps]
def update_output(log_data, log_queue, result_queue):
initial_result = table_for(self.get_agent_framework().memory)
final_result = None
while True:
try:
message = log_queue.get_nowait()
log_data.append(reformat(message))
yield log_data, html_for(log_data), final_result or initial_result
except queue.Empty:
try:
final_result = result_queue.get_nowait()
yield log_data, html_for(log_data), final_result or initial_result
except queue.Empty:
if final_result is not None:
break
time.sleep(0.1)
def get_initial_plot():
fig = go.Figure()
fig.update_layout(
title='Loading vector DB...',
height=400,
)
return fig
def get_plot():
documents, vectors, colors = DealAgentFramework.get_plot_data(max_datapoints=1000)
# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
x=vectors[:, 0],
y=vectors[:, 1],
z=vectors[:, 2],
mode='markers',
marker=dict(size=2, color=colors, opacity=0.7),
)])
fig.update_layout(
scene=dict(xaxis_title='x',
yaxis_title='y',
zaxis_title='z',
aspectmode='manual',
aspectratio=dict(x=2.2, y=2.2, z=1), # Make x-axis twice as long
camera=dict(
eye=dict(x=1.6, y=1.6, z=0.8) # Adjust camera position
)),
height=400,
margin=dict(r=5, b=1, l=5, t=2)
)
return fig
def do_run():
new_opportunities = self.get_agent_framework().run()
table = table_for(new_opportunities)
return table
def run_with_logging(initial_log_data):
log_queue = queue.Queue()
result_queue = queue.Queue()
setup_logging(log_queue)
def worker():
result = do_run()
result_queue.put(result)
thread = threading.Thread(target=worker)
thread.start()
for log_data, output, final_result in update_output(initial_log_data, log_queue, result_queue):
yield log_data, output, final_result
def do_select(selected_index: gr.SelectData):
opportunities = self.get_agent_framework().memory
row = selected_index.index[0]
opportunity = opportunities[row]
self.get_agent_framework().planner.messenger.alert(opportunity)
with gr.Row():
gr.Markdown('<div style="text-align: center;font-size:24px"><strong>Deal Intel</strong> - Autonomous Agent Framework that hunts for deals</div>')
with gr.Row():
gr.Markdown('<div style="text-align: center;font-size:14px">A proprietary fine-tuned LLM deployed on Modal and a RAG pipeline with a frontier model collaborate to send push notifications with great online deals.</div>')
with gr.Row():
opportunities_dataframe = gr.Dataframe(
headers=["Deals found so far", "Price", "Estimate", "Discount", "URL"],
wrap=True,
column_widths=[6, 1, 1, 1, 3],
row_count=10,
col_count=5,
max_height=400,
)
with gr.Row():
with gr.Column(scale=1):
logs = gr.HTML()
with gr.Column(scale=1):
plot = gr.Plot(value=get_plot(), show_label=False)
ui.load(run_with_logging, inputs=[log_data], outputs=[log_data, logs, opportunities_dataframe])
timer = gr.Timer(value=300, active=True)
timer.tick(run_with_logging, inputs=[log_data], outputs=[log_data, logs, opportunities_dataframe])
opportunities_dataframe.select(do_select)
ui.launch(share=False, inbrowser=True)
if __name__=="__main__":
App().run()

View File

@@ -0,0 +1,66 @@
import modal
from modal import App, Image
# Setup
app = modal.App("pricer")
image = Image.debian_slim().pip_install("torch", "transformers", "bitsandbytes", "accelerate", "peft")
secrets = [modal.Secret.from_name("hf-secret")]
# Constants
GPU = "T4"
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
PROJECT_NAME = "pricer"
HF_USER = "ed-donner" # your HF name here! Or use mine if you just want to reproduce my results.
RUN_NAME = "2024-09-13_13.04.39"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
@app.function(image=image, secrets=secrets, gpu=GPU, timeout=1800)
def price(description: str) -> float:
import os
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
from peft import PeftModel
QUESTION = "How much does this cost to the nearest dollar?"
PREFIX = "Price is $"
prompt = f"{QUESTION}\n{description}\n{PREFIX}"
# Quant Config
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4"
)
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=quant_config,
device_map="auto"
)
fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)
set_seed(42)
inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
attention_mask = torch.ones(inputs.shape, device="cuda")
outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=5, num_return_sequences=1)
result = tokenizer.decode(outputs[0])
contents = result.split("Price is $")[1]
contents = contents.replace(',','')
match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
return float(match.group()) if match else 0

View File

@@ -0,0 +1,89 @@
import modal
from modal import App, Volume, Image
# Setup - define our infrastructure with code!
app = modal.App("pricer-service")
image = Image.debian_slim().pip_install("huggingface", "torch", "transformers", "bitsandbytes", "accelerate", "peft")
secrets = [modal.Secret.from_name("hf-secret")]
# Constants
GPU = "T4"
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
PROJECT_NAME = "pricer"
HF_USER = "ed-donner" # your HF name here! Or use mine if you just want to reproduce my results.
RUN_NAME = "2024-09-13_13.04.39"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
MODEL_DIR = "hf-cache/"
BASE_DIR = MODEL_DIR + BASE_MODEL
FINETUNED_DIR = MODEL_DIR + FINETUNED_MODEL
QUESTION = "How much does this cost to the nearest dollar?"
PREFIX = "Price is $"
@app.cls(image=image, secrets=secrets, gpu=GPU, timeout=1800)
class Pricer:
@modal.build()
def download_model_to_folder(self):
from huggingface_hub import snapshot_download
import os
os.makedirs(MODEL_DIR, exist_ok=True)
snapshot_download(BASE_MODEL, local_dir=BASE_DIR)
snapshot_download(FINETUNED_MODEL, revision=REVISION, local_dir=FINETUNED_DIR)
@modal.enter()
def setup(self):
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
from peft import PeftModel
# Quant Config
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4"
)
# Load model and tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(BASE_DIR)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.padding_side = "right"
self.base_model = AutoModelForCausalLM.from_pretrained(
BASE_DIR,
quantization_config=quant_config,
device_map="auto"
)
self.fine_tuned_model = PeftModel.from_pretrained(self.base_model, FINETUNED_DIR, revision=REVISION)
@modal.method()
def price(self, description: str) -> float:
import os
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
from peft import PeftModel
set_seed(42)
prompt = f"{QUESTION}\n\n{description}\n\n{PREFIX}"
inputs = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
attention_mask = torch.ones(inputs.shape, device="cuda")
outputs = self.fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=5, num_return_sequences=1)
result = self.tokenizer.decode(outputs[0])
contents = result.split("Price is $")[1]
contents = contents.replace(',','')
match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
return float(match.group()) if match else 0
@modal.method()
def wake_up(self) -> str:
return "ok"

View File

@@ -0,0 +1,75 @@
import math
import matplotlib.pyplot as plt
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}
class Tester:
def __init__(self, predictor, data, title=None, size=250):
self.predictor = predictor
self.data = data
self.title = title or predictor.__name__.replace("_", " ").title()
self.size = size
self.guesses = []
self.truths = []
self.errors = []
self.sles = []
self.colors = []
def color_for(self, error, truth):
if error<40 or error/truth < 0.2:
return "green"
elif error<80 or error/truth < 0.4:
return "orange"
else:
return "red"
def run_datapoint(self, i):
datapoint = self.data[i]
guess = self.predictor(datapoint)
truth = datapoint.price
error = abs(guess - truth)
log_error = math.log(truth+1) - math.log(guess+1)
sle = log_error ** 2
color = self.color_for(error, truth)
title = datapoint.title if len(datapoint.title) <= 40 else datapoint.title[:40]+"..."
self.guesses.append(guess)
self.truths.append(truth)
self.errors.append(error)
self.sles.append(sle)
self.colors.append(color)
print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")
def chart(self, title):
max_error = max(self.errors)
plt.figure(figsize=(12, 8))
max_val = max(max(self.truths), max(self.guesses))
plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
plt.xlabel('Ground Truth')
plt.ylabel('Model Estimate')
plt.xlim(0, max_val)
plt.ylim(0, max_val)
plt.title(title)
plt.show()
def report(self):
average_error = sum(self.errors) / self.size
rmsle = math.sqrt(sum(self.sles) / self.size)
hits = sum(1 for color in self.colors if color=="green")
title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
self.chart(title)
def run(self):
self.error = 0
for i in range(self.size):
self.run_datapoint(i)
self.report()
@classmethod
def test(cls, function, data):
cls(function, data).run()