Submission for Week 8
This commit is contained in:
233
week8/community_contributions/tochi/agents/deals.py
Normal file
233
week8/community_contributions/tochi/agents/deals.py
Normal file
@@ -0,0 +1,233 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Dict, Self
|
||||
import feedparser
|
||||
from tqdm import tqdm
|
||||
import time
|
||||
from openai import OpenAI
|
||||
from typing import Optional
|
||||
import json
|
||||
|
||||
|
||||
load_dotenv(override=True)
|
||||
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "your-key-if-not-using-env")
|
||||
|
||||
openai = OpenAI()
|
||||
|
||||
feeds = [
|
||||
"https://www.dealnews.com/c142/Electronics/?rss=1",
|
||||
"https://www.dealnews.com/c39/Computers/?rss=1",
|
||||
"https://www.dealnews.com/c238/Automotive/?rss=1",
|
||||
"https://www.dealnews.com/f1912/Smart-Home/?rss=1",
|
||||
"https://www.dealnews.com/c196/Home-Garden/?rss=1",
|
||||
"https://www.reddit.com/r/buildapcsales.rss",
|
||||
"https://www.reddit.com/r/deals.rss",
|
||||
]
|
||||
|
||||
SYSTEM_PROMPT = """
|
||||
You are an RSS feed parser specializing in extracting deal information. Your task is to analyze content and extract structured data.
|
||||
|
||||
# INPUT TYPES
|
||||
You will receive one of two input types:
|
||||
|
||||
**TYPE 1: RSS Feed Entry Data**
|
||||
- May contain fields like: title, summary, description, link
|
||||
- Summary/description often contains HTML with deal details
|
||||
- Multiple URL fields may exist (link, links array, etc.)
|
||||
|
||||
**TYPE 2: HTML Page Content**
|
||||
- Raw HTML from a deal webpage
|
||||
- Contains product information, pricing, and purchase links
|
||||
|
||||
# TASK
|
||||
Extract and structure the following information:
|
||||
1. **title**: The deal's headline or main title
|
||||
- For RSS entries: Use the entry's title field directly
|
||||
- For HTML: Extract the main product/deal title
|
||||
|
||||
2. **summary**: A concise summary of the deal (2-3 sentences max), focusing on:
|
||||
- What is being offered (product name, specs)
|
||||
- Key terms (price, discount percentage, original price)
|
||||
- Important conditions (promo codes, shipping, availability, refurb/new condition)
|
||||
- Strip ALL HTML tags and formatting
|
||||
|
||||
3. **url**: The primary link where users can access the deal
|
||||
- Prioritize direct product/deal purchase links
|
||||
- Avoid tracking links, RSS links with "?rss=1" or "?iref=rss"
|
||||
- For RSS entries, use the "link" field or first link in "links" array
|
||||
|
||||
# EXTRACTION RULES
|
||||
- **From RSS entries**: Parse the 'summary' or 'description' HTML to extract deal details
|
||||
- **Clean all HTML**: Remove <img>, <div>, <p>, <ul>, <li>, and all other tags
|
||||
- **Extract pricing**: Include specific dollar amounts, percentages, and comparisons
|
||||
- **Extract conditions**: Note promo codes, refurb status, warranty info, shipping details
|
||||
- **URL priority**: Direct deal link > product page > category page
|
||||
- **Handle missing data**: Use null for any truly missing required field
|
||||
|
||||
# OUTPUT FORMAT
|
||||
Return ONLY valid JSON with this exact structure:
|
||||
{
|
||||
"title": "string",
|
||||
"summary": "string",
|
||||
"url": "string"
|
||||
}
|
||||
|
||||
Do not include any additional text, explanations, or markdown formatting - only the JSON object.
|
||||
|
||||
# EXAMPLES
|
||||
|
||||
**Input (RSS Entry)**:
|
||||
```
|
||||
title: "Sony Headphones for $99 + free shipping"
|
||||
summary: "<p>Was $199, now $99. Use code SAVE50.</p>"
|
||||
link: "https://example.com/deal?iref=rss-c142"
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```json
|
||||
{
|
||||
"title": "Sony Headphones for $99 + free shipping",
|
||||
"summary": "Sony Headphones originally priced at $199, now available for $99 with free shipping. Use promo code SAVE50 at checkout.",
|
||||
"url": "https://example.com/deal"
|
||||
}
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
def gpt_parse(soup: str) -> Optional[Dict[str, str]]:
|
||||
"""
|
||||
Parse RSS feed content using GPT to extract title, summary, and URL.
|
||||
|
||||
Args:
|
||||
soup: Raw RSS feed content (HTML/text)
|
||||
|
||||
Returns:
|
||||
Dictionary with title, summary, url keys or None if parsing fails
|
||||
"""
|
||||
|
||||
text_to_summarize = soup
|
||||
if not text_to_summarize:
|
||||
return None
|
||||
|
||||
try:
|
||||
response = openai.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
temperature=0.2,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": text_to_summarize},
|
||||
],
|
||||
)
|
||||
res_text = response.choices[0].message.content
|
||||
parsed_data = json.loads(res_text)
|
||||
|
||||
if all(
|
||||
key in parsed_data and parsed_data[key]
|
||||
for key in ["title", "summary", "url"]
|
||||
):
|
||||
return {
|
||||
"title": parsed_data["title"],
|
||||
"summary": parsed_data["summary"],
|
||||
"url": parsed_data["url"],
|
||||
}
|
||||
else:
|
||||
print(f"Missing or empty required fields in response: {parsed_data}")
|
||||
return None
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error parsing JSON from OpenAI response: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error calling OpenAI: {e}")
|
||||
return None
|
||||
|
||||
class ScrapedDeal:
|
||||
"""
|
||||
A class to represent a Deal retrieved from an RSS feed
|
||||
"""
|
||||
|
||||
category: str
|
||||
title: str
|
||||
summary: str
|
||||
url: str
|
||||
details: str
|
||||
features: str
|
||||
|
||||
def __init__(self, entry: Dict[str, str]):
|
||||
"""
|
||||
Populate this instance based on the provided dict
|
||||
"""
|
||||
|
||||
self.title = entry["title"]
|
||||
self.summary = entry["summary"]
|
||||
self.url = entry["url"]
|
||||
self.details = self.summary
|
||||
self.features = ""
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a string to describe this deal
|
||||
"""
|
||||
return f"<{self.title}>"
|
||||
|
||||
def describe(self):
|
||||
"""
|
||||
Return a longer string to describe this deal for use in calling a model
|
||||
"""
|
||||
return f"Title: {self.title}\nDetails: {self.details.strip()}\nFeatures: {self.features.strip()}\nURL: {self.url}"
|
||||
|
||||
@classmethod
|
||||
def fetch(cls, show_progress: bool = False) -> List[Self]:
|
||||
"""
|
||||
Retrieve all deals from the selected RSS feeds
|
||||
"""
|
||||
deals = []
|
||||
skipped = 0
|
||||
|
||||
feed_iter = tqdm(feeds) if show_progress else feeds
|
||||
for feed_url in feed_iter:
|
||||
feed = feedparser.parse(feed_url)
|
||||
for entry in feed.entries[:10]:
|
||||
try:
|
||||
parsed_deal = gpt_parse(json.dumps(entry))
|
||||
deals.append(cls(parsed_deal))
|
||||
deals.append(cls(entry))
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
skipped += 1
|
||||
print(f"Skipping deal: {str(e)}")
|
||||
continue
|
||||
|
||||
print(f"Fetched {len(deals)} deals successfully, skipped {skipped}")
|
||||
return deals
|
||||
|
||||
|
||||
class Deal(BaseModel):
|
||||
"""
|
||||
A class to Represent a Deal with a summary description
|
||||
"""
|
||||
|
||||
product_description: str
|
||||
price: float
|
||||
url: str
|
||||
|
||||
|
||||
class DealSelection(BaseModel):
|
||||
"""
|
||||
A class to Represent a list of Deals
|
||||
"""
|
||||
|
||||
deals: List[Deal]
|
||||
|
||||
|
||||
class Opportunity(BaseModel):
|
||||
"""
|
||||
A class to represent a possible opportunity: a Deal where we estimate
|
||||
it should cost more than it's being offered
|
||||
"""
|
||||
|
||||
deal: Deal
|
||||
estimate: float
|
||||
discount: float
|
||||
|
||||
Reference in New Issue
Block a user