import os from dotenv import load_dotenv from pydantic import BaseModel from typing import List, Dict, Self import feedparser from tqdm import tqdm import time from openai import OpenAI from typing import Optional import json load_dotenv(override=True) os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "your-key-if-not-using-env") openai = OpenAI() feeds = [ "https://www.dealnews.com/c142/Electronics/?rss=1", "https://www.dealnews.com/c39/Computers/?rss=1", "https://www.dealnews.com/c238/Automotive/?rss=1", "https://www.dealnews.com/f1912/Smart-Home/?rss=1", "https://www.dealnews.com/c196/Home-Garden/?rss=1", "https://www.reddit.com/r/buildapcsales.rss", "https://www.reddit.com/r/deals.rss", ] SYSTEM_PROMPT = """ You are an RSS feed parser specializing in extracting deal information. Your task is to analyze content and extract structured data. # INPUT TYPES You will receive one of two input types: **TYPE 1: RSS Feed Entry Data** - May contain fields like: title, summary, description, link - Summary/description often contains HTML with deal details - Multiple URL fields may exist (link, links array, etc.) **TYPE 2: HTML Page Content** - Raw HTML from a deal webpage - Contains product information, pricing, and purchase links # TASK Extract and structure the following information: 1. **title**: The deal's headline or main title - For RSS entries: Use the entry's title field directly - For HTML: Extract the main product/deal title 2. **summary**: A concise summary of the deal (2-3 sentences max), focusing on: - What is being offered (product name, specs) - Key terms (price, discount percentage, original price) - Important conditions (promo codes, shipping, availability, refurb/new condition) - Strip ALL HTML tags and formatting 3. **url**: The primary link where users can access the deal - Prioritize direct product/deal purchase links - Avoid tracking links, RSS links with "?rss=1" or "?iref=rss" - For RSS entries, use the "link" field or first link in "links" array # EXTRACTION RULES - **From RSS entries**: Parse the 'summary' or 'description' HTML to extract deal details - **Clean all HTML**: Remove ,
,

,