LLM_Engineering_OLD/community-contributions/playwright-bojan/openai_scraper_playwright.py

# openai_scraper_playwright.py

import asyncio
from playwright.async_api import async_playwright
from openai import OpenAI
import logging
import random
import time
import os
from prometheus_client import start_http_server, Counter, Histogram
from diskcache import Cache
from dotenv import load_dotenv

load_dotenv()

SCRAPE_ATTEMPTS = Counter('scrape_attempts', 'Total scraping attempts')
SCRAPE_DURATION = Histogram('scrape_duration', 'Scraping duration distribution')
cache = Cache('./scraper_cache')

class ScrapingError(Exception): pass
class ContentAnalysisError(Exception): pass

class EnhancedOpenAIScraper:
    API_KEY = os.getenv("OPENAI_API_KEY")
    BROWSER_EXECUTABLE = os.getenv("BROWSER_PATH", "/usr/bin/chromium-browser")
    MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", 30000))

    def __init__(self, headless=True):
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64)...",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)..."
        ]
        self.timeout = 45000
        self.retry_count = int(os.getenv("RETRY_COUNT", 2))
        self.headless = headless
        self.proxy_servers = [x.strip() for x in os.getenv("PROXY_SERVERS", "").split(',') if x.strip()]

    async def human_interaction(self, page):
        for _ in range(random.randint(2, 5)):
            x, y = random.randint(0, 1366), random.randint(0, 768)
            await page.mouse.move(x, y, steps=random.randint(5, 20))
            await page.wait_for_timeout(random.randint(50, 200))

        if random.random() < 0.3:
            await page.keyboard.press('Tab')
            await page.keyboard.type(' ', delay=random.randint(50, 200))

        await page.mouse.wheel(0, random.choice([300, 600, 900]))
        await page.wait_for_timeout(random.randint(500, 2000))

    async def load_page(self, page, url):
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=self.timeout)
            selectors = ['main article', '#main-content', 'section:first-of-type', 'div[class*="content"]', 'body']
            for selector in selectors:
                if await page.query_selector(selector):
                    return True
            await page.wait_for_timeout(5000)
            return True
        except Exception as e:
            logging.error(f"Error loading page {url}: {e}")
            return False

    @SCRAPE_DURATION.time()
    async def scrape_with_retry(self, url):
        SCRAPE_ATTEMPTS.inc()
        last_error = None
        try:
            async with async_playwright() as p:
                args = {
                    "headless": self.headless,
                    "args": ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
                    "executable_path": self.BROWSER_EXECUTABLE
                }
                browser = await p.chromium.launch(**args)
                context = await browser.new_context(user_agent=random.choice(self.user_agents))
                page = await context.new_page()
                await page.add_init_script("""
                    Object.defineProperty(navigator, 'webdriver', { get: () => false });
                """)

                for attempt in range(self.retry_count):
                    try:
                        if not await self.load_page(page, url):
                            raise ScrapingError("Failed to load page")
                        await self.human_interaction(page)
                        content = await page.evaluate("""() => document.body.innerText""")
                        if not content.strip():
                            raise ContentAnalysisError("No content extracted")
                        await browser.close()
                        return content[:self.MAX_CONTENT_LENGTH]
                    except Exception as e:
                        last_error = e
                        if attempt < self.retry_count - 1:
                            await asyncio.sleep(5)
                        else:
                            await browser.close()
                            raise
        except Exception as e:
            raise last_error or e

    async def get_cached_content(self, url):
        key = 'cache_' + url.replace('https://', '').replace('/', '_')
        content = cache.get(key)
        if content is None:
            content = await self.scrape_with_retry(url)
            cache.set(key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600)))
        return content

async def analyze_content(url="https://openai.com", headless=True):
    scraper = EnhancedOpenAIScraper(headless=headless)
    content = await scraper.get_cached_content(url)
    client = OpenAI(api_key=EnhancedOpenAIScraper.API_KEY)
    if not client.api_key:
        raise ContentAnalysisError("OpenAI API key not configured")

    prompt = f"""
Analyze this page:

{content}
    """
    model = os.getenv("OPENAI_MODEL", "gpt-4-turbo")
    temperature = float(os.getenv("MODEL_TEMPERATURE", 0.3))
    max_tokens = int(os.getenv("MAX_TOKENS", 1500))
    top_p = float(os.getenv("MODEL_TOP_P", 0.9))

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a content analyst."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p
    )

    if not response.choices:
        raise ContentAnalysisError("Empty response from GPT")

    return response.choices[0].message.content