LLM_Engineering_OLD/community-contributions/bojan-playwright-scraper/playwright_ai_scraper.py

# playwright_ai_scraper.py
import asyncio
import logging
import random
import time
import os
from playwright.async_api import async_playwright
from openai import OpenAI
from prometheus_client import Counter, Histogram, start_http_server
from diskcache import Cache
from dotenv import load_dotenv

# Loading .env variablesi
load_dotenv()

# Setting up logging
logging.basicConfig(
    level=os.getenv("LOG_LEVEL", "INFO").upper(),
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Setting up Prometheus metrics
SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts")
SCRAPE_DURATION = Histogram(
    "scrape_duration", "Scraping duration distribution"
)

# Setting up cache
cache = Cache("./scraper_cache")

# Custom exceptions


class ScrapingError(Exception):
    pass


class AnalysisError(Exception):
    pass


class AIScraper:
    API_KEY = os.getenv("OPENAI_API_KEY")
    MAX_CONTENT = int(os.getenv("MAX_CONTENT_LENGTH", 30000))

    def __init__(self, headless=True):
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 "
            "Safari/537.36"
        ]
        self.timeout = 60000  # 60 seconds
        self.retries = int(os.getenv("RETRY_COUNT", 2))
        self.headless = headless
        self.delays = {
            "scroll": (500, 2000),
            "click": (100, 300),
            "move": (50, 200)
        }

    async def human_interaction(self, page):
        """Simulates human behavior on the page."""
        try:
            for _ in range(random.randint(2, 5)):
                x = random.randint(0, 1366)
                y = random.randint(0, 768)
                await page.mouse.move(x, y, steps=random.randint(5, 20))
                await page.wait_for_timeout(
                    random.randint(*self.delays["move"])
                )
            scroll = random.choice([300, 600, 900])
            await page.mouse.wheel(0, scroll)
            await page.wait_for_timeout(
                random.randint(*self.delays["scroll"])
            )
        except Exception as e:
            logging.warning(f"Human interaction failed: {e}")

    async def load_page(self, page, url):
        """Loads the page with dynamic waiting."""
        start_time = time.time()
        try:
            await page.goto(
                url, wait_until="domcontentloaded", timeout=self.timeout
            )
            selectors = [
                "main article",
                "#main-content",
                "section:first-of-type",
                'div[class*="content"]',
                "body"
            ]
            for selector in selectors:
                element = await page.query_selector(selector)
                if element:
                    return True
            if time.time() - start_time < 30:
                await page.wait_for_timeout(
                    30000 - int(time.time() - start_time)
                )
            return True
        except Exception as e:
            logging.error(f"Error loading {url}: {e}")
            return False

    async def scrape_with_retry(self, url):
        """Scrapes the page with retries."""
        SCRAPE_ATTEMPTS.inc()
        start_time = time.time()
        async with async_playwright() as p:
            try:
                browser = await p.chromium.launch(headless=self.headless)
                context = await browser.new_context(
                    user_agent=random.choice(self.user_agents),
                    viewport={"width": 1366, "height": 768}
                )
                page = await context.new_page()
                await page.add_init_script("""
                    Object.defineProperty(navigator, 'webdriver', {
                        get: () => false
                    });
                """)
                for attempt in range(self.retries):
                    try:
                        logging.info(
                            f"Attempt {attempt + 1}: Scraping {url}")
                        if not await self.load_page(page, url):
                            raise ScrapingError(f"Failed to load {url}")
                        await self.human_interaction(page)
                        content = await page.evaluate(
                            """() => {
                                const s = [
                                    'main article',
                                    '#main-content',
                                    'section:first-of-type',
                                    'div[class*="content"]'
                                ];
                                let c = '';
                                for (const x of s) {
                                    const e = document.querySelector(x);
                                    if (e) c += e.innerText + '\\n';
                                }
                                return c.trim() || document.body.innerText;
                            }"""
                        )
                        if not content.strip():
                            raise ScrapingError("No content")
                        SCRAPE_DURATION.observe(time.time() - start_time)
                        return content[:self.MAX_CONTENT]
                    except ScrapingError as e:
                        logging.warning(f"Attempt {attempt + 1} failed: {e}")
                        if attempt < self.retries - 1:
                            await asyncio.sleep(5)
                        else:
                            raise
            except Exception as e:
                logging.error(f"Error in scrape: {e}")
                raise
            finally:
                await browser.close()
        raise ScrapingError(f"All attempts to scrape {url} failed")

    async def get_cached_content(self, url):
        """Retrieves content from cache or scrapes."""
        key = f"content_{url.replace('/', '_')}"
        content = cache.get(key)
        if content is None:
            try:
                content = await self.scrape_with_retry(url)
                cache.set(
                    key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600))
                )
            except Exception as e:
                logging.error(f"Err: {e}")
                raise
        return content


async def analyze_content(url, headless=True):
    """Analyzes the page content using the OpenAI API."""
    try:
        scraper = AIScraper(headless=headless)
        content = await scraper.get_cached_content(url)
        client = OpenAI(api_key=scraper.API_KEY)
        if not client.api_key:
            raise AnalysisError("OpenAI API key not configured")
        prompt = """
            Analyze the website content and extract:
            1. **Summary**: Overview of the website's purpose.
            2. **Entities**: Prominent individuals or organizations.
            3. **Updates**: Recent announcements or news.
            4. **Topics**: Primary subjects or themes.
            5. **Features**: Noteworthy projects or initiatives.
            Format output under these headings. Note if info is missing.
            Content: {content}
        """.format(content=content)
        response = client.chat.completions.create(
            model=os.getenv("OPENAI_MODEL", "gpt-4-turbo"),
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=float(os.getenv("MODEL_TEMPERATURE", 0.3)),
            max_tokens=int(os.getenv("MAX_TOKENS", 1500)),
            top_p=float(os.getenv("MODEL_TOP_P", 0.9))
        )
        if not response.choices:
            raise AnalysisError("Empty response from OpenAI")
        return response.choices[0].message.content
    except (ScrapingError, AnalysisError) as e:
        logging.error(f"Analysis failed: {e}")
        return f"Error: {e}"
    except Exception as e:
        logging.exception(f"Error in analyze: {e}")
        return f"Unexpected error: {e}"


async def main():
    """Main function for scraping and analysis."""
    try:
        port = int(os.getenv("PROMETHEUS_PORT", 8000))
        start_http_server(port)
        logging.info(f"Prometheus server started on port {port}")
    except Exception as e:
        logging.warning(f"Prometheus server failed: {e}")
    urls = [
        "https://www.anthropic.com",
        "https://deepmind.google",
        "https://huggingface.co",
        "https://runwayml.com"
    ]
    for url in urls:
        start_time = time.time()
        result = await analyze_content(url, headless=True)
        end_time = time.time()
        print(
            f"\nAnalysis of {url} completed in "
            f"{end_time - start_time:.2f} seconds\n"
        )
        print(result)

if __name__ == "__main__":
    asyncio.run(main())