Add Bojan's Playwright asynchronous scraper project

This contribution includes a fully asynchronous scraper using Playwright and OpenAI API, with Python scripts, Jupyter notebooks (outputs cleared), Markdown summaries, and a README. Organized under community-contributions/bojan-playwright-scraper/. Limited content retrieval from Huggingface.co is documented in the README.
2025-04-29 10:07:18 +02:00
parent c8f4c7c14e
commit 1a626abba0
9 changed files with 731 additions and 0 deletions
--- a/community-contributions/bojan-playwright-scraper/playwright_ai_scraper.py
+++ b/community-contributions/bojan-playwright-scraper/playwright_ai_scraper.py
@@ -0,0 +1,245 @@
+# playwright_ai_scraper.py
+import asyncio
+import logging
+import random
+import time
+import os
+from playwright.async_api import async_playwright
+from openai import OpenAI
+from prometheus_client import Counter, Histogram, start_http_server
+from diskcache import Cache
+from dotenv import load_dotenv
+
+# Učitavanje .env varijabli
+load_dotenv()
+
+# Postavljanje logginga
+logging.basicConfig(
+    level=os.getenv("LOG_LEVEL", "INFO").upper(),
+    format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+# Postavljanje Prometheus metrika
+SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts")
+SCRAPE_DURATION = Histogram(
+    "scrape_duration", "Scraping duration distribution"
+)
+
+# Postavljanje cachea
+cache = Cache("./scraper_cache")
+
+# Prilagođene iznimke
+
+
+class ScrapingError(Exception):
+    pass
+
+
+class AnalysisError(Exception):
+    pass
+
+
+class AIScraper:
+    API_KEY = os.getenv("OPENAI_API_KEY")
+    MAX_CONTENT = int(os.getenv("MAX_CONTENT_LENGTH", 30000))
+
+    def __init__(self, headless=True):
+        self.user_agents = [
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+            "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 "
+            "Safari/537.36"
+        ]
+        self.timeout = 60000  # 60 sekundi
+        self.retries = int(os.getenv("RETRY_COUNT", 2))
+        self.headless = headless
+        self.delays = {
+            "scroll": (500, 2000),
+            "click": (100, 300),
+            "move": (50, 200)
+        }
+
+    async def human_interaction(self, page):
+        """Simulira ljudsko ponašanje na stranici."""
+        try:
+            for _ in range(random.randint(2, 5)):
+                x = random.randint(0, 1366)
+                y = random.randint(0, 768)
+                await page.mouse.move(x, y, steps=random.randint(5, 20))
+                await page.wait_for_timeout(
+                    random.randint(*self.delays["move"])
+                )
+            scroll = random.choice([300, 600, 900])
+            await page.mouse.wheel(0, scroll)
+            await page.wait_for_timeout(
+                random.randint(*self.delays["scroll"])
+            )
+        except Exception as e:
+            logging.warning(f"Human interaction failed: {e}")
+
+    async def load_page(self, page, url):
+        """Učitava stranicu s dinamičkim čekanjem."""
+        start_time = time.time()
+        try:
+            await page.goto(
+                url, wait_until="domcontentloaded", timeout=self.timeout
+            )
+            selectors = [
+                "main article",
+                "#main-content",
+                "section:first-of-type",
+                'div[class*="content"]',
+                "body"
+            ]
+            for selector in selectors:
+                element = await page.query_selector(selector)
+                if element:
+                    return True
+            if time.time() - start_time < 30:
+                await page.wait_for_timeout(
+                    30000 - int(time.time() - start_time)
+                )
+            return True
+        except Exception as e:
+            logging.error(f"Error loading {url}: {e}")
+            return False
+
+    async def scrape_with_retry(self, url):
+        """Scrapa stranicu s ponovnim pokušajima."""
+        SCRAPE_ATTEMPTS.inc()
+        start_time = time.time()
+        async with async_playwright() as p:
+            try:
+                browser = await p.chromium.launch(headless=self.headless)
+                context = await browser.new_context(
+                    user_agent=random.choice(self.user_agents),
+                    viewport={"width": 1366, "height": 768}
+                )
+                page = await context.new_page()
+                await page.add_init_script("""
+                    Object.defineProperty(navigator, 'webdriver', {
+                        get: () => false
+                    });
+                """)
+                for attempt in range(self.retries):
+                    try:
+                        logging.info(
+                            f"Attempt {attempt + 1}: Scraping {url}")
+                        if not await self.load_page(page, url):
+                            raise ScrapingError(f"Failed to load {url}")
+                        await self.human_interaction(page)
+                        content = await page.evaluate(
+                            """() => {
+                                const s = [
+                                    'main article',
+                                    '#main-content',
+                                    'section:first-of-type',
+                                    'div[class*="content"]'
+                                ];
+                                let c = '';
+                                for (const x of s) {
+                                    const e = document.querySelector(x);
+                                    if (e) c += e.innerText + '\\n';
+                                }
+                                return c.trim() || document.body.innerText;
+                            }"""
+                        )
+                        if not content.strip():
+                            raise ScrapingError("No content")
+                        SCRAPE_DURATION.observe(time.time() - start_time)
+                        return content[:self.MAX_CONTENT]
+                    except ScrapingError as e:
+                        logging.warning(f"Attempt {attempt + 1} failed: {e}")
+                        if attempt < self.retries - 1:
+                            await asyncio.sleep(5)
+                        else:
+                            raise
+            except Exception as e:
+                logging.error(f"Error in scrape: {e}")
+                raise
+            finally:
+                await browser.close()
+        raise ScrapingError(f"All attempts to scrape {url} failed")
+
+    async def get_cached_content(self, url):
+        """Dohvaća sadržaj iz cachea ili scrapa."""
+        key = f"content_{url.replace('/', '_')}"
+        content = cache.get(key)
+        if content is None:
+            try:
+                content = await self.scrape_with_retry(url)
+                cache.set(
+                    key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600))
+                )
+            except Exception as e:
+                logging.error(f"Err: {e}")
+                raise
+        return content
+
+
+async def analyze_content(url, headless=True):
+    """Analizira sadržaj stranice koristeći OpenAI API."""
+    try:
+        scraper = AIScraper(headless=headless)
+        content = await scraper.get_cached_content(url)
+        client = OpenAI(api_key=scraper.API_KEY)
+        if not client.api_key:
+            raise AnalysisError("OpenAI API key not configured")
+        prompt = """
+            Analyze the website content and extract:
+            1. **Summary**: Overview of the website's purpose.
+            2. **Entities**: Prominent individuals or organizations.
+            3. **Updates**: Recent announcements or news.
+            4. **Topics**: Primary subjects or themes.
+            5. **Features**: Noteworthy projects or initiatives.
+            Format output under these headings. Note if info is missing.
+            Content: {content}
+        """.format(content=content)
+        response = client.chat.completions.create(
+            model=os.getenv("OPENAI_MODEL", "gpt-4-turbo"),
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=float(os.getenv("MODEL_TEMPERATURE", 0.3)),
+            max_tokens=int(os.getenv("MAX_TOKENS", 1500)),
+            top_p=float(os.getenv("MODEL_TOP_P", 0.9))
+        )
+        if not response.choices:
+            raise AnalysisError("Empty response from OpenAI")
+        return response.choices[0].message.content
+    except (ScrapingError, AnalysisError) as e:
+        logging.error(f"Analysis failed: {e}")
+        return f"Error: {e}"
+    except Exception as e:
+        logging.exception(f"Error in analyze: {e}")
+        return f"Unexpected error: {e}"
+
+
+async def main():
+    """Glavna funkcija za scraping i analizu."""
+    try:
+        port = int(os.getenv("PROMETHEUS_PORT", 8000))
+        start_http_server(port)
+        logging.info(f"Prometheus server started on port {port}")
+    except Exception as e:
+        logging.warning(f"Prometheus server failed: {e}")
+    urls = [
+        "https://www.anthropic.com",
+        "https://deepmind.google",
+        "https://huggingface.co",
+        "https://runwayml.com"
+    ]
+    for url in urls:
+        start_time = time.time()
+        result = await analyze_content(url, headless=True)
+        end_time = time.time()
+        print(
+            f"\nAnalysis of {url} completed in "
+            f"{end_time - start_time:.2f} seconds\n"
+        )
+        print(result)
+
+if __name__ == "__main__":
+    asyncio.run(main())