Final adjustments and preparation for Ed's review

2025-04-24 15:37:07 +02:00
parent 1a7f4e86b0
commit 6ea90801bd
2 changed files with 234 additions and 220 deletions
--- a/community-contributions/playwright-bojan/openai_scraper_playwright.py
+++ b/community-contributions/playwright-bojan/openai_scraper_playwright.py
@@ -1,3 +1,5 @@
+# openai_scraper_playwright.py
+
 import asyncio
 from playwright.async_api import async_playwright
 from openai import OpenAI
@@ -11,290 +13,129 @@ from dotenv import load_dotenv

 load_dotenv()

-# Setting up Prometheus metrics
 SCRAPE_ATTEMPTS = Counter('scrape_attempts', 'Total scraping attempts')
-SCRAPE_DURATION = Histogram(
-    'scrape_duration', 'Scraping duration distribution')
-
-# Setting up cache
+SCRAPE_DURATION = Histogram('scrape_duration', 'Scraping duration distribution')
 cache = Cache('./scraper_cache')

-
-class ScrapingError(Exception):
-    pass
-
-
-class ContentAnalysisError(Exception):
-    pass
-
+class ScrapingError(Exception): pass
+class ContentAnalysisError(Exception): pass

 class EnhancedOpenAIScraper:
    API_KEY = os.getenv("OPENAI_API_KEY")
-    BROWSER_EXECUTABLE = os.getenv(
-        "BROWSER_PATH", "/usr/bin/chromium-browser")
+    BROWSER_EXECUTABLE = os.getenv("BROWSER_PATH", "/usr/bin/chromium-browser")
    MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", 30000))

    def __init__(self, headless=True):
        self.user_agents = [
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64)...",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)..."
        ]
-        self.timeout = 45000  # 45 seconds
+        self.timeout = 45000
        self.retry_count = int(os.getenv("RETRY_COUNT", 2))
        self.headless = headless
-        self.mouse_velocity_range = (100, 500)  # px/ms
-        self.interaction_delays = {
-            'scroll': (int(os.getenv("SCROLL_DELAY_MIN", 500)), int(os.getenv("SCROLL_DELAY_MAX", 2000))),
-            'click': (int(os.getenv("CLICK_DELAY_MIN", 100)), int(os.getenv("CLICK_DELAY_MAX", 300))),
-            'movement': (int(os.getenv("MOVEMENT_DELAY_MIN", 50)), int(os.getenv("MOVEMENT_DELAY_MAX", 200)))
-        }
-        self.proxy_servers = [server.strip() for server in os.getenv(
-            "PROXY_SERVERS", "").split(',') if server.strip()]
+        self.proxy_servers = [x.strip() for x in os.getenv("PROXY_SERVERS", "").split(',') if x.strip()]

    async def human_interaction(self, page):
-        """Advanced simulation of user behavior"""
-        # Random mouse movement path
        for _ in range(random.randint(2, 5)):
-            x = random.randint(0, 1366)
-            y = random.randint(0, 768)
+            x, y = random.randint(0, 1366), random.randint(0, 768)
            await page.mouse.move(x, y, steps=random.randint(5, 20))
-            await page.wait_for_timeout(random.randint(*self.interaction_delays['movement']))
+            await page.wait_for_timeout(random.randint(50, 200))

-        # Simulating typing
        if random.random() < 0.3:
            await page.keyboard.press('Tab')
            await page.keyboard.type(' ', delay=random.randint(50, 200))

-        # More realistic scrolling
-        scroll_distance = random.choice([300, 600, 900])
-        await page.mouse.wheel(0, scroll_distance)
-        await page.wait_for_timeout(random.randint(*self.interaction_delays['scroll']))
+        await page.mouse.wheel(0, random.choice([300, 600, 900]))
+        await page.wait_for_timeout(random.randint(500, 2000))

    async def load_page(self, page, url):
-        """Smarter page loading with dynamic waiting"""
-        start_time = time.time()
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=self.timeout)
-
-            # Smarter content extraction selectors
-            selectors = [
-                'main article',
-                '#main-content',
-                'section:first-of-type',
-                'div[class*="content"]',
-                'body'  # Fallback
-            ]
-
+            selectors = ['main article', '#main-content', 'section:first-of-type', 'div[class*="content"]', 'body']
            for selector in selectors:
-                try:
-                    element = await page.query_selector(selector)
-                    if element:
-                        return True
-                except Exception:
-                    continue
-
-            # Fallback if no selector is found within a certain time
-            if time.time() - start_time < 30:  # If we haven't used the full timeout
-                await page.wait_for_timeout(30000 - int(time.time() - start_time))
-
-            return True  # Page likely loaded
+                if await page.query_selector(selector):
+                    return True
+            await page.wait_for_timeout(5000)
+            return True
        except Exception as e:
            logging.error(f"Error loading page {url}: {e}")
            return False

    @SCRAPE_DURATION.time()
-    async def scrape_with_retry(self):
-        """Main function with retry mechanism and browser reuse"""
+    async def scrape_with_retry(self, url):
        SCRAPE_ATTEMPTS.inc()
        last_error = None
-        browser = None
-        context = None
-        page = None
-
        try:
            async with async_playwright() as p:
-                launch_args = {
+                args = {
                    "headless": self.headless,
-                    "args": [
-                        "--disable-blink-features=AutomationControlled",
-                        "--single-process",
-                        "--no-sandbox",
-                        f"--user-agent={random.choice(self.user_agents)}"
-                    ],
+                    "args": ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
                    "executable_path": self.BROWSER_EXECUTABLE
                }
-                if self.proxy_servers:
-                    proxy_url = random.choice(self.proxy_servers)
-                    proxy_config = {"server": proxy_url}
-                    proxy_username = os.getenv('PROXY_USER')
-                    proxy_password = os.getenv('PROXY_PASS')
-                    if proxy_username and proxy_password:
-                        proxy_config['username'] = proxy_username
-                        proxy_config['password'] = proxy_password
-                    launch_args['proxy'] = proxy_config
-
-                browser = await p.chromium.launch(**launch_args)
-                context = await browser.new_context(
-                    user_agent=random.choice(self.user_agents),
-                    viewport={"width": 1366, "height": 768},
-                    locale=os.getenv("BROWSER_LOCALE", "en-US")
-                )
-                await context.route("**/*", lambda route: route.continue_())
+                browser = await p.chromium.launch(**args)
+                context = await browser.new_context(user_agent=random.choice(self.user_agents))
                page = await context.new_page()
                await page.add_init_script("""
                    Object.defineProperty(navigator, 'webdriver', { get: () => false });
-                    window.navigator.chrome = { runtime: {}, app: { isInstalled: false } };
                """)

                for attempt in range(self.retry_count):
                    try:
-                        logging.info(
-                            f"Attempt {attempt + 1}: Loading OpenAI...")
-                        if not await self.load_page(page, "https://openai.com"):
-                            raise ScrapingError(
-                                "Failed to load key content on OpenAI website.")
+                        if not await self.load_page(page, url):
+                            raise ScrapingError("Failed to load page")
                        await self.human_interaction(page)
-                        await page.screenshot(path=f"openai_debug_{attempt}.png")
-                        content = await page.evaluate("""() => {
-                            const selectors = [
-                                'main article',
-                                '#main-content',
-                                'section:first-of-type',
-                                'div[class*="content"]'
-                            ];
-
-                            let content = '';
-                            for (const selector of selectors) {
-                                const element = document.querySelector(selector);
-                                if (element) {
-                                    content += element.innerText + '\\n\\n';
-                                }
-                            }
-                            return content.trim() || document.body.innerText;
-                        }""")
+                        content = await page.evaluate("""() => document.body.innerText""")
                        if not content.strip():
-                            raise ContentAnalysisError(
-                                "No content extracted from the page.")
+                            raise ContentAnalysisError("No content extracted")
+                        await browser.close()
                        return content[:self.MAX_CONTENT_LENGTH]
-
-                    except (ScrapingError, ContentAnalysisError) as e:
-                        last_error = e
-                        logging.warning(
-                            f"Attempt {attempt + 1} failed: {str(e)}")
-                        if attempt < self.retry_count - 1:
-                            await asyncio.sleep(5)
-                        else:
-                            if browser:
-                                await browser.close()
-                                browser = None
-                            raise
                    except Exception as e:
                        last_error = e
-                        logging.exception(f"Unexpected error on attempt {
-                                          attempt + 1}: {str(e)}")
                        if attempt < self.retry_count - 1:
                            await asyncio.sleep(5)
                        else:
-                            if browser:
-                                await browser.close()
-                                browser = None
+                            await browser.close()
                            raise
-
        except Exception as e:
-            last_error = e
-        finally:
-            if browser:
-                await browser.close()
+            raise last_error or e

-        raise last_error if last_error else Exception(
-            "All scraping attempts failed.")
-
-    async def get_cached_content(self):
-        key = 'openai_content_cache_key'
+    async def get_cached_content(self, url):
+        key = 'cache_' + url.replace('https://', '').replace('/', '_')
        content = cache.get(key)
        if content is None:
-            content = await self.scrape_with_retry()
-            cache.set(key, content, expire=int(
-                os.getenv("CACHE_EXPIRY", 3600)))
+            content = await self.scrape_with_retry(url)
+            cache.set(key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600)))
        return content

+async def analyze_content(url="https://openai.com", headless=True):
+    scraper = EnhancedOpenAIScraper(headless=headless)
+    content = await scraper.get_cached_content(url)
+    client = OpenAI(api_key=EnhancedOpenAIScraper.API_KEY)
+    if not client.api_key:
+        raise ContentAnalysisError("OpenAI API key not configured")

-async def analyze_content(headless=True):
-    try:
-        scraper = EnhancedOpenAIScraper(headless=headless)
-        content = await scraper.get_cached_content()
+    prompt = f"""
+Analyze this page:

-        client = OpenAI(api_key=EnhancedOpenAIScraper.API_KEY)
-        if not client.api_key:
-            raise ContentAnalysisError(
-                "OpenAI API key not configured (check environment variables).")
+{content}
+    """
+    model = os.getenv("OPENAI_MODEL", "gpt-4-turbo")
+    temperature = float(os.getenv("MODEL_TEMPERATURE", 0.3))
+    max_tokens = int(os.getenv("MAX_TOKENS", 1500))
+    top_p = float(os.getenv("MODEL_TOP_P", 0.9))

-        prompt_template = """
-            Analyze the following website content and extract the following information if present:
-
-            1.  **Overall Summary of the Website:** Provide a concise overview of the website's purpose and the main topics discussed.
-            2.  **Key Individuals or Entities:** Identify and briefly describe any prominent individuals, companies, or organizations mentioned.
-            3.  **Recent Announcements or Updates:** List any recent announcements, news, or updates found on the website, including dates if available.
-            4.  **Main Topics or Themes:** Identify the primary subjects or themes explored on the website.
-            5.  **Any Noteworthy Features or Projects:** Highlight any significant features, projects, or initiatives mentioned.
-
-            Format the output clearly under each of these headings. If a particular piece of information is not found, indicate that it is not present.
-
-            Content:
-            {content}
-        """
-
-        formatted_prompt = prompt_template.format(content=content)
-        model_name = os.getenv("OPENAI_MODEL", "gpt-4-turbo")
-        temperature = float(os.getenv("MODEL_TEMPERATURE", 0.3))
-        max_tokens = int(os.getenv("MAX_TOKENS", 1500))
-        top_p = float(os.getenv("MODEL_TOP_P", 0.9))
-
-        response = client.chat.completions.create(
-            model=model_name,
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant that analyzes website content and extracts key information in a structured format."},
-                {"role": "user", "content": formatted_prompt}
-            ],
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p
-        )
-
-        if not response.choices:
-            raise ContentAnalysisError("Empty response from GPT.")
-
-        return response.choices[0].message.content
-
-    except (ScrapingError, ContentAnalysisError) as e:
-        logging.error(f"Analysis failed: {str(e)}")
-        return f"Critical analysis error: {str(e)}"
-    except Exception as e:
-        logging.exception("Unexpected error during analysis.")
-        return f"Unexpected analysis error: {str(e)}"
-
-
-async def main():
-    logging.basicConfig(
-        level=os.getenv("LOG_LEVEL", "INFO").upper(),
-        format='%(asctime)s - %(levelname)s - %(message)s'
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You are a content analyst."},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=temperature,
+        max_tokens=max_tokens,
+        top_p=top_p
    )

-    # Start Prometheus HTTP server for exposing metrics
-    try:
-        prometheus_port = int(os.getenv("PROMETHEUS_PORT", 8000))
-        start_http_server(prometheus_port)
-        logging.info(f"Prometheus metrics server started on port {
-                     prometheus_port}")
-    except Exception as e:
-        logging.warning(f"Failed to start Prometheus metrics server: {e}")
+    if not response.choices:
+        raise ContentAnalysisError("Empty response from GPT")

-    start_time = time.time()
-    result = await analyze_content(headless=True)
-    end_time = time.time()
-
-    print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds\n")
-    print(result)
-
-if __name__ == "__main__":
-    asyncio.run(main())
+    return response.choices[0].message.content