From ba7455e38ee7ec47111d955d058d95c95aebc8ea Mon Sep 17 00:00:00 2001 From: lakovicb <> Date: Wed, 23 Apr 2025 14:45:58 +0200 Subject: [PATCH 1/5] Added Playwright-based scraper solution to community-contributions --- .../playwright-bojan/README.md | 56 ++++ .../openai_scraper_playwright.py | 300 ++++++++++++++++++ 2 files changed, 356 insertions(+) create mode 100644 community-contributions/playwright-bojan/README.md create mode 100644 community-contributions/playwright-bojan/openai_scraper_playwright.py diff --git a/community-contributions/playwright-bojan/README.md b/community-contributions/playwright-bojan/README.md new file mode 100644 index 0000000..f24b91c --- /dev/null +++ b/community-contributions/playwright-bojan/README.md @@ -0,0 +1,56 @@ +# 🧠 Playwright-Based Web Scraper for openai.com +### πŸ“š Community Contribution for Ed Donner's "LLM Engineering: Master AI" Course + +> _β€œAn extra exercise for those who enjoy web scraping... +> In the community-contributions folder, you'll find an example Selenium solution from a student.”_ + +--- + +## πŸ” About This Project + +This is a response to Ed Donner’s bonus exercise to scrape `https://openai.com`, which uses dynamic JavaScript rendering. +A fellow student contributed a Selenium-based solution β€” this one goes a step further with **Playwright**. + +--- + +## πŸ†š Why Playwright Over Selenium? + +| Feature | Selenium | Playwright πŸ† | +|----------------------|------------------------------|-----------------------------| +| **Installation** | More complex setup | Minimal + faster setup | +| **Speed** | Slower due to architecture | Faster execution (async) | +| **Multi-browser** | Requires config | Built-in Chrome, Firefox, WebKit support | +| **Headless mode** | Less stable | Super stable | +| **Async-friendly** | Not built-in | Native support via asyncio | +| **Interaction APIs** | Limited | Richer simulation (mouse, scroll, etc.) | + +--- + +## βš™οΈ Features + +- βœ… **Full JavaScript rendering** using Chromium +- βœ… **Human-like behavior simulation** (mouse movement, scrolling, typing) +- βœ… **Caching** with `diskcache` +- βœ… **Prometheus metrics** +- βœ… **Asynchronous scraping logic** +- βœ… **Content summarization via OpenAI GPT API** + +--- + +## 🧠 Why not in JupyterLab? + +Due to the async nature of Playwright and the use of `asyncio.run()`, running this inside Jupyter causes `RuntimeError` conflicts. + +This solution was developed and tested in: + +- πŸ’» WingIDE 10 Pro +- 🐧 Ubuntu via WSL +- 🐍 Conda environment with Anaconda Python 3.12 + +--- + +## πŸš€ How to Run + +1. Install dependencies: +```bash +pip install -r requirements.txt diff --git a/community-contributions/playwright-bojan/openai_scraper_playwright.py b/community-contributions/playwright-bojan/openai_scraper_playwright.py new file mode 100644 index 0000000..7eac886 --- /dev/null +++ b/community-contributions/playwright-bojan/openai_scraper_playwright.py @@ -0,0 +1,300 @@ +import asyncio +from playwright.async_api import async_playwright +from openai import OpenAI +import logging +import random +import time +import os +from prometheus_client import start_http_server, Counter, Histogram +from diskcache import Cache +from dotenv import load_dotenv + +load_dotenv() + +# Setting up Prometheus metrics +SCRAPE_ATTEMPTS = Counter('scrape_attempts', 'Total scraping attempts') +SCRAPE_DURATION = Histogram( + 'scrape_duration', 'Scraping duration distribution') + +# Setting up cache +cache = Cache('./scraper_cache') + + +class ScrapingError(Exception): + pass + + +class ContentAnalysisError(Exception): + pass + + +class EnhancedOpenAIScraper: + API_KEY = os.getenv("OPENAI_API_KEY") + BROWSER_EXECUTABLE = os.getenv( + "BROWSER_PATH", "/usr/bin/chromium-browser") + MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", 30000)) + + def __init__(self, headless=True): + self.user_agents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ] + self.timeout = 45000 # 45 seconds + self.retry_count = int(os.getenv("RETRY_COUNT", 2)) + self.headless = headless + self.mouse_velocity_range = (100, 500) # px/ms + self.interaction_delays = { + 'scroll': (int(os.getenv("SCROLL_DELAY_MIN", 500)), int(os.getenv("SCROLL_DELAY_MAX", 2000))), + 'click': (int(os.getenv("CLICK_DELAY_MIN", 100)), int(os.getenv("CLICK_DELAY_MAX", 300))), + 'movement': (int(os.getenv("MOVEMENT_DELAY_MIN", 50)), int(os.getenv("MOVEMENT_DELAY_MAX", 200))) + } + self.proxy_servers = [server.strip() for server in os.getenv( + "PROXY_SERVERS", "").split(',') if server.strip()] + + async def human_interaction(self, page): + """Advanced simulation of user behavior""" + # Random mouse movement path + for _ in range(random.randint(2, 5)): + x = random.randint(0, 1366) + y = random.randint(0, 768) + await page.mouse.move(x, y, steps=random.randint(5, 20)) + await page.wait_for_timeout(random.randint(*self.interaction_delays['movement'])) + + # Simulating typing + if random.random() < 0.3: + await page.keyboard.press('Tab') + await page.keyboard.type(' ', delay=random.randint(50, 200)) + + # More realistic scrolling + scroll_distance = random.choice([300, 600, 900]) + await page.mouse.wheel(0, scroll_distance) + await page.wait_for_timeout(random.randint(*self.interaction_delays['scroll'])) + + async def load_page(self, page, url): + """Smarter page loading with dynamic waiting""" + start_time = time.time() + try: + await page.goto(url, wait_until="domcontentloaded", timeout=self.timeout) + + # Smarter content extraction selectors + selectors = [ + 'main article', + '#main-content', + 'section:first-of-type', + 'div[class*="content"]', + 'body' # Fallback + ] + + for selector in selectors: + try: + element = await page.query_selector(selector) + if element: + return True + except Exception: + continue + + # Fallback if no selector is found within a certain time + if time.time() - start_time < 30: # If we haven't used the full timeout + await page.wait_for_timeout(30000 - int(time.time() - start_time)) + + return True # Page likely loaded + except Exception as e: + logging.error(f"Error loading page {url}: {e}") + return False + + @SCRAPE_DURATION.time() + async def scrape_with_retry(self): + """Main function with retry mechanism and browser reuse""" + SCRAPE_ATTEMPTS.inc() + last_error = None + browser = None + context = None + page = None + + try: + async with async_playwright() as p: + launch_args = { + "headless": self.headless, + "args": [ + "--disable-blink-features=AutomationControlled", + "--single-process", + "--no-sandbox", + f"--user-agent={random.choice(self.user_agents)}" + ], + "executable_path": self.BROWSER_EXECUTABLE + } + if self.proxy_servers: + proxy_url = random.choice(self.proxy_servers) + proxy_config = {"server": proxy_url} + proxy_username = os.getenv('PROXY_USER') + proxy_password = os.getenv('PROXY_PASS') + if proxy_username and proxy_password: + proxy_config['username'] = proxy_username + proxy_config['password'] = proxy_password + launch_args['proxy'] = proxy_config + + browser = await p.chromium.launch(**launch_args) + context = await browser.new_context( + user_agent=random.choice(self.user_agents), + viewport={"width": 1366, "height": 768}, + locale=os.getenv("BROWSER_LOCALE", "en-US") + ) + await context.route("**/*", lambda route: route.continue_()) + page = await context.new_page() + await page.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + window.navigator.chrome = { runtime: {}, app: { isInstalled: false } }; + """) + + for attempt in range(self.retry_count): + try: + logging.info( + f"Attempt {attempt + 1}: Loading OpenAI...") + if not await self.load_page(page, "https://openai.com"): + raise ScrapingError( + "Failed to load key content on OpenAI website.") + await self.human_interaction(page) + await page.screenshot(path=f"openai_debug_{attempt}.png") + content = await page.evaluate("""() => { + const selectors = [ + 'main article', + '#main-content', + 'section:first-of-type', + 'div[class*="content"]' + ]; + + let content = ''; + for (const selector of selectors) { + const element = document.querySelector(selector); + if (element) { + content += element.innerText + '\\n\\n'; + } + } + return content.trim() || document.body.innerText; + }""") + if not content.strip(): + raise ContentAnalysisError( + "No content extracted from the page.") + return content[:self.MAX_CONTENT_LENGTH] + + except (ScrapingError, ContentAnalysisError) as e: + last_error = e + logging.warning( + f"Attempt {attempt + 1} failed: {str(e)}") + if attempt < self.retry_count - 1: + await asyncio.sleep(5) + else: + if browser: + await browser.close() + browser = None + raise + except Exception as e: + last_error = e + logging.exception(f"Unexpected error on attempt { + attempt + 1}: {str(e)}") + if attempt < self.retry_count - 1: + await asyncio.sleep(5) + else: + if browser: + await browser.close() + browser = None + raise + + except Exception as e: + last_error = e + finally: + if browser: + await browser.close() + + raise last_error if last_error else Exception( + "All scraping attempts failed.") + + async def get_cached_content(self): + key = 'openai_content_cache_key' + content = cache.get(key) + if content is None: + content = await self.scrape_with_retry() + cache.set(key, content, expire=int( + os.getenv("CACHE_EXPIRY", 3600))) + return content + + +async def analyze_content(headless=True): + try: + scraper = EnhancedOpenAIScraper(headless=headless) + content = await scraper.get_cached_content() + + client = OpenAI(api_key=EnhancedOpenAIScraper.API_KEY) + if not client.api_key: + raise ContentAnalysisError( + "OpenAI API key not configured (check environment variables).") + + prompt_template = """ + Analyze the following website content and extract the following information if present: + + 1. **Overall Summary of the Website:** Provide a concise overview of the website's purpose and the main topics discussed. + 2. **Key Individuals or Entities:** Identify and briefly describe any prominent individuals, companies, or organizations mentioned. + 3. **Recent Announcements or Updates:** List any recent announcements, news, or updates found on the website, including dates if available. + 4. **Main Topics or Themes:** Identify the primary subjects or themes explored on the website. + 5. **Any Noteworthy Features or Projects:** Highlight any significant features, projects, or initiatives mentioned. + + Format the output clearly under each of these headings. If a particular piece of information is not found, indicate that it is not present. + + Content: + {content} + """ + + formatted_prompt = prompt_template.format(content=content) + model_name = os.getenv("OPENAI_MODEL", "gpt-4-turbo") + temperature = float(os.getenv("MODEL_TEMPERATURE", 0.3)) + max_tokens = int(os.getenv("MAX_TOKENS", 1500)) + top_p = float(os.getenv("MODEL_TOP_P", 0.9)) + + response = client.chat.completions.create( + model=model_name, + messages=[ + {"role": "system", "content": "You are a helpful assistant that analyzes website content and extracts key information in a structured format."}, + {"role": "user", "content": formatted_prompt} + ], + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p + ) + + if not response.choices: + raise ContentAnalysisError("Empty response from GPT.") + + return response.choices[0].message.content + + except (ScrapingError, ContentAnalysisError) as e: + logging.error(f"Analysis failed: {str(e)}") + return f"Critical analysis error: {str(e)}" + except Exception as e: + logging.exception("Unexpected error during analysis.") + return f"Unexpected analysis error: {str(e)}" + + +async def main(): + logging.basicConfig( + level=os.getenv("LOG_LEVEL", "INFO").upper(), + format='%(asctime)s - %(levelname)s - %(message)s' + ) + + # Start Prometheus HTTP server for exposing metrics + try: + prometheus_port = int(os.getenv("PROMETHEUS_PORT", 8000)) + start_http_server(prometheus_port) + logging.info(f"Prometheus metrics server started on port { + prometheus_port}") + except Exception as e: + logging.warning(f"Failed to start Prometheus metrics server: {e}") + + start_time = time.time() + result = await analyze_content(headless=True) + end_time = time.time() + + print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds\n") + print(result) + +if __name__ == "__main__": + asyncio.run(main()) From df5dbe0aa9106bff0b1f4eb7d999488573c8c9c4 Mon Sep 17 00:00:00 2001 From: lakovicb <> Date: Wed, 23 Apr 2025 16:41:30 +0200 Subject: [PATCH 2/5] Added formatted markdown-only notebook for Playwright scraper output --- ...aywright_Solution_Showcase_Formatted.ipynb | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 community-contributions/playwright-bojan/Playwright_Solution_Showcase_Formatted.ipynb diff --git a/community-contributions/playwright-bojan/Playwright_Solution_Showcase_Formatted.ipynb b/community-contributions/playwright-bojan/Playwright_Solution_Showcase_Formatted.ipynb new file mode 100644 index 0000000..b2fabd0 --- /dev/null +++ b/community-contributions/playwright-bojan/Playwright_Solution_Showcase_Formatted.ipynb @@ -0,0 +1,69 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3df9df94", + "metadata": {}, + "source": [ + "# πŸ§ͺ Playwright Scraper Output (Formatted)\n", + "\n", + "---\n", + "\n", + "## 🧭 1. **Overall Summary of the Website**\n", + "\n", + "*The website appears to be focused on showcasing various applications and updates related to OpenAI's technology, specifically ChatGPT and other AI tools. It provides information on product releases, company updates, and educational content on how to use AI technologies in different scenarios such as planning trips, learning games, coding, and more.*\n", + "\n", + "---\n", + "\n", + "## πŸ§‘β€πŸ’Ό 2. **Key Individuals or Entities**\n", + "\n", + "- **OpenAI** β€” Company behind the technologies and updates discussed on the website \n", + "- **Lyndon Barrois & Sora** β€” Featured in a story, possibly highlighting user experiences or contributions\n", + "\n", + "---\n", + "\n", + "## πŸ“° 3. **Recent Announcements or Updates**\n", + "\n", + "- πŸ“’ **Introducing GPT-4.1 in the API** β€” *(no date provided)*\n", + "- πŸ–ΌοΈ **Introducing 4o Image Generation** β€” *(no date provided)*\n", + "- 🐟 **Catching halibut with ChatGPT** β€” *(no date provided)*\n", + "- 🧠 **Thinking with images** β€” *Apr 16, 2025*\n", + "- πŸ§‘β€βš–οΈ **Nonprofit commission advisors announced** β€” *Apr 15, 2025*\n", + "- βš™οΈ **Updated Preparedness Framework** β€” *Apr 15, 2025*\n", + "- 🌐 **BrowseComp benchmark for browsing agents** β€” *Apr 10, 2025*\n", + "- πŸš€ **OpenAI Pioneers Program launched** β€” *Apr 9, 2025*\n", + "- πŸ“Š **PaperBench research benchmark published** β€” *Apr 2, 2025*\n", + "\n", + "---\n", + "\n", + "## πŸ“š 4. **Main Topics or Themes**\n", + "\n", + "- πŸ€– **AI Technology Applications** β€” Using AI for tasks like planning, learning, and troubleshooting \n", + "- 🧩 **Product and Feature Releases** β€” Updates on new capabilities \n", + "- πŸ“˜ **Educational Content** β€” Guides for using AI effectively \n", + "- πŸ§ͺ **Research and Development** β€” Publications and technical benchmarks\n", + "\n", + "---\n", + "\n", + "## ⭐ 5. **Noteworthy Features or Projects**\n", + "\n", + "- βœ… **GPT-4.1** β€” A new API-accessible version of the language model \n", + "- πŸ–ΌοΈ **4o Image Generation** β€” Feature focused on AI-generated images \n", + "- πŸš€ **OpenAI Pioneers Program** β€” Initiative likely fostering innovation in AI \n", + "- πŸ“Š **BrowseComp & PaperBench** β€” Benchmarks for evaluating AI agents\n", + "\n", + "---\n", + "\n", + "βœ… *If you're reading this inside Jupyter and seeing clean structure β€” your async notebook setup is working beautifully.*\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From feb4a2e772e394c5e2c985fae6fd7a4281dc87ab Mon Sep 17 00:00:00 2001 From: lakovicb <> Date: Wed, 23 Apr 2025 16:48:32 +0200 Subject: [PATCH 3/5] Added requirements.txt for scraper dependencies --- community-contributions/playwright-bojan/requirements.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 community-contributions/playwright-bojan/requirements.txt diff --git a/community-contributions/playwright-bojan/requirements.txt b/community-contributions/playwright-bojan/requirements.txt new file mode 100644 index 0000000..50498f7 --- /dev/null +++ b/community-contributions/playwright-bojan/requirements.txt @@ -0,0 +1,6 @@ +playwright>=1.43.0 +openai>=1.14.2 +prometheus-client>=0.19.0 +diskcache>=5.6.1 +python-dotenv>=1.0.1 +nest_asyncio>=1.6.0 From 1a7f4e86b0d3a55b36d2754d1df8cb77dd505208 Mon Sep 17 00:00:00 2001 From: lakovicb <> Date: Wed, 23 Apr 2025 16:53:26 +0200 Subject: [PATCH 4/5] Added detailed README for Playwright-based scraper contribution --- .../playwright-bojan/README.md | 99 ++++++++++--------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/community-contributions/playwright-bojan/README.md b/community-contributions/playwright-bojan/README.md index f24b91c..314b468 100644 --- a/community-contributions/playwright-bojan/README.md +++ b/community-contributions/playwright-bojan/README.md @@ -1,56 +1,67 @@ -# 🧠 Playwright-Based Web Scraper for openai.com -### πŸ“š Community Contribution for Ed Donner's "LLM Engineering: Master AI" Course +# 🧠 Community Contribution: Async Playwright-based OpenAI Scraper -> _β€œAn extra exercise for those who enjoy web scraping... -> In the community-contributions folder, you'll find an example Selenium solution from a student.”_ +This contribution presents a fully asynchronous, headless-browser-based scraper for [https://openai.com](https://openai.com) using **Playwright** β€” an alternative to Selenium. + +Developed by: [lakovicb](https://github.com/lakovicb) +IDE used: WingIDE Pro (Jupyter compatibility via `nest_asyncio`) --- -## πŸ” About This Project +## πŸ“¦ Features -This is a response to Ed Donner’s bonus exercise to scrape `https://openai.com`, which uses dynamic JavaScript rendering. -A fellow student contributed a Selenium-based solution β€” this one goes a step further with **Playwright**. - ---- - -## πŸ†š Why Playwright Over Selenium? - -| Feature | Selenium | Playwright πŸ† | -|----------------------|------------------------------|-----------------------------| -| **Installation** | More complex setup | Minimal + faster setup | -| **Speed** | Slower due to architecture | Faster execution (async) | -| **Multi-browser** | Requires config | Built-in Chrome, Firefox, WebKit support | -| **Headless mode** | Less stable | Super stable | -| **Async-friendly** | Not built-in | Native support via asyncio | -| **Interaction APIs** | Limited | Richer simulation (mouse, scroll, etc.) | - ---- - -## βš™οΈ Features - -- βœ… **Full JavaScript rendering** using Chromium -- βœ… **Human-like behavior simulation** (mouse movement, scrolling, typing) -- βœ… **Caching** with `diskcache` -- βœ… **Prometheus metrics** -- βœ… **Asynchronous scraping logic** -- βœ… **Content summarization via OpenAI GPT API** - ---- - -## 🧠 Why not in JupyterLab? - -Due to the async nature of Playwright and the use of `asyncio.run()`, running this inside Jupyter causes `RuntimeError` conflicts. - -This solution was developed and tested in: - -- πŸ’» WingIDE 10 Pro -- 🐧 Ubuntu via WSL -- 🐍 Conda environment with Anaconda Python 3.12 +- 🧭 Simulates human-like interactions (mouse movement, scrolling) +- 🧠 GPT-based analysis using OpenAI's API +- πŸ§ͺ Works inside **JupyterLab** using `nest_asyncio` +- πŸ“Š Prometheus metrics for scraping observability +- ⚑ Smart content caching via `diskcache` --- ## πŸš€ How to Run -1. Install dependencies: +### 1. Install dependencies + ```bash pip install -r requirements.txt +``` + +> Ensure [Playwright is installed & browsers are downloaded](https://playwright.dev/python/docs/intro) + +```bash +playwright install +``` + +### 2. Set environment variables in `.env` + +```env +OPENAI_API_KEY=your_openai_key +BROWSER_PATH=/usr/bin/chromium-browser +``` + +You can also define optional proxy/login params if needed. + +--- + +## πŸ“˜ Notebooks Included + +| Notebook | Description | +|----------|-------------| +| `Playwright_Solution_JupyterAsync.ipynb` | Executes async scraper directly inside Jupyter | +| `Playwright_Solution_Showcase_Formatted.ipynb` | Nicely formatted output for human reading | + +--- + +## πŸ” Output Example + +- GPT-generated summary +- Timeline of updates +- Entities and projects mentioned +- Structured topics & themes + +βœ… *Can be extended with PDF export, LangChain pipeline, or vector store ingestion.* + +--- + +## πŸ™ Thanks + +Huge thanks to Ed Donner for the amazing course and challenge inspiration! From 6ea90801bd21f6dbb30e0309a876403cc6bb9fd6 Mon Sep 17 00:00:00 2001 From: lakovicb <> Date: Thu, 24 Apr 2025 15:37:07 +0200 Subject: [PATCH 5/5] Final adjustments and preparation for Ed's review --- .../Playwright_Solution_JupyterAsync.ipynb | 173 +++++++++++ .../openai_scraper_playwright.py | 281 ++++-------------- 2 files changed, 234 insertions(+), 220 deletions(-) create mode 100644 community-contributions/playwright-bojan/Playwright_Solution_JupyterAsync.ipynb diff --git a/community-contributions/playwright-bojan/Playwright_Solution_JupyterAsync.ipynb b/community-contributions/playwright-bojan/Playwright_Solution_JupyterAsync.ipynb new file mode 100644 index 0000000..c24277c --- /dev/null +++ b/community-contributions/playwright-bojan/Playwright_Solution_JupyterAsync.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "aa629e55-8f41-41ab-b319-b55dd1cfc76b", + "metadata": {}, + "source": [ + "# Playwright Scraper Showcase (Async in Jupyter)\n", + "\n", + "This notebook demonstrates how to run async Playwright-based scraping code inside JupyterLab using `nest_asyncio`.\n", + "\n", + "**Note:** Requires `openai_scraper_playwright.py` to be in the same directory." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "97469777", + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "import asyncio\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6254fa89", + "metadata": {}, + "outputs": [], + "source": [ + "from openai_scraper_playwright import EnhancedOpenAIScraper, analyze_content" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "33d2737b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### 1. Overall Summary of the Website:\n", + "The website appears to be a hub for various applications of AI technology, particularly focusing on the capabilities of ChatGPT and other AI models developed by OpenAI. It offers a range of services from answering queries, assisting in planning trips, explaining technical topics, helping with language translation, and providing educational content. The site also features updates on new AI models, research publications, and business solutions integrating AI.\n", + "\n", + "### 2. Key Individuals or Entities:\n", + "- **OpenAI**: Mentioned as the organization behind the development of AI models and technologies such as ChatGPT, GPT-4.1, and image generation models. OpenAI seems to be focused on advancing and applying AI in various fields.\n", + "- **Lyndon Barrois & Sora**: Featured in a story, possibly highlighting individual experiences or contributions within the OpenAI ecosystem.\n", + "\n", + "### 3. Recent Announcements or Updates:\n", + "- **Introducing our latest image generation model in the API** (Product, Apr 23, 2025)\n", + "- **Thinking with images** (Release, Apr 16, 2025)\n", + "- **OpenAI announces nonprofit commission advisors** (Company, Apr 15, 2025)\n", + "- **Our updated Preparedness Framework** (Publication, Apr 15, 2025)\n", + "- **BrowseComp: a benchmark for browsing agents** (Publication, Apr 10, 2025)\n", + "- **OpenAI Pioneers Program** (Company, Apr 9, 2025)\n", + "\n", + "### 4. Main Topics or Themes:\n", + "- **AI Model Development and Application**: Discusses various AI models like ChatGPT, GPT-4.1, and image generation models.\n", + "- **Educational and Practical AI Uses**: Offers help in educational topics, practical tasks, and creative endeavors using AI.\n", + "- **Business Integration**: Focuses on integrating AI into business processes, automating tasks in finance, legal, and other sectors.\n", + "- **Research and Publications**: Shares updates on the latest research and publications related to AI technology.\n", + "\n", + "### 5. Any Noteworthy Features or Projects:\n", + "- **GPT-4.1 and Image Generation Models**: Introduction of new and advanced AI models for text and image processing.\n", + "- **OpenAI Pioneers Program**: A significant initiative likely aimed at fostering innovation and practical applications of AI technology.\n", + "- **BrowseComp and PaperBench**: Research projects or benchmarks designed to evaluate and improve AI capabilities in specific domains.\n" + ] + } + ], + "source": [ + "result = asyncio.run(analyze_content())\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "d7450ccf", + "metadata": {}, + "source": [ + "βœ… If you see structured analysis above, the async code ran successfully in Jupyter!" + ] + }, + { + "cell_type": "markdown", + "id": "9a46716c-6f77-4b2b-b423-cc9fe05014da", + "metadata": {}, + "source": [ + "# πŸ§ͺ Playwright Scraper Output (Formatted)\n", + "\n", + "---\n", + "\n", + "## 🧭 1. **Overall Summary of the Website**\n", + "\n", + "*The website appears to be focused on showcasing various applications and updates related to OpenAI's technology, specifically ChatGPT and other AI tools. It provides information on product releases, company updates, and educational content on how to use AI technologies in different scenarios such as planning trips, learning games, coding, and more.*\n", + "\n", + "---\n", + "\n", + "## πŸ§‘β€πŸ’Ό 2. **Key Individuals or Entities**\n", + "\n", + "- **OpenAI** β€” Company behind the technologies and updates discussed on the website \n", + "- **Lyndon Barrois & Sora** β€” Featured in a story, possibly highlighting user experiences or contributions\n", + "\n", + "---\n", + "\n", + "## πŸ“° 3. **Recent Announcements or Updates**\n", + "\n", + "- πŸ“’ **Introducing GPT-4.1 in the API** β€” *(no date provided)*\n", + "- πŸ–ΌοΈ **Introducing 4o Image Generation** β€” *(no date provided)*\n", + "- 🐟 **Catching halibut with ChatGPT** β€” *(no date provided)*\n", + "- 🧠 **Thinking with images** β€” *Apr 16, 2025*\n", + "- πŸ§‘β€βš–οΈ **Nonprofit commission advisors announced** β€” *Apr 15, 2025*\n", + "- βš™οΈ **Updated Preparedness Framework** β€” *Apr 15, 2025*\n", + "- 🌐 **BrowseComp benchmark for browsing agents** β€” *Apr 10, 2025*\n", + "- πŸš€ **OpenAI Pioneers Program launched** β€” *Apr 9, 2025*\n", + "- πŸ“Š **PaperBench research benchmark published** β€” *Apr 2, 2025*\n", + "\n", + "---\n", + "\n", + "## πŸ“š 4. **Main Topics or Themes**\n", + "\n", + "- πŸ€– **AI Technology Applications** β€” Using AI for tasks like planning, learning, and troubleshooting \n", + "- 🧩 **Product and Feature Releases** β€” Updates on new capabilities \n", + "- πŸ“˜ **Educational Content** β€” Guides for using AI effectively \n", + "- πŸ§ͺ **Research and Development** β€” Publications and technical benchmarks\n", + "\n", + "---\n", + "\n", + "## ⭐ 5. **Noteworthy Features or Projects**\n", + "\n", + "- βœ… **GPT-4.1** β€” A new API-accessible version of the language model \n", + "- πŸ–ΌοΈ **4o Image Generation** β€” Feature focused on AI-generated images \n", + "- πŸš€ **OpenAI Pioneers Program** β€” Initiative likely fostering innovation in AI \n", + "- πŸ“Š **BrowseComp & PaperBench** β€” Benchmarks for evaluating AI agents\n", + "\n", + "---\n", + "\n", + "βœ… *If you're reading this inside Jupyter and seeing clean structure β€” your async notebook setup is working beautifully.*\n" + ] + }, + { + "cell_type": "markdown", + "id": "95c38374-5daa-487c-8bd9-919bb4037ea3", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/community-contributions/playwright-bojan/openai_scraper_playwright.py b/community-contributions/playwright-bojan/openai_scraper_playwright.py index 7eac886..d63b041 100644 --- a/community-contributions/playwright-bojan/openai_scraper_playwright.py +++ b/community-contributions/playwright-bojan/openai_scraper_playwright.py @@ -1,3 +1,5 @@ +# openai_scraper_playwright.py + import asyncio from playwright.async_api import async_playwright from openai import OpenAI @@ -11,290 +13,129 @@ from dotenv import load_dotenv load_dotenv() -# Setting up Prometheus metrics SCRAPE_ATTEMPTS = Counter('scrape_attempts', 'Total scraping attempts') -SCRAPE_DURATION = Histogram( - 'scrape_duration', 'Scraping duration distribution') - -# Setting up cache +SCRAPE_DURATION = Histogram('scrape_duration', 'Scraping duration distribution') cache = Cache('./scraper_cache') - -class ScrapingError(Exception): - pass - - -class ContentAnalysisError(Exception): - pass - +class ScrapingError(Exception): pass +class ContentAnalysisError(Exception): pass class EnhancedOpenAIScraper: API_KEY = os.getenv("OPENAI_API_KEY") - BROWSER_EXECUTABLE = os.getenv( - "BROWSER_PATH", "/usr/bin/chromium-browser") + BROWSER_EXECUTABLE = os.getenv("BROWSER_PATH", "/usr/bin/chromium-browser") MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", 30000)) def __init__(self, headless=True): self.user_agents = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + "Mozilla/5.0 (Windows NT 10.0; Win64; x64)...", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)..." ] - self.timeout = 45000 # 45 seconds + self.timeout = 45000 self.retry_count = int(os.getenv("RETRY_COUNT", 2)) self.headless = headless - self.mouse_velocity_range = (100, 500) # px/ms - self.interaction_delays = { - 'scroll': (int(os.getenv("SCROLL_DELAY_MIN", 500)), int(os.getenv("SCROLL_DELAY_MAX", 2000))), - 'click': (int(os.getenv("CLICK_DELAY_MIN", 100)), int(os.getenv("CLICK_DELAY_MAX", 300))), - 'movement': (int(os.getenv("MOVEMENT_DELAY_MIN", 50)), int(os.getenv("MOVEMENT_DELAY_MAX", 200))) - } - self.proxy_servers = [server.strip() for server in os.getenv( - "PROXY_SERVERS", "").split(',') if server.strip()] + self.proxy_servers = [x.strip() for x in os.getenv("PROXY_SERVERS", "").split(',') if x.strip()] async def human_interaction(self, page): - """Advanced simulation of user behavior""" - # Random mouse movement path for _ in range(random.randint(2, 5)): - x = random.randint(0, 1366) - y = random.randint(0, 768) + x, y = random.randint(0, 1366), random.randint(0, 768) await page.mouse.move(x, y, steps=random.randint(5, 20)) - await page.wait_for_timeout(random.randint(*self.interaction_delays['movement'])) + await page.wait_for_timeout(random.randint(50, 200)) - # Simulating typing if random.random() < 0.3: await page.keyboard.press('Tab') await page.keyboard.type(' ', delay=random.randint(50, 200)) - # More realistic scrolling - scroll_distance = random.choice([300, 600, 900]) - await page.mouse.wheel(0, scroll_distance) - await page.wait_for_timeout(random.randint(*self.interaction_delays['scroll'])) + await page.mouse.wheel(0, random.choice([300, 600, 900])) + await page.wait_for_timeout(random.randint(500, 2000)) async def load_page(self, page, url): - """Smarter page loading with dynamic waiting""" - start_time = time.time() try: await page.goto(url, wait_until="domcontentloaded", timeout=self.timeout) - - # Smarter content extraction selectors - selectors = [ - 'main article', - '#main-content', - 'section:first-of-type', - 'div[class*="content"]', - 'body' # Fallback - ] - + selectors = ['main article', '#main-content', 'section:first-of-type', 'div[class*="content"]', 'body'] for selector in selectors: - try: - element = await page.query_selector(selector) - if element: - return True - except Exception: - continue - - # Fallback if no selector is found within a certain time - if time.time() - start_time < 30: # If we haven't used the full timeout - await page.wait_for_timeout(30000 - int(time.time() - start_time)) - - return True # Page likely loaded + if await page.query_selector(selector): + return True + await page.wait_for_timeout(5000) + return True except Exception as e: logging.error(f"Error loading page {url}: {e}") return False @SCRAPE_DURATION.time() - async def scrape_with_retry(self): - """Main function with retry mechanism and browser reuse""" + async def scrape_with_retry(self, url): SCRAPE_ATTEMPTS.inc() last_error = None - browser = None - context = None - page = None - try: async with async_playwright() as p: - launch_args = { + args = { "headless": self.headless, - "args": [ - "--disable-blink-features=AutomationControlled", - "--single-process", - "--no-sandbox", - f"--user-agent={random.choice(self.user_agents)}" - ], + "args": ["--disable-blink-features=AutomationControlled", "--no-sandbox"], "executable_path": self.BROWSER_EXECUTABLE } - if self.proxy_servers: - proxy_url = random.choice(self.proxy_servers) - proxy_config = {"server": proxy_url} - proxy_username = os.getenv('PROXY_USER') - proxy_password = os.getenv('PROXY_PASS') - if proxy_username and proxy_password: - proxy_config['username'] = proxy_username - proxy_config['password'] = proxy_password - launch_args['proxy'] = proxy_config - - browser = await p.chromium.launch(**launch_args) - context = await browser.new_context( - user_agent=random.choice(self.user_agents), - viewport={"width": 1366, "height": 768}, - locale=os.getenv("BROWSER_LOCALE", "en-US") - ) - await context.route("**/*", lambda route: route.continue_()) + browser = await p.chromium.launch(**args) + context = await browser.new_context(user_agent=random.choice(self.user_agents)) page = await context.new_page() await page.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { get: () => false }); - window.navigator.chrome = { runtime: {}, app: { isInstalled: false } }; """) for attempt in range(self.retry_count): try: - logging.info( - f"Attempt {attempt + 1}: Loading OpenAI...") - if not await self.load_page(page, "https://openai.com"): - raise ScrapingError( - "Failed to load key content on OpenAI website.") + if not await self.load_page(page, url): + raise ScrapingError("Failed to load page") await self.human_interaction(page) - await page.screenshot(path=f"openai_debug_{attempt}.png") - content = await page.evaluate("""() => { - const selectors = [ - 'main article', - '#main-content', - 'section:first-of-type', - 'div[class*="content"]' - ]; - - let content = ''; - for (const selector of selectors) { - const element = document.querySelector(selector); - if (element) { - content += element.innerText + '\\n\\n'; - } - } - return content.trim() || document.body.innerText; - }""") + content = await page.evaluate("""() => document.body.innerText""") if not content.strip(): - raise ContentAnalysisError( - "No content extracted from the page.") + raise ContentAnalysisError("No content extracted") + await browser.close() return content[:self.MAX_CONTENT_LENGTH] - - except (ScrapingError, ContentAnalysisError) as e: - last_error = e - logging.warning( - f"Attempt {attempt + 1} failed: {str(e)}") - if attempt < self.retry_count - 1: - await asyncio.sleep(5) - else: - if browser: - await browser.close() - browser = None - raise except Exception as e: last_error = e - logging.exception(f"Unexpected error on attempt { - attempt + 1}: {str(e)}") if attempt < self.retry_count - 1: await asyncio.sleep(5) else: - if browser: - await browser.close() - browser = None + await browser.close() raise - except Exception as e: - last_error = e - finally: - if browser: - await browser.close() + raise last_error or e - raise last_error if last_error else Exception( - "All scraping attempts failed.") - - async def get_cached_content(self): - key = 'openai_content_cache_key' + async def get_cached_content(self, url): + key = 'cache_' + url.replace('https://', '').replace('/', '_') content = cache.get(key) if content is None: - content = await self.scrape_with_retry() - cache.set(key, content, expire=int( - os.getenv("CACHE_EXPIRY", 3600))) + content = await self.scrape_with_retry(url) + cache.set(key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600))) return content +async def analyze_content(url="https://openai.com", headless=True): + scraper = EnhancedOpenAIScraper(headless=headless) + content = await scraper.get_cached_content(url) + client = OpenAI(api_key=EnhancedOpenAIScraper.API_KEY) + if not client.api_key: + raise ContentAnalysisError("OpenAI API key not configured") -async def analyze_content(headless=True): - try: - scraper = EnhancedOpenAIScraper(headless=headless) - content = await scraper.get_cached_content() + prompt = f""" +Analyze this page: - client = OpenAI(api_key=EnhancedOpenAIScraper.API_KEY) - if not client.api_key: - raise ContentAnalysisError( - "OpenAI API key not configured (check environment variables).") +{content} + """ + model = os.getenv("OPENAI_MODEL", "gpt-4-turbo") + temperature = float(os.getenv("MODEL_TEMPERATURE", 0.3)) + max_tokens = int(os.getenv("MAX_TOKENS", 1500)) + top_p = float(os.getenv("MODEL_TOP_P", 0.9)) - prompt_template = """ - Analyze the following website content and extract the following information if present: - - 1. **Overall Summary of the Website:** Provide a concise overview of the website's purpose and the main topics discussed. - 2. **Key Individuals or Entities:** Identify and briefly describe any prominent individuals, companies, or organizations mentioned. - 3. **Recent Announcements or Updates:** List any recent announcements, news, or updates found on the website, including dates if available. - 4. **Main Topics or Themes:** Identify the primary subjects or themes explored on the website. - 5. **Any Noteworthy Features or Projects:** Highlight any significant features, projects, or initiatives mentioned. - - Format the output clearly under each of these headings. If a particular piece of information is not found, indicate that it is not present. - - Content: - {content} - """ - - formatted_prompt = prompt_template.format(content=content) - model_name = os.getenv("OPENAI_MODEL", "gpt-4-turbo") - temperature = float(os.getenv("MODEL_TEMPERATURE", 0.3)) - max_tokens = int(os.getenv("MAX_TOKENS", 1500)) - top_p = float(os.getenv("MODEL_TOP_P", 0.9)) - - response = client.chat.completions.create( - model=model_name, - messages=[ - {"role": "system", "content": "You are a helpful assistant that analyzes website content and extracts key information in a structured format."}, - {"role": "user", "content": formatted_prompt} - ], - temperature=temperature, - max_tokens=max_tokens, - top_p=top_p - ) - - if not response.choices: - raise ContentAnalysisError("Empty response from GPT.") - - return response.choices[0].message.content - - except (ScrapingError, ContentAnalysisError) as e: - logging.error(f"Analysis failed: {str(e)}") - return f"Critical analysis error: {str(e)}" - except Exception as e: - logging.exception("Unexpected error during analysis.") - return f"Unexpected analysis error: {str(e)}" - - -async def main(): - logging.basicConfig( - level=os.getenv("LOG_LEVEL", "INFO").upper(), - format='%(asctime)s - %(levelname)s - %(message)s' + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": "You are a content analyst."}, + {"role": "user", "content": prompt} + ], + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p ) - # Start Prometheus HTTP server for exposing metrics - try: - prometheus_port = int(os.getenv("PROMETHEUS_PORT", 8000)) - start_http_server(prometheus_port) - logging.info(f"Prometheus metrics server started on port { - prometheus_port}") - except Exception as e: - logging.warning(f"Failed to start Prometheus metrics server: {e}") + if not response.choices: + raise ContentAnalysisError("Empty response from GPT") - start_time = time.time() - result = await analyze_content(headless=True) - end_time = time.time() - - print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds\n") - print(result) - -if __name__ == "__main__": - asyncio.run(main()) + return response.choices[0].message.content