Added Playwright-based scraper solution to community-contributions
This commit is contained in:
56
community-contributions/playwright-bojan/README.md
Normal file
56
community-contributions/playwright-bojan/README.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# 🧠 Playwright-Based Web Scraper for openai.com
|
||||
### 📚 Community Contribution for Ed Donner's "LLM Engineering: Master AI" Course
|
||||
|
||||
> _“An extra exercise for those who enjoy web scraping...
|
||||
> In the community-contributions folder, you'll find an example Selenium solution from a student.”_
|
||||
|
||||
---
|
||||
|
||||
## 🔍 About This Project
|
||||
|
||||
This is a response to Ed Donner’s bonus exercise to scrape `https://openai.com`, which uses dynamic JavaScript rendering.
|
||||
A fellow student contributed a Selenium-based solution — this one goes a step further with **Playwright**.
|
||||
|
||||
---
|
||||
|
||||
## 🆚 Why Playwright Over Selenium?
|
||||
|
||||
| Feature | Selenium | Playwright 🏆 |
|
||||
|----------------------|------------------------------|-----------------------------|
|
||||
| **Installation** | More complex setup | Minimal + faster setup |
|
||||
| **Speed** | Slower due to architecture | Faster execution (async) |
|
||||
| **Multi-browser** | Requires config | Built-in Chrome, Firefox, WebKit support |
|
||||
| **Headless mode** | Less stable | Super stable |
|
||||
| **Async-friendly** | Not built-in | Native support via asyncio |
|
||||
| **Interaction APIs** | Limited | Richer simulation (mouse, scroll, etc.) |
|
||||
|
||||
---
|
||||
|
||||
## ⚙️ Features
|
||||
|
||||
- ✅ **Full JavaScript rendering** using Chromium
|
||||
- ✅ **Human-like behavior simulation** (mouse movement, scrolling, typing)
|
||||
- ✅ **Caching** with `diskcache`
|
||||
- ✅ **Prometheus metrics**
|
||||
- ✅ **Asynchronous scraping logic**
|
||||
- ✅ **Content summarization via OpenAI GPT API**
|
||||
|
||||
---
|
||||
|
||||
## 🧠 Why not in JupyterLab?
|
||||
|
||||
Due to the async nature of Playwright and the use of `asyncio.run()`, running this inside Jupyter causes `RuntimeError` conflicts.
|
||||
|
||||
This solution was developed and tested in:
|
||||
|
||||
- 💻 WingIDE 10 Pro
|
||||
- 🐧 Ubuntu via WSL
|
||||
- 🐍 Conda environment with Anaconda Python 3.12
|
||||
|
||||
---
|
||||
|
||||
## 🚀 How to Run
|
||||
|
||||
1. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
@@ -0,0 +1,300 @@
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
from openai import OpenAI
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
import os
|
||||
from prometheus_client import start_http_server, Counter, Histogram
|
||||
from diskcache import Cache
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Setting up Prometheus metrics
|
||||
SCRAPE_ATTEMPTS = Counter('scrape_attempts', 'Total scraping attempts')
|
||||
SCRAPE_DURATION = Histogram(
|
||||
'scrape_duration', 'Scraping duration distribution')
|
||||
|
||||
# Setting up cache
|
||||
cache = Cache('./scraper_cache')
|
||||
|
||||
|
||||
class ScrapingError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ContentAnalysisError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class EnhancedOpenAIScraper:
|
||||
API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
BROWSER_EXECUTABLE = os.getenv(
|
||||
"BROWSER_PATH", "/usr/bin/chromium-browser")
|
||||
MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", 30000))
|
||||
|
||||
def __init__(self, headless=True):
|
||||
self.user_agents = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
]
|
||||
self.timeout = 45000 # 45 seconds
|
||||
self.retry_count = int(os.getenv("RETRY_COUNT", 2))
|
||||
self.headless = headless
|
||||
self.mouse_velocity_range = (100, 500) # px/ms
|
||||
self.interaction_delays = {
|
||||
'scroll': (int(os.getenv("SCROLL_DELAY_MIN", 500)), int(os.getenv("SCROLL_DELAY_MAX", 2000))),
|
||||
'click': (int(os.getenv("CLICK_DELAY_MIN", 100)), int(os.getenv("CLICK_DELAY_MAX", 300))),
|
||||
'movement': (int(os.getenv("MOVEMENT_DELAY_MIN", 50)), int(os.getenv("MOVEMENT_DELAY_MAX", 200)))
|
||||
}
|
||||
self.proxy_servers = [server.strip() for server in os.getenv(
|
||||
"PROXY_SERVERS", "").split(',') if server.strip()]
|
||||
|
||||
async def human_interaction(self, page):
|
||||
"""Advanced simulation of user behavior"""
|
||||
# Random mouse movement path
|
||||
for _ in range(random.randint(2, 5)):
|
||||
x = random.randint(0, 1366)
|
||||
y = random.randint(0, 768)
|
||||
await page.mouse.move(x, y, steps=random.randint(5, 20))
|
||||
await page.wait_for_timeout(random.randint(*self.interaction_delays['movement']))
|
||||
|
||||
# Simulating typing
|
||||
if random.random() < 0.3:
|
||||
await page.keyboard.press('Tab')
|
||||
await page.keyboard.type(' ', delay=random.randint(50, 200))
|
||||
|
||||
# More realistic scrolling
|
||||
scroll_distance = random.choice([300, 600, 900])
|
||||
await page.mouse.wheel(0, scroll_distance)
|
||||
await page.wait_for_timeout(random.randint(*self.interaction_delays['scroll']))
|
||||
|
||||
async def load_page(self, page, url):
|
||||
"""Smarter page loading with dynamic waiting"""
|
||||
start_time = time.time()
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=self.timeout)
|
||||
|
||||
# Smarter content extraction selectors
|
||||
selectors = [
|
||||
'main article',
|
||||
'#main-content',
|
||||
'section:first-of-type',
|
||||
'div[class*="content"]',
|
||||
'body' # Fallback
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
try:
|
||||
element = await page.query_selector(selector)
|
||||
if element:
|
||||
return True
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Fallback if no selector is found within a certain time
|
||||
if time.time() - start_time < 30: # If we haven't used the full timeout
|
||||
await page.wait_for_timeout(30000 - int(time.time() - start_time))
|
||||
|
||||
return True # Page likely loaded
|
||||
except Exception as e:
|
||||
logging.error(f"Error loading page {url}: {e}")
|
||||
return False
|
||||
|
||||
@SCRAPE_DURATION.time()
|
||||
async def scrape_with_retry(self):
|
||||
"""Main function with retry mechanism and browser reuse"""
|
||||
SCRAPE_ATTEMPTS.inc()
|
||||
last_error = None
|
||||
browser = None
|
||||
context = None
|
||||
page = None
|
||||
|
||||
try:
|
||||
async with async_playwright() as p:
|
||||
launch_args = {
|
||||
"headless": self.headless,
|
||||
"args": [
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--single-process",
|
||||
"--no-sandbox",
|
||||
f"--user-agent={random.choice(self.user_agents)}"
|
||||
],
|
||||
"executable_path": self.BROWSER_EXECUTABLE
|
||||
}
|
||||
if self.proxy_servers:
|
||||
proxy_url = random.choice(self.proxy_servers)
|
||||
proxy_config = {"server": proxy_url}
|
||||
proxy_username = os.getenv('PROXY_USER')
|
||||
proxy_password = os.getenv('PROXY_PASS')
|
||||
if proxy_username and proxy_password:
|
||||
proxy_config['username'] = proxy_username
|
||||
proxy_config['password'] = proxy_password
|
||||
launch_args['proxy'] = proxy_config
|
||||
|
||||
browser = await p.chromium.launch(**launch_args)
|
||||
context = await browser.new_context(
|
||||
user_agent=random.choice(self.user_agents),
|
||||
viewport={"width": 1366, "height": 768},
|
||||
locale=os.getenv("BROWSER_LOCALE", "en-US")
|
||||
)
|
||||
await context.route("**/*", lambda route: route.continue_())
|
||||
page = await context.new_page()
|
||||
await page.add_init_script("""
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
window.navigator.chrome = { runtime: {}, app: { isInstalled: false } };
|
||||
""")
|
||||
|
||||
for attempt in range(self.retry_count):
|
||||
try:
|
||||
logging.info(
|
||||
f"Attempt {attempt + 1}: Loading OpenAI...")
|
||||
if not await self.load_page(page, "https://openai.com"):
|
||||
raise ScrapingError(
|
||||
"Failed to load key content on OpenAI website.")
|
||||
await self.human_interaction(page)
|
||||
await page.screenshot(path=f"openai_debug_{attempt}.png")
|
||||
content = await page.evaluate("""() => {
|
||||
const selectors = [
|
||||
'main article',
|
||||
'#main-content',
|
||||
'section:first-of-type',
|
||||
'div[class*="content"]'
|
||||
];
|
||||
|
||||
let content = '';
|
||||
for (const selector of selectors) {
|
||||
const element = document.querySelector(selector);
|
||||
if (element) {
|
||||
content += element.innerText + '\\n\\n';
|
||||
}
|
||||
}
|
||||
return content.trim() || document.body.innerText;
|
||||
}""")
|
||||
if not content.strip():
|
||||
raise ContentAnalysisError(
|
||||
"No content extracted from the page.")
|
||||
return content[:self.MAX_CONTENT_LENGTH]
|
||||
|
||||
except (ScrapingError, ContentAnalysisError) as e:
|
||||
last_error = e
|
||||
logging.warning(
|
||||
f"Attempt {attempt + 1} failed: {str(e)}")
|
||||
if attempt < self.retry_count - 1:
|
||||
await asyncio.sleep(5)
|
||||
else:
|
||||
if browser:
|
||||
await browser.close()
|
||||
browser = None
|
||||
raise
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
logging.exception(f"Unexpected error on attempt {
|
||||
attempt + 1}: {str(e)}")
|
||||
if attempt < self.retry_count - 1:
|
||||
await asyncio.sleep(5)
|
||||
else:
|
||||
if browser:
|
||||
await browser.close()
|
||||
browser = None
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
finally:
|
||||
if browser:
|
||||
await browser.close()
|
||||
|
||||
raise last_error if last_error else Exception(
|
||||
"All scraping attempts failed.")
|
||||
|
||||
async def get_cached_content(self):
|
||||
key = 'openai_content_cache_key'
|
||||
content = cache.get(key)
|
||||
if content is None:
|
||||
content = await self.scrape_with_retry()
|
||||
cache.set(key, content, expire=int(
|
||||
os.getenv("CACHE_EXPIRY", 3600)))
|
||||
return content
|
||||
|
||||
|
||||
async def analyze_content(headless=True):
|
||||
try:
|
||||
scraper = EnhancedOpenAIScraper(headless=headless)
|
||||
content = await scraper.get_cached_content()
|
||||
|
||||
client = OpenAI(api_key=EnhancedOpenAIScraper.API_KEY)
|
||||
if not client.api_key:
|
||||
raise ContentAnalysisError(
|
||||
"OpenAI API key not configured (check environment variables).")
|
||||
|
||||
prompt_template = """
|
||||
Analyze the following website content and extract the following information if present:
|
||||
|
||||
1. **Overall Summary of the Website:** Provide a concise overview of the website's purpose and the main topics discussed.
|
||||
2. **Key Individuals or Entities:** Identify and briefly describe any prominent individuals, companies, or organizations mentioned.
|
||||
3. **Recent Announcements or Updates:** List any recent announcements, news, or updates found on the website, including dates if available.
|
||||
4. **Main Topics or Themes:** Identify the primary subjects or themes explored on the website.
|
||||
5. **Any Noteworthy Features or Projects:** Highlight any significant features, projects, or initiatives mentioned.
|
||||
|
||||
Format the output clearly under each of these headings. If a particular piece of information is not found, indicate that it is not present.
|
||||
|
||||
Content:
|
||||
{content}
|
||||
"""
|
||||
|
||||
formatted_prompt = prompt_template.format(content=content)
|
||||
model_name = os.getenv("OPENAI_MODEL", "gpt-4-turbo")
|
||||
temperature = float(os.getenv("MODEL_TEMPERATURE", 0.3))
|
||||
max_tokens = int(os.getenv("MAX_TOKENS", 1500))
|
||||
top_p = float(os.getenv("MODEL_TOP_P", 0.9))
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant that analyzes website content and extracts key information in a structured format."},
|
||||
{"role": "user", "content": formatted_prompt}
|
||||
],
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
top_p=top_p
|
||||
)
|
||||
|
||||
if not response.choices:
|
||||
raise ContentAnalysisError("Empty response from GPT.")
|
||||
|
||||
return response.choices[0].message.content
|
||||
|
||||
except (ScrapingError, ContentAnalysisError) as e:
|
||||
logging.error(f"Analysis failed: {str(e)}")
|
||||
return f"Critical analysis error: {str(e)}"
|
||||
except Exception as e:
|
||||
logging.exception("Unexpected error during analysis.")
|
||||
return f"Unexpected analysis error: {str(e)}"
|
||||
|
||||
|
||||
async def main():
|
||||
logging.basicConfig(
|
||||
level=os.getenv("LOG_LEVEL", "INFO").upper(),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
# Start Prometheus HTTP server for exposing metrics
|
||||
try:
|
||||
prometheus_port = int(os.getenv("PROMETHEUS_PORT", 8000))
|
||||
start_http_server(prometheus_port)
|
||||
logging.info(f"Prometheus metrics server started on port {
|
||||
prometheus_port}")
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to start Prometheus metrics server: {e}")
|
||||
|
||||
start_time = time.time()
|
||||
result = await analyze_content(headless=True)
|
||||
end_time = time.time()
|
||||
|
||||
print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds\n")
|
||||
print(result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user