Files
LLM_Engineering_OLD/community-contributions/bojan-playwright-scraper/playwright_ai_scraper.py
lakovicb 5684963b77 Update comments to American English and ignore scraper_cache
- Translated Croatian comments to American English in notebook_generator.py and playwright_ai_scraper.py.
- Added scraper_cache/ to .gitignore to exclude cache directory from the repository.
2025-04-29 12:03:16 +02:00

246 lines
8.9 KiB
Python

# playwright_ai_scraper.py
import asyncio
import logging
import random
import time
import os
from playwright.async_api import async_playwright
from openai import OpenAI
from prometheus_client import Counter, Histogram, start_http_server
from diskcache import Cache
from dotenv import load_dotenv
# Loading .env variablesi
load_dotenv()
# Setting up logging
logging.basicConfig(
level=os.getenv("LOG_LEVEL", "INFO").upper(),
format="%(asctime)s - %(levelname)s - %(message)s"
)
# Setting up Prometheus metrics
SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts")
SCRAPE_DURATION = Histogram(
"scrape_duration", "Scraping duration distribution"
)
# Setting up cache
cache = Cache("./scraper_cache")
# Custom exceptions
class ScrapingError(Exception):
pass
class AnalysisError(Exception):
pass
class AIScraper:
API_KEY = os.getenv("OPENAI_API_KEY")
MAX_CONTENT = int(os.getenv("MAX_CONTENT_LENGTH", 30000))
def __init__(self, headless=True):
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 "
"Safari/537.36"
]
self.timeout = 60000 # 60 seconds
self.retries = int(os.getenv("RETRY_COUNT", 2))
self.headless = headless
self.delays = {
"scroll": (500, 2000),
"click": (100, 300),
"move": (50, 200)
}
async def human_interaction(self, page):
"""Simulates human behavior on the page."""
try:
for _ in range(random.randint(2, 5)):
x = random.randint(0, 1366)
y = random.randint(0, 768)
await page.mouse.move(x, y, steps=random.randint(5, 20))
await page.wait_for_timeout(
random.randint(*self.delays["move"])
)
scroll = random.choice([300, 600, 900])
await page.mouse.wheel(0, scroll)
await page.wait_for_timeout(
random.randint(*self.delays["scroll"])
)
except Exception as e:
logging.warning(f"Human interaction failed: {e}")
async def load_page(self, page, url):
"""Loads the page with dynamic waiting."""
start_time = time.time()
try:
await page.goto(
url, wait_until="domcontentloaded", timeout=self.timeout
)
selectors = [
"main article",
"#main-content",
"section:first-of-type",
'div[class*="content"]',
"body"
]
for selector in selectors:
element = await page.query_selector(selector)
if element:
return True
if time.time() - start_time < 30:
await page.wait_for_timeout(
30000 - int(time.time() - start_time)
)
return True
except Exception as e:
logging.error(f"Error loading {url}: {e}")
return False
async def scrape_with_retry(self, url):
"""Scrapes the page with retries."""
SCRAPE_ATTEMPTS.inc()
start_time = time.time()
async with async_playwright() as p:
try:
browser = await p.chromium.launch(headless=self.headless)
context = await browser.new_context(
user_agent=random.choice(self.user_agents),
viewport={"width": 1366, "height": 768}
)
page = await context.new_page()
await page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => false
});
""")
for attempt in range(self.retries):
try:
logging.info(
f"Attempt {attempt + 1}: Scraping {url}")
if not await self.load_page(page, url):
raise ScrapingError(f"Failed to load {url}")
await self.human_interaction(page)
content = await page.evaluate(
"""() => {
const s = [
'main article',
'#main-content',
'section:first-of-type',
'div[class*="content"]'
];
let c = '';
for (const x of s) {
const e = document.querySelector(x);
if (e) c += e.innerText + '\\n';
}
return c.trim() || document.body.innerText;
}"""
)
if not content.strip():
raise ScrapingError("No content")
SCRAPE_DURATION.observe(time.time() - start_time)
return content[:self.MAX_CONTENT]
except ScrapingError as e:
logging.warning(f"Attempt {attempt + 1} failed: {e}")
if attempt < self.retries - 1:
await asyncio.sleep(5)
else:
raise
except Exception as e:
logging.error(f"Error in scrape: {e}")
raise
finally:
await browser.close()
raise ScrapingError(f"All attempts to scrape {url} failed")
async def get_cached_content(self, url):
"""Retrieves content from cache or scrapes."""
key = f"content_{url.replace('/', '_')}"
content = cache.get(key)
if content is None:
try:
content = await self.scrape_with_retry(url)
cache.set(
key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600))
)
except Exception as e:
logging.error(f"Err: {e}")
raise
return content
async def analyze_content(url, headless=True):
"""Analyzes the page content using the OpenAI API."""
try:
scraper = AIScraper(headless=headless)
content = await scraper.get_cached_content(url)
client = OpenAI(api_key=scraper.API_KEY)
if not client.api_key:
raise AnalysisError("OpenAI API key not configured")
prompt = """
Analyze the website content and extract:
1. **Summary**: Overview of the website's purpose.
2. **Entities**: Prominent individuals or organizations.
3. **Updates**: Recent announcements or news.
4. **Topics**: Primary subjects or themes.
5. **Features**: Noteworthy projects or initiatives.
Format output under these headings. Note if info is missing.
Content: {content}
""".format(content=content)
response = client.chat.completions.create(
model=os.getenv("OPENAI_MODEL", "gpt-4-turbo"),
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
temperature=float(os.getenv("MODEL_TEMPERATURE", 0.3)),
max_tokens=int(os.getenv("MAX_TOKENS", 1500)),
top_p=float(os.getenv("MODEL_TOP_P", 0.9))
)
if not response.choices:
raise AnalysisError("Empty response from OpenAI")
return response.choices[0].message.content
except (ScrapingError, AnalysisError) as e:
logging.error(f"Analysis failed: {e}")
return f"Error: {e}"
except Exception as e:
logging.exception(f"Error in analyze: {e}")
return f"Unexpected error: {e}"
async def main():
"""Main function for scraping and analysis."""
try:
port = int(os.getenv("PROMETHEUS_PORT", 8000))
start_http_server(port)
logging.info(f"Prometheus server started on port {port}")
except Exception as e:
logging.warning(f"Prometheus server failed: {e}")
urls = [
"https://www.anthropic.com",
"https://deepmind.google",
"https://huggingface.co",
"https://runwayml.com"
]
for url in urls:
start_time = time.time()
result = await analyze_content(url, headless=True)
end_time = time.time()
print(
f"\nAnalysis of {url} completed in "
f"{end_time - start_time:.2f} seconds\n"
)
print(result)
if __name__ == "__main__":
asyncio.run(main())