Add Bojan's Playwright asynchronous scraper project
This contribution includes a fully asynchronous scraper using Playwright and OpenAI API, with Python scripts, Jupyter notebooks (outputs cleared), Markdown summaries, and a README. Organized under community-contributions/bojan-playwright-scraper/. Limited content retrieval from Huggingface.co is documented in the README.
This commit is contained in:
@@ -0,0 +1,245 @@
|
||||
# playwright_ai_scraper.py
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
import os
|
||||
from playwright.async_api import async_playwright
|
||||
from openai import OpenAI
|
||||
from prometheus_client import Counter, Histogram, start_http_server
|
||||
from diskcache import Cache
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Učitavanje .env varijabli
|
||||
load_dotenv()
|
||||
|
||||
# Postavljanje logginga
|
||||
logging.basicConfig(
|
||||
level=os.getenv("LOG_LEVEL", "INFO").upper(),
|
||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
|
||||
# Postavljanje Prometheus metrika
|
||||
SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts")
|
||||
SCRAPE_DURATION = Histogram(
|
||||
"scrape_duration", "Scraping duration distribution"
|
||||
)
|
||||
|
||||
# Postavljanje cachea
|
||||
cache = Cache("./scraper_cache")
|
||||
|
||||
# Prilagođene iznimke
|
||||
|
||||
|
||||
class ScrapingError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class AnalysisError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class AIScraper:
|
||||
API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
MAX_CONTENT = int(os.getenv("MAX_CONTENT_LENGTH", 30000))
|
||||
|
||||
def __init__(self, headless=True):
|
||||
self.user_agents = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 "
|
||||
"Safari/537.36"
|
||||
]
|
||||
self.timeout = 60000 # 60 sekundi
|
||||
self.retries = int(os.getenv("RETRY_COUNT", 2))
|
||||
self.headless = headless
|
||||
self.delays = {
|
||||
"scroll": (500, 2000),
|
||||
"click": (100, 300),
|
||||
"move": (50, 200)
|
||||
}
|
||||
|
||||
async def human_interaction(self, page):
|
||||
"""Simulira ljudsko ponašanje na stranici."""
|
||||
try:
|
||||
for _ in range(random.randint(2, 5)):
|
||||
x = random.randint(0, 1366)
|
||||
y = random.randint(0, 768)
|
||||
await page.mouse.move(x, y, steps=random.randint(5, 20))
|
||||
await page.wait_for_timeout(
|
||||
random.randint(*self.delays["move"])
|
||||
)
|
||||
scroll = random.choice([300, 600, 900])
|
||||
await page.mouse.wheel(0, scroll)
|
||||
await page.wait_for_timeout(
|
||||
random.randint(*self.delays["scroll"])
|
||||
)
|
||||
except Exception as e:
|
||||
logging.warning(f"Human interaction failed: {e}")
|
||||
|
||||
async def load_page(self, page, url):
|
||||
"""Učitava stranicu s dinamičkim čekanjem."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
await page.goto(
|
||||
url, wait_until="domcontentloaded", timeout=self.timeout
|
||||
)
|
||||
selectors = [
|
||||
"main article",
|
||||
"#main-content",
|
||||
"section:first-of-type",
|
||||
'div[class*="content"]',
|
||||
"body"
|
||||
]
|
||||
for selector in selectors:
|
||||
element = await page.query_selector(selector)
|
||||
if element:
|
||||
return True
|
||||
if time.time() - start_time < 30:
|
||||
await page.wait_for_timeout(
|
||||
30000 - int(time.time() - start_time)
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.error(f"Error loading {url}: {e}")
|
||||
return False
|
||||
|
||||
async def scrape_with_retry(self, url):
|
||||
"""Scrapa stranicu s ponovnim pokušajima."""
|
||||
SCRAPE_ATTEMPTS.inc()
|
||||
start_time = time.time()
|
||||
async with async_playwright() as p:
|
||||
try:
|
||||
browser = await p.chromium.launch(headless=self.headless)
|
||||
context = await browser.new_context(
|
||||
user_agent=random.choice(self.user_agents),
|
||||
viewport={"width": 1366, "height": 768}
|
||||
)
|
||||
page = await context.new_page()
|
||||
await page.add_init_script("""
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => false
|
||||
});
|
||||
""")
|
||||
for attempt in range(self.retries):
|
||||
try:
|
||||
logging.info(
|
||||
f"Attempt {attempt + 1}: Scraping {url}")
|
||||
if not await self.load_page(page, url):
|
||||
raise ScrapingError(f"Failed to load {url}")
|
||||
await self.human_interaction(page)
|
||||
content = await page.evaluate(
|
||||
"""() => {
|
||||
const s = [
|
||||
'main article',
|
||||
'#main-content',
|
||||
'section:first-of-type',
|
||||
'div[class*="content"]'
|
||||
];
|
||||
let c = '';
|
||||
for (const x of s) {
|
||||
const e = document.querySelector(x);
|
||||
if (e) c += e.innerText + '\\n';
|
||||
}
|
||||
return c.trim() || document.body.innerText;
|
||||
}"""
|
||||
)
|
||||
if not content.strip():
|
||||
raise ScrapingError("No content")
|
||||
SCRAPE_DURATION.observe(time.time() - start_time)
|
||||
return content[:self.MAX_CONTENT]
|
||||
except ScrapingError as e:
|
||||
logging.warning(f"Attempt {attempt + 1} failed: {e}")
|
||||
if attempt < self.retries - 1:
|
||||
await asyncio.sleep(5)
|
||||
else:
|
||||
raise
|
||||
except Exception as e:
|
||||
logging.error(f"Error in scrape: {e}")
|
||||
raise
|
||||
finally:
|
||||
await browser.close()
|
||||
raise ScrapingError(f"All attempts to scrape {url} failed")
|
||||
|
||||
async def get_cached_content(self, url):
|
||||
"""Dohvaća sadržaj iz cachea ili scrapa."""
|
||||
key = f"content_{url.replace('/', '_')}"
|
||||
content = cache.get(key)
|
||||
if content is None:
|
||||
try:
|
||||
content = await self.scrape_with_retry(url)
|
||||
cache.set(
|
||||
key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600))
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(f"Err: {e}")
|
||||
raise
|
||||
return content
|
||||
|
||||
|
||||
async def analyze_content(url, headless=True):
|
||||
"""Analizira sadržaj stranice koristeći OpenAI API."""
|
||||
try:
|
||||
scraper = AIScraper(headless=headless)
|
||||
content = await scraper.get_cached_content(url)
|
||||
client = OpenAI(api_key=scraper.API_KEY)
|
||||
if not client.api_key:
|
||||
raise AnalysisError("OpenAI API key not configured")
|
||||
prompt = """
|
||||
Analyze the website content and extract:
|
||||
1. **Summary**: Overview of the website's purpose.
|
||||
2. **Entities**: Prominent individuals or organizations.
|
||||
3. **Updates**: Recent announcements or news.
|
||||
4. **Topics**: Primary subjects or themes.
|
||||
5. **Features**: Noteworthy projects or initiatives.
|
||||
Format output under these headings. Note if info is missing.
|
||||
Content: {content}
|
||||
""".format(content=content)
|
||||
response = client.chat.completions.create(
|
||||
model=os.getenv("OPENAI_MODEL", "gpt-4-turbo"),
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=float(os.getenv("MODEL_TEMPERATURE", 0.3)),
|
||||
max_tokens=int(os.getenv("MAX_TOKENS", 1500)),
|
||||
top_p=float(os.getenv("MODEL_TOP_P", 0.9))
|
||||
)
|
||||
if not response.choices:
|
||||
raise AnalysisError("Empty response from OpenAI")
|
||||
return response.choices[0].message.content
|
||||
except (ScrapingError, AnalysisError) as e:
|
||||
logging.error(f"Analysis failed: {e}")
|
||||
return f"Error: {e}"
|
||||
except Exception as e:
|
||||
logging.exception(f"Error in analyze: {e}")
|
||||
return f"Unexpected error: {e}"
|
||||
|
||||
|
||||
async def main():
|
||||
"""Glavna funkcija za scraping i analizu."""
|
||||
try:
|
||||
port = int(os.getenv("PROMETHEUS_PORT", 8000))
|
||||
start_http_server(port)
|
||||
logging.info(f"Prometheus server started on port {port}")
|
||||
except Exception as e:
|
||||
logging.warning(f"Prometheus server failed: {e}")
|
||||
urls = [
|
||||
"https://www.anthropic.com",
|
||||
"https://deepmind.google",
|
||||
"https://huggingface.co",
|
||||
"https://runwayml.com"
|
||||
]
|
||||
for url in urls:
|
||||
start_time = time.time()
|
||||
result = await analyze_content(url, headless=True)
|
||||
end_time = time.time()
|
||||
print(
|
||||
f"\nAnalysis of {url} completed in "
|
||||
f"{end_time - start_time:.2f} seconds\n"
|
||||
)
|
||||
print(result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user