- Translated Croatian comments to American English in notebook_generator.py and playwright_ai_scraper.py. - Added scraper_cache/ to .gitignore to exclude cache directory from the repository.
246 lines
8.9 KiB
Python
246 lines
8.9 KiB
Python
# playwright_ai_scraper.py
|
|
import asyncio
|
|
import logging
|
|
import random
|
|
import time
|
|
import os
|
|
from playwright.async_api import async_playwright
|
|
from openai import OpenAI
|
|
from prometheus_client import Counter, Histogram, start_http_server
|
|
from diskcache import Cache
|
|
from dotenv import load_dotenv
|
|
|
|
# Loading .env variablesi
|
|
load_dotenv()
|
|
|
|
# Setting up logging
|
|
logging.basicConfig(
|
|
level=os.getenv("LOG_LEVEL", "INFO").upper(),
|
|
format="%(asctime)s - %(levelname)s - %(message)s"
|
|
)
|
|
|
|
# Setting up Prometheus metrics
|
|
SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts")
|
|
SCRAPE_DURATION = Histogram(
|
|
"scrape_duration", "Scraping duration distribution"
|
|
)
|
|
|
|
# Setting up cache
|
|
cache = Cache("./scraper_cache")
|
|
|
|
# Custom exceptions
|
|
|
|
|
|
class ScrapingError(Exception):
|
|
pass
|
|
|
|
|
|
class AnalysisError(Exception):
|
|
pass
|
|
|
|
|
|
class AIScraper:
|
|
API_KEY = os.getenv("OPENAI_API_KEY")
|
|
MAX_CONTENT = int(os.getenv("MAX_CONTENT_LENGTH", 30000))
|
|
|
|
def __init__(self, headless=True):
|
|
self.user_agents = [
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 "
|
|
"Safari/537.36"
|
|
]
|
|
self.timeout = 60000 # 60 seconds
|
|
self.retries = int(os.getenv("RETRY_COUNT", 2))
|
|
self.headless = headless
|
|
self.delays = {
|
|
"scroll": (500, 2000),
|
|
"click": (100, 300),
|
|
"move": (50, 200)
|
|
}
|
|
|
|
async def human_interaction(self, page):
|
|
"""Simulates human behavior on the page."""
|
|
try:
|
|
for _ in range(random.randint(2, 5)):
|
|
x = random.randint(0, 1366)
|
|
y = random.randint(0, 768)
|
|
await page.mouse.move(x, y, steps=random.randint(5, 20))
|
|
await page.wait_for_timeout(
|
|
random.randint(*self.delays["move"])
|
|
)
|
|
scroll = random.choice([300, 600, 900])
|
|
await page.mouse.wheel(0, scroll)
|
|
await page.wait_for_timeout(
|
|
random.randint(*self.delays["scroll"])
|
|
)
|
|
except Exception as e:
|
|
logging.warning(f"Human interaction failed: {e}")
|
|
|
|
async def load_page(self, page, url):
|
|
"""Loads the page with dynamic waiting."""
|
|
start_time = time.time()
|
|
try:
|
|
await page.goto(
|
|
url, wait_until="domcontentloaded", timeout=self.timeout
|
|
)
|
|
selectors = [
|
|
"main article",
|
|
"#main-content",
|
|
"section:first-of-type",
|
|
'div[class*="content"]',
|
|
"body"
|
|
]
|
|
for selector in selectors:
|
|
element = await page.query_selector(selector)
|
|
if element:
|
|
return True
|
|
if time.time() - start_time < 30:
|
|
await page.wait_for_timeout(
|
|
30000 - int(time.time() - start_time)
|
|
)
|
|
return True
|
|
except Exception as e:
|
|
logging.error(f"Error loading {url}: {e}")
|
|
return False
|
|
|
|
async def scrape_with_retry(self, url):
|
|
"""Scrapes the page with retries."""
|
|
SCRAPE_ATTEMPTS.inc()
|
|
start_time = time.time()
|
|
async with async_playwright() as p:
|
|
try:
|
|
browser = await p.chromium.launch(headless=self.headless)
|
|
context = await browser.new_context(
|
|
user_agent=random.choice(self.user_agents),
|
|
viewport={"width": 1366, "height": 768}
|
|
)
|
|
page = await context.new_page()
|
|
await page.add_init_script("""
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => false
|
|
});
|
|
""")
|
|
for attempt in range(self.retries):
|
|
try:
|
|
logging.info(
|
|
f"Attempt {attempt + 1}: Scraping {url}")
|
|
if not await self.load_page(page, url):
|
|
raise ScrapingError(f"Failed to load {url}")
|
|
await self.human_interaction(page)
|
|
content = await page.evaluate(
|
|
"""() => {
|
|
const s = [
|
|
'main article',
|
|
'#main-content',
|
|
'section:first-of-type',
|
|
'div[class*="content"]'
|
|
];
|
|
let c = '';
|
|
for (const x of s) {
|
|
const e = document.querySelector(x);
|
|
if (e) c += e.innerText + '\\n';
|
|
}
|
|
return c.trim() || document.body.innerText;
|
|
}"""
|
|
)
|
|
if not content.strip():
|
|
raise ScrapingError("No content")
|
|
SCRAPE_DURATION.observe(time.time() - start_time)
|
|
return content[:self.MAX_CONTENT]
|
|
except ScrapingError as e:
|
|
logging.warning(f"Attempt {attempt + 1} failed: {e}")
|
|
if attempt < self.retries - 1:
|
|
await asyncio.sleep(5)
|
|
else:
|
|
raise
|
|
except Exception as e:
|
|
logging.error(f"Error in scrape: {e}")
|
|
raise
|
|
finally:
|
|
await browser.close()
|
|
raise ScrapingError(f"All attempts to scrape {url} failed")
|
|
|
|
async def get_cached_content(self, url):
|
|
"""Retrieves content from cache or scrapes."""
|
|
key = f"content_{url.replace('/', '_')}"
|
|
content = cache.get(key)
|
|
if content is None:
|
|
try:
|
|
content = await self.scrape_with_retry(url)
|
|
cache.set(
|
|
key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600))
|
|
)
|
|
except Exception as e:
|
|
logging.error(f"Err: {e}")
|
|
raise
|
|
return content
|
|
|
|
|
|
async def analyze_content(url, headless=True):
|
|
"""Analyzes the page content using the OpenAI API."""
|
|
try:
|
|
scraper = AIScraper(headless=headless)
|
|
content = await scraper.get_cached_content(url)
|
|
client = OpenAI(api_key=scraper.API_KEY)
|
|
if not client.api_key:
|
|
raise AnalysisError("OpenAI API key not configured")
|
|
prompt = """
|
|
Analyze the website content and extract:
|
|
1. **Summary**: Overview of the website's purpose.
|
|
2. **Entities**: Prominent individuals or organizations.
|
|
3. **Updates**: Recent announcements or news.
|
|
4. **Topics**: Primary subjects or themes.
|
|
5. **Features**: Noteworthy projects or initiatives.
|
|
Format output under these headings. Note if info is missing.
|
|
Content: {content}
|
|
""".format(content=content)
|
|
response = client.chat.completions.create(
|
|
model=os.getenv("OPENAI_MODEL", "gpt-4-turbo"),
|
|
messages=[
|
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
{"role": "user", "content": prompt}
|
|
],
|
|
temperature=float(os.getenv("MODEL_TEMPERATURE", 0.3)),
|
|
max_tokens=int(os.getenv("MAX_TOKENS", 1500)),
|
|
top_p=float(os.getenv("MODEL_TOP_P", 0.9))
|
|
)
|
|
if not response.choices:
|
|
raise AnalysisError("Empty response from OpenAI")
|
|
return response.choices[0].message.content
|
|
except (ScrapingError, AnalysisError) as e:
|
|
logging.error(f"Analysis failed: {e}")
|
|
return f"Error: {e}"
|
|
except Exception as e:
|
|
logging.exception(f"Error in analyze: {e}")
|
|
return f"Unexpected error: {e}"
|
|
|
|
|
|
async def main():
|
|
"""Main function for scraping and analysis."""
|
|
try:
|
|
port = int(os.getenv("PROMETHEUS_PORT", 8000))
|
|
start_http_server(port)
|
|
logging.info(f"Prometheus server started on port {port}")
|
|
except Exception as e:
|
|
logging.warning(f"Prometheus server failed: {e}")
|
|
urls = [
|
|
"https://www.anthropic.com",
|
|
"https://deepmind.google",
|
|
"https://huggingface.co",
|
|
"https://runwayml.com"
|
|
]
|
|
for url in urls:
|
|
start_time = time.time()
|
|
result = await analyze_content(url, headless=True)
|
|
end_time = time.time()
|
|
print(
|
|
f"\nAnalysis of {url} completed in "
|
|
f"{end_time - start_time:.2f} seconds\n"
|
|
)
|
|
print(result)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|