Add Bojan's Playwright asynchronous scraper project
This contribution includes a fully asynchronous scraper using Playwright and OpenAI API, with Python scripts, Jupyter notebooks (outputs cleared), Markdown summaries, and a README. Organized under community-contributions/bojan-playwright-scraper/. Limited content retrieval from Huggingface.co is documented in the README.
This commit is contained in:
@@ -0,0 +1,79 @@
|
||||
import sys
|
||||
import os
|
||||
import nbformat
|
||||
from nbformat.v4 import new_notebook, new_markdown_cell
|
||||
import asyncio
|
||||
from dotenv import load_dotenv
|
||||
import logging
|
||||
|
||||
# Učitavanje .env varijabli
|
||||
load_dotenv()
|
||||
|
||||
# Postavljanje logginga
|
||||
logging.basicConfig(
|
||||
level=os.getenv("LOG_LEVEL", "INFO").upper(),
|
||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
|
||||
# Dodavanje direktorija projekta u sys.path
|
||||
project_dir = os.path.join(
|
||||
"/home/lakov/projects/llm_engineering",
|
||||
"community-contributions/playwright-bojan"
|
||||
)
|
||||
if project_dir not in sys.path:
|
||||
sys.path.insert(0, project_dir)
|
||||
|
||||
# Uvoz analyze_content iz playwright_ai_scraper.py
|
||||
try:
|
||||
from playwright_ai_scraper import analyze_content
|
||||
except ModuleNotFoundError as e:
|
||||
logging.error(f"Error importing module: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Funkcija za spremanje notebooka
|
||||
|
||||
|
||||
def save_notebook(url, content):
|
||||
output_dir = os.path.join(project_dir, "notebooks")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Izvlačenje domene iz URL-a
|
||||
domain = url.split("//")[-1].split("/")[0].replace(".", "_")
|
||||
filename = f"{domain}_Summary.ipynb"
|
||||
path = os.path.join(output_dir, filename)
|
||||
|
||||
nb = new_notebook()
|
||||
intro = f"""
|
||||
# Summary for {url}
|
||||
|
||||
This notebook contains an AI-generated summary of the website content.
|
||||
|
||||
**URL**: `{url}`
|
||||
|
||||
---
|
||||
**Analysis**:
|
||||
{content}
|
||||
"""
|
||||
nb.cells.append(new_markdown_cell(intro))
|
||||
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
nbformat.write(nb, f)
|
||||
|
||||
logging.info(f"Notebook saved to: {path}")
|
||||
return path
|
||||
|
||||
# Glavna funkcija
|
||||
|
||||
|
||||
async def main():
|
||||
url = input("Enter URL to scrape: ")
|
||||
try:
|
||||
result = await analyze_content(url, headless=True)
|
||||
save_notebook(url, result)
|
||||
print(f"Summary for {url}:\n{result}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to process {url}: {e}")
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user