diff --git a/.gitignore b/.gitignore index 053b56b..018b2d6 100644 --- a/.gitignore +++ b/.gitignore @@ -182,3 +182,9 @@ products_vectorstore/ # ignore optimized C++ code from being checked into repo week4/optimized week4/simple +*.env.save +.virtual_documents/ +WingIDE_Projekti/ +nohup.out +*.png + diff --git a/community-contributions/bojan-playwright-scraper/README.md b/community-contributions/bojan-playwright-scraper/README.md new file mode 100644 index 0000000..1a69c53 --- /dev/null +++ b/community-contributions/bojan-playwright-scraper/README.md @@ -0,0 +1,144 @@ + +# 🧠 Community Contribution: Async Playwright-based AI Scraper + +## Overview +This project is a fully asynchronous, headless-browser-based scraper built using Playwright and the OpenAI API. +It scrapes and analyzes content from four AI-related websites, producing structured summaries in Markdown and Jupyter notebook formats. +Playwright was chosen over Selenium for its speed and efficiency, making it ideal for modern web scraping tasks. + +**Developed by:** lakovicb +**IDE used:** WingIDE Pro 10 (Jupyter compatibility via nest_asyncio) +**Python version:** 3.12.9 (developed and tested with Anaconda) + +--- + +## 📦 Features +- 🧭 Simulates human-like interactions (mouse movement, scrolling) +- 🧠 GPT-based analysis using OpenAI's API +- 🧪 Works inside JupyterLab using nest_asyncio +- 📊 Prometheus metrics for scraping observability +- ⚡ Smart content caching via diskcache +- 📝 Generates structured Markdown summaries and Jupyter notebooks + +--- + +## 🚀 How to Run + +### 1. Install dependencies +Run these commands in your terminal: +```bash +conda install python-dotenv prometheus_client diskcache nbformat +pip install playwright openai +playwright install +``` +> Note: Ensure your environment supports Python 3.12 for optimal performance. + +--- + +### 2. Set environment variables +Create a `.env` file in `/home/lakov/projects/llm_engineering/` with: +```env +OPENAI_API_KEY=your_openai_key +``` +(Optional) Define proxy/login parameters if needed. + +--- + +### 3. Run the scraper +```bash +python playwright_ai_scraper.py +``` +This scrapes and analyzes the following URLs: +- https://www.anthropic.com +- https://deepmind.google +- https://huggingface.co +- https://runwayml.com + +--- + +### 4. Generate notebooks +```bash +python notebook_generator.py +``` +Enter a URL when prompted to generate a Jupyter notebook in the `notebooks/` directory. + +--- + +## 📊 Results + +### Python Files for Developers +- `playwright_ai_scraper.py`: Core async scraper and analyzer. +- `notebook_generator.py`: Creates Jupyter notebooks for given URLs. + +These files enable transparency, reproducibility, and extendability. + +--- + +### Markdown Summaries +Saved in `outputs/`: +- Structured analyses with sections for Summary, Entities, Updates, Topics, and Features. +- Readable and portable format. + +--- + +### Jupyter Notebooks +Available in `notebooks/`: +- `Playwright_AI_Scraper_JupyterAsync.ipynb` +- `Playwright_AI_Scraper_Showcase_Formatted.ipynb` + +--- + +## 🔍 Playwright vs. Selenium + +| Criteria | Selenium | Playwright | +|------------------------|---------------------------------------|--------------------------------------| +| Release Year | 2004 | 2020 | +| Supported Browsers | Chrome, Firefox, Safari, Edge, IE | Chromium, Firefox, WebKit | +| Supported Languages | Many | Python, JS/TS, Java, C# | +| Setup | Complex (WebDrivers) | Simple (auto-installs binaries) | +| Execution Speed | Slower | Faster (WebSocket) | +| Dynamic Content | Good (requires explicit waits) | Excellent (auto-waits) | +| Community Support | Large, mature | Growing, modern, Microsoft-backed | + +> **Playwright** was chosen for its speed, simplicity, and modern feature set. + +--- + +## ⚙️ Asynchronous Code and WingIDE Pro 10 + +- Fully async scraping with `asyncio`. +- Developed using WingIDE Pro 10 for: + - Robust async support + - Full Python 3.12 compatibility + - Integration with JupyterLab via `nest_asyncio` + - Stability and efficient debugging + +--- + +## 📁 Directory Structure + +```bash +playwright_ai_scraper.py # Main scraper script +notebook_generator.py # Notebook generator script +outputs/ # Markdown summaries +notebooks/ # Generated Jupyter notebooks +requirements.txt # List of dependencies +scraper_cache/ # Cache directory +``` + +--- + +## 📝 Notes + +- Uses Prometheus metrics and diskcache. +- Ensure a valid OpenAI API key. +- Potential extensions: PDF export, LangChain pipeline, vector store ingestion. + +- **Note:** Due to the dynamic nature and limited static text on the Huggingface.co homepage, the scraper retrieved only minimal information, which resulted in a limited AI-generated summary. This behavior reflects a realistic limitation of scraping dynamic websites without interaction-based extraction. + + +--- + +## 🙏 Thanks + +Special thanks to **Ed Donner** for the amazing course and project challenge inspiration! diff --git a/community-contributions/bojan-playwright-scraper/notebook_generator.py b/community-contributions/bojan-playwright-scraper/notebook_generator.py new file mode 100644 index 0000000..cf9699e --- /dev/null +++ b/community-contributions/bojan-playwright-scraper/notebook_generator.py @@ -0,0 +1,79 @@ +import sys +import os +import nbformat +from nbformat.v4 import new_notebook, new_markdown_cell +import asyncio +from dotenv import load_dotenv +import logging + +# Učitavanje .env varijabli +load_dotenv() + +# Postavljanje logginga +logging.basicConfig( + level=os.getenv("LOG_LEVEL", "INFO").upper(), + format="%(asctime)s - %(levelname)s - %(message)s" +) + +# Dodavanje direktorija projekta u sys.path +project_dir = os.path.join( + "/home/lakov/projects/llm_engineering", + "community-contributions/playwright-bojan" +) +if project_dir not in sys.path: + sys.path.insert(0, project_dir) + +# Uvoz analyze_content iz playwright_ai_scraper.py +try: + from playwright_ai_scraper import analyze_content +except ModuleNotFoundError as e: + logging.error(f"Error importing module: {e}") + sys.exit(1) + +# Funkcija za spremanje notebooka + + +def save_notebook(url, content): + output_dir = os.path.join(project_dir, "notebooks") + os.makedirs(output_dir, exist_ok=True) + + # Izvlačenje domene iz URL-a + domain = url.split("//")[-1].split("/")[0].replace(".", "_") + filename = f"{domain}_Summary.ipynb" + path = os.path.join(output_dir, filename) + + nb = new_notebook() + intro = f""" +# Summary for {url} + +This notebook contains an AI-generated summary of the website content. + +**URL**: `{url}` + +--- +**Analysis**: +{content} +""" + nb.cells.append(new_markdown_cell(intro)) + + with open(path, 'w', encoding='utf-8') as f: + nbformat.write(nb, f) + + logging.info(f"Notebook saved to: {path}") + return path + +# Glavna funkcija + + +async def main(): + url = input("Enter URL to scrape: ") + try: + result = await analyze_content(url, headless=True) + save_notebook(url, result) + print(f"Summary for {url}:\n{result}") + except Exception as e: + logging.error(f"Failed to process {url}: {e}") + print(f"Error: {e}") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/community-contributions/bojan-playwright-scraper/notebooks/deepmind_google_Summary.ipynb b/community-contributions/bojan-playwright-scraper/notebooks/deepmind_google_Summary.ipynb new file mode 100644 index 0000000..e419aab --- /dev/null +++ b/community-contributions/bojan-playwright-scraper/notebooks/deepmind_google_Summary.ipynb @@ -0,0 +1,60 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "144bdfa2", + "metadata": {}, + "source": [ + "\n", + "# Summary for https://deepmind.google\n", + "\n", + "This notebook contains an AI-generated summary of the website content.\n", + "\n", + "**URL**: `https://deepmind.google`\n", + "\n", + "---\n", + "**Analysis**:\n", + "### Summary\n", + "The website introduces \"Gemini 2.5,\" which appears to be the latest version of an AI model designed for the \"agentic era.\" The site likely focuses on promoting and explaining the capabilities and applications of this AI technology.\n", + "\n", + "### Entities\n", + "- **Gemini 2.5**: This is the primary entity mentioned, referring to the AI model.\n", + "- No specific individuals or organizations are named in the provided content.\n", + "\n", + "### Updates\n", + "- The introduction of \"Gemini 2.5\" is a recent update, indicating a new or significantly updated version of the AI model.\n", + "\n", + "### Topics\n", + "- **AI Models**: The site focuses on artificial intelligence technologies.\n", + "- **Agentic Era**: This suggests a theme of AI models being used in ways that are proactive or autonomous.\n", + "\n", + "### Features\n", + "- **Chat with Gemini**: This feature allows users to interact directly with the Gemini 2.5 AI, presumably to demonstrate its capabilities or to provide user support.\n", + "- Detailed descriptions of other projects or initiatives are not provided in the content.\n", + "\n", + "**Note**: The content provided is limited, and additional information might be available on the actual website to provide a more comprehensive analysis.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (WSL-Lakov)", + "language": "python", + "name": "lakov-wsl" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/community-contributions/bojan-playwright-scraper/notebooks/huggingface_co_Summary.ipynb b/community-contributions/bojan-playwright-scraper/notebooks/huggingface_co_Summary.ipynb new file mode 100644 index 0000000..ef30e75 --- /dev/null +++ b/community-contributions/bojan-playwright-scraper/notebooks/huggingface_co_Summary.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3069b0e8", + "metadata": {}, + "source": [ + "\n", + "# Summary for https://huggingface.co\n", + "\n", + "This notebook contains an AI-generated summary of the website content.\n", + "\n", + "**URL**: `https://huggingface.co`\n", + "\n", + "---\n", + "**Analysis**:\n", + "Based on the provided content snippet, here is an analysis structured under the requested headings:\n", + "\n", + "### Summary\n", + "The information provided is insufficient to determine the exact purpose of the website. However, the name \"Dia-1.6B\" suggests it might be related to a project or software version.\n", + "\n", + "### Entities\n", + "No specific individuals or organizations are mentioned in the provided content.\n", + "\n", + "### Updates\n", + "The content was updated 1 day ago, indicating recent activity or changes. However, the nature of these updates is not specified.\n", + "\n", + "### Topics\n", + "The snippet does not provide enough information to identify primary subjects or themes.\n", + "\n", + "### Features\n", + "The content does not detail any specific projects or initiatives.\n", + "\n", + "**Note:** The analysis is limited due to the lack of detailed information in the provided content snippet. More comprehensive content would be required for a complete analysis.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (WSL-Lakov)", + "language": "python", + "name": "lakov-wsl" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/community-contributions/bojan-playwright-scraper/notebooks/runwayml_com_Summary.ipynb b/community-contributions/bojan-playwright-scraper/notebooks/runwayml_com_Summary.ipynb new file mode 100644 index 0000000..08957f3 --- /dev/null +++ b/community-contributions/bojan-playwright-scraper/notebooks/runwayml_com_Summary.ipynb @@ -0,0 +1,62 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d2eeed62", + "metadata": {}, + "source": [ + "\n", + "# Summary for https://runwayml.com\n", + "\n", + "This notebook contains an AI-generated summary of the website content.\n", + "\n", + "**URL**: `https://runwayml.com`\n", + "\n", + "---\n", + "**Analysis**:\n", + "### Summary\n", + "The website promotes a series of short films created using \"Gen-4,\" which is described as the next-generation series of AI models designed for media generation and ensuring world consistency. The site appears to focus on showcasing the capabilities of these AI models in filmmaking.\n", + "\n", + "### Entities\n", + "- **Gen-4**: The AI model series used for creating the films.\n", + "- No specific individuals or organizations are mentioned beyond the reference to the AI technology.\n", + "\n", + "### Updates\n", + "- There are no specific recent announcements or news updates provided in the content.\n", + "\n", + "### Topics\n", + "- **AI in Filmmaking**: The use of advanced AI models in the creation of films.\n", + "- **Short Films**: Mention of specific titles like \"The Lonely Little Flame,\" \"NYC is a Zoo,\" and \"The Herd\" suggests a focus on narrative short films.\n", + "- **Technology in Media Production**: Emphasis on the role of Gen-4 AI technology in media production.\n", + "\n", + "### Features\n", + "- **Gen-4 AI Models**: Highlighted as a significant innovation in media generation.\n", + "- **Short Films**: The films listed (\"The Lonely Little Flame,\" \"NYC is a Zoo,\" \"The Herd\") are examples of projects created using the Gen-4 technology.\n", + "- **Interactive Elements**: Options to \"Try Runway Now\" and \"Learn More About Gen-4\" suggest interactive features for visitors to engage with the technology or learn more about it.\n", + "\n", + "Additional information about the specific functionality of the Gen-4 AI models, the background of the organization, or detailed descriptions of the films would be needed for a more comprehensive analysis.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (WSL-Lakov)", + "language": "python", + "name": "lakov-wsl" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/community-contributions/bojan-playwright-scraper/notebooks/www_anthropic_com_Summary.ipynb b/community-contributions/bojan-playwright-scraper/notebooks/www_anthropic_com_Summary.ipynb new file mode 100644 index 0000000..a037389 --- /dev/null +++ b/community-contributions/bojan-playwright-scraper/notebooks/www_anthropic_com_Summary.ipynb @@ -0,0 +1,70 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cccf3fd8", + "metadata": {}, + "source": [ + "\n", + "# Summary for https://www.anthropic.com\n", + "\n", + "This notebook contains an AI-generated summary of the website content.\n", + "\n", + "**URL**: `https://www.anthropic.com`\n", + "\n", + "---\n", + "**Analysis**:\n", + "### Summary\n", + "The website is dedicated to showcasing AI research and products with a strong emphasis on safety. It introduces \"Claude 3.7 Sonnet,\" described as their most intelligent AI model, and highlights the organization's commitment to building AI that serves humanity's long-term well-being. The site also offers resources and tools for building AI-powered applications and emphasizes responsible AI development.\n", + "\n", + "### Entities\n", + "- **Anthropic**: The organization behind the website, focused on developing AI technologies with an emphasis on safety and human benefit.\n", + "- **Claude 3.7 Sonnet**: The latest AI model featured prominently on the site.\n", + "\n", + "### Updates\n", + "Recent announcements or news include:\n", + "- **Mar 27, 2025**: Articles on \"Tracing the thoughts of a large language model\" and \"Anthropic Economic Index.\"\n", + "- **Feb 24, 2025**: Releases of \"Claude 3.7 Sonnet and Claude Code\" and \"Claude's extended thinking.\"\n", + "- **Dec 18, 2024**: Discussion on \"Alignment faking in large language models.\"\n", + "- **Nov 25, 2024**: Introduction of the \"Model Context Protocol.\"\n", + "\n", + "### Topics\n", + "Primary subjects or themes covered on the website include:\n", + "- AI Safety and Ethics\n", + "- AI-powered Applications Development\n", + "- Responsible AI Development\n", + "- AI Research and Policy Work\n", + "\n", + "### Features\n", + "Noteworthy projects or initiatives mentioned:\n", + "- **Claude 3.7 Sonnet**: The latest AI model available for use.\n", + "- **Anthropic Academy**: An educational initiative to teach users how to build with Claude.\n", + "- **Anthropic’s Responsible Scaling Policy**: A policy framework guiding the responsible development of AI technologies.\n", + "- **Model Context Protocol**: A new product initiative aimed at enhancing AI model understanding and safety.\n", + "\n", + "These sections collectively provide a comprehensive view of the website's focus on advancing AI technology with a foundational commitment to safety and ethical considerations.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (WSL-Lakov)", + "language": "python", + "name": "lakov-wsl" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/community-contributions/bojan-playwright-scraper/playwright_ai_scraper.py b/community-contributions/bojan-playwright-scraper/playwright_ai_scraper.py new file mode 100644 index 0000000..d84d0f3 --- /dev/null +++ b/community-contributions/bojan-playwright-scraper/playwright_ai_scraper.py @@ -0,0 +1,245 @@ +# playwright_ai_scraper.py +import asyncio +import logging +import random +import time +import os +from playwright.async_api import async_playwright +from openai import OpenAI +from prometheus_client import Counter, Histogram, start_http_server +from diskcache import Cache +from dotenv import load_dotenv + +# Učitavanje .env varijabli +load_dotenv() + +# Postavljanje logginga +logging.basicConfig( + level=os.getenv("LOG_LEVEL", "INFO").upper(), + format="%(asctime)s - %(levelname)s - %(message)s" +) + +# Postavljanje Prometheus metrika +SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts") +SCRAPE_DURATION = Histogram( + "scrape_duration", "Scraping duration distribution" +) + +# Postavljanje cachea +cache = Cache("./scraper_cache") + +# Prilagođene iznimke + + +class ScrapingError(Exception): + pass + + +class AnalysisError(Exception): + pass + + +class AIScraper: + API_KEY = os.getenv("OPENAI_API_KEY") + MAX_CONTENT = int(os.getenv("MAX_CONTENT_LENGTH", 30000)) + + def __init__(self, headless=True): + self.user_agents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 " + "Safari/537.36" + ] + self.timeout = 60000 # 60 sekundi + self.retries = int(os.getenv("RETRY_COUNT", 2)) + self.headless = headless + self.delays = { + "scroll": (500, 2000), + "click": (100, 300), + "move": (50, 200) + } + + async def human_interaction(self, page): + """Simulira ljudsko ponašanje na stranici.""" + try: + for _ in range(random.randint(2, 5)): + x = random.randint(0, 1366) + y = random.randint(0, 768) + await page.mouse.move(x, y, steps=random.randint(5, 20)) + await page.wait_for_timeout( + random.randint(*self.delays["move"]) + ) + scroll = random.choice([300, 600, 900]) + await page.mouse.wheel(0, scroll) + await page.wait_for_timeout( + random.randint(*self.delays["scroll"]) + ) + except Exception as e: + logging.warning(f"Human interaction failed: {e}") + + async def load_page(self, page, url): + """Učitava stranicu s dinamičkim čekanjem.""" + start_time = time.time() + try: + await page.goto( + url, wait_until="domcontentloaded", timeout=self.timeout + ) + selectors = [ + "main article", + "#main-content", + "section:first-of-type", + 'div[class*="content"]', + "body" + ] + for selector in selectors: + element = await page.query_selector(selector) + if element: + return True + if time.time() - start_time < 30: + await page.wait_for_timeout( + 30000 - int(time.time() - start_time) + ) + return True + except Exception as e: + logging.error(f"Error loading {url}: {e}") + return False + + async def scrape_with_retry(self, url): + """Scrapa stranicu s ponovnim pokušajima.""" + SCRAPE_ATTEMPTS.inc() + start_time = time.time() + async with async_playwright() as p: + try: + browser = await p.chromium.launch(headless=self.headless) + context = await browser.new_context( + user_agent=random.choice(self.user_agents), + viewport={"width": 1366, "height": 768} + ) + page = await context.new_page() + await page.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', { + get: () => false + }); + """) + for attempt in range(self.retries): + try: + logging.info( + f"Attempt {attempt + 1}: Scraping {url}") + if not await self.load_page(page, url): + raise ScrapingError(f"Failed to load {url}") + await self.human_interaction(page) + content = await page.evaluate( + """() => { + const s = [ + 'main article', + '#main-content', + 'section:first-of-type', + 'div[class*="content"]' + ]; + let c = ''; + for (const x of s) { + const e = document.querySelector(x); + if (e) c += e.innerText + '\\n'; + } + return c.trim() || document.body.innerText; + }""" + ) + if not content.strip(): + raise ScrapingError("No content") + SCRAPE_DURATION.observe(time.time() - start_time) + return content[:self.MAX_CONTENT] + except ScrapingError as e: + logging.warning(f"Attempt {attempt + 1} failed: {e}") + if attempt < self.retries - 1: + await asyncio.sleep(5) + else: + raise + except Exception as e: + logging.error(f"Error in scrape: {e}") + raise + finally: + await browser.close() + raise ScrapingError(f"All attempts to scrape {url} failed") + + async def get_cached_content(self, url): + """Dohvaća sadržaj iz cachea ili scrapa.""" + key = f"content_{url.replace('/', '_')}" + content = cache.get(key) + if content is None: + try: + content = await self.scrape_with_retry(url) + cache.set( + key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600)) + ) + except Exception as e: + logging.error(f"Err: {e}") + raise + return content + + +async def analyze_content(url, headless=True): + """Analizira sadržaj stranice koristeći OpenAI API.""" + try: + scraper = AIScraper(headless=headless) + content = await scraper.get_cached_content(url) + client = OpenAI(api_key=scraper.API_KEY) + if not client.api_key: + raise AnalysisError("OpenAI API key not configured") + prompt = """ + Analyze the website content and extract: + 1. **Summary**: Overview of the website's purpose. + 2. **Entities**: Prominent individuals or organizations. + 3. **Updates**: Recent announcements or news. + 4. **Topics**: Primary subjects or themes. + 5. **Features**: Noteworthy projects or initiatives. + Format output under these headings. Note if info is missing. + Content: {content} + """.format(content=content) + response = client.chat.completions.create( + model=os.getenv("OPENAI_MODEL", "gpt-4-turbo"), + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt} + ], + temperature=float(os.getenv("MODEL_TEMPERATURE", 0.3)), + max_tokens=int(os.getenv("MAX_TOKENS", 1500)), + top_p=float(os.getenv("MODEL_TOP_P", 0.9)) + ) + if not response.choices: + raise AnalysisError("Empty response from OpenAI") + return response.choices[0].message.content + except (ScrapingError, AnalysisError) as e: + logging.error(f"Analysis failed: {e}") + return f"Error: {e}" + except Exception as e: + logging.exception(f"Error in analyze: {e}") + return f"Unexpected error: {e}" + + +async def main(): + """Glavna funkcija za scraping i analizu.""" + try: + port = int(os.getenv("PROMETHEUS_PORT", 8000)) + start_http_server(port) + logging.info(f"Prometheus server started on port {port}") + except Exception as e: + logging.warning(f"Prometheus server failed: {e}") + urls = [ + "https://www.anthropic.com", + "https://deepmind.google", + "https://huggingface.co", + "https://runwayml.com" + ] + for url in urls: + start_time = time.time() + result = await analyze_content(url, headless=True) + end_time = time.time() + print( + f"\nAnalysis of {url} completed in " + f"{end_time - start_time:.2f} seconds\n" + ) + print(result) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/community-contributions/bojan-playwright-scraper/requirements.txt b/community-contributions/bojan-playwright-scraper/requirements.txt new file mode 100644 index 0000000..50498f7 --- /dev/null +++ b/community-contributions/bojan-playwright-scraper/requirements.txt @@ -0,0 +1,6 @@ +playwright>=1.43.0 +openai>=1.14.2 +prometheus-client>=0.19.0 +diskcache>=5.6.1 +python-dotenv>=1.0.1 +nest_asyncio>=1.6.0