Update comments to American English and ignore scraper_cache

- Translated Croatian comments to American English in notebook_generator.py and playwright_ai_scraper.py. - Added scraper_cache/ to .gitignore to exclude cache directory from the repository.
2025-04-29 12:03:16 +02:00
parent 1a626abba0
commit 5684963b77
3 changed files with 20 additions and 19 deletions
--- a/community-contributions/bojan-playwright-scraper/notebook_generator.py
+++ b/community-contributions/bojan-playwright-scraper/notebook_generator.py
@@ -6,16 +6,16 @@ import asyncio
 from dotenv import load_dotenv
 import logging

-# Učitavanje .env varijabli
+# Loading .env variables
 load_dotenv()

-# Postavljanje logginga
+# Setting up logging
 logging.basicConfig(
    level=os.getenv("LOG_LEVEL", "INFO").upper(),
    format="%(asctime)s - %(levelname)s - %(message)s"
 )

-# Dodavanje direktorija projekta u sys.path
+# Adding project directory to sys.path
 project_dir = os.path.join(
    "/home/lakov/projects/llm_engineering",
    "community-contributions/playwright-bojan"
@@ -23,21 +23,21 @@ project_dir = os.path.join(
 if project_dir not in sys.path:
    sys.path.insert(0, project_dir)

-# Uvoz analyze_content iz playwright_ai_scraper.py
+# Importing analyze_content from playwright_ai_scraper.py
 try:
    from playwright_ai_scraper import analyze_content
 except ModuleNotFoundError as e:
    logging.error(f"Error importing module: {e}")
    sys.exit(1)

-# Funkcija za spremanje notebooka
+# Function to save the notebook


 def save_notebook(url, content):
    output_dir = os.path.join(project_dir, "notebooks")
    os.makedirs(output_dir, exist_ok=True)

-    # Izvlačenje domene iz URL-a
+    # Extracting the domain from the URL
    domain = url.split("//")[-1].split("/")[0].replace(".", "_")
    filename = f"{domain}_Summary.ipynb"
    path = os.path.join(output_dir, filename)
@@ -62,7 +62,7 @@ This notebook contains an AI-generated summary of the website content.
    logging.info(f"Notebook saved to: {path}")
    return path

-# Glavna funkcija
+# Main function


 async def main():
--- a/community-contributions/bojan-playwright-scraper/playwright_ai_scraper.py
+++ b/community-contributions/bojan-playwright-scraper/playwright_ai_scraper.py
@@ -10,25 +10,25 @@ from prometheus_client import Counter, Histogram, start_http_server
 from diskcache import Cache
 from dotenv import load_dotenv

-# Učitavanje .env varijabli
+# Loading .env variablesi
 load_dotenv()

-# Postavljanje logginga
+# Setting up logging
 logging.basicConfig(
    level=os.getenv("LOG_LEVEL", "INFO").upper(),
    format="%(asctime)s - %(levelname)s - %(message)s"
 )

-# Postavljanje Prometheus metrika
+# Setting up Prometheus metrics
 SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts")
 SCRAPE_DURATION = Histogram(
    "scrape_duration", "Scraping duration distribution"
 )

-# Postavljanje cachea
+# Setting up cache
 cache = Cache("./scraper_cache")

-# Prilagođene iznimke
+# Custom exceptions


 class ScrapingError(Exception):
@@ -51,7 +51,7 @@ class AIScraper:
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 "
            "Safari/537.36"
        ]
-        self.timeout = 60000  # 60 sekundi
+        self.timeout = 60000  # 60 seconds
        self.retries = int(os.getenv("RETRY_COUNT", 2))
        self.headless = headless
        self.delays = {
@@ -61,7 +61,7 @@ class AIScraper:
        }

    async def human_interaction(self, page):
-        """Simulira ljudsko ponašanje na stranici."""
+        """Simulates human behavior on the page."""
        try:
            for _ in range(random.randint(2, 5)):
                x = random.randint(0, 1366)
@@ -79,7 +79,7 @@ class AIScraper:
            logging.warning(f"Human interaction failed: {e}")

    async def load_page(self, page, url):
-        """Učitava stranicu s dinamičkim čekanjem."""
+        """Loads the page with dynamic waiting."""
        start_time = time.time()
        try:
            await page.goto(
@@ -106,7 +106,7 @@ class AIScraper:
            return False

    async def scrape_with_retry(self, url):
-        """Scrapa stranicu s ponovnim pokušajima."""
+        """Scrapes the page with retries."""
        SCRAPE_ATTEMPTS.inc()
        start_time = time.time()
        async with async_playwright() as p:
@@ -163,7 +163,7 @@ class AIScraper:
        raise ScrapingError(f"All attempts to scrape {url} failed")

    async def get_cached_content(self, url):
-        """Dohvaća sadržaj iz cachea ili scrapa."""
+        """Retrieves content from cache or scrapes."""
        key = f"content_{url.replace('/', '_')}"
        content = cache.get(key)
        if content is None:
@@ -179,7 +179,7 @@ class AIScraper:


 async def analyze_content(url, headless=True):
-    """Analizira sadržaj stranice koristeći OpenAI API."""
+    """Analyzes the page content using the OpenAI API."""
    try:
        scraper = AIScraper(headless=headless)
        content = await scraper.get_cached_content(url)
@@ -218,7 +218,7 @@ async def analyze_content(url, headless=True):


 async def main():
-    """Glavna funkcija za scraping i analizu."""
+    """Main function for scraping and analysis."""
    try:
        port = int(os.getenv("PROMETHEUS_PORT", 8000))
        start_http_server(port)