Update comments to American English and ignore scraper_cache

- Translated Croatian comments to American English in notebook_generator.py and playwright_ai_scraper.py.
- Added scraper_cache/ to .gitignore to exclude cache directory from the repository.
This commit is contained in:
lakovicb
2025-04-29 12:03:16 +02:00
parent 1a626abba0
commit 5684963b77
3 changed files with 20 additions and 19 deletions

1
.gitignore vendored
View File

@@ -188,3 +188,4 @@ WingIDE_Projekti/
nohup.out nohup.out
*.png *.png
scraper_cache/

View File

@@ -6,16 +6,16 @@ import asyncio
from dotenv import load_dotenv from dotenv import load_dotenv
import logging import logging
# Učitavanje .env varijabli # Loading .env variables
load_dotenv() load_dotenv()
# Postavljanje logginga # Setting up logging
logging.basicConfig( logging.basicConfig(
level=os.getenv("LOG_LEVEL", "INFO").upper(), level=os.getenv("LOG_LEVEL", "INFO").upper(),
format="%(asctime)s - %(levelname)s - %(message)s" format="%(asctime)s - %(levelname)s - %(message)s"
) )
# Dodavanje direktorija projekta u sys.path # Adding project directory to sys.path
project_dir = os.path.join( project_dir = os.path.join(
"/home/lakov/projects/llm_engineering", "/home/lakov/projects/llm_engineering",
"community-contributions/playwright-bojan" "community-contributions/playwright-bojan"
@@ -23,21 +23,21 @@ project_dir = os.path.join(
if project_dir not in sys.path: if project_dir not in sys.path:
sys.path.insert(0, project_dir) sys.path.insert(0, project_dir)
# Uvoz analyze_content iz playwright_ai_scraper.py # Importing analyze_content from playwright_ai_scraper.py
try: try:
from playwright_ai_scraper import analyze_content from playwright_ai_scraper import analyze_content
except ModuleNotFoundError as e: except ModuleNotFoundError as e:
logging.error(f"Error importing module: {e}") logging.error(f"Error importing module: {e}")
sys.exit(1) sys.exit(1)
# Funkcija za spremanje notebooka # Function to save the notebook
def save_notebook(url, content): def save_notebook(url, content):
output_dir = os.path.join(project_dir, "notebooks") output_dir = os.path.join(project_dir, "notebooks")
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
# Izvlačenje domene iz URL-a # Extracting the domain from the URL
domain = url.split("//")[-1].split("/")[0].replace(".", "_") domain = url.split("//")[-1].split("/")[0].replace(".", "_")
filename = f"{domain}_Summary.ipynb" filename = f"{domain}_Summary.ipynb"
path = os.path.join(output_dir, filename) path = os.path.join(output_dir, filename)
@@ -62,7 +62,7 @@ This notebook contains an AI-generated summary of the website content.
logging.info(f"Notebook saved to: {path}") logging.info(f"Notebook saved to: {path}")
return path return path
# Glavna funkcija # Main function
async def main(): async def main():

View File

@@ -10,25 +10,25 @@ from prometheus_client import Counter, Histogram, start_http_server
from diskcache import Cache from diskcache import Cache
from dotenv import load_dotenv from dotenv import load_dotenv
# Učitavanje .env varijabli # Loading .env variablesi
load_dotenv() load_dotenv()
# Postavljanje logginga # Setting up logging
logging.basicConfig( logging.basicConfig(
level=os.getenv("LOG_LEVEL", "INFO").upper(), level=os.getenv("LOG_LEVEL", "INFO").upper(),
format="%(asctime)s - %(levelname)s - %(message)s" format="%(asctime)s - %(levelname)s - %(message)s"
) )
# Postavljanje Prometheus metrika # Setting up Prometheus metrics
SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts") SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts")
SCRAPE_DURATION = Histogram( SCRAPE_DURATION = Histogram(
"scrape_duration", "Scraping duration distribution" "scrape_duration", "Scraping duration distribution"
) )
# Postavljanje cachea # Setting up cache
cache = Cache("./scraper_cache") cache = Cache("./scraper_cache")
# Prilagođene iznimke # Custom exceptions
class ScrapingError(Exception): class ScrapingError(Exception):
@@ -51,7 +51,7 @@ class AIScraper:
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 "
"Safari/537.36" "Safari/537.36"
] ]
self.timeout = 60000 # 60 sekundi self.timeout = 60000 # 60 seconds
self.retries = int(os.getenv("RETRY_COUNT", 2)) self.retries = int(os.getenv("RETRY_COUNT", 2))
self.headless = headless self.headless = headless
self.delays = { self.delays = {
@@ -61,7 +61,7 @@ class AIScraper:
} }
async def human_interaction(self, page): async def human_interaction(self, page):
"""Simulira ljudsko ponašanje na stranici.""" """Simulates human behavior on the page."""
try: try:
for _ in range(random.randint(2, 5)): for _ in range(random.randint(2, 5)):
x = random.randint(0, 1366) x = random.randint(0, 1366)
@@ -79,7 +79,7 @@ class AIScraper:
logging.warning(f"Human interaction failed: {e}") logging.warning(f"Human interaction failed: {e}")
async def load_page(self, page, url): async def load_page(self, page, url):
"""Učitava stranicu s dinamičkim čekanjem.""" """Loads the page with dynamic waiting."""
start_time = time.time() start_time = time.time()
try: try:
await page.goto( await page.goto(
@@ -106,7 +106,7 @@ class AIScraper:
return False return False
async def scrape_with_retry(self, url): async def scrape_with_retry(self, url):
"""Scrapa stranicu s ponovnim pokušajima.""" """Scrapes the page with retries."""
SCRAPE_ATTEMPTS.inc() SCRAPE_ATTEMPTS.inc()
start_time = time.time() start_time = time.time()
async with async_playwright() as p: async with async_playwright() as p:
@@ -163,7 +163,7 @@ class AIScraper:
raise ScrapingError(f"All attempts to scrape {url} failed") raise ScrapingError(f"All attempts to scrape {url} failed")
async def get_cached_content(self, url): async def get_cached_content(self, url):
"""Dohvaća sadržaj iz cachea ili scrapa.""" """Retrieves content from cache or scrapes."""
key = f"content_{url.replace('/', '_')}" key = f"content_{url.replace('/', '_')}"
content = cache.get(key) content = cache.get(key)
if content is None: if content is None:
@@ -179,7 +179,7 @@ class AIScraper:
async def analyze_content(url, headless=True): async def analyze_content(url, headless=True):
"""Analizira sadržaj stranice koristeći OpenAI API.""" """Analyzes the page content using the OpenAI API."""
try: try:
scraper = AIScraper(headless=headless) scraper = AIScraper(headless=headless)
content = await scraper.get_cached_content(url) content = await scraper.get_cached_content(url)
@@ -218,7 +218,7 @@ async def analyze_content(url, headless=True):
async def main(): async def main():
"""Glavna funkcija za scraping i analizu.""" """Main function for scraping and analysis."""
try: try:
port = int(os.getenv("PROMETHEUS_PORT", 8000)) port = int(os.getenv("PROMETHEUS_PORT", 8000))
start_http_server(port) start_http_server(port)