Update comments to American English and ignore scraper_cache
- Translated Croatian comments to American English in notebook_generator.py and playwright_ai_scraper.py. - Added scraper_cache/ to .gitignore to exclude cache directory from the repository.
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -188,3 +188,4 @@ WingIDE_Projekti/
|
|||||||
nohup.out
|
nohup.out
|
||||||
*.png
|
*.png
|
||||||
|
|
||||||
|
scraper_cache/
|
||||||
|
|||||||
@@ -6,16 +6,16 @@ import asyncio
|
|||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
# Učitavanje .env varijabli
|
# Loading .env variables
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
# Postavljanje logginga
|
# Setting up logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=os.getenv("LOG_LEVEL", "INFO").upper(),
|
level=os.getenv("LOG_LEVEL", "INFO").upper(),
|
||||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
format="%(asctime)s - %(levelname)s - %(message)s"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Dodavanje direktorija projekta u sys.path
|
# Adding project directory to sys.path
|
||||||
project_dir = os.path.join(
|
project_dir = os.path.join(
|
||||||
"/home/lakov/projects/llm_engineering",
|
"/home/lakov/projects/llm_engineering",
|
||||||
"community-contributions/playwright-bojan"
|
"community-contributions/playwright-bojan"
|
||||||
@@ -23,21 +23,21 @@ project_dir = os.path.join(
|
|||||||
if project_dir not in sys.path:
|
if project_dir not in sys.path:
|
||||||
sys.path.insert(0, project_dir)
|
sys.path.insert(0, project_dir)
|
||||||
|
|
||||||
# Uvoz analyze_content iz playwright_ai_scraper.py
|
# Importing analyze_content from playwright_ai_scraper.py
|
||||||
try:
|
try:
|
||||||
from playwright_ai_scraper import analyze_content
|
from playwright_ai_scraper import analyze_content
|
||||||
except ModuleNotFoundError as e:
|
except ModuleNotFoundError as e:
|
||||||
logging.error(f"Error importing module: {e}")
|
logging.error(f"Error importing module: {e}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Funkcija za spremanje notebooka
|
# Function to save the notebook
|
||||||
|
|
||||||
|
|
||||||
def save_notebook(url, content):
|
def save_notebook(url, content):
|
||||||
output_dir = os.path.join(project_dir, "notebooks")
|
output_dir = os.path.join(project_dir, "notebooks")
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
# Izvlačenje domene iz URL-a
|
# Extracting the domain from the URL
|
||||||
domain = url.split("//")[-1].split("/")[0].replace(".", "_")
|
domain = url.split("//")[-1].split("/")[0].replace(".", "_")
|
||||||
filename = f"{domain}_Summary.ipynb"
|
filename = f"{domain}_Summary.ipynb"
|
||||||
path = os.path.join(output_dir, filename)
|
path = os.path.join(output_dir, filename)
|
||||||
@@ -62,7 +62,7 @@ This notebook contains an AI-generated summary of the website content.
|
|||||||
logging.info(f"Notebook saved to: {path}")
|
logging.info(f"Notebook saved to: {path}")
|
||||||
return path
|
return path
|
||||||
|
|
||||||
# Glavna funkcija
|
# Main function
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
|||||||
@@ -10,25 +10,25 @@ from prometheus_client import Counter, Histogram, start_http_server
|
|||||||
from diskcache import Cache
|
from diskcache import Cache
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# Učitavanje .env varijabli
|
# Loading .env variablesi
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
# Postavljanje logginga
|
# Setting up logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=os.getenv("LOG_LEVEL", "INFO").upper(),
|
level=os.getenv("LOG_LEVEL", "INFO").upper(),
|
||||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
format="%(asctime)s - %(levelname)s - %(message)s"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Postavljanje Prometheus metrika
|
# Setting up Prometheus metrics
|
||||||
SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts")
|
SCRAPE_ATTEMPTS = Counter("scrape_attempts", "Total scraping attempts")
|
||||||
SCRAPE_DURATION = Histogram(
|
SCRAPE_DURATION = Histogram(
|
||||||
"scrape_duration", "Scraping duration distribution"
|
"scrape_duration", "Scraping duration distribution"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Postavljanje cachea
|
# Setting up cache
|
||||||
cache = Cache("./scraper_cache")
|
cache = Cache("./scraper_cache")
|
||||||
|
|
||||||
# Prilagođene iznimke
|
# Custom exceptions
|
||||||
|
|
||||||
|
|
||||||
class ScrapingError(Exception):
|
class ScrapingError(Exception):
|
||||||
@@ -51,7 +51,7 @@ class AIScraper:
|
|||||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 "
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 "
|
||||||
"Safari/537.36"
|
"Safari/537.36"
|
||||||
]
|
]
|
||||||
self.timeout = 60000 # 60 sekundi
|
self.timeout = 60000 # 60 seconds
|
||||||
self.retries = int(os.getenv("RETRY_COUNT", 2))
|
self.retries = int(os.getenv("RETRY_COUNT", 2))
|
||||||
self.headless = headless
|
self.headless = headless
|
||||||
self.delays = {
|
self.delays = {
|
||||||
@@ -61,7 +61,7 @@ class AIScraper:
|
|||||||
}
|
}
|
||||||
|
|
||||||
async def human_interaction(self, page):
|
async def human_interaction(self, page):
|
||||||
"""Simulira ljudsko ponašanje na stranici."""
|
"""Simulates human behavior on the page."""
|
||||||
try:
|
try:
|
||||||
for _ in range(random.randint(2, 5)):
|
for _ in range(random.randint(2, 5)):
|
||||||
x = random.randint(0, 1366)
|
x = random.randint(0, 1366)
|
||||||
@@ -79,7 +79,7 @@ class AIScraper:
|
|||||||
logging.warning(f"Human interaction failed: {e}")
|
logging.warning(f"Human interaction failed: {e}")
|
||||||
|
|
||||||
async def load_page(self, page, url):
|
async def load_page(self, page, url):
|
||||||
"""Učitava stranicu s dinamičkim čekanjem."""
|
"""Loads the page with dynamic waiting."""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
try:
|
try:
|
||||||
await page.goto(
|
await page.goto(
|
||||||
@@ -106,7 +106,7 @@ class AIScraper:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
async def scrape_with_retry(self, url):
|
async def scrape_with_retry(self, url):
|
||||||
"""Scrapa stranicu s ponovnim pokušajima."""
|
"""Scrapes the page with retries."""
|
||||||
SCRAPE_ATTEMPTS.inc()
|
SCRAPE_ATTEMPTS.inc()
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
@@ -163,7 +163,7 @@ class AIScraper:
|
|||||||
raise ScrapingError(f"All attempts to scrape {url} failed")
|
raise ScrapingError(f"All attempts to scrape {url} failed")
|
||||||
|
|
||||||
async def get_cached_content(self, url):
|
async def get_cached_content(self, url):
|
||||||
"""Dohvaća sadržaj iz cachea ili scrapa."""
|
"""Retrieves content from cache or scrapes."""
|
||||||
key = f"content_{url.replace('/', '_')}"
|
key = f"content_{url.replace('/', '_')}"
|
||||||
content = cache.get(key)
|
content = cache.get(key)
|
||||||
if content is None:
|
if content is None:
|
||||||
@@ -179,7 +179,7 @@ class AIScraper:
|
|||||||
|
|
||||||
|
|
||||||
async def analyze_content(url, headless=True):
|
async def analyze_content(url, headless=True):
|
||||||
"""Analizira sadržaj stranice koristeći OpenAI API."""
|
"""Analyzes the page content using the OpenAI API."""
|
||||||
try:
|
try:
|
||||||
scraper = AIScraper(headless=headless)
|
scraper = AIScraper(headless=headless)
|
||||||
content = await scraper.get_cached_content(url)
|
content = await scraper.get_cached_content(url)
|
||||||
@@ -218,7 +218,7 @@ async def analyze_content(url, headless=True):
|
|||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""Glavna funkcija za scraping i analizu."""
|
"""Main function for scraping and analysis."""
|
||||||
try:
|
try:
|
||||||
port = int(os.getenv("PROMETHEUS_PORT", 8000))
|
port = int(os.getenv("PROMETHEUS_PORT", 8000))
|
||||||
start_http_server(port)
|
start_http_server(port)
|
||||||
|
|||||||
Reference in New Issue
Block a user