Add ReputationRadar community contribution (demo replaced by link)
This commit is contained in:
@@ -0,0 +1,138 @@
|
||||
"""Trustpilot scraping service with polite crawling safeguards."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List
|
||||
from urllib.parse import urlencode
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
||||
|
||||
from .utils import (
|
||||
NormalizedItem,
|
||||
ServiceError,
|
||||
ServiceWarning,
|
||||
ensure_timezone,
|
||||
random_user_agent,
|
||||
sanitize_text,
|
||||
)
|
||||
|
||||
BASE_URL = "https://www.trustpilot.com"
|
||||
SEARCH_PATH = "/search"
|
||||
|
||||
|
||||
class BlockedError(ServiceWarning):
|
||||
"""Raised when Trustpilot blocks the scraping attempt."""
|
||||
|
||||
|
||||
def _check_robots(user_agent: str) -> None:
|
||||
parser = RobotFileParser()
|
||||
parser.set_url(f"{BASE_URL}/robots.txt")
|
||||
parser.read()
|
||||
if not parser.can_fetch(user_agent, SEARCH_PATH):
|
||||
raise ServiceWarning(
|
||||
"Trustpilot robots.txt disallows scraping the search endpoint. "
|
||||
"Please use the official API or upload data manually."
|
||||
)
|
||||
|
||||
|
||||
@retry(
|
||||
reraise=True,
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=1, max=8),
|
||||
retry=retry_if_exception_type((requests.RequestException, BlockedError)),
|
||||
)
|
||||
def _fetch_page(session: requests.Session, user_agent: str, page: int, brand: str, language: str) -> str:
|
||||
params = {"query": brand, "page": page}
|
||||
if language:
|
||||
params["languages"] = language
|
||||
url = f"{BASE_URL}{SEARCH_PATH}?{urlencode(params)}"
|
||||
response = session.get(
|
||||
url,
|
||||
headers={"User-Agent": user_agent, "Accept-Language": language or "en"},
|
||||
timeout=20,
|
||||
)
|
||||
if response.status_code in (401, 403):
|
||||
raise BlockedError("Trustpilot denied access (HTTP 403).")
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
|
||||
def _parse_reviews(html: str, user_agent: str) -> List[NormalizedItem]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
cards = soup.select("article[data-service-review-card-layout]")
|
||||
items: List[NormalizedItem] = []
|
||||
now = datetime.now(timezone.utc)
|
||||
for card in cards:
|
||||
link = card.select_one("a.link_internal__YpiJI")
|
||||
url = f"{BASE_URL}{link['href']}" if link and link.get("href") else ""
|
||||
title_el = card.select_one("h2")
|
||||
title = title_el.get_text(strip=True) if title_el else ""
|
||||
text_el = card.select_one("[data-review-description-typography]")
|
||||
text = text_el.get_text(separator=" ", strip=True) if text_el else ""
|
||||
rating_el = card.select_one("img[alt*='stars']")
|
||||
rating = rating_el["alt"] if rating_el and rating_el.get("alt") else ""
|
||||
author_el = card.select_one("span.styles_consumerDetails__ZF4I6")
|
||||
author = author_el.get_text(strip=True) if author_el else None
|
||||
date_el = card.select_one("time")
|
||||
timestamp = now
|
||||
if date_el and date_el.get("datetime"):
|
||||
try:
|
||||
timestamp = datetime.fromisoformat(date_el["datetime"].replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
timestamp = now
|
||||
|
||||
body = sanitize_text(f"{title}\n\n{text}")
|
||||
if len(body) < 15:
|
||||
continue
|
||||
items.append(
|
||||
NormalizedItem(
|
||||
source="trustpilot",
|
||||
id=card.get("data-review-id", str(hash(body))),
|
||||
url=url,
|
||||
author=author,
|
||||
timestamp=ensure_timezone(timestamp),
|
||||
text=body,
|
||||
meta={
|
||||
"rating": rating,
|
||||
"user_agent": user_agent,
|
||||
},
|
||||
)
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def fetch_reviews(brand: str, language: str = "en", pages: int = 2) -> List[NormalizedItem]:
|
||||
"""Scrape Trustpilot search results for recent reviews."""
|
||||
if not brand:
|
||||
raise ServiceWarning("Brand name is required for Trustpilot scraping.")
|
||||
|
||||
session = requests.Session()
|
||||
user_agent = random_user_agent()
|
||||
_check_robots(user_agent)
|
||||
|
||||
aggregated: List[NormalizedItem] = []
|
||||
seen_ids: set[str] = set()
|
||||
|
||||
for page in range(1, pages + 1):
|
||||
try:
|
||||
html = _fetch_page(session, user_agent=user_agent, page=page, brand=brand, language=language)
|
||||
except BlockedError as exc:
|
||||
raise ServiceWarning(
|
||||
"Trustpilot blocked the scraping attempt. Consider using their official API or providing CSV uploads."
|
||||
) from exc
|
||||
except requests.RequestException as exc: # noqa: BLE001
|
||||
raise ServiceError(f"Trustpilot request failed: {exc}") from exc
|
||||
page_items = _parse_reviews(html, user_agent)
|
||||
for item in page_items:
|
||||
if item["id"] in seen_ids:
|
||||
continue
|
||||
aggregated.append(item)
|
||||
seen_ids.add(item["id"])
|
||||
time.sleep(1.5) # gentle crawl delay
|
||||
|
||||
return aggregated
|
||||
Reference in New Issue
Block a user