Add ReputationRadar community contribution (demo replaced by link)

2025-10-22 15:30:00 +05:30
parent 9b84cc62c0
commit a3ee215468
22 changed files with 1794 additions and 0 deletions
--- a/community-contributions/Reputation_Radar/services/trustpilot_scraper.py
+++ b/community-contributions/Reputation_Radar/services/trustpilot_scraper.py
@@ -0,0 +1,138 @@
+"""Trustpilot scraping service with polite crawling safeguards."""
+
+from __future__ import annotations
+
+import time
+from datetime import datetime, timezone
+from typing import Dict, List
+from urllib.parse import urlencode
+from urllib.robotparser import RobotFileParser
+
+import requests
+from bs4 import BeautifulSoup
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
+
+from .utils import (
+    NormalizedItem,
+    ServiceError,
+    ServiceWarning,
+    ensure_timezone,
+    random_user_agent,
+    sanitize_text,
+)
+
+BASE_URL = "https://www.trustpilot.com"
+SEARCH_PATH = "/search"
+
+
+class BlockedError(ServiceWarning):
+    """Raised when Trustpilot blocks the scraping attempt."""
+
+
+def _check_robots(user_agent: str) -> None:
+    parser = RobotFileParser()
+    parser.set_url(f"{BASE_URL}/robots.txt")
+    parser.read()
+    if not parser.can_fetch(user_agent, SEARCH_PATH):
+        raise ServiceWarning(
+            "Trustpilot robots.txt disallows scraping the search endpoint. "
+            "Please use the official API or upload data manually."
+        )
+
+
+@retry(
+    reraise=True,
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=1, max=8),
+    retry=retry_if_exception_type((requests.RequestException, BlockedError)),
+)
+def _fetch_page(session: requests.Session, user_agent: str, page: int, brand: str, language: str) -> str:
+    params = {"query": brand, "page": page}
+    if language:
+        params["languages"] = language
+    url = f"{BASE_URL}{SEARCH_PATH}?{urlencode(params)}"
+    response = session.get(
+        url,
+        headers={"User-Agent": user_agent, "Accept-Language": language or "en"},
+        timeout=20,
+    )
+    if response.status_code in (401, 403):
+        raise BlockedError("Trustpilot denied access (HTTP 403).")
+    response.raise_for_status()
+    return response.text
+
+
+def _parse_reviews(html: str, user_agent: str) -> List[NormalizedItem]:
+    soup = BeautifulSoup(html, "html.parser")
+    cards = soup.select("article[data-service-review-card-layout]")
+    items: List[NormalizedItem] = []
+    now = datetime.now(timezone.utc)
+    for card in cards:
+        link = card.select_one("a.link_internal__YpiJI")
+        url = f"{BASE_URL}{link['href']}" if link and link.get("href") else ""
+        title_el = card.select_one("h2")
+        title = title_el.get_text(strip=True) if title_el else ""
+        text_el = card.select_one("[data-review-description-typography]")
+        text = text_el.get_text(separator=" ", strip=True) if text_el else ""
+        rating_el = card.select_one("img[alt*='stars']")
+        rating = rating_el["alt"] if rating_el and rating_el.get("alt") else ""
+        author_el = card.select_one("span.styles_consumerDetails__ZF4I6")
+        author = author_el.get_text(strip=True) if author_el else None
+        date_el = card.select_one("time")
+        timestamp = now
+        if date_el and date_el.get("datetime"):
+            try:
+                timestamp = datetime.fromisoformat(date_el["datetime"].replace("Z", "+00:00"))
+            except ValueError:
+                timestamp = now
+
+        body = sanitize_text(f"{title}\n\n{text}")
+        if len(body) < 15:
+            continue
+        items.append(
+            NormalizedItem(
+                source="trustpilot",
+                id=card.get("data-review-id", str(hash(body))),
+                url=url,
+                author=author,
+                timestamp=ensure_timezone(timestamp),
+                text=body,
+                meta={
+                    "rating": rating,
+                    "user_agent": user_agent,
+                },
+            )
+        )
+    return items
+
+
+def fetch_reviews(brand: str, language: str = "en", pages: int = 2) -> List[NormalizedItem]:
+    """Scrape Trustpilot search results for recent reviews."""
+    if not brand:
+        raise ServiceWarning("Brand name is required for Trustpilot scraping.")
+
+    session = requests.Session()
+    user_agent = random_user_agent()
+    _check_robots(user_agent)
+
+    aggregated: List[NormalizedItem] = []
+    seen_ids: set[str] = set()
+
+    for page in range(1, pages + 1):
+        try:
+            html = _fetch_page(session, user_agent=user_agent, page=page, brand=brand, language=language)
+        except BlockedError as exc:
+            raise ServiceWarning(
+                "Trustpilot blocked the scraping attempt. Consider using their official API or providing CSV uploads."
+            ) from exc
+        except requests.RequestException as exc:  # noqa: BLE001
+            raise ServiceError(f"Trustpilot request failed: {exc}") from exc
+        page_items = _parse_reviews(html, user_agent)
+        for item in page_items:
+            if item["id"] in seen_ids:
+                continue
+            aggregated.append(item)
+            seen_ids.add(item["id"])
+        time.sleep(1.5)  # gentle crawl delay
+
+    return aggregated