Add ReputationRadar community contribution (demo replaced by link)

2025-10-22 15:30:00 +05:30
parent 9b84cc62c0
commit a3ee215468
22 changed files with 1794 additions and 0 deletions
--- a/community-contributions/Reputation_Radar/services/init.py
+++ b/community-contributions/Reputation_Radar/services/init.py
@@ -0,0 +1,11 @@
+"""Service layer exports for ReputationRadar."""
+
+from . import llm, reddit_client, trustpilot_scraper, twitter_client, utils
+
+__all__ = [
+    "llm",
+    "reddit_client",
+    "trustpilot_scraper",
+    "twitter_client",
+    "utils",
+]
--- a/community-contributions/Reputation_Radar/services/llm.py
+++ b/community-contributions/Reputation_Radar/services/llm.py
@@ -0,0 +1,147 @@
+"""LLM sentiment analysis and summarization utilities."""
+
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional, Sequence
+
+try:  # pragma: no cover - optional dependency
+    from openai import OpenAI
+except ModuleNotFoundError:  # pragma: no cover
+    OpenAI = None  # type: ignore[assignment]
+
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+
+from .utils import ServiceWarning, chunked
+
+CLASSIFICATION_SYSTEM_PROMPT = "You are a precise brand-sentiment classifier. Output JSON only."
+SUMMARY_SYSTEM_PROMPT = "You analyze brand chatter and produce concise, executive-ready summaries."
+
+
+@dataclass
+class SentimentResult:
+    """Structured sentiment output."""
+
+    label: str
+    confidence: float
+
+
+class LLMService:
+    """Wrapper around OpenAI with VADER fallback."""
+
+    def __init__(self, api_key: Optional[str], model: str = "gpt-4o-mini", batch_size: int = 20):
+        self.batch_size = max(1, batch_size)
+        self.model = model
+        self.logger = logging.getLogger("services.llm")
+        self._client: Optional[Any] = None
+        self._analyzer = SentimentIntensityAnalyzer()
+        if api_key and OpenAI is not None:
+            try:
+                self._client = OpenAI(api_key=api_key)
+            except Exception as exc:  # noqa: BLE001
+                self.logger.warning("Failed to initialize OpenAI client, using VADER fallback: %s", exc)
+                self._client = None
+        elif api_key and OpenAI is None:
+            self.logger.warning("openai package not installed; falling back to VADER despite API key.")
+
+    def available(self) -> bool:
+        """Return whether OpenAI-backed features are available."""
+        return self._client is not None
+
+    def classify_sentiment_batch(self, texts: Sequence[str]) -> List[SentimentResult]:
+        """Classify multiple texts, chunking if necessary."""
+        if not texts:
+            return []
+        if not self.available():
+            return [self._vader_sentiment(text) for text in texts]
+
+        results: List[SentimentResult] = []
+        for chunk in chunked(list(texts), self.batch_size):
+            prompt_lines = ["Classify each item as \"positive\", \"neutral\", or \"negative\".", "Also output a confidence score between 0 and 1.", "Return an array of objects: [{\"label\": \"...\", \"confidence\": 0.0}].", "Items:"]
+            prompt_lines.extend([f"{idx + 1}) {text}" for idx, text in enumerate(chunk)])
+            prompt = "\n".join(prompt_lines)
+            try:
+                response = self._client.responses.create(  # type: ignore[union-attr]
+                    model=self.model,
+                    input=[
+                        {"role": "system", "content": CLASSIFICATION_SYSTEM_PROMPT},
+                        {"role": "user", "content": prompt},
+                    ],
+                    temperature=0,
+                    max_output_tokens=500,
+                )
+                output_text = self._extract_text(response)
+                parsed = json.loads(output_text)
+                for item in parsed:
+                    results.append(
+                        SentimentResult(
+                            label=item.get("label", "neutral"),
+                            confidence=float(item.get("confidence", 0.5)),
+                        )
+                    )
+            except Exception as exc:  # noqa: BLE001
+                self.logger.warning("Classification fallback to VADER due to error: %s", exc)
+                for text in chunk:
+                    results.append(self._vader_sentiment(text))
+        # Ensure the output length matches input
+        if len(results) != len(texts):
+            # align by padding with neutral
+            results.extend([SentimentResult(label="neutral", confidence=0.33)] * (len(texts) - len(results)))
+        return results
+
+    def summarize_overall(self, findings: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Create an executive summary using OpenAI."""
+        if not self.available():
+            raise ServiceWarning("OpenAI API key missing. Summary unavailable.")
+        prompt_lines = [
+            "Given these labeled items and their short rationales, write:",
+            "- 5 bullet \"Highlights\"",
+            "- 5 bullet \"Risks & Concerns\"",
+            "- One-line \"Overall Tone\" (Positive/Neutral/Negative with brief justification)",
+            "- 3 \"Recommended Actions\"",
+            "Keep it under 180 words total. Be specific but neutral in tone.",
+            "Items:",
+        ]
+        for idx, item in enumerate(findings, start=1):
+            prompt_lines.append(
+                f"{idx}) [{item.get('label','neutral').upper()}] {item.get('text','')}"
+            )
+        prompt = "\n".join(prompt_lines)
+        try:
+            response = self._client.responses.create(  # type: ignore[union-attr]
+                model=self.model,
+                input=[
+                    {"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=0.2,
+                max_output_tokens=800,
+            )
+            output_text = self._extract_text(response)
+            return {"raw": output_text}
+        except Exception as exc:  # noqa: BLE001
+            self.logger.error("Failed to generate summary: %s", exc)
+            raise ServiceWarning("Unable to generate executive summary at this time.") from exc
+
+    def _vader_sentiment(self, text: str) -> SentimentResult:
+        scores = self._analyzer.polarity_scores(text)
+        compound = scores["compound"]
+        if compound >= 0.2:
+            label = "positive"
+        elif compound <= -0.2:
+            label = "negative"
+        else:
+            label = "neutral"
+        confidence = min(1.0, max(0.0, abs(compound)))
+        return SentimentResult(label=label, confidence=confidence)
+
+    def _extract_text(self, response: Any) -> str:
+        """Support multiple OpenAI client response shapes."""
+        if hasattr(response, "output") and response.output:
+            content = response.output[0].content[0]
+            return getattr(content, "text", str(content))
+        if hasattr(response, "choices"):
+            return response.choices[0].message.content  # type: ignore[return-value]
+        raise ValueError("Unknown response structure from OpenAI client.")
--- a/community-contributions/Reputation_Radar/services/reddit_client.py
+++ b/community-contributions/Reputation_Radar/services/reddit_client.py
@@ -0,0 +1,141 @@
+"""Reddit data collection service using PRAW."""
+
+from __future__ import annotations
+
+import time
+from datetime import datetime, timezone
+from typing import Dict, Iterable, List, Optional
+
+import praw
+from praw.models import Comment, Submission
+
+from .utils import (
+    NormalizedItem,
+    ServiceError,
+    ServiceWarning,
+    ensure_timezone,
+    sanitize_text,
+)
+
+
+TIME_FILTER_MAP = {
+    "24h": "day",
+    "7d": "week",
+    "30d": "month",
+}
+
+
+def _iter_submissions(subreddit: praw.models.Subreddit, query: str, limit: int, time_filter: str) -> Iterable[Submission]:
+    return subreddit.search(query=query, sort="new", time_filter=time_filter, limit=limit * 3)
+
+
+def _iter_comments(submission: Submission) -> Iterable[Comment]:
+    submission.comments.replace_more(limit=0)
+    return submission.comments.list()
+
+
+def _normalize_submission(submission: Submission) -> NormalizedItem:
+    created = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
+    return NormalizedItem(
+        source="reddit",
+        id=submission.id,
+        url=f"https://www.reddit.com{submission.permalink}",
+        author=str(submission.author) if submission.author else None,
+        timestamp=ensure_timezone(created),
+        text=f"{submission.title}\n\n{submission.selftext or ''}",
+        meta={
+            "score": submission.score,
+            "num_comments": submission.num_comments,
+            "subreddit": submission.subreddit.display_name,
+            "type": "submission",
+        },
+    )
+
+
+def _normalize_comment(comment: Comment, submission: Submission) -> NormalizedItem:
+    created = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
+    return NormalizedItem(
+        source="reddit",
+        id=comment.id,
+        url=f"https://www.reddit.com{comment.permalink}",
+        author=str(comment.author) if comment.author else None,
+        timestamp=ensure_timezone(created),
+        text=comment.body,
+        meta={
+            "score": comment.score,
+            "subreddit": submission.subreddit.display_name,
+            "type": "comment",
+            "submission_title": submission.title,
+        },
+    )
+
+
+def fetch_mentions(
+    brand: str,
+    credentials: Dict[str, str],
+    limit: int = 25,
+    date_filter: str = "7d",
+    min_upvotes: int = 0,
+) -> List[NormalizedItem]:
+    """Fetch recent Reddit submissions/comments mentioning the brand."""
+    client_id = credentials.get("client_id")
+    client_secret = credentials.get("client_secret")
+    user_agent = credentials.get("user_agent")
+
+    if not all([client_id, client_secret, user_agent]):
+        raise ServiceWarning("Reddit credentials are missing. Provide them in the sidebar to enable this source.")
+
+    try:
+        reddit = praw.Reddit(
+            client_id=client_id,
+            client_secret=client_secret,
+            user_agent=user_agent,
+        )
+        reddit.read_only = True
+    except Exception as exc:  # noqa: BLE001
+        raise ServiceError(f"Failed to initialize Reddit client: {exc}") from exc
+
+    time_filter = TIME_FILTER_MAP.get(date_filter.lower(), "week")
+    subreddit = reddit.subreddit("all")
+    results: List[NormalizedItem] = []
+    seen_ids: set[str] = set()
+    try:
+        for submission in _iter_submissions(subreddit, query=brand, limit=limit, time_filter=time_filter):
+            if submission.id in seen_ids:
+                continue
+            if submission.score < min_upvotes:
+                continue
+            normalized_submission = _normalize_submission(submission)
+            normalized_submission["text"] = sanitize_text(normalized_submission["text"])
+            if normalized_submission["text"]:
+                results.append(normalized_submission)
+                seen_ids.add(submission.id)
+            if len(results) >= limit:
+                break
+
+            # Fetch comments mentioning the brand
+            match_count = 0
+            for comment in _iter_comments(submission):
+                if brand.lower() not in (comment.body or "").lower():
+                    continue
+                if comment.score < min_upvotes:
+                    continue
+                normalized_comment = _normalize_comment(comment, submission)
+                normalized_comment["text"] = sanitize_text(normalized_comment["text"])
+                if not normalized_comment["text"]:
+                    continue
+                if normalized_comment["id"] in seen_ids:
+                    continue
+                results.append(normalized_comment)
+                seen_ids.add(normalized_comment["id"])
+                match_count += 1
+                if len(results) >= limit:
+                    break
+            if len(results) >= limit:
+                break
+            # Respect rate limits
+            if match_count:
+                time.sleep(1)
+    except Exception as exc:  # noqa: BLE001
+        raise ServiceError(f"Error while fetching Reddit data: {exc}") from exc
+    return results
--- a/community-contributions/Reputation_Radar/services/trustpilot_scraper.py
+++ b/community-contributions/Reputation_Radar/services/trustpilot_scraper.py
@@ -0,0 +1,138 @@
+"""Trustpilot scraping service with polite crawling safeguards."""
+
+from __future__ import annotations
+
+import time
+from datetime import datetime, timezone
+from typing import Dict, List
+from urllib.parse import urlencode
+from urllib.robotparser import RobotFileParser
+
+import requests
+from bs4 import BeautifulSoup
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
+
+from .utils import (
+    NormalizedItem,
+    ServiceError,
+    ServiceWarning,
+    ensure_timezone,
+    random_user_agent,
+    sanitize_text,
+)
+
+BASE_URL = "https://www.trustpilot.com"
+SEARCH_PATH = "/search"
+
+
+class BlockedError(ServiceWarning):
+    """Raised when Trustpilot blocks the scraping attempt."""
+
+
+def _check_robots(user_agent: str) -> None:
+    parser = RobotFileParser()
+    parser.set_url(f"{BASE_URL}/robots.txt")
+    parser.read()
+    if not parser.can_fetch(user_agent, SEARCH_PATH):
+        raise ServiceWarning(
+            "Trustpilot robots.txt disallows scraping the search endpoint. "
+            "Please use the official API or upload data manually."
+        )
+
+
+@retry(
+    reraise=True,
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=1, max=8),
+    retry=retry_if_exception_type((requests.RequestException, BlockedError)),
+)
+def _fetch_page(session: requests.Session, user_agent: str, page: int, brand: str, language: str) -> str:
+    params = {"query": brand, "page": page}
+    if language:
+        params["languages"] = language
+    url = f"{BASE_URL}{SEARCH_PATH}?{urlencode(params)}"
+    response = session.get(
+        url,
+        headers={"User-Agent": user_agent, "Accept-Language": language or "en"},
+        timeout=20,
+    )
+    if response.status_code in (401, 403):
+        raise BlockedError("Trustpilot denied access (HTTP 403).")
+    response.raise_for_status()
+    return response.text
+
+
+def _parse_reviews(html: str, user_agent: str) -> List[NormalizedItem]:
+    soup = BeautifulSoup(html, "html.parser")
+    cards = soup.select("article[data-service-review-card-layout]")
+    items: List[NormalizedItem] = []
+    now = datetime.now(timezone.utc)
+    for card in cards:
+        link = card.select_one("a.link_internal__YpiJI")
+        url = f"{BASE_URL}{link['href']}" if link and link.get("href") else ""
+        title_el = card.select_one("h2")
+        title = title_el.get_text(strip=True) if title_el else ""
+        text_el = card.select_one("[data-review-description-typography]")
+        text = text_el.get_text(separator=" ", strip=True) if text_el else ""
+        rating_el = card.select_one("img[alt*='stars']")
+        rating = rating_el["alt"] if rating_el and rating_el.get("alt") else ""
+        author_el = card.select_one("span.styles_consumerDetails__ZF4I6")
+        author = author_el.get_text(strip=True) if author_el else None
+        date_el = card.select_one("time")
+        timestamp = now
+        if date_el and date_el.get("datetime"):
+            try:
+                timestamp = datetime.fromisoformat(date_el["datetime"].replace("Z", "+00:00"))
+            except ValueError:
+                timestamp = now
+
+        body = sanitize_text(f"{title}\n\n{text}")
+        if len(body) < 15:
+            continue
+        items.append(
+            NormalizedItem(
+                source="trustpilot",
+                id=card.get("data-review-id", str(hash(body))),
+                url=url,
+                author=author,
+                timestamp=ensure_timezone(timestamp),
+                text=body,
+                meta={
+                    "rating": rating,
+                    "user_agent": user_agent,
+                },
+            )
+        )
+    return items
+
+
+def fetch_reviews(brand: str, language: str = "en", pages: int = 2) -> List[NormalizedItem]:
+    """Scrape Trustpilot search results for recent reviews."""
+    if not brand:
+        raise ServiceWarning("Brand name is required for Trustpilot scraping.")
+
+    session = requests.Session()
+    user_agent = random_user_agent()
+    _check_robots(user_agent)
+
+    aggregated: List[NormalizedItem] = []
+    seen_ids: set[str] = set()
+
+    for page in range(1, pages + 1):
+        try:
+            html = _fetch_page(session, user_agent=user_agent, page=page, brand=brand, language=language)
+        except BlockedError as exc:
+            raise ServiceWarning(
+                "Trustpilot blocked the scraping attempt. Consider using their official API or providing CSV uploads."
+            ) from exc
+        except requests.RequestException as exc:  # noqa: BLE001
+            raise ServiceError(f"Trustpilot request failed: {exc}") from exc
+        page_items = _parse_reviews(html, user_agent)
+        for item in page_items:
+            if item["id"] in seen_ids:
+                continue
+            aggregated.append(item)
+            seen_ids.add(item["id"])
+        time.sleep(1.5)  # gentle crawl delay
+
+    return aggregated
--- a/community-contributions/Reputation_Radar/services/twitter_client.py
+++ b/community-contributions/Reputation_Radar/services/twitter_client.py
@@ -0,0 +1,98 @@
+"""Twitter (X) data collection using the v2 recent search API."""
+
+from __future__ import annotations
+
+import time
+from datetime import datetime, timezone
+from typing import Dict, List, Optional
+
+import requests
+
+from .utils import NormalizedItem, ServiceError, ServiceWarning, ensure_timezone, sanitize_text
+
+SEARCH_URL = "https://api.twitter.com/2/tweets/search/recent"
+
+
+def _build_query(brand: str, language: str) -> str:
+    terms = [brand]
+    if language:
+        terms.append(f"lang:{language}")
+    return " ".join(terms)
+
+
+def fetch_mentions(
+    brand: str,
+    bearer_token: Optional[str],
+    limit: int = 25,
+    min_likes: int = 0,
+    language: str = "en",
+) -> List[NormalizedItem]:
+    """Fetch recent tweets mentioning the brand."""
+    if not bearer_token:
+        raise ServiceWarning(
+            "Twitter bearer token not provided. Add it in the sidebar to enable Twitter ingestion."
+        )
+
+    headers = {
+        "Authorization": f"Bearer {bearer_token}",
+        "User-Agent": "ReputationRadar/1.0",
+    }
+    params = {
+        "query": _build_query(brand, language),
+        "max_results": min(100, limit),
+        "tweet.fields": "author_id,created_at,lang,public_metrics",
+        "expansions": "author_id",
+        "user.fields": "name,username",
+    }
+
+    collected: List[NormalizedItem] = []
+    next_token: Optional[str] = None
+
+    while len(collected) < limit:
+        if next_token:
+            params["next_token"] = next_token
+        response = requests.get(SEARCH_URL, headers=headers, params=params, timeout=15)
+        if response.status_code == 401:
+            raise ServiceWarning("Twitter API authentication failed. Please verify the bearer token.")
+        if response.status_code == 429:
+            time.sleep(5)
+            continue
+        if response.status_code >= 400:
+            raise ServiceError(f"Twitter API error {response.status_code}: {response.text}")
+
+        payload = response.json()
+        data = payload.get("data", [])
+        includes = payload.get("includes", {})
+        users_index = {user["id"]: user for user in includes.get("users", [])}
+
+        for tweet in data:
+            created_at = datetime.fromisoformat(tweet["created_at"].replace("Z", "+00:00"))
+            author_info = users_index.get(tweet["author_id"], {})
+            item = NormalizedItem(
+                source="twitter",
+                id=tweet["id"],
+                url=f"https://twitter.com/{author_info.get('username','')}/status/{tweet['id']}",
+                author=author_info.get("username"),
+                timestamp=ensure_timezone(created_at),
+                text=sanitize_text(tweet["text"]),
+                meta={
+                    "likes": tweet.get("public_metrics", {}).get("like_count", 0),
+                    "retweets": tweet.get("public_metrics", {}).get("retweet_count", 0),
+                    "replies": tweet.get("public_metrics", {}).get("reply_count", 0),
+                    "quote_count": tweet.get("public_metrics", {}).get("quote_count", 0),
+                },
+            )
+            if not item["text"]:
+                continue
+            if item["meta"]["likes"] < min_likes:
+                continue
+            collected.append(item)
+            if len(collected) >= limit:
+                break
+
+        next_token = payload.get("meta", {}).get("next_token")
+        if not next_token:
+            break
+        time.sleep(1)  # stay friendly to rate limits
+
+    return collected[:limit]
--- a/community-contributions/Reputation_Radar/services/utils.py
+++ b/community-contributions/Reputation_Radar/services/utils.py
@@ -0,0 +1,217 @@
+"""Utility helpers for ReputationRadar services."""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import random
+import re
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, TypedDict
+
+from bs4 import BeautifulSoup
+from fuzzywuzzy import fuzz
+
+
+LOG_FILE = Path(__file__).resolve().parents[1] / "logs" / "app.log"
+MIN_TEXT_LENGTH = 15
+SIMILARITY_THRESHOLD = 90
+
+
+class NormalizedItem(TypedDict):
+    """Canonical representation of a fetched mention."""
+
+    source: str
+    id: str
+    url: str
+    author: Optional[str]
+    timestamp: datetime
+    text: str
+    meta: Dict[str, object]
+
+
+class ServiceError(RuntimeError):
+    """Raised when a service hard fails."""
+
+
+class ServiceWarning(RuntimeError):
+    """Raised for recoverable issues that should surface to the UI."""
+
+
+def initialize_logger(name: str = "reputation_radar") -> logging.Logger:
+    """Configure and return a module-level logger."""
+    LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        handlers=[
+            logging.FileHandler(LOG_FILE, encoding="utf-8"),
+            logging.StreamHandler(),
+        ],
+    )
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    return logger
+
+
+def load_sample_items(name: str) -> List[NormalizedItem]:
+    """Load demo data from the samples directory."""
+    samples_dir = Path(__file__).resolve().parents[1] / "samples"
+    sample_path = samples_dir / f"{name}.json"
+    if not sample_path.exists():
+        return []
+    with sample_path.open("r", encoding="utf-8") as handle:
+        raw_items = json.load(handle)
+    cleaned: List[NormalizedItem] = []
+    for item in raw_items:
+        try:
+            cleaned.append(
+                NormalizedItem(
+                    source=item["source"],
+                    id=str(item["id"]),
+                    url=item.get("url", ""),
+                    author=item.get("author"),
+                    timestamp=datetime.fromisoformat(item["timestamp"]),
+                    text=item["text"],
+                    meta=item.get("meta", {}),
+                )
+            )
+        except (KeyError, ValueError):
+            continue
+    return cleaned
+
+
+def strip_html(value: str) -> str:
+    """Remove HTML tags and normalize whitespace."""
+    if not value:
+        return ""
+    soup = BeautifulSoup(value, "html.parser")
+    text = soup.get_text(separator=" ", strip=True)
+    text = re.sub(r"\s+", " ", text)
+    text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")
+    return text.strip()
+
+
+def sanitize_text(value: str) -> str:
+    """Clean text and remove excessive noise."""
+    text = strip_html(value)
+    text = re.sub(r"http\S+", "", text)  # drop inline URLs
+    text = re.sub(r"\s{2,}", " ", text)
+    return text.strip()
+
+
+def drop_short_items(items: Iterable[NormalizedItem], minimum_length: int = MIN_TEXT_LENGTH) -> List[NormalizedItem]:
+    """Filter out items that are too short to analyze."""
+    return [
+        item
+        for item in items
+        if len(item["text"]) >= minimum_length
+    ]
+
+
+def fuzzy_deduplicate(items: Sequence[NormalizedItem], threshold: int = SIMILARITY_THRESHOLD) -> List[NormalizedItem]:
+    """Remove duplicates based on URL or fuzzy text similarity."""
+    seen_urls: set[str] = set()
+    deduped: List[NormalizedItem] = []
+    for item in items:
+        url = item.get("url") or ""
+        text = item.get("text") or ""
+        if url and url in seen_urls:
+            continue
+        duplicate_found = False
+        for existing in deduped:
+            if not text or not existing.get("text"):
+                continue
+            if fuzz.token_set_ratio(text, existing["text"]) >= threshold:
+                duplicate_found = True
+                break
+        if not duplicate_found:
+            deduped.append(item)
+            if url:
+                seen_urls.add(url)
+    return deduped
+
+
+def normalize_items(items: Sequence[NormalizedItem]) -> List[NormalizedItem]:
+    """Apply sanitization, deduplication, and drop noisy entries."""
+    sanitized: List[NormalizedItem] = []
+    for item in items:
+        cleaned_text = sanitize_text(item.get("text", ""))
+        if len(cleaned_text) < MIN_TEXT_LENGTH:
+            continue
+        sanitized.append(
+            NormalizedItem(
+                source=item["source"],
+                id=item["id"],
+                url=item.get("url", ""),
+                author=item.get("author"),
+                timestamp=item["timestamp"],
+                text=cleaned_text,
+                meta=item.get("meta", {}),
+            )
+        )
+    return fuzzy_deduplicate(sanitized)
+
+
+def parse_date_range(option: str) -> datetime:
+    """Return a UTC timestamp threshold for the given range identifier."""
+    now = datetime.now(timezone.utc)
+    option = option.lower()
+    delta = {
+        "24h": timedelta(days=1),
+        "7d": timedelta(days=7),
+        "30d": timedelta(days=30),
+    }.get(option, timedelta(days=7))
+    return now - delta
+
+
+def random_user_agent() -> str:
+    """Return a random user agent string for polite scraping."""
+    user_agents = [
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3) AppleWebKit/605.1.15 "
+        "(KHTML, like Gecko) Version/16.4 Safari/605.1.15",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0",
+    ]
+    return random.choice(user_agents)
+
+
+def chunked(iterable: Sequence[str], size: int) -> Iterator[Sequence[str]]:
+    """Yield successive chunks from iterable."""
+    for start in range(0, len(iterable), size):
+        yield iterable[start : start + size]
+
+
+def validate_openai_key(api_key: Optional[str]) -> Tuple[Optional[str], List[str]]:
+    """Validate an OpenAI key following the guidance from day1 notebook."""
+    warnings: List[str] = []
+    if not api_key:
+        warnings.append("No OpenAI API key detected. VADER fallback will be used.")
+        return None, warnings
+    if not api_key.startswith("sk-"):
+        warnings.append(
+            "Provided OpenAI API key does not start with the expected prefix (sk-)."
+        )
+    if api_key.strip() != api_key:
+        warnings.append("OpenAI API key looks like it has leading or trailing whitespace.")
+        api_key = api_key.strip()
+    return api_key, warnings
+
+
+def ensure_timezone(ts: datetime) -> datetime:
+    """Guarantee timestamps are timezone-aware in UTC."""
+    if ts.tzinfo is None:
+        return ts.replace(tzinfo=timezone.utc)
+    return ts.astimezone(timezone.utc)
+
+
+def safe_int(value: Optional[object], default: int = 0) -> int:
+    """Convert a value to int with a fallback."""
+    try:
+        return int(value)  # type: ignore[arg-type]
+    except (TypeError, ValueError):
+        return default