Add ReputationRadar community contribution (demo replaced by link)
This commit is contained in:
@@ -0,0 +1,11 @@
|
||||
"""Service layer exports for ReputationRadar."""
|
||||
|
||||
from . import llm, reddit_client, trustpilot_scraper, twitter_client, utils
|
||||
|
||||
__all__ = [
|
||||
"llm",
|
||||
"reddit_client",
|
||||
"trustpilot_scraper",
|
||||
"twitter_client",
|
||||
"utils",
|
||||
]
|
||||
147
community-contributions/Reputation_Radar/services/llm.py
Normal file
147
community-contributions/Reputation_Radar/services/llm.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""LLM sentiment analysis and summarization utilities."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence
|
||||
|
||||
try: # pragma: no cover - optional dependency
|
||||
from openai import OpenAI
|
||||
except ModuleNotFoundError: # pragma: no cover
|
||||
OpenAI = None # type: ignore[assignment]
|
||||
|
||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||
|
||||
from .utils import ServiceWarning, chunked
|
||||
|
||||
CLASSIFICATION_SYSTEM_PROMPT = "You are a precise brand-sentiment classifier. Output JSON only."
|
||||
SUMMARY_SYSTEM_PROMPT = "You analyze brand chatter and produce concise, executive-ready summaries."
|
||||
|
||||
|
||||
@dataclass
|
||||
class SentimentResult:
|
||||
"""Structured sentiment output."""
|
||||
|
||||
label: str
|
||||
confidence: float
|
||||
|
||||
|
||||
class LLMService:
|
||||
"""Wrapper around OpenAI with VADER fallback."""
|
||||
|
||||
def __init__(self, api_key: Optional[str], model: str = "gpt-4o-mini", batch_size: int = 20):
|
||||
self.batch_size = max(1, batch_size)
|
||||
self.model = model
|
||||
self.logger = logging.getLogger("services.llm")
|
||||
self._client: Optional[Any] = None
|
||||
self._analyzer = SentimentIntensityAnalyzer()
|
||||
if api_key and OpenAI is not None:
|
||||
try:
|
||||
self._client = OpenAI(api_key=api_key)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.warning("Failed to initialize OpenAI client, using VADER fallback: %s", exc)
|
||||
self._client = None
|
||||
elif api_key and OpenAI is None:
|
||||
self.logger.warning("openai package not installed; falling back to VADER despite API key.")
|
||||
|
||||
def available(self) -> bool:
|
||||
"""Return whether OpenAI-backed features are available."""
|
||||
return self._client is not None
|
||||
|
||||
def classify_sentiment_batch(self, texts: Sequence[str]) -> List[SentimentResult]:
|
||||
"""Classify multiple texts, chunking if necessary."""
|
||||
if not texts:
|
||||
return []
|
||||
if not self.available():
|
||||
return [self._vader_sentiment(text) for text in texts]
|
||||
|
||||
results: List[SentimentResult] = []
|
||||
for chunk in chunked(list(texts), self.batch_size):
|
||||
prompt_lines = ["Classify each item as \"positive\", \"neutral\", or \"negative\".", "Also output a confidence score between 0 and 1.", "Return an array of objects: [{\"label\": \"...\", \"confidence\": 0.0}].", "Items:"]
|
||||
prompt_lines.extend([f"{idx + 1}) {text}" for idx, text in enumerate(chunk)])
|
||||
prompt = "\n".join(prompt_lines)
|
||||
try:
|
||||
response = self._client.responses.create( # type: ignore[union-attr]
|
||||
model=self.model,
|
||||
input=[
|
||||
{"role": "system", "content": CLASSIFICATION_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0,
|
||||
max_output_tokens=500,
|
||||
)
|
||||
output_text = self._extract_text(response)
|
||||
parsed = json.loads(output_text)
|
||||
for item in parsed:
|
||||
results.append(
|
||||
SentimentResult(
|
||||
label=item.get("label", "neutral"),
|
||||
confidence=float(item.get("confidence", 0.5)),
|
||||
)
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.warning("Classification fallback to VADER due to error: %s", exc)
|
||||
for text in chunk:
|
||||
results.append(self._vader_sentiment(text))
|
||||
# Ensure the output length matches input
|
||||
if len(results) != len(texts):
|
||||
# align by padding with neutral
|
||||
results.extend([SentimentResult(label="neutral", confidence=0.33)] * (len(texts) - len(results)))
|
||||
return results
|
||||
|
||||
def summarize_overall(self, findings: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""Create an executive summary using OpenAI."""
|
||||
if not self.available():
|
||||
raise ServiceWarning("OpenAI API key missing. Summary unavailable.")
|
||||
prompt_lines = [
|
||||
"Given these labeled items and their short rationales, write:",
|
||||
"- 5 bullet \"Highlights\"",
|
||||
"- 5 bullet \"Risks & Concerns\"",
|
||||
"- One-line \"Overall Tone\" (Positive/Neutral/Negative with brief justification)",
|
||||
"- 3 \"Recommended Actions\"",
|
||||
"Keep it under 180 words total. Be specific but neutral in tone.",
|
||||
"Items:",
|
||||
]
|
||||
for idx, item in enumerate(findings, start=1):
|
||||
prompt_lines.append(
|
||||
f"{idx}) [{item.get('label','neutral').upper()}] {item.get('text','')}"
|
||||
)
|
||||
prompt = "\n".join(prompt_lines)
|
||||
try:
|
||||
response = self._client.responses.create( # type: ignore[union-attr]
|
||||
model=self.model,
|
||||
input=[
|
||||
{"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_output_tokens=800,
|
||||
)
|
||||
output_text = self._extract_text(response)
|
||||
return {"raw": output_text}
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.error("Failed to generate summary: %s", exc)
|
||||
raise ServiceWarning("Unable to generate executive summary at this time.") from exc
|
||||
|
||||
def _vader_sentiment(self, text: str) -> SentimentResult:
|
||||
scores = self._analyzer.polarity_scores(text)
|
||||
compound = scores["compound"]
|
||||
if compound >= 0.2:
|
||||
label = "positive"
|
||||
elif compound <= -0.2:
|
||||
label = "negative"
|
||||
else:
|
||||
label = "neutral"
|
||||
confidence = min(1.0, max(0.0, abs(compound)))
|
||||
return SentimentResult(label=label, confidence=confidence)
|
||||
|
||||
def _extract_text(self, response: Any) -> str:
|
||||
"""Support multiple OpenAI client response shapes."""
|
||||
if hasattr(response, "output") and response.output:
|
||||
content = response.output[0].content[0]
|
||||
return getattr(content, "text", str(content))
|
||||
if hasattr(response, "choices"):
|
||||
return response.choices[0].message.content # type: ignore[return-value]
|
||||
raise ValueError("Unknown response structure from OpenAI client.")
|
||||
@@ -0,0 +1,141 @@
|
||||
"""Reddit data collection service using PRAW."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
import praw
|
||||
from praw.models import Comment, Submission
|
||||
|
||||
from .utils import (
|
||||
NormalizedItem,
|
||||
ServiceError,
|
||||
ServiceWarning,
|
||||
ensure_timezone,
|
||||
sanitize_text,
|
||||
)
|
||||
|
||||
|
||||
TIME_FILTER_MAP = {
|
||||
"24h": "day",
|
||||
"7d": "week",
|
||||
"30d": "month",
|
||||
}
|
||||
|
||||
|
||||
def _iter_submissions(subreddit: praw.models.Subreddit, query: str, limit: int, time_filter: str) -> Iterable[Submission]:
|
||||
return subreddit.search(query=query, sort="new", time_filter=time_filter, limit=limit * 3)
|
||||
|
||||
|
||||
def _iter_comments(submission: Submission) -> Iterable[Comment]:
|
||||
submission.comments.replace_more(limit=0)
|
||||
return submission.comments.list()
|
||||
|
||||
|
||||
def _normalize_submission(submission: Submission) -> NormalizedItem:
|
||||
created = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
|
||||
return NormalizedItem(
|
||||
source="reddit",
|
||||
id=submission.id,
|
||||
url=f"https://www.reddit.com{submission.permalink}",
|
||||
author=str(submission.author) if submission.author else None,
|
||||
timestamp=ensure_timezone(created),
|
||||
text=f"{submission.title}\n\n{submission.selftext or ''}",
|
||||
meta={
|
||||
"score": submission.score,
|
||||
"num_comments": submission.num_comments,
|
||||
"subreddit": submission.subreddit.display_name,
|
||||
"type": "submission",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _normalize_comment(comment: Comment, submission: Submission) -> NormalizedItem:
|
||||
created = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
|
||||
return NormalizedItem(
|
||||
source="reddit",
|
||||
id=comment.id,
|
||||
url=f"https://www.reddit.com{comment.permalink}",
|
||||
author=str(comment.author) if comment.author else None,
|
||||
timestamp=ensure_timezone(created),
|
||||
text=comment.body,
|
||||
meta={
|
||||
"score": comment.score,
|
||||
"subreddit": submission.subreddit.display_name,
|
||||
"type": "comment",
|
||||
"submission_title": submission.title,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def fetch_mentions(
|
||||
brand: str,
|
||||
credentials: Dict[str, str],
|
||||
limit: int = 25,
|
||||
date_filter: str = "7d",
|
||||
min_upvotes: int = 0,
|
||||
) -> List[NormalizedItem]:
|
||||
"""Fetch recent Reddit submissions/comments mentioning the brand."""
|
||||
client_id = credentials.get("client_id")
|
||||
client_secret = credentials.get("client_secret")
|
||||
user_agent = credentials.get("user_agent")
|
||||
|
||||
if not all([client_id, client_secret, user_agent]):
|
||||
raise ServiceWarning("Reddit credentials are missing. Provide them in the sidebar to enable this source.")
|
||||
|
||||
try:
|
||||
reddit = praw.Reddit(
|
||||
client_id=client_id,
|
||||
client_secret=client_secret,
|
||||
user_agent=user_agent,
|
||||
)
|
||||
reddit.read_only = True
|
||||
except Exception as exc: # noqa: BLE001
|
||||
raise ServiceError(f"Failed to initialize Reddit client: {exc}") from exc
|
||||
|
||||
time_filter = TIME_FILTER_MAP.get(date_filter.lower(), "week")
|
||||
subreddit = reddit.subreddit("all")
|
||||
results: List[NormalizedItem] = []
|
||||
seen_ids: set[str] = set()
|
||||
try:
|
||||
for submission in _iter_submissions(subreddit, query=brand, limit=limit, time_filter=time_filter):
|
||||
if submission.id in seen_ids:
|
||||
continue
|
||||
if submission.score < min_upvotes:
|
||||
continue
|
||||
normalized_submission = _normalize_submission(submission)
|
||||
normalized_submission["text"] = sanitize_text(normalized_submission["text"])
|
||||
if normalized_submission["text"]:
|
||||
results.append(normalized_submission)
|
||||
seen_ids.add(submission.id)
|
||||
if len(results) >= limit:
|
||||
break
|
||||
|
||||
# Fetch comments mentioning the brand
|
||||
match_count = 0
|
||||
for comment in _iter_comments(submission):
|
||||
if brand.lower() not in (comment.body or "").lower():
|
||||
continue
|
||||
if comment.score < min_upvotes:
|
||||
continue
|
||||
normalized_comment = _normalize_comment(comment, submission)
|
||||
normalized_comment["text"] = sanitize_text(normalized_comment["text"])
|
||||
if not normalized_comment["text"]:
|
||||
continue
|
||||
if normalized_comment["id"] in seen_ids:
|
||||
continue
|
||||
results.append(normalized_comment)
|
||||
seen_ids.add(normalized_comment["id"])
|
||||
match_count += 1
|
||||
if len(results) >= limit:
|
||||
break
|
||||
if len(results) >= limit:
|
||||
break
|
||||
# Respect rate limits
|
||||
if match_count:
|
||||
time.sleep(1)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
raise ServiceError(f"Error while fetching Reddit data: {exc}") from exc
|
||||
return results
|
||||
@@ -0,0 +1,138 @@
|
||||
"""Trustpilot scraping service with polite crawling safeguards."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List
|
||||
from urllib.parse import urlencode
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
||||
|
||||
from .utils import (
|
||||
NormalizedItem,
|
||||
ServiceError,
|
||||
ServiceWarning,
|
||||
ensure_timezone,
|
||||
random_user_agent,
|
||||
sanitize_text,
|
||||
)
|
||||
|
||||
BASE_URL = "https://www.trustpilot.com"
|
||||
SEARCH_PATH = "/search"
|
||||
|
||||
|
||||
class BlockedError(ServiceWarning):
|
||||
"""Raised when Trustpilot blocks the scraping attempt."""
|
||||
|
||||
|
||||
def _check_robots(user_agent: str) -> None:
|
||||
parser = RobotFileParser()
|
||||
parser.set_url(f"{BASE_URL}/robots.txt")
|
||||
parser.read()
|
||||
if not parser.can_fetch(user_agent, SEARCH_PATH):
|
||||
raise ServiceWarning(
|
||||
"Trustpilot robots.txt disallows scraping the search endpoint. "
|
||||
"Please use the official API or upload data manually."
|
||||
)
|
||||
|
||||
|
||||
@retry(
|
||||
reraise=True,
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=1, max=8),
|
||||
retry=retry_if_exception_type((requests.RequestException, BlockedError)),
|
||||
)
|
||||
def _fetch_page(session: requests.Session, user_agent: str, page: int, brand: str, language: str) -> str:
|
||||
params = {"query": brand, "page": page}
|
||||
if language:
|
||||
params["languages"] = language
|
||||
url = f"{BASE_URL}{SEARCH_PATH}?{urlencode(params)}"
|
||||
response = session.get(
|
||||
url,
|
||||
headers={"User-Agent": user_agent, "Accept-Language": language or "en"},
|
||||
timeout=20,
|
||||
)
|
||||
if response.status_code in (401, 403):
|
||||
raise BlockedError("Trustpilot denied access (HTTP 403).")
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
|
||||
def _parse_reviews(html: str, user_agent: str) -> List[NormalizedItem]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
cards = soup.select("article[data-service-review-card-layout]")
|
||||
items: List[NormalizedItem] = []
|
||||
now = datetime.now(timezone.utc)
|
||||
for card in cards:
|
||||
link = card.select_one("a.link_internal__YpiJI")
|
||||
url = f"{BASE_URL}{link['href']}" if link and link.get("href") else ""
|
||||
title_el = card.select_one("h2")
|
||||
title = title_el.get_text(strip=True) if title_el else ""
|
||||
text_el = card.select_one("[data-review-description-typography]")
|
||||
text = text_el.get_text(separator=" ", strip=True) if text_el else ""
|
||||
rating_el = card.select_one("img[alt*='stars']")
|
||||
rating = rating_el["alt"] if rating_el and rating_el.get("alt") else ""
|
||||
author_el = card.select_one("span.styles_consumerDetails__ZF4I6")
|
||||
author = author_el.get_text(strip=True) if author_el else None
|
||||
date_el = card.select_one("time")
|
||||
timestamp = now
|
||||
if date_el and date_el.get("datetime"):
|
||||
try:
|
||||
timestamp = datetime.fromisoformat(date_el["datetime"].replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
timestamp = now
|
||||
|
||||
body = sanitize_text(f"{title}\n\n{text}")
|
||||
if len(body) < 15:
|
||||
continue
|
||||
items.append(
|
||||
NormalizedItem(
|
||||
source="trustpilot",
|
||||
id=card.get("data-review-id", str(hash(body))),
|
||||
url=url,
|
||||
author=author,
|
||||
timestamp=ensure_timezone(timestamp),
|
||||
text=body,
|
||||
meta={
|
||||
"rating": rating,
|
||||
"user_agent": user_agent,
|
||||
},
|
||||
)
|
||||
)
|
||||
return items
|
||||
|
||||
|
||||
def fetch_reviews(brand: str, language: str = "en", pages: int = 2) -> List[NormalizedItem]:
|
||||
"""Scrape Trustpilot search results for recent reviews."""
|
||||
if not brand:
|
||||
raise ServiceWarning("Brand name is required for Trustpilot scraping.")
|
||||
|
||||
session = requests.Session()
|
||||
user_agent = random_user_agent()
|
||||
_check_robots(user_agent)
|
||||
|
||||
aggregated: List[NormalizedItem] = []
|
||||
seen_ids: set[str] = set()
|
||||
|
||||
for page in range(1, pages + 1):
|
||||
try:
|
||||
html = _fetch_page(session, user_agent=user_agent, page=page, brand=brand, language=language)
|
||||
except BlockedError as exc:
|
||||
raise ServiceWarning(
|
||||
"Trustpilot blocked the scraping attempt. Consider using their official API or providing CSV uploads."
|
||||
) from exc
|
||||
except requests.RequestException as exc: # noqa: BLE001
|
||||
raise ServiceError(f"Trustpilot request failed: {exc}") from exc
|
||||
page_items = _parse_reviews(html, user_agent)
|
||||
for item in page_items:
|
||||
if item["id"] in seen_ids:
|
||||
continue
|
||||
aggregated.append(item)
|
||||
seen_ids.add(item["id"])
|
||||
time.sleep(1.5) # gentle crawl delay
|
||||
|
||||
return aggregated
|
||||
@@ -0,0 +1,98 @@
|
||||
"""Twitter (X) data collection using the v2 recent search API."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from .utils import NormalizedItem, ServiceError, ServiceWarning, ensure_timezone, sanitize_text
|
||||
|
||||
SEARCH_URL = "https://api.twitter.com/2/tweets/search/recent"
|
||||
|
||||
|
||||
def _build_query(brand: str, language: str) -> str:
|
||||
terms = [brand]
|
||||
if language:
|
||||
terms.append(f"lang:{language}")
|
||||
return " ".join(terms)
|
||||
|
||||
|
||||
def fetch_mentions(
|
||||
brand: str,
|
||||
bearer_token: Optional[str],
|
||||
limit: int = 25,
|
||||
min_likes: int = 0,
|
||||
language: str = "en",
|
||||
) -> List[NormalizedItem]:
|
||||
"""Fetch recent tweets mentioning the brand."""
|
||||
if not bearer_token:
|
||||
raise ServiceWarning(
|
||||
"Twitter bearer token not provided. Add it in the sidebar to enable Twitter ingestion."
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {bearer_token}",
|
||||
"User-Agent": "ReputationRadar/1.0",
|
||||
}
|
||||
params = {
|
||||
"query": _build_query(brand, language),
|
||||
"max_results": min(100, limit),
|
||||
"tweet.fields": "author_id,created_at,lang,public_metrics",
|
||||
"expansions": "author_id",
|
||||
"user.fields": "name,username",
|
||||
}
|
||||
|
||||
collected: List[NormalizedItem] = []
|
||||
next_token: Optional[str] = None
|
||||
|
||||
while len(collected) < limit:
|
||||
if next_token:
|
||||
params["next_token"] = next_token
|
||||
response = requests.get(SEARCH_URL, headers=headers, params=params, timeout=15)
|
||||
if response.status_code == 401:
|
||||
raise ServiceWarning("Twitter API authentication failed. Please verify the bearer token.")
|
||||
if response.status_code == 429:
|
||||
time.sleep(5)
|
||||
continue
|
||||
if response.status_code >= 400:
|
||||
raise ServiceError(f"Twitter API error {response.status_code}: {response.text}")
|
||||
|
||||
payload = response.json()
|
||||
data = payload.get("data", [])
|
||||
includes = payload.get("includes", {})
|
||||
users_index = {user["id"]: user for user in includes.get("users", [])}
|
||||
|
||||
for tweet in data:
|
||||
created_at = datetime.fromisoformat(tweet["created_at"].replace("Z", "+00:00"))
|
||||
author_info = users_index.get(tweet["author_id"], {})
|
||||
item = NormalizedItem(
|
||||
source="twitter",
|
||||
id=tweet["id"],
|
||||
url=f"https://twitter.com/{author_info.get('username','')}/status/{tweet['id']}",
|
||||
author=author_info.get("username"),
|
||||
timestamp=ensure_timezone(created_at),
|
||||
text=sanitize_text(tweet["text"]),
|
||||
meta={
|
||||
"likes": tweet.get("public_metrics", {}).get("like_count", 0),
|
||||
"retweets": tweet.get("public_metrics", {}).get("retweet_count", 0),
|
||||
"replies": tweet.get("public_metrics", {}).get("reply_count", 0),
|
||||
"quote_count": tweet.get("public_metrics", {}).get("quote_count", 0),
|
||||
},
|
||||
)
|
||||
if not item["text"]:
|
||||
continue
|
||||
if item["meta"]["likes"] < min_likes:
|
||||
continue
|
||||
collected.append(item)
|
||||
if len(collected) >= limit:
|
||||
break
|
||||
|
||||
next_token = payload.get("meta", {}).get("next_token")
|
||||
if not next_token:
|
||||
break
|
||||
time.sleep(1) # stay friendly to rate limits
|
||||
|
||||
return collected[:limit]
|
||||
217
community-contributions/Reputation_Radar/services/utils.py
Normal file
217
community-contributions/Reputation_Radar/services/utils.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""Utility helpers for ReputationRadar services."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, TypedDict
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
|
||||
LOG_FILE = Path(__file__).resolve().parents[1] / "logs" / "app.log"
|
||||
MIN_TEXT_LENGTH = 15
|
||||
SIMILARITY_THRESHOLD = 90
|
||||
|
||||
|
||||
class NormalizedItem(TypedDict):
|
||||
"""Canonical representation of a fetched mention."""
|
||||
|
||||
source: str
|
||||
id: str
|
||||
url: str
|
||||
author: Optional[str]
|
||||
timestamp: datetime
|
||||
text: str
|
||||
meta: Dict[str, object]
|
||||
|
||||
|
||||
class ServiceError(RuntimeError):
|
||||
"""Raised when a service hard fails."""
|
||||
|
||||
|
||||
class ServiceWarning(RuntimeError):
|
||||
"""Raised for recoverable issues that should surface to the UI."""
|
||||
|
||||
|
||||
def initialize_logger(name: str = "reputation_radar") -> logging.Logger:
|
||||
"""Configure and return a module-level logger."""
|
||||
LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE, encoding="utf-8"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(logging.INFO)
|
||||
return logger
|
||||
|
||||
|
||||
def load_sample_items(name: str) -> List[NormalizedItem]:
|
||||
"""Load demo data from the samples directory."""
|
||||
samples_dir = Path(__file__).resolve().parents[1] / "samples"
|
||||
sample_path = samples_dir / f"{name}.json"
|
||||
if not sample_path.exists():
|
||||
return []
|
||||
with sample_path.open("r", encoding="utf-8") as handle:
|
||||
raw_items = json.load(handle)
|
||||
cleaned: List[NormalizedItem] = []
|
||||
for item in raw_items:
|
||||
try:
|
||||
cleaned.append(
|
||||
NormalizedItem(
|
||||
source=item["source"],
|
||||
id=str(item["id"]),
|
||||
url=item.get("url", ""),
|
||||
author=item.get("author"),
|
||||
timestamp=datetime.fromisoformat(item["timestamp"]),
|
||||
text=item["text"],
|
||||
meta=item.get("meta", {}),
|
||||
)
|
||||
)
|
||||
except (KeyError, ValueError):
|
||||
continue
|
||||
return cleaned
|
||||
|
||||
|
||||
def strip_html(value: str) -> str:
|
||||
"""Remove HTML tags and normalize whitespace."""
|
||||
if not value:
|
||||
return ""
|
||||
soup = BeautifulSoup(value, "html.parser")
|
||||
text = soup.get_text(separator=" ", strip=True)
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")
|
||||
return text.strip()
|
||||
|
||||
|
||||
def sanitize_text(value: str) -> str:
|
||||
"""Clean text and remove excessive noise."""
|
||||
text = strip_html(value)
|
||||
text = re.sub(r"http\S+", "", text) # drop inline URLs
|
||||
text = re.sub(r"\s{2,}", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def drop_short_items(items: Iterable[NormalizedItem], minimum_length: int = MIN_TEXT_LENGTH) -> List[NormalizedItem]:
|
||||
"""Filter out items that are too short to analyze."""
|
||||
return [
|
||||
item
|
||||
for item in items
|
||||
if len(item["text"]) >= minimum_length
|
||||
]
|
||||
|
||||
|
||||
def fuzzy_deduplicate(items: Sequence[NormalizedItem], threshold: int = SIMILARITY_THRESHOLD) -> List[NormalizedItem]:
|
||||
"""Remove duplicates based on URL or fuzzy text similarity."""
|
||||
seen_urls: set[str] = set()
|
||||
deduped: List[NormalizedItem] = []
|
||||
for item in items:
|
||||
url = item.get("url") or ""
|
||||
text = item.get("text") or ""
|
||||
if url and url in seen_urls:
|
||||
continue
|
||||
duplicate_found = False
|
||||
for existing in deduped:
|
||||
if not text or not existing.get("text"):
|
||||
continue
|
||||
if fuzz.token_set_ratio(text, existing["text"]) >= threshold:
|
||||
duplicate_found = True
|
||||
break
|
||||
if not duplicate_found:
|
||||
deduped.append(item)
|
||||
if url:
|
||||
seen_urls.add(url)
|
||||
return deduped
|
||||
|
||||
|
||||
def normalize_items(items: Sequence[NormalizedItem]) -> List[NormalizedItem]:
|
||||
"""Apply sanitization, deduplication, and drop noisy entries."""
|
||||
sanitized: List[NormalizedItem] = []
|
||||
for item in items:
|
||||
cleaned_text = sanitize_text(item.get("text", ""))
|
||||
if len(cleaned_text) < MIN_TEXT_LENGTH:
|
||||
continue
|
||||
sanitized.append(
|
||||
NormalizedItem(
|
||||
source=item["source"],
|
||||
id=item["id"],
|
||||
url=item.get("url", ""),
|
||||
author=item.get("author"),
|
||||
timestamp=item["timestamp"],
|
||||
text=cleaned_text,
|
||||
meta=item.get("meta", {}),
|
||||
)
|
||||
)
|
||||
return fuzzy_deduplicate(sanitized)
|
||||
|
||||
|
||||
def parse_date_range(option: str) -> datetime:
|
||||
"""Return a UTC timestamp threshold for the given range identifier."""
|
||||
now = datetime.now(timezone.utc)
|
||||
option = option.lower()
|
||||
delta = {
|
||||
"24h": timedelta(days=1),
|
||||
"7d": timedelta(days=7),
|
||||
"30d": timedelta(days=30),
|
||||
}.get(option, timedelta(days=7))
|
||||
return now - delta
|
||||
|
||||
|
||||
def random_user_agent() -> str:
|
||||
"""Return a random user agent string for polite scraping."""
|
||||
user_agents = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3) AppleWebKit/605.1.15 "
|
||||
"(KHTML, like Gecko) Version/16.4 Safari/605.1.15",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0",
|
||||
]
|
||||
return random.choice(user_agents)
|
||||
|
||||
|
||||
def chunked(iterable: Sequence[str], size: int) -> Iterator[Sequence[str]]:
|
||||
"""Yield successive chunks from iterable."""
|
||||
for start in range(0, len(iterable), size):
|
||||
yield iterable[start : start + size]
|
||||
|
||||
|
||||
def validate_openai_key(api_key: Optional[str]) -> Tuple[Optional[str], List[str]]:
|
||||
"""Validate an OpenAI key following the guidance from day1 notebook."""
|
||||
warnings: List[str] = []
|
||||
if not api_key:
|
||||
warnings.append("No OpenAI API key detected. VADER fallback will be used.")
|
||||
return None, warnings
|
||||
if not api_key.startswith("sk-"):
|
||||
warnings.append(
|
||||
"Provided OpenAI API key does not start with the expected prefix (sk-)."
|
||||
)
|
||||
if api_key.strip() != api_key:
|
||||
warnings.append("OpenAI API key looks like it has leading or trailing whitespace.")
|
||||
api_key = api_key.strip()
|
||||
return api_key, warnings
|
||||
|
||||
|
||||
def ensure_timezone(ts: datetime) -> datetime:
|
||||
"""Guarantee timestamps are timezone-aware in UTC."""
|
||||
if ts.tzinfo is None:
|
||||
return ts.replace(tzinfo=timezone.utc)
|
||||
return ts.astimezone(timezone.utc)
|
||||
|
||||
|
||||
def safe_int(value: Optional[object], default: int = 0) -> int:
|
||||
"""Convert a value to int with a fallback."""
|
||||
try:
|
||||
return int(value) # type: ignore[arg-type]
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
Reference in New Issue
Block a user