Add ReputationRadar community contribution (demo replaced by link)

This commit is contained in:
Parth Verma
2025-10-22 15:30:00 +05:30
parent 9b84cc62c0
commit a3ee215468
22 changed files with 1794 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
"""Service layer exports for ReputationRadar."""
from . import llm, reddit_client, trustpilot_scraper, twitter_client, utils
__all__ = [
"llm",
"reddit_client",
"trustpilot_scraper",
"twitter_client",
"utils",
]

View File

@@ -0,0 +1,147 @@
"""LLM sentiment analysis and summarization utilities."""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass
from typing import Any, Dict, Iterable, List, Optional, Sequence
try: # pragma: no cover - optional dependency
from openai import OpenAI
except ModuleNotFoundError: # pragma: no cover
OpenAI = None # type: ignore[assignment]
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from .utils import ServiceWarning, chunked
CLASSIFICATION_SYSTEM_PROMPT = "You are a precise brand-sentiment classifier. Output JSON only."
SUMMARY_SYSTEM_PROMPT = "You analyze brand chatter and produce concise, executive-ready summaries."
@dataclass
class SentimentResult:
"""Structured sentiment output."""
label: str
confidence: float
class LLMService:
"""Wrapper around OpenAI with VADER fallback."""
def __init__(self, api_key: Optional[str], model: str = "gpt-4o-mini", batch_size: int = 20):
self.batch_size = max(1, batch_size)
self.model = model
self.logger = logging.getLogger("services.llm")
self._client: Optional[Any] = None
self._analyzer = SentimentIntensityAnalyzer()
if api_key and OpenAI is not None:
try:
self._client = OpenAI(api_key=api_key)
except Exception as exc: # noqa: BLE001
self.logger.warning("Failed to initialize OpenAI client, using VADER fallback: %s", exc)
self._client = None
elif api_key and OpenAI is None:
self.logger.warning("openai package not installed; falling back to VADER despite API key.")
def available(self) -> bool:
"""Return whether OpenAI-backed features are available."""
return self._client is not None
def classify_sentiment_batch(self, texts: Sequence[str]) -> List[SentimentResult]:
"""Classify multiple texts, chunking if necessary."""
if not texts:
return []
if not self.available():
return [self._vader_sentiment(text) for text in texts]
results: List[SentimentResult] = []
for chunk in chunked(list(texts), self.batch_size):
prompt_lines = ["Classify each item as \"positive\", \"neutral\", or \"negative\".", "Also output a confidence score between 0 and 1.", "Return an array of objects: [{\"label\": \"...\", \"confidence\": 0.0}].", "Items:"]
prompt_lines.extend([f"{idx + 1}) {text}" for idx, text in enumerate(chunk)])
prompt = "\n".join(prompt_lines)
try:
response = self._client.responses.create( # type: ignore[union-attr]
model=self.model,
input=[
{"role": "system", "content": CLASSIFICATION_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
temperature=0,
max_output_tokens=500,
)
output_text = self._extract_text(response)
parsed = json.loads(output_text)
for item in parsed:
results.append(
SentimentResult(
label=item.get("label", "neutral"),
confidence=float(item.get("confidence", 0.5)),
)
)
except Exception as exc: # noqa: BLE001
self.logger.warning("Classification fallback to VADER due to error: %s", exc)
for text in chunk:
results.append(self._vader_sentiment(text))
# Ensure the output length matches input
if len(results) != len(texts):
# align by padding with neutral
results.extend([SentimentResult(label="neutral", confidence=0.33)] * (len(texts) - len(results)))
return results
def summarize_overall(self, findings: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Create an executive summary using OpenAI."""
if not self.available():
raise ServiceWarning("OpenAI API key missing. Summary unavailable.")
prompt_lines = [
"Given these labeled items and their short rationales, write:",
"- 5 bullet \"Highlights\"",
"- 5 bullet \"Risks & Concerns\"",
"- One-line \"Overall Tone\" (Positive/Neutral/Negative with brief justification)",
"- 3 \"Recommended Actions\"",
"Keep it under 180 words total. Be specific but neutral in tone.",
"Items:",
]
for idx, item in enumerate(findings, start=1):
prompt_lines.append(
f"{idx}) [{item.get('label','neutral').upper()}] {item.get('text','')}"
)
prompt = "\n".join(prompt_lines)
try:
response = self._client.responses.create( # type: ignore[union-attr]
model=self.model,
input=[
{"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
temperature=0.2,
max_output_tokens=800,
)
output_text = self._extract_text(response)
return {"raw": output_text}
except Exception as exc: # noqa: BLE001
self.logger.error("Failed to generate summary: %s", exc)
raise ServiceWarning("Unable to generate executive summary at this time.") from exc
def _vader_sentiment(self, text: str) -> SentimentResult:
scores = self._analyzer.polarity_scores(text)
compound = scores["compound"]
if compound >= 0.2:
label = "positive"
elif compound <= -0.2:
label = "negative"
else:
label = "neutral"
confidence = min(1.0, max(0.0, abs(compound)))
return SentimentResult(label=label, confidence=confidence)
def _extract_text(self, response: Any) -> str:
"""Support multiple OpenAI client response shapes."""
if hasattr(response, "output") and response.output:
content = response.output[0].content[0]
return getattr(content, "text", str(content))
if hasattr(response, "choices"):
return response.choices[0].message.content # type: ignore[return-value]
raise ValueError("Unknown response structure from OpenAI client.")

View File

@@ -0,0 +1,141 @@
"""Reddit data collection service using PRAW."""
from __future__ import annotations
import time
from datetime import datetime, timezone
from typing import Dict, Iterable, List, Optional
import praw
from praw.models import Comment, Submission
from .utils import (
NormalizedItem,
ServiceError,
ServiceWarning,
ensure_timezone,
sanitize_text,
)
TIME_FILTER_MAP = {
"24h": "day",
"7d": "week",
"30d": "month",
}
def _iter_submissions(subreddit: praw.models.Subreddit, query: str, limit: int, time_filter: str) -> Iterable[Submission]:
return subreddit.search(query=query, sort="new", time_filter=time_filter, limit=limit * 3)
def _iter_comments(submission: Submission) -> Iterable[Comment]:
submission.comments.replace_more(limit=0)
return submission.comments.list()
def _normalize_submission(submission: Submission) -> NormalizedItem:
created = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
return NormalizedItem(
source="reddit",
id=submission.id,
url=f"https://www.reddit.com{submission.permalink}",
author=str(submission.author) if submission.author else None,
timestamp=ensure_timezone(created),
text=f"{submission.title}\n\n{submission.selftext or ''}",
meta={
"score": submission.score,
"num_comments": submission.num_comments,
"subreddit": submission.subreddit.display_name,
"type": "submission",
},
)
def _normalize_comment(comment: Comment, submission: Submission) -> NormalizedItem:
created = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
return NormalizedItem(
source="reddit",
id=comment.id,
url=f"https://www.reddit.com{comment.permalink}",
author=str(comment.author) if comment.author else None,
timestamp=ensure_timezone(created),
text=comment.body,
meta={
"score": comment.score,
"subreddit": submission.subreddit.display_name,
"type": "comment",
"submission_title": submission.title,
},
)
def fetch_mentions(
brand: str,
credentials: Dict[str, str],
limit: int = 25,
date_filter: str = "7d",
min_upvotes: int = 0,
) -> List[NormalizedItem]:
"""Fetch recent Reddit submissions/comments mentioning the brand."""
client_id = credentials.get("client_id")
client_secret = credentials.get("client_secret")
user_agent = credentials.get("user_agent")
if not all([client_id, client_secret, user_agent]):
raise ServiceWarning("Reddit credentials are missing. Provide them in the sidebar to enable this source.")
try:
reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent,
)
reddit.read_only = True
except Exception as exc: # noqa: BLE001
raise ServiceError(f"Failed to initialize Reddit client: {exc}") from exc
time_filter = TIME_FILTER_MAP.get(date_filter.lower(), "week")
subreddit = reddit.subreddit("all")
results: List[NormalizedItem] = []
seen_ids: set[str] = set()
try:
for submission in _iter_submissions(subreddit, query=brand, limit=limit, time_filter=time_filter):
if submission.id in seen_ids:
continue
if submission.score < min_upvotes:
continue
normalized_submission = _normalize_submission(submission)
normalized_submission["text"] = sanitize_text(normalized_submission["text"])
if normalized_submission["text"]:
results.append(normalized_submission)
seen_ids.add(submission.id)
if len(results) >= limit:
break
# Fetch comments mentioning the brand
match_count = 0
for comment in _iter_comments(submission):
if brand.lower() not in (comment.body or "").lower():
continue
if comment.score < min_upvotes:
continue
normalized_comment = _normalize_comment(comment, submission)
normalized_comment["text"] = sanitize_text(normalized_comment["text"])
if not normalized_comment["text"]:
continue
if normalized_comment["id"] in seen_ids:
continue
results.append(normalized_comment)
seen_ids.add(normalized_comment["id"])
match_count += 1
if len(results) >= limit:
break
if len(results) >= limit:
break
# Respect rate limits
if match_count:
time.sleep(1)
except Exception as exc: # noqa: BLE001
raise ServiceError(f"Error while fetching Reddit data: {exc}") from exc
return results

View File

@@ -0,0 +1,138 @@
"""Trustpilot scraping service with polite crawling safeguards."""
from __future__ import annotations
import time
from datetime import datetime, timezone
from typing import Dict, List
from urllib.parse import urlencode
from urllib.robotparser import RobotFileParser
import requests
from bs4 import BeautifulSoup
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
from .utils import (
NormalizedItem,
ServiceError,
ServiceWarning,
ensure_timezone,
random_user_agent,
sanitize_text,
)
BASE_URL = "https://www.trustpilot.com"
SEARCH_PATH = "/search"
class BlockedError(ServiceWarning):
"""Raised when Trustpilot blocks the scraping attempt."""
def _check_robots(user_agent: str) -> None:
parser = RobotFileParser()
parser.set_url(f"{BASE_URL}/robots.txt")
parser.read()
if not parser.can_fetch(user_agent, SEARCH_PATH):
raise ServiceWarning(
"Trustpilot robots.txt disallows scraping the search endpoint. "
"Please use the official API or upload data manually."
)
@retry(
reraise=True,
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=1, max=8),
retry=retry_if_exception_type((requests.RequestException, BlockedError)),
)
def _fetch_page(session: requests.Session, user_agent: str, page: int, brand: str, language: str) -> str:
params = {"query": brand, "page": page}
if language:
params["languages"] = language
url = f"{BASE_URL}{SEARCH_PATH}?{urlencode(params)}"
response = session.get(
url,
headers={"User-Agent": user_agent, "Accept-Language": language or "en"},
timeout=20,
)
if response.status_code in (401, 403):
raise BlockedError("Trustpilot denied access (HTTP 403).")
response.raise_for_status()
return response.text
def _parse_reviews(html: str, user_agent: str) -> List[NormalizedItem]:
soup = BeautifulSoup(html, "html.parser")
cards = soup.select("article[data-service-review-card-layout]")
items: List[NormalizedItem] = []
now = datetime.now(timezone.utc)
for card in cards:
link = card.select_one("a.link_internal__YpiJI")
url = f"{BASE_URL}{link['href']}" if link and link.get("href") else ""
title_el = card.select_one("h2")
title = title_el.get_text(strip=True) if title_el else ""
text_el = card.select_one("[data-review-description-typography]")
text = text_el.get_text(separator=" ", strip=True) if text_el else ""
rating_el = card.select_one("img[alt*='stars']")
rating = rating_el["alt"] if rating_el and rating_el.get("alt") else ""
author_el = card.select_one("span.styles_consumerDetails__ZF4I6")
author = author_el.get_text(strip=True) if author_el else None
date_el = card.select_one("time")
timestamp = now
if date_el and date_el.get("datetime"):
try:
timestamp = datetime.fromisoformat(date_el["datetime"].replace("Z", "+00:00"))
except ValueError:
timestamp = now
body = sanitize_text(f"{title}\n\n{text}")
if len(body) < 15:
continue
items.append(
NormalizedItem(
source="trustpilot",
id=card.get("data-review-id", str(hash(body))),
url=url,
author=author,
timestamp=ensure_timezone(timestamp),
text=body,
meta={
"rating": rating,
"user_agent": user_agent,
},
)
)
return items
def fetch_reviews(brand: str, language: str = "en", pages: int = 2) -> List[NormalizedItem]:
"""Scrape Trustpilot search results for recent reviews."""
if not brand:
raise ServiceWarning("Brand name is required for Trustpilot scraping.")
session = requests.Session()
user_agent = random_user_agent()
_check_robots(user_agent)
aggregated: List[NormalizedItem] = []
seen_ids: set[str] = set()
for page in range(1, pages + 1):
try:
html = _fetch_page(session, user_agent=user_agent, page=page, brand=brand, language=language)
except BlockedError as exc:
raise ServiceWarning(
"Trustpilot blocked the scraping attempt. Consider using their official API or providing CSV uploads."
) from exc
except requests.RequestException as exc: # noqa: BLE001
raise ServiceError(f"Trustpilot request failed: {exc}") from exc
page_items = _parse_reviews(html, user_agent)
for item in page_items:
if item["id"] in seen_ids:
continue
aggregated.append(item)
seen_ids.add(item["id"])
time.sleep(1.5) # gentle crawl delay
return aggregated

View File

@@ -0,0 +1,98 @@
"""Twitter (X) data collection using the v2 recent search API."""
from __future__ import annotations
import time
from datetime import datetime, timezone
from typing import Dict, List, Optional
import requests
from .utils import NormalizedItem, ServiceError, ServiceWarning, ensure_timezone, sanitize_text
SEARCH_URL = "https://api.twitter.com/2/tweets/search/recent"
def _build_query(brand: str, language: str) -> str:
terms = [brand]
if language:
terms.append(f"lang:{language}")
return " ".join(terms)
def fetch_mentions(
brand: str,
bearer_token: Optional[str],
limit: int = 25,
min_likes: int = 0,
language: str = "en",
) -> List[NormalizedItem]:
"""Fetch recent tweets mentioning the brand."""
if not bearer_token:
raise ServiceWarning(
"Twitter bearer token not provided. Add it in the sidebar to enable Twitter ingestion."
)
headers = {
"Authorization": f"Bearer {bearer_token}",
"User-Agent": "ReputationRadar/1.0",
}
params = {
"query": _build_query(brand, language),
"max_results": min(100, limit),
"tweet.fields": "author_id,created_at,lang,public_metrics",
"expansions": "author_id",
"user.fields": "name,username",
}
collected: List[NormalizedItem] = []
next_token: Optional[str] = None
while len(collected) < limit:
if next_token:
params["next_token"] = next_token
response = requests.get(SEARCH_URL, headers=headers, params=params, timeout=15)
if response.status_code == 401:
raise ServiceWarning("Twitter API authentication failed. Please verify the bearer token.")
if response.status_code == 429:
time.sleep(5)
continue
if response.status_code >= 400:
raise ServiceError(f"Twitter API error {response.status_code}: {response.text}")
payload = response.json()
data = payload.get("data", [])
includes = payload.get("includes", {})
users_index = {user["id"]: user for user in includes.get("users", [])}
for tweet in data:
created_at = datetime.fromisoformat(tweet["created_at"].replace("Z", "+00:00"))
author_info = users_index.get(tweet["author_id"], {})
item = NormalizedItem(
source="twitter",
id=tweet["id"],
url=f"https://twitter.com/{author_info.get('username','')}/status/{tweet['id']}",
author=author_info.get("username"),
timestamp=ensure_timezone(created_at),
text=sanitize_text(tweet["text"]),
meta={
"likes": tweet.get("public_metrics", {}).get("like_count", 0),
"retweets": tweet.get("public_metrics", {}).get("retweet_count", 0),
"replies": tweet.get("public_metrics", {}).get("reply_count", 0),
"quote_count": tweet.get("public_metrics", {}).get("quote_count", 0),
},
)
if not item["text"]:
continue
if item["meta"]["likes"] < min_likes:
continue
collected.append(item)
if len(collected) >= limit:
break
next_token = payload.get("meta", {}).get("next_token")
if not next_token:
break
time.sleep(1) # stay friendly to rate limits
return collected[:limit]

View File

@@ -0,0 +1,217 @@
"""Utility helpers for ReputationRadar services."""
from __future__ import annotations
import json
import logging
import os
import random
import re
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, TypedDict
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
LOG_FILE = Path(__file__).resolve().parents[1] / "logs" / "app.log"
MIN_TEXT_LENGTH = 15
SIMILARITY_THRESHOLD = 90
class NormalizedItem(TypedDict):
"""Canonical representation of a fetched mention."""
source: str
id: str
url: str
author: Optional[str]
timestamp: datetime
text: str
meta: Dict[str, object]
class ServiceError(RuntimeError):
"""Raised when a service hard fails."""
class ServiceWarning(RuntimeError):
"""Raised for recoverable issues that should surface to the UI."""
def initialize_logger(name: str = "reputation_radar") -> logging.Logger:
"""Configure and return a module-level logger."""
LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
handlers=[
logging.FileHandler(LOG_FILE, encoding="utf-8"),
logging.StreamHandler(),
],
)
logger = logging.getLogger(name)
logger.setLevel(logging.INFO)
return logger
def load_sample_items(name: str) -> List[NormalizedItem]:
"""Load demo data from the samples directory."""
samples_dir = Path(__file__).resolve().parents[1] / "samples"
sample_path = samples_dir / f"{name}.json"
if not sample_path.exists():
return []
with sample_path.open("r", encoding="utf-8") as handle:
raw_items = json.load(handle)
cleaned: List[NormalizedItem] = []
for item in raw_items:
try:
cleaned.append(
NormalizedItem(
source=item["source"],
id=str(item["id"]),
url=item.get("url", ""),
author=item.get("author"),
timestamp=datetime.fromisoformat(item["timestamp"]),
text=item["text"],
meta=item.get("meta", {}),
)
)
except (KeyError, ValueError):
continue
return cleaned
def strip_html(value: str) -> str:
"""Remove HTML tags and normalize whitespace."""
if not value:
return ""
soup = BeautifulSoup(value, "html.parser")
text = soup.get_text(separator=" ", strip=True)
text = re.sub(r"\s+", " ", text)
text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")
return text.strip()
def sanitize_text(value: str) -> str:
"""Clean text and remove excessive noise."""
text = strip_html(value)
text = re.sub(r"http\S+", "", text) # drop inline URLs
text = re.sub(r"\s{2,}", " ", text)
return text.strip()
def drop_short_items(items: Iterable[NormalizedItem], minimum_length: int = MIN_TEXT_LENGTH) -> List[NormalizedItem]:
"""Filter out items that are too short to analyze."""
return [
item
for item in items
if len(item["text"]) >= minimum_length
]
def fuzzy_deduplicate(items: Sequence[NormalizedItem], threshold: int = SIMILARITY_THRESHOLD) -> List[NormalizedItem]:
"""Remove duplicates based on URL or fuzzy text similarity."""
seen_urls: set[str] = set()
deduped: List[NormalizedItem] = []
for item in items:
url = item.get("url") or ""
text = item.get("text") or ""
if url and url in seen_urls:
continue
duplicate_found = False
for existing in deduped:
if not text or not existing.get("text"):
continue
if fuzz.token_set_ratio(text, existing["text"]) >= threshold:
duplicate_found = True
break
if not duplicate_found:
deduped.append(item)
if url:
seen_urls.add(url)
return deduped
def normalize_items(items: Sequence[NormalizedItem]) -> List[NormalizedItem]:
"""Apply sanitization, deduplication, and drop noisy entries."""
sanitized: List[NormalizedItem] = []
for item in items:
cleaned_text = sanitize_text(item.get("text", ""))
if len(cleaned_text) < MIN_TEXT_LENGTH:
continue
sanitized.append(
NormalizedItem(
source=item["source"],
id=item["id"],
url=item.get("url", ""),
author=item.get("author"),
timestamp=item["timestamp"],
text=cleaned_text,
meta=item.get("meta", {}),
)
)
return fuzzy_deduplicate(sanitized)
def parse_date_range(option: str) -> datetime:
"""Return a UTC timestamp threshold for the given range identifier."""
now = datetime.now(timezone.utc)
option = option.lower()
delta = {
"24h": timedelta(days=1),
"7d": timedelta(days=7),
"30d": timedelta(days=30),
}.get(option, timedelta(days=7))
return now - delta
def random_user_agent() -> str:
"""Return a random user agent string for polite scraping."""
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/16.4 Safari/605.1.15",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0",
]
return random.choice(user_agents)
def chunked(iterable: Sequence[str], size: int) -> Iterator[Sequence[str]]:
"""Yield successive chunks from iterable."""
for start in range(0, len(iterable), size):
yield iterable[start : start + size]
def validate_openai_key(api_key: Optional[str]) -> Tuple[Optional[str], List[str]]:
"""Validate an OpenAI key following the guidance from day1 notebook."""
warnings: List[str] = []
if not api_key:
warnings.append("No OpenAI API key detected. VADER fallback will be used.")
return None, warnings
if not api_key.startswith("sk-"):
warnings.append(
"Provided OpenAI API key does not start with the expected prefix (sk-)."
)
if api_key.strip() != api_key:
warnings.append("OpenAI API key looks like it has leading or trailing whitespace.")
api_key = api_key.strip()
return api_key, warnings
def ensure_timezone(ts: datetime) -> datetime:
"""Guarantee timestamps are timezone-aware in UTC."""
if ts.tzinfo is None:
return ts.replace(tzinfo=timezone.utc)
return ts.astimezone(timezone.utc)
def safe_int(value: Optional[object], default: int = 0) -> int:
"""Convert a value to int with a fallback."""
try:
return int(value) # type: ignore[arg-type]
except (TypeError, ValueError):
return default