AI Brochure Creator, powered with GPT-5 Nano and partially utilizing Responses API
This commit is contained in:
@@ -0,0 +1,286 @@
|
||||
from ipaddress import ip_address, IPv4Address, IPv6Address
|
||||
from urllib.parse import ParseResult, urlparse
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from requests import get, RequestException, Session
|
||||
|
||||
class Extractor:
|
||||
"""
|
||||
Extracts and processes content from HTML response text using BeautifulSoup.
|
||||
"""
|
||||
__soup: BeautifulSoup
|
||||
|
||||
__extracted_title: str = ""
|
||||
@property
|
||||
def extracted_title(self) -> str:
|
||||
"""
|
||||
Returns the extracted title from the HTML content.
|
||||
"""
|
||||
if not self.__extracted_title:
|
||||
self.__extracted_title = self.get_title()
|
||||
return self.__extracted_title
|
||||
|
||||
__extracted_text: str = ""
|
||||
@property
|
||||
def extracted_text(self) -> str:
|
||||
"""
|
||||
Returns the extracted main text content from the HTML, excluding irrelevant tags.
|
||||
"""
|
||||
if not self.__extracted_text:
|
||||
self.__extracted_text = self.get_text()
|
||||
return self.__extracted_text
|
||||
|
||||
__extracted_links_on_page: list[str] | None = None
|
||||
@property
|
||||
def extracted_links_on_page(self) -> list[str]:
|
||||
"""
|
||||
Return all href values found on the page.
|
||||
|
||||
Notes:
|
||||
- Only anchor tags with an href are included.
|
||||
- Values are returned as-is (may be relative or absolute).
|
||||
"""
|
||||
if self.__extracted_links_on_page is None:
|
||||
self.__extracted_links_on_page = [str(a.get("href")) for a in self._soup.find_all('a', href=True) if isinstance(a, Tag)]
|
||||
return self.__extracted_links_on_page
|
||||
|
||||
@property
|
||||
def _soup(self) -> BeautifulSoup:
|
||||
"""
|
||||
Returns the BeautifulSoup object for the HTML content.
|
||||
"""
|
||||
return self.__soup
|
||||
|
||||
def __init__(self, response_text_content: str) -> None:
|
||||
"""
|
||||
Initializes the Extractor with HTML response text.
|
||||
|
||||
Parameters:
|
||||
response_text_content (str): The HTML response text to be processed.
|
||||
"""
|
||||
self.__soup = BeautifulSoup(response_text_content, "html.parser")
|
||||
self.__extracted_links_on_page = None
|
||||
|
||||
def get_title(self) -> str:
|
||||
"""
|
||||
Extracts the title from the HTML content.
|
||||
"""
|
||||
return self._soup.title.get_text() if self._soup.title is not None else "No title"
|
||||
|
||||
def get_text(self) -> str:
|
||||
"""
|
||||
Extracts and cleans the main text content from the HTML, removing irrelevant tags.
|
||||
"""
|
||||
for irrelevant in self._soup.find_all(["script", "style", "img", "figure", "video", "audio", "button", "svg", "canvas", "input", "form", "meta"]):
|
||||
irrelevant.decompose()
|
||||
raw_text: str = self._soup.get_text(separator="\n")
|
||||
cleaned_text: str = " ".join(raw_text.split())
|
||||
return cleaned_text if cleaned_text else "No content"
|
||||
|
||||
class Website:
|
||||
"""
|
||||
A class to represent a website.
|
||||
"""
|
||||
|
||||
__DEFAULT_ALLOWED_DOMAINS: list[str] = [".com", ".org", ".net"]
|
||||
|
||||
__title: str = ""
|
||||
__website_url: str = ""
|
||||
__text: str = ""
|
||||
__allowed_domains: list[str] = []
|
||||
__links_on_page: list[str] | None = None
|
||||
|
||||
@property
|
||||
def title(self) -> str:
|
||||
"""
|
||||
Returns the title of the website.
|
||||
"""
|
||||
return self.__title
|
||||
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""
|
||||
Returns the main text content of the website.
|
||||
"""
|
||||
return self.__text
|
||||
|
||||
@property
|
||||
def website_url(self) -> str:
|
||||
"""
|
||||
Returns the URL of the website.
|
||||
"""
|
||||
return self.__website_url
|
||||
|
||||
@property
|
||||
def links_on_page(self) -> list[str] | None:
|
||||
"""
|
||||
Returns the list of links extracted from the website.
|
||||
"""
|
||||
return self.__links_on_page
|
||||
|
||||
@property
|
||||
def _allowed_domains(self) -> list[str]:
|
||||
"""
|
||||
Returns the list of allowed domain suffixes.
|
||||
"""
|
||||
return self.__allowed_domains
|
||||
|
||||
@_allowed_domains.setter
|
||||
def _allowed_domains(self, value: list[str] | str) -> None:
|
||||
"""
|
||||
Sets the list of allowed domain suffixes.
|
||||
Filters out empty strings and ensures each suffix starts with a dot.
|
||||
"""
|
||||
if isinstance(value, str):
|
||||
value = [
|
||||
item.strip() if item.strip().startswith(".") else f".{item.strip()}"
|
||||
for item in value.split(",")
|
||||
if item.strip()
|
||||
]
|
||||
else:
|
||||
value = [
|
||||
item if item.startswith(".") else f".{item}"
|
||||
for item in value
|
||||
if item
|
||||
]
|
||||
self.__allowed_domains = value
|
||||
|
||||
def _set_website_url(self, value: str) -> None:
|
||||
"""
|
||||
Protected: set the website URL after validating and fetch website data.
|
||||
Use this from inside the class to initialize or change the URL.
|
||||
"""
|
||||
if not value:
|
||||
raise ValueError("Website URL must be provided")
|
||||
|
||||
parsed_url: ParseResult = urlparse(value)
|
||||
|
||||
self._validate(parsed_url)
|
||||
|
||||
self.__website_url = value
|
||||
self.__fetch_website_data()
|
||||
|
||||
@property
|
||||
def fetch_failed(self) -> bool:
|
||||
"""
|
||||
Returns whether the website data fetch failed.
|
||||
"""
|
||||
return self.__fetch_failed
|
||||
|
||||
def _validate(self, parsed_url: ParseResult) -> None:
|
||||
"""
|
||||
Validate the parsed URL.
|
||||
|
||||
Parameters:
|
||||
parsed_url: The parsed URL to validate.
|
||||
|
||||
Raises:
|
||||
ValueError: If the URL is missing parts, uses an invalid scheme,
|
||||
points to a local/private address, or is not in allowed domains.
|
||||
"""
|
||||
if not parsed_url.netloc or parsed_url.scheme not in ("http", "https"):
|
||||
raise ValueError("Website URL must be a valid URL")
|
||||
|
||||
if not parsed_url.hostname:
|
||||
raise ValueError("Website URL must contain a valid hostname")
|
||||
|
||||
if self.__is_local_address(parsed_url.hostname):
|
||||
raise ValueError("Website URL must not be a local address")
|
||||
|
||||
if not self.__is_allowed_domain(parsed_url.hostname):
|
||||
raise ValueError("Website URL must be an allowed domain")
|
||||
|
||||
def __is_local_address(self, hostname: str) -> bool:
|
||||
"""
|
||||
Check if the given hostname is a local address.
|
||||
|
||||
Parameters:
|
||||
hostname (str): The hostname to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the hostname is a local address, False otherwise.
|
||||
"""
|
||||
if hostname in ("localhost", "127.0.0.1", "::1"):
|
||||
return True
|
||||
|
||||
try:
|
||||
ip: IPv4Address | IPv6Address = ip_address(hostname)
|
||||
if ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_reserved:
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
def __is_allowed_domain(self, hostname: str) -> bool:
|
||||
"""
|
||||
Check if the given hostname is an allowed domain.
|
||||
|
||||
Parameters:
|
||||
hostname (str): The hostname to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the hostname is an allowed domain, False otherwise.
|
||||
"""
|
||||
allowed_domains = [".com", ".org", ".net", ".io"]
|
||||
return any(hostname.endswith(domain) for domain in allowed_domains)
|
||||
|
||||
def __fetch_website_data(self) -> None:
|
||||
"""
|
||||
Fetch website content and populate title, text, and links.
|
||||
|
||||
Side effects:
|
||||
- Sets internal state: __title, __text, __links_on_page, __fetch_failed.
|
||||
- Performs an HTTP GET with a browser-like User-Agent.
|
||||
"""
|
||||
try:
|
||||
get_fn = self.__session.get if self.__session else get
|
||||
response = get_fn(
|
||||
self.website_url,
|
||||
timeout=10,
|
||||
verify=True,
|
||||
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"}
|
||||
)
|
||||
except RequestException as e:
|
||||
self.__title = "Error"
|
||||
self.__text = str(e)
|
||||
self.__fetch_failed = True
|
||||
return
|
||||
|
||||
if response.ok:
|
||||
extractor: Extractor = Extractor(response.text)
|
||||
self.__title = extractor.extracted_title
|
||||
self.__text = extractor.extracted_text
|
||||
self.__links_on_page = extractor.extracted_links_on_page
|
||||
else:
|
||||
if response.status_code == 404:
|
||||
self.__title = "Not Found"
|
||||
self.__text = "The requested page was not found (404)."
|
||||
else:
|
||||
self.__title = "Error"
|
||||
self.__text = f"Error: {response.status_code} - {response.reason}"
|
||||
self.__fetch_failed = True
|
||||
|
||||
def __init__(self, website_url: str, allowed_domains: list[str] | str | None = None, session: Session | None = None) -> None:
|
||||
"""
|
||||
Initializes the Website object and fetches its data.
|
||||
|
||||
Parameters:
|
||||
website_url (str): The URL of the website to fetch.
|
||||
allowed_domains (list[str] | str, optional): A list of allowed domain suffixes.
|
||||
If a string is provided, it should be a comma-separated list of domain suffixes (e.g., ".com,.org,.net").
|
||||
session (requests.Session | None, optional): Reused HTTP session for connection pooling.
|
||||
"""
|
||||
self.__fetch_failed: bool = False
|
||||
self.__session: Session | None = session
|
||||
if allowed_domains is None:
|
||||
self._allowed_domains = self.__DEFAULT_ALLOWED_DOMAINS.copy()
|
||||
else:
|
||||
self._allowed_domains = allowed_domains
|
||||
# Use protected setter internally so the public API exposes only the getter.
|
||||
self._set_website_url(website_url)
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string representation of the Website object.
|
||||
"""
|
||||
return f"Website(title={self.title}, url={self.website_url})"
|
||||
Reference in New Issue
Block a user