From 10b42a739db33444590228250903129b7c260919 Mon Sep 17 00:00:00 2001 From: Execotryx Date: Sun, 17 Aug 2025 20:17:31 +0500 Subject: [PATCH 1/2] AI Brochure Creator, powered with GPT-5 Nano and partially utilizing Responses API --- .../ai-powered-marketing-brochures/.gitignore | 210 +++++++++++++ .../ai-brochure-creator.py | 207 +++++++++++++ .../ai_brochure_config.py | 59 ++++ .../ai-powered-marketing-brochures/ai_core.py | 181 +++++++++++ .../extractor_of_relevant_links.py | 91 ++++++ .../requirements.txt | 5 + .../ai-powered-marketing-brochures/website.py | 286 ++++++++++++++++++ 7 files changed, 1039 insertions(+) create mode 100644 week1/community-contributions/ai-powered-marketing-brochures/.gitignore create mode 100644 week1/community-contributions/ai-powered-marketing-brochures/ai-brochure-creator.py create mode 100644 week1/community-contributions/ai-powered-marketing-brochures/ai_brochure_config.py create mode 100644 week1/community-contributions/ai-powered-marketing-brochures/ai_core.py create mode 100644 week1/community-contributions/ai-powered-marketing-brochures/extractor_of_relevant_links.py create mode 100644 week1/community-contributions/ai-powered-marketing-brochures/requirements.txt create mode 100644 week1/community-contributions/ai-powered-marketing-brochures/website.py diff --git a/week1/community-contributions/ai-powered-marketing-brochures/.gitignore b/week1/community-contributions/ai-powered-marketing-brochures/.gitignore new file mode 100644 index 0000000..290698f --- /dev/null +++ b/week1/community-contributions/ai-powered-marketing-brochures/.gitignore @@ -0,0 +1,210 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock +#poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +#pdm.lock +#pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +#pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Cursor +# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to +# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data +# refer to https://docs.cursor.com/context/ignore-files +.cursorignore +.cursorindexingignore + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + + +.*-env \ No newline at end of file diff --git a/week1/community-contributions/ai-powered-marketing-brochures/ai-brochure-creator.py b/week1/community-contributions/ai-powered-marketing-brochures/ai-brochure-creator.py new file mode 100644 index 0000000..79f3246 --- /dev/null +++ b/week1/community-contributions/ai-powered-marketing-brochures/ai-brochure-creator.py @@ -0,0 +1,207 @@ +from ai_core import AICore +from ai_brochure_config import AIBrochureConfig +from extractor_of_relevant_links import ExtractorOfRelevantLinks +from website import Website +from openai.types.responses import Response +from rich.console import Console +from rich.markdown import Markdown +from requests import Session +from concurrent.futures import ThreadPoolExecutor, as_completed +from json import loads + +class BrochureCreator(AICore[str]): + """ + Builds a short Markdown brochure for a company or individual by: + - extracting relevant links from the website, + - inferring the entity name and status, + - and prompting the model using the collected page content. + """ + + @property + def _website(self) -> Website: + """Return the main Website instance to analyze.""" + return self.__website + + @property + def _extractor(self) -> ExtractorOfRelevantLinks: + """Return the helper responsible for extracting relevant links.""" + return self.__extractor + + def __init__(self, config: AIBrochureConfig, website: Website) -> None: + """ + Initialize the brochure creator with configuration and target website. + + Parameters: + config: AI and runtime configuration. + website: The root website to analyze and summarize. + """ + system_behavior: str = ("You are an assistant that analyzes the contents of several relevant pages from a company website " + "and creates a short brochure about the company for prospective customers, investors and recruits. " + "Include details of company culture, customers and careers/jobs if information is available. ") + super().__init__(config, system_behavior) + self.__website: Website = website + self.__extractor: ExtractorOfRelevantLinks = ExtractorOfRelevantLinks(config, website) + + def create_brochure(self) -> str: + """ + Create a short Markdown brochure based on the website's content. + + Returns: + A Markdown string with the brochure, or a fallback message if no relevant pages were found. + """ + relevant_pages: list[dict[str, str | Website]] = self._get_relevant_pages() + if not relevant_pages: + return "No relevant pages found to create a brochure." + + brochure_prompt_part: str = self._form_brochure_prompt(relevant_pages) + inferred_company_name, inferred_status = self._infer_entity(brochure_prompt_part) + + full_brochure_prompt: str = self._form_full_prompt(inferred_company_name, inferred_status) + response: str = self.ask(full_brochure_prompt) + return response + + def _get_relevant_pages(self) -> list[dict[str, str | Website]]: + """ + Resolve relevant links into Website objects using a shared session and concurrency. + """ + relevant_pages: list[dict[str, str | Website]] = [] + relevant_links: list[dict[str, str]] = self._extractor.extract_relevant_links()["links"] + # Limit the number of pages to fetch to keep latency and token usage reasonable. + MAX_PAGES: int = 6 + links_subset = relevant_links[:MAX_PAGES] + + def build_page(item: dict[str, str], session: Session) -> dict[str, str | Website] | None: + try: + url = str(item["url"]) + page_type = str(item["type"]) + return {"type": page_type, "page": Website(url, session=session)} + except Exception: + return None + + with Session() as session, ThreadPoolExecutor(max_workers=4) as executor: + futures = [executor.submit(build_page, link, session) for link in links_subset] + for fut in as_completed(futures): + res = fut.result() + if res: + relevant_pages.append(res) + + return relevant_pages + + def _truncate_text(self, text: str, limit: int) -> str: + """ + Truncate text to 'limit' characters to reduce tokens and latency. + """ + if len(text) <= limit: + return text + return text[: max(0, limit - 20)] + "... [truncated]" + + def _form_brochure_prompt(self, relevant_pages: list[dict[str, str | Website]]) -> str: + """ + Assemble a prompt that includes the main page and relevant pages' titles and text. + + Parameters: + relevant_pages: List of page descriptors returned by _get_relevant_pages. + + Returns: + A prompt string containing quoted sections per page. + """ + QUOTE_DELIMITER: str = "\n\"\"\"\n" + MAX_MAIN_CHARS = 6000 + MAX_PAGE_CHARS = 3000 + prompt: str = ( + f"Main page:{QUOTE_DELIMITER}" + f"Title: {self._website.title}\n" + f"Text:\n{self._truncate_text(self._website.text, MAX_MAIN_CHARS)}{QUOTE_DELIMITER}\n" + ) + + for page in relevant_pages: + if isinstance(page['page'], Website) and not page['page'].fetch_failed: + prompt += ( + f"{page['type']}:{QUOTE_DELIMITER}" + f"Title: {page['page'].title}\n" + f"Text:\n{self._truncate_text(page['page'].text, MAX_PAGE_CHARS)}{QUOTE_DELIMITER}\n" + ) + + return prompt + + def _infer_entity(self, brochure_prompt_part: str) -> tuple[str, str]: + """ + Infer both the entity name and status in a single model call to reduce latency. + Returns: + (name, status) where status is 'company' or 'individual'. + """ + prompt = ( + "From the following website excerpts, infer the entity name and whether it is a company or an individual. " + "Respond strictly as JSON with keys 'name' and 'status' (status must be 'company' or 'individual').\n" + f"{brochure_prompt_part}" + ) + raw = self.ask(prompt) + try: + data: dict[str, str] = loads(raw) + name: str = str(data.get("name", "")).strip() or "Unknown" + status: str = str(data.get("status", "")).strip().lower() + if status not in ("company", "individual"): + status = "company" + return name, status + except Exception: + # Fallback: use entire output as name, assume company + return raw.strip() or "Unknown", "company" + + def _form_full_prompt(self, inferred_company_name: str, inferred_status: str) -> str: + """ + Build the final brochure-generation prompt using the inferred entity and prior history. + + Parameters: + inferred_company_name: The inferred entity name. + inferred_status: Either 'company' or 'individual'. + + Returns: + A final prompt instructing the model to produce a Markdown brochure. + """ + full_prompt: str = (f"You are looking at a {inferred_status} called {inferred_company_name}, to whom website {self._website.website_url} belongs.\n" + f"Build a short brochure about the {inferred_status}. Use the information from the website that is already stored in the history.\n" + "Your response must be in a Markdown format.") + return full_prompt + + def ask(self, question: str) -> str: + """ + Send a question to the model, update chat history, and return the text output. + + Parameters: + question: The user prompt. + + Returns: + The model output text. + """ + self.history_manager.add_user_message(question) + response: Response = self._ai_api.responses.create( + model=self.config.model_name, + instructions=self.history_manager.system_behavior, + input=self.history_manager.chat_history, + reasoning={ "effort": "low" } + ) + self.history_manager.add_assistant_message(response) + return response.output_text + +console: Console = Console() + +def display_markdown(content: str) -> None: + """ + Render Markdown content to the console using rich. + """ + console.print(Markdown(content)) + +def show_summary(summary: str) -> None: + """ + Print a Markdown summary if provided; otherwise print a fallback message. + """ + if summary: + display_markdown(summary) + else: + console.print("No summary found.") + +if __name__ == "__main__": + website: Website = Website("") + brochure_creator: BrochureCreator = BrochureCreator(AIBrochureConfig(), website) + brochure: str = brochure_creator.create_brochure() + display_markdown(brochure) \ No newline at end of file diff --git a/week1/community-contributions/ai-powered-marketing-brochures/ai_brochure_config.py b/week1/community-contributions/ai-powered-marketing-brochures/ai_brochure_config.py new file mode 100644 index 0000000..9a0e2bd --- /dev/null +++ b/week1/community-contributions/ai-powered-marketing-brochures/ai_brochure_config.py @@ -0,0 +1,59 @@ +import os +from dotenv import load_dotenv + +class AIBrochureConfig: + """ + Configuration class to load environment variables. + """ + + def __get_config_value(self, key: str): + """ + Get the value of an environment variable. + """ + if not key: + raise ValueError("Key must be provided") + + value: str | None = os.getenv(key) + if not value: + raise ValueError(f"Environment variable '{key}' not found") + + return value + + def _get_str(self, key: str) -> str: + """ + Get a string value from the environment variables. + """ + return self.__get_config_value(key) + + def _get_int(self, key: str) -> int: + """ + Get an integer value from the environment variables. + """ + value = self.__get_config_value(key) + try: + return int(value) + except ValueError: + raise ValueError(f"Environment variable '{key}' must be an integer") + + @property + def openai_api_key(self) -> str: + """ + Get the OpenAI API key from the environment variables. + """ + if self.__openai_api_key == "": + self.__openai_api_key = self._get_str("OPENAI_API_KEY") + return self.__openai_api_key + + @property + def model_name(self) -> str: + """ + Get the model name from the environment variables. + """ + if self.__model_name == "": + self.__model_name = self._get_str("MODEL_NAME") + return self.__model_name + + def __init__(self) -> None: + load_dotenv(dotenv_path=".env") + self.__openai_api_key: str = "" + self.__model_name: str = "" diff --git a/week1/community-contributions/ai-powered-marketing-brochures/ai_core.py b/week1/community-contributions/ai-powered-marketing-brochures/ai_core.py new file mode 100644 index 0000000..e517f9d --- /dev/null +++ b/week1/community-contributions/ai-powered-marketing-brochures/ai_core.py @@ -0,0 +1,181 @@ +import openai +from abc import ABC, abstractmethod +from ai_brochure_config import AIBrochureConfig +from typing import Any, cast, Generic, TypeVar +from openai.types.responses import ResponseInputItemParam, Response, ResponseOutputMessage + +TAiResponse = TypeVar('TAiResponse', default=Any) + +class HistoryManager: + """ + Manage chat history and system behavior for a conversation with the model. + """ + @property + def chat_history(self) -> list[ResponseInputItemParam]: + """ + Return the accumulated conversation as a list of response input items. + """ + return self.__chat_history + + @property + def system_behavior(self) -> str: + """ + Return the system behavior (instructions) used for this conversation. + """ + return self.__system_behavior + + def __init__(self, system_behavior: str) -> None: + """ + Initialize the history manager. + + Parameters: + system_behavior: The system instruction string for the conversation. + """ + self.__chat_history: list[ResponseInputItemParam] = [] + self.__system_behavior: str = system_behavior + + def add_user_message(self, message: str) -> None: + """ + Append a user message to the chat history. + + Parameters: + message: The user text to add. + """ + self.__chat_history.append({ + "role": "user", + "content": [{"type": "input_text", "text": message}], + }) + + def add_assistant_message(self, output_message: Response) -> None: + """ + Append the assistant's output to the chat history. + + Parameters: + output_message: The model response to convert and store. + """ + for out in output_message.output: + # Convert the Pydantic output model to an input item shape + self.__chat_history.append( + cast(ResponseInputItemParam, out.model_dump(exclude_unset=True)) + ) + + +class AICore(ABC, Generic[TAiResponse]): + """ + Abstract base class for AI core functionalities. + """ + @property + def config(self) -> AIBrochureConfig: + """ + Return the stored AIBrochureConfig for this instance. + + Returns: + AIBrochureConfig: The current configuration used by this object. + + Notes: + - This accessor returns the internal configuration reference. Mutating the returned + object may affect the internal state of this instance. + - To change the configuration, use the appropriate setter or factory method rather + than modifying the returned value in-place. + """ + return self.__config + + @config.setter + def config(self, config: AIBrochureConfig | None) -> None: + """ + Set the instance configuration for the AI brochure generator. + + Parameters + ---------- + config : AIBrochureConfig | None + The configuration to assign to the instance. If None, the instance's + configuration will be reset to a newly created default AIBrochureConfig. + + Returns + ------- + None + + Notes + ----- + This method stores the provided configuration on a private attribute + """ + if config is None: + self.__config = AIBrochureConfig() + else: + self.__config = config + + @property + def _ai_api(self) -> openai.OpenAI: + """ + Return the cached OpenAI API client, initializing it on first access. + + This private helper lazily constructs and caches an openai.OpenAI client using + the API key found on self.config.openai_api_key. On the first call, if the + client has not yet been created, the method verifies that self.config is set, + creates the client with openai.OpenAI(api_key=...), stores it on + self.__ai_api, and returns it. Subsequent calls return the same cached + instance. + + Returns: + openai.OpenAI: A configured OpenAI API client. + + Raises: + ValueError: If self.config is None when attempting to initialize the client. + + Notes: + - The method mutates self.__ai_api as a side effect (caching). + - The caller should treat this as a private implementation detail. + - Thread safety is not guaranteed; concurrent initialization may result in + multiple client instances if invoked from multiple threads simultaneously. + """ + if self.__ai_api is None: + if self.config is None: + raise ValueError("Configuration must be set before accessing AI API") + self.__ai_api = openai.OpenAI(api_key=self.config.openai_api_key) + return self.__ai_api + + @property + def history_manager(self) -> HistoryManager: + """ + Return the history manager for this AI core instance. + + This property provides access to the HistoryManager that tracks the chat + history and system behavior. + + Returns: + HistoryManager: The current history manager. This property always returns + a HistoryManager instance and never None. + """ + return self.__history_manager + + def __init__(self, config: AIBrochureConfig, system_behavior: str) -> None: + """ + Initializes the AI core with the provided configuration. + + Parameters: + config (AIBrochureConfig): The configuration object for the AI core. + system_behavior (str): The behavior of the system. + """ + # Initialize all instance-level attributes here + self.__config: AIBrochureConfig = config + self.__history_manager: HistoryManager = HistoryManager(system_behavior) + self.__ai_api: openai.OpenAI | None = None + + if __debug__: + # Sanity check: confirm attributes are initialized + assert hasattr(self, "_AICore__config") + assert hasattr(self, "_AICore__history_manager") + assert hasattr(self, "_AICore__ai_api") + + @abstractmethod + def ask(self, question: str) -> TAiResponse: + """ + Ask a question to the AI model. + + Parameters: + question: The question to ask. + + Returns: + TAiResponse: The model's response type defined by the subclass. + """ + pass \ No newline at end of file diff --git a/week1/community-contributions/ai-powered-marketing-brochures/extractor_of_relevant_links.py b/week1/community-contributions/ai-powered-marketing-brochures/extractor_of_relevant_links.py new file mode 100644 index 0000000..e94fa38 --- /dev/null +++ b/week1/community-contributions/ai-powered-marketing-brochures/extractor_of_relevant_links.py @@ -0,0 +1,91 @@ +from ai_brochure_config import AIBrochureConfig +from website import Website +from ai_core import AICore +from openai.types.responses import Response +from json import loads + +RelevantLinksDict = dict[str, list[dict[str, str]]] + +class ExtractorOfRelevantLinks(AICore[RelevantLinksDict]): + """ + Extractor for relevant links from a website. + """ + + @property + def website(self) -> Website: + """Return the root Website whose links are being analyzed.""" + return self.__website + + def __init__(self, config: AIBrochureConfig, website: Website) -> None: + """ + Initialize the extractor with configuration and target website. + + Parameters: + config: AI and runtime configuration. + website: The Website from which links were collected. + """ + system_behavior: str = ("You are an expert in creation of online advertisement materials." + "You are going to be provided with a list of links found on a website." + "You are able to decide which of the links would be most relevant to include in a brochure about the company," + "such as links to an About page or a Company page or Careers/Jobs pages.\n" + "You should respond in JSON as in this example:") + system_behavior += """ + { + "links": [ + {"type": "about page", "url": "https://www.example.com/about"}, + {"type": "company page", "url": "https://www.another_example.net/company"}, + {"type": "careers page", "url": "https://ex.one_more_example.org/careers"} + ] + } + """ + super().__init__(config, system_behavior) + self.__website: Website = website + + def get_links_user_prompt(self) -> str: + """ + Build a user prompt listing discovered links and instructions for relevance filtering. + + Returns: + A string to send to the model listing links and guidance. + """ + starter_part: str = (f"Here is a list of links found on the website of {self.website.website_url} - " + "please decide which of these links are relevant web links for a brochure about company." + "Respond with full HTTPS URLs. Avoid including Terms of Service, Privacy, email links.\n" + "Links (some might be relative links):\n") + + links_part: str = "\n".join(f"- {link}" for link in self.website.links_on_page) if self.website.links_on_page else "No links found." + + return starter_part + links_part + + def extract_relevant_links(self) -> RelevantLinksDict: + """ + Request the model to select relevant links for brochure creation. + + Returns: + A dictionary with a 'links' array containing objects with 'type' and 'url'. + """ + user_prompt = self.get_links_user_prompt() + response = self.ask(user_prompt) + return response + + def ask(self, question: str) -> RelevantLinksDict: + """ + Send a question to the model and parse the JSON response. + + Parameters: + question: The prompt to submit. + + Returns: + RelevantLinksDict: Parsed JSON containing selected links. + """ + self.history_manager.add_user_message(question) + + response: Response = self._ai_api.responses.create( + model=self.config.model_name, + instructions=self.history_manager.system_behavior, + reasoning={ "effort": "low" }, + input=self.history_manager.chat_history + ) + + self.history_manager.add_assistant_message(response) + return loads(response.output_text) \ No newline at end of file diff --git a/week1/community-contributions/ai-powered-marketing-brochures/requirements.txt b/week1/community-contributions/ai-powered-marketing-brochures/requirements.txt new file mode 100644 index 0000000..9747210 --- /dev/null +++ b/week1/community-contributions/ai-powered-marketing-brochures/requirements.txt @@ -0,0 +1,5 @@ +python-dotenv +openai +bs4 +requests +rich \ No newline at end of file diff --git a/week1/community-contributions/ai-powered-marketing-brochures/website.py b/week1/community-contributions/ai-powered-marketing-brochures/website.py new file mode 100644 index 0000000..ac9bb9d --- /dev/null +++ b/week1/community-contributions/ai-powered-marketing-brochures/website.py @@ -0,0 +1,286 @@ +from ipaddress import ip_address, IPv4Address, IPv6Address +from urllib.parse import ParseResult, urlparse +from bs4 import BeautifulSoup, Tag +from requests import get, RequestException, Session + +class Extractor: + """ + Extracts and processes content from HTML response text using BeautifulSoup. + """ + __soup: BeautifulSoup + + __extracted_title: str = "" + @property + def extracted_title(self) -> str: + """ + Returns the extracted title from the HTML content. + """ + if not self.__extracted_title: + self.__extracted_title = self.get_title() + return self.__extracted_title + + __extracted_text: str = "" + @property + def extracted_text(self) -> str: + """ + Returns the extracted main text content from the HTML, excluding irrelevant tags. + """ + if not self.__extracted_text: + self.__extracted_text = self.get_text() + return self.__extracted_text + + __extracted_links_on_page: list[str] | None = None + @property + def extracted_links_on_page(self) -> list[str]: + """ + Return all href values found on the page. + + Notes: + - Only anchor tags with an href are included. + - Values are returned as-is (may be relative or absolute). + """ + if self.__extracted_links_on_page is None: + self.__extracted_links_on_page = [str(a.get("href")) for a in self._soup.find_all('a', href=True) if isinstance(a, Tag)] + return self.__extracted_links_on_page + + @property + def _soup(self) -> BeautifulSoup: + """ + Returns the BeautifulSoup object for the HTML content. + """ + return self.__soup + + def __init__(self, response_text_content: str) -> None: + """ + Initializes the Extractor with HTML response text. + + Parameters: + response_text_content (str): The HTML response text to be processed. + """ + self.__soup = BeautifulSoup(response_text_content, "html.parser") + self.__extracted_links_on_page = None + + def get_title(self) -> str: + """ + Extracts the title from the HTML content. + """ + return self._soup.title.get_text() if self._soup.title is not None else "No title" + + def get_text(self) -> str: + """ + Extracts and cleans the main text content from the HTML, removing irrelevant tags. + """ + for irrelevant in self._soup.find_all(["script", "style", "img", "figure", "video", "audio", "button", "svg", "canvas", "input", "form", "meta"]): + irrelevant.decompose() + raw_text: str = self._soup.get_text(separator="\n") + cleaned_text: str = " ".join(raw_text.split()) + return cleaned_text if cleaned_text else "No content" + +class Website: + """ + A class to represent a website. + """ + + __DEFAULT_ALLOWED_DOMAINS: list[str] = [".com", ".org", ".net"] + + __title: str = "" + __website_url: str = "" + __text: str = "" + __allowed_domains: list[str] = [] + __links_on_page: list[str] | None = None + + @property + def title(self) -> str: + """ + Returns the title of the website. + """ + return self.__title + + @property + def text(self) -> str: + """ + Returns the main text content of the website. + """ + return self.__text + + @property + def website_url(self) -> str: + """ + Returns the URL of the website. + """ + return self.__website_url + + @property + def links_on_page(self) -> list[str] | None: + """ + Returns the list of links extracted from the website. + """ + return self.__links_on_page + + @property + def _allowed_domains(self) -> list[str]: + """ + Returns the list of allowed domain suffixes. + """ + return self.__allowed_domains + + @_allowed_domains.setter + def _allowed_domains(self, value: list[str] | str) -> None: + """ + Sets the list of allowed domain suffixes. + Filters out empty strings and ensures each suffix starts with a dot. + """ + if isinstance(value, str): + value = [ + item.strip() if item.strip().startswith(".") else f".{item.strip()}" + for item in value.split(",") + if item.strip() + ] + else: + value = [ + item if item.startswith(".") else f".{item}" + for item in value + if item + ] + self.__allowed_domains = value + + def _set_website_url(self, value: str) -> None: + """ + Protected: set the website URL after validating and fetch website data. + Use this from inside the class to initialize or change the URL. + """ + if not value: + raise ValueError("Website URL must be provided") + + parsed_url: ParseResult = urlparse(value) + + self._validate(parsed_url) + + self.__website_url = value + self.__fetch_website_data() + + @property + def fetch_failed(self) -> bool: + """ + Returns whether the website data fetch failed. + """ + return self.__fetch_failed + + def _validate(self, parsed_url: ParseResult) -> None: + """ + Validate the parsed URL. + + Parameters: + parsed_url: The parsed URL to validate. + + Raises: + ValueError: If the URL is missing parts, uses an invalid scheme, + points to a local/private address, or is not in allowed domains. + """ + if not parsed_url.netloc or parsed_url.scheme not in ("http", "https"): + raise ValueError("Website URL must be a valid URL") + + if not parsed_url.hostname: + raise ValueError("Website URL must contain a valid hostname") + + if self.__is_local_address(parsed_url.hostname): + raise ValueError("Website URL must not be a local address") + + if not self.__is_allowed_domain(parsed_url.hostname): + raise ValueError("Website URL must be an allowed domain") + + def __is_local_address(self, hostname: str) -> bool: + """ + Check if the given hostname is a local address. + + Parameters: + hostname (str): The hostname to check. + + Returns: + bool: True if the hostname is a local address, False otherwise. + """ + if hostname in ("localhost", "127.0.0.1", "::1"): + return True + + try: + ip: IPv4Address | IPv6Address = ip_address(hostname) + if ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_reserved: + return True + except ValueError: + return False + + return False + + def __is_allowed_domain(self, hostname: str) -> bool: + """ + Check if the given hostname is an allowed domain. + + Parameters: + hostname (str): The hostname to check. + + Returns: + bool: True if the hostname is an allowed domain, False otherwise. + """ + allowed_domains = [".com", ".org", ".net", ".io"] + return any(hostname.endswith(domain) for domain in allowed_domains) + + def __fetch_website_data(self) -> None: + """ + Fetch website content and populate title, text, and links. + + Side effects: + - Sets internal state: __title, __text, __links_on_page, __fetch_failed. + - Performs an HTTP GET with a browser-like User-Agent. + """ + try: + get_fn = self.__session.get if self.__session else get + response = get_fn( + self.website_url, + timeout=10, + verify=True, + headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"} + ) + except RequestException as e: + self.__title = "Error" + self.__text = str(e) + self.__fetch_failed = True + return + + if response.ok: + extractor: Extractor = Extractor(response.text) + self.__title = extractor.extracted_title + self.__text = extractor.extracted_text + self.__links_on_page = extractor.extracted_links_on_page + else: + if response.status_code == 404: + self.__title = "Not Found" + self.__text = "The requested page was not found (404)." + else: + self.__title = "Error" + self.__text = f"Error: {response.status_code} - {response.reason}" + self.__fetch_failed = True + + def __init__(self, website_url: str, allowed_domains: list[str] | str | None = None, session: Session | None = None) -> None: + """ + Initializes the Website object and fetches its data. + + Parameters: + website_url (str): The URL of the website to fetch. + allowed_domains (list[str] | str, optional): A list of allowed domain suffixes. + If a string is provided, it should be a comma-separated list of domain suffixes (e.g., ".com,.org,.net"). + session (requests.Session | None, optional): Reused HTTP session for connection pooling. + """ + self.__fetch_failed: bool = False + self.__session: Session | None = session + if allowed_domains is None: + self._allowed_domains = self.__DEFAULT_ALLOWED_DOMAINS.copy() + else: + self._allowed_domains = allowed_domains + # Use protected setter internally so the public API exposes only the getter. + self._set_website_url(website_url) + + def __str__(self) -> str: + """ + Returns a string representation of the Website object. + """ + return f"Website(title={self.title}, url={self.website_url})" \ No newline at end of file From b495be8c2861f172237c02318c99f2013b830594 Mon Sep 17 00:00:00 2001 From: Execotryx Date: Sun, 17 Aug 2025 23:44:24 +0500 Subject: [PATCH 2/2] AI Brochure Creator, powered with GPT-5 Nano and partially utilizing Responses API --- .../.gitignore | 0 .../ai-brochure-creator.py | 0 .../ai_brochure_config.py | 0 .../ai_core.py | 0 .../extractor_of_relevant_links.py | 0 .../requirements.txt | 0 .../website.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename week1/community-contributions/{ai-powered-marketing-brochures => ai-powered-marketing-brochures-gpt-5}/.gitignore (100%) rename week1/community-contributions/{ai-powered-marketing-brochures => ai-powered-marketing-brochures-gpt-5}/ai-brochure-creator.py (100%) rename week1/community-contributions/{ai-powered-marketing-brochures => ai-powered-marketing-brochures-gpt-5}/ai_brochure_config.py (100%) rename week1/community-contributions/{ai-powered-marketing-brochures => ai-powered-marketing-brochures-gpt-5}/ai_core.py (100%) rename week1/community-contributions/{ai-powered-marketing-brochures => ai-powered-marketing-brochures-gpt-5}/extractor_of_relevant_links.py (100%) rename week1/community-contributions/{ai-powered-marketing-brochures => ai-powered-marketing-brochures-gpt-5}/requirements.txt (100%) rename week1/community-contributions/{ai-powered-marketing-brochures => ai-powered-marketing-brochures-gpt-5}/website.py (100%) diff --git a/week1/community-contributions/ai-powered-marketing-brochures/.gitignore b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/.gitignore similarity index 100% rename from week1/community-contributions/ai-powered-marketing-brochures/.gitignore rename to week1/community-contributions/ai-powered-marketing-brochures-gpt-5/.gitignore diff --git a/week1/community-contributions/ai-powered-marketing-brochures/ai-brochure-creator.py b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai-brochure-creator.py similarity index 100% rename from week1/community-contributions/ai-powered-marketing-brochures/ai-brochure-creator.py rename to week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai-brochure-creator.py diff --git a/week1/community-contributions/ai-powered-marketing-brochures/ai_brochure_config.py b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai_brochure_config.py similarity index 100% rename from week1/community-contributions/ai-powered-marketing-brochures/ai_brochure_config.py rename to week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai_brochure_config.py diff --git a/week1/community-contributions/ai-powered-marketing-brochures/ai_core.py b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai_core.py similarity index 100% rename from week1/community-contributions/ai-powered-marketing-brochures/ai_core.py rename to week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai_core.py diff --git a/week1/community-contributions/ai-powered-marketing-brochures/extractor_of_relevant_links.py b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/extractor_of_relevant_links.py similarity index 100% rename from week1/community-contributions/ai-powered-marketing-brochures/extractor_of_relevant_links.py rename to week1/community-contributions/ai-powered-marketing-brochures-gpt-5/extractor_of_relevant_links.py diff --git a/week1/community-contributions/ai-powered-marketing-brochures/requirements.txt b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/requirements.txt similarity index 100% rename from week1/community-contributions/ai-powered-marketing-brochures/requirements.txt rename to week1/community-contributions/ai-powered-marketing-brochures-gpt-5/requirements.txt diff --git a/week1/community-contributions/ai-powered-marketing-brochures/website.py b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/website.py similarity index 100% rename from week1/community-contributions/ai-powered-marketing-brochures/website.py rename to week1/community-contributions/ai-powered-marketing-brochures-gpt-5/website.py