Merge pull request #604 from Execotryx/ai-brochure-on-gpt-5-nano

AI Brochure Creator, powered with GPT-5 Nano and partially utilizing Responses API
2025-08-23 10:08:36 +01:00
parent 910a4da24e b495be8c28
commit 7212c12a5a
7 changed files with 1039 additions and 0 deletions
--- a/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/.gitignore
+++ b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/.gitignore
@@ -0,0 +1,210 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer, 
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+
+
+.*-env
--- a/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai-brochure-creator.py
+++ b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai-brochure-creator.py
@@ -0,0 +1,207 @@
+from ai_core import AICore
+from ai_brochure_config import AIBrochureConfig
+from extractor_of_relevant_links import ExtractorOfRelevantLinks
+from website import Website
+from openai.types.responses import Response
+from rich.console import Console
+from rich.markdown import Markdown
+from requests import Session
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from json import loads
+
+class BrochureCreator(AICore[str]):
+    """
+    Builds a short Markdown brochure for a company or individual by:
+    - extracting relevant links from the website,
+    - inferring the entity name and status,
+    - and prompting the model using the collected page content.
+    """
+
+    @property
+    def _website(self) -> Website:
+        """Return the main Website instance to analyze."""
+        return self.__website
+
+    @property
+    def _extractor(self) -> ExtractorOfRelevantLinks:
+        """Return the helper responsible for extracting relevant links."""
+        return self.__extractor
+
+    def __init__(self, config: AIBrochureConfig, website: Website) -> None:
+        """
+        Initialize the brochure creator with configuration and target website.
+
+        Parameters:
+            config: AI and runtime configuration.
+            website: The root website to analyze and summarize.
+        """
+        system_behavior: str = ("You are an assistant that analyzes the contents of several relevant pages from a company website "
+                                "and creates a short brochure about the company for prospective customers, investors and recruits. "
+                                "Include details of company culture, customers and careers/jobs if information is available. ")
+        super().__init__(config, system_behavior)
+        self.__website: Website = website
+        self.__extractor: ExtractorOfRelevantLinks = ExtractorOfRelevantLinks(config, website)
+
+    def create_brochure(self) -> str:
+        """
+        Create a short Markdown brochure based on the website's content.
+
+        Returns:
+            A Markdown string with the brochure, or a fallback message if no relevant pages were found.
+        """
+        relevant_pages: list[dict[str, str | Website]] = self._get_relevant_pages()
+        if not relevant_pages:
+            return "No relevant pages found to create a brochure."
+
+        brochure_prompt_part: str = self._form_brochure_prompt(relevant_pages)
+        inferred_company_name, inferred_status = self._infer_entity(brochure_prompt_part)
+
+        full_brochure_prompt: str = self._form_full_prompt(inferred_company_name, inferred_status)
+        response: str = self.ask(full_brochure_prompt)
+        return response
+
+    def _get_relevant_pages(self) -> list[dict[str, str | Website]]:
+        """
+        Resolve relevant links into Website objects using a shared session and concurrency.
+        """
+        relevant_pages: list[dict[str, str | Website]] = []
+        relevant_links: list[dict[str, str]] = self._extractor.extract_relevant_links()["links"]
+        # Limit the number of pages to fetch to keep latency and token usage reasonable.
+        MAX_PAGES: int = 6
+        links_subset = relevant_links[:MAX_PAGES]
+
+        def build_page(item: dict[str, str], session: Session) -> dict[str, str | Website] | None:
+            try:
+                url = str(item["url"])
+                page_type = str(item["type"])
+                return {"type": page_type, "page": Website(url, session=session)}
+            except Exception:
+                return None
+
+        with Session() as session, ThreadPoolExecutor(max_workers=4) as executor:
+            futures = [executor.submit(build_page, link, session) for link in links_subset]
+            for fut in as_completed(futures):
+                res = fut.result()
+                if res:
+                    relevant_pages.append(res)
+
+        return relevant_pages
+
+    def _truncate_text(self, text: str, limit: int) -> str:
+        """
+        Truncate text to 'limit' characters to reduce tokens and latency.
+        """
+        if len(text) <= limit:
+            return text
+        return text[: max(0, limit - 20)] + "... [truncated]"
+
+    def _form_brochure_prompt(self, relevant_pages: list[dict[str, str | Website]]) -> str:
+        """
+        Assemble a prompt that includes the main page and relevant pages' titles and text.
+
+        Parameters:
+            relevant_pages: List of page descriptors returned by _get_relevant_pages.
+
+        Returns:
+            A prompt string containing quoted sections per page.
+        """
+        QUOTE_DELIMITER: str = "\n\"\"\"\n"
+        MAX_MAIN_CHARS = 6000
+        MAX_PAGE_CHARS = 3000
+        prompt: str = (
+            f"Main page:{QUOTE_DELIMITER}"
+            f"Title: {self._website.title}\n"
+            f"Text:\n{self._truncate_text(self._website.text, MAX_MAIN_CHARS)}{QUOTE_DELIMITER}\n"
+        )
+
+        for page in relevant_pages:
+            if isinstance(page['page'], Website) and not page['page'].fetch_failed:
+                prompt += (
+                    f"{page['type']}:{QUOTE_DELIMITER}"
+                    f"Title: {page['page'].title}\n"
+                    f"Text:\n{self._truncate_text(page['page'].text, MAX_PAGE_CHARS)}{QUOTE_DELIMITER}\n"
+                )
+
+        return prompt
+
+    def _infer_entity(self, brochure_prompt_part: str) -> tuple[str, str]:
+        """
+        Infer both the entity name and status in a single model call to reduce latency.
+        Returns:
+            (name, status) where status is 'company' or 'individual'.
+        """
+        prompt = (
+            "From the following website excerpts, infer the entity name and whether it is a company or an individual. "
+            "Respond strictly as JSON with keys 'name' and 'status' (status must be 'company' or 'individual').\n"
+            f"{brochure_prompt_part}"
+        )
+        raw = self.ask(prompt)
+        try:
+            data: dict[str, str] = loads(raw)
+            name: str = str(data.get("name", "")).strip() or "Unknown"
+            status: str = str(data.get("status", "")).strip().lower()
+            if status not in ("company", "individual"):
+                status = "company"
+            return name, status
+        except Exception:
+            # Fallback: use entire output as name, assume company
+            return raw.strip() or "Unknown", "company"
+
+    def _form_full_prompt(self, inferred_company_name: str, inferred_status: str) -> str:
+        """
+        Build the final brochure-generation prompt using the inferred entity and prior history.
+
+        Parameters:
+            inferred_company_name: The inferred entity name.
+            inferred_status: Either 'company' or 'individual'.
+
+        Returns:
+            A final prompt instructing the model to produce a Markdown brochure.
+        """
+        full_prompt: str = (f"You are looking at a {inferred_status} called {inferred_company_name}, to whom website {self._website.website_url} belongs.\n"
+                            f"Build a short brochure about the {inferred_status}. Use the information from the website that is already stored in the history.\n"
+                            "Your response must be in a Markdown format.")
+        return full_prompt
+
+    def ask(self, question: str) -> str:
+        """
+        Send a question to the model, update chat history, and return the text output.
+
+        Parameters:
+            question: The user prompt.
+
+        Returns:
+            The model output text.
+        """
+        self.history_manager.add_user_message(question)
+        response: Response = self._ai_api.responses.create(
+            model=self.config.model_name,
+            instructions=self.history_manager.system_behavior,
+            input=self.history_manager.chat_history,
+            reasoning={ "effort": "low" }
+        )
+        self.history_manager.add_assistant_message(response)
+        return response.output_text    
+
+console: Console = Console()
+
+def display_markdown(content: str) -> None:
+    """
+    Render Markdown content to the console using rich.
+    """
+    console.print(Markdown(content))
+
+def show_summary(summary: str) -> None:
+    """
+    Print a Markdown summary if provided; otherwise print a fallback message.
+    """
+    if summary:
+        display_markdown(summary)
+    else:
+        console.print("No summary found.")
+
+if __name__ == "__main__":
+    website: Website = Website("<put your site address here>")
+    brochure_creator: BrochureCreator = BrochureCreator(AIBrochureConfig(), website)
+    brochure: str = brochure_creator.create_brochure()
+    display_markdown(brochure)
--- a/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai_brochure_config.py
+++ b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai_brochure_config.py
@@ -0,0 +1,59 @@
+import os
+from dotenv import load_dotenv
+
+class AIBrochureConfig:
+    """
+    Configuration class to load environment variables.
+    """
+
+    def __get_config_value(self, key: str):
+        """
+        Get the value of an environment variable.
+        """
+        if not key:
+            raise ValueError("Key must be provided")
+
+        value: str | None = os.getenv(key)
+        if not value:
+            raise ValueError(f"Environment variable '{key}' not found")
+
+        return value
+
+    def _get_str(self, key: str) -> str:
+        """
+        Get a string value from the environment variables.
+        """
+        return self.__get_config_value(key)
+
+    def _get_int(self, key: str) -> int:
+        """
+        Get an integer value from the environment variables.
+        """
+        value = self.__get_config_value(key)
+        try:
+            return int(value)
+        except ValueError:
+            raise ValueError(f"Environment variable '{key}' must be an integer")
+
+    @property
+    def openai_api_key(self) -> str:
+        """
+        Get the OpenAI API key from the environment variables.
+        """
+        if self.__openai_api_key == "":
+            self.__openai_api_key = self._get_str("OPENAI_API_KEY")
+        return self.__openai_api_key
+
+    @property
+    def model_name(self) -> str:
+        """
+        Get the model name from the environment variables.
+        """
+        if self.__model_name == "":
+            self.__model_name = self._get_str("MODEL_NAME")
+        return self.__model_name
+
+    def __init__(self) -> None:
+        load_dotenv(dotenv_path=".env")
+        self.__openai_api_key: str = ""
+        self.__model_name: str = ""
--- a/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai_core.py
+++ b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/ai_core.py
@@ -0,0 +1,181 @@
+import openai
+from abc import ABC, abstractmethod
+from ai_brochure_config import AIBrochureConfig
+from typing import Any, cast, Generic, TypeVar
+from openai.types.responses import ResponseInputItemParam, Response, ResponseOutputMessage
+
+TAiResponse = TypeVar('TAiResponse', default=Any)
+
+class HistoryManager:
+    """
+    Manage chat history and system behavior for a conversation with the model.
+    """
+    @property
+    def chat_history(self) -> list[ResponseInputItemParam]:
+        """
+        Return the accumulated conversation as a list of response input items.
+        """
+        return self.__chat_history
+
+    @property
+    def system_behavior(self) -> str:
+        """
+        Return the system behavior (instructions) used for this conversation.
+        """
+        return self.__system_behavior
+
+    def __init__(self, system_behavior: str) -> None:
+        """
+        Initialize the history manager.
+
+        Parameters:
+            system_behavior: The system instruction string for the conversation.
+        """
+        self.__chat_history: list[ResponseInputItemParam] = []
+        self.__system_behavior: str = system_behavior
+
+    def add_user_message(self, message: str) -> None:
+        """
+        Append a user message to the chat history.
+
+        Parameters:
+            message: The user text to add.
+        """
+        self.__chat_history.append({
+            "role": "user",
+            "content": [{"type": "input_text", "text": message}],
+        })
+
+    def add_assistant_message(self, output_message: Response) -> None:
+        """
+        Append the assistant's output to the chat history.
+
+        Parameters:
+            output_message: The model response to convert and store.
+        """
+        for out in output_message.output:
+            # Convert the Pydantic output model to an input item shape
+            self.__chat_history.append(
+                cast(ResponseInputItemParam, out.model_dump(exclude_unset=True))
+            )
+
+
+class AICore(ABC, Generic[TAiResponse]):
+    """
+    Abstract base class for AI core functionalities.
+    """
+    @property
+    def config(self) -> AIBrochureConfig:
+        """
+        Return the stored AIBrochureConfig for this instance.
+
+        Returns:
+            AIBrochureConfig: The current configuration used by this object.
+
+        Notes:
+            - This accessor returns the internal configuration reference. Mutating the returned
+              object may affect the internal state of this instance.
+            - To change the configuration, use the appropriate setter or factory method rather
+              than modifying the returned value in-place.
+        """
+        return self.__config
+
+    @config.setter
+    def config(self, config: AIBrochureConfig | None) -> None:
+        """
+        Set the instance configuration for the AI brochure generator.
+
+        Parameters
+        ----------
+        config : AIBrochureConfig | None
+            The configuration to assign to the instance. If None, the instance's
+            configuration will be reset to a newly created default AIBrochureConfig.
+
+        Returns
+        -------
+        None
+
+        Notes
+        -----
+        This method stores the provided configuration on a private attribute
+        """
+        if config is None:
+            self.__config = AIBrochureConfig()
+        else:
+            self.__config = config
+
+    @property
+    def _ai_api(self) -> openai.OpenAI:
+        """
+        Return the cached OpenAI API client, initializing it on first access.
+
+        This private helper lazily constructs and caches an openai.OpenAI client using
+        the API key found on self.config.openai_api_key. On the first call, if the
+        client has not yet been created, the method verifies that self.config is set,
+        creates the client with openai.OpenAI(api_key=...), stores it on
+        self.__ai_api, and returns it. Subsequent calls return the same cached
+        instance.
+
+        Returns:
+            openai.OpenAI: A configured OpenAI API client.
+
+        Raises:
+            ValueError: If self.config is None when attempting to initialize the client.
+
+        Notes:
+            - The method mutates self.__ai_api as a side effect (caching).
+            - The caller should treat this as a private implementation detail.
+            - Thread safety is not guaranteed; concurrent initialization may result in
+              multiple client instances if invoked from multiple threads simultaneously.
+        """
+        if self.__ai_api is None:
+            if self.config is None:
+                raise ValueError("Configuration must be set before accessing AI API")
+            self.__ai_api = openai.OpenAI(api_key=self.config.openai_api_key)
+        return self.__ai_api
+
+    @property
+    def history_manager(self) -> HistoryManager:
+        """
+        Return the history manager for this AI core instance.
+
+        This property provides access to the HistoryManager that tracks the chat
+        history and system behavior.
+
+        Returns:
+            HistoryManager: The current history manager. This property always returns
+            a HistoryManager instance and never None.
+        """
+        return self.__history_manager
+
+    def __init__(self, config: AIBrochureConfig, system_behavior: str) -> None:
+        """
+        Initializes the AI core with the provided configuration.
+
+        Parameters:
+            config (AIBrochureConfig): The configuration object for the AI core.
+            system_behavior (str): The behavior of the system.
+        """
+        # Initialize all instance-level attributes here
+        self.__config: AIBrochureConfig = config
+        self.__history_manager: HistoryManager = HistoryManager(system_behavior)
+        self.__ai_api: openai.OpenAI | None = None
+
+        if __debug__:
+            # Sanity check: confirm attributes are initialized
+            assert hasattr(self, "_AICore__config")
+            assert hasattr(self, "_AICore__history_manager")
+            assert hasattr(self, "_AICore__ai_api")
+
+    @abstractmethod
+    def ask(self, question: str) -> TAiResponse:
+        """
+        Ask a question to the AI model.
+
+        Parameters:
+            question: The question to ask.
+
+        Returns:
+            TAiResponse: The model's response type defined by the subclass.
+        """
+        pass
--- a/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/extractor_of_relevant_links.py
+++ b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/extractor_of_relevant_links.py
@@ -0,0 +1,91 @@
+from ai_brochure_config import AIBrochureConfig
+from website import Website
+from ai_core import AICore
+from openai.types.responses import Response
+from json import loads
+
+RelevantLinksDict = dict[str, list[dict[str, str]]]
+
+class ExtractorOfRelevantLinks(AICore[RelevantLinksDict]):
+    """
+    Extractor for relevant links from a website.
+    """
+
+    @property
+    def website(self) -> Website:
+        """Return the root Website whose links are being analyzed."""
+        return self.__website
+
+    def __init__(self, config: AIBrochureConfig, website: Website) -> None:
+        """
+        Initialize the extractor with configuration and target website.
+
+        Parameters:
+            config: AI and runtime configuration.
+            website: The Website from which links were collected.
+        """
+        system_behavior: str = ("You are an expert in creation of online advertisement materials."
+                                  "You are going to be provided with a list of links found on a website."
+                                  "You are able to decide which of the links would be most relevant to include in a brochure about the company,"
+                                  "such as links to an About page or a Company page or Careers/Jobs pages.\n"
+                                  "You should respond in JSON as in this example:")
+        system_behavior += """
+        {
+            "links": [
+                {"type": "about page", "url": "https://www.example.com/about"},
+                {"type": "company page", "url": "https://www.another_example.net/company"},
+                {"type": "careers page", "url": "https://ex.one_more_example.org/careers"}
+            ]
+        }
+        """
+        super().__init__(config, system_behavior)
+        self.__website: Website = website
+
+    def get_links_user_prompt(self) -> str:
+        """
+        Build a user prompt listing discovered links and instructions for relevance filtering.
+
+        Returns:
+            A string to send to the model listing links and guidance.
+        """
+        starter_part: str = (f"Here is a list of links found on the website of {self.website.website_url} - "
+                             "please decide which of these links are relevant web links for a brochure about company."
+                             "Respond with full HTTPS URLs. Avoid including Terms of Service, Privacy, email links.\n"
+                             "Links (some might be relative links):\n")
+
+        links_part: str = "\n".join(f"- {link}" for link in self.website.links_on_page) if self.website.links_on_page else "No links found."
+
+        return starter_part + links_part
+
+    def extract_relevant_links(self) -> RelevantLinksDict:
+        """
+        Request the model to select relevant links for brochure creation.
+
+        Returns:
+            A dictionary with a 'links' array containing objects with 'type' and 'url'.
+        """
+        user_prompt = self.get_links_user_prompt()
+        response = self.ask(user_prompt)
+        return response
+
+    def ask(self, question: str) -> RelevantLinksDict:
+        """
+        Send a question to the model and parse the JSON response.
+
+        Parameters:
+            question: The prompt to submit.
+
+        Returns:
+            RelevantLinksDict: Parsed JSON containing selected links.
+        """
+        self.history_manager.add_user_message(question)
+        
+        response: Response = self._ai_api.responses.create(
+            model=self.config.model_name,
+            instructions=self.history_manager.system_behavior,
+            reasoning={ "effort": "low" },
+            input=self.history_manager.chat_history
+        )
+
+        self.history_manager.add_assistant_message(response)
+        return loads(response.output_text)
--- a/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/requirements.txt
+++ b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/requirements.txt
@@ -0,0 +1,5 @@
+python-dotenv
+openai
+bs4
+requests
+rich
--- a/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/website.py
+++ b/week1/community-contributions/ai-powered-marketing-brochures-gpt-5/website.py
@@ -0,0 +1,286 @@
+from ipaddress import ip_address, IPv4Address, IPv6Address
+from urllib.parse import ParseResult, urlparse
+from bs4 import BeautifulSoup, Tag
+from requests import get, RequestException, Session
+
+class Extractor:
+    """
+    Extracts and processes content from HTML response text using BeautifulSoup.
+    """
+    __soup: BeautifulSoup
+
+    __extracted_title: str = ""
+    @property
+    def extracted_title(self) -> str:
+        """
+        Returns the extracted title from the HTML content.
+        """
+        if not self.__extracted_title:
+            self.__extracted_title = self.get_title()
+        return self.__extracted_title
+
+    __extracted_text: str = ""
+    @property
+    def extracted_text(self) -> str:
+        """
+        Returns the extracted main text content from the HTML, excluding irrelevant tags.
+        """
+        if not self.__extracted_text:
+            self.__extracted_text = self.get_text()
+        return self.__extracted_text
+
+    __extracted_links_on_page: list[str] | None = None
+    @property
+    def extracted_links_on_page(self) -> list[str]:
+        """
+        Return all href values found on the page.
+
+        Notes:
+            - Only anchor tags with an href are included.
+            - Values are returned as-is (may be relative or absolute).
+        """
+        if self.__extracted_links_on_page is None:
+            self.__extracted_links_on_page = [str(a.get("href")) for a in self._soup.find_all('a', href=True) if isinstance(a, Tag)]
+        return self.__extracted_links_on_page
+
+    @property
+    def _soup(self) -> BeautifulSoup:
+        """
+        Returns the BeautifulSoup object for the HTML content.
+        """
+        return self.__soup
+    
+    def __init__(self, response_text_content: str) -> None:
+        """
+        Initializes the Extractor with HTML response text.
+
+        Parameters:
+            response_text_content (str): The HTML response text to be processed.
+        """
+        self.__soup = BeautifulSoup(response_text_content, "html.parser")
+        self.__extracted_links_on_page = None
+
+    def get_title(self) -> str:
+        """
+        Extracts the title from the HTML content.
+        """
+        return self._soup.title.get_text() if self._soup.title is not None else "No title"
+
+    def get_text(self) -> str:
+        """
+        Extracts and cleans the main text content from the HTML, removing irrelevant tags.
+        """
+        for irrelevant in self._soup.find_all(["script", "style", "img", "figure", "video", "audio", "button", "svg", "canvas", "input", "form", "meta"]):
+            irrelevant.decompose()
+        raw_text: str = self._soup.get_text(separator="\n")
+        cleaned_text: str = " ".join(raw_text.split())
+        return cleaned_text if cleaned_text else "No content"
+
+class Website:
+    """
+    A class to represent a website.
+    """
+
+    __DEFAULT_ALLOWED_DOMAINS: list[str] = [".com", ".org", ".net"]
+
+    __title: str = ""
+    __website_url: str = ""
+    __text: str = ""
+    __allowed_domains: list[str] = []
+    __links_on_page: list[str] | None = None
+
+    @property
+    def title(self) -> str:
+        """
+        Returns the title of the website.
+        """
+        return self.__title
+
+    @property
+    def text(self) -> str:
+        """
+        Returns the main text content of the website.
+        """
+        return self.__text
+
+    @property
+    def website_url(self) -> str:
+        """
+        Returns the URL of the website.
+        """
+        return self.__website_url
+
+    @property
+    def links_on_page(self) -> list[str] | None:
+        """
+        Returns the list of links extracted from the website.
+        """
+        return self.__links_on_page
+
+    @property
+    def _allowed_domains(self) -> list[str]:
+        """
+        Returns the list of allowed domain suffixes.
+        """
+        return self.__allowed_domains
+
+    @_allowed_domains.setter
+    def _allowed_domains(self, value: list[str] | str) -> None:
+        """
+        Sets the list of allowed domain suffixes.
+        Filters out empty strings and ensures each suffix starts with a dot.
+        """
+        if isinstance(value, str):
+            value = [
+                item.strip() if item.strip().startswith(".") else f".{item.strip()}"
+                for item in value.split(",")
+                if item.strip()
+            ]
+        else:
+            value = [
+                item if item.startswith(".") else f".{item}"
+                for item in value
+                if item
+            ]
+        self.__allowed_domains = value
+
+    def _set_website_url(self, value: str) -> None:
+        """
+        Protected: set the website URL after validating and fetch website data.
+        Use this from inside the class to initialize or change the URL.
+        """
+        if not value:
+            raise ValueError("Website URL must be provided")
+
+        parsed_url: ParseResult = urlparse(value)
+
+        self._validate(parsed_url)
+
+        self.__website_url = value
+        self.__fetch_website_data()
+
+    @property
+    def fetch_failed(self) -> bool:
+        """
+        Returns whether the website data fetch failed.
+        """
+        return self.__fetch_failed
+
+    def _validate(self, parsed_url: ParseResult) -> None:
+        """
+        Validate the parsed URL.
+
+        Parameters:
+            parsed_url: The parsed URL to validate.
+
+        Raises:
+            ValueError: If the URL is missing parts, uses an invalid scheme,
+                        points to a local/private address, or is not in allowed domains.
+        """
+        if not parsed_url.netloc or parsed_url.scheme not in ("http", "https"):
+            raise ValueError("Website URL must be a valid URL")
+
+        if not parsed_url.hostname:
+            raise ValueError("Website URL must contain a valid hostname")
+
+        if self.__is_local_address(parsed_url.hostname):
+            raise ValueError("Website URL must not be a local address")
+
+        if not self.__is_allowed_domain(parsed_url.hostname):
+            raise ValueError("Website URL must be an allowed domain")
+
+    def __is_local_address(self, hostname: str) -> bool:
+        """
+        Check if the given hostname is a local address.
+
+        Parameters:
+            hostname (str): The hostname to check.
+
+        Returns:
+            bool: True if the hostname is a local address, False otherwise.
+        """
+        if hostname in ("localhost", "127.0.0.1", "::1"):
+            return True
+
+        try:
+            ip: IPv4Address | IPv6Address = ip_address(hostname)
+            if ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_reserved:
+                return True
+        except ValueError:
+            return False
+
+        return False
+
+    def __is_allowed_domain(self, hostname: str) -> bool:
+        """
+        Check if the given hostname is an allowed domain.
+
+        Parameters:
+            hostname (str): The hostname to check.
+
+        Returns:
+            bool: True if the hostname is an allowed domain, False otherwise.
+        """
+        allowed_domains = [".com", ".org", ".net", ".io"]
+        return any(hostname.endswith(domain) for domain in allowed_domains)
+
+    def __fetch_website_data(self) -> None:
+        """
+        Fetch website content and populate title, text, and links.
+
+        Side effects:
+            - Sets internal state: __title, __text, __links_on_page, __fetch_failed.
+            - Performs an HTTP GET with a browser-like User-Agent.
+        """
+        try:
+            get_fn = self.__session.get if self.__session else get
+            response = get_fn(
+                self.website_url,
+                timeout=10,
+                verify=True,
+                headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"}
+            )
+        except RequestException as e:
+            self.__title = "Error"
+            self.__text = str(e)
+            self.__fetch_failed = True
+            return
+        
+        if response.ok:
+            extractor: Extractor = Extractor(response.text)
+            self.__title = extractor.extracted_title
+            self.__text = extractor.extracted_text
+            self.__links_on_page = extractor.extracted_links_on_page
+        else:
+            if response.status_code == 404:
+                self.__title = "Not Found"
+                self.__text = "The requested page was not found (404)."
+            else:
+                self.__title = "Error"
+                self.__text = f"Error: {response.status_code} - {response.reason}"
+            self.__fetch_failed = True
+
+    def __init__(self, website_url: str, allowed_domains: list[str] | str | None = None, session: Session | None = None) -> None:
+        """
+        Initializes the Website object and fetches its data.
+
+        Parameters:
+            website_url (str): The URL of the website to fetch.
+            allowed_domains (list[str] | str, optional): A list of allowed domain suffixes.
+                If a string is provided, it should be a comma-separated list of domain suffixes (e.g., ".com,.org,.net").
+            session (requests.Session | None, optional): Reused HTTP session for connection pooling.
+        """
+        self.__fetch_failed: bool = False
+        self.__session: Session | None = session
+        if allowed_domains is None:
+            self._allowed_domains = self.__DEFAULT_ALLOWED_DOMAINS.copy()
+        else:
+            self._allowed_domains = allowed_domains
+        # Use protected setter internally so the public API exposes only the getter.
+        self._set_website_url(website_url)
+
+    def __str__(self) -> str:
+        """
+        Returns a string representation of the Website object.
+        """
+        return f"Website(title={self.title}, url={self.website_url})"