AI Brochure Creator, powered with GPT-5 Nano and partially utilizing Responses API
This commit is contained in:
210
week1/community-contributions/ai-powered-marketing-brochures/.gitignore
vendored
Normal file
210
week1/community-contributions/ai-powered-marketing-brochures/.gitignore
vendored
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[codz]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py.cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# UV
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
#uv.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
#poetry.toml
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
||||||
|
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
||||||
|
#pdm.lock
|
||||||
|
#pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# pixi
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
||||||
|
#pixi.lock
|
||||||
|
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
||||||
|
# in the .venv directory. It is recommended not to include this directory in version control.
|
||||||
|
.pixi
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.envrc
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
# Abstra
|
||||||
|
# Abstra is an AI-powered process automation framework.
|
||||||
|
# Ignore directories containing user credentials, local state, and settings.
|
||||||
|
# Learn more at https://abstra.io/docs
|
||||||
|
.abstra/
|
||||||
|
|
||||||
|
# Visual Studio Code
|
||||||
|
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
||||||
|
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
||||||
|
# you could uncomment the following to ignore the entire vscode folder
|
||||||
|
# .vscode/
|
||||||
|
|
||||||
|
# Ruff stuff:
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# PyPI configuration file
|
||||||
|
.pypirc
|
||||||
|
|
||||||
|
# Cursor
|
||||||
|
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
||||||
|
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
||||||
|
# refer to https://docs.cursor.com/context/ignore-files
|
||||||
|
.cursorignore
|
||||||
|
.cursorindexingignore
|
||||||
|
|
||||||
|
# Marimo
|
||||||
|
marimo/_static/
|
||||||
|
marimo/_lsp/
|
||||||
|
__marimo__/
|
||||||
|
|
||||||
|
|
||||||
|
.*-env
|
||||||
@@ -0,0 +1,207 @@
|
|||||||
|
from ai_core import AICore
|
||||||
|
from ai_brochure_config import AIBrochureConfig
|
||||||
|
from extractor_of_relevant_links import ExtractorOfRelevantLinks
|
||||||
|
from website import Website
|
||||||
|
from openai.types.responses import Response
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.markdown import Markdown
|
||||||
|
from requests import Session
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from json import loads
|
||||||
|
|
||||||
|
class BrochureCreator(AICore[str]):
|
||||||
|
"""
|
||||||
|
Builds a short Markdown brochure for a company or individual by:
|
||||||
|
- extracting relevant links from the website,
|
||||||
|
- inferring the entity name and status,
|
||||||
|
- and prompting the model using the collected page content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _website(self) -> Website:
|
||||||
|
"""Return the main Website instance to analyze."""
|
||||||
|
return self.__website
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _extractor(self) -> ExtractorOfRelevantLinks:
|
||||||
|
"""Return the helper responsible for extracting relevant links."""
|
||||||
|
return self.__extractor
|
||||||
|
|
||||||
|
def __init__(self, config: AIBrochureConfig, website: Website) -> None:
|
||||||
|
"""
|
||||||
|
Initialize the brochure creator with configuration and target website.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config: AI and runtime configuration.
|
||||||
|
website: The root website to analyze and summarize.
|
||||||
|
"""
|
||||||
|
system_behavior: str = ("You are an assistant that analyzes the contents of several relevant pages from a company website "
|
||||||
|
"and creates a short brochure about the company for prospective customers, investors and recruits. "
|
||||||
|
"Include details of company culture, customers and careers/jobs if information is available. ")
|
||||||
|
super().__init__(config, system_behavior)
|
||||||
|
self.__website: Website = website
|
||||||
|
self.__extractor: ExtractorOfRelevantLinks = ExtractorOfRelevantLinks(config, website)
|
||||||
|
|
||||||
|
def create_brochure(self) -> str:
|
||||||
|
"""
|
||||||
|
Create a short Markdown brochure based on the website's content.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A Markdown string with the brochure, or a fallback message if no relevant pages were found.
|
||||||
|
"""
|
||||||
|
relevant_pages: list[dict[str, str | Website]] = self._get_relevant_pages()
|
||||||
|
if not relevant_pages:
|
||||||
|
return "No relevant pages found to create a brochure."
|
||||||
|
|
||||||
|
brochure_prompt_part: str = self._form_brochure_prompt(relevant_pages)
|
||||||
|
inferred_company_name, inferred_status = self._infer_entity(brochure_prompt_part)
|
||||||
|
|
||||||
|
full_brochure_prompt: str = self._form_full_prompt(inferred_company_name, inferred_status)
|
||||||
|
response: str = self.ask(full_brochure_prompt)
|
||||||
|
return response
|
||||||
|
|
||||||
|
def _get_relevant_pages(self) -> list[dict[str, str | Website]]:
|
||||||
|
"""
|
||||||
|
Resolve relevant links into Website objects using a shared session and concurrency.
|
||||||
|
"""
|
||||||
|
relevant_pages: list[dict[str, str | Website]] = []
|
||||||
|
relevant_links: list[dict[str, str]] = self._extractor.extract_relevant_links()["links"]
|
||||||
|
# Limit the number of pages to fetch to keep latency and token usage reasonable.
|
||||||
|
MAX_PAGES: int = 6
|
||||||
|
links_subset = relevant_links[:MAX_PAGES]
|
||||||
|
|
||||||
|
def build_page(item: dict[str, str], session: Session) -> dict[str, str | Website] | None:
|
||||||
|
try:
|
||||||
|
url = str(item["url"])
|
||||||
|
page_type = str(item["type"])
|
||||||
|
return {"type": page_type, "page": Website(url, session=session)}
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
with Session() as session, ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
|
futures = [executor.submit(build_page, link, session) for link in links_subset]
|
||||||
|
for fut in as_completed(futures):
|
||||||
|
res = fut.result()
|
||||||
|
if res:
|
||||||
|
relevant_pages.append(res)
|
||||||
|
|
||||||
|
return relevant_pages
|
||||||
|
|
||||||
|
def _truncate_text(self, text: str, limit: int) -> str:
|
||||||
|
"""
|
||||||
|
Truncate text to 'limit' characters to reduce tokens and latency.
|
||||||
|
"""
|
||||||
|
if len(text) <= limit:
|
||||||
|
return text
|
||||||
|
return text[: max(0, limit - 20)] + "... [truncated]"
|
||||||
|
|
||||||
|
def _form_brochure_prompt(self, relevant_pages: list[dict[str, str | Website]]) -> str:
|
||||||
|
"""
|
||||||
|
Assemble a prompt that includes the main page and relevant pages' titles and text.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
relevant_pages: List of page descriptors returned by _get_relevant_pages.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A prompt string containing quoted sections per page.
|
||||||
|
"""
|
||||||
|
QUOTE_DELIMITER: str = "\n\"\"\"\n"
|
||||||
|
MAX_MAIN_CHARS = 6000
|
||||||
|
MAX_PAGE_CHARS = 3000
|
||||||
|
prompt: str = (
|
||||||
|
f"Main page:{QUOTE_DELIMITER}"
|
||||||
|
f"Title: {self._website.title}\n"
|
||||||
|
f"Text:\n{self._truncate_text(self._website.text, MAX_MAIN_CHARS)}{QUOTE_DELIMITER}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
for page in relevant_pages:
|
||||||
|
if isinstance(page['page'], Website) and not page['page'].fetch_failed:
|
||||||
|
prompt += (
|
||||||
|
f"{page['type']}:{QUOTE_DELIMITER}"
|
||||||
|
f"Title: {page['page'].title}\n"
|
||||||
|
f"Text:\n{self._truncate_text(page['page'].text, MAX_PAGE_CHARS)}{QUOTE_DELIMITER}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
def _infer_entity(self, brochure_prompt_part: str) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Infer both the entity name and status in a single model call to reduce latency.
|
||||||
|
Returns:
|
||||||
|
(name, status) where status is 'company' or 'individual'.
|
||||||
|
"""
|
||||||
|
prompt = (
|
||||||
|
"From the following website excerpts, infer the entity name and whether it is a company or an individual. "
|
||||||
|
"Respond strictly as JSON with keys 'name' and 'status' (status must be 'company' or 'individual').\n"
|
||||||
|
f"{brochure_prompt_part}"
|
||||||
|
)
|
||||||
|
raw = self.ask(prompt)
|
||||||
|
try:
|
||||||
|
data: dict[str, str] = loads(raw)
|
||||||
|
name: str = str(data.get("name", "")).strip() or "Unknown"
|
||||||
|
status: str = str(data.get("status", "")).strip().lower()
|
||||||
|
if status not in ("company", "individual"):
|
||||||
|
status = "company"
|
||||||
|
return name, status
|
||||||
|
except Exception:
|
||||||
|
# Fallback: use entire output as name, assume company
|
||||||
|
return raw.strip() or "Unknown", "company"
|
||||||
|
|
||||||
|
def _form_full_prompt(self, inferred_company_name: str, inferred_status: str) -> str:
|
||||||
|
"""
|
||||||
|
Build the final brochure-generation prompt using the inferred entity and prior history.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
inferred_company_name: The inferred entity name.
|
||||||
|
inferred_status: Either 'company' or 'individual'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A final prompt instructing the model to produce a Markdown brochure.
|
||||||
|
"""
|
||||||
|
full_prompt: str = (f"You are looking at a {inferred_status} called {inferred_company_name}, to whom website {self._website.website_url} belongs.\n"
|
||||||
|
f"Build a short brochure about the {inferred_status}. Use the information from the website that is already stored in the history.\n"
|
||||||
|
"Your response must be in a Markdown format.")
|
||||||
|
return full_prompt
|
||||||
|
|
||||||
|
def ask(self, question: str) -> str:
|
||||||
|
"""
|
||||||
|
Send a question to the model, update chat history, and return the text output.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
question: The user prompt.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The model output text.
|
||||||
|
"""
|
||||||
|
self.history_manager.add_user_message(question)
|
||||||
|
response: Response = self._ai_api.responses.create(
|
||||||
|
model=self.config.model_name,
|
||||||
|
instructions=self.history_manager.system_behavior,
|
||||||
|
input=self.history_manager.chat_history,
|
||||||
|
reasoning={ "effort": "low" }
|
||||||
|
)
|
||||||
|
self.history_manager.add_assistant_message(response)
|
||||||
|
return response.output_text
|
||||||
|
|
||||||
|
console: Console = Console()
|
||||||
|
|
||||||
|
def display_markdown(content: str) -> None:
|
||||||
|
"""
|
||||||
|
Render Markdown content to the console using rich.
|
||||||
|
"""
|
||||||
|
console.print(Markdown(content))
|
||||||
|
|
||||||
|
def show_summary(summary: str) -> None:
|
||||||
|
"""
|
||||||
|
Print a Markdown summary if provided; otherwise print a fallback message.
|
||||||
|
"""
|
||||||
|
if summary:
|
||||||
|
display_markdown(summary)
|
||||||
|
else:
|
||||||
|
console.print("No summary found.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
website: Website = Website("<put your site address here>")
|
||||||
|
brochure_creator: BrochureCreator = BrochureCreator(AIBrochureConfig(), website)
|
||||||
|
brochure: str = brochure_creator.create_brochure()
|
||||||
|
display_markdown(brochure)
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
class AIBrochureConfig:
|
||||||
|
"""
|
||||||
|
Configuration class to load environment variables.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __get_config_value(self, key: str):
|
||||||
|
"""
|
||||||
|
Get the value of an environment variable.
|
||||||
|
"""
|
||||||
|
if not key:
|
||||||
|
raise ValueError("Key must be provided")
|
||||||
|
|
||||||
|
value: str | None = os.getenv(key)
|
||||||
|
if not value:
|
||||||
|
raise ValueError(f"Environment variable '{key}' not found")
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
def _get_str(self, key: str) -> str:
|
||||||
|
"""
|
||||||
|
Get a string value from the environment variables.
|
||||||
|
"""
|
||||||
|
return self.__get_config_value(key)
|
||||||
|
|
||||||
|
def _get_int(self, key: str) -> int:
|
||||||
|
"""
|
||||||
|
Get an integer value from the environment variables.
|
||||||
|
"""
|
||||||
|
value = self.__get_config_value(key)
|
||||||
|
try:
|
||||||
|
return int(value)
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(f"Environment variable '{key}' must be an integer")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def openai_api_key(self) -> str:
|
||||||
|
"""
|
||||||
|
Get the OpenAI API key from the environment variables.
|
||||||
|
"""
|
||||||
|
if self.__openai_api_key == "":
|
||||||
|
self.__openai_api_key = self._get_str("OPENAI_API_KEY")
|
||||||
|
return self.__openai_api_key
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_name(self) -> str:
|
||||||
|
"""
|
||||||
|
Get the model name from the environment variables.
|
||||||
|
"""
|
||||||
|
if self.__model_name == "":
|
||||||
|
self.__model_name = self._get_str("MODEL_NAME")
|
||||||
|
return self.__model_name
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
load_dotenv(dotenv_path=".env")
|
||||||
|
self.__openai_api_key: str = ""
|
||||||
|
self.__model_name: str = ""
|
||||||
@@ -0,0 +1,181 @@
|
|||||||
|
import openai
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from ai_brochure_config import AIBrochureConfig
|
||||||
|
from typing import Any, cast, Generic, TypeVar
|
||||||
|
from openai.types.responses import ResponseInputItemParam, Response, ResponseOutputMessage
|
||||||
|
|
||||||
|
TAiResponse = TypeVar('TAiResponse', default=Any)
|
||||||
|
|
||||||
|
class HistoryManager:
|
||||||
|
"""
|
||||||
|
Manage chat history and system behavior for a conversation with the model.
|
||||||
|
"""
|
||||||
|
@property
|
||||||
|
def chat_history(self) -> list[ResponseInputItemParam]:
|
||||||
|
"""
|
||||||
|
Return the accumulated conversation as a list of response input items.
|
||||||
|
"""
|
||||||
|
return self.__chat_history
|
||||||
|
|
||||||
|
@property
|
||||||
|
def system_behavior(self) -> str:
|
||||||
|
"""
|
||||||
|
Return the system behavior (instructions) used for this conversation.
|
||||||
|
"""
|
||||||
|
return self.__system_behavior
|
||||||
|
|
||||||
|
def __init__(self, system_behavior: str) -> None:
|
||||||
|
"""
|
||||||
|
Initialize the history manager.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
system_behavior: The system instruction string for the conversation.
|
||||||
|
"""
|
||||||
|
self.__chat_history: list[ResponseInputItemParam] = []
|
||||||
|
self.__system_behavior: str = system_behavior
|
||||||
|
|
||||||
|
def add_user_message(self, message: str) -> None:
|
||||||
|
"""
|
||||||
|
Append a user message to the chat history.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
message: The user text to add.
|
||||||
|
"""
|
||||||
|
self.__chat_history.append({
|
||||||
|
"role": "user",
|
||||||
|
"content": [{"type": "input_text", "text": message}],
|
||||||
|
})
|
||||||
|
|
||||||
|
def add_assistant_message(self, output_message: Response) -> None:
|
||||||
|
"""
|
||||||
|
Append the assistant's output to the chat history.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
output_message: The model response to convert and store.
|
||||||
|
"""
|
||||||
|
for out in output_message.output:
|
||||||
|
# Convert the Pydantic output model to an input item shape
|
||||||
|
self.__chat_history.append(
|
||||||
|
cast(ResponseInputItemParam, out.model_dump(exclude_unset=True))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AICore(ABC, Generic[TAiResponse]):
|
||||||
|
"""
|
||||||
|
Abstract base class for AI core functionalities.
|
||||||
|
"""
|
||||||
|
@property
|
||||||
|
def config(self) -> AIBrochureConfig:
|
||||||
|
"""
|
||||||
|
Return the stored AIBrochureConfig for this instance.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AIBrochureConfig: The current configuration used by this object.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- This accessor returns the internal configuration reference. Mutating the returned
|
||||||
|
object may affect the internal state of this instance.
|
||||||
|
- To change the configuration, use the appropriate setter or factory method rather
|
||||||
|
than modifying the returned value in-place.
|
||||||
|
"""
|
||||||
|
return self.__config
|
||||||
|
|
||||||
|
@config.setter
|
||||||
|
def config(self, config: AIBrochureConfig | None) -> None:
|
||||||
|
"""
|
||||||
|
Set the instance configuration for the AI brochure generator.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
config : AIBrochureConfig | None
|
||||||
|
The configuration to assign to the instance. If None, the instance's
|
||||||
|
configuration will be reset to a newly created default AIBrochureConfig.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
This method stores the provided configuration on a private attribute
|
||||||
|
"""
|
||||||
|
if config is None:
|
||||||
|
self.__config = AIBrochureConfig()
|
||||||
|
else:
|
||||||
|
self.__config = config
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _ai_api(self) -> openai.OpenAI:
|
||||||
|
"""
|
||||||
|
Return the cached OpenAI API client, initializing it on first access.
|
||||||
|
|
||||||
|
This private helper lazily constructs and caches an openai.OpenAI client using
|
||||||
|
the API key found on self.config.openai_api_key. On the first call, if the
|
||||||
|
client has not yet been created, the method verifies that self.config is set,
|
||||||
|
creates the client with openai.OpenAI(api_key=...), stores it on
|
||||||
|
self.__ai_api, and returns it. Subsequent calls return the same cached
|
||||||
|
instance.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
openai.OpenAI: A configured OpenAI API client.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If self.config is None when attempting to initialize the client.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- The method mutates self.__ai_api as a side effect (caching).
|
||||||
|
- The caller should treat this as a private implementation detail.
|
||||||
|
- Thread safety is not guaranteed; concurrent initialization may result in
|
||||||
|
multiple client instances if invoked from multiple threads simultaneously.
|
||||||
|
"""
|
||||||
|
if self.__ai_api is None:
|
||||||
|
if self.config is None:
|
||||||
|
raise ValueError("Configuration must be set before accessing AI API")
|
||||||
|
self.__ai_api = openai.OpenAI(api_key=self.config.openai_api_key)
|
||||||
|
return self.__ai_api
|
||||||
|
|
||||||
|
@property
|
||||||
|
def history_manager(self) -> HistoryManager:
|
||||||
|
"""
|
||||||
|
Return the history manager for this AI core instance.
|
||||||
|
|
||||||
|
This property provides access to the HistoryManager that tracks the chat
|
||||||
|
history and system behavior.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HistoryManager: The current history manager. This property always returns
|
||||||
|
a HistoryManager instance and never None.
|
||||||
|
"""
|
||||||
|
return self.__history_manager
|
||||||
|
|
||||||
|
def __init__(self, config: AIBrochureConfig, system_behavior: str) -> None:
|
||||||
|
"""
|
||||||
|
Initializes the AI core with the provided configuration.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (AIBrochureConfig): The configuration object for the AI core.
|
||||||
|
system_behavior (str): The behavior of the system.
|
||||||
|
"""
|
||||||
|
# Initialize all instance-level attributes here
|
||||||
|
self.__config: AIBrochureConfig = config
|
||||||
|
self.__history_manager: HistoryManager = HistoryManager(system_behavior)
|
||||||
|
self.__ai_api: openai.OpenAI | None = None
|
||||||
|
|
||||||
|
if __debug__:
|
||||||
|
# Sanity check: confirm attributes are initialized
|
||||||
|
assert hasattr(self, "_AICore__config")
|
||||||
|
assert hasattr(self, "_AICore__history_manager")
|
||||||
|
assert hasattr(self, "_AICore__ai_api")
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def ask(self, question: str) -> TAiResponse:
|
||||||
|
"""
|
||||||
|
Ask a question to the AI model.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
question: The question to ask.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TAiResponse: The model's response type defined by the subclass.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
@@ -0,0 +1,91 @@
|
|||||||
|
from ai_brochure_config import AIBrochureConfig
|
||||||
|
from website import Website
|
||||||
|
from ai_core import AICore
|
||||||
|
from openai.types.responses import Response
|
||||||
|
from json import loads
|
||||||
|
|
||||||
|
RelevantLinksDict = dict[str, list[dict[str, str]]]
|
||||||
|
|
||||||
|
class ExtractorOfRelevantLinks(AICore[RelevantLinksDict]):
|
||||||
|
"""
|
||||||
|
Extractor for relevant links from a website.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def website(self) -> Website:
|
||||||
|
"""Return the root Website whose links are being analyzed."""
|
||||||
|
return self.__website
|
||||||
|
|
||||||
|
def __init__(self, config: AIBrochureConfig, website: Website) -> None:
|
||||||
|
"""
|
||||||
|
Initialize the extractor with configuration and target website.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config: AI and runtime configuration.
|
||||||
|
website: The Website from which links were collected.
|
||||||
|
"""
|
||||||
|
system_behavior: str = ("You are an expert in creation of online advertisement materials."
|
||||||
|
"You are going to be provided with a list of links found on a website."
|
||||||
|
"You are able to decide which of the links would be most relevant to include in a brochure about the company,"
|
||||||
|
"such as links to an About page or a Company page or Careers/Jobs pages.\n"
|
||||||
|
"You should respond in JSON as in this example:")
|
||||||
|
system_behavior += """
|
||||||
|
{
|
||||||
|
"links": [
|
||||||
|
{"type": "about page", "url": "https://www.example.com/about"},
|
||||||
|
{"type": "company page", "url": "https://www.another_example.net/company"},
|
||||||
|
{"type": "careers page", "url": "https://ex.one_more_example.org/careers"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
super().__init__(config, system_behavior)
|
||||||
|
self.__website: Website = website
|
||||||
|
|
||||||
|
def get_links_user_prompt(self) -> str:
|
||||||
|
"""
|
||||||
|
Build a user prompt listing discovered links and instructions for relevance filtering.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A string to send to the model listing links and guidance.
|
||||||
|
"""
|
||||||
|
starter_part: str = (f"Here is a list of links found on the website of {self.website.website_url} - "
|
||||||
|
"please decide which of these links are relevant web links for a brochure about company."
|
||||||
|
"Respond with full HTTPS URLs. Avoid including Terms of Service, Privacy, email links.\n"
|
||||||
|
"Links (some might be relative links):\n")
|
||||||
|
|
||||||
|
links_part: str = "\n".join(f"- {link}" for link in self.website.links_on_page) if self.website.links_on_page else "No links found."
|
||||||
|
|
||||||
|
return starter_part + links_part
|
||||||
|
|
||||||
|
def extract_relevant_links(self) -> RelevantLinksDict:
|
||||||
|
"""
|
||||||
|
Request the model to select relevant links for brochure creation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary with a 'links' array containing objects with 'type' and 'url'.
|
||||||
|
"""
|
||||||
|
user_prompt = self.get_links_user_prompt()
|
||||||
|
response = self.ask(user_prompt)
|
||||||
|
return response
|
||||||
|
|
||||||
|
def ask(self, question: str) -> RelevantLinksDict:
|
||||||
|
"""
|
||||||
|
Send a question to the model and parse the JSON response.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
question: The prompt to submit.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RelevantLinksDict: Parsed JSON containing selected links.
|
||||||
|
"""
|
||||||
|
self.history_manager.add_user_message(question)
|
||||||
|
|
||||||
|
response: Response = self._ai_api.responses.create(
|
||||||
|
model=self.config.model_name,
|
||||||
|
instructions=self.history_manager.system_behavior,
|
||||||
|
reasoning={ "effort": "low" },
|
||||||
|
input=self.history_manager.chat_history
|
||||||
|
)
|
||||||
|
|
||||||
|
self.history_manager.add_assistant_message(response)
|
||||||
|
return loads(response.output_text)
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
python-dotenv
|
||||||
|
openai
|
||||||
|
bs4
|
||||||
|
requests
|
||||||
|
rich
|
||||||
@@ -0,0 +1,286 @@
|
|||||||
|
from ipaddress import ip_address, IPv4Address, IPv6Address
|
||||||
|
from urllib.parse import ParseResult, urlparse
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
from requests import get, RequestException, Session
|
||||||
|
|
||||||
|
class Extractor:
|
||||||
|
"""
|
||||||
|
Extracts and processes content from HTML response text using BeautifulSoup.
|
||||||
|
"""
|
||||||
|
__soup: BeautifulSoup
|
||||||
|
|
||||||
|
__extracted_title: str = ""
|
||||||
|
@property
|
||||||
|
def extracted_title(self) -> str:
|
||||||
|
"""
|
||||||
|
Returns the extracted title from the HTML content.
|
||||||
|
"""
|
||||||
|
if not self.__extracted_title:
|
||||||
|
self.__extracted_title = self.get_title()
|
||||||
|
return self.__extracted_title
|
||||||
|
|
||||||
|
__extracted_text: str = ""
|
||||||
|
@property
|
||||||
|
def extracted_text(self) -> str:
|
||||||
|
"""
|
||||||
|
Returns the extracted main text content from the HTML, excluding irrelevant tags.
|
||||||
|
"""
|
||||||
|
if not self.__extracted_text:
|
||||||
|
self.__extracted_text = self.get_text()
|
||||||
|
return self.__extracted_text
|
||||||
|
|
||||||
|
__extracted_links_on_page: list[str] | None = None
|
||||||
|
@property
|
||||||
|
def extracted_links_on_page(self) -> list[str]:
|
||||||
|
"""
|
||||||
|
Return all href values found on the page.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Only anchor tags with an href are included.
|
||||||
|
- Values are returned as-is (may be relative or absolute).
|
||||||
|
"""
|
||||||
|
if self.__extracted_links_on_page is None:
|
||||||
|
self.__extracted_links_on_page = [str(a.get("href")) for a in self._soup.find_all('a', href=True) if isinstance(a, Tag)]
|
||||||
|
return self.__extracted_links_on_page
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _soup(self) -> BeautifulSoup:
|
||||||
|
"""
|
||||||
|
Returns the BeautifulSoup object for the HTML content.
|
||||||
|
"""
|
||||||
|
return self.__soup
|
||||||
|
|
||||||
|
def __init__(self, response_text_content: str) -> None:
|
||||||
|
"""
|
||||||
|
Initializes the Extractor with HTML response text.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
response_text_content (str): The HTML response text to be processed.
|
||||||
|
"""
|
||||||
|
self.__soup = BeautifulSoup(response_text_content, "html.parser")
|
||||||
|
self.__extracted_links_on_page = None
|
||||||
|
|
||||||
|
def get_title(self) -> str:
|
||||||
|
"""
|
||||||
|
Extracts the title from the HTML content.
|
||||||
|
"""
|
||||||
|
return self._soup.title.get_text() if self._soup.title is not None else "No title"
|
||||||
|
|
||||||
|
def get_text(self) -> str:
|
||||||
|
"""
|
||||||
|
Extracts and cleans the main text content from the HTML, removing irrelevant tags.
|
||||||
|
"""
|
||||||
|
for irrelevant in self._soup.find_all(["script", "style", "img", "figure", "video", "audio", "button", "svg", "canvas", "input", "form", "meta"]):
|
||||||
|
irrelevant.decompose()
|
||||||
|
raw_text: str = self._soup.get_text(separator="\n")
|
||||||
|
cleaned_text: str = " ".join(raw_text.split())
|
||||||
|
return cleaned_text if cleaned_text else "No content"
|
||||||
|
|
||||||
|
class Website:
|
||||||
|
"""
|
||||||
|
A class to represent a website.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__DEFAULT_ALLOWED_DOMAINS: list[str] = [".com", ".org", ".net"]
|
||||||
|
|
||||||
|
__title: str = ""
|
||||||
|
__website_url: str = ""
|
||||||
|
__text: str = ""
|
||||||
|
__allowed_domains: list[str] = []
|
||||||
|
__links_on_page: list[str] | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def title(self) -> str:
|
||||||
|
"""
|
||||||
|
Returns the title of the website.
|
||||||
|
"""
|
||||||
|
return self.__title
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self) -> str:
|
||||||
|
"""
|
||||||
|
Returns the main text content of the website.
|
||||||
|
"""
|
||||||
|
return self.__text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def website_url(self) -> str:
|
||||||
|
"""
|
||||||
|
Returns the URL of the website.
|
||||||
|
"""
|
||||||
|
return self.__website_url
|
||||||
|
|
||||||
|
@property
|
||||||
|
def links_on_page(self) -> list[str] | None:
|
||||||
|
"""
|
||||||
|
Returns the list of links extracted from the website.
|
||||||
|
"""
|
||||||
|
return self.__links_on_page
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _allowed_domains(self) -> list[str]:
|
||||||
|
"""
|
||||||
|
Returns the list of allowed domain suffixes.
|
||||||
|
"""
|
||||||
|
return self.__allowed_domains
|
||||||
|
|
||||||
|
@_allowed_domains.setter
|
||||||
|
def _allowed_domains(self, value: list[str] | str) -> None:
|
||||||
|
"""
|
||||||
|
Sets the list of allowed domain suffixes.
|
||||||
|
Filters out empty strings and ensures each suffix starts with a dot.
|
||||||
|
"""
|
||||||
|
if isinstance(value, str):
|
||||||
|
value = [
|
||||||
|
item.strip() if item.strip().startswith(".") else f".{item.strip()}"
|
||||||
|
for item in value.split(",")
|
||||||
|
if item.strip()
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
value = [
|
||||||
|
item if item.startswith(".") else f".{item}"
|
||||||
|
for item in value
|
||||||
|
if item
|
||||||
|
]
|
||||||
|
self.__allowed_domains = value
|
||||||
|
|
||||||
|
def _set_website_url(self, value: str) -> None:
|
||||||
|
"""
|
||||||
|
Protected: set the website URL after validating and fetch website data.
|
||||||
|
Use this from inside the class to initialize or change the URL.
|
||||||
|
"""
|
||||||
|
if not value:
|
||||||
|
raise ValueError("Website URL must be provided")
|
||||||
|
|
||||||
|
parsed_url: ParseResult = urlparse(value)
|
||||||
|
|
||||||
|
self._validate(parsed_url)
|
||||||
|
|
||||||
|
self.__website_url = value
|
||||||
|
self.__fetch_website_data()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def fetch_failed(self) -> bool:
|
||||||
|
"""
|
||||||
|
Returns whether the website data fetch failed.
|
||||||
|
"""
|
||||||
|
return self.__fetch_failed
|
||||||
|
|
||||||
|
def _validate(self, parsed_url: ParseResult) -> None:
|
||||||
|
"""
|
||||||
|
Validate the parsed URL.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
parsed_url: The parsed URL to validate.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the URL is missing parts, uses an invalid scheme,
|
||||||
|
points to a local/private address, or is not in allowed domains.
|
||||||
|
"""
|
||||||
|
if not parsed_url.netloc or parsed_url.scheme not in ("http", "https"):
|
||||||
|
raise ValueError("Website URL must be a valid URL")
|
||||||
|
|
||||||
|
if not parsed_url.hostname:
|
||||||
|
raise ValueError("Website URL must contain a valid hostname")
|
||||||
|
|
||||||
|
if self.__is_local_address(parsed_url.hostname):
|
||||||
|
raise ValueError("Website URL must not be a local address")
|
||||||
|
|
||||||
|
if not self.__is_allowed_domain(parsed_url.hostname):
|
||||||
|
raise ValueError("Website URL must be an allowed domain")
|
||||||
|
|
||||||
|
def __is_local_address(self, hostname: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the given hostname is a local address.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
hostname (str): The hostname to check.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the hostname is a local address, False otherwise.
|
||||||
|
"""
|
||||||
|
if hostname in ("localhost", "127.0.0.1", "::1"):
|
||||||
|
return True
|
||||||
|
|
||||||
|
try:
|
||||||
|
ip: IPv4Address | IPv6Address = ip_address(hostname)
|
||||||
|
if ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_reserved:
|
||||||
|
return True
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __is_allowed_domain(self, hostname: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the given hostname is an allowed domain.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
hostname (str): The hostname to check.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the hostname is an allowed domain, False otherwise.
|
||||||
|
"""
|
||||||
|
allowed_domains = [".com", ".org", ".net", ".io"]
|
||||||
|
return any(hostname.endswith(domain) for domain in allowed_domains)
|
||||||
|
|
||||||
|
def __fetch_website_data(self) -> None:
|
||||||
|
"""
|
||||||
|
Fetch website content and populate title, text, and links.
|
||||||
|
|
||||||
|
Side effects:
|
||||||
|
- Sets internal state: __title, __text, __links_on_page, __fetch_failed.
|
||||||
|
- Performs an HTTP GET with a browser-like User-Agent.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
get_fn = self.__session.get if self.__session else get
|
||||||
|
response = get_fn(
|
||||||
|
self.website_url,
|
||||||
|
timeout=10,
|
||||||
|
verify=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"}
|
||||||
|
)
|
||||||
|
except RequestException as e:
|
||||||
|
self.__title = "Error"
|
||||||
|
self.__text = str(e)
|
||||||
|
self.__fetch_failed = True
|
||||||
|
return
|
||||||
|
|
||||||
|
if response.ok:
|
||||||
|
extractor: Extractor = Extractor(response.text)
|
||||||
|
self.__title = extractor.extracted_title
|
||||||
|
self.__text = extractor.extracted_text
|
||||||
|
self.__links_on_page = extractor.extracted_links_on_page
|
||||||
|
else:
|
||||||
|
if response.status_code == 404:
|
||||||
|
self.__title = "Not Found"
|
||||||
|
self.__text = "The requested page was not found (404)."
|
||||||
|
else:
|
||||||
|
self.__title = "Error"
|
||||||
|
self.__text = f"Error: {response.status_code} - {response.reason}"
|
||||||
|
self.__fetch_failed = True
|
||||||
|
|
||||||
|
def __init__(self, website_url: str, allowed_domains: list[str] | str | None = None, session: Session | None = None) -> None:
|
||||||
|
"""
|
||||||
|
Initializes the Website object and fetches its data.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
website_url (str): The URL of the website to fetch.
|
||||||
|
allowed_domains (list[str] | str, optional): A list of allowed domain suffixes.
|
||||||
|
If a string is provided, it should be a comma-separated list of domain suffixes (e.g., ".com,.org,.net").
|
||||||
|
session (requests.Session | None, optional): Reused HTTP session for connection pooling.
|
||||||
|
"""
|
||||||
|
self.__fetch_failed: bool = False
|
||||||
|
self.__session: Session | None = session
|
||||||
|
if allowed_domains is None:
|
||||||
|
self._allowed_domains = self.__DEFAULT_ALLOWED_DOMAINS.copy()
|
||||||
|
else:
|
||||||
|
self._allowed_domains = allowed_domains
|
||||||
|
# Use protected setter internally so the public API exposes only the getter.
|
||||||
|
self._set_website_url(website_url)
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
"""
|
||||||
|
Returns a string representation of the Website object.
|
||||||
|
"""
|
||||||
|
return f"Website(title={self.title}, url={self.website_url})"
|
||||||
Reference in New Issue
Block a user