207 lines
8.6 KiB
Python
207 lines
8.6 KiB
Python
from ai_core import AICore
|
|
from ai_brochure_config import AIBrochureConfig
|
|
from extractor_of_relevant_links import ExtractorOfRelevantLinks
|
|
from website import Website
|
|
from openai.types.responses import Response
|
|
from rich.console import Console
|
|
from rich.markdown import Markdown
|
|
from requests import Session
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from json import loads
|
|
|
|
class BrochureCreator(AICore[str]):
|
|
"""
|
|
Builds a short Markdown brochure for a company or individual by:
|
|
- extracting relevant links from the website,
|
|
- inferring the entity name and status,
|
|
- and prompting the model using the collected page content.
|
|
"""
|
|
|
|
@property
|
|
def _website(self) -> Website:
|
|
"""Return the main Website instance to analyze."""
|
|
return self.__website
|
|
|
|
@property
|
|
def _extractor(self) -> ExtractorOfRelevantLinks:
|
|
"""Return the helper responsible for extracting relevant links."""
|
|
return self.__extractor
|
|
|
|
def __init__(self, config: AIBrochureConfig, website: Website) -> None:
|
|
"""
|
|
Initialize the brochure creator with configuration and target website.
|
|
|
|
Parameters:
|
|
config: AI and runtime configuration.
|
|
website: The root website to analyze and summarize.
|
|
"""
|
|
system_behavior: str = ("You are an assistant that analyzes the contents of several relevant pages from a company website "
|
|
"and creates a short brochure about the company for prospective customers, investors and recruits. "
|
|
"Include details of company culture, customers and careers/jobs if information is available. ")
|
|
super().__init__(config, system_behavior)
|
|
self.__website: Website = website
|
|
self.__extractor: ExtractorOfRelevantLinks = ExtractorOfRelevantLinks(config, website)
|
|
|
|
def create_brochure(self) -> str:
|
|
"""
|
|
Create a short Markdown brochure based on the website's content.
|
|
|
|
Returns:
|
|
A Markdown string with the brochure, or a fallback message if no relevant pages were found.
|
|
"""
|
|
relevant_pages: list[dict[str, str | Website]] = self._get_relevant_pages()
|
|
if not relevant_pages:
|
|
return "No relevant pages found to create a brochure."
|
|
|
|
brochure_prompt_part: str = self._form_brochure_prompt(relevant_pages)
|
|
inferred_company_name, inferred_status = self._infer_entity(brochure_prompt_part)
|
|
|
|
full_brochure_prompt: str = self._form_full_prompt(inferred_company_name, inferred_status)
|
|
response: str = self.ask(full_brochure_prompt)
|
|
return response
|
|
|
|
def _get_relevant_pages(self) -> list[dict[str, str | Website]]:
|
|
"""
|
|
Resolve relevant links into Website objects using a shared session and concurrency.
|
|
"""
|
|
relevant_pages: list[dict[str, str | Website]] = []
|
|
relevant_links: list[dict[str, str]] = self._extractor.extract_relevant_links()["links"]
|
|
# Limit the number of pages to fetch to keep latency and token usage reasonable.
|
|
MAX_PAGES: int = 6
|
|
links_subset = relevant_links[:MAX_PAGES]
|
|
|
|
def build_page(item: dict[str, str], session: Session) -> dict[str, str | Website] | None:
|
|
try:
|
|
url = str(item["url"])
|
|
page_type = str(item["type"])
|
|
return {"type": page_type, "page": Website(url, session=session)}
|
|
except Exception:
|
|
return None
|
|
|
|
with Session() as session, ThreadPoolExecutor(max_workers=4) as executor:
|
|
futures = [executor.submit(build_page, link, session) for link in links_subset]
|
|
for fut in as_completed(futures):
|
|
res = fut.result()
|
|
if res:
|
|
relevant_pages.append(res)
|
|
|
|
return relevant_pages
|
|
|
|
def _truncate_text(self, text: str, limit: int) -> str:
|
|
"""
|
|
Truncate text to 'limit' characters to reduce tokens and latency.
|
|
"""
|
|
if len(text) <= limit:
|
|
return text
|
|
return text[: max(0, limit - 20)] + "... [truncated]"
|
|
|
|
def _form_brochure_prompt(self, relevant_pages: list[dict[str, str | Website]]) -> str:
|
|
"""
|
|
Assemble a prompt that includes the main page and relevant pages' titles and text.
|
|
|
|
Parameters:
|
|
relevant_pages: List of page descriptors returned by _get_relevant_pages.
|
|
|
|
Returns:
|
|
A prompt string containing quoted sections per page.
|
|
"""
|
|
QUOTE_DELIMITER: str = "\n\"\"\"\n"
|
|
MAX_MAIN_CHARS = 6000
|
|
MAX_PAGE_CHARS = 3000
|
|
prompt: str = (
|
|
f"Main page:{QUOTE_DELIMITER}"
|
|
f"Title: {self._website.title}\n"
|
|
f"Text:\n{self._truncate_text(self._website.text, MAX_MAIN_CHARS)}{QUOTE_DELIMITER}\n"
|
|
)
|
|
|
|
for page in relevant_pages:
|
|
if isinstance(page['page'], Website) and not page['page'].fetch_failed:
|
|
prompt += (
|
|
f"{page['type']}:{QUOTE_DELIMITER}"
|
|
f"Title: {page['page'].title}\n"
|
|
f"Text:\n{self._truncate_text(page['page'].text, MAX_PAGE_CHARS)}{QUOTE_DELIMITER}\n"
|
|
)
|
|
|
|
return prompt
|
|
|
|
def _infer_entity(self, brochure_prompt_part: str) -> tuple[str, str]:
|
|
"""
|
|
Infer both the entity name and status in a single model call to reduce latency.
|
|
Returns:
|
|
(name, status) where status is 'company' or 'individual'.
|
|
"""
|
|
prompt = (
|
|
"From the following website excerpts, infer the entity name and whether it is a company or an individual. "
|
|
"Respond strictly as JSON with keys 'name' and 'status' (status must be 'company' or 'individual').\n"
|
|
f"{brochure_prompt_part}"
|
|
)
|
|
raw = self.ask(prompt)
|
|
try:
|
|
data: dict[str, str] = loads(raw)
|
|
name: str = str(data.get("name", "")).strip() or "Unknown"
|
|
status: str = str(data.get("status", "")).strip().lower()
|
|
if status not in ("company", "individual"):
|
|
status = "company"
|
|
return name, status
|
|
except Exception:
|
|
# Fallback: use entire output as name, assume company
|
|
return raw.strip() or "Unknown", "company"
|
|
|
|
def _form_full_prompt(self, inferred_company_name: str, inferred_status: str) -> str:
|
|
"""
|
|
Build the final brochure-generation prompt using the inferred entity and prior history.
|
|
|
|
Parameters:
|
|
inferred_company_name: The inferred entity name.
|
|
inferred_status: Either 'company' or 'individual'.
|
|
|
|
Returns:
|
|
A final prompt instructing the model to produce a Markdown brochure.
|
|
"""
|
|
full_prompt: str = (f"You are looking at a {inferred_status} called {inferred_company_name}, to whom website {self._website.website_url} belongs.\n"
|
|
f"Build a short brochure about the {inferred_status}. Use the information from the website that is already stored in the history.\n"
|
|
"Your response must be in a Markdown format.")
|
|
return full_prompt
|
|
|
|
def ask(self, question: str) -> str:
|
|
"""
|
|
Send a question to the model, update chat history, and return the text output.
|
|
|
|
Parameters:
|
|
question: The user prompt.
|
|
|
|
Returns:
|
|
The model output text.
|
|
"""
|
|
self.history_manager.add_user_message(question)
|
|
response: Response = self._ai_api.responses.create(
|
|
model=self.config.model_name,
|
|
instructions=self.history_manager.system_behavior,
|
|
input=self.history_manager.chat_history,
|
|
reasoning={ "effort": "low" }
|
|
)
|
|
self.history_manager.add_assistant_message(response)
|
|
return response.output_text
|
|
|
|
console: Console = Console()
|
|
|
|
def display_markdown(content: str) -> None:
|
|
"""
|
|
Render Markdown content to the console using rich.
|
|
"""
|
|
console.print(Markdown(content))
|
|
|
|
def show_summary(summary: str) -> None:
|
|
"""
|
|
Print a Markdown summary if provided; otherwise print a fallback message.
|
|
"""
|
|
if summary:
|
|
display_markdown(summary)
|
|
else:
|
|
console.print("No summary found.")
|
|
|
|
if __name__ == "__main__":
|
|
website: Website = Website("<put your site address here>")
|
|
brochure_creator: BrochureCreator = BrochureCreator(AIBrochureConfig(), website)
|
|
brochure: str = brochure_creator.create_brochure()
|
|
display_markdown(brochure) |