164 lines
5.9 KiB
Plaintext
164 lines
5.9 KiB
Plaintext
# imports
|
|
|
|
import os
|
|
import requests
|
|
import json
|
|
from typing import List
|
|
from dotenv import load_dotenv
|
|
from bs4 import BeautifulSoup
|
|
from IPython.display import Markdown, display, update_display
|
|
from openai import OpenAI
|
|
|
|
|
|
# Initialize and constants
|
|
|
|
load_dotenv()
|
|
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
|
|
MODEL = 'gpt-4o-mini'
|
|
openai = OpenAI()
|
|
|
|
|
|
# A class to represent a Webpage
|
|
|
|
class Website:
|
|
url: str
|
|
title: str
|
|
body: str
|
|
links: List[str]
|
|
|
|
def __init__(self, url):
|
|
self.url = url
|
|
response = requests.get(url)
|
|
self.body = response.content
|
|
soup = BeautifulSoup(self.body, 'html.parser')
|
|
self.title = soup.title.string if soup.title else "No title found"
|
|
if soup.body:
|
|
for irrelevant in soup.body(["script", "style", "img", "input"]):
|
|
irrelevant.decompose()
|
|
self.text = soup.body.get_text(separator="\n", strip=True)
|
|
else:
|
|
self.text = ""
|
|
links = [link.get('href') for link in soup.find_all('a')]
|
|
self.links = [link for link in links if link]
|
|
|
|
def get_contents(self):
|
|
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
|
|
|
|
link_system_prompt = """
|
|
You are provided with a list of links found on a webpage. Your task is to first categorize each link into one of the following categories:
|
|
- about page
|
|
- careers page
|
|
- terms of service
|
|
- privacy policy
|
|
- contact page
|
|
- other (please specify).
|
|
|
|
Once the links are categorized, please choose which links are most relevant to include in a brochure about the company.
|
|
The brochure should only include links such as About pages, Careers pages, or Company Overview pages. Exclude any links related to Terms of Service, Privacy Policy, or email addresses.
|
|
|
|
Respond in the following JSON format:
|
|
{
|
|
"categorized_links": [
|
|
{"category": "about page", "url": "https://full.url/about"},
|
|
{"category": "careers page", "url": "https://full.url/careers"},
|
|
{"category": "terms of service", "url": "https://full.url/terms"},
|
|
{"category": "privacy policy", "url": "https://full.url/privacy"},
|
|
{"category": "other", "specify": "contact page", "url": "https://full.url/contact"}
|
|
],
|
|
"brochure_links": [
|
|
{"type": "about page", "url": "https://full.url/about"},
|
|
{"type": "careers page", "url": "https://full.url/careers"}
|
|
]
|
|
}
|
|
|
|
Please find the links below and proceed with the task:
|
|
|
|
Links (some may be relative links):
|
|
[INSERT LINK LIST HERE]
|
|
"""
|
|
|
|
def get_links_user_prompt(website):
|
|
user_prompt = f"Here is the list of links on the website of {website.url} - "
|
|
user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
|
|
Do not include Terms of Service, Privacy, email links.\n"
|
|
user_prompt += "Links (some might be relative links):\n"
|
|
user_prompt += "\n".join(website.links)
|
|
return user_prompt
|
|
|
|
def get_links(url):
|
|
website = Website(url)
|
|
completion = openai.chat.completions.create(
|
|
model=MODEL,
|
|
messages=[
|
|
{"role": "system", "content": link_system_prompt},
|
|
{"role": "user", "content": get_links_user_prompt(website)}
|
|
],
|
|
response_format={"type": "json_object"}
|
|
)
|
|
result = completion.choices[0].message.content
|
|
return json.loads(result)
|
|
|
|
|
|
from urllib.parse import urljoin
|
|
|
|
def get_all_details(url):
|
|
result = "Landing page:\n"
|
|
result += Website(url).get_contents() # Get the landing page content
|
|
|
|
links = get_links(url) # Retrieve the links JSON
|
|
|
|
brochure_links = links.get('brochure_links', []) # Get the brochure links list (which is already a list)
|
|
print("Found Brochure links:", brochure_links) # Debug output to show the brochure links
|
|
|
|
# Iterate over each brochure link
|
|
for link in brochure_links:
|
|
result += f"\n\n{link['type']}:\n" # Add the type of link (about page, careers page, etc.)
|
|
|
|
# Handle relative URLs by converting them to absolute URLs
|
|
full_url = urljoin(url, link["url"])
|
|
|
|
# Fetch and append the content of the brochure link URL
|
|
result += Website(full_url).get_contents()
|
|
|
|
return result
|
|
|
|
|
|
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
|
|
and creates a brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
|
|
Include details of company culture, customers and careers/jobs if you have the information.\
|
|
Structure the brochure to include specific sections as follows:\
|
|
About Us\
|
|
What we do\
|
|
How We Do It\
|
|
Where We Do It\
|
|
Our People\
|
|
Our Culture\
|
|
Connect with Us.\
|
|
Please provide two versions of the brochure, the first in English, the second in Spanish. The contents of the brochure are to be the same for both languages."
|
|
|
|
def get_brochure_user_prompt(company_name, url):
|
|
user_prompt = f"You are looking at a company called: {company_name}\n"
|
|
user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
|
|
user_prompt += get_all_details(url)
|
|
user_prompt = user_prompt[:20_000] # Truncate if more than 20,000 characters
|
|
return user_prompt
|
|
|
|
def stream_brochure(company_name, url):
|
|
stream = openai.chat.completions.create(
|
|
model=MODEL,
|
|
messages=[
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": get_brochure_user_prompt(company_name, url)}
|
|
],
|
|
stream=True
|
|
)
|
|
|
|
response = ""
|
|
display_handle = display(Markdown(""), display_id=True)
|
|
for chunk in stream:
|
|
response += chunk.choices[0].delta.content or ''
|
|
response = response.replace("```","").replace("markdown", "")
|
|
update_display(Markdown(response), display_id=display_handle.display_id)
|
|
|
|
stream_brochure("Anthropic", "https://anthropic.com")
|