This script allows users to input a URL, scrape the visible content using the Playwright framework, and summarize it using the OpenAI GPT-4o API. The summarized output is saved as a Markdown (.md) file, providing a clean and accessible format. Key features: - Prompts user for a URL at runtime - Uses Playwright to scrape the page content - Extracts visible text with BeautifulSoup - Summarizes content using OpenAI's chat model - Saves output to a user-friendly Markdown file This contribution supports browser-based content summarization and expands the repo’s AI toolset for web interaction tasks.
56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
import os
|
|
import openai
|
|
from IPython.display import Markdown, display
|
|
from dotenv import load_dotenv
|
|
from playwright.sync_api import sync_playwright
|
|
from bs4 import BeautifulSoup
|
|
|
|
load_dotenv()
|
|
openai.api_key = os.getenv("OPENAI_API_KEY") # Or set it directly
|
|
|
|
def scrape_website(url):
|
|
# Code to scrape a website using Playwright
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
page = browser.new_page()
|
|
page.goto(url)
|
|
content = page.content()
|
|
browser.close()
|
|
return content
|
|
|
|
def summarize_content(html_content):
|
|
#Get only the text parts of the webpage
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
summary_text = soup.get_text(separator=' ', strip=True)
|
|
# Code to summarize using OpenAI API
|
|
system_prompt = ("You summarize html content as markdown.")
|
|
user_prompt = (
|
|
"You are a helpful assistant. Summarize the following HTML webpage content in markdown with simple terms:\n\n"
|
|
+ summary_text
|
|
)
|
|
response = openai.chat.completions.create(
|
|
model="gpt-4o",
|
|
messages=[{"role": "user", "content": user_prompt}]
|
|
)
|
|
return response.choices[0].message.content
|
|
|
|
def save_markdown(summary, filename="summary.md", url=None):
|
|
#Open the file summary.md
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
if url:
|
|
f.write(f"# Summary of [{url}]({url})\n\n")
|
|
else:
|
|
f.write("# Summary\n\n")
|
|
f.write(summary.strip())
|
|
|
|
# 4. Main Logic
|
|
def main():
|
|
url = input("Enter the URL to summarize: ").strip()
|
|
html = scrape_website(url)
|
|
summary = summarize_content(html)
|
|
save_markdown(summary, filename="summary.md", url=url)
|
|
print("✅ Summary saved to summary.md")
|
|
|
|
# 5. Entry Point
|
|
if __name__ == "__main__":
|
|
main() |