Files
David Zywiec afb94de271 Add web scraping and summarization script using Playwright and OpenAI
This script allows users to input a URL, scrape the visible content using the Playwright framework, and summarize it using the OpenAI GPT-4o API. The summarized output is saved as a Markdown (.md) file, providing a clean and accessible format.

Key features:
- Prompts user for a URL at runtime
- Uses Playwright to scrape the page content
- Extracts visible text with BeautifulSoup
- Summarizes content using OpenAI's chat model
- Saves output to a user-friendly Markdown file

This contribution supports browser-based content summarization and expands the repo’s AI toolset for web interaction tasks.
2025-07-18 15:04:38 -05:00

56 lines
1.8 KiB
Python

import os
import openai
from IPython.display import Markdown, display
from dotenv import load_dotenv
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY") # Or set it directly
def scrape_website(url):
# Code to scrape a website using Playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
content = page.content()
browser.close()
return content
def summarize_content(html_content):
#Get only the text parts of the webpage
soup = BeautifulSoup(html_content, 'html.parser')
summary_text = soup.get_text(separator=' ', strip=True)
# Code to summarize using OpenAI API
system_prompt = ("You summarize html content as markdown.")
user_prompt = (
"You are a helpful assistant. Summarize the following HTML webpage content in markdown with simple terms:\n\n"
+ summary_text
)
response = openai.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": user_prompt}]
)
return response.choices[0].message.content
def save_markdown(summary, filename="summary.md", url=None):
#Open the file summary.md
with open(filename, "w", encoding="utf-8") as f:
if url:
f.write(f"# Summary of [{url}]({url})\n\n")
else:
f.write("# Summary\n\n")
f.write(summary.strip())
# 4. Main Logic
def main():
url = input("Enter the URL to summarize: ").strip()
html = scrape_website(url)
summary = summarize_content(html)
save_markdown(summary, filename="summary.md", url=url)
print("✅ Summary saved to summary.md")
# 5. Entry Point
if __name__ == "__main__":
main()