Add web scraping and summarization script using Playwright and OpenAI
This script allows users to input a URL, scrape the visible content using the Playwright framework, and summarize it using the OpenAI GPT-4o API. The summarized output is saved as a Markdown (.md) file, providing a clean and accessible format. Key features: - Prompts user for a URL at runtime - Uses Playwright to scrape the page content - Extracts visible text with BeautifulSoup - Summarizes content using OpenAI's chat model - Saves output to a user-friendly Markdown file This contribution supports browser-based content summarization and expands the repo’s AI toolset for web interaction tasks.
This commit is contained in:
@@ -0,0 +1,56 @@
|
||||
import os
|
||||
import openai
|
||||
from IPython.display import Markdown, display
|
||||
from dotenv import load_dotenv
|
||||
from playwright.sync_api import sync_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
load_dotenv()
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY") # Or set it directly
|
||||
|
||||
def scrape_website(url):
|
||||
# Code to scrape a website using Playwright
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
page = browser.new_page()
|
||||
page.goto(url)
|
||||
content = page.content()
|
||||
browser.close()
|
||||
return content
|
||||
|
||||
def summarize_content(html_content):
|
||||
#Get only the text parts of the webpage
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
summary_text = soup.get_text(separator=' ', strip=True)
|
||||
# Code to summarize using OpenAI API
|
||||
system_prompt = ("You summarize html content as markdown.")
|
||||
user_prompt = (
|
||||
"You are a helpful assistant. Summarize the following HTML webpage content in markdown with simple terms:\n\n"
|
||||
+ summary_text
|
||||
)
|
||||
response = openai.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[{"role": "user", "content": user_prompt}]
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
def save_markdown(summary, filename="summary.md", url=None):
|
||||
#Open the file summary.md
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
if url:
|
||||
f.write(f"# Summary of [{url}]({url})\n\n")
|
||||
else:
|
||||
f.write("# Summary\n\n")
|
||||
f.write(summary.strip())
|
||||
|
||||
# 4. Main Logic
|
||||
def main():
|
||||
url = input("Enter the URL to summarize: ").strip()
|
||||
html = scrape_website(url)
|
||||
summary = summarize_content(html)
|
||||
save_markdown(summary, filename="summary.md", url=url)
|
||||
print("✅ Summary saved to summary.md")
|
||||
|
||||
# 5. Entry Point
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user