Merge pull request #531 from davidzywiec/PlayWright_Contribution

Add web scraping and summarization script using Playwright and OpenAI
This commit is contained in:
Ed Donner
2025-07-25 10:06:13 -04:00
committed by GitHub
2 changed files with 90 additions and 0 deletions

View File

@@ -0,0 +1,56 @@
import os
import openai
from IPython.display import Markdown, display
from dotenv import load_dotenv
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY") # Or set it directly
def scrape_website(url):
# Code to scrape a website using Playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
content = page.content()
browser.close()
return content
def summarize_content(html_content):
#Get only the text parts of the webpage
soup = BeautifulSoup(html_content, 'html.parser')
summary_text = soup.get_text(separator=' ', strip=True)
# Code to summarize using OpenAI API
system_prompt = ("You summarize html content as markdown.")
user_prompt = (
"You are a helpful assistant. Summarize the following HTML webpage content in markdown with simple terms:\n\n"
+ summary_text
)
response = openai.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": user_prompt}]
)
return response.choices[0].message.content
def save_markdown(summary, filename="summary.md", url=None):
#Open the file summary.md
with open(filename, "w", encoding="utf-8") as f:
if url:
f.write(f"# Summary of [{url}]({url})\n\n")
else:
f.write("# Summary\n\n")
f.write(summary.strip())
# 4. Main Logic
def main():
url = input("Enter the URL to summarize: ").strip()
html = scrape_website(url)
summary = summarize_content(html)
save_markdown(summary, filename="summary.md", url=url)
print("✅ Summary saved to summary.md")
# 5. Entry Point
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,34 @@
# Summary of [https://www.willwight.com/](https://www.willwight.com/)
# Will Wight - New York Times Best-Selling Author
### Overview
Will Wight is a renowned author known for the "Cradle" series, alongside other works like "The Last Horizon" and "The Traveler's Gate Trilogy." He combines humor and storytelling in his blog and engages actively with his readers.
### Books
- **The Last Horizon**: Currently ongoing series.
- **Cradle**: A 12-book series, now complete.
- **The Traveler's Gate Trilogy**: Completed series.
- **The Elder Empire**: Consists of two trilogies with stories happening simultaneously, totaling 6 books.
### Recent Highlights
- **The Pilot Release**: The fourth book in "The Last Horizon" series, celebrated on July 4th, 2025. The 26th book by Will, marking a milestone as his next book will be his 27th.
- **Barnes & Noble Success**: A significant achievement of getting Will's books stocked nationwide in Barnes & Noble, marking a breakthrough for indie publishing.
### Blog Highlights
- Will shares personal anecdotes and behind-the-scenes insights into his creative process.
- A humorous tone is used, including whimsical stories about his life and writing challenges.
- Recent experiences at Epic Universe theme park with thoughts on its design and offerings.
### Connect
- **Mailing List**: Over 15,000 fans subscribe to receive updates on new stories and releases.
- **Hidden Gnome Publishing**: The entity behind Will's publications, working to bring his books to wider audiences.
### Extras
- **Merch**: Available for fans wanting to support and connect with Will's universe.
- **Podcast**: Offers sneak peeks, discussions, and insights into Will's works.
### Humorous Note
Will humorously describes himself transforming into a "monstrous mongoose" during a full moon, adding a quirky touch to his persona.
For more detailed information on books, blogs, and extras, visit Will's website and explore his engaging world of storytelling!