{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "3ba06289-d17a-4ccd-85f5-2b79956d4e59", "metadata": {}, "outputs": [], "source": [ "!pip install selenium" ] }, { "cell_type": "code", "execution_count": null, "id": "935fe7b1-1807-4f75-863d-4c118e425a19", "metadata": {}, "outputs": [], "source": [ "pip show selenium" ] }, { "cell_type": "code", "execution_count": null, "id": "eabbbc62-1de1-4883-9b3e-9c90145ea6c5", "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.edge.options import Options as EdgeOptions # Import EdgeOptions\n", "from selenium.webdriver.edge.service import Service as EdgeService # Import EdgeService\n", "from bs4 import BeautifulSoup\n", "import time\n", "import os\n", "\n", "class Website:\n", " def __init__(self, url, driver_path=None, wait_time=3):\n", " self.url = url\n", " self.wait_time = wait_time\n", "\n", " # Headless Edge settings\n", " options = EdgeOptions() # Use EdgeOptions\n", " # options.add_argument(\"--headless\")\n", " options.add_argument(\"--disable-gpu\")\n", " options.add_argument(\"--no-sandbox\")\n", " options.add_argument(\"--window-size=1920x1080\")\n", "\n", " # Driver path\n", " if driver_path:\n", " # For Edge, you might need to specify the path to msedgedriver\n", " # For driver download, https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads\n", " service = EdgeService(executable_path=driver_path) # Use EdgeService\n", " else:\n", " # If msedgedriver.exe is in your system's PATH, you can omit executable_path\n", " service = EdgeService()\n", "\n", " # Start browser\n", " # Use webdriver.Edge() for Microsoft Edge\n", " driver = webdriver.Edge(service=service, options=options)\n", " driver.get(url)\n", "\n", " # Wait for the loading page\n", " time.sleep(self.wait_time)\n", "\n", " # Take page source\n", " html = driver.page_source\n", " driver.quit()\n", "\n", " # Analysis with BeautifulSoup \n", " soup = BeautifulSoup(html, 'html.parser')\n", " self.title = soup.title.string if soup.title else \"No title found\"\n", "\n", " # Clean irrelevant tags\n", " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", " irrelevant.decompose()\n", "\n", " self.text = soup.body.get_text(separator=\"\\n\", strip=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "852c52e2-bd4d-4bb9-94ef-e498c33f1a89", "metadata": {}, "outputs": [], "source": [ "site = Website(\"https://openai.com\", driver_path=\"/Users/klee/Documents/edgedriver_mac64_m1/msedgedriver\")\n", "print(\"Title:\", site.title)\n", "print(\"\\nFirst 500 character:\\n\", site.text[:500])" ] }, { "cell_type": "code", "execution_count": null, "id": "7620c685-c35c-4d6b-aaf1-a3da98f19ca7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }