diff --git a/week1/community-contributions/day1_selenium_microsoftedgedriver.ipynb b/week1/community-contributions/day1_selenium_microsoftedgedriver.ipynb new file mode 100644 index 0000000..c2e6b86 --- /dev/null +++ b/week1/community-contributions/day1_selenium_microsoftedgedriver.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3ba06289-d17a-4ccd-85f5-2b79956d4e59", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install selenium" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "935fe7b1-1807-4f75-863d-4c118e425a19", + "metadata": {}, + "outputs": [], + "source": [ + "pip show selenium" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eabbbc62-1de1-4883-9b3e-9c90145ea6c5", + "metadata": {}, + "outputs": [], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.edge.options import Options as EdgeOptions # Import EdgeOptions\n", + "from selenium.webdriver.edge.service import Service as EdgeService # Import EdgeService\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "import os\n", + "\n", + "class Website:\n", + " def __init__(self, url, driver_path=None, wait_time=3):\n", + " self.url = url\n", + " self.wait_time = wait_time\n", + "\n", + " # Headless Edge settings\n", + " options = EdgeOptions() # Use EdgeOptions\n", + " # options.add_argument(\"--headless\")\n", + " options.add_argument(\"--disable-gpu\")\n", + " options.add_argument(\"--no-sandbox\")\n", + " options.add_argument(\"--window-size=1920x1080\")\n", + "\n", + " # Driver path\n", + " if driver_path:\n", + " # For Edge, you might need to specify the path to msedgedriver\n", + " # For driver download, https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads\n", + " service = EdgeService(executable_path=driver_path) # Use EdgeService\n", + " else:\n", + " # If msedgedriver.exe is in your system's PATH, you can omit executable_path\n", + " service = EdgeService()\n", + "\n", + " # Start browser\n", + " # Use webdriver.Edge() for Microsoft Edge\n", + " driver = webdriver.Edge(service=service, options=options)\n", + " driver.get(url)\n", + "\n", + " # Wait for the loading page\n", + " time.sleep(self.wait_time)\n", + "\n", + " # Take page source\n", + " html = driver.page_source\n", + " driver.quit()\n", + "\n", + " # Analysis with BeautifulSoup \n", + " soup = BeautifulSoup(html, 'html.parser')\n", + " self.title = soup.title.string if soup.title else \"No title found\"\n", + "\n", + " # Clean irrelevant tags\n", + " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", + " irrelevant.decompose()\n", + "\n", + " self.text = soup.body.get_text(separator=\"\\n\", strip=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "852c52e2-bd4d-4bb9-94ef-e498c33f1a89", + "metadata": {}, + "outputs": [], + "source": [ + "site = Website(\"https://openai.com\", driver_path=\"/Users/klee/Documents/edgedriver_mac64_m1/msedgedriver\")\n", + "print(\"Title:\", site.title)\n", + "print(\"\\nFirst 500 character:\\n\", site.text[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7620c685-c35c-4d6b-aaf1-a3da98f19ca7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}