diff --git a/week1/community-contributions/domienbakker/day1.selenium.scraper.ipynb b/week1/community-contributions/domienbakker/day1.selenium.scraper.ipynb new file mode 100644 index 0000000..40d87e9 --- /dev/null +++ b/week1/community-contributions/domienbakker/day1.selenium.scraper.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0bb7f4e9", + "metadata": {}, + "source": [ + "### Prepare dependencies\n", + "I had issues with selenium and my chromedriver, I had to install the exact dependencies below to make it work.\n", + "First add the dependency selenium by executing folowing command\n", + "``` bash\n", + "uv pip install selenium==4.11.2\n", + "uv pip install urllib3==1.26.16\n", + "```\n", + "***Do not forget to restart the Jupyter kernel to make the package available.***" + ] + }, + { + "cell_type": "markdown", + "id": "7a116541", + "metadata": {}, + "source": [ + "### Prefered Web Browser\n", + "This script will use Safari on MacOSX, please install the Safari driver when required [Safari driver](https://webkit.org/blog/6900/webdriver-support-in-safari-10).\n", + "It will assume that Edge is used on Windows systems, install the Chrome driver from: [ChromeDriver](https://googlechromelabs.github.io/chrome-for-testing/#stable) \n", + "Feel free to add other browser support when required.\n", + "\n", + "I am on Windows and I extracted the ChromeDriver and put it in my %USERPROFILE%\\AppData\\Local\\Microsoft\\WindowsApps folder to ensure that it is available via the PATH. Any folder available in the PATH environment setting will work.\n", + "Start cmd and execute set to see all folders in the PATH environment setting.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1353f8ef", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from scraper import fetch_website_contents\n", + "from IPython.display import Markdown, display\n", + "from openai import OpenAI\n", + "from selenium import webdriver" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24d86842", + "metadata": {}, + "outputs": [], + "source": [ + "def verify_openai_api_key():\n", + " \"\"\"Verify that the OpenAI API key is set in the environment variables.\"\"\"\n", + " load_dotenv(override=True)\n", + " api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + " if not api_key:\n", + " raise ValueError(\"OPENAI_API_KEY is not set in environment variables.\")\n", + " \n", + " # Dry run with a simple request to verify the key.\n", + " try:\n", + " client = OpenAI(api_key=api_key)\n", + " client.models.list()\n", + " except:\n", + " raise ValueError(\"Invalid OPENAI_API_KEY.\")\n", + " \n", + " return api_key\n", + "\n", + "def get_webdriver():\n", + " \"\"\"Initialize and return a Selenium WebDriver based on the operating system.\"\"\"\n", + " \n", + " # Verify the os, use Safari for MacOS, Chrome for others.\n", + " if os.name == 'posix': \n", + " driver = webdriver.Safari()\n", + " else:\n", + " driver = webdriver.Chrome()\n", + " return driver\n", + "\n", + "def fetch_website_contents_selenium(url: str) -> str:\n", + " \"\"\"Fetch website contents using Selenium WebDriver.\"\"\"\n", + " driver = get_webdriver()\n", + " driver.get(url)\n", + " content = driver.page_source\n", + " driver.quit()\n", + " return content\n", + "\n", + "def messages_for(website: str) -> list[dict]:\n", + " return [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": \"\"\"You are a helpful assistant that summarizes website content.\n", + " You can perfectly understand HTML structure and extract meaningful information from it.\"\"\"\n", + " },\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"Summarize the following website content:\\n\\n{website}\"\n", + " }\n", + " ]\n", + "\n", + "def summarize_website(url: str, api_key: str) -> str:\n", + " content = fetch_website_contents_selenium(url)\n", + " openai_client = OpenAI(api_key=api_key)\n", + " response = openai_client.chat.completions.create(\n", + " model = \"gpt-4.1-mini\",\n", + " messages = messages_for(content)\n", + " )\n", + " return response.choices[0].message.content\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a7e7f0a", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " api_key = verify_openai_api_key()\n", + " print(summarize_website(\"https://www.forbes.com/\", api_key))\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llm-engineering", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}