{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "4e2a9393-7767-488e-a8bf-27c12dca35bd", "metadata": {}, "outputs": [], "source": [ "# imports\n", "\n", "import os\n", "import requests\n", "from dotenv import load_dotenv\n", "from bs4 import BeautifulSoup\n", "from IPython.display import Markdown, display\n", "from openai import OpenAI\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7b87cadb-d513-4303-baee-a37b6f938e4d", "metadata": {}, "outputs": [], "source": [ "# Load environment variables in a file called .env\n", "\n", "load_dotenv(override=True)\n", "api_key = os.getenv('OPENAI_API_KEY')\n", "\n", "# Check the key\n", "\n", "if not api_key:\n", " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", "elif not api_key.startswith(\"sk-proj-\"):\n", " print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n", "elif api_key.strip() != api_key:\n", " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", "else:\n", " print(\"API key found and looks good so far!\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3", "metadata": {}, "outputs": [], "source": [ "openai = OpenAI()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c5e793b2-6775-426a-a139-4848291d0463", "metadata": {}, "outputs": [], "source": [ "# A class to represent a Webpage\n", "# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n", "\n", "# Some websites need you to use proper headers when fetching them:\n", "headers = {\n", " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", "}\n", "\n", "class Website:\n", "\n", " def __init__(self, url):\n", " \"\"\"\n", " Create this Website object from the given url using the BeautifulSoup library\n", " \"\"\"\n", " self.url = url\n", " response = requests.get(url, headers=headers)\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", " self.title = soup.title.string if soup.title else \"No title found\"\n", " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", " irrelevant.decompose()\n", " self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "0a9cc69e-dd0f-4c48-86a2-c0c13eeac18f", "metadata": {}, "outputs": [], "source": [ "# Set the system prompt\n", "# Asking AI to be wrong\n", "\n", "system_prompt = \"You are an improper assistant who analyses websites \\\n", "and provides a short summary, ignoring text that might be navigation related. \\\n", "your summaries will be untrue and contain hoaxes based on the current news \\\n", "if the website is not in English, please state what the original language is, and then translate it to English.\"\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f0275b1b-7cfe-4f9d-abfa-7650d378da0c", "metadata": {}, "outputs": [], "source": [ "# A function that writes a User Prompt that asks for summaries of websites:\n", "\n", "def user_prompt_for(website):\n", " user_prompt = f\"You are looking at a website titled {website.title}\"\n", " user_prompt += \"\\nThe contents of this website is as follows; \\\n", "please provide a short summary of this website in markdown. \\\n", "If it includes news or announcements, then summarize these too.\\n\\n\"\n", " user_prompt += website.text\n", " return user_prompt\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "0134dfa4-8299-48b5-b444-f2a8c3403c88", "metadata": {}, "outputs": [], "source": [ "# A function that writes the message to GPT according to the standard format.\n", "\n", "def messages_for(website):\n", " return [\n", " {\"role\": \"system\", \"content\": system_prompt},\n", " {\"role\": \"user\", \"content\": user_prompt_for(website)}\n", " ]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "905b9919-aba7-45b5-ae65-81b3d1d78e34", "metadata": {}, "outputs": [], "source": [ "# And now: call the OpenAI API. You will get very familiar with this!\n", "\n", "def summarize(url):\n", " website = Website(url)\n", " response = openai.chat.completions.create(\n", " model = \"gpt-4o-mini\",\n", " messages = messages_for(website)\n", " )\n", " return response.choices[0].message.content\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "3d926d59-450e-4609-92ba-2d6f244f1342", "metadata": {}, "outputs": [], "source": [ "# A function to display this nicely in the Jupyter output, using markdown\n", "\n", "def display_summary(url):\n", " summary = summarize(url)\n", " display(Markdown(summary))\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "3018853a-445f-41ff-9560-d925d1774b2f", "metadata": {}, "outputs": [], "source": [ "display_summary(\"https://detik.com\")\n" ] }, { "cell_type": "markdown", "id": "a430d86e-01db-4ad5-a2f9-ac85e37fe9c1", "metadata": {}, "source": [ "# Please don't take this hoax creator seriously :)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "df8c4a6d-c370-4fe1-9d13-32db78bcbfda", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }