diff --git a/week1/community-contributions/summarizer_using_llama3.2.ipynb b/week1/community-contributions/summarizer_using_llama3.2.ipynb new file mode 100644 index 0000000..8d1d681 --- /dev/null +++ b/week1/community-contributions/summarizer_using_llama3.2.ipynb @@ -0,0 +1,454 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a92df66b-68c9-4288-b881-45d1fd948c18", + "metadata": {}, + "source": [ + "### Week 1 Contribution: Selenium-enhanced Website Summarizer\n", + "This notebook attempts to summarize content from any website using a BeautifulSoup-first strategy with a Selenium fallback for JavaScript-heavy pages. Llama 3.2 is used to generate a markdown-formatted summary.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "407ea4b4-7c1b-4f94-a48d-f3ee3273bc61", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "from IPython.display import Markdown,display\n", + "from openai import OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "040e97a8-9a5f-4903-9d0e-fa19bb719b4f", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL=\"llama3.2\"\n", + "openai=OpenAI(base_url=\"http://localhost:11434/v1\",api_key=\"ollama\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cac3c9ae-31ce-45b1-bbc1-70577a198e84", + "metadata": {}, + "outputs": [], + "source": [ + "message=\"Hi, write a snarky poem for me.\" \n", + "response=openai.chat.completions.create(\n", + " model=MODEL,\n", + " messages=[{\n", + " \"role\":\"user\",\n", + " \"content\":message\n", + " }]\n", + ")\n", + "print(response.choices[0].message.content)" + ] + }, + { + "cell_type": "markdown", + "id": "a27514f6-d7a5-4292-b98b-dc166416a2fc", + "metadata": {}, + "source": [ + "### Beautiful Soup Version" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "678901b6-5da1-4df7-8b73-a1c69dc758b0", + "metadata": {}, + "outputs": [], + "source": [ + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", + "} # to make sure we're not blocked as bots from websites\n", + "\n", + "class bsWebsite:\n", + " \"\"\"\n", + " Attributes:\n", + " url (str): The URL of the page\n", + " title (str): The title of the page\n", + " text (str): The readable text from the page\n", + " \"\"\"\n", + "\n", + " def __init__(self,url):\n", + " self.url=url\n", + " response=requests.get(url,headers=headers) # gets the content of the page in response variable\n", + "\n", + " soup=BeautifulSoup(response.content,'html.parser') # content of response is accessed using html parser for structure\n", + " self.title=soup.title.string if soup.title else \"No title\"\n", + "\n", + " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", + " irrelevant.decompose()\n", + "\n", + " self.text=soup.body.get_text(separator='\\n',strip=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a1a5ddd-7907-46fd-a1b7-ceeb876262f7", + "metadata": {}, + "outputs": [], + "source": [ + "ed = bsWebsite(\"https://edwarddonner.com\")\n", + "\n", + "print(ed.url)\n", + "print(ed.text)\n", + "print(ed.title)" + ] + }, + { + "cell_type": "markdown", + "id": "b7e965e4-7d20-4980-8cb2-871b8ca63c45", + "metadata": {}, + "source": [ + "#### Now, let's create a detailed summary for how selenium works using what we just made" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b71a05c6-669b-4632-aeb9-b51daa4429a1", + "metadata": {}, + "outputs": [], + "source": [ + "sel=bsWebsite(\"https://www.geeksforgeeks.org/software-engineering/selenium-webdriver-tutorial/\")\n", + "print(sel.url)\n", + "print(sel.title)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c463c67-2a9c-4fcd-99aa-cab0e2cdf936", + "metadata": {}, + "outputs": [], + "source": [ + "def user_prompt_for(web):\n", + " user_prompt=f\"\"\"You are looking at a website called {web.title}. \n", + " Provide a detailed summary of the given content and the concepts in markdown:\\n[{web.text}]\"\"\"\n", + "\n", + " return user_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2118ac4-3355-4f90-b799-ba375ceeafc1", + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt=\"\"\"You are an assistant that analyses the contents of a website based on request of user, \n", + "while ignoring text that is navigation related. Respond in markdown.\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "716b3772-3c73-4010-b089-8bc374cab9de", + "metadata": {}, + "outputs": [], + "source": [ + "print(user_prompt_for(ed))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b23b39b4-78a3-4694-8c89-f2ce56b628f2", + "metadata": {}, + "outputs": [], + "source": [ + "user_prompt=user_prompt_for(sel)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce29c83c-7b47-43a8-8f92-c2a1aa36f8f5", + "metadata": {}, + "outputs": [], + "source": [ + "messages=[\n", + " { \"role\":\"system\", \"content\":system_prompt},\n", + " { \"role\":\"user\", \"content\":user_prompt}\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f120702-029e-4c1a-8ffb-2c4944110aa8", + "metadata": {}, + "outputs": [], + "source": [ + "response=openai.chat.completions.create(model=MODEL,messages=messages)\n", + "\n", + "print(response.choices[0].message.content)" + ] + }, + { + "cell_type": "markdown", + "id": "e9326415-6d35-4750-b9b1-1ae83a86d6f7", + "metadata": {}, + "source": [ + "### Selenium Version" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba86d4cc-cf4c-4f75-aa57-4126b15463b7", + "metadata": {}, + "outputs": [], + "source": [ + "# making sure we're in the virtual environment\n", + "import sys\n", + "print(sys.executable)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ba86dfa-1e91-4535-9c93-3838c46aee52", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install selenium" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01771002-b10f-4681-8710-0f1515866c92", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install webdriver-manager" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c19b582d-a355-4c20-8028-42a802e7dca5", + "metadata": {}, + "outputs": [], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.edge.service import Service\n", + "# for edge only:\n", + "from webdriver_manager.microsoft import EdgeChromiumDriverManager" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "978ab0b9-b42b-4136-8383-79b3f84e084b", + "metadata": {}, + "outputs": [], + "source": [ + "# works for edge only. Do not close the window that pops up as t will be used to open sites given.\n", + "driver=webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7dfdeb48-562e-44d3-9044-157d616835fd", + "metadata": {}, + "outputs": [], + "source": [ + "# creating a similar class as bsWebsie but using selenium\n", + "class SelWebsite:\n", + "\n", + " def __init__(self,url,driver):\n", + " self.driver=driver\n", + " self.driver.get(url)\n", + " \n", + " self.url=self.driver.current_url\n", + " self.title=self.driver.title\n", + " self.text=self.driver.find_element(By.TAG_NAME,\"body\").text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6174105d-c123-4032-afa8-75588c0f1133", + "metadata": {}, + "outputs": [], + "source": [ + "# testing it on OpenAI website\n", + "gpt=SelWebsite(\"https://openai.com\",driver)\n", + "print(gpt.url)\n", + "print(gpt.driver)\n", + "print(gpt.title)\n", + "print(gpt.text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bde84abf-09dd-4a56-b6a7-4e5a34c1098e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "b7208f3f-6245-48a4-a5ae-d0b59550ee28", + "metadata": {}, + "source": [ + "##### Troubleshooting in case of errors:\n", + "1. Make sure the window popped up wasn't closed.\n", + "2. If the below cell results in any text except an error - driver ID is valid. In this case, quit and restart the driver again.\n", + "3. If driver ID is invalid, activate driver again using below cells." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30afa4d1-1ce6-4bad-820e-b72cf3eef959", + "metadata": {}, + "outputs": [], + "source": [ + "# use the following code to check for valid session ID for driver if error occurs:\n", + "print(driver.session_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "154ace93-47b2-40ea-9d49-c6c598a67144", + "metadata": {}, + "outputs": [], + "source": [ + "# if above is valid but still results in trouble, run both; otherwise run only the second part:\n", + "# driver.quit()\n", + "# driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07e74ec5-fda6-462f-b929-7d173b0bdb31", + "metadata": {}, + "outputs": [], + "source": [ + "print(user_prompt_for(gpt))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5d0fd2e-949a-4358-b963-1395157618d2", + "metadata": {}, + "outputs": [], + "source": [ + "messages2=[\n", + " {\"role\":\"system\",\"content\":system_prompt},\n", + " {\"role\":\"user\",\"content\":user_prompt_for(gpt)}\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db457f5c-e1be-4087-932d-25ba4880b3ac", + "metadata": {}, + "outputs": [], + "source": [ + "response=openai.chat.completions.create(model=MODEL,messages=messages2)\n", + "\n", + "print(response.choices[0].message.content)" + ] + }, + { + "cell_type": "markdown", + "id": "d448018f-f363-4af9-8ae3-88cc4408da91", + "metadata": {}, + "source": [ + "### Now let's build a summarize function which can be called directly to summarize any site." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "690ca16b-4b9c-4ddc-b21e-1e69b1d3135a", + "metadata": {}, + "outputs": [], + "source": [ + "def summarize(site_url):\n", + " \"\"\"\n", + " Summarizes the visible content of a website.\n", + " - Tries BeautifulSoup parsing first (bsWebsite)\n", + " - Falls back to Selenium parsing (SelWebsite) if BS4 fails\n", + " - Uses llama3.2 to generate a summary in Markdown\n", + " \"\"\"\n", + " try:\n", + " site=bsWebsite(site_url)\n", + " except Exception as e:\n", + " print(f\"BS4 failed: {e}\\nTrying Selenium...\\n\")\n", + " site=SelWebsite(site_url,driver)\n", + "\n", + " messages3=[\n", + " {\"role\":\"system\",\"content\":system_prompt},\n", + " {\"role\":\"user\",\"content\":user_prompt_for(site)}\n", + " ]\n", + "\n", + " print(f\"\\nSummarizing: {site.title}\\nURL: {site.url}\\n\")\n", + "\n", + " response=openai.chat.completions.create(model=MODEL,messages=messages3)\n", + "\n", + " print(response.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2744296c-ebbd-4696-8517-d14234af9a65", + "metadata": {}, + "outputs": [], + "source": [ + "summarize(\"https://www.udemy.com\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d0d2379-c8b3-4900-8671-179303c00929", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}