Files
LLM_Engineering_OLD/week1/community-contributions/summarizer_using_llama3.2.ipynb
2025-06-20 19:06:39 +05:30

455 lines
12 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "a92df66b-68c9-4288-b881-45d1fd948c18",
"metadata": {},
"source": [
"### Week 1 Contribution: Selenium-enhanced Website Summarizer\n",
"This notebook attempts to summarize content from any website using a BeautifulSoup-first strategy with a Selenium fallback for JavaScript-heavy pages. Llama 3.2 is used to generate a markdown-formatted summary.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "407ea4b4-7c1b-4f94-a48d-f3ee3273bc61",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown,display\n",
"from openai import OpenAI"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "040e97a8-9a5f-4903-9d0e-fa19bb719b4f",
"metadata": {},
"outputs": [],
"source": [
"MODEL=\"llama3.2\"\n",
"openai=OpenAI(base_url=\"http://localhost:11434/v1\",api_key=\"ollama\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cac3c9ae-31ce-45b1-bbc1-70577a198e84",
"metadata": {},
"outputs": [],
"source": [
"message=\"Hi, write a snarky poem for me.\" \n",
"response=openai.chat.completions.create(\n",
" model=MODEL,\n",
" messages=[{\n",
" \"role\":\"user\",\n",
" \"content\":message\n",
" }]\n",
")\n",
"print(response.choices[0].message.content)"
]
},
{
"cell_type": "markdown",
"id": "a27514f6-d7a5-4292-b98b-dc166416a2fc",
"metadata": {},
"source": [
"### Beautiful Soup Version"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "678901b6-5da1-4df7-8b73-a1c69dc758b0",
"metadata": {},
"outputs": [],
"source": [
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"} # to make sure we're not blocked as bots from websites\n",
"\n",
"class bsWebsite:\n",
" \"\"\"\n",
" Attributes:\n",
" url (str): The URL of the page\n",
" title (str): The title of the page\n",
" text (str): The readable text from the page\n",
" \"\"\"\n",
"\n",
" def __init__(self,url):\n",
" self.url=url\n",
" response=requests.get(url,headers=headers) # gets the content of the page in response variable\n",
"\n",
" soup=BeautifulSoup(response.content,'html.parser') # content of response is accessed using html parser for structure\n",
" self.title=soup.title.string if soup.title else \"No title\"\n",
"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
"\n",
" self.text=soup.body.get_text(separator='\\n',strip=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a1a5ddd-7907-46fd-a1b7-ceeb876262f7",
"metadata": {},
"outputs": [],
"source": [
"ed = bsWebsite(\"https://edwarddonner.com\")\n",
"\n",
"print(ed.url)\n",
"print(ed.text)\n",
"print(ed.title)"
]
},
{
"cell_type": "markdown",
"id": "b7e965e4-7d20-4980-8cb2-871b8ca63c45",
"metadata": {},
"source": [
"#### Now, let's create a detailed summary for how selenium works using what we just made"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b71a05c6-669b-4632-aeb9-b51daa4429a1",
"metadata": {},
"outputs": [],
"source": [
"sel=bsWebsite(\"https://www.geeksforgeeks.org/software-engineering/selenium-webdriver-tutorial/\")\n",
"print(sel.url)\n",
"print(sel.title)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c463c67-2a9c-4fcd-99aa-cab0e2cdf936",
"metadata": {},
"outputs": [],
"source": [
"def user_prompt_for(web):\n",
" user_prompt=f\"\"\"You are looking at a website called {web.title}. \n",
" Provide a detailed summary of the given content and the concepts in markdown:\\n[{web.text}]\"\"\"\n",
"\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2118ac4-3355-4f90-b799-ba375ceeafc1",
"metadata": {},
"outputs": [],
"source": [
"system_prompt=\"\"\"You are an assistant that analyses the contents of a website based on request of user, \n",
"while ignoring text that is navigation related. Respond in markdown.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "716b3772-3c73-4010-b089-8bc374cab9de",
"metadata": {},
"outputs": [],
"source": [
"print(user_prompt_for(ed))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b23b39b4-78a3-4694-8c89-f2ce56b628f2",
"metadata": {},
"outputs": [],
"source": [
"user_prompt=user_prompt_for(sel)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce29c83c-7b47-43a8-8f92-c2a1aa36f8f5",
"metadata": {},
"outputs": [],
"source": [
"messages=[\n",
" { \"role\":\"system\", \"content\":system_prompt},\n",
" { \"role\":\"user\", \"content\":user_prompt}\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1f120702-029e-4c1a-8ffb-2c4944110aa8",
"metadata": {},
"outputs": [],
"source": [
"response=openai.chat.completions.create(model=MODEL,messages=messages)\n",
"\n",
"print(response.choices[0].message.content)"
]
},
{
"cell_type": "markdown",
"id": "e9326415-6d35-4750-b9b1-1ae83a86d6f7",
"metadata": {},
"source": [
"### Selenium Version"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba86d4cc-cf4c-4f75-aa57-4126b15463b7",
"metadata": {},
"outputs": [],
"source": [
"# making sure we're in the virtual environment\n",
"import sys\n",
"print(sys.executable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ba86dfa-1e91-4535-9c93-3838c46aee52",
"metadata": {},
"outputs": [],
"source": [
"# !pip install selenium"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "01771002-b10f-4681-8710-0f1515866c92",
"metadata": {},
"outputs": [],
"source": [
"# !pip install webdriver-manager"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c19b582d-a355-4c20-8028-42a802e7dca5",
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.edge.service import Service\n",
"# for edge only:\n",
"from webdriver_manager.microsoft import EdgeChromiumDriverManager"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "978ab0b9-b42b-4136-8383-79b3f84e084b",
"metadata": {},
"outputs": [],
"source": [
"# works for edge only. Do not close the window that pops up as t will be used to open sites given.\n",
"driver=webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7dfdeb48-562e-44d3-9044-157d616835fd",
"metadata": {},
"outputs": [],
"source": [
"# creating a similar class as bsWebsie but using selenium\n",
"class SelWebsite:\n",
"\n",
" def __init__(self,url,driver):\n",
" self.driver=driver\n",
" self.driver.get(url)\n",
" \n",
" self.url=self.driver.current_url\n",
" self.title=self.driver.title\n",
" self.text=self.driver.find_element(By.TAG_NAME,\"body\").text"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6174105d-c123-4032-afa8-75588c0f1133",
"metadata": {},
"outputs": [],
"source": [
"# testing it on OpenAI website\n",
"gpt=SelWebsite(\"https://openai.com\",driver)\n",
"print(gpt.url)\n",
"print(gpt.driver)\n",
"print(gpt.title)\n",
"print(gpt.text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bde84abf-09dd-4a56-b6a7-4e5a34c1098e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "b7208f3f-6245-48a4-a5ae-d0b59550ee28",
"metadata": {},
"source": [
"##### Troubleshooting in case of errors:\n",
"1. Make sure the window popped up wasn't closed.\n",
"2. If the below cell results in any text except an error - driver ID is valid. In this case, quit and restart the driver again.\n",
"3. If driver ID is invalid, activate driver again using below cells."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30afa4d1-1ce6-4bad-820e-b72cf3eef959",
"metadata": {},
"outputs": [],
"source": [
"# use the following code to check for valid session ID for driver if error occurs:\n",
"print(driver.session_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "154ace93-47b2-40ea-9d49-c6c598a67144",
"metadata": {},
"outputs": [],
"source": [
"# if above is valid but still results in trouble, run both; otherwise run only the second part:\n",
"# driver.quit()\n",
"# driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07e74ec5-fda6-462f-b929-7d173b0bdb31",
"metadata": {},
"outputs": [],
"source": [
"print(user_prompt_for(gpt))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5d0fd2e-949a-4358-b963-1395157618d2",
"metadata": {},
"outputs": [],
"source": [
"messages2=[\n",
" {\"role\":\"system\",\"content\":system_prompt},\n",
" {\"role\":\"user\",\"content\":user_prompt_for(gpt)}\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "db457f5c-e1be-4087-932d-25ba4880b3ac",
"metadata": {},
"outputs": [],
"source": [
"response=openai.chat.completions.create(model=MODEL,messages=messages2)\n",
"\n",
"print(response.choices[0].message.content)"
]
},
{
"cell_type": "markdown",
"id": "d448018f-f363-4af9-8ae3-88cc4408da91",
"metadata": {},
"source": [
"### Now let's build a summarize function which can be called directly to summarize any site."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "690ca16b-4b9c-4ddc-b21e-1e69b1d3135a",
"metadata": {},
"outputs": [],
"source": [
"def summarize(site_url):\n",
" \"\"\"\n",
" Summarizes the visible content of a website.\n",
" - Tries BeautifulSoup parsing first (bsWebsite)\n",
" - Falls back to Selenium parsing (SelWebsite) if BS4 fails\n",
" - Uses llama3.2 to generate a summary in Markdown\n",
" \"\"\"\n",
" try:\n",
" site=bsWebsite(site_url)\n",
" except Exception as e:\n",
" print(f\"BS4 failed: {e}\\nTrying Selenium...\\n\")\n",
" site=SelWebsite(site_url,driver)\n",
"\n",
" messages3=[\n",
" {\"role\":\"system\",\"content\":system_prompt},\n",
" {\"role\":\"user\",\"content\":user_prompt_for(site)}\n",
" ]\n",
"\n",
" print(f\"\\nSummarizing: {site.title}\\nURL: {site.url}\\n\")\n",
"\n",
" response=openai.chat.completions.create(model=MODEL,messages=messages3)\n",
"\n",
" print(response.choices[0].message.content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2744296c-ebbd-4696-8517-d14234af9a65",
"metadata": {},
"outputs": [],
"source": [
"summarize(\"https://www.udemy.com\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d0d2379-c8b3-4900-8671-179303c00929",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}