feat: week 1 addon italian restaurant menu parser
This commit is contained in:
32
week1/website.py
Normal file
32
week1/website.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# A class to represent a Webpage
|
||||
|
||||
# Some websites need you to use proper headers when fetching them:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
class Website:
|
||||
"""
|
||||
A utility class to represent a Website that we have scraped, now with links.
|
||||
"""
|
||||
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
response = requests.get(url, headers=headers)
|
||||
self.body = response.content
|
||||
soup = BeautifulSoup(self.body, 'html.parser')
|
||||
self.title = soup.title.string if soup.title else "No title found"
|
||||
if soup.body:
|
||||
for irrelevant in soup.body(["script", "style", "img", "input"]):
|
||||
irrelevant.decompose()
|
||||
self.text = soup.body.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
self.text = ""
|
||||
links = [link.get('href') for link in soup.find_all('a')]
|
||||
self.links = [link for link in links if link]
|
||||
|
||||
def get_contents(self):
|
||||
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
|
||||
Reference in New Issue
Block a user