feat: week 1 addon italian restaurant menu parser

2025-10-03 07:16:17 +02:00
parent 3286cfb395
commit cb992c9d91
2 changed files with 291 additions and 0 deletions
--- a/week1/website.py
+++ b/week1/website.py
@@ -0,0 +1,32 @@
+import requests
+from bs4 import BeautifulSoup
+
+# A class to represent a Webpage
+
+# Some websites need you to use proper headers when fetching them:
+headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
+}
+
+class Website:
+    """
+    A utility class to represent a Website that we have scraped, now with links.
+    """
+
+    def __init__(self, url):
+        self.url = url
+        response = requests.get(url, headers=headers)
+        self.body = response.content
+        soup = BeautifulSoup(self.body, 'html.parser')
+        self.title = soup.title.string if soup.title else "No title found"
+        if soup.body:
+            for irrelevant in soup.body(["script", "style", "img", "input"]):
+                irrelevant.decompose()
+            self.text = soup.body.get_text(separator="\n", strip=True)
+        else:
+            self.text = ""
+        links = [link.get('href') for link in soup.find_all('a')]
+        self.links = [link for link in links if link]
+
+    def get_contents(self):
+        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"