Launching refreshed version of LLM Engineering weeks 1-4 - see README

2025-10-11 15:58:39 -04:00
parent 3286cfb395
commit c7257b9ae6
68 changed files with 16583 additions and 3756 deletions
--- a/week1/scraper.py
+++ b/week1/scraper.py
@@ -0,0 +1,37 @@
+from bs4 import BeautifulSoup
+import requests
+
+
+# Standard headers to fetch a website
+headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
+}
+
+
+def fetch_website_contents(url):
+    """
+    Return the title and contents of the website at the given url;
+    truncate to 2,000 characters as a sensible limit
+    """
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.content, "html.parser")
+    title = soup.title.string if soup.title else "No title found"
+    if soup.body:
+        for irrelevant in soup.body(["script", "style", "img", "input"]):
+            irrelevant.decompose()
+        text = soup.body.get_text(separator="\n", strip=True)
+    else:
+        text = ""
+    return (title + "\n\n" + text)[:2_000]
+
+
+def fetch_website_links(url):
+    """
+    Return the links on the webiste at the given url
+    I realize this is inefficient as we're parsing twice! This is to keep the code in the lab simple.
+    Feel free to use a class and optimize it!
+    """
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.content, "html.parser")
+    links = [link.get("href") for link in soup.find_all("a")]
+    return [link for link in links if link]