Launching refreshed version of LLM Engineering weeks 1-4 - see README
This commit is contained in:
37
week1/scraper.py
Normal file
37
week1/scraper.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
|
||||
# Standard headers to fetch a website
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
|
||||
def fetch_website_contents(url):
|
||||
"""
|
||||
Return the title and contents of the website at the given url;
|
||||
truncate to 2,000 characters as a sensible limit
|
||||
"""
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
title = soup.title.string if soup.title else "No title found"
|
||||
if soup.body:
|
||||
for irrelevant in soup.body(["script", "style", "img", "input"]):
|
||||
irrelevant.decompose()
|
||||
text = soup.body.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
text = ""
|
||||
return (title + "\n\n" + text)[:2_000]
|
||||
|
||||
|
||||
def fetch_website_links(url):
|
||||
"""
|
||||
Return the links on the webiste at the given url
|
||||
I realize this is inefficient as we're parsing twice! This is to keep the code in the lab simple.
|
||||
Feel free to use a class and optimize it!
|
||||
"""
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
links = [link.get("href") for link in soup.find_all("a")]
|
||||
return [link for link in links if link]
|
||||
Reference in New Issue
Block a user