Minor refinements
This commit is contained in:
@@ -23,7 +23,7 @@ langchain[docarray]
|
|||||||
datasets
|
datasets
|
||||||
sentencepiece
|
sentencepiece
|
||||||
matplotlib
|
matplotlib
|
||||||
google.generativeai
|
google-generativeai
|
||||||
anthropic
|
anthropic
|
||||||
scikit-learn
|
scikit-learn
|
||||||
unstructured
|
unstructured
|
||||||
|
|||||||
@@ -184,6 +184,11 @@
|
|||||||
"# A class to represent a Webpage\n",
|
"# A class to represent a Webpage\n",
|
||||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# Some websites need you to use proper headers when fetching them:\n",
|
||||||
|
"headers = {\n",
|
||||||
|
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
"class Website:\n",
|
"class Website:\n",
|
||||||
"\n",
|
"\n",
|
||||||
" def __init__(self, url):\n",
|
" def __init__(self, url):\n",
|
||||||
@@ -191,7 +196,7 @@
|
|||||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" self.url = url\n",
|
" self.url = url\n",
|
||||||
" response = requests.get(url)\n",
|
" response = requests.get(url, headers=headers)\n",
|
||||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||||
|
|||||||
@@ -70,6 +70,11 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# A class to represent a Webpage\n",
|
"# A class to represent a Webpage\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# Some websites need you to use proper headers when fetching them:\n",
|
||||||
|
"headers = {\n",
|
||||||
|
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
"class Website:\n",
|
"class Website:\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" A utility class to represent a Website that we have scraped, now with links\n",
|
" A utility class to represent a Website that we have scraped, now with links\n",
|
||||||
@@ -77,7 +82,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" def __init__(self, url):\n",
|
" def __init__(self, url):\n",
|
||||||
" self.url = url\n",
|
" self.url = url\n",
|
||||||
" response = requests.get(url)\n",
|
" response = requests.get(url, headers=headers)\n",
|
||||||
" self.body = response.content\n",
|
" self.body = response.content\n",
|
||||||
" soup = BeautifulSoup(self.body, 'html.parser')\n",
|
" soup = BeautifulSoup(self.body, 'html.parser')\n",
|
||||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||||
|
|||||||
@@ -893,7 +893,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.10"
|
"version": "3.11.11"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@@ -547,7 +547,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.10"
|
"version": "3.11.11"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
Reference in New Issue
Block a user