Files
LLM_Engineering_OLD/week1/community-contributions/website-url-scrapping-csv/URLScrapping-linkscrapping.ipynb
2025-09-16 11:30:42 -05:00

334 lines
11 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "raw",
"id": "c6227d68-b1f4-4f71-9cc6-18aa3ce54209",
"metadata": {},
"source": [
"# FirstPage URL Summarizer (OpenAI)\n",
"\n",
"#This notebook does not crawl a whole site. It only fetches the first page for each provided URL and asks OpenAI to summarize it.\n",
"\n",
"### What it does\n",
"Loads a list of URLs (provided inline or from a file)\n",
"Fetches each page with `aiohttp` (HTML only)\n",
"Extracts text via BeautifulSoup (basic)\n",
"Calls OpenAI to produce a structured JSON summary\n",
"Exports a CSV with: url, http_status, title, meta_description, summary, category, key_entities\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b0fe0e9-228e-461b-9a3e-f4392974c974",
"metadata": {},
"outputs": [],
"source": [
"# (Optional) If running locally, install deps here\n",
"import sys, subprocess\n",
"def pip_install(pkgs):\n",
" subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", *pkgs])\n",
"\n",
"pkgs = [\n",
" \"aiohttp>=3.10\",\n",
" \"beautifulsoup4>=4.12\",\n",
" \"lxml>=5.2\",\n",
" \"pandas>=2.2\",\n",
" \"python-dotenv>=1.0\",\n",
" \"openai>=1.51\",\n",
"]\n",
"try:\n",
" import aiohttp, bs4, lxml, pandas, dotenv, openai\n",
"except Exception:\n",
" pip_install(pkgs)\n",
"print(\"Ready ✔\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86134741-0f8c-4049-894c-f31b27701da8",
"metadata": {},
"outputs": [],
"source": [
"import os, asyncio, aiohttp, pandas as pd\n",
"from bs4 import BeautifulSoup\n",
"from urllib.parse import urlparse\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"\n",
"load_dotenv() # reads .env if present\n",
"OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
"MODEL = os.getenv(\"OPENAI_DEFAULT_MODEL\", \"gpt-4.1-mini\")\n",
"if not OPENAI_API_KEY:\n",
" print(\"Set OPENAI_API_KEY in .env or environment.\")\n",
"client = OpenAI(api_key=OPENAI_API_KEY)\n",
"\n",
"DEFAULT_HEADERS = {\"User-Agent\": \"FirstPageSummarizer/1.0 (+https://edwarddonner.com\"}"
]
},
{
"cell_type": "raw",
"id": "b96c4ed0-4c50-4347-8cc4-22ea21e7e483",
"metadata": {},
"source": [
"## 1) Provide URLs\n",
"You can paste a small list below, or set `URLS_FILE` to a text/CSV file containing URLs (one per line or in a column named `url`)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ce4aef5-8df8-4f47-91b3-c3ecc7c4c8be",
"metadata": {},
"outputs": [],
"source": [
"URLS_INLINE = [\n",
" \"https://edwarddonner.com\"\n",
"]\n",
"URLS_FILE = None # e.g., \"urls.txt\" or \"urls.csv\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba9f6f25-a04c-44fe-a16c-f7b5c47ed100",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"def load_urls(urls_inline, urls_file):\n",
" urls = []\n",
" if urls_file and os.path.exists(urls_file):\n",
" if urls_file.endswith(\".csv\"):\n",
" df = pd.read_csv(urls_file)\n",
" if \"url\" in df.columns:\n",
" urls.extend(df[\"url\"].dropna().tolist())\n",
" else:\n",
" with open(urls_file, \"r\", encoding=\"utf-8\") as f:\n",
" for line in f:\n",
" line=line.strip()\n",
" if line:\n",
" urls.append(line)\n",
" urls.extend([u for u in urls_inline if u])\n",
" # de-dup while preserving order\n",
" seen=set(); out=[]\n",
" for u in urls:\n",
" if u not in seen:\n",
" seen.add(u); out.append(u)\n",
" return out\n",
"\n",
"URLS = load_urls(URLS_INLINE, URLS_FILE)\n",
"print(f\"Loaded {len(URLS)} URLs\")"
]
},
{
"cell_type": "raw",
"id": "bb3761f0-3684-4f30-92e9-869fd4556529",
"metadata": {},
"source": [
"## 2) Fetch first page HTML only\n",
"This grabs the main HTML and extracts simple metadata and body text. No link following."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7a7582b6-8277-4967-9d98-8cceeeab486d",
"metadata": {},
"outputs": [],
"source": [
"from aiohttp import ClientTimeout\n",
"from bs4 import BeautifulSoup\n",
"try:\n",
" from bs4 import FeatureNotFound\n",
"except Exception:\n",
" class FeatureNotFound(Exception):\n",
" ...\n",
"\n",
"DEFAULT_HEADERS = {\"User-Agent\": \"FirstPageSummarizer/1.0 (+https://edwarddonner.com)\"}\n",
"\n",
"async def fetch_one(session, url):\n",
" \"\"\"Fetch just one page (HTML if available).\"\"\"\n",
" try:\n",
" async with session.get(\n",
" url,\n",
" timeout=ClientTimeout(total=20),\n",
" headers=DEFAULT_HEADERS,\n",
" allow_redirects=True\n",
" ) as r:\n",
" ctype = r.headers.get(\"Content-Type\", \"\") or \"\"\n",
" is_html = \"html\" in ctype.lower()\n",
" text = await r.text(errors=\"ignore\") if is_html else \"\"\n",
" return {\n",
" \"url\": str(r.url),\n",
" \"status\": r.status,\n",
" \"content_type\": ctype,\n",
" \"html\": text,\n",
" }\n",
" except Exception as e:\n",
" return {\"url\": url, \"status\": None, \"content_type\": \"\", \"html\": \"\", \"error\": str(e)}\n",
"\n",
"def make_soup(html: str) -> BeautifulSoup:\n",
" \"\"\"Try lxml parser first, fall back to built-in html.parser if missing.\"\"\"\n",
" try:\n",
" return BeautifulSoup(html, \"lxml\")\n",
" except FeatureNotFound:\n",
" return BeautifulSoup(html, \"html.parser\")\n",
"\n",
"def extract_fields(url, html):\n",
" \"\"\"Extract title, meta description, and text from HTML.\"\"\"\n",
" soup = make_soup(html)\n",
" title = soup.title.string.strip() if soup.title and soup.title.string else \"\"\n",
"\n",
" meta_desc = \"\"\n",
" m = soup.find(\"meta\", attrs={\"name\": \"description\"})\n",
" if m and m.get(\"content\"):\n",
" meta_desc = m[\"content\"].strip()\n",
"\n",
" for tag in soup([\"script\", \"style\", \"noscript\"]):\n",
" tag.decompose()\n",
"\n",
" text = soup.get_text(\" \", strip=True)\n",
" text = text[:8000] # truncate to limit token size\n",
" return title, meta_desc, text\n",
"\n",
"async def fetch_all(urls):\n",
" \"\"\"Fetch and extract fields for a list of URLs (first page only).\"\"\"\n",
" import aiohttp\n",
" out = []\n",
" async with aiohttp.ClientSession() as session:\n",
" for u in urls:\n",
" resp = await fetch_one(session, u)\n",
" if resp.get(\"html\"):\n",
" title, meta_desc, text = extract_fields(resp[\"url\"], resp[\"html\"])\n",
" resp.update({\"title\": title, \"meta_description\": meta_desc, \"text\": text})\n",
" out.append(resp)\n",
" return out\n",
"\n",
"# Example usage in notebook (if URLS is defined):\n",
"# results = await fetch_all(URLS)\n",
"# len(results), results[:1]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d30a3c6d-b208-4d6b-a5ea-e4276935a629",
"metadata": {},
"outputs": [],
"source": [
"URLS = [\"https://edwarddonner.com\", \"https://www.wikipedia.org/\"]\n",
"results = await fetch_all(URLS)\n",
"len(results), results[:1]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2a53f08-4374-4125-9de8-6e1060e31200",
"metadata": {},
"outputs": [],
"source": [
"import os, json\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"\n",
"load_dotenv()\n",
"client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
"MODEL = os.getenv(\"OPENAI_DEFAULT_MODEL\", \"gpt-4.1-mini\")\n",
"\n",
"SYSTEM_PROMPT = \"\"\"\n",
"You summarize a web page for migration planning. \n",
"Return JSON with:\n",
"- title: short page title\n",
"- meta_description: concise (<= 160 chars)\n",
"- summary: 3-5 bullet points as a single string\n",
"- category: one of [blog, docs, product, pricing, careers, marketing, legal, support, account, other]\n",
"- key_entities: array of 3-8 important entities/keywords\n",
"\"\"\"\n",
"\n",
"def summarize_page(row):\n",
" user = (\n",
" f\"URL: {row['url']}\\n\"\n",
" f\"<title>{row.get('title','')}</title>\\n\"\n",
" f\"<meta_description>{row.get('meta_description','')}</meta_description>\\n\"\n",
" f\"<text>\\n{row.get('text','')[:6000]}\\n</text>\"\n",
" )\n",
" resp = client.responses.create(\n",
" model=MODEL,\n",
" input=[\n",
" {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
" {\"role\": \"user\", \"content\": user},\n",
" ],\n",
" response_format={\"type\": \"json_object\"}\n",
" )\n",
" return json.loads(resp.output[0].content[0].text)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "59f7d992-e7f0-4287-bd19-f8062fefe8c3",
"metadata": {},
"outputs": [],
"source": [
"enriched = []\n",
"for r in results:\n",
" if r.get(\"status\") and 200 <= r[\"status\"] < 400 and \"html\" in r.get(\"content_type\",\"\").lower():\n",
" try:\n",
" data = summarize_page(r)\n",
" enriched.append({**r, **data})\n",
" except Exception as e:\n",
" enriched.append({**r, \"error\": str(e)})\n",
" else:\n",
" enriched.append({**r, \"error\": \"Non-HTML or bad status\"})\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "822d8108-64c2-4cf1-abc5-1acd288b7574",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.DataFrame(enriched)\n",
"df.to_csv(\"firstpage_summary.csv\", index=False)\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f05d05c-bf6d-4236-8767-8695e4d4618f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}