{ "cells": [ { "cell_type": "raw", "id": "c6227d68-b1f4-4f71-9cc6-18aa3ce54209", "metadata": {}, "source": [ "# First‑Page URL Summarizer (OpenAI)\n", "\n", "#This notebook does not crawl a whole site. It only fetches the first page for each provided URL and asks OpenAI to summarize it.\n", "\n", "### What it does\n", "Loads a list of URLs (provided inline or from a file)\n", "Fetches each page with `aiohttp` (HTML only)\n", "Extracts text via BeautifulSoup (basic)\n", "Calls OpenAI to produce a structured JSON summary\n", "Exports a CSV with: url, http_status, title, meta_description, summary, category, key_entities\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0b0fe0e9-228e-461b-9a3e-f4392974c974", "metadata": {}, "outputs": [], "source": [ "# (Optional) If running locally, install deps here\n", "import sys, subprocess\n", "def pip_install(pkgs):\n", " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", *pkgs])\n", "\n", "pkgs = [\n", " \"aiohttp>=3.10\",\n", " \"beautifulsoup4>=4.12\",\n", " \"lxml>=5.2\",\n", " \"pandas>=2.2\",\n", " \"python-dotenv>=1.0\",\n", " \"openai>=1.51\",\n", "]\n", "try:\n", " import aiohttp, bs4, lxml, pandas, dotenv, openai\n", "except Exception:\n", " pip_install(pkgs)\n", "print(\"Ready ✔\")" ] }, { "cell_type": "code", "execution_count": null, "id": "86134741-0f8c-4049-894c-f31b27701da8", "metadata": {}, "outputs": [], "source": [ "import os, asyncio, aiohttp, pandas as pd\n", "from bs4 import BeautifulSoup\n", "from urllib.parse import urlparse\n", "from dotenv import load_dotenv\n", "from openai import OpenAI\n", "\n", "load_dotenv() # reads .env if present\n", "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", "MODEL = os.getenv(\"OPENAI_DEFAULT_MODEL\", \"gpt-4.1-mini\")\n", "if not OPENAI_API_KEY:\n", " print(\"Set OPENAI_API_KEY in .env or environment.\")\n", "client = OpenAI(api_key=OPENAI_API_KEY)\n", "\n", "DEFAULT_HEADERS = {\"User-Agent\": \"FirstPageSummarizer/1.0 (+https://edwarddonner.com\"}" ] }, { "cell_type": "raw", "id": "b96c4ed0-4c50-4347-8cc4-22ea21e7e483", "metadata": {}, "source": [ "## 1) Provide URLs\n", "You can paste a small list below, or set `URLS_FILE` to a text/CSV file containing URLs (one per line or in a column named `url`)." ] }, { "cell_type": "code", "execution_count": null, "id": "7ce4aef5-8df8-4f47-91b3-c3ecc7c4c8be", "metadata": {}, "outputs": [], "source": [ "URLS_INLINE = [\n", " \"https://edwarddonner.com\"\n", "]\n", "URLS_FILE = None # e.g., \"urls.txt\" or \"urls.csv\"" ] }, { "cell_type": "code", "execution_count": null, "id": "ba9f6f25-a04c-44fe-a16c-f7b5c47ed100", "metadata": {}, "outputs": [], "source": [ "import csv\n", "def load_urls(urls_inline, urls_file):\n", " urls = []\n", " if urls_file and os.path.exists(urls_file):\n", " if urls_file.endswith(\".csv\"):\n", " df = pd.read_csv(urls_file)\n", " if \"url\" in df.columns:\n", " urls.extend(df[\"url\"].dropna().tolist())\n", " else:\n", " with open(urls_file, \"r\", encoding=\"utf-8\") as f:\n", " for line in f:\n", " line=line.strip()\n", " if line:\n", " urls.append(line)\n", " urls.extend([u for u in urls_inline if u])\n", " # de-dup while preserving order\n", " seen=set(); out=[]\n", " for u in urls:\n", " if u not in seen:\n", " seen.add(u); out.append(u)\n", " return out\n", "\n", "URLS = load_urls(URLS_INLINE, URLS_FILE)\n", "print(f\"Loaded {len(URLS)} URLs\")" ] }, { "cell_type": "raw", "id": "bb3761f0-3684-4f30-92e9-869fd4556529", "metadata": {}, "source": [ "## 2) Fetch first page HTML only\n", "This grabs the main HTML and extracts simple metadata and body text. No link following." ] }, { "cell_type": "code", "execution_count": null, "id": "7a7582b6-8277-4967-9d98-8cceeeab486d", "metadata": {}, "outputs": [], "source": [ "from aiohttp import ClientTimeout\n", "from bs4 import BeautifulSoup\n", "try:\n", " from bs4 import FeatureNotFound\n", "except Exception:\n", " class FeatureNotFound(Exception):\n", " ...\n", "\n", "DEFAULT_HEADERS = {\"User-Agent\": \"FirstPageSummarizer/1.0 (+https://edwarddonner.com)\"}\n", "\n", "async def fetch_one(session, url):\n", " \"\"\"Fetch just one page (HTML if available).\"\"\"\n", " try:\n", " async with session.get(\n", " url,\n", " timeout=ClientTimeout(total=20),\n", " headers=DEFAULT_HEADERS,\n", " allow_redirects=True\n", " ) as r:\n", " ctype = r.headers.get(\"Content-Type\", \"\") or \"\"\n", " is_html = \"html\" in ctype.lower()\n", " text = await r.text(errors=\"ignore\") if is_html else \"\"\n", " return {\n", " \"url\": str(r.url),\n", " \"status\": r.status,\n", " \"content_type\": ctype,\n", " \"html\": text,\n", " }\n", " except Exception as e:\n", " return {\"url\": url, \"status\": None, \"content_type\": \"\", \"html\": \"\", \"error\": str(e)}\n", "\n", "def make_soup(html: str) -> BeautifulSoup:\n", " \"\"\"Try lxml parser first, fall back to built-in html.parser if missing.\"\"\"\n", " try:\n", " return BeautifulSoup(html, \"lxml\")\n", " except FeatureNotFound:\n", " return BeautifulSoup(html, \"html.parser\")\n", "\n", "def extract_fields(url, html):\n", " \"\"\"Extract title, meta description, and text from HTML.\"\"\"\n", " soup = make_soup(html)\n", " title = soup.title.string.strip() if soup.title and soup.title.string else \"\"\n", "\n", " meta_desc = \"\"\n", " m = soup.find(\"meta\", attrs={\"name\": \"description\"})\n", " if m and m.get(\"content\"):\n", " meta_desc = m[\"content\"].strip()\n", "\n", " for tag in soup([\"script\", \"style\", \"noscript\"]):\n", " tag.decompose()\n", "\n", " text = soup.get_text(\" \", strip=True)\n", " text = text[:8000] # truncate to limit token size\n", " return title, meta_desc, text\n", "\n", "async def fetch_all(urls):\n", " \"\"\"Fetch and extract fields for a list of URLs (first page only).\"\"\"\n", " import aiohttp\n", " out = []\n", " async with aiohttp.ClientSession() as session:\n", " for u in urls:\n", " resp = await fetch_one(session, u)\n", " if resp.get(\"html\"):\n", " title, meta_desc, text = extract_fields(resp[\"url\"], resp[\"html\"])\n", " resp.update({\"title\": title, \"meta_description\": meta_desc, \"text\": text})\n", " out.append(resp)\n", " return out\n", "\n", "# Example usage in notebook (if URLS is defined):\n", "# results = await fetch_all(URLS)\n", "# len(results), results[:1]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d30a3c6d-b208-4d6b-a5ea-e4276935a629", "metadata": {}, "outputs": [], "source": [ "URLS = [\"https://edwarddonner.com\", \"https://www.wikipedia.org/\"]\n", "results = await fetch_all(URLS)\n", "len(results), results[:1]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b2a53f08-4374-4125-9de8-6e1060e31200", "metadata": {}, "outputs": [], "source": [ "import os, json\n", "from dotenv import load_dotenv\n", "from openai import OpenAI\n", "\n", "load_dotenv()\n", "client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n", "MODEL = os.getenv(\"OPENAI_DEFAULT_MODEL\", \"gpt-4.1-mini\")\n", "\n", "SYSTEM_PROMPT = \"\"\"\n", "You summarize a web page for migration planning. \n", "Return JSON with:\n", "- title: short page title\n", "- meta_description: concise (<= 160 chars)\n", "- summary: 3-5 bullet points as a single string\n", "- category: one of [blog, docs, product, pricing, careers, marketing, legal, support, account, other]\n", "- key_entities: array of 3-8 important entities/keywords\n", "\"\"\"\n", "\n", "def summarize_page(row):\n", " user = (\n", " f\"URL: {row['url']}\\n\"\n", " f\"