Merge pull request #666 from vinitparak/community-contributions-branch

integration of OpenAI-based page summarization
This commit is contained in:
Ed Donner
2025-09-19 19:41:42 -04:00
committed by GitHub
3 changed files with 1694 additions and 0 deletions

View File

@@ -0,0 +1,333 @@
{
"cells": [
{
"cell_type": "raw",
"id": "c6227d68-b1f4-4f71-9cc6-18aa3ce54209",
"metadata": {},
"source": [
"# FirstPage URL Summarizer (OpenAI)\n",
"\n",
"#This notebook does not crawl a whole site. It only fetches the first page for each provided URL and asks OpenAI to summarize it.\n",
"\n",
"### What it does\n",
"Loads a list of URLs (provided inline or from a file)\n",
"Fetches each page with `aiohttp` (HTML only)\n",
"Extracts text via BeautifulSoup (basic)\n",
"Calls OpenAI to produce a structured JSON summary\n",
"Exports a CSV with: url, http_status, title, meta_description, summary, category, key_entities\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b0fe0e9-228e-461b-9a3e-f4392974c974",
"metadata": {},
"outputs": [],
"source": [
"# (Optional) If running locally, install deps here\n",
"import sys, subprocess\n",
"def pip_install(pkgs):\n",
" subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", *pkgs])\n",
"\n",
"pkgs = [\n",
" \"aiohttp>=3.10\",\n",
" \"beautifulsoup4>=4.12\",\n",
" \"lxml>=5.2\",\n",
" \"pandas>=2.2\",\n",
" \"python-dotenv>=1.0\",\n",
" \"openai>=1.51\",\n",
"]\n",
"try:\n",
" import aiohttp, bs4, lxml, pandas, dotenv, openai\n",
"except Exception:\n",
" pip_install(pkgs)\n",
"print(\"Ready ✔\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86134741-0f8c-4049-894c-f31b27701da8",
"metadata": {},
"outputs": [],
"source": [
"import os, asyncio, aiohttp, pandas as pd\n",
"from bs4 import BeautifulSoup\n",
"from urllib.parse import urlparse\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"\n",
"load_dotenv() # reads .env if present\n",
"OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
"MODEL = os.getenv(\"OPENAI_DEFAULT_MODEL\", \"gpt-4.1-mini\")\n",
"if not OPENAI_API_KEY:\n",
" print(\"Set OPENAI_API_KEY in .env or environment.\")\n",
"client = OpenAI(api_key=OPENAI_API_KEY)\n",
"\n",
"DEFAULT_HEADERS = {\"User-Agent\": \"FirstPageSummarizer/1.0 (+https://edwarddonner.com\"}"
]
},
{
"cell_type": "raw",
"id": "b96c4ed0-4c50-4347-8cc4-22ea21e7e483",
"metadata": {},
"source": [
"## 1) Provide URLs\n",
"You can paste a small list below, or set `URLS_FILE` to a text/CSV file containing URLs (one per line or in a column named `url`)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ce4aef5-8df8-4f47-91b3-c3ecc7c4c8be",
"metadata": {},
"outputs": [],
"source": [
"URLS_INLINE = [\n",
" \"https://edwarddonner.com\"\n",
"]\n",
"URLS_FILE = None # e.g., \"urls.txt\" or \"urls.csv\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba9f6f25-a04c-44fe-a16c-f7b5c47ed100",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"def load_urls(urls_inline, urls_file):\n",
" urls = []\n",
" if urls_file and os.path.exists(urls_file):\n",
" if urls_file.endswith(\".csv\"):\n",
" df = pd.read_csv(urls_file)\n",
" if \"url\" in df.columns:\n",
" urls.extend(df[\"url\"].dropna().tolist())\n",
" else:\n",
" with open(urls_file, \"r\", encoding=\"utf-8\") as f:\n",
" for line in f:\n",
" line=line.strip()\n",
" if line:\n",
" urls.append(line)\n",
" urls.extend([u for u in urls_inline if u])\n",
" # de-dup while preserving order\n",
" seen=set(); out=[]\n",
" for u in urls:\n",
" if u not in seen:\n",
" seen.add(u); out.append(u)\n",
" return out\n",
"\n",
"URLS = load_urls(URLS_INLINE, URLS_FILE)\n",
"print(f\"Loaded {len(URLS)} URLs\")"
]
},
{
"cell_type": "raw",
"id": "bb3761f0-3684-4f30-92e9-869fd4556529",
"metadata": {},
"source": [
"## 2) Fetch first page HTML only\n",
"This grabs the main HTML and extracts simple metadata and body text. No link following."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7a7582b6-8277-4967-9d98-8cceeeab486d",
"metadata": {},
"outputs": [],
"source": [
"from aiohttp import ClientTimeout\n",
"from bs4 import BeautifulSoup\n",
"try:\n",
" from bs4 import FeatureNotFound\n",
"except Exception:\n",
" class FeatureNotFound(Exception):\n",
" ...\n",
"\n",
"DEFAULT_HEADERS = {\"User-Agent\": \"FirstPageSummarizer/1.0 (+https://edwarddonner.com)\"}\n",
"\n",
"async def fetch_one(session, url):\n",
" \"\"\"Fetch just one page (HTML if available).\"\"\"\n",
" try:\n",
" async with session.get(\n",
" url,\n",
" timeout=ClientTimeout(total=20),\n",
" headers=DEFAULT_HEADERS,\n",
" allow_redirects=True\n",
" ) as r:\n",
" ctype = r.headers.get(\"Content-Type\", \"\") or \"\"\n",
" is_html = \"html\" in ctype.lower()\n",
" text = await r.text(errors=\"ignore\") if is_html else \"\"\n",
" return {\n",
" \"url\": str(r.url),\n",
" \"status\": r.status,\n",
" \"content_type\": ctype,\n",
" \"html\": text,\n",
" }\n",
" except Exception as e:\n",
" return {\"url\": url, \"status\": None, \"content_type\": \"\", \"html\": \"\", \"error\": str(e)}\n",
"\n",
"def make_soup(html: str) -> BeautifulSoup:\n",
" \"\"\"Try lxml parser first, fall back to built-in html.parser if missing.\"\"\"\n",
" try:\n",
" return BeautifulSoup(html, \"lxml\")\n",
" except FeatureNotFound:\n",
" return BeautifulSoup(html, \"html.parser\")\n",
"\n",
"def extract_fields(url, html):\n",
" \"\"\"Extract title, meta description, and text from HTML.\"\"\"\n",
" soup = make_soup(html)\n",
" title = soup.title.string.strip() if soup.title and soup.title.string else \"\"\n",
"\n",
" meta_desc = \"\"\n",
" m = soup.find(\"meta\", attrs={\"name\": \"description\"})\n",
" if m and m.get(\"content\"):\n",
" meta_desc = m[\"content\"].strip()\n",
"\n",
" for tag in soup([\"script\", \"style\", \"noscript\"]):\n",
" tag.decompose()\n",
"\n",
" text = soup.get_text(\" \", strip=True)\n",
" text = text[:8000] # truncate to limit token size\n",
" return title, meta_desc, text\n",
"\n",
"async def fetch_all(urls):\n",
" \"\"\"Fetch and extract fields for a list of URLs (first page only).\"\"\"\n",
" import aiohttp\n",
" out = []\n",
" async with aiohttp.ClientSession() as session:\n",
" for u in urls:\n",
" resp = await fetch_one(session, u)\n",
" if resp.get(\"html\"):\n",
" title, meta_desc, text = extract_fields(resp[\"url\"], resp[\"html\"])\n",
" resp.update({\"title\": title, \"meta_description\": meta_desc, \"text\": text})\n",
" out.append(resp)\n",
" return out\n",
"\n",
"# Example usage in notebook (if URLS is defined):\n",
"# results = await fetch_all(URLS)\n",
"# len(results), results[:1]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d30a3c6d-b208-4d6b-a5ea-e4276935a629",
"metadata": {},
"outputs": [],
"source": [
"URLS = [\"https://edwarddonner.com\", \"https://www.wikipedia.org/\"]\n",
"results = await fetch_all(URLS)\n",
"len(results), results[:1]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2a53f08-4374-4125-9de8-6e1060e31200",
"metadata": {},
"outputs": [],
"source": [
"import os, json\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"\n",
"load_dotenv()\n",
"client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
"MODEL = os.getenv(\"OPENAI_DEFAULT_MODEL\", \"gpt-4.1-mini\")\n",
"\n",
"SYSTEM_PROMPT = \"\"\"\n",
"You summarize a web page for migration planning. \n",
"Return JSON with:\n",
"- title: short page title\n",
"- meta_description: concise (<= 160 chars)\n",
"- summary: 3-5 bullet points as a single string\n",
"- category: one of [blog, docs, product, pricing, careers, marketing, legal, support, account, other]\n",
"- key_entities: array of 3-8 important entities/keywords\n",
"\"\"\"\n",
"\n",
"def summarize_page(row):\n",
" user = (\n",
" f\"URL: {row['url']}\\n\"\n",
" f\"<title>{row.get('title','')}</title>\\n\"\n",
" f\"<meta_description>{row.get('meta_description','')}</meta_description>\\n\"\n",
" f\"<text>\\n{row.get('text','')[:6000]}\\n</text>\"\n",
" )\n",
" resp = client.responses.create(\n",
" model=MODEL,\n",
" input=[\n",
" {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
" {\"role\": \"user\", \"content\": user},\n",
" ],\n",
" response_format={\"type\": \"json_object\"}\n",
" )\n",
" return json.loads(resp.output[0].content[0].text)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "59f7d992-e7f0-4287-bd19-f8062fefe8c3",
"metadata": {},
"outputs": [],
"source": [
"enriched = []\n",
"for r in results:\n",
" if r.get(\"status\") and 200 <= r[\"status\"] < 400 and \"html\" in r.get(\"content_type\",\"\").lower():\n",
" try:\n",
" data = summarize_page(r)\n",
" enriched.append({**r, **data})\n",
" except Exception as e:\n",
" enriched.append({**r, \"error\": str(e)})\n",
" else:\n",
" enriched.append({**r, \"error\": \"Non-HTML or bad status\"})\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "822d8108-64c2-4cf1-abc5-1acd288b7574",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.DataFrame(enriched)\n",
"df.to_csv(\"firstpage_summary.csv\", index=False)\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f05d05c-bf6d-4236-8767-8695e4d4618f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,6 @@
aiohttp>=3.10
beautifulsoup4>=4.12
lxml>=5.2
pandas>=2.2
python-dotenv>=1.0
openai>=1.51