Merge pull request #666 from vinitparak/community-contributions-branch

integration of OpenAI-based page summarization
2025-09-19 19:41:42 -04:00
parent 8849a0c70f 967b1c4101
commit 3c16d501b5
3 changed files with 1694 additions and 0 deletions
--- a/week1/community-contributions/website-url-scrapping-csv/URLScrapping-linkscrapping.ipynb
+++ b/week1/community-contributions/website-url-scrapping-csv/URLScrapping-linkscrapping.ipynb
@@ -0,0 +1,333 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "id": "c6227d68-b1f4-4f71-9cc6-18aa3ce54209",
+   "metadata": {},
+   "source": [
+    "# First‑Page URL Summarizer (OpenAI)\n",
+    "\n",
+    "#This notebook does not crawl a whole site. It only fetches the first page for each provided URL and asks OpenAI to summarize it.\n",
+    "\n",
+    "### What it does\n",
+    "Loads a list of URLs (provided inline or from a file)\n",
+    "Fetches each page with `aiohttp` (HTML only)\n",
+    "Extracts text via BeautifulSoup (basic)\n",
+    "Calls OpenAI to produce a structured JSON summary\n",
+    "Exports a CSV with: url, http_status, title, meta_description, summary, category, key_entities\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b0fe0e9-228e-461b-9a3e-f4392974c974",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# (Optional) If running locally, install deps here\n",
+    "import sys, subprocess\n",
+    "def pip_install(pkgs):\n",
+    "    subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", *pkgs])\n",
+    "\n",
+    "pkgs = [\n",
+    "    \"aiohttp>=3.10\",\n",
+    "    \"beautifulsoup4>=4.12\",\n",
+    "    \"lxml>=5.2\",\n",
+    "    \"pandas>=2.2\",\n",
+    "    \"python-dotenv>=1.0\",\n",
+    "    \"openai>=1.51\",\n",
+    "]\n",
+    "try:\n",
+    "    import aiohttp, bs4, lxml, pandas, dotenv, openai\n",
+    "except Exception:\n",
+    "    pip_install(pkgs)\n",
+    "print(\"Ready ✔\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86134741-0f8c-4049-894c-f31b27701da8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, asyncio, aiohttp, pandas as pd\n",
+    "from bs4 import BeautifulSoup\n",
+    "from urllib.parse import urlparse\n",
+    "from dotenv import load_dotenv\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "load_dotenv()  # reads .env if present\n",
+    "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
+    "MODEL = os.getenv(\"OPENAI_DEFAULT_MODEL\", \"gpt-4.1-mini\")\n",
+    "if not OPENAI_API_KEY:\n",
+    "    print(\"Set OPENAI_API_KEY in .env or environment.\")\n",
+    "client = OpenAI(api_key=OPENAI_API_KEY)\n",
+    "\n",
+    "DEFAULT_HEADERS = {\"User-Agent\": \"FirstPageSummarizer/1.0 (+https://edwarddonner.com\"}"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "b96c4ed0-4c50-4347-8cc4-22ea21e7e483",
+   "metadata": {},
+   "source": [
+    "## 1) Provide URLs\n",
+    "You can paste a small list below, or set `URLS_FILE` to a text/CSV file containing URLs (one per line or in a column named `url`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ce4aef5-8df8-4f47-91b3-c3ecc7c4c8be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "URLS_INLINE = [\n",
+    "    \"https://edwarddonner.com\"\n",
+    "]\n",
+    "URLS_FILE = None  # e.g., \"urls.txt\" or \"urls.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba9f6f25-a04c-44fe-a16c-f7b5c47ed100",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "def load_urls(urls_inline, urls_file):\n",
+    "    urls = []\n",
+    "    if urls_file and os.path.exists(urls_file):\n",
+    "        if urls_file.endswith(\".csv\"):\n",
+    "            df = pd.read_csv(urls_file)\n",
+    "            if \"url\" in df.columns:\n",
+    "                urls.extend(df[\"url\"].dropna().tolist())\n",
+    "        else:\n",
+    "            with open(urls_file, \"r\", encoding=\"utf-8\") as f:\n",
+    "                for line in f:\n",
+    "                    line=line.strip()\n",
+    "                    if line:\n",
+    "                        urls.append(line)\n",
+    "    urls.extend([u for u in urls_inline if u])\n",
+    "    # de-dup while preserving order\n",
+    "    seen=set(); out=[]\n",
+    "    for u in urls:\n",
+    "        if u not in seen:\n",
+    "            seen.add(u); out.append(u)\n",
+    "    return out\n",
+    "\n",
+    "URLS = load_urls(URLS_INLINE, URLS_FILE)\n",
+    "print(f\"Loaded {len(URLS)} URLs\")"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "bb3761f0-3684-4f30-92e9-869fd4556529",
+   "metadata": {},
+   "source": [
+    "## 2) Fetch first page HTML only\n",
+    "This grabs the main HTML and extracts simple metadata and body text. No link following."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a7582b6-8277-4967-9d98-8cceeeab486d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from aiohttp import ClientTimeout\n",
+    "from bs4 import BeautifulSoup\n",
+    "try:\n",
+    "    from bs4 import FeatureNotFound\n",
+    "except Exception:\n",
+    "    class FeatureNotFound(Exception):\n",
+    "        ...\n",
+    "\n",
+    "DEFAULT_HEADERS = {\"User-Agent\": \"FirstPageSummarizer/1.0 (+https://edwarddonner.com)\"}\n",
+    "\n",
+    "async def fetch_one(session, url):\n",
+    "    \"\"\"Fetch just one page (HTML if available).\"\"\"\n",
+    "    try:\n",
+    "        async with session.get(\n",
+    "            url,\n",
+    "            timeout=ClientTimeout(total=20),\n",
+    "            headers=DEFAULT_HEADERS,\n",
+    "            allow_redirects=True\n",
+    "        ) as r:\n",
+    "            ctype = r.headers.get(\"Content-Type\", \"\") or \"\"\n",
+    "            is_html = \"html\" in ctype.lower()\n",
+    "            text = await r.text(errors=\"ignore\") if is_html else \"\"\n",
+    "            return {\n",
+    "                \"url\": str(r.url),\n",
+    "                \"status\": r.status,\n",
+    "                \"content_type\": ctype,\n",
+    "                \"html\": text,\n",
+    "            }\n",
+    "    except Exception as e:\n",
+    "        return {\"url\": url, \"status\": None, \"content_type\": \"\", \"html\": \"\", \"error\": str(e)}\n",
+    "\n",
+    "def make_soup(html: str) -> BeautifulSoup:\n",
+    "    \"\"\"Try lxml parser first, fall back to built-in html.parser if missing.\"\"\"\n",
+    "    try:\n",
+    "        return BeautifulSoup(html, \"lxml\")\n",
+    "    except FeatureNotFound:\n",
+    "        return BeautifulSoup(html, \"html.parser\")\n",
+    "\n",
+    "def extract_fields(url, html):\n",
+    "    \"\"\"Extract title, meta description, and text from HTML.\"\"\"\n",
+    "    soup = make_soup(html)\n",
+    "    title = soup.title.string.strip() if soup.title and soup.title.string else \"\"\n",
+    "\n",
+    "    meta_desc = \"\"\n",
+    "    m = soup.find(\"meta\", attrs={\"name\": \"description\"})\n",
+    "    if m and m.get(\"content\"):\n",
+    "        meta_desc = m[\"content\"].strip()\n",
+    "\n",
+    "    for tag in soup([\"script\", \"style\", \"noscript\"]):\n",
+    "        tag.decompose()\n",
+    "\n",
+    "    text = soup.get_text(\" \", strip=True)\n",
+    "    text = text[:8000]  # truncate to limit token size\n",
+    "    return title, meta_desc, text\n",
+    "\n",
+    "async def fetch_all(urls):\n",
+    "    \"\"\"Fetch and extract fields for a list of URLs (first page only).\"\"\"\n",
+    "    import aiohttp\n",
+    "    out = []\n",
+    "    async with aiohttp.ClientSession() as session:\n",
+    "        for u in urls:\n",
+    "            resp = await fetch_one(session, u)\n",
+    "            if resp.get(\"html\"):\n",
+    "                title, meta_desc, text = extract_fields(resp[\"url\"], resp[\"html\"])\n",
+    "                resp.update({\"title\": title, \"meta_description\": meta_desc, \"text\": text})\n",
+    "            out.append(resp)\n",
+    "    return out\n",
+    "\n",
+    "# Example usage in notebook (if URLS is defined):\n",
+    "# results = await fetch_all(URLS)\n",
+    "# len(results), results[:1]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d30a3c6d-b208-4d6b-a5ea-e4276935a629",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "URLS = [\"https://edwarddonner.com\", \"https://www.wikipedia.org/\"]\n",
+    "results = await fetch_all(URLS)\n",
+    "len(results), results[:1]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2a53f08-4374-4125-9de8-6e1060e31200",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, json\n",
+    "from dotenv import load_dotenv\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "load_dotenv()\n",
+    "client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
+    "MODEL = os.getenv(\"OPENAI_DEFAULT_MODEL\", \"gpt-4.1-mini\")\n",
+    "\n",
+    "SYSTEM_PROMPT = \"\"\"\n",
+    "You summarize a web page for migration planning. \n",
+    "Return JSON with:\n",
+    "- title: short page title\n",
+    "- meta_description: concise (<= 160 chars)\n",
+    "- summary: 3-5 bullet points as a single string\n",
+    "- category: one of [blog, docs, product, pricing, careers, marketing, legal, support, account, other]\n",
+    "- key_entities: array of 3-8 important entities/keywords\n",
+    "\"\"\"\n",
+    "\n",
+    "def summarize_page(row):\n",
+    "    user = (\n",
+    "        f\"URL: {row['url']}\\n\"\n",
+    "        f\"<title>{row.get('title','')}</title>\\n\"\n",
+    "        f\"<meta_description>{row.get('meta_description','')}</meta_description>\\n\"\n",
+    "        f\"<text>\\n{row.get('text','')[:6000]}\\n</text>\"\n",
+    "    )\n",
+    "    resp = client.responses.create(\n",
+    "        model=MODEL,\n",
+    "        input=[\n",
+    "            {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+    "            {\"role\": \"user\", \"content\": user},\n",
+    "        ],\n",
+    "        response_format={\"type\": \"json_object\"}\n",
+    "    )\n",
+    "    return json.loads(resp.output[0].content[0].text)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59f7d992-e7f0-4287-bd19-f8062fefe8c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "enriched = []\n",
+    "for r in results:\n",
+    "    if r.get(\"status\") and 200 <= r[\"status\"] < 400 and \"html\" in r.get(\"content_type\",\"\").lower():\n",
+    "        try:\n",
+    "            data = summarize_page(r)\n",
+    "            enriched.append({**r, **data})\n",
+    "        except Exception as e:\n",
+    "            enriched.append({**r, \"error\": str(e)})\n",
+    "    else:\n",
+    "        enriched.append({**r, \"error\": \"Non-HTML or bad status\"})\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "822d8108-64c2-4cf1-abc5-1acd288b7574",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.DataFrame(enriched)\n",
+    "df.to_csv(\"firstpage_summary.csv\", index=False)\n",
+    "df.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f05d05c-bf6d-4236-8767-8695e4d4618f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/week1/community-contributions/website-url-scrapping-csv/firstpage_summary.csv
+++ b/week1/community-contributions/website-url-scrapping-csv/firstpage_summary.csv
--- a/week1/community-contributions/website-url-scrapping-csv/requirement.txt
+++ b/week1/community-contributions/website-url-scrapping-csv/requirement.txt
@@ -0,0 +1,6 @@
+aiohttp>=3.10
+beautifulsoup4>=4.12
+lxml>=5.2
+pandas>=2.2
+python-dotenv>=1.0
+openai>=1.51