LLM_Engineering_OLD/week1/community-contributions/website-url-scrapping-csv/URLScrapping-linkscrapping.ipynb

{
 "cells": [
  {
   "cell_type": "raw",
   "id": "c6227d68-b1f4-4f71-9cc6-18aa3ce54209",
   "metadata": {},
   "source": [
    "# First‑Page URL Summarizer (OpenAI)\n",
    "\n",
    "#This notebook does not crawl a whole site. It only fetches the first page for each provided URL and asks OpenAI to summarize it.\n",
    "\n",
    "### What it does\n",
    "Loads a list of URLs (provided inline or from a file)\n",
    "Fetches each page with `aiohttp` (HTML only)\n",
    "Extracts text via BeautifulSoup (basic)\n",
    "Calls OpenAI to produce a structured JSON summary\n",
    "Exports a CSV with: url, http_status, title, meta_description, summary, category, key_entities\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b0fe0e9-228e-461b-9a3e-f4392974c974",
   "metadata": {},
   "outputs": [],
   "source": [
    "# (Optional) If running locally, install deps here\n",
    "import sys, subprocess\n",
    "def pip_install(pkgs):\n",
    "    subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", *pkgs])\n",
    "\n",
    "pkgs = [\n",
    "    \"aiohttp>=3.10\",\n",
    "    \"beautifulsoup4>=4.12\",\n",
    "    \"lxml>=5.2\",\n",
    "    \"pandas>=2.2\",\n",
    "    \"python-dotenv>=1.0\",\n",
    "    \"openai>=1.51\",\n",
    "]\n",
    "try:\n",
    "    import aiohttp, bs4, lxml, pandas, dotenv, openai\n",
    "except Exception:\n",
    "    pip_install(pkgs)\n",
    "print(\"Ready ✔\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86134741-0f8c-4049-894c-f31b27701da8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, asyncio, aiohttp, pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "from urllib.parse import urlparse\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "\n",
    "load_dotenv()  # reads .env if present\n",
    "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
    "MODEL = os.getenv(\"OPENAI_DEFAULT_MODEL\", \"gpt-4.1-mini\")\n",
    "if not OPENAI_API_KEY:\n",
    "    print(\"Set OPENAI_API_KEY in .env or environment.\")\n",
    "client = OpenAI(api_key=OPENAI_API_KEY)\n",
    "\n",
    "DEFAULT_HEADERS = {\"User-Agent\": \"FirstPageSummarizer/1.0 (+https://edwarddonner.com\"}"
   ]
  },
  {
   "cell_type": "raw",
   "id": "b96c4ed0-4c50-4347-8cc4-22ea21e7e483",
   "metadata": {},
   "source": [
    "## 1) Provide URLs\n",
    "You can paste a small list below, or set `URLS_FILE` to a text/CSV file containing URLs (one per line or in a column named `url`)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ce4aef5-8df8-4f47-91b3-c3ecc7c4c8be",
   "metadata": {},
   "outputs": [],
   "source": [
    "URLS_INLINE = [\n",
    "    \"https://edwarddonner.com\"\n",
    "]\n",
    "URLS_FILE = None  # e.g., \"urls.txt\" or \"urls.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba9f6f25-a04c-44fe-a16c-f7b5c47ed100",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "def load_urls(urls_inline, urls_file):\n",
    "    urls = []\n",
    "    if urls_file and os.path.exists(urls_file):\n",
    "        if urls_file.endswith(\".csv\"):\n",
    "            df = pd.read_csv(urls_file)\n",
    "            if \"url\" in df.columns:\n",
    "                urls.extend(df[\"url\"].dropna().tolist())\n",
    "        else:\n",
    "            with open(urls_file, \"r\", encoding=\"utf-8\") as f:\n",
    "                for line in f:\n",
    "                    line=line.strip()\n",
    "                    if line:\n",
    "                        urls.append(line)\n",
    "    urls.extend([u for u in urls_inline if u])\n",
    "    # de-dup while preserving order\n",
    "    seen=set(); out=[]\n",
    "    for u in urls:\n",
    "        if u not in seen:\n",
    "            seen.add(u); out.append(u)\n",
    "    return out\n",
    "\n",
    "URLS = load_urls(URLS_INLINE, URLS_FILE)\n",
    "print(f\"Loaded {len(URLS)} URLs\")"
   ]
  },
  {
   "cell_type": "raw",
   "id": "bb3761f0-3684-4f30-92e9-869fd4556529",
   "metadata": {},
   "source": [
    "## 2) Fetch first page HTML only\n",
    "This grabs the main HTML and extracts simple metadata and body text. No link following."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a7582b6-8277-4967-9d98-8cceeeab486d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from aiohttp import ClientTimeout\n",
    "from bs4 import BeautifulSoup\n",
    "try:\n",
    "    from bs4 import FeatureNotFound\n",
    "except Exception:\n",
    "    class FeatureNotFound(Exception):\n",
    "        ...\n",
    "\n",
    "DEFAULT_HEADERS = {\"User-Agent\": \"FirstPageSummarizer/1.0 (+https://edwarddonner.com)\"}\n",
    "\n",
    "async def fetch_one(session, url):\n",
    "    \"\"\"Fetch just one page (HTML if available).\"\"\"\n",
    "    try:\n",
    "        async with session.get(\n",
    "            url,\n",
    "            timeout=ClientTimeout(total=20),\n",
    "            headers=DEFAULT_HEADERS,\n",
    "            allow_redirects=True\n",
    "        ) as r:\n",
    "            ctype = r.headers.get(\"Content-Type\", \"\") or \"\"\n",
    "            is_html = \"html\" in ctype.lower()\n",
    "            text = await r.text(errors=\"ignore\") if is_html else \"\"\n",
    "            return {\n",
    "                \"url\": str(r.url),\n",
    "                \"status\": r.status,\n",
    "                \"content_type\": ctype,\n",
    "                \"html\": text,\n",
    "            }\n",
    "    except Exception as e:\n",
    "        return {\"url\": url, \"status\": None, \"content_type\": \"\", \"html\": \"\", \"error\": str(e)}\n",
    "\n",
    "def make_soup(html: str) -> BeautifulSoup:\n",
    "    \"\"\"Try lxml parser first, fall back to built-in html.parser if missing.\"\"\"\n",
    "    try:\n",
    "        return BeautifulSoup(html, \"lxml\")\n",
    "    except FeatureNotFound:\n",
    "        return BeautifulSoup(html, \"html.parser\")\n",
    "\n",
    "def extract_fields(url, html):\n",
    "    \"\"\"Extract title, meta description, and text from HTML.\"\"\"\n",
    "    soup = make_soup(html)\n",
    "    title = soup.title.string.strip() if soup.title and soup.title.string else \"\"\n",
    "\n",
    "    meta_desc = \"\"\n",
    "    m = soup.find(\"meta\", attrs={\"name\": \"description\"})\n",
    "    if m and m.get(\"content\"):\n",
    "        meta_desc = m[\"content\"].strip()\n",
    "\n",
    "    for tag in soup([\"script\", \"style\", \"noscript\"]):\n",
    "        tag.decompose()\n",
    "\n",
    "    text = soup.get_text(\" \", strip=True)\n",
    "    text = text[:8000]  # truncate to limit token size\n",
    "    return title, meta_desc, text\n",
    "\n",
    "async def fetch_all(urls):\n",
    "    \"\"\"Fetch and extract fields for a list of URLs (first page only).\"\"\"\n",
    "    import aiohttp\n",
    "    out = []\n",
    "    async with aiohttp.ClientSession() as session:\n",
    "        for u in urls:\n",
    "            resp = await fetch_one(session, u)\n",
    "            if resp.get(\"html\"):\n",
    "                title, meta_desc, text = extract_fields(resp[\"url\"], resp[\"html\"])\n",
    "                resp.update({\"title\": title, \"meta_description\": meta_desc, \"text\": text})\n",
    "            out.append(resp)\n",
    "    return out\n",
    "\n",
    "# Example usage in notebook (if URLS is defined):\n",
    "# results = await fetch_all(URLS)\n",
    "# len(results), results[:1]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d30a3c6d-b208-4d6b-a5ea-e4276935a629",
   "metadata": {},
   "outputs": [],
   "source": [
    "URLS = [\"https://edwarddonner.com\", \"https://www.wikipedia.org/\"]\n",
    "results = await fetch_all(URLS)\n",
    "len(results), results[:1]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2a53f08-4374-4125-9de8-6e1060e31200",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, json\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "\n",
    "load_dotenv()\n",
    "client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
    "MODEL = os.getenv(\"OPENAI_DEFAULT_MODEL\", \"gpt-4.1-mini\")\n",
    "\n",
    "SYSTEM_PROMPT = \"\"\"\n",
    "You summarize a web page for migration planning. \n",
    "Return JSON with:\n",
    "- title: short page title\n",
    "- meta_description: concise (<= 160 chars)\n",
    "- summary: 3-5 bullet points as a single string\n",
    "- category: one of [blog, docs, product, pricing, careers, marketing, legal, support, account, other]\n",
    "- key_entities: array of 3-8 important entities/keywords\n",
    "\"\"\"\n",
    "\n",
    "def summarize_page(row):\n",
    "    user = (\n",
    "        f\"URL: {row['url']}\\n\"\n",
    "        f\"<title>{row.get('title','')}</title>\\n\"\n",
    "        f\"<meta_description>{row.get('meta_description','')}</meta_description>\\n\"\n",
    "        f\"<text>\\n{row.get('text','')[:6000]}\\n</text>\"\n",
    "    )\n",
    "    resp = client.responses.create(\n",
    "        model=MODEL,\n",
    "        input=[\n",
    "            {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
    "            {\"role\": \"user\", \"content\": user},\n",
    "        ],\n",
    "        response_format={\"type\": \"json_object\"}\n",
    "    )\n",
    "    return json.loads(resp.output[0].content[0].text)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59f7d992-e7f0-4287-bd19-f8062fefe8c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "enriched = []\n",
    "for r in results:\n",
    "    if r.get(\"status\") and 200 <= r[\"status\"] < 400 and \"html\" in r.get(\"content_type\",\"\").lower():\n",
    "        try:\n",
    "            data = summarize_page(r)\n",
    "            enriched.append({**r, **data})\n",
    "        except Exception as e:\n",
    "            enriched.append({**r, \"error\": str(e)})\n",
    "    else:\n",
    "        enriched.append({**r, \"error\": \"Non-HTML or bad status\"})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "822d8108-64c2-4cf1-abc5-1acd288b7574",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame(enriched)\n",
    "df.to_csv(\"firstpage_summary.csv\", index=False)\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f05d05c-bf6d-4236-8767-8695e4d4618f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}