Merge pull request #852 from maherp24/week1_day2

day2_webpage_summarizer
2025-11-01 20:24:53 -04:00
parent 9305c57dbb c5382d871d
commit 1ccbe233a8
1 changed files with 157 additions and 0 deletions
--- a/week1/community-contributions/day2_ollama_webpage_summarizer_bs4.ipynb
+++ b/week1/community-contributions/day2_ollama_webpage_summarizer_bs4.ipynb
@@ -0,0 +1,157 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "33314df1",
   "metadata": {},
   "source": [
    "# Webpage Summarizer with Ollama\n",
    "\n",
    "Scrape any webpage and get a quick summary using Ollama 3.2.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64917274",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 1: Setup and Dependencies\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import ollama\n",
    "\n",
    "# Check if Ollama is running\n",
    "try:\n",
    "    ollama.list()\n",
    "    print(\"✓ Ollama is running!\")\n",
    "except Exception as e:\n",
    "    print(\"⚠ Warning: Can't connect to Ollama. Make sure it's running!\")\n",
    "    print(\"  Run 'ollama serve' in your terminal if needed.\")\n",
    "\n",
    "print(\"\\nReady to summarize webpages!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "981ff805",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 2: Set URL to scrape\n",
    "# Change this URL to whatever webpage you want to summarize\n",
    "url = \"https://github.com/maherp24\"\n",
    "\n",
    "print(f\"Will summarize: {url}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a45d7c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # Get the webpage\n",
    "    response = requests.get(url, timeout=10)\n",
    "    response.raise_for_status()\n",
    "    \n",
    "    print(\"✓ Webpage fetched!\")\n",
    "    \n",
    "    # Parse HTML\n",
    "    soup = BeautifulSoup(response.content, 'html.parser')\n",
    "    \n",
    "    # Extract text from paragraphs\n",
    "    paragraphs = soup.find_all('p')\n",
    "    text = ' '.join([p.get_text() for p in paragraphs])\n",
    "    \n",
    "    # Clean up whitespace\n",
    "    text = ' '.join(text.split())\n",
    "    \n",
    "    # Limit to 4000 characters to avoid token issues\n",
    "    if len(text) > 4000:\n",
    "        text = text[:4000]\n",
    "        print(f\"✓ Extracted {len(text)} characters (truncated to 4000)\")\n",
    "    else:\n",
    "        print(f\"✓ Extracted {len(text)} characters\")\n",
    "    \n",
    "    print(f\"\\nFirst 200 characters:\\n{text[:200]}...\")\n",
    "    \n",
    "except Exception as e:\n",
    "    print(f\"❌ Error scraping webpage: {e}\")\n",
    "    text = None\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66ea2618",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 4: Summarize with Ollama\n",
    "if text:\n",
    "    print(\"\\nGenerating summary with Ollama...\")\n",
    "    print(\"This might take a few seconds...\\n\")\n",
    "    \n",
    "    try:\n",
    "        # Call Ollama\n",
    "        response = ollama.chat(\n",
    "            model='llama3.2',\n",
    "            messages=[{\n",
    "                'role': 'user',\n",
    "                'content': f'Summarize this webpage content in 3-5 sentences:\\n\\n{text}'\n",
    "            }]\n",
    "        )\n",
    "        \n",
    "        summary = response['message']['content']\n",
    "        \n",
    "        print(\"=\" * 60)\n",
    "        print(\"SUMMARY\")\n",
    "        print(\"=\" * 60)\n",
    "        print(summary)\n",
    "        print(\"=\" * 60)\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"❌ Error with Ollama: {e}\")\n",
    "        print(\"\\nMake sure:\")\n",
    "        print(\"  1. Ollama is running ('ollama serve')\")\n",
    "        print(\"  2. llama3.2 model is installed ('ollama pull llama3.2')\")\n",
    "else:\n",
    "    print(\"No text to summarize (scraping failed)\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d78d1ee1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }