Merge pull request #852 from maherp24/week1_day2

day2_webpage_summarizer
This commit is contained in:
Ed Donner
2025-11-01 20:24:53 -04:00
committed by GitHub

View File

@@ -0,0 +1,157 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "33314df1",
"metadata": {},
"source": [
"# Webpage Summarizer with Ollama\n",
"\n",
"Scrape any webpage and get a quick summary using Ollama 3.2.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "64917274",
"metadata": {},
"outputs": [],
"source": [
"# Cell 1: Setup and Dependencies\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import ollama\n",
"\n",
"# Check if Ollama is running\n",
"try:\n",
" ollama.list()\n",
" print(\"✓ Ollama is running!\")\n",
"except Exception as e:\n",
" print(\"⚠ Warning: Can't connect to Ollama. Make sure it's running!\")\n",
" print(\" Run 'ollama serve' in your terminal if needed.\")\n",
"\n",
"print(\"\\nReady to summarize webpages!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "981ff805",
"metadata": {},
"outputs": [],
"source": [
"# Cell 2: Set URL to scrape\n",
"# Change this URL to whatever webpage you want to summarize\n",
"url = \"https://github.com/maherp24\"\n",
"\n",
"print(f\"Will summarize: {url}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a45d7c1",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" # Get the webpage\n",
" response = requests.get(url, timeout=10)\n",
" response.raise_for_status()\n",
" \n",
" print(\"✓ Webpage fetched!\")\n",
" \n",
" # Parse HTML\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" \n",
" # Extract text from paragraphs\n",
" paragraphs = soup.find_all('p')\n",
" text = ' '.join([p.get_text() for p in paragraphs])\n",
" \n",
" # Clean up whitespace\n",
" text = ' '.join(text.split())\n",
" \n",
" # Limit to 4000 characters to avoid token issues\n",
" if len(text) > 4000:\n",
" text = text[:4000]\n",
" print(f\"✓ Extracted {len(text)} characters (truncated to 4000)\")\n",
" else:\n",
" print(f\"✓ Extracted {len(text)} characters\")\n",
" \n",
" print(f\"\\nFirst 200 characters:\\n{text[:200]}...\")\n",
" \n",
"except Exception as e:\n",
" print(f\"❌ Error scraping webpage: {e}\")\n",
" text = None\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "66ea2618",
"metadata": {},
"outputs": [],
"source": [
"# Cell 4: Summarize with Ollama\n",
"if text:\n",
" print(\"\\nGenerating summary with Ollama...\")\n",
" print(\"This might take a few seconds...\\n\")\n",
" \n",
" try:\n",
" # Call Ollama\n",
" response = ollama.chat(\n",
" model='llama3.2',\n",
" messages=[{\n",
" 'role': 'user',\n",
" 'content': f'Summarize this webpage content in 3-5 sentences:\\n\\n{text}'\n",
" }]\n",
" )\n",
" \n",
" summary = response['message']['content']\n",
" \n",
" print(\"=\" * 60)\n",
" print(\"SUMMARY\")\n",
" print(\"=\" * 60)\n",
" print(summary)\n",
" print(\"=\" * 60)\n",
" \n",
" except Exception as e:\n",
" print(f\"❌ Error with Ollama: {e}\")\n",
" print(\"\\nMake sure:\")\n",
" print(\" 1. Ollama is running ('ollama serve')\")\n",
" print(\" 2. llama3.2 model is installed ('ollama pull llama3.2')\")\n",
"else:\n",
" print(\"No text to summarize (scraping failed)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d78d1ee1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}