Add DeepSeek exercise notebook for website summarization

2025-02-25 09:10:25 +02:00
parent 8ffd3a523f
commit c5bf054dad
1 changed files with 213 additions and 0 deletions
--- a/week1/community-contributions/day2
+++ b/week1/community-contributions/day2
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bc7d1de3-e2ac-46ff-a302-3b4ba38c4c90",
+   "metadata": {},
+   "source": [
+    "## Also trying the amazing reasoning model DeepSeek\n",
+    "\n",
+    "Here we use the version of DeepSeek-reasoner that's been distilled to 1.5B.  \n",
+    "This is actually a 1.5B variant of Qwen that has been fine-tuned using synethic data generated by Deepseek R1.\n",
+    "\n",
+    "Other sizes of DeepSeek are [here](https://ollama.com/library/deepseek-r1) all the way up to the full 671B parameter version, which would use up 404GB of your drive and is far too large for most!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf9eb44e-fe5b-47aa-b719-0bb63669ab3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ollama pull deepseek-r1:1.5b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bdcd35a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ollama pull deepseek-r1:8b"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1622d9bb-5c68-4d4e-9ca4-b492c751f898",
+   "metadata": {},
+   "source": [
+    "# NOW the exercise for you\n",
+    "\n",
+    "Take the code from day1 and incorporate it here, to build a website summarizer that uses Llama 3.2 running locally instead of OpenAI; use either of the above approaches."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c106420",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import requests\n",
+    "import ollama\n",
+    "from bs4 import BeautifulSoup\n",
+    "from IPython.display import Markdown, display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22d62f00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Constants\n",
+    "\n",
+    "OLLAMA_API = \"http://localhost:11434/api/chat\"\n",
+    "HEADERS = {\"Content-Type\": \"application/json\"}\n",
+    "MODEL = \"deepseek-r1:8b\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6de38216-6d1c-48c4-877b-86d403f4e0f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# A class to represent a Webpage\n",
+    "# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
+    "\n",
+    "# Some websites need you to use proper headers when fetching them:\n",
+    "headers = {\n",
+    " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
+    "}\n",
+    "\n",
+    "class Website:\n",
+    "\n",
+    "    def __init__(self, url):\n",
+    "        \"\"\"\n",
+    "        Create this Website object from the given url using the BeautifulSoup library\n",
+    "        \"\"\"\n",
+    "        self.url = url\n",
+    "        response = requests.get(url, headers=headers)\n",
+    "        soup = BeautifulSoup(response.content, 'html.parser')\n",
+    "        self.title = soup.title.string if soup.title else \"No title found\"\n",
+    "        for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
+    "            irrelevant.decompose()\n",
+    "        self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4449b7dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
+    "\n",
+    "system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
+    "and provides a short summary, ignoring text that might be navigation related. \\\n",
+    "Respond in markdown.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "daca9448",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def user_prompt_for(website):\n",
+    "    user_prompt = f\"You are looking at a website titled {website.title}\"\n",
+    "    user_prompt += \"\\nThe contents of this website is as follows; \\\n",
+    "please provide a short summary of this website in markdown. \\\n",
+    "If it includes news or announcements, then summarize these too.\\n\\n\"\n",
+    "    user_prompt += website.text\n",
+    "    return user_prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ec9d5d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# See how this function creates exactly the format above\n",
+    "\n",
+    "def messages_for(website):\n",
+    "    return [\n",
+    "        {\"role\": \"system\", \"content\": system_prompt},\n",
+    "        {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e1ab04a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# And now: call the OpenAI API. You will get very familiar with this!\n",
+    "\n",
+    "def summarize(url):\n",
+    "    website = Website(url)\n",
+    "    response = ollama.chat(\n",
+    "        model = MODEL,\n",
+    "        messages = messages_for(website)\n",
+    "    )\n",
+    "    return response['message']['content']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d3b5628",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def display_summary(url):\n",
+    "    summary = summarize(url)\n",
+    "    display(Markdown(summary))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "938e5633",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display_summary(\"https://edwarddonner.com\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llms",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}