{ "cells": [ { "cell_type": "code", "execution_count": 9, "id": "fc57c47f-31fc-4527-af71-ce117d35c480", "metadata": {}, "outputs": [], "source": [ "# imports\n", "# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt\n", "\n", "import os\n", "import requests\n", "import json\n", "from typing import List\n", "from dotenv import load_dotenv\n", "from bs4 import BeautifulSoup\n", "from IPython.display import Markdown, display, update_display\n", "from openai import OpenAI\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "d74ea4e7-7d4a-4c85-92d3-8cdb231bc261", "metadata": {}, "outputs": [], "source": [ "import pandas as pd " ] }, { "cell_type": "code", "execution_count": 11, "id": "3eb884ea-02db-4ff8-91f9-c71e40b1cf4a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "API key looks good so far\n" ] } ], "source": [ "# Initialize and constants\n", "\n", "load_dotenv(override=True)\n", "api_key = os.getenv('OPENAI_API_KEY')\n", "\n", "if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:\n", " print(\"API key looks good so far\")\n", "else:\n", " print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n", " \n", "MODEL = 'gpt-4o-mini'\n", "openai = OpenAI()" ] }, { "cell_type": "code", "execution_count": 12, "id": "d48a7b9b-273d-4bc9-997b-c7112e02528c", "metadata": {}, "outputs": [], "source": [ "# A class to represent a Webpage\n", "\n", "# Some websites need you to use proper headers when fetching them:\n", "headers = {\n", " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", "}\n", "\n", "class Website:\n", " def __init__(self, url):\n", " self.url = url\n", " response = requests.get(url, headers=headers)\n", " self.body = response.content\n", " soup = BeautifulSoup(self.body, 'html.parser')\n", " self.title = soup.title.string if soup.title else \"No title found\"\n", "\n", " if soup.body:\n", " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", " irrelevant.decompose()\n", " self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n", " else:\n", " self.text = \"\"\n", "\n", " links = [link.get('href') for link in soup.find_all('a')]\n", " self.links = [link for link in links if link]\n", "\n", " def get_contents(self):\n", " return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n", " " ] }, { "cell_type": "code", "execution_count": 15, "id": "bf51ae6e-91ae-46eb-ac39-dc860454ea4a", "metadata": {}, "outputs": [], "source": [ "def get_condition_links_from_topics_page():\n", " topics_url = \"https://www.thuisarts.nl/overzicht/onderwerpen\"\n", " response = requests.get(topics_url, headers=headers)\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", " # Find all tags that look like condition pages\n", " links = soup.find_all(\"a\", href=True)\n", " condition_links = []\n", "\n", " for link in links:\n", " href = link['href']\n", " if href.startswith(\"/\"):\n", " href = \"https://www.thuisarts.nl\" + href\n", " if href.startswith(\"https://www.thuisarts.nl/\") and len(href.split(\"/\")) > 3:\n", " condition_links.append(href)\n", "\n", " # Remove duplicates and return\n", " return list(set(condition_links))\n" ] }, { "cell_type": "code", "execution_count": 16, "id": "a246ac9f-73fb-4c2d-ab92-6f3f2bf7afac", "metadata": {}, "outputs": [], "source": [ "link_system_prompt = \"\"\"You are an assistant that filters URLs for patient education content. \n", "\n", "Only return links that lead to pages about symptoms, health conditions, treatments, or diseases — for example: pages on 'headache', 'diarrhea', 'stomach pain', 'asthma', etc.\n", "\n", "DO NOT return:\n", "- contact pages\n", "- overview/video/image/keuzekaart lists unless they directly link to medical complaints\n", "- navigation or privacy/cookie/social media links\n", "\n", "Respond only with full https links in JSON format, like this:\n", "{\n", " \"links\": [\n", " {\"type\": \"symptom or condition page\", \"url\": \"https://www.thuisarts.nl/hoofdpijn\"},\n", " {\"type\": \"symptom or condition page\", \"url\": \"https://www.thuisarts.nl/buikpijn\"}\n", " ]\n", "}\n", "\"\"\"\n" ] }, { "cell_type": "code", "execution_count": 17, "id": "b3ac761e-f583-479e-b8ef-70e70f8f361a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "You are an assistant that filters URLs for patient education content. \n", "\n", "Only return links that lead to pages about symptoms, health conditions, treatments, or diseases — for example: pages on 'headache', 'diarrhea', 'stomach pain', 'asthma', etc.\n", "\n", "DO NOT return:\n", "- contact pages\n", "- overview/video/image/keuzekaart lists unless they directly link to medical complaints\n", "- navigation or privacy/cookie/social media links\n", "\n", "Respond only with full https links in JSON format, like this:\n", "{\n", " \"links\": [\n", " {\"type\": \"symptom or condition page\", \"url\": \"https://www.thuisarts.nl/hoofdpijn\"},\n", " {\"type\": \"symptom or condition page\", \"url\": \"https://www.thuisarts.nl/buikpijn\"}\n", " ]\n", "}\n", "\n" ] } ], "source": [ "print(link_system_prompt)" ] }, { "cell_type": "code", "execution_count": 18, "id": "5548e8d4-2813-40fe-a807-cf3661d3a0a9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Found 680 condition pages.\n" ] } ], "source": [ "condition_links = get_condition_links_from_topics_page()\n", "print(f\"✅ Found {len(condition_links)} condition pages.\")\n", "\n", "# Format for summary function\n", "selected_links = [{\"url\": link} for link in condition_links]\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "8d264592-8b77-425a-be4a-73ef7d32d744", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "def load_existing_summaries(filepath=\"brochure_cache.json\"):\n", " if os.path.exists(filepath):\n", " with open(filepath, \"r\", encoding=\"utf-8\") as f:\n", " return json.load(f)\n", " return {}\n", "\n", "def save_summaries_to_cache(summaries, filepath=\"brochure_cache.json\"):\n", " with open(filepath, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(summaries, f, indent=2, ensure_ascii=False)\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "1cdd9456-1262-40a0-bc3f-28d23010ed7f", "metadata": {}, "outputs": [], "source": [ "selected_links = [{\"url\": link} for link in get_condition_links_from_topics_page()][:10]\n" ] }, { "cell_type": "code", "execution_count": 21, "id": "0c2f24ea-fa6b-4431-849a-e1aeaa936022", "metadata": {}, "outputs": [], "source": [ "summary_cache = {}\n", "\n", "def summarize_for_brochure(url):\n", " if url in summary_cache:\n", " summary = summary_cache[url]\n", " print(f\"✅ [Cached] {url}\")\n", " print(f\"📄 Summary:\\n{summary}\\n\") # 👈 this prints the cached summary too\n", " return summary\n", "\n", " page = Website(url)\n", "\n", " example = \"\"\"\n", "Example:\n", "\n", "Title: Keelpijn \n", "Summary: Sore throat is a common symptom, often caused by a virus. It usually goes away on its own within a few days. Drink warm fluids, rest your voice, and take paracetamol if needed. See a doctor if the pain lasts more than a week or gets worse.\n", "\n", "Title: Hoofdpijn \n", "Summary: Headaches can have many causes like stress, fatigue, or dehydration. Most are harmless and go away with rest and fluids. Painkillers like paracetamol can help. If headaches are severe, frequent, or different than usual, contact your GP.\n", "\"\"\"\n", "\n", " prompt = f\"\"\"\n", "You are a health writer. Based on the Dutch content below, write a clear, short, brochure-style summary in **English** for patients.\n", "\n", "Use the format: \n", "Title: {page.title} \n", "Summary: \n", "\n", "Keep it under 100 words, easy to read, friendly, and medically accurate.\n", "\n", "{example}\n", "\n", "Now use this for:\n", "Title: {page.title}\n", "Content:\n", "{page.text[:3000]}\n", "\"\"\"\n", "\n", " response = openai.chat.completions.create(\n", " model=\"gpt-4\",\n", " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " temperature=0.4\n", " )\n", "\n", " summary = response.choices[0].message.content.strip()\n", " summary_cache[url] = summary\n", " return summary\n" ] }, { "cell_type": "code", "execution_count": 22, "id": "af8f9d81-d848-4fb9-ac79-782b39fed4a2", "metadata": {}, "outputs": [], "source": [ "def build_symptom_brochure(links, cache_file=\"brochure_cache.json\"):\n", " brochure = []\n", " cached = load_existing_summaries(cache_file)\n", " print(\"📄 Building summaries for brochure:\\n\")\n", "\n", " for i, item in enumerate(links, 1):\n", " url = item[\"url\"]\n", " if url in cached:\n", " print(f\"✅ [Cached] {url}\")\n", " brochure.append({\"url\": url, \"summary\": cached[url]})\n", " continue\n", " \n", " print(f\"🔄 [{i}/{len(links)}] Summarizing: {url}\")\n", " try:\n", " summary = summarize_for_brochure(url)\n", " print(f\"✅ Summary:\\n{summary}\\n\")\n", " brochure.append({\"url\": url, \"summary\": summary})\n", " cached[url] = summary # Save new summary\n", " save_summaries_to_cache(cached, cache_file)\n", " except Exception as e:\n", " print(f\"❌ Error summarizing {url}: {e}\\n\")\n", " brochure.append({\"url\": url, \"summary\": \"Error generating summary.\"})\n", "\n", " return brochure\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "e9079d6b-538f-4681-9776-4628a111246a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "📄 Building summaries for brochure:\n", "\n", "🔄 [1/10] Summarizing: https://www.thuisarts.nl/sociale-angststoornis\n", "✅ [New] https://www.thuisarts.nl/sociale-angststoornis\n", "📄 Summary:\n", "Title: Social Anxiety Disorder\n", "Summary: Social anxiety disorder, or social phobia, is a fear of what others think of you, often leading to panic attacks. Writing down what happens, your thoughts, and feelings can help manage this fear. Positive thinking can also be beneficial when you're feeling anxious. Discussing your concerns with your GP or practice nurse can be helpful. If there's no improvement or symptoms are severe, treatments such as therapy with a psychologist or anxiety medication may be considered.\n", "\n", "✅ Summary:\n", "Title: Social Anxiety Disorder\n", "Summary: Social anxiety disorder, or social phobia, is a fear of what others think of you, often leading to panic attacks. Writing down what happens, your thoughts, and feelings can help manage this fear. Positive thinking can also be beneficial when you're feeling anxious. Discussing your concerns with your GP or practice nurse can be helpful. If there's no improvement or symptoms are severe, treatments such as therapy with a psychologist or anxiety medication may be considered.\n", "\n", "✅ [Cached] https://www.thuisarts.nl/diabetes-type-2\n", "🔄 [3/10] Summarizing: https://www.thuisarts.nl/morton-neuroom\n", "✅ [New] https://www.thuisarts.nl/morton-neuroom\n", "📄 Summary:\n", "Title: Morton's Neuroma | Thuisarts.nl \n", "Summary: Morton's Neuroma is a pinched nerve in the forefoot, causing burning pain in the forefoot and toes. It often results from wearing too narrow shoes or high heels. Wearing comfortable, roomy shoes can help alleviate symptoms. For severe pain, paracetamol can be taken. Sometimes, a custom shoe insole can also help.\n", "\n", "✅ Summary:\n", "Title: Morton's Neuroma | Thuisarts.nl \n", "Summary: Morton's Neuroma is a pinched nerve in the forefoot, causing burning pain in the forefoot and toes. It often results from wearing too narrow shoes or high heels. Wearing comfortable, roomy shoes can help alleviate symptoms. For severe pain, paracetamol can be taken. Sometimes, a custom shoe insole can also help.\n", "\n", "🔄 [4/10] Summarizing: https://www.thuisarts.nl/borstvergroting\n", "✅ [New] https://www.thuisarts.nl/borstvergroting\n", "📄 Summary:\n", "Title: Breast Augmentation | Thuisarts.nl \n", "Summary: A breast augmentation is a procedure where a plastic surgeon inserts fillings into your breasts, under general anesthesia. The surgery takes about an hour. Consider the pros and cons carefully. Benefits may include a more positive body image and increased self-confidence. Risks may include infection, bleeding, scarring, or hardening of the breasts over time. Often, a follow-up surgery is needed later. If you smoke, it's important to quit three weeks before surgery.\n", "\n", "✅ Summary:\n", "Title: Breast Augmentation | Thuisarts.nl \n", "Summary: A breast augmentation is a procedure where a plastic surgeon inserts fillings into your breasts, under general anesthesia. The surgery takes about an hour. Consider the pros and cons carefully. Benefits may include a more positive body image and increased self-confidence. Risks may include infection, bleeding, scarring, or hardening of the breasts over time. Often, a follow-up surgery is needed later. If you smoke, it's important to quit three weeks before surgery.\n", "\n", "🔄 [5/10] Summarizing: https://www.thuisarts.nl/kijkoperatie-in-buik\n", "✅ [New] https://www.thuisarts.nl/kijkoperatie-in-buik\n", "📄 Summary:\n", "Title: Abdominal Laparoscopy | Thuisarts.nl\n", "Summary: An abdominal laparoscopy allows the doctor to examine or operate in your abdomen. Small tubes with a camera and tools are inserted through tiny incisions. You'll have a pre-operation discussion with your surgeon and anesthesiologist. You will be deeply sedated for the procedure. You cannot drive home post-operation, so arrange for someone to pick you up. Recovery usually requires a week off work, sometimes longer.\n", "\n", "✅ Summary:\n", "Title: Abdominal Laparoscopy | Thuisarts.nl\n", "Summary: An abdominal laparoscopy allows the doctor to examine or operate in your abdomen. Small tubes with a camera and tools are inserted through tiny incisions. You'll have a pre-operation discussion with your surgeon and anesthesiologist. You will be deeply sedated for the procedure. You cannot drive home post-operation, so arrange for someone to pick you up. Recovery usually requires a week off work, sometimes longer.\n", "\n", "🔄 [6/10] Summarizing: https://www.thuisarts.nl/veranderingen-in-zorg-als-je-18-wordt\n", "✅ [New] https://www.thuisarts.nl/veranderingen-in-zorg-als-je-18-wordt\n", "📄 Summary:\n", "Title: Changes in Care When You Turn 18 | Thuisarts.nl\n", "Summary: As you become an adult, usually around 18, you transition from child to adult healthcare. You will start to take more responsibility, such as making appointments and requesting medications, giving you more control over your care. You will create a plan detailing what you need to manage this independently, with support provided to help you. This transition is a gradual process, with preparation beginning before you turn 18.\n", "\n", "✅ Summary:\n", "Title: Changes in Care When You Turn 18 | Thuisarts.nl\n", "Summary: As you become an adult, usually around 18, you transition from child to adult healthcare. You will start to take more responsibility, such as making appointments and requesting medications, giving you more control over your care. You will create a plan detailing what you need to manage this independently, with support provided to help you. This transition is a gradual process, with preparation beginning before you turn 18.\n", "\n", "🔄 [7/10] Summarizing: https://www.thuisarts.nl/zon-en-zonnebrand\n", "✅ [New] https://www.thuisarts.nl/zon-en-zonnebrand\n", "📄 Summary:\n", "Title: Sun and Sunburn | Thuisarts.nl\n", "Summary: Protect your skin from excessive sunlight to avoid sunburn. If you notice your skin burning, immediately move out of the sun. Cool your skin with wet cloths if it hurts and take paracetamol for severe pain. Stay out of the sun for at least three days to allow your skin to recover. If you have symptoms of sunstroke, sun allergy, or eczema, seek medical advice.\n", "\n", "✅ Summary:\n", "Title: Sun and Sunburn | Thuisarts.nl\n", "Summary: Protect your skin from excessive sunlight to avoid sunburn. If you notice your skin burning, immediately move out of the sun. Cool your skin with wet cloths if it hurts and take paracetamol for severe pain. Stay out of the sun for at least three days to allow your skin to recover. If you have symptoms of sunstroke, sun allergy, or eczema, seek medical advice.\n", "\n", "🔄 [8/10] Summarizing: https://www.thuisarts.nl/ganglion\n", "✅ [New] https://www.thuisarts.nl/ganglion\n", "📄 Summary:\n", "Title: Ganglion | Thuisarts.nl \n", "Summary: A ganglion is a small bump that can appear on your wrist, finger, or foot. It is a protrusion from the joint and is harmless. In half of the cases, a ganglion disappears on its own. If you notice such a bump, there is usually no cause for concern.\n", "\n", "✅ Summary:\n", "Title: Ganglion | Thuisarts.nl \n", "Summary: A ganglion is a small bump that can appear on your wrist, finger, or foot. It is a protrusion from the joint and is harmless. In half of the cases, a ganglion disappears on its own. If you notice such a bump, there is usually no cause for concern.\n", "\n", "🔄 [9/10] Summarizing: https://www.thuisarts.nl/kunstheup\n", "✅ [New] https://www.thuisarts.nl/kunstheup\n", "📄 Summary:\n", "Title: Hip Replacement | Thuisarts.nl\n", "Summary: A hip replacement can be an option if you are experiencing severe pain or stiffness in your hip, such as from advanced arthritis or another hip disease. This is usually considered when other treatments like physiotherapy and painkillers have not provided enough relief. You can discuss with your hospital doctor whether a hip replacement is suitable for you. A hip prosthesis typically lasts longer than 20 years.\n", "\n", "✅ Summary:\n", "Title: Hip Replacement | Thuisarts.nl\n", "Summary: A hip replacement can be an option if you are experiencing severe pain or stiffness in your hip, such as from advanced arthritis or another hip disease. This is usually considered when other treatments like physiotherapy and painkillers have not provided enough relief. You can discuss with your hospital doctor whether a hip replacement is suitable for you. A hip prosthesis typically lasts longer than 20 years.\n", "\n", "🔄 [10/10] Summarizing: https://www.thuisarts.nl/gezond-leven\n", "✅ [New] https://www.thuisarts.nl/gezond-leven\n", "📄 Summary:\n", "Title: Healthy Living | Thuisarts.nl\n", "Summary: For good health, it's important to eat, drink, and sleep well, stay active, relax, and maintain social contacts. Avoiding substances like alcohol is also beneficial. If you want to make changes to your lifestyle, take it step by step. Discuss your plans with your GP or practice nurse. Whether it's about healthy eating, exercise, sleep, stress management, social contact, or substance use, they can provide guidance and support.\n", "\n", "✅ Summary:\n", "Title: Healthy Living | Thuisarts.nl\n", "Summary: For good health, it's important to eat, drink, and sleep well, stay active, relax, and maintain social contacts. Avoiding substances like alcohol is also beneficial. If you want to make changes to your lifestyle, take it step by step. Discuss your plans with your GP or practice nurse. Whether it's about healthy eating, exercise, sleep, stress management, social contact, or substance use, they can provide guidance and support.\n", "\n" ] } ], "source": [ "brochure = build_symptom_brochure(selected_links)" ] }, { "cell_type": "code", "execution_count": 23, "id": "e2121c3c-aa6a-4640-8e19-6ca6ccf84783", "metadata": {}, "outputs": [], "source": [ "def export_brochure_to_txt(brochure, filepath=\"brochure_summaries.txt\"):\n", " if not brochure:\n", " print(\"⚠️ No summaries to export.\")\n", " return\n", "\n", " with open(filepath, \"w\", encoding=\"utf-8\") as f:\n", " for item in brochure:\n", " url = item.get(\"url\", \"Unknown URL\")\n", " summary = item.get(\"summary\", \"No summary available.\")\n", " f.write(f\"URL: {url}\\n\")\n", " f.write(f\"{summary}\\n\\n\")\n", "\n", " print(f\"📁 Exported {len(brochure)} summaries to {filepath}\")\n" ] }, { "cell_type": "code", "execution_count": 26, "id": "f14288f9-4d1c-4a0e-aaf4-9f86324b0602", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "📁 Exported 10 summaries to brochure_summaries.txt\n" ] } ], "source": [ "export_brochure_to_txt(brochure)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c23e89db-3ded-4189-a227-6ca6ac2f1332", "metadata": {}, "outputs": [], "source": [ "###---it works---" ] }, { "cell_type": "code", "execution_count": null, "id": "a700e4f3-fb6a-499a-a579-6f9b8ad35c9f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.12" } }, "nbformat": 4, "nbformat_minor": 5 }