Add my contributions to community-contributions

2025-09-29 13:25:48 +01:00
parent e4d751dd2f
commit 4b51a8f9e1
3 changed files with 538 additions and 0 deletions
--- a/community-contributions/biomedical-article-summariser/README.md
+++ b/community-contributions/biomedical-article-summariser/README.md
@@ -0,0 +1,38 @@
 ## Biomedical Article Abstract Summariser using Europe PMC + Ollama
 This is a simple app that demonstrates an article abstract summariser leveraging Europe PMC’s API and Ollama LLMs to generate concise summaries of biomedical literature.
 ## 🔍 About Europe PMC (EPMC)
 Europe PMC is a free, open-access database that provides access to millions of life sciences and biomedical articles, research papers, and preprints. It is part of the PubMed Central International (PMCI) network.
 ## Features
 This solution presents 2 methods: 
 1. A simple demo via a jupyter notebook
 2. An interactive demo via gradio, running on your local computer. 
 **Core Features:** 
 - Fetch an article’s metadata and abstract via Europe PMC’s API (using a provided PMCID).
 - Preprocess and clean the abstract text unnecessary tags e.g referenc tag or math formula.
 - Summarise abstracts into bullet points + a short paragraph using Ollama models.
 ## 📌 How to Use
 - Go to [Europe PMC' website](https://europepmc.org/).
 - Use the search bar to find an open-access article by keywords, entity names, journal, or author. E.g Genes, Diseases, nutrition etc
 - Since the app currently only runs on open-access only articles, you'll need to restrict results to `open-access` only articles: add filters like `HAS_FT:Y` or `IN_EPMC:Y` to your search syntax. E.g .`"Genes: HAS_FT:Y"`
 - Select your article of interest and copy its PMCID (e.g., PMC1234567).
 - Run the summariser:
  - via notebook: Paste the `PMCID` as a string in the display_response func, after running all other cells. 
  - via gradio: 
    - run the python script via CLI: 
    ```python
    python article_summariser-gradio.py
    ```
    - Paste the `PMCID` as you've copied it in the `Enter a **EuropePMC Article ID` textbox. 
    - click on the `Fetch Article Abstract and generate Summary` button. 
    **N.B**: I've observed that using `llama3.2` runs faster on my pc. You may experience some delays with all other models. Also make sure to already have ollama running via `ollama serve` on your terminal before running the script. 
--- a/community-contributions/biomedical-article-summariser/article_summariser-gradio.py
+++ b/community-contributions/biomedical-article-summariser/article_summariser-gradio.py
@@ -0,0 +1,202 @@
 import re
 import requests
 import functools
 from typing import List, Tuple, Dict, Any
 from loguru import logger
 from bs4 import BeautifulSoup as bs
 import ollama
 import gradio as gr
 SYS_PROMPT = """
 You are an expert in biomedical text mining and information extraction. 
 You excel at breaking down complex articles into digestible contents for your audience. 
 Your audience can comprise of students, early researchers and professionals in the field.
 Summarize the key findings in the following article [ARTICLE] .
 Your summary should provide crucial points covered in the paper that helps your diverse audience quickly understand the most vital information. 
 Crucial points to include in your summary:
 - Main objectives of the study
 - Key findings and results
 - Methodologies used
 - Implications of the findings(if any)
 - Any limitations or future directions mentioned
 Format: Provide your summary in bullet points highlighting key areas followed with a concise paragraph that encapsulates the results of the paper.
 The tone should be professional and clear.
 """
 def catch_request_error(func):
    """
    Wrapper func to catch request errors and return None if an error occurs.
    Used as a decorator.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except requests.RequestException as e:
            print(f"Request error in {func.__name__}: {e}")
            return None
    return wrapper
@catch_request_error
@logger.catch
 def get_xml_from_url(url: str) -> bs:
    """
    Fetches the XML content from Europe PMC website.
    Args:
        url (str): Europe PMC's production url to fetch the XML from.
    Returns:
        soup (bs4.BeautifulSoup): Parsed XML content.
    """
    response = requests.get(url)
    response.raise_for_status() #check for request errors
    return bs(response.content, "lxml-xml")  
 def clean_text(text:str) -> str:
    """
    This function cleans a text by filtering reference patterns in text, 
    extra whitespaces, escaped latex-style formatting appearing in text body instead of predefined latex tags
    Args: 
    text(str): The text to be cleaned
    Returns: 
    tex(str): The cleaned text 
    """
    # Remove LaTeX-style math and formatting tags #already filtered from soup content but some still appear
    text = re.sub(r"\{.*?\}", "", text)  # Matches and removes anything inside curly braces {}
    text = re.sub(r"\\[a-zA-Z]+", "", text)  # Matches and removes characters that appears with numbers
    # Remove reference tags like [34] or [1,2,3]
    text = re.sub(r"\[\s*(\d+\s*(,\s*\d+\s*)*)\]", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text
 def fetch_article_abstract(soup: bs) -> Tuple[str, str]:
    """
    Extracts the abstract text from the XML soup.
    Args:
        soup (bs4.BeautifulSoup): Parsed XML content.
    Returns:
        Tuple(article_title (str), abstract_text (str)): A tuple of the article's title and its extracted abstract text.
    """
    if soup is None:
        return "No XML found", ""
    article_title = soup.find("article-title").get_text(strip=True) if soup.find("article-title") else "No Title Found for this article"
    abstract_tag = soup.find("abstract")
    if abstract_tag:
        abstract_text = ' '.join([clean_text(p.get_text(strip=True)) for p in abstract_tag.find_all("p") if p.get_text(strip=True)])
    else:
        abstract_text = ""
    return article_title, abstract_text
 def build_message(article_title: str, abstract_text: str, sys_prompt:str=SYS_PROMPT) -> List[Dict[str, str]]:
    """
    Constructs the message payload for the LLM.
    Args:
        article_title (str): The title of the article.
        abstract_text (str): The abstract text of the article.
    Returns:
        List[Dict[str, str]]: A list of message dictionaries for the LLM.
    """
    user_prompt = f"""You are looking at an article with title:  {article_title}. 
    The article's abstract is as follows: \n{abstract_text}.
    Summarise the article. Start your summary by providing a short sentence on what the article is about 
    and then a bulleted list of the key points covered in the article.
 """
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt}
    ]
    return messages
 def generate_response(messages:List[Dict[str, str]], model:str) -> str:
    """ 
    Generates a response from the LLM based on the provided messages.
    Args:
        messages (List[Dict[str, str]]): The message payload for the LLM.
        model (str): The model to use for generating the response.
    Returns:
        str: The content of the LLM's response.
    """
    response = ollama.chat(model=model, messages=messages)
    return response["message"]["content"]
 def summariser(article_id: str, model:str) -> str:
    if article_id and not re.match(r"^PMC\d{5,8}$", article_id):
        raise gr.Error("Please check the length/Format of the provided Article ID. It should start with 'PMC' followed by 5 to 8 digits, e.g., 'PMC1234567'.")
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{article_id}/fullTextXML"
    soup = get_xml_from_url(url)
    article_title, abstract_text = fetch_article_abstract(soup)
    if not abstract_text:
         raise gr.Error(f"No abstract found for {article_title}")
    messages = build_message(article_title, abstract_text)
    #pull model from ollama
    ollama.pull(model)
    summary = generate_response(messages, model)
    return f"## 📝 Article Title: {article_title}\n\n### 📌 Summary:\n{summary}"
 INTRO_TXT = "This is a simple Biomedical Article Summariser. It uses PMCID to fetch articles from the Europe PMC(EPMC) Website. It currently only runs on article's abstract. Future improvements would integrate full-text articles"
 INST_TXT = "Enter a **EuropePMC Article ID** (e.g., `PMC1234567`) and select a model from the dropdown menu to generate a structured summary"
 def gradio_ui():
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(INTRO_TXT)
    gr.Markdown(INST_TXT)
    with gr.Row():
      with gr.Column(scale=1):
        article_id = gr.Textbox(label="Enter Article's PMCID here", placeholder="e.g., PMC1234567")
        model_choice = gr.Dropdown(choices=["llama3.2", "deepseek-r1", "gemma3", "mistral", "gpt-oss"], value="llama3.2", label="Select a model")
        run_btn = gr.Button("Fetch Article Abstract and generate Summary", variant='primary')
      with gr.Column(scale=1):
        output_box = gr.Markdown()
    run_btn.click(fn=summariser, inputs=[article_id, model_choice], outputs=output_box)
  return demo
 if __name__ == "__main__":
  app = gradio_ui()
  app.launch(share=True, debug=True)
--- a/community-contributions/biomedical-article-summariser/article_summariser.ipynb
+++ b/community-contributions/biomedical-article-summariser/article_summariser.ipynb
@@ -0,0 +1,298 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "eb1f9b94",
   "metadata": {},
   "source": [
    "This Notebook is my exercise1 version. I have adapted the week 1 exercise solution to an article's abstract summariser using from Europe PMC's article API. [Europe PMC (EPMC)](https://europepmc.org/) is a free, open-access database that provides access to a wealth of life sciences and biomedical literature. It is an integral part of the PubMed Central International (PMCI) network, aggregating content from multiple sources and offering access to millions of scientific articles, research papers, and preprints, all in one place. \n",
    "\n",
    "My solution uses a provided article's PMCID(obtainable by selecting an article you wish to summarise from EPMC's website). PMCID are unique only to open-access articles and you can only use the function below for such articles. To get an article's PMICD: \n",
    "To use:\n",
    "1. Go to [EPMC's Website](https://europepmc.org/)\n",
    "1. Use the search tab and search for articles by keywords, entities journal or Author's name. E.g Genes, Diseases, nutrition etc\n",
    "2. Search for open-access articles by including the keyword `HAS_FT:Y` or `IN_EPMC:Y`. Example: `\"Genes: HAS_FT:Y\"`\n",
    "3. Then your article of interest and copy the PMCID. \n",
    "4. feed the PMCID into the `display_reponse` func to generate the summary from the article's abstract.   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53120ced",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pprint\n",
    "from pprint import pformat\n",
    "import requests\n",
    "import functools\n",
    "from typing import List, Tuple, Dict, Any\n",
    "\n",
    "from tqdm import tqdm\n",
    "from loguru import logger\n",
    "from bs4 import BeautifulSoup as bs\n",
    "\n",
    "from IPython.display import display, HTML, Markdown\n",
    "\n",
    "import ollama"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fbabbd46",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6026505",
   "metadata": {},
   "outputs": [],
   "source": [
    "def catch_request_error(func):\n",
    "    \"\"\"\n",
    "    Wrapper func to catch request errors and return None if an error occurs.\n",
    "    Used as a decorator.\n",
    "    \"\"\"\n",
    "    @functools.wraps(func)\n",
    "    def wrapper(*args, **kwargs):\n",
    "        try:\n",
    "            return func(*args, **kwargs)\n",
    "        except requests.RequestException as e:\n",
    "            print(f\"Request error in {func.__name__}: {e}\")\n",
    "            return None\n",
    "    return wrapper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e9cfff6",
   "metadata": {},
   "outputs": [],
   "source": [
    "@catch_request_error\n",
    "@logger.catch\n",
    "def get_xml_from_url(url: str) -> bs:\n",
    "    \"\"\"\n",
    "    Fetches the XML content from Europe PMC website.\n",
    "\n",
    "    Args:\n",
    "        url (str): Europe PMC's production url to fetch the XML from.\n",
    "\n",
    "    Returns:\n",
    "        soup (bs4.BeautifulSoup): Parsed XML content.\n",
    "    \"\"\"\n",
    "    response = requests.get(url)\n",
    "    response.raise_for_status() #check for request errors\n",
    "    return bs(response.content, \"xml\")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ade46c84",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_text(text:str) -> str:\n",
    "    \"\"\"\n",
    "    This function cleans a text by filtering reference patterns in text, \n",
    "    extra whitespaces, escaped latex-style formatting appearing in text body instead of predefined latex tags\n",
    "\n",
    "    Args: \n",
    "    text(str): The text to be cleaned\n",
    "    \n",
    "    Returns: \n",
    "    tex(str): The cleaned text \n",
    "    \n",
    "    \"\"\"\n",
    "   \n",
    "    # Remove LaTeX-style math and formatting tags #already filtered from soup content but some still appear\n",
    "    text = re.sub(r\"\\{.*?\\}\", \"\", text)  # Matches and removes anything inside curly braces {}\n",
    "    text = re.sub(r\"\\\\[a-zA-Z]+\", \"\", text)  # Matches and removes characters that appears with numbers\n",
    "    \n",
    "    # Remove reference tags like [34] or [1,2,3]\n",
    "    text = re.sub(r\"\\[\\s*(\\d+\\s*(,\\s*\\d+\\s*)*)\\]\", \"\", text)\n",
    "    \n",
    "    # Remove extra whitespace\n",
    "    text = re.sub(r\"\\s+\", \" \", text).strip()\n",
    "    \n",
    "    return text\n",
    "\n",
    "\n",
    "def fetch_article_abstract(soup: bs) -> Tuple[str, str]:\n",
    "    \"\"\"\n",
    "    Extracts the abstract text from the XML soup.\n",
    "\n",
    "    Args:\n",
    "        soup (bs4.BeautifulSoup): Parsed XML content.\n",
    "    Returns:\n",
    "        Tuple(article_title (str), abstract_text (str)): A tuple of the article's title and its extracted abstract text.\n",
    "    \"\"\"\n",
    "    if soup is None:\n",
    "        return \"No XML found\", \"\"\n",
    "    article_title = soup.find(\"article-title\").get_text(strip=True) if soup.find(\"article-title\") else \"No Title Found for this article\"\n",
    "\n",
    "    abstract_tag = soup.find(\"abstract\")\n",
    "    if abstract_tag:\n",
    "        abstract_text = ' '.join([clean_text(p.get_text(strip=True)) for p in abstract_tag.find_all(\"p\") if p.get_text(strip=True)])\n",
    "    else:\n",
    "        abstract_text = \"\"\n",
    "    return article_title, abstract_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0686408b",
   "metadata": {},
   "outputs": [],
   "source": [
    "sys_prompt = \"\"\"You are an expert in biomedical text mining and information extraction. \n",
    "You excel at breaking down complex articles into digestible contents for your audience. \n",
    "Your audience can comprise of students, early researchers and professionals in the field.\n",
    "Summarize the key findings in the following article [ARTICLE] .\n",
    "Your summary should provide crucial points covered in the paper that helps your diverse audience quickly understand the most vital information. \n",
    "Crucial points to consider:\n",
    "- Main objectives of the study\n",
    "- Key findings and results\n",
    "- Methodologies used\n",
    "- Implications of the findings(if any)\n",
    "- Any limitations or future directions mentioned\n",
    "\n",
    "Format: Provide your summary in bullet points highlighting key areas followed with a  concise paragraph that encapsulates the results of the paper.\n",
    "\n",
    "The tone should be professional and clear.\n",
    "\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cd5ca3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL = \"llama3.2\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb1c2ccd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_message(article_title: str, abstract_text: str, sys_prompt:str=sys_prompt) -> List[Dict[str, str]]:\n",
    "    \"\"\"\n",
    "    Constructs the message payload for the LLM.\n",
    "\n",
    "    Args:\n",
    "        article_title (str): The title of the article.\n",
    "        abstract_text (str): The abstract text of the article.\n",
    "\n",
    "    Returns:\n",
    "        List[Dict[str, str]]: A list of message dictionaries for the LLM.\n",
    "    \"\"\"\n",
    "    user_prompt = f\"\"\"You are looking at an article with title:  {article_title}. \n",
    "    The article's abstract is as follows: \\n{abstract_text}.\n",
    "    Summarise the article. Start your summary by providing a short sentence on what the article is about \n",
    "    and then a bulleted list of the key points covered in the article.\n",
    "\"\"\"\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": sys_prompt},\n",
    "        {\"role\": \"user\", \"content\": user_prompt}\n",
    "    ]\n",
    "    return messages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80facfc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_response(messages, model=MODEL):\n",
    "    response = ollama.chat(model=model, messages=messages)\n",
    "    return response[\"message\"][\"content\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87fb0621",
   "metadata": {},
   "outputs": [],
   "source": [
    "#combine everything to main function\n",
    "def display_reponse(article_id:str):\n",
    "    if article_id and not re.match(r\"^PMC\\d{5,8}$\", article_id):\n",
    "        raise ValueError(\"Please check the length/Format of the provided Article ID. It should start with 'PMC' followed by 5 to 8 digits, e.g., 'PMC1234567'.\")\n",
    "    url = f\"https://www.ebi.ac.uk/europepmc/webservices/rest/{article_id}/fullTextXML\"\n",
    "    soup = get_xml_from_url(url)\n",
    "    article_title, abstract_text = fetch_article_abstract(soup)\n",
    "    messages = build_message(article_title, abstract_text)\n",
    "    response = generate_response(messages)\n",
    "\n",
    "    display(Markdown(f\"### Article Title: {article_title}\"))\n",
    "    display(Markdown(f\"### LLM Response: \\n{response}\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3177b43",
   "metadata": {},
   "outputs": [],
   "source": [
    "#add your article's PMCID here to test the function\n",
    "display_reponse(\"PMC7394925\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffa39234",
   "metadata": {},
   "outputs": [],
   "source": [
    "display_reponse(\"PMC12375411\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0532123e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm-course",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }