203 lines
6.7 KiB
Python
203 lines
6.7 KiB
Python
import re
|
|
|
|
import requests
|
|
import functools
|
|
from typing import List, Tuple, Dict, Any
|
|
|
|
from loguru import logger
|
|
|
|
from bs4 import BeautifulSoup as bs
|
|
|
|
|
|
import ollama
|
|
import gradio as gr
|
|
|
|
|
|
|
|
SYS_PROMPT = """
|
|
You are an expert in biomedical text mining and information extraction.
|
|
You excel at breaking down complex articles into digestible contents for your audience.
|
|
Your audience can comprise of students, early researchers and professionals in the field.
|
|
Summarize the key findings in the following article [ARTICLE] .
|
|
Your summary should provide crucial points covered in the paper that helps your diverse audience quickly understand the most vital information.
|
|
Crucial points to include in your summary:
|
|
- Main objectives of the study
|
|
- Key findings and results
|
|
- Methodologies used
|
|
- Implications of the findings(if any)
|
|
- Any limitations or future directions mentioned
|
|
|
|
Format: Provide your summary in bullet points highlighting key areas followed with a concise paragraph that encapsulates the results of the paper.
|
|
|
|
The tone should be professional and clear.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def catch_request_error(func):
|
|
"""
|
|
Wrapper func to catch request errors and return None if an error occurs.
|
|
Used as a decorator.
|
|
"""
|
|
@functools.wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
try:
|
|
return func(*args, **kwargs)
|
|
except requests.RequestException as e:
|
|
print(f"Request error in {func.__name__}: {e}")
|
|
return None
|
|
return wrapper
|
|
|
|
|
|
|
|
@catch_request_error
|
|
@logger.catch
|
|
def get_xml_from_url(url: str) -> bs:
|
|
"""
|
|
Fetches the XML content from Europe PMC website.
|
|
|
|
Args:
|
|
url (str): Europe PMC's production url to fetch the XML from.
|
|
|
|
Returns:
|
|
soup (bs4.BeautifulSoup): Parsed XML content.
|
|
"""
|
|
response = requests.get(url)
|
|
response.raise_for_status() #check for request errors
|
|
return bs(response.content, "lxml-xml")
|
|
|
|
|
|
|
|
|
|
def clean_text(text:str) -> str:
|
|
"""
|
|
This function cleans a text by filtering reference patterns in text,
|
|
extra whitespaces, escaped latex-style formatting appearing in text body instead of predefined latex tags
|
|
|
|
Args:
|
|
text(str): The text to be cleaned
|
|
|
|
Returns:
|
|
tex(str): The cleaned text
|
|
|
|
"""
|
|
|
|
# Remove LaTeX-style math and formatting tags #already filtered from soup content but some still appear
|
|
text = re.sub(r"\{.*?\}", "", text) # Matches and removes anything inside curly braces {}
|
|
text = re.sub(r"\\[a-zA-Z]+", "", text) # Matches and removes characters that appears with numbers
|
|
|
|
# Remove reference tags like [34] or [1,2,3]
|
|
text = re.sub(r"\[\s*(\d+\s*(,\s*\d+\s*)*)\]", "", text)
|
|
|
|
# Remove extra whitespace
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
|
|
return text
|
|
|
|
|
|
def fetch_article_abstract(soup: bs) -> Tuple[str, str]:
|
|
"""
|
|
Extracts the abstract text from the XML soup.
|
|
|
|
Args:
|
|
soup (bs4.BeautifulSoup): Parsed XML content.
|
|
Returns:
|
|
Tuple(article_title (str), abstract_text (str)): A tuple of the article's title and its extracted abstract text.
|
|
"""
|
|
if soup is None:
|
|
return "No XML found", ""
|
|
article_title = soup.find("article-title").get_text(strip=True) if soup.find("article-title") else "No Title Found for this article"
|
|
|
|
abstract_tag = soup.find("abstract")
|
|
if abstract_tag:
|
|
abstract_text = ' '.join([clean_text(p.get_text(strip=True)) for p in abstract_tag.find_all("p") if p.get_text(strip=True)])
|
|
else:
|
|
abstract_text = ""
|
|
return article_title, abstract_text
|
|
|
|
|
|
|
|
def build_message(article_title: str, abstract_text: str, sys_prompt:str=SYS_PROMPT) -> List[Dict[str, str]]:
|
|
"""
|
|
Constructs the message payload for the LLM.
|
|
|
|
Args:
|
|
article_title (str): The title of the article.
|
|
abstract_text (str): The abstract text of the article.
|
|
|
|
Returns:
|
|
List[Dict[str, str]]: A list of message dictionaries for the LLM.
|
|
"""
|
|
user_prompt = f"""You are looking at an article with title: {article_title}.
|
|
The article's abstract is as follows: \n{abstract_text}.
|
|
Summarise the article. Start your summary by providing a short sentence on what the article is about
|
|
and then a bulleted list of the key points covered in the article.
|
|
"""
|
|
messages = [
|
|
{"role": "system", "content": sys_prompt},
|
|
{"role": "user", "content": user_prompt}
|
|
]
|
|
return messages
|
|
|
|
|
|
|
|
def generate_response(messages:List[Dict[str, str]], model:str) -> str:
|
|
"""
|
|
Generates a response from the LLM based on the provided messages.
|
|
Args:
|
|
messages (List[Dict[str, str]]): The message payload for the LLM.
|
|
model (str): The model to use for generating the response.
|
|
Returns:
|
|
str: The content of the LLM's response.
|
|
"""
|
|
|
|
response = ollama.chat(model=model, messages=messages)
|
|
return response["message"]["content"]
|
|
|
|
|
|
def summariser(article_id: str, model:str) -> str:
|
|
if article_id and not re.match(r"^PMC\d{5,8}$", article_id):
|
|
raise gr.Error("Please check the length/Format of the provided Article ID. It should start with 'PMC' followed by 5 to 8 digits, e.g., 'PMC1234567'.")
|
|
url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{article_id}/fullTextXML"
|
|
soup = get_xml_from_url(url)
|
|
article_title, abstract_text = fetch_article_abstract(soup)
|
|
if not abstract_text:
|
|
raise gr.Error(f"No abstract found for {article_title}")
|
|
messages = build_message(article_title, abstract_text)
|
|
|
|
#pull model from ollama
|
|
ollama.pull(model)
|
|
summary = generate_response(messages, model)
|
|
|
|
return f"## 📝 Article Title: {article_title}\n\n### 📌 Summary:\n{summary}"
|
|
|
|
INTRO_TXT = "This is a simple Biomedical Article Summariser. It uses PMCID to fetch articles from the Europe PMC(EPMC) Website. It currently only runs on article's abstract. Future improvements would integrate full-text articles"
|
|
INST_TXT = "Enter a **EuropePMC Article ID** (e.g., `PMC1234567`) and select a model from the dropdown menu to generate a structured summary"
|
|
def gradio_ui():
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
gr.Markdown(INTRO_TXT)
|
|
gr.Markdown(INST_TXT)
|
|
|
|
with gr.Row():
|
|
with gr.Column(scale=1):
|
|
article_id = gr.Textbox(label="Enter Article's PMCID here", placeholder="e.g., PMC1234567")
|
|
model_choice = gr.Dropdown(choices=["llama3.2", "deepseek-r1", "gemma3", "mistral", "gpt-oss"], value="llama3.2", label="Select a model")
|
|
run_btn = gr.Button("Fetch Article Abstract and generate Summary", variant='primary')
|
|
with gr.Column(scale=1):
|
|
output_box = gr.Markdown()
|
|
|
|
|
|
run_btn.click(fn=summariser, inputs=[article_id, model_choice], outputs=output_box)
|
|
|
|
return demo
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app = gradio_ui()
|
|
app.launch(share=True, debug=True)
|
|
|
|
|
|
|