From e4d751dd2f120bb262ff66fcfb6fb20a7c66d734 Mon Sep 17 00:00:00 2001 From: Amina Mardiyyah Rufai <39123465+amina-mardiyyah@users.noreply.github.com> Date: Mon, 29 Sep 2025 12:24:43 +0100 Subject: [PATCH] add week1 assignments --- week1_myversion/article_summariser.py | 202 ++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 week1_myversion/article_summariser.py diff --git a/week1_myversion/article_summariser.py b/week1_myversion/article_summariser.py new file mode 100644 index 0000000..9bf67fa --- /dev/null +++ b/week1_myversion/article_summariser.py @@ -0,0 +1,202 @@ +import re + +import requests +import functools +from typing import List, Tuple, Dict, Any + +from loguru import logger + +from bs4 import BeautifulSoup as bs + + +import ollama +import gradio as gr + + + +SYS_PROMPT = """ +You are an expert in biomedical text mining and information extraction. +You excel at breaking down complex articles into digestible contents for your audience. +Your audience can comprise of students, early researchers and professionals in the field. +Summarize the key findings in the following article [ARTICLE] . +Your summary should provide crucial points covered in the paper that helps your diverse audience quickly understand the most vital information. +Crucial points to include in your summary: +- Main objectives of the study +- Key findings and results +- Methodologies used +- Implications of the findings(if any) +- Any limitations or future directions mentioned + +Format: Provide your summary in bullet points highlighting key areas followed with a concise paragraph that encapsulates the results of the paper. + +The tone should be professional and clear. + +""" + + + + +def catch_request_error(func): + """ + Wrapper func to catch request errors and return None if an error occurs. + Used as a decorator. + """ + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except requests.RequestException as e: + print(f"Request error in {func.__name__}: {e}") + return None + return wrapper + + + +@catch_request_error +@logger.catch +def get_xml_from_url(url: str) -> bs: + """ + Fetches the XML content from Europe PMC website. + + Args: + url (str): Europe PMC's production url to fetch the XML from. + + Returns: + soup (bs4.BeautifulSoup): Parsed XML content. + """ + response = requests.get(url) + response.raise_for_status() #check for request errors + return bs(response.content, "lxml-xml") + + + + +def clean_text(text:str) -> str: + """ + This function cleans a text by filtering reference patterns in text, + extra whitespaces, escaped latex-style formatting appearing in text body instead of predefined latex tags + + Args: + text(str): The text to be cleaned + + Returns: + tex(str): The cleaned text + + """ + + # Remove LaTeX-style math and formatting tags #already filtered from soup content but some still appear + text = re.sub(r"\{.*?\}", "", text) # Matches and removes anything inside curly braces {} + text = re.sub(r"\\[a-zA-Z]+", "", text) # Matches and removes characters that appears with numbers + + # Remove reference tags like [34] or [1,2,3] + text = re.sub(r"\[\s*(\d+\s*(,\s*\d+\s*)*)\]", "", text) + + # Remove extra whitespace + text = re.sub(r"\s+", " ", text).strip() + + return text + + +def fetch_article_abstract(soup: bs) -> Tuple[str, str]: + """ + Extracts the abstract text from the XML soup. + + Args: + soup (bs4.BeautifulSoup): Parsed XML content. + Returns: + Tuple(article_title (str), abstract_text (str)): A tuple of the article's title and its extracted abstract text. + """ + if soup is None: + return "No XML found", "" + article_title = soup.find("article-title").get_text(strip=True) if soup.find("article-title") else "No Title Found for this article" + + abstract_tag = soup.find("abstract") + if abstract_tag: + abstract_text = ' '.join([clean_text(p.get_text(strip=True)) for p in abstract_tag.find_all("p") if p.get_text(strip=True)]) + else: + abstract_text = "" + return article_title, abstract_text + + + +def build_message(article_title: str, abstract_text: str, sys_prompt:str=SYS_PROMPT) -> List[Dict[str, str]]: + """ + Constructs the message payload for the LLM. + + Args: + article_title (str): The title of the article. + abstract_text (str): The abstract text of the article. + + Returns: + List[Dict[str, str]]: A list of message dictionaries for the LLM. + """ + user_prompt = f"""You are looking at an article with title: {article_title}. + The article's abstract is as follows: \n{abstract_text}. + Summarise the article. Start your summary by providing a short sentence on what the article is about + and then a bulleted list of the key points covered in the article. +""" + messages = [ + {"role": "system", "content": sys_prompt}, + {"role": "user", "content": user_prompt} + ] + return messages + + + +def generate_response(messages:List[Dict[str, str]], model:str) -> str: + """ + Generates a response from the LLM based on the provided messages. + Args: + messages (List[Dict[str, str]]): The message payload for the LLM. + model (str): The model to use for generating the response. + Returns: + str: The content of the LLM's response. + """ + + response = ollama.chat(model=model, messages=messages) + return response["message"]["content"] + + +def summariser(article_id: str, model:str) -> str: + if article_id and not re.match(r"^PMC\d{5,8}$", article_id): + raise gr.Error("Please check the length/Format of the provided Article ID. It should start with 'PMC' followed by 5 to 8 digits, e.g., 'PMC1234567'.") + url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{article_id}/fullTextXML" + soup = get_xml_from_url(url) + article_title, abstract_text = fetch_article_abstract(soup) + if not abstract_text: + raise gr.Error(f"No abstract found for {article_title}") + messages = build_message(article_title, abstract_text) + + #pull model from ollama + ollama.pull(model) + summary = generate_response(messages, model) + + return f"## 📝 Article Title: {article_title}\n\n### 📌 Summary:\n{summary}" + +INTRO_TXT = "This is a simple Biomedical Article Summariser. It uses PMCID to fetch articles from the Europe PMC(EPMC) Website. It currently only runs on article's abstract. Future improvements would integrate full-text articles" +INST_TXT = "Enter a **EuropePMC Article ID** (e.g., `PMC1234567`) and select a model from the dropdown menu to generate a structured summary" +def gradio_ui(): + with gr.Blocks(theme=gr.themes.Soft()) as demo: + gr.Markdown(INTRO_TXT) + gr.Markdown(INST_TXT) + + with gr.Row(): + with gr.Column(scale=1): + article_id = gr.Textbox(label="Enter Article's PMCID here", placeholder="e.g., PMC1234567") + model_choice = gr.Dropdown(choices=["llama3.2", "deepseek-r1", "gemma3", "mistral", "gpt-oss"], value="llama3.2", label="Select a model") + run_btn = gr.Button("Fetch Article Abstract and generate Summary", variant='primary') + with gr.Column(scale=1): + output_box = gr.Markdown() + + + run_btn.click(fn=summariser, inputs=[article_id, model_choice], outputs=output_box) + + return demo + + +if __name__ == "__main__": + app = gradio_ui() + app.launch(share=True, debug=True) + + +