add week1 assignments

2025-09-29 12:24:43 +01:00
parent 6a5a2d3dd6
commit e4d751dd2f
1 changed files with 202 additions and 0 deletions
--- a/week1_myversion/article_summariser.py
+++ b/week1_myversion/article_summariser.py
@@ -0,0 +1,202 @@
+import re
+
+import requests
+import functools
+from typing import List, Tuple, Dict, Any
+
+from loguru import logger
+
+from bs4 import BeautifulSoup as bs
+
+
+import ollama
+import gradio as gr
+
+
+
+SYS_PROMPT = """
+You are an expert in biomedical text mining and information extraction. 
+You excel at breaking down complex articles into digestible contents for your audience. 
+Your audience can comprise of students, early researchers and professionals in the field.
+Summarize the key findings in the following article [ARTICLE] .
+Your summary should provide crucial points covered in the paper that helps your diverse audience quickly understand the most vital information. 
+Crucial points to include in your summary:
+- Main objectives of the study
+- Key findings and results
+- Methodologies used
+- Implications of the findings(if any)
+- Any limitations or future directions mentioned
+
+Format: Provide your summary in bullet points highlighting key areas followed with a concise paragraph that encapsulates the results of the paper.
+
+The tone should be professional and clear.
+
+"""
+
+
+
+
+def catch_request_error(func):
+    """
+    Wrapper func to catch request errors and return None if an error occurs.
+    Used as a decorator.
+    """
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except requests.RequestException as e:
+            print(f"Request error in {func.__name__}: {e}")
+            return None
+    return wrapper
+
+
+
+@catch_request_error
+@logger.catch
+def get_xml_from_url(url: str) -> bs:
+    """
+    Fetches the XML content from Europe PMC website.
+
+    Args:
+        url (str): Europe PMC's production url to fetch the XML from.
+
+    Returns:
+        soup (bs4.BeautifulSoup): Parsed XML content.
+    """
+    response = requests.get(url)
+    response.raise_for_status() #check for request errors
+    return bs(response.content, "lxml-xml")  
+
+
+
+
+def clean_text(text:str) -> str:
+    """
+    This function cleans a text by filtering reference patterns in text, 
+    extra whitespaces, escaped latex-style formatting appearing in text body instead of predefined latex tags
+
+    Args: 
+    text(str): The text to be cleaned
+    
+    Returns: 
+    tex(str): The cleaned text 
+    
+    """
+   
+    # Remove LaTeX-style math and formatting tags #already filtered from soup content but some still appear
+    text = re.sub(r"\{.*?\}", "", text)  # Matches and removes anything inside curly braces {}
+    text = re.sub(r"\\[a-zA-Z]+", "", text)  # Matches and removes characters that appears with numbers
+    
+    # Remove reference tags like [34] or [1,2,3]
+    text = re.sub(r"\[\s*(\d+\s*(,\s*\d+\s*)*)\]", "", text)
+    
+    # Remove extra whitespace
+    text = re.sub(r"\s+", " ", text).strip()
+    
+    return text
+
+
+def fetch_article_abstract(soup: bs) -> Tuple[str, str]:
+    """
+    Extracts the abstract text from the XML soup.
+
+    Args:
+        soup (bs4.BeautifulSoup): Parsed XML content.
+    Returns:
+        Tuple(article_title (str), abstract_text (str)): A tuple of the article's title and its extracted abstract text.
+    """
+    if soup is None:
+        return "No XML found", ""
+    article_title = soup.find("article-title").get_text(strip=True) if soup.find("article-title") else "No Title Found for this article"
+
+    abstract_tag = soup.find("abstract")
+    if abstract_tag:
+        abstract_text = ' '.join([clean_text(p.get_text(strip=True)) for p in abstract_tag.find_all("p") if p.get_text(strip=True)])
+    else:
+        abstract_text = ""
+    return article_title, abstract_text
+
+
+
+def build_message(article_title: str, abstract_text: str, sys_prompt:str=SYS_PROMPT) -> List[Dict[str, str]]:
+    """
+    Constructs the message payload for the LLM.
+
+    Args:
+        article_title (str): The title of the article.
+        abstract_text (str): The abstract text of the article.
+
+    Returns:
+        List[Dict[str, str]]: A list of message dictionaries for the LLM.
+    """
+    user_prompt = f"""You are looking at an article with title:  {article_title}. 
+    The article's abstract is as follows: \n{abstract_text}.
+    Summarise the article. Start your summary by providing a short sentence on what the article is about 
+    and then a bulleted list of the key points covered in the article.
+"""
+    messages = [
+        {"role": "system", "content": sys_prompt},
+        {"role": "user", "content": user_prompt}
+    ]
+    return messages
+
+
+
+def generate_response(messages:List[Dict[str, str]], model:str) -> str:
+    """ 
+    Generates a response from the LLM based on the provided messages.
+    Args:
+        messages (List[Dict[str, str]]): The message payload for the LLM.
+        model (str): The model to use for generating the response.
+    Returns:
+        str: The content of the LLM's response.
+    """
+
+    response = ollama.chat(model=model, messages=messages)
+    return response["message"]["content"]
+
+
+def summariser(article_id: str, model:str) -> str:
+    if article_id and not re.match(r"^PMC\d{5,8}$", article_id):
+        raise gr.Error("Please check the length/Format of the provided Article ID. It should start with 'PMC' followed by 5 to 8 digits, e.g., 'PMC1234567'.")
+    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{article_id}/fullTextXML"
+    soup = get_xml_from_url(url)
+    article_title, abstract_text = fetch_article_abstract(soup)
+    if not abstract_text:
+         raise gr.Error(f"No abstract found for {article_title}")
+    messages = build_message(article_title, abstract_text)
+
+    #pull model from ollama
+    ollama.pull(model)
+    summary = generate_response(messages, model)
+
+    return f"## 📝 Article Title: {article_title}\n\n### 📌 Summary:\n{summary}"
+
+INTRO_TXT = "This is a simple Biomedical Article Summariser. It uses PMCID to fetch articles from the Europe PMC(EPMC) Website. It currently only runs on article's abstract. Future improvements would integrate full-text articles"
+INST_TXT = "Enter a **EuropePMC Article ID** (e.g., `PMC1234567`) and select a model from the dropdown menu to generate a structured summary"
+def gradio_ui():
+  with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(INTRO_TXT)
+    gr.Markdown(INST_TXT)
+
+    with gr.Row():
+      with gr.Column(scale=1):
+        article_id = gr.Textbox(label="Enter Article's PMCID here", placeholder="e.g., PMC1234567")
+        model_choice = gr.Dropdown(choices=["llama3.2", "deepseek-r1", "gemma3", "mistral", "gpt-oss"], value="llama3.2", label="Select a model")
+        run_btn = gr.Button("Fetch Article Abstract and generate Summary", variant='primary')
+      with gr.Column(scale=1):
+        output_box = gr.Markdown()
+
+
+    run_btn.click(fn=summariser, inputs=[article_id, model_choice], outputs=output_box)
+  
+  return demo
+
+
+if __name__ == "__main__":
+  app = gradio_ui()
+  app.launch(share=True, debug=True)
+
+
+