Delete week1_myversion directory

2025-10-18 12:03:46 +01:00
parent 4b51a8f9e1
commit 974e11250c
1 changed files with 0 additions and 202 deletions
--- a/week1_myversion/article_summariser.py
+++ b/week1_myversion/article_summariser.py
@@ -1,202 +0,0 @@
 import re
 import requests
 import functools
 from typing import List, Tuple, Dict, Any
 from loguru import logger
 from bs4 import BeautifulSoup as bs
 import ollama
 import gradio as gr
 SYS_PROMPT = """
 You are an expert in biomedical text mining and information extraction. 
 You excel at breaking down complex articles into digestible contents for your audience. 
 Your audience can comprise of students, early researchers and professionals in the field.
 Summarize the key findings in the following article [ARTICLE] .
 Your summary should provide crucial points covered in the paper that helps your diverse audience quickly understand the most vital information. 
 Crucial points to include in your summary:
 - Main objectives of the study
 - Key findings and results
 - Methodologies used
 - Implications of the findings(if any)
 - Any limitations or future directions mentioned
 Format: Provide your summary in bullet points highlighting key areas followed with a concise paragraph that encapsulates the results of the paper.
 The tone should be professional and clear.
 """
 def catch_request_error(func):
    """
    Wrapper func to catch request errors and return None if an error occurs.
    Used as a decorator.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except requests.RequestException as e:
            print(f"Request error in {func.__name__}: {e}")
            return None
    return wrapper
@catch_request_error
@logger.catch
 def get_xml_from_url(url: str) -> bs:
    """
    Fetches the XML content from Europe PMC website.
    Args:
        url (str): Europe PMC's production url to fetch the XML from.
    Returns:
        soup (bs4.BeautifulSoup): Parsed XML content.
    """
    response = requests.get(url)
    response.raise_for_status() #check for request errors
    return bs(response.content, "lxml-xml")  
 def clean_text(text:str) -> str:
    """
    This function cleans a text by filtering reference patterns in text, 
    extra whitespaces, escaped latex-style formatting appearing in text body instead of predefined latex tags
    Args: 
    text(str): The text to be cleaned
    Returns: 
    tex(str): The cleaned text 
    """
    # Remove LaTeX-style math and formatting tags #already filtered from soup content but some still appear
    text = re.sub(r"\{.*?\}", "", text)  # Matches and removes anything inside curly braces {}
    text = re.sub(r"\\[a-zA-Z]+", "", text)  # Matches and removes characters that appears with numbers
    # Remove reference tags like [34] or [1,2,3]
    text = re.sub(r"\[\s*(\d+\s*(,\s*\d+\s*)*)\]", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text
 def fetch_article_abstract(soup: bs) -> Tuple[str, str]:
    """
    Extracts the abstract text from the XML soup.
    Args:
        soup (bs4.BeautifulSoup): Parsed XML content.
    Returns:
        Tuple(article_title (str), abstract_text (str)): A tuple of the article's title and its extracted abstract text.
    """
    if soup is None:
        return "No XML found", ""
    article_title = soup.find("article-title").get_text(strip=True) if soup.find("article-title") else "No Title Found for this article"
    abstract_tag = soup.find("abstract")
    if abstract_tag:
        abstract_text = ' '.join([clean_text(p.get_text(strip=True)) for p in abstract_tag.find_all("p") if p.get_text(strip=True)])
    else:
        abstract_text = ""
    return article_title, abstract_text
 def build_message(article_title: str, abstract_text: str, sys_prompt:str=SYS_PROMPT) -> List[Dict[str, str]]:
    """
    Constructs the message payload for the LLM.
    Args:
        article_title (str): The title of the article.
        abstract_text (str): The abstract text of the article.
    Returns:
        List[Dict[str, str]]: A list of message dictionaries for the LLM.
    """
    user_prompt = f"""You are looking at an article with title:  {article_title}. 
    The article's abstract is as follows: \n{abstract_text}.
    Summarise the article. Start your summary by providing a short sentence on what the article is about 
    and then a bulleted list of the key points covered in the article.
 """
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt}
    ]
    return messages
 def generate_response(messages:List[Dict[str, str]], model:str) -> str:
    """ 
    Generates a response from the LLM based on the provided messages.
    Args:
        messages (List[Dict[str, str]]): The message payload for the LLM.
        model (str): The model to use for generating the response.
    Returns:
        str: The content of the LLM's response.
    """
    response = ollama.chat(model=model, messages=messages)
    return response["message"]["content"]
 def summariser(article_id: str, model:str) -> str:
    if article_id and not re.match(r"^PMC\d{5,8}$", article_id):
        raise gr.Error("Please check the length/Format of the provided Article ID. It should start with 'PMC' followed by 5 to 8 digits, e.g., 'PMC1234567'.")
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{article_id}/fullTextXML"
    soup = get_xml_from_url(url)
    article_title, abstract_text = fetch_article_abstract(soup)
    if not abstract_text:
         raise gr.Error(f"No abstract found for {article_title}")
    messages = build_message(article_title, abstract_text)
    #pull model from ollama
    ollama.pull(model)
    summary = generate_response(messages, model)
    return f"## 📝 Article Title: {article_title}\n\n### 📌 Summary:\n{summary}"
 INTRO_TXT = "This is a simple Biomedical Article Summariser. It uses PMCID to fetch articles from the Europe PMC(EPMC) Website. It currently only runs on article's abstract. Future improvements would integrate full-text articles"
 INST_TXT = "Enter a **EuropePMC Article ID** (e.g., `PMC1234567`) and select a model from the dropdown menu to generate a structured summary"
 def gradio_ui():
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(INTRO_TXT)
    gr.Markdown(INST_TXT)
    with gr.Row():
      with gr.Column(scale=1):
        article_id = gr.Textbox(label="Enter Article's PMCID here", placeholder="e.g., PMC1234567")
        model_choice = gr.Dropdown(choices=["llama3.2", "deepseek-r1", "gemma3", "mistral", "gpt-oss"], value="llama3.2", label="Select a model")
        run_btn = gr.Button("Fetch Article Abstract and generate Summary", variant='primary')
      with gr.Column(scale=1):
        output_box = gr.Markdown()
    run_btn.click(fn=summariser, inputs=[article_id, model_choice], outputs=output_box)
  return demo
 if __name__ == "__main__":
  app = gradio_ui()
  app.launch(share=True, debug=True)