add week1 assignments
This commit is contained in:
202
week1_myversion/article_summariser.py
Normal file
202
week1_myversion/article_summariser.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import re
|
||||
|
||||
import requests
|
||||
import functools
|
||||
from typing import List, Tuple, Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from bs4 import BeautifulSoup as bs
|
||||
|
||||
|
||||
import ollama
|
||||
import gradio as gr
|
||||
|
||||
|
||||
|
||||
SYS_PROMPT = """
|
||||
You are an expert in biomedical text mining and information extraction.
|
||||
You excel at breaking down complex articles into digestible contents for your audience.
|
||||
Your audience can comprise of students, early researchers and professionals in the field.
|
||||
Summarize the key findings in the following article [ARTICLE] .
|
||||
Your summary should provide crucial points covered in the paper that helps your diverse audience quickly understand the most vital information.
|
||||
Crucial points to include in your summary:
|
||||
- Main objectives of the study
|
||||
- Key findings and results
|
||||
- Methodologies used
|
||||
- Implications of the findings(if any)
|
||||
- Any limitations or future directions mentioned
|
||||
|
||||
Format: Provide your summary in bullet points highlighting key areas followed with a concise paragraph that encapsulates the results of the paper.
|
||||
|
||||
The tone should be professional and clear.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
def catch_request_error(func):
|
||||
"""
|
||||
Wrapper func to catch request errors and return None if an error occurs.
|
||||
Used as a decorator.
|
||||
"""
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except requests.RequestException as e:
|
||||
print(f"Request error in {func.__name__}: {e}")
|
||||
return None
|
||||
return wrapper
|
||||
|
||||
|
||||
|
||||
@catch_request_error
|
||||
@logger.catch
|
||||
def get_xml_from_url(url: str) -> bs:
|
||||
"""
|
||||
Fetches the XML content from Europe PMC website.
|
||||
|
||||
Args:
|
||||
url (str): Europe PMC's production url to fetch the XML from.
|
||||
|
||||
Returns:
|
||||
soup (bs4.BeautifulSoup): Parsed XML content.
|
||||
"""
|
||||
response = requests.get(url)
|
||||
response.raise_for_status() #check for request errors
|
||||
return bs(response.content, "lxml-xml")
|
||||
|
||||
|
||||
|
||||
|
||||
def clean_text(text:str) -> str:
|
||||
"""
|
||||
This function cleans a text by filtering reference patterns in text,
|
||||
extra whitespaces, escaped latex-style formatting appearing in text body instead of predefined latex tags
|
||||
|
||||
Args:
|
||||
text(str): The text to be cleaned
|
||||
|
||||
Returns:
|
||||
tex(str): The cleaned text
|
||||
|
||||
"""
|
||||
|
||||
# Remove LaTeX-style math and formatting tags #already filtered from soup content but some still appear
|
||||
text = re.sub(r"\{.*?\}", "", text) # Matches and removes anything inside curly braces {}
|
||||
text = re.sub(r"\\[a-zA-Z]+", "", text) # Matches and removes characters that appears with numbers
|
||||
|
||||
# Remove reference tags like [34] or [1,2,3]
|
||||
text = re.sub(r"\[\s*(\d+\s*(,\s*\d+\s*)*)\]", "", text)
|
||||
|
||||
# Remove extra whitespace
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def fetch_article_abstract(soup: bs) -> Tuple[str, str]:
|
||||
"""
|
||||
Extracts the abstract text from the XML soup.
|
||||
|
||||
Args:
|
||||
soup (bs4.BeautifulSoup): Parsed XML content.
|
||||
Returns:
|
||||
Tuple(article_title (str), abstract_text (str)): A tuple of the article's title and its extracted abstract text.
|
||||
"""
|
||||
if soup is None:
|
||||
return "No XML found", ""
|
||||
article_title = soup.find("article-title").get_text(strip=True) if soup.find("article-title") else "No Title Found for this article"
|
||||
|
||||
abstract_tag = soup.find("abstract")
|
||||
if abstract_tag:
|
||||
abstract_text = ' '.join([clean_text(p.get_text(strip=True)) for p in abstract_tag.find_all("p") if p.get_text(strip=True)])
|
||||
else:
|
||||
abstract_text = ""
|
||||
return article_title, abstract_text
|
||||
|
||||
|
||||
|
||||
def build_message(article_title: str, abstract_text: str, sys_prompt:str=SYS_PROMPT) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Constructs the message payload for the LLM.
|
||||
|
||||
Args:
|
||||
article_title (str): The title of the article.
|
||||
abstract_text (str): The abstract text of the article.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, str]]: A list of message dictionaries for the LLM.
|
||||
"""
|
||||
user_prompt = f"""You are looking at an article with title: {article_title}.
|
||||
The article's abstract is as follows: \n{abstract_text}.
|
||||
Summarise the article. Start your summary by providing a short sentence on what the article is about
|
||||
and then a bulleted list of the key points covered in the article.
|
||||
"""
|
||||
messages = [
|
||||
{"role": "system", "content": sys_prompt},
|
||||
{"role": "user", "content": user_prompt}
|
||||
]
|
||||
return messages
|
||||
|
||||
|
||||
|
||||
def generate_response(messages:List[Dict[str, str]], model:str) -> str:
|
||||
"""
|
||||
Generates a response from the LLM based on the provided messages.
|
||||
Args:
|
||||
messages (List[Dict[str, str]]): The message payload for the LLM.
|
||||
model (str): The model to use for generating the response.
|
||||
Returns:
|
||||
str: The content of the LLM's response.
|
||||
"""
|
||||
|
||||
response = ollama.chat(model=model, messages=messages)
|
||||
return response["message"]["content"]
|
||||
|
||||
|
||||
def summariser(article_id: str, model:str) -> str:
|
||||
if article_id and not re.match(r"^PMC\d{5,8}$", article_id):
|
||||
raise gr.Error("Please check the length/Format of the provided Article ID. It should start with 'PMC' followed by 5 to 8 digits, e.g., 'PMC1234567'.")
|
||||
url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{article_id}/fullTextXML"
|
||||
soup = get_xml_from_url(url)
|
||||
article_title, abstract_text = fetch_article_abstract(soup)
|
||||
if not abstract_text:
|
||||
raise gr.Error(f"No abstract found for {article_title}")
|
||||
messages = build_message(article_title, abstract_text)
|
||||
|
||||
#pull model from ollama
|
||||
ollama.pull(model)
|
||||
summary = generate_response(messages, model)
|
||||
|
||||
return f"## 📝 Article Title: {article_title}\n\n### 📌 Summary:\n{summary}"
|
||||
|
||||
INTRO_TXT = "This is a simple Biomedical Article Summariser. It uses PMCID to fetch articles from the Europe PMC(EPMC) Website. It currently only runs on article's abstract. Future improvements would integrate full-text articles"
|
||||
INST_TXT = "Enter a **EuropePMC Article ID** (e.g., `PMC1234567`) and select a model from the dropdown menu to generate a structured summary"
|
||||
def gradio_ui():
|
||||
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
||||
gr.Markdown(INTRO_TXT)
|
||||
gr.Markdown(INST_TXT)
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column(scale=1):
|
||||
article_id = gr.Textbox(label="Enter Article's PMCID here", placeholder="e.g., PMC1234567")
|
||||
model_choice = gr.Dropdown(choices=["llama3.2", "deepseek-r1", "gemma3", "mistral", "gpt-oss"], value="llama3.2", label="Select a model")
|
||||
run_btn = gr.Button("Fetch Article Abstract and generate Summary", variant='primary')
|
||||
with gr.Column(scale=1):
|
||||
output_box = gr.Markdown()
|
||||
|
||||
|
||||
run_btn.click(fn=summariser, inputs=[article_id, model_choice], outputs=output_box)
|
||||
|
||||
return demo
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = gradio_ui()
|
||||
app.launch(share=True, debug=True)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user