Fix: Convert protocol_summarizer_webapp from submodule to regular files

- Remove protocol_summarizer_webapp submodule reference
- Add all webapp files as regular files to enable proper PR creation
- Includes Streamlit app, documentation, and configuration files
This commit is contained in:
albertoclemente
2025-07-03 17:23:00 +02:00
parent 581ae06597
commit 99d1d2b4f5
6 changed files with 224 additions and 1 deletions

Submodule community-contributions/protocol_summarizer_webapp deleted from de831a5894

View File

@@ -0,0 +1,3 @@
<!-- Use this file to provide workspace-specific custom instructions to Copilot. For more details, visit https://code.visualstudio.com/docs/copilot/copilot-customization#_use-a-githubcopilotinstructionsmd-file -->
This is a Streamlit web application for clinical trial protocol summarization. Use Streamlit best practices for UI and Python for backend logic. Integrate with ClinicalTrials.gov v2 API for study search and OpenAI for summarization.

View File

@@ -0,0 +1,30 @@
updates.md
.env
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
venv/
ENV/
.streamlit/
.idea/
.vscode/
*.swp
*.swo
.DS_Store

View File

@@ -0,0 +1,66 @@
# Protocol Summarizer Webapp
A Streamlit web application for searching and summarizing clinical trial protocols from ClinicalTrials.gov using Large Language Models. This tool enables researchers and clinical professionals to quickly extract key information from clinical trial protocols.
## Features
- Search for clinical trials by keyword
- Display a list of studies with title and NCT number
- Select a study to summarize
- Fetch the protocol's brief summary from ClinicalTrials.gov API
- Automatically summarize the protocol using OpenAI's LLM
- Extract structured information like study design, population, interventions, and endpoints
## Installation
1. Clone this repository:
```sh
git clone https://github.com/albertoclemente/protocol_summarizer.git
cd protocol_summarizer/protocol_summarizer_webapp
```
2. Install dependencies:
```sh
pip install -r requirements.txt
```
3. Create a `.env` file in the project root with your OpenAI API key:
```
OPENAI_API_KEY=your_api_key_here
```
## Usage
1. Run the Streamlit app:
```sh
streamlit run app.py
```
2. In your browser:
- Enter a disease, condition, or keyword in the search box
- Select the number of results to display
- Click the "Search" button
- Select a study from the results
- Click "Summarize Protocol" to generate a structured summary
## Technical Details
- Uses ClinicalTrials.gov API v2 to retrieve study information
- Implements fallback methods to handle API changes or failures
- Extracts protocol brief summaries using reliable JSON parsing
- Generates structured summaries using OpenAI's GPT models
## Requirements
- Python 3.7+
- Streamlit
- Requests
- OpenAI Python library
- python-dotenv
## Contribution
Contributions are welcome! Please feel free to submit a Pull Request.
## License
MIT License

View File

@@ -0,0 +1,121 @@
import os
from dotenv import load_dotenv
import streamlit as st
import requests
from openai import OpenAI
load_dotenv()
st.title("Protocol Summarizer")
st.markdown("""
Search for clinical trials by keyword, select a study, and generate a protocol summary using an LLM.
""")
# Search input
# Show results only after user presses Enter
with st.form(key="search_form"):
query = st.text_input("Enter a disease, study title, or keyword:")
max_results = st.slider("Number of results", 1, 20, 5)
submitted = st.form_submit_button("Search")
@st.cache_data(show_spinner=False)
def search_clinical_trials(query, max_results=5):
if not query:
return []
url = f"https://clinicaltrials.gov/api/v2/studies?query.term={query}&pageSize={max_results}&format=json"
resp = requests.get(url)
studies = []
if resp.status_code == 200:
data = resp.json()
for study in data.get('studies', []):
nct = study.get('protocolSection', {}).get('identificationModule', {}).get('nctId', 'N/A')
title = study.get('protocolSection', {}).get('identificationModule', {}).get('officialTitle', 'N/A')
studies.append({'nct': nct, 'title': title})
return studies
results = search_clinical_trials(query, max_results) if query else []
if results:
st.subheader("Search Results")
for i, study in enumerate(results):
st.markdown(f"**{i+1}. {study['title']}** (NCT: {study['nct']})")
selected = st.number_input("Select study number to summarize", min_value=1, max_value=len(results), value=1)
selected_study = results[selected-1]
st.markdown(f"### Selected Study\n**{selected_study['title']}** (NCT: {selected_study['nct']})")
if st.button("Summarize Protocol"):
# Fetch the brief summary for the selected study
nct_id = selected_study['nct']
# Use the V2 API which we know works reliably
url = f"https://clinicaltrials.gov/api/v2/studies/{nct_id}?format=json"
with st.spinner("Fetching study details..."):
resp = requests.get(url)
brief = ""
if resp.status_code == 200:
try:
data = resp.json()
# V2 API has protocolSection at the root level
if 'protocolSection' in data:
desc_mod = data.get('protocolSection', {}).get('descriptionModule', {})
brief = desc_mod.get('briefSummary', '')
# If briefSummary is empty, try detailedDescription
if not brief:
brief = desc_mod.get('detailedDescription', '')
except Exception as e:
st.error(f"Error parsing study data: {e}")
# If API fails, try HTML scraping as a fallback
if not brief and resp.status_code != 200:
st.warning(f"API returned status code {resp.status_code}. Trying alternative method...")
html_url = f"https://clinicaltrials.gov/ct2/show/{nct_id}"
html_resp = requests.get(html_url)
if "Brief Summary:" in html_resp.text:
start = html_resp.text.find("Brief Summary:") + 15
excerpt = html_resp.text[start:start+1000]
# Clean up HTML
import re
excerpt = re.sub('<[^<]+?>', ' ', excerpt)
excerpt = re.sub('\\s+', ' ', excerpt)
brief = excerpt.strip()
if not brief:
st.error("No brief summary or detailed description found for this study.")
st.stop()
# Now we have the brief summary, send it to the LLM
openai = OpenAI()
def user_prompt_for_protocol_brief(brief_text):
return (
"Extract the following details from the clinical trial brief summary in markdown format with clear section headings (e.g., ## Study Design, ## Population, etc.):\n"
"- Study design\n"
"- Population\n"
"- Interventions\n"
"- Primary and secondary endpoints\n"
"- Study duration\n\n"
f"Brief summary text:\n{brief_text}"
)
system_prompt = "You are a clinical research assistant. Extract and list the requested protocol details in markdown format with clear section headings."
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt_for_protocol_brief(brief)}
]
with st.spinner("Summarizing with LLM..."):
try:
response = openai.chat.completions.create(
model="gpt-4o-mini",
messages=messages
)
summary = response.choices[0].message.content
st.markdown(summary)
except Exception as e:
st.error(f"LLM call failed: {e}")
else:
if query:
st.info("No results found. Try a different keyword.")

View File

@@ -0,0 +1,4 @@
streamlit
openai
requests
python-dotenv