Fix: Convert protocol_summarizer_webapp from submodule to regular files

- Remove protocol_summarizer_webapp submodule reference - Add all webapp files as regular files to enable proper PR creation - Includes Streamlit app, documentation, and configuration files
2025-07-03 17:23:00 +02:00
parent 581ae06597
commit 99d1d2b4f5
6 changed files with 224 additions and 1 deletions
--- a/community-contributions/protocol_summarizer_webapp
+++ b/community-contributions/protocol_summarizer_webapp
--- a/community-contributions/protocol_summarizer_webapp/.github/copilot-instructions.md
+++ b/community-contributions/protocol_summarizer_webapp/.github/copilot-instructions.md
@@ -0,0 +1,3 @@
+<!-- Use this file to provide workspace-specific custom instructions to Copilot. For more details, visit https://code.visualstudio.com/docs/copilot/copilot-customization#_use-a-githubcopilotinstructionsmd-file -->
+
+This is a Streamlit web application for clinical trial protocol summarization. Use Streamlit best practices for UI and Python for backend logic. Integrate with ClinicalTrials.gov v2 API for study search and OpenAI for summarization.
--- a/community-contributions/protocol_summarizer_webapp/.gitignore
+++ b/community-contributions/protocol_summarizer_webapp/.gitignore
@@ -0,0 +1,30 @@
+updates.md
+.env
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+venv/
+ENV/
+.streamlit/
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store
--- a/community-contributions/protocol_summarizer_webapp/README.md
+++ b/community-contributions/protocol_summarizer_webapp/README.md
@@ -0,0 +1,66 @@
+# Protocol Summarizer Webapp
+
+A Streamlit web application for searching and summarizing clinical trial protocols from ClinicalTrials.gov using Large Language Models. This tool enables researchers and clinical professionals to quickly extract key information from clinical trial protocols.
+
+## Features
+- Search for clinical trials by keyword
+- Display a list of studies with title and NCT number
+- Select a study to summarize
+- Fetch the protocol's brief summary from ClinicalTrials.gov API
+- Automatically summarize the protocol using OpenAI's LLM
+- Extract structured information like study design, population, interventions, and endpoints
+
+## Installation
+
+1. Clone this repository:
+   ```sh
+   git clone https://github.com/albertoclemente/protocol_summarizer.git
+   cd protocol_summarizer/protocol_summarizer_webapp
+   ```
+
+2. Install dependencies:
+   ```sh
+   pip install -r requirements.txt
+   ```
+
+3. Create a `.env` file in the project root with your OpenAI API key:
+   ```
+   OPENAI_API_KEY=your_api_key_here
+   ```
+
+## Usage
+
+1. Run the Streamlit app:
+   ```sh
+   streamlit run app.py
+   ```
+
+2. In your browser:
+   - Enter a disease, condition, or keyword in the search box
+   - Select the number of results to display
+   - Click the "Search" button
+   - Select a study from the results
+   - Click "Summarize Protocol" to generate a structured summary
+
+## Technical Details
+
+- Uses ClinicalTrials.gov API v2 to retrieve study information
+- Implements fallback methods to handle API changes or failures
+- Extracts protocol brief summaries using reliable JSON parsing
+- Generates structured summaries using OpenAI's GPT models
+
+## Requirements
+
+- Python 3.7+
+- Streamlit
+- Requests
+- OpenAI Python library
+- python-dotenv
+
+## Contribution
+
+Contributions are welcome! Please feel free to submit a Pull Request.
+
+## License
+
+MIT License
--- a/community-contributions/protocol_summarizer_webapp/app.py
+++ b/community-contributions/protocol_summarizer_webapp/app.py
@@ -0,0 +1,121 @@
+import os
+from dotenv import load_dotenv
+import streamlit as st
+import requests
+from openai import OpenAI
+
+load_dotenv()
+
+st.title("Protocol Summarizer")
+
+st.markdown("""
+Search for clinical trials by keyword, select a study, and generate a protocol summary using an LLM.
+""")
+
+# Search input
+
+# Show results only after user presses Enter
+with st.form(key="search_form"):
+    query = st.text_input("Enter a disease, study title, or keyword:")
+    max_results = st.slider("Number of results", 1, 20, 5)
+    submitted = st.form_submit_button("Search")
+
+@st.cache_data(show_spinner=False)
+def search_clinical_trials(query, max_results=5):
+    if not query:
+        return []
+    url = f"https://clinicaltrials.gov/api/v2/studies?query.term={query}&pageSize={max_results}&format=json"
+    resp = requests.get(url)
+    studies = []
+    if resp.status_code == 200:
+        data = resp.json()
+        for study in data.get('studies', []):
+            nct = study.get('protocolSection', {}).get('identificationModule', {}).get('nctId', 'N/A')
+            title = study.get('protocolSection', {}).get('identificationModule', {}).get('officialTitle', 'N/A')
+            studies.append({'nct': nct, 'title': title})
+    return studies
+
+results = search_clinical_trials(query, max_results) if query else []
+
+if results:
+    st.subheader("Search Results")
+    for i, study in enumerate(results):
+        st.markdown(f"**{i+1}. {study['title']}** (NCT: {study['nct']})")
+    selected = st.number_input("Select study number to summarize", min_value=1, max_value=len(results), value=1)
+    selected_study = results[selected-1]
+    st.markdown(f"### Selected Study\n**{selected_study['title']}** (NCT: {selected_study['nct']})")
+    if st.button("Summarize Protocol"):
+        # Fetch the brief summary for the selected study
+        nct_id = selected_study['nct']
+        
+        # Use the V2 API which we know works reliably
+        url = f"https://clinicaltrials.gov/api/v2/studies/{nct_id}?format=json"
+        with st.spinner("Fetching study details..."):
+            resp = requests.get(url)
+            brief = ""
+            
+            if resp.status_code == 200:
+                try:
+                    data = resp.json()
+                    
+                    # V2 API has protocolSection at the root level
+                    if 'protocolSection' in data:
+                        desc_mod = data.get('protocolSection', {}).get('descriptionModule', {})
+                        brief = desc_mod.get('briefSummary', '')
+                        
+                        # If briefSummary is empty, try detailedDescription
+                        if not brief:
+                            brief = desc_mod.get('detailedDescription', '')
+                except Exception as e:
+                    st.error(f"Error parsing study data: {e}")
+            
+            # If API fails, try HTML scraping as a fallback
+            if not brief and resp.status_code != 200:
+                st.warning(f"API returned status code {resp.status_code}. Trying alternative method...")
+                html_url = f"https://clinicaltrials.gov/ct2/show/{nct_id}"
+                html_resp = requests.get(html_url)
+                
+                if "Brief Summary:" in html_resp.text:
+                    start = html_resp.text.find("Brief Summary:") + 15
+                    excerpt = html_resp.text[start:start+1000]
+                    
+                    # Clean up HTML
+                    import re
+                    excerpt = re.sub('<[^<]+?>', ' ', excerpt)
+                    excerpt = re.sub('\\s+', ' ', excerpt)
+                    brief = excerpt.strip()
+            
+            if not brief:
+                st.error("No brief summary or detailed description found for this study.")
+                st.stop()
+            
+        # Now we have the brief summary, send it to the LLM
+        openai = OpenAI()
+        def user_prompt_for_protocol_brief(brief_text):
+            return (
+                "Extract the following details from the clinical trial brief summary in markdown format with clear section headings (e.g., ## Study Design, ## Population, etc.):\n"
+                "- Study design\n"
+                "- Population\n"
+                "- Interventions\n"
+                "- Primary and secondary endpoints\n"
+                "- Study duration\n\n"
+                f"Brief summary text:\n{brief_text}"
+            )
+        system_prompt = "You are a clinical research assistant. Extract and list the requested protocol details in markdown format with clear section headings."
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt_for_protocol_brief(brief)}
+        ]
+        with st.spinner("Summarizing with LLM..."):
+            try:
+                response = openai.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=messages
+                )
+                summary = response.choices[0].message.content
+                st.markdown(summary)
+            except Exception as e:
+                st.error(f"LLM call failed: {e}")
+else:
+    if query:
+        st.info("No results found. Try a different keyword.")
--- a/community-contributions/protocol_summarizer_webapp/requirements.txt
+++ b/community-contributions/protocol_summarizer_webapp/requirements.txt
@@ -0,0 +1,4 @@
+streamlit
+openai
+requests
+python-dotenv