Merge pull request #424 from MiR-stack/community-contributions-branch

user can summarize research papers by website link
This commit is contained in:
Ed Donner
2025-06-06 21:38:12 -04:00
committed by GitHub

View File

@@ -0,0 +1,307 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "3ba06289-d17a-4ccd-85f5-2b79956d4e59",
"metadata": {},
"outputs": [],
"source": [
"!pip install selenium"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb6636be-e43f-4896-aadd-cafda003ed4e",
"metadata": {},
"outputs": [],
"source": [
"!pip install -q -U google-genai"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dfe66209-1d33-4292-80f1-20e11baf4bc3",
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.chrome.service import Service\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from IPython.display import Markdown, display\n",
"from google import genai\n",
"from google.genai import types\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f2b4306c-17d0-46fe-a889-7440ff809dc6",
"metadata": {},
"outputs": [],
"source": [
"\n",
"#load env\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('GEMINI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")"
]
},
{
"cell_type": "markdown",
"id": "08ec6fec-886c-4a0c-a046-e8643ad700d3",
"metadata": {},
"source": [
"# Lets make a simple call for check our model is working fine or not"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "89143d5c-0013-4f7e-8e1f-f7db7e936f0d",
"metadata": {},
"outputs": [],
"source": [
"client = genai.Client(api_key=api_key)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1144b77a-6785-479a-ab4f-bb0ab5624b49",
"metadata": {},
"outputs": [],
"source": [
"\n",
"response = client.models.generate_content(\n",
" model=\"gemini-2.5-flash-preview-05-20\",\n",
" contents=[\"hi gemini\"]\n",
")\n",
"print(response.text)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bbf3836c-19b8-44e1-904a-f265925c2786",
"metadata": {},
"outputs": [],
"source": [
"\n",
"class Website:\n",
" def __init__(self, url, driver_path=None, wait_time=3):\n",
" self.url = url\n",
" self.wait_time = wait_time\n",
"\n",
" # Headless Chrome settings\n",
" options = Options()\n",
" # options.add_argument(\"--headless\") \n",
" # Headless mode runs the browser in the background (invisible).\n",
" # However, some websites (like openai.com) block headless browsers.\n",
" # So if this line is active, the page may not load correctly and you may not get the full content.\n",
" options.add_argument(\"--disable-gpu\")\n",
" options.add_argument(\"--no-sandbox\")\n",
" options.add_argument(\"--window-size=1920x1080\")\n",
"\n",
" # Driver path\n",
" if driver_path:\n",
" service = Service(executable_path=driver_path)\n",
" else:\n",
" service = Service() \n",
"\n",
" # Start browser\n",
" driver = webdriver.Chrome(service=service, options=options)\n",
" driver.get(url)\n",
"\n",
" # Wait for the loading page\n",
" time.sleep(self.wait_time)\n",
"\n",
" # Take page source\n",
" html = driver.page_source\n",
" driver.quit()\n",
"\n",
" # Analysis with BeautifulSoup \n",
" soup = BeautifulSoup(html, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
"\n",
" # Clean irrelevant tags\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
"\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "852c52e2-bd4d-4bb9-94ef-e498c33f1a89",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"\"\"You are an academic research assistant specialized in summarizing scholarly papers. Follow this workflow rigorously:\n",
"\n",
"Step 1: Document Verification\n",
"Verify if the input is a research paper by checking for:\n",
"\n",
"Presence of academic sections (Abstract, Introduction, Methodology, Results, Discussion, References)\n",
"\n",
"Technical/scholarly language\n",
"\n",
"Citations (in-text or bibliography)\n",
"\n",
"Research claims or data analysis\n",
"If NOT a research paper:\n",
"→ Respond: \"This doesn't appear to be a research paper. Please upload peer-reviewed academic literature for summarization.\"\n",
"\n",
"Step 2: Structured Summary (If verified)\n",
"Generate a 5-section summary in this exact format:\n",
"\n",
"1. Research Question\n",
"[Identify core problem/gap addressed in 1 sentence]\n",
"\n",
"2. Methodology\n",
"[Study design, data sources, analytical techniques in 2 bullet points]\n",
"\n",
"3. Key Findings\n",
"[3-4 quantified results with numerical evidence from tables/figures]\n",
"\n",
"4. Limitations\n",
"[2 major constraints acknowledged by authors]\n",
"\n",
"5. Significance\n",
"[Impact on field & practical implications in 1 sentence]\n",
"\n",
"Critical Rules:\n",
"Accuracy Priority: Never invent data. Write \"Not specified\" for missing elements\n",
"\n",
"Source Anchoring: Cite page/paragraph numbers for claims (e.g., \"Fig 3 shows 24% improvement\")\n",
"\n",
"Jargon Handling: Simplify complex terms using: [Technical Term → Layman Explanation] inline\n",
"\n",
"Bias Alert: Flag any undeclared funding/sponsorship conflicts\n",
"\n",
"Output Format: Strict Markdown with section headers, 200-word maximum\n",
"\n",
"Example Output:\n",
"1. Research Question\n",
"How does microplastic concentration affect zebrafish neural development?\n",
"\n",
"2. Methodology\n",
"\n",
"Exposed embryos to 0.1-10μm PET particles (5-100mg/L) for 96h\n",
"\n",
"Quantified gene expression (RT-qPCR) and behavioral assays (Open Field Test)\n",
"\n",
"3. Key Findings\n",
"▲ 40% reduction in neuron count at 50mg/L exposure (p<0.01, Fig 2B)\n",
"■ 2.3x increase in anxiolytic behavior (Table 3)\n",
"▼ 17% downregulation in shha expression (p=0.03)\n",
"\n",
"4. Limitations\n",
" \n",
"Used static exposure vs dynamic aquatic environments\n",
"\n",
"Limited proteomic validation\n",
"\n",
"5. Significance\n",
"Establishes dose-dependent neurotoxicity thresholds for aquatic toxicology regulations.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7620c685-c35c-4d6b-aaf1-a3da98f19ca7",
"metadata": {},
"outputs": [],
"source": [
"# A function that writes a User Prompt that asks for summaries of websites:\n",
"\n",
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
"please provide a summary of this website in markdown.\\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4257406-089b-45a3-bfb5-272004360a49",
"metadata": {},
"outputs": [],
"source": [
"def summarize(url):\n",
" website = Website(url)\n",
" response = client.models.generate_content(\n",
" model=\"gemini-2.5-flash-preview-05-20\",\n",
" config=types.GenerateContentConfig(\n",
" system_instruction=system_prompt),\n",
" contents=user_prompt_for(website)\n",
" )\n",
"\n",
" return response.text\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f68b32ae-9e65-4aa4-ae8d-cc2482c4a2e2",
"metadata": {},
"outputs": [],
"source": [
"def display_summary(url):\n",
" summary = summarize(url)\n",
" display(Markdown(summary))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae52543c-01c1-4262-b53c-95ef4e5a93aa",
"metadata": {},
"outputs": [],
"source": [
"display_summary(\"https://onlinelibrary.wiley.com/doi/full/10.1155/2021/8812542\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}