{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ba06289-d17a-4ccd-85f5-2b79956d4e59",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb6636be-e43f-4896-aadd-cafda003ed4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q -U google-genai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfe66209-1d33-4292-80f1-20e11baf4bc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.chrome.options import Options\n",
    "from selenium.webdriver.chrome.service import Service\n",
    "from bs4 import BeautifulSoup\n",
    "import time\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "from IPython.display import Markdown, display\n",
    "from google import genai\n",
    "from google.genai import types\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2b4306c-17d0-46fe-a889-7440ff809dc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#load env\n",
    "load_dotenv(override=True)\n",
    "api_key = os.getenv('GEMINI_API_KEY')\n",
    "\n",
    "# Check the key\n",
    "\n",
    "if not api_key:\n",
    "    print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
    "elif api_key.strip() != api_key:\n",
    "    print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
    "else:\n",
    "    print(\"API key found and looks good so far!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "08ec6fec-886c-4a0c-a046-e8643ad700d3",
   "metadata": {},
   "source": [
    "# Lets make a simple call for check our model is working fine or not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89143d5c-0013-4f7e-8e1f-f7db7e936f0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "client = genai.Client(api_key=api_key)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1144b77a-6785-479a-ab4f-bb0ab5624b49",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "response = client.models.generate_content(\n",
    "    model=\"gemini-2.5-flash-preview-05-20\",\n",
    "    contents=[\"hi gemini\"]\n",
    ")\n",
    "print(response.text)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bbf3836c-19b8-44e1-904a-f265925c2786",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "class Website:\n",
    "    def __init__(self, url, driver_path=None, wait_time=3):\n",
    "        self.url = url\n",
    "        self.wait_time = wait_time\n",
    "\n",
    "        # Headless Chrome settings\n",
    "        options = Options()\n",
    "        # options.add_argument(\"--headless\")  \n",
    "        # Headless mode runs the browser in the background (invisible).\n",
    "        # However, some websites (like openai.com) block headless browsers.\n",
    "        # So if this line is active, the page may not load correctly and you may not get the full content.\n",
    "        options.add_argument(\"--disable-gpu\")\n",
    "        options.add_argument(\"--no-sandbox\")\n",
    "        options.add_argument(\"--window-size=1920x1080\")\n",
    "\n",
    "        # Driver path\n",
    "        if driver_path:\n",
    "            service = Service(executable_path=driver_path)\n",
    "        else:\n",
    "            service = Service() \n",
    "\n",
    "        # Start browser\n",
    "        driver = webdriver.Chrome(service=service, options=options)\n",
    "        driver.get(url)\n",
    "\n",
    "        # Wait for the loading page\n",
    "        time.sleep(self.wait_time)\n",
    "\n",
    "        # Take page source\n",
    "        html = driver.page_source\n",
    "        driver.quit()\n",
    "\n",
    "        # Analysis with BeautifulSoup \n",
    "        soup = BeautifulSoup(html, 'html.parser')\n",
    "        self.title = soup.title.string if soup.title else \"No title found\"\n",
    "\n",
    "        # Clean irrelevant tags\n",
    "        for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
    "            irrelevant.decompose()\n",
    "\n",
    "        self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "852c52e2-bd4d-4bb9-94ef-e498c33f1a89",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_prompt = \"\"\"You are an academic research assistant specialized in summarizing scholarly papers. Follow this workflow rigorously:\n",
    "\n",
    "Step 1: Document Verification\n",
    "Verify if the input is a research paper by checking for:\n",
    "\n",
    "Presence of academic sections (Abstract, Introduction, Methodology, Results, Discussion, References)\n",
    "\n",
    "Technical/scholarly language\n",
    "\n",
    "Citations (in-text or bibliography)\n",
    "\n",
    "Research claims or data analysis\n",
    "If NOT a research paper:\n",
    "→ Respond: \"This doesn't appear to be a research paper. Please upload peer-reviewed academic literature for summarization.\"\n",
    "\n",
    "Step 2: Structured Summary (If verified)\n",
    "Generate a 5-section summary in this exact format:\n",
    "\n",
    "1. Research Question\n",
    "[Identify core problem/gap addressed in 1 sentence]\n",
    "\n",
    "2. Methodology\n",
    "[Study design, data sources, analytical techniques in 2 bullet points]\n",
    "\n",
    "3. Key Findings\n",
    "[3-4 quantified results with numerical evidence from tables/figures]\n",
    "\n",
    "4. Limitations\n",
    "[2 major constraints acknowledged by authors]\n",
    "\n",
    "5. Significance\n",
    "[Impact on field & practical implications in 1 sentence]\n",
    "\n",
    "Critical Rules:\n",
    "Accuracy Priority: Never invent data. Write \"Not specified\" for missing elements\n",
    "\n",
    "Source Anchoring: Cite page/paragraph numbers for claims (e.g., \"Fig 3 shows 24% improvement\")\n",
    "\n",
    "Jargon Handling: Simplify complex terms using: [Technical Term → Layman Explanation] inline\n",
    "\n",
    "Bias Alert: Flag any undeclared funding/sponsorship conflicts\n",
    "\n",
    "Output Format: Strict Markdown with section headers, 200-word maximum\n",
    "\n",
    "Example Output:\n",
    "1. Research Question\n",
    "How does microplastic concentration affect zebrafish neural development?\n",
    "\n",
    "2. Methodology\n",
    "\n",
    "Exposed embryos to 0.1-10μm PET particles (5-100mg/L) for 96h\n",
    "\n",
    "Quantified gene expression (RT-qPCR) and behavioral assays (Open Field Test)\n",
    "\n",
    "3. Key Findings\n",
    "▲ 40% reduction in neuron count at 50mg/L exposure (p<0.01, Fig 2B)\n",
    "■ 2.3x increase in anxiolytic behavior (Table 3)\n",
    "▼ 17% downregulation in shha expression (p=0.03)\n",
    "\n",
    "4. Limitations\n",
    "    \n",
    "Used static exposure vs dynamic aquatic environments\n",
    "\n",
    "Limited proteomic validation\n",
    "\n",
    "5. Significance\n",
    "Establishes dose-dependent neurotoxicity thresholds for aquatic toxicology regulations.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7620c685-c35c-4d6b-aaf1-a3da98f19ca7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# A function that writes a User Prompt that asks for summaries of websites:\n",
    "\n",
    "def user_prompt_for(website):\n",
    "    user_prompt = f\"You are looking at a website titled {website.title}\"\n",
    "    user_prompt += \"\\nThe contents of this website is as follows; \\\n",
    "please provide a summary of this website in markdown.\\n\\n\"\n",
    "    user_prompt += website.text\n",
    "    return user_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4257406-089b-45a3-bfb5-272004360a49",
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarize(url):\n",
    "    website = Website(url)\n",
    "    response = client.models.generate_content(\n",
    "        model=\"gemini-2.5-flash-preview-05-20\",\n",
    "        config=types.GenerateContentConfig(\n",
    "            system_instruction=system_prompt),\n",
    "        contents=user_prompt_for(website)\n",
    "    )\n",
    "\n",
    "    return response.text\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f68b32ae-9e65-4aa4-ae8d-cc2482c4a2e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def display_summary(url):\n",
    "    summary = summarize(url)\n",
    "    display(Markdown(summary))\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae52543c-01c1-4262-b53c-95ef4e5a93aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "display_summary(\"https://onlinelibrary.wiley.com/doi/full/10.1155/2021/8812542\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}