Merge pull request #586 from Oluwaseyi-A/community-contributions-branch

Add notebook: day1-research-paper-summarizer-with-highlighter
This commit is contained in:
Ed Donner
2025-08-16 09:02:32 +01:00
committed by GitHub

View File

@@ -0,0 +1,202 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "5c527a13-459e-4a46-b00e-f2c5056de155",
"metadata": {},
"source": [
"# Research Paper Summarizer with Text Highlighting"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "861a0be5-6da7-4f66-8f82-bc083a913f9f",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "74bf6765-53b6-457b-ac2d-0d1afa7fbf8f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"API key found and looks good so far!\n"
]
}
],
"source": [
"# Load environment variables in a file called .env\n",
"\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "227ed7af-d539-4c87-988b-80e6e049c863",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()\n",
"\n",
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
"# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "dcaadf8b-456d-48ca-af9d-9f57d3414308",
"metadata": {},
"outputs": [],
"source": [
"# A class to represent a Webpage\n",
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
"\n",
"# Some websites need you to use proper headers when fetching them:\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
"\n",
" def __init__(self, url):\n",
" \"\"\"\n",
" Create this Website object from the given url using the BeautifulSoup library\n",
" \"\"\"\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "6315093f-be68-408e-a5e1-6a2e4ea675e8",
"metadata": {},
"outputs": [],
"source": [
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at an article website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
"please provide a short summary of this website in markdown. \\\n",
"I'm also looking for complete statements containing the following keywords (if found): \\\n",
"'large circuit model', 'ChipGPT' \\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt\n",
"\n",
"\n",
"article = Website(\"https://arxiv.org/html/2401.12224v1\")\n",
"# print(user_prompt_for(article))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ff8a4112-f118-4866-b6cf-82675de0a38d",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"You are an assistant that analyzes the contents of a scientific \\\n",
"article for a PhD student (who has to read a lot of papers and journals). The \\\n",
"user will provide the article website and keyword(s) they are looking to learn and \\\n",
"cite from. Your job is to summarize the paper and point out all the statements \\\n",
"containing the specific keyword(s) the user typed. \\\n",
"Respond in markdown.\"\n",
"\n",
"\n",
"def messages_for(website):\n",
" return [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
" ]\n",
"\n",
" \n",
"#messages_for(article)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b5e47bea-403d-48c3-ab9d-4d6adef83241",
"metadata": {},
"outputs": [],
"source": [
"def summarize(url):\n",
" website = Website(url)\n",
" response = openai.chat.completions.create(\n",
" model = \"gpt-4o-mini\",\n",
" messages = messages_for(website)\n",
" )\n",
" return response.choices[0].message.content\n",
"\n",
"\n",
"def display_summary(url):\n",
" summary = summarize(url)\n",
" display(Markdown(summary))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f6ac1bc-5bc8-4daa-8174-d201400e517a",
"metadata": {},
"outputs": [],
"source": [
"display_summary(\"https://arxiv.org/html/2401.12224v1\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}