Files
LLM_Engineering_OLD/week1/community-contributions/training-summary-translation-length/jacquieAM/website-summary.ipynb

330 lines
10 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "9ab446e4-219c-4589-aa8f-9386adcf5c60",
"metadata": {},
"outputs": [],
"source": [
"## Project Overview\n",
"This project combines web scraping with OpenAIs GPT models to summarize online training content. It extracts material from Microsofts **Quantum Computing Fundamentals** learning path, cleans it, and generates concise summaries per lesson as well as an overall course summary. \n",
"\n",
"## Key Features\n",
"- Fetches and parses webpages using **requests** and **BeautifulSoup** \n",
"- Produces summaries in multiple languages (e.g., English, Spanish, or any language) and at varying levels of detail (short, medium, detailed) \n",
"- Summarizes individual lessons on demand or processes entire learning paths \n",
"- Presents results as clean, structured **Markdown** directly in the notebook \n",
"\n",
"## Tech Stack\n",
"- **Model**: GPT-4o-mini \n",
"- **Language**: Python \n",
"- **Libraries**: BeautifulSoup, OpenAI \n",
"\n",
"## Purpose\n",
"This project demonstrates how AI can streamline the understanding of technical documentation and online courses by generating multilingual, customizable summaries. \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI\n",
"\n",
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
"metadata": {},
"outputs": [],
"source": [
"# Load environment variables from .env file (not included)\n",
"\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()\n",
"\n",
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5e793b2-6775-426a-a139-4848291d0463",
"metadata": {},
"outputs": [],
"source": [
"# A class to represent a Webpage\n",
"\n",
"# Some websites need you to use proper headers when fetching them:\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
"\n",
" def __init__(self, url):\n",
" \"\"\"\n",
" Create this Website object from the given url using the BeautifulSoup library\n",
" \"\"\"\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"training_website = Website(\"https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/\")\n",
"print(training_website.title)\n",
"print(training_website.text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
"metadata": {},
"outputs": [],
"source": [
"# Create a system prompt function that can use different language and length \n",
"\n",
"def build_system_prompt(language=\"Spanish\", length=\"short\"):\n",
" return f\"\"\"You are an assistant that analyzes the contents of a website and provides a {length} summary, ignoring text that might be navigation related.\n",
" Respond in 20 words or less markdown, and respond in {language}.\n",
" \"\"\"\n",
" \n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "987c95a6-6618-4d22-a2c3-3038a9d3f154",
"metadata": {},
"outputs": [],
"source": [
"# Create a function that writes a User Prompt that asks for summaries of websites:\n",
"\n",
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
"please provide a short summary in {language} of this website in markdown. \\\n",
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a846c89-81d8-4f48-9d62-7744d76694e2",
"metadata": {},
"outputs": [],
"source": [
"print(user_prompt_for(training_website))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26448ec4-5c00-4204-baec-7df91d11ff2e",
"metadata": {},
"outputs": [],
"source": [
"print(user_prompt_for(training_website))"
]
},
{
"cell_type": "markdown",
"id": "d06e8d78-ce4c-4b05-aa8e-17050c82bb47",
"metadata": {},
"source": [
"## And now let's build useful messages for GPT-4o-mini, using a function"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def messages_for(website, language=\"Spanish\", length=\"short\"):\n",
" return [\n",
" {\"role\": \"system\", \"content\": build_system_prompt(language, length)},\n",
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
" ]"
]
},
{
"cell_type": "markdown",
"id": "16f49d46-bf55-4c3e-928f-68fc0bf715b0",
"metadata": {},
"source": [
"## Time to bring it together - the API for OpenAI is very simple!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "425214b8-c5c5-4d7a-8b79-f9e151c9d54f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "905b9919-aba7-45b5-ae65-81b3d1d78e34",
"metadata": {},
"outputs": [],
"source": [
"#call the OpenAI API. \n",
"\n",
"def summarize(url, language=\"Spanish\", length=\"short\"):\n",
" website = Website(url)\n",
" response = openai.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" messages=messages_for(website, language, length)\n",
" )\n",
" return response.choices[0].message.content\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c437357-d004-49f5-95c3-fce38aefcb5c",
"metadata": {},
"outputs": [],
"source": [
"#Summarize all the lessons in microsoft quantum computer training, having the option to summarize by lesson, or the training as a whole\n",
"\n",
"def summarize_training(path_url, language=\"Spanish\", length=\"short\"):\n",
" links = get_links_from_path(path_url)\n",
" print(f\"Found {len(links)} lessons\")\n",
"\n",
" all_summaries = []\n",
"\n",
" for link in links:\n",
" print(f\"Summarizing {link}...\")\n",
" summary = summarize(link, language, length)\n",
" all_summaries.append(f\"### {link}\\n{summary}\\n\")\n",
"\n",
" combined_prompt = \"Here are summaries of each lesson:\\n\\n\" + \"\\n\".join(all_summaries)\n",
" response = openai.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": build_system_prompt(language, length)},\n",
" {\"role\": \"user\", \"content\": \"Please summarize the entire training path based on these lesson summaries:\\n\\n\" + combined_prompt}\n",
" ]\n",
" )\n",
"\n",
" return \"\\n\".join(all_summaries) + \"\\n\\n## General Course Summary\\n\" + response.choices[0].message.content\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05e38d41-dfa4-4b20-9c96-c46ea75d9fb5",
"metadata": {},
"outputs": [],
"source": [
"summarize(\"https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d926d59-450e-4609-92ba-2d6f244f1342",
"metadata": {},
"outputs": [],
"source": [
"# A function to display this nicely in the Jupyter output, using markdown\n",
"\n",
"def display_summary(url):\n",
" summary = summarize(url)\n",
" display(Markdown(summary))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3018853a-445f-41ff-9560-d925d1774b2f",
"metadata": {},
"outputs": [],
"source": [
"display_summary(\"https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}