Merge branch 'main' of github.com:ed-donner/llm_engineering
This commit is contained in:
177
week1/community-contributions/CoolCodeSummarizer.ipynb
Normal file
177
week1/community-contributions/CoolCodeSummarizer.ipynb
Normal file
@@ -0,0 +1,177 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0b15b939-593a-4ccc-89bd-0cee09fe2f12",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Python Code Summarizer\n",
|
||||
"\n",
|
||||
"The Below code will summarize the python code and example it in details which can help codes better understand a forigen code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8dcf353c-e4f2-4ce7-a3b5-71b29700a148",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Imports\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"import os\n",
|
||||
"import openai\n",
|
||||
"from dotenv import load_dotenv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "111cf632-08e8-4246-a5bb-b56942789242",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e4f5376f-5e6f-4d75-81bf-222e34bfe828",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def read_code(**kwargs):\n",
|
||||
" \"\"\"\n",
|
||||
" You can pass two types of key word arguments to this function.\n",
|
||||
" code_path= Path to your complex python code.\n",
|
||||
" code= Passing raw python code.\n",
|
||||
" \"\"\"\n",
|
||||
" code_path = kwargs.get('code_path',None)\n",
|
||||
" code_raw = kwargs.get('code',None)\n",
|
||||
" \n",
|
||||
" if code_path:\n",
|
||||
" with open(code_path, 'r') as code_file:\n",
|
||||
" code = code_file.read()\n",
|
||||
" return (True, code)\n",
|
||||
"\n",
|
||||
" if code_raw:\n",
|
||||
" return (True, code_raw)\n",
|
||||
"\n",
|
||||
" return (False, None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "00743dac-0e70-45b7-879a-d7293a6f68a6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Model Prompt\n",
|
||||
"system_prompt = (\n",
|
||||
" \"You are a helpful assistant. The following input will be a Python code snippet. \"\n",
|
||||
" \"Your task is to:\\n\\n\"\n",
|
||||
" \"1. Summarize the overall purpose of the code.\\n\"\n",
|
||||
" \"2. Explain the code line by line, describing what each line does and why it's written that way.\\n\"\n",
|
||||
" \"3. Provide reasoning behind the code structure and logic to help novice Python developers understand the concepts better.\\n\\n\"\n",
|
||||
" \"Use Markdown format in your response. Make the explanation beginner-friendly, using code blocks, bullet points, and headings where helpful.\"\n",
|
||||
" ) \n",
|
||||
"# In a plot twist worthy of sci-fi, this prompt was written by ChatGPT...\n",
|
||||
"# to tell ChatGPT how to respond. We’ve officially entered the Matrix. 🤖🌀"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ed7d2447-32a9-4761-8b0a-b31814bee7e5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# Guess where I got this code from :)\n",
|
||||
"code_line = \"\"\"yeild from set(book.get(\"author)) for book in books if book.get(\"author\"))\"\"\"\n",
|
||||
"is_code, raw_code = read_code(code=code_line)\n",
|
||||
"\n",
|
||||
"if is_code:\n",
|
||||
" user_prompt = raw_code\n",
|
||||
"else:\n",
|
||||
" print(\"Invalid Arguments\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d74a1a39-1c24-4d4b-bd49-0ca416377a93",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_for():\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "df6c2726-d0fb-4ab6-b13b-d047e8807558",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarize():\n",
|
||||
" \n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages_for()\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8425144c-595e-4ad6-9801-3e8778d285c4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def display_summary():\n",
|
||||
" summary = summarize()\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "744bffdd-ec3c-4b27-b126-81bf3e8c8295",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -110,10 +110,24 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
||||
173
week1/community-contributions/day-1-youtube-video-summary.ipynb
Normal file
173
week1/community-contributions/day-1-youtube-video-summary.ipynb
Normal file
@@ -0,0 +1,173 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 78,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install youtube_transcript_api"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 79,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from youtube_transcript_api import YouTubeTranscriptApi"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 80,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 92,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"class YouTubeWebLink:\n",
|
||||
" def __init__(self, url):\n",
|
||||
" self.url = url\n",
|
||||
" self.video_id = self.get_video_id(url)\n",
|
||||
" self.set_openai_client()\n",
|
||||
" self.set_system_prompt()\n",
|
||||
"\n",
|
||||
" def get_video_id(self, url):\n",
|
||||
" \"\"\" extract youtube video id from url with regular expression \"\"\"\n",
|
||||
" regex = r\"(?:v=|be/)([a-zA-Z0-9_-]{11})\"\n",
|
||||
" match = re.search(regex, url)\n",
|
||||
" if match:\n",
|
||||
" return match.group(1)\n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"Probably not a YouTube URL\")\n",
|
||||
" \n",
|
||||
" def set_openai_client(self):\n",
|
||||
" self.openai = OpenAI()\n",
|
||||
" \n",
|
||||
" def set_system_prompt(self, system_prompt=None):\n",
|
||||
" \"\"\" set system prompt from youtube video \"\"\"\n",
|
||||
" self.system_prompt = \"\"\"\n",
|
||||
" You are a skilled explainer and storyteller who specializes in summarizing YouTube video transcripts in a way that's both engaging and informative. \n",
|
||||
" Your task is to:\n",
|
||||
" - Capture key points and main ideas of the video\n",
|
||||
" - Structure your summary with in clear sections\n",
|
||||
" - Include important details, facts, and figures mentioned\n",
|
||||
" - Never end your summary with a \"Conclusion\" section\n",
|
||||
" - Keep the summary short and easy to understand\n",
|
||||
" - Always format your response in markdown for better readability\n",
|
||||
" \"\"\" if system_prompt is None else system_prompt\n",
|
||||
"\n",
|
||||
" def get_transcript(self):\n",
|
||||
" \"\"\" get transcript from youtube video \"\"\"\n",
|
||||
" try:\n",
|
||||
" print('Fetching video transcript...')\n",
|
||||
" transcript = YouTubeTranscriptApi.get_transcript(self.video_id)\n",
|
||||
" return \" \".join([item['text'] for item in transcript])\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error fetching transcript: {e}\")\n",
|
||||
" return None\n",
|
||||
" \n",
|
||||
" def get_summary_from_transcript(self, transcript):\n",
|
||||
" \"\"\" summarize text using openai \"\"\"\n",
|
||||
" try:\n",
|
||||
" print('Summarizing video...')\n",
|
||||
" response = self.openai.chat.completions.create(\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": self.system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": f\"Summarize the following YouTube video transcript:\\n\\n{transcript}\"}\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error summarizing text: {e}\")\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
" def display_summary(self):\n",
|
||||
" \"\"\" summarize youtube video \"\"\"\n",
|
||||
" transcript = self.get_transcript()\n",
|
||||
" summary = self.get_summary_from_transcript(transcript)\n",
|
||||
" display(Markdown(summary))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 93,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# video link and share link of same youtube video\n",
|
||||
"test_url_1 = \"https://www.youtube.com/watch?v=nYy-umCNKPQ&list=PLWHe-9GP9SMMdl6SLaovUQF2abiLGbMjs\"\n",
|
||||
"test_url_2 = \"https://youtu.be/nYy-umCNKPQ?si=ILVrQlKT0W71G5pU\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test that we get same id\n",
|
||||
"video1, video2 = YouTubeWebLink(test_url_1), YouTubeWebLink(test_url_2)\n",
|
||||
"video1.video_id, video2.video_id"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"video1.display_summary()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llms",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
293
week1/community-contributions/day1-compare-websites.ipynb
Normal file
293
week1/community-contributions/day1-compare-websites.ipynb
Normal file
@@ -0,0 +1,293 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2c80b652-eadd-4d48-a512-d5945c0365d3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Compare websites"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables \n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
|
||||
"# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2aa190e5-cb31-456a-96cc-db109919cd78",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Website class"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c5e793b2-6775-426a-a139-4848291d0463",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
|
||||
"\n",
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f0275b1b-7cfe-4f9d-abfa-7650d378da0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d06e8d78-ce4c-4b05-aa8e-17050c82bb47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Website messages function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# See how this function creates exactly the format above\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "16f49d46-bf55-4c3e-928f-68fc0bf715b0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Website summary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "905b9919-aba7-45b5-ae65-81b3d1d78e34",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# And now: call the OpenAI API. You will get very familiar with this!\n",
|
||||
"\n",
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content\n",
|
||||
"\n",
|
||||
"# A function to display this nicely in the Jupyter output, using markdown\n",
|
||||
"\n",
|
||||
"def display_summary(summary): \n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3d926d59-450e-4609-92ba-2d6f244f1342",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"w1 = \"https://cnn.com\"\n",
|
||||
"summary1 = summarize(w1)\n",
|
||||
"display_summary(summary1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "45d83403-a24c-44b5-84ac-961449b4008f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"w2 = \"https://www.foxnews.com\"\n",
|
||||
"summary2 = summarize(w2)\n",
|
||||
"display_summary(summary2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0a51b45c-f3a6-4b0b-acfe-52957c04fd94",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Comparison between two websites"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4b30d5a5-bbe5-499c-9392-0896440f80c7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt_compare = \"\"\"You are a weblsite analyst that compares the summaries of two websites\n",
|
||||
"and provides a compare and contrast bewtween the two. \n",
|
||||
"Respond in markdown.\"\"\"\n",
|
||||
"\n",
|
||||
"def user_prompt_for_compare(summary1, summary2):\n",
|
||||
" user_prompt = f\"You are asked to compare this summary of a website {summary1}\\n\\n\"\n",
|
||||
" user_prompt += f\"\\nWith the summary of this second website {summary2}\\n\\n\"\n",
|
||||
" user_prompt += \"please provide a short comparison of the two websites. \\\n",
|
||||
"List the similarities and differences in bullet point format.\\n\\n\" \n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c5c9c955-840f-4c31-a1a7-b4872f77f3b4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_for_compare():\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt_compare},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for_compare(summary1, summary2)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "56307d77-f207-48f1-b59a-e97f6a2a37dd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def compare(): \n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages_for_compare()\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ae3140bb-ddad-43e2-b697-6d05ae541544",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(compare())"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,448 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "92d0aa2b-8e2f-4c1b-8b81-646faf4cd8c5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# And now the change for Ollama\n",
|
||||
"\n",
|
||||
"1. No environment variables are needed (no keys) so this part has been removed\n",
|
||||
"\n",
|
||||
"2. The OpenAI client library is being initialized to point to your local computer for Ollama\n",
|
||||
"\n",
|
||||
"3. You need to have installed Ollama on your computer, and run `ollama run llama3.2` in a Powershell or Terminal if you haven't already\n",
|
||||
"\n",
|
||||
"4. Anywhere in this lab that it used to have **gpt-4o-mini** it now has **lama3.2**\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Here it is - see the base_url\n",
|
||||
"\n",
|
||||
"openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "442fc84b-0815-4f40-99ab-d9a5da6bda91",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Let's make a quick call to a Frontier model to get started, as a preview!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a58394bf-1e45-46af-9bfd-01e24da6f49a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# To give you a preview -- calling OpenAI with these messages is this easy. Any problems, head over to the Troubleshooting notebook.\n",
|
||||
"\n",
|
||||
"message = \"Hello, Llama! This is my first ever message to you! Hi!\"\n",
|
||||
"response = openai.chat.completions.create(model=\"llama3.2\", messages=[{\"role\":\"user\", \"content\":message}])\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2aa190e5-cb31-456a-96cc-db109919cd78",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## OK onwards with our first project"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c5e793b2-6775-426a-a139-4848291d0463",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's try one out. Change the website and add print statements to follow along.\n",
|
||||
"\n",
|
||||
"ed = Website(\"https://sohanpatharla.vercel.app/about\")\n",
|
||||
"print(ed.title)\n",
|
||||
"print(\"Title is printed above\")\n",
|
||||
"print(ed.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
|
||||
"\n",
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f0275b1b-7cfe-4f9d-abfa-7650d378da0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d06e8d78-ce4c-4b05-aa8e-17050c82bb47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## And now let's build useful messages for GPT-4o-mini, using a function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# See how this function creates exactly the format above\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "16f49d46-bf55-4c3e-928f-68fc0bf715b0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Time to bring it together - the API for OpenAI is very simple!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "905b9919-aba7-45b5-ae65-81b3d1d78e34",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# And now: call the OpenAI API. You will get very familiar with this!\n",
|
||||
"\n",
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"llama3.2\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3d926d59-450e-4609-92ba-2d6f244f1342",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function to display this nicely in the Jupyter output, using markdown\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3018853a-445f-41ff-9560-d925d1774b2f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://sohanpatharla.vercel.app/about\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b3bcf6f4-adce-45e9-97ad-d9a5d7a3a624",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Let's try more websites\n",
|
||||
"\n",
|
||||
"Note that this will only work on websites that can be scraped using this simplistic approach.\n",
|
||||
"\n",
|
||||
"Websites that are rendered with Javascript, like React apps, won't show up. See the community-contributions folder for a Selenium implementation that gets around this. You'll need to read up on installing Selenium (ask ChatGPT!)\n",
|
||||
"\n",
|
||||
"Also Websites protected with CloudFront (and similar) may give 403 errors - many thanks Andy J for pointing this out.\n",
|
||||
"\n",
|
||||
"But many websites will work just fine!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "45d83403-a24c-44b5-84ac-961449b4008f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://openai.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "75e9fd40-b354-4341-991e-863ef2e59db7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://anthropic.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "490381df-3d03-4aaa-8f29-c5c10ace0ab5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Email Subject Suggestion based on the letter body"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "00743dac-0e70-45b7-879a-d7293a6f68a6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 1: Create your prompts\n",
|
||||
"\n",
|
||||
"system_prompt = \"\"\"You are an assistant that analyzes the contents of an email letter body \\\n",
|
||||
"and provide a appropriate short subject line for that email,based on that email body. \\\n",
|
||||
"\"\"\"\n",
|
||||
"user_prompt = \"\"\"\n",
|
||||
" \\nThe contents of an email body is as follows; \\\n",
|
||||
"understand the content in that well and provide me a appropriate subject based on the text content in it. \\\n",
|
||||
"Understand the sentiment of the email and choose the subject type to be formal or informal or anything.\\n\\n\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Step 2: Make the messages list\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" \n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt + \"\"\"\n",
|
||||
"Hey John, just wanted to say thanks for your help with the move last weekend! Couldn't have done it without you.\n",
|
||||
"\"\"\"},\n",
|
||||
"\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt + \"\"\"\n",
|
||||
"Dear Hiring Manager, I am writing to express my interest in the Marketing Manager position listed on your company’s website.\n",
|
||||
"\"\"\"},\n",
|
||||
"\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt + \"\"\"\n",
|
||||
"We are excited to invite you to our annual developer conference taking place in San Francisco this July. Register today to secure your spot!\n",
|
||||
"\"\"\"},\n",
|
||||
"\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt + \"\"\"\n",
|
||||
"Hello, I'm following up on the support ticket I submitted last week regarding the issue with logging into my account. I still haven’t received a resolution.\n",
|
||||
"\"\"\"},\n",
|
||||
"\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt + \"\"\"\n",
|
||||
"Congratulations! You've been selected as one of our winners in the Spring Giveaway Contest. Claim your prize by replying to this email.\n",
|
||||
"\"\"\"},\n",
|
||||
"\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt + \"\"\"\n",
|
||||
"Good morning team, just a reminder that our Q2 strategy meeting is scheduled for 10 AM tomorrow in Conference Room B.\n",
|
||||
"\"\"\"},\n",
|
||||
"\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt + \"\"\"\n",
|
||||
"Hi Mom, the flight was fine, and I got here safely. The weather’s great and the Airbnb is cozy. I’ll send pictures soon!\n",
|
||||
"\"\"\"},\n",
|
||||
"\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt + \"\"\"\n",
|
||||
"To whom it may concern, I am very dissatisfied with the quality of the product I received and would like a full refund.\n",
|
||||
"\"\"\"}\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Step 3: Call OpenAI\n",
|
||||
"\n",
|
||||
"response =openai.chat.completions.create(model=\"llama3.2\",messages=messages)\n",
|
||||
"\n",
|
||||
"# Step 4: print the result\n",
|
||||
"# response = openai.chat.completions.create(model=\"llama3.2\", messages=messages)\n",
|
||||
"#print(response.choices[0].message.content)\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "36ed9f14-b349-40e9-a42c-b367e77f8bda",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## An extra exercise for those who enjoy web scraping\n",
|
||||
"\n",
|
||||
"You may notice that if you try `display_summary(\"https://openai.com\")` - it doesn't work! That's because OpenAI has a fancy website that uses Javascript. There are many ways around this that some of you might be familiar with. For example, Selenium is a hugely popular framework that runs a browser behind the scenes, renders the page, and allows you to query it. If you have experience with Selenium, Playwright or similar, then feel free to improve the Website class to use them. In the community-contributions folder, you'll find an example Selenium solution from a student (thank you!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bf424661-6c39-4398-9983-9b02df7e9311",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install selenium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f4484fcf-8b39-4c3f-9674-37970ed71988",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Parse webpages which is designed using JavaScript heavely\n",
|
||||
"# download the chorme driver from here as per your version of chrome - https://developer.chrome.com/docs/chromedriver/downloads\n",
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.chrome.service import Service\n",
|
||||
"from selenium.webdriver.common.by import By\n",
|
||||
"from selenium.webdriver.chrome.options import Options\n",
|
||||
"\n",
|
||||
"PATH_TO_CHROME_DRIVER = r'C:\\Users\\sohan\\Downloads\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe'\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
" url: str\n",
|
||||
" title: str\n",
|
||||
" text: str\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" self.url = url\n",
|
||||
"\n",
|
||||
" options = Options()\n",
|
||||
"\n",
|
||||
" options.add_argument(\"--no-sandbox\")\n",
|
||||
" options.add_argument(\"--disable-dev-shm-usage\")\n",
|
||||
"\n",
|
||||
" service = Service(PATH_TO_CHROME_DRIVER)\n",
|
||||
" driver = webdriver.Chrome(service=service, options=options)\n",
|
||||
" driver.get(url)\n",
|
||||
"\n",
|
||||
" input(\"Please complete the verification in the browser and press Enter to continue...\")\n",
|
||||
" page_source = driver.page_source\n",
|
||||
" driver.quit()\n",
|
||||
"\n",
|
||||
" soup = BeautifulSoup(page_source, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "56989f9b-8efb-4cfb-a355-1c50d36cc9b2",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://openai.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "59b15b6d-3743-44a0-9dd4-23c9e9da6e3e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,431 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35f59eb3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Pluggable Web Scraper and Summarizer with Interface-Based Design\n",
|
||||
"\n",
|
||||
"This system implements a **pluggable architecture** for web scraping and summarization, built on interface-based design using Python’s `Protocol` types. Each stage of the pipeline—content fetching, HTML parsing, and LLM-based summarization—is defined through explicit structural contracts rather than concrete implementations. Components like `RequestsFetcher`, `RobustSoupParser`, and `OllamaClient` fulfill these protocols and can be swapped independently, enabling flexibility, testing, and future extension without modifying core logic. Immutable data models (`@dataclass(frozen=True)`) enforce data integrity throughout the pipeline, while the design cleanly separates concerns across modules to support maintainability and modular growth."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "f42e6d21",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from dataclasses import dataclass\n",
|
||||
"from typing import Protocol, Optional, List, Dict, Tuple\n",
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import logging\n",
|
||||
"import chardet"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "65c17368",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Configuration"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "eb0904d7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"logging.basicConfig(level=logging.INFO)\n",
|
||||
"logger = logging.getLogger(__name__)\n",
|
||||
"\n",
|
||||
"HEADERS = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\",\n",
|
||||
"}\n",
|
||||
"DEFAULT_TIMEOUT = 10\n",
|
||||
"UNWANTED_TAGS = [\"script\", \"style\", \"nav\", \"header\", \"footer\", \"img\", \"input\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8110aa46",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data Models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "cdb6c990",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@dataclass(frozen=True)\n",
|
||||
"class RawResponse:\n",
|
||||
" content: bytes\n",
|
||||
" status_code: int\n",
|
||||
" encoding: str\n",
|
||||
" headers: Dict[str, str]\n",
|
||||
" elapsed: float\n",
|
||||
" final_url: str\n",
|
||||
"\n",
|
||||
"@dataclass(frozen=True)\n",
|
||||
"class WebsiteContent:\n",
|
||||
" url: str\n",
|
||||
" title: str\n",
|
||||
" text: str\n",
|
||||
" status_code: int\n",
|
||||
" response_time: float\n",
|
||||
"\n",
|
||||
"@dataclass(frozen=True)\n",
|
||||
"class LLMResponse:\n",
|
||||
" content: str\n",
|
||||
" model: str\n",
|
||||
" tokens_used: int"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "87b2a97a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Protocols"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "3070eac2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class ContentFetcher(Protocol):\n",
|
||||
" def fetch(self, url: str) -> RawResponse: ...\n",
|
||||
"\n",
|
||||
"class ContentParser(Protocol):\n",
|
||||
" def parse(self, response: RawResponse) -> WebsiteContent: ...\n",
|
||||
"\n",
|
||||
"class LLMClient(Protocol):\n",
|
||||
" def generate(self, messages: List[Dict[str, str]], model: str) -> LLMResponse: ...\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "553daa11",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Implementations"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "1a42bed9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class RequestsFetcher:\n",
|
||||
" def __init__(self, \n",
|
||||
" headers: Dict[str, str] = HEADERS,\n",
|
||||
" timeout: int = DEFAULT_TIMEOUT,\n",
|
||||
" max_redirects: int = 5):\n",
|
||||
" self.headers = headers\n",
|
||||
" self.timeout = timeout\n",
|
||||
" self.max_redirects = max_redirects\n",
|
||||
"\n",
|
||||
" def fetch(self, url: str) -> RawResponse:\n",
|
||||
" logger.info(f\"Fetching content from {url}\")\n",
|
||||
" try:\n",
|
||||
" response = requests.get(\n",
|
||||
" url,\n",
|
||||
" headers=self.headers,\n",
|
||||
" timeout=self.timeout,\n",
|
||||
" allow_redirects=True,\n",
|
||||
" stream=False # Prevent partial content issues\n",
|
||||
" )\n",
|
||||
" response.raise_for_status()\n",
|
||||
" \n",
|
||||
" return RawResponse(\n",
|
||||
" content=response.content,\n",
|
||||
" status_code=response.status_code,\n",
|
||||
" encoding=response.encoding,\n",
|
||||
" headers=dict(response.headers),\n",
|
||||
" elapsed=response.elapsed.total_seconds(),\n",
|
||||
" final_url=response.url\n",
|
||||
" )\n",
|
||||
" except requests.exceptions.RequestException as e:\n",
|
||||
" logger.error(f\"Failed to fetch {url}: {str(e)}\")\n",
|
||||
" raise\n",
|
||||
"\n",
|
||||
"class RobustSoupParser:\n",
|
||||
" def __init__(self, unwanted_tags: Tuple[str] = UNWANTED_TAGS):\n",
|
||||
" self.unwanted_tags = unwanted_tags\n",
|
||||
"\n",
|
||||
" def parse(self, response: RawResponse) -> WebsiteContent:\n",
|
||||
" logger.info(f\"Parsing content from {response.final_url}\")\n",
|
||||
" \n",
|
||||
" # Detect encoding if not provided\n",
|
||||
" encoding = response.encoding or self._detect_encoding(response.content)\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" decoded_content = response.content.decode(encoding, errors='replace')\n",
|
||||
" soup = BeautifulSoup(decoded_content, 'html.parser')\n",
|
||||
" except Exception as e:\n",
|
||||
" logger.error(f\"Failed to parse content: {str(e)}\")\n",
|
||||
" raise\n",
|
||||
"\n",
|
||||
" return WebsiteContent(\n",
|
||||
" url=response.final_url,\n",
|
||||
" title=self._extract_title(soup),\n",
|
||||
" text=self._clean_content(soup),\n",
|
||||
" status_code=response.status_code,\n",
|
||||
" response_time=response.elapsed\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def _detect_encoding(self, content: bytes) -> str:\n",
|
||||
" result = chardet.detect(content)\n",
|
||||
" return result['encoding'] or 'utf-8'\n",
|
||||
"\n",
|
||||
" def _extract_title(self, soup: BeautifulSoup) -> str:\n",
|
||||
" title_tag = soup.find('title')\n",
|
||||
" return title_tag.text.strip() if title_tag else \"Untitled\"\n",
|
||||
"\n",
|
||||
" def _clean_content(self, soup: BeautifulSoup) -> str:\n",
|
||||
" # Remove unwanted tags\n",
|
||||
" for tag in self.unwanted_tags:\n",
|
||||
" for element in soup.find_all(tag):\n",
|
||||
" element.decompose()\n",
|
||||
"\n",
|
||||
" # Extract text with semantic line breaks\n",
|
||||
" text = '\\n\\n'.join([\n",
|
||||
" element.get_text().strip()\n",
|
||||
" for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'article'])\n",
|
||||
" if element.get_text().strip()\n",
|
||||
" ])\n",
|
||||
" \n",
|
||||
" return text or \"No readable content found\"\n",
|
||||
"\n",
|
||||
"class OllamaClient:\n",
|
||||
" def __init__(self, \n",
|
||||
" base_url: str = 'http://localhost:11434/v1',\n",
|
||||
" api_key: str = 'ollama',\n",
|
||||
" max_retries: int = 3):\n",
|
||||
" self.client = OpenAI(base_url=base_url, api_key=api_key)\n",
|
||||
" self.max_retries = max_retries\n",
|
||||
"\n",
|
||||
" def generate(self, \n",
|
||||
" messages: List[Dict[str, str]], \n",
|
||||
" model: str = \"llama3.2\") -> LLMResponse:\n",
|
||||
" logger.info(f\"Generating summary with {model}\")\n",
|
||||
" \n",
|
||||
" for attempt in range(self.max_retries):\n",
|
||||
" try:\n",
|
||||
" response = self.client.chat.completions.create(\n",
|
||||
" model=model,\n",
|
||||
" messages=messages\n",
|
||||
" )\n",
|
||||
" return LLMResponse(\n",
|
||||
" content=response.choices[0].message.content,\n",
|
||||
" model=model,\n",
|
||||
" tokens_used=response.usage.total_tokens\n",
|
||||
" )\n",
|
||||
" except Exception as e:\n",
|
||||
" if attempt == self.max_retries - 1:\n",
|
||||
" logger.error(f\"Failed after {self.max_retries} attempts: {str(e)}\")\n",
|
||||
" raise\n",
|
||||
" logger.warning(f\"Retry {attempt + 1}/{self.max_retries}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1805d4f8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Core Pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "a985806a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class SummarizationPipeline:\n",
|
||||
" SYSTEM_PROMPT = \"\"\"You are a professional web content analyst. Provide a structured markdown summary containing:\n",
|
||||
"- Key points\n",
|
||||
"- Notable statistics\n",
|
||||
"- Important names/dates\n",
|
||||
"- Actionable insights\n",
|
||||
"Avoid navigation content and marketing fluff.\"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self,\n",
|
||||
" fetcher: ContentFetcher,\n",
|
||||
" parser: ContentParser,\n",
|
||||
" llm_client: LLMClient):\n",
|
||||
" self.fetcher = fetcher\n",
|
||||
" self.parser = parser\n",
|
||||
" self.llm_client = llm_client\n",
|
||||
"\n",
|
||||
" def summarize(self, url: str, model: str = \"llama3.2\") -> LLMResponse:\n",
|
||||
" raw_response = self.fetcher.fetch(url)\n",
|
||||
" website_content = self.parser.parse(raw_response)\n",
|
||||
" messages = self._build_messages(website_content)\n",
|
||||
" return self.llm_client.generate(messages, model)\n",
|
||||
"\n",
|
||||
" def _build_messages(self, content: WebsiteContent) -> List[Dict[str, str]]:\n",
|
||||
" user_prompt = f\"\"\"**Website Analysis Request**\n",
|
||||
"URL: {content.url}\n",
|
||||
"Title: {content.title}\n",
|
||||
"\n",
|
||||
"Content:\n",
|
||||
"{content.text[:8000]} # Truncate to stay within context window\n",
|
||||
"\n",
|
||||
"Please provide a comprehensive summary following the guidelines above.\"\"\"\n",
|
||||
" \n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": self.SYSTEM_PROMPT},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "41832e20",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Factory & Presentation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "656b8dd4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_default_pipeline() -> SummarizationPipeline:\n",
|
||||
" return SummarizationPipeline(\n",
|
||||
" fetcher=RequestsFetcher(),\n",
|
||||
" parser=RobustSoupParser(),\n",
|
||||
" llm_client=OllamaClient()\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"class JupyterPresenter:\n",
|
||||
" @staticmethod\n",
|
||||
" def display(response: LLMResponse) -> None:\n",
|
||||
" display(Markdown(f\"\"\"\n",
|
||||
"## Summary Results\n",
|
||||
"**Model**: {response.model} \n",
|
||||
"**Tokens Used**: {response.tokens_used} \n",
|
||||
"**Summary**:\n",
|
||||
"{response.content}\n",
|
||||
" \"\"\"))\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "76339788",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "69304964",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:__main__:Fetching content from https://edwarddonner.com\n",
|
||||
"INFO:__main__:Parsing content from https://edwarddonner.com/\n",
|
||||
"INFO:__main__:Generating summary with llama3.2\n",
|
||||
"INFO:httpx:HTTP Request: POST http://localhost:11434/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"\n",
|
||||
"## Summary Results\n",
|
||||
"**Model**: llama3.2 \n",
|
||||
"**Tokens Used**: 630 \n",
|
||||
"**Summary**:\n",
|
||||
"**Website Analysis Summary**\n",
|
||||
"==========================\n",
|
||||
"\n",
|
||||
"### Key Points\n",
|
||||
"\n",
|
||||
"* The website belongs to Edward Donner, a co-founder and CTO of Nebula.io, an AI startup applying LLMs for talent discovery.\n",
|
||||
"* The website showcases Donner's interests in code writing, music production, and technology.\n",
|
||||
"* It announces the launch of The Complete Agentic AI Engineering Course and provides resources on LLM workshop and mastering AI.\n",
|
||||
"\n",
|
||||
"### Notable Statistics\n",
|
||||
"\n",
|
||||
"* None mentioned, as there are no explicit statistics provided on the website.\n",
|
||||
"\n",
|
||||
"### Important Names/Dates\n",
|
||||
"\n",
|
||||
"* Edward Donner: Website owner and CTO of Nebula.io.\n",
|
||||
"* 2021: Year in which AI startup untapt was acquired by an unknown party (no information about the acquirer is available).\n",
|
||||
"\n",
|
||||
"### Actionable Insights\n",
|
||||
"\n",
|
||||
"* The website appears to be a personal page showcasing Donner's expertise in AI, LLMs, and talent discovery. It may serve as a way for him to establish his professional brand and network with potential clients or collaborators.\n",
|
||||
"* Offering resources and courses, such as \"The Complete Agentic AI Engineering Course\" and workshops, can help attract visitors and demonstrate the company's capabilities.\n",
|
||||
"* Subscribing to the website might offer exclusive access to updates, insights on LLMs and talent discovery, and potentially lucrative career opportunities.\n",
|
||||
" "
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pipeline = create_default_pipeline()\n",
|
||||
"try:\n",
|
||||
" response = pipeline.summarize(\"https://edwarddonner.com\")\n",
|
||||
" JupyterPresenter.display(response)\n",
|
||||
"except Exception as e:\n",
|
||||
" logger.error(f\"Summarization failed: {str(e)}\")\n",
|
||||
" display(Markdown(\"## Error\\nUnable to generate summary. Please check the URL and try again.\"))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,273 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a15135e6-3ba5-44ae-b14b-dc67674a5ca3",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# Resarch Paper Summarizer by Name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a50f02ea-0f04-4f68-ae66-d1369780065e",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"### Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ea6e09ac-adee-4bb8-b3bd-4f6411495751",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## If dependencies do not exist please install them\n",
|
||||
"# !pip install python-dotenv openai arxiv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e5301f2b-3037-4a85-b7cd-5e6bd700418a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import arxiv\n",
|
||||
"import os\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ac45a1f4-0005-4e0a-be90-741182c1db9f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load Open AI Key"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "381bef97-6bb7-4bdc-a71d-2ea65c8f6964",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv()\n",
|
||||
"api_key = os.getenv(\"OPENAI_API_KEY\")\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"❌ No OpenAI API key found in .env file.\")\n",
|
||||
"else:\n",
|
||||
" print(\"✅ API key loaded successfully.\")\n",
|
||||
"\n",
|
||||
"# ✅ Initialize OpenAI\n",
|
||||
"openai = OpenAI(api_key=api_key)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "00817dbe-209e-418c-bb46-7b6b866fdff4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Main Class MLResearchFetcher"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7355ba4c-ef61-4934-bb79-4d80b4473e52",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class MLResearchFetcher:\n",
|
||||
" def __init__(self, system_prompt, query=\"machine learning\", max_results=5):\n",
|
||||
" self.query = query\n",
|
||||
" self.max_results = max_results\n",
|
||||
" self.system_prompt = system_prompt\n",
|
||||
"\n",
|
||||
" def fetch_papers(self):\n",
|
||||
" search = arxiv.Search(\n",
|
||||
" query=f'ti:\"{self.query}\"',\n",
|
||||
" max_results=self.max_results,\n",
|
||||
" sort_by=arxiv.SortCriterion.SubmittedDate,\n",
|
||||
" sort_order=arxiv.SortOrder.Descending,\n",
|
||||
" )\n",
|
||||
" return list(search.results())\n",
|
||||
"\n",
|
||||
" def summarize_abstract(self, abstract, system_prompt):\n",
|
||||
" try:\n",
|
||||
" completion = openai.chat.completions.create(\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": abstract}\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" return completion.choices[0].message.content.strip()\n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"❌ Error during summarization: {e}\"\n",
|
||||
"\n",
|
||||
" def display_results(self):\n",
|
||||
" papers = self.fetch_papers()\n",
|
||||
" for paper in papers:\n",
|
||||
" display(Markdown(f\"### 📄 [{paper.title}]({paper.entry_id})\"))\n",
|
||||
" display(Markdown(f\"**Authors:** {', '.join(author.name for author in paper.authors)}\"))\n",
|
||||
" display(Markdown(f\"**Published:** {paper.published.date()}\"))\n",
|
||||
" display(Markdown(f\"**Abstract:** {paper.summary.strip()}\"))\n",
|
||||
" summary = self.summarize_abstract(paper.summary, self.system_prompt)\n",
|
||||
" display(Markdown(f\"**🔍 Summary:** {summary}\"))\n",
|
||||
" display(Markdown(\"---\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "304857ba-e832-42a3-8219-ec9202e41509",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Helper Functions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1be2a2da-135b-4aec-b200-dc364d319ac4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an expert research paper summarizer and AI research assistant. \\\n",
|
||||
"When provided with the URL or content of a research paper in the field of machine learning, artificial intelligence, or data science, perform the following: \\\n",
|
||||
"1. **Extract and present** the following details in a clear, structured Markdown format: \\\n",
|
||||
" - Title and Author(s) \\\n",
|
||||
" - Year of Publication \\\n",
|
||||
" - Objective or Aim of the Research (Why the study was conducted) \\\n",
|
||||
" - Background or Introduction (What foundational knowledge or motivation led to this work) \\\n",
|
||||
" - Type of Research (e.g., empirical study, theoretical analysis, experimental benchmark) \\\n",
|
||||
" - Methods or Methodology (How the research was conducted: dataset, models, techniques used) \\\n",
|
||||
" - Results and Key Findings (What was discovered or proven) \\\n",
|
||||
" - Conclusion (Summary of insights, limitations, and proposed future work) \\\n",
|
||||
"\\\n",
|
||||
"2. **Evaluate** the impact and relevance of the paper: \\\n",
|
||||
" - Assess the significance of the research to the broader ML/AI community \\\n",
|
||||
" - Note any novelty, performance improvements, or theoretical breakthroughs \\\n",
|
||||
" - Comment on the potential applications or industry relevance \\\n",
|
||||
"\\\n",
|
||||
"3. **Suggest new research directions**: \\\n",
|
||||
" - Identify gaps, limitations, or unexplored ideas in the paper \\\n",
|
||||
" - Propose at least one new research idea or follow-up paper that builds upon this work \\\n",
|
||||
"\\\n",
|
||||
"Respond in a clean, professional Markdown format suitable for researchers or students reviewing the literature.\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f8b68134-c265-4272-87c4-e16fc205e7c4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def print_papers(papers):\n",
|
||||
" for paper in papers:\n",
|
||||
" title = paper.title\n",
|
||||
" authors = \", \".join(author.name for author in paper.authors)\n",
|
||||
" published = paper.published.strftime('%Y-%m-%d')\n",
|
||||
" abstract = paper.summary.strip()\n",
|
||||
" link = paper.entry_id\n",
|
||||
" pdf_link = [l.href for l in paper.links if l.title == 'pdf']\n",
|
||||
" categories = \", \".join(paper.categories)\n",
|
||||
"\n",
|
||||
" print(f\"\\n📄 Title: {title}\")\n",
|
||||
" print(f\"👥 Authors: {authors}\")\n",
|
||||
" print(f\"📅 Published: {published}\")\n",
|
||||
" print(f\"🏷️ Categories: {categories}\")\n",
|
||||
" print(f\"🔗 Link: {link}\")\n",
|
||||
" if pdf_link:\n",
|
||||
" print(f\"📄 PDF: {pdf_link[0]}\")\n",
|
||||
" print(f\"\\n📝 Abstract:\\n{abstract}\")\n",
|
||||
" print(\"-\" * 80)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9e688bbd-d3dd-4f2b-a7c3-d6e550ec9667",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Get the papers given the name of the paper"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6dcf9639-d6b5-4194-b6a2-5260329fcbe7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fetcher = MLResearchFetcher(system_prompt, query=\"QWEN2 TECHNICAL REPORT\", max_results=3)\n",
|
||||
"papers = fetcher.fetch_papers()\n",
|
||||
"print_papers(papers)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a04e219b-389f-4e0a-9645-662d966d4055",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Call the model and get the results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "297e915b-078a-49c7-836f-3c4ddf8e17dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fetcher.display_results()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2344499c-3b39-4497-a0bf-1cff83117fdc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
615
week1/community-contributions/day1_aniketk04.ipynb
Normal file
615
week1/community-contributions/day1_aniketk04.ipynb
Normal file
@@ -0,0 +1,615 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Instant Gratification\n",
|
||||
"\n",
|
||||
"## Your first Frontier LLM Project!\n",
|
||||
"\n",
|
||||
"Let's build a useful LLM solution - in a matter of minutes.\n",
|
||||
"\n",
|
||||
"By the end of this course, you will have built an autonomous Agentic AI solution with 7 agents that collaborate to solve a business problem. All in good time! We will start with something smaller...\n",
|
||||
"\n",
|
||||
"Our goal is to code a new kind of Web Browser. Give it a URL, and it will respond with a summary. The Reader's Digest of the internet!!\n",
|
||||
"\n",
|
||||
"Before starting, you should have completed the setup for [PC](../SETUP-PC.md) or [Mac](../SETUP-mac.md) and you hopefully launched this jupyter lab from within the project root directory, with your environment activated.\n",
|
||||
"\n",
|
||||
"## If you're new to Jupyter Lab\n",
|
||||
"\n",
|
||||
"Welcome to the wonderful world of Data Science experimentation! Once you've used Jupyter Lab, you'll wonder how you ever lived without it. Simply click in each \"cell\" with code in it, such as the cell immediately below this text, and hit Shift+Return to execute that cell. As you wish, you can add a cell with the + button in the toolbar, and print values of variables, or try out variations. \n",
|
||||
"\n",
|
||||
"I've written a notebook called [Guide to Jupyter](Guide%20to%20Jupyter.ipynb) to help you get more familiar with Jupyter Labs, including adding Markdown comments, using `!` to run shell commands, and `tqdm` to show progress.\n",
|
||||
"\n",
|
||||
"## If you'd prefer to work in IDEs\n",
|
||||
"\n",
|
||||
"If you're more comfortable in IDEs like VSCode or Pycharm, they both work great with these lab notebooks too. \n",
|
||||
"If you'd prefer to work in VSCode, [here](https://chatgpt.com/share/676f2e19-c228-8012-9911-6ca42f8ed766) are instructions from an AI friend on how to configure it for the course.\n",
|
||||
"\n",
|
||||
"## If you'd like to brush up your Python\n",
|
||||
"\n",
|
||||
"I've added a notebook called [Intermediate Python](Intermediate%20Python.ipynb) to get you up to speed. But you should give it a miss if you already have a good idea what this code does: \n",
|
||||
"`yield from {book.get(\"author\") for book in books if book.get(\"author\")}`\n",
|
||||
"\n",
|
||||
"## I am here to help\n",
|
||||
"\n",
|
||||
"If you have any problems at all, please do reach out. \n",
|
||||
"I'm available through the platform, or at ed@edwarddonner.com, or at https://www.linkedin.com/in/eddonner/ if you'd like to connect (and I love connecting!)\n",
|
||||
"\n",
|
||||
"## More troubleshooting\n",
|
||||
"\n",
|
||||
"Please see the [troubleshooting](troubleshooting.ipynb) notebook in this folder to diagnose and fix common problems. At the very end of it is a diagnostics script with some useful debug info.\n",
|
||||
"\n",
|
||||
"## If this is old hat!\n",
|
||||
"\n",
|
||||
"If you're already comfortable with today's material, please hang in there; you can move swiftly through the first few labs - we will get much more in depth as the weeks progress.\n",
|
||||
"\n",
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../important.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#900;\">Please read - important note</h2>\n",
|
||||
" <span style=\"color:#900;\">The way I collaborate with you may be different to other courses you've taken. I prefer not to type code while you watch. Rather, I execute Jupyter Labs, like this, and give you an intuition for what's going on. My suggestion is that you do this with me, either at the same time, or (perhaps better) right afterwards. Add print statements to understand what's going on, and then come up with your own variations. If you have a Github account, use this to showcase your variations. Not only is this essential practice, but it demonstrates your skills to others, including perhaps future clients or employers...</span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>\n",
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../business.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#181;\">Business value of these exercises</h2>\n",
|
||||
" <span style=\"color:#181;\">A final thought. While I've designed these notebooks to be educational, I've also tried to make them enjoyable. We'll do fun things like have LLMs tell jokes and argue with each other. But fundamentally, my goal is to teach skills you can apply in business. I'll explain business implications as we go, and it's worth keeping this in mind: as you build experience with models and techniques, think of ways you could put this into action at work today. Please do contact me if you'd like to discuss more or if you have ideas to bounce off me.</span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bc8e7064-bca4-48b5-8598-dee42658cab3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip install -q -U google-generativeai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6900b2a8-6384-4316-8aaa-5e519fca4254",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Connecting to OpenAI\n",
|
||||
"\n",
|
||||
"The next cell is where we load in the environment variables in your `.env` file and connect to OpenAI.\n",
|
||||
"\n",
|
||||
"## Troubleshooting if you have problems:\n",
|
||||
"\n",
|
||||
"Head over to the [troubleshooting](troubleshooting.ipynb) notebook in this folder for step by step code to identify the root cause and fix it!\n",
|
||||
"\n",
|
||||
"If you make a change, try restarting the \"Kernel\" (the python process sitting behind this notebook) by Kernel menu >> Restart Kernel and Clear Outputs of All Cells. Then try this notebook again, starting at the top.\n",
|
||||
"\n",
|
||||
"Or, contact me! Message me or email ed@edwarddonner.com and we will get this to work.\n",
|
||||
"\n",
|
||||
"Any concerns about API costs? See my notes in the README - costs should be minimal, and you can control it at every point. You can also use Ollama as a free alternative, which we discuss during Day 2."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
|
||||
"# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "442fc84b-0815-4f40-99ab-d9a5da6bda91",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Let's make a quick call to a Frontier model to get started, as a preview!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "81249b57-bf32-42a5-870d-411a58792dcc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from openai import OpenAI\n",
|
||||
"MODEL = \"llama3.2\"\n",
|
||||
"openai = OpenAI(base_url=\"http://localhost:11434/v1\", api_key=\"ollama\")\n",
|
||||
"\n",
|
||||
"response = openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=[{\"role\": \"user\", \"content\": \"What is 2 + 2?\"}]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a58394bf-1e45-46af-9bfd-01e24da6f49a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# To give you a preview -- calling OpenAI with these messages is this easy. Any problems, head over to the Troubleshooting notebook.\n",
|
||||
"\n",
|
||||
"message = \"Hello, GPT! This is my first ever message to you! Hi!\"\n",
|
||||
"response = openai.chat.completions.create(model=\"llama3.2\", messages=[{\"role\":\"user\", \"content\":message}])\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2aa190e5-cb31-456a-96cc-db109919cd78",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## OK onwards with our first project"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c5e793b2-6775-426a-a139-4848291d0463",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
" \"\"\"\n",
|
||||
" A utility class to represent a website that we have scraped\n",
|
||||
"\n",
|
||||
" \"\"\"\n",
|
||||
" url:str\n",
|
||||
" title:str\n",
|
||||
" text:str\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's try one out. Change the website and add print statements to follow along.\n",
|
||||
"\n",
|
||||
"ed = Website(\"https://edwarddonner.com\")\n",
|
||||
"print(ed.title)\n",
|
||||
"# print(ed.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6a478a0c-2c53-48ff-869c-4d08199931e1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Types of prompts\n",
|
||||
"\n",
|
||||
"You may know this already - but if not, you will get very familiar with it!\n",
|
||||
"\n",
|
||||
"Models like GPT4o have been trained to receive instructions in a particular way.\n",
|
||||
"\n",
|
||||
"They expect to receive:\n",
|
||||
"\n",
|
||||
"**A system prompt** that tells them what task they are performing and what tone they should use\n",
|
||||
"\n",
|
||||
"**A user prompt** -- the conversation starter that they should reply to"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
|
||||
"\n",
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f0275b1b-7cfe-4f9d-abfa-7650d378da0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "26448ec4-5c00-4204-baec-7df91d11ff2e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(user_prompt_for(ed))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ea211b5f-28e1-4a86-8e52-c0b7677cadcc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Messages\n",
|
||||
"\n",
|
||||
"The API from OpenAI expects to receive messages in a particular structure.\n",
|
||||
"Many of the other APIs share this structure:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"[\n",
|
||||
" {\"role\": \"system\", \"content\": \"system message goes here\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"user message goes here\"}\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"To give you a preview, the next 2 cells make a rather simple call - we won't stretch the might GPT (yet!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f25dcd35-0cd0-4235-9f64-ac37ed9eaaa5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a snarky assistant\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"What is 2 + 2?\"}\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "21ed95c5-7001-47de-a36d-1d6673b403ce",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# To give you a preview -- calling OpenAI with system and user messages:\n",
|
||||
"\n",
|
||||
"response = openai.chat.completions.create(model=\"llama3.2\", messages=messages)\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d06e8d78-ce4c-4b05-aa8e-17050c82bb47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## And now let's build useful messages for GPT-4o-mini, using a function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# See how this function creates exactly the format above\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "36478464-39ee-485c-9f3f-6a4e458dbc9c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Try this out, and then try for a few more websites\n",
|
||||
"\n",
|
||||
"messages_for(ed)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "16f49d46-bf55-4c3e-928f-68fc0bf715b0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Time to bring it together - the API for OpenAI is very simple!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "905b9919-aba7-45b5-ae65-81b3d1d78e34",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# And now: call the OpenAI API. You will get very familiar with this!\n",
|
||||
"!ollama pull llama3.2\n",
|
||||
"\n",
|
||||
"from openai import OpenAI\n",
|
||||
"MODEL = \"llama3.2\"\n",
|
||||
"openai = OpenAI(base_url=\"http://localhost:11434/v1\", api_key=\"ollama\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1b65dd67-8ae7-4932-85ad-128bf8850148",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarize(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3d926d59-450e-4609-92ba-2d6f244f1342",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function to display this nicely in the Jupyter output, using markdown\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3018853a-445f-41ff-9560-d925d1774b2f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b3bcf6f4-adce-45e9-97ad-d9a5d7a3a624",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Let's try more websites\n",
|
||||
"\n",
|
||||
"Note that this will only work on websites that can be scraped using this simplistic approach.\n",
|
||||
"\n",
|
||||
"Websites that are rendered with Javascript, like React apps, won't show up. See the community-contributions folder for a Selenium implementation that gets around this. You'll need to read up on installing Selenium (ask ChatGPT!)\n",
|
||||
"\n",
|
||||
"Also Websites protected with CloudFront (and similar) may give 403 errors - many thanks Andy J for pointing this out.\n",
|
||||
"\n",
|
||||
"But many websites will work just fine!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "45d83403-a24c-44b5-84ac-961449b4008f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://cnn.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "75e9fd40-b354-4341-991e-863ef2e59db7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://anthropic.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c951be1a-7f1b-448f-af1f-845978e47e2c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../business.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#181;\">Business applications</h2>\n",
|
||||
" <span style=\"color:#181;\">In this exercise, you experienced calling the Cloud API of a Frontier Model (a leading model at the frontier of AI) for the first time. We will be using APIs like OpenAI at many stages in the course, in addition to building our own LLMs.\n",
|
||||
"\n",
|
||||
"More specifically, we've applied this to Summarization - a classic Gen AI use case to make a summary. This can be applied to any business vertical - summarizing the news, summarizing financial performance, summarizing a resume in a cover letter - the applications are limitless. Consider how you could apply Summarization in your business, and try prototyping a solution.</span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>\n",
|
||||
"\n",
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../important.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#900;\">Before you continue - now try yourself</h2>\n",
|
||||
" <span style=\"color:#900;\">Use the cell below to make your own simple commercial example. Stick with the summarization use case for now. Here's an idea: write something that will take the contents of an email, and will suggest an appropriate short subject line for the email. That's the kind of feature that might be built into a commercial email tool.</span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "00743dac-0e70-45b7-879a-d7293a6f68a6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 1: Create your prompts\n",
|
||||
"\n",
|
||||
"system_prompt = \"you are an assistant which analyzes the website content and understand it\"\n",
|
||||
"user_prompt = \"\"\"\n",
|
||||
" Summarize the website https://www.github.com. ignore the components like input,forms etc\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Step 2: Make the messages list\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ] # fill this in\n",
|
||||
"\n",
|
||||
"# Step 3: Call OpenAI\n",
|
||||
"\n",
|
||||
"response =openai.chat.completions.create(\n",
|
||||
" model=\"llama3.2\",\n",
|
||||
" messages = messages )\n",
|
||||
"\n",
|
||||
"# Step 4: print the result\n",
|
||||
"summary = response.choices[0].message.content\n",
|
||||
"display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "36ed9f14-b349-40e9-a42c-b367e77f8bda",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## An extra exercise for those who enjoy web scraping\n",
|
||||
"\n",
|
||||
"You may notice that if you try `display_summary(\"https://openai.com\")` - it doesn't work! That's because OpenAI has a fancy website that uses Javascript. There are many ways around this that some of you might be familiar with. For example, Selenium is a hugely popular framework that runs a browser behind the scenes, renders the page, and allows you to query it. If you have experience with Selenium, Playwright or similar, then feel free to improve the Website class to use them. In the community-contributions folder, you'll find an example Selenium solution from a student (thank you!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eeab24dc-5f90-4570-b542-b0585aca3eb6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Sharing your code\n",
|
||||
"\n",
|
||||
"I'd love it if you share your code afterwards so I can share it with others! You'll notice that some students have already made changes (including a Selenium implementation) which you will find in the community-contributions folder. If you'd like add your changes to that folder, submit a Pull Request with your new versions in that folder and I'll merge your changes.\n",
|
||||
"\n",
|
||||
"If you're not an expert with git (and I am not!) then GPT has given some nice instructions on how to submit a Pull Request. It's a bit of an involved process, but once you've done it once it's pretty clear. As a pro-tip: it's best if you clear the outputs of your Jupyter notebooks (Edit >> Clean outputs of all cells, and then Save) for clean notebooks.\n",
|
||||
"\n",
|
||||
"Here are good instructions courtesy of an AI friend: \n",
|
||||
"https://chatgpt.com/share/677a9cb5-c64c-8012-99e0-e06e88afd293"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
112
week1/community-contributions/day1_selenium_chromedriver.ipynb
Normal file
112
week1/community-contributions/day1_selenium_chromedriver.ipynb
Normal file
@@ -0,0 +1,112 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3ba06289-d17a-4ccd-85f5-2b79956d4e59",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install selenium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eabbbc62-1de1-4883-9b3e-9c90145ea6c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.chrome.options import Options\n",
|
||||
"from selenium.webdriver.chrome.service import Service\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"import time\n",
|
||||
"import os \n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
" def __init__(self, url, driver_path=None, wait_time=3):\n",
|
||||
" self.url = url\n",
|
||||
" self.wait_time = wait_time\n",
|
||||
"\n",
|
||||
" # Headless Chrome settings\n",
|
||||
" options = Options()\n",
|
||||
" # options.add_argument(\"--headless\") \n",
|
||||
" # Headless mode runs the browser in the background (invisible).\n",
|
||||
" # However, some websites (like openai.com) block headless browsers.\n",
|
||||
" # So if this line is active, the page may not load correctly and you may not get the full content.\n",
|
||||
" options.add_argument(\"--disable-gpu\")\n",
|
||||
" options.add_argument(\"--no-sandbox\")\n",
|
||||
" options.add_argument(\"--window-size=1920x1080\")\n",
|
||||
"\n",
|
||||
" # Driver path\n",
|
||||
" if driver_path:\n",
|
||||
" service = Service(executable_path=driver_path)\n",
|
||||
" else:\n",
|
||||
" service = Service() \n",
|
||||
"\n",
|
||||
" # Start browser\n",
|
||||
" driver = webdriver.Chrome(service=service, options=options)\n",
|
||||
" driver.get(url)\n",
|
||||
"\n",
|
||||
" # Wait for the loading page\n",
|
||||
" time.sleep(self.wait_time)\n",
|
||||
"\n",
|
||||
" # Take page source\n",
|
||||
" html = driver.page_source\n",
|
||||
" driver.quit()\n",
|
||||
"\n",
|
||||
" # Analysis with BeautifulSoup \n",
|
||||
" soup = BeautifulSoup(html, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
"\n",
|
||||
" # Clean irrelevant tags\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
"\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "852c52e2-bd4d-4bb9-94ef-e498c33f1a89",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"site = Website(\"https://openai.com\", driver_path=\"/Users/gizemmervedemir/Downloads/chromedriver-mac-arm64/chromedriver\")\n",
|
||||
"print(\"Title:\", site.title)\n",
|
||||
"print(\"\\nFirst 500 character:\\n\", site.text[:500])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7620c685-c35c-4d6b-aaf1-a3da98f19ca7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
271
week1/community-contributions/day1_selenium_implementation.ipynb
Normal file
271
week1/community-contributions/day1_selenium_implementation.ipynb
Normal file
@@ -0,0 +1,271 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
|
||||
"# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
|
||||
"\n",
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f0275b1b-7cfe-4f9d-abfa-7650d378da0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# See how this function creates exactly the format above\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eeab24dc-5f90-4570-b542-b0585aca3eb6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Sharing your code\n",
|
||||
"\n",
|
||||
"I'd love it if you share your code afterwards so I can share it with others! You'll notice that some students have already made changes (including a Selenium implementation) which you will find in the community-contributions folder. If you'd like add your changes to that folder, submit a Pull Request with your new versions in that folder and I'll merge your changes.\n",
|
||||
"\n",
|
||||
"If you're not an expert with git (and I am not!) then GPT has given some nice instructions on how to submit a Pull Request. It's a bit of an involved process, but once you've done it once it's pretty clear. As a pro-tip: it's best if you clear the outputs of your Jupyter notebooks (Edit >> Clean outputs of all cells, and then Save) for clean notebooks\n",
|
||||
"\n",
|
||||
"Here are good instructions courtesy of an AI friend: \n",
|
||||
"https://chatgpt.com/share/677a9cb5-c64c-8012-99e0-e06e88afd293"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "acbb92b2-b625-4a37-b03a-09dc8f06b222",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install selenium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d6448a12-6aa1-4dd1-aaf1-c8a3a3c3ecb0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install webdriver-manager"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f4484fcf-8b39-4c3f-9674-37970ed71988",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"# Import necessary modules\n",
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.chrome.options import Options\n",
|
||||
"from selenium.webdriver.chrome.service import Service\n",
|
||||
"from webdriver_manager.chrome import ChromeDriverManager\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"class ScrapeWebsite:\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given URL using Selenium + BeautifulSoup\n",
|
||||
" Supports JavaScript-heavy and normal websites uniformly.\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
"\n",
|
||||
" # Configure headless Chrome\n",
|
||||
" options = Options()\n",
|
||||
" options.add_argument('--headless')\n",
|
||||
" options.add_argument('--no-sandbox')\n",
|
||||
" options.add_argument('--disable-dev-shm-usage')\n",
|
||||
"\n",
|
||||
" # Use webdriver-manager to manage ChromeDriver\n",
|
||||
" service = Service(ChromeDriverManager().install())\n",
|
||||
"\n",
|
||||
" # Initialize the Chrome WebDriver with the service and options\n",
|
||||
" driver = webdriver.Chrome(service=service, options=options)\n",
|
||||
"\n",
|
||||
" # Start Selenium WebDriver\n",
|
||||
" driver.get(url)\n",
|
||||
"\n",
|
||||
" # Wait for JS to load (adjust as needed)\n",
|
||||
" time.sleep(3)\n",
|
||||
"\n",
|
||||
" # Fetch the page source after JS execution\n",
|
||||
" page_source = driver.page_source\n",
|
||||
" driver.quit()\n",
|
||||
"\n",
|
||||
" # Parse the HTML content with BeautifulSoup\n",
|
||||
" soup = BeautifulSoup(page_source, 'html.parser')\n",
|
||||
"\n",
|
||||
" # Extract title\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
"\n",
|
||||
" # Remove unnecessary elements\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
"\n",
|
||||
" # Extract the main text\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f576f485-60c0-4539-bfb3-79d821ebefa4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarize_js_website(url):\n",
|
||||
" website = ScrapeWebsite(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "00ac3659-e4f0-4b64-8041-ba35bfa2c4c9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summary = summarize_js_website(\"https://dheerajmaddi.netlify.app/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d526136e-9960-4f09-aad0-32f8c11de0ac",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bcf1fd75-9964-4223-bcda-f2794bc9f7af",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,751 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# YOUR FIRST LAB\n",
|
||||
"### Please read this section. This is valuable to get you prepared, even if it's a long read -- it's important stuff.\n",
|
||||
"\n",
|
||||
"## Your first Frontier LLM Project\n",
|
||||
"\n",
|
||||
"Let's build a useful LLM solution - in a matter of minutes.\n",
|
||||
"\n",
|
||||
"By the end of this course, you will have built an autonomous Agentic AI solution with 7 agents that collaborate to solve a business problem. All in good time! We will start with something smaller...\n",
|
||||
"\n",
|
||||
"Our goal is to code a new kind of Web Browser. Give it a URL, and it will respond with a summary. The Reader's Digest of the internet!!\n",
|
||||
"\n",
|
||||
"Before starting, you should have completed the setup for [PC](../SETUP-PC.md) or [Mac](../SETUP-mac.md) and you hopefully launched this jupyter lab from within the project root directory, with your environment activated.\n",
|
||||
"\n",
|
||||
"## If you're new to Jupyter Lab\n",
|
||||
"\n",
|
||||
"Welcome to the wonderful world of Data Science experimentation! Once you've used Jupyter Lab, you'll wonder how you ever lived without it. Simply click in each \"cell\" with code in it, such as the cell immediately below this text, and hit Shift+Return to execute that cell. As you wish, you can add a cell with the + button in the toolbar, and print values of variables, or try out variations. \n",
|
||||
"\n",
|
||||
"I've written a notebook called [Guide to Jupyter](Guide%20to%20Jupyter.ipynb) to help you get more familiar with Jupyter Labs, including adding Markdown comments, using `!` to run shell commands, and `tqdm` to show progress.\n",
|
||||
"\n",
|
||||
"## If you're new to the Command Line\n",
|
||||
"\n",
|
||||
"Please see these excellent guides: [Command line on PC](https://chatgpt.com/share/67b0acea-ba38-8012-9c34-7a2541052665) and [Command line on Mac](https://chatgpt.com/canvas/shared/67b0b10c93a081918210723867525d2b). \n",
|
||||
"\n",
|
||||
"## If you'd prefer to work in IDEs\n",
|
||||
"\n",
|
||||
"If you're more comfortable in IDEs like VSCode or Pycharm, they both work great with these lab notebooks too. \n",
|
||||
"If you'd prefer to work in VSCode, [here](https://chatgpt.com/share/676f2e19-c228-8012-9911-6ca42f8ed766) are instructions from an AI friend on how to configure it for the course.\n",
|
||||
"\n",
|
||||
"## If you'd like to brush up your Python\n",
|
||||
"\n",
|
||||
"I've added a notebook called [Intermediate Python](Intermediate%20Python.ipynb) to get you up to speed. But you should give it a miss if you already have a good idea what this code does: \n",
|
||||
"`yield from {book.get(\"author\") for book in books if book.get(\"author\")}`\n",
|
||||
"\n",
|
||||
"## I am here to help\n",
|
||||
"\n",
|
||||
"If you have any problems at all, please do reach out. \n",
|
||||
"I'm available through the platform, or at ed@edwarddonner.com, or at https://www.linkedin.com/in/eddonner/ if you'd like to connect (and I love connecting!) \n",
|
||||
"And this is new to me, but I'm also trying out X/Twitter at [@edwarddonner](https://x.com/edwarddonner) - if you're on X, please show me how it's done 😂 \n",
|
||||
"\n",
|
||||
"## More troubleshooting\n",
|
||||
"\n",
|
||||
"Please see the [troubleshooting](troubleshooting.ipynb) notebook in this folder to diagnose and fix common problems. At the very end of it is a diagnostics script with some useful debug info.\n",
|
||||
"\n",
|
||||
"## If this is old hat!\n",
|
||||
"\n",
|
||||
"If you're already comfortable with today's material, please hang in there; you can move swiftly through the first few labs - we will get much more in depth as the weeks progress.\n",
|
||||
"\n",
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../important.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#900;\">Please read - important note</h2>\n",
|
||||
" <span style=\"color:#900;\">The way I collaborate with you may be different to other courses you've taken. I prefer not to type code while you watch. Rather, I execute Jupyter Labs, like this, and give you an intuition for what's going on. My suggestion is that you carefully execute this yourself, <b>after</b> watching the lecture. Add print statements to understand what's going on, and then come up with your own variations. If you have a Github account, use this to showcase your variations. Not only is this essential practice, but it demonstrates your skills to others, including perhaps future clients or employers...</span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>\n",
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../resources.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#f71;\">Treat these labs as a resource</h2>\n",
|
||||
" <span style=\"color:#f71;\">I push updates to the code regularly. When people ask questions or have problems, I incorporate it in the code, adding more examples or improved commentary. As a result, you'll notice that the code below isn't identical to the videos. Everything from the videos is here; but in addition, I've added more steps and better explanations, and occasionally added new models like DeepSeek. Consider this like an interactive book that accompanies the lectures.\n",
|
||||
" </span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>\n",
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../business.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#181;\">Business value of these exercises</h2>\n",
|
||||
" <span style=\"color:#181;\">A final thought. While I've designed these notebooks to be educational, I've also tried to make them enjoyable. We'll do fun things like have LLMs tell jokes and argue with each other. But fundamentally, my goal is to teach skills you can apply in business. I'll explain business implications as we go, and it's worth keeping this in mind: as you build experience with models and techniques, think of ways you could put this into action at work today. Please do contact me if you'd like to discuss more or if you have ideas to bounce off me.</span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6900b2a8-6384-4316-8aaa-5e519fca4254",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Connecting to OpenAI (or Ollama)\n",
|
||||
"\n",
|
||||
"The next cell is where we load in the environment variables in your `.env` file and connect to OpenAI. \n",
|
||||
"\n",
|
||||
"If you'd like to use free Ollama instead, please see the README section \"Free Alternative to Paid APIs\", and if you're not sure how to do this, there's a full solution in the solutions folder (day1_with_ollama.ipynb).\n",
|
||||
"\n",
|
||||
"## Troubleshooting if you have problems:\n",
|
||||
"\n",
|
||||
"Head over to the [troubleshooting](troubleshooting.ipynb) notebook in this folder for step by step code to identify the root cause and fix it!\n",
|
||||
"\n",
|
||||
"If you make a change, try restarting the \"Kernel\" (the python process sitting behind this notebook) by Kernel menu >> Restart Kernel and Clear Outputs of All Cells. Then try this notebook again, starting at the top.\n",
|
||||
"\n",
|
||||
"Or, contact me! Message me or email ed@edwarddonner.com and we will get this to work.\n",
|
||||
"\n",
|
||||
"Any concerns about API costs? See my notes in the README - costs should be minimal, and you can control it at every point. You can also use Ollama as a free alternative, which we discuss during Day 2."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
|
||||
"# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "442fc84b-0815-4f40-99ab-d9a5da6bda91",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Let's make a quick call to a Frontier model to get started, as a preview!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a58394bf-1e45-46af-9bfd-01e24da6f49a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# To give you a preview -- calling OpenAI with these messages is this easy. Any problems, head over to the Troubleshooting notebook.\n",
|
||||
"\n",
|
||||
"message = \"Hello, GPT! This is my first ever message to you! Hi!\"\n",
|
||||
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=[{\"role\":\"user\", \"content\":message}])\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2aa190e5-cb31-456a-96cc-db109919cd78",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## OK onwards with our first project"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c5e793b2-6775-426a-a139-4848291d0463",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's try one out. Change the website and add print statements to follow along.\n",
|
||||
"\n",
|
||||
"ed = Website(\"https://edwarddonner.com\")\n",
|
||||
"print(ed.title)\n",
|
||||
"print(ed.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6a478a0c-2c53-48ff-869c-4d08199931e1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Types of prompts\n",
|
||||
"\n",
|
||||
"You may know this already - but if not, you will get very familiar with it!\n",
|
||||
"\n",
|
||||
"Models like GPT4o have been trained to receive instructions in a particular way.\n",
|
||||
"\n",
|
||||
"They expect to receive:\n",
|
||||
"\n",
|
||||
"**A system prompt** that tells them what task they are performing and what tone they should use\n",
|
||||
"\n",
|
||||
"**A user prompt** -- the conversation starter that they should reply to"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
|
||||
"\n",
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f0275b1b-7cfe-4f9d-abfa-7650d378da0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "26448ec4-5c00-4204-baec-7df91d11ff2e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(user_prompt_for(ed))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ea211b5f-28e1-4a86-8e52-c0b7677cadcc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Messages\n",
|
||||
"\n",
|
||||
"The API from OpenAI expects to receive messages in a particular structure.\n",
|
||||
"Many of the other APIs share this structure:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"[\n",
|
||||
" {\"role\": \"system\", \"content\": \"system message goes here\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"user message goes here\"}\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"To give you a preview, the next 2 cells make a rather simple call - we won't stretch the mighty GPT (yet!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f25dcd35-0cd0-4235-9f64-ac37ed9eaaa5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a snarky assistant\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"What is 2 + 2?\"}\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "21ed95c5-7001-47de-a36d-1d6673b403ce",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# To give you a preview -- calling OpenAI with system and user messages:\n",
|
||||
"\n",
|
||||
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d06e8d78-ce4c-4b05-aa8e-17050c82bb47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## And now let's build useful messages for GPT-4o-mini, using a function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# See how this function creates exactly the format above\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "36478464-39ee-485c-9f3f-6a4e458dbc9c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Try this out, and then try for a few more websites\n",
|
||||
"\n",
|
||||
"messages_for(ed)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "16f49d46-bf55-4c3e-928f-68fc0bf715b0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Time to bring it together - the API for OpenAI is very simple!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "905b9919-aba7-45b5-ae65-81b3d1d78e34",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# And now: call the OpenAI API. You will get very familiar with this!\n",
|
||||
"\n",
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "05e38d41-dfa4-4b20-9c96-c46ea75d9fb5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarize(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3d926d59-450e-4609-92ba-2d6f244f1342",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function to display this nicely in the Jupyter output, using markdown\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3018853a-445f-41ff-9560-d925d1774b2f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b3bcf6f4-adce-45e9-97ad-d9a5d7a3a624",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Let's try more websites\n",
|
||||
"\n",
|
||||
"Note that this will only work on websites that can be scraped using this simplistic approach.\n",
|
||||
"\n",
|
||||
"Websites that are rendered with Javascript, like React apps, won't show up. See the community-contributions folder for a Selenium implementation that gets around this. You'll need to read up on installing Selenium (ask ChatGPT!)\n",
|
||||
"\n",
|
||||
"Also Websites protected with CloudFront (and similar) may give 403 errors - many thanks Andy J for pointing this out.\n",
|
||||
"\n",
|
||||
"But many websites will work just fine!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "45d83403-a24c-44b5-84ac-961449b4008f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://cnn.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "75e9fd40-b354-4341-991e-863ef2e59db7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://anthropic.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f84c01ba",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Install Selenium using Conda\n",
|
||||
"\n",
|
||||
"## First we need to install selenium package using conda"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "14d1ca84",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%conda install -c conda-forge selenium -y"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a5f35b45",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Change the website class to use selenium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ed2ebef8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.chrome.service import Service\n",
|
||||
"from selenium.webdriver.common.by import By\n",
|
||||
"from selenium.webdriver.chrome.options import Options\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this WebsiteSelenium object from the given URL using Selenium and BeautifulSoup.\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
"\n",
|
||||
" # Set up Selenium WebDriver with headless Chrome\n",
|
||||
" chrome_options = Options()\n",
|
||||
" chrome_options.add_argument(\"--no-sandbox\")\n",
|
||||
" chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
|
||||
" chrome_options.add_argument(\"--disable-blink-features=AutomationControlled\") # Prevent detection\n",
|
||||
" chrome_options.add_argument(\"--disable-infobars\") # Disable \"Chrome is being controlled\" infobar\n",
|
||||
" \n",
|
||||
" # Remove the default \"user-agent\" string\n",
|
||||
" # chrome_options.add_argument(\"user-agent=YOUR_CUSTOM_USER_AGENT\") # Use a user-agent string from a real browser\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" service = Service() # Use default ChromeDriver path\n",
|
||||
" driver = webdriver.Chrome(service=service, options=chrome_options)\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" # Fetch the webpage\n",
|
||||
" driver.get(url)\n",
|
||||
"\n",
|
||||
" # Get the page source\n",
|
||||
" page_source = driver.page_source\n",
|
||||
"\n",
|
||||
" # Parse the page source with BeautifulSoup\n",
|
||||
" soup = BeautifulSoup(page_source, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
|
||||
" finally:\n",
|
||||
" # Close the WebDriver\n",
|
||||
" driver.quit()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "66eae3bd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Now let's try again"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5f9ef6a1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://anthropic.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c951be1a-7f1b-448f-af1f-845978e47e2c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../business.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#181;\">Business applications</h2>\n",
|
||||
" <span style=\"color:#181;\">In this exercise, you experienced calling the Cloud API of a Frontier Model (a leading model at the frontier of AI) for the first time. We will be using APIs like OpenAI at many stages in the course, in addition to building our own LLMs.\n",
|
||||
"\n",
|
||||
"More specifically, we've applied this to Summarization - a classic Gen AI use case to make a summary. This can be applied to any business vertical - summarizing the news, summarizing financial performance, summarizing a resume in a cover letter - the applications are limitless. Consider how you could apply Summarization in your business, and try prototyping a solution.</span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>\n",
|
||||
"\n",
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../important.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#900;\">Before you continue - now try yourself</h2>\n",
|
||||
" <span style=\"color:#900;\">Use the cell below to make your own simple commercial example. Stick with the summarization use case for now. Here's an idea: write something that will take the contents of an email, and will suggest an appropriate short subject line for the email. That's the kind of feature that might be built into a commercial email tool.</span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0ab3a6bb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# CV improver for a job\n",
|
||||
"\n",
|
||||
"We are going to use AI to help us improve our linkedin profile for a given linkedIn Job URL.\n",
|
||||
"\n",
|
||||
"It will take in our profile URL and a job URL and it will output several recommendations on how to modify our profile to better match what is required in the job."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "62d46f7e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def user_prompt_for_job(candidate_profile_url, job_url):\n",
|
||||
" candidate_profile = Website(candidate_profile_url)\n",
|
||||
" user_prompt = f\"You are looking at a candidate profile titled {candidate_profile.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this candidate profile is as follows;\\n\"\n",
|
||||
" user_prompt += candidate_profile.text\n",
|
||||
"\n",
|
||||
" job = Website(job_url)\n",
|
||||
" user_prompt += \"\\nThis candidate wants to apply to following job: {job.title} \\n \"\n",
|
||||
" user_prompt += \"\\nThe details of the jobs is as follows; \\\n",
|
||||
" please provide the candidate at least 5 skills or areas of improvement to add \\\n",
|
||||
" to their linkedin profile.\\n\\n\"\n",
|
||||
" user_prompt += job.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "00743dac-0e70-45b7-879a-d7293a6f68a6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 1: Create your prompts\n",
|
||||
"\n",
|
||||
"system_prompt = \" You are a recruiter speciallised in HR and talent adquisition. \\\n",
|
||||
" You'll be analising the linkedin profile of a candidate and a published job \\\n",
|
||||
" and will give the candidate recommendations on how to modify their profile \\\n",
|
||||
" to better match the job. Respond in markdown.\"\n",
|
||||
"\n",
|
||||
"user_prompt = user_prompt_for_job(\n",
|
||||
" candidate_profile_url=\"https://www.linkedin.com/in/eddonner/\", \n",
|
||||
" job_url=\"https://www.linkedin.com/jobs/view/4130488506\")\n",
|
||||
"\n",
|
||||
"print(user_prompt)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d7535220",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# Step 2: Make the messages list\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Step 3: Call OpenAI\n",
|
||||
"response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
"response = response.choices[0].message.content\n",
|
||||
"\n",
|
||||
"# Step 4: print the result\n",
|
||||
"\n",
|
||||
"display(Markdown(response))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "36ed9f14-b349-40e9-a42c-b367e77f8bda",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## An extra exercise for those who enjoy web scraping\n",
|
||||
"\n",
|
||||
"You may notice that if you try `display_summary(\"https://openai.com\")` - it doesn't work! That's because OpenAI has a fancy website that uses Javascript. There are many ways around this that some of you might be familiar with. For example, Selenium is a hugely popular framework that runs a browser behind the scenes, renders the page, and allows you to query it. If you have experience with Selenium, Playwright or similar, then feel free to improve the Website class to use them. In the community-contributions folder, you'll find an example Selenium solution from a student (thank you!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eeab24dc-5f90-4570-b542-b0585aca3eb6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Sharing your code\n",
|
||||
"\n",
|
||||
"I'd love it if you share your code afterwards so I can share it with others! You'll notice that some students have already made changes (including a Selenium implementation) which you will find in the community-contributions folder. If you'd like add your changes to that folder, submit a Pull Request with your new versions in that folder and I'll merge your changes.\n",
|
||||
"\n",
|
||||
"If you're not an expert with git (and I am not!) then GPT has given some nice instructions on how to submit a Pull Request. It's a bit of an involved process, but once you've done it once it's pretty clear. As a pro-tip: it's best if you clear the outputs of your Jupyter notebooks (Edit >> Clean outputs of all cells, and then Save) for clean notebooks.\n",
|
||||
"\n",
|
||||
"Here are good instructions courtesy of an AI friend: \n",
|
||||
"https://chatgpt.com/share/677a9cb5-c64c-8012-99e0-e06e88afd293"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f4484fcf-8b39-4c3f-9674-37970ed71988",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llms",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,241 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "1b809d22-d170-4db3-a298-1740ce06b534",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Udemy Course >> LLM Engineering: Master AI and LLMs\n",
|
||||
"#Student: Jay\n",
|
||||
"#Date: Apr 20, 2025\n",
|
||||
"#Home work: Day1 - Summmarize website using local LLama\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "01e91579-7e32-4c4d-9cc9-c06d13c16209",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"\n",
|
||||
"OLLAMA_API = \"http://localhost:11434/api/chat\"\n",
|
||||
"HEADERS = {\"Content-Type\": \"application/json\"}\n",
|
||||
"MODEL = \"llama3.2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "8d780fba-868c-4216-88f5-1e3ca5ad43ed",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "839b645f-90ee-434d-b0bd-1cb4e574a8de",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "ef2453e8-3eca-4f6d-8ccf-9e5274b589a7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "6ec397d5-e9b0-411d-8bdb-66605273cb11",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a snarky assistant\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"What is 2 + 2?\"}\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "76aed9eb-a085-4687-859d-817c771156fa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "26de4682-cf4f-4b7e-8cb2-049f7f46b758",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
"\n",
|
||||
" response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=messages_for(website) \n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "16b2532a-d44c-4903-83ec-0b828a2d1b92",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "86af4905-5d5c-47c9-b9b2-27257452ff94",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"**Anthropic Website Summary**\n",
|
||||
"=====================================\n",
|
||||
"\n",
|
||||
"### Mission and Values\n",
|
||||
"\n",
|
||||
"Anthropic's mission is to build AI that serves humanity's long-term well-being. They focus on designing powerful technologies with human benefit at their foundation, aiming to demonstrate responsible AI development in practice.\n",
|
||||
"\n",
|
||||
"### Notable Releases\n",
|
||||
"\n",
|
||||
"#### 2025\n",
|
||||
"\n",
|
||||
"* **Claude 3.7 Sonnet**: Anthropic's most intelligent AI model, now available.\n",
|
||||
"* Recent news articles:\n",
|
||||
"\t+ \"Tracing the thoughts of a large language model: Interpretability\"\n",
|
||||
"\t+ \"Anthropic Economic Index: Societal Impacts\"\n",
|
||||
"\n",
|
||||
"### Products and Solutions\n",
|
||||
"\n",
|
||||
"* **Claude**: A suite of AI tools for building applications and custom experiences with human benefit in mind.\n",
|
||||
"* **Claude Overview**, **API Platform**, and various other products, including:\n",
|
||||
"\t+ **Claude 3.5 Haiku**\n",
|
||||
"\t+ **Claude 3 Opus**\n",
|
||||
"\n",
|
||||
"### Research and Commitments\n",
|
||||
"\n",
|
||||
"* The Anthropic Academy: A learning platform for developers to build AI solutions with Claude.\n",
|
||||
"* Responsible scaling policy and alignment science initiatives.\n",
|
||||
"\n",
|
||||
"### News Section (Selection)**\n",
|
||||
"\n",
|
||||
"Anthropic's recent news articles:\n",
|
||||
"* \"Claude extended thinking\"\n",
|
||||
"* \"Alignment faking in large language models\"\n",
|
||||
"\n",
|
||||
"### Company Information\n",
|
||||
"\n",
|
||||
"For more information on Anthropic, including company, careers, and help resources, follow the provided links."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display_summary(\"https://anthropic.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a5151062-614e-44ff-b341-d3f64e28aa93",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,248 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Welcome to your first assignment!\n",
|
||||
"\n",
|
||||
"Instructions are below. Please give this a try, and look in the solutions folder if you get stuck (or feel free to ask me!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ada885d9-4d42-4d9b-97f0-74fbbbfe93a9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../resources.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#f71;\">Just before we get to the assignment --</h2>\n",
|
||||
" <span style=\"color:#f71;\">I thought I'd take a second to point you at this page of useful resources for the course. This includes links to all the slides.<br/>\n",
|
||||
" <a href=\"https://edwarddonner.com/2024/11/13/llm-engineering-resources/\">https://edwarddonner.com/2024/11/13/llm-engineering-resources/</a><br/>\n",
|
||||
" Please keep this bookmarked, and I'll continue to add more useful links there over time.\n",
|
||||
" </span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "23057e00-b6fc-4678-93a9-6b31cb704bff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# There's actually an alternative approach that some people might prefer\n",
|
||||
"# You can use the OpenAI client python library to call Ollama:\n",
|
||||
"\n",
|
||||
"from openai import OpenAI\n",
|
||||
"ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
"\n",
|
||||
"response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=messages\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1622d9bb-5c68-4d4e-9ca4-b492c751f898",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# NOW the exercise for you\n",
|
||||
"\n",
|
||||
"Take the code from day1 and incorporate it here, to build a website summarizer that uses Llama 3.2 running locally instead of OpenAI; use either of the above approaches."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "37e35a64-7c2a-453d-96fa-9c8119c6618d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "fc410fe7-7abe-48ab-9206-ec6412278ac5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"\n",
|
||||
"OLLAMA_API = \"http://localhost:11434/api/chat\"\n",
|
||||
"HEADERS = {\"Content-Type\": \"application/json\"}\n",
|
||||
"MODEL = \"llama3.2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "654af616-1ad4-4d28-be41-f3c99b6e8f42",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"id": "f665c051-95a2-4102-8e26-1974bd5c7d3a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "258cf0af-650f-4225-b1c1-8f29e209ebfd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"id": "fe5291b0-a2bb-4b60-af77-d33517a7005b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" client = OpenAI(base_url=\"http://localhost:11434/v1\", api_key=\"ollama\")\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"id": "b53f34cd-f8ce-4656-a46a-33e966156e2e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function to display this nicely in the Jupyter output, using markdown\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"id": "5b28ccfa-eb27-4154-aeb6-aff439c8a723",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"**Website Summary**\n",
|
||||
"=====================\n",
|
||||
"\n",
|
||||
"### About the Website\n",
|
||||
"\n",
|
||||
"This website is owned by Edward Donner, a co-founder and CTO of Nebula.io, an AI company that applies AI to help people discover their potential. The website provides information about his background, experience, and work with LLMs (Large Language Models).\n",
|
||||
"\n",
|
||||
"### News and Announcements\n",
|
||||
"\n",
|
||||
"* **Upcoming Events:**\n",
|
||||
" + January 23, 2025: LLM Workshop - Hands-on with Agents - resources\n",
|
||||
" + December 21, 2024: Welcome to the SuperDataScientists community!\n",
|
||||
" + November 13, 2024: Mastering AI and LLL Engineering - Resources\n",
|
||||
" + October 16, 2024: From Software Engineer to AI Data Scientist - resources\n",
|
||||
"* **Acquisition:**\n",
|
||||
" + In 2021, Edward's previous startup untapt was acquired.\n",
|
||||
"\n",
|
||||
"### Links\n",
|
||||
"\n",
|
||||
"The website also provides links to Edward Donner's social media profiles (LinkedIn, Twitter, Facebook), as well as a newsletter signup form."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display_summary(\"https://edwarddonner.com\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
207
week1/community-contributions/day2_exercise_deepseek_r1.ipynb
Normal file
207
week1/community-contributions/day2_exercise_deepseek_r1.ipynb
Normal file
@@ -0,0 +1,207 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "2b00a7de-c563-4d41-b8ab-84128f0f3069",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "daa9de5c-6241-46aa-a51d-98bc154ee6e7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"\n",
|
||||
"OLLAMA_API = \"http://localhost:11434/api/chat\"\n",
|
||||
"HEADERS = {\"Content-Type\": \"application/json\"}\n",
|
||||
"MODEL = \"llama3.2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "f3bf8e10-5770-4081-b099-cf83e41126b8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "e6a5d9d5-a617-4ea4-9b03-3eae2dd4520d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "c01a0e24-ccf7-4359-a731-dcda6bfc5023",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "43f5df54-a34b-42cd-a6b2-e28996a84ff7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "b79e63fb-b741-4f4a-8bc4-66a60feef2cd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" # response = openai.chat.completions.create(\n",
|
||||
" # model = \"gpt-4o-mini\",\n",
|
||||
" # messages = messages_for(website)\n",
|
||||
" # )\n",
|
||||
" response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model=\"deepseek-r1:1.5b\",\n",
|
||||
" messages=messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "3298a858-e5de-4804-b188-06c0ce6471b0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "40dcb721-f807-47bf-9d18-f6a649c371e0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"<think>\n",
|
||||
"Alright, so I'm trying to figure out why CNN's \"World Today\" section is showing the word \"Gulf.\" I remember that \"Gulf\" was a significant event, but not sure if there have been any other notable events in the Gulf region around recent times. Is it about oil production or something else? Maybe natural gas?\n",
|
||||
"Wait, I've heard of an oil-producing company called BP, which is associated with the Gulf. They're big on Shell and Opec members like Russia. But does that make \"Gulf\" part of their content? Or is it more about how the Gulf looks in visual terms?\n",
|
||||
"\n",
|
||||
"I'm also thinking about news categories—global events, geopolitical stuff, tech, culture, etc.—and maybe a recent oil production related report. Maybe they show real-time data about how much oil has been produced or the availability in some country nearby.\n",
|
||||
"\n",
|
||||
"Hmm, I'm not sure if \"Gulf\" refers to Earth's location or just part of the Gulf region because they have two Gulfagoras islands as landmarks that sound similar to \"Gulf.\" Could it be a typo where the team named these after BP? But then if it were Gulfagoras, wouldn't they be more about geography than the actual oil aspect? Or maybe they were the names when BP was discovered?\n",
|
||||
"\n",
|
||||
"I think CNN is aiming for current news, so they probably show how something or another company has done in the Gulf. Since BP is big there, with data like gas production and costs, that might fit under geopolitical or energy news. But why specifically \"Gulf\"? Maybe to align with how some other teams handle Earth-related news.\n",
|
||||
"\n",
|
||||
"Overall, I'm leaning towards it being about BP's recent activities due to their oil involvement in the Gulf region. They typically cover geological products, energy, and production reports, so \"Gulf\" probably refers to those specific topics or regions.\n",
|
||||
"</think>\n",
|
||||
"\n",
|
||||
" CNN's \"World Today\" section shows \"Gulf,\" likely referring to BP's exploration of Earth resources, particularly in relation to their oil production in the Gulf region. BP is well-known for being part of the Gulf region and associated with major companies like Shell, Opec members such as Russia, and significant geological features like the Gulfagoras islands, which might be named after them due to BP's location or discovery nearby. Therefore, this title reflects their current geopolitical news focusing on energy-related activities in the Gulf.\n",
|
||||
"\n",
|
||||
"**Answer:** The \"Gulf\" section likely refers to the oil production activities of BP, linking them to the Gulf region and geologically significant features in Earth terms."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display_summary(\"https://cnn.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f41a586e-fe1f-4040-8ebb-31887981907f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,266 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Welcome to your first assignment!\n",
|
||||
"\n",
|
||||
"Instructions are below. Please give this a try, and look in the solutions folder if you get stuck (or feel free to ask me!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ada885d9-4d42-4d9b-97f0-74fbbbfe93a9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../resources.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#f71;\">Just before we get to the assignment --</h2>\n",
|
||||
" <span style=\"color:#f71;\">I thought I'd take a second to point you at this page of useful resources for the course. This includes links to all the slides.<br/>\n",
|
||||
" <a href=\"https://edwarddonner.com/2024/11/13/llm-engineering-resources/\">https://edwarddonner.com/2024/11/13/llm-engineering-resources/</a><br/>\n",
|
||||
" Please keep this bookmarked, and I'll continue to add more useful links there over time.\n",
|
||||
" </span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9cc85216-f6e4-436e-b6c1-976c8f2d1152",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install webdriver-manager\n",
|
||||
"!pip install selenium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"import ollama\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.chrome.options import Options\n",
|
||||
"from selenium.webdriver.chrome.service import Service\n",
|
||||
"from webdriver_manager.chrome import ChromeDriverManager\n",
|
||||
"import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29ddd15d-a3c5-4f4e-a678-873f56162724",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"MODEL = \"llama3.2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "479ff514-e8bd-4985-a572-2ea28bb4fa40",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's just make sure the model is loaded\n",
|
||||
"\n",
|
||||
"!ollama pull llama3.2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6a021f13-d6a1-4b96-8e18-4eae49d876fe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Introducing the ollama package\n",
|
||||
"\n",
|
||||
"And now we'll do the same thing, but using the elegant ollama python package instead of a direct HTTP call.\n",
|
||||
"\n",
|
||||
"Under the hood, it's making the same call as above to the ollama server running at localhost:11434"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a4704e10-f5fb-4c15-a935-f046c06fb13d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Alternative approach - using OpenAI python library to connect to Ollama"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "23057e00-b6fc-4678-93a9-6b31cb704bff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# There's actually an alternative approach that some people might prefer\n",
|
||||
"# You can use the OpenAI client python library to call Ollama:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1622d9bb-5c68-4d4e-9ca4-b492c751f898",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# NOW the exercise for you\n",
|
||||
"\n",
|
||||
"Take the code from day1 and incorporate it here, to build a website summarizer that uses Llama 3.2 running locally instead of OpenAI; use either of the above approaches."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8251b6a5-7b43-42b9-84a9-4a94b6bdb933",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"class ScrapeWebsite:\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given URL using Selenium + BeautifulSoup\n",
|
||||
" Supports JavaScript-heavy and normal websites uniformly.\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
"\n",
|
||||
" # Configure headless Chrome\n",
|
||||
" options = Options()\n",
|
||||
" options.add_argument('--headless')\n",
|
||||
" options.add_argument('--no-sandbox')\n",
|
||||
" options.add_argument('--disable-dev-shm-usage')\n",
|
||||
"\n",
|
||||
" # Use webdriver-manager to manage ChromeDriver\n",
|
||||
" service = Service(ChromeDriverManager().install())\n",
|
||||
"\n",
|
||||
" # Initialize the Chrome WebDriver with the service and options\n",
|
||||
" driver = webdriver.Chrome(service=service, options=options)\n",
|
||||
"\n",
|
||||
" # Start Selenium WebDriver\n",
|
||||
" driver.get(url)\n",
|
||||
"\n",
|
||||
" # Wait for JS to load (adjust as needed)\n",
|
||||
" time.sleep(3)\n",
|
||||
"\n",
|
||||
" # Fetch the page source after JS execution\n",
|
||||
" page_source = driver.page_source\n",
|
||||
" driver.quit()\n",
|
||||
"\n",
|
||||
" # Parse the HTML content with BeautifulSoup\n",
|
||||
" soup = BeautifulSoup(page_source, 'html.parser')\n",
|
||||
"\n",
|
||||
" # Extract title\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
"\n",
|
||||
" # Remove unnecessary elements\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
"\n",
|
||||
" # Extract the main text\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6de38216-6d1c-48c4-877b-86d403f4e0f8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
|
||||
"\n",
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\"\n",
|
||||
"\n",
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"# And now: call the OpenAI API. You will get very familiar with this!\n",
|
||||
"\n",
|
||||
"def summarize(url):\n",
|
||||
" website = ScrapeWebsite(url)\n",
|
||||
" response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model = MODEL,\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5dbf8d5c-a42a-4a72-b3a4-c75865b841bb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summary = summarize(\"https://edwarddonner.com/2024/11/13/llm-engineering-resources/\")\n",
|
||||
"display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4ddfacdc-b16a-4999-9ff2-93ed19600d24",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user