Merge branch 'main' of github.com:ed-donner/llm_engineering

This commit is contained in:
Edward Donner
2025-03-14 09:16:34 -04:00
44 changed files with 3914 additions and 3 deletions

View File

@@ -0,0 +1,273 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "fad31e32-2e42-42ae-ae63-c15d90292839",
"metadata": {},
"source": [
"# First Project\n",
"Ollama -> Summary\n",
"huggingface_hub -> \"facebook/m2m100_418M\" for translation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5fb79a20-a455-4d27-91a1-91958af786c1",
"metadata": {},
"outputs": [],
"source": [
"!pip install transformers datasets torch\n",
"!pip install huggingface_hub"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e95ac7f2-5192-4f83-acf3-61df30cd3109",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import json\n",
"import ollama"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12276d74-0e79-4e66-9135-1c9d1a80b943",
"metadata": {},
"outputs": [],
"source": [
"class Website:\n",
" def __init__(self, url):\n",
" self.url = url\n",
" response = requests.get(url)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
"\n",
"huggingface_url = \"https://huggingface.co/learn/ml-for-3d-course\"\n",
"huggingface_website = Website(huggingface_url)\n",
"\n",
"huggingface_data = {\n",
" \"title\": huggingface_website.title,\n",
" \"text\": huggingface_website.text\n",
"}\n",
"print(huggingface_data)\n",
"\n",
"with open('ml_for_3d_course_data.json', 'w') as f:\n",
" json.dump(huggingface_data, f)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d74c85c-3e09-4514-bde4-4cafc4910c52",
"metadata": {},
"outputs": [],
"source": [
"# huggingface_data 'text' value\n",
"huggingface_text = huggingface_data['text']\n",
"\n",
"# Summary\n",
"response_summary = ollama.chat(model=\"llama3.2:latest\", messages=[{\"role\": \"user\", \"content\": f\"Summarize the following text: {huggingface_text}\"}])\n",
"print(response_summary)\n",
"\n",
"# print summary\n",
"summary_huggingface_text = response_summary.message['content']\n",
"print(\"Summary Text:\", summary_huggingface_text)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d13764d5-cb76-46c5-bbe6-d132b31a9ea6",
"metadata": {},
"outputs": [],
"source": [
"# HuggingFace Translation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08405038-4115-487f-9efc-de58572453c1",
"metadata": {},
"outputs": [],
"source": [
"class Website:\n",
" url: str\n",
" title: str\n",
" text: str\n",
"\n",
" def __init__(self, url):\n",
" self.url = url\n",
" response = requests.get(url)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
"\n",
"url = \"https://huggingface.co/learn/ml-for-3d-course\"\n",
"website = Website(url)\n",
"print(website.title) \n",
"print(website.text[:1000])\n",
"\n",
"data = {\n",
" \"title\": website.title,\n",
" \"text\": website.text\n",
"}\n",
"\n",
"with open('ml_for_3d_course_data.json', 'w') as f:\n",
" json.dump(data, f)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0632352f-4b16-4125-83bf-f3cc3aabd659",
"metadata": {},
"outputs": [],
"source": [
"print(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a85f8625-725d-4d7f-8cb7-8da4276f81cf",
"metadata": {},
"outputs": [],
"source": [
"!pip install sacremoses"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c800cea4-f4a4-4e41-9637-31ff11afb256",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer\n",
"\n",
"# Load the M2M100 model and tokenizer\n",
"model_name = \"facebook/m2m100_418M\"\n",
"model = M2M100ForConditionalGeneration.from_pretrained(model_name)\n",
"tokenizer = M2M100Tokenizer.from_pretrained(model_name)\n",
"\n",
"# Load the saved JSON file\n",
"with open('ml_for_3d_course_data.json', 'r') as f:\n",
" data = json.load(f)\n",
"\n",
"# Extract text from the loaded data\n",
"text = data[\"text\"]\n",
"\n",
"# Set the source language to English and target language to Korean\n",
"source_lang = \"en\"\n",
"target_lang = \"ko\"\n",
"\n",
"# Set the language for tokenizer (important for M2M100)\n",
"tokenizer.src_lang = source_lang\n",
"tokenizer.tgt_lang = target_lang\n",
"\n",
"# Split text into smaller chunks if it's too large\n",
"# This step ensures we don't exceed the model's maximum length (512 tokens)\n",
"max_input_length = 512\n",
"chunks = [text[i:i+max_input_length] for i in range(0, len(text), max_input_length)]\n",
"\n",
"print(chunks)\n",
"# Initialize a list to hold the translated text\n",
"translated_chunks = []\n",
"\n",
"# Iterate through each chunk and translate it\n",
"for chunk in chunks:\n",
" # Tokenize the chunk\n",
" encoded = tokenizer(chunk, return_tensors=\"pt\", padding=True, truncation=True, max_length=512)\n",
"\n",
" # Generate translation from the model, forcing the output to be in Korean\n",
" generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(target_lang), max_length=512)\n",
"\n",
" # Decode the translated tokens to text\n",
" translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]\n",
" translated_chunks.append(translated_text)\n",
"\n",
"# Combine all translated chunks back together\n",
"final_translated_text = ' '.join(translated_chunks)\n",
"print(\"Translated Text:\", final_translated_text)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ffe0f264-a588-422f-a6e1-b60504d1e02c",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import requests\n",
"\n",
"# Ollama API URL 설정\n",
"ollama_url = \"http://localhost:11411/v1/models/facebook/m2m100_418M/generate\"\n",
"\n",
"# 저장된 JSON 파일 로드\n",
"with open('ml_for_3d_course_data.json', 'r') as f:\n",
" data = json.load(f)\n",
"\n",
"# 텍스트 추출\n",
"course_text = data[\"text\"]\n",
"\n",
"# 번역할 소스 언어 및 타겟 언어 설정\n",
"source_language = \"en\"\n",
"target_language = \"ko\"\n",
"\n",
"# 데이터 준비\n",
"payload = {\n",
" \"input_text\": course_text,\n",
" \"src_lang\": source_language,\n",
" \"tgt_lang\": target_language\n",
"}\n",
"\n",
"# API 호출\n",
"response = requests.post(ollama_url, json=payload)\n",
"\n",
"# 응답 확인\n",
"if response.status_code == 200:\n",
" translated_course_text = response.json().get(\"translated_text\", \"Translation failed\")\n",
" print(\"Translated Course Text:\", translated_course_text)\n",
"else:\n",
" print(f\"Error {response.status_code}: {response.text}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,279 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "603cd418-504a-4b4d-b1c3-be04febf3e79",
"metadata": {},
"source": [
"# Article Title Generator\n",
"\n",
"Summarization use-case in which the user provides an article, which the LLM will analyze to suggest an SEO-optimized title.\n",
"\n",
"**NOTES**:\n",
"\n",
"1. This version does NOT support website scrapping. You must copy and paste the required article.\n",
"2. The following models were configured:\n",
" a. OpenAI gpt-4o-mini\n",
" b. Llama llama3.2\n",
" c. Deepseek deepseek-r1:1.5b\n",
" It is possible to configure additional models by adding the new model to the MODELS dictionary and its\n",
" initialization to the CLIENTS dictionary. Then, call the model with --> ***answer =\n",
" get_answer('NEW_MODEL')***.\n",
"3. Users are encouraged to assess and rank the suggested titles using any headline analyzer tool online.\n",
" Example: https://www.isitwp.com/headline-analyzer/. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e773daa6-d05e-49bf-ad8e-a8ed4882b77e",
"metadata": {},
"outputs": [],
"source": [
"# Confirming Llama is loaded\n",
"!ollama pull llama3.2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "279b0c00-9bb0-4c7f-9c6d-aa0b108274b9",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4730d8d-3e20-4f3c-a4ff-ed2ac0a8aa27",
"metadata": {},
"outputs": [],
"source": [
"# set environment variables for OpenAi\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# validate API Key\n",
"if not api_key:\n",
" raise ValueError(\"No API key was found! Please check the .env file.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1abbb826-de66-498c-94d8-33369ad01885",
"metadata": {},
"outputs": [],
"source": [
"# constants\n",
"MODELS = { 'GPT': 'gpt-4o-mini', \n",
" 'LLAMA': 'llama3.2', \n",
" 'DEEPSEEK': 'deepseek-r1:1.5b'\n",
" }\n",
"\n",
"CLIENTS = { 'GPT': OpenAI(), \n",
" 'LLAMA': OpenAI(base_url='http://localhost:11434/v1', api_key='ollama'),\n",
" 'DEEPSEEK': OpenAI(base_url='http://localhost:11434/v1', api_key='ollama') \n",
" }"
]
},
{
"cell_type": "markdown",
"id": "6f490fe4-32d5-41f3-890d-ecf4e5e01dd4",
"metadata": {},
"source": [
"### Copy & paste your article (without a title)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ddd76319-13ce-480b-baa7-cab6a5c88168",
"metadata": {},
"outputs": [],
"source": [
"# article - copy & paste your article\n",
"article = \"\"\"\n",
" REPLACE WITH YOUR ARTICLE CONTENT\n",
" \"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1914afad-dbd8-4c1f-8e68-80b0e5d743a9",
"metadata": {},
"outputs": [],
"source": [
"# system prompt\n",
"system_prompt = \"\"\"\n",
" You are an experienced SEO-focused copywriter. The user will provide an article, and your task is to analyze its content and generate the most effective, keyword-optimized title to maximize SEO performance.Respond in Markdown format.\n",
" \"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "176cfac7-5e6d-4d4a-a1c4-1b63b60de1f7",
"metadata": {},
"outputs": [],
"source": [
"# user prompt\n",
"user_prompt = f\"Following the article to be analyzed. Respond in Markdown format./n/n{article}\"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c45fc7d7-08c9-4e34-b427-b928a219bb94",
"metadata": {},
"outputs": [],
"source": [
"# message list\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f67b881f-1040-4cf7-82c5-e85f4c0bd252",
"metadata": {},
"outputs": [],
"source": [
"# call model and get answer\n",
"def get_answer(model):\n",
" # set required client\n",
" client = CLIENTS[model]\n",
"\n",
" # call model\n",
" response = client.chat.completions.create(\n",
" model=MODELS[model],\n",
" messages=messages\n",
" )\n",
" \n",
" # return answer\n",
" return response.choices[0].message.content\n",
" "
]
},
{
"cell_type": "markdown",
"id": "947b42ed-5b43-486d-8af3-e5b671c1fd0e",
"metadata": {},
"source": [
"### Get OpenAI Suggested Title"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb6f66e3-ab99-4f76-9358-896cb43c1fa1",
"metadata": {},
"outputs": [],
"source": [
"# get openAi answer\n",
"answer = get_answer('GPT')\n",
"\n",
"# display openAi answer\n",
"display(Markdown(f\"### {MODELS['GPT']} Answer\\n\\n{answer}\" ))"
]
},
{
"cell_type": "markdown",
"id": "70073ebf-a00a-416b-854d-642d450cd99b",
"metadata": {},
"source": [
"### Get Llama Suggested Title"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "caa190bb-de5f-45cc-b671-5d62688f7b25",
"metadata": {},
"outputs": [],
"source": [
"# get Llama answer\n",
"answer = get_answer('LLAMA')\n",
"\n",
"# display Llama answer\n",
"display(Markdown(f\"### {MODELS['LLAMA']} Answer\\n\\n{answer}\" ))"
]
},
{
"cell_type": "markdown",
"id": "811edc4f-20e2-482d-ac89-fae9d1b70bed",
"metadata": {},
"source": [
"### Get Deepseek Suggested Title"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "082628e4-ff4c-46dd-ae5f-76578eb017ad",
"metadata": {},
"outputs": [],
"source": [
"# get Deepseek answer\n",
"answer = get_answer('DEEPSEEK')\n",
"\n",
"# display Deepseek answer\n",
"display(Markdown(f\"### {MODELS['DEEPSEEK']} Answer\\n\\n{answer}\" ))"
]
},
{
"cell_type": "markdown",
"id": "7fc404a6-3a91-4c09-89de-867d3d69b4b2",
"metadata": {},
"source": [
"### Suggested future improvements\n",
"\n",
"1. Add website scrapping support to replace copy/pasting of articles.\n",
"2. Improve the system_prompt to provide specific SEO best practices to adopt during the title generation.\n",
"3. Rephrase the system_prompt to ensure the model provides a single Title (not a list of suggestions). \n",
"4. Add the logic that would allow each model to assess the recommendations from the different models and \n",
" select the best among these. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf7403ac-d43b-4493-98bb-6fee94950cb0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,472 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "603cd418-504a-4b4d-b1c3-be04febf3e79",
"metadata": {},
"source": [
"# Article Title Generator (V2)\n",
"\n",
"Summarization use-case in which the user provides an article, which the LLM will analyze to suggest an SEO-optimized title.\n",
"\n",
"**NOTES**:\n",
"\n",
"1. This version supports website scrapping using Selenium (based on the code from **/week1/community-\n",
" contributions/day1-webscraping-selenium-for-javascript.ipynb** - Thanks for the contribution!)\n",
"2. Leverage streaming (OpenAI only).\n",
"3. The following models were configured:\\\n",
" \n",
" a. OpenAI gpt-4o-mini\\\n",
" b. Llama llama3.2\\\n",
" c. Deepseek deepseek-r1:1.5b\\\n",
"\n",
" It is possible to configure additional models by adding the new model to the MODELS dictionary and its\n",
" initialization to the CLIENTS dictionary. Then, call the model with --> ***answer =\n",
" get_answer('NEW_MODEL')***.\n",
"5. Improved system_prompt to provide specific SEO best practices to adopt during the title generation.\n",
"6. Rephrased the system_prompt to ensure the model provides a single Title (not a list of suggestions).\n",
"7. Includes function to remove unrequired thinking/reasoning verbose from the model response (Deepseek). \n",
"8. Users are encouraged to assess and rank the suggested titles using any headline analyzer tool online.\n",
" Example: https://www.isitwp.com/headline-analyzer/. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "115004a8-747a-4954-9580-1ed548f80336",
"metadata": {},
"outputs": [],
"source": [
"# install required libraries if they were not part of the requirements.txt\n",
"!pip install selenium\n",
"!pip install undetected-chromedriver"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e773daa6-d05e-49bf-ad8e-a8ed4882b77e",
"metadata": {},
"outputs": [],
"source": [
"# confirming Llama is loaded\n",
"!ollama pull llama3.2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "279b0c00-9bb0-4c7f-9c6d-aa0b108274b9",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"import undetected_chromedriver as uc\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"import time\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4730d8d-3e20-4f3c-a4ff-ed2ac0a8aa27",
"metadata": {},
"outputs": [],
"source": [
"# set environment variables for OpenAi\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# validate API Key\n",
"if not api_key:\n",
" raise ValueError(\"No API key was found! Please check the .env file.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1abbb826-de66-498c-94d8-33369ad01885",
"metadata": {},
"outputs": [],
"source": [
"# constants\n",
"MODELS = { 'GPT': 'gpt-4o-mini', \n",
" 'LLAMA': 'llama3.2', \n",
" 'DEEPSEEK': 'deepseek-r1:1.5b'\n",
" }\n",
"\n",
"CLIENTS = { 'GPT': OpenAI(), \n",
" 'LLAMA': OpenAI(base_url='http://localhost:11434/v1', api_key='ollama'),\n",
" 'DEEPSEEK': OpenAI(base_url='http://localhost:11434/v1', api_key='ollama') \n",
" }\n",
"\n",
"# path to Chrome\n",
"CHROME_PATH = \"C:/Program Files/Google/Chrome/Application/chrome.exe\""
]
},
{
"cell_type": "markdown",
"id": "6f490fe4-32d5-41f3-890d-ecf4e5e01dd4",
"metadata": {},
"source": [
"**Webcrawler** (based on the code from __/week1/community-contributions/day1-webscraping-selenium-for-javascript.ipynb__)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2a1cf7a-044f-4a9c-b76e-8f112d384550",
"metadata": {},
"outputs": [],
"source": [
"class WebsiteCrawler:\n",
" def __init__(self, url, wait_time=20, chrome_path=None):\n",
" \"\"\"\n",
" Initialize the WebsiteCrawler using Selenium to scrape JavaScript-rendered content.\n",
" \"\"\"\n",
" self.url = url\n",
" self.wait_time = wait_time\n",
"\n",
" options = uc.ChromeOptions()\n",
" options.add_argument(\"--disable-gpu\")\n",
" options.add_argument(\"--no-sandbox\")\n",
" options.add_argument(\"--disable-dev-shm-usage\")\n",
" options.add_argument(\"--disable-blink-features=AutomationControlled\")\n",
" # options.add_argument(\"--headless=new\") # For Chrome >= 109 - unreliable on my end!\n",
" options.add_argument(\"start-maximized\")\n",
" options.add_argument(\n",
" \"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
" )\n",
" if chrome_path:\n",
" options.binary_location = chrome_path\n",
"\n",
" self.driver = uc.Chrome(options=options)\n",
"\n",
" try:\n",
" # Load the URL\n",
" self.driver.get(url)\n",
"\n",
" # Wait for Cloudflare or similar checks\n",
" time.sleep(10)\n",
"\n",
" # Ensure the main content is loaded\n",
" WebDriverWait(self.driver, self.wait_time).until(\n",
" EC.presence_of_element_located((By.TAG_NAME, \"main\"))\n",
" )\n",
"\n",
" # Extract the main content\n",
" main_content = self.driver.find_element(By.CSS_SELECTOR, \"main\").get_attribute(\"outerHTML\")\n",
"\n",
" # Parse with BeautifulSoup\n",
" soup = BeautifulSoup(main_content, \"html.parser\")\n",
" self.title = self.driver.title if self.driver.title else \"No title found\"\n",
" self.text = soup.get_text(separator=\"\\n\", strip=True)\n",
"\n",
" except Exception as e:\n",
" print(f\"Error occurred: {e}\")\n",
" self.title = \"Error occurred\"\n",
" self.text = \"\"\n",
"\n",
" finally:\n",
" self.driver.quit()\n"
]
},
{
"cell_type": "markdown",
"id": "592d8f86-fbf7-4b16-a69d-468030d72dc4",
"metadata": {},
"source": [
"### Prompts"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1914afad-dbd8-4c1f-8e68-80b0e5d743a9",
"metadata": {},
"outputs": [],
"source": [
"# system prompt\n",
"system_prompt = \"\"\"\n",
" You are an experienced SEO-focused copywriter. The user will provide an article, and your task is to analyze its content and generate a single, most effective, keyword-optimized title to maximize SEO performance.\n",
"\n",
"Instructions:\n",
"Ignore irrelevant content, such as the current title (if any), navigation menus, advertisements, or unrelated text.\n",
"Prioritize SEO best practices, considering:\n",
"Keyword relevance and search intent (informational, transactional, etc.).\n",
"Readability and engagement.\n",
"Avoiding keyword stuffing.\n",
"Ensure conciseness and clarity, keeping the title under 60 characters when possible for optimal SERP display.\n",
"Use a compelling structure that balances informativeness and engagement, leveraging formats like:\n",
"Listicles (\"10 Best Strategies for…\")\n",
"How-to guides (\"How to Boost…\")\n",
"Questions (\"What Is the Best Way to…\")\n",
"Power words to enhance click-through rates (e.g., \"Proven,\" \"Ultimate,\" \"Essential\").\n",
"Provide only one single, best title—do not suggest multiple options.\n",
"Limit the answer to the following Response Format (Markdown):\n",
"Optimized Title: [Provide only one title here]\n",
"Justification: [Explain why this title is effective for SEO]\n",
"\n",
" \"\"\""
]
},
{
"cell_type": "markdown",
"id": "b0486867-6d38-4cb5-91d4-fb60952c3a9b",
"metadata": {},
"source": [
"**Provide the article URL and get its content for analysis**"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ddd76319-13ce-480b-baa7-cab6a5c88168",
"metadata": {},
"outputs": [],
"source": [
"# article url - change to any other article URL\n",
"article_url = \"https://searchengineland.com/seo-trends-2025-447745\"\n",
"\n",
"# get article content\n",
"article = WebsiteCrawler(url=article_url, chrome_path=CHROME_PATH)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "176cfac7-5e6d-4d4a-a1c4-1b63b60de1f7",
"metadata": {},
"outputs": [],
"source": [
"# user prompt\n",
"user_prompt = \"\"\"\n",
"Following the article to be analyzed to suggest a title. Limit the answer to the following Response Format (Markdown): \n",
"Optimized Title: [Provide only one title here]\n",
"Justification: [Explain why this title is effective for SEO].\n",
"\"\"\"\n",
"\n",
"user_prompt = f\"{user_prompt} {article}\"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c45fc7d7-08c9-4e34-b427-b928a219bb94",
"metadata": {},
"outputs": [],
"source": [
"# message list\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f67b881f-1040-4cf7-82c5-e85f4c0bd252",
"metadata": {},
"outputs": [],
"source": [
"# get suggested title\n",
"def get_title(model, **kwargs):\n",
" # stream if GPT\n",
" if 'stream' in kwargs:\n",
" response = CLIENTS[model].chat.completions.create(\n",
" model=MODELS[model],\n",
" messages=messages,\n",
" stream=kwargs['stream']\n",
" )\n",
" else:\n",
" response = CLIENTS[model].chat.completions.create(\n",
" model=MODELS[model],\n",
" messages=messages,\n",
" )\n",
"\n",
" return response\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8988d6ff-076a-4eae-baf4-26a8d6a2bc44",
"metadata": {},
"outputs": [],
"source": [
"# filter response from model verbose - like Deepseek reasoning/thinking verbose\n",
"def filter_response(response):\n",
" # Find last occurrence of 'Optimized Title:' to avoid displaying reasoning verbose\n",
" substring = 'Optimized Title:'\n",
" start = response.rfind('Optimized Title:')\n",
" if start > -1:\n",
" filtered_response = response[start:]\n",
"\n",
" # insert line break to preserve format\n",
" filtered_response = filtered_response.replace(\"**Justification:**\", \"\\n**Justification:**\")\n",
" \n",
" return filtered_response"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e9e99cf-5e25-4a1f-ab11-a2255e318671",
"metadata": {},
"outputs": [],
"source": [
"# display suggested title\n",
"def display_title(model):\n",
" # get model-suggested title\n",
" title = get_title(model)\n",
" \n",
" display(Markdown(f\"### {model} (___{MODELS[model]}___) Answer\\n\\n_______\")) \n",
"\n",
" response = \"\"\n",
"\n",
" if model == 'GPT':\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" # for chunk in stream:\n",
" for chunk in get_title(model=model, stream=True):\n",
" response += chunk.choices[0].delta.content or ''\n",
" response = (\n",
" response.replace(\"```\",\"\")\n",
" .replace(\"markdown\", \"\")\n",
" .replace(\"Optimized Title:\", \"**Optimized Title:**\")\n",
" .replace(\"Justification:\", \"**Justification:**\")\n",
" )\n",
" update_display(Markdown(response), display_id=display_handle.display_id)\n",
" else:\n",
" response = get_title(model=model)\n",
" response = response.choices[0].message.content\n",
" response = filter_response(response)\n",
" response = (\n",
" response.replace(\"Optimized Title:\", \"**Optimized Title:**\")\n",
" .replace(\"Justification:\", \"**Justification:**\")\n",
" )\n",
" display(Markdown(response))"
]
},
{
"cell_type": "markdown",
"id": "947b42ed-5b43-486d-8af3-e5b671c1fd0e",
"metadata": {},
"source": [
"### Get OpenAI Suggested Title"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb6f66e3-ab99-4f76-9358-896cb43c1fa1",
"metadata": {},
"outputs": [],
"source": [
"# get and display openAi suggested title\n",
"display_title(model='GPT')"
]
},
{
"cell_type": "markdown",
"id": "70073ebf-a00a-416b-854d-642d450cd99b",
"metadata": {},
"source": [
"### Get Llama Suggested Title"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "caa190bb-de5f-45cc-b671-5d62688f7b25",
"metadata": {},
"outputs": [],
"source": [
"# get and display Llama suggested title\n",
"display_title(model='LLAMA')"
]
},
{
"cell_type": "markdown",
"id": "811edc4f-20e2-482d-ac89-fae9d1b70bed",
"metadata": {},
"source": [
"### Get Deepseek Suggested Title"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "082628e4-ff4c-46dd-ae5f-76578eb017ad",
"metadata": {},
"outputs": [],
"source": [
"# get and display Deepseek title\n",
"display_title(model='DEEPSEEK')"
]
},
{
"cell_type": "markdown",
"id": "7fc404a6-3a91-4c09-89de-867d3d69b4b2",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"### Observations\n",
"\n",
"1. **Selenium:** The headless option (__options.add_argument(\"--headless=new\")__), while ideal to speed up the scanning process, presented problems while scanning several websites (including openai.com and canva.com).\n",
"2. **Deepseek challenges:**\\\n",
" a.It always returns its thinking/reasoning verbose, which, while helpful to understand how it works, is not always\n",
" required, such as in this example code. A new function (**filter_response**) was created to remove the additional verbose.\\\n",
" b. It is unreliable with the response, sometimes returning the required format for the response instead of the\n",
" actual response. For example, for the title, it may sometimes return:\n",
" \n",
" **Optimized Title:** \\[The user wants the suggested title here]\n",
" \n",
"### Suggested future improvements\n",
"\n",
"1. Add the logic that would allow each model to assess the recommendations from the different models and \n",
" select the best among these.\n",
"2. Add the logic to leverage an API (if available) that automatically assesses the suggested titles."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1af8260b-5ba1-4eeb-acd0-02de537b1bf4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -234,7 +234,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "llms",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -252,5 +252,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

View File

@@ -0,0 +1,229 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "29ddd15d-a3c5-4f4e-a678-873f56162724",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display\n",
"import ollama"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "479ff514-e8bd-4985-a572-2ea28bb4fa40",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest â ‹ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest â ™ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest â ¹ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest â ¸ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest â ¼ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest â ´ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest â ¦ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest â § \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest â ‡ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest â <C3A2> \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[1Gpulling manifest \u001b[K\n",
"pulling 2bada8a74506... 100% â•âˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâ<C3A2> 4.7 GB \u001b[K\n",
"pulling 66b9ea09bd5b... 100% â•âˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâ<C3A2> 68 B \u001b[K\n",
"pulling eb4402837c78... 100% â•âˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâ<C3A2> 1.5 KB \u001b[K\n",
"pulling 832dd9e00a68... 100% â•âˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâ<C3A2> 11 KB \u001b[K\n",
"pulling 2f15b3218f05... 100% â•âˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâˆâ<C3A2> 487 B \u001b[K\n",
"verifying sha256 digest \u001b[K\n",
"writing manifest \u001b[K\n",
"success \u001b[K\u001b[?25h\u001b[?2026l\n"
]
}
],
"source": [
"# Let's just make sure the model is loaded\n",
"\n",
"!ollama pull qwen2.5\n",
"MODEL = \"qwen2.5\""
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "6de38216-6d1c-48c4-877b-86d403f4e0f8",
"metadata": {},
"outputs": [],
"source": [
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
"\n",
" def __init__(self, url):\n",
" \"\"\"\n",
" Create this Website object from the given url using the BeautifulSoup library\n",
" \"\"\"\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "a531b8f6-d4f8-4140-b54d-bcf280bd7a99",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
"and provides a short summary, ignoring text that might be navigation related. \\\n",
"Respond in markdown.\""
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "6b46ff43-4817-431e-8335-8d2cc9957910",
"metadata": {},
"outputs": [],
"source": [
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
"please provide a summary of this website in markdown. \\\n",
"If it includes news or announcements, then summarize these too.(only if they are present)\\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "13a3a001-5d91-4269-ab60-493bbf35bda4",
"metadata": {},
"outputs": [],
"source": [
"def messages_for(website):\n",
" return [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c61ad738-9395-415d-b88b-d4a70d4331aa",
"metadata": {},
"outputs": [],
"source": [
"def summarize(url):\n",
" website = Website(url)\n",
" response = ollama.chat(model=MODEL, messages=messages_for(website))\n",
" return response['message']['content']"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "bdbcfa75-980b-4542-872d-af8b20546b5d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'```markdown\\n# Tailwind CSS Cheat Sheet Summary\\n\\nThis website serves as a comprehensive guide for developers using Tailwind CSS, providing quick access to commonly used utility classes and configurations. The content is organized into sections such as typography, layout, colors, shadows, and more, making it easy for users to find specific styles or settings.\\n\\n- **Typography**: Includes various font sizes, weights, line heights, and other typographic utilities.\\n- **Layout**: Features columns, grid, flexbox, spacing, and responsive design utilities.\\n- **Colors**: Lists predefined color palettes and utility classes for color manipulation.\\n- **Shadows**: Provides options to add depth and dimension to elements through shadow effects.\\n- **Other Sections**: Covers forms, animations, and more, with concise descriptions and examples.\\n\\nThe site is designed to be a one-stop reference tool, allowing developers to quickly apply Tailwind CSS styles without having to consult the official documentation every time.\\n```'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summarize(\"https://www.creative-tim.com/twcomponents/cheatsheet/\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "817e6f73-1abe-4f79-9010-f4264e0f324a",
"metadata": {},
"outputs": [],
"source": [
"def display_summary(url):\n",
" summary = summarize(url)\n",
" display(Markdown(summary))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "504c19cf-9add-4a78-a028-fe2710e0604d",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"# Summary\n",
"\n",
"**Home Page:**\n",
"- The website is titled \"Home - Edward Donner\" and introduces Ed, who enjoys coding, experimenting with large language models (LLMs), DJing, and engaging in Hacker News.\n",
"- He co-founded Nebula.io, an AI company focusing on helping people discover their potential. The platform uses proprietary LLMs for talent discovery and has been patented.\n",
"\n",
"**News/Announcements:**\n",
"- **January 23, 2025:** LLM Workshop Hands-on with Agents\n",
"- **December 21, 2024:** Welcome, SuperDataScientists!\n",
"- **November 13, 2024:** Mastering AI and LLM Engineering Resources\n",
"- **October 16, 2024:** From Software Engineer to AI Data Scientist resources\n",
"\n",
"**Connect Section:**\n",
"- Provides ways to get in touch with Ed, including email, LinkedIn, Twitter, Facebook, and a newsletter subscription form.\n",
"\n",
"**Additional Content:**\n",
"- **Connect Four:** Describes it as an arena where LLMs compete against each other.\n",
"- **About Page:** Further details about Ed's background and Nebula.io."
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_summary('https://edwarddonner.com')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20d621cb-6bfb-41a6-bd98-a51ef0a8b158",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,180 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "fe12c203-e6a6-452c-a655-afb8a03a4ff5",
"metadata": {},
"source": [
"# End of week 1 exercise\n",
"\n",
"To demonstrate your familiarity with OpenAI API, and also Ollama, build a tool that takes a technical question, \n",
"and responds with an explanation. This is a tool that you will be able to use yourself during the course!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1070317-3ed9-4659-abe3-828943230e03",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"import os\n",
"import requests\n",
"import json\n",
"from typing import List\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"import ollama"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a456906-915a-4bfd-bb9d-57e505c5093f",
"metadata": {},
"outputs": [],
"source": [
"# constants\n",
"MODEL_GPT = 'gpt-4o-mini'\n",
"MODEL_LLAMA = 'llama3.2'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8d7923c-5f28-4c30-8556-342d7c8497c1",
"metadata": {},
"outputs": [],
"source": [
"# set up environment\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:\n",
" print(\"API key looks good so far\")\n",
"else:\n",
" print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n",
"\n",
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3f0d0137-52b0-47a8-81a8-11a90a010798",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"You are provided with a technical question. \\\n",
"You are answering by providing a quick explanation and giving some examples.\\n\"\n",
"\n",
"# here is the question; type over this to ask something new\n",
"question = \"\"\"\n",
"Please explain what this code does and why:\n",
"yield from {book.get(\"author\") for book in books if book.get(\"author\")}\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60ce7000-a4a5-4cce-a261-e75ef45063b4",
"metadata": {},
"outputs": [],
"source": [
"# Get gpt-4o-mini to answer, with streaming\n",
"def get_answer_gpt():\n",
" stream = openai.chat.completions.create(\n",
" model=MODEL_GPT,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": question}\n",
" ],\n",
" stream=True\n",
" )\n",
"\n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" for chunk in stream:\n",
" response += chunk.choices[0].delta.content or ''\n",
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
" update_display(Markdown(response), display_id=display_handle.display_id)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f7c8ea8-4082-4ad0-8751-3301adcf6538",
"metadata": {},
"outputs": [],
"source": [
"# Get Llama 3.2 to answer\n",
"def get_answer_ollama():\n",
" stream = ollama.generate(\n",
" MODEL_LLAMA,\n",
" question,\n",
" stream=True\n",
" )\n",
" \n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" for chunk in stream:\n",
" response += chunk['response'] or ''\n",
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
" update_display(Markdown(response), display_id=display_handle.display_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a859eb1-23fa-40dd-ba91-b35084433a00",
"metadata": {},
"outputs": [],
"source": [
"get_answer_gpt()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c73f046-da3a-49a5-8a74-4b8a86a9032a",
"metadata": {},
"outputs": [],
"source": [
"get_answer_ollama()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bea20f33-a710-44ab-9a4d-856db05e4201",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}