Merge branch 'main' of github.com:ed-donner/llm_engineering

This commit is contained in:
Edward Donner
2025-06-10 19:28:01 -04:00
24 changed files with 9488 additions and 0 deletions

View File

@@ -0,0 +1,226 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "38795b24-9801-4cfb-a000-ccd7f41e6128",
"metadata": {},
"source": [
"\n",
"# 🧠 Multi-Product Competitor Intelligence Summarizer using Web Scraping + LLM\n",
"\n",
"This notebook scrapes product pages using `Selenium`, collects the product information, and summarizes key features and comparison insights using `Ollama (LLaMA3) and OpenAI`.\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"\n",
"# Load environment variables in a file called .env\n",
"\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
"metadata": {},
"outputs": [],
"source": [
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
"\n",
"system_prompt = \"Summarize the following product information for comparison.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38245e18",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 📦 Install required packages (run once)\n",
"!pip install selenium bs4 requests\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88ae528b-aefe-4c64-b927-676e739194af",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4a831a5",
"metadata": {},
"outputs": [],
"source": [
"def summarize_with_openai(text, model=\"gpt-4o-mini\"):\n",
" response = openai.chat.completions.create(\n",
" model=model,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": text}\n",
" ],\n",
" temperature=0.7\n",
" )\n",
" return response.choices[0].message.content\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef65cd72",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# ⚙️ Selenium setup (headless)\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.common.by import By\n",
"import time\n",
"\n",
"def scrape_text_from_url(url):\n",
" options = Options()\n",
" options.add_argument(\"--headless=new\")\n",
" driver = webdriver.Chrome(options=options)\n",
" driver.get(url)\n",
" time.sleep(3)\n",
" \n",
" # You can tune this selector depending on the site\n",
" body = driver.find_element(By.TAG_NAME, 'body')\n",
" text = body.text\n",
" driver.quit()\n",
" return text.strip()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36e19014",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 🧠 LLM Prompting using Ollama (local llama3)\n",
"import subprocess\n",
"\n",
"def summarize_with_ollama(text):\n",
" prompt = f\"Summarize the following product description:\\n\\n{text}\\n\\nSummary:\"\n",
" try:\n",
" print(\"inside ollama\")\n",
" result = subprocess.run(\n",
" [\"ollama\", \"run\", \"llama3.2\"],\n",
" input=prompt,\n",
" capture_output=True, text=True, check=True, encoding=\"utf-8\"\n",
" )\n",
" print(\"git result\")\n",
" return result.stdout.strip()\n",
" except subprocess.CalledProcessError as e:\n",
" return f\"Error running ollama: {e.stderr}\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e04cea6e",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 🔁 Analyze multiple product URLs and summarize\n",
"product_urls = {\n",
" \"iPhone 15 Pro\": \"https://www.apple.com/in/iphone-15-pro/\",\n",
" \"Samsung S24 Ultra\": \"https://www.samsung.com/in/smartphones/galaxy-s24-ultra/\",\n",
"}\n",
"\n",
"product_texts = {}\n",
"\n",
"for name, url in product_urls.items():\n",
" print(f\"Scraping {name} ...\")\n",
" product_texts[name] = scrape_text_from_url(url)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ebd5a20",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 📄 Display side-by-side summaries\n",
"for name, text in product_texts.items():\n",
" print(f\"\\n🔹 {name} Summary with Ollama:\")\n",
" print(summarize_with_ollama(text))\n",
"\n",
" print(f\"\\n🔹 {name} Summary with OpenAI GPT:\")\n",
" print(summarize_with_openai(text))\n",
" print(\"=\"*100)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "935e0081-ccf5-4d9a-a984-ee82c77c04a2",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,330 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "70a27b7c-3f3c-4d82-bdea-381939ce98bd",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"# My Adverserial Conversation\n",
"J. McInerney, 26 May 2025\n",
"I am taking some cells from the Week2, Day 1 notebook and modifying them so I can have an adverserial conversation between OpenAI and a local LLM (gemma3:12b). First I will just reimplement what Ed did in the Week2, Day 1 notebook. Then I will try a deeper conversation."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ec14834-4cf2-4f1d-9128-4ddad7b91804",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"#import anthropic\n",
"from IPython.display import Markdown, display, update_display"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98618ab4-075f-438c-b85b-d146e5299a87",
"metadata": {},
"outputs": [],
"source": [
"# Load environment variables in a file called .env\n",
"# Print the key prefixes to help with any debugging\n",
"\n",
"load_dotenv(override=True)\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"if openai_api_key:\n",
" print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
"else:\n",
" print(\"OpenAI API Key not set\")\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95e69172-4601-4eb0-a7af-19abebd4bf56",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"# Connect to OpenAI, Anthropic\n",
"openai = OpenAI()"
]
},
{
"cell_type": "markdown",
"id": "98f47886-71ae-4b41-875a-1b97a5eb0ddc",
"metadata": {},
"source": [
"## An adversarial conversation between Chatbots..\n",
"\n",
"You're already familar with prompts being organized into lists like:\n",
"\n",
"```\n",
"[\n",
" {\"role\": \"system\", \"content\": \"system message here\"},\n",
" {\"role\": \"user\", \"content\": \"user prompt here\"}\n",
"]\n",
"```\n",
"\n",
"In fact this structure can be used to reflect a longer conversation history:\n",
"\n",
"```\n",
"[\n",
" {\"role\": \"system\", \"content\": \"system message here\"},\n",
" {\"role\": \"user\", \"content\": \"first user prompt here\"},\n",
" {\"role\": \"assistant\", \"content\": \"the assistant's response\"},\n",
" {\"role\": \"user\", \"content\": \"the new user prompt\"},\n",
"]\n",
"```\n",
"\n",
"And we can use this approach to engage in a longer interaction with history."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74125f8b-042e-4236-ad3d-6371ce5a1493",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"# Let's make a conversation between GPT-4o-mini and Gemma3:12b\n",
"# We're using cheap versions of models so the costs will be minimal\n",
"\n",
"gpt_model = \"gpt-4o-mini\"\n",
"local_model = 'gemma3:12b'\n",
"\n",
"gpt_system = \"You are a chatbot who is very argumentative; \\\n",
"you disagree with anything in the conversation and you challenge everything, in a snarky way.\"\n",
"\n",
"local_system = \"You are a very polite, courteous chatbot. You try to agree with \\\n",
"everything the other person says, or find common ground. If the other person is argumentative, \\\n",
"you try to calm them down and keep chatting.\"\n",
"\n",
"gpt_messages = [\"Hi there\"]\n",
"local_messages = [\"Hi\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f94d9232-f82a-4eab-9d89-bd9815f260f0",
"metadata": {},
"outputs": [],
"source": [
"def call_gpt():\n",
" messages = [{\"role\": \"system\", \"content\": gpt_system}]\n",
" for gpt, local in zip(gpt_messages, local_messages):\n",
" messages.append({\"role\": \"assistant\", \"content\": gpt})\n",
" messages.append({\"role\": \"user\", \"content\": local})\n",
" completion = openai.chat.completions.create(\n",
" model=gpt_model,\n",
" messages=messages\n",
" )\n",
" return completion.choices[0].message.content"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6445453-31be-4c63-b350-957b7d99b6f4",
"metadata": {},
"outputs": [],
"source": [
"call_gpt()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fc51f776-f6e2-41af-acb5-cbdf03fdf530",
"metadata": {},
"outputs": [],
"source": [
"basellm = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
"def call_local():\n",
" messages = []\n",
" for gpt, local_message in zip(gpt_messages, local_messages):\n",
" messages.append({\"role\": \"user\", \"content\": gpt})\n",
" messages.append({\"role\": \"assistant\", \"content\": local_message})\n",
" messages.append({\"role\": \"user\", \"content\": gpt_messages[-1]})\n",
" \n",
" completion = basellm.chat.completions.create(\n",
" model=local_model,\n",
" messages=messages\n",
" )\n",
" \n",
" return completion.choices[0].message.content"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16fd90cb-ebfd-4a4f-ae49-70568ae8fbb1",
"metadata": {},
"outputs": [],
"source": [
"call_local()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "429eeefb-f080-4a57-8f2d-ff3d4237afab",
"metadata": {},
"outputs": [],
"source": [
"call_gpt()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ce847ed-521d-4be5-895b-44088de499e1",
"metadata": {},
"outputs": [],
"source": [
"gpt_messages = [\"Hi there\"]\n",
"local_messages = [\"Hi\"]\n",
"\n",
"print(f\"GPT:\\n{gpt_messages[0]}\\n\")\n",
"print(f\"local:\\n{local_messages[0]}\\n\")\n",
"\n",
"for i in range(5):\n",
" gpt_next = call_gpt()\n",
" print(f\"GPT:\\n{gpt_next}\\n\")\n",
" gpt_messages.append(gpt_next)\n",
" \n",
" local_next = call_local()\n",
" print(f\"local:\\n{local_next}\\n\")\n",
" local_messages.append(local_next)"
]
},
{
"cell_type": "markdown",
"id": "d3b1707a-2903-4529-b6eb-95a874a14e78",
"metadata": {},
"source": [
"## Let's try a more thoughful conversation\n",
"The two chatbots will engage in a friendly discussion on whether the US should have entered World War I in 1917. They are both open minded so they can learn from each other."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abb733bf-a5d3-4718-8741-8e8abfd3a088",
"metadata": {},
"outputs": [],
"source": [
"# Let's make a conversation between GPT-4o-mini and Gemma3:12b\n",
"# We're using cheap versions of models so the costs will be minimal\n",
"\n",
"gpt_system = \"You are a chatbot who believes it was a mistake for the US to enter World War I; \\\n",
"you are open to other arguments, but you feel the evidence suggests the world would have been \\\n",
"better off if the US had stayed isolationalist. You consider counter arguments but also express \\\n",
"your own arguments.\"\n",
"\n",
"local_system = \"You are a chatbot who believes the US made the right decision entering World War I in \\\n",
"1917. Overall, the world is a better place for it. You are open minded but believe the evidence \\\n",
"supports this view. You consider counter arguments but also express your own arguments.\"\n",
"\n",
"gpt_messages = [\"It was such a mistake for the US to enter WWI\"]\n",
"local_messages = [\"Why do you say that?\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "569e18a3-25cd-46d5-8edb-713ff149d008",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"print(f\"GPT:\\n{gpt_messages[0]}\\n\")\n",
"print(f\"local:\\n{local_messages[0]}\\n\")\n",
"\n",
"for i in range(5):\n",
" gpt_next = call_gpt()\n",
" print(f\"GPT:\\n{gpt_next}\\n\")\n",
" gpt_messages.append(gpt_next)\n",
" \n",
" local_next = call_local()\n",
" print(f\"local:\\n{local_next}\\n\")\n",
" local_messages.append(local_next)"
]
},
{
"cell_type": "markdown",
"id": "d29df7da-eaa3-4c98-b913-05185b62cffe",
"metadata": {},
"source": [
"## Conclusion\n",
"I am amazed at how insightful this conversation was. Not only did they explore all the pros and cons, they began applying those lessons to current day foreign policy. This looks like a very good way to explore a topic. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b486b2d6-40da-4745-8cbf-1afd2be22caa",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,357 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "53211323-6a09-452a-b471-98e22d92bfc2",
"metadata": {},
"source": [
"# 🌐 WebPage Summarizer\n",
"---\n",
"- 🌍 **Task:** Summarizing webpage content using AI. \n",
"- 🧠 **Model:** OpenAI's ``gpt-4o-mini`` and ``llama3.2:3b`` for text summarization. \n",
"- 🕵️‍♂️ **Data Extraction:** Selenium for handling both static and JavaScript-rendered websites. \n",
"- 📌 **Output Format:** Markdown-formatted summaries. \n",
"- 🔗 **Scope:** Processes only the given webpage URL (not the entire site). \n",
"- 🚀 **Tools:** Python, Requests, Selenium, BeautifulSoup, OpenAI API, Ollama. \n",
"- 🧑‍💻 **Skill Level:** Beginner.\n",
"\n",
"🛠️ Requirements\n",
"- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n",
"- 🔑 OpenAI API Key (for GPT model)\n",
"- Install Ollama and pull llama3.2:3b or another lightweight model\n",
"- Google Chrome browser installed\n",
"\n",
"**✨ This script handles both JavaScript and non-JavaScript websites using Selenium with Chrome WebDriver for reliable content extraction from modern web applications.**\n",
"\n",
"Let's get started and automate website summarization! 🚀\n",
"\n",
"![](https://github.com/lisekarimi/lexo/blob/main/assets/01_basic_llm_project.jpg?raw=true)\n",
"\n",
"---\n",
"📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)"
]
},
{
"cell_type": "markdown",
"id": "d70aa4b0",
"metadata": {},
"source": [
"## 🛠️ Environment Setup & Dependencies"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ebf2fa36",
"metadata": {},
"outputs": [],
"source": [
"%pip install selenium webdriver-manager"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1dcf1d9d-c540-4900-b14e-ad36a28fc822",
"metadata": {},
"outputs": [],
"source": [
"# ===========================\n",
"# System & Environment\n",
"# ===========================\n",
"import os\n",
"from dotenv import load_dotenv\n",
"\n",
"# ===========================\n",
"# Web Scraping\n",
"# ===========================\n",
"import time\n",
"from bs4 import BeautifulSoup\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"\n",
"# ===========================\n",
"# AI-related\n",
"# ===========================\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI\n",
"import ollama"
]
},
{
"cell_type": "markdown",
"id": "cc20642b",
"metadata": {},
"source": [
"## 🔐 Model Configuration & Authentication"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8598c299-05ca-492e-b085-6bcc2f7dda0d",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"if not api_key:\n",
" raise ValueError(\"OPENAI_API_KEY not found in environment variables\")\n",
"\n",
"print(\"✅ API key loaded successfully!\")\n",
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8098defb",
"metadata": {},
"outputs": [],
"source": [
"MODEL_OPENAI = \"gpt-4o-mini\"\n",
"MODEL_OLLAMA = \"llama3.2:3b\""
]
},
{
"cell_type": "markdown",
"id": "2bd1d83f",
"metadata": {},
"source": [
"## 🌐 Web Scraping Infrastructure"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6fe5114",
"metadata": {},
"outputs": [],
"source": [
"class WebsiteCrawler:\n",
" def __init__(self, url):\n",
" self.url = url\n",
" self.title = \"\"\n",
" self.text = \"\"\n",
" self.scrape()\n",
"\n",
" def scrape(self):\n",
" try:\n",
" # Chrome options\n",
" chrome_options = Options()\n",
" chrome_options.add_argument(\"--headless\")\n",
" chrome_options.add_argument(\"--no-sandbox\")\n",
" chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
" chrome_options.add_argument(\"--disable-gpu\")\n",
" chrome_options.add_argument(\"--window-size=1920,1080\")\n",
" chrome_options.add_argument(\"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\")\n",
"\n",
" # Try to find Chrome\n",
" chrome_paths = [\n",
" r\"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe\",\n",
" r\"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe\",\n",
" r\"C:\\Users\\{}\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe\".format(os.getenv('USERNAME')),\n",
" ]\n",
"\n",
" chrome_binary = None\n",
" for path in chrome_paths:\n",
" if os.path.exists(path):\n",
" chrome_binary = path\n",
" break\n",
"\n",
" if chrome_binary:\n",
" chrome_options.binary_location = chrome_binary\n",
"\n",
" # Create driver\n",
" driver = webdriver.Chrome(options=chrome_options)\n",
" driver.set_page_load_timeout(30)\n",
"\n",
" print(f\"🔍 Loading: {self.url}\")\n",
" driver.get(self.url)\n",
"\n",
" # Wait for page to load\n",
" time.sleep(5)\n",
"\n",
" # Try to wait for main content\n",
" try:\n",
" WebDriverWait(driver, 10).until(\n",
" EC.presence_of_element_located((By.TAG_NAME, \"main\"))\n",
" )\n",
" except Exception:\n",
" try:\n",
" WebDriverWait(driver, 10).until(\n",
" EC.presence_of_element_located((By.TAG_NAME, \"body\"))\n",
" )\n",
" except Exception:\n",
" pass # Continue anyway\n",
"\n",
" # Get title and page source\n",
" self.title = driver.title\n",
" page_source = driver.page_source\n",
" driver.quit()\n",
"\n",
" print(f\"✅ Page loaded: {self.title}\")\n",
"\n",
" # Parse with BeautifulSoup\n",
" soup = BeautifulSoup(page_source, 'html.parser')\n",
"\n",
" # Remove unwanted elements\n",
" for element in soup([\"script\", \"style\", \"img\", \"input\", \"button\", \"nav\", \"footer\", \"header\"]):\n",
" element.decompose()\n",
"\n",
" # Get main content\n",
" main = soup.find('main') or soup.find('article') or soup.find('.content') or soup.find('body')\n",
" if main:\n",
" self.text = main.get_text(separator=\"\\n\", strip=True)\n",
" else:\n",
" self.text = soup.get_text(separator=\"\\n\", strip=True)\n",
"\n",
" # Clean up text\n",
" lines = [line.strip() for line in self.text.split('\\n') if line.strip() and len(line.strip()) > 2]\n",
" self.text = '\\n'.join(lines[:200]) # Limit to first 200 lines\n",
"\n",
" print(f\"📄 Extracted {len(self.text)} characters\")\n",
"\n",
" except Exception as e:\n",
" print(f\"❌ Error occurred: {e}\")\n",
" self.title = \"Error occurred\"\n",
" self.text = \"Could not scrape website content\""
]
},
{
"cell_type": "markdown",
"id": "d727feff",
"metadata": {},
"source": [
"## 🧠 Prompt Engineering & Templates"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "02e3a673-a8a1-4101-a441-3816f7ab9e4d",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
"and provides a short summary, ignoring text that might be navigation related. \\\n",
"Respond in markdown.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86bb80f9-9e7c-4825-985f-9b83fe50839f",
"metadata": {},
"outputs": [],
"source": [
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.\\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "89998b18-77aa-4aaf-a137-f0d078d61f75",
"metadata": {},
"outputs": [],
"source": [
"def messages_for(website):\n",
" return [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
" ]"
]
},
{
"cell_type": "markdown",
"id": "cde36d4f",
"metadata": {},
"source": [
"## 📝 Summarization "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5636affe",
"metadata": {},
"outputs": [],
"source": [
"def summarize_gpt(url):\n",
" \"\"\"Scrape website and summarize with GPT\"\"\"\n",
" site = WebsiteCrawler(url)\n",
"\n",
" if \"Error occurred\" in site.title or len(site.text) < 50:\n",
" print(f\"❌ Failed to scrape meaningful content from {url}\")\n",
" return\n",
"\n",
" print(\"🤖 Creating summary...\")\n",
"\n",
" # Create summary\n",
" response = openai.chat.completions.create(\n",
" model=MODEL_OPENAI,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt_for(site)}\n",
" ]\n",
" )\n",
"\n",
" web_summary = response.choices[0].message.content\n",
" display(Markdown(web_summary))\n",
"\n",
"summarize_gpt('https://openai.com')\n",
"# summarize_gpt('https://stripe.com')\n",
"# summarize_gpt('https://vercel.com')\n",
"# summarize_gpt('https://react.dev')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "90b9a8f8-0c1c-40c8-a4b3-e8e1fcd29df5",
"metadata": {},
"outputs": [],
"source": [
"def summarize_ollama(url):\n",
" website = WebsiteCrawler(url)\n",
" response = ollama.chat(\n",
" model=MODEL_OLLAMA,\n",
" messages=messages_for(website))\n",
" display(Markdown(response['message']['content'])) # Generate and display output\n",
"\n",
"summarize_ollama('https://github.com')\n",
"# summarize_ollama('https://nextjs.org')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,370 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "dc8af57c-23a9-452e-9fc3-0e5027edda14",
"metadata": {},
"source": [
"# AI-powered Brochure Generator\n",
"---\n",
"- 🌍 Task: Generate a company brochure using its name and website for clients, investors, and recruits.\n",
"- 🧠 Model: Toggle `USE_OPENAI` to switch between OpenAI and Ollama models\n",
"- 🕵️‍♂️ Data Extraction: Scraping website content and filtering key links (About, Products, Careers, Contact).\n",
"- 📌 Output Format: a Markdown-formatted brochure streamed in real-time.\n",
"- 🚀 Tools: BeautifulSoup, OpenAI API, and IPython display, ollama.\n",
"- 🧑‍💻 Skill Level: Intermediate.\n",
"\n",
"🛠️ Requirements\n",
"- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n",
"- 🔑 OpenAI API Key \n",
"- Install Ollama and pull llama3.2:3b or another lightweight model\n",
"---\n",
"📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)"
]
},
{
"cell_type": "markdown",
"id": "ec869f2c",
"metadata": {},
"source": [
"## 🧩 System Design Overview\n",
"\n",
"### Class Structure\n",
"\n",
"![](https://github.com/lisekarimi/lexo/blob/main/assets/02_brochure_class_diagram.png?raw=true)\n",
"\n",
"This code consists of three main classes:\n",
"\n",
"1. **`Website`**: \n",
" - Scrapes and processes webpage content. \n",
" - Extracts **text** and **links** from a given URL. \n",
"\n",
"2. **`LLMClient`**: \n",
" - Handles interactions with **OpenAI or Ollama (`llama3`, `deepseek`, `qwen`)**. \n",
" - Uses `get_relevant_links()` to filter webpage links. \n",
" - Uses `generate_brochure()` to create and stream a Markdown-formatted brochure. \n",
"\n",
"3. **`BrochureGenerator`**: \n",
" - Uses `Website` to scrape the main webpage and relevant links. \n",
" - Uses `LLMClient` to filter relevant links and generate a brochure. \n",
" - Calls `generate()` to run the entire process.\n",
"\n",
"### Workflow\n",
"\n",
"1. **`main()`** initializes `BrochureGenerator` and calls `generate()`. \n",
"2. **`generate()`** calls **`LLMClient.get_relevant_links()`** to extract relevant links using **LLM (OpenAI/Ollama)**. \n",
"3. **`Website` scrapes the webpage**, extracting **text and links** from the given URL. \n",
"4. **Relevant links are re-scraped** using `Website` to collect additional content. \n",
"5. **All collected content is passed to `LLMClient.generate_brochure()`**. \n",
"6. **`LLMClient` streams the generated brochure** using **OpenAI or Ollama**. \n",
"7. **The final brochure is displayed in Markdown format.**\n",
"\n",
"![](https://github.com/lisekarimi/lexo/blob/main/assets/02_brochure_process.png?raw=true)\n",
"\n",
"\n",
"### Intermediate reasoning\n",
"\n",
"In this workflow, we have intermediate reasoning because the LLM is called twice:\n",
"\n",
"1. **First LLM call**: Takes raw links → filters/selects relevant ones (reasoning step).\n",
"2. **Second LLM call**: Takes selected content → generates final brochure.\n",
"\n",
"🧠 **LLM output becomes LLM input** — thats intermediate reasoning.\n",
"\n",
"![](https://github.com/lisekarimi/lexo/blob/main/assets/02_llm_intermd_reasoning.png?raw=true)"
]
},
{
"cell_type": "markdown",
"id": "4b286461-35ee-4bc5-b07d-af554923e36d",
"metadata": {},
"source": [
"## 📦 Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3fe5670c-5146-474b-9e75-484210533f55",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"import json\n",
"import ollama\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import display, Markdown, update_display\n",
"from openai import OpenAI"
]
},
{
"cell_type": "markdown",
"id": "f3e23181-1e66-410d-a910-1fb4230f8088",
"metadata": {},
"source": [
"## 🧠 Define the Model\n",
"\n",
"The user can switch between OpenAI and Ollama by changing a single variable (`USE_OPENAI`). The model selection is dynamic."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa2bd452-0cf4-4fec-9542-e1c86584c23f",
"metadata": {},
"outputs": [],
"source": [
"# Load API key\n",
"load_dotenv()\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"if not api_key or not api_key.startswith('sk-'):\n",
" raise ValueError(\"Invalid OpenAI API key. Check your .env file.\")\n",
"\n",
"# Define the model dynamically\n",
"USE_OPENAI = True # True to use openai and False to use Ollama\n",
"MODEL = 'gpt-4o-mini' if USE_OPENAI else 'llama3.2:3b'\n",
"\n",
"openai_client = OpenAI() if USE_OPENAI else None"
]
},
{
"cell_type": "markdown",
"id": "4fd997b7-1b89-4817-b53a-078164f5f71f",
"metadata": {},
"source": [
"## 🏗️ Define Classes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aed1af59-8b8f-4add-98dc-a9f1b5b511a5",
"metadata": {},
"outputs": [],
"source": [
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
" \"\"\"\n",
" A utility class to scrape and process website content.\n",
" \"\"\"\n",
" def __init__(self, url):\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" self.text = self.extract_text(soup)\n",
" self.links = self.extract_links(soup)\n",
"\n",
" def extract_text(self, soup):\n",
" if soup.body:\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" return soup.body.get_text(separator=\"\\n\", strip=True)\n",
" return \"\"\n",
"\n",
" def extract_links(self, soup):\n",
" links = [link.get('href') for link in soup.find_all('a')]\n",
" return [link for link in links if link and 'http' in link]\n",
"\n",
" def get_contents(self):\n",
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea04dc7e-ff4c-4113-83b7-0bddcf5072b9",
"metadata": {},
"outputs": [],
"source": [
"class LLMClient:\n",
" def __init__(self, model=MODEL):\n",
" self.model = model\n",
"\n",
" def get_relevant_links(self, website):\n",
" link_system_prompt = \"\"\"\n",
" You are given a list of links from a company website.\n",
" Select only relevant links for a brochure (About, Company, Careers, Products, Contact).\n",
" Exclude login, terms, privacy, and emails.\n",
"\n",
" ### **Instructions**\n",
" - Return **only valid JSON**.\n",
" - **Do not** include explanations, comments, or Markdown.\n",
" - Example output:\n",
" {\n",
" \"links\": [\n",
" {\"type\": \"about\", \"url\": \"https://company.com/about\"},\n",
" {\"type\": \"contact\", \"url\": \"https://company.com/contact\"},\n",
" {\"type\": \"product\", \"url\": \"https://company.com/products\"}\n",
" ]\n",
" }\n",
" \"\"\"\n",
"\n",
" user_prompt = f\"\"\"\n",
" Here is the list of links on the website of {website.url}:\n",
" Please identify the relevant web links for a company brochure. Respond in JSON format.\n",
" Do not include login, terms of service, privacy, or email links.\n",
" Links (some might be relative links):\n",
" {', '.join(website.links)}\n",
" \"\"\"\n",
"\n",
" if USE_OPENAI:\n",
" response = openai_client.chat.completions.create(\n",
" model=self.model,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" )\n",
" return json.loads(response.choices[0].message.content.strip())\n",
" else:\n",
" response = ollama.chat(\n",
" model=self.model,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" )\n",
" result = response.get(\"message\", {}).get(\"content\", \"\").strip()\n",
" try:\n",
" return json.loads(result) # Attempt to parse JSON\n",
" except json.JSONDecodeError:\n",
" print(\"Error: Response is not valid JSON\")\n",
" return {\"links\": []} # Return empty list if parsing fails\n",
"\n",
"\n",
" def generate_brochure(self, company_name, content, language):\n",
" system_prompt = \"\"\"\n",
" You are a professional translator and writer who creates fun and engaging brochures.\n",
" Your task is to read content from a companys website and write a short, humorous, joky,\n",
" and entertaining brochure for potential customers, investors, and job seekers.\n",
" Include details about the companys culture, customers, and career opportunities if available.\n",
" Respond in Markdown format.\n",
" \"\"\"\n",
"\n",
" user_prompt = f\"\"\"\n",
" Create a fun brochure for '{company_name}' using the following content:\n",
" {content[:5000]}\n",
" Respond in {language} only, and format your response correctly in Markdown.\n",
" Do NOT escape characters or return extra backslashes.\n",
" \"\"\"\n",
"\n",
" if USE_OPENAI:\n",
" response_stream = openai_client.chat.completions.create(\n",
" model=self.model,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" stream=True\n",
" )\n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" for chunk in response_stream:\n",
" response += chunk.choices[0].delta.content or ''\n",
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
" update_display(Markdown(response), display_id=display_handle.display_id)\n",
" else:\n",
" response_stream = ollama.chat(\n",
" model=self.model,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" stream=True\n",
" )\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" full_text = \"\"\n",
" for chunk in response_stream:\n",
" if \"message\" in chunk:\n",
" content = chunk[\"message\"][\"content\"] or \"\"\n",
" full_text += content\n",
" update_display(Markdown(full_text), display_id=display_handle.display_id)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c69651f-e004-421e-acc5-c439e57a8762",
"metadata": {},
"outputs": [],
"source": [
"class BrochureGenerator:\n",
" \"\"\"\n",
" Main class to generate a company brochure.\n",
" \"\"\"\n",
" def __init__(self, company_name, url, language='English'):\n",
" self.company_name = company_name\n",
" self.url = url\n",
" self.language = language\n",
" self.website = Website(url)\n",
" self.llm_client = LLMClient()\n",
"\n",
" def generate(self):\n",
" links = self.llm_client.get_relevant_links(self.website)\n",
" content = self.website.get_contents()\n",
"\n",
" for link in links['links']:\n",
" linked_website = Website(link['url'])\n",
" content += f\"\\n\\n{link['type']}:\\n\"\n",
" content += linked_website.get_contents()\n",
"\n",
" self.llm_client.generate_brochure(self.company_name, content, self.language)\n"
]
},
{
"cell_type": "markdown",
"id": "1379d39d",
"metadata": {},
"source": [
"## 📝 Generate Brochure"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a63519a-1981-477b-9de1-f1ff9be94201",
"metadata": {},
"outputs": [],
"source": [
"def main():\n",
" company_name = \"Tour Eiffel\"\n",
" url = \"https://www.toureiffel.paris/fr\"\n",
" language = \"French\"\n",
"\n",
" generator = BrochureGenerator(company_name, url, language)\n",
" generator.generate()\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,142 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "6e907206-4c13-4698-91c6-9ca1c32be8e7",
"metadata": {},
"source": [
"# TechExplainAI\n",
"---\n",
"\n",
"AI-driven tool that provides concise, structured explanations for technical questions and code snippets.\n",
"\n",
"- 🌍 Task: AI-powered technical explanation generator\n",
"- 🧠 Model: OpenAI's `GPT-4o-mini`, Ollama's `llama3.2:3b`\n",
"- 📌 Output Format: Markdown with real-time streaming\n",
"- 🧑‍💻 Skill Level: Beginner\n",
"- 🔄 Interaction Mode: User enters a technical question → AI generates a structured, concise explanation\n",
"- 🎯 Purpose: Quickly explain technical concepts and Python code snippets\n",
"- 🔧 Customization: Users can modify the models, prompts, and formatting as needed\n",
"\n",
"🛠️ Requirements\n",
"- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n",
"- 🔑 OpenAI API Key\n",
"- Install Ollama and pull llama3.2:3b or another lightweight model\n",
"\n",
"---\n",
"📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f743c87a-ed80-43d5-84ad-c78c8bdacb09",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import openai\n",
"import ollama\n",
"from dotenv import load_dotenv\n",
"from IPython.display import display, Markdown, update_display\n",
"\n",
"# Load environment variables\n",
"load_dotenv(override=True)\n",
"\n",
"# Set up OpenAI API key\n",
"OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')\n",
"if not OPENAI_API_KEY:\n",
" raise ValueError(\"Please set your OpenAI API key in environment variables.\")\n",
"\n",
"# Constants\n",
"MODEL_GPT = \"gpt-4o-mini\"\n",
"MODEL_LLAMA = \"llama3.2:3b\"\n",
"\n",
"# Prompt user for question (until input is provided)\n",
"while True:\n",
" question = input(\"Hello, I am your personal technical tutor. Enter your question: \").strip()\n",
" if question:\n",
" break # Proceed only if a valid question is entered\n",
" print(\"Question cannot be empty. Please enter a question.\")\n",
"\n",
"# Common user prompt\n",
"user_prompt = f\"\"\"\n",
"Please give a detailed explanation to the following question: {question}.\n",
"Be less verbose.\n",
"Provide a clear and concise explanation without unnecessary elaboration.\n",
"\"\"\"\n",
"\n",
"# Common system prompt\n",
"system_prompt = \"\"\"\n",
"You are a helpful AI assistant that explains Python code in a clear and concise manner. Provide structured explanations and examples when necessary.\n",
"Be less verbose.\n",
"\"\"\"\n",
"\n",
"def ask_openai():\n",
" \"\"\"Gets response from OpenAI's GPT model with streaming.\"\"\"\n",
" print(\"\\n\\n\\n🚀🤖🚀 Response from OpenAI GPT-4o-mini 🚀🤖🚀\")\n",
" client = openai.OpenAI(api_key=OPENAI_API_KEY)\n",
" response_stream = client.chat.completions.create(\n",
" model=MODEL_GPT,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" stream=True\n",
" )\n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" for chunk in response_stream:\n",
" response += chunk.choices[0].delta.content or ''\n",
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
" update_display(Markdown(response), display_id=display_handle.display_id)\n",
"\n",
"def ask_ollama():\n",
" \"\"\"Gets response from Ollama's Llama 3.2 model with streaming.\"\"\"\n",
" print(\"\\n\\n\\n🔥✨🔥 Response from Llama 3.2 🔥✨🔥\\n\")\n",
" response = ollama.chat(\n",
" model=MODEL_LLAMA,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" stream=True\n",
" )\n",
"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" full_text = \"\"\n",
" for chunk in response:\n",
" if \"message\" in chunk:\n",
" content = chunk[\"message\"][\"content\"] or \"\"\n",
" full_text += content\n",
" update_display(Markdown(full_text), display_id=display_handle.display_id)\n",
"\n",
"# Call the functions\n",
"ask_openai()\n",
"ask_ollama()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,127 @@
{
"cells": [
{
"metadata": {},
"cell_type": "code",
"source": [
"import os, textwrap, time, requests\n",
"from bs4 import BeautifulSoup\n",
"from openai import OpenAI\n",
"from dotenv import load_dotenv\n",
"from urllib.parse import urljoin\n",
"\n",
"# ------------------ ENV & OpenAI ------------------\n",
"load_dotenv(override=True)\n",
"openai = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
"\n",
"UA = (\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n",
" \"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117 Safari/537.36\")\n",
"BASE_URL = \"https://www.cambridge.org\"\n",
"JFQA_URL = f\"{BASE_URL}/core/journals/journal-of-financial-and-quantitative-analysis/latest-issue\"\n",
"\n",
"# ------------------ Helpers ------------------\n",
"def fetch_latest_issue(url: str) -> list[dict]:\n",
" \"\"\"Return unique {title, link} dicts for each research article.\"\"\"\n",
" soup = BeautifulSoup(\n",
" requests.get(url, headers={\"User-Agent\": UA}, timeout=30).text,\n",
" \"html.parser\"\n",
" )\n",
"\n",
" anchors = soup.find_all(\"a\", href=lambda h: h and \"/article/\" in h)\n",
" seen, articles = set(), []\n",
" for a in anchors:\n",
" href = a[\"href\"].split(\"?\")[0] # strip tracking params\n",
" if href in seen: # deduplicate\n",
" continue\n",
" seen.add(href)\n",
" title = a.get_text(\" \", strip=True)\n",
" full = urljoin(BASE_URL, href)\n",
" articles.append({\"title\": title, \"link\": full})\n",
" print(f\"Found {len(articles)} unique article links.\")\n",
" return articles\n",
"\n",
"def fetch_article_details(link: str) -> dict:\n",
" soup = BeautifulSoup(\n",
" requests.get(link, headers={\"User-Agent\": UA}, timeout=30).text,\n",
" \"html.parser\"\n",
" )\n",
"\n",
" # abstract\n",
" abs_tag = soup.find(\"div\", class_=\"abstract\")\n",
" abstract = abs_tag.get_text(\" \", strip=True) if abs_tag else \"N/A\"\n",
"\n",
" # publication date (meta is most reliable)\n",
" meta_date = soup.find(\"meta\", attrs={\"name\": \"citation_publication_date\"})\n",
" pub_date = meta_date[\"content\"] if meta_date else \"N/A\"\n",
"\n",
" # authors (multiple <meta name=\"citation_author\"> tags)\n",
" authors = [m[\"content\"] for m in soup.find_all(\"meta\",\n",
" attrs={\"name\": \"citation_author\"})]\n",
" authors_str = \", \".join(authors) or \"N/A\"\n",
"\n",
" return {\"abstract\": abstract, \"pub_date\": pub_date, \"authors\": authors_str}\n",
"\n",
"def summarise(txt: str) -> str:\n",
" prompt = (\"Summarise the following financepaper abstract in 23 sentences, \"\n",
" \"mentioning the question, method, and main finding.\\n\\n\"\n",
" f\"Abstract:\\n{txt}\")\n",
" try:\n",
" rsp = openai.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" messages=[\n",
" {\"role\": \"system\",\n",
" \"content\": \"You are a helpful finance research assistant.\"},\n",
" {\"role\": \"user\", \"content\": prompt}],\n",
" temperature=0.2, max_tokens=120\n",
" )\n",
" return rsp.choices[0].message.content.strip()\n",
" except Exception as e:\n",
" print(f\"⚠️ summarise error → {e}\")\n",
" return \"Summary unavailable.\"\n",
"\n",
"def scrape_jfqa_latest() -> None:\n",
" for art in fetch_latest_issue(JFQA_URL):\n",
" det = fetch_article_details(art[\"link\"])\n",
" if det[\"abstract\"] == \"N/A\":\n",
" print(f\"\\n📘 {art['title']} — no abstract found.\")\n",
" continue\n",
"\n",
" summary = summarise(det[\"abstract\"])\n",
" print(f\"\\n📘 {art['title']}\")\n",
" print(f\" Authors: {det['authors']}\")\n",
" print(f\" Date : {det['pub_date']}\")\n",
" print(f\" Journal: JFQA (Latest Issue)\")\n",
" print(\" Summary:\", textwrap.shorten(summary, width=600, placeholder=\"…\"))\n",
" print(\"-\" * 90)\n",
" time.sleep(1.0) # polite gap between OpenAI calls\n",
"\n",
"if __name__ == \"__main__\":\n",
" scrape_jfqa_latest()\n"
],
"id": "e20b182f6258f0be",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,261 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "4d011f3d-c10c-4a75-bd36-576e383a8d1d",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI\n",
"\n",
"\n",
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c51302e0-c848-4ec4-a0ab-03deeb9e7987",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"if not api_key:\n",
" print('No Api Key was found')\n",
"elif not api_key.startswith('sk-proj-'):\n",
" print(\"An api key was found, but it doesnt start with sk-proj\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An api key was found, but it might have space in the first or end\")\n",
"else:\n",
" print(\"Api key found and looks good so far!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1df04f3-bd4d-4b14-87cc-1e91eaf7c0ab",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "340b018a-6e97-491c-aa26-66c683ece8a0",
"metadata": {},
"outputs": [],
"source": [
"message = \"Hello GPT, this is my first message\"\n",
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=[{\"role\": \"user\", \"content\":message}])\n",
"print(response.choices[0].message.content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a06c291-2fe6-4669-a8b6-3b67769eb3fa",
"metadata": {},
"outputs": [],
"source": [
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
"\n",
" def __init__(self, url):\n",
" \"\"\"\n",
" Create this Website object from the given url using the BeautifulSoup library\n",
" \"\"\"\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd36b141-a252-44a8-8fa4-d4c2c33d3db9",
"metadata": {},
"outputs": [],
"source": [
"github = Website(\"https://github.com/Fikriraihan\")\n",
"print(github.title)\n",
"print(github.text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea402ba2-6c7f-4f96-95c0-d68a0e96e644",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"You are a skilled GitHub profile analyzer. \" \\\n",
"\"Your job is to take the provided GitHub profile or repository URL and generate a clear, structured summary covering these points: \" \\\n",
"\"1⃣ **Profile Summary** \" \\\n",
"\"- Username \" \\\n",
"\"- Bio (if available) \" \\\n",
"\"- Total public repositories \" \\\n",
"\"- Total followers \" \\\n",
"\"- Total stars received (sum across repos) \" \\\n",
"\"- Top programming languages (by repo count) \" \\\n",
"\"2⃣ **Repository Highlights** (top 3 by stars or activity) \" \\\n",
"\"For each: \" \\\n",
"\"- Repository name \" \\\n",
"\"- Description \" \\\n",
"\"- Primary language \" \\\n",
"\"- Star count \" \\\n",
"\"- Last updated date \" \\\n",
"\"- Notable technologies or frameworks used \" \\\n",
"\"3⃣ **Overall Assessment** \" \\\n",
"\"- What does this user specialize in? \" \\\n",
"\"- Are they more focused on personal projects or collaborations? \" \\\n",
"\"- Any standout strengths or skills you notice? \" \\\n",
"\"4⃣ **Recommendations** \" \\\n",
"\"- Suggest one area or technology they could explore next to grow. \" \\\n",
"\"- Suggest one improvement to make their GitHub profile more appealing. \" \\\n",
"\"Be concise, insightful, and encourage the users growth. \" \\\n",
"\"If some data is missing, state it clearly instead of guessing.\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a964e8f2-40f4-457b-9c81-7e6e2768f450",
"metadata": {},
"outputs": [],
"source": [
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a github named {website.title}\"\n",
" user_prompt += \"\\nThe contents of this github is as follows; \\\n",
"please provide a summary of this website in markdown.\"\n",
" user_prompt += website.text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "026d8ae4-1aea-45b9-b694-db0809527780",
"metadata": {},
"outputs": [],
"source": [
"system_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2e040916-8d7e-421b-b1a7-56e710940eaa",
"metadata": {},
"outputs": [],
"source": [
"print(user_prompt_for(github))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11bc74b0-7ca7-40da-81cc-84b2dd04780b",
"metadata": {},
"outputs": [],
"source": [
"messages_for(github)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e64f497f-3742-4d70-9e15-29d1974b3361",
"metadata": {},
"outputs": [],
"source": [
"def summarize(url):\n",
" website = Website(url)\n",
" response = openai.chat.completions.create(\n",
" model = \"gpt-4o-mini\",\n",
" messages = messages_for(website)\n",
" )\n",
" return response.choices[0].message.content"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95d0938d-0b26-4253-94a6-ac9240e7a8c9",
"metadata": {},
"outputs": [],
"source": [
"summarize(\"https://github.com/Fikriraihan\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd863db3-731a-46d8-ac14-f74f8ae39bd4",
"metadata": {},
"outputs": [],
"source": [
"def display_summary(url):\n",
" summary = summarize(url)\n",
" display(Markdown(summary))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70c5c3aa-2c06-460b-9c4f-6465d2c8611c",
"metadata": {},
"outputs": [],
"source": [
"display_summary(\"https://github.com/Fikriraihan\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f3dfe6e3-dfd2-4acd-a2e4-681873c650c8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,354 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "2e40e4f0-4f65-4f68-be50-07401959f46e",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "fea8f921-7f2f-4942-9f88-cb6eb64ea731",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"API key found and looks good so far!\n"
]
}
],
"source": [
"# Load environment variables in a file called .env\n",
"\n",
"load_dotenv()\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8d90ba3b-e50e-4a7d-820f-e669ea3679ff",
"metadata": {},
"outputs": [],
"source": [
"#call open AI\n",
"openai = OpenAI()\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "046a59c6-56f5-4a09-89bd-8163075ad643",
"metadata": {},
"outputs": [],
"source": [
"# A class to represent a Webpage\n",
"\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"class Website:\n",
" def __init__(self, url):\n",
" \"\"\"\n",
" Create this Website object for a Finance latest news\n",
" \"\"\"\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" \n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" \n",
" # Find news headlines and content \n",
" news_data = []\n",
" \n",
" # Try different selectors \n",
" news_items = soup.find_all('h3') + soup.find_all('h2')\n",
" \n",
" for item in news_items:\n",
" headline = item.get_text(strip=True)\n",
" if headline and len(headline) > 20: # Filter out short/empty text\n",
" # Try to find content near the headline\n",
" content = \"\"\n",
" parent = item.find_parent()\n",
" if parent:\n",
" # Look for paragraph or summary text\n",
" summary = parent.find('p')\n",
" if summary:\n",
" content = summary.get_text(strip=True)[:300] + \"...\"\n",
" \n",
" news_data.append({'headline': headline, 'content': content})\n",
" \n",
" # Create the text content\n",
" self.text = \"Latest financial news headlines:\\n\\n\"\n",
" \n",
" # Get top 5 headlines with content\n",
" for i, news in enumerate(news_data[:10], 1):\n",
" self.text += f\"{i}. {news['headline']}\\n\"\n",
" if news['content']:\n",
" self.text += f\" Summary: {news['content']}\\n\"\n",
" self.text += \"\\n\"\n",
" \n",
" if not news_data:\n",
" self.text = \"No headlines found. Yahoo Finance structure may have changed.\"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b5b1c72e-bc74-4ed0-9a64-795ca9bac74d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Title: Yahoo Finance - Stock Market Live, Quotes, Business & Finance News\n",
"Top News:\n",
"Latest financial news headlines:\n",
"\n",
"1. US Risks Losing Reliable Investment Status, Allianz GI Manager Says\n",
" Summary: (Bloomberg) -- Inside one of Europes biggest asset managers, theres growing concern that Republican efforts to gut legislation supporting key industries such as clean energy may result in the US losing its status as a destination for investor capital.Most Read from BloombergNY Private School Plead...\n",
"\n",
"2. Why Intempus thinks robots should have a human physiological state\n",
" Summary: Teddy Warner, 19, has always been interested in robotics. His family was in the industry, and he says he \"grew up\" working in a machinist shop while in high school. Now Warner is building a robotics company of his own, Intempus, that looks to make robots a bit more human. Intempus is building tech t...\n",
"\n",
"3. Last 24 hours: TechCrunch Disrupt 2025 Early Bird Deals will fly away after today\n",
" Summary: Just 24 hours left to lock in Early Bird pricing for TechCrunch Disrupt 2025 — happening October 2729 at Moscone West in San Francisco. Save up to $900 on your pass, or bring someone brilliant with you for 90% off their ticket. This deal ends tonight at 11:59 p.m. PT. Grab your Early Bird discount ...\n",
"\n",
"4. 48 hours left: What you wont want to miss at the 20th TechCrunch Disrupt in October\n",
" Summary: There are just 48 hours left to save up to $900 on your ticket to TechCrunch Disrupt 2025 — and get 90% off the second. After May 25 at 11:59 p.m. PT, Early Bird pricing vanishes — along with your best chance to join 10,000 of techs most forward-thinking minds for less. But forget the math for a ...\n",
"\n",
"5. More than a third of Americans say they want an 'adventurous retirement'\n",
" Summary: Retirement is no longer just about rocking chairs, gardening, grandchildren, or afternoons on the golf course....\n",
"\n",
"6. 'Unsustainable fiscal situation': Wall Street braces for more bond market turmoil as Trump tax bill stirs up deficit concerns\n",
" Summary: Surging Treasury yields signal deepening market fears as Trump's tax plan, soaring deficits, and global fiscal turmoil shake investor confidence....\n",
"\n",
"7. Nvidia has lost its shock power to investors, for now\n",
" Summary: Nvidia's quarter may be tougher than normal to assess. Here's why....\n",
"\n",
"8. Nvidia earnings, Trump tariff updates, and the Fed's preferred inflation gauge: What to know this week\n",
" Summary: A quarterly earnings release from Nvidia is set to greet investors in the week ahead as the stock market rally has hit pause....\n",
"\n",
"9. This week in Trumponomics: Bonds spoil the party\n",
" Summary: Trump is heading toward an important victory on tax cuts. Instead of cheering, markets are fretting....\n",
"\n",
"10. Manufacturers could benefit from Trump's 'big, beautiful' bill depending on what they make\n",
" Summary: Advocates for the manufacturing sector have hailed the advancement of Trump's \"big, beautiful bill,\" but at least two provisions in the 1,000-plus-page package could cut that ebullience for some factory owners....\n",
"\n",
"\n"
]
}
],
"source": [
"website = Website(\"https://finance.yahoo.com/topic/latest-news/\")\n",
"\n",
"print(\"Title:\", website.title)\n",
"print(\"Top News:\")\n",
"print(website.text)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "2c0ac856-b0d8-4b15-8092-71ab3952a0d9",
"metadata": {},
"outputs": [],
"source": [
"# Define our system prompt\n",
"system_prompt = \"\"\"You are a veteran stock market and finance expert with 50+ years of experience helping investors make safe, steady gains. Your audience is beginners with small amounts to invest (around $100). \n",
"\n",
"**Response Format:**\n",
"1. Start with \"The News Snapshot:\" - Write 3-4 lines summarizing the key financial developments from the provided headlines and summaries, showing you understand the current market situation, start the write up for this with today in the news we see that...\n",
"\n",
"2. Give specific stock advice based on the news:\n",
" - What to avoid and why\n",
" - 2-3 specific stock recommendations with ticker symbols\n",
" - Focus only on safe, dividend-paying stocks or clear beneficiaries from the news\n",
"\n",
"3. End with \"The big picture:\" - One sentence explaining the overall market condition\n",
"\n",
"4. Close with \"Your game plan:\" - Simple, actionable advice for their $100 to show how to split it\n",
"\n",
"**Tone & Style:**\n",
"- Talk like a knowledgeable but friendly Wall Street professional advising a beginner\n",
"- Keep it under 200 words total\n",
"- Use simple language, no complex jargon\n",
"- Be direct and practical\n",
"- Focus on capital preservation over quick gains\n",
"- Always relate advice directly to the news headlines provided\n",
"\n",
"**Key Rules:**\n",
"- Only recommend established, safe stocks\n",
"- Always explain WHY based on the news\n",
"- No speculative or meme stocks\n",
"- Emphasize learning over quick profits\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "077acf13-6e37-488f-a7c7-5f301266f57f",
"metadata": {},
"outputs": [],
"source": [
"# A function that writes a User Prompt that asks for summaries of websites:\n",
"\n",
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
"please provide a provide your investment advice for a beginner with $100. \\\n",
"Because it includes finance news or trend, let the advice be based on these too.\\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "1c129909-769c-49f0-a84d-85a25972463b",
"metadata": {},
"outputs": [],
"source": [
"def messages_for(website):\n",
" return [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "2c9f998f-639f-451b-a67e-5a95978ab70d",
"metadata": {},
"outputs": [],
"source": [
"def get_advice(url):\n",
" website = Website(url)\n",
" response = openai.chat.completions.create(\n",
" model = \"gpt-4o-mini\",\n",
" messages = messages_for(website)\n",
" )\n",
" return response.choices[0].message.content"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "402b4bb4-fbf4-4930-9cd1-4ede22491fa2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'**The News Snapshot:** Recent headlines reveal rising treasury yields and concerns over the US losing its \"reliable investment\" status, stoking fears of market uncertainty. Amidst this backdrop, investors may want to focus on stable, dividend-paying stocks that can weather the storm and provide consistent returns.\\n\\n**Stock Advice:**\\n- **Avoid speculative tech stocks** like Nvidia, which has recently shown volatility and uncertainty in earnings, leading to a potential loss of investor confidence.\\n- **Recommendation #1: Johnson & Johnson (JNJ)** A well-established healthcare company that pays a reliable dividend, making it a safe bet in uncertain times.\\n- **Recommendation #2: Procter & Gamble (PG)** Known for its strong brand portfolio and consistent dividend payouts, PG offers stability and resilience against market fluctuations.\\n- **Recommendation #3: Coca-Cola (KO)** With a history of dividend increases, Coca-Cola remains a staple in many portfolios, providing that defensive position investors need right now.\\n\\n**The big picture:** The market is showing signs of concern, and investors should prioritize capital preservation over chasing quick returns.\\n\\n**Your game plan:** With your $100, consider investing in fractional shares of JNJ, PG, or KO to benefit from their dividends and stability while learning about long-term investing principles.'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_advice(\"https://finance.yahoo.com/topic/latest-news/\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "0427753f-6b47-4c36-b68f-0f22abd8a7cd",
"metadata": {},
"outputs": [],
"source": [
"def display_fin_advice(url):\n",
" advice_content = get_advice(url) \n",
" display(Markdown(advice_content))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "1d26e64f-fdd0-4492-9b20-a54847b11139",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"The News Snapshot: Today in the news, we see that concerns are rising around the US potentially losing its appeal as a reliable investment destination due to political actions, particularly in clean energy. Rising Treasury yields and fiscal uncertainty, stemming from tax policies, are causing unease in the markets. Generally, investors are on alert due to potential repercussions for sectors reliant on government support and tax reform.\n",
"\n",
"Specific Stock Advice:\n",
"- I advise avoiding high-growth tech stocks like **Nvidia (NVDA)** for now, as their recent earnings show volatility and uncertainty. \n",
"- Instead, consider established dividend-paying stocks like **Johnson & Johnson (JNJ)** and **Procter & Gamble (PG)**. Both companies are less sensitive to political changes and provide steady dividends, making them safer bets during turbulent times.\n",
"- Another option is **3M Company (MMM)**, which has a strong history of dividend payments and benefits from potential manufacturing boosts tied to new legislation.\n",
"\n",
"The big picture: The market is navigating through uncertainties, particularly around fiscal policy and investment confidence.\n",
"\n",
"Your game plan: Split your $100 into three parts: $40 in Johnson & Johnson, $40 in Procter & Gamble, and keep $20 in cash for future opportunities or to cover transaction fees. This balanced approach aims for safety and steady growth."
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_fin_advice(\"https://finance.yahoo.com/topic/latest-news/\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7567571d-b4c7-41be-9fd0-d65ae533a252",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,247 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"id": "8ce13728-0040-43cc-82cd-e10c838ef71c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🌍 Detected language: PT\n",
"🔗 Preview of extracted text:\n",
"\n",
"ITASAT2 irá atuar para aplicações científicas e de defesa\n",
"Publicado em 14/04/2025 - 14h15\n",
"O Instituto Tecnológico de Aeronáutica (ITA) realizou, entre os dias 17 e 19 de março, a Revisão Preliminar de Projeto (PDR) do ITASAT 2, novo microssatélite em desenvolvimento por pesquisadores do Centro Espacial ITA (CEI). A atividade representa uma etapa importante dos estudos e contou com a presença de instituições parceiras, tanto do Brasil quanto do exterior.\n",
"Participaram do encontro representantes do\n",
"...\n",
"\n",
"Amount of words: 526\n",
"\n",
"\n",
"📊 Usage Report\n",
"🧾 Prompt tokens: 927\n",
"🧠 Completion tokens: 309\n",
"🔢 Total tokens: 1236\n",
"💰 Total cost: $0.000927\n",
"\n",
"\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"# 📝 Summary\n",
"\n",
"The ITA (Instituto Tecnológico de Aeronáutica) is working on the ITASAT 2 project, a new microsatellite geared towards scientific and defense applications! 🌟 This initiative was highlighted at the Preliminary Design Review (PDR) held from March 17 to 19, with participation from notable organizations such as NASA and the Brazilian Space Agency (AEB). This is a fantastic collaboration that spans both domestic and international partnerships how exciting is that? \n",
"\n",
"ITASAT 2 will consist of a constellation of three CubeSats focusing on monitoring the Earth's ionosphere and assessing plasma bubble formation. Interestingly, it also has defense applications such as geolocating radio frequency sources and optical identification of uncooperative vessels a crucial capability for maritime security!\n",
"\n",
"The PDR showcased the team's technical and managerial capabilities, receiving unanimous approval to proceed with the project. Its great to see such thorough preparation reflecting the dedication of the ITA team! \n",
"\n",
"The CubeSats themselves are cubic nano or microsatellites, and the ITASAT 2 is of the 16U variety, meaning it's made up of 16 units measuring 10 cm each just amazing how compact these technologies can be! Additionally, the CEI is also developing another CubeSat called SelenITA, which will contribute to NASA's Artemis mission to study the Moon! 🌕\n",
"\n",
"Keep an eye on this remarkable project as it continues to develop the future of space exploration and defense technology looks bright! 🚀"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Import Libraries\n",
"import os\n",
"import requests\n",
"from openai import OpenAI\n",
"\n",
"from bs4 import BeautifulSoup\n",
"from langdetect import detect, LangDetectException\n",
"from dotenv import load_dotenv\n",
"\n",
"from IPython.display import Markdown, display\n",
"\n",
"# Load .env variables\n",
"load_dotenv()\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"if not openai_api_key:\n",
" raise ValueError(\"⚠️ OPENAI_API_KEY not found in .env file.\")\n",
"\n",
"# Generating object to work with GPT tasks \n",
"openai = OpenAI()\n",
"\n",
"# Class to work with text extraction, processing and summarizing from a given url\n",
"class WebPageSummarizer():\n",
" \"\"\"\n",
" Class to work with text extraction, processing and summarizing from a given url using the BeautifulSoup library. It also includes pricing.\n",
" \"\"\"\n",
" def __init__(self, url: str, summary_detail: str = \"high\", show_summary: bool = True, language_of_reference = \"English\", model: str = \"gpt-4o-mini\") -> None:\n",
"\n",
" # Initial summarizer settings\n",
" self.url = url\n",
" self.model = model\n",
" self.show_summary = show_summary\n",
" self.summary_detail = summary_detail\n",
" self.language_of_reference = language_of_reference\n",
" self.language_code_map = {\n",
" \"english\": \"en\",\n",
" \"portuguese\": \"pt\",\n",
" \"spanish\": \"es\",\n",
" \"french\": \"fr\",\n",
" \"german\": \"de\",\n",
" \"italian\": \"it\",\n",
" \"japanese\": \"ja\",\n",
" \"chinese\": \"zh\",\n",
" \"korean\": \"ko\",\n",
" }\n",
" \n",
" self.model_pricing = {\n",
" \"gpt-4o-mini\": {\"input\": 0.0005, \"output\": 0.0015},\n",
" \"gpt-4o\": {\"input\": 0.005, \"output\": 0.015},\n",
" \"gpt-4-turbo\": {\"input\": 0.01, \"output\": 0.03},\n",
" \"gpt-4\": {\"input\": 0.03, \"output\": 0.06}, # Rarely used now\n",
" \"gpt-3.5-turbo\": {\"input\": 0.0005, \"output\": 0.0015}\n",
" }\n",
"\n",
" self.headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \"\n",
" \"(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36\"\n",
" }\n",
"\n",
" if self.summary_detail not in [\"high\", \"low\"]:\n",
" raise Exception(\"\"\"Please select summary detail as either \"high\" or \"low\".\"\"\")\n",
"\n",
" def __extract_text(self):\n",
" response = requests.get(self.url, headers=self.headers)\n",
" if response.status_code != 200:\n",
" raise Exception(f\"Failed to fetch page. Status code: {response.status_code}\")\n",
" \n",
" soup = BeautifulSoup(response.text, \"html.parser\")\n",
" \n",
" # Try to extract meaningful content\n",
" paragraphs = soup.find_all(\"p\")\n",
" \n",
" # Join all paragraph text\n",
" self.text = \"\\n\".join([p.get_text() for p in paragraphs if p.get_text().strip() != \"\"])\n",
"\n",
" # Guarantee limit of text to summary\n",
" max_words = 7000\n",
" if len(self.text.split()) > max_words:\n",
" self.text = \" \".join(self.text.split()[:max_words])\n",
" \n",
" def __detect_language(self):\n",
" # Detect language\n",
" try:\n",
" self.language_url = detect(self.text)\n",
" except LangDetectException:\n",
" self.language_url = \"unknown\"\n",
"\n",
" # Normalize and resolve target language code\n",
" target_language_name = self.language_of_reference.lower().strip()\n",
" self.target_language_code = self.language_code_map.get(target_language_name)\n",
" \n",
" if not self.target_language_code:\n",
" raise ValueError(f\"❌ Unsupported language: {self.language_of_reference}. Please use one of: {list(LANGUAGE_CODE_MAP.keys())}\")\n",
"\n",
" print(f\"🌍 Detected language: {self.language_url.upper()}\")\n",
" \n",
" if self.show_summary:\n",
" print(\"🔗 Preview of extracted text:\\n\")\n",
" print(self.text[:500] + \"\\n...\\n\")\n",
" print(f\"Amount of words: {len(self.text.split())}\\n\")\n",
"\n",
" def __calculate_cost(self, prompt_tokens: int, completion_tokens: int) -> float:\n",
" \"\"\"\n",
" Calculates total cost in USD based on selected model.\n",
" \"\"\"\n",
" pricing = self.model_pricing.get(self.model)\n",
" if pricing is None:\n",
" raise ValueError(f\"\"\"Pricing not available for model \"{self.model}\". Add it to model_pricing.\"\"\")\n",
" \n",
" input_cost = (prompt_tokens / 1000) * pricing[\"input\"]\n",
" output_cost = (completion_tokens / 1000) * pricing[\"output\"]\n",
" return input_cost + output_cost\n",
"\n",
" def summarize(self)-> str:\n",
" \"\"\"\n",
" Method to process user prompts in the context of the user.\n",
" \"\"\"\n",
" self.__extract_text()\n",
" self.__detect_language()\n",
" \n",
" # Prompt for system definition\n",
" self.system_prompt = f\"\"\" \n",
" You are an assistant that analyzes the contents of a website and provides a summary. \n",
" Please notice that providing a {self.summary_detail} summary detail is IMPORTANT.\n",
" If you find text that might be navigation related or ad related please ignore. Respond in markdown. \n",
" Also, can you please start your summary with the tile \"📝 Summary\"?\n",
" \n",
" Please show some excited behavior during your summary, making comments with extra knowledge if possible during or at the end of the sentence. \n",
" \"\"\"\n",
"\n",
" self.content = f\"\"\"The text to summarize is as follows: {self.text}\"\"\"\n",
"\n",
" if self.language_url != self.target_language_code:\n",
" self.system_prompt = f\"\"\"The website content is in {self.language_url.upper()}. Please first translate it to {self.language_of_reference}. \n",
" {self.system_prompt.strip()}\n",
" \"\"\"\n",
"\n",
" response = openai.chat.completions.create(model=self.model, messages=[{\"role\":\"system\", \"content\":self.system_prompt}, \n",
" {\"role\": \"user\", \"content\":self.content}])\n",
"\n",
" # Cost calculation and usage report\n",
" usage = response.usage\n",
" total_cost = self.__calculate_cost(usage.prompt_tokens, usage.completion_tokens)\n",
" \n",
" print(\"\\n📊 Usage Report\")\n",
" print(f\"🧾 Prompt tokens: {usage.prompt_tokens}\")\n",
" print(f\"🧠 Completion tokens: {usage.completion_tokens}\")\n",
" print(f\"🔢 Total tokens: {usage.total_tokens}\")\n",
" print(f\"💰 Total cost: ${total_cost:.6f}\\n\\n\\n\")\n",
"\n",
" return response.choices[0].message.content\n",
"\n",
"\n",
"web_page_summarizer = WebPageSummarizer(\"http://www.ita.br/noticias/revisodeprojetodonovomicrossatlitedoitaaprovada\", summary_detail = \"low\")\n",
"display(Markdown(web_page_summarizer.summarize()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af5a186a-bb25-4cf4-a6d2-6034cd493bc4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,794 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
"metadata": {},
"source": [
"# YOUR FIRST LAB\n",
"### Please read this section. This is valuable to get you prepared, even if it's a long read -- it's important stuff.\n",
"\n",
"## Your first Frontier LLM Project\n",
"\n",
"Let's build a useful LLM solution - in a matter of minutes.\n",
"\n",
"By the end of this course, you will have built an autonomous Agentic AI solution with 7 agents that collaborate to solve a business problem. All in good time! We will start with something smaller...\n",
"\n",
"Our goal is to code a new kind of Web Browser. Give it a URL, and it will respond with a summary. The Reader's Digest of the internet!!\n",
"\n",
"Before starting, you should have completed the setup for [PC](../SETUP-PC.md) or [Mac](../SETUP-mac.md) and you hopefully launched this jupyter lab from within the project root directory, with your environment activated.\n",
"\n",
"## If you're new to Jupyter Lab\n",
"\n",
"Welcome to the wonderful world of Data Science experimentation! Once you've used Jupyter Lab, you'll wonder how you ever lived without it. Simply click in each \"cell\" with code in it, such as the cell immediately below this text, and hit Shift+Return to execute that cell. As you wish, you can add a cell with the + button in the toolbar, and print values of variables, or try out variations. \n",
"\n",
"I've written a notebook called [Guide to Jupyter](Guide%20to%20Jupyter.ipynb) to help you get more familiar with Jupyter Labs, including adding Markdown comments, using `!` to run shell commands, and `tqdm` to show progress.\n",
"\n",
"## If you're new to the Command Line\n",
"\n",
"Please see these excellent guides: [Command line on PC](https://chatgpt.com/share/67b0acea-ba38-8012-9c34-7a2541052665) and [Command line on Mac](https://chatgpt.com/canvas/shared/67b0b10c93a081918210723867525d2b). \n",
"\n",
"## If you'd prefer to work in IDEs\n",
"\n",
"If you're more comfortable in IDEs like VSCode, Cursor or PyCharm, they both work great with these lab notebooks too. \n",
"If you'd prefer to work in VSCode, [here](https://chatgpt.com/share/676f2e19-c228-8012-9911-6ca42f8ed766) are instructions from an AI friend on how to configure it for the course.\n",
"\n",
"## If you'd like to brush up your Python\n",
"\n",
"I've added a notebook called [Intermediate Python](Intermediate%20Python.ipynb) to get you up to speed. But you should give it a miss if you already have a good idea what this code does: \n",
"`yield from {book.get(\"author\") for book in books if book.get(\"author\")}`\n",
"\n",
"## I am here to help\n",
"\n",
"If you have any problems at all, please do reach out. \n",
"I'm available through the platform, or at ed@edwarddonner.com, or at https://www.linkedin.com/in/eddonner/ if you'd like to connect (and I love connecting!) \n",
"And this is new to me, but I'm also trying out X/Twitter at [@edwarddonner](https://x.com/edwarddonner) - if you're on X, please show me how it's done 😂 \n",
"\n",
"## More troubleshooting\n",
"\n",
"Please see the [troubleshooting](troubleshooting.ipynb) notebook in this folder to diagnose and fix common problems. At the very end of it is a diagnostics script with some useful debug info.\n",
"\n",
"## For foundational technical knowledge (eg Git, APIs, debugging) \n",
"\n",
"If you're relatively new to programming -- I've got your back! While it's ideal to have some programming experience for this course, there's only one mandatory prerequisite: plenty of patience. 😁 I've put together a set of self-study guides that cover Git and GitHub, APIs and endpoints, beginner python and more.\n",
"\n",
"This covers Git and GitHub; what they are, the difference, and how to use them: \n",
"https://github.com/ed-donner/agents/blob/main/guides/03_git_and_github.ipynb\n",
"\n",
"This covers technical foundations: \n",
"ChatGPT vs API; taking screenshots; Environment Variables; Networking basics; APIs and endpoints: \n",
"https://github.com/ed-donner/agents/blob/main/guides/04_technical_foundations.ipynb\n",
"\n",
"This covers Python for beginners, and making sure that a `NameError` never trips you up: \n",
"https://github.com/ed-donner/agents/blob/main/guides/06_python_foundations.ipynb\n",
"\n",
"This covers the essential techniques for figuring out errors: \n",
"https://github.com/ed-donner/agents/blob/main/guides/08_debugging.ipynb\n",
"\n",
"And you'll find other useful guides in the same folder in GitHub. Some information applies to my other Udemy course (eg Async Python) but most of it is very relevant for LLM engineering.\n",
"\n",
"## If this is old hat!\n",
"\n",
"If you're already comfortable with today's material, please hang in there; you can move swiftly through the first few labs - we will get much more in depth as the weeks progress. Ultimately we will fine-tune our own LLM to compete with OpenAI!\n",
"\n",
"<table style=\"margin: 0; text-align: left;\">\n",
" <tr>\n",
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
" <img src=\"../important.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
" </td>\n",
" <td>\n",
" <h2 style=\"color:#900;\">Please read - important note</h2>\n",
" <span style=\"color:#900;\">The way I collaborate with you may be different to other courses you've taken. I prefer not to type code while you watch. Rather, I execute Jupyter Labs, like this, and give you an intuition for what's going on. My suggestion is that you carefully execute this yourself, <b>after</b> watching the lecture. Add print statements to understand what's going on, and then come up with your own variations. If you have a Github account, use this to showcase your variations. Not only is this essential practice, but it demonstrates your skills to others, including perhaps future clients or employers...</span>\n",
" </td>\n",
" </tr>\n",
"</table>\n",
"<table style=\"margin: 0; text-align: left;\">\n",
" <tr>\n",
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
" <img src=\"../resources.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
" </td>\n",
" <td>\n",
" <h2 style=\"color:#f71;\">This code is a live resource - keep an eye out for my emails</h2>\n",
" <span style=\"color:#f71;\">I push updates to the code regularly. As people ask questions, I add more examples or improved commentary. As a result, you'll notice that the code below isn't identical to the videos. Everything from the videos is here; but I've also added better explanations and new models like DeepSeek. Consider this like an interactive book.<br/><br/>\n",
" I try to send emails regularly with important updates related to the course. You can find this in the 'Announcements' section of Udemy in the left sidebar. You can also choose to receive my emails via your Notification Settings in Udemy. I'm respectful of your inbox and always try to add value with my emails!\n",
" </span>\n",
" </td>\n",
" </tr>\n",
"</table>\n",
"<table style=\"margin: 0; text-align: left;\">\n",
" <tr>\n",
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
" <img src=\"../business.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
" </td>\n",
" <td>\n",
" <h2 style=\"color:#181;\">Business value of these exercises</h2>\n",
" <span style=\"color:#181;\">A final thought. While I've designed these notebooks to be educational, I've also tried to make them enjoyable. We'll do fun things like have LLMs tell jokes and argue with each other. But fundamentally, my goal is to teach skills you can apply in business. I'll explain business implications as we go, and it's worth keeping this in mind: as you build experience with models and techniques, think of ways you could put this into action at work today. Please do contact me if you'd like to discuss more or if you have ideas to bounce off me.</span>\n",
" </td>\n",
" </tr>\n",
"</table>"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.chrome.service import Service\n",
"from webdriver_manager.chrome import ChromeDriverManager\n",
"import time\n",
"import random\n",
"from urllib import robotparser\n",
"from urllib.parse import urlparse\n",
"\n",
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
]
},
{
"cell_type": "markdown",
"id": "6900b2a8-6384-4316-8aaa-5e519fca4254",
"metadata": {},
"source": [
"# Connecting to OpenAI (or Ollama)\n",
"\n",
"The next cell is where we load in the environment variables in your `.env` file and connect to OpenAI. \n",
"\n",
"If you'd like to use free Ollama instead, please see the README section \"Free Alternative to Paid APIs\", and if you're not sure how to do this, there's a full solution in the solutions folder (day1_with_ollama.ipynb).\n",
"\n",
"## Troubleshooting if you have problems:\n",
"\n",
"Head over to the [troubleshooting](troubleshooting.ipynb) notebook in this folder for step by step code to identify the root cause and fix it!\n",
"\n",
"If you make a change, try restarting the \"Kernel\" (the python process sitting behind this notebook) by Kernel menu >> Restart Kernel and Clear Outputs of All Cells. Then try this notebook again, starting at the top.\n",
"\n",
"Or, contact me! Message me or email ed@edwarddonner.com and we will get this to work.\n",
"\n",
"Any concerns about API costs? See my notes in the README - costs should be minimal, and you can control it at every point. You can also use Ollama as a free alternative, which we discuss during Day 2."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
"metadata": {},
"outputs": [],
"source": [
"# Load environment variables in a file called .env\n",
"\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()\n",
"\n",
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
"# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
]
},
{
"cell_type": "markdown",
"id": "442fc84b-0815-4f40-99ab-d9a5da6bda91",
"metadata": {},
"source": [
"# Let's make a quick call to a Frontier model to get started, as a preview!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a58394bf-1e45-46af-9bfd-01e24da6f49a",
"metadata": {},
"outputs": [],
"source": [
"# To give you a preview -- calling OpenAI with these messages is this easy. Any problems, head over to the Troubleshooting notebook.\n",
"\n",
"message = \"Hello, GPT! This is my first ever message to you! Hi!\"\n",
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=[{\"role\":\"user\", \"content\":message}])\n",
"print(response.choices[0].message.content)"
]
},
{
"cell_type": "markdown",
"id": "2aa190e5-cb31-456a-96cc-db109919cd78",
"metadata": {},
"source": [
"## OK onwards with our first project"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5e793b2-6775-426a-a139-4848291d0463",
"metadata": {},
"outputs": [],
"source": [
"# A class to represent a Webpage\n",
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
"\n",
"# Some websites need you to use proper headers when fetching them:\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
"\n",
" def __init__(self, url):\n",
" \"\"\"\n",
" Create this Website object from the given url using the BeautifulSoup library\n",
" \"\"\"\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
"metadata": {},
"outputs": [],
"source": [
"# Let's try one out. Change the website and add print statements to follow along.\n",
"\n",
"ed = Website(\"https://edwarddonner.com\")\n",
"print(ed.title)\n",
"print(ed.text)"
]
},
{
"cell_type": "markdown",
"id": "6a478a0c-2c53-48ff-869c-4d08199931e1",
"metadata": {},
"source": [
"## Types of prompts\n",
"\n",
"You may know this already - but if not, you will get very familiar with it!\n",
"\n",
"Models like GPT4o have been trained to receive instructions in a particular way.\n",
"\n",
"They expect to receive:\n",
"\n",
"**A system prompt** that tells them what task they are performing and what tone they should use\n",
"\n",
"**A user prompt** -- the conversation starter that they should reply to"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
"metadata": {},
"outputs": [],
"source": [
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
"\n",
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
"and provides a short summary, ignoring text that might be navigation related. \\\n",
"Respond in markdown.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0275b1b-7cfe-4f9d-abfa-7650d378da0c",
"metadata": {},
"outputs": [],
"source": [
"# A function that writes a User Prompt that asks for summaries of websites:\n",
"\n",
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
"please provide a short summary of this website in markdown. \\\n",
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26448ec4-5c00-4204-baec-7df91d11ff2e",
"metadata": {},
"outputs": [],
"source": [
"print(user_prompt_for(ed))"
]
},
{
"cell_type": "markdown",
"id": "ea211b5f-28e1-4a86-8e52-c0b7677cadcc",
"metadata": {},
"source": [
"## Messages\n",
"\n",
"The API from OpenAI expects to receive messages in a particular structure.\n",
"Many of the other APIs share this structure:\n",
"\n",
"```python\n",
"[\n",
" {\"role\": \"system\", \"content\": \"system message goes here\"},\n",
" {\"role\": \"user\", \"content\": \"user message goes here\"}\n",
"]\n",
"```\n",
"To give you a preview, the next 2 cells make a rather simple call - we won't stretch the mighty GPT (yet!)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f25dcd35-0cd0-4235-9f64-ac37ed9eaaa5",
"metadata": {},
"outputs": [],
"source": [
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a snarky assistant\"},\n",
" {\"role\": \"user\", \"content\": \"What is 2 + 2?\"}\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "21ed95c5-7001-47de-a36d-1d6673b403ce",
"metadata": {},
"outputs": [],
"source": [
"# To give you a preview -- calling OpenAI with system and user messages:\n",
"\n",
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n",
"print(response.choices[0].message.content)"
]
},
{
"cell_type": "markdown",
"id": "d06e8d78-ce4c-4b05-aa8e-17050c82bb47",
"metadata": {},
"source": [
"## And now let's build useful messages for GPT-4o-mini, using a function"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
"metadata": {},
"outputs": [],
"source": [
"# See how this function creates exactly the format above\n",
"\n",
"def messages_for(website):\n",
" return [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36478464-39ee-485c-9f3f-6a4e458dbc9c",
"metadata": {},
"outputs": [],
"source": [
"# Try this out, and then try for a few more websites\n",
"\n",
"messages_for(ed)"
]
},
{
"cell_type": "markdown",
"id": "16f49d46-bf55-4c3e-928f-68fc0bf715b0",
"metadata": {},
"source": [
"## Time to bring it together - the API for OpenAI is very simple!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "905b9919-aba7-45b5-ae65-81b3d1d78e34",
"metadata": {},
"outputs": [],
"source": [
"# And now: call the OpenAI API. You will get very familiar with this!\n",
"\n",
"def summarize(url):\n",
" website = Website(url)\n",
" response = openai.chat.completions.create(\n",
" model = \"gpt-4o-mini\",\n",
" messages = messages_for(website)\n",
" )\n",
" return response.choices[0].message.content"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05e38d41-dfa4-4b20-9c96-c46ea75d9fb5",
"metadata": {},
"outputs": [],
"source": [
"summarize(\"https://edwarddonner.com\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d926d59-450e-4609-92ba-2d6f244f1342",
"metadata": {},
"outputs": [],
"source": [
"# A function to display this nicely in the Jupyter output, using markdown\n",
"\n",
"def display_summary(url):\n",
" summary = summarize(url)\n",
" display(Markdown(summary))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3018853a-445f-41ff-9560-d925d1774b2f",
"metadata": {},
"outputs": [],
"source": [
"display_summary(\"https://edwarddonner.com\")"
]
},
{
"cell_type": "markdown",
"id": "b3bcf6f4-adce-45e9-97ad-d9a5d7a3a624",
"metadata": {},
"source": [
"# Let's try more websites\n",
"\n",
"Note that this will only work on websites that can be scraped using this simplistic approach.\n",
"\n",
"Websites that are rendered with Javascript, like React apps, won't show up. See the community-contributions folder for a Selenium implementation that gets around this. You'll need to read up on installing Selenium (ask ChatGPT!)\n",
"\n",
"Also Websites protected with CloudFront (and similar) may give 403 errors - many thanks Andy J for pointing this out.\n",
"\n",
"But many websites will work just fine!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45d83403-a24c-44b5-84ac-961449b4008f",
"metadata": {},
"outputs": [],
"source": [
"display_summary(\"https://cnn.com\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75e9fd40-b354-4341-991e-863ef2e59db7",
"metadata": {},
"outputs": [],
"source": [
"display_summary(\"https://anthropic.com\")"
]
},
{
"cell_type": "markdown",
"id": "c951be1a-7f1b-448f-af1f-845978e47e2c",
"metadata": {},
"source": [
"<table style=\"margin: 0; text-align: left;\">\n",
" <tr>\n",
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
" <img src=\"../business.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
" </td>\n",
" <td>\n",
" <h2 style=\"color:#181;\">Business applications</h2>\n",
" <span style=\"color:#181;\">In this exercise, you experienced calling the Cloud API of a Frontier Model (a leading model at the frontier of AI) for the first time. We will be using APIs like OpenAI at many stages in the course, in addition to building our own LLMs.\n",
"\n",
"More specifically, we've applied this to Summarization - a classic Gen AI use case to make a summary. This can be applied to any business vertical - summarizing the news, summarizing financial performance, summarizing a resume in a cover letter - the applications are limitless. Consider how you could apply Summarization in your business, and try prototyping a solution.</span>\n",
" </td>\n",
" </tr>\n",
"</table>\n",
"\n",
"<table style=\"margin: 0; text-align: left;\">\n",
" <tr>\n",
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
" <img src=\"../important.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
" </td>\n",
" <td>\n",
" <h2 style=\"color:#900;\">Before you continue - now try yourself</h2>\n",
" <span style=\"color:#900;\">Use the cell below to make your own simple commercial example. Stick with the summarization use case for now. Here's an idea: write something that will take the contents of an email, and will suggest an appropriate short subject line for the email. That's the kind of feature that might be built into a commercial email tool.</span>\n",
" </td>\n",
" </tr>\n",
"</table>"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "00743dac-0e70-45b7-879a-d7293a6f68a6",
"metadata": {},
"outputs": [],
"source": [
"# Step 1: Create your prompts\n",
"\n",
"system_prompt = \"\"\"\n",
"You are an assistant that creates short clear concise and relevant email \n",
"subject lines based on the content of the email\n",
"\"\"\"\n",
"user_prompt = \"\"\"\n",
"Hi team,\n",
"\n",
"Just a quick update on our Q2 progress. Weve exceeded our sales goals by 15% and customer satisfaction scores are up 10 points from last quarter. Kudos to everyone involved, especially the sales and support teams. Lets keep this momentum going as we head into Q3.\n",
"\n",
"Best,\n",
"Jeannine\n",
"\"\"\"\n",
"\n",
"# Step 2: Make the messages list\n",
"\n",
"messages = [\n",
" {\"role\":\"system\", \"content\":system_prompt},\n",
" {\"role\":\"user\", \"content\":f\"Email:{user_prompt}/n/nGenerate a concise subject line for this email.\"}\n",
"] # fill this in\n",
"\n",
"# Step 3: Call OpenAI\n",
"\n",
"response = openai.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" messages=messages,\n",
" temperature=0.5,\n",
" max_tokens=20\n",
")\n",
"\n",
"# Step 4: print the result\n",
"\n",
"print(\"Suggested subject line: \", response.choices[0].message.content.strip())"
]
},
{
"cell_type": "markdown",
"id": "36ed9f14-b349-40e9-a42c-b367e77f8bda",
"metadata": {},
"source": [
"## An extra exercise for those who enjoy web scraping\n",
"\n",
"You may notice that if you try `display_summary(\"https://openai.com\")` - it doesn't work! That's because OpenAI has a fancy website that uses Javascript. There are many ways around this that some of you might be familiar with. For example, Selenium is a hugely popular framework that runs a browser behind the scenes, renders the page, and allows you to query it. If you have experience with Selenium, Playwright or similar, then feel free to improve the Website class to use them. In the community-contributions folder, you'll find an example Selenium solution from a student (thank you!)"
]
},
{
"cell_type": "markdown",
"id": "eeab24dc-5f90-4570-b542-b0585aca3eb6",
"metadata": {},
"source": [
"# Sharing your code\n",
"\n",
"I'd love it if you share your code afterwards so I can share it with others! You'll notice that some students have already made changes (including a Selenium implementation) which you will find in the community-contributions folder. If you'd like add your changes to that folder, submit a Pull Request with your new versions in that folder and I'll merge your changes.\n",
"\n",
"If you're not an expert with git (and I am not!) then GPT has given some nice instructions on how to submit a Pull Request. It's a bit of an involved process, but once you've done it once it's pretty clear. As a pro-tip: it's best if you clear the outputs of your Jupyter notebooks (Edit >> Clean outputs of all cells, and then Save) for clean notebooks.\n",
"\n",
"Here are good instructions courtesy of an AI friend: \n",
"https://chatgpt.com/share/677a9cb5-c64c-8012-99e0-e06e88afd293"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4484fcf-8b39-4c3f-9674-37970ed71988",
"metadata": {},
"outputs": [],
"source": [
"# A modified class to fetch and parse fully rendered pages: with ethically reduced CAPTCHA events\n",
"class Website:\n",
"\n",
" def __init__(self, url):\n",
" \"\"\"\n",
" Create this Website object from the given url using Selenium and BeautifulSoup.\n",
" Render JavaScript content and extract text from the page.\n",
" \"\"\"\n",
" self.url = url\n",
"\n",
" if not self._is_allowed_by_robots(url):\n",
" print(f\"Warning: robots.txt does not explicitly allow webscraping of {url}. Proceeding anyway.\")\n",
" self.text, self.title = self._scrape_content()\n",
"\n",
" # Check robots.txt if scraping is allowed\n",
" def _is_allowed_by_robots(self, url, user_agent=\"*\"):\n",
" parsed = urlparse(url)\n",
" robots_url = f\"{parsed.scheme}://{parsed.netloc}/robots.txt\"\n",
" rp = urllib.robotparser.RobotFileParser()\n",
" rp.set_url(robots_url)\n",
" try:\n",
" rp.read()\n",
" return rp.can_fetch(user_agent, url)\n",
" except Exception:\n",
" # If robots.txt is unreachable, assume permissable\n",
" return True\n",
"\n",
" def _scrape_content(self, retries=3, wait_base=5):\n",
" # List of user agents for rotation\n",
" user_agents = [\n",
" \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\",\n",
" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15\",\n",
" \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0\"\n",
" ]\n",
" # Rotate user agents infrequently\n",
" selected_agent = random.choice(user_agents)\n",
" \n",
" # Set up headless Chrome options\n",
" options = Options()\n",
" options.add_argument(\"--headless=new\")\n",
" options.add_argument(\"--disable-gpu\")\n",
" options.add_argument(\"--no-sandbox\")\n",
" options.add_argument(\"--disable-dev-shm-usage\")\n",
" options.add_argument(f\"user-agent={selected_agent}\")\n",
"\n",
" # Try to bypass anti-bot protections with exponential backoff\n",
" for attempt in range(retries):\n",
" try:\n",
" # Start browser\n",
" service = Service(ChromeDriverManager().install())\n",
" driver = webdriver.Chrome(service=service, options=options)\n",
" driver.set_page_load_timeout(30)\n",
" driver.get(self.url)\n",
"\n",
" # Mimick human browsing behavior with random time delay, without overloading the server\n",
" time.sleep(random.uniform(6, 12))\n",
" \n",
" # Get the page source after rendering\n",
" soup = BeautifulSoup(driver.page_source, 'html.parser')\n",
" driver.quit()\n",
"\n",
" for tag in soup([\"script\", \"style\", \"img\", \"input\"]):\n",
" tag.decompose()\n",
" \n",
" title = soup.title.string.strip() if soup.title and soup.title.string else \"No title found\"\n",
" body = soup.body\n",
" text = soup.body.get_text(separator=\"\\n\", strip=True) if body else \"No content found.\"\n",
"\n",
" return text, title\n",
" \n",
" except Exception as e:\n",
" # Exponential backoff to avoid retry spamming on failure\n",
" time.sleep(wait_base * (2 ** attempt)) \n",
" continue\n",
"\n",
" raise Exception(\"Failed to retrieve content despite retries.\")\n",
"\n",
"\n",
"rendered_page = Website(\"https://openai.com\")\n",
"print(\"\\nTitle: \", rendered_page.title)\n",
"print(\"\\nText: \", rendered_page.text, \"\\n\")\n",
"#print(\"\\nUser prompt: \", user_prompt_for(rendered_page), \"\\n\")\n",
"#messages_for(rendered_page)\n",
"#summarize(\"https://openai.com\")\n",
"\n",
"display_summary(\"https://openai.com\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "781119a4-844c-4e03-84bd-8b8f2200d86c",
"metadata": {},
"outputs": [],
"source": [
"# With Async for multiple page scraping: using Selenium and Jupyter Labs\n",
"import nest_asyncio # Required for Jupyter notebook\n",
"import asyncio\n",
"from concurrent.futures import ThreadPoolExecutor\n",
"\n",
"# Async-safe wrapper for multiple URLs: because Selenium is synchronous\n",
"def scrape_sync(url):\n",
" try:\n",
" page = Website(url)\n",
" return {\n",
" \"url\": url,\n",
" \"title\": page.title,\n",
" \"text\": page.text,\n",
" \"summary\": display_summary(url)\n",
" }\n",
" except Exception as e:\n",
" return {\n",
" \"url\": url,\n",
" \"error\": str(e)\n",
" }\n",
"\n",
"\n",
"# Async runner for multiple URLs\n",
"async def scrape_multiple_async(urls, max_workers=4):\n",
" loop = asyncio.get_running_loop()\n",
" with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
" futures = [\n",
" loop.run_in_executor(executor, scrape_sync, url)\n",
" for url in urls\n",
" ]\n",
" return await asyncio.gather(*futures)\n",
"\n",
"\n",
"# Example async usage\n",
"if __name__ == \"__main__\":\n",
" urls_to_scrape = [\n",
" \"https://www.investopedia.com/articles/active-trading/111115/why-all-worlds-top-10-companies-are-american.asp\",\n",
" \"https://fortune.com/ranking/global500/\",\n",
" \"http://en.wikipedia.org/wiki/List_of_largest_corporate_profits_and_losses\",\n",
" ]\n",
"\n",
" async def run():\n",
" results = await scrape_multiple_async(urls_to_scrape)\n",
" for res in results:\n",
" print(f\"\\nURL: {res.get('url')}\")\n",
" print(f\"Title: {res.get('title', 'N/A')}\")\n",
" print(f\"Preview:\\n{res.get('text', res.get('error', 'No content'))}\\n\")\n",
"\n",
" # Jupyter notebook already has a running event loop: asyncio.run() cannot be called from a running event loop\n",
" nest_asyncio.apply()\n",
" await run()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "32fa56f2-f78e-421f-b35e-77fb9608d652",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,307 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "3ba06289-d17a-4ccd-85f5-2b79956d4e59",
"metadata": {},
"outputs": [],
"source": [
"!pip install selenium"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb6636be-e43f-4896-aadd-cafda003ed4e",
"metadata": {},
"outputs": [],
"source": [
"!pip install -q -U google-genai"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dfe66209-1d33-4292-80f1-20e11baf4bc3",
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.chrome.service import Service\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from IPython.display import Markdown, display\n",
"from google import genai\n",
"from google.genai import types\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f2b4306c-17d0-46fe-a889-7440ff809dc6",
"metadata": {},
"outputs": [],
"source": [
"\n",
"#load env\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('GEMINI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")"
]
},
{
"cell_type": "markdown",
"id": "08ec6fec-886c-4a0c-a046-e8643ad700d3",
"metadata": {},
"source": [
"# Lets make a simple call for check our model is working fine or not"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "89143d5c-0013-4f7e-8e1f-f7db7e936f0d",
"metadata": {},
"outputs": [],
"source": [
"client = genai.Client(api_key=api_key)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1144b77a-6785-479a-ab4f-bb0ab5624b49",
"metadata": {},
"outputs": [],
"source": [
"\n",
"response = client.models.generate_content(\n",
" model=\"gemini-2.5-flash-preview-05-20\",\n",
" contents=[\"hi gemini\"]\n",
")\n",
"print(response.text)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bbf3836c-19b8-44e1-904a-f265925c2786",
"metadata": {},
"outputs": [],
"source": [
"\n",
"class Website:\n",
" def __init__(self, url, driver_path=None, wait_time=3):\n",
" self.url = url\n",
" self.wait_time = wait_time\n",
"\n",
" # Headless Chrome settings\n",
" options = Options()\n",
" # options.add_argument(\"--headless\") \n",
" # Headless mode runs the browser in the background (invisible).\n",
" # However, some websites (like openai.com) block headless browsers.\n",
" # So if this line is active, the page may not load correctly and you may not get the full content.\n",
" options.add_argument(\"--disable-gpu\")\n",
" options.add_argument(\"--no-sandbox\")\n",
" options.add_argument(\"--window-size=1920x1080\")\n",
"\n",
" # Driver path\n",
" if driver_path:\n",
" service = Service(executable_path=driver_path)\n",
" else:\n",
" service = Service() \n",
"\n",
" # Start browser\n",
" driver = webdriver.Chrome(service=service, options=options)\n",
" driver.get(url)\n",
"\n",
" # Wait for the loading page\n",
" time.sleep(self.wait_time)\n",
"\n",
" # Take page source\n",
" html = driver.page_source\n",
" driver.quit()\n",
"\n",
" # Analysis with BeautifulSoup \n",
" soup = BeautifulSoup(html, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
"\n",
" # Clean irrelevant tags\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
"\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "852c52e2-bd4d-4bb9-94ef-e498c33f1a89",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"\"\"You are an academic research assistant specialized in summarizing scholarly papers. Follow this workflow rigorously:\n",
"\n",
"Step 1: Document Verification\n",
"Verify if the input is a research paper by checking for:\n",
"\n",
"Presence of academic sections (Abstract, Introduction, Methodology, Results, Discussion, References)\n",
"\n",
"Technical/scholarly language\n",
"\n",
"Citations (in-text or bibliography)\n",
"\n",
"Research claims or data analysis\n",
"If NOT a research paper:\n",
"→ Respond: \"This doesn't appear to be a research paper. Please upload peer-reviewed academic literature for summarization.\"\n",
"\n",
"Step 2: Structured Summary (If verified)\n",
"Generate a 5-section summary in this exact format:\n",
"\n",
"1. Research Question\n",
"[Identify core problem/gap addressed in 1 sentence]\n",
"\n",
"2. Methodology\n",
"[Study design, data sources, analytical techniques in 2 bullet points]\n",
"\n",
"3. Key Findings\n",
"[3-4 quantified results with numerical evidence from tables/figures]\n",
"\n",
"4. Limitations\n",
"[2 major constraints acknowledged by authors]\n",
"\n",
"5. Significance\n",
"[Impact on field & practical implications in 1 sentence]\n",
"\n",
"Critical Rules:\n",
"Accuracy Priority: Never invent data. Write \"Not specified\" for missing elements\n",
"\n",
"Source Anchoring: Cite page/paragraph numbers for claims (e.g., \"Fig 3 shows 24% improvement\")\n",
"\n",
"Jargon Handling: Simplify complex terms using: [Technical Term → Layman Explanation] inline\n",
"\n",
"Bias Alert: Flag any undeclared funding/sponsorship conflicts\n",
"\n",
"Output Format: Strict Markdown with section headers, 200-word maximum\n",
"\n",
"Example Output:\n",
"1. Research Question\n",
"How does microplastic concentration affect zebrafish neural development?\n",
"\n",
"2. Methodology\n",
"\n",
"Exposed embryos to 0.1-10μm PET particles (5-100mg/L) for 96h\n",
"\n",
"Quantified gene expression (RT-qPCR) and behavioral assays (Open Field Test)\n",
"\n",
"3. Key Findings\n",
"▲ 40% reduction in neuron count at 50mg/L exposure (p<0.01, Fig 2B)\n",
"■ 2.3x increase in anxiolytic behavior (Table 3)\n",
"▼ 17% downregulation in shha expression (p=0.03)\n",
"\n",
"4. Limitations\n",
" \n",
"Used static exposure vs dynamic aquatic environments\n",
"\n",
"Limited proteomic validation\n",
"\n",
"5. Significance\n",
"Establishes dose-dependent neurotoxicity thresholds for aquatic toxicology regulations.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7620c685-c35c-4d6b-aaf1-a3da98f19ca7",
"metadata": {},
"outputs": [],
"source": [
"# A function that writes a User Prompt that asks for summaries of websites:\n",
"\n",
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
"please provide a summary of this website in markdown.\\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4257406-089b-45a3-bfb5-272004360a49",
"metadata": {},
"outputs": [],
"source": [
"def summarize(url):\n",
" website = Website(url)\n",
" response = client.models.generate_content(\n",
" model=\"gemini-2.5-flash-preview-05-20\",\n",
" config=types.GenerateContentConfig(\n",
" system_instruction=system_prompt),\n",
" contents=user_prompt_for(website)\n",
" )\n",
"\n",
" return response.text\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f68b32ae-9e65-4aa4-ae8d-cc2482c4a2e2",
"metadata": {},
"outputs": [],
"source": [
"def display_summary(url):\n",
" summary = summarize(url)\n",
" display(Markdown(summary))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae52543c-01c1-4262-b53c-95ef4e5a93aa",
"metadata": {},
"outputs": [],
"source": [
"display_summary(\"https://onlinelibrary.wiley.com/doi/full/10.1155/2021/8812542\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,626 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "a98030af-fcd1-4d63-a36e-38ba053498fa",
"metadata": {},
"source": [
"# A full business solution\n",
"\n",
"## Now we will take our project from Day 1 to the next level\n",
"\n",
"### BUSINESS CHALLENGE:\n",
"\n",
"Create a product that builds a Brochure for a company to be used for prospective clients, investors and potential recruits.\n",
"\n",
"We will be provided a company name and their primary website.\n",
"\n",
"See the end of this notebook for examples of real-world business applications.\n",
"\n",
"And remember: I'm always available if you have problems or ideas! Please do reach out."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5b08506-dc8b-4443-9201-5f1848161363",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt\n",
"\n",
"import os\n",
"import requests\n",
"import json\n",
"from typing import List\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.chrome.service import Service\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from webdriver_manager.chrome import ChromeDriverManager\n",
"from urllib.parse import urlparse, urljoin\n",
"import time\n",
"import random\n",
"import concurrent.futures"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fc5d8880-f2ee-4c06-af16-ecbc0262af61",
"metadata": {},
"outputs": [],
"source": [
"# Initialize and constants\n",
"\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:\n",
" print(\"API key looks good so far\")\n",
"else:\n",
" print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n",
" \n",
"MODEL = 'gpt-4o-mini'\n",
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "106dd65e-90af-4ca8-86b6-23a41840645b",
"metadata": {},
"outputs": [],
"source": [
"# A class to represent a Webpage\n",
"\n",
"# Some websites need you to use proper headers when fetching them:\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
" \"\"\"\n",
" A utility class to represent a Website that we have scraped, now with links\n",
" \"\"\"\n",
"\n",
" def __init__(self, url):\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" self.body = response.content\n",
" soup = BeautifulSoup(self.body, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" if soup.body:\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
" else:\n",
" self.text = \"\"\n",
" links = [link.get('href') for link in soup.find_all('a')]\n",
" self.links = [link for link in links if link]\n",
"\n",
" def get_contents(self):\n",
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n",
"\n",
"\n",
"# A modified class to fetch and parse fully rendered pages\n",
"class NewWebsite:\n",
" shared_driver = None # Class variable to share browser instance\n",
"\n",
" def __init__(self, url, driver=None):\n",
" self.url = url\n",
" self.driver = driver or NewWebsite._get_shared_driver()\n",
" self.text, self.title, self.links = self._scrape_content()\n",
" \n",
" @classmethod\n",
" def _get_shared_driver(cls):\n",
" if cls.shared_driver is None:\n",
" # Set up headless Chrome options\n",
" options = Options()\n",
" options.add_argument(\"--headless=new\")\n",
" options.add_argument(\"--disable-gpu\")\n",
" options.add_argument(\"--no-sandbox\")\n",
" options.add_argument(\"--disable-dev-shm-usage\")\n",
" options.add_argument(\"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\")\n",
"\n",
" service = Service(ChromeDriverManager().install())\n",
" cls.shared_driver = webdriver.Chrome(service=service, options=options)\n",
" return cls.shared_driver\n",
"\n",
" def _scrape_content(self):\n",
" try:\n",
" self.driver.get(self.url)\n",
" # Mimick human browsing behavior without overloading the server\n",
" WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, \"a\")))\n",
" # Allow JS-rendered content to settle\n",
" time.sleep(2)\n",
"\n",
" # Get the page source after rendering\n",
" soup = BeautifulSoup(self.driver.page_source, \"html.parser\")\n",
" \n",
" for tag in soup([\"script\", \"style\", \"img\", \"input\"]):\n",
" tag.decompose()\n",
" \n",
" title = soup.title.string.strip() if soup.title and soup.title.string else \"No title found\"\n",
" body = soup.body\n",
" text = soup.body.get_text(separator=\"\\n\", strip=True) if body else \"No content found.\"\n",
"\n",
" # Extract and clean links\n",
" links = []\n",
" for link_tag in soup.find_all(\"a\", href=True):\n",
" href = link_tag[\"href\"].strip()\n",
" if href and not href.startswith((\"mailto:\", \"tel:\", \"javascript:\")):\n",
" full_url = urljoin(self.url, href)\n",
" links.append(full_url)\n",
" \n",
" return text, title, links\n",
" \n",
" except Exception as e:\n",
" return \"Error loading content\", \"Error\", []\n",
"\n",
" def get_contents(self):\n",
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n",
"\n",
" # Close the driver\n",
" @classmethod\n",
" def close_driver(cls):\n",
" if cls.shared_driver:\n",
" cls.shared_driver.quit()\n",
" cls.shared_driver = None\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e30d8128-933b-44cc-81c8-ab4c9d86589a",
"metadata": {},
"outputs": [],
"source": [
"cardiff = NewWebsite(\"https://cardiff.co/\")\n",
"cardiff.links"
]
},
{
"cell_type": "markdown",
"id": "1771af9c-717a-4fca-bbbe-8a95893312c3",
"metadata": {},
"source": [
"## First step: Have GPT-4o-mini figure out which links are relevant\n",
"\n",
"### Use a call to gpt-4o-mini to read the links on a webpage, and respond in structured JSON. \n",
"It should decide which links are relevant, and replace relative links such as \"/about\" with \"https://company.com/about\". \n",
"We will use \"one shot prompting\" in which we provide an example of how it should respond in the prompt.\n",
"\n",
"This is an excellent use case for an LLM, because it requires nuanced understanding. Imagine trying to code this without LLMs by parsing and analyzing the webpage - it would be very hard!\n",
"\n",
"Sidenote: there is a more advanced technique called \"Structured Outputs\" in which we require the model to respond according to a spec. We cover this technique in Week 8 during our autonomous Agentic AI project."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6957b079-0d96-45f7-a26a-3487510e9b35",
"metadata": {},
"outputs": [],
"source": [
"link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n",
"You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n",
"such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
"link_system_prompt += \"You should respond in JSON as in this example:\"\n",
"link_system_prompt += \"\"\"\n",
"{\n",
" \"links\": [\n",
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
" {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n",
" ]\n",
"}\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b97e4068-97ed-4120-beae-c42105e4d59a",
"metadata": {},
"outputs": [],
"source": [
"print(link_system_prompt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e1f601b-2eaf-499d-b6b8-c99050c9d6b3",
"metadata": {},
"outputs": [],
"source": [
"def get_links_user_prompt(website):\n",
" user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
" user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
"Do not include Terms of Service, Privacy, email links.\\n\"\n",
" user_prompt += \"Links (some might be relative links):\\n\"\n",
" user_prompt += \"\\n\".join(website.links)\n",
" return user_prompt\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6bcbfa78-6395-4685-b92c-22d592050fd7",
"metadata": {},
"outputs": [],
"source": [
"print(get_links_user_prompt(cardiff))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a29aca19-ca13-471c-a4b4-5abbfa813f69",
"metadata": {},
"outputs": [],
"source": [
"def get_links(url):\n",
" website = Website(url)\n",
" response = openai.chat.completions.create(\n",
" model=MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
" {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
" ],\n",
" response_format={\"type\": \"json_object\"}\n",
" )\n",
" result = response.choices[0].message.content\n",
" return json.loads(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74a827a0-2782-4ae5-b210-4a242a8b4cc2",
"metadata": {},
"outputs": [],
"source": [
"# Anthropic has made their site harder to scrape, so I'm using HuggingFace..\n",
"\n",
"huggingface = Website(\"https://huggingface.co\")\n",
"huggingface.links"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3d583e2-dcc4-40cc-9b28-1e8dbf402924",
"metadata": {},
"outputs": [],
"source": [
"get_links(\"https://cardiff.co\")"
]
},
{
"cell_type": "markdown",
"id": "0d74128e-dfb6-47ec-9549-288b621c838c",
"metadata": {},
"source": [
"## Second step: make the brochure!\n",
"\n",
"Assemble all the details into another prompt to GPT4-o"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85a5b6e2-e7ef-44a9-bc7f-59ede71037b5",
"metadata": {},
"outputs": [],
"source": [
"def get_all_details(url):\n",
" result = \"Landing page:\\n\"\n",
" result += Website(url).get_contents()\n",
" links = get_links(url)\n",
" print(\"Found links:\", links)\n",
" for link in links[\"links\"]:\n",
" result += f\"\\n\\n{link['type']}\\n\"\n",
" result += Website(link[\"url\"]).get_contents()\n",
" return result\n",
"\n",
"def get_all_details_rendered(url):\n",
" result = \"Landing page:\\n\"\n",
" result += NewWebsite(url).get_contents()\n",
" links = get_links(url)\n",
" print(\"Found links:\", links)\n",
"\n",
" for link in links[\"links\"]:\n",
" result += f\"\\n\\n{link['type']}\\n\"\n",
" result += NewWebsite(link[\"url\"]).get_contents()\n",
"\n",
" # Important: close browser after all scraping is done\n",
" NewWebsite.close_driver()\n",
" return result\n",
"\n",
"def scrape_link(link):\n",
" try:\n",
" page = NewWebsite(link[\"url\"])\n",
" return f\"\\n\\n{link['type']}\\n{page.get_contents()}\"\n",
" except Exception as e:\n",
" return f\"\\n\\n{link['type']}\\nError loading page: {e}\"\n",
"\n",
"# Threaded scraper for linked pages\n",
"def get_all_details_rendered_concurrently(url):\n",
" result = \"Landing page:\\n\"\n",
" result += NewWebsite(url).get_contents()\n",
"\n",
" # LLM-filtered link generator\n",
" links = get_links(url)\n",
" print(\"Found links:\", links)\n",
"\n",
" with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" future_to_link = {executor.submit(scrape_link, link): link for link in links[\"links\"]}\n",
" for future in concurrent.futures.as_completed(future_to_link):\n",
" result += future.result()\n",
"\n",
" # Close shared browser\n",
" NewWebsite.close_driver()\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5099bd14-076d-4745-baf3-dac08d8e5ab2",
"metadata": {},
"outputs": [],
"source": [
"print(get_all_details_rendered_concurrently(\"https://cardiff.co\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b863a55-f86c-4e3f-8a79-94e24c1a8cf2",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n",
"and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n",
"Include details of company culture, customers and careers/jobs if you have the information.\"\n",
"\n",
"# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':\n",
"\n",
"# system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n",
"# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n",
"# Include details of company culture, customers and careers/jobs if you have the information.\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ab83d92-d36b-4ce0-8bcc-5bb4c2f8ff23",
"metadata": {},
"outputs": [],
"source": [
"def get_brochure_user_prompt(company_name, url):\n",
" user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n",
" #user_prompt += get_all_details(url)\n",
" user_prompt += get_all_details_rendered_concurrently(url)\n",
" user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd909e0b-1312-4ce2-a553-821e795d7572",
"metadata": {},
"outputs": [],
"source": [
"get_brochure_user_prompt(\"Cardiff\", \"https://cardiff.co\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e44de579-4a1a-4e6a-a510-20ea3e4b8d46",
"metadata": {},
"outputs": [],
"source": [
"def create_brochure(company_name, url):\n",
" response = openai.chat.completions.create(\n",
" model=MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n",
" ],\n",
" )\n",
" result = response.choices[0].message.content\n",
" display(Markdown(result))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e093444a-9407-42ae-924a-145730591a39",
"metadata": {},
"outputs": [],
"source": [
"create_brochure(\"Cardiff\", \"https://cardiff.co\")"
]
},
{
"cell_type": "markdown",
"id": "61eaaab7-0b47-4b29-82d4-75d474ad8d18",
"metadata": {},
"source": [
"## Finally - a minor improvement\n",
"\n",
"With a small adjustment, we can change this so that the results stream back from OpenAI,\n",
"with the familiar typewriter animation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "51db0e49-f261-4137-aabe-92dd601f7725",
"metadata": {},
"outputs": [],
"source": [
"def stream_brochure(company_name, url):\n",
" stream = openai.chat.completions.create(\n",
" model=MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n",
" ],\n",
" stream=True\n",
" )\n",
" \n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" for chunk in stream:\n",
" response += chunk.choices[0].delta.content or ''\n",
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
" update_display(Markdown(response), display_id=display_handle.display_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "56bf0ae3-ee9d-4a72-9cd6-edcac67ceb6d",
"metadata": {},
"outputs": [],
"source": [
"stream_brochure(\"Cardiff\", \"https://cardiff.co\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fdb3f8d8-a3eb-41c8-b1aa-9f60686a653b",
"metadata": {},
"outputs": [],
"source": [
"# Try changing the system prompt to the humorous version when you make the Brochure for Hugging Face:\n",
"\n",
"stream_brochure(\"HuggingFace\", \"https://huggingface.co\")"
]
},
{
"cell_type": "markdown",
"id": "a27bf9e0-665f-4645-b66b-9725e2a959b5",
"metadata": {},
"source": [
"<table style=\"margin: 0; text-align: left;\">\n",
" <tr>\n",
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
" <img src=\"../business.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
" </td>\n",
" <td>\n",
" <h2 style=\"color:#181;\">Business applications</h2>\n",
" <span style=\"color:#181;\">In this exercise we extended the Day 1 code to make multiple LLM calls, and generate a document.\n",
"\n",
"This is perhaps the first example of Agentic AI design patterns, as we combined multiple calls to LLMs. This will feature more in Week 2, and then we will return to Agentic AI in a big way in Week 8 when we build a fully autonomous Agent solution.\n",
"\n",
"Generating content in this way is one of the very most common Use Cases. As with summarization, this can be applied to any business vertical. Write marketing content, generate a product tutorial from a spec, create personalized email content, and so much more. Explore how you can apply content generation to your business, and try making yourself a proof-of-concept prototype. See what other students have done in the community-contributions folder -- so many valuable projects -- it's wild!</span>\n",
" </td>\n",
" </tr>\n",
"</table>"
]
},
{
"cell_type": "markdown",
"id": "14b2454b-8ef8-4b5c-b928-053a15e0d553",
"metadata": {},
"source": [
"<table style=\"margin: 0; text-align: left;\">\n",
" <tr>\n",
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
" <img src=\"../important.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
" </td>\n",
" <td>\n",
" <h2 style=\"color:#900;\">Before you move to Week 2 (which is tons of fun)</h2>\n",
" <span style=\"color:#900;\">Please see the week1 EXERCISE notebook for your challenge for the end of week 1. This will give you some essential practice working with Frontier APIs, and prepare you well for Week 2.</span>\n",
" </td>\n",
" </tr>\n",
"</table>"
]
},
{
"cell_type": "markdown",
"id": "17b64f0f-7d33-4493-985a-033d06e8db08",
"metadata": {},
"source": [
"<table style=\"margin: 0; text-align: left;\">\n",
" <tr>\n",
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
" <img src=\"../resources.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
" </td>\n",
" <td>\n",
" <h2 style=\"color:#f71;\">A reminder on 3 useful resources</h2>\n",
" <span style=\"color:#f71;\">1. The resources for the course are available <a href=\"https://edwarddonner.com/2024/11/13/llm-engineering-resources/\">here.</a><br/>\n",
" 2. I'm on LinkedIn <a href=\"https://www.linkedin.com/in/eddonner/\">here</a> and I love connecting with people taking the course!<br/>\n",
" 3. I'm trying out X/Twitter and I'm at <a href=\"https://x.com/edwarddonner\">@edwarddonner<a> and hoping people will teach me how it's done.. \n",
" </span>\n",
" </td>\n",
" </tr>\n",
"</table>"
]
},
{
"cell_type": "markdown",
"id": "6f48e42e-fa7a-495f-a5d4-26bfc24d60b6",
"metadata": {},
"source": [
"<table style=\"margin: 0; text-align: left;\">\n",
" <tr>\n",
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
" <img src=\"../thankyou.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
" </td>\n",
" <td>\n",
" <h2 style=\"color:#090;\">Finally! I have a special request for you</h2>\n",
" <span style=\"color:#090;\">\n",
" My editor tells me that it makes a MASSIVE difference when students rate this course on Udemy - it's one of the main ways that Udemy decides whether to show it to others. If you're able to take a minute to rate this, I'd be so very grateful! And regardless - always please reach out to me at ed@edwarddonner.com if I can help at any point.\n",
" </span>\n",
" </td>\n",
" </tr>\n",
"</table>"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b8d3e1a1-ba54-4907-97c5-30f89a24775b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,104 @@
from openai import OpenAI
from dotenv import load_dotenv
import os
import pypdf
class ResumeBasedJobRecommendation:
def __init__(self, path: str):
self.resume_path = path
# method to read the content from the resume and use it for the user prompt
def read_resume(self):
"""method to read the content from the resume and use it for the user prompt.
Returns:
content (str): returns the content of the resume.
"""
try:
pdfreader = pypdf.PdfReader(self.resume_path)
data = ""
for page_number in range(pdfreader.get_num_pages()):
page = pdfreader.pages[page_number]
data += page.extract_text()
except FileNotFoundError as e:
print(f"Issue with the resume file path: {str(e)}")
return
except Exception as e:
print(f"Couldn't able to parse the pdf : {str(e)}")
return
return data
#
def message_prompt(self, data: str, job_sites: list, location: str):
"""method suggests the appropriate job roles and provides the search link from job sites based on users input of resume data, job boards and location.
Args:
data (str): resume content for user prompt
job_sites (list): job searching sites for user prompt
location (str): location of job search
Returns:
content (str): Provides summary of resume with suggested job roles and links using gpt 4.o model.
"""
self.message = [
{"role": "system",
"content": "You are an assistant that analysizes the resume data and summarize it. \
Based on the summarization, you suggest the appropriate job roles \
and provide the appropriate job search links for each suggested roles from the job sites based on filtering by the \
location provided. "
},
{
"role": "user",
"content": f"Below is my resume content, kindly look for the appropriate job openings in \
{job_sites} for location {location}:\n{data}"
}]
self.response = openai.chat.completions.create(model='gpt-4o-mini', messages=self.message)
return self.response.choices[0].message.content
if __name__ == '__main__':
# load the api key from .env and check if it is valid.
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
if api_key is None:
print("No api key was found.")
exit()
elif not api_key.startswith('sk-proj-'):
print("api key is present but it is not matching with the openai api key pattern starting with sk-proj-. Please check it.")
exit()
elif api_key.strip() != api_key:
print("api key is good but it seems it has the spaces at starting or the end. Please check and remove it.")
exit()
else:
print("api key is found and it looks good.")
openai = OpenAI()
#Provide the valid resume path
file_path = input("Kindly enter the resume path:\n")
if not file_path:
print("Resume path is not provided. Kindly provide the valid path.")
exit()
obj = ResumeBasedJobRecommendation(file_path)
data = obj.read_resume()
if not data:
pass
else:
#provide the input for the job sites to search and valid job location
job_sites = input("Enter the job sites with space between each other: ")
if not job_sites:
print("Didn't provided the job sites to search for. Going with Linkedin, Indeed, Glassdoor and Naukri as defaults.")
job_sites = ['LinkedIn', 'Indeed', 'Naukri', 'Glassdoor']
else:
job_sites = job_sites.split(' ')
location = input("Enter the job location:")
if not location:
print("No location has been provided. Default will consider as United States.")
location = 'United States'
response = obj.message_prompt(data, job_sites, location)
print(response)

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,349 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "fe12c203-e6a6-452c-a655-afb8a03a4ff5",
"metadata": {},
"source": [
"# End of week 1 exercise\n",
"\n",
"To demonstrate your familiarity with OpenAI API, and also Ollama, build a tool that takes a technical question, \n",
"and responds with an explanation. This is a tool that you will be able to use yourself during the course!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1070317-3ed9-4659-abe3-828943230e03",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"import json\n",
"from typing import List\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display, update_display, Image\n",
"from openai import OpenAI\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.chrome.service import Service\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from webdriver_manager.chrome import ChromeDriverManager\n",
"from urllib.parse import urlparse, urljoin\n",
"import time\n",
"import random\n",
"import concurrent.futures\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a456906-915a-4bfd-bb9d-57e505c5093f",
"metadata": {},
"outputs": [],
"source": [
"# constants\n",
"\n",
"MODEL = 'gpt-4o-mini'\n",
"openai = OpenAI()\n",
"MODEL_GPT = 'gpt-4o-mini'\n",
"MODEL_LLAMA = 'llama3.2'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8d7923c-5f28-4c30-8556-342d7c8497c1",
"metadata": {},
"outputs": [],
"source": [
"# set up environment\n",
"\n",
"# A modified class to fetch and parse fully rendered pages\n",
"class NewWebsite:\n",
" shared_driver = None # Class variable to share browser instance\n",
"\n",
" def __init__(self, url, driver=None):\n",
" self.url = url\n",
" self.driver = driver or NewWebsite._get_shared_driver()\n",
" self.text, self.title, self.links = self._scrape_content()\n",
" \n",
" @classmethod\n",
" def _get_shared_driver(cls):\n",
" if cls.shared_driver is None:\n",
" # Set up headless Chrome options\n",
" options = Options()\n",
" options.add_argument(\"--headless=new\")\n",
" options.add_argument(\"--disable-gpu\")\n",
" options.add_argument(\"--no-sandbox\")\n",
" options.add_argument(\"--disable-dev-shm-usage\")\n",
" options.add_argument(\"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\")\n",
"\n",
" service = Service(ChromeDriverManager().install())\n",
" cls.shared_driver = webdriver.Chrome(service=service, options=options)\n",
" return cls.shared_driver\n",
"\n",
" def _scrape_content(self):\n",
" try:\n",
" self.driver.get(self.url)\n",
" # Mimick human browsing behavior without overloading the server\n",
" WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, \"a\")))\n",
" # Allow JS-rendered content to settle\n",
" time.sleep(2)\n",
"\n",
" # Get the page source after rendering\n",
" soup = BeautifulSoup(self.driver.page_source, \"html.parser\")\n",
" \n",
" for tag in soup([\"script\", \"style\", \"img\", \"input\"]):\n",
" tag.decompose()\n",
" \n",
" title = soup.title.string.strip() if soup.title and soup.title.string else \"No title found\"\n",
" body = soup.body\n",
" text = soup.body.get_text(separator=\"\\n\", strip=True) if body else \"No content found.\"\n",
"\n",
" # Extract and clean links\n",
" links = []\n",
" for link_tag in soup.find_all(\"a\", href=True):\n",
" href = link_tag[\"href\"].strip()\n",
" if href and not href.startswith((\"mailto:\", \"tel:\", \"javascript:\")):\n",
" full_url = urljoin(self.url, href)\n",
" links.append(full_url)\n",
" \n",
" return text, title, links\n",
" \n",
" except Exception as e:\n",
" return \"Error loading content\", \"Error\", []\n",
"\n",
" def get_contents(self):\n",
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n",
"\n",
" # Close the driver\n",
" @classmethod\n",
" def close_driver(cls):\n",
" if cls.shared_driver:\n",
" cls.shared_driver.quit()\n",
" cls.shared_driver = None\n",
"\n",
"link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n",
"You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n",
"such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
"link_system_prompt += \"You should respond in JSON as in this example:\"\n",
"link_system_prompt += \"\"\"\n",
"{\n",
" \"links\": [\n",
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
" {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n",
" ]\n",
"}\n",
"\"\"\"\n",
"\n",
"def get_links_user_prompt(website):\n",
" user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
" user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
"Do not include Terms of Service, Privacy, email links.\\n\"\n",
" user_prompt += \"Links (some might be relative links):\\n\"\n",
" user_prompt += \"\\n\".join(website.links)\n",
" return user_prompt\n",
"\n",
"def get_links(url):\n",
" website = NewWebsite(url)\n",
" response = openai.chat.completions.create(\n",
" model=MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
" {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
" ],\n",
" response_format={\"type\": \"json_object\"}\n",
" )\n",
" result = response.choices[0].message.content\n",
" return json.loads(result)\n",
"\n",
"def scrape_link(link):\n",
" try:\n",
" page = NewWebsite(link[\"url\"])\n",
" return f\"\\n\\n{link['type']}\\n{page.get_contents()}\"\n",
" except Exception as e:\n",
" return f\"\\n\\n{link['type']}\\nError loading page: {e}\"\n",
"\n",
"# Threaded scraper for linked pages\n",
"def get_all_details_rendered_concurrently(url):\n",
" result = \"Landing page:\\n\"\n",
" result += NewWebsite(url).get_contents()\n",
"\n",
" # LLM-filtered link generator\n",
" links = get_links(url)\n",
" print(\"Found links:\", links)\n",
"\n",
" with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" future_to_link = {executor.submit(scrape_link, link): link for link in links[\"links\"]}\n",
" for future in concurrent.futures.as_completed(future_to_link):\n",
" result += future.result()\n",
"\n",
" # Close shared browser\n",
" NewWebsite.close_driver()\n",
" return result\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3f0d0137-52b0-47a8-81a8-11a90a010798",
"metadata": {},
"outputs": [],
"source": [
"# here is the question; type over this to ask something new\n",
"\n",
"system_prompt = \"You are an LLM Engineer that analyzes the contents of several relevant pages from a company website \\\n",
"rewrites internal tools and systems and rebuilds them end-to-end, starting from scratch. Starting with the online application at cardiff.co/apply, \\\n",
"Tell me why you're best suited to be the lead of this project and work with our 12 year resident developer to implement a \\\n",
"state of the art solution in record time. Include backend architecture, model orchestration, how you handle latency, cost and user experience, \\\n",
"and details of how you would achieve this goal based on company culture and industries served if you have the information, \\\n",
"and walk me through the details like you're explaining it to a sharp product owner. Respond in markdown.\"\\\n",
"\n",
"\n",
"def get_solution_user_prompt(company_name, url):\n",
" user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a solution to rewrite the company's application in markdown.\\n\"\n",
" #user_prompt += get_all_details(url)\n",
" user_prompt += get_all_details_rendered_concurrently(url)\n",
" user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
" return user_prompt\n",
"\n",
"def create_solution(company_name, url):\n",
" response = openai.chat.completions.create(\n",
" model=MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": get_solution_user_prompt(company_name, url)}\n",
" ],\n",
" )\n",
" result = response.choices[0].message.content\n",
" display(Markdown(result))\n",
"\n",
" return result\n",
"\n",
"#create_solution(\"Cardiff\", \"https://cardiff.co\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60ce7000-a4a5-4cce-a261-e75ef45063b4",
"metadata": {},
"outputs": [],
"source": [
"# Get gpt-4o-mini to answer, with streaming\n",
"\n",
"new_system_prompt = \"You are a Senior Engineer that analyzes the planned solution given to you for a company website \\\n",
"and you rewrite code for rebuilding internal tools and systems end-to-end based on the proposed solutions. \\\n",
"Start with the online application at cardiff.co/apply, use canvas and write code for the proposed solution \\\n",
"in the appropriate language that best suits the task for backend architecture, model orchestration, how you handle latency, cost and user experience wherever possible.\"\n",
"\n",
"output_dir = \"cardiff_rebuild_output\"\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"\n",
"def save_code_blocks(markdown_text, base_filename=\"cardiff_code\"):\n",
" output_dir = \"cardiff_rebuild_output\"\n",
" os.makedirs(output_dir, exist_ok=True)\n",
" \n",
" code_blocks = re.findall(r\"```(.*?)\\n(.*?)```\", markdown_text, re.DOTALL)\n",
" saved_files = []\n",
"\n",
" for idx, (language, code) in enumerate(code_blocks, 1):\n",
" ext = language.strip() if language else \"txt\"\n",
" filename = f\"{base_filename}_part{idx}.{ext}\"\n",
" filepath = os.path.join(output_dir, filename)\n",
" with open(filepath, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(code)\n",
" saved_files.append(filepath)\n",
"\n",
" return saved_files\n",
"\n",
"def develop_from_proposal(proposal_text, company_name):\n",
" # Stream code generation from GPT-4o\n",
" system = \"You are a senior software engineer. Use the following proposal to generate production-ready code to \\\n",
" implement the backend, frontend, and any orchestration described. Write clean, documented code in markdown format.\"\n",
" \n",
" stream = openai.chat.completions.create(\n",
" model=\"gpt-4o\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system},\n",
" {\"role\": \"user\", \"content\": proposal_text}\n",
" ],\n",
" stream=True\n",
" )\n",
"\n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" for chunk in stream:\n",
" content = chunk.choices[0].delta.content or \"\"\n",
" response += content\n",
" update_display(Markdown(response), display_id=display_handle.display_id)\n",
"\n",
" saved_files = save_code_blocks(response)\n",
" \n",
" # Generate a UI design mockup image\n",
" image_prompt = f\"A modern, mobile-friendly UI wireframe for a business loan application system for {company_name}. Clean layout, input fields for business name, revenue, loan amount, industry, and contact info. Includes a step-by-step progress bar, submit button, and secure branding.\"\n",
" \n",
" img_response = openai.images.generate(\n",
" model=\"dall-e-3\",\n",
" prompt=image_prompt,\n",
" n=1,\n",
" size=\"1024x1024\"\n",
" )\n",
" \n",
" image_url = img_response.data[0].url\n",
" img_path = os.path.join(output_dir, f\"{company_name.lower()}_ui_mockup.png\")\n",
" with open(img_path, 'wb') as handler:\n",
" handler.write(requests.get(image_url).content)\n",
"\n",
" print(\"Code files saved to:\", saved_files)\n",
" print(\"UI mockup saved at:\", img_path)\n",
"\n",
" display(Markdown(\"### Proposed UI Design\"))\n",
" display(Image(url=image_url))\n",
"\n",
"proposal = create_solution(\"Cardiff\", \"https://cardiff.co\")\n",
"develop_from_proposal(proposal, \"Cardiff\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f7c8ea8-4082-4ad0-8751-3301adcf6538",
"metadata": {},
"outputs": [],
"source": [
"# Get Llama 3.2 to answer"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,150 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "4dabb31c-a584-4715-9714-9fc9978c3cb5",
"metadata": {},
"outputs": [],
"source": [
"#Get IPL best team"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3bb88086-ea9c-4766-9baf-a57bb69c3202",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9dc24243-d20a-48aa-b90b-26ef90233e22",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb35e3d1-8733-4931-8744-9c3754793161",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "63d62eb3-3255-4046-863e-d866a833d1a6",
"metadata": {},
"outputs": [],
"source": [
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
" def __init__(self, url):\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "409a70a6-331a-4ea4-ab8d-7a46fffc70d7",
"metadata": {},
"outputs": [],
"source": [
"# Step 1: Create your prompts\n",
"system_prompt = \"You are an assistant that analyzes the contents of a cric info website \\\n",
"and provides a short summary of best team in IPL. \\\n",
"Respond in markdown.\"\n",
"\n",
"user_prompt = \"\"\"\n",
" Get page title\n",
"\"\"\"\n",
"\n",
"# Step 2: Make the messages list\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a snarky assistant\"},\n",
" {\"role\": \"user\", \"content\": \"Team name\"}\n",
"]\n",
"\n",
"# Step 3: Call OpenAI\n",
"\n",
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n",
"print(response.choices[0].message.content)\n",
"\n",
"def messages_for(website):\n",
" return [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}]\n",
"\n",
"webUrl = \"https://www.google.com\"\n",
"print(messages_for(webUrl))\n",
"\n",
"def summarize(url):\n",
" website = Website(url)\n",
" response = openai.chat.completions.create(\n",
" model = \"gpt-4o-mini\",\n",
" messages = messages_for(website)\n",
" )\n",
" return response.choices[0].message.content\n",
"\n",
"# Step 4: print the result\n",
"summary = summarize(webUrl)\n",
"display(Markdown(summary))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,557 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "ae1ef804-3504-488d-af86-5a0da36fea78",
"metadata": {},
"source": [
"# ☀️🏃‍♀️ WeatherMate\n",
"----\n",
"\n",
"**WeatherMate** is a conversational **AI agent** that analyzes real-time weather conditions and suggests the best activities and events based on location. Whether it's sunny, rainy, or snowy, WeatherMate helps you make the most of your day! \n",
"\n",
"Here's how it works:\n",
"1. Get current weather conditions for the user's location.\n",
"2. Recommend suitable indoor or outdoor activities based on the weather.\n",
"3. Find relevant events using the Ticketmaster API.\n",
"4. Merge both activity suggestions and events into a single, structured response.\n",
"\n",
"---\n",
"\n",
"Large Language Models (LLMs), by themselves, cannot fetch real-time data such as weather information. To enable LLMs to access and use such real-time data, we integrate **external tools.** \n",
"\n",
"In this notebook, we will implement a weather API, allowing the assistant to fetch real-time weather information and use it for personalized activity suggestions based on current weather conditions. This is an essential step in transforming an LLM into a more interactive and data-driven AI assistant.\n",
"\n",
"\n",
"In this notebook, we will develop a conversational AI Agent that helps users receive personalized activity recommendations based on real-time weather data.\n",
"\n",
"- 🧑‍💻 Skill Level: Advanced\n",
"- 📤 Output Format: conversational chat\n",
"- 🚀 Tools:\n",
" - Weather API integration \n",
" - Ticketmaster API\n",
" - OpenAI with external tool handling\n",
" - Gradio for the UI\n",
"\n",
"🛠️ Requirements\n",
"- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n",
"- 🔑 OpenAI API Key\n",
"- 🔑 Weather API integration (https://www.weatherapi.com)\n",
"- 🔑 Ticketmaster API (https://developer.ticketmaster.com/explore/)\n",
"\n",
"⚙️ Customizable by user\n",
"- 🤖 Selected model\n",
"- 📜 system_prompt: Controls model behavior\n",
"\n",
"---\n",
"📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)"
]
},
{
"cell_type": "markdown",
"id": "ad262788",
"metadata": {},
"source": [
"**Class Diagram**\n",
"\n",
"![](https://github.com/lisekarimi/lexo/blob/main/assets/05_weather_class_diagram.png?raw=true)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6b7a492-f510-4ba4-bbc3-239675d389dd",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import json\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"import gradio as gr\n",
"from datetime import datetime\n",
"\n",
"# Initialization\n",
"\n",
"load_dotenv(override=True)\n",
"\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"if not openai_api_key:\n",
" print(\"❌ OpenAI API Key is missing!\")\n",
"\n",
"weather_api_key = os.getenv('WEATHERAPI_KEY')\n",
"if not weather_api_key:\n",
" print(\"❌ Weather API Key is missing!\")\n",
"\n",
"ticketmaster_api_key = os.getenv('TICKETMASTER_KEY')\n",
"if not ticketmaster_api_key:\n",
" print(\"❌ TicketMaster API Key is missing!\")\n",
"\n",
"\n",
"MODEL = \"gpt-4o-mini\"\n",
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "347dbe00-5826-4aa6-9d2c-9d028fc33ec8",
"metadata": {},
"outputs": [],
"source": [
"# Get today's date and day name\n",
"today_str = datetime.today().strftime('%Y-%m-%d')\n",
"day_name = datetime.today().strftime('%A')\n",
"\n",
"nb_activity = 10\n",
"\n",
"\n",
"system_message = f\"\"\"\n",
"You are a fun and helpful assistant for an Activity Suggestion App.\n",
"Your job is to recommend **up to {nb_activity} activities** based on the real-time weather fetched from the API, ensuring a mix of **indoor, outdoor, and event-based activities** whenever possible.\n",
"\n",
"The total must always be **10 or fewer**, following this rule:\n",
"**nb_events + nb_indoors + nb_outdoors ≤ 10**.\n",
"\n",
"You must **analyze and think carefully** to determine the best combination of activities and events for the user.\n",
"- Evaluate **weather conditions** to decide if outdoor activities are suitable.\n",
"- Check **event availability** and select the most relevant ones.\n",
"- Balance **indoor, outdoor, and event-based activities** dynamically to provide the best experience.\n",
"\n",
"If one of these categories is unavailable, that's fine—just provide the best possible suggestions without exceeding **10 activities**.\n",
"Deliver everything **in one go—no waiting!**\n",
"\n",
"\n",
"### **Understanding Relative Dates**\n",
"- Always interpret relative dates based on **{today_str} ({day_name})**.\n",
"- The weekend always refers to Saturday and Sunday.\n",
"- \"Next {day_name}\" should refer to the **closest upcoming occurrence** of that day.\n",
"- If the user asks for a time range (e.g., \"the next 3 days\"), calculate the **exact date range** starting from today.\n",
"- If no specific date is mentioned, **assume today by default**.\n",
"- **Do not ask for confirmation** when interpreting dates—just assume the correct date and proceed confidently unless there's real ambiguity.\n",
"\n",
"### **Activity and Event Suggestion Process**\n",
"To provide the best {nb_activity} activity recommendations, follow these steps:\n",
"Step 1: Retrieve Weather Data Use the Weather API to get current conditions for the user's location.\n",
"Step 2: Suggest Activities Recommend suitable indoor or outdoor activities based on the weather.\n",
"Step 3: Fetch Events (if available) Use the Ticketmaster API to find relevant events in the users area.\n",
"Step 4: Combine Everything Merge both event listings and activity suggestions into a single, well-structured response.\n",
"This entire process should be done seamlessly in one go without making the user wait.\n",
"\n",
"### **How to Handle Each API**\n",
"- **Weather API Handling**:\n",
" - If the user requests a relative date (e.g., \"tomorrow,\" \"next Monday\"), calculate the number of days from today.\n",
" - Provide the weather forecast only for the requested date, ignoring any other days in the response.\n",
" - If no weather data is available, inform the user in a friendly, light-hearted way.\n",
" - The forecast is limited to 14 days, so if the user requests a longer period, politely let him know.\n",
"\n",
"- **Ticketmaster API Handling**:\n",
" - If the user asks for events today, set the start date as todays date.\n",
" - If the user asks for any specific weekday, find the next occurrence of that day and use it as the start date.\n",
" - If the user asks for a range of days (e.g., \"the next 3 days\"), use todays date as the start date.\n",
" - The country corresponding to the user's city must be represented using the ISO Alpha-2 Code (e.g., FR for France, US for the United States, CA for Canada, DK for Denmark).\n",
" - If more than 5 events are found, ask the user for their interests to refine the search, using a one-word keyword like 'music,' 'cinema,' or 'theater.'\n",
" - If no events are found, explicitly inform the user in a friendly, funny way.\n",
" - Do not mention Ticketmaster unless necessary; simply state that you are checking for events.\n",
"\n",
"### **User Interaction Rules**\n",
"- If the user **doesnt mention a city**, **ask them to provide one**.\n",
"- If an event search fails, do **not** mention Ticketmaster; simply say that no events were found.\n",
"- Ensure all activity suggestions are provided **in one response**, combining weather-based activities and event suggestions.\n",
"\n",
"\n",
"### **Event Formatting in Output**\n",
"**If Ticketmaster events are available**, format the output as follows:\n",
"Here are some events that may interest you:\n",
"**Event Name**:\n",
"- 📅 Date: Give the date like 19th March 2025\n",
"- 📍 Venue:\n",
"- 🔗 Ticket Link: Put the URL here\n",
"\n",
"(And don't forget to separate these gems with a snazzy divider)\n",
"\n",
"**Event Name**:\n",
"- 📅 Date: Give the date like 19th March 2025\n",
"- 📍 Venue:\n",
"- 🔗 Ticket Link: Put the URL here\n",
"\n",
"(Another divider, because we like to keep things fresh!)\n",
"\n",
"**Event Name**:\n",
"- 📅 Date: Give the date like 19th March 2025\n",
"- 📍 Venue:\n",
"- 🔗 Ticket Link: Put the URL here\n",
"\n",
"### **Tone and Style**\n",
"**Keep it short, fun, and dont forget to add a dash of humor!**\n",
"Your job is to keep the user smiling while giving them the **best activities for the day**.\n",
"Be **accurate and concise**, but lets keep it **light and lively!** 🎉\n",
"\"\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "578da33d-be38-4c75-8a96-9d6bfc1af99b",
"metadata": {},
"outputs": [],
"source": [
"class WeatherAPI:\n",
" def get_weather(self, city: str, days: int) -> dict:\n",
" \"\"\"Fetches weather data for the given city for the next 'days' number of days.\"\"\"\n",
" url = \"https://api.weatherapi.com/v1/forecast.json\"\n",
" params = {\"key\": weather_api_key, \"q\": city, \"days\": days}\n",
" # print(f\"params weather: {params}\")\n",
" response = requests.get(url, params=params)\n",
"\n",
" if response.status_code == 200:\n",
" data = response.json()\n",
" forecast = []\n",
" for day in data[\"forecast\"][\"forecastday\"]:\n",
" forecast.append({\n",
" \"date\": day[\"date\"],\n",
" \"temp\": day[\"day\"][\"avgtemp_c\"]\n",
" })\n",
"\n",
" result = {\n",
" \"city\": city,\n",
" \"forecast\": forecast\n",
" }\n",
" return result\n",
" else:\n",
" return {\"error\": f\"City '{city}' not found or other issue. Please check the city name and try again.\"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "305f9f18-8556-4b49-9f6b-4a2233eefae9",
"metadata": {},
"outputs": [],
"source": [
"from abc import ABC, abstractmethod\n",
"\n",
"class BaseEventAPI(ABC):\n",
" @abstractmethod\n",
" def get_events(self, city, country_code, keywords, size):\n",
" \"\"\"Fetches upcoming events from an event provider.\"\"\"\n",
" pass # Subclasses must implement this method\n",
"\n",
"class TicketmasterAPI(BaseEventAPI):\n",
" def get_events(self, city, country_code, keywords, start_date):\n",
" \"\"\"Fetches upcoming events from Ticketmaster for a given city.\"\"\"\n",
" url = \"https://app.ticketmaster.com/discovery/v2/events.json\"\n",
" params = {\n",
" \"apikey\": ticketmaster_api_key,\n",
" \"city\": city,\n",
" \"countryCode\": country_code,\n",
" \"keyword\": \",\".join(keywords),\n",
" \"size\": 10,\n",
" \"startDateTime\": start_date\n",
" }\n",
"\n",
" response = requests.get(url, params=params)\n",
"\n",
" if response.status_code == 200:\n",
" data = response.json()\n",
" events = data.get(\"_embedded\", {}).get(\"events\", [])\n",
" return [\n",
" {\n",
" \"name\": event[\"name\"],\n",
" \"date\": event[\"dates\"][\"start\"][\"localDate\"],\n",
" \"venue\": event[\"_embedded\"][\"venues\"][0][\"name\"],\n",
" \"url\": event.get(\"url\", \"N/A\") # Using .get() to avoid KeyError\n",
" }\n",
" for event in events\n",
" ] if events else []\n",
" else:\n",
" return {\"error\": f\"API request failed! Status: {response.status_code}, Response: {response.text}\"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c60820f-4e9f-4851-8330-52c8fd676259",
"metadata": {},
"outputs": [],
"source": [
"class ChatAssistant:\n",
" def __init__(self):\n",
" self.model = MODEL\n",
" self.tools = [\n",
" {\n",
" \"type\": \"function\",\n",
" \"function\": {\n",
" \"name\": \"get_weather\",\n",
" \"description\": \"Get the current weather and forecast for the destination city.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"city\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"The city for which the weather is being requested.\"\n",
" },\n",
" \"days\": {\n",
" \"type\": \"integer\",\n",
" \"description\": \"The number of days for the weather forecast (can be 1, 2, 6, or 10).\"\n",
" }\n",
" },\n",
" \"required\": [\"city\", \"days\"],\n",
" \"additionalProperties\": False\n",
" }\n",
" }\n",
" },\n",
" {\n",
" \"type\": \"function\",\n",
" \"function\": {\n",
" \"name\": \"get_ticketmaster_events\",\n",
" \"description\": \"Fetch upcoming events from Ticketmaster.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"city\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"City where the events are searched.\"\n",
" },\n",
" \"country_code\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"Country code for filtering results.\"\n",
" },\n",
" \"keywords\": {\n",
" \"type\": \"array\",\n",
" \"items\": {\n",
" \"type\": \"string\"\n",
" },\n",
" \"description\": \"Optional keywords for event search (e.g., 'music', 'concert').\"\n",
" },\n",
" \"size\": {\n",
" \"type\": \"integer\",\n",
" \"description\": \"Number of events to fetch.\"\n",
" },\n",
" \"start_date\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"Start date for the event search.\"\n",
" }\n",
" },\n",
" \"required\": [\"city\", \"country_code\", \"size\", \"start_date\"],\n",
" \"additionalProperties\": False\n",
" }\n",
" }\n",
" }\n",
" ]\n",
"\n",
" def chat(self, user_message, history, weather_api, event_apis):\n",
" # Build the conversation\n",
" messages = [{\"role\": \"system\", \"content\": system_message}] + history + [{\"role\": \"user\", \"content\": user_message}]\n",
"\n",
" # OpenAI response\n",
" response = openai.chat.completions.create(model=self.model, messages=messages, tools=self.tools, stream=True)\n",
"\n",
" recovered_pieces = {\n",
" \"content\": None,\n",
" \"role\": \"assistant\",\n",
" \"tool_calls\": {}\n",
" }\n",
" last_tool_calls = {}\n",
" has_tool_call = False\n",
" result = \"\" # Initialize result accumulator\n",
" # previous_index = None # Track the last processed index\n",
"\n",
" for chunk in response:\n",
" delta = chunk.choices[0].delta\n",
" finish_reason = chunk.choices[0].finish_reason\n",
"\n",
" # Handle tool call detection\n",
" if delta.tool_calls and finish_reason in [None, \"tool_calls\"]:\n",
" has_tool_call = True\n",
" piece = delta.tool_calls[0] # Get the first piece in the tool call\n",
"\n",
" # Create a dictionary for the tool call if it doesn't exist yet\n",
" recovered_pieces[\"tool_calls\"][piece.index] = recovered_pieces[\"tool_calls\"].get(\n",
" piece.index, {\"id\": None, \"function\": {\"arguments\": \"\", \"name\": \"\"}, \"type\": \"function\"}\n",
" )\n",
"\n",
" if piece.id:\n",
" recovered_pieces[\"tool_calls\"][piece.index][\"id\"] = piece.id\n",
" if piece.function.name:\n",
" recovered_pieces[\"tool_calls\"][piece.index][\"function\"][\"name\"] = piece.function.name\n",
" recovered_pieces[\"tool_calls\"][piece.index][\"function\"][\"arguments\"] += piece.function.arguments\n",
"\n",
" # Store the tool call in the dictionary by index\n",
" last_tool_calls[piece.index] = recovered_pieces[\"tool_calls\"][piece.index]\n",
"\n",
" # Store content in result and yield\n",
" else:\n",
" result += delta.content or \"\"\n",
" if result.strip():\n",
" yield result\n",
"\n",
"\n",
" # Handle tool call scenario\n",
" if has_tool_call:\n",
" # Handle the tool calls\n",
" response = self.handle_tool_call(last_tool_calls, weather_api, event_apis)\n",
"\n",
" if response: # Only iterate if response is not None\n",
" tool_calls_list = [tool_call for tool_call in last_tool_calls.values()]\n",
" messages.append({\"role\": \"assistant\", \"tool_calls\": tool_calls_list}) # Append the tool calls to the messages\n",
"\n",
" # Dynamically process each tool call response and append it to the message history\n",
" for res in response:\n",
" messages.append({\n",
" \"role\": \"tool\",\n",
" \"tool_call_id\": res[\"tool_call_id\"],\n",
" \"content\": json.dumps(res[\"content\"])\n",
" })\n",
"\n",
" # New OpenAI request with tool response\n",
" response = openai.chat.completions.create(model=self.model, messages=messages, stream=True)\n",
"\n",
" result = \"\" # Reset result before second stream\n",
" for chunk in response:\n",
" result += chunk.choices[0].delta.content or \"\"\n",
" if result.strip():\n",
" yield result\n",
"\n",
"\n",
" def handle_tool_call(self, tool_call, weather_api, event_apis):\n",
" stored_values = {} # Dictionary to store the valid value for each field\n",
"\n",
" for index, call in tool_call.items():\n",
" # Load the arguments for each tool call dynamically\n",
" arguments = json.loads(call[\"function\"][\"arguments\"])\n",
"\n",
" # Iterate over all keys dynamically\n",
" for key, value in arguments.items():\n",
" # Update the field if it's currently None or hasn't been set before\n",
" if key not in stored_values or stored_values[key] is None:\n",
" stored_values[key] = value\n",
"\n",
" city = stored_values.get('city')\n",
" days = stored_values.get('days')\n",
" country_code = stored_values.get('country_code')\n",
" keywords = stored_values.get('keywords', [])\n",
" # size = stored_values.get('size')\n",
" start_date = stored_values.get('start_date')\n",
" start_date = str(start_date) + \"T00:00:00Z\"\n",
"\n",
" weather_data = None\n",
" event_data = None\n",
"\n",
" # Iteration over tool_call\n",
" for call in tool_call.values():\n",
" if call[\"function\"][\"name\"] == \"get_weather\":\n",
" weather_data = weather_api.get_weather(city, days)\n",
"\n",
" if call[\"function\"][\"name\"] == \"get_ticketmaster_events\":\n",
" event_data = event_apis[\"ticketmaster\"].get_events(city, country_code, keywords, start_date)\n",
"\n",
" responses = []\n",
"\n",
" # Ensure weather response is always included\n",
" weather_tool_call_id = next((call[\"id\"] for call in tool_call.values() if call[\"function\"][\"name\"] == \"get_weather\"), None)\n",
" if weather_data and \"forecast\" in weather_data:\n",
" responses.append({\n",
" \"role\": \"assistant\",\n",
" \"content\": {\"weather\": weather_data[\"forecast\"]},\n",
" \"tool_call_id\": weather_tool_call_id\n",
" })\n",
" elif weather_tool_call_id:\n",
" responses.append({\n",
" \"role\": \"assistant\",\n",
" \"content\": {\"message\": \"No weather data available for this location.\"},\n",
" \"tool_call_id\": weather_tool_call_id\n",
" })\n",
"\n",
" # Ensure event response is always included\n",
" event_tool_call_id = next((call[\"id\"] for call in tool_call.values() if call[\"function\"][\"name\"] == \"get_ticketmaster_events\"), None)\n",
" if event_data:\n",
" responses.append({\n",
" \"role\": \"assistant\",\n",
" \"content\": {\"events\": event_data},\n",
" \"tool_call_id\": event_tool_call_id\n",
" })\n",
" elif event_tool_call_id:\n",
" responses.append({\n",
" \"role\": \"assistant\",\n",
" \"content\": {\"message\": \"No events found for this location.\"},\n",
" \"tool_call_id\": event_tool_call_id\n",
" })\n",
"\n",
" # print(\"Final responses:\", responses)\n",
" return responses\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "191a3a9e-95e1-4ca6-8992-4a5bafb9b8ff",
"metadata": {},
"outputs": [],
"source": [
"# GradioInterface class to handle the Gradio UI\n",
"class GradioInterface:\n",
" def __init__(self, activity_assistant):\n",
" self.activity_assistant = activity_assistant\n",
"\n",
" def launch(self):\n",
" # Gradio chat interface\n",
" gr.ChatInterface(fn=self.activity_assistant.chat, type=\"messages\").launch()\n",
"\n",
"# ActivityAssistant setup\n",
"class ActivityAssistant:\n",
" def __init__(self):\n",
" self.weather_api = WeatherAPI() # Interact with the Weather API\n",
" self.event_apis = { # Interact with the Events API\n",
" \"ticketmaster\": TicketmasterAPI()\n",
" }\n",
" self.chat_assistant = ChatAssistant() # This will handle conversation with OpenAI\n",
"\n",
" def chat(self, user_message, history):\n",
" # Forward the user message and conversation history to ChatAssistant\n",
" response_stream = self.chat_assistant.chat(user_message, history, self.weather_api, self.event_apis)\n",
" for chunk in response_stream:\n",
" yield chunk"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b501e8e-2e10-4ab7-b523-1d4b8ad358e8",
"metadata": {},
"outputs": [],
"source": [
"# Main execution\n",
"if __name__ == \"__main__\":\n",
" activity_assistant = ActivityAssistant()\n",
" gradio_interface = GradioInterface(activity_assistant)\n",
" gradio_interface.launch()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,420 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6a08763a-aed6-4f91-94d0-80a3c0e2665b",
"metadata": {},
"source": [
"### Weeks 2 - Day 2 - Gradio Chatbot with LiteLLM (Model Routing)"
]
},
{
"cell_type": "markdown",
"id": "a4f38c58-5ceb-4d5e-b538-c1acdc881f73",
"metadata": {},
"source": [
"**Author** : [Marcus Rosen](https://github.com/MarcusRosen)"
]
},
{
"cell_type": "markdown",
"id": "36f4814a-2bfc-4631-97d7-7a474fa1cc8e",
"metadata": {},
"source": [
"[LiteLLM](https://docs.litellm.ai/docs/) provides the abilitty to call different LLM providers via a unified interface, returning results in OpenAI compatible formats.\n",
"\n",
"Features:\n",
"- Model Selection in Gradio (Anthropic, OpenAI, Gemini)\n",
"- Single Inference function for all model providers via LiteLLM (call_llm)\n",
"- Streaming **NOTE:** Bug when trying to stream in Gradio, but works directly in Notebook\n",
"- Debug Tracing"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "b6c12598-4773-4f85-93ca-0128d74fbca0",
"metadata": {},
"outputs": [],
"source": [
"from litellm import completion\n",
"import gradio as gr\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"import os\n",
"import requests\n",
"import json"
]
},
{
"cell_type": "markdown",
"id": "d24be370-5347-47fb-a58e-21a1b5409ab2",
"metadata": {},
"source": [
"#### Load API Keys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e03afbe9-16aa-434c-a701-b3bfe75e927d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OpenAI API Key exists and begins sk-proj-\n",
"Anthropic API Key exists and begins sk-ant-\n",
"Google API Key exists and begins AIzaSyDC\n"
]
}
],
"source": [
"# Load environment variables in a file called .env\n",
"# Print the key prefixes to help with any debugging\n",
"\n",
"load_dotenv(override=True)\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
"google_api_key = os.getenv('GEMINI_API_KEY')\n",
"\n",
"if openai_api_key:\n",
" print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
"else:\n",
" print(\"OpenAI API Key not set\")\n",
" \n",
"if anthropic_api_key:\n",
" print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n",
"else:\n",
" print(\"Anthropic API Key not set\")\n",
"\n",
"if google_api_key:\n",
" print(f\"Google API Key exists and begins {google_api_key[:8]}\")\n",
" # import google.generativeai\n",
" # google.generativeai.configure()\n",
"else:\n",
" print(\"Gemini API Key not set\")"
]
},
{
"cell_type": "markdown",
"id": "66e46447-0e73-49ef-944a-d1e8fae4986e",
"metadata": {},
"source": [
"### Use LiteLLM to abstract out the model provider"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "473c2029-ca74-4f1e-92ac-05f7817ff7df",
"metadata": {},
"outputs": [],
"source": [
"def call_llm(model, system_prompt, user_prompt, json_format_response=False, streaming=False):\n",
" if DEBUG_OUTPUT: \n",
" print(\"call_llm()\")\n",
" print(f\"streaming={streaming}\")\n",
" print(f\"json_format_response={json_format_response}\")\n",
" \n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
"\n",
" payload = {\n",
" \"model\": model,\n",
" \"messages\": messages\n",
" }\n",
" # Use Json Reponse Format\n",
" # Link: https://docs.litellm.ai/docs/completion/json_mode\n",
" if json_format_response:\n",
" payload[\"response_format\"]: { \"type\": \"json_object\" }\n",
" \n",
" if streaming:\n",
" payload[\"stream\"] = True\n",
" response = completion(**payload)\n",
" # Return a generator expression instead of using yield in the function\n",
" return (part.choices[0].delta.content or \"\" for part in response)\n",
" else:\n",
" response = completion(**payload)\n",
" return response[\"choices\"][0][\"message\"][\"content\"]"
]
},
{
"cell_type": "markdown",
"id": "f45e0972-a6a0-4237-8a69-e6f165f30e0d",
"metadata": {},
"source": [
"### Brochure building functions"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "c76d4ff9-0f18-49d0-a9b5-2c6c0bad359a",
"metadata": {},
"outputs": [],
"source": [
"# A class to represent a Webpage\n",
"\n",
"# Some websites need you to use proper headers when fetching them:\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
" \"\"\"\n",
" A utility class to represent a Website that we have scraped, now with links\n",
" \"\"\"\n",
"\n",
" def __init__(self, url):\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" self.body = response.content\n",
" soup = BeautifulSoup(self.body, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" if soup.body:\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
" else:\n",
" self.text = \"\"\n",
" links = [link.get('href') for link in soup.find_all('a')]\n",
" self.links = [link for link in links if link]\n",
"\n",
" def get_contents(self):\n",
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\""
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "ff41b687-3a46-4bca-a031-1148b91a4fdf",
"metadata": {},
"outputs": [],
"source": [
"def get_links(url, model):\n",
" if DEBUG_OUTPUT:\n",
" print(\"get_links()\")\n",
" website = Website(url)\n",
"\n",
" link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n",
" You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n",
" such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
" link_system_prompt += \"You should respond in raw JSON exactly as specified in this example. DO NOT USE MARKDOWN.\"\n",
" link_system_prompt += \"\"\"\n",
" {\n",
" \"links\": [\n",
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
" {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n",
" ]\n",
" }\n",
" \"\"\"\n",
" \n",
" result = call_llm(model=model, \n",
" system_prompt=link_system_prompt, \n",
" user_prompt=get_links_user_prompt(website), \n",
" json_format_response=True, \n",
" streaming=False)\n",
" if DEBUG_OUTPUT:\n",
" print(result)\n",
" return json.loads(result)\n",
"\n",
"def get_links_user_prompt(website):\n",
" if DEBUG_OUTPUT:\n",
" print(\"get_links_user_prompt()\")\n",
" \n",
" user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
" user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
"Do not include Terms of Service, Privacy, email links.\\n\"\n",
" user_prompt += \"Links (some might be relative links):\\n\"\n",
" user_prompt += \"\\n\".join(website.links)\n",
"\n",
" if DEBUG_OUTPUT:\n",
" print(user_prompt)\n",
" \n",
" return user_prompt\n",
"\n",
"def get_all_details(url, model):\n",
" if DEBUG_OUTPUT:\n",
" print(\"get_all_details()\")\n",
" \n",
" result = \"Landing page:\\n\"\n",
" result += Website(url).get_contents()\n",
" links = get_links(url, model)\n",
" if DEBUG_OUTPUT:\n",
" print(\"Found links:\", links)\n",
" for link in links[\"links\"]:\n",
" result += f\"\\n\\n{link['type']}\\n\"\n",
" result += Website(link[\"url\"]).get_contents()\n",
" return result\n",
"\n",
"def get_brochure_user_prompt(company_name, url, model):\n",
" \n",
" if DEBUG_OUTPUT:\n",
" print(\"get_brochure_user_prompt()\")\n",
" \n",
" user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n",
" user_prompt += get_all_details(url, model)\n",
" user_prompt = user_prompt[:5000] # Truncate if more than 5,000 characters\n",
" return user_prompt\n"
]
},
{
"cell_type": "code",
"execution_count": 106,
"id": "cf7512a1-a498-44e8-a234-9affb72efe60",
"metadata": {},
"outputs": [],
"source": [
"def create_brochure(company_name, url, model, streaming):\n",
"\n",
" system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n",
"and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n",
"Include details of company culture, customers and careers/jobs if you have the information.\"\n",
" if streaming:\n",
" result = call_llm(model=model, system_prompt=system_prompt, user_prompt=get_brochure_user_prompt(company_name, url, model), streaming=True)\n",
" return (p for p in result)\n",
" else: \n",
" return call_llm(model=model, system_prompt=system_prompt, user_prompt=get_brochure_user_prompt(company_name, url, model), streaming=False)\n",
" "
]
},
{
"cell_type": "markdown",
"id": "ecb6d212-ddb6-4170-81bf-8f3ea54479f8",
"metadata": {},
"source": [
"#### Testing Model before implenting Gradio"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "de89843a-08ac-4431-8c83-21a93c05f764",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# Rio Tinto: Providing the Materials for a Sustainable Future\n",
"\n",
"## About Rio Tinto\n",
"\n",
"Rio Tinto is a global mining and metals company, operating in 35 countries with over 60,000 employees. Their purpose is to find better ways to provide the materials the world needs. Continuous improvement and innovation are at the core of their DNA, as they work to responsibly supply the metals and minerals critical for urbanization and the transition to a low-carbon economy.\n",
"\n",
"## Our Products\n",
"\n",
"Rio Tinto's diverse portfolio includes:\n",
"\n",
"- Iron Ore: The primary raw material used to make steel, which is strong, long-lasting and cost-efficient.\n",
"- Aluminium: A lightweight, durable and recyclable metal.\n",
"- Copper: A tough, malleable, corrosion-resistant and recyclable metal that is an excellent conductor of heat and electricity.\n",
"- Lithium: The lightest of all metals, a key element for low-carbon technologies.\n",
"- Diamonds: Ethically-sourced, high-quality diamonds.\n",
"\n",
"## Sustainability and Innovation\n",
"\n",
"Sustainability is at the heart of Rio Tinto's operations. They are targeting net zero emissions by 2050 and investing in nature-based solutions to complement their decarbonization efforts. Innovation is a key focus, with research and development into new technologies to improve efficiency and reduce environmental impact.\n",
"\n",
"## Careers and Culture\n",
"\n",
"Rio Tinto values its 60,000 employees and is committed to fostering a diverse and inclusive workplace. They offer a wide range of career opportunities, from mining and processing to engineering, finance, and more. Rio Tinto's culture is centered on safety, collaboration, and continuous improvement, with a strong emphasis on sustainability and responsible business practices.\n",
"\n",
"## Conclusion\n",
"\n",
"Rio Tinto is a global leader in the mining and metals industry, providing the materials essential for a sustainable future. Through their commitment to innovation, sustainability, and their talented workforce, Rio Tinto is well-positioned to meet the world's growing demand for critical resources.\n",
"\u001b[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\u001b[0m\n",
"LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.\n",
"\n",
"<generator object call_llm.<locals>.<genexpr> at 0x7f80ca5da0c0>\n"
]
}
],
"source": [
"MODEL=\"claude-3-haiku-20240307\"\n",
"DEBUG_OUTPUT=False\n",
"streaming=True\n",
"result = create_brochure(company_name=\"Rio Tinto\", url=\"http://www.riotinto.com\", model=MODEL, streaming=streaming)\n",
"\n",
"if streaming:\n",
" for chunk in result:\n",
" print(chunk, end=\"\", flush=True)\n",
"else:\n",
" print(result)\n"
]
},
{
"cell_type": "markdown",
"id": "1f330c92-6280-4dae-b4d8-717a56edb236",
"metadata": {},
"source": [
"#### Gradio Setup\n",
"Associate Dropdown values with the model we want to use.\n",
"Link: https://www.gradio.app/docs/gradio/dropdown#initialization"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2f38862-3728-4bba-9e16-6f9fab276145",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"DEBUG_OUTPUT=True\n",
"view = gr.Interface(\n",
" fn=create_brochure,\n",
" inputs=[\n",
" gr.Textbox(label=\"Company name:\"),\n",
" gr.Textbox(label=\"Landing page URL including http:// or https://\"),\n",
" gr.Dropdown(choices=[(\"GPT 4o Mini\", \"gpt-4o-mini\"), \n",
" (\"Claude Haiku 3\", \"claude-3-haiku-20240307\"), \n",
" (\"Gemini 2.0 Flash\", \"gemini/gemini-2.0-flash\")], \n",
" label=\"Select model\"),\n",
" gr.Checkbox(label=\"Stream\")\n",
" ],\n",
" outputs=[gr.Markdown(label=\"Brochure:\")],\n",
" flagging_mode=\"never\"\n",
")\n",
"view.launch()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e0981136-2067-43b8-b17d-83560dd609ce",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,195 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "989184c3-676b-4a68-8841-387ba0776e1d",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"import gradio as gr\n",
"import ollama"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0ac9605-d28a-4c19-97e3-1dd3f9ac99ba",
"metadata": {},
"outputs": [],
"source": [
"system_message = \"You are a helpful assistant for an Airline called FlightAI. \"\n",
"system_message += \"Give short, courteous answers, no more than 1 sentence. Respond to greetings and general conversation politely.\"\n",
"system_message += \"Always be accurate. If you don't know the answer, say so.\"\n",
"system_message += \"When a user asks for information that requires external data or action, use the available tools to get that information Specifically\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "533e6edf-454a-493d-b0a7-dbc29a5f3930",
"metadata": {},
"outputs": [],
"source": [
"def chat(message, history):\n",
" messages = [{\"role\": \"system\", \"content\": system_message}] + history + [{\"role\": \"user\", \"content\": message}]\n",
" response = ollama.chat(model=\"llama3.2\", messages=messages)\n",
" return response['message']['content']\n",
"\n",
"gr.ChatInterface(fn=chat, type=\"messages\").launch()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac22d421-a241-4c1f-bac4-db2150099ecc",
"metadata": {},
"outputs": [],
"source": [
"ticket_prices = {\"london\": \"$799\", \"paris\": \"$899\", \"tokyo\": \"$1400\", \"berlin\": \"$499\"}\n",
"\n",
"def get_ticket_price(destination_city):\n",
" print(f\"Tool get_ticket_price called for {destination_city}\")\n",
" city = destination_city.lower()\n",
" return ticket_prices.get(city, \"Unknown\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a0381b1-375c-44ac-8757-2fdde2c76541",
"metadata": {},
"outputs": [],
"source": [
"price_function = {\n",
" \"name\": \"get_ticket_price\",\n",
" \"description\": \"Get the price of a return ticket to the destination city. Call this whenever you need to know the ticket price, for example when a customer asks 'How much is a ticket to this city'\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"destination_city\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"The city that the customer wants to travel to\",\n",
" },\n",
" },\n",
" \"required\": [\"destination_city\"],\n",
" \"additionalProperties\": False\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce5a7fd0-1ce1-4b53-873e-f55d1e39d847",
"metadata": {},
"outputs": [],
"source": [
"#tools = [{\"type\": \"function\", \"function\": price_function}]\n",
"tools = [\n",
" {\n",
" \"type\":\"function\",\n",
" \"function\":{\n",
" \"name\": \"get_ticket_price\",\n",
" \"description\": \"Get the price of a return ticket to the destination city. Call this whenever you need to know the ticket price, for example when a customer asks 'How much is a ticket to this city'\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"destination_city\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"The city that the customer wants to travel to\"\n",
" },\n",
" },\n",
" \"required\": [\"destination_city\"],\n",
" \"additionalProperties\": False\n",
" },\n",
" },\n",
" }\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06eab709-3f05-4697-a6a8-5f5bc1f442a5",
"metadata": {},
"outputs": [],
"source": [
"def handle_tool_call(message):\n",
" tool_call = message.tool_calls[0]\n",
" arguments = tool_call.function.arguments\n",
" city = arguments.get('destination_city')\n",
" price = get_ticket_price(city)\n",
" response = {\n",
" \"role\": \"tool\",\n",
" \"content\": json.dumps({\"destination_city\": city,\"price\": price}),\n",
" # \"tool_call_id\": tool_call.id\n",
" }\n",
" return response, city"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7f9af23-0683-40c3-a70b-0a385754688c",
"metadata": {},
"outputs": [],
"source": [
"def chat(message, history):\n",
" messages = [{\"role\": \"system\", \"content\": system_message}] + history + [{\"role\": \"user\", \"content\": message}]\n",
" response = ollama.chat(model=\"llama3.2\", messages=messages,tools=tools)\n",
" if response['message'].get('tool_calls'):\n",
" message = response['message']\n",
" response, city = handle_tool_call(message)\n",
" messages.append(message)\n",
" messages.append(response)\n",
" response = ollama.chat(model=\"llama3.2\", messages=messages)\n",
" \n",
" return response['message']['content']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fcfa39e2-92ce-48df-b735-f9bbfe638c81",
"metadata": {},
"outputs": [],
"source": [
"gr.ChatInterface(fn=chat, type=\"messages\").launch()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f5044e9-0ae8-4d88-a22f-d1180ab52434",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,605 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "SFA6R-4jL7SS"
},
"source": [
"# Synthetic Data Generator Notebook\n",
"## About\n",
"This colab notebook demonstrates the use of Frontier and Open-source LLM models for generating synthetic dataset for a business scenario provided by the user. From a UI interface implemented in gradio, a user can define their business scenario in detail, select the number of records needed along with the its format and adjust the number of max output tokens to be generated by the chosen LLM.\n",
"\n",
"It does not stop here. Once the records have been produced in the LLM output, it can be extracted and stored in a file, format same as set by user before. The file is stored in colab notebook under the contents directory. All of this is extraction is done with the help of the 're' library. My first time using it and I totally enjoyed learning it.\n",
"\n",
"## Outlook\n",
"Sometimes the response is loaded with the user prompt and a lot of tags when using an open-source models, such as Mixtral from Mistral. This is because of the prompt format being used. The 'assistant' 'role' format does not suit them. This is an optimization to look for and can be easily done by using custom prompt template for such models and these templates are hinted on their huggingface repo."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ip4I4Lff3B2M"
},
"source": [
"## Install & Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8zVlW-GMcBaU",
"outputId": "0c473564-fb93-41a9-c819-e6aa2382d75a"
},
"outputs": [],
"source": [
"!pip install -q gradio anthropic requests torch bitsandbytes transformers accelerate openai"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YKVNzE5sFH2l"
},
"outputs": [],
"source": [
"# imports\n",
"import re\n",
"import os\n",
"import sys\n",
"import gc\n",
"import io\n",
"import json\n",
"import anthropic\n",
"import gradio as gr\n",
"import requests\n",
"import subprocess\n",
"import google.generativeai as ggai\n",
"import torch\n",
"import tempfile\n",
"import shutil\n",
"from io import StringIO\n",
"import pandas as pd\n",
"from google.colab import userdata\n",
"from huggingface_hub import login\n",
"from openai import OpenAI\n",
"from pathlib import Path\n",
"from datetime import datetime\n",
"from IPython.display import Markdown, display, update_display\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "LWpD6bZv3mAR"
},
"source": [
"## HuggingFace Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "aeC2oWY2FTv7"
},
"outputs": [],
"source": [
"# Sign in to HuggingFace Hub\n",
"\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8Au2UPVy3vn5"
},
"source": [
"## Frontier Models configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "geBBsd14X3UL"
},
"outputs": [],
"source": [
"openai_client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))\n",
"anthropic_client = anthropic.Anthropic(api_key=userdata.get('ANTHROPIC_API_KEY'))\n",
"ggai.configure(api_key=userdata.get('GOOGLE_API_KEY'))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "tCnDIOlKgjbO"
},
"source": [
"## Defining Prompts"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "gkwXZsxofAU1"
},
"outputs": [],
"source": [
"system_prompt = \"\"\"\n",
"You are a synthetic dataset generator. Your role is to create synthetic dataset that infers structured data schemas from business scenarios given by the user.\n",
"\n",
"Your task is to:\n",
"1. Understand the user's business problem(s) or use case(s).\n",
"2. Identify the key fields needed to support that scenario.\n",
"3. Define appropriate field names, data types, and formats.\n",
"4. Generate synthetic records that match the inferred schema.\n",
"\n",
"Guidelines:\n",
"- Use realistic field names and values. Do not invent unrelated fields or values.\n",
"- Choose sensible data types: string, integer, float, date, boolean, enum, etc.\n",
"- Respect logical constraints (e.g., age range, date ranges, email formats).\n",
"- Output the dataset in the format the user requests (json, csv, txt, markdown table).\n",
"- If the scenario is vague or broad, make reasonable assumptions and explain them briefly before generating the dataset.\n",
"- Always generate a dataset that supports the business use case logically.\n",
"\n",
"Before generating the data, display the inferred schema in a readable format.\n",
"\"\"\"\n",
"\n",
"# trial_user_prompt = \"Im building a churn prediction model for a telecom company. Can you generate a synthetic dataset with 100 rows?\"\n",
"def get_user_prompt(business_problem, no_of_samples, file_format):\n",
" return f\"\"\"\n",
" The business scenario for which I want you to generate a dataset is defined below:\n",
" {business_problem}\n",
"\n",
" Generate a synthetic dataset of {no_of_samples} records in {file_format} format.\n",
" When generating the dataset, wrap it between the '<<<>>>' tag. Make sure the tag is there in the output.\n",
" Do not include any other special characters in between the tags, other than the ones required in producing the correct format of data.\n",
" For examples: When a 'csv' format is given, only the ',' character can be used in between the tags.\n",
" \"\"\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "yNpVf9-oQdoO"
},
"source": [
"### Quanitzation Config"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "3ErZ315MQdU3"
},
"outputs": [],
"source": [
"# This allows us to load the model into memory and use less memory\n",
"def get_quantization_config():\n",
" return BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "clGtRh0N4951"
},
"source": [
"## HF Model inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "MAhyn1ehb3Dh"
},
"outputs": [],
"source": [
"# All in one HuggingFace Model Response function\n",
"def run_hfmodel_and_get_response(prompt, model_name, output_tokens):\n",
" tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
" tokenizer.pad_token = tokenizer.eos_token\n",
" inputs = tokenizer.apply_chat_template(prompt, return_tensors=\"pt\")\n",
" if torch.cuda.is_available():\n",
" inputs = inputs.to(\"cuda\")\n",
" streamer = TextStreamer(tokenizer)\n",
" if \"microsoft/bitnet-b1.58-2B-4T\" in model_name:\n",
" model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", trust_remote_code=True)\n",
" elif \"tiiuae/Falcon-E-3B-Instruct\" in model_name:\n",
" model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", torch_dtype=torch.float16 )\n",
" else:\n",
" model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", quantization_config=get_quantization_config())\n",
" outputs = model.generate(inputs, max_new_tokens=output_tokens, streamer=streamer)\n",
" response = tokenizer.decode(outputs[0])\n",
" del model, inputs, tokenizer, outputs\n",
" gc.collect()\n",
" torch.cuda.empty_cache()\n",
" return response"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Gh_Ny1aM-L8z"
},
"source": [
"## Frontier Models Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "h11WlZNhfHCR"
},
"outputs": [],
"source": [
"# ChatGPT, Claude and Gemini response function\n",
"def get_chatgpt_response(prompt, model_name, output_tokens):\n",
" response = openai_client.chat.completions.create(\n",
" model=model_name,\n",
" messages=prompt,\n",
" max_tokens=output_tokens,\n",
" )\n",
" return response.choices[0].message.content\n",
"\n",
"def get_claude_response(prompt, model_name, output_tokens):\n",
" response = anthropic_client.messages.create(\n",
" model=model_name,\n",
" max_tokens=output_tokens,\n",
" system=system_prompt,\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": prompt,\n",
" }\n",
" ],\n",
" )\n",
" return response.content[0].text\n",
"\n",
"def get_gemini_response(prompt, model_name, output_tokens):\n",
" model = ggai.GenerativeModel(\n",
" model_name=model_name,\n",
" system_instruction=system_prompt,\n",
" )\n",
"\n",
" response = model.generate_content(prompt, generation_config={\n",
" \"max_output_tokens\": output_tokens,\n",
" \"temperature\": 0.7,\n",
" })\n",
" return response.text"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nzHbM_WQvRgT"
},
"source": [
"## Gradio Implementation"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "uFWZqw1R-al_"
},
"source": [
"### Dropdowns Selection Lists"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rOzEb0o--aD7"
},
"outputs": [],
"source": [
"# Dropdown List Values for the user\n",
"MODEL_TYPES=[\"GPT\", \"Claude\", \"Gemini\", \"HuggingFace\"]\n",
"OPENAI_MODEL_NAMES=[\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\"]\n",
"ANTHROPIC_MODELS=[\"claude-3-7-sonnet-latest\", \"claude-3-5-haiku-latest\", \"claude-3-opus-latest\"]\n",
"GOOGLE_MODELS=[\"gemini-2.0-flash\", \"gemini-1.5-pro\"]\n",
"HUGGINGFACE_MODELS=[\n",
" \"meta-llama/Llama-3.2-3B-Instruct\",\n",
" \"microsoft/bitnet-b1.58-2B-4T\",\n",
" \"ByteDance-Seed/Seed-Coder-8B-Instruct\",\n",
" \"tiiuae/Falcon-E-3B-Instruct\",\n",
" \"Qwen/Qwen2.5-7B-Instruct\"\n",
"]\n",
"MODEL_NAMES = {\n",
" \"GPT\": OPENAI_MODEL_NAMES,\n",
" \"Claude\": ANTHROPIC_MODELS,\n",
" \"Gemini\": GOOGLE_MODELS,\n",
" \"HuggingFace\": HUGGINGFACE_MODELS\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sbXGL8_4-oKc"
},
"source": [
"### UI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_0NCY7FgCVHj"
},
"outputs": [],
"source": [
"with gr.Blocks() as generator_ui:\n",
" gr.Markdown(\"# 🧠 Business Scenario → Synthetic Dataset Generator\")\n",
"\n",
" with gr.Row():\n",
" with gr.Column(scale=3):\n",
" with gr.Row():\n",
" dataset_size=gr.Number(value=10, label=\"Enter the number of data samples to generate.\", show_label=True)\n",
" format=gr.Dropdown([\"json\", \"csv\", \"txt\", \"markdown\"], label=\"Select the format for the dataset\", show_label=True)\n",
" with gr.Row():\n",
" scenario=gr.Textbox(label=\"Business Scenario\", lines=5, placeholder=\"Describe your business scenario here\")\n",
" with gr.Row():\n",
" error = gr.Markdown(visible=False)\n",
" with gr.Row():\n",
" clear = gr.Button(\"Clear Everything\")\n",
" submit = gr.Button(\"Generate Dataset\", variant=\"primary\")\n",
"\n",
" with gr.Column(scale=1):\n",
" model_type = gr.Dropdown(MODEL_TYPES, label=\"Model Type\", show_label=True, info=\"Select the model type you want to use\")\n",
" model_name = gr.Dropdown(MODEL_NAMES[model_type.value], label=\"Model Name\", show_label=True, allow_custom_value=True, info=\"Select the model name or enter one manually\")\n",
" output_tokens= gr.Number(value=1000, label=\"Enter the max number of output tokens to generate.\", show_label=True, info=\"This will impact the length of the response containg the dataset\")\n",
"\n",
" with gr.Row():\n",
" # Chatbot Interface\n",
" chatbot = gr.Chatbot(\n",
" type='messages',\n",
" label='Chatbot',\n",
" show_label=True,\n",
" height=300,\n",
" resizable=True,\n",
" elem_id=\"chatbot\",\n",
" avatar_images=(\"🧑\", \"🤖\",)\n",
" )\n",
" with gr.Row(variant=\"compact\"):\n",
" extract_btn = gr.Button(\"Extract and Save Dataset\", variant=\"huggingface\", visible=False)\n",
" file_name = gr.Textbox(label=\"Enter file name here (without file extension)\", placeholder=\"e.g. cancer_synthetic, warehouse_synthetic (no digits)\", visible=False)\n",
" with gr.Row():\n",
" markdown_preview = gr.Markdown(visible = False)\n",
" dataset_preview = gr.Textbox(label=\"Dataset Preview\",visible=False)\n",
" with gr.Row():\n",
" file_saved = gr.Textbox(visible=False)\n",
"\n",
" def run_inference(scenario, model_type, model_name, output_tokens, dataset_size, format):\n",
" \"\"\"Run the model and get the response\"\"\"\n",
" model_type=model_type.lower()\n",
" print(f\"scenario: {scenario}\")\n",
" print(f\"model_type: {model_type}\")\n",
" print(f\"model_name: {model_name}\")\n",
" if not scenario.strip():\n",
" return gr.update(value=\"❌ **Error:** Please define a scenario first!\",visible=True), []\n",
"\n",
" user_prompt = get_user_prompt(scenario, dataset_size, format)\n",
" prompt = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt},\n",
" ]\n",
"\n",
" if model_type == \"gpt\":\n",
" response = get_chatgpt_response(prompt=prompt, model_name=model_name, output_tokens=output_tokens)\n",
" elif model_type == \"claude\":\n",
" response = get_claude_response(prompt=user_prompt, model_name=model_name, output_tokens=output_tokens)\n",
" elif model_type == \"gemini\":\n",
" response = get_gemini_response(prompt=user_prompt, model_name=model_name, output_tokens=output_tokens)\n",
" else:\n",
" response = run_hfmodel_and_get_response(prompt=prompt, model_name=model_name, output_tokens=output_tokens)\n",
" torch.cuda.empty_cache()\n",
" history = [\n",
" {\"role\": \"user\", \"content\": scenario},\n",
" {\"role\": \"assistant\", \"content\": response}\n",
" ]\n",
" return gr.update(visible=False), history\n",
"\n",
" def extract_dataset_string(response):\n",
" \"\"\"Extract dataset content between defined tags using regex.\"\"\"\n",
" # Remove known artificial tokens (common in HuggingFace or Claude)\n",
" response = re.sub(r\"<\\[.*?\\]>\", \"\", response)\n",
"\n",
" # Remove system or prompt echo if repeated before dataset\n",
" response = re.sub(r\"(?is)^.*?<<<\", \"<<<\", response.strip(), count=1)\n",
"\n",
" # 1. Match strict <<<>>>...<<<>>> tag blocks (use last match)\n",
" matches = re.findall(r\"<<<>>>[\\s\\r\\n]*(.*?)[\\s\\r\\n]*<<<>>>\", response, re.DOTALL)\n",
" if matches:\n",
" return matches[-1].strip()\n",
"\n",
" # 2. Match loose <<< ... >>> format\n",
" matches = re.findall(r\"<<<[\\s\\r\\n]*(.*?)[\\s\\r\\n]*>>>\", response, re.DOTALL)\n",
" if matches:\n",
" return matches[-1].strip()\n",
"\n",
" # 3. Match final fallback: take everything after last <<< as raw data\n",
" last_open = response.rfind(\"<<<\")\n",
" if last_open != -1:\n",
" raw = response[last_open + 3 :].strip()\n",
" # Optionally cut off noisy trailing notes, explanations, etc.\n",
" raw = re.split(r\"\\n\\s*\\n|Explanation:|Note:|---\", raw)[0]\n",
" return raw.strip()\n",
"\n",
" return \"Could not extract dataset! Try again with a different model.\"\n",
"\n",
" def extract_dataset_from_response(chatbot_history, file_name, file_type):\n",
" \"\"\"Extract dataset and update in gradio UI components\"\"\"\n",
" response = chatbot_history[-1][\"content\"]\n",
" if not response:\n",
" return gr.update(visible=True, value=\"Could not find LLM Response! Try again.\"), gr.update(visible=False)\n",
"\n",
" # match = re.search(r'<<<\\s*(.*?)\\s*>>>', response, re.DOTALL)\n",
" # print(match)\n",
" # if match and match.group(1).strip() == \"\":\n",
" # match = re.search(r'<<<>>>\\s*(.*?)\\s*<<<>>>', response, re.DOTALL)\n",
" # print(match)\n",
" # if match is None:\n",
" # return gr.update(visible=True, value=\"Could not extract dataset! Try again with a different model.\"), gr.update(visible=False)\n",
" # dataset = match.group(1).strip()\n",
" dataset = extract_dataset_string(response)\n",
" if dataset == \"Could not extract dataset! Try again with a different model.\":\n",
" return gr.update(visible=True, value=dataset), gr.update(visible=False)\n",
" text = save_dataset(dataset, file_type, file_name)\n",
" return gr.update(visible=True, value=text), gr.update(visible=True, value=dataset)\n",
"\n",
" def save_dataset(dataset, file_format, file_name):\n",
" \"\"\"Save dataset to a file based on the selected format.\"\"\"\n",
" file_name=file_name+\".\"+file_format\n",
" print(dataset)\n",
" print(file_name)\n",
" if file_format == \"json\":\n",
" try:\n",
" data = json.loads(dataset)\n",
" with open(file_name, \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(data, f, indent=4)\n",
" return \"Dataset saved successfully!\"\n",
" except:\n",
" return \"Could not save dataset! Try again in another format.\"\n",
" elif file_format == \"csv\":\n",
" try:\n",
" df = pd.read_csv(StringIO(dataset))\n",
" df.to_csv(file_name, index=False)\n",
" return \"Dataset saved successfully!\"\n",
" except:\n",
" return \"Could not save dataset! Try again in another format.\"\n",
" elif file_format == \"txt\":\n",
" try:\n",
" with open(file_name, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(dataset)\n",
" return \"Dataset saved successfully!\"\n",
" except:\n",
" return \"Could not save dataset! Try again in another format.\"\n",
"\n",
" def clear_chat():\n",
" \"\"\"Clear the chat history.\"\"\"\n",
" return \"\", [], gr.update(visible=False), gr.update(visible=False)\n",
"\n",
" def show_extract_btn(chatbot_history, format):\n",
" \"\"\"Show the extract button if the response has been displayed in the chatbot and format is not set to markdown\"\"\"\n",
" if chatbot_history == []:\n",
" return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)\n",
" if format == \"markdown\":\n",
" return gr.update(visible=True, value=chatbot_history[1][\"content\"]), gr.update(visible=False), gr.update(visible=False)\n",
" return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)\n",
"\n",
" extract_btn.click(\n",
" fn=extract_dataset_from_response,\n",
" inputs=[chatbot, file_name, format],\n",
" outputs=[file_saved, dataset_preview]\n",
" )\n",
"\n",
" chatbot.change(\n",
" fn=show_extract_btn,\n",
" inputs=[chatbot, format],\n",
" outputs=[markdown_preview, extract_btn, file_name]\n",
" )\n",
"\n",
" model_type.change(\n",
" fn=lambda x: gr.update(choices=MODEL_NAMES[x], value=MODEL_NAMES[x][0]),\n",
" inputs=[model_type],\n",
" outputs=[model_name]\n",
" )\n",
"\n",
" submit.click(\n",
" fn=run_inference,\n",
" inputs=[scenario, model_type, model_name, output_tokens, dataset_size, format],\n",
" outputs=[error, chatbot],\n",
" show_progress=True\n",
" )\n",
"\n",
" clear.click(\n",
" clear_chat,\n",
" outputs=[scenario, chatbot, dataset_preview, file_saved]\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "kzDUJahK8uRN",
"outputId": "c5674be2-b262-4439-ae91-4f3e1f49e041"
},
"outputs": [],
"source": [
"# Example Scenarios\n",
"\n",
"# Generate a dataset for predicting customer churn in a subscription-based telecom company. Include features like monthly charges, contract type, tenure (in months), number of support calls, internet usage (in GB), payment method, and whether the customer has churned.\n",
"# Generate a dataset for training a model to approve/reject loan applications. Include features like loan amount, applicant income, co-applicant income, employment type, credit history (binary), loan term, number of dependents, education level, and loan approval status.\n",
"# Create a dataset of credit card transactions for detecting fraud. Include transaction ID, amount, timestamp, merchant category, customer location, card presence (yes/no), transaction device type, and fraud label (yes/no).\n",
"# Generate a dataset of investment customers with fields like portfolio value, age, income bracket, risk appetite (low/medium/high), number of transactions per month, preferred investment types, and risk score.\n",
"# Create a dataset of hospitalized patients to predict readmission within 30 days. Include patient ID, age, gender, number of prior admissions, diagnosis codes, length of stay, discharge type, medications prescribed, and readmission label.\n",
"# Generate a dataset for predicting medical appointment no-shows. Include appointment ID, scheduled date, appointment date, lead time (days between scheduling and appointment), SMS reminders sent, patient age, gender, health condition severity, and no-show status.\n",
"\n",
"generator_ui.launch(share=True, debug=True, inbrowser=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_9HIC_AzfZBZ"
},
"outputs": [],
"source": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,569 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "BSbc4VbLi2Ek"
},
"source": [
"# Synthetic Dataset generator\n",
"- 🚀 Live Demo: https://huggingface.co/spaces/lisekarimi/datagen\n",
"- 🧑‍💻 Repo: https://github.com/lisekarimi/datagen\n",
"\n",
"---\n",
"\n",
"- 🌍 **Task**: Generate realistic synthetic datasets\n",
"- 🎯 **Supported Data Types**: Tabular, Text, Time-series\n",
"- 🧠 **Models**: GPT (OpenAI) , Claude (Anthropic), CodeQwen1.5-7B-Chat (via Hugging Face Inference) / Llama (in Google Colab through T4 GPU)\n",
"- 🚀 **Tools**: Python, Gradio UI, OpenAI / Anthropic / HuggingFace APIs\n",
"- 📤 **Output Formats**: JSON and CSV file\n",
"- 🧑‍💻 **Skill Level**: Intermediate\n",
"\n",
"🎯 **How It Works**\n",
"\n",
"1⃣ Define your business problem or dataset topic.\n",
"\n",
"2⃣ Choose the dataset type, output format, model, and number of samples.\n",
"\n",
"3⃣ The LLM generates the code; you can adjust or modify it as needed.\n",
"\n",
"4⃣ Execute the code to generate your output file.\n",
"\n",
"🛠️ **Requirements** \n",
"- ⚙️ **Hardware**: ✅ GPU required (model download); Google Colab recommended (T4)\n",
"- 🔑 OpenAI API Key (for GPT) \n",
"- 🔑 Anthropic API Key (for Claude) \n",
"- 🔑 Hugging Face Token \n",
"\n",
"**Deploy CodeQwen Endpoint:**\n",
"- Visit https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat\n",
"- Click **Deploy** → **Inference Endpoints** → **Create Endpoint** (requires credit card)\n",
"- Copy your endpoint URL: `https://[id].us-east-1.aws.endpoints.huggingface.cloud`\n",
"\n",
"⚙️ **Customizable by user** \n",
"- 🤖 Selected model: GPT / Claude / Llama / Code Qwen\n",
"- 📜 `system_prompt`: Controls model behavior (concise, accurate, structured) \n",
"- 💬 `user_prompt`: Dynamic — include other fields\n",
"\n",
"---\n",
"📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9E-Ioggxi2Em"
},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pR-ftUatjEGd",
"outputId": "ae5668c5-c369-4066-bbbf-b560fb28e39a"
},
"outputs": [],
"source": [
"# Install required packages in Google Colab\n",
"%pip install -q python-dotenv gradio anthropic openai requests torch bitsandbytes transformers sentencepiece accelerate"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "VPmk2-Ggi2Em"
},
"outputs": [],
"source": [
"import re\n",
"import sys\n",
"import subprocess\n",
"import threading\n",
"import anthropic\n",
"import torch\n",
"import gradio as gr\n",
"from openai import OpenAI\n",
"from huggingface_hub import InferenceClient, login\n",
"from google.colab import userdata\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DUQ55_oji2En"
},
"source": [
"## Initialization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "MiicxGawi2En"
},
"outputs": [],
"source": [
"# Google Colab User Data\n",
"# Ensure you have set the following in your Google Colab environment:\n",
"openai_api_key = userdata.get(\"OPENAI_API_KEY\")\n",
"anthropic_api_key = userdata.get(\"ANTHROPIC_API_KEY\")\n",
"hf_token = userdata.get('HF_TOKEN')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"OPENAI_MODEL = \"gpt-4o-mini\"\n",
"CLAUDE_MODEL = \"claude-3-5-sonnet-20240620\"\n",
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
"\n",
"code_qwen = \"Qwen/CodeQwen1.5-7B-Chat\"\n",
"CODE_QWEN_URL = \"https://zfkokxzs1xrqv13v.us-east-1.aws.endpoints.huggingface.cloud\"\n",
"\n",
"login(hf_token, add_to_git_credential=True)\n",
"openai = OpenAI(api_key=openai_api_key)\n",
"claude = anthropic.Anthropic(api_key=anthropic_api_key)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ipA1F440i2En"
},
"source": [
"## Prompts definition"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JgtqCyRji2En"
},
"outputs": [],
"source": [
"system_message = \"\"\"\n",
"You are a helpful assistant whose main purpose is to generate datasets for business problems.\n",
"\n",
"Be less verbose.\n",
"Be accurate and concise.\n",
"\n",
"The user will describe a business problem. Based on this, you must generate a synthetic dataset that fits the context.\n",
"\n",
"The dataset should be saved in a specific format such as CSV, JSON — the desired format will be specified by the user.\n",
"\n",
"The dependencies for python code should include only standard python libraries such as numpy, pandas and built-in libraries.\n",
"\n",
"When saving a DataFrame to JSON using `to_json()`, do not use the `encoding` parameter. Instead, manually open the file with `open()` and specify the encoding. Then pass the file object to `to_json()`.\n",
"\n",
"Ensure Python code blocks are correctly indented, especially inside `with`, `for`, `if`, `try`, and `def` blocks.\n",
"\n",
"Return only the Python code that generates and saves the dataset.\n",
"After saving the file, print the code that was executed and a message confirming the dataset was generated successfully.\n",
"\"\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Bk6saP4oi2Eo"
},
"outputs": [],
"source": [
"def user_prompt(**input_data):\n",
" user_prompt = f\"\"\"\n",
" Generate a synthetic {input_data[\"dataset_type\"].lower()} dataset in {input_data[\"output_format\"].upper()} format.\n",
" Business problem: {input_data[\"business_problem\"]}\n",
" Samples: {input_data[\"num_samples\"]}\n",
" \"\"\"\n",
" return user_prompt\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XnrPiAZ7i2Eo"
},
"source": [
"## Call API for Closed Models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Sx7hHKczi2Eo"
},
"outputs": [],
"source": [
"def stream_gpt(user_prompt):\n",
" stream = openai.chat.completions.create(\n",
" model=OPENAI_MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\",\"content\": user_prompt},\n",
" ],\n",
" stream=True,\n",
" )\n",
"\n",
" response = \"\"\n",
" for chunk in stream:\n",
" response += chunk.choices[0].delta.content or \"\"\n",
" yield response\n",
"\n",
" return response\n",
"\n",
"\n",
"def stream_claude(user_prompt):\n",
" result = claude.messages.stream(\n",
" model=CLAUDE_MODEL,\n",
" max_tokens=2000,\n",
" system=system_message,\n",
" messages=[\n",
" {\"role\": \"user\",\"content\": user_prompt}\n",
" ]\n",
" )\n",
" reply = \"\"\n",
" with result as stream:\n",
" for text in stream.text_stream:\n",
" reply += text\n",
" yield reply\n",
" print(text, end=\"\", flush=True)\n",
" return reply\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PUPeZ4xPi2Eo"
},
"source": [
"## Call Open Source Models\n",
"- Llama is downloaded and run on T4 GPU (Google Colab).\n",
"- Code Qwen is run through inference endpoint"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "W0AuZT2uk0Sd"
},
"outputs": [],
"source": [
"def stream_llama(user_prompt):\n",
" try:\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\",\"content\": user_prompt},\n",
" ]\n",
"\n",
" tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
" tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
" quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
" )\n",
"\n",
" model = AutoModelForCausalLM.from_pretrained(\n",
" LLAMA,\n",
" device_map=\"auto\",\n",
" quantization_config=quant_config\n",
" )\n",
"\n",
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
" streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)\n",
"\n",
" thread = threading.Thread(target=model.generate, kwargs={\n",
" \"input_ids\": inputs,\n",
" \"max_new_tokens\": 1000,\n",
" \"pad_token_id\": tokenizer.eos_token_id,\n",
" \"streamer\": streamer\n",
" })\n",
" thread.start()\n",
"\n",
" started = False\n",
" reply = \"\"\n",
"\n",
" for new_text in streamer:\n",
" if not started:\n",
" if \"<|start_header_id|>assistant<|end_header_id|>\" in new_text:\n",
" started = True\n",
" new_text = new_text.split(\"<|start_header_id|>assistant<|end_header_id|>\")[-1].strip()\n",
" else:\n",
" continue\n",
"\n",
" if \"<|eot_id|>\" in new_text:\n",
" new_text = new_text.replace(\"<|eot_id|>\", \"\")\n",
" if new_text.strip():\n",
" reply += new_text\n",
" yield reply\n",
" break\n",
"\n",
" if new_text.strip():\n",
" reply += new_text\n",
" yield reply\n",
"\n",
" return reply\n",
"\n",
" except Exception as e:\n",
" print(f\"LLaMA error: {e}\")\n",
" raise\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "V0JS_6THi2Eo"
},
"outputs": [],
"source": [
"def stream_code_qwen(user_prompt):\n",
" tokenizer = AutoTokenizer.from_pretrained(code_qwen)\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\",\"content\": user_prompt},\n",
" ]\n",
" text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
" client = InferenceClient(CODE_QWEN_URL, token=hf_token)\n",
" stream = client.text_generation(text, stream=True, details=True, max_new_tokens=3000)\n",
" result = \"\"\n",
" for r in stream:\n",
" result += r.token.text\n",
" yield result"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PqG57dJIi2Eo"
},
"source": [
"## Select the model and generate the ouput"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YqSKnklRi2Eo"
},
"outputs": [],
"source": [
"def generate_from_inputs(model, **input_data):\n",
" # print(\"🔍 input_data received:\", input_data)\n",
" user_prompt_str = user_prompt(**input_data)\n",
"\n",
" if model == \"GPT\":\n",
" result = stream_gpt(user_prompt_str)\n",
" elif model == \"Claude\":\n",
" result = stream_claude(user_prompt_str)\n",
" elif model == \"Llama\":\n",
" result = stream_llama(user_prompt_str)\n",
" elif model == \"Code Qwen\":\n",
" result = stream_code_qwen(user_prompt_str)\n",
" else:\n",
" raise ValueError(\"Unknown model\")\n",
"\n",
" for stream_so_far in result:\n",
" yield stream_so_far\n",
"\n",
" return result\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "zG6_TSfni2Eo"
},
"outputs": [],
"source": [
"def handle_generate(business_problem, dataset_type, dataset_format, num_samples, model):\n",
" input_data = {\n",
" \"business_problem\": business_problem,\n",
" \"dataset_type\": dataset_type,\n",
" \"output_format\": dataset_format,\n",
" \"num_samples\": num_samples,\n",
" }\n",
"\n",
" response = generate_from_inputs(model, **input_data)\n",
" for chunk in response:\n",
" yield chunk\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "p5DQcx71i2Ep"
},
"source": [
"## Extract python code from the LLM output and execute it locally"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NcEkmsnai2Ep",
"jp-MarkdownHeadingCollapsed": true
},
"outputs": [],
"source": [
"def extract_code(text):\n",
" match = re.search(r\"```python(.*?)```\", text, re.DOTALL)\n",
"\n",
" if match:\n",
" code = match.group(0).strip()\n",
" else:\n",
" code = \"\"\n",
" print(\"No matching substring found.\")\n",
"\n",
" return code.replace(\"```python\\n\", \"\").replace(\"```\", \"\")\n",
"\n",
"\n",
"def execute_code_in_virtualenv(text, python_interpreter=sys.executable):\n",
" if not python_interpreter:\n",
" raise EnvironmentError(\"Python interpreter not found in the specified virtual environment.\")\n",
"\n",
" code_str = extract_code(text)\n",
" command = [python_interpreter, '-c', code_str]\n",
"\n",
" try:\n",
" result = subprocess.run(command, check=True, capture_output=True, text=True)\n",
" stdout = result.stdout\n",
" return stdout\n",
"\n",
" except subprocess.CalledProcessError as e:\n",
" return f\"Execution error:\\n{e}\"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DQgEyFzJi2Ep"
},
"source": [
"## Gradio interface"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SEiZVkdFi2Ep"
},
"outputs": [],
"source": [
"def update_output_format(dataset_type):\n",
" if dataset_type in [\"Tabular\", \"Time-series\"]:\n",
" return gr.update(choices=[\"JSON\", \"csv\"], value=\"JSON\")\n",
" elif dataset_type == \"Text\":\n",
" return gr.update(choices=[\"JSON\"], value=\"JSON\")\n",
"\n",
"with gr.Blocks() as ui:\n",
" gr.Markdown(\"## Create a dataset for a business problem\")\n",
"\n",
" with gr.Column():\n",
" business_problem = gr.Textbox(label=\"Business problem\", lines=2)\n",
" dataset_type = gr.Dropdown(\n",
" [\"Tabular\", \"Time-series\", \"Text\"], label=\"Dataset type\"\n",
" )\n",
"\n",
" output_format = gr.Dropdown( choices=[\"JSON\", \"csv\"], value=\"JSON\",label=\"Output Format\")\n",
"\n",
" num_samples = gr.Number(label=\"Number of samples\", value=10, precision=0)\n",
"\n",
" model = gr.Dropdown([\"GPT\", \"Claude\", \"Llama\", \"Code Qwen\"], label=\"Select model\", value=\"GPT\")\n",
"\n",
" dataset_type.change(update_output_format,inputs=[dataset_type], outputs=[output_format])\n",
"\n",
" with gr.Row():\n",
" with gr.Column():\n",
" dataset_run = gr.Button(\"Create a dataset\")\n",
" gr.Markdown(\"\"\"⚠️ For Llama and Code Qwen: The generated code might not be optimal. It's recommended to review it before execution.\n",
" Some mistakes may occur.\"\"\")\n",
"\n",
" with gr.Column():\n",
" code_run = gr.Button(\"Execute code for a dataset\")\n",
" gr.Markdown(\"\"\"⚠️ Be cautious when sharing this app with code execution publicly, as it could pose safety risks.\n",
" The execution of user-generated code may lead to potential vulnerabilities, and its important to use this tool responsibly.\"\"\")\n",
"\n",
" with gr.Row():\n",
" dataset_out = gr.Textbox(label=\"Generated Dataset\")\n",
" code_out = gr.Textbox(label=\"Executed code\")\n",
"\n",
" dataset_run.click(\n",
" handle_generate,\n",
" inputs=[business_problem, dataset_type, output_format, num_samples, model],\n",
" outputs=[dataset_out]\n",
" )\n",
"\n",
" code_run.click(\n",
" execute_code_in_virtualenv,\n",
" inputs=[dataset_out],\n",
" outputs=[code_out]\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 646
},
"id": "jCAkTEtMi2Ep",
"outputId": "deeeb1a7-c432-4007-eba2-cbcc28dbc0ff"
},
"outputs": [],
"source": [
"ui.launch(inbrowser=True)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,262 @@
# -*- coding: utf-8 -*-
"""new_training_with_RAG.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1gi8FPI1dtnxBNTf86JdmXQ0BYqnKz7LS
# Predict Product Prices
"""
!nvidia-smi
!pip install -q datasets requests torch peft bitsandbytes transformers trl accelerate sentencepiece matplotlib langchain-community chromadb
import os
import re
import math
from tqdm import tqdm
from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import (
AutoModelForCausalLM, AutoTokenizer, TrainingArguments,
set_seed, BitsAndBytesConfig, GenerationConfig)
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
from datetime import datetime
import matplotlib.pyplot as plt
#LangChain & RAG Imports
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.vectorstores import Chroma
import chromadb
from langchain.embeddings import HuggingFaceEmbeddings
# Commented out IPython magic to ensure Python compatibility.
# Constants
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
#BASE_MODEL = 'mistralai/Mistral-7B-Instruct-v0.1'
PROJECT_NAME = "pricer-optim"
HF_USER = "Adriana213"
# Data
DATASET_NAME = f"{HF_USER}/pricer-data"
MAX_SEQUENCE_LENGTH = 182
RUN_NAME = f"{PROJECT_NAME}-{datetime.now():%Y%m%d_%H%M%S}"
HUB_MODEL_NAME = f"{HF_USER}/{RUN_NAME}"
# Hyperparameters for QLoRA
LORA_R = 8
LORA_ALPHA = 32
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]
LORA_DROPOUT = 0.10
QUANT_4_BIT = True
# Hyperparameters for Training
EPOCHS = 2
BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 2e-4
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.05
OPTIMIZER = "paged_adamw_32bit"
STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200 # kept for potential future use
# %matplotlib inline
HUB_MODEL_NAME
"""### Log in to HuggingFace & get Data"""
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)
torch.cuda.empty_cache()
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']
"""## Now load the Tokenizer and Model
The model is "quantized" - we are reducing the precision to 4 bits.
"""
# Pick the right quantization
if QUANT_4_BIT:
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4"
)
else:
quant_config = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_compute_dtype=torch.bfloat16
)
# Load the Tokenizer and the Model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=quant_config,
device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id
print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB")
"""# Data Collator
"""
from trl import DataCollatorForCompletionOnlyLM
response_template = "Price is $"
collator = DataCollatorForCompletionOnlyLM(response_template,
tokenizer=tokenizer)
"""# Set up the configuration for Training"""
# LoRA Config
lora_parameters = LoraConfig(
lora_alpha = LORA_ALPHA,
lora_dropout = LORA_DROPOUT,
r = LORA_R,
bias = "none",
task_type = "CAUSAL_LM",
target_modules = TARGET_MODULES,
)
# Training Config
train_parameters = SFTConfig(
output_dir = RUN_NAME,
num_train_epochs = EPOCHS,
per_device_train_batch_size = BATCH_SIZE,
per_device_eval_batch_size = 4,
eval_strategy = "no",
eval_steps = EVAL_STEPS,
gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
optim = OPTIMIZER,
save_steps = SAVE_STEPS,
save_total_limit = 5,
logging_steps = 50,
learning_rate = LEARNING_RATE,
weight_decay = 0.01,
fp16=False,
bf16=True,
max_grad_norm=0.3,
max_steps=-1,
warmup_ratio = WARMUP_RATIO,
group_by_length=True,
lr_scheduler_type = LR_SCHEDULER_TYPE,
run_name = RUN_NAME,
max_seq_length = MAX_SEQUENCE_LENGTH,
dataset_text_field = "text",
save_strategy = "steps",
hub_strategy = "every_save",
push_to_hub = True,
hub_model_id = HUB_MODEL_NAME,
hub_private_repo = True,
report_to = 'none',
)
fine_tuning = SFTTrainer(
model = base_model,
train_dataset = train,
eval_dataset=test,
peft_config = lora_parameters,
args = train_parameters,
data_collator = collator,
)
"""## Fine Tuning"""
fine_tuning.train()
fine_tuning.model.push_to_hub(RUN_NAME, private=True)
print(f"Saved to the hub: {RUN_NAME}")
"""# Implement RAG"""
HF_USER = "Adriana213"
RUN_NAME = "pricer-optim-20250514_061529"
fine_tuned_model = PeftModel.from_pretrained(base_model, f"{HF_USER}/{RUN_NAME}")
print(f"✅ Loaded fine-tuned adapter: {HF_USER}/{RUN_NAME}")
base_model = fine_tuned_model
"""## Build Chroma index"""
docs = [
Document(page_content=text, metadata = {'price': price})
for text, price in zip(train['text'], train['price'])
]
# Create embeddings & persist Chroma index
embedding = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')
chroma = Chroma.from_documents(
documents = docs,
embedding = embedding,
persist_directory = 'chroma_train_index'
)
chroma.persist()
print('Chroma index built and persisted.')
"""## RAG Prediction Function"""
generation_config = GenerationConfig(
max_new_token = 10,
do_sample = False,
temperature = 0.1
)
def predict_price_rag(desc: str, k: int = 3) -> float:
hits = chroma.similarity_search(desc, k = k)
shot_strs = [
f'Description: {doc.page_content}\nPrice is ${doc.metadata["price"]}'
for doc in hits
]
prompt = "\n\n".join(shot_strs) + f"\n\nDescription: {desc}\nPrice is $"
inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
out = base_model.generate(**inputs, generation_config=generation_config)
text = tokenizer.decode(
out[0, inputs["input_ids"].shape[-1]:],
skip_special_tokens=True
).strip()
return float(re.findall(r"\d+\.?\d+", text)[0])
!zip -r chroma_index.zip chroma_train_index
from google.colab import files
files.download("chroma_index.zip")

View File

@@ -0,0 +1,258 @@
# -*- coding: utf-8 -*-
"""Testing Fine-tuned model with RAG
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1J8P8cwqwhBo3CNIZaEFe6BMRw0WUfEqy
## Predict Product Prices
### And now, to evaluate our fine-tuned open source model
"""
!pip install -q datasets peft requests torch bitsandbytes transformers trl accelerate sentencepiece matplotlib langchain-community chromadb
import os
import re
import math
from google.colab import userdata
from huggingface_hub import login
import torch
import torch.nn.functional as F
from transformers import (
AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig, GenerationConfig)
from datasets import load_dataset
from peft import PeftModel
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import matplotlib.pyplot as plt
# Commented out IPython magic to ensure Python compatibility.
# Constants
BASE_MODEL = "meta-llama/Llama-3.1-8B"
PROJECT_NAME = "pricer"
HF_USER = "Adriana213"
RUN_NAME = "optim-20250514_061529"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
# Data
DATASET_NAME = f"{HF_USER}/pricer-data"
# Hyperparameters for QLoRA
QUANT_4_BIT = True
# %matplotlib inline
# Used for writing to output in color
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}
"""### Log in to HuggingFace
"""
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']
test[0]
"""## Now load the Tokenizer and Model"""
if QUANT_4_BIT:
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4"
)
else:
quant_config = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_compute_dtype=torch.bfloat16
)
# Load the Tokenizer and the Model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=quant_config,
device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id
# Load the fine-tuned model with PEFT
fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)
print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB")
fine_tuned_model
"""# Evaluation"""
def extract_price(s):
if "Price is $" in s:
contents = s.split("Price is $")[1]
contents = contents.replace(',','')
match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
return float(match.group()) if match else 0
return 0
extract_price("Price is $a fabulous 899.99 or so")
# Original prediction function takes the most likely next token
def model_predict(prompt):
inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
attention_mask = torch.ones(inputs.shape, device="cuda")
outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=3, num_return_sequences=1)
response = tokenizer.decode(outputs[0])
return extract_price(response)
# top_K = 3
# def improved_model_predict(prompt, device="cuda"):
# set_seed(42)
# inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
# attention_mask = torch.ones(inputs.shape, device=device)
# with torch.no_grad():
# outputs = fine_tuned_model(inputs, attention_mask=attention_mask)
# next_token_logits = outputs.logits[:, -1, :].to('cpu')
# next_token_probs = F.softmax(next_token_logits, dim=-1)
# top_prob, top_token_id = next_token_probs.topk(top_K)
# prices, weights = [], []
# for i in range(top_K):
# predicted_token = tokenizer.decode(top_token_id[0][i])
# probability = top_prob[0][i]
# try:
# result = float(predicted_token)
# except ValueError as e:
# result = 0.0
# if result > 0:
# prices.append(result)
# weights.append(probability)
# if not prices:
# return 0.0, 0.0
# total = sum(weights)
# weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]
# return sum(weighted_prices).item()
embedder = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
chroma = Chroma(
persist_directory = "chroma_train_index",
embedding_function = embedder
)
gen_config = GenerationConfig(max_new_tokens=10, do_sample=False)
def predict_price_rag(desc: str, k: int = 3) -> float:
docs = chroma.similarity_search(desc, k=k)
shots = "\n\n".join(f"Description: {d.page_content}\nPrice is ${d.metadata['price']}"
for d in docs)
prompt = f"{shots}\n\nDescription: {desc}\nPrice is $"
inp = tokenizer(prompt, return_tensors="pt").to(fine_tuned_model.device)
out = fine_tuned_model.generate(**inp, generation_config=gen_config)
txt = tokenizer.decode(out[0, inp["input_ids"].shape[-1]:], skip_special_tokens=True).strip()
return float(re.findall(r"\d+\.?\d+", txt)[0])
class Tester:
def __init__(self, predictor, data, title=None, size=250):
self.predictor = predictor
self.data = data
self.title = title or predictor.__name__.replace("_", " ").title()
self.size = size
self.guesses = []
self.truths = []
self.errors = []
self.sles = []
self.colors = []
def color_for(self, error, truth):
if error<40 or error/truth < 0.2:
return "green"
elif error<80 or error/truth < 0.4:
return "orange"
else:
return "red"
def run_datapoint(self, i):
datapoint = self.data[i]
guess = self.predictor(datapoint["text"])
truth = datapoint["price"]
error = abs(guess - truth)
log_error = math.log(truth+1) - math.log(guess+1)
sle = log_error ** 2
color = self.color_for(error, truth)
title = datapoint["text"].split("\n\n")[1][:20] + "..."
self.guesses.append(guess)
self.truths.append(truth)
self.errors.append(error)
self.sles.append(sle)
self.colors.append(color)
print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")
def chart(self, title):
max_error = max(self.errors)
plt.figure(figsize=(12, 8))
max_val = max(max(self.truths), max(self.guesses))
plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
plt.xlabel('Ground Truth')
plt.ylabel('Model Estimate')
plt.xlim(0, max_val)
plt.ylim(0, max_val)
plt.title(title)
plt.show()
def report(self):
average_error = sum(self.errors) / self.size
rmsle = math.sqrt(sum(self.sles) / self.size)
hits = sum(1 for color in self.colors if color=="green")
title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
self.chart(title)
def run(self):
self.error = 0
for i in range(self.size):
self.run_datapoint(i)
self.report()
@classmethod
def test(cls, function, data):
cls(function, data).run()
Tester.test(predict_price_rag, test)