Merge branch 'main' of github.com:bluebells1/llm_engineering into sm-branch-wk4

This commit is contained in:
Susan Martin
2025-06-24 10:22:44 +01:00
18 changed files with 4211 additions and 9 deletions

View File

@@ -0,0 +1,226 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "38795b24-9801-4cfb-a000-ccd7f41e6128",
"metadata": {},
"source": [
"\n",
"# 🧠 Multi-Product Competitor Intelligence Summarizer using Web Scraping + LLM\n",
"\n",
"This notebook scrapes product pages using `Selenium`, collects the product information, and summarizes key features and comparison insights using `Ollama (LLaMA3) and OpenAI`.\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"\n",
"# Load environment variables in a file called .env\n",
"\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
"metadata": {},
"outputs": [],
"source": [
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
"\n",
"system_prompt = \"Summarize the following product information for comparison.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38245e18",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 📦 Install required packages (run once)\n",
"!pip install selenium bs4 requests\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88ae528b-aefe-4c64-b927-676e739194af",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4a831a5",
"metadata": {},
"outputs": [],
"source": [
"def summarize_with_openai(text, model=\"gpt-4o-mini\"):\n",
" response = openai.chat.completions.create(\n",
" model=model,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": text}\n",
" ],\n",
" temperature=0.7\n",
" )\n",
" return response.choices[0].message.content\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef65cd72",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# ⚙️ Selenium setup (headless)\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.common.by import By\n",
"import time\n",
"\n",
"def scrape_text_from_url(url):\n",
" options = Options()\n",
" options.add_argument(\"--headless=new\")\n",
" driver = webdriver.Chrome(options=options)\n",
" driver.get(url)\n",
" time.sleep(3)\n",
" \n",
" # You can tune this selector depending on the site\n",
" body = driver.find_element(By.TAG_NAME, 'body')\n",
" text = body.text\n",
" driver.quit()\n",
" return text.strip()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36e19014",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 🧠 LLM Prompting using Ollama (local llama3)\n",
"import subprocess\n",
"\n",
"def summarize_with_ollama(text):\n",
" prompt = f\"Summarize the following product description:\\n\\n{text}\\n\\nSummary:\"\n",
" try:\n",
" print(\"inside ollama\")\n",
" result = subprocess.run(\n",
" [\"ollama\", \"run\", \"llama3.2\"],\n",
" input=prompt,\n",
" capture_output=True, text=True, check=True, encoding=\"utf-8\"\n",
" )\n",
" print(\"git result\")\n",
" return result.stdout.strip()\n",
" except subprocess.CalledProcessError as e:\n",
" return f\"Error running ollama: {e.stderr}\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e04cea6e",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 🔁 Analyze multiple product URLs and summarize\n",
"product_urls = {\n",
" \"iPhone 15 Pro\": \"https://www.apple.com/in/iphone-15-pro/\",\n",
" \"Samsung S24 Ultra\": \"https://www.samsung.com/in/smartphones/galaxy-s24-ultra/\",\n",
"}\n",
"\n",
"product_texts = {}\n",
"\n",
"for name, url in product_urls.items():\n",
" print(f\"Scraping {name} ...\")\n",
" product_texts[name] = scrape_text_from_url(url)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ebd5a20",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 📄 Display side-by-side summaries\n",
"for name, text in product_texts.items():\n",
" print(f\"\\n🔹 {name} Summary with Ollama:\")\n",
" print(summarize_with_ollama(text))\n",
"\n",
" print(f\"\\n🔹 {name} Summary with OpenAI GPT:\")\n",
" print(summarize_with_openai(text))\n",
" print(\"=\"*100)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "935e0081-ccf5-4d9a-a984-ee82c77c04a2",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,357 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "53211323-6a09-452a-b471-98e22d92bfc2",
"metadata": {},
"source": [
"# 🌐 WebPage Summarizer\n",
"---\n",
"- 🌍 **Task:** Summarizing webpage content using AI. \n",
"- 🧠 **Model:** OpenAI's ``gpt-4o-mini`` and ``llama3.2:3b`` for text summarization. \n",
"- 🕵️‍♂️ **Data Extraction:** Selenium for handling both static and JavaScript-rendered websites. \n",
"- 📌 **Output Format:** Markdown-formatted summaries. \n",
"- 🔗 **Scope:** Processes only the given webpage URL (not the entire site). \n",
"- 🚀 **Tools:** Python, Requests, Selenium, BeautifulSoup, OpenAI API, Ollama. \n",
"- 🧑‍💻 **Skill Level:** Beginner.\n",
"\n",
"🛠️ Requirements\n",
"- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n",
"- 🔑 OpenAI API Key (for GPT model)\n",
"- Install Ollama and pull llama3.2:3b or another lightweight model\n",
"- Google Chrome browser installed\n",
"\n",
"**✨ This script handles both JavaScript and non-JavaScript websites using Selenium with Chrome WebDriver for reliable content extraction from modern web applications.**\n",
"\n",
"Let's get started and automate website summarization! 🚀\n",
"\n",
"![](https://github.com/lisekarimi/lexo/blob/main/assets/01_basic_llm_project.jpg?raw=true)\n",
"\n",
"---\n",
"📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)"
]
},
{
"cell_type": "markdown",
"id": "d70aa4b0",
"metadata": {},
"source": [
"## 🛠️ Environment Setup & Dependencies"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ebf2fa36",
"metadata": {},
"outputs": [],
"source": [
"%pip install selenium webdriver-manager"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1dcf1d9d-c540-4900-b14e-ad36a28fc822",
"metadata": {},
"outputs": [],
"source": [
"# ===========================\n",
"# System & Environment\n",
"# ===========================\n",
"import os\n",
"from dotenv import load_dotenv\n",
"\n",
"# ===========================\n",
"# Web Scraping\n",
"# ===========================\n",
"import time\n",
"from bs4 import BeautifulSoup\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"\n",
"# ===========================\n",
"# AI-related\n",
"# ===========================\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI\n",
"import ollama"
]
},
{
"cell_type": "markdown",
"id": "cc20642b",
"metadata": {},
"source": [
"## 🔐 Model Configuration & Authentication"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8598c299-05ca-492e-b085-6bcc2f7dda0d",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"if not api_key:\n",
" raise ValueError(\"OPENAI_API_KEY not found in environment variables\")\n",
"\n",
"print(\"✅ API key loaded successfully!\")\n",
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8098defb",
"metadata": {},
"outputs": [],
"source": [
"MODEL_OPENAI = \"gpt-4o-mini\"\n",
"MODEL_OLLAMA = \"llama3.2:3b\""
]
},
{
"cell_type": "markdown",
"id": "2bd1d83f",
"metadata": {},
"source": [
"## 🌐 Web Scraping Infrastructure"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6fe5114",
"metadata": {},
"outputs": [],
"source": [
"class WebsiteCrawler:\n",
" def __init__(self, url):\n",
" self.url = url\n",
" self.title = \"\"\n",
" self.text = \"\"\n",
" self.scrape()\n",
"\n",
" def scrape(self):\n",
" try:\n",
" # Chrome options\n",
" chrome_options = Options()\n",
" chrome_options.add_argument(\"--headless\")\n",
" chrome_options.add_argument(\"--no-sandbox\")\n",
" chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
" chrome_options.add_argument(\"--disable-gpu\")\n",
" chrome_options.add_argument(\"--window-size=1920,1080\")\n",
" chrome_options.add_argument(\"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\")\n",
"\n",
" # Try to find Chrome\n",
" chrome_paths = [\n",
" r\"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe\",\n",
" r\"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe\",\n",
" r\"C:\\Users\\{}\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe\".format(os.getenv('USERNAME')),\n",
" ]\n",
"\n",
" chrome_binary = None\n",
" for path in chrome_paths:\n",
" if os.path.exists(path):\n",
" chrome_binary = path\n",
" break\n",
"\n",
" if chrome_binary:\n",
" chrome_options.binary_location = chrome_binary\n",
"\n",
" # Create driver\n",
" driver = webdriver.Chrome(options=chrome_options)\n",
" driver.set_page_load_timeout(30)\n",
"\n",
" print(f\"🔍 Loading: {self.url}\")\n",
" driver.get(self.url)\n",
"\n",
" # Wait for page to load\n",
" time.sleep(5)\n",
"\n",
" # Try to wait for main content\n",
" try:\n",
" WebDriverWait(driver, 10).until(\n",
" EC.presence_of_element_located((By.TAG_NAME, \"main\"))\n",
" )\n",
" except Exception:\n",
" try:\n",
" WebDriverWait(driver, 10).until(\n",
" EC.presence_of_element_located((By.TAG_NAME, \"body\"))\n",
" )\n",
" except Exception:\n",
" pass # Continue anyway\n",
"\n",
" # Get title and page source\n",
" self.title = driver.title\n",
" page_source = driver.page_source\n",
" driver.quit()\n",
"\n",
" print(f\"✅ Page loaded: {self.title}\")\n",
"\n",
" # Parse with BeautifulSoup\n",
" soup = BeautifulSoup(page_source, 'html.parser')\n",
"\n",
" # Remove unwanted elements\n",
" for element in soup([\"script\", \"style\", \"img\", \"input\", \"button\", \"nav\", \"footer\", \"header\"]):\n",
" element.decompose()\n",
"\n",
" # Get main content\n",
" main = soup.find('main') or soup.find('article') or soup.find('.content') or soup.find('body')\n",
" if main:\n",
" self.text = main.get_text(separator=\"\\n\", strip=True)\n",
" else:\n",
" self.text = soup.get_text(separator=\"\\n\", strip=True)\n",
"\n",
" # Clean up text\n",
" lines = [line.strip() for line in self.text.split('\\n') if line.strip() and len(line.strip()) > 2]\n",
" self.text = '\\n'.join(lines[:200]) # Limit to first 200 lines\n",
"\n",
" print(f\"📄 Extracted {len(self.text)} characters\")\n",
"\n",
" except Exception as e:\n",
" print(f\"❌ Error occurred: {e}\")\n",
" self.title = \"Error occurred\"\n",
" self.text = \"Could not scrape website content\""
]
},
{
"cell_type": "markdown",
"id": "d727feff",
"metadata": {},
"source": [
"## 🧠 Prompt Engineering & Templates"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "02e3a673-a8a1-4101-a441-3816f7ab9e4d",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
"and provides a short summary, ignoring text that might be navigation related. \\\n",
"Respond in markdown.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86bb80f9-9e7c-4825-985f-9b83fe50839f",
"metadata": {},
"outputs": [],
"source": [
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.\\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "89998b18-77aa-4aaf-a137-f0d078d61f75",
"metadata": {},
"outputs": [],
"source": [
"def messages_for(website):\n",
" return [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
" ]"
]
},
{
"cell_type": "markdown",
"id": "cde36d4f",
"metadata": {},
"source": [
"## 📝 Summarization "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5636affe",
"metadata": {},
"outputs": [],
"source": [
"def summarize_gpt(url):\n",
" \"\"\"Scrape website and summarize with GPT\"\"\"\n",
" site = WebsiteCrawler(url)\n",
"\n",
" if \"Error occurred\" in site.title or len(site.text) < 50:\n",
" print(f\"❌ Failed to scrape meaningful content from {url}\")\n",
" return\n",
"\n",
" print(\"🤖 Creating summary...\")\n",
"\n",
" # Create summary\n",
" response = openai.chat.completions.create(\n",
" model=MODEL_OPENAI,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt_for(site)}\n",
" ]\n",
" )\n",
"\n",
" web_summary = response.choices[0].message.content\n",
" display(Markdown(web_summary))\n",
"\n",
"summarize_gpt('https://openai.com')\n",
"# summarize_gpt('https://stripe.com')\n",
"# summarize_gpt('https://vercel.com')\n",
"# summarize_gpt('https://react.dev')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "90b9a8f8-0c1c-40c8-a4b3-e8e1fcd29df5",
"metadata": {},
"outputs": [],
"source": [
"def summarize_ollama(url):\n",
" website = WebsiteCrawler(url)\n",
" response = ollama.chat(\n",
" model=MODEL_OLLAMA,\n",
" messages=messages_for(website))\n",
" display(Markdown(response['message']['content'])) # Generate and display output\n",
"\n",
"summarize_ollama('https://github.com')\n",
"# summarize_ollama('https://nextjs.org')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,370 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "dc8af57c-23a9-452e-9fc3-0e5027edda14",
"metadata": {},
"source": [
"# AI-powered Brochure Generator\n",
"---\n",
"- 🌍 Task: Generate a company brochure using its name and website for clients, investors, and recruits.\n",
"- 🧠 Model: Toggle `USE_OPENAI` to switch between OpenAI and Ollama models\n",
"- 🕵️‍♂️ Data Extraction: Scraping website content and filtering key links (About, Products, Careers, Contact).\n",
"- 📌 Output Format: a Markdown-formatted brochure streamed in real-time.\n",
"- 🚀 Tools: BeautifulSoup, OpenAI API, and IPython display, ollama.\n",
"- 🧑‍💻 Skill Level: Intermediate.\n",
"\n",
"🛠️ Requirements\n",
"- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n",
"- 🔑 OpenAI API Key \n",
"- Install Ollama and pull llama3.2:3b or another lightweight model\n",
"---\n",
"📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)"
]
},
{
"cell_type": "markdown",
"id": "ec869f2c",
"metadata": {},
"source": [
"## 🧩 System Design Overview\n",
"\n",
"### Class Structure\n",
"\n",
"![](https://github.com/lisekarimi/lexo/blob/main/assets/02_brochure_class_diagram.png?raw=true)\n",
"\n",
"This code consists of three main classes:\n",
"\n",
"1. **`Website`**: \n",
" - Scrapes and processes webpage content. \n",
" - Extracts **text** and **links** from a given URL. \n",
"\n",
"2. **`LLMClient`**: \n",
" - Handles interactions with **OpenAI or Ollama (`llama3`, `deepseek`, `qwen`)**. \n",
" - Uses `get_relevant_links()` to filter webpage links. \n",
" - Uses `generate_brochure()` to create and stream a Markdown-formatted brochure. \n",
"\n",
"3. **`BrochureGenerator`**: \n",
" - Uses `Website` to scrape the main webpage and relevant links. \n",
" - Uses `LLMClient` to filter relevant links and generate a brochure. \n",
" - Calls `generate()` to run the entire process.\n",
"\n",
"### Workflow\n",
"\n",
"1. **`main()`** initializes `BrochureGenerator` and calls `generate()`. \n",
"2. **`generate()`** calls **`LLMClient.get_relevant_links()`** to extract relevant links using **LLM (OpenAI/Ollama)**. \n",
"3. **`Website` scrapes the webpage**, extracting **text and links** from the given URL. \n",
"4. **Relevant links are re-scraped** using `Website` to collect additional content. \n",
"5. **All collected content is passed to `LLMClient.generate_brochure()`**. \n",
"6. **`LLMClient` streams the generated brochure** using **OpenAI or Ollama**. \n",
"7. **The final brochure is displayed in Markdown format.**\n",
"\n",
"![](https://github.com/lisekarimi/lexo/blob/main/assets/02_brochure_process.png?raw=true)\n",
"\n",
"\n",
"### Intermediate reasoning\n",
"\n",
"In this workflow, we have intermediate reasoning because the LLM is called twice:\n",
"\n",
"1. **First LLM call**: Takes raw links → filters/selects relevant ones (reasoning step).\n",
"2. **Second LLM call**: Takes selected content → generates final brochure.\n",
"\n",
"🧠 **LLM output becomes LLM input** — thats intermediate reasoning.\n",
"\n",
"![](https://github.com/lisekarimi/lexo/blob/main/assets/02_llm_intermd_reasoning.png?raw=true)"
]
},
{
"cell_type": "markdown",
"id": "4b286461-35ee-4bc5-b07d-af554923e36d",
"metadata": {},
"source": [
"## 📦 Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3fe5670c-5146-474b-9e75-484210533f55",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"import json\n",
"import ollama\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import display, Markdown, update_display\n",
"from openai import OpenAI"
]
},
{
"cell_type": "markdown",
"id": "f3e23181-1e66-410d-a910-1fb4230f8088",
"metadata": {},
"source": [
"## 🧠 Define the Model\n",
"\n",
"The user can switch between OpenAI and Ollama by changing a single variable (`USE_OPENAI`). The model selection is dynamic."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa2bd452-0cf4-4fec-9542-e1c86584c23f",
"metadata": {},
"outputs": [],
"source": [
"# Load API key\n",
"load_dotenv()\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"if not api_key or not api_key.startswith('sk-'):\n",
" raise ValueError(\"Invalid OpenAI API key. Check your .env file.\")\n",
"\n",
"# Define the model dynamically\n",
"USE_OPENAI = True # True to use openai and False to use Ollama\n",
"MODEL = 'gpt-4o-mini' if USE_OPENAI else 'llama3.2:3b'\n",
"\n",
"openai_client = OpenAI() if USE_OPENAI else None"
]
},
{
"cell_type": "markdown",
"id": "4fd997b7-1b89-4817-b53a-078164f5f71f",
"metadata": {},
"source": [
"## 🏗️ Define Classes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aed1af59-8b8f-4add-98dc-a9f1b5b511a5",
"metadata": {},
"outputs": [],
"source": [
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
" \"\"\"\n",
" A utility class to scrape and process website content.\n",
" \"\"\"\n",
" def __init__(self, url):\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" self.text = self.extract_text(soup)\n",
" self.links = self.extract_links(soup)\n",
"\n",
" def extract_text(self, soup):\n",
" if soup.body:\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" return soup.body.get_text(separator=\"\\n\", strip=True)\n",
" return \"\"\n",
"\n",
" def extract_links(self, soup):\n",
" links = [link.get('href') for link in soup.find_all('a')]\n",
" return [link for link in links if link and 'http' in link]\n",
"\n",
" def get_contents(self):\n",
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea04dc7e-ff4c-4113-83b7-0bddcf5072b9",
"metadata": {},
"outputs": [],
"source": [
"class LLMClient:\n",
" def __init__(self, model=MODEL):\n",
" self.model = model\n",
"\n",
" def get_relevant_links(self, website):\n",
" link_system_prompt = \"\"\"\n",
" You are given a list of links from a company website.\n",
" Select only relevant links for a brochure (About, Company, Careers, Products, Contact).\n",
" Exclude login, terms, privacy, and emails.\n",
"\n",
" ### **Instructions**\n",
" - Return **only valid JSON**.\n",
" - **Do not** include explanations, comments, or Markdown.\n",
" - Example output:\n",
" {\n",
" \"links\": [\n",
" {\"type\": \"about\", \"url\": \"https://company.com/about\"},\n",
" {\"type\": \"contact\", \"url\": \"https://company.com/contact\"},\n",
" {\"type\": \"product\", \"url\": \"https://company.com/products\"}\n",
" ]\n",
" }\n",
" \"\"\"\n",
"\n",
" user_prompt = f\"\"\"\n",
" Here is the list of links on the website of {website.url}:\n",
" Please identify the relevant web links for a company brochure. Respond in JSON format.\n",
" Do not include login, terms of service, privacy, or email links.\n",
" Links (some might be relative links):\n",
" {', '.join(website.links)}\n",
" \"\"\"\n",
"\n",
" if USE_OPENAI:\n",
" response = openai_client.chat.completions.create(\n",
" model=self.model,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" )\n",
" return json.loads(response.choices[0].message.content.strip())\n",
" else:\n",
" response = ollama.chat(\n",
" model=self.model,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" )\n",
" result = response.get(\"message\", {}).get(\"content\", \"\").strip()\n",
" try:\n",
" return json.loads(result) # Attempt to parse JSON\n",
" except json.JSONDecodeError:\n",
" print(\"Error: Response is not valid JSON\")\n",
" return {\"links\": []} # Return empty list if parsing fails\n",
"\n",
"\n",
" def generate_brochure(self, company_name, content, language):\n",
" system_prompt = \"\"\"\n",
" You are a professional translator and writer who creates fun and engaging brochures.\n",
" Your task is to read content from a companys website and write a short, humorous, joky,\n",
" and entertaining brochure for potential customers, investors, and job seekers.\n",
" Include details about the companys culture, customers, and career opportunities if available.\n",
" Respond in Markdown format.\n",
" \"\"\"\n",
"\n",
" user_prompt = f\"\"\"\n",
" Create a fun brochure for '{company_name}' using the following content:\n",
" {content[:5000]}\n",
" Respond in {language} only, and format your response correctly in Markdown.\n",
" Do NOT escape characters or return extra backslashes.\n",
" \"\"\"\n",
"\n",
" if USE_OPENAI:\n",
" response_stream = openai_client.chat.completions.create(\n",
" model=self.model,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" stream=True\n",
" )\n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" for chunk in response_stream:\n",
" response += chunk.choices[0].delta.content or ''\n",
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
" update_display(Markdown(response), display_id=display_handle.display_id)\n",
" else:\n",
" response_stream = ollama.chat(\n",
" model=self.model,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" stream=True\n",
" )\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" full_text = \"\"\n",
" for chunk in response_stream:\n",
" if \"message\" in chunk:\n",
" content = chunk[\"message\"][\"content\"] or \"\"\n",
" full_text += content\n",
" update_display(Markdown(full_text), display_id=display_handle.display_id)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c69651f-e004-421e-acc5-c439e57a8762",
"metadata": {},
"outputs": [],
"source": [
"class BrochureGenerator:\n",
" \"\"\"\n",
" Main class to generate a company brochure.\n",
" \"\"\"\n",
" def __init__(self, company_name, url, language='English'):\n",
" self.company_name = company_name\n",
" self.url = url\n",
" self.language = language\n",
" self.website = Website(url)\n",
" self.llm_client = LLMClient()\n",
"\n",
" def generate(self):\n",
" links = self.llm_client.get_relevant_links(self.website)\n",
" content = self.website.get_contents()\n",
"\n",
" for link in links['links']:\n",
" linked_website = Website(link['url'])\n",
" content += f\"\\n\\n{link['type']}:\\n\"\n",
" content += linked_website.get_contents()\n",
"\n",
" self.llm_client.generate_brochure(self.company_name, content, self.language)\n"
]
},
{
"cell_type": "markdown",
"id": "1379d39d",
"metadata": {},
"source": [
"## 📝 Generate Brochure"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a63519a-1981-477b-9de1-f1ff9be94201",
"metadata": {},
"outputs": [],
"source": [
"def main():\n",
" company_name = \"Tour Eiffel\"\n",
" url = \"https://www.toureiffel.paris/fr\"\n",
" language = \"French\"\n",
"\n",
" generator = BrochureGenerator(company_name, url, language)\n",
" generator.generate()\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,142 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "6e907206-4c13-4698-91c6-9ca1c32be8e7",
"metadata": {},
"source": [
"# TechExplainAI\n",
"---\n",
"\n",
"AI-driven tool that provides concise, structured explanations for technical questions and code snippets.\n",
"\n",
"- 🌍 Task: AI-powered technical explanation generator\n",
"- 🧠 Model: OpenAI's `GPT-4o-mini`, Ollama's `llama3.2:3b`\n",
"- 📌 Output Format: Markdown with real-time streaming\n",
"- 🧑‍💻 Skill Level: Beginner\n",
"- 🔄 Interaction Mode: User enters a technical question → AI generates a structured, concise explanation\n",
"- 🎯 Purpose: Quickly explain technical concepts and Python code snippets\n",
"- 🔧 Customization: Users can modify the models, prompts, and formatting as needed\n",
"\n",
"🛠️ Requirements\n",
"- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n",
"- 🔑 OpenAI API Key\n",
"- Install Ollama and pull llama3.2:3b or another lightweight model\n",
"\n",
"---\n",
"📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f743c87a-ed80-43d5-84ad-c78c8bdacb09",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import openai\n",
"import ollama\n",
"from dotenv import load_dotenv\n",
"from IPython.display import display, Markdown, update_display\n",
"\n",
"# Load environment variables\n",
"load_dotenv(override=True)\n",
"\n",
"# Set up OpenAI API key\n",
"OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')\n",
"if not OPENAI_API_KEY:\n",
" raise ValueError(\"Please set your OpenAI API key in environment variables.\")\n",
"\n",
"# Constants\n",
"MODEL_GPT = \"gpt-4o-mini\"\n",
"MODEL_LLAMA = \"llama3.2:3b\"\n",
"\n",
"# Prompt user for question (until input is provided)\n",
"while True:\n",
" question = input(\"Hello, I am your personal technical tutor. Enter your question: \").strip()\n",
" if question:\n",
" break # Proceed only if a valid question is entered\n",
" print(\"Question cannot be empty. Please enter a question.\")\n",
"\n",
"# Common user prompt\n",
"user_prompt = f\"\"\"\n",
"Please give a detailed explanation to the following question: {question}.\n",
"Be less verbose.\n",
"Provide a clear and concise explanation without unnecessary elaboration.\n",
"\"\"\"\n",
"\n",
"# Common system prompt\n",
"system_prompt = \"\"\"\n",
"You are a helpful AI assistant that explains Python code in a clear and concise manner. Provide structured explanations and examples when necessary.\n",
"Be less verbose.\n",
"\"\"\"\n",
"\n",
"def ask_openai():\n",
" \"\"\"Gets response from OpenAI's GPT model with streaming.\"\"\"\n",
" print(\"\\n\\n\\n🚀🤖🚀 Response from OpenAI GPT-4o-mini 🚀🤖🚀\")\n",
" client = openai.OpenAI(api_key=OPENAI_API_KEY)\n",
" response_stream = client.chat.completions.create(\n",
" model=MODEL_GPT,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" stream=True\n",
" )\n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" for chunk in response_stream:\n",
" response += chunk.choices[0].delta.content or ''\n",
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
" update_display(Markdown(response), display_id=display_handle.display_id)\n",
"\n",
"def ask_ollama():\n",
" \"\"\"Gets response from Ollama's Llama 3.2 model with streaming.\"\"\"\n",
" print(\"\\n\\n\\n🔥✨🔥 Response from Llama 3.2 🔥✨🔥\\n\")\n",
" response = ollama.chat(\n",
" model=MODEL_LLAMA,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" stream=True\n",
" )\n",
"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" full_text = \"\"\n",
" for chunk in response:\n",
" if \"message\" in chunk:\n",
" content = chunk[\"message\"][\"content\"] or \"\"\n",
" full_text += content\n",
" update_display(Markdown(full_text), display_id=display_handle.display_id)\n",
"\n",
"# Call the functions\n",
"ask_openai()\n",
"ask_ollama()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,247 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"id": "8ce13728-0040-43cc-82cd-e10c838ef71c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🌍 Detected language: PT\n",
"🔗 Preview of extracted text:\n",
"\n",
"ITASAT2 irá atuar para aplicações científicas e de defesa\n",
"Publicado em 14/04/2025 - 14h15\n",
"O Instituto Tecnológico de Aeronáutica (ITA) realizou, entre os dias 17 e 19 de março, a Revisão Preliminar de Projeto (PDR) do ITASAT 2, novo microssatélite em desenvolvimento por pesquisadores do Centro Espacial ITA (CEI). A atividade representa uma etapa importante dos estudos e contou com a presença de instituições parceiras, tanto do Brasil quanto do exterior.\n",
"Participaram do encontro representantes do\n",
"...\n",
"\n",
"Amount of words: 526\n",
"\n",
"\n",
"📊 Usage Report\n",
"🧾 Prompt tokens: 927\n",
"🧠 Completion tokens: 309\n",
"🔢 Total tokens: 1236\n",
"💰 Total cost: $0.000927\n",
"\n",
"\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"# 📝 Summary\n",
"\n",
"The ITA (Instituto Tecnológico de Aeronáutica) is working on the ITASAT 2 project, a new microsatellite geared towards scientific and defense applications! 🌟 This initiative was highlighted at the Preliminary Design Review (PDR) held from March 17 to 19, with participation from notable organizations such as NASA and the Brazilian Space Agency (AEB). This is a fantastic collaboration that spans both domestic and international partnerships how exciting is that? \n",
"\n",
"ITASAT 2 will consist of a constellation of three CubeSats focusing on monitoring the Earth's ionosphere and assessing plasma bubble formation. Interestingly, it also has defense applications such as geolocating radio frequency sources and optical identification of uncooperative vessels a crucial capability for maritime security!\n",
"\n",
"The PDR showcased the team's technical and managerial capabilities, receiving unanimous approval to proceed with the project. Its great to see such thorough preparation reflecting the dedication of the ITA team! \n",
"\n",
"The CubeSats themselves are cubic nano or microsatellites, and the ITASAT 2 is of the 16U variety, meaning it's made up of 16 units measuring 10 cm each just amazing how compact these technologies can be! Additionally, the CEI is also developing another CubeSat called SelenITA, which will contribute to NASA's Artemis mission to study the Moon! 🌕\n",
"\n",
"Keep an eye on this remarkable project as it continues to develop the future of space exploration and defense technology looks bright! 🚀"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Import Libraries\n",
"import os\n",
"import requests\n",
"from openai import OpenAI\n",
"\n",
"from bs4 import BeautifulSoup\n",
"from langdetect import detect, LangDetectException\n",
"from dotenv import load_dotenv\n",
"\n",
"from IPython.display import Markdown, display\n",
"\n",
"# Load .env variables\n",
"load_dotenv()\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"if not openai_api_key:\n",
" raise ValueError(\"⚠️ OPENAI_API_KEY not found in .env file.\")\n",
"\n",
"# Generating object to work with GPT tasks \n",
"openai = OpenAI()\n",
"\n",
"# Class to work with text extraction, processing and summarizing from a given url\n",
"class WebPageSummarizer():\n",
" \"\"\"\n",
" Class to work with text extraction, processing and summarizing from a given url using the BeautifulSoup library. It also includes pricing.\n",
" \"\"\"\n",
" def __init__(self, url: str, summary_detail: str = \"high\", show_summary: bool = True, language_of_reference = \"English\", model: str = \"gpt-4o-mini\") -> None:\n",
"\n",
" # Initial summarizer settings\n",
" self.url = url\n",
" self.model = model\n",
" self.show_summary = show_summary\n",
" self.summary_detail = summary_detail\n",
" self.language_of_reference = language_of_reference\n",
" self.language_code_map = {\n",
" \"english\": \"en\",\n",
" \"portuguese\": \"pt\",\n",
" \"spanish\": \"es\",\n",
" \"french\": \"fr\",\n",
" \"german\": \"de\",\n",
" \"italian\": \"it\",\n",
" \"japanese\": \"ja\",\n",
" \"chinese\": \"zh\",\n",
" \"korean\": \"ko\",\n",
" }\n",
" \n",
" self.model_pricing = {\n",
" \"gpt-4o-mini\": {\"input\": 0.0005, \"output\": 0.0015},\n",
" \"gpt-4o\": {\"input\": 0.005, \"output\": 0.015},\n",
" \"gpt-4-turbo\": {\"input\": 0.01, \"output\": 0.03},\n",
" \"gpt-4\": {\"input\": 0.03, \"output\": 0.06}, # Rarely used now\n",
" \"gpt-3.5-turbo\": {\"input\": 0.0005, \"output\": 0.0015}\n",
" }\n",
"\n",
" self.headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \"\n",
" \"(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36\"\n",
" }\n",
"\n",
" if self.summary_detail not in [\"high\", \"low\"]:\n",
" raise Exception(\"\"\"Please select summary detail as either \"high\" or \"low\".\"\"\")\n",
"\n",
" def __extract_text(self):\n",
" response = requests.get(self.url, headers=self.headers)\n",
" if response.status_code != 200:\n",
" raise Exception(f\"Failed to fetch page. Status code: {response.status_code}\")\n",
" \n",
" soup = BeautifulSoup(response.text, \"html.parser\")\n",
" \n",
" # Try to extract meaningful content\n",
" paragraphs = soup.find_all(\"p\")\n",
" \n",
" # Join all paragraph text\n",
" self.text = \"\\n\".join([p.get_text() for p in paragraphs if p.get_text().strip() != \"\"])\n",
"\n",
" # Guarantee limit of text to summary\n",
" max_words = 7000\n",
" if len(self.text.split()) > max_words:\n",
" self.text = \" \".join(self.text.split()[:max_words])\n",
" \n",
" def __detect_language(self):\n",
" # Detect language\n",
" try:\n",
" self.language_url = detect(self.text)\n",
" except LangDetectException:\n",
" self.language_url = \"unknown\"\n",
"\n",
" # Normalize and resolve target language code\n",
" target_language_name = self.language_of_reference.lower().strip()\n",
" self.target_language_code = self.language_code_map.get(target_language_name)\n",
" \n",
" if not self.target_language_code:\n",
" raise ValueError(f\"❌ Unsupported language: {self.language_of_reference}. Please use one of: {list(LANGUAGE_CODE_MAP.keys())}\")\n",
"\n",
" print(f\"🌍 Detected language: {self.language_url.upper()}\")\n",
" \n",
" if self.show_summary:\n",
" print(\"🔗 Preview of extracted text:\\n\")\n",
" print(self.text[:500] + \"\\n...\\n\")\n",
" print(f\"Amount of words: {len(self.text.split())}\\n\")\n",
"\n",
" def __calculate_cost(self, prompt_tokens: int, completion_tokens: int) -> float:\n",
" \"\"\"\n",
" Calculates total cost in USD based on selected model.\n",
" \"\"\"\n",
" pricing = self.model_pricing.get(self.model)\n",
" if pricing is None:\n",
" raise ValueError(f\"\"\"Pricing not available for model \"{self.model}\". Add it to model_pricing.\"\"\")\n",
" \n",
" input_cost = (prompt_tokens / 1000) * pricing[\"input\"]\n",
" output_cost = (completion_tokens / 1000) * pricing[\"output\"]\n",
" return input_cost + output_cost\n",
"\n",
" def summarize(self)-> str:\n",
" \"\"\"\n",
" Method to process user prompts in the context of the user.\n",
" \"\"\"\n",
" self.__extract_text()\n",
" self.__detect_language()\n",
" \n",
" # Prompt for system definition\n",
" self.system_prompt = f\"\"\" \n",
" You are an assistant that analyzes the contents of a website and provides a summary. \n",
" Please notice that providing a {self.summary_detail} summary detail is IMPORTANT.\n",
" If you find text that might be navigation related or ad related please ignore. Respond in markdown. \n",
" Also, can you please start your summary with the tile \"📝 Summary\"?\n",
" \n",
" Please show some excited behavior during your summary, making comments with extra knowledge if possible during or at the end of the sentence. \n",
" \"\"\"\n",
"\n",
" self.content = f\"\"\"The text to summarize is as follows: {self.text}\"\"\"\n",
"\n",
" if self.language_url != self.target_language_code:\n",
" self.system_prompt = f\"\"\"The website content is in {self.language_url.upper()}. Please first translate it to {self.language_of_reference}. \n",
" {self.system_prompt.strip()}\n",
" \"\"\"\n",
"\n",
" response = openai.chat.completions.create(model=self.model, messages=[{\"role\":\"system\", \"content\":self.system_prompt}, \n",
" {\"role\": \"user\", \"content\":self.content}])\n",
"\n",
" # Cost calculation and usage report\n",
" usage = response.usage\n",
" total_cost = self.__calculate_cost(usage.prompt_tokens, usage.completion_tokens)\n",
" \n",
" print(\"\\n📊 Usage Report\")\n",
" print(f\"🧾 Prompt tokens: {usage.prompt_tokens}\")\n",
" print(f\"🧠 Completion tokens: {usage.completion_tokens}\")\n",
" print(f\"🔢 Total tokens: {usage.total_tokens}\")\n",
" print(f\"💰 Total cost: ${total_cost:.6f}\\n\\n\\n\")\n",
"\n",
" return response.choices[0].message.content\n",
"\n",
"\n",
"web_page_summarizer = WebPageSummarizer(\"http://www.ita.br/noticias/revisodeprojetodonovomicrossatlitedoitaaprovada\", summary_detail = \"low\")\n",
"display(Markdown(web_page_summarizer.summarize()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af5a186a-bb25-4cf4-a6d2-6034cd493bc4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,307 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "3ba06289-d17a-4ccd-85f5-2b79956d4e59",
"metadata": {},
"outputs": [],
"source": [
"!pip install selenium"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb6636be-e43f-4896-aadd-cafda003ed4e",
"metadata": {},
"outputs": [],
"source": [
"!pip install -q -U google-genai"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dfe66209-1d33-4292-80f1-20e11baf4bc3",
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.chrome.service import Service\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from IPython.display import Markdown, display\n",
"from google import genai\n",
"from google.genai import types\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f2b4306c-17d0-46fe-a889-7440ff809dc6",
"metadata": {},
"outputs": [],
"source": [
"\n",
"#load env\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('GEMINI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")"
]
},
{
"cell_type": "markdown",
"id": "08ec6fec-886c-4a0c-a046-e8643ad700d3",
"metadata": {},
"source": [
"# Lets make a simple call for check our model is working fine or not"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "89143d5c-0013-4f7e-8e1f-f7db7e936f0d",
"metadata": {},
"outputs": [],
"source": [
"client = genai.Client(api_key=api_key)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1144b77a-6785-479a-ab4f-bb0ab5624b49",
"metadata": {},
"outputs": [],
"source": [
"\n",
"response = client.models.generate_content(\n",
" model=\"gemini-2.5-flash-preview-05-20\",\n",
" contents=[\"hi gemini\"]\n",
")\n",
"print(response.text)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bbf3836c-19b8-44e1-904a-f265925c2786",
"metadata": {},
"outputs": [],
"source": [
"\n",
"class Website:\n",
" def __init__(self, url, driver_path=None, wait_time=3):\n",
" self.url = url\n",
" self.wait_time = wait_time\n",
"\n",
" # Headless Chrome settings\n",
" options = Options()\n",
" # options.add_argument(\"--headless\") \n",
" # Headless mode runs the browser in the background (invisible).\n",
" # However, some websites (like openai.com) block headless browsers.\n",
" # So if this line is active, the page may not load correctly and you may not get the full content.\n",
" options.add_argument(\"--disable-gpu\")\n",
" options.add_argument(\"--no-sandbox\")\n",
" options.add_argument(\"--window-size=1920x1080\")\n",
"\n",
" # Driver path\n",
" if driver_path:\n",
" service = Service(executable_path=driver_path)\n",
" else:\n",
" service = Service() \n",
"\n",
" # Start browser\n",
" driver = webdriver.Chrome(service=service, options=options)\n",
" driver.get(url)\n",
"\n",
" # Wait for the loading page\n",
" time.sleep(self.wait_time)\n",
"\n",
" # Take page source\n",
" html = driver.page_source\n",
" driver.quit()\n",
"\n",
" # Analysis with BeautifulSoup \n",
" soup = BeautifulSoup(html, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
"\n",
" # Clean irrelevant tags\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
"\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "852c52e2-bd4d-4bb9-94ef-e498c33f1a89",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"\"\"You are an academic research assistant specialized in summarizing scholarly papers. Follow this workflow rigorously:\n",
"\n",
"Step 1: Document Verification\n",
"Verify if the input is a research paper by checking for:\n",
"\n",
"Presence of academic sections (Abstract, Introduction, Methodology, Results, Discussion, References)\n",
"\n",
"Technical/scholarly language\n",
"\n",
"Citations (in-text or bibliography)\n",
"\n",
"Research claims or data analysis\n",
"If NOT a research paper:\n",
"→ Respond: \"This doesn't appear to be a research paper. Please upload peer-reviewed academic literature for summarization.\"\n",
"\n",
"Step 2: Structured Summary (If verified)\n",
"Generate a 5-section summary in this exact format:\n",
"\n",
"1. Research Question\n",
"[Identify core problem/gap addressed in 1 sentence]\n",
"\n",
"2. Methodology\n",
"[Study design, data sources, analytical techniques in 2 bullet points]\n",
"\n",
"3. Key Findings\n",
"[3-4 quantified results with numerical evidence from tables/figures]\n",
"\n",
"4. Limitations\n",
"[2 major constraints acknowledged by authors]\n",
"\n",
"5. Significance\n",
"[Impact on field & practical implications in 1 sentence]\n",
"\n",
"Critical Rules:\n",
"Accuracy Priority: Never invent data. Write \"Not specified\" for missing elements\n",
"\n",
"Source Anchoring: Cite page/paragraph numbers for claims (e.g., \"Fig 3 shows 24% improvement\")\n",
"\n",
"Jargon Handling: Simplify complex terms using: [Technical Term → Layman Explanation] inline\n",
"\n",
"Bias Alert: Flag any undeclared funding/sponsorship conflicts\n",
"\n",
"Output Format: Strict Markdown with section headers, 200-word maximum\n",
"\n",
"Example Output:\n",
"1. Research Question\n",
"How does microplastic concentration affect zebrafish neural development?\n",
"\n",
"2. Methodology\n",
"\n",
"Exposed embryos to 0.1-10μm PET particles (5-100mg/L) for 96h\n",
"\n",
"Quantified gene expression (RT-qPCR) and behavioral assays (Open Field Test)\n",
"\n",
"3. Key Findings\n",
"▲ 40% reduction in neuron count at 50mg/L exposure (p<0.01, Fig 2B)\n",
"■ 2.3x increase in anxiolytic behavior (Table 3)\n",
"▼ 17% downregulation in shha expression (p=0.03)\n",
"\n",
"4. Limitations\n",
" \n",
"Used static exposure vs dynamic aquatic environments\n",
"\n",
"Limited proteomic validation\n",
"\n",
"5. Significance\n",
"Establishes dose-dependent neurotoxicity thresholds for aquatic toxicology regulations.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7620c685-c35c-4d6b-aaf1-a3da98f19ca7",
"metadata": {},
"outputs": [],
"source": [
"# A function that writes a User Prompt that asks for summaries of websites:\n",
"\n",
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
"please provide a summary of this website in markdown.\\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4257406-089b-45a3-bfb5-272004360a49",
"metadata": {},
"outputs": [],
"source": [
"def summarize(url):\n",
" website = Website(url)\n",
" response = client.models.generate_content(\n",
" model=\"gemini-2.5-flash-preview-05-20\",\n",
" config=types.GenerateContentConfig(\n",
" system_instruction=system_prompt),\n",
" contents=user_prompt_for(website)\n",
" )\n",
"\n",
" return response.text\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f68b32ae-9e65-4aa4-ae8d-cc2482c4a2e2",
"metadata": {},
"outputs": [],
"source": [
"def display_summary(url):\n",
" summary = summarize(url)\n",
" display(Markdown(summary))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae52543c-01c1-4262-b53c-95ef4e5a93aa",
"metadata": {},
"outputs": [],
"source": [
"display_summary(\"https://onlinelibrary.wiley.com/doi/full/10.1155/2021/8812542\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,557 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "ae1ef804-3504-488d-af86-5a0da36fea78",
"metadata": {},
"source": [
"# ☀️🏃‍♀️ WeatherMate\n",
"----\n",
"\n",
"**WeatherMate** is a conversational **AI agent** that analyzes real-time weather conditions and suggests the best activities and events based on location. Whether it's sunny, rainy, or snowy, WeatherMate helps you make the most of your day! \n",
"\n",
"Here's how it works:\n",
"1. Get current weather conditions for the user's location.\n",
"2. Recommend suitable indoor or outdoor activities based on the weather.\n",
"3. Find relevant events using the Ticketmaster API.\n",
"4. Merge both activity suggestions and events into a single, structured response.\n",
"\n",
"---\n",
"\n",
"Large Language Models (LLMs), by themselves, cannot fetch real-time data such as weather information. To enable LLMs to access and use such real-time data, we integrate **external tools.** \n",
"\n",
"In this notebook, we will implement a weather API, allowing the assistant to fetch real-time weather information and use it for personalized activity suggestions based on current weather conditions. This is an essential step in transforming an LLM into a more interactive and data-driven AI assistant.\n",
"\n",
"\n",
"In this notebook, we will develop a conversational AI Agent that helps users receive personalized activity recommendations based on real-time weather data.\n",
"\n",
"- 🧑‍💻 Skill Level: Advanced\n",
"- 📤 Output Format: conversational chat\n",
"- 🚀 Tools:\n",
" - Weather API integration \n",
" - Ticketmaster API\n",
" - OpenAI with external tool handling\n",
" - Gradio for the UI\n",
"\n",
"🛠️ Requirements\n",
"- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n",
"- 🔑 OpenAI API Key\n",
"- 🔑 Weather API integration (https://www.weatherapi.com)\n",
"- 🔑 Ticketmaster API (https://developer.ticketmaster.com/explore/)\n",
"\n",
"⚙️ Customizable by user\n",
"- 🤖 Selected model\n",
"- 📜 system_prompt: Controls model behavior\n",
"\n",
"---\n",
"📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)"
]
},
{
"cell_type": "markdown",
"id": "ad262788",
"metadata": {},
"source": [
"**Class Diagram**\n",
"\n",
"![](https://github.com/lisekarimi/lexo/blob/main/assets/05_weather_class_diagram.png?raw=true)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6b7a492-f510-4ba4-bbc3-239675d389dd",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import json\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"import gradio as gr\n",
"from datetime import datetime\n",
"\n",
"# Initialization\n",
"\n",
"load_dotenv(override=True)\n",
"\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"if not openai_api_key:\n",
" print(\"❌ OpenAI API Key is missing!\")\n",
"\n",
"weather_api_key = os.getenv('WEATHERAPI_KEY')\n",
"if not weather_api_key:\n",
" print(\"❌ Weather API Key is missing!\")\n",
"\n",
"ticketmaster_api_key = os.getenv('TICKETMASTER_KEY')\n",
"if not ticketmaster_api_key:\n",
" print(\"❌ TicketMaster API Key is missing!\")\n",
"\n",
"\n",
"MODEL = \"gpt-4o-mini\"\n",
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "347dbe00-5826-4aa6-9d2c-9d028fc33ec8",
"metadata": {},
"outputs": [],
"source": [
"# Get today's date and day name\n",
"today_str = datetime.today().strftime('%Y-%m-%d')\n",
"day_name = datetime.today().strftime('%A')\n",
"\n",
"nb_activity = 10\n",
"\n",
"\n",
"system_message = f\"\"\"\n",
"You are a fun and helpful assistant for an Activity Suggestion App.\n",
"Your job is to recommend **up to {nb_activity} activities** based on the real-time weather fetched from the API, ensuring a mix of **indoor, outdoor, and event-based activities** whenever possible.\n",
"\n",
"The total must always be **10 or fewer**, following this rule:\n",
"**nb_events + nb_indoors + nb_outdoors ≤ 10**.\n",
"\n",
"You must **analyze and think carefully** to determine the best combination of activities and events for the user.\n",
"- Evaluate **weather conditions** to decide if outdoor activities are suitable.\n",
"- Check **event availability** and select the most relevant ones.\n",
"- Balance **indoor, outdoor, and event-based activities** dynamically to provide the best experience.\n",
"\n",
"If one of these categories is unavailable, that's fine—just provide the best possible suggestions without exceeding **10 activities**.\n",
"Deliver everything **in one go—no waiting!**\n",
"\n",
"\n",
"### **Understanding Relative Dates**\n",
"- Always interpret relative dates based on **{today_str} ({day_name})**.\n",
"- The weekend always refers to Saturday and Sunday.\n",
"- \"Next {day_name}\" should refer to the **closest upcoming occurrence** of that day.\n",
"- If the user asks for a time range (e.g., \"the next 3 days\"), calculate the **exact date range** starting from today.\n",
"- If no specific date is mentioned, **assume today by default**.\n",
"- **Do not ask for confirmation** when interpreting dates—just assume the correct date and proceed confidently unless there's real ambiguity.\n",
"\n",
"### **Activity and Event Suggestion Process**\n",
"To provide the best {nb_activity} activity recommendations, follow these steps:\n",
"Step 1: Retrieve Weather Data Use the Weather API to get current conditions for the user's location.\n",
"Step 2: Suggest Activities Recommend suitable indoor or outdoor activities based on the weather.\n",
"Step 3: Fetch Events (if available) Use the Ticketmaster API to find relevant events in the users area.\n",
"Step 4: Combine Everything Merge both event listings and activity suggestions into a single, well-structured response.\n",
"This entire process should be done seamlessly in one go without making the user wait.\n",
"\n",
"### **How to Handle Each API**\n",
"- **Weather API Handling**:\n",
" - If the user requests a relative date (e.g., \"tomorrow,\" \"next Monday\"), calculate the number of days from today.\n",
" - Provide the weather forecast only for the requested date, ignoring any other days in the response.\n",
" - If no weather data is available, inform the user in a friendly, light-hearted way.\n",
" - The forecast is limited to 14 days, so if the user requests a longer period, politely let him know.\n",
"\n",
"- **Ticketmaster API Handling**:\n",
" - If the user asks for events today, set the start date as todays date.\n",
" - If the user asks for any specific weekday, find the next occurrence of that day and use it as the start date.\n",
" - If the user asks for a range of days (e.g., \"the next 3 days\"), use todays date as the start date.\n",
" - The country corresponding to the user's city must be represented using the ISO Alpha-2 Code (e.g., FR for France, US for the United States, CA for Canada, DK for Denmark).\n",
" - If more than 5 events are found, ask the user for their interests to refine the search, using a one-word keyword like 'music,' 'cinema,' or 'theater.'\n",
" - If no events are found, explicitly inform the user in a friendly, funny way.\n",
" - Do not mention Ticketmaster unless necessary; simply state that you are checking for events.\n",
"\n",
"### **User Interaction Rules**\n",
"- If the user **doesnt mention a city**, **ask them to provide one**.\n",
"- If an event search fails, do **not** mention Ticketmaster; simply say that no events were found.\n",
"- Ensure all activity suggestions are provided **in one response**, combining weather-based activities and event suggestions.\n",
"\n",
"\n",
"### **Event Formatting in Output**\n",
"**If Ticketmaster events are available**, format the output as follows:\n",
"Here are some events that may interest you:\n",
"**Event Name**:\n",
"- 📅 Date: Give the date like 19th March 2025\n",
"- 📍 Venue:\n",
"- 🔗 Ticket Link: Put the URL here\n",
"\n",
"(And don't forget to separate these gems with a snazzy divider)\n",
"\n",
"**Event Name**:\n",
"- 📅 Date: Give the date like 19th March 2025\n",
"- 📍 Venue:\n",
"- 🔗 Ticket Link: Put the URL here\n",
"\n",
"(Another divider, because we like to keep things fresh!)\n",
"\n",
"**Event Name**:\n",
"- 📅 Date: Give the date like 19th March 2025\n",
"- 📍 Venue:\n",
"- 🔗 Ticket Link: Put the URL here\n",
"\n",
"### **Tone and Style**\n",
"**Keep it short, fun, and dont forget to add a dash of humor!**\n",
"Your job is to keep the user smiling while giving them the **best activities for the day**.\n",
"Be **accurate and concise**, but lets keep it **light and lively!** 🎉\n",
"\"\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "578da33d-be38-4c75-8a96-9d6bfc1af99b",
"metadata": {},
"outputs": [],
"source": [
"class WeatherAPI:\n",
" def get_weather(self, city: str, days: int) -> dict:\n",
" \"\"\"Fetches weather data for the given city for the next 'days' number of days.\"\"\"\n",
" url = \"https://api.weatherapi.com/v1/forecast.json\"\n",
" params = {\"key\": weather_api_key, \"q\": city, \"days\": days}\n",
" # print(f\"params weather: {params}\")\n",
" response = requests.get(url, params=params)\n",
"\n",
" if response.status_code == 200:\n",
" data = response.json()\n",
" forecast = []\n",
" for day in data[\"forecast\"][\"forecastday\"]:\n",
" forecast.append({\n",
" \"date\": day[\"date\"],\n",
" \"temp\": day[\"day\"][\"avgtemp_c\"]\n",
" })\n",
"\n",
" result = {\n",
" \"city\": city,\n",
" \"forecast\": forecast\n",
" }\n",
" return result\n",
" else:\n",
" return {\"error\": f\"City '{city}' not found or other issue. Please check the city name and try again.\"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "305f9f18-8556-4b49-9f6b-4a2233eefae9",
"metadata": {},
"outputs": [],
"source": [
"from abc import ABC, abstractmethod\n",
"\n",
"class BaseEventAPI(ABC):\n",
" @abstractmethod\n",
" def get_events(self, city, country_code, keywords, size):\n",
" \"\"\"Fetches upcoming events from an event provider.\"\"\"\n",
" pass # Subclasses must implement this method\n",
"\n",
"class TicketmasterAPI(BaseEventAPI):\n",
" def get_events(self, city, country_code, keywords, start_date):\n",
" \"\"\"Fetches upcoming events from Ticketmaster for a given city.\"\"\"\n",
" url = \"https://app.ticketmaster.com/discovery/v2/events.json\"\n",
" params = {\n",
" \"apikey\": ticketmaster_api_key,\n",
" \"city\": city,\n",
" \"countryCode\": country_code,\n",
" \"keyword\": \",\".join(keywords),\n",
" \"size\": 10,\n",
" \"startDateTime\": start_date\n",
" }\n",
"\n",
" response = requests.get(url, params=params)\n",
"\n",
" if response.status_code == 200:\n",
" data = response.json()\n",
" events = data.get(\"_embedded\", {}).get(\"events\", [])\n",
" return [\n",
" {\n",
" \"name\": event[\"name\"],\n",
" \"date\": event[\"dates\"][\"start\"][\"localDate\"],\n",
" \"venue\": event[\"_embedded\"][\"venues\"][0][\"name\"],\n",
" \"url\": event.get(\"url\", \"N/A\") # Using .get() to avoid KeyError\n",
" }\n",
" for event in events\n",
" ] if events else []\n",
" else:\n",
" return {\"error\": f\"API request failed! Status: {response.status_code}, Response: {response.text}\"}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c60820f-4e9f-4851-8330-52c8fd676259",
"metadata": {},
"outputs": [],
"source": [
"class ChatAssistant:\n",
" def __init__(self):\n",
" self.model = MODEL\n",
" self.tools = [\n",
" {\n",
" \"type\": \"function\",\n",
" \"function\": {\n",
" \"name\": \"get_weather\",\n",
" \"description\": \"Get the current weather and forecast for the destination city.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"city\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"The city for which the weather is being requested.\"\n",
" },\n",
" \"days\": {\n",
" \"type\": \"integer\",\n",
" \"description\": \"The number of days for the weather forecast (can be 1, 2, 6, or 10).\"\n",
" }\n",
" },\n",
" \"required\": [\"city\", \"days\"],\n",
" \"additionalProperties\": False\n",
" }\n",
" }\n",
" },\n",
" {\n",
" \"type\": \"function\",\n",
" \"function\": {\n",
" \"name\": \"get_ticketmaster_events\",\n",
" \"description\": \"Fetch upcoming events from Ticketmaster.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"city\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"City where the events are searched.\"\n",
" },\n",
" \"country_code\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"Country code for filtering results.\"\n",
" },\n",
" \"keywords\": {\n",
" \"type\": \"array\",\n",
" \"items\": {\n",
" \"type\": \"string\"\n",
" },\n",
" \"description\": \"Optional keywords for event search (e.g., 'music', 'concert').\"\n",
" },\n",
" \"size\": {\n",
" \"type\": \"integer\",\n",
" \"description\": \"Number of events to fetch.\"\n",
" },\n",
" \"start_date\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"Start date for the event search.\"\n",
" }\n",
" },\n",
" \"required\": [\"city\", \"country_code\", \"size\", \"start_date\"],\n",
" \"additionalProperties\": False\n",
" }\n",
" }\n",
" }\n",
" ]\n",
"\n",
" def chat(self, user_message, history, weather_api, event_apis):\n",
" # Build the conversation\n",
" messages = [{\"role\": \"system\", \"content\": system_message}] + history + [{\"role\": \"user\", \"content\": user_message}]\n",
"\n",
" # OpenAI response\n",
" response = openai.chat.completions.create(model=self.model, messages=messages, tools=self.tools, stream=True)\n",
"\n",
" recovered_pieces = {\n",
" \"content\": None,\n",
" \"role\": \"assistant\",\n",
" \"tool_calls\": {}\n",
" }\n",
" last_tool_calls = {}\n",
" has_tool_call = False\n",
" result = \"\" # Initialize result accumulator\n",
" # previous_index = None # Track the last processed index\n",
"\n",
" for chunk in response:\n",
" delta = chunk.choices[0].delta\n",
" finish_reason = chunk.choices[0].finish_reason\n",
"\n",
" # Handle tool call detection\n",
" if delta.tool_calls and finish_reason in [None, \"tool_calls\"]:\n",
" has_tool_call = True\n",
" piece = delta.tool_calls[0] # Get the first piece in the tool call\n",
"\n",
" # Create a dictionary for the tool call if it doesn't exist yet\n",
" recovered_pieces[\"tool_calls\"][piece.index] = recovered_pieces[\"tool_calls\"].get(\n",
" piece.index, {\"id\": None, \"function\": {\"arguments\": \"\", \"name\": \"\"}, \"type\": \"function\"}\n",
" )\n",
"\n",
" if piece.id:\n",
" recovered_pieces[\"tool_calls\"][piece.index][\"id\"] = piece.id\n",
" if piece.function.name:\n",
" recovered_pieces[\"tool_calls\"][piece.index][\"function\"][\"name\"] = piece.function.name\n",
" recovered_pieces[\"tool_calls\"][piece.index][\"function\"][\"arguments\"] += piece.function.arguments\n",
"\n",
" # Store the tool call in the dictionary by index\n",
" last_tool_calls[piece.index] = recovered_pieces[\"tool_calls\"][piece.index]\n",
"\n",
" # Store content in result and yield\n",
" else:\n",
" result += delta.content or \"\"\n",
" if result.strip():\n",
" yield result\n",
"\n",
"\n",
" # Handle tool call scenario\n",
" if has_tool_call:\n",
" # Handle the tool calls\n",
" response = self.handle_tool_call(last_tool_calls, weather_api, event_apis)\n",
"\n",
" if response: # Only iterate if response is not None\n",
" tool_calls_list = [tool_call for tool_call in last_tool_calls.values()]\n",
" messages.append({\"role\": \"assistant\", \"tool_calls\": tool_calls_list}) # Append the tool calls to the messages\n",
"\n",
" # Dynamically process each tool call response and append it to the message history\n",
" for res in response:\n",
" messages.append({\n",
" \"role\": \"tool\",\n",
" \"tool_call_id\": res[\"tool_call_id\"],\n",
" \"content\": json.dumps(res[\"content\"])\n",
" })\n",
"\n",
" # New OpenAI request with tool response\n",
" response = openai.chat.completions.create(model=self.model, messages=messages, stream=True)\n",
"\n",
" result = \"\" # Reset result before second stream\n",
" for chunk in response:\n",
" result += chunk.choices[0].delta.content or \"\"\n",
" if result.strip():\n",
" yield result\n",
"\n",
"\n",
" def handle_tool_call(self, tool_call, weather_api, event_apis):\n",
" stored_values = {} # Dictionary to store the valid value for each field\n",
"\n",
" for index, call in tool_call.items():\n",
" # Load the arguments for each tool call dynamically\n",
" arguments = json.loads(call[\"function\"][\"arguments\"])\n",
"\n",
" # Iterate over all keys dynamically\n",
" for key, value in arguments.items():\n",
" # Update the field if it's currently None or hasn't been set before\n",
" if key not in stored_values or stored_values[key] is None:\n",
" stored_values[key] = value\n",
"\n",
" city = stored_values.get('city')\n",
" days = stored_values.get('days')\n",
" country_code = stored_values.get('country_code')\n",
" keywords = stored_values.get('keywords', [])\n",
" # size = stored_values.get('size')\n",
" start_date = stored_values.get('start_date')\n",
" start_date = str(start_date) + \"T00:00:00Z\"\n",
"\n",
" weather_data = None\n",
" event_data = None\n",
"\n",
" # Iteration over tool_call\n",
" for call in tool_call.values():\n",
" if call[\"function\"][\"name\"] == \"get_weather\":\n",
" weather_data = weather_api.get_weather(city, days)\n",
"\n",
" if call[\"function\"][\"name\"] == \"get_ticketmaster_events\":\n",
" event_data = event_apis[\"ticketmaster\"].get_events(city, country_code, keywords, start_date)\n",
"\n",
" responses = []\n",
"\n",
" # Ensure weather response is always included\n",
" weather_tool_call_id = next((call[\"id\"] for call in tool_call.values() if call[\"function\"][\"name\"] == \"get_weather\"), None)\n",
" if weather_data and \"forecast\" in weather_data:\n",
" responses.append({\n",
" \"role\": \"assistant\",\n",
" \"content\": {\"weather\": weather_data[\"forecast\"]},\n",
" \"tool_call_id\": weather_tool_call_id\n",
" })\n",
" elif weather_tool_call_id:\n",
" responses.append({\n",
" \"role\": \"assistant\",\n",
" \"content\": {\"message\": \"No weather data available for this location.\"},\n",
" \"tool_call_id\": weather_tool_call_id\n",
" })\n",
"\n",
" # Ensure event response is always included\n",
" event_tool_call_id = next((call[\"id\"] for call in tool_call.values() if call[\"function\"][\"name\"] == \"get_ticketmaster_events\"), None)\n",
" if event_data:\n",
" responses.append({\n",
" \"role\": \"assistant\",\n",
" \"content\": {\"events\": event_data},\n",
" \"tool_call_id\": event_tool_call_id\n",
" })\n",
" elif event_tool_call_id:\n",
" responses.append({\n",
" \"role\": \"assistant\",\n",
" \"content\": {\"message\": \"No events found for this location.\"},\n",
" \"tool_call_id\": event_tool_call_id\n",
" })\n",
"\n",
" # print(\"Final responses:\", responses)\n",
" return responses\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "191a3a9e-95e1-4ca6-8992-4a5bafb9b8ff",
"metadata": {},
"outputs": [],
"source": [
"# GradioInterface class to handle the Gradio UI\n",
"class GradioInterface:\n",
" def __init__(self, activity_assistant):\n",
" self.activity_assistant = activity_assistant\n",
"\n",
" def launch(self):\n",
" # Gradio chat interface\n",
" gr.ChatInterface(fn=self.activity_assistant.chat, type=\"messages\").launch()\n",
"\n",
"# ActivityAssistant setup\n",
"class ActivityAssistant:\n",
" def __init__(self):\n",
" self.weather_api = WeatherAPI() # Interact with the Weather API\n",
" self.event_apis = { # Interact with the Events API\n",
" \"ticketmaster\": TicketmasterAPI()\n",
" }\n",
" self.chat_assistant = ChatAssistant() # This will handle conversation with OpenAI\n",
"\n",
" def chat(self, user_message, history):\n",
" # Forward the user message and conversation history to ChatAssistant\n",
" response_stream = self.chat_assistant.chat(user_message, history, self.weather_api, self.event_apis)\n",
" for chunk in response_stream:\n",
" yield chunk"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b501e8e-2e10-4ab7-b523-1d4b8ad358e8",
"metadata": {},
"outputs": [],
"source": [
"# Main execution\n",
"if __name__ == \"__main__\":\n",
" activity_assistant = ActivityAssistant()\n",
" gradio_interface = GradioInterface(activity_assistant)\n",
" gradio_interface.launch()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,420 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6a08763a-aed6-4f91-94d0-80a3c0e2665b",
"metadata": {},
"source": [
"### Weeks 2 - Day 2 - Gradio Chatbot with LiteLLM (Model Routing)"
]
},
{
"cell_type": "markdown",
"id": "a4f38c58-5ceb-4d5e-b538-c1acdc881f73",
"metadata": {},
"source": [
"**Author** : [Marcus Rosen](https://github.com/MarcusRosen)"
]
},
{
"cell_type": "markdown",
"id": "36f4814a-2bfc-4631-97d7-7a474fa1cc8e",
"metadata": {},
"source": [
"[LiteLLM](https://docs.litellm.ai/docs/) provides the abilitty to call different LLM providers via a unified interface, returning results in OpenAI compatible formats.\n",
"\n",
"Features:\n",
"- Model Selection in Gradio (Anthropic, OpenAI, Gemini)\n",
"- Single Inference function for all model providers via LiteLLM (call_llm)\n",
"- Streaming **NOTE:** Bug when trying to stream in Gradio, but works directly in Notebook\n",
"- Debug Tracing"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "b6c12598-4773-4f85-93ca-0128d74fbca0",
"metadata": {},
"outputs": [],
"source": [
"from litellm import completion\n",
"import gradio as gr\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"import os\n",
"import requests\n",
"import json"
]
},
{
"cell_type": "markdown",
"id": "d24be370-5347-47fb-a58e-21a1b5409ab2",
"metadata": {},
"source": [
"#### Load API Keys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e03afbe9-16aa-434c-a701-b3bfe75e927d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OpenAI API Key exists and begins sk-proj-\n",
"Anthropic API Key exists and begins sk-ant-\n",
"Google API Key exists and begins AIzaSyDC\n"
]
}
],
"source": [
"# Load environment variables in a file called .env\n",
"# Print the key prefixes to help with any debugging\n",
"\n",
"load_dotenv(override=True)\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
"google_api_key = os.getenv('GEMINI_API_KEY')\n",
"\n",
"if openai_api_key:\n",
" print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
"else:\n",
" print(\"OpenAI API Key not set\")\n",
" \n",
"if anthropic_api_key:\n",
" print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n",
"else:\n",
" print(\"Anthropic API Key not set\")\n",
"\n",
"if google_api_key:\n",
" print(f\"Google API Key exists and begins {google_api_key[:8]}\")\n",
" # import google.generativeai\n",
" # google.generativeai.configure()\n",
"else:\n",
" print(\"Gemini API Key not set\")"
]
},
{
"cell_type": "markdown",
"id": "66e46447-0e73-49ef-944a-d1e8fae4986e",
"metadata": {},
"source": [
"### Use LiteLLM to abstract out the model provider"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "473c2029-ca74-4f1e-92ac-05f7817ff7df",
"metadata": {},
"outputs": [],
"source": [
"def call_llm(model, system_prompt, user_prompt, json_format_response=False, streaming=False):\n",
" if DEBUG_OUTPUT: \n",
" print(\"call_llm()\")\n",
" print(f\"streaming={streaming}\")\n",
" print(f\"json_format_response={json_format_response}\")\n",
" \n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
"\n",
" payload = {\n",
" \"model\": model,\n",
" \"messages\": messages\n",
" }\n",
" # Use Json Reponse Format\n",
" # Link: https://docs.litellm.ai/docs/completion/json_mode\n",
" if json_format_response:\n",
" payload[\"response_format\"]: { \"type\": \"json_object\" }\n",
" \n",
" if streaming:\n",
" payload[\"stream\"] = True\n",
" response = completion(**payload)\n",
" # Return a generator expression instead of using yield in the function\n",
" return (part.choices[0].delta.content or \"\" for part in response)\n",
" else:\n",
" response = completion(**payload)\n",
" return response[\"choices\"][0][\"message\"][\"content\"]"
]
},
{
"cell_type": "markdown",
"id": "f45e0972-a6a0-4237-8a69-e6f165f30e0d",
"metadata": {},
"source": [
"### Brochure building functions"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "c76d4ff9-0f18-49d0-a9b5-2c6c0bad359a",
"metadata": {},
"outputs": [],
"source": [
"# A class to represent a Webpage\n",
"\n",
"# Some websites need you to use proper headers when fetching them:\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
" \"\"\"\n",
" A utility class to represent a Website that we have scraped, now with links\n",
" \"\"\"\n",
"\n",
" def __init__(self, url):\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" self.body = response.content\n",
" soup = BeautifulSoup(self.body, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" if soup.body:\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
" else:\n",
" self.text = \"\"\n",
" links = [link.get('href') for link in soup.find_all('a')]\n",
" self.links = [link for link in links if link]\n",
"\n",
" def get_contents(self):\n",
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\""
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "ff41b687-3a46-4bca-a031-1148b91a4fdf",
"metadata": {},
"outputs": [],
"source": [
"def get_links(url, model):\n",
" if DEBUG_OUTPUT:\n",
" print(\"get_links()\")\n",
" website = Website(url)\n",
"\n",
" link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n",
" You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n",
" such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
" link_system_prompt += \"You should respond in raw JSON exactly as specified in this example. DO NOT USE MARKDOWN.\"\n",
" link_system_prompt += \"\"\"\n",
" {\n",
" \"links\": [\n",
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
" {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n",
" ]\n",
" }\n",
" \"\"\"\n",
" \n",
" result = call_llm(model=model, \n",
" system_prompt=link_system_prompt, \n",
" user_prompt=get_links_user_prompt(website), \n",
" json_format_response=True, \n",
" streaming=False)\n",
" if DEBUG_OUTPUT:\n",
" print(result)\n",
" return json.loads(result)\n",
"\n",
"def get_links_user_prompt(website):\n",
" if DEBUG_OUTPUT:\n",
" print(\"get_links_user_prompt()\")\n",
" \n",
" user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
" user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
"Do not include Terms of Service, Privacy, email links.\\n\"\n",
" user_prompt += \"Links (some might be relative links):\\n\"\n",
" user_prompt += \"\\n\".join(website.links)\n",
"\n",
" if DEBUG_OUTPUT:\n",
" print(user_prompt)\n",
" \n",
" return user_prompt\n",
"\n",
"def get_all_details(url, model):\n",
" if DEBUG_OUTPUT:\n",
" print(\"get_all_details()\")\n",
" \n",
" result = \"Landing page:\\n\"\n",
" result += Website(url).get_contents()\n",
" links = get_links(url, model)\n",
" if DEBUG_OUTPUT:\n",
" print(\"Found links:\", links)\n",
" for link in links[\"links\"]:\n",
" result += f\"\\n\\n{link['type']}\\n\"\n",
" result += Website(link[\"url\"]).get_contents()\n",
" return result\n",
"\n",
"def get_brochure_user_prompt(company_name, url, model):\n",
" \n",
" if DEBUG_OUTPUT:\n",
" print(\"get_brochure_user_prompt()\")\n",
" \n",
" user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n",
" user_prompt += get_all_details(url, model)\n",
" user_prompt = user_prompt[:5000] # Truncate if more than 5,000 characters\n",
" return user_prompt\n"
]
},
{
"cell_type": "code",
"execution_count": 106,
"id": "cf7512a1-a498-44e8-a234-9affb72efe60",
"metadata": {},
"outputs": [],
"source": [
"def create_brochure(company_name, url, model, streaming):\n",
"\n",
" system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n",
"and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n",
"Include details of company culture, customers and careers/jobs if you have the information.\"\n",
" if streaming:\n",
" result = call_llm(model=model, system_prompt=system_prompt, user_prompt=get_brochure_user_prompt(company_name, url, model), streaming=True)\n",
" return (p for p in result)\n",
" else: \n",
" return call_llm(model=model, system_prompt=system_prompt, user_prompt=get_brochure_user_prompt(company_name, url, model), streaming=False)\n",
" "
]
},
{
"cell_type": "markdown",
"id": "ecb6d212-ddb6-4170-81bf-8f3ea54479f8",
"metadata": {},
"source": [
"#### Testing Model before implenting Gradio"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "de89843a-08ac-4431-8c83-21a93c05f764",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# Rio Tinto: Providing the Materials for a Sustainable Future\n",
"\n",
"## About Rio Tinto\n",
"\n",
"Rio Tinto is a global mining and metals company, operating in 35 countries with over 60,000 employees. Their purpose is to find better ways to provide the materials the world needs. Continuous improvement and innovation are at the core of their DNA, as they work to responsibly supply the metals and minerals critical for urbanization and the transition to a low-carbon economy.\n",
"\n",
"## Our Products\n",
"\n",
"Rio Tinto's diverse portfolio includes:\n",
"\n",
"- Iron Ore: The primary raw material used to make steel, which is strong, long-lasting and cost-efficient.\n",
"- Aluminium: A lightweight, durable and recyclable metal.\n",
"- Copper: A tough, malleable, corrosion-resistant and recyclable metal that is an excellent conductor of heat and electricity.\n",
"- Lithium: The lightest of all metals, a key element for low-carbon technologies.\n",
"- Diamonds: Ethically-sourced, high-quality diamonds.\n",
"\n",
"## Sustainability and Innovation\n",
"\n",
"Sustainability is at the heart of Rio Tinto's operations. They are targeting net zero emissions by 2050 and investing in nature-based solutions to complement their decarbonization efforts. Innovation is a key focus, with research and development into new technologies to improve efficiency and reduce environmental impact.\n",
"\n",
"## Careers and Culture\n",
"\n",
"Rio Tinto values its 60,000 employees and is committed to fostering a diverse and inclusive workplace. They offer a wide range of career opportunities, from mining and processing to engineering, finance, and more. Rio Tinto's culture is centered on safety, collaboration, and continuous improvement, with a strong emphasis on sustainability and responsible business practices.\n",
"\n",
"## Conclusion\n",
"\n",
"Rio Tinto is a global leader in the mining and metals industry, providing the materials essential for a sustainable future. Through their commitment to innovation, sustainability, and their talented workforce, Rio Tinto is well-positioned to meet the world's growing demand for critical resources.\n",
"\u001b[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\u001b[0m\n",
"LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.\n",
"\n",
"<generator object call_llm.<locals>.<genexpr> at 0x7f80ca5da0c0>\n"
]
}
],
"source": [
"MODEL=\"claude-3-haiku-20240307\"\n",
"DEBUG_OUTPUT=False\n",
"streaming=True\n",
"result = create_brochure(company_name=\"Rio Tinto\", url=\"http://www.riotinto.com\", model=MODEL, streaming=streaming)\n",
"\n",
"if streaming:\n",
" for chunk in result:\n",
" print(chunk, end=\"\", flush=True)\n",
"else:\n",
" print(result)\n"
]
},
{
"cell_type": "markdown",
"id": "1f330c92-6280-4dae-b4d8-717a56edb236",
"metadata": {},
"source": [
"#### Gradio Setup\n",
"Associate Dropdown values with the model we want to use.\n",
"Link: https://www.gradio.app/docs/gradio/dropdown#initialization"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2f38862-3728-4bba-9e16-6f9fab276145",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"DEBUG_OUTPUT=True\n",
"view = gr.Interface(\n",
" fn=create_brochure,\n",
" inputs=[\n",
" gr.Textbox(label=\"Company name:\"),\n",
" gr.Textbox(label=\"Landing page URL including http:// or https://\"),\n",
" gr.Dropdown(choices=[(\"GPT 4o Mini\", \"gpt-4o-mini\"), \n",
" (\"Claude Haiku 3\", \"claude-3-haiku-20240307\"), \n",
" (\"Gemini 2.0 Flash\", \"gemini/gemini-2.0-flash\")], \n",
" label=\"Select model\"),\n",
" gr.Checkbox(label=\"Stream\")\n",
" ],\n",
" outputs=[gr.Markdown(label=\"Brochure:\")],\n",
" flagging_mode=\"never\"\n",
")\n",
"view.launch()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e0981136-2067-43b8-b17d-83560dd609ce",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,569 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "BSbc4VbLi2Ek"
},
"source": [
"# Synthetic Dataset generator\n",
"- 🚀 Live Demo: https://huggingface.co/spaces/lisekarimi/datagen\n",
"- 🧑‍💻 Repo: https://github.com/lisekarimi/datagen\n",
"\n",
"---\n",
"\n",
"- 🌍 **Task**: Generate realistic synthetic datasets\n",
"- 🎯 **Supported Data Types**: Tabular, Text, Time-series\n",
"- 🧠 **Models**: GPT (OpenAI) , Claude (Anthropic), CodeQwen1.5-7B-Chat (via Hugging Face Inference) / Llama (in Google Colab through T4 GPU)\n",
"- 🚀 **Tools**: Python, Gradio UI, OpenAI / Anthropic / HuggingFace APIs\n",
"- 📤 **Output Formats**: JSON and CSV file\n",
"- 🧑‍💻 **Skill Level**: Intermediate\n",
"\n",
"🎯 **How It Works**\n",
"\n",
"1⃣ Define your business problem or dataset topic.\n",
"\n",
"2⃣ Choose the dataset type, output format, model, and number of samples.\n",
"\n",
"3⃣ The LLM generates the code; you can adjust or modify it as needed.\n",
"\n",
"4⃣ Execute the code to generate your output file.\n",
"\n",
"🛠️ **Requirements** \n",
"- ⚙️ **Hardware**: ✅ GPU required (model download); Google Colab recommended (T4)\n",
"- 🔑 OpenAI API Key (for GPT) \n",
"- 🔑 Anthropic API Key (for Claude) \n",
"- 🔑 Hugging Face Token \n",
"\n",
"**Deploy CodeQwen Endpoint:**\n",
"- Visit https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat\n",
"- Click **Deploy** → **Inference Endpoints** → **Create Endpoint** (requires credit card)\n",
"- Copy your endpoint URL: `https://[id].us-east-1.aws.endpoints.huggingface.cloud`\n",
"\n",
"⚙️ **Customizable by user** \n",
"- 🤖 Selected model: GPT / Claude / Llama / Code Qwen\n",
"- 📜 `system_prompt`: Controls model behavior (concise, accurate, structured) \n",
"- 💬 `user_prompt`: Dynamic — include other fields\n",
"\n",
"---\n",
"📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9E-Ioggxi2Em"
},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pR-ftUatjEGd",
"outputId": "ae5668c5-c369-4066-bbbf-b560fb28e39a"
},
"outputs": [],
"source": [
"# Install required packages in Google Colab\n",
"%pip install -q python-dotenv gradio anthropic openai requests torch bitsandbytes transformers sentencepiece accelerate"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "VPmk2-Ggi2Em"
},
"outputs": [],
"source": [
"import re\n",
"import sys\n",
"import subprocess\n",
"import threading\n",
"import anthropic\n",
"import torch\n",
"import gradio as gr\n",
"from openai import OpenAI\n",
"from huggingface_hub import InferenceClient, login\n",
"from google.colab import userdata\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DUQ55_oji2En"
},
"source": [
"## Initialization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "MiicxGawi2En"
},
"outputs": [],
"source": [
"# Google Colab User Data\n",
"# Ensure you have set the following in your Google Colab environment:\n",
"openai_api_key = userdata.get(\"OPENAI_API_KEY\")\n",
"anthropic_api_key = userdata.get(\"ANTHROPIC_API_KEY\")\n",
"hf_token = userdata.get('HF_TOKEN')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"OPENAI_MODEL = \"gpt-4o-mini\"\n",
"CLAUDE_MODEL = \"claude-3-5-sonnet-20240620\"\n",
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
"\n",
"code_qwen = \"Qwen/CodeQwen1.5-7B-Chat\"\n",
"CODE_QWEN_URL = \"https://zfkokxzs1xrqv13v.us-east-1.aws.endpoints.huggingface.cloud\"\n",
"\n",
"login(hf_token, add_to_git_credential=True)\n",
"openai = OpenAI(api_key=openai_api_key)\n",
"claude = anthropic.Anthropic(api_key=anthropic_api_key)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ipA1F440i2En"
},
"source": [
"## Prompts definition"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JgtqCyRji2En"
},
"outputs": [],
"source": [
"system_message = \"\"\"\n",
"You are a helpful assistant whose main purpose is to generate datasets for business problems.\n",
"\n",
"Be less verbose.\n",
"Be accurate and concise.\n",
"\n",
"The user will describe a business problem. Based on this, you must generate a synthetic dataset that fits the context.\n",
"\n",
"The dataset should be saved in a specific format such as CSV, JSON — the desired format will be specified by the user.\n",
"\n",
"The dependencies for python code should include only standard python libraries such as numpy, pandas and built-in libraries.\n",
"\n",
"When saving a DataFrame to JSON using `to_json()`, do not use the `encoding` parameter. Instead, manually open the file with `open()` and specify the encoding. Then pass the file object to `to_json()`.\n",
"\n",
"Ensure Python code blocks are correctly indented, especially inside `with`, `for`, `if`, `try`, and `def` blocks.\n",
"\n",
"Return only the Python code that generates and saves the dataset.\n",
"After saving the file, print the code that was executed and a message confirming the dataset was generated successfully.\n",
"\"\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Bk6saP4oi2Eo"
},
"outputs": [],
"source": [
"def user_prompt(**input_data):\n",
" user_prompt = f\"\"\"\n",
" Generate a synthetic {input_data[\"dataset_type\"].lower()} dataset in {input_data[\"output_format\"].upper()} format.\n",
" Business problem: {input_data[\"business_problem\"]}\n",
" Samples: {input_data[\"num_samples\"]}\n",
" \"\"\"\n",
" return user_prompt\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XnrPiAZ7i2Eo"
},
"source": [
"## Call API for Closed Models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Sx7hHKczi2Eo"
},
"outputs": [],
"source": [
"def stream_gpt(user_prompt):\n",
" stream = openai.chat.completions.create(\n",
" model=OPENAI_MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\",\"content\": user_prompt},\n",
" ],\n",
" stream=True,\n",
" )\n",
"\n",
" response = \"\"\n",
" for chunk in stream:\n",
" response += chunk.choices[0].delta.content or \"\"\n",
" yield response\n",
"\n",
" return response\n",
"\n",
"\n",
"def stream_claude(user_prompt):\n",
" result = claude.messages.stream(\n",
" model=CLAUDE_MODEL,\n",
" max_tokens=2000,\n",
" system=system_message,\n",
" messages=[\n",
" {\"role\": \"user\",\"content\": user_prompt}\n",
" ]\n",
" )\n",
" reply = \"\"\n",
" with result as stream:\n",
" for text in stream.text_stream:\n",
" reply += text\n",
" yield reply\n",
" print(text, end=\"\", flush=True)\n",
" return reply\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PUPeZ4xPi2Eo"
},
"source": [
"## Call Open Source Models\n",
"- Llama is downloaded and run on T4 GPU (Google Colab).\n",
"- Code Qwen is run through inference endpoint"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "W0AuZT2uk0Sd"
},
"outputs": [],
"source": [
"def stream_llama(user_prompt):\n",
" try:\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\",\"content\": user_prompt},\n",
" ]\n",
"\n",
" tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
" tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
" quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
" )\n",
"\n",
" model = AutoModelForCausalLM.from_pretrained(\n",
" LLAMA,\n",
" device_map=\"auto\",\n",
" quantization_config=quant_config\n",
" )\n",
"\n",
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
" streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)\n",
"\n",
" thread = threading.Thread(target=model.generate, kwargs={\n",
" \"input_ids\": inputs,\n",
" \"max_new_tokens\": 1000,\n",
" \"pad_token_id\": tokenizer.eos_token_id,\n",
" \"streamer\": streamer\n",
" })\n",
" thread.start()\n",
"\n",
" started = False\n",
" reply = \"\"\n",
"\n",
" for new_text in streamer:\n",
" if not started:\n",
" if \"<|start_header_id|>assistant<|end_header_id|>\" in new_text:\n",
" started = True\n",
" new_text = new_text.split(\"<|start_header_id|>assistant<|end_header_id|>\")[-1].strip()\n",
" else:\n",
" continue\n",
"\n",
" if \"<|eot_id|>\" in new_text:\n",
" new_text = new_text.replace(\"<|eot_id|>\", \"\")\n",
" if new_text.strip():\n",
" reply += new_text\n",
" yield reply\n",
" break\n",
"\n",
" if new_text.strip():\n",
" reply += new_text\n",
" yield reply\n",
"\n",
" return reply\n",
"\n",
" except Exception as e:\n",
" print(f\"LLaMA error: {e}\")\n",
" raise\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "V0JS_6THi2Eo"
},
"outputs": [],
"source": [
"def stream_code_qwen(user_prompt):\n",
" tokenizer = AutoTokenizer.from_pretrained(code_qwen)\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\",\"content\": user_prompt},\n",
" ]\n",
" text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
" client = InferenceClient(CODE_QWEN_URL, token=hf_token)\n",
" stream = client.text_generation(text, stream=True, details=True, max_new_tokens=3000)\n",
" result = \"\"\n",
" for r in stream:\n",
" result += r.token.text\n",
" yield result"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PqG57dJIi2Eo"
},
"source": [
"## Select the model and generate the ouput"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YqSKnklRi2Eo"
},
"outputs": [],
"source": [
"def generate_from_inputs(model, **input_data):\n",
" # print(\"🔍 input_data received:\", input_data)\n",
" user_prompt_str = user_prompt(**input_data)\n",
"\n",
" if model == \"GPT\":\n",
" result = stream_gpt(user_prompt_str)\n",
" elif model == \"Claude\":\n",
" result = stream_claude(user_prompt_str)\n",
" elif model == \"Llama\":\n",
" result = stream_llama(user_prompt_str)\n",
" elif model == \"Code Qwen\":\n",
" result = stream_code_qwen(user_prompt_str)\n",
" else:\n",
" raise ValueError(\"Unknown model\")\n",
"\n",
" for stream_so_far in result:\n",
" yield stream_so_far\n",
"\n",
" return result\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "zG6_TSfni2Eo"
},
"outputs": [],
"source": [
"def handle_generate(business_problem, dataset_type, dataset_format, num_samples, model):\n",
" input_data = {\n",
" \"business_problem\": business_problem,\n",
" \"dataset_type\": dataset_type,\n",
" \"output_format\": dataset_format,\n",
" \"num_samples\": num_samples,\n",
" }\n",
"\n",
" response = generate_from_inputs(model, **input_data)\n",
" for chunk in response:\n",
" yield chunk\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "p5DQcx71i2Ep"
},
"source": [
"## Extract python code from the LLM output and execute it locally"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NcEkmsnai2Ep",
"jp-MarkdownHeadingCollapsed": true
},
"outputs": [],
"source": [
"def extract_code(text):\n",
" match = re.search(r\"```python(.*?)```\", text, re.DOTALL)\n",
"\n",
" if match:\n",
" code = match.group(0).strip()\n",
" else:\n",
" code = \"\"\n",
" print(\"No matching substring found.\")\n",
"\n",
" return code.replace(\"```python\\n\", \"\").replace(\"```\", \"\")\n",
"\n",
"\n",
"def execute_code_in_virtualenv(text, python_interpreter=sys.executable):\n",
" if not python_interpreter:\n",
" raise EnvironmentError(\"Python interpreter not found in the specified virtual environment.\")\n",
"\n",
" code_str = extract_code(text)\n",
" command = [python_interpreter, '-c', code_str]\n",
"\n",
" try:\n",
" result = subprocess.run(command, check=True, capture_output=True, text=True)\n",
" stdout = result.stdout\n",
" return stdout\n",
"\n",
" except subprocess.CalledProcessError as e:\n",
" return f\"Execution error:\\n{e}\"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DQgEyFzJi2Ep"
},
"source": [
"## Gradio interface"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SEiZVkdFi2Ep"
},
"outputs": [],
"source": [
"def update_output_format(dataset_type):\n",
" if dataset_type in [\"Tabular\", \"Time-series\"]:\n",
" return gr.update(choices=[\"JSON\", \"csv\"], value=\"JSON\")\n",
" elif dataset_type == \"Text\":\n",
" return gr.update(choices=[\"JSON\"], value=\"JSON\")\n",
"\n",
"with gr.Blocks() as ui:\n",
" gr.Markdown(\"## Create a dataset for a business problem\")\n",
"\n",
" with gr.Column():\n",
" business_problem = gr.Textbox(label=\"Business problem\", lines=2)\n",
" dataset_type = gr.Dropdown(\n",
" [\"Tabular\", \"Time-series\", \"Text\"], label=\"Dataset type\"\n",
" )\n",
"\n",
" output_format = gr.Dropdown( choices=[\"JSON\", \"csv\"], value=\"JSON\",label=\"Output Format\")\n",
"\n",
" num_samples = gr.Number(label=\"Number of samples\", value=10, precision=0)\n",
"\n",
" model = gr.Dropdown([\"GPT\", \"Claude\", \"Llama\", \"Code Qwen\"], label=\"Select model\", value=\"GPT\")\n",
"\n",
" dataset_type.change(update_output_format,inputs=[dataset_type], outputs=[output_format])\n",
"\n",
" with gr.Row():\n",
" with gr.Column():\n",
" dataset_run = gr.Button(\"Create a dataset\")\n",
" gr.Markdown(\"\"\"⚠️ For Llama and Code Qwen: The generated code might not be optimal. It's recommended to review it before execution.\n",
" Some mistakes may occur.\"\"\")\n",
"\n",
" with gr.Column():\n",
" code_run = gr.Button(\"Execute code for a dataset\")\n",
" gr.Markdown(\"\"\"⚠️ Be cautious when sharing this app with code execution publicly, as it could pose safety risks.\n",
" The execution of user-generated code may lead to potential vulnerabilities, and its important to use this tool responsibly.\"\"\")\n",
"\n",
" with gr.Row():\n",
" dataset_out = gr.Textbox(label=\"Generated Dataset\")\n",
" code_out = gr.Textbox(label=\"Executed code\")\n",
"\n",
" dataset_run.click(\n",
" handle_generate,\n",
" inputs=[business_problem, dataset_type, output_format, num_samples, model],\n",
" outputs=[dataset_out]\n",
" )\n",
"\n",
" code_run.click(\n",
" execute_code_in_virtualenv,\n",
" inputs=[dataset_out],\n",
" outputs=[code_out]\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 646
},
"id": "jCAkTEtMi2Ep",
"outputId": "deeeb1a7-c432-4007-eba2-cbcc28dbc0ff"
},
"outputs": [],
"source": [
"ui.launch(inbrowser=True)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,262 @@
# -*- coding: utf-8 -*-
"""new_training_with_RAG.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1gi8FPI1dtnxBNTf86JdmXQ0BYqnKz7LS
# Predict Product Prices
"""
!nvidia-smi
!pip install -q datasets requests torch peft bitsandbytes transformers trl accelerate sentencepiece matplotlib langchain-community chromadb
import os
import re
import math
from tqdm import tqdm
from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import (
AutoModelForCausalLM, AutoTokenizer, TrainingArguments,
set_seed, BitsAndBytesConfig, GenerationConfig)
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
from datetime import datetime
import matplotlib.pyplot as plt
#LangChain & RAG Imports
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.vectorstores import Chroma
import chromadb
from langchain.embeddings import HuggingFaceEmbeddings
# Commented out IPython magic to ensure Python compatibility.
# Constants
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
#BASE_MODEL = 'mistralai/Mistral-7B-Instruct-v0.1'
PROJECT_NAME = "pricer-optim"
HF_USER = "Adriana213"
# Data
DATASET_NAME = f"{HF_USER}/pricer-data"
MAX_SEQUENCE_LENGTH = 182
RUN_NAME = f"{PROJECT_NAME}-{datetime.now():%Y%m%d_%H%M%S}"
HUB_MODEL_NAME = f"{HF_USER}/{RUN_NAME}"
# Hyperparameters for QLoRA
LORA_R = 8
LORA_ALPHA = 32
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]
LORA_DROPOUT = 0.10
QUANT_4_BIT = True
# Hyperparameters for Training
EPOCHS = 2
BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 2e-4
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.05
OPTIMIZER = "paged_adamw_32bit"
STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200 # kept for potential future use
# %matplotlib inline
HUB_MODEL_NAME
"""### Log in to HuggingFace & get Data"""
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)
torch.cuda.empty_cache()
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']
"""## Now load the Tokenizer and Model
The model is "quantized" - we are reducing the precision to 4 bits.
"""
# Pick the right quantization
if QUANT_4_BIT:
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4"
)
else:
quant_config = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_compute_dtype=torch.bfloat16
)
# Load the Tokenizer and the Model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=quant_config,
device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id
print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB")
"""# Data Collator
"""
from trl import DataCollatorForCompletionOnlyLM
response_template = "Price is $"
collator = DataCollatorForCompletionOnlyLM(response_template,
tokenizer=tokenizer)
"""# Set up the configuration for Training"""
# LoRA Config
lora_parameters = LoraConfig(
lora_alpha = LORA_ALPHA,
lora_dropout = LORA_DROPOUT,
r = LORA_R,
bias = "none",
task_type = "CAUSAL_LM",
target_modules = TARGET_MODULES,
)
# Training Config
train_parameters = SFTConfig(
output_dir = RUN_NAME,
num_train_epochs = EPOCHS,
per_device_train_batch_size = BATCH_SIZE,
per_device_eval_batch_size = 4,
eval_strategy = "no",
eval_steps = EVAL_STEPS,
gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
optim = OPTIMIZER,
save_steps = SAVE_STEPS,
save_total_limit = 5,
logging_steps = 50,
learning_rate = LEARNING_RATE,
weight_decay = 0.01,
fp16=False,
bf16=True,
max_grad_norm=0.3,
max_steps=-1,
warmup_ratio = WARMUP_RATIO,
group_by_length=True,
lr_scheduler_type = LR_SCHEDULER_TYPE,
run_name = RUN_NAME,
max_seq_length = MAX_SEQUENCE_LENGTH,
dataset_text_field = "text",
save_strategy = "steps",
hub_strategy = "every_save",
push_to_hub = True,
hub_model_id = HUB_MODEL_NAME,
hub_private_repo = True,
report_to = 'none',
)
fine_tuning = SFTTrainer(
model = base_model,
train_dataset = train,
eval_dataset=test,
peft_config = lora_parameters,
args = train_parameters,
data_collator = collator,
)
"""## Fine Tuning"""
fine_tuning.train()
fine_tuning.model.push_to_hub(RUN_NAME, private=True)
print(f"Saved to the hub: {RUN_NAME}")
"""# Implement RAG"""
HF_USER = "Adriana213"
RUN_NAME = "pricer-optim-20250514_061529"
fine_tuned_model = PeftModel.from_pretrained(base_model, f"{HF_USER}/{RUN_NAME}")
print(f"✅ Loaded fine-tuned adapter: {HF_USER}/{RUN_NAME}")
base_model = fine_tuned_model
"""## Build Chroma index"""
docs = [
Document(page_content=text, metadata = {'price': price})
for text, price in zip(train['text'], train['price'])
]
# Create embeddings & persist Chroma index
embedding = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')
chroma = Chroma.from_documents(
documents = docs,
embedding = embedding,
persist_directory = 'chroma_train_index'
)
chroma.persist()
print('Chroma index built and persisted.')
"""## RAG Prediction Function"""
generation_config = GenerationConfig(
max_new_token = 10,
do_sample = False,
temperature = 0.1
)
def predict_price_rag(desc: str, k: int = 3) -> float:
hits = chroma.similarity_search(desc, k = k)
shot_strs = [
f'Description: {doc.page_content}\nPrice is ${doc.metadata["price"]}'
for doc in hits
]
prompt = "\n\n".join(shot_strs) + f"\n\nDescription: {desc}\nPrice is $"
inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
out = base_model.generate(**inputs, generation_config=generation_config)
text = tokenizer.decode(
out[0, inputs["input_ids"].shape[-1]:],
skip_special_tokens=True
).strip()
return float(re.findall(r"\d+\.?\d+", text)[0])
!zip -r chroma_index.zip chroma_train_index
from google.colab import files
files.download("chroma_index.zip")

View File

@@ -0,0 +1,258 @@
# -*- coding: utf-8 -*-
"""Testing Fine-tuned model with RAG
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1J8P8cwqwhBo3CNIZaEFe6BMRw0WUfEqy
## Predict Product Prices
### And now, to evaluate our fine-tuned open source model
"""
!pip install -q datasets peft requests torch bitsandbytes transformers trl accelerate sentencepiece matplotlib langchain-community chromadb
import os
import re
import math
from google.colab import userdata
from huggingface_hub import login
import torch
import torch.nn.functional as F
from transformers import (
AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig, GenerationConfig)
from datasets import load_dataset
from peft import PeftModel
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import matplotlib.pyplot as plt
# Commented out IPython magic to ensure Python compatibility.
# Constants
BASE_MODEL = "meta-llama/Llama-3.1-8B"
PROJECT_NAME = "pricer"
HF_USER = "Adriana213"
RUN_NAME = "optim-20250514_061529"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"
# Data
DATASET_NAME = f"{HF_USER}/pricer-data"
# Hyperparameters for QLoRA
QUANT_4_BIT = True
# %matplotlib inline
# Used for writing to output in color
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}
"""### Log in to HuggingFace
"""
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']
test[0]
"""## Now load the Tokenizer and Model"""
if QUANT_4_BIT:
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4"
)
else:
quant_config = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_compute_dtype=torch.bfloat16
)
# Load the Tokenizer and the Model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=quant_config,
device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id
# Load the fine-tuned model with PEFT
fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)
print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB")
fine_tuned_model
"""# Evaluation"""
def extract_price(s):
if "Price is $" in s:
contents = s.split("Price is $")[1]
contents = contents.replace(',','')
match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
return float(match.group()) if match else 0
return 0
extract_price("Price is $a fabulous 899.99 or so")
# Original prediction function takes the most likely next token
def model_predict(prompt):
inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
attention_mask = torch.ones(inputs.shape, device="cuda")
outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=3, num_return_sequences=1)
response = tokenizer.decode(outputs[0])
return extract_price(response)
# top_K = 3
# def improved_model_predict(prompt, device="cuda"):
# set_seed(42)
# inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
# attention_mask = torch.ones(inputs.shape, device=device)
# with torch.no_grad():
# outputs = fine_tuned_model(inputs, attention_mask=attention_mask)
# next_token_logits = outputs.logits[:, -1, :].to('cpu')
# next_token_probs = F.softmax(next_token_logits, dim=-1)
# top_prob, top_token_id = next_token_probs.topk(top_K)
# prices, weights = [], []
# for i in range(top_K):
# predicted_token = tokenizer.decode(top_token_id[0][i])
# probability = top_prob[0][i]
# try:
# result = float(predicted_token)
# except ValueError as e:
# result = 0.0
# if result > 0:
# prices.append(result)
# weights.append(probability)
# if not prices:
# return 0.0, 0.0
# total = sum(weights)
# weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]
# return sum(weighted_prices).item()
embedder = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
chroma = Chroma(
persist_directory = "chroma_train_index",
embedding_function = embedder
)
gen_config = GenerationConfig(max_new_tokens=10, do_sample=False)
def predict_price_rag(desc: str, k: int = 3) -> float:
docs = chroma.similarity_search(desc, k=k)
shots = "\n\n".join(f"Description: {d.page_content}\nPrice is ${d.metadata['price']}"
for d in docs)
prompt = f"{shots}\n\nDescription: {desc}\nPrice is $"
inp = tokenizer(prompt, return_tensors="pt").to(fine_tuned_model.device)
out = fine_tuned_model.generate(**inp, generation_config=gen_config)
txt = tokenizer.decode(out[0, inp["input_ids"].shape[-1]:], skip_special_tokens=True).strip()
return float(re.findall(r"\d+\.?\d+", txt)[0])
class Tester:
def __init__(self, predictor, data, title=None, size=250):
self.predictor = predictor
self.data = data
self.title = title or predictor.__name__.replace("_", " ").title()
self.size = size
self.guesses = []
self.truths = []
self.errors = []
self.sles = []
self.colors = []
def color_for(self, error, truth):
if error<40 or error/truth < 0.2:
return "green"
elif error<80 or error/truth < 0.4:
return "orange"
else:
return "red"
def run_datapoint(self, i):
datapoint = self.data[i]
guess = self.predictor(datapoint["text"])
truth = datapoint["price"]
error = abs(guess - truth)
log_error = math.log(truth+1) - math.log(guess+1)
sle = log_error ** 2
color = self.color_for(error, truth)
title = datapoint["text"].split("\n\n")[1][:20] + "..."
self.guesses.append(guess)
self.truths.append(truth)
self.errors.append(error)
self.sles.append(sle)
self.colors.append(color)
print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")
def chart(self, title):
max_error = max(self.errors)
plt.figure(figsize=(12, 8))
max_val = max(max(self.truths), max(self.guesses))
plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
plt.xlabel('Ground Truth')
plt.ylabel('Model Estimate')
plt.xlim(0, max_val)
plt.ylim(0, max_val)
plt.title(title)
plt.show()
def report(self):
average_error = sum(self.errors) / self.size
rmsle = math.sqrt(sum(self.sles) / self.size)
hits = sum(1 for color in self.colors if color=="green")
title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
self.chart(title)
def run(self):
self.error = 0
for i in range(self.size):
self.run_datapoint(i)
self.report()
@classmethod
def test(cls, function, data):
cls(function, data).run()
Tester.test(predict_price_rag, test)

View File

@@ -44,7 +44,6 @@
"from sentence_transformers import SentenceTransformer\n", "from sentence_transformers import SentenceTransformer\n",
"from datasets import load_dataset\n", "from datasets import load_dataset\n",
"import chromadb\n", "import chromadb\n",
"from items import Item\n",
"from sklearn.manifold import TSNE\n", "from sklearn.manifold import TSNE\n",
"import plotly.graph_objects as go" "import plotly.graph_objects as go"
] ]
@@ -77,6 +76,18 @@
"login(hf_token, add_to_git_credential=True)" "login(hf_token, add_to_git_credential=True)"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "8491f550-df4a-4c8f-a260-a7a419e8efb6",
"metadata": {},
"outputs": [],
"source": [
"# Another import after Logging in to Hugging Face - thank you Trung N.!\n",
"\n",
"from items import Item"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "3d4995a4-f67f-4871-87df-8c6439b06366", "id": "3d4995a4-f67f-4871-87df-8c6439b06366",

View File

@@ -44,7 +44,6 @@
"from sentence_transformers import SentenceTransformer\n", "from sentence_transformers import SentenceTransformer\n",
"from datasets import load_dataset\n", "from datasets import load_dataset\n",
"import chromadb\n", "import chromadb\n",
"from items import Item\n",
"from sklearn.manifold import TSNE\n", "from sklearn.manifold import TSNE\n",
"import plotly.graph_objects as go" "import plotly.graph_objects as go"
] ]
@@ -174,7 +173,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.11" "version": "3.11.12"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -44,7 +44,6 @@
"from sentence_transformers import SentenceTransformer\n", "from sentence_transformers import SentenceTransformer\n",
"from datasets import load_dataset\n", "from datasets import load_dataset\n",
"import chromadb\n", "import chromadb\n",
"from items import Item\n",
"from sklearn.manifold import TSNE\n", "from sklearn.manifold import TSNE\n",
"import plotly.graph_objects as go" "import plotly.graph_objects as go"
] ]
@@ -166,7 +165,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.11" "version": "3.11.12"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -48,7 +48,6 @@
"from sentence_transformers import SentenceTransformer\n", "from sentence_transformers import SentenceTransformer\n",
"from datasets import load_dataset\n", "from datasets import load_dataset\n",
"import chromadb\n", "import chromadb\n",
"from items import Item\n",
"from testing import Tester" "from testing import Tester"
] ]
}, },
@@ -66,6 +65,31 @@
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')" "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "ce73b034-9ec1-4533-ba41-3e57c7878b61",
"metadata": {},
"outputs": [],
"source": [
"# Log in to HuggingFace\n",
"\n",
"hf_token = os.environ['HF_TOKEN']\n",
"login(hf_token, add_to_git_credential=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c01daad-86b0-4bc0-91ba-20a64df043ed",
"metadata": {},
"outputs": [],
"source": [
"# Another import after Logging in to Hugging Face - thank you Trung N.!\n",
"\n",
"from items import Item"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@@ -495,7 +519,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.11" "version": "3.11.12"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -84,6 +84,31 @@
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')" "os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "1006966f-96b7-4e1a-93f0-2bb9a09057c8",
"metadata": {},
"outputs": [],
"source": [
"# Log in to HuggingFace\n",
"\n",
"hf_token = os.environ['HF_TOKEN']\n",
"login(hf_token, add_to_git_credential=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de0e4b22-ee61-4b79-95bc-3cd707d5f83d",
"metadata": {},
"outputs": [],
"source": [
"# Another import after Logging in to Hugging Face - thank you Trung N.!\n",
"\n",
"from items import Item"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

View File

@@ -78,7 +78,7 @@
" </td>\n", " </td>\n",
" <td>\n", " <td>\n",
" <h2 style=\"color:#f71;\">Additional resource: more sophisticated planning agent</h2>\n", " <h2 style=\"color:#f71;\">Additional resource: more sophisticated planning agent</h2>\n",
" <span style=\"color:#f71;\">The Planning Agent that we use in the next cell is simply a python script that calls the other Agents; frankly that's all we require for this project. But if you're intrigued to see a more Autonomous version in which we give the Planning Agent tools and allow it to decide which Agents to call, see my implementation of <a href=\"https://github.com/ed-donner/agentic/blob/main/workshop/agents/autonomous_planning_agent.py\">AutonomousPlanningAgent</a> in my related repo, <a href=\"https://github.com/ed-donner/agentic\">Agentic</a>. This is an example with multiple tools that dynamically decides which function to call.\n", " <span style=\"color:#f71;\">The Planning Agent that we use in the next cell is simply a python script that calls the other Agents; frankly that's all we require for this project. But if you're intrigued to see a more Autonomous version in which we give the Planning Agent tools and allow it to decide which Agents to call, see my implementation of <a href=\"https://github.com/ed-donner/agentic/blob/main/workshop/price_agents/autonomous_planning_agent.py\">AutonomousPlanningAgent</a> in my related repo, <a href=\"https://github.com/ed-donner/agentic\">Agentic</a>. This is an example with multiple tools that dynamically decides which function to call.\n",
" </span>\n", " </span>\n",
" </td>\n", " </td>\n",
" </tr>\n", " </tr>\n",
@@ -144,7 +144,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.11" "version": "3.11.12"
} }
}, },
"nbformat": 4, "nbformat": 4,