Files
LLM_Engineering_OLD/week1/community-contributions/week1-EXERCISE_rewrite-internal-tools-code-and-UI-image_jeannine-jordan.ipynb
2025-05-30 06:15:59 -04:00

350 lines
15 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "fe12c203-e6a6-452c-a655-afb8a03a4ff5",
"metadata": {},
"source": [
"# End of week 1 exercise\n",
"\n",
"To demonstrate your familiarity with OpenAI API, and also Ollama, build a tool that takes a technical question, \n",
"and responds with an explanation. This is a tool that you will be able to use yourself during the course!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1070317-3ed9-4659-abe3-828943230e03",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"import json\n",
"from typing import List\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display, update_display, Image\n",
"from openai import OpenAI\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.chrome.service import Service\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from webdriver_manager.chrome import ChromeDriverManager\n",
"from urllib.parse import urlparse, urljoin\n",
"import time\n",
"import random\n",
"import concurrent.futures\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a456906-915a-4bfd-bb9d-57e505c5093f",
"metadata": {},
"outputs": [],
"source": [
"# constants\n",
"\n",
"MODEL = 'gpt-4o-mini'\n",
"openai = OpenAI()\n",
"MODEL_GPT = 'gpt-4o-mini'\n",
"MODEL_LLAMA = 'llama3.2'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8d7923c-5f28-4c30-8556-342d7c8497c1",
"metadata": {},
"outputs": [],
"source": [
"# set up environment\n",
"\n",
"# A modified class to fetch and parse fully rendered pages\n",
"class NewWebsite:\n",
" shared_driver = None # Class variable to share browser instance\n",
"\n",
" def __init__(self, url, driver=None):\n",
" self.url = url\n",
" self.driver = driver or NewWebsite._get_shared_driver()\n",
" self.text, self.title, self.links = self._scrape_content()\n",
" \n",
" @classmethod\n",
" def _get_shared_driver(cls):\n",
" if cls.shared_driver is None:\n",
" # Set up headless Chrome options\n",
" options = Options()\n",
" options.add_argument(\"--headless=new\")\n",
" options.add_argument(\"--disable-gpu\")\n",
" options.add_argument(\"--no-sandbox\")\n",
" options.add_argument(\"--disable-dev-shm-usage\")\n",
" options.add_argument(\"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\")\n",
"\n",
" service = Service(ChromeDriverManager().install())\n",
" cls.shared_driver = webdriver.Chrome(service=service, options=options)\n",
" return cls.shared_driver\n",
"\n",
" def _scrape_content(self):\n",
" try:\n",
" self.driver.get(self.url)\n",
" # Mimick human browsing behavior without overloading the server\n",
" WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, \"a\")))\n",
" # Allow JS-rendered content to settle\n",
" time.sleep(2)\n",
"\n",
" # Get the page source after rendering\n",
" soup = BeautifulSoup(self.driver.page_source, \"html.parser\")\n",
" \n",
" for tag in soup([\"script\", \"style\", \"img\", \"input\"]):\n",
" tag.decompose()\n",
" \n",
" title = soup.title.string.strip() if soup.title and soup.title.string else \"No title found\"\n",
" body = soup.body\n",
" text = soup.body.get_text(separator=\"\\n\", strip=True) if body else \"No content found.\"\n",
"\n",
" # Extract and clean links\n",
" links = []\n",
" for link_tag in soup.find_all(\"a\", href=True):\n",
" href = link_tag[\"href\"].strip()\n",
" if href and not href.startswith((\"mailto:\", \"tel:\", \"javascript:\")):\n",
" full_url = urljoin(self.url, href)\n",
" links.append(full_url)\n",
" \n",
" return text, title, links\n",
" \n",
" except Exception as e:\n",
" return \"Error loading content\", \"Error\", []\n",
"\n",
" def get_contents(self):\n",
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n",
"\n",
" # Close the driver\n",
" @classmethod\n",
" def close_driver(cls):\n",
" if cls.shared_driver:\n",
" cls.shared_driver.quit()\n",
" cls.shared_driver = None\n",
"\n",
"link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n",
"You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n",
"such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
"link_system_prompt += \"You should respond in JSON as in this example:\"\n",
"link_system_prompt += \"\"\"\n",
"{\n",
" \"links\": [\n",
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
" {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n",
" ]\n",
"}\n",
"\"\"\"\n",
"\n",
"def get_links_user_prompt(website):\n",
" user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
" user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
"Do not include Terms of Service, Privacy, email links.\\n\"\n",
" user_prompt += \"Links (some might be relative links):\\n\"\n",
" user_prompt += \"\\n\".join(website.links)\n",
" return user_prompt\n",
"\n",
"def get_links(url):\n",
" website = NewWebsite(url)\n",
" response = openai.chat.completions.create(\n",
" model=MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
" {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
" ],\n",
" response_format={\"type\": \"json_object\"}\n",
" )\n",
" result = response.choices[0].message.content\n",
" return json.loads(result)\n",
"\n",
"def scrape_link(link):\n",
" try:\n",
" page = NewWebsite(link[\"url\"])\n",
" return f\"\\n\\n{link['type']}\\n{page.get_contents()}\"\n",
" except Exception as e:\n",
" return f\"\\n\\n{link['type']}\\nError loading page: {e}\"\n",
"\n",
"# Threaded scraper for linked pages\n",
"def get_all_details_rendered_concurrently(url):\n",
" result = \"Landing page:\\n\"\n",
" result += NewWebsite(url).get_contents()\n",
"\n",
" # LLM-filtered link generator\n",
" links = get_links(url)\n",
" print(\"Found links:\", links)\n",
"\n",
" with concurrent.futures.ThreadPoolExecutor() as executor:\n",
" future_to_link = {executor.submit(scrape_link, link): link for link in links[\"links\"]}\n",
" for future in concurrent.futures.as_completed(future_to_link):\n",
" result += future.result()\n",
"\n",
" # Close shared browser\n",
" NewWebsite.close_driver()\n",
" return result\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3f0d0137-52b0-47a8-81a8-11a90a010798",
"metadata": {},
"outputs": [],
"source": [
"# here is the question; type over this to ask something new\n",
"\n",
"system_prompt = \"You are an LLM Engineer that analyzes the contents of several relevant pages from a company website \\\n",
"rewrites internal tools and systems and rebuilds them end-to-end, starting from scratch. Starting with the online application at cardiff.co/apply, \\\n",
"Tell me why you're best suited to be the lead of this project and work with our 12 year resident developer to implement a \\\n",
"state of the art solution in record time. Include backend architecture, model orchestration, how you handle latency, cost and user experience, \\\n",
"and details of how you would achieve this goal based on company culture and industries served if you have the information, \\\n",
"and walk me through the details like you're explaining it to a sharp product owner. Respond in markdown.\"\\\n",
"\n",
"\n",
"def get_solution_user_prompt(company_name, url):\n",
" user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a solution to rewrite the company's application in markdown.\\n\"\n",
" #user_prompt += get_all_details(url)\n",
" user_prompt += get_all_details_rendered_concurrently(url)\n",
" user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
" return user_prompt\n",
"\n",
"def create_solution(company_name, url):\n",
" response = openai.chat.completions.create(\n",
" model=MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": get_solution_user_prompt(company_name, url)}\n",
" ],\n",
" )\n",
" result = response.choices[0].message.content\n",
" display(Markdown(result))\n",
"\n",
" return result\n",
"\n",
"#create_solution(\"Cardiff\", \"https://cardiff.co\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60ce7000-a4a5-4cce-a261-e75ef45063b4",
"metadata": {},
"outputs": [],
"source": [
"# Get gpt-4o-mini to answer, with streaming\n",
"\n",
"new_system_prompt = \"You are a Senior Engineer that analyzes the planned solution given to you for a company website \\\n",
"and you rewrite code for rebuilding internal tools and systems end-to-end based on the proposed solutions. \\\n",
"Start with the online application at cardiff.co/apply, use canvas and write code for the proposed solution \\\n",
"in the appropriate language that best suits the task for backend architecture, model orchestration, how you handle latency, cost and user experience wherever possible.\"\n",
"\n",
"output_dir = \"cardiff_rebuild_output\"\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"\n",
"def save_code_blocks(markdown_text, base_filename=\"cardiff_code\"):\n",
" output_dir = \"cardiff_rebuild_output\"\n",
" os.makedirs(output_dir, exist_ok=True)\n",
" \n",
" code_blocks = re.findall(r\"```(.*?)\\n(.*?)```\", markdown_text, re.DOTALL)\n",
" saved_files = []\n",
"\n",
" for idx, (language, code) in enumerate(code_blocks, 1):\n",
" ext = language.strip() if language else \"txt\"\n",
" filename = f\"{base_filename}_part{idx}.{ext}\"\n",
" filepath = os.path.join(output_dir, filename)\n",
" with open(filepath, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(code)\n",
" saved_files.append(filepath)\n",
"\n",
" return saved_files\n",
"\n",
"def develop_from_proposal(proposal_text, company_name):\n",
" # Stream code generation from GPT-4o\n",
" system = \"You are a senior software engineer. Use the following proposal to generate production-ready code to \\\n",
" implement the backend, frontend, and any orchestration described. Write clean, documented code in markdown format.\"\n",
" \n",
" stream = openai.chat.completions.create(\n",
" model=\"gpt-4o\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system},\n",
" {\"role\": \"user\", \"content\": proposal_text}\n",
" ],\n",
" stream=True\n",
" )\n",
"\n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" for chunk in stream:\n",
" content = chunk.choices[0].delta.content or \"\"\n",
" response += content\n",
" update_display(Markdown(response), display_id=display_handle.display_id)\n",
"\n",
" saved_files = save_code_blocks(response)\n",
" \n",
" # Generate a UI design mockup image\n",
" image_prompt = f\"A modern, mobile-friendly UI wireframe for a business loan application system for {company_name}. Clean layout, input fields for business name, revenue, loan amount, industry, and contact info. Includes a step-by-step progress bar, submit button, and secure branding.\"\n",
" \n",
" img_response = openai.images.generate(\n",
" model=\"dall-e-3\",\n",
" prompt=image_prompt,\n",
" n=1,\n",
" size=\"1024x1024\"\n",
" )\n",
" \n",
" image_url = img_response.data[0].url\n",
" img_path = os.path.join(output_dir, f\"{company_name.lower()}_ui_mockup.png\")\n",
" with open(img_path, 'wb') as handler:\n",
" handler.write(requests.get(image_url).content)\n",
"\n",
" print(\"Code files saved to:\", saved_files)\n",
" print(\"UI mockup saved at:\", img_path)\n",
"\n",
" display(Markdown(\"### Proposed UI Design\"))\n",
" display(Image(url=image_url))\n",
"\n",
"proposal = create_solution(\"Cardiff\", \"https://cardiff.co\")\n",
"develop_from_proposal(proposal, \"Cardiff\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f7c8ea8-4082-4ad0-8751-3301adcf6538",
"metadata": {},
"outputs": [],
"source": [
"# Get Llama 3.2 to answer"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}