350 lines
15 KiB
Plaintext
350 lines
15 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "fe12c203-e6a6-452c-a655-afb8a03a4ff5",
|
|
"metadata": {},
|
|
"source": [
|
|
"# End of week 1 exercise\n",
|
|
"\n",
|
|
"To demonstrate your familiarity with OpenAI API, and also Ollama, build a tool that takes a technical question, \n",
|
|
"and responds with an explanation. This is a tool that you will be able to use yourself during the course!"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c1070317-3ed9-4659-abe3-828943230e03",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# imports\n",
|
|
"\n",
|
|
"import os\n",
|
|
"import requests\n",
|
|
"import json\n",
|
|
"from typing import List\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"from bs4 import BeautifulSoup\n",
|
|
"from IPython.display import Markdown, display, update_display, Image\n",
|
|
"from openai import OpenAI\n",
|
|
"from selenium import webdriver\n",
|
|
"from selenium.webdriver.chrome.options import Options\n",
|
|
"from selenium.webdriver.chrome.service import Service\n",
|
|
"from selenium.webdriver.common.by import By\n",
|
|
"from selenium.webdriver.support.ui import WebDriverWait\n",
|
|
"from selenium.webdriver.support import expected_conditions as EC\n",
|
|
"from webdriver_manager.chrome import ChromeDriverManager\n",
|
|
"from urllib.parse import urlparse, urljoin\n",
|
|
"import time\n",
|
|
"import random\n",
|
|
"import concurrent.futures\n",
|
|
"import re"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4a456906-915a-4bfd-bb9d-57e505c5093f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# constants\n",
|
|
"\n",
|
|
"MODEL = 'gpt-4o-mini'\n",
|
|
"openai = OpenAI()\n",
|
|
"MODEL_GPT = 'gpt-4o-mini'\n",
|
|
"MODEL_LLAMA = 'llama3.2'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "a8d7923c-5f28-4c30-8556-342d7c8497c1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# set up environment\n",
|
|
"\n",
|
|
"# A modified class to fetch and parse fully rendered pages\n",
|
|
"class NewWebsite:\n",
|
|
" shared_driver = None # Class variable to share browser instance\n",
|
|
"\n",
|
|
" def __init__(self, url, driver=None):\n",
|
|
" self.url = url\n",
|
|
" self.driver = driver or NewWebsite._get_shared_driver()\n",
|
|
" self.text, self.title, self.links = self._scrape_content()\n",
|
|
" \n",
|
|
" @classmethod\n",
|
|
" def _get_shared_driver(cls):\n",
|
|
" if cls.shared_driver is None:\n",
|
|
" # Set up headless Chrome options\n",
|
|
" options = Options()\n",
|
|
" options.add_argument(\"--headless=new\")\n",
|
|
" options.add_argument(\"--disable-gpu\")\n",
|
|
" options.add_argument(\"--no-sandbox\")\n",
|
|
" options.add_argument(\"--disable-dev-shm-usage\")\n",
|
|
" options.add_argument(\"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\")\n",
|
|
"\n",
|
|
" service = Service(ChromeDriverManager().install())\n",
|
|
" cls.shared_driver = webdriver.Chrome(service=service, options=options)\n",
|
|
" return cls.shared_driver\n",
|
|
"\n",
|
|
" def _scrape_content(self):\n",
|
|
" try:\n",
|
|
" self.driver.get(self.url)\n",
|
|
" # Mimick human browsing behavior without overloading the server\n",
|
|
" WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, \"a\")))\n",
|
|
" # Allow JS-rendered content to settle\n",
|
|
" time.sleep(2)\n",
|
|
"\n",
|
|
" # Get the page source after rendering\n",
|
|
" soup = BeautifulSoup(self.driver.page_source, \"html.parser\")\n",
|
|
" \n",
|
|
" for tag in soup([\"script\", \"style\", \"img\", \"input\"]):\n",
|
|
" tag.decompose()\n",
|
|
" \n",
|
|
" title = soup.title.string.strip() if soup.title and soup.title.string else \"No title found\"\n",
|
|
" body = soup.body\n",
|
|
" text = soup.body.get_text(separator=\"\\n\", strip=True) if body else \"No content found.\"\n",
|
|
"\n",
|
|
" # Extract and clean links\n",
|
|
" links = []\n",
|
|
" for link_tag in soup.find_all(\"a\", href=True):\n",
|
|
" href = link_tag[\"href\"].strip()\n",
|
|
" if href and not href.startswith((\"mailto:\", \"tel:\", \"javascript:\")):\n",
|
|
" full_url = urljoin(self.url, href)\n",
|
|
" links.append(full_url)\n",
|
|
" \n",
|
|
" return text, title, links\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" return \"Error loading content\", \"Error\", []\n",
|
|
"\n",
|
|
" def get_contents(self):\n",
|
|
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n",
|
|
"\n",
|
|
" # Close the driver\n",
|
|
" @classmethod\n",
|
|
" def close_driver(cls):\n",
|
|
" if cls.shared_driver:\n",
|
|
" cls.shared_driver.quit()\n",
|
|
" cls.shared_driver = None\n",
|
|
"\n",
|
|
"link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n",
|
|
"You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n",
|
|
"such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
|
|
"link_system_prompt += \"You should respond in JSON as in this example:\"\n",
|
|
"link_system_prompt += \"\"\"\n",
|
|
"{\n",
|
|
" \"links\": [\n",
|
|
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
|
|
" {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n",
|
|
" ]\n",
|
|
"}\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"def get_links_user_prompt(website):\n",
|
|
" user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
|
|
" user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
|
|
"Do not include Terms of Service, Privacy, email links.\\n\"\n",
|
|
" user_prompt += \"Links (some might be relative links):\\n\"\n",
|
|
" user_prompt += \"\\n\".join(website.links)\n",
|
|
" return user_prompt\n",
|
|
"\n",
|
|
"def get_links(url):\n",
|
|
" website = NewWebsite(url)\n",
|
|
" response = openai.chat.completions.create(\n",
|
|
" model=MODEL,\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
|
|
" {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
|
|
" ],\n",
|
|
" response_format={\"type\": \"json_object\"}\n",
|
|
" )\n",
|
|
" result = response.choices[0].message.content\n",
|
|
" return json.loads(result)\n",
|
|
"\n",
|
|
"def scrape_link(link):\n",
|
|
" try:\n",
|
|
" page = NewWebsite(link[\"url\"])\n",
|
|
" return f\"\\n\\n{link['type']}\\n{page.get_contents()}\"\n",
|
|
" except Exception as e:\n",
|
|
" return f\"\\n\\n{link['type']}\\nError loading page: {e}\"\n",
|
|
"\n",
|
|
"# Threaded scraper for linked pages\n",
|
|
"def get_all_details_rendered_concurrently(url):\n",
|
|
" result = \"Landing page:\\n\"\n",
|
|
" result += NewWebsite(url).get_contents()\n",
|
|
"\n",
|
|
" # LLM-filtered link generator\n",
|
|
" links = get_links(url)\n",
|
|
" print(\"Found links:\", links)\n",
|
|
"\n",
|
|
" with concurrent.futures.ThreadPoolExecutor() as executor:\n",
|
|
" future_to_link = {executor.submit(scrape_link, link): link for link in links[\"links\"]}\n",
|
|
" for future in concurrent.futures.as_completed(future_to_link):\n",
|
|
" result += future.result()\n",
|
|
"\n",
|
|
" # Close shared browser\n",
|
|
" NewWebsite.close_driver()\n",
|
|
" return result\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3f0d0137-52b0-47a8-81a8-11a90a010798",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# here is the question; type over this to ask something new\n",
|
|
"\n",
|
|
"system_prompt = \"You are an LLM Engineer that analyzes the contents of several relevant pages from a company website \\\n",
|
|
"rewrites internal tools and systems and rebuilds them end-to-end, starting from scratch. Starting with the online application at cardiff.co/apply, \\\n",
|
|
"Tell me why you're best suited to be the lead of this project and work with our 12 year resident developer to implement a \\\n",
|
|
"state of the art solution in record time. Include backend architecture, model orchestration, how you handle latency, cost and user experience, \\\n",
|
|
"and details of how you would achieve this goal based on company culture and industries served if you have the information, \\\n",
|
|
"and walk me through the details like you're explaining it to a sharp product owner. Respond in markdown.\"\\\n",
|
|
"\n",
|
|
"\n",
|
|
"def get_solution_user_prompt(company_name, url):\n",
|
|
" user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
|
|
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a solution to rewrite the company's application in markdown.\\n\"\n",
|
|
" #user_prompt += get_all_details(url)\n",
|
|
" user_prompt += get_all_details_rendered_concurrently(url)\n",
|
|
" user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
|
|
" return user_prompt\n",
|
|
"\n",
|
|
"def create_solution(company_name, url):\n",
|
|
" response = openai.chat.completions.create(\n",
|
|
" model=MODEL,\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
|
" {\"role\": \"user\", \"content\": get_solution_user_prompt(company_name, url)}\n",
|
|
" ],\n",
|
|
" )\n",
|
|
" result = response.choices[0].message.content\n",
|
|
" display(Markdown(result))\n",
|
|
"\n",
|
|
" return result\n",
|
|
"\n",
|
|
"#create_solution(\"Cardiff\", \"https://cardiff.co\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "60ce7000-a4a5-4cce-a261-e75ef45063b4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Get gpt-4o-mini to answer, with streaming\n",
|
|
"\n",
|
|
"new_system_prompt = \"You are a Senior Engineer that analyzes the planned solution given to you for a company website \\\n",
|
|
"and you rewrite code for rebuilding internal tools and systems end-to-end based on the proposed solutions. \\\n",
|
|
"Start with the online application at cardiff.co/apply, use canvas and write code for the proposed solution \\\n",
|
|
"in the appropriate language that best suits the task for backend architecture, model orchestration, how you handle latency, cost and user experience wherever possible.\"\n",
|
|
"\n",
|
|
"output_dir = \"cardiff_rebuild_output\"\n",
|
|
"os.makedirs(output_dir, exist_ok=True)\n",
|
|
"\n",
|
|
"def save_code_blocks(markdown_text, base_filename=\"cardiff_code\"):\n",
|
|
" output_dir = \"cardiff_rebuild_output\"\n",
|
|
" os.makedirs(output_dir, exist_ok=True)\n",
|
|
" \n",
|
|
" code_blocks = re.findall(r\"```(.*?)\\n(.*?)```\", markdown_text, re.DOTALL)\n",
|
|
" saved_files = []\n",
|
|
"\n",
|
|
" for idx, (language, code) in enumerate(code_blocks, 1):\n",
|
|
" ext = language.strip() if language else \"txt\"\n",
|
|
" filename = f\"{base_filename}_part{idx}.{ext}\"\n",
|
|
" filepath = os.path.join(output_dir, filename)\n",
|
|
" with open(filepath, \"w\", encoding=\"utf-8\") as f:\n",
|
|
" f.write(code)\n",
|
|
" saved_files.append(filepath)\n",
|
|
"\n",
|
|
" return saved_files\n",
|
|
"\n",
|
|
"def develop_from_proposal(proposal_text, company_name):\n",
|
|
" # Stream code generation from GPT-4o\n",
|
|
" system = \"You are a senior software engineer. Use the following proposal to generate production-ready code to \\\n",
|
|
" implement the backend, frontend, and any orchestration described. Write clean, documented code in markdown format.\"\n",
|
|
" \n",
|
|
" stream = openai.chat.completions.create(\n",
|
|
" model=\"gpt-4o\",\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": system},\n",
|
|
" {\"role\": \"user\", \"content\": proposal_text}\n",
|
|
" ],\n",
|
|
" stream=True\n",
|
|
" )\n",
|
|
"\n",
|
|
" response = \"\"\n",
|
|
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
|
" for chunk in stream:\n",
|
|
" content = chunk.choices[0].delta.content or \"\"\n",
|
|
" response += content\n",
|
|
" update_display(Markdown(response), display_id=display_handle.display_id)\n",
|
|
"\n",
|
|
" saved_files = save_code_blocks(response)\n",
|
|
" \n",
|
|
" # Generate a UI design mockup image\n",
|
|
" image_prompt = f\"A modern, mobile-friendly UI wireframe for a business loan application system for {company_name}. Clean layout, input fields for business name, revenue, loan amount, industry, and contact info. Includes a step-by-step progress bar, submit button, and secure branding.\"\n",
|
|
" \n",
|
|
" img_response = openai.images.generate(\n",
|
|
" model=\"dall-e-3\",\n",
|
|
" prompt=image_prompt,\n",
|
|
" n=1,\n",
|
|
" size=\"1024x1024\"\n",
|
|
" )\n",
|
|
" \n",
|
|
" image_url = img_response.data[0].url\n",
|
|
" img_path = os.path.join(output_dir, f\"{company_name.lower()}_ui_mockup.png\")\n",
|
|
" with open(img_path, 'wb') as handler:\n",
|
|
" handler.write(requests.get(image_url).content)\n",
|
|
"\n",
|
|
" print(\"Code files saved to:\", saved_files)\n",
|
|
" print(\"UI mockup saved at:\", img_path)\n",
|
|
"\n",
|
|
" display(Markdown(\"### Proposed UI Design\"))\n",
|
|
" display(Image(url=image_url))\n",
|
|
"\n",
|
|
"proposal = create_solution(\"Cardiff\", \"https://cardiff.co\")\n",
|
|
"develop_from_proposal(proposal, \"Cardiff\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8f7c8ea8-4082-4ad0-8751-3301adcf6538",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Get Llama 3.2 to answer"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|