{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "4e5da3f5-ebd0-4e20-ab89-95847187287b", "metadata": {}, "outputs": [], "source": [ "import requests\n", "import json\n", "from typing import List\n", "from bs4 import BeautifulSoup\n", "from IPython.display import Markdown, display, update_display, clear_output\n", "from openai import OpenAI\n", "from dotenv import load_dotenv\n", "import os\n", "from scraper import fetch_website_links, fetch_website_contents" ] }, { "cell_type": "code", "execution_count": null, "id": "86adec56-3b27-46da-9b1a-1e5946a76a09", "metadata": {}, "outputs": [], "source": [ "load_dotenv(override=True)\n", "api_key = os.getenv('OPENROUTER_API_KEY')\n", "\n", "# Check the key\n", "\n", "if not api_key:\n", " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", "elif api_key.strip() != api_key:\n", " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", "else:\n", " print(\"API key found and looks good so far!\")\n", "headers = {\n", " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", "}\n", "\n", "openrouter_url = \"https://openrouter.ai/api/v1\"\n", "openai = OpenAI(api_key=api_key, base_url=openrouter_url)\n", "MODEL = \"gpt-5-nano\"" ] }, { "cell_type": "code", "execution_count": null, "id": "abf2f706-2709-404a-9fb7-774a9f57dd11", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "company_name = input(\"Enter the company name: \")\n", "url = input(\"Enter the company url: \")" ] }, { "cell_type": "code", "execution_count": null, "id": "153fa3d1-3ce5-46d0-838d-3e95a4b8628b", "metadata": {}, "outputs": [], "source": [ "link_system_prompt = \"You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n", "link_system_prompt += \"You should respond in JSON as in this example:\"\n", "link_system_prompt += \"\"\"\n", " EXAMPLE 1:\n", " {\n", " \"links\": [\n", " {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n", " {\"type\": \"careers page\", \"url\": \"https://another.full.url/careers\"}\n", " ]\n", " }\n", " EXAMPLE 2:\n", " {\n", " \"links\": [\n", " {\"type\": \"company blog\", \"url\": \"https://blog.example.com\"},\n", " {\"type\": \"our story\", \"url\": \"https://example.com/our-story\"}\n", " ]\n", " }\n", " \"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "6fcacc2e-7445-4d8a-aa80-489d3a2247ec", "metadata": {}, "outputs": [], "source": [ "def get_links_user_prompt(url):\n", " user_prompt = f\"Here is the list of links on the website of {url} - \"\n", " user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.\\n\"\n", " user_prompt += \"Links (some might be relative links):\\n\"\n", " links = fetch_website_links(url)\n", " user_prompt += \"\\n\".join(links[:20])\n", " return user_prompt" ] }, { "cell_type": "code", "execution_count": null, "id": "dfe222c5-0d3e-4be2-85e1-596ab9d407dc", "metadata": {}, "outputs": [], "source": [ "def get_links(url):\n", " response = openai.chat.completions.create(\n", " model = MODEL,\n", " messages = [\n", " {\"role\": \"system\", \"content\": link_system_prompt},\n", " {\"role\": \"user\", \"content\": get_links_user_prompt(url)}\n", " ],\n", " response_format = {\"type\": \"json_object\"}\n", " )\n", " result = response.choices[0].message.content\n", " return json.loads(result)" ] }, { "cell_type": "code", "execution_count": null, "id": "c964bdce-be5d-41c7-a8d7-8e25e58463c5", "metadata": {}, "outputs": [], "source": [ "def get_all_details(url):\n", " result = \"Landing page:\\n\"\n", " result += fetch_website_contents(url)\n", " links = get_links(url)\n", "\n", " for link in links[\"links\"]:\n", " result += f\"{link['type']}\\n\"\n", " try:\n", " result += f\"\\n\\n### Link: Link: {link['type']}\\n\"\n", " result += fetch_website_contents(link[\"url\"])\n", " except Exception as e:\n", " print(f\"Omitted link: {link['url']}: {e}\")\n", " continue\n", " return result" ] }, { "cell_type": "code", "execution_count": null, "id": "5337019a-b789-49d7-bf10-0f15148c0276", "metadata": {}, "outputs": [], "source": [ "system_prompt = (\n", " \"You are an assistant that analyzes the contents of several relevant pages from a company website \"\n", " \"and creates a great type of brochure about the company for prospective customers, investors, and recruits. \"\n", " \"Respond in markdown. Include details of company culture, customers, and careers/jobs if you have the information. Add emoticons where ever possible.\\n\\n\"\n", "\n", " \"Please structure the brochure using the following sections:\\n\"\n", " \"1. **Introduction**: A brief overview of the company.\\n\"\n", " \"2. **Company Culture**: Emphasize fun, atmosphere, and any unique cultural elements.\\n\"\n", " \"3. **Customers**: Mention notable customers or industries.\\n\"\n", " \"4. **Careers/Jobs**: Highlight career opportunities.\\n\"\n", " \"5. **Conclusion**: Wrap up with a final lighthearted message.\\n\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "1dd4f2d4-8189-452a-b15a-c09ae5894ac8", "metadata": {}, "outputs": [], "source": [ "def get_brochure_user_prompt(company_name, url):\n", " user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n", " user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n", " user_prompt += get_all_details(url)\n", " user_prompt = user_prompt[:20000]\n", " return user_prompt" ] }, { "cell_type": "code", "execution_count": null, "id": "8ab4bfef-eb22-43fb-8a46-f1f6a225793b", "metadata": {}, "outputs": [], "source": [ "def stream_brochure():\n", " global brochure_text\n", " brochure_text = \"\"\n", "\n", " stream = openai.chat.completions.create(\n", " model = MODEL,\n", " messages = [\n", " {\"role\": \"system\", \"content\": system_prompt},\n", " {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n", " ],\n", " stream = True\n", " )\n", "\n", " response = \"\"\n", " display_handle = display(Markdown(\"\"), display_id = True)\n", " for chunk in stream:\n", " content = chunk.choices[0].delta.content or ''\n", " response += content\n", " brochure_text += content\n", " response = response.replace(\"```\", \"\"). replace(\"markdown\", \"\")\n", " update_display(Markdown(response), display_id = display_handle.display_id)" ] }, { "cell_type": "code", "execution_count": null, "id": "7828c747-7872-48e2-b3e6-faab95ba76cb", "metadata": {}, "outputs": [], "source": [ "def user_translate_brochure(language):\n", " clear_output(wait = True)\n", "\n", " translation_stream = openai.chat.completions.create(\n", " model = MODEL,\n", " messages = [\n", " {\"role\": \"user\", \"content\": f\"Translate the following to {language}:\\n {brochure_text}\"}\n", " ],\n", " stream = True\n", " )\n", "\n", " display_handle = display(Markdown(\"\"), display_id = True)\n", " translated_text = \"\"\n", "\n", " for chunk in translation_stream:\n", " content = chunk.choices[0].delta.content or \"\"\n", " if content:\n", " translated_text += content\n", " update_display(Markdown(translated_text), display_id = display_handle.display_id)" ] }, { "cell_type": "code", "execution_count": null, "id": "e6cfa92a-8a86-485d-a7e1-1651705ee6dc", "metadata": {}, "outputs": [], "source": [ "stream_brochure()\n", "language_choice = input(\"Enter the language to translate the brochure into (e.g., 'French'): \")\n", "user_translate_brochure(language_choice)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.7" } }, "nbformat": 4, "nbformat_minor": 5 }