Files
LLM_Engineering_OLD/week1/community-contributions/company_brochure_relevent_links.ipynb
Natalia Kaczyńska 3343d77152 add solutions for week1
2025-10-17 20:20:03 +02:00

280 lines
9.7 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "4e5da3f5-ebd0-4e20-ab89-95847187287b",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"from typing import List\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display, update_display, clear_output\n",
"from openai import OpenAI\n",
"from dotenv import load_dotenv\n",
"import os\n",
"from scraper import fetch_website_links, fetch_website_contents"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86adec56-3b27-46da-9b1a-1e5946a76a09",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENROUTER_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"openrouter_url = \"https://openrouter.ai/api/v1\"\n",
"openai = OpenAI(api_key=api_key, base_url=openrouter_url)\n",
"MODEL = \"gpt-5-nano\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abf2f706-2709-404a-9fb7-774a9f57dd11",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"company_name = input(\"Enter the company name: \")\n",
"url = input(\"Enter the company url: \")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "153fa3d1-3ce5-46d0-838d-3e95a4b8628b",
"metadata": {},
"outputs": [],
"source": [
"link_system_prompt = \"You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
"link_system_prompt += \"You should respond in JSON as in this example:\"\n",
"link_system_prompt += \"\"\"\n",
" EXAMPLE 1:\n",
" {\n",
" \"links\": [\n",
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
" {\"type\": \"careers page\", \"url\": \"https://another.full.url/careers\"}\n",
" ]\n",
" }\n",
" EXAMPLE 2:\n",
" {\n",
" \"links\": [\n",
" {\"type\": \"company blog\", \"url\": \"https://blog.example.com\"},\n",
" {\"type\": \"our story\", \"url\": \"https://example.com/our-story\"}\n",
" ]\n",
" }\n",
" \"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fcacc2e-7445-4d8a-aa80-489d3a2247ec",
"metadata": {},
"outputs": [],
"source": [
"def get_links_user_prompt(url):\n",
" user_prompt = f\"Here is the list of links on the website of {url} - \"\n",
" user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.\\n\"\n",
" user_prompt += \"Links (some might be relative links):\\n\"\n",
" links = fetch_website_links(url)\n",
" user_prompt += \"\\n\".join(links[:20])\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dfe222c5-0d3e-4be2-85e1-596ab9d407dc",
"metadata": {},
"outputs": [],
"source": [
"def get_links(url):\n",
" response = openai.chat.completions.create(\n",
" model = MODEL,\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
" {\"role\": \"user\", \"content\": get_links_user_prompt(url)}\n",
" ],\n",
" response_format = {\"type\": \"json_object\"}\n",
" )\n",
" result = response.choices[0].message.content\n",
" return json.loads(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c964bdce-be5d-41c7-a8d7-8e25e58463c5",
"metadata": {},
"outputs": [],
"source": [
"def get_all_details(url):\n",
" result = \"Landing page:\\n\"\n",
" result += fetch_website_contents(url)\n",
" links = get_links(url)\n",
"\n",
" for link in links[\"links\"]:\n",
" result += f\"{link['type']}\\n\"\n",
" try:\n",
" result += f\"\\n\\n### Link: Link: {link['type']}\\n\"\n",
" result += fetch_website_contents(link[\"url\"])\n",
" except Exception as e:\n",
" print(f\"Omitted link: {link['url']}: {e}\")\n",
" continue\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5337019a-b789-49d7-bf10-0f15148c0276",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = (\n",
" \"You are an assistant that analyzes the contents of several relevant pages from a company website \"\n",
" \"and creates a great type of brochure about the company for prospective customers, investors, and recruits. \"\n",
" \"Respond in markdown. Include details of company culture, customers, and careers/jobs if you have the information. Add emoticons where ever possible.\\n\\n\"\n",
"\n",
" \"Please structure the brochure using the following sections:\\n\"\n",
" \"1. **Introduction**: A brief overview of the company.\\n\"\n",
" \"2. **Company Culture**: Emphasize fun, atmosphere, and any unique cultural elements.\\n\"\n",
" \"3. **Customers**: Mention notable customers or industries.\\n\"\n",
" \"4. **Careers/Jobs**: Highlight career opportunities.\\n\"\n",
" \"5. **Conclusion**: Wrap up with a final lighthearted message.\\n\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1dd4f2d4-8189-452a-b15a-c09ae5894ac8",
"metadata": {},
"outputs": [],
"source": [
"def get_brochure_user_prompt(company_name, url):\n",
" user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n",
" user_prompt += get_all_details(url)\n",
" user_prompt = user_prompt[:20000]\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ab4bfef-eb22-43fb-8a46-f1f6a225793b",
"metadata": {},
"outputs": [],
"source": [
"def stream_brochure():\n",
" global brochure_text\n",
" brochure_text = \"\"\n",
"\n",
" stream = openai.chat.completions.create(\n",
" model = MODEL,\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n",
" ],\n",
" stream = True\n",
" )\n",
"\n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id = True)\n",
" for chunk in stream:\n",
" content = chunk.choices[0].delta.content or ''\n",
" response += content\n",
" brochure_text += content\n",
" response = response.replace(\"```\", \"\"). replace(\"markdown\", \"\")\n",
" update_display(Markdown(response), display_id = display_handle.display_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7828c747-7872-48e2-b3e6-faab95ba76cb",
"metadata": {},
"outputs": [],
"source": [
"def user_translate_brochure(language):\n",
" clear_output(wait = True)\n",
"\n",
" translation_stream = openai.chat.completions.create(\n",
" model = MODEL,\n",
" messages = [\n",
" {\"role\": \"user\", \"content\": f\"Translate the following to {language}:\\n {brochure_text}\"}\n",
" ],\n",
" stream = True\n",
" )\n",
"\n",
" display_handle = display(Markdown(\"\"), display_id = True)\n",
" translated_text = \"\"\n",
"\n",
" for chunk in translation_stream:\n",
" content = chunk.choices[0].delta.content or \"\"\n",
" if content:\n",
" translated_text += content\n",
" update_display(Markdown(translated_text), display_id = display_handle.display_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6cfa92a-8a86-485d-a7e1-1651705ee6dc",
"metadata": {},
"outputs": [],
"source": [
"stream_brochure()\n",
"language_choice = input(\"Enter the language to translate the brochure into (e.g., 'French'): \")\n",
"user_translate_brochure(language_choice)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}