386 lines
16 KiB
Plaintext
386 lines
16 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Day 5 Solution - Business Solution: Company Brochure Generator\n",
|
|
"\n",
|
|
"This is my solution to the Day 5 assignment. I've implemented a comprehensive business solution that generates company brochures.\n",
|
|
"\n",
|
|
"## Features Implemented:\n",
|
|
"- Intelligent link selection using LLM\n",
|
|
"- Multi-page content aggregation\n",
|
|
"- Professional brochure generation\n",
|
|
"- Model comparison and optimization\n",
|
|
"- Business-ready output formatting\n",
|
|
"- Cost-effective processing strategies\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Day 5 setup complete! Ready for business solution development.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Day 5 Solution - Imports and Setup\n",
|
|
"import os\n",
|
|
"import json\n",
|
|
"import ssl\n",
|
|
"import requests\n",
|
|
"from bs4 import BeautifulSoup\n",
|
|
"from urllib.parse import urljoin\n",
|
|
"from IPython.display import Markdown, display, update_display\n",
|
|
"from openai import OpenAI\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"import ollama\n",
|
|
"import time\n",
|
|
"\n",
|
|
"# Load environment variables\n",
|
|
"load_dotenv(override=True)\n",
|
|
"\n",
|
|
"# SSL fix for Windows\n",
|
|
"ssl._create_default_https_context = ssl._create_unverified_context\n",
|
|
"os.environ['PYTHONHTTPSVERIFY'] = '0'\n",
|
|
"os.environ['CURL_CA_BUNDLE'] = ''\n",
|
|
"\n",
|
|
"# Initialize clients\n",
|
|
"openai = OpenAI()\n",
|
|
"\n",
|
|
"# Constants\n",
|
|
"MODEL_GPT = 'gpt-4o-mini'\n",
|
|
"MODEL_LLAMA = 'llama3.2'\n",
|
|
"\n",
|
|
"print(\"Day 5 setup complete! Ready for business solution development.\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Enhanced Web Scraping Functions\n",
|
|
"HEADERS = {\n",
|
|
" \"User-Agent\": (\n",
|
|
" \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n",
|
|
" \"AppleWebKit/537.36 (KHTML, like Gecko) \"\n",
|
|
" \"Chrome/117.0.0.0 Safari/537.36\"\n",
|
|
" )\n",
|
|
"}\n",
|
|
"\n",
|
|
"def fetch_website_contents(url, char_limit=2000):\n",
|
|
" \"\"\"Fetch and clean website content\"\"\"\n",
|
|
" try:\n",
|
|
" response = requests.get(url, headers=HEADERS, timeout=10)\n",
|
|
" response.raise_for_status()\n",
|
|
" html = response.text\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error fetching {url}: {e}\")\n",
|
|
" return \"Error: Could not fetch website content\"\n",
|
|
" \n",
|
|
" soup = BeautifulSoup(html, \"html.parser\")\n",
|
|
" \n",
|
|
" # Remove script and style elements\n",
|
|
" for script in soup([\"script\", \"style\"]):\n",
|
|
" script.decompose()\n",
|
|
" \n",
|
|
" title = soup.title.get_text(strip=True) if soup.title else \"No title found\"\n",
|
|
" text = soup.get_text()\n",
|
|
" \n",
|
|
" # Clean up whitespace\n",
|
|
" lines = (line.strip() for line in text.splitlines())\n",
|
|
" chunks = (phrase.strip() for line in lines for phrase in line.split(\" \"))\n",
|
|
" text = ' '.join(chunk for chunk in chunks if chunk)\n",
|
|
" \n",
|
|
" return (f\"{title}\\\\n\\\\n{text}\").strip()[:char_limit]\n",
|
|
"\n",
|
|
"def fetch_website_links(url):\n",
|
|
" \"\"\"Fetch all links from a website\"\"\"\n",
|
|
" try:\n",
|
|
" response = requests.get(url, headers=HEADERS, timeout=10)\n",
|
|
" response.raise_for_status()\n",
|
|
" html = response.text\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error fetching links from {url}: {e}\")\n",
|
|
" return []\n",
|
|
" \n",
|
|
" soup = BeautifulSoup(html, \"html.parser\")\n",
|
|
" links = []\n",
|
|
" \n",
|
|
" for a in soup.select(\"a[href]\"):\n",
|
|
" href = a.get(\"href\")\n",
|
|
" if href:\n",
|
|
" # Convert relative URLs to absolute\n",
|
|
" if href.startswith((\"http://\", \"https://\")):\n",
|
|
" links.append(href)\n",
|
|
" else:\n",
|
|
" links.append(urljoin(url, href))\n",
|
|
" \n",
|
|
" return list(set(links)) # Remove duplicates\n",
|
|
"\n",
|
|
"print(\"Enhanced web scraping functions defined!\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Intelligent Link Selection\n",
|
|
"def select_relevant_links(url, model=\"gpt-4o-mini\"):\n",
|
|
" \"\"\"Use LLM to select relevant links for brochure generation\"\"\"\n",
|
|
" print(f\"🔍 Analyzing links for {url}...\")\n",
|
|
" \n",
|
|
" # Get all links\n",
|
|
" links = fetch_website_links(url)\n",
|
|
" print(f\"Found {len(links)} total links\")\n",
|
|
" \n",
|
|
" # Create prompt for link selection\n",
|
|
" link_system_prompt = \"\"\"\n",
|
|
" You are provided with a list of links found on a webpage.\n",
|
|
" You are able to decide which of the links would be most relevant to include in a brochure about the company,\n",
|
|
" such as links to an About page, or a Company page, or Careers/Jobs pages.\n",
|
|
" You should respond in JSON as in this example:\n",
|
|
"\n",
|
|
" {\n",
|
|
" \"links\": [\n",
|
|
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
|
|
" {\"type\": \"careers page\", \"url\": \"https://another.full.url/careers\"}\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" \"\"\"\n",
|
|
" \n",
|
|
" user_prompt = f\"\"\"\n",
|
|
" Here is the list of links on the website {url} -\n",
|
|
" Please decide which of these are relevant web links for a brochure about the company, \n",
|
|
" respond with the full https URL in JSON format.\n",
|
|
" Do not include Terms of Service, Privacy, email links.\n",
|
|
"\n",
|
|
" Links (some might be relative links):\n",
|
|
"\n",
|
|
" {chr(10).join(links[:50])} # Limit to first 50 links to avoid token limits\n",
|
|
" \"\"\"\n",
|
|
" \n",
|
|
" try:\n",
|
|
" if model.startswith(\"gpt\"):\n",
|
|
" response = openai.chat.completions.create(\n",
|
|
" model=model,\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
|
|
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
|
" ],\n",
|
|
" response_format={\"type\": \"json_object\"}\n",
|
|
" )\n",
|
|
" result = response.choices[0].message.content\n",
|
|
" else:\n",
|
|
" response = ollama.chat(\n",
|
|
" model=model,\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
|
|
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
|
" ]\n",
|
|
" )\n",
|
|
" result = response['message']['content']\n",
|
|
" \n",
|
|
" links_data = json.loads(result)\n",
|
|
" print(f\"✅ Selected {len(links_data['links'])} relevant links\")\n",
|
|
" return links_data\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" print(f\"❌ Error selecting links: {e}\")\n",
|
|
" return {\"links\": []}\n",
|
|
"\n",
|
|
"print(\"Intelligent link selection function defined!\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Content Aggregation\n",
|
|
"def fetch_page_and_all_relevant_links(url, model=\"gpt-4o-mini\"):\n",
|
|
" \"\"\"Fetch main page content and all relevant linked pages\"\"\"\n",
|
|
" print(f\"📄 Fetching content for {url}...\")\n",
|
|
" \n",
|
|
" # Get main page content\n",
|
|
" main_content = fetch_website_contents(url)\n",
|
|
" \n",
|
|
" # Get relevant links\n",
|
|
" relevant_links = select_relevant_links(url, model)\n",
|
|
" \n",
|
|
" # Build comprehensive content\n",
|
|
" result = f\"## Landing Page:\\\\n\\\\n{main_content}\\\\n## Relevant Links:\\\\n\"\n",
|
|
" \n",
|
|
" for link in relevant_links['links']:\n",
|
|
" print(f\" 📄 Fetching {link['type']}: {link['url']}\")\n",
|
|
" try:\n",
|
|
" content = fetch_website_contents(link[\"url\"])\n",
|
|
" result += f\"\\\\n\\\\n### Link: {link['type']}\\\\n\"\n",
|
|
" result += content\n",
|
|
" except Exception as e:\n",
|
|
" print(f\" ❌ Error fetching {link['url']}: {e}\")\n",
|
|
" result += f\"\\\\n\\\\n### Link: {link['type']} (Error)\\\\n\"\n",
|
|
" result += f\"Error fetching content: {e}\"\n",
|
|
" \n",
|
|
" return result\n",
|
|
"\n",
|
|
"print(\"Content aggregation function defined!\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Professional Brochure Generation\n",
|
|
"def create_company_brochure(company_name, url, model=\"gpt-4o-mini\", style=\"professional\"):\n",
|
|
" \"\"\"Generate a professional company brochure\"\"\"\n",
|
|
" print(f\"🏢 Creating brochure for {company_name}...\")\n",
|
|
" \n",
|
|
" # Get all content\n",
|
|
" all_content = fetch_page_and_all_relevant_links(url, model)\n",
|
|
" \n",
|
|
" # Truncate if too long (to avoid token limits)\n",
|
|
" if len(all_content) > 5000:\n",
|
|
" all_content = all_content[:5000] + \"\\\\n\\\\n[Content truncated...]\"\n",
|
|
" \n",
|
|
" # Define brochure system prompt based on style\n",
|
|
" if style == \"professional\":\n",
|
|
" brochure_system_prompt = \"\"\"\n",
|
|
" You are an assistant that analyzes the contents of several relevant pages from a company website\n",
|
|
" and creates a short brochure about the company for prospective customers, investors and recruits.\n",
|
|
" Respond in markdown without code blocks.\n",
|
|
" Include details of company culture, customers and careers/jobs if you have the information.\n",
|
|
" \"\"\"\n",
|
|
" elif style == \"humorous\":\n",
|
|
" brochure_system_prompt = \"\"\"\n",
|
|
" You are an assistant that analyzes the contents of several relevant pages from a company website\n",
|
|
" and creates a short, humorous, entertaining, witty brochure about the company for prospective customers, investors and recruits.\n",
|
|
" Respond in markdown without code blocks.\n",
|
|
" Include details of company culture, customers and careers/jobs if you have the information.\n",
|
|
" \"\"\"\n",
|
|
" else:\n",
|
|
" brochure_system_prompt = \"\"\"\n",
|
|
" You are an assistant that analyzes the contents of several relevant pages from a company website\n",
|
|
" and creates a short brochure about the company.\n",
|
|
" Respond in markdown without code blocks.\n",
|
|
" \"\"\"\n",
|
|
" \n",
|
|
" user_prompt = f\"\"\"\n",
|
|
" You are looking at a company called: {company_name}\n",
|
|
" Here are the contents of its landing page and other relevant pages;\n",
|
|
" use this information to build a short brochure of the company in markdown without code blocks.\n",
|
|
"\n",
|
|
" {all_content}\n",
|
|
" \"\"\"\n",
|
|
" \n",
|
|
" try:\n",
|
|
" if model.startswith(\"gpt\"):\n",
|
|
" response = openai.chat.completions.create(\n",
|
|
" model=model,\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": brochure_system_prompt},\n",
|
|
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
|
" ],\n",
|
|
" temperature=0.7,\n",
|
|
" max_tokens=1000\n",
|
|
" )\n",
|
|
" brochure = response.choices[0].message.content\n",
|
|
" else:\n",
|
|
" response = ollama.chat(\n",
|
|
" model=model,\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": brochure_system_prompt},\n",
|
|
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
|
" ]\n",
|
|
" )\n",
|
|
" brochure = response['message']['content']\n",
|
|
" \n",
|
|
" print(f\"✅ Brochure generated successfully!\")\n",
|
|
" return brochure\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" print(f\"❌ Error generating brochure: {e}\")\n",
|
|
" return f\"Error generating brochure: {e}\"\n",
|
|
"\n",
|
|
"def display_brochure(company_name, url, model=\"gpt-4o-mini\", style=\"professional\"):\n",
|
|
" \"\"\"Display a company brochure\"\"\"\n",
|
|
" brochure = create_company_brochure(company_name, url, model, style)\n",
|
|
" display(Markdown(f\"# {company_name} Brochure\\\\n\\\\n{brochure}\"))\n",
|
|
"\n",
|
|
"print(\"Professional brochure generation functions defined!\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Test Day 5 Solution - Business Brochure Generator\n",
|
|
"print(\"## Day 5 Solution Test - Business Brochure Generator\")\n",
|
|
"print(\"=\"*60)\n",
|
|
"\n",
|
|
"# Test with different companies\n",
|
|
"test_companies = [\n",
|
|
" (\"Hugging Face\", \"https://huggingface.co\"),\n",
|
|
" (\"OpenAI\", \"https://openai.com\"),\n",
|
|
" (\"Anthropic\", \"https://anthropic.com\")\n",
|
|
"]\n",
|
|
"\n",
|
|
"print(\"🏢 Testing brochure generation for different companies...\")\n",
|
|
"\n",
|
|
"for company_name, url in test_companies:\n",
|
|
" print(f\"\\\\n{'='*50}\")\n",
|
|
" print(f\"Testing: {company_name}\")\n",
|
|
" print(f\"URL: {url}\")\n",
|
|
" print('='*50)\n",
|
|
" \n",
|
|
" try:\n",
|
|
" # Test with professional style\n",
|
|
" print(f\"\\\\n📄 Generating professional brochure for {company_name}...\")\n",
|
|
" display_brochure(company_name, url, model=MODEL_GPT, style=\"professional\")\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" print(f\"❌ Error with {company_name}: {e}\")\n",
|
|
" \n",
|
|
" print(\"\\\\n\" + \"-\"*40)\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|