LLM_Engineering_OLD/week1/my-solutions/day5-solution.ipynb

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Day 5 Solution - Business Solution: Company Brochure Generator\n",
        "\n",
        "This is my solution to the Day 5 assignment. I've implemented a comprehensive business solution that generates company brochures.\n",
        "\n",
        "## Features Implemented:\n",
        "- Intelligent link selection using LLM\n",
        "- Multi-page content aggregation\n",
        "- Professional brochure generation\n",
        "- Model comparison and optimization\n",
        "- Business-ready output formatting\n",
        "- Cost-effective processing strategies\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Day 5 setup complete! Ready for business solution development.\n"
          ]
        }
      ],
      "source": [
        "# Day 5 Solution - Imports and Setup\n",
        "import os\n",
        "import json\n",
        "import ssl\n",
        "import requests\n",
        "from bs4 import BeautifulSoup\n",
        "from urllib.parse import urljoin\n",
        "from IPython.display import Markdown, display, update_display\n",
        "from openai import OpenAI\n",
        "from dotenv import load_dotenv\n",
        "import ollama\n",
        "import time\n",
        "\n",
        "# Load environment variables\n",
        "load_dotenv(override=True)\n",
        "\n",
        "# SSL fix for Windows\n",
        "ssl._create_default_https_context = ssl._create_unverified_context\n",
        "os.environ['PYTHONHTTPSVERIFY'] = '0'\n",
        "os.environ['CURL_CA_BUNDLE'] = ''\n",
        "\n",
        "# Initialize clients\n",
        "openai = OpenAI()\n",
        "\n",
        "# Constants\n",
        "MODEL_GPT = 'gpt-4o-mini'\n",
        "MODEL_LLAMA = 'llama3.2'\n",
        "\n",
        "print(\"Day 5 setup complete! Ready for business solution development.\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Enhanced Web Scraping Functions\n",
        "HEADERS = {\n",
        "    \"User-Agent\": (\n",
        "        \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n",
        "        \"AppleWebKit/537.36 (KHTML, like Gecko) \"\n",
        "        \"Chrome/117.0.0.0 Safari/537.36\"\n",
        "    )\n",
        "}\n",
        "\n",
        "def fetch_website_contents(url, char_limit=2000):\n",
        "    \"\"\"Fetch and clean website content\"\"\"\n",
        "    try:\n",
        "        response = requests.get(url, headers=HEADERS, timeout=10)\n",
        "        response.raise_for_status()\n",
        "        html = response.text\n",
        "    except Exception as e:\n",
        "        print(f\"Error fetching {url}: {e}\")\n",
        "        return \"Error: Could not fetch website content\"\n",
        "    \n",
        "    soup = BeautifulSoup(html, \"html.parser\")\n",
        "    \n",
        "    # Remove script and style elements\n",
        "    for script in soup([\"script\", \"style\"]):\n",
        "        script.decompose()\n",
        "    \n",
        "    title = soup.title.get_text(strip=True) if soup.title else \"No title found\"\n",
        "    text = soup.get_text()\n",
        "    \n",
        "    # Clean up whitespace\n",
        "    lines = (line.strip() for line in text.splitlines())\n",
        "    chunks = (phrase.strip() for line in lines for phrase in line.split(\"  \"))\n",
        "    text = ' '.join(chunk for chunk in chunks if chunk)\n",
        "    \n",
        "    return (f\"{title}\\\\n\\\\n{text}\").strip()[:char_limit]\n",
        "\n",
        "def fetch_website_links(url):\n",
        "    \"\"\"Fetch all links from a website\"\"\"\n",
        "    try:\n",
        "        response = requests.get(url, headers=HEADERS, timeout=10)\n",
        "        response.raise_for_status()\n",
        "        html = response.text\n",
        "    except Exception as e:\n",
        "        print(f\"Error fetching links from {url}: {e}\")\n",
        "        return []\n",
        "    \n",
        "    soup = BeautifulSoup(html, \"html.parser\")\n",
        "    links = []\n",
        "    \n",
        "    for a in soup.select(\"a[href]\"):\n",
        "        href = a.get(\"href\")\n",
        "        if href:\n",
        "            # Convert relative URLs to absolute\n",
        "            if href.startswith((\"http://\", \"https://\")):\n",
        "                links.append(href)\n",
        "            else:\n",
        "                links.append(urljoin(url, href))\n",
        "    \n",
        "    return list(set(links))  # Remove duplicates\n",
        "\n",
        "print(\"Enhanced web scraping functions defined!\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Intelligent Link Selection\n",
        "def select_relevant_links(url, model=\"gpt-4o-mini\"):\n",
        "    \"\"\"Use LLM to select relevant links for brochure generation\"\"\"\n",
        "    print(f\"🔍 Analyzing links for {url}...\")\n",
        "    \n",
        "    # Get all links\n",
        "    links = fetch_website_links(url)\n",
        "    print(f\"Found {len(links)} total links\")\n",
        "    \n",
        "    # Create prompt for link selection\n",
        "    link_system_prompt = \"\"\"\n",
        "    You are provided with a list of links found on a webpage.\n",
        "    You are able to decide which of the links would be most relevant to include in a brochure about the company,\n",
        "    such as links to an About page, or a Company page, or Careers/Jobs pages.\n",
        "    You should respond in JSON as in this example:\n",
        "\n",
        "    {\n",
        "        \"links\": [\n",
        "            {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
        "            {\"type\": \"careers page\", \"url\": \"https://another.full.url/careers\"}\n",
        "        ]\n",
        "    }\n",
        "    \"\"\"\n",
        "    \n",
        "    user_prompt = f\"\"\"\n",
        "    Here is the list of links on the website {url} -\n",
        "    Please decide which of these are relevant web links for a brochure about the company, \n",
        "    respond with the full https URL in JSON format.\n",
        "    Do not include Terms of Service, Privacy, email links.\n",
        "\n",
        "    Links (some might be relative links):\n",
        "\n",
        "    {chr(10).join(links[:50])}  # Limit to first 50 links to avoid token limits\n",
        "    \"\"\"\n",
        "    \n",
        "    try:\n",
        "        if model.startswith(\"gpt\"):\n",
        "            response = openai.chat.completions.create(\n",
        "                model=model,\n",
        "                messages=[\n",
        "                    {\"role\": \"system\", \"content\": link_system_prompt},\n",
        "                    {\"role\": \"user\", \"content\": user_prompt}\n",
        "                ],\n",
        "                response_format={\"type\": \"json_object\"}\n",
        "            )\n",
        "            result = response.choices[0].message.content\n",
        "        else:\n",
        "            response = ollama.chat(\n",
        "                model=model,\n",
        "                messages=[\n",
        "                    {\"role\": \"system\", \"content\": link_system_prompt},\n",
        "                    {\"role\": \"user\", \"content\": user_prompt}\n",
        "                ]\n",
        "            )\n",
        "            result = response['message']['content']\n",
        "        \n",
        "        links_data = json.loads(result)\n",
        "        print(f\"✅ Selected {len(links_data['links'])} relevant links\")\n",
        "        return links_data\n",
        "        \n",
        "    except Exception as e:\n",
        "        print(f\"❌ Error selecting links: {e}\")\n",
        "        return {\"links\": []}\n",
        "\n",
        "print(\"Intelligent link selection function defined!\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Content Aggregation\n",
        "def fetch_page_and_all_relevant_links(url, model=\"gpt-4o-mini\"):\n",
        "    \"\"\"Fetch main page content and all relevant linked pages\"\"\"\n",
        "    print(f\"📄 Fetching content for {url}...\")\n",
        "    \n",
        "    # Get main page content\n",
        "    main_content = fetch_website_contents(url)\n",
        "    \n",
        "    # Get relevant links\n",
        "    relevant_links = select_relevant_links(url, model)\n",
        "    \n",
        "    # Build comprehensive content\n",
        "    result = f\"## Landing Page:\\\\n\\\\n{main_content}\\\\n## Relevant Links:\\\\n\"\n",
        "    \n",
        "    for link in relevant_links['links']:\n",
        "        print(f\"  📄 Fetching {link['type']}: {link['url']}\")\n",
        "        try:\n",
        "            content = fetch_website_contents(link[\"url\"])\n",
        "            result += f\"\\\\n\\\\n### Link: {link['type']}\\\\n\"\n",
        "            result += content\n",
        "        except Exception as e:\n",
        "            print(f\"    ❌ Error fetching {link['url']}: {e}\")\n",
        "            result += f\"\\\\n\\\\n### Link: {link['type']} (Error)\\\\n\"\n",
        "            result += f\"Error fetching content: {e}\"\n",
        "    \n",
        "    return result\n",
        "\n",
        "print(\"Content aggregation function defined!\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Professional Brochure Generation\n",
        "def create_company_brochure(company_name, url, model=\"gpt-4o-mini\", style=\"professional\"):\n",
        "    \"\"\"Generate a professional company brochure\"\"\"\n",
        "    print(f\"🏢 Creating brochure for {company_name}...\")\n",
        "    \n",
        "    # Get all content\n",
        "    all_content = fetch_page_and_all_relevant_links(url, model)\n",
        "    \n",
        "    # Truncate if too long (to avoid token limits)\n",
        "    if len(all_content) > 5000:\n",
        "        all_content = all_content[:5000] + \"\\\\n\\\\n[Content truncated...]\"\n",
        "    \n",
        "    # Define brochure system prompt based on style\n",
        "    if style == \"professional\":\n",
        "        brochure_system_prompt = \"\"\"\n",
        "        You are an assistant that analyzes the contents of several relevant pages from a company website\n",
        "        and creates a short brochure about the company for prospective customers, investors and recruits.\n",
        "        Respond in markdown without code blocks.\n",
        "        Include details of company culture, customers and careers/jobs if you have the information.\n",
        "        \"\"\"\n",
        "    elif style == \"humorous\":\n",
        "        brochure_system_prompt = \"\"\"\n",
        "        You are an assistant that analyzes the contents of several relevant pages from a company website\n",
        "        and creates a short, humorous, entertaining, witty brochure about the company for prospective customers, investors and recruits.\n",
        "        Respond in markdown without code blocks.\n",
        "        Include details of company culture, customers and careers/jobs if you have the information.\n",
        "        \"\"\"\n",
        "    else:\n",
        "        brochure_system_prompt = \"\"\"\n",
        "        You are an assistant that analyzes the contents of several relevant pages from a company website\n",
        "        and creates a short brochure about the company.\n",
        "        Respond in markdown without code blocks.\n",
        "        \"\"\"\n",
        "    \n",
        "    user_prompt = f\"\"\"\n",
        "    You are looking at a company called: {company_name}\n",
        "    Here are the contents of its landing page and other relevant pages;\n",
        "    use this information to build a short brochure of the company in markdown without code blocks.\n",
        "\n",
        "    {all_content}\n",
        "    \"\"\"\n",
        "    \n",
        "    try:\n",
        "        if model.startswith(\"gpt\"):\n",
        "            response = openai.chat.completions.create(\n",
        "                model=model,\n",
        "                messages=[\n",
        "                    {\"role\": \"system\", \"content\": brochure_system_prompt},\n",
        "                    {\"role\": \"user\", \"content\": user_prompt}\n",
        "                ],\n",
        "                temperature=0.7,\n",
        "                max_tokens=1000\n",
        "            )\n",
        "            brochure = response.choices[0].message.content\n",
        "        else:\n",
        "            response = ollama.chat(\n",
        "                model=model,\n",
        "                messages=[\n",
        "                    {\"role\": \"system\", \"content\": brochure_system_prompt},\n",
        "                    {\"role\": \"user\", \"content\": user_prompt}\n",
        "                ]\n",
        "            )\n",
        "            brochure = response['message']['content']\n",
        "        \n",
        "        print(f\"✅ Brochure generated successfully!\")\n",
        "        return brochure\n",
        "        \n",
        "    except Exception as e:\n",
        "        print(f\"❌ Error generating brochure: {e}\")\n",
        "        return f\"Error generating brochure: {e}\"\n",
        "\n",
        "def display_brochure(company_name, url, model=\"gpt-4o-mini\", style=\"professional\"):\n",
        "    \"\"\"Display a company brochure\"\"\"\n",
        "    brochure = create_company_brochure(company_name, url, model, style)\n",
        "    display(Markdown(f\"# {company_name} Brochure\\\\n\\\\n{brochure}\"))\n",
        "\n",
        "print(\"Professional brochure generation functions defined!\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Test Day 5 Solution - Business Brochure Generator\n",
        "print(\"## Day 5 Solution Test - Business Brochure Generator\")\n",
        "print(\"=\"*60)\n",
        "\n",
        "# Test with different companies\n",
        "test_companies = [\n",
        "    (\"Hugging Face\", \"https://huggingface.co\"),\n",
        "    (\"OpenAI\", \"https://openai.com\"),\n",
        "    (\"Anthropic\", \"https://anthropic.com\")\n",
        "]\n",
        "\n",
        "print(\"🏢 Testing brochure generation for different companies...\")\n",
        "\n",
        "for company_name, url in test_companies:\n",
        "    print(f\"\\\\n{'='*50}\")\n",
        "    print(f\"Testing: {company_name}\")\n",
        "    print(f\"URL: {url}\")\n",
        "    print('='*50)\n",
        "    \n",
        "    try:\n",
        "        # Test with professional style\n",
        "        print(f\"\\\\n📄 Generating professional brochure for {company_name}...\")\n",
        "        display_brochure(company_name, url, model=MODEL_GPT, style=\"professional\")\n",
        "        \n",
        "    except Exception as e:\n",
        "        print(f\"❌ Error with {company_name}: {e}\")\n",
        "    \n",
        "    print(\"\\\\n\" + \"-\"*40)\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": ".venv",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.12"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}