{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1054e1c9-142a-4059-bfe6-f9be6073fb72",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt\n",
    "\n",
    "import os\n",
    "import requests\n",
    "import json\n",
    "from typing import List\n",
    "from dotenv import load_dotenv\n",
    "from bs4 import BeautifulSoup\n",
    "from IPython.display import Markdown, display, update_display\n",
    "from openai import OpenAI\n",
    "import ollama"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9e59a6ba-d7e1-4834-b3ff-86321e354ade",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv(override=True)\n",
    "MODEL = \"llama3.2\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0ea82fa1-0986-4749-9d7e-d6a23dd88722",
   "metadata": {},
   "outputs": [],
   "source": [
    "# A class to represent a Webpage\n",
    "\n",
    "# Some websites need you to use proper headers when fetching them:\n",
    "headers = {\n",
    " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
    "}\n",
    "\n",
    "class Website:\n",
    "    \"\"\"\n",
    "    A utility class to represent a Website that we have scraped, now with links\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self, url):\n",
    "        self.url = url\n",
    "        response = requests.get(url, headers=headers)\n",
    "        self.body = response.content\n",
    "        soup = BeautifulSoup(self.body, 'html.parser')\n",
    "        self.title = soup.title.string if soup.title else \"No title found\"\n",
    "        if soup.body:\n",
    "            for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
    "                irrelevant.decompose()\n",
    "            self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
    "        else:\n",
    "            self.text = \"\"\n",
    "        links = [link.get('href') for link in soup.find_all('a')]\n",
    "        self.links = [link for link in links if link]\n",
    "\n",
    "    def get_contents(self):\n",
    "        return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2351a604-c280-48fb-84d2-272512535414",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://edwarddonner.com/',\n",
       " 'https://edwarddonner.com/connect-four/',\n",
       " 'https://edwarddonner.com/outsmart/',\n",
       " 'https://edwarddonner.com/about-me-and-about-nebula/',\n",
       " 'https://edwarddonner.com/posts/',\n",
       " 'https://edwarddonner.com/',\n",
       " 'https://news.ycombinator.com',\n",
       " 'https://nebula.io/?utm_source=ed&utm_medium=referral',\n",
       " 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',\n",
       " 'https://patents.google.com/patent/US20210049536A1/',\n",
       " 'https://www.linkedin.com/in/eddonner/',\n",
       " 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',\n",
       " 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',\n",
       " 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',\n",
       " 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',\n",
       " 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',\n",
       " 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',\n",
       " 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',\n",
       " 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',\n",
       " 'https://edwarddonner.com/',\n",
       " 'https://edwarddonner.com/connect-four/',\n",
       " 'https://edwarddonner.com/outsmart/',\n",
       " 'https://edwarddonner.com/about-me-and-about-nebula/',\n",
       " 'https://edwarddonner.com/posts/',\n",
       " 'mailto:hello@mygroovydomain.com',\n",
       " 'https://www.linkedin.com/in/eddonner/',\n",
       " 'https://twitter.com/edwarddonner',\n",
       " 'https://www.facebook.com/edward.donner.52']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ed = Website(\"https://edwarddonner.com\")\n",
    "ed.links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e2dd2206-0343-4bf2-8037-de587ff6fe10",
   "metadata": {},
   "outputs": [],
   "source": [
    "link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n",
    "You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n",
    "such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
    "link_system_prompt += \"You should respond in JSON as in this example:\"\n",
    "link_system_prompt += \"\"\"\n",
    "{\n",
    "    \"links\": [\n",
    "        {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
    "        {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n",
    "    ]\n",
    "}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d891f202-352c-4f93-97c4-ab773daacc60",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.\n",
      "You should respond in JSON as in this example:\n",
      "{\n",
      "    \"links\": [\n",
      "        {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
      "        {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n",
      "    ]\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(link_system_prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "89be55aa-7236-4d3c-8459-b9c992cd68f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_links_user_prompt(website):\n",
    "    user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
    "    user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
    "Do not include Terms of Service, Privacy, email links.\\n\"\n",
    "    user_prompt += \"Links (some might be relative links):\\n\"\n",
    "    user_prompt += \"\\n\".join(website.links)\n",
    "    return user_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ec4ed9d2-9b54-4d33-adba-328b47cdde1a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.\n",
      "Links (some might be relative links):\n",
      "https://edwarddonner.com/\n",
      "https://edwarddonner.com/connect-four/\n",
      "https://edwarddonner.com/outsmart/\n",
      "https://edwarddonner.com/about-me-and-about-nebula/\n",
      "https://edwarddonner.com/posts/\n",
      "https://edwarddonner.com/\n",
      "https://news.ycombinator.com\n",
      "https://nebula.io/?utm_source=ed&utm_medium=referral\n",
      "https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html\n",
      "https://patents.google.com/patent/US20210049536A1/\n",
      "https://www.linkedin.com/in/eddonner/\n",
      "https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/\n",
      "https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/\n",
      "https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/\n",
      "https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/\n",
      "https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/\n",
      "https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/\n",
      "https://edwarddonner.com/2024/11/13/llm-engineering-resources/\n",
      "https://edwarddonner.com/2024/11/13/llm-engineering-resources/\n",
      "https://edwarddonner.com/\n",
      "https://edwarddonner.com/connect-four/\n",
      "https://edwarddonner.com/outsmart/\n",
      "https://edwarddonner.com/about-me-and-about-nebula/\n",
      "https://edwarddonner.com/posts/\n",
      "mailto:hello@mygroovydomain.com\n",
      "https://www.linkedin.com/in/eddonner/\n",
      "https://twitter.com/edwarddonner\n",
      "https://www.facebook.com/edward.donner.52\n"
     ]
    }
   ],
   "source": [
    "print(get_links_user_prompt(ed))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "228cdeea-5c05-45a4-8afe-e6ef8f02810a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import logging\n",
    "import pprint\n",
    "#pprint.pprint(response)\n",
    "\n",
    "import re\n",
    "\n",
    "def extract_json_from_text(text):\n",
    "    \"\"\"\n",
    "    Extract the first JSON object found in the text.\n",
    "    \"\"\"\n",
    "    match = re.search(r'\\{.*\\}', text, re.DOTALL)\n",
    "    if match:\n",
    "        return match.group(0)\n",
    "    return None\n",
    "\n",
    "def get_links(url):\n",
    "    website = Website(url)\n",
    "    \n",
    "    try:\n",
    "        response = ollama.chat(\n",
    "            model=\"llama3.2\",\n",
    "            messages=[\n",
    "                {\"role\": \"system\", \"content\": link_system_prompt},\n",
    "                {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
    "            ]\n",
    "        )\n",
    "\n",
    "        result = response['message']['content']\n",
    "       \n",
    "        # Log the raw result for debugging\n",
    "        logging.debug(f\"Raw result: {result}\")\n",
    "\n",
    "       \n",
    "        if isinstance(result, str):\n",
    "            if not result.strip():\n",
    "                logging.warning(\"Result string is empty.\")\n",
    "                return None\n",
    "\n",
    "            json_text = extract_json_from_text(result)\n",
    "            if not json_text:\n",
    "                logging.warning(\"No JSON object found in the result string.\")\n",
    "                return None\n",
    "\n",
    "            logging.debug(f\"Extracted JSON string: {repr(json_text)}\")\n",
    "\n",
    "            try:\n",
    "                return json.loads(json_text)\n",
    "            except json.JSONDecodeError as e:\n",
    "                logging.error(f\"JSON decoding error: {e}\")\n",
    "                logging.debug(f\"Problematic JSON string: {repr(json_text)}\")\n",
    "                return None\n",
    "        \n",
    "    except Exception as e:\n",
    "        logging.exception(\"An unexpected error occurred in get_links.\")\n",
    "        return None\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "3ce0b67e-8483-418a-bcf3-836910381e2d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'links': [{'type': 'About page', 'url': 'https://huggingface.co/'},\n",
       "  {'type': 'Company page', 'url': 'https://huggingface.co/'},\n",
       "  {'type': 'Careers/Jobs page',\n",
       "   'url': 'https://apply.workable.com/huggingface/'},\n",
       "  {'type': 'Blog page', 'url': 'https://blog.huggingface.co/'},\n",
       "  {'type': 'GitHub repository', 'url': 'https://github.com/huggingface'},\n",
       "  {'type': 'Twitter handle', 'url': 'https://twitter.com/huggingface'},\n",
       "  {'type': 'LinkedIn company page',\n",
       "   'url': 'https://www.linkedin.com/company/huggingface/'}]}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_links(\"https://huggingface.co\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "aeb09b75-33ea-4638-bc01-6c3d738f0060",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "def is_url_reachable(url, timeout=5):\n",
    "    try:\n",
    "        response = requests.head(url, timeout=timeout)\n",
    "        return response.status_code < 400\n",
    "    except requests.RequestException:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5f2f9cc5-de4f-43d8-a803-97c11c7e91c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_all_details(url):\n",
    "    if is_url_reachable(url,5):\n",
    "        result = \"Landing page:\\n\"\n",
    "        result += Website(url).get_contents()\n",
    "        links = get_links(url)\n",
    "        print(\"Found links:\", links)\n",
    "        for link in links[\"links\"]:\n",
    "            result += f\"\\n\\n{link['type']}\\n\"\n",
    "            result += Website(link[\"url\"]).get_contents()\n",
    "        return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "cd405ade-6b44-45c5-aeb4-724cf6cce8f6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found links: {'links': [{'type': 'home page', 'url': 'https://huggingface.co/'}, {'type': 'About page', 'url': 'https://huggingface.co/about'}]}\n",
      "Landing page:\n",
      "Webpage Title:\n",
      "Hugging Face – The AI community building the future.\n",
      "Webpage Contents:\n",
      "Hugging Face\n",
      "Models\n",
      "Datasets\n",
      "Spaces\n",
      "Posts\n",
      "Docs\n",
      "Enterprise\n",
      "Pricing\n",
      "Log In\n",
      "Sign Up\n",
      "The AI community building the future.\n",
      "The platform where the machine learning community collaborates on models, datasets, and applications.\n",
      "Explore AI Apps\n",
      "or\n",
      "Browse 1M+ models\n",
      "Trending on\n",
      "this week\n",
      "Models\n",
      "nvidia/parakeet-tdt-0.6b-v2\n",
      "Updated\n",
      "about 12 hours ago\n",
      "•\n",
      "167k\n",
      "•\n",
      "868\n",
      "nari-labs/Dia-1.6B\n",
      "Updated\n",
      "2 days ago\n",
      "•\n",
      "173k\n",
      "•\n",
      "2.18k\n",
      "Lightricks/LTX-Video\n",
      "Updated\n",
      "1 day ago\n",
      "•\n",
      "291k\n",
      "•\n",
      "1.49k\n",
      "ACE-Step/ACE-Step-v1-3.5B\n",
      "Updated\n",
      "3 days ago\n",
      "•\n",
      "427\n",
      "lodestones/Chroma\n",
      "Updated\n",
      "2 days ago\n",
      "•\n",
      "538\n",
      "Browse 1M+ models\n",
      "Spaces\n",
      "Running\n",
      "6.59k\n",
      "6.59k\n",
      "DeepSite\n",
      "🐳\n",
      "Generate any application with DeepSeek\n",
      "Running\n",
      "on\n",
      "CPU Upgrade\n",
      "618\n",
      "618\n",
      "Computer Agent\n",
      "🖥\n",
      "Interact with an AI agent to perform web tasks\n",
      "Running\n",
      "on\n",
      "Zero\n",
      "333\n",
      "333\n",
      "DreamO\n",
      "🐨\n",
      "A Unified Framework for Image Customization\n",
      "Running\n",
      "368\n",
      "368\n",
      "FLUX Pro Unlimited\n",
      "🔥\n",
      "Use the FLUX-Pro model as much as you want.\n",
      "Running\n",
      "on\n",
      "Zero\n",
      "360\n",
      "360\n",
      "ACE Step\n",
      "😻\n",
      "A Step Towards Music Generation Foundation Model\n",
      "Browse 400k+ applications\n",
      "Datasets\n",
      "openbmb/Ultra-FineWeb\n",
      "Updated\n",
      "7 days ago\n",
      "•\n",
      "3.5k\n",
      "•\n",
      "59\n",
      "nvidia/OpenCodeReasoning\n",
      "Updated\n",
      "12 days ago\n",
      "•\n",
      "15.4k\n",
      "•\n",
      "410\n",
      "PrimeIntellect/INTELLECT-2-RL-Dataset\n",
      "Updated\n",
      "3 days ago\n",
      "•\n",
      "338\n",
      "•\n",
      "33\n",
      "nvidia/OpenMathReasoning\n",
      "Updated\n",
      "7 days ago\n",
      "•\n",
      "36.5k\n",
      "•\n",
      "227\n",
      "DMindAI/DMind_Benchmark\n",
      "Updated\n",
      "about 3 hours ago\n",
      "•\n",
      "2.16k\n",
      "•\n",
      "73\n",
      "Browse 250k+ datasets\n",
      "The Home of Machine Learning\n",
      "Create, discover and collaborate on ML better.\n",
      "The collaboration platform\n",
      "Host and collaborate on unlimited public models, datasets and applications.\n",
      "Move faster\n",
      "With the HF Open source stack.\n",
      "Explore all modalities\n",
      "Text, image, video, audio or even 3D.\n",
      "Build your portfolio\n",
      "Share your work with the world and build your ML profile.\n",
      "Sign Up\n",
      "Accelerate your ML\n",
      "We provide paid Compute and Enterprise solutions.\n",
      "Compute\n",
      "Deploy on optimized\n",
      "Inference Endpoints\n",
      "or update your\n",
      "Spaces applications\n",
      "to a GPU in a few clicks.\n",
      "View pricing\n",
      "Starting at $0.60/hour for GPU\n",
      "Enterprise\n",
      "Give your team the most advanced platform to build AI with enterprise-grade security, access controls and\n",
      "\t\t\tdedicated support.\n",
      "Getting started\n",
      "Starting at $20/user/month\n",
      "Single Sign-On\n",
      "Regions\n",
      "Priority Support\n",
      "Audit Logs\n",
      "Resource Groups\n",
      "Private Datasets Viewer\n",
      "More than 50,000 organizations are using Hugging Face\n",
      "Ai2\n",
      "Enterprise\n",
      "non-profit\n",
      "•\n",
      "757 models\n",
      "•\n",
      "3.27k followers\n",
      "AI at Meta\n",
      "Enterprise\n",
      "company\n",
      "•\n",
      "2.12k models\n",
      "•\n",
      "5.97k followers\n",
      "Amazon\n",
      "company\n",
      "•\n",
      "20 models\n",
      "•\n",
      "3.16k followers\n",
      "Google\n",
      "company\n",
      "•\n",
      "991 models\n",
      "•\n",
      "13.2k followers\n",
      "Intel\n",
      "company\n",
      "•\n",
      "221 models\n",
      "•\n",
      "2.54k followers\n",
      "Microsoft\n",
      "company\n",
      "•\n",
      "374 models\n",
      "•\n",
      "12.4k followers\n",
      "Grammarly\n",
      "Enterprise\n",
      "company\n",
      "•\n",
      "10 models\n",
      "•\n",
      "160 followers\n",
      "Writer\n",
      "Enterprise\n",
      "company\n",
      "•\n",
      "21 models\n",
      "•\n",
      "267 followers\n",
      "Our Open Source\n",
      "We are building the foundation of ML tooling with the community.\n",
      "Transformers\n",
      "144,388\n",
      "State-of-the-art ML for PyTorch, TensorFlow, JAX\n",
      "Diffusers\n",
      "29,004\n",
      "State-of-the-art Diffusion models in PyTorch\n",
      "Safetensors\n",
      "3,266\n",
      "Safe way to store/distribute neural network weights\n",
      "Hub Python Library\n",
      "2,600\n",
      "Python client to interact with the Hugging Face Hub\n",
      "Tokenizers\n",
      "9,684\n",
      "Fast tokenizers optimized for research & production\n",
      "TRL\n",
      "13,756\n",
      "Train transformers LMs with reinforcement learning\n",
      "Transformers.js\n",
      "13,601\n",
      "State-of-the-art ML running directly in your browser\n",
      "smolagents\n",
      "18,686\n",
      "Smol library to build great agents in Python\n",
      "PEFT\n",
      "18,418\n",
      "Parameter-efficient finetuning for large language models\n",
      "Datasets\n",
      "20,121\n",
      "Access & share datasets for any ML tasks\n",
      "Text Generation Inference\n",
      "10,119\n",
      "Serve language models with TGI optimized toolkit\n",
      "Accelerate\n",
      "8,722\n",
      "Train PyTorch models with multi-GPU, TPU, mixed precision\n",
      "System theme\n",
      "Website\n",
      "Models\n",
      "Datasets\n",
      "Spaces\n",
      "Tasks\n",
      "Inference Endpoints\n",
      "HuggingChat\n",
      "Company\n",
      "About\n",
      "Brand assets\n",
      "Terms of service\n",
      "Privacy\n",
      "Jobs\n",
      "Press\n",
      "Resources\n",
      "Learn\n",
      "Documentation\n",
      "Blog\n",
      "Forum\n",
      "Service Status\n",
      "Social\n",
      "GitHub\n",
      "Twitter\n",
      "LinkedIn\n",
      "Discord\n",
      "\n",
      "\n",
      "\n",
      "home page\n",
      "Webpage Title:\n",
      "Hugging Face – The AI community building the future.\n",
      "Webpage Contents:\n",
      "Hugging Face\n",
      "Models\n",
      "Datasets\n",
      "Spaces\n",
      "Posts\n",
      "Docs\n",
      "Enterprise\n",
      "Pricing\n",
      "Log In\n",
      "Sign Up\n",
      "The AI community building the future.\n",
      "The platform where the machine learning community collaborates on models, datasets, and applications.\n",
      "Explore AI Apps\n",
      "or\n",
      "Browse 1M+ models\n",
      "Trending on\n",
      "this week\n",
      "Models\n",
      "nvidia/parakeet-tdt-0.6b-v2\n",
      "Updated\n",
      "about 12 hours ago\n",
      "•\n",
      "167k\n",
      "•\n",
      "868\n",
      "nari-labs/Dia-1.6B\n",
      "Updated\n",
      "2 days ago\n",
      "•\n",
      "173k\n",
      "•\n",
      "2.18k\n",
      "Lightricks/LTX-Video\n",
      "Updated\n",
      "1 day ago\n",
      "•\n",
      "291k\n",
      "•\n",
      "1.49k\n",
      "ACE-Step/ACE-Step-v1-3.5B\n",
      "Updated\n",
      "3 days ago\n",
      "•\n",
      "427\n",
      "lodestones/Chroma\n",
      "Updated\n",
      "2 days ago\n",
      "•\n",
      "538\n",
      "Browse 1M+ models\n",
      "Spaces\n",
      "Running\n",
      "6.59k\n",
      "6.59k\n",
      "DeepSite\n",
      "🐳\n",
      "Generate any application with DeepSeek\n",
      "Running\n",
      "on\n",
      "CPU Upgrade\n",
      "618\n",
      "618\n",
      "Computer Agent\n",
      "🖥\n",
      "Interact with an AI agent to perform web tasks\n",
      "Running\n",
      "on\n",
      "Zero\n",
      "333\n",
      "333\n",
      "DreamO\n",
      "🐨\n",
      "A Unified Framework for Image Customization\n",
      "Running\n",
      "368\n",
      "368\n",
      "FLUX Pro Unlimited\n",
      "🔥\n",
      "Use the FLUX-Pro model as much as you want.\n",
      "Running\n",
      "on\n",
      "Zero\n",
      "360\n",
      "360\n",
      "ACE Step\n",
      "😻\n",
      "A Step Towards Music Generation Foundation Model\n",
      "Browse 400k+ applications\n",
      "Datasets\n",
      "openbmb/Ultra-FineWeb\n",
      "Updated\n",
      "7 days ago\n",
      "•\n",
      "3.5k\n",
      "•\n",
      "59\n",
      "nvidia/OpenCodeReasoning\n",
      "Updated\n",
      "12 days ago\n",
      "•\n",
      "15.4k\n",
      "•\n",
      "410\n",
      "PrimeIntellect/INTELLECT-2-RL-Dataset\n",
      "Updated\n",
      "3 days ago\n",
      "•\n",
      "338\n",
      "•\n",
      "33\n",
      "nvidia/OpenMathReasoning\n",
      "Updated\n",
      "7 days ago\n",
      "•\n",
      "36.5k\n",
      "•\n",
      "227\n",
      "DMindAI/DMind_Benchmark\n",
      "Updated\n",
      "about 3 hours ago\n",
      "•\n",
      "2.16k\n",
      "•\n",
      "73\n",
      "Browse 250k+ datasets\n",
      "The Home of Machine Learning\n",
      "Create, discover and collaborate on ML better.\n",
      "The collaboration platform\n",
      "Host and collaborate on unlimited public models, datasets and applications.\n",
      "Move faster\n",
      "With the HF Open source stack.\n",
      "Explore all modalities\n",
      "Text, image, video, audio or even 3D.\n",
      "Build your portfolio\n",
      "Share your work with the world and build your ML profile.\n",
      "Sign Up\n",
      "Accelerate your ML\n",
      "We provide paid Compute and Enterprise solutions.\n",
      "Compute\n",
      "Deploy on optimized\n",
      "Inference Endpoints\n",
      "or update your\n",
      "Spaces applications\n",
      "to a GPU in a few clicks.\n",
      "View pricing\n",
      "Starting at $0.60/hour for GPU\n",
      "Enterprise\n",
      "Give your team the most advanced platform to build AI with enterprise-grade security, access controls and\n",
      "\t\t\tdedicated support.\n",
      "Getting started\n",
      "Starting at $20/user/month\n",
      "Single Sign-On\n",
      "Regions\n",
      "Priority Support\n",
      "Audit Logs\n",
      "Resource Groups\n",
      "Private Datasets Viewer\n",
      "More than 50,000 organizations are using Hugging Face\n",
      "Ai2\n",
      "Enterprise\n",
      "non-profit\n",
      "•\n",
      "757 models\n",
      "•\n",
      "3.27k followers\n",
      "AI at Meta\n",
      "Enterprise\n",
      "company\n",
      "•\n",
      "2.12k models\n",
      "•\n",
      "5.97k followers\n",
      "Amazon\n",
      "company\n",
      "•\n",
      "20 models\n",
      "•\n",
      "3.16k followers\n",
      "Google\n",
      "company\n",
      "•\n",
      "991 models\n",
      "•\n",
      "13.2k followers\n",
      "Intel\n",
      "company\n",
      "•\n",
      "221 models\n",
      "•\n",
      "2.54k followers\n",
      "Microsoft\n",
      "company\n",
      "•\n",
      "374 models\n",
      "•\n",
      "12.4k followers\n",
      "Grammarly\n",
      "Enterprise\n",
      "company\n",
      "•\n",
      "10 models\n",
      "•\n",
      "160 followers\n",
      "Writer\n",
      "Enterprise\n",
      "company\n",
      "•\n",
      "21 models\n",
      "•\n",
      "267 followers\n",
      "Our Open Source\n",
      "We are building the foundation of ML tooling with the community.\n",
      "Transformers\n",
      "144,388\n",
      "State-of-the-art ML for PyTorch, TensorFlow, JAX\n",
      "Diffusers\n",
      "29,004\n",
      "State-of-the-art Diffusion models in PyTorch\n",
      "Safetensors\n",
      "3,266\n",
      "Safe way to store/distribute neural network weights\n",
      "Hub Python Library\n",
      "2,600\n",
      "Python client to interact with the Hugging Face Hub\n",
      "Tokenizers\n",
      "9,684\n",
      "Fast tokenizers optimized for research & production\n",
      "TRL\n",
      "13,756\n",
      "Train transformers LMs with reinforcement learning\n",
      "Transformers.js\n",
      "13,601\n",
      "State-of-the-art ML running directly in your browser\n",
      "smolagents\n",
      "18,686\n",
      "Smol library to build great agents in Python\n",
      "PEFT\n",
      "18,418\n",
      "Parameter-efficient finetuning for large language models\n",
      "Datasets\n",
      "20,121\n",
      "Access & share datasets for any ML tasks\n",
      "Text Generation Inference\n",
      "10,119\n",
      "Serve language models with TGI optimized toolkit\n",
      "Accelerate\n",
      "8,722\n",
      "Train PyTorch models with multi-GPU, TPU, mixed precision\n",
      "System theme\n",
      "Website\n",
      "Models\n",
      "Datasets\n",
      "Spaces\n",
      "Tasks\n",
      "Inference Endpoints\n",
      "HuggingChat\n",
      "Company\n",
      "About\n",
      "Brand assets\n",
      "Terms of service\n",
      "Privacy\n",
      "Jobs\n",
      "Press\n",
      "Resources\n",
      "Learn\n",
      "Documentation\n",
      "Blog\n",
      "Forum\n",
      "Service Status\n",
      "Social\n",
      "GitHub\n",
      "Twitter\n",
      "LinkedIn\n",
      "Discord\n",
      "\n",
      "\n",
      "\n",
      "About page\n",
      "Webpage Title:\n",
      "about (Sergei)\n",
      "Webpage Contents:\n",
      "Hugging Face\n",
      "Models\n",
      "Datasets\n",
      "Spaces\n",
      "Posts\n",
      "Docs\n",
      "Enterprise\n",
      "Pricing\n",
      "Log In\n",
      "Sign Up\n",
      "Sergei\n",
      "about\n",
      "Follow\n",
      "AlbertRuan's profile picture\n",
      "selvivincent's profile picture\n",
      "jondelamothe's profile picture\n",
      "5\n",
      "\t\t\t\t\tfollowers\n",
      "·\n",
      "0 following\n",
      "AI & ML interests\n",
      "None yet\n",
      "Organizations\n",
      "None yet\n",
      "models\n",
      "0\n",
      "None public yet\n",
      "datasets\n",
      "0\n",
      "None public yet\n",
      "System theme\n",
      "Company\n",
      "TOS\n",
      "Privacy\n",
      "About\n",
      "Jobs\n",
      "Website\n",
      "Models\n",
      "Datasets\n",
      "Spaces\n",
      "Pricing\n",
      "Docs\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(get_all_details(\"https://huggingface.co\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8361b67c-4063-499a-b0a7-583971dd6c48",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n",
    "and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n",
    "Include details of company culture, customers and careers/jobs if you have the information.\"\n",
    "\n",
    "# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':\n",
    "\n",
    "# system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n",
    "# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n",
    "# Include details of company culture, customers and careers/jobs if you have the information.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "0acd22ba-1dd9-40e8-b33d-1d6b88b5e4e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_brochure_user_prompt(company_name, url):\n",
    "    try:\n",
    "        if is_url_reachable(url):\n",
    "            web_content = get_all_details(url)[:5000] \n",
    "            user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
    "            user_prompt += f\"Use the name {company_name} clearly in the brochure.\\n\"\n",
    "            user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n",
    "            user_prompt += f\"\\n\\nReminder: the company name is {company_name}.\"\n",
    "            #user_prompt += get_all_details(url)\n",
    "            #user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
    "            user_prompt += web_content\n",
    "            return user_prompt\n",
    "    except requests.RequestException:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "89b8b16c-0914-440e-8a1b-54959b0ae7d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found links: {'links': [{'type': 'About page', 'url': 'https://huggingface.co'}, {'type': 'Company page', 'url': 'https://huggingface.co/brand'}, {'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'Blog', 'url': 'https://blog.huggingface.co'}, {'type': 'Research Papers', 'url': 'https://huggingface.co/docs/transformers'}]}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_brochure_user_prompt(\"HuggingFace\", \"https://huggingface.co\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "77528cd7-2460-4768-8d8c-a849f19f6381",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "def is_url_reachable1(url, timeout=5):\n",
    "    try:\n",
    "        response = requests.head(url, timeout=timeout)\n",
    "        return response.status_code < 400\n",
    "    except requests.RequestException:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "b3f37ce1-ad44-46ff-8f18-74b537acaa9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_brochure(company_name, url):\n",
    "    try:\n",
    "        if is_url_reachable(url,5):\n",
    "            response = ollama.chat(\n",
    "                model=\"llama3.2\",\n",
    "                messages=[\n",
    "                    {\"role\": \"system\", \"content\": system_prompt},\n",
    "                    {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n",
    "                  ]\n",
    "            )\n",
    "    \n",
    "        result = response['message']['content']\n",
    "        display(Markdown(result))\n",
    "    except requests.RequestException:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "1e8a5ac2-b7e2-4c98-9615-5baba00e2dd0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found links: {'links': [{'type': 'About page', 'url': 'https://huggingface.co'}, {'type': 'Company page', 'url': 'https://huggingface.co/brand'}, {'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}]}\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "# HuggingFace: Empowering AI Innovation\n",
       "\n",
       "[Image: A futuristic illustration of a brain with glowing neural connections]\n",
       "\n",
       "At Hugging Face, we're building the future of Artificial Intelligence. Our platform is a collaborative space where machine learning practitioners, researchers, and developers come together to create, share, and apply AI models.\n",
       "\n",
       "## What We Do\n",
       "\n",
       "We provide an open-source foundation for machine learning tooling, enabling users to:\n",
       "\n",
       "* Build and deploy models for text, image, video, audio, or 3D applications\n",
       "* Host and collaborate on unlimited public models, datasets, and applications\n",
       "* Accelerate their ML work with our optimized inference endpoints and computing solutions\n",
       "\n",
       "## Our Community\n",
       "\n",
       "With over 1 million+ models available, our community is a hub of innovation. We're proud to have partnered with leading organizations such as:\n",
       "\n",
       "* Meta AI2\n",
       "* Amazon AI\n",
       "* Google AI\n",
       "* Intel AI\n",
       "* Microsoft AI\n",
       "* Grammarly AI\n",
       "* Writer AI\n",
       "\n",
       "## Our Technologies\n",
       "\n",
       "We've developed a range of cutting-edge technologies to support our platform, including:\n",
       "\n",
       "* **Transformers**: State-of-the-art ML for PyTorch, TensorFlow, and JAX\n",
       "* **Diffusers**: State-of-the-art Diffusion models in PyTorch\n",
       "* **Safetensors**: A safe way to store/distribute neural network weights\n",
       "* **Hub Python Library**: A Python client to interact with our Hugging Face Hub\n",
       "\n",
       "## Join Our Mission\n",
       "\n",
       "Ready to accelerate your ML work? Explore our platform, sign up for a free account, and start building your portfolio. Our Open Source initiatives are always looking for contributors to help shape the future of AI.\n",
       "\n",
       "### Get Started\n",
       "\n",
       "* [Sign Up](#) for a free account\n",
       "* [Explore Models](#) and datasets\n",
       "* [Browse Spaces](#) for collaboration and deployment\n",
       "* [Learn About Enterprise Solutions](#)\n",
       "\n",
       "Join us in building a brighter future with AI."
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "create_brochure(\"HuggingFace\", \"https://huggingface.co\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "6ca16d59-1be8-44ef-8590-f5390e4debef",
   "metadata": {},
   "outputs": [],
   "source": [
    "def stream_brochure(company_name, url):\n",
    "    if not is_url_reachable(url):\n",
    "        print(\"❌ URL not reachable\")\n",
    "        return\n",
    "    try:\n",
    "        #if is_url_reachable(url,5):\n",
    "         stream = ollama.chat(\n",
    "            model=\"llama3.2\",\n",
    "            messages=[\n",
    "                {\"role\": \"system\", \"content\": system_prompt},\n",
    "                {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n",
    "                ],\n",
    "            stream=True\n",
    "            )\n",
    "    \n",
    "       #result = response['message']['content']\n",
    "       # display(Markdown(result))\n",
    "    except requests.RequestException:\n",
    "        return False\n",
    "        \n",
    "    response = \"\"\n",
    "    display_handle = display(Markdown(\"\"), display_id=True)\n",
    "    #for chunk in stream:\n",
    "        #response += chunk.choices[0].delta.content or ''\n",
    "        #response += chunk['message']['content'] or ''\n",
    "        #response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
    "        #update_display(Markdown(response), display_id=display_handle.display_id)\n",
    "\n",
    "    for chunk in stream:\n",
    "        content = chunk.get('message', {}).get('content', '')\n",
    "        if content:\n",
    "            response += content.replace(\"```\", \"\")\n",
    "            update_display(Markdown(response), display_id=display_handle.display_id)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "id": "0f156311-cc32-4bce-9645-7d10a50eae06",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found links: {'links': [{'type': 'About page', 'url': 'https://huggingface.co/'}, {'type': 'Company page', 'url': 'https://huggingface.co/brand'}, {'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}]}\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "# HuggingFace: Building a Future of AI Collaboration\n",
       "\n",
       "Welcome to Hugging Face, the premier platform for machine learning community collaboration. Our mission is to empower developers and researchers to build, discover, and share models, datasets, and applications that drive innovation in AI.\n",
       "\n",
       "## About Us\n",
       "\n",
       "At Hugging Face, we believe that AI should be accessible to everyone. That's why we've built a collaborative ecosystem where experts and enthusiasts can come together to create, learn, and grow. Our platform hosts over 1 million pre-trained models, 250k+ datasets, and thousands of applications.\n",
       "\n",
       "## What We Offer\n",
       "\n",
       "* **Models**: Browse our vast library of pre-trained models, including state-of-the-art Transformers, Diffusers, and more.\n",
       "* **Datasets**: Access a wide range of high-quality datasets for various AI tasks, from text generation to computer vision.\n",
       "* **Spaces**: Host and collaborate on unlimited public models, datasets, and applications with our powerful collaboration platform.\n",
       "* **Compute**: Deploy your models on optimized inference endpoints or upgrade your spaces applications to leverage GPU computing.\n",
       "\n",
       "## Our Community\n",
       "\n",
       "Hugging Face has partnered with leading companies in AI research, including Meta, Google, Amazon, Intel, Microsoft, and Grammarly. Our community consists of over 50,000 organizations, with notable members like AI2 (non-profit), AI at Meta, Amazon, Google, Intel, and Microsoft.\n",
       "\n",
       "## Our Technology\n",
       "\n",
       "* **Transformers**: State-of-the-art machine learning toolkit for PyTorch, TensorFlow, and JAX.\n",
       "* **Diffusers**: Advanced diffusion models in PyTorch.\n",
       "* **Safetensors**: Safe way to store/distribute neural network weights.\n",
       "* **Hub Python Library**: Python client to interact with the Hugging Face Hub.\n",
       "\n",
       "## Join Our Community\n",
       "\n",
       "Whether you're a researcher, developer, or enthusiast, we invite you to join our vibrant community. Share your work, learn from others, and accelerate your AI journey with Hugging Face.\n",
       "\n",
       "### Get Started\n",
       "\n",
       "* Sign up for free access to our platform.\n",
       "* Explore our tutorials, documentation, and blog.\n",
       "* Join our Discord community to connect with fellow users.\n",
       "\n",
       "Hugging Face is more than just a platform – it's a movement. Let's build the future of AI together!"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "stream_brochure(\"HuggingFace\", \"https://huggingface.co\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "c2c46421-77df-490a-adb5-13c43bd43b80",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a86ab994-a6cd-4e4e-90a3-1416fa6ce658",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a8064b0-8cbe-4e47-b1e5-49d3a9a91d4b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}