440 lines
14 KiB
Plaintext
440 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1054e1c9-142a-4059-bfe6-f9be6073fb72",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# imports\n",
|
|
"# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt\n",
|
|
"\n",
|
|
"import os\n",
|
|
"import requests\n",
|
|
"import json\n",
|
|
"from typing import List\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"from bs4 import BeautifulSoup\n",
|
|
"from IPython.display import Markdown, display, update_display\n",
|
|
"from openai import OpenAI\n",
|
|
"import ollama"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9e59a6ba-d7e1-4834-b3ff-86321e354ade",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"load_dotenv(override=True)\n",
|
|
"MODEL = \"llama3.2\"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0ea82fa1-0986-4749-9d7e-d6a23dd88722",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# A class to represent a Webpage\n",
|
|
"\n",
|
|
"# Some websites need you to use proper headers when fetching them:\n",
|
|
"headers = {\n",
|
|
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
|
"}\n",
|
|
"\n",
|
|
"class Website:\n",
|
|
" \"\"\"\n",
|
|
" A utility class to represent a Website that we have scraped, now with links\n",
|
|
" \"\"\"\n",
|
|
"\n",
|
|
" def __init__(self, url):\n",
|
|
" self.url = url\n",
|
|
" response = requests.get(url, headers=headers)\n",
|
|
" self.body = response.content\n",
|
|
" soup = BeautifulSoup(self.body, 'html.parser')\n",
|
|
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
|
" if soup.body:\n",
|
|
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
|
" irrelevant.decompose()\n",
|
|
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
|
|
" else:\n",
|
|
" self.text = \"\"\n",
|
|
" links = [link.get('href') for link in soup.find_all('a')]\n",
|
|
" self.links = [link for link in links if link]\n",
|
|
"\n",
|
|
" def get_contents(self):\n",
|
|
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2351a604-c280-48fb-84d2-272512535414",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"ed = Website(\"https://edwarddonner.com\")\n",
|
|
"ed.links"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e2dd2206-0343-4bf2-8037-de587ff6fe10",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n",
|
|
"You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n",
|
|
"such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
|
|
"link_system_prompt += \"You should respond in JSON as in this example:\"\n",
|
|
"link_system_prompt += \"\"\"\n",
|
|
"{\n",
|
|
" \"links\": [\n",
|
|
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
|
|
" {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n",
|
|
" ]\n",
|
|
"}\n",
|
|
"\"\"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d891f202-352c-4f93-97c4-ab773daacc60",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(link_system_prompt)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "89be55aa-7236-4d3c-8459-b9c992cd68f5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_links_user_prompt(website):\n",
|
|
" user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
|
|
" user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
|
|
"Do not include Terms of Service, Privacy, email links.\\n\"\n",
|
|
" user_prompt += \"Links (some might be relative links):\\n\"\n",
|
|
" user_prompt += \"\\n\".join(website.links)\n",
|
|
" return user_prompt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ec4ed9d2-9b54-4d33-adba-328b47cdde1a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(get_links_user_prompt(ed))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "228cdeea-5c05-45a4-8afe-e6ef8f02810a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"import logging\n",
|
|
"import pprint\n",
|
|
"#pprint.pprint(response)\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"def extract_json_from_text(text):\n",
|
|
" \"\"\"\n",
|
|
" Extract the first JSON object found in the text.\n",
|
|
" \"\"\"\n",
|
|
" match = re.search(r'\\{.*\\}', text, re.DOTALL)\n",
|
|
" if match:\n",
|
|
" return match.group(0)\n",
|
|
" return None\n",
|
|
"\n",
|
|
"def get_links(url):\n",
|
|
" website = Website(url)\n",
|
|
" \n",
|
|
" try:\n",
|
|
" response = ollama.chat(\n",
|
|
" model=\"llama3.2\",\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
|
|
" {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
|
|
" ]\n",
|
|
" )\n",
|
|
"\n",
|
|
" result = response['message']['content']\n",
|
|
" \n",
|
|
" # Log the raw result for debugging\n",
|
|
" logging.debug(f\"Raw result: {result}\")\n",
|
|
"\n",
|
|
" \n",
|
|
" if isinstance(result, str):\n",
|
|
" if not result.strip():\n",
|
|
" logging.warning(\"Result string is empty.\")\n",
|
|
" return None\n",
|
|
"\n",
|
|
" json_text = extract_json_from_text(result)\n",
|
|
" if not json_text:\n",
|
|
" logging.warning(\"No JSON object found in the result string.\")\n",
|
|
" return None\n",
|
|
"\n",
|
|
" logging.debug(f\"Extracted JSON string: {repr(json_text)}\")\n",
|
|
"\n",
|
|
" try:\n",
|
|
" return json.loads(json_text)\n",
|
|
" except json.JSONDecodeError as e:\n",
|
|
" logging.error(f\"JSON decoding error: {e}\")\n",
|
|
" logging.debug(f\"Problematic JSON string: {repr(json_text)}\")\n",
|
|
" return None\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" logging.exception(\"An unexpected error occurred in get_links.\")\n",
|
|
" return None\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3ce0b67e-8483-418a-bcf3-836910381e2d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"get_links(\"https://huggingface.co\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "aeb09b75-33ea-4638-bc01-6c3d738f0060",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import requests\n",
|
|
"\n",
|
|
"def is_url_reachable(url, timeout=5):\n",
|
|
" try:\n",
|
|
" response = requests.head(url, timeout=timeout)\n",
|
|
" return response.status_code < 400\n",
|
|
" except requests.RequestException:\n",
|
|
" return False"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5f2f9cc5-de4f-43d8-a803-97c11c7e91c2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_all_details(url):\n",
|
|
" if is_url_reachable(url,5):\n",
|
|
" result = \"Landing page:\\n\"\n",
|
|
" result += Website(url).get_contents()\n",
|
|
" links = get_links(url)\n",
|
|
" print(\"Found links:\", links)\n",
|
|
" for link in links[\"links\"]:\n",
|
|
" result += f\"\\n\\n{link['type']}\\n\"\n",
|
|
" result += Website(link[\"url\"]).get_contents()\n",
|
|
" return result"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cd405ade-6b44-45c5-aeb4-724cf6cce8f6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(get_all_details(\"https://huggingface.co\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8361b67c-4063-499a-b0a7-583971dd6c48",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n",
|
|
"and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n",
|
|
"Include details of company culture, customers and careers/jobs if you have the information.\"\n",
|
|
"\n",
|
|
"# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':\n",
|
|
"\n",
|
|
"# system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n",
|
|
"# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n",
|
|
"# Include details of company culture, customers and careers/jobs if you have the information.\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0acd22ba-1dd9-40e8-b33d-1d6b88b5e4e3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_brochure_user_prompt(company_name, url):\n",
|
|
" try:\n",
|
|
" if is_url_reachable(url):\n",
|
|
" web_content = get_all_details(url)[:5000] \n",
|
|
" user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
|
|
" user_prompt += f\"Use the name {company_name} clearly in the brochure.\\n\"\n",
|
|
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n",
|
|
" user_prompt += f\"\\n\\nReminder: the company name is {company_name}.\"\n",
|
|
" #user_prompt += get_all_details(url)\n",
|
|
" #user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
|
|
" user_prompt += web_content\n",
|
|
" return user_prompt\n",
|
|
" except requests.RequestException:\n",
|
|
" return False"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "89b8b16c-0914-440e-8a1b-54959b0ae7d0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"get_brochure_user_prompt(\"HuggingFace\", \"https://huggingface.co\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "77528cd7-2460-4768-8d8c-a849f19f6381",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import requests\n",
|
|
"\n",
|
|
"def is_url_reachable1(url, timeout=5):\n",
|
|
" try:\n",
|
|
" response = requests.head(url, timeout=timeout)\n",
|
|
" return response.status_code < 400\n",
|
|
" except requests.RequestException:\n",
|
|
" return False"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b3f37ce1-ad44-46ff-8f18-74b537acaa9b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def create_brochure(company_name, url):\n",
|
|
" try:\n",
|
|
" if is_url_reachable(url,5):\n",
|
|
" response = ollama.chat(\n",
|
|
" model=\"llama3.2\",\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
|
" {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n",
|
|
" ]\n",
|
|
" )\n",
|
|
" \n",
|
|
" result = response['message']['content']\n",
|
|
" display(Markdown(result))\n",
|
|
" except requests.RequestException:\n",
|
|
" return False"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1e8a5ac2-b7e2-4c98-9615-5baba00e2dd0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"create_brochure(\"HuggingFace\", \"https://huggingface.co\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6ca16d59-1be8-44ef-8590-f5390e4debef",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def stream_brochure(company_name, url):\n",
|
|
" if not is_url_reachable(url):\n",
|
|
" print(\"❌ URL not reachable\")\n",
|
|
" return\n",
|
|
" try:\n",
|
|
" #if is_url_reachable(url,5):\n",
|
|
" stream = ollama.chat(\n",
|
|
" model=\"llama3.2\",\n",
|
|
" messages=[\n",
|
|
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
|
" {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n",
|
|
" ],\n",
|
|
" stream=True\n",
|
|
" )\n",
|
|
" \n",
|
|
" #result = response['message']['content']\n",
|
|
" # display(Markdown(result))\n",
|
|
" except requests.RequestException:\n",
|
|
" return False\n",
|
|
" \n",
|
|
" response = \"\"\n",
|
|
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
|
" #for chunk in stream:\n",
|
|
" #response += chunk.choices[0].delta.content or ''\n",
|
|
" #response += chunk['message']['content'] or ''\n",
|
|
" #response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
|
|
" #update_display(Markdown(response), display_id=display_handle.display_id)\n",
|
|
"\n",
|
|
" for chunk in stream:\n",
|
|
" content = chunk.get('message', {}).get('content', '')\n",
|
|
" if content:\n",
|
|
" response += content.replace(\"```\", \"\")\n",
|
|
" update_display(Markdown(response), display_id=display_handle.display_id)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0f156311-cc32-4bce-9645-7d10a50eae06",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"stream_brochure(\"HuggingFace\", \"https://huggingface.co\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|