Added my contributions to community-contributions
This commit is contained in:
221
week1/community-contributions/wk1-day1-RBG-all-sites-jina.ipynb
Normal file
221
week1/community-contributions/wk1-day1-RBG-all-sites-jina.ipynb
Normal file
@@ -0,0 +1,221 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# My First Lab = My 1st Frontier LLM Project\n",
|
||||
"## Summarize All Websites without Selenium\n",
|
||||
"This simple \"app\" uses Jina (https://jina.ai/reader) to turn all websites into markdown before summarizing by an LLM. As their website says: \"Convert a URL to LLM-friendly input, by simply adding r.jina.ai in front\". They have other tools that look useful too.\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests # added for jina\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"# from scraper import fetch_website_contents # not needed for jina\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables from a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n",
|
||||
"\n",
|
||||
"# Setup access to the frontier model\n",
|
||||
"\n",
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f0275b1b-7cfe-4f9d-abfa-7650d378da0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 1-a: Define the user prompt\n",
|
||||
"\n",
|
||||
"user_prompt_prefix = \"\"\"\n",
|
||||
"Here are the contents of a website.\n",
|
||||
"Provide a short summary of this website.\n",
|
||||
"If it includes news or announcements, then summarize these too.\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 1-b: Define the system prompt\n",
|
||||
"\n",
|
||||
"system_prompt = \"\"\"\n",
|
||||
"You are a smart assistant that analyzes the contents of a website,\n",
|
||||
"and provides a short, clear, summary, ignoring text that might be navigation related.\n",
|
||||
"Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Add the website content to the user prompt\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_prefix + website}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 5: Change the content utility to use jina\n",
|
||||
"\n",
|
||||
"def fetch_url_content(url):\n",
|
||||
" jina_reader_url = f\"https://r.jina.ai/{url}\"\n",
|
||||
" try:\n",
|
||||
" response = requests.get(jina_reader_url)\n",
|
||||
" response.raise_for_status() # Raise an exception for HTTP errors\n",
|
||||
" return response.text\n",
|
||||
" except requests.exceptions.RequestException as e:\n",
|
||||
" print(f\"Error fetching URL: {e}\")\n",
|
||||
" return None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "905b9919-aba7-45b5-ae65-81b3d1d78e34",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 3: Call OpenAI & Step 4: print the result\n",
|
||||
"\n",
|
||||
"def summarize(url):\n",
|
||||
" website = fetch_url_content(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-5-nano\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" summary = response.choices[0].message.content\n",
|
||||
" return display(Markdown(summary))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "05e38d41-dfa4-4b20-9c96-c46ea75d9fb5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarize(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "45d83403-a24c-44b5-84ac-961449b4008f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarize(\"https://cnn.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "75e9fd40-b354-4341-991e-863ef2e59db7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarize(\"https://openai.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "36ed9f14-b349-40e9-a42c-b367e77f8bda",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Content Summary vs Technical Summary\n",
|
||||
"\n",
|
||||
"In my work a technical summary of a website, or group of websites, would be useful too. For example, does it render on the server (HTML) or in the browser (JavaScript), what content management system (CMS) was used, how many pages, how many outbound links, how many inbound links, etc. Doing this exercise I realized LLMs can help with analyzing content, but I may need other tools to count pages, links, and other specifications.\n",
|
||||
"\n",
|
||||
"A \"Shout Out\" to whoever put \"Market_Research_Agent.ipynb\" in the Community-Contributions. It is a great example of using an LLM as a management consultant. I think Jina might help with this usecase by offering web search results through an API to feed to your LLM. Here is the system prompt from that notebook and I plan to use this format often.\n",
|
||||
"\n",
|
||||
"system_prompt = \"\"\"You are to act like a Mckinsey Consultant specializing in market research. \n",
|
||||
"1) You are to follow legal guidelines and never give immoral advice. \n",
|
||||
"2) Your job is to maximise profits for your clients by analysing their companies initiatives and giving out recommendations for newer initiatives.\\n \n",
|
||||
"3) Follow industry frameworks for reponses always give simple answers and stick to the point.\n",
|
||||
"4) If possible try to see what competitors exist and what market gap can your clients company exploit.\n",
|
||||
"5) Further more, USe SWOT, Porters 5 forces to summarize your recommendations, Give confidence score with every recommendations\n",
|
||||
"6) Try to give unique solutions by seeing what the market gap is, if market gap is ambiguious skip this step\n",
|
||||
"7) add an estimate of what rate the revenue of the comapany will increase at provided they follow the guidelines, give conservating estimates keeping in account non ideal conditions.\n",
|
||||
"8) if the website isnt of a company or data isnt available, give out an error message along the lines of more data required for analysis\"\"\""
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user