Files
LLM_Engineering_OLD/community-contributions/muhammad_qasim_sheikh/Week 1/Day 5/brochure.ipynb

208 lines
6.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"id": "57499cf2",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"from dotenv import load_dotenv\n",
"from IPython.display import Markdown, display, update_display\n",
"from scraper import fetch_website_links, fetch_website_contents\n",
"from openai import OpenAI"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "310a13f3",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"client = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "79226a7f",
"metadata": {},
"outputs": [],
"source": [
"link_analyzer_prompt = \"\"\"\n",
"You are a skilled research analyst. Your task is to identify the most useful introductory links for a given topic from a list of URLs. \n",
"You must ignore forum posts, product pages, and social media links. Focus on high-quality articles, documentation, and educational resources.\n",
"Respond ONLY with a JSON object in the following format:\n",
"{\n",
" \"links\": [\n",
" {\"type\": \"overview_article\", \"url\": \"https://...\"},\n",
" {\"type\": \"technical_docs\", \"url\": \"https://...\"},\n",
" {\"type\": \"history_summary\", \"url\": \"https://...\"}\n",
" ]\n",
"}\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "73d02b52",
"metadata": {},
"outputs": [],
"source": [
"briefing_prompt = \"\"\"\n",
"You are an expert intelligence analyst. You will be given raw text from several articles about a topic. \n",
"Your mission is to synthesize this information into a clear and structured research brief. \n",
"The brief must contain the following sections in Markdown:\n",
"\n",
"Research Brief: {topic}\n",
"\n",
"1. Executive Summary\n",
"(A one-paragraph overview of the entire topic.)\n",
"\n",
"2. Key Concepts\n",
"(Use bullet points to list and explain the most important terms and ideas.)\n",
"\n",
"3. Important Figures / Events\n",
"(List the key people, organizations, or historical events relevant to the topic.)\n",
"\n",
"4. Further Reading\n",
"(Provide a list of the original URLs you analyzed for deeper study.)\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ab04efb6",
"metadata": {},
"outputs": [],
"source": [
"def get_relevant_links(topic: str, starting_url: str) -> dict:\n",
" \n",
" # getting all links from the starting URL\n",
" links_on_page = fetch_website_links(starting_url)\n",
" \n",
" # user prompt for the Link Analyst\n",
" user_prompt = f\"\"\"\n",
" Please analyze the following links related to the topic \"{topic}\" and return the most relevant ones for a research brief.\n",
" The main URL is {starting_url}. Make sure all returned URLs are absolute.\n",
"\n",
" Links:\n",
" {\"\\n\".join(links_on_page)}\n",
" \"\"\"\n",
" \n",
" response = client.chat.completions.create(\n",
" model=\"gpt-4o-mini\", \n",
" messages=[\n",
" {\"role\": \"system\", \"content\": link_analyzer_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" response_format={\"type\": \"json_object\"}\n",
" )\n",
" \n",
" result_json = response.choices[0].message.content\n",
" relevant_links = json.loads(result_json)\n",
" return relevant_links"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "ef6ef363",
"metadata": {},
"outputs": [],
"source": [
"def get_all_content(links_data: dict) -> str:\n",
" all_content = \"\"\n",
" original_urls = []\n",
"\n",
" for link in links_data.get(\"links\", []):\n",
" url = link.get(\"url\")\n",
" if url:\n",
" original_urls.append(url)\n",
" content = fetch_website_contents(url)\n",
" all_content += f\"Content from {url} \\n{content}\\n\\n\"\n",
" \n",
" all_content += f\"Original URLs for Reference\\n\" + \"\\n\".join(original_urls)\n",
" return all_content"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "c2020492",
"metadata": {},
"outputs": [],
"source": [
"def create_research_brief(topic: str, starting_url: str):\n",
" relevant_links = get_relevant_links(topic, starting_url)\n",
" full_content = get_all_content(relevant_links)\n",
"\n",
" user_prompt = f\"\"\"\n",
" Please create a research brief on the topic \"{topic}\" using the following content.\n",
" Remember to include the original URLs in the 'Further Reading' section.\n",
"\n",
" Content:\n",
" {full_content[:15000]}\n",
" \"\"\"\n",
" \n",
" stream = client.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": briefing_prompt.format(topic=topic)},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ],\n",
" stream=True\n",
" )\n",
" \n",
" response = \"\"\n",
" display_handle = display(Markdown(\"\"), display_id=True)\n",
" for chunk in stream:\n",
" response += chunk.choices[0].delta.content or ''\n",
" update_display(Markdown(response), display_id=display_handle.display_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "594e940c",
"metadata": {},
"outputs": [],
"source": [
"create_research_brief(\n",
" topic=\"The Rise of Artificial Intelligence\", \n",
" starting_url=\"https://en.wikipedia.org/wiki/Artificial_intelligence\"\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "llm-engineering",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}