{ "cells": [ { "cell_type": "code", "execution_count": 9, "id": "57499cf2", "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "from dotenv import load_dotenv\n", "from IPython.display import Markdown, display, update_display\n", "from scraper import fetch_website_links, fetch_website_contents\n", "from openai import OpenAI" ] }, { "cell_type": "code", "execution_count": 10, "id": "310a13f3", "metadata": {}, "outputs": [], "source": [ "load_dotenv(override=True)\n", "api_key = os.getenv('OPENAI_API_KEY')\n", "\n", "client = OpenAI()" ] }, { "cell_type": "code", "execution_count": 11, "id": "79226a7f", "metadata": {}, "outputs": [], "source": [ "link_analyzer_prompt = \"\"\"\n", "You are a skilled research analyst. Your task is to identify the most useful introductory links for a given topic from a list of URLs. \n", "You must ignore forum posts, product pages, and social media links. Focus on high-quality articles, documentation, and educational resources.\n", "Respond ONLY with a JSON object in the following format:\n", "{\n", " \"links\": [\n", " {\"type\": \"overview_article\", \"url\": \"https://...\"},\n", " {\"type\": \"technical_docs\", \"url\": \"https://...\"},\n", " {\"type\": \"history_summary\", \"url\": \"https://...\"}\n", " ]\n", "}\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 12, "id": "73d02b52", "metadata": {}, "outputs": [], "source": [ "briefing_prompt = \"\"\"\n", "You are an expert intelligence analyst. You will be given raw text from several articles about a topic. \n", "Your mission is to synthesize this information into a clear and structured research brief. \n", "The brief must contain the following sections in Markdown:\n", "\n", "Research Brief: {topic}\n", "\n", "1. Executive Summary\n", "(A one-paragraph overview of the entire topic.)\n", "\n", "2. Key Concepts\n", "(Use bullet points to list and explain the most important terms and ideas.)\n", "\n", "3. Important Figures / Events\n", "(List the key people, organizations, or historical events relevant to the topic.)\n", "\n", "4. Further Reading\n", "(Provide a list of the original URLs you analyzed for deeper study.)\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 13, "id": "ab04efb6", "metadata": {}, "outputs": [], "source": [ "def get_relevant_links(topic: str, starting_url: str) -> dict:\n", " \n", " # getting all links from the starting URL\n", " links_on_page = fetch_website_links(starting_url)\n", " \n", " # user prompt for the Link Analyst\n", " user_prompt = f\"\"\"\n", " Please analyze the following links related to the topic \"{topic}\" and return the most relevant ones for a research brief.\n", " The main URL is {starting_url}. Make sure all returned URLs are absolute.\n", "\n", " Links:\n", " {\"\\n\".join(links_on_page)}\n", " \"\"\"\n", " \n", " response = client.chat.completions.create(\n", " model=\"gpt-4o-mini\", \n", " messages=[\n", " {\"role\": \"system\", \"content\": link_analyzer_prompt},\n", " {\"role\": \"user\", \"content\": user_prompt}\n", " ],\n", " response_format={\"type\": \"json_object\"}\n", " )\n", " \n", " result_json = response.choices[0].message.content\n", " relevant_links = json.loads(result_json)\n", " return relevant_links" ] }, { "cell_type": "code", "execution_count": 14, "id": "ef6ef363", "metadata": {}, "outputs": [], "source": [ "def get_all_content(links_data: dict) -> str:\n", " all_content = \"\"\n", " original_urls = []\n", "\n", " for link in links_data.get(\"links\", []):\n", " url = link.get(\"url\")\n", " if url:\n", " original_urls.append(url)\n", " content = fetch_website_contents(url)\n", " all_content += f\"Content from {url} \\n{content}\\n\\n\"\n", " \n", " all_content += f\"Original URLs for Reference\\n\" + \"\\n\".join(original_urls)\n", " return all_content" ] }, { "cell_type": "code", "execution_count": 15, "id": "c2020492", "metadata": {}, "outputs": [], "source": [ "def create_research_brief(topic: str, starting_url: str):\n", " relevant_links = get_relevant_links(topic, starting_url)\n", " full_content = get_all_content(relevant_links)\n", "\n", " user_prompt = f\"\"\"\n", " Please create a research brief on the topic \"{topic}\" using the following content.\n", " Remember to include the original URLs in the 'Further Reading' section.\n", "\n", " Content:\n", " {full_content[:15000]}\n", " \"\"\"\n", " \n", " stream = client.chat.completions.create(\n", " model=\"gpt-4o-mini\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": briefing_prompt.format(topic=topic)},\n", " {\"role\": \"user\", \"content\": user_prompt}\n", " ],\n", " stream=True\n", " )\n", " \n", " response = \"\"\n", " display_handle = display(Markdown(\"\"), display_id=True)\n", " for chunk in stream:\n", " response += chunk.choices[0].delta.content or ''\n", " update_display(Markdown(response), display_id=display_handle.display_id)" ] }, { "cell_type": "code", "execution_count": null, "id": "594e940c", "metadata": {}, "outputs": [], "source": [ "create_research_brief(\n", " topic=\"The Rise of Artificial Intelligence\", \n", " starting_url=\"https://en.wikipedia.org/wiki/Artificial_intelligence\"\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "llm-engineering", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }