Added new folders and files in muhammad_qasim_sheikh directory
This commit is contained in:
@@ -0,0 +1,207 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "57499cf2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"from scraper import fetch_website_links, fetch_website_contents\n",
|
||||
"from openai import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "310a13f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"client = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "79226a7f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"link_analyzer_prompt = \"\"\"\n",
|
||||
"You are a skilled research analyst. Your task is to identify the most useful introductory links for a given topic from a list of URLs. \n",
|
||||
"You must ignore forum posts, product pages, and social media links. Focus on high-quality articles, documentation, and educational resources.\n",
|
||||
"Respond ONLY with a JSON object in the following format:\n",
|
||||
"{\n",
|
||||
" \"links\": [\n",
|
||||
" {\"type\": \"overview_article\", \"url\": \"https://...\"},\n",
|
||||
" {\"type\": \"technical_docs\", \"url\": \"https://...\"},\n",
|
||||
" {\"type\": \"history_summary\", \"url\": \"https://...\"}\n",
|
||||
" ]\n",
|
||||
"}\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "73d02b52",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"briefing_prompt = \"\"\"\n",
|
||||
"You are an expert intelligence analyst. You will be given raw text from several articles about a topic. \n",
|
||||
"Your mission is to synthesize this information into a clear and structured research brief. \n",
|
||||
"The brief must contain the following sections in Markdown:\n",
|
||||
"\n",
|
||||
"Research Brief: {topic}\n",
|
||||
"\n",
|
||||
"1. Executive Summary\n",
|
||||
"(A one-paragraph overview of the entire topic.)\n",
|
||||
"\n",
|
||||
"2. Key Concepts\n",
|
||||
"(Use bullet points to list and explain the most important terms and ideas.)\n",
|
||||
"\n",
|
||||
"3. Important Figures / Events\n",
|
||||
"(List the key people, organizations, or historical events relevant to the topic.)\n",
|
||||
"\n",
|
||||
"4. Further Reading\n",
|
||||
"(Provide a list of the original URLs you analyzed for deeper study.)\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "ab04efb6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_relevant_links(topic: str, starting_url: str) -> dict:\n",
|
||||
" \n",
|
||||
" # getting all links from the starting URL\n",
|
||||
" links_on_page = fetch_website_links(starting_url)\n",
|
||||
" \n",
|
||||
" # user prompt for the Link Analyst\n",
|
||||
" user_prompt = f\"\"\"\n",
|
||||
" Please analyze the following links related to the topic \"{topic}\" and return the most relevant ones for a research brief.\n",
|
||||
" The main URL is {starting_url}. Make sure all returned URLs are absolute.\n",
|
||||
"\n",
|
||||
" Links:\n",
|
||||
" {\"\\n\".join(links_on_page)}\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model=\"gpt-4o-mini\", \n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": link_analyzer_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ],\n",
|
||||
" response_format={\"type\": \"json_object\"}\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" result_json = response.choices[0].message.content\n",
|
||||
" relevant_links = json.loads(result_json)\n",
|
||||
" return relevant_links"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "ef6ef363",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_all_content(links_data: dict) -> str:\n",
|
||||
" all_content = \"\"\n",
|
||||
" original_urls = []\n",
|
||||
"\n",
|
||||
" for link in links_data.get(\"links\", []):\n",
|
||||
" url = link.get(\"url\")\n",
|
||||
" if url:\n",
|
||||
" original_urls.append(url)\n",
|
||||
" content = fetch_website_contents(url)\n",
|
||||
" all_content += f\"Content from {url} \\n{content}\\n\\n\"\n",
|
||||
" \n",
|
||||
" all_content += f\"Original URLs for Reference\\n\" + \"\\n\".join(original_urls)\n",
|
||||
" return all_content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "c2020492",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_research_brief(topic: str, starting_url: str):\n",
|
||||
" relevant_links = get_relevant_links(topic, starting_url)\n",
|
||||
" full_content = get_all_content(relevant_links)\n",
|
||||
"\n",
|
||||
" user_prompt = f\"\"\"\n",
|
||||
" Please create a research brief on the topic \"{topic}\" using the following content.\n",
|
||||
" Remember to include the original URLs in the 'Further Reading' section.\n",
|
||||
"\n",
|
||||
" Content:\n",
|
||||
" {full_content[:15000]}\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" stream = client.chat.completions.create(\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": briefing_prompt.format(topic=topic)},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ],\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" response = \"\"\n",
|
||||
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
||||
" for chunk in stream:\n",
|
||||
" response += chunk.choices[0].delta.content or ''\n",
|
||||
" update_display(Markdown(response), display_id=display_handle.display_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "594e940c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"create_research_brief(\n",
|
||||
" topic=\"The Rise of Artificial Intelligence\", \n",
|
||||
" starting_url=\"https://en.wikipedia.org/wiki/Artificial_intelligence\"\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llm-engineering",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
|
||||
# Standard headers to fetch a website
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
|
||||
def fetch_website_contents(url):
|
||||
"""
|
||||
Return the title and contents of the website at the given url;
|
||||
truncate to 2,000 characters as a sensible limit
|
||||
"""
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
title = soup.title.string if soup.title else "No title found"
|
||||
if soup.body:
|
||||
for irrelevant in soup.body(["script", "style", "img", "input"]):
|
||||
irrelevant.decompose()
|
||||
text = soup.body.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
text = ""
|
||||
return (title + "\n\n" + text)[:2_000]
|
||||
|
||||
|
||||
def fetch_website_links(url):
|
||||
"""
|
||||
Return the links on the webiste at the given url
|
||||
I realize this is inefficient as we're parsing twice! This is to keep the code in the lab simple.
|
||||
Feel free to use a class and optimize it!
|
||||
"""
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
links = [link.get("href") for link in soup.find_all("a")]
|
||||
return [link for link in links if link]
|
||||
Reference in New Issue
Block a user