Added new folders and files in muhammad_qasim_sheikh directory

2025-10-22 17:46:05 +05:00
parent 0b4e4be9a0
commit 2549f89d1e
16 changed files with 581 additions and 1284 deletions
--- a/community-contributions/muhammad_qasim_sheikh/Week
+++ b/community-contributions/muhammad_qasim_sheikh/Week
@@ -0,0 +1,207 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "57499cf2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from dotenv import load_dotenv\n",
+    "from IPython.display import Markdown, display, update_display\n",
+    "from scraper import fetch_website_links, fetch_website_contents\n",
+    "from openai import OpenAI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "310a13f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "load_dotenv(override=True)\n",
+    "api_key = os.getenv('OPENAI_API_KEY')\n",
+    "\n",
+    "client = OpenAI()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "79226a7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "link_analyzer_prompt = \"\"\"\n",
+    "You are a skilled research analyst. Your task is to identify the most useful introductory links for a given topic from a list of URLs. \n",
+    "You must ignore forum posts, product pages, and social media links. Focus on high-quality articles, documentation, and educational resources.\n",
+    "Respond ONLY with a JSON object in the following format:\n",
+    "{\n",
+    "    \"links\": [\n",
+    "        {\"type\": \"overview_article\", \"url\": \"https://...\"},\n",
+    "        {\"type\": \"technical_docs\", \"url\": \"https://...\"},\n",
+    "        {\"type\": \"history_summary\", \"url\": \"https://...\"}\n",
+    "    ]\n",
+    "}\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "73d02b52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "briefing_prompt = \"\"\"\n",
+    "You are an expert intelligence analyst. You will be given raw text from several articles about a topic. \n",
+    "Your mission is to synthesize this information into a clear and structured research brief. \n",
+    "The brief must contain the following sections in Markdown:\n",
+    "\n",
+    "Research Brief: {topic}\n",
+    "\n",
+    "1. Executive Summary\n",
+    "(A one-paragraph overview of the entire topic.)\n",
+    "\n",
+    "2. Key Concepts\n",
+    "(Use bullet points to list and explain the most important terms and ideas.)\n",
+    "\n",
+    "3. Important Figures / Events\n",
+    "(List the key people, organizations, or historical events relevant to the topic.)\n",
+    "\n",
+    "4. Further Reading\n",
+    "(Provide a list of the original URLs you analyzed for deeper study.)\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ab04efb6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_relevant_links(topic: str, starting_url: str) -> dict:\n",
+    "    \n",
+    "    # getting all links from the starting URL\n",
+    "    links_on_page = fetch_website_links(starting_url)\n",
+    "    \n",
+    "    # user prompt for the Link Analyst\n",
+    "    user_prompt = f\"\"\"\n",
+    "    Please analyze the following links related to the topic \"{topic}\" and return the most relevant ones for a research brief.\n",
+    "    The main URL is {starting_url}. Make sure all returned URLs are absolute.\n",
+    "\n",
+    "    Links:\n",
+    "    {\"\\n\".join(links_on_page)}\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    response = client.chat.completions.create(\n",
+    "        model=\"gpt-4o-mini\", \n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": link_analyzer_prompt},\n",
+    "            {\"role\": \"user\", \"content\": user_prompt}\n",
+    "        ],\n",
+    "        response_format={\"type\": \"json_object\"}\n",
+    "    )\n",
+    "    \n",
+    "    result_json = response.choices[0].message.content\n",
+    "    relevant_links = json.loads(result_json)\n",
+    "    return relevant_links"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "ef6ef363",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_all_content(links_data: dict) -> str:\n",
+    "    all_content = \"\"\n",
+    "    original_urls = []\n",
+    "\n",
+    "    for link in links_data.get(\"links\", []):\n",
+    "        url = link.get(\"url\")\n",
+    "        if url:\n",
+    "            original_urls.append(url)\n",
+    "            content = fetch_website_contents(url)\n",
+    "            all_content += f\"Content from {url} \\n{content}\\n\\n\"\n",
+    "    \n",
+    "    all_content += f\"Original URLs for Reference\\n\" + \"\\n\".join(original_urls)\n",
+    "    return all_content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c2020492",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_research_brief(topic: str, starting_url: str):\n",
+    "    relevant_links = get_relevant_links(topic, starting_url)\n",
+    "    full_content = get_all_content(relevant_links)\n",
+    "\n",
+    "    user_prompt = f\"\"\"\n",
+    "    Please create a research brief on the topic \"{topic}\" using the following content.\n",
+    "    Remember to include the original URLs in the 'Further Reading' section.\n",
+    "\n",
+    "    Content:\n",
+    "    {full_content[:15000]}\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    stream = client.chat.completions.create(\n",
+    "        model=\"gpt-4o-mini\",\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": briefing_prompt.format(topic=topic)},\n",
+    "            {\"role\": \"user\", \"content\": user_prompt}\n",
+    "        ],\n",
+    "        stream=True\n",
+    "    )\n",
+    "    \n",
+    "    response = \"\"\n",
+    "    display_handle = display(Markdown(\"\"), display_id=True)\n",
+    "    for chunk in stream:\n",
+    "        response += chunk.choices[0].delta.content or ''\n",
+    "        update_display(Markdown(response), display_id=display_handle.display_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "594e940c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "create_research_brief(\n",
+    "    topic=\"The Rise of Artificial Intelligence\", \n",
+    "    starting_url=\"https://en.wikipedia.org/wiki/Artificial_intelligence\"\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llm-engineering",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/community-contributions/muhammad_qasim_sheikh/Week
+++ b/community-contributions/muhammad_qasim_sheikh/Week
@@ -0,0 +1,37 @@
+from bs4 import BeautifulSoup
+import requests
+
+
+# Standard headers to fetch a website
+headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
+}
+
+
+def fetch_website_contents(url):
+    """
+    Return the title and contents of the website at the given url;
+    truncate to 2,000 characters as a sensible limit
+    """
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.content, "html.parser")
+    title = soup.title.string if soup.title else "No title found"
+    if soup.body:
+        for irrelevant in soup.body(["script", "style", "img", "input"]):
+            irrelevant.decompose()
+        text = soup.body.get_text(separator="\n", strip=True)
+    else:
+        text = ""
+    return (title + "\n\n" + text)[:2_000]
+
+
+def fetch_website_links(url):
+    """
+    Return the links on the webiste at the given url
+    I realize this is inefficient as we're parsing twice! This is to keep the code in the lab simple.
+    Feel free to use a class and optimize it!
+    """
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.content, "html.parser")
+    links = [link.get("href") for link in soup.find_all("a")]
+    return [link for link in links if link]