Add my notebook to community-contributions

2025-05-28 12:50:14 -06:00
parent d6b4bfbef0
commit 0714e7d1d9
1 changed files with 127 additions and 0 deletions
--- a/week1/community-contributions/Day1-finance-journal-summarizer.ipynb
+++ b/week1/community-contributions/Day1-finance-journal-summarizer.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "import os, textwrap, time, requests\n",
+    "from bs4 import BeautifulSoup\n",
+    "from openai import OpenAI\n",
+    "from dotenv import load_dotenv\n",
+    "from urllib.parse import urljoin\n",
+    "\n",
+    "# ------------------ ENV & OpenAI ------------------\n",
+    "load_dotenv(override=True)\n",
+    "openai = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
+    "\n",
+    "UA        = (\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n",
+    "             \"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117 Safari/537.36\")\n",
+    "BASE_URL  = \"https://www.cambridge.org\"\n",
+    "JFQA_URL  = f\"{BASE_URL}/core/journals/journal-of-financial-and-quantitative-analysis/latest-issue\"\n",
+    "\n",
+    "# ------------------ Helpers ------------------\n",
+    "def fetch_latest_issue(url: str) -> list[dict]:\n",
+    "    \"\"\"Return unique {title, link} dicts for each research article.\"\"\"\n",
+    "    soup = BeautifulSoup(\n",
+    "        requests.get(url, headers={\"User-Agent\": UA}, timeout=30).text,\n",
+    "        \"html.parser\"\n",
+    "    )\n",
+    "\n",
+    "    anchors = soup.find_all(\"a\", href=lambda h: h and \"/article/\" in h)\n",
+    "    seen, articles = set(), []\n",
+    "    for a in anchors:\n",
+    "        href = a[\"href\"].split(\"?\")[0]           # strip tracking params\n",
+    "        if href in seen:                         # de‑duplicate\n",
+    "            continue\n",
+    "        seen.add(href)\n",
+    "        title = a.get_text(\" \", strip=True)\n",
+    "        full  = urljoin(BASE_URL, href)\n",
+    "        articles.append({\"title\": title, \"link\": full})\n",
+    "    print(f\"Found {len(articles)} unique article links.\")\n",
+    "    return articles\n",
+    "\n",
+    "def fetch_article_details(link: str) -> dict:\n",
+    "    soup = BeautifulSoup(\n",
+    "        requests.get(link, headers={\"User-Agent\": UA}, timeout=30).text,\n",
+    "        \"html.parser\"\n",
+    "    )\n",
+    "\n",
+    "    # abstract\n",
+    "    abs_tag   = soup.find(\"div\", class_=\"abstract\")\n",
+    "    abstract  = abs_tag.get_text(\" \", strip=True) if abs_tag else \"N/A\"\n",
+    "\n",
+    "    # publication date (meta is most reliable)\n",
+    "    meta_date = soup.find(\"meta\", attrs={\"name\": \"citation_publication_date\"})\n",
+    "    pub_date  = meta_date[\"content\"] if meta_date else \"N/A\"\n",
+    "\n",
+    "    # authors (multiple <meta name=\"citation_author\"> tags)\n",
+    "    authors   = [m[\"content\"] for m in soup.find_all(\"meta\",\n",
+    "                       attrs={\"name\": \"citation_author\"})]\n",
+    "    authors_str = \", \".join(authors) or \"N/A\"\n",
+    "\n",
+    "    return {\"abstract\": abstract, \"pub_date\": pub_date, \"authors\": authors_str}\n",
+    "\n",
+    "def summarise(txt: str) -> str:\n",
+    "    prompt = (\"Summarise the following finance‑paper abstract in 2‑3 sentences, \"\n",
+    "              \"mentioning the question, method, and main finding.\\n\\n\"\n",
+    "              f\"Abstract:\\n{txt}\")\n",
+    "    try:\n",
+    "        rsp = openai.chat.completions.create(\n",
+    "            model=\"gpt-4o-mini\",\n",
+    "            messages=[\n",
+    "                {\"role\": \"system\",\n",
+    "                 \"content\": \"You are a helpful finance research assistant.\"},\n",
+    "                {\"role\": \"user\", \"content\": prompt}],\n",
+    "            temperature=0.2, max_tokens=120\n",
+    "        )\n",
+    "        return rsp.choices[0].message.content.strip()\n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️  summarise error → {e}\")\n",
+    "        return \"Summary unavailable.\"\n",
+    "\n",
+    "def scrape_jfqa_latest() -> None:\n",
+    "    for art in fetch_latest_issue(JFQA_URL):\n",
+    "        det = fetch_article_details(art[\"link\"])\n",
+    "        if det[\"abstract\"] == \"N/A\":\n",
+    "            print(f\"\\n📘 {art['title']}  —  no abstract found.\")\n",
+    "            continue\n",
+    "\n",
+    "        summary = summarise(det[\"abstract\"])\n",
+    "        print(f\"\\n📘 {art['title']}\")\n",
+    "        print(f\"   Authors: {det['authors']}\")\n",
+    "        print(f\"   Date   : {det['pub_date']}\")\n",
+    "        print(f\"   Journal: JFQA (Latest Issue)\")\n",
+    "        print(\"   Summary:\", textwrap.shorten(summary, width=600, placeholder=\"…\"))\n",
+    "        print(\"-\" * 90)\n",
+    "        time.sleep(1.0)      # polite gap between OpenAI calls\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    scrape_jfqa_latest()\n"
+   ],
+   "id": "e20b182f6258f0be",
+   "outputs": [],
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}