diff --git a/week1/community-contributions/Day1-finance-journal-summarizer.ipynb b/week1/community-contributions/Day1-finance-journal-summarizer.ipynb new file mode 100644 index 0000000..cffa355 --- /dev/null +++ b/week1/community-contributions/Day1-finance-journal-summarizer.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "code", + "source": [ + "import os, textwrap, time, requests\n", + "from bs4 import BeautifulSoup\n", + "from openai import OpenAI\n", + "from dotenv import load_dotenv\n", + "from urllib.parse import urljoin\n", + "\n", + "# ------------------ ENV & OpenAI ------------------\n", + "load_dotenv(override=True)\n", + "openai = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n", + "\n", + "UA = (\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n", + " \"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117 Safari/537.36\")\n", + "BASE_URL = \"https://www.cambridge.org\"\n", + "JFQA_URL = f\"{BASE_URL}/core/journals/journal-of-financial-and-quantitative-analysis/latest-issue\"\n", + "\n", + "# ------------------ Helpers ------------------\n", + "def fetch_latest_issue(url: str) -> list[dict]:\n", + " \"\"\"Return unique {title, link} dicts for each research article.\"\"\"\n", + " soup = BeautifulSoup(\n", + " requests.get(url, headers={\"User-Agent\": UA}, timeout=30).text,\n", + " \"html.parser\"\n", + " )\n", + "\n", + " anchors = soup.find_all(\"a\", href=lambda h: h and \"/article/\" in h)\n", + " seen, articles = set(), []\n", + " for a in anchors:\n", + " href = a[\"href\"].split(\"?\")[0] # strip tracking params\n", + " if href in seen: # de‑duplicate\n", + " continue\n", + " seen.add(href)\n", + " title = a.get_text(\" \", strip=True)\n", + " full = urljoin(BASE_URL, href)\n", + " articles.append({\"title\": title, \"link\": full})\n", + " print(f\"Found {len(articles)} unique article links.\")\n", + " return articles\n", + "\n", + "def fetch_article_details(link: str) -> dict:\n", + " soup = BeautifulSoup(\n", + " requests.get(link, headers={\"User-Agent\": UA}, timeout=30).text,\n", + " \"html.parser\"\n", + " )\n", + "\n", + " # abstract\n", + " abs_tag = soup.find(\"div\", class_=\"abstract\")\n", + " abstract = abs_tag.get_text(\" \", strip=True) if abs_tag else \"N/A\"\n", + "\n", + " # publication date (meta is most reliable)\n", + " meta_date = soup.find(\"meta\", attrs={\"name\": \"citation_publication_date\"})\n", + " pub_date = meta_date[\"content\"] if meta_date else \"N/A\"\n", + "\n", + " # authors (multiple tags)\n", + " authors = [m[\"content\"] for m in soup.find_all(\"meta\",\n", + " attrs={\"name\": \"citation_author\"})]\n", + " authors_str = \", \".join(authors) or \"N/A\"\n", + "\n", + " return {\"abstract\": abstract, \"pub_date\": pub_date, \"authors\": authors_str}\n", + "\n", + "def summarise(txt: str) -> str:\n", + " prompt = (\"Summarise the following finance‑paper abstract in 2‑3 sentences, \"\n", + " \"mentioning the question, method, and main finding.\\n\\n\"\n", + " f\"Abstract:\\n{txt}\")\n", + " try:\n", + " rsp = openai.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"system\",\n", + " \"content\": \"You are a helpful finance research assistant.\"},\n", + " {\"role\": \"user\", \"content\": prompt}],\n", + " temperature=0.2, max_tokens=120\n", + " )\n", + " return rsp.choices[0].message.content.strip()\n", + " except Exception as e:\n", + " print(f\"⚠️ summarise error → {e}\")\n", + " return \"Summary unavailable.\"\n", + "\n", + "def scrape_jfqa_latest() -> None:\n", + " for art in fetch_latest_issue(JFQA_URL):\n", + " det = fetch_article_details(art[\"link\"])\n", + " if det[\"abstract\"] == \"N/A\":\n", + " print(f\"\\n📘 {art['title']} — no abstract found.\")\n", + " continue\n", + "\n", + " summary = summarise(det[\"abstract\"])\n", + " print(f\"\\n📘 {art['title']}\")\n", + " print(f\" Authors: {det['authors']}\")\n", + " print(f\" Date : {det['pub_date']}\")\n", + " print(f\" Journal: JFQA (Latest Issue)\")\n", + " print(\" Summary:\", textwrap.shorten(summary, width=600, placeholder=\"…\"))\n", + " print(\"-\" * 90)\n", + " time.sleep(1.0) # polite gap between OpenAI calls\n", + "\n", + "if __name__ == \"__main__\":\n", + " scrape_jfqa_latest()\n" + ], + "id": "e20b182f6258f0be", + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}