Files
LLM_Engineering_OLD/week1/community-contributions/Day1-finance-journal-summarizer.ipynb
2025-05-28 12:50:14 -06:00

128 lines
5.1 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"metadata": {},
"cell_type": "code",
"source": [
"import os, textwrap, time, requests\n",
"from bs4 import BeautifulSoup\n",
"from openai import OpenAI\n",
"from dotenv import load_dotenv\n",
"from urllib.parse import urljoin\n",
"\n",
"# ------------------ ENV & OpenAI ------------------\n",
"load_dotenv(override=True)\n",
"openai = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
"\n",
"UA = (\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n",
" \"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117 Safari/537.36\")\n",
"BASE_URL = \"https://www.cambridge.org\"\n",
"JFQA_URL = f\"{BASE_URL}/core/journals/journal-of-financial-and-quantitative-analysis/latest-issue\"\n",
"\n",
"# ------------------ Helpers ------------------\n",
"def fetch_latest_issue(url: str) -> list[dict]:\n",
" \"\"\"Return unique {title, link} dicts for each research article.\"\"\"\n",
" soup = BeautifulSoup(\n",
" requests.get(url, headers={\"User-Agent\": UA}, timeout=30).text,\n",
" \"html.parser\"\n",
" )\n",
"\n",
" anchors = soup.find_all(\"a\", href=lambda h: h and \"/article/\" in h)\n",
" seen, articles = set(), []\n",
" for a in anchors:\n",
" href = a[\"href\"].split(\"?\")[0] # strip tracking params\n",
" if href in seen: # deduplicate\n",
" continue\n",
" seen.add(href)\n",
" title = a.get_text(\" \", strip=True)\n",
" full = urljoin(BASE_URL, href)\n",
" articles.append({\"title\": title, \"link\": full})\n",
" print(f\"Found {len(articles)} unique article links.\")\n",
" return articles\n",
"\n",
"def fetch_article_details(link: str) -> dict:\n",
" soup = BeautifulSoup(\n",
" requests.get(link, headers={\"User-Agent\": UA}, timeout=30).text,\n",
" \"html.parser\"\n",
" )\n",
"\n",
" # abstract\n",
" abs_tag = soup.find(\"div\", class_=\"abstract\")\n",
" abstract = abs_tag.get_text(\" \", strip=True) if abs_tag else \"N/A\"\n",
"\n",
" # publication date (meta is most reliable)\n",
" meta_date = soup.find(\"meta\", attrs={\"name\": \"citation_publication_date\"})\n",
" pub_date = meta_date[\"content\"] if meta_date else \"N/A\"\n",
"\n",
" # authors (multiple <meta name=\"citation_author\"> tags)\n",
" authors = [m[\"content\"] for m in soup.find_all(\"meta\",\n",
" attrs={\"name\": \"citation_author\"})]\n",
" authors_str = \", \".join(authors) or \"N/A\"\n",
"\n",
" return {\"abstract\": abstract, \"pub_date\": pub_date, \"authors\": authors_str}\n",
"\n",
"def summarise(txt: str) -> str:\n",
" prompt = (\"Summarise the following financepaper abstract in 23 sentences, \"\n",
" \"mentioning the question, method, and main finding.\\n\\n\"\n",
" f\"Abstract:\\n{txt}\")\n",
" try:\n",
" rsp = openai.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" messages=[\n",
" {\"role\": \"system\",\n",
" \"content\": \"You are a helpful finance research assistant.\"},\n",
" {\"role\": \"user\", \"content\": prompt}],\n",
" temperature=0.2, max_tokens=120\n",
" )\n",
" return rsp.choices[0].message.content.strip()\n",
" except Exception as e:\n",
" print(f\"⚠️ summarise error → {e}\")\n",
" return \"Summary unavailable.\"\n",
"\n",
"def scrape_jfqa_latest() -> None:\n",
" for art in fetch_latest_issue(JFQA_URL):\n",
" det = fetch_article_details(art[\"link\"])\n",
" if det[\"abstract\"] == \"N/A\":\n",
" print(f\"\\n📘 {art['title']} — no abstract found.\")\n",
" continue\n",
"\n",
" summary = summarise(det[\"abstract\"])\n",
" print(f\"\\n📘 {art['title']}\")\n",
" print(f\" Authors: {det['authors']}\")\n",
" print(f\" Date : {det['pub_date']}\")\n",
" print(f\" Journal: JFQA (Latest Issue)\")\n",
" print(\" Summary:\", textwrap.shorten(summary, width=600, placeholder=\"…\"))\n",
" print(\"-\" * 90)\n",
" time.sleep(1.0) # polite gap between OpenAI calls\n",
"\n",
"if __name__ == \"__main__\":\n",
" scrape_jfqa_latest()\n"
],
"id": "e20b182f6258f0be",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}