Merge pull request #414 from armangoudarzi70/my-contribution
Add my notebook to community-contributions
This commit is contained in:
@@ -0,0 +1,127 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import os, textwrap, time, requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from urllib.parse import urljoin\n",
|
||||
"\n",
|
||||
"# ------------------ ENV & OpenAI ------------------\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"openai = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
|
||||
"\n",
|
||||
"UA = (\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n",
|
||||
" \"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117 Safari/537.36\")\n",
|
||||
"BASE_URL = \"https://www.cambridge.org\"\n",
|
||||
"JFQA_URL = f\"{BASE_URL}/core/journals/journal-of-financial-and-quantitative-analysis/latest-issue\"\n",
|
||||
"\n",
|
||||
"# ------------------ Helpers ------------------\n",
|
||||
"def fetch_latest_issue(url: str) -> list[dict]:\n",
|
||||
" \"\"\"Return unique {title, link} dicts for each research article.\"\"\"\n",
|
||||
" soup = BeautifulSoup(\n",
|
||||
" requests.get(url, headers={\"User-Agent\": UA}, timeout=30).text,\n",
|
||||
" \"html.parser\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" anchors = soup.find_all(\"a\", href=lambda h: h and \"/article/\" in h)\n",
|
||||
" seen, articles = set(), []\n",
|
||||
" for a in anchors:\n",
|
||||
" href = a[\"href\"].split(\"?\")[0] # strip tracking params\n",
|
||||
" if href in seen: # de‑duplicate\n",
|
||||
" continue\n",
|
||||
" seen.add(href)\n",
|
||||
" title = a.get_text(\" \", strip=True)\n",
|
||||
" full = urljoin(BASE_URL, href)\n",
|
||||
" articles.append({\"title\": title, \"link\": full})\n",
|
||||
" print(f\"Found {len(articles)} unique article links.\")\n",
|
||||
" return articles\n",
|
||||
"\n",
|
||||
"def fetch_article_details(link: str) -> dict:\n",
|
||||
" soup = BeautifulSoup(\n",
|
||||
" requests.get(link, headers={\"User-Agent\": UA}, timeout=30).text,\n",
|
||||
" \"html.parser\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # abstract\n",
|
||||
" abs_tag = soup.find(\"div\", class_=\"abstract\")\n",
|
||||
" abstract = abs_tag.get_text(\" \", strip=True) if abs_tag else \"N/A\"\n",
|
||||
"\n",
|
||||
" # publication date (meta is most reliable)\n",
|
||||
" meta_date = soup.find(\"meta\", attrs={\"name\": \"citation_publication_date\"})\n",
|
||||
" pub_date = meta_date[\"content\"] if meta_date else \"N/A\"\n",
|
||||
"\n",
|
||||
" # authors (multiple <meta name=\"citation_author\"> tags)\n",
|
||||
" authors = [m[\"content\"] for m in soup.find_all(\"meta\",\n",
|
||||
" attrs={\"name\": \"citation_author\"})]\n",
|
||||
" authors_str = \", \".join(authors) or \"N/A\"\n",
|
||||
"\n",
|
||||
" return {\"abstract\": abstract, \"pub_date\": pub_date, \"authors\": authors_str}\n",
|
||||
"\n",
|
||||
"def summarise(txt: str) -> str:\n",
|
||||
" prompt = (\"Summarise the following finance‑paper abstract in 2‑3 sentences, \"\n",
|
||||
" \"mentioning the question, method, and main finding.\\n\\n\"\n",
|
||||
" f\"Abstract:\\n{txt}\")\n",
|
||||
" try:\n",
|
||||
" rsp = openai.chat.completions.create(\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\",\n",
|
||||
" \"content\": \"You are a helpful finance research assistant.\"},\n",
|
||||
" {\"role\": \"user\", \"content\": prompt}],\n",
|
||||
" temperature=0.2, max_tokens=120\n",
|
||||
" )\n",
|
||||
" return rsp.choices[0].message.content.strip()\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"⚠️ summarise error → {e}\")\n",
|
||||
" return \"Summary unavailable.\"\n",
|
||||
"\n",
|
||||
"def scrape_jfqa_latest() -> None:\n",
|
||||
" for art in fetch_latest_issue(JFQA_URL):\n",
|
||||
" det = fetch_article_details(art[\"link\"])\n",
|
||||
" if det[\"abstract\"] == \"N/A\":\n",
|
||||
" print(f\"\\n📘 {art['title']} — no abstract found.\")\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" summary = summarise(det[\"abstract\"])\n",
|
||||
" print(f\"\\n📘 {art['title']}\")\n",
|
||||
" print(f\" Authors: {det['authors']}\")\n",
|
||||
" print(f\" Date : {det['pub_date']}\")\n",
|
||||
" print(f\" Journal: JFQA (Latest Issue)\")\n",
|
||||
" print(\" Summary:\", textwrap.shorten(summary, width=600, placeholder=\"…\"))\n",
|
||||
" print(\"-\" * 90)\n",
|
||||
" time.sleep(1.0) # polite gap between OpenAI calls\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" scrape_jfqa_latest()\n"
|
||||
],
|
||||
"id": "e20b182f6258f0be",
|
||||
"outputs": [],
|
||||
"execution_count": null
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user