From 0c08fc80c8da2d3f1b74b6e75994befce5d6103a Mon Sep 17 00:00:00 2001 From: mansoorceksport Date: Mon, 20 Oct 2025 15:06:59 +0700 Subject: [PATCH] Add PDF summarizer notebook and supporting extractor script --- .../Week1Day1/pdf_summarizer/pdf_extractor.py | 31 +++ .../pdf_summarizer/pdf_summarize.ipynb | 254 ++++++++++++++++++ .../Week1Day1/pdf_summarizer/sample.pdf | Bin 0 -> 3731 bytes 3 files changed, 285 insertions(+) create mode 100644 week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/pdf_extractor.py create mode 100644 week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/pdf_summarize.ipynb create mode 100644 week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/sample.pdf diff --git a/week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/pdf_extractor.py b/week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/pdf_extractor.py new file mode 100644 index 0000000..9889be5 --- /dev/null +++ b/week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/pdf_extractor.py @@ -0,0 +1,31 @@ +import pymupdf # PyMuPDF +def extract_text(pdf_path): + """ + Extracts and aggregates text from all pages of a given PDF file while displaying + metadata including title and author. + + This function opens a PDF file, extracts text from every page, and combines the text + into a single string for further use. Metadata such as the document title and author + will also be printed for informational purposes. The PDF file is closed automatically + once the operation is complete. + + Parameters: + pdf_path (str): The file path to the PDF document. + + Returns: + str: A compiled string of text extracted from all pages of the PDF. + """ + # Replace 'your_document.pdf' with the actual path to your PDF file + doc = pymupdf.open(pdf_path) + print(f"Document title: {doc.metadata['title']}") + print(f"Document author: {doc.metadata['author']}") + + # Extract text from all pages + all_text = "" + for page in doc: + all_text += page.get_text() + "\n" + print("\nText from all pages:") + print(all_text) + + doc.close() + return all_text \ No newline at end of file diff --git a/week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/pdf_summarize.ipynb b/week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/pdf_summarize.ipynb new file mode 100644 index 0000000..6ec2bc6 --- /dev/null +++ b/week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/pdf_summarize.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2025-10-20T08:01:30.691815Z", + "start_time": "2025-10-20T08:01:30.689588Z" + } + }, + "source": [ + "from dotenv import load_dotenv\n", + "import pdf_extractor\n", + "import os\n", + "from ollama import Client" + ], + "outputs": [], + "execution_count": 20 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-20T08:01:32.070132Z", + "start_time": "2025-10-20T08:01:32.064843Z" + } + }, + "cell_type": "code", + "source": [ + "load_dotenv(override=True)\n", + "api_key = os.environ.get('OLLAMA_API_KEY')\n", + "\n", + "if not api_key:\n", + " print(\"No API key found\")\n", + "else:\n", + " print(\"API key found\")\n" + ], + "id": "7c1e78571e54895f", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API key found\n" + ] + } + ], + "execution_count": 21 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-20T08:01:33.313806Z", + "start_time": "2025-10-20T08:01:33.305667Z" + } + }, + "cell_type": "code", + "source": [ + "client = Client(\n", + " host=\"https://ollama.com\",\n", + " headers={'Authorization': 'Bearer ' + os.environ.get('OLLAMA_API_KEY')}\n", + ")" + ], + "id": "4be731227f848288", + "outputs": [], + "execution_count": 22 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-20T08:01:35.004035Z", + "start_time": "2025-10-20T08:01:34.990890Z" + } + }, + "cell_type": "code", + "source": "pdf_content = pdf_extractor.extract_text(\"sample.pdf\")", + "id": "912aacb46475d2ab", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document title: \n", + "Document author: \n", + "\n", + "Text from all pages:\n", + "The Mountain Guardian\n", + "High above the clouds, where the wind howled through jagged peaks and snow kissed the stone,\n", + "there lived a man whose name few remembered. The villagers below called him Kaelen the Silent, a\n", + "ghost among the mountains, a legend whispered around fires. For decades, no one had seen him\n", + "descend, yet strange lights often danced in the night sky above the cliffs - lights that bent and\n", + "shimmered like the northern auroras, though no aurora ever touched those skies.\n", + "Kaelen had not always been alone. Once, he was a warrior - the greatest of his kind. Born with an\n", + "unnatural power that hummed beneath his skin, he could command the very essence of the world:\n", + "stones shifted at his will, rivers bent their flow, and storms obeyed his call. The elders had declared\n", + "him chosen, a guardian meant to protect the realm. But power was a double-edged blade, and when\n", + "war came, it cut too deep.\n", + "In the final battle of the Age of Blades, Kaelen's strength saved thousands - and doomed just as\n", + "many. In a moment of desperation, he unleashed his full might upon the invading armies, shattering\n", + "the ground and swallowing them whole. The land itself screamed under the force. Cities crumbled,\n", + "forests burned, and the blood of both friend and foe stained the soil. The war ended that day, but the\n", + "cost was too high. Wracked with guilt, Kaelen vanished into the mountains, vowing never again to\n", + "wield his gift.\n", + "Years passed. Seasons turned. Legends grew. The world moved on, forgetting the man who once\n", + "shaped its fate. But Kaelen did not forget. Each dawn, he stood at the edge of the cliff and watched\n", + "the valley below - the rivers he had diverted, the scars he had carved into the land. He lived simply:\n", + "gathering herbs, carving wooden charms, speaking to no one but the wind. Yet the power still\n", + "thrummed beneath his skin, restless and waiting.\n", + "One winter, a storm unlike any other swept through the mountains. Villages were buried beneath\n", + "snow, and beasts from the frozen north roamed far beyond their borders. Among them came a\n", + "darkness more terrible than any blizzard: an ancient force, long sealed away, had awakened. Its\n", + "shadow crept across the land, devouring light and life alike. And with it came a name Kaelen\n", + "thought he would never hear again - the Order of the Dawn, the same elders who had once called\n", + "him guardian.\n", + "They came to his mountain, desperate and broken. \"The world needs you,\" they said. \"Only you can\n", + "stop this.\"\n", + "\n", + "Kaelen turned away. \"The world needs peace,\" he whispered. \"And I am no bringer of peace.\"\n", + "But the cries of the valley reached him - the weeping of children, the howls of the dying, the\n", + "whispers of a world on the brink. The guilt he had carried for decades began to shift, transforming\n", + "into something else: resolve. Perhaps his power was never meant to destroy or to save. Perhaps it\n", + "was meant to balance - to stand between chaos and order.\n", + "At dawn, Kaelen descended the mountain for the first time in forty years. His footsteps shook the\n", + "ground. The wind followed in his wake. The villagers stared in awe as the man from legend walked\n", + "among them, cloak billowing like a storm cloud.\n", + "The darkness waited beyond the valley, patient and hungry. Kaelen felt its presence - ancient,\n", + "powerful, and mocking. But he did not falter. This time, he would not wield his gift as a weapon of\n", + "wrath. This time, he would master it.\n", + "And as the first clash of power shook the heavens, the world realized that the guardian had returned\n", + "- not as a destroyer, not as a savior, but as a man who understood that true strength lies not in\n", + "isolation, but in purpose.\n", + "\n", + "\n" + ] + } + ], + "execution_count": 23 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-20T08:01:36.914001Z", + "start_time": "2025-10-20T08:01:36.911275Z" + } + }, + "cell_type": "code", + "source": [ + "system_prompt = \"\"\"You are a snarky assistant that analyzes the contents of a pdf,\n", + "and provides a short, snarky, humorous summary, ignoring text that might be navigation related.\n", + "Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.\"\"\"\n", + "\n", + "user_prompt = \"\"\"\n", + " Here are the contents of a pdf.\n", + " Provide a short summary of this pdf.\n", + "\"\"\"\n", + "\n", + "user_prompt += pdf_content\n" + ], + "id": "a665eb55a5cce433", + "outputs": [], + "execution_count": 24 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-20T08:01:38.255895Z", + "start_time": "2025-10-20T08:01:38.253714Z" + } + }, + "cell_type": "code", + "source": [ + "messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + "]" + ], + "id": "9cf97ff1a01c4a0b", + "outputs": [], + "execution_count": 25 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-20T08:05:57.525835Z", + "start_time": "2025-10-20T08:05:57.522774Z" + } + }, + "cell_type": "code", + "source": "response = client.chat('gpt-oss:120b-cloud', messages=messages, stream=True)", + "id": "3c08773150a59b12", + "outputs": [], + "execution_count": 41 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-20T08:06:02.788455Z", + "start_time": "2025-10-20T08:05:59.261571Z" + } + }, + "cell_type": "code", + "source": [ + "from IPython.display import display, Markdown\n", + "\n", + "output = \"\"\n", + "for part in response:\n", + " content = part['message']['content']\n", + " output += content\n", + " # print(content, end='', flush=True)\n", + "\n", + "display(Markdown(output))\n" + ], + "id": "13553a2bef707111", + "outputs": [ + { + "data": { + "text/plain": [ + "" + ], + "text/markdown": "## TL;DR: The “Mountain Guardian” is basically **Brooding Goliath #12** \n\n- **Kaelen the Silent**: Once a god‑level warrior who could bend rocks, rivers, and storms to his whims. Think “Avatar” meets “Grumpy Old Man”. \n- **War trauma**: He demolished an entire invading army, erased whole cities, and then got a massive case of *oops‑I‑did‑that* guilt, so he retreated to his alpine Airbnb for 40 years. \n- **Mountaintop hermit life**: Collects herbs, carves wooden charms, and talks to the wind—basically a D&D NPC with an overpowered “power‑under‑the‑skin” passive. \n- **Plot twist**: A cosmic snow‑storm + ancient evil + the Order of the Dawn (the same folks who called him “guardian”) knock on his door. “World needs you!” they cry. \n- **Kaelen’s epiphany**: “Peace = staying on my mountain” → “Maybe I can actually *use* my powers without blowing everything up.” \n- **Climactic comeback**: He finally descends, shakes the valley (literally), and fights the darkness—not as a smiting juggernaut, but as a reluctantly responsible adult with a purpose. \n\n**Bottom line:** A brooding, guilt‑ridden superhero finally decides to get off his rock and do his job. The moral? Even the biggest hermits can’t ignore the world forever—especially when it’s on fire. 🌋🗻✨" + }, + "metadata": {}, + "output_type": "display_data", + "jetTransient": { + "display_id": null + } + } + ], + "execution_count": 42 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/sample.pdf b/week1/community-contributions/_mansoor/Week1Day1/pdf_summarizer/sample.pdf new file mode 100644 index 0000000000000000000000000000000000000000..80130941d73fbc5d64b13e2062ef3c7fd0b5264e GIT binary patch literal 3731 zcmbVPcU)7~7grh8hyp@EhVTR_tB?VKu#vF$Ml7D23OaBXaF#P z>>mu#*N3`BhT;IIqhBBn0^Y-MBq{(0SDhhHXB>r03&-Fn0OEH5O(s#nbqb)#of{fL za3m~O3U~ThnmL|8#f1Yys62d4P#@zSmy^Vh6dX{9IDSwQgh<% zVb3lwJ$hQAI!)c4&L^ZoIg{73&)H%nnzCnS#G^8`FQR0M3>Vs)3$s$T)(}c-_ zP@hrYte)EZID5LQV&c97-*h|Xf|uxYuJ^?Db+0Ll;-n@eR_!3PUe+>q#LcO-xh#yL zJYp2|m3QIs3v zmAC`GS6Gb{&m~*QH%e>`^(OgCS5No7DPNI1(UQ8#D91V~W$VSUckQ^{(`oU|J?#TP z$hvWF<}|1KJsK-02Z0E@ zU>ja;?2O6oH_-Jx7htL7R5(d{q}Go=q_W})~tGK`J1GfLiXJMAv0@>*=*-=%kF7;YpY8IR_=Ba>s|LET&gY76(j9b zaur)~ittz6I__;*Oc8%ZXl;41S|VDZ_~#Jzg!)cLp`as#0+TSM2{iwF#<_ry%eGj*Q!=a-(qlqQk3vAS#M%Zs|(_^YC1oW>WF$169r`3@VfRAO6? z9ueJ<+te<3RE>h@rakI<5VKY}5F8U3F=8at90Cnvq-)ynklSb8SSl{+5u~oIX1+}z zmMB-E31_1F!aFNXJ)L=K#W<&+!ybhF%lUa#yGmeem9muSUa^Xgg{od6san6~&DDu~ zvvQKS$cpO}Cx+A_E;S@P%|pCBLfou{3WmgO9kY6|DpywnS=*BlCZ?T8fUG&p*@;;E zN<4TL^NpkD!&1#R<0V3zp+V=kc84;?!|(W4Peg?YC2aPIPnns8^+@1ZBh8Nev!Wxj z0*8&dB8)vrRxd3T#qTCRnT8!XzhBDDnpdKhqR%WIbSJ02dM1eUsT&y2>d3p&nvseQ zu0L>UcWI?kj=N!0+stbU(sMaB$}=$4Dpj2F@eTXA+bH2D(V|H>zCLS2oawxAJXV@c z-eTN6f5%3z_12t6ZkpUODMvu8spFP)&!Wmnn^_&cLt86muF|inoG?4rE648HYXqTp z*IX&t5@osFnvXGaN~%a?MO6#$Ydhm&^#0quO1ZpgxT)3VokFx-Wc14p;n=OL4#aJ> z+?kgVo6)NPvb+ZGhOH{f=!uNCT?&ou^dya)SB`32hqzCoOvc(nb>(;Wy!Wyep|)RG z9(lvNdD}h=GbjH<#5ae$tCPhb-pS#VU%DZ6K(HA?;YXyzb(r(po1fY|=e-&;5$8@R zzV9`X{wt3n$7Eqt+g{#G-AZ_Ry#>RM#awhxGThac#)smhNY~*>)z;Pv9CCmTEO69ZC-2#I>vOX$R zC$GNiT9M!6grI2!#phBCHp4S}C}ppBn&wp97H5Ol2}Vf0_nDX2gK6IH+;6rqOk$qTm zF<7u}V@1KqWJvvJ?)0sV#^9M%YvD-qXvuj~pXECbPaKx08#w=**%@^~u;XU^I)&1n zBYi5)@y6XPBcUtS^VJek8zfC@y?0t3MaailnEp1>Tef`+nP4GmS2mI+lGlb1{FOGG5no4)t@1^wSD;VUHGe>vK9)Z%)-|nw?z}W7Uh`n^dQVPonw$iA zd*CfAlYH+6-j--@nA#BOt@W-X)tM4pV1)orfaT>5JC`{rnxv6~ zA^x&LKKk95UDK!`Hh2Z7t2GrHw)<`3Zue~DqXxCqu~AA<^ucAypb74{FHvXR;eC~Q z7x4&HQ!FZdRJd+#XyxjljLiJd!c9bpz*I<~!g45!n8I4w|J?{buYH#qQ0R5v_zfzV zWkN>_i^rGWgeGXV1-|gexVQGiig<9YnP8E$$urLDQ|!yHZqs<0xoEq!d&83F_CG&s zZeRP4u9pIV+TgJi;0JC`>If8o z(uAv{bdEv(Y=X7_F5DdKmLX6RKMIaJ0-%;SLIjSA$M~t4kO^1_)P?3x{nUGNe=%?f z)Qkjn%Xm^C0QJC=4wESOFTh`vp#HUz|0bW?=>P5^kPL*AF)lbN@B?M+c3t zB7#K0ywT8`rrT#a9siN2Xh3#eP3tp`$La7_y^-KQ~=xpPXBb9&es~T zGz>@|lpG_?xk};c+Uf`(h)NCBg+fCk146L@>Vaf(AmJ0SP{)voP-QSuAfKS($s|*d zasW!Ex(FBo35OwIFl{7EQ%m_T%HZ4xezEWOKReow>PH|4a=A|+$aFqaABRvJ=`e;X z^h;hq&GA%AfJ7q@Kx4UkZT%w2G|(dsBpjMdpb<%b(L)c%1wdc`91el~aRFLLqy`cQ z06t>~6dcSo?gfy(U>b1PKQM3u=%cUq!C)xR$6sS`lm-~RzhOFDi@(MYIxsLQUt=2o zM~4Oi{tvwx8er=Gah4kHhbQ2|K@tfj8=isV+6#3ilR*N>^)r|OR-^zjz>V;y9H3DB U!l|EJqoJvRfG8-KIhaHK2Z>@#lmGw# literal 0 HcmV?d00001