Add PDF summarizer notebook and supporting extractor script
This commit is contained in:
@@ -0,0 +1,31 @@
|
||||
import pymupdf # PyMuPDF
|
||||
def extract_text(pdf_path):
|
||||
"""
|
||||
Extracts and aggregates text from all pages of a given PDF file while displaying
|
||||
metadata including title and author.
|
||||
|
||||
This function opens a PDF file, extracts text from every page, and combines the text
|
||||
into a single string for further use. Metadata such as the document title and author
|
||||
will also be printed for informational purposes. The PDF file is closed automatically
|
||||
once the operation is complete.
|
||||
|
||||
Parameters:
|
||||
pdf_path (str): The file path to the PDF document.
|
||||
|
||||
Returns:
|
||||
str: A compiled string of text extracted from all pages of the PDF.
|
||||
"""
|
||||
# Replace 'your_document.pdf' with the actual path to your PDF file
|
||||
doc = pymupdf.open(pdf_path)
|
||||
print(f"Document title: {doc.metadata['title']}")
|
||||
print(f"Document author: {doc.metadata['author']}")
|
||||
|
||||
# Extract text from all pages
|
||||
all_text = ""
|
||||
for page in doc:
|
||||
all_text += page.get_text() + "\n"
|
||||
print("\nText from all pages:")
|
||||
print(all_text)
|
||||
|
||||
doc.close()
|
||||
return all_text
|
||||
@@ -0,0 +1,254 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "initial_id",
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-20T08:01:30.691815Z",
|
||||
"start_time": "2025-10-20T08:01:30.689588Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import pdf_extractor\n",
|
||||
"import os\n",
|
||||
"from ollama import Client"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 20
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-20T08:01:32.070132Z",
|
||||
"start_time": "2025-10-20T08:01:32.064843Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.environ.get('OLLAMA_API_KEY')\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key found\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found\")\n"
|
||||
],
|
||||
"id": "7c1e78571e54895f",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"API key found\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 21
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-20T08:01:33.313806Z",
|
||||
"start_time": "2025-10-20T08:01:33.305667Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"client = Client(\n",
|
||||
" host=\"https://ollama.com\",\n",
|
||||
" headers={'Authorization': 'Bearer ' + os.environ.get('OLLAMA_API_KEY')}\n",
|
||||
")"
|
||||
],
|
||||
"id": "4be731227f848288",
|
||||
"outputs": [],
|
||||
"execution_count": 22
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-20T08:01:35.004035Z",
|
||||
"start_time": "2025-10-20T08:01:34.990890Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": "pdf_content = pdf_extractor.extract_text(\"sample.pdf\")",
|
||||
"id": "912aacb46475d2ab",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Document title: \n",
|
||||
"Document author: \n",
|
||||
"\n",
|
||||
"Text from all pages:\n",
|
||||
"The Mountain Guardian\n",
|
||||
"High above the clouds, where the wind howled through jagged peaks and snow kissed the stone,\n",
|
||||
"there lived a man whose name few remembered. The villagers below called him Kaelen the Silent, a\n",
|
||||
"ghost among the mountains, a legend whispered around fires. For decades, no one had seen him\n",
|
||||
"descend, yet strange lights often danced in the night sky above the cliffs - lights that bent and\n",
|
||||
"shimmered like the northern auroras, though no aurora ever touched those skies.\n",
|
||||
"Kaelen had not always been alone. Once, he was a warrior - the greatest of his kind. Born with an\n",
|
||||
"unnatural power that hummed beneath his skin, he could command the very essence of the world:\n",
|
||||
"stones shifted at his will, rivers bent their flow, and storms obeyed his call. The elders had declared\n",
|
||||
"him chosen, a guardian meant to protect the realm. But power was a double-edged blade, and when\n",
|
||||
"war came, it cut too deep.\n",
|
||||
"In the final battle of the Age of Blades, Kaelen's strength saved thousands - and doomed just as\n",
|
||||
"many. In a moment of desperation, he unleashed his full might upon the invading armies, shattering\n",
|
||||
"the ground and swallowing them whole. The land itself screamed under the force. Cities crumbled,\n",
|
||||
"forests burned, and the blood of both friend and foe stained the soil. The war ended that day, but the\n",
|
||||
"cost was too high. Wracked with guilt, Kaelen vanished into the mountains, vowing never again to\n",
|
||||
"wield his gift.\n",
|
||||
"Years passed. Seasons turned. Legends grew. The world moved on, forgetting the man who once\n",
|
||||
"shaped its fate. But Kaelen did not forget. Each dawn, he stood at the edge of the cliff and watched\n",
|
||||
"the valley below - the rivers he had diverted, the scars he had carved into the land. He lived simply:\n",
|
||||
"gathering herbs, carving wooden charms, speaking to no one but the wind. Yet the power still\n",
|
||||
"thrummed beneath his skin, restless and waiting.\n",
|
||||
"One winter, a storm unlike any other swept through the mountains. Villages were buried beneath\n",
|
||||
"snow, and beasts from the frozen north roamed far beyond their borders. Among them came a\n",
|
||||
"darkness more terrible than any blizzard: an ancient force, long sealed away, had awakened. Its\n",
|
||||
"shadow crept across the land, devouring light and life alike. And with it came a name Kaelen\n",
|
||||
"thought he would never hear again - the Order of the Dawn, the same elders who had once called\n",
|
||||
"him guardian.\n",
|
||||
"They came to his mountain, desperate and broken. \"The world needs you,\" they said. \"Only you can\n",
|
||||
"stop this.\"\n",
|
||||
"\n",
|
||||
"Kaelen turned away. \"The world needs peace,\" he whispered. \"And I am no bringer of peace.\"\n",
|
||||
"But the cries of the valley reached him - the weeping of children, the howls of the dying, the\n",
|
||||
"whispers of a world on the brink. The guilt he had carried for decades began to shift, transforming\n",
|
||||
"into something else: resolve. Perhaps his power was never meant to destroy or to save. Perhaps it\n",
|
||||
"was meant to balance - to stand between chaos and order.\n",
|
||||
"At dawn, Kaelen descended the mountain for the first time in forty years. His footsteps shook the\n",
|
||||
"ground. The wind followed in his wake. The villagers stared in awe as the man from legend walked\n",
|
||||
"among them, cloak billowing like a storm cloud.\n",
|
||||
"The darkness waited beyond the valley, patient and hungry. Kaelen felt its presence - ancient,\n",
|
||||
"powerful, and mocking. But he did not falter. This time, he would not wield his gift as a weapon of\n",
|
||||
"wrath. This time, he would master it.\n",
|
||||
"And as the first clash of power shook the heavens, the world realized that the guardian had returned\n",
|
||||
"- not as a destroyer, not as a savior, but as a man who understood that true strength lies not in\n",
|
||||
"isolation, but in purpose.\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 23
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-20T08:01:36.914001Z",
|
||||
"start_time": "2025-10-20T08:01:36.911275Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"system_prompt = \"\"\"You are a snarky assistant that analyzes the contents of a pdf,\n",
|
||||
"and provides a short, snarky, humorous summary, ignoring text that might be navigation related.\n",
|
||||
"Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.\"\"\"\n",
|
||||
"\n",
|
||||
"user_prompt = \"\"\"\n",
|
||||
" Here are the contents of a pdf.\n",
|
||||
" Provide a short summary of this pdf.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"user_prompt += pdf_content\n"
|
||||
],
|
||||
"id": "a665eb55a5cce433",
|
||||
"outputs": [],
|
||||
"execution_count": 24
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-20T08:01:38.255895Z",
|
||||
"start_time": "2025-10-20T08:01:38.253714Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
"]"
|
||||
],
|
||||
"id": "9cf97ff1a01c4a0b",
|
||||
"outputs": [],
|
||||
"execution_count": 25
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-20T08:05:57.525835Z",
|
||||
"start_time": "2025-10-20T08:05:57.522774Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": "response = client.chat('gpt-oss:120b-cloud', messages=messages, stream=True)",
|
||||
"id": "3c08773150a59b12",
|
||||
"outputs": [],
|
||||
"execution_count": 41
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-20T08:06:02.788455Z",
|
||||
"start_time": "2025-10-20T08:05:59.261571Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"from IPython.display import display, Markdown\n",
|
||||
"\n",
|
||||
"output = \"\"\n",
|
||||
"for part in response:\n",
|
||||
" content = part['message']['content']\n",
|
||||
" output += content\n",
|
||||
" # print(content, end='', flush=True)\n",
|
||||
"\n",
|
||||
"display(Markdown(output))\n"
|
||||
],
|
||||
"id": "13553a2bef707111",
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
],
|
||||
"text/markdown": "## TL;DR: The “Mountain Guardian” is basically **Brooding Goliath #12** \n\n- **Kaelen the Silent**: Once a god‑level warrior who could bend rocks, rivers, and storms to his whims. Think “Avatar” meets “Grumpy Old Man”. \n- **War trauma**: He demolished an entire invading army, erased whole cities, and then got a massive case of *oops‑I‑did‑that* guilt, so he retreated to his alpine Airbnb for 40 years. \n- **Mountaintop hermit life**: Collects herbs, carves wooden charms, and talks to the wind—basically a D&D NPC with an overpowered “power‑under‑the‑skin” passive. \n- **Plot twist**: A cosmic snow‑storm + ancient evil + the Order of the Dawn (the same folks who called him “guardian”) knock on his door. “World needs you!” they cry. \n- **Kaelen’s epiphany**: “Peace = staying on my mountain” → “Maybe I can actually *use* my powers without blowing everything up.” \n- **Climactic comeback**: He finally descends, shakes the valley (literally), and fights the darkness—not as a smiting juggernaut, but as a reluctantly responsible adult with a purpose. \n\n**Bottom line:** A brooding, guilt‑ridden superhero finally decides to get off his rock and do his job. The moral? Even the biggest hermits can’t ignore the world forever—especially when it’s on fire. 🌋🗻✨"
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data",
|
||||
"jetTransient": {
|
||||
"display_id": null
|
||||
}
|
||||
}
|
||||
],
|
||||
"execution_count": 42
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Binary file not shown.
Reference in New Issue
Block a user