Merge pull request #460 from mkash96/reddit-analyser
Add Day 1 & 2 Reddit Analysis notebooks, PDF, and README to community…
This commit is contained in:
Binary file not shown.
@@ -0,0 +1,409 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9413d98a-352a-47b7-b84b-5b4a61b3c002",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Reddit Post Analysis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "97ebfa77-33f8-4cd1-9204-d73aeefc0fea",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1. **Sets the Role and Tone** \n",
|
||||
" Instructs the AI to act as an **expert analyst** specializing in extracting insights from online forums like Reddit.\n",
|
||||
"\n",
|
||||
"2. **Guides Sentiment Analysis** \n",
|
||||
" Asks the AI to evaluate overall sentiment (e.g., positive, neutral, negative), and to present it as approximate percentages with a brief rationale.\n",
|
||||
"\n",
|
||||
"3. **Groups and Labels Themes** \n",
|
||||
" Instructs the AI to identify and cluster **key discussion themes**, perspectives, and emotional tones. Each theme should be explained and illustrated with **example comments**.\n",
|
||||
"\n",
|
||||
"4. **Creates an Insights Table** \n",
|
||||
" Requests a structured table with fields like *Perspectives, Frustrations, Tools, Suggestions* to concisely summarize the discussion’s core insights.\n",
|
||||
"\n",
|
||||
"5. **Describes Community Dynamics** \n",
|
||||
" Asks the AI to assess the **interaction style** (e.g., supportive, sarcastic, argumentative) and note any social patterns (e.g., consensus or conflict)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "425868ba-faec-4754-87f5-650f7529b319",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9596f40f-5add-4602-91e3-cd7d2c753c33",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import praw\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display, Image\n",
|
||||
"from openai import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9e1a9999-4aad-416d-90fe-3b0841a4f455",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### Load Credentials"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "847843ce-ebf9-4f48-b625-82e3ed687c81",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c615d79b-55a0-4eb1-ad8b-a2e28c11b49e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"reddit = praw.Reddit(\n",
|
||||
" client_id=os.getenv(\"REDDIT_CLIENT_ID\"),\n",
|
||||
" client_secret=os.getenv(\"REDDIT_CLIENT_SECRET\"),\n",
|
||||
" user_agent=os.getenv(\"REDDIT_USER_AGENT\"),\n",
|
||||
" username=os.getenv(\"REDDIT_USERNAME\"),\n",
|
||||
" password=os.getenv(\"REDDIT_PASSWORD\")\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Authenticated as:\", reddit.user.me())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6df2224d-ecfd-4e07-9bc8-102eff257d69",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "21ba0482-79e5-45ec-81d7-8611312c6b9e",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### Reddit Post Scraper"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8dc5276d-2d38-4651-9db0-c353076d6096",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"class RedditPostScraper:\n",
|
||||
" def __init__(self, url):\n",
|
||||
" self.submission = reddit.submission(url=url)\n",
|
||||
" self.submission.comments.replace_more(limit=None)\n",
|
||||
" self._title = self.submission.title\n",
|
||||
" self._text = self.submission.selftext\n",
|
||||
" self._comments = \"\"\n",
|
||||
" self._formatted_comments = [] # for reprocessing if needed\n",
|
||||
"\n",
|
||||
" def _generate_comments(self):\n",
|
||||
" comments_list = []\n",
|
||||
" for top_level in self.submission.comments:\n",
|
||||
" top_author = top_level.author.name if top_level.author else \"[deleted]\"\n",
|
||||
" comments_list.append(f\"{top_author}: {top_level.body}\")\n",
|
||||
"\n",
|
||||
" for reply in top_level.replies:\n",
|
||||
" reply_author = reply.author.name if reply.author else \"[deleted]\"\n",
|
||||
" comments_list.append(\n",
|
||||
" f\"{reply_author} replied to {top_author}'s comment: {reply.body}\"\n",
|
||||
" )\n",
|
||||
" self._formatted_comments = comments_list\n",
|
||||
"\n",
|
||||
" def title(self):\n",
|
||||
" return f\"Title:\\n{self._title}\\n{self._text}\"\n",
|
||||
"\n",
|
||||
" def comments(self, max_words=None):\n",
|
||||
" if not self._formatted_comments:\n",
|
||||
" self._generate_comments()\n",
|
||||
"\n",
|
||||
" output_comments = []\n",
|
||||
" total_words = 0\n",
|
||||
"\n",
|
||||
" for comment in self._formatted_comments:\n",
|
||||
" word_count = len(comment.split())\n",
|
||||
" if max_words and total_words + word_count > max_words:\n",
|
||||
" break\n",
|
||||
" output_comments.append(comment)\n",
|
||||
" total_words += word_count\n",
|
||||
"\n",
|
||||
" return \"Text:\\n\" + \"\\n\\n\".join(output_comments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3121cad0-4e2c-4d78-88e2-e72c6b99e2bf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# post = RedditPostScraper(\"https://www.reddit.com/r/running/comments/1l77osa/pushing_through_a_run/\")\n",
|
||||
"# print(post.title())\n",
|
||||
"# print(post.comments(2000))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "569760f6-5d68-40c1-9227-374c8e04d70a",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### System and User Prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "22c0e89a-c076-4616-ae9b-b4cd588f39ad",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = '''You are an expert analyst specializing in extracting insights from online discussion forums. You will be given the title of a Reddit post and a list of comments (some with replies). Your task is to analyze the sentiment of the discussion and extract structured insights that reflect the collective responses.\n",
|
||||
"\n",
|
||||
"Your response **must be in well-formatted Markdown**. Use clear section headers (`##`, `###`), bullet points, and tables where appropriate.\n",
|
||||
"\n",
|
||||
"Perform the following tasks:\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## 1. Overall Sentiment Breakdown\n",
|
||||
"\n",
|
||||
"- Determine the overall sentiment of the responses (e.g., positive, negative, neutral, mixed).\n",
|
||||
"- Express the sentiment as approximate percentages (e.g., 60% positive, 25% neutral, 15% negative).\n",
|
||||
"- Provide a short explanation for why the sentiment skews this way, referring to tone, topic sensitivity, controversy, humor, or supportiveness.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## 2. Thematic Grouping of Comments\n",
|
||||
"\n",
|
||||
"- Identify key recurring **themes, perspectives, or discussion threads** in the comments.\n",
|
||||
"- For each theme, create a subheading.\n",
|
||||
"- Under each:\n",
|
||||
" - Briefly describe the focus or tone of that cluster (e.g., personal stories, criticism, questions, jokes).\n",
|
||||
" - Include 1–2 **example comments** using quote formatting (`>`), preferably ones with replies or high engagement.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## 3. Insights Table\n",
|
||||
"\n",
|
||||
"If applicable, extract and structure insights into the following table. Leave any column empty if it’s not relevant to the post type:\n",
|
||||
"\n",
|
||||
"| Perspectives/ Motivations | Pains/ Concerns/ Frustrations | Tools / References / Resources | Suggestions / Solutions |\n",
|
||||
"|-------------------------------|----------------------------------|--------------------------------------|------------------------------------|\n",
|
||||
"| - ... | - ... | - ... | - ... |\n",
|
||||
"\n",
|
||||
"- Populate this table with concise bullet points.\n",
|
||||
"- Adapt categories to match the discussion type (e.g., switch \"Suggestions\" to \"Reactions\" if it's a news thread).\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## 4. Tone and Community Dynamics\n",
|
||||
"\n",
|
||||
"- Comment on the **style and culture** of interaction: humor, sarcasm, empathy, trolling, intellectual debate, etc.\n",
|
||||
"- Mention any noticeable social dynamics: agreement/disagreement, echo chambers, respectful debate, or hostility.\n",
|
||||
"- Include casual or emotional comments if they illustrate community personality.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"**Respond only in well-formatted Markdown.** Structure your output for clarity and insight, suitable for rendering in documentation, reports, or dashboards. Do not summarize every comment — focus on patterns, perspectives, and collective signals.\n",
|
||||
"\n",
|
||||
"'''"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cf9d15d6-4f9a-45fd-96ed-d7097c7f03d6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt_for(post):\n",
|
||||
" user_prompt = f\"You are looking at a Reddit discussion titled:\\n\\n{post.title()}\\n\\n\"\n",
|
||||
" user_prompt += \"Below are the responses from various users. Analyze them according to the system prompt provided.\\n\"\n",
|
||||
" user_prompt += \"Make sure your response is structured in Markdown with headers, lists, and tables as instructed.\\n\\n\"\n",
|
||||
" user_prompt += post.comments(4000)\n",
|
||||
" return user_prompt\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f18c581c-ea30-4a43-9223-8c184dedb37e",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### Generating Responses"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aadf8f41-aca3-41be-b18b-cb49a67ba256",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "feac9c61-f1f8-48f0-9189-bc60ac7fd755",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarize(url):\n",
|
||||
" website = RedditPostScraper(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "12b1d6dd-2d62-4136-8b8e-0a92134d4261",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# summarize(\"https://www.reddit.com/r/running/comments/1l77osa/pushing_through_a_run/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dd48253d-cdca-4c29-b4f2-c470290de63b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7e0825a9-a3b0-43a0-b69c-cf0ce81d77d2",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### Example Usage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8a61a482-ec70-4e29-b99c-0d82298a32b1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://www.reddit.com/r/running/comments/1l77osa/pushing_through_a_run/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1a336777-a06e-4535-b68d-a6470eb1d701",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://www.reddit.com/r/AskReddit/comments/1lam10k/how_do_you_feel_about_the_no_kings_protest/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b6b12074-ffb6-4a6d-bdd2-bbbb78f82781",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://www.reddit.com/r/canada/comments/1laq8ok/donald_trump_is_a_convicted_felon_could_he_be/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "63b805e5-183f-439b-bfe7-9ee6bbe4a5b4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
# Reddit Post Analyzer – GPT & Open Source Approaches
|
||||
|
||||
This project consists of two Jupyter notebooks that demonstrate different methods for analyzing Reddit post data:
|
||||
|
||||
- **Day 1:** `Day1_RedditAnalysis_gpt.ipynb` – Uses GPT-based sentiment and insight extraction from Reddit posts and comments.
|
||||
- **Day 2:** `day2_RedditAnalysis_opensource.ipynb` – Implements an open-source alternative for Reddit data processing and basic sentiment/thematic analysis.
|
||||
|
||||
---
|
||||
|
||||
## 📌 Features
|
||||
|
||||
- Reddit post and comment scraping using PRAW
|
||||
- GPT-based sentiment summarization and insight structuring (Day 1)
|
||||
- Open-source sentiment and thematic analysis pipeline (Day 2)
|
||||
- Markdown-formatted output suitable for reporting
|
||||
|
||||
---
|
||||
|
||||
## 🛠️ Setup Instructions
|
||||
|
||||
### Reddit API Credentials Setup
|
||||
|
||||
To access Reddit data, you need to create a Reddit app and obtain credentials:
|
||||
|
||||
#### Steps to Get Your Reddit API Keys:
|
||||
|
||||
1. Go to [https://www.reddit.com/prefs/apps](https://www.reddit.com/prefs/apps).
|
||||
2. Scroll to the bottom and click **“create another app”** or **“create app”**.
|
||||
3. Choose the **“script”** option.
|
||||
4. Fill in the following fields:
|
||||
- **name:** e.g., Reddit Analyzer
|
||||
- **redirect uri:** `http://localhost:8080`
|
||||
- **description:** *(optional)*
|
||||
5. After creating the app, you will get:
|
||||
- **client ID** (displayed under the app name)
|
||||
- **client secret**
|
||||
6. Keep note of your Reddit **username** and **password** (these are used with script apps)
|
||||
|
||||
#### Store your credentials in a `.env` file:
|
||||
|
||||
Create a `.env` file in the root directory with the following format:
|
||||
|
||||
```env
|
||||
REDDIT_CLIENT_ID=your_client_id
|
||||
REDDIT_CLIENT_SECRET=your_client_secret
|
||||
REDDIT_USER_AGENT=your_custom_user_agent
|
||||
REDDIT_USERNAME=your_reddit_username
|
||||
REDDIT_PASSWORD=your_reddit_password
|
||||
```
|
||||
|
||||
These will be securely loaded into your script using the `dotenv` package.
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Running the Notebooks
|
||||
|
||||
Make sure to activate your virtual environment (if applicable), install dependencies, and run the notebooks cell by cell in **Jupyter Lab** or **VS Code**.
|
||||
|
||||
---
|
||||
@@ -0,0 +1,436 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8c22d46c-d08b-4dbd-bdf5-338adce95e1a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Reddit Post Analysis using open source models (llama 3.2, deepseek r1, mistral:7b)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bfc5335b-53a8-4cd1-b1a8-95496ae4856d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1. **Sets the Role and Tone** \n",
|
||||
" Instructs the AI to act as an **expert analyst** specializing in extracting insights from online forums like Reddit.\n",
|
||||
"\n",
|
||||
"2. **Guides Sentiment Analysis** \n",
|
||||
" Asks the AI to evaluate overall sentiment (e.g., positive, neutral, negative), and to present it as approximate percentages with a brief rationale.\n",
|
||||
"\n",
|
||||
"3. **Groups and Labels Themes** \n",
|
||||
" Instructs the AI to identify and cluster **key discussion themes**, perspectives, and emotional tones. Each theme should be explained and illustrated with **example comments**.\n",
|
||||
"\n",
|
||||
"4. **Creates an Insights Table** \n",
|
||||
" Requests a structured table with fields like *Perspectives, Frustrations, Tools, Suggestions* to concisely summarize the discussion’s core insights.\n",
|
||||
"\n",
|
||||
"5. **Describes Community Dynamics** \n",
|
||||
" Asks the AI to assess the **interaction style** (e.g., supportive, sarcastic, argumentative) and note any social patterns (e.g., consensus or conflict)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6104a23f-c43a-48dc-a018-cddb8bea75d1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import praw\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import ollama"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "07de5c1d-1930-49ca-a026-2265e5432327",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Load Credentials"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "83fdd570-83a3-4e18-a94e-969c557978d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"reddit = praw.Reddit(\n",
|
||||
" client_id=os.getenv(\"REDDIT_CLIENT_ID\"),\n",
|
||||
" client_secret=os.getenv(\"REDDIT_CLIENT_SECRET\"),\n",
|
||||
" user_agent=os.getenv(\"REDDIT_USER_AGENT\"),\n",
|
||||
" username=os.getenv(\"REDDIT_USERNAME\"),\n",
|
||||
" password=os.getenv(\"REDDIT_PASSWORD\")\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Authenticated as:\", reddit.user.me())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6a8a58d8-6755-4e22-be97-232c2f7ea07c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f6b5b086-a4aa-40d2-a721-b3b8781d7ccf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Reddit Post Scraper"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "09c7a428-db62-4353-9fa5-d12bbdc4477c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class RedditPostScraper:\n",
|
||||
" def __init__(self, url):\n",
|
||||
" self.submission = reddit.submission(url=url)\n",
|
||||
" self.submission.comments.replace_more(limit=None)\n",
|
||||
" self._title = self.submission.title\n",
|
||||
" self._text = self.submission.selftext\n",
|
||||
" self._comments = \"\"\n",
|
||||
" self._formatted_comments = [] # for reprocessing if needed\n",
|
||||
"\n",
|
||||
" def _generate_comments(self):\n",
|
||||
" comments_list = []\n",
|
||||
" for top_level in self.submission.comments:\n",
|
||||
" top_author = top_level.author.name if top_level.author else \"[deleted]\"\n",
|
||||
" comments_list.append(f\"{top_author}: {top_level.body}\")\n",
|
||||
"\n",
|
||||
" for reply in top_level.replies:\n",
|
||||
" reply_author = reply.author.name if reply.author else \"[deleted]\"\n",
|
||||
" comments_list.append(\n",
|
||||
" f\"{reply_author} replied to {top_author}'s comment: {reply.body}\"\n",
|
||||
" )\n",
|
||||
" self._formatted_comments = comments_list\n",
|
||||
"\n",
|
||||
" def title(self):\n",
|
||||
" return f\"Title:\\n{self._title}\\n{self._text}\"\n",
|
||||
"\n",
|
||||
" def comments(self, max_words=None):\n",
|
||||
" if not self._formatted_comments:\n",
|
||||
" self._generate_comments()\n",
|
||||
"\n",
|
||||
" output_comments = []\n",
|
||||
" total_words = 0\n",
|
||||
"\n",
|
||||
" for comment in self._formatted_comments:\n",
|
||||
" word_count = len(comment.split())\n",
|
||||
" if max_words and total_words + word_count > max_words:\n",
|
||||
" break\n",
|
||||
" output_comments.append(comment)\n",
|
||||
" total_words += word_count\n",
|
||||
"\n",
|
||||
" return \"Text:\\n\" + \"\\n\\n\".join(output_comments)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3cece64a-ca54-4961-b04e-40f8057e2e78",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### System and User Prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "029de240-398e-4339-b90c-e6e90a96bcb5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = '''You are an expert analyst specializing in extracting insights from online discussion forums. You will be given the title of a Reddit post and a list of comments (some with replies). Your task is to analyze the sentiment of the discussion and extract structured insights that reflect the collective responses.\n",
|
||||
"Your response **must be in well-formatted Markdown**. Use clear section headers (`##`, `###`), bullet points, and tables where appropriate.\n",
|
||||
"Perform the following tasks:\n",
|
||||
"---\n",
|
||||
"## 1. Overall Sentiment Breakdown\n",
|
||||
"- Determine the overall sentiment of the responses (e.g., positive, negative, neutral, mixed).\n",
|
||||
"- Express the sentiment as approximate percentages (e.g., 60% positive, 25% neutral, 15% negative).\n",
|
||||
"- Provide a short explanation for why the sentiment skews this way, referring to tone, topic sensitivity, controversy, humor, or supportiveness.\n",
|
||||
"---\n",
|
||||
"## 2. Thematic Grouping of Comments\n",
|
||||
"- Identify key recurring **themes, perspectives, or discussion threads** in the comments.\n",
|
||||
"- For each theme, create a subheading.\n",
|
||||
"- Under each:\n",
|
||||
" - Briefly describe the focus or tone of that cluster (e.g., personal stories, criticism, questions, jokes).\n",
|
||||
" - Include 1–2 **example comments** using quote formatting (`>`), preferably ones with replies or high engagement.\n",
|
||||
"---\n",
|
||||
"## 3. Insights Table\n",
|
||||
"If applicable, extract and structure insights into the following table. Leave any column empty if it’s not relevant to the post type:\n",
|
||||
"| Perspectives/ Motivations | Pains/ Concerns/ Frustrations | Tools / References / Resources | Suggestions / Solutions |\n",
|
||||
"|-------------------------------|----------------------------------|--------------------------------------|------------------------------------|\n",
|
||||
"| - ... | - ... | - ... | - ... |\n",
|
||||
"- Populate this table with concise bullet points.\n",
|
||||
"- Adapt categories to match the discussion type (e.g., switch \"Suggestions\" to \"Reactions\" if it's a news thread).\n",
|
||||
"---\n",
|
||||
"## 4. Tone and Community Dynamics\n",
|
||||
"- Comment on the **style and culture** of interaction: humor, sarcasm, empathy, trolling, intellectual debate, etc.\n",
|
||||
"- Mention any noticeable social dynamics: agreement/disagreement, echo chambers, respectful debate, or hostility.\n",
|
||||
"- Include casual or emotional comments if they illustrate community personality.\n",
|
||||
"---\n",
|
||||
"**Respond only in well-formatted Markdown.** Structure your output for clarity and insight, suitable for rendering in documentation, reports, or dashboards. Do not summarize every comment — focus on patterns, perspectives, and collective signals.\n",
|
||||
"\n",
|
||||
"'''"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "350d8eea-005b-474e-9b57-cdb4004d8144",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt_for(post):\n",
|
||||
" user_prompt = f\"You are looking at a Reddit discussion titled:\\n\\n{post.title()}\\n\\n\"\n",
|
||||
" user_prompt += \"Below are the responses from various users. Analyze them according to the system prompt provided.\\n\"\n",
|
||||
" user_prompt += \"Make sure your response is structured in Markdown with headers, lists, and tables as instructed.\\n\\n\"\n",
|
||||
" user_prompt += post.comments(1000)\n",
|
||||
" return user_prompt\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bf23ed3b-8583-444e-ac62-3d415f771462",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# post = RedditPostScraper(\"https://www.reddit.com/r/running/comments/1l77osa/pushing_through_a_run/\")\n",
|
||||
"# print(post.title())\n",
|
||||
"# print(post.comments())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4e37f2e1-6eef-4c27-a442-97a6ff3dbf2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Generating messages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0781921b-e4e0-49f8-b34a-fd1017be6150",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "544c81a2-37c2-491e-8ef4-ac5d56173b72",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### llama 3.2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d3dd0a2a-ddf2-4bd1-823d-b49fa44a09ec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
"def summarizellama(url):\n",
|
||||
" website = RedditPostScraper(url)\n",
|
||||
" response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model = \"llama3.2\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "717ccb6d-f6c9-4f36-ad69-686f3f1bd26b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def display_summaryllama(url):\n",
|
||||
" summary = summarizellama(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2f981fe9-ed2d-4546-8fb3-c0f8048e3474",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summaryllama(\"https://www.reddit.com/r/running/comments/1l77osa/pushing_through_a_run/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e3091dcf-f8b3-4d1a-a85c-3a9ebed2ac6c",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### deepseek"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "55e465fa-e29d-4ed3-8f44-71964d2f866b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
"def summarizedeepseek(url):\n",
|
||||
" website = RedditPostScraper(url)\n",
|
||||
" response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model = \"deepseek-r1\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "40c26a89-97a8-4883-857a-fb13fea9222d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def display_summarydeepseek(url):\n",
|
||||
" summary = summarizedeepseek(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "362b871e-8f4d-47fa-b01d-bbe3082dd271",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summarydeepseek(\"https://www.reddit.com/r/running/comments/1l77osa/pushing_through_a_run/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3841bb1e-e885-4cb5-88f6-b6698ccbb77f",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### Mistral"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6d913e07-31b4-439d-a861-c4fd99012588",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!ollama pull mistral:7b"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ab881745-990c-4158-935b-36075c1dacde",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
"def summarizeMistral(url):\n",
|
||||
" website = RedditPostScraper(url)\n",
|
||||
" response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model = \"mistral:7b\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d5de3db6-ba69-43e8-9f6c-0945dbafa308",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def display_summaryMistral(url):\n",
|
||||
" summary = summarizeMistral(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7ea97e30-44be-45dc-ad2f-b6951ecc0190",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summaryMistral(\"https://www.reddit.com/r/running/comments/1l77osa/pushing_through_a_run/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "38e4aabe-b111-4ddb-af6c-6d4ff7d6f26b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user