Merge branch 'main' of github.com:ed-donner/llm_engineering

This commit is contained in:
Edward Donner
2025-01-05 12:51:27 -05:00
11 changed files with 3268 additions and 1 deletions

View File

@@ -0,0 +1,226 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "306f1a67-4f1c-4aed-8f80-2a8458a1bce5",
"metadata": {},
"source": [
"# Stock data analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI\n",
"\n",
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
]
},
{
"cell_type": "markdown",
"id": "6900b2a8-6384-4316-8aaa-5e519fca4254",
"metadata": {},
"source": [
"# Connecting to OpenAI"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
"metadata": {},
"outputs": [],
"source": [
"# Load environment variables in a file called .env\n",
"\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "51d42a08-188e-4c56-9578-47cd549bd1d8",
"metadata": {},
"outputs": [],
"source": [
"from urllib.parse import urlencode\n",
"import datetime\n",
"\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "682eff74-55c4-4d4b-b267-703edbc293c7",
"metadata": {},
"outputs": [],
"source": [
"class YahooFinanceWebsite:\n",
" def __init__(self, stock_symbol):\n",
" \"\"\"\n",
" Create this Website object from the given url using the BeautifulSoup library\n",
" \"\"\"\n",
" self.stock_symbol = stock_symbol.upper()\n",
"\n",
" def __build_url(self, params):\n",
" base_url = f\"https://finance.yahoo.com/quote/{self.stock_symbol}/history/\"\n",
" query_string = urlencode(params)\n",
" return f\"{base_url}?{query_string}\"\n",
"\n",
" def get_stock_data(self):\n",
" datetime_now = datetime.datetime.now()\n",
" datetime_year_ago = datetime_now - datetime.timedelta(days=365)\n",
" params = {\"frequency\": \"1wk\", \"period1\": datetime_year_ago.timestamp(), \"period2\": datetime_now.timestamp()}\n",
" url = self.__build_url(params)\n",
" response = requests.get(url, headers=headers)\n",
"\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" \n",
" title = soup.title.string if soup.title else \"No title found\"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
"\n",
" html_table_data = soup.find(\"table\")\n",
"\n",
" return title, html_table_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70b8d7e7-51e7-4392-9b85-9ac9f67a907c",
"metadata": {},
"outputs": [],
"source": [
"def build_stock_analysis_prompt(stock_symbol, title, stock_table_data):\n",
" sys_prompt = r\"\"\"You are an assistant that analyzes the contents of HTML formated table that contains data on a specific stock.\n",
" The HTML table contains the date, open price, close price, low and highs aggregated for every week over one year timeframe.\n",
" Ignoring text, tags or html attributes that might be navigation related. \n",
" Respond in Markdown format\"\"\"\n",
" \n",
" user_prompt = f\"The data provided below in the HTML table format for {stock_symbol} from the Yahoo Finances.\\\n",
" Make the explaination easy enough for a newbie to understand. \\\n",
" Analyze and Summarize the trends on this stock:\\n{stock_table_data}\\n\\n\\\n",
" Also, calculate the total returns in percentage one could have expected over this period.\"\n",
" \n",
" return [\n",
" {\"role\": \"system\", \"content\": sys_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de514421-4cc8-4881-85b4-97f03e94c589",
"metadata": {},
"outputs": [],
"source": [
"def analyze_stock_trends(stock_symbol):\n",
" stock_data_page = YahooFinanceWebsite(stock_symbol)\n",
" title, stock_table_data = stock_data_page.get_stock_data()\n",
" response = openai.chat.completions.create(\n",
" model = \"gpt-4o-mini\",\n",
" messages = build_stock_analysis_prompt(stock_symbol, title, stock_table_data)\n",
" )\n",
" return response.choices[0].message.content\n",
"\n",
"def display_analysis(stock_symbol):\n",
" display(Markdown(analyze_stock_trends(stock_symbol)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41acc36f-484a-4257-a240-cf27520e7396",
"metadata": {},
"outputs": [],
"source": [
"display_analysis(\"GOOG\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e09541f-bbc4-4cf3-a1ef-9ed5e1b718e4",
"metadata": {},
"outputs": [],
"source": [
"display_analysis(\"PFE\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6af9395-0c5c-4265-a309-baba786bfa71",
"metadata": {},
"outputs": [],
"source": [
"display_analysis(\"AAPL\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "afe4f6d1-a6ea-44b5-81ae-8e756cfc0d84",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,213 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "9ae10427-6ca2-4ac0-b6a0-e9206dd3cb52",
"metadata": {},
"source": [
"### Using OpenAI gpt-4o-mini model to generate social media posts for events"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "477fe060-a11f-424f-bac4-34c5121cf437",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "61f012e5-cdba-48cb-ae74-df9659c23d90",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"API key found and looks good so far!\n"
]
}
],
"source": [
"# Load environment variables in a file called .env\n",
"\n",
"load_dotenv()\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "19c79615-57aa-40e0-a83b-891f43df4f65",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "68ad05f8-dfcc-47b1-ba16-b35bedeff48b",
"metadata": {},
"outputs": [],
"source": [
"# A class to represent a Webpage\n",
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
"\n",
"# Some websites need you to use proper headers when fetching them:\n",
"headers = {\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}\n",
"\n",
"class Website:\n",
"\n",
" def __init__(self, url):\n",
" \"\"\"\n",
" Create this Website object from the given url using the BeautifulSoup library\n",
" \"\"\"\n",
" self.url = url\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" self.title = soup.title.string if soup.title else \"No title found\"\n",
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "acff6c95-77a5-40f0-bf9f-7d47cec987fc",
"metadata": {},
"outputs": [],
"source": [
"# See how this function creates exactly the format above\n",
"\n",
"def messages_for(website):\n",
" return [\n",
" {\"role\": \"system\", \"content\": \"You are an assistant that analyzes the contents of a website \\\n",
"and provides a short summary, ignoring text that might be navigation related. \\\n",
"Respond in markdown.\"},\n",
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
" ]\n",
"\n",
"# A function that writes a User Prompt that asks for summaries of websites:\n",
"\n",
"def user_prompt_for(website):\n",
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
"please provide a short summary of this website in markdown. \\\n",
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
" user_prompt += website.text\n",
" return user_prompt\n",
" \n",
"# Generate a summary of content fetched by scraping the website\n",
"\n",
"def summarize(url):\n",
" website = Website(url)\n",
" response = openai.chat.completions.create(\n",
" model = \"gpt-4o-mini\",\n",
" messages = messages_for(website)\n",
" )\n",
" return response.choices[0].message.content"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b43f8cda-8a61-4773-83b2-bb8fe55a0cb2",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"**Twitter Post:** \n",
"🚀 Join us online for #StartupMastery on Jan 7, 6-9 PM GMT! Explore Lean Startup, Agile, & Design Thinking methodologies. Gain practical skills, access resources, and earn a certificate! Tickets from €74.98. Don't miss out! 🎟️🌟\n",
"\n",
"**Instagram Post:** \n",
"🌟 Ready to boost your startup skills? Join us for **Startup Mastery**! 💡 On January 7 from 6 PM to 9 PM GMT, dive into Lean Startup, Agile, and Design Thinking with top-notch experts. Access recorded sessions, worksheets, and get certified! 🎟️ Tickets from €74.98. See you online! 🚀✨ #StartupMastery #LeanStartup #Agile #DesignThinking\n",
"\n",
"**Facebook Post:** \n",
"🗓️ Exciting opportunity for entrepreneurs and startup enthusiasts! Attend our **Startup Mastery** online workshop on January 7, from 6 PM to 9 PM GMT. Learn about Lean Startup, Agile, and Design Thinking methodologies to enhance your startup journey. Enjoy a transformative experience with insights on MVP development, rapid prototyping, and feedback loops. Plus, you'll get access to recorded sessions and can earn a certificate! Limited tickets available from €74.98. Organizers: Lean Agile Zone. Dont miss out! 🚀 #StartupMastery #Entrepreneurship"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Step 1: Create your prompts\n",
"WEBSITE_LINK = \"https://www.eventbrite.ie/e/startup-mastery-leveraging-lean-startup-agile-and-design-thinking-tickets-920474252267?aff=ebdssbcategorybrowse&keep_tld=1\"\n",
"\n",
"system_prompt = \"You are an assistant that analyzes the contents of an event \\\n",
"and provides short summaries for a Twitter post, an instagram post and a facebook post.\\\n",
"Ensure the summaries abide by the platform rules for each of the platforms.\"\n",
"\n",
"website_summary = summarize(WEBSITE_LINK)\n",
"user_prompt = f\"The events details are as follows: {website_summary}. Please summarize the above. Capture details like time and location, please capture them as well.\"\n",
"\n",
"# Step 2: Make the messages list\n",
"\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt},\n",
"]\n",
"\n",
"# Step 3: Call OpenAI\n",
"\n",
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n",
"\n",
"# Step 4: print the result\n",
"\n",
"display(Markdown(response.choices[0].message.content))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,276 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "1b6fe0c1-931e-4194-bcfe-0716d8f75b50",
"metadata": {},
"source": [
"# Youtube Video Summarization\n",
"\n",
"## My First Frontier LLM Project!\n",
"\n",
"Welcome to my first LLM-based project! The goal of this project is to leverage large language models (LLMs) to summarize YouTube videos. Currently, it only supports English transcriptions, so instead of watching the entire video, you can simply read the summary!\n",
"\n",
"## Important Note\n",
"Be mindful when testing with longer videos, as they may consume significant resources and could lead to high costs on your ChatGPT bill.\n",
"You can switch to Ollama for free usage if you're looking to reduce costs.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
"metadata": {},
"outputs": [],
"source": [
"!pip install youtube-transcript-api openai"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a082ddaf-abf5-4e6c-8112-74846c768301",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from IPython.display import Markdown, display\n",
"\n",
"from openai import OpenAI\n",
"from youtube_transcript_api import YouTubeTranscriptApi\n",
"import re\n",
"\n",
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
"metadata": {},
"outputs": [],
"source": [
"# Load environment variables in a file called .env\n",
"\n",
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"# Check the key\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
"elif not api_key.startswith(\"sk-proj-\"):\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
"elif api_key.strip() != api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5e793b2-6775-426a-a139-4848291d0463",
"metadata": {},
"outputs": [],
"source": [
"class YoutubeVideoID:\n",
" def __init__(self, url):\n",
" self.url = url\n",
" self.video_id = self.extract_video_id(url)\n",
"\n",
" def extract_video_id(self, url):\n",
" \"\"\"\n",
" Extracts the YouTube video ID from a given URL.\n",
" Supports both regular and shortened URLs.\n",
" \"\"\"\n",
" # Regular expression to match YouTube video URL and extract the video ID\n",
" regex = r\"(?:https?:\\/\\/)?(?:www\\.)?(?:youtube\\.com\\/(?:[^\\/\\n\\s]+\\/\\S+\\/|\\S*\\?v=)|(?:youtu\\.be\\/))([a-zA-Z0-9_-]{11})\"\n",
" match = re.match(regex, url)\n",
" \n",
" if match:\n",
" return match.group(1)\n",
" else:\n",
" raise ValueError(\"Invalid YouTube URL\")\n",
"\n",
" def __str__(self):\n",
" return f\"Video ID: {self.video_id}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
"metadata": {},
"outputs": [],
"source": [
"# Example usage\n",
"video_url = \"https://www.youtube.com/watch?v=kqaMIFEz15s\"\n",
"\n",
"yt_video = YoutubeVideoID(video_url)\n",
"print(yt_video)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f724be3c-bdeb-4079-b4be-f12608144484",
"metadata": {},
"outputs": [],
"source": [
"def get_transcript(video_id, language='en'):\n",
" try:\n",
" # Try to get the transcript in the desired language (Indonesian by default)\n",
" transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])\n",
" # Join all the 'text' fields into a single string\n",
" return \" \".join([item['text'] for item in transcript])\n",
" except Exception as e:\n",
" print(f\"Error fetching transcript: {e}\")\n",
" return None\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12e302fa-f564-4ec6-a08f-b3b3ce549396",
"metadata": {},
"outputs": [],
"source": [
"# Fetch transcript using the video ID\n",
"transcript_text = get_transcript(yt_video.video_id)\n",
"print(len(transcript_text))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a0750be-88a1-4e65-9cb8-a0a2f11eecdf",
"metadata": {},
"outputs": [],
"source": [
"# Function to summarize text using ChatGPT\n",
"def summarize_text(text):\n",
" try:\n",
" system_prompts = \"\"\"\n",
" You are a helpful assistant who provides concise and accurate summaries of text. Your task is to:\n",
" \n",
" - Capture the key points of the content.\n",
" - Keep the summary brief and easy to understand.\n",
" - Avoid summarizing overly lengthy texts or breaking them into excessively short summaries.\n",
" - Use bullet points where appropriate to enhance clarity and structure.\n",
" \"\"\"\n",
" response = openai.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompts},\n",
" {\"role\": \"user\", \"content\": f\"Summarize the following text:\\n{text}\"}\n",
" ],\n",
" max_tokens=200\n",
" )\n",
" return response.choices[0].message.content\n",
" except Exception as e:\n",
" print(f\"Error summarizing text: {e}\")\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad646bc4-a11a-4c44-b941-54befdbf9bc6",
"metadata": {},
"outputs": [],
"source": [
"def split_text(text, chunk_size=3000):\n",
" \"\"\"\n",
" Splits large text into smaller chunks based on the given chunk size.\n",
" Ensures that chunks end with a full stop where possible to maintain sentence integrity.\n",
" \n",
" :param text: str, the text to be split\n",
" :param chunk_size: int, maximum size of each chunk (default 3000 characters)\n",
" :return: list of str, where each str is a chunk of text\n",
" \"\"\"\n",
" chunks = []\n",
" while len(text) > chunk_size:\n",
" # Find the last full stop within or at the chunk size\n",
" split_point = text.rfind('.', 0, chunk_size + 1) # +1 to include the period itself if it's at chunk_size\n",
" if split_point == -1: # No period found within the chunk size\n",
" split_point = chunk_size\n",
" \n",
" # Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure\n",
" chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])\n",
" text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]\n",
" \n",
" # Add the remaining text as the final chunk, only strip if there's content\n",
" if text:\n",
" chunks.append(text.strip())\n",
" \n",
" return chunks\n",
"\n",
"transcript_chunks = split_text(transcript_text)\n",
"\n",
"# Now you can summarize each chunk individually\n",
"summaries = []\n",
"for chunk in transcript_chunks:\n",
" summary = summarize_text(chunk)\n",
" summaries.append(summary)\n",
"\n",
"\n",
"# Combine the individual summaries into one\n",
"full_summary = \" \".join(summaries)\n",
"display(Markdown(full_summary))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b266fdc-da31-4d79-8982-be77f03be59f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "792c814d-73f8-4c1e-a0bb-b654b40e4d8b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}