Merge branch 'main' of https://github.com/DharmaTejaYadlapati/llm_engineering
This commit is contained in:
@@ -0,0 +1,179 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "606e9c73-50fe-46b9-8df3-ae2246c00a3e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Business Use Case - LLM based Resume Upgrader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "919f6546-80ec-4d4c-8a80-00228f50e4a0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b2f5b02c-f782-4578-8a91-07891c39ceb0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"steps to perform\n",
|
||||
"-> load API key from env file\n",
|
||||
"-> create a function to call llm api\n",
|
||||
"-> create messages for system prompt and user prompt\n",
|
||||
"-> display the llm output"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "31aaa20e-4996-43cb-b43a-a1aef80fd391",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv()\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"# error handling\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "92f65c91-ca7f-47e6-9fd7-d63b278ba264",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "98fc7bac-07c8-4801-9225-8f843837f3c2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# system prompt\n",
|
||||
"\n",
|
||||
"system_prompt = \"\"\"You are a helpful resume editor assistant that provides required assistance in changing a resume to match the given job descrption role \\\n",
|
||||
"You are given a resume and job description, your job is to understand the resume and job description to suggest upto 6 missing key words in the resume. Then you have to \n",
|
||||
"suggest how the user can improve his resume by giving upto 3 example sentences using the suggest keywords to fit into their resume.\n",
|
||||
"by using the following structure provide your response \\\n",
|
||||
"Sturcture:\n",
|
||||
"Job role : [Job Role]:\n",
|
||||
"Candidate Name : [Candidate Name]\n",
|
||||
"Missing Key words in Resume Based on Given job description:\n",
|
||||
" - [] Missing key words\n",
|
||||
" -[] Missing key words\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Suggestion:\n",
|
||||
" - [] # write a sentence including the key words to put them in the resume\n",
|
||||
" - [] # write a sentence including the key words to put them in the resume\n",
|
||||
"\n",
|
||||
"Guidelines:\n",
|
||||
"- give proper keyword suggestions which are essential for the job function. Do not give any unnecesary suggestions\n",
|
||||
"- Keep the suggested sentences less that 50 words\n",
|
||||
"- \n",
|
||||
"\"\"\"\n",
|
||||
"user_prompt = f'Give me suggestions on how to improve my resume and for the given job description '\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0d9c40b5-8e27-41b9-8b88-2c83e7d2b3ec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# call openai api\n",
|
||||
"def resume_upgrader(resume:str, job_description:str):\n",
|
||||
" user_prompt = f'Give me suggestions on how to improve my resume {resume} and for the given job description {job_description}'\n",
|
||||
" messages = [\n",
|
||||
" {'role': 'system', 'content': system_prompt},\n",
|
||||
" {'role': 'user', 'content': user_prompt}\n",
|
||||
" ]\n",
|
||||
" try:\n",
|
||||
" \n",
|
||||
" response = openai.chat.completions.create(model =\"gpt-4o-mini\", messages = messages)\n",
|
||||
" return response.choices[0].message.content\n",
|
||||
" except:\n",
|
||||
" print('got error while retting the response from api')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5aa29465-c119-4178-90f1-3ebdc9eeb11a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def print_api_response(response_markdown):\n",
|
||||
" \"\"\"Print the markdown response\"\"\"\n",
|
||||
" display(Markdown(response_markdown))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "82a92034-6722-4e78-a901-b4ef2b9cbb84",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"resume = input(\"Paste your resume in here\")\n",
|
||||
"job_description = input(\"paste your job descritpion here\")\n",
|
||||
"response = resume_upgrader(resume, job_description)\n",
|
||||
"print_api_response(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d0be536f-e890-473f-8c68-767bc0e3b47c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,189 @@
|
||||
# AI Property Rental Assistant
|
||||
|
||||
An intelligent property rental assistant Jupyter notebook that scrapes real estate listings from OnTheMarket and uses a local LLM (DeepSeek R1) to analyze and recommend properties based on user requirements.
|
||||
|
||||
## Features
|
||||
|
||||
- **Web Scraping**: Automatically fetches property listings from OnTheMarket
|
||||
- **AI-Powered Analysis**: Uses DeepSeek R1 model via Ollama for intelligent recommendations
|
||||
- **Personalized Recommendations**: Filters and ranks properties based on:
|
||||
- Budget constraints
|
||||
- Number of bedrooms
|
||||
- Tenant type (student, family, professional)
|
||||
- Location preferences
|
||||
- **Clean Output**: Returns formatted markdown with top 3-5 property recommendations
|
||||
- **Smart Filtering**: Handles cases where no suitable properties are found with helpful suggestions
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.7+
|
||||
- Ollama installed and running locally
|
||||
- DeepSeek R1 14B model pulled in Ollama
|
||||
|
||||
## Installation
|
||||
|
||||
1. **Clone the repository**
|
||||
```bash
|
||||
git clone <your-repo-url>
|
||||
cd property-rental-assistant
|
||||
```
|
||||
|
||||
2. **Install required Python packages**
|
||||
```bash
|
||||
pip install requests beautifulsoup4 ollama ipython jupyter
|
||||
```
|
||||
|
||||
3. **Install and setup Ollama**
|
||||
```bash
|
||||
# Install Ollama (macOS/Linux)
|
||||
curl -fsSL https://ollama.ai/install.sh | sh
|
||||
|
||||
# For Windows, download from: https://ollama.ai/download
|
||||
```
|
||||
|
||||
4. **Pull the DeepSeek R1 model**
|
||||
```bash
|
||||
ollama pull deepseek-r1:14b
|
||||
```
|
||||
|
||||
5. **Start Ollama server**
|
||||
```bash
|
||||
ollama serve
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Running the Notebook
|
||||
|
||||
1. **Start Jupyter Notebook**
|
||||
```bash
|
||||
jupyter notebook
|
||||
```
|
||||
|
||||
2. **Open the notebook**
|
||||
Navigate to `property_rental_assistant.ipynb` in the Jupyter interface
|
||||
|
||||
3. **Run all cells**
|
||||
Click `Cell` → `Run All` or use `Shift + Enter` to run cells individually
|
||||
|
||||
### Customizing Search Parameters
|
||||
|
||||
Modify the `user_needs` variable in the notebook:
|
||||
```python
|
||||
user_needs = "I'm a student looking for a 2-bedroom house in Durham under £2,000/month"
|
||||
```
|
||||
|
||||
Other examples:
|
||||
- `"Family of 4 looking for 3-bedroom house with garden in Durham, budget £2,500/month"`
|
||||
- `"Professional couple seeking modern 1-bed apartment near city center, max £1,500/month"`
|
||||
- `"Student group needs 4-bedroom house near Durham University, £600/month per person"`
|
||||
|
||||
### Changing the Property Website
|
||||
|
||||
Update the `website_url` variable in the notebook:
|
||||
```python
|
||||
website_url = "https://www.onthemarket.com/to-rent/property/durham/"
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌──────────────┐ ┌─────────────┐
|
||||
│ OnTheMarket │────▶│ Web Scraper │────▶│ Ollama │
|
||||
│ Website │ │ (BeautifulSoup)│ │ (DeepSeek R1)│
|
||||
└─────────────────┘ └──────────────┘ └─────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────┐
|
||||
│ AI-Generated Recommendations │
|
||||
│ • Top 5 matching properties │
|
||||
│ • Filtered by requirements │
|
||||
│ • Markdown formatted output │
|
||||
└─────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
property-rental-assistant/
|
||||
│
|
||||
├── property_rental_assistant.ipynb # Main Jupyter notebook
|
||||
└── README.md # This file
|
||||
```
|
||||
|
||||
## 🔧 Configuration
|
||||
|
||||
### Ollama API Settings
|
||||
```python
|
||||
OLLAMA_API = "http://localhost:11434/api/chat" # Default Ollama endpoint
|
||||
MODEL = "deepseek-r1:14b" # Model to use
|
||||
```
|
||||
|
||||
### Web Scraping Settings
|
||||
```python
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
}
|
||||
timeout = 10 # Request timeout in seconds
|
||||
```
|
||||
|
||||
### Content Limits
|
||||
```python
|
||||
website.text[:4000] # Truncate content to 4000 chars for token limits
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Web Scraping**: The `Website` class fetches and parses HTML content from the property listing URL
|
||||
2. **Content Cleaning**: Removes scripts, styles, and images to extract clean text
|
||||
3. **Prompt Engineering**: Combines system prompt with user requirements and scraped data
|
||||
4. **LLM Analysis**: Sends the prompt to DeepSeek R1 via Ollama API
|
||||
5. **Recommendation Generation**: The AI analyzes listings and returns top matches in markdown format
|
||||
|
||||
## 🛠️ Troubleshooting
|
||||
|
||||
### Ollama Connection Error
|
||||
```
|
||||
Error communicating with Ollama: [Errno 111] Connection refused
|
||||
```
|
||||
**Solution**: Ensure Ollama is running with `ollama serve`
|
||||
|
||||
### Model Not Found
|
||||
```
|
||||
Error: model 'deepseek-r1:14b' not found
|
||||
```
|
||||
**Solution**: Pull the model with `ollama pull deepseek-r1:14b`
|
||||
|
||||
### Web Scraping Blocked
|
||||
```
|
||||
Error fetching website: 403 Forbidden
|
||||
```
|
||||
**Solution**: The website may be blocking automated requests. Try:
|
||||
- Updating the User-Agent string
|
||||
- Adding delays between requests
|
||||
- Using a proxy or VPN
|
||||
|
||||
### Insufficient Property Data
|
||||
If recommendations are poor quality, the scraper may not be capturing listing details properly. Check:
|
||||
- The website structure hasn't changed
|
||||
- The content truncation limit (4000 chars) isn't too restrictive
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- [ ] Support multiple property websites (Rightmove, Zoopla, SpareRoom)
|
||||
- [ ] Interactive CLI for dynamic user input
|
||||
- [ ] Property image analysis
|
||||
- [ ] Save search history and favorite properties
|
||||
- [ ] Email notifications for new matching properties
|
||||
- [ ] Price trend analysis
|
||||
- [ ] Commute time calculations to specified locations
|
||||
- [ ] Multi-language support
|
||||
- [ ] Web interface with Flask/FastAPI
|
||||
- [ ] Docker containerization
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
- [Ollama](https://ollama.ai/) for local LLM hosting
|
||||
- [DeepSeek](https://www.deepseek.com/) for the R1 model
|
||||
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for web scraping
|
||||
- [OnTheMarket](https://www.onthemarket.com/) for property data
|
||||
@@ -0,0 +1,217 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "57112e5c-7b0f-4ba7-9022-ae21e8ac0f42",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3b71a051-fc0e-46a9-8b1b-b58f685e800d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"OLLAMA_API = \"http://localhost:11434/api/chat\"\n",
|
||||
"HEADERS = {\"Content-Type\": \"application/json\"}\n",
|
||||
"MODEL = \"deepseek-r1:14b\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ed3be9dc-d459-46ac-a8eb-f9b932c4302f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
" def __init__(self, url):\n",
|
||||
" self.url = url\n",
|
||||
" try:\n",
|
||||
" response = requests.get(url, headers=headers, timeout=10)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" if soup.body:\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
|
||||
" else:\n",
|
||||
" self.text = \"No body content found\"\n",
|
||||
" except requests.RequestException as e:\n",
|
||||
" print(f\"Error fetching website: {e}\")\n",
|
||||
" self.title = \"Error loading page\"\n",
|
||||
" self.text = \"Could not load page content\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "17ea76f8-38d9-40b9-8aba-eb957d690a0d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Without Ollama package"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3a6fd698-8e59-4cd7-bb53-b9375e50f899",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def house_renting(system_prompt, user_prompt):\n",
|
||||
" messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]\n",
|
||||
" payload = {\n",
|
||||
" \"model\": MODEL,\n",
|
||||
" \"messages\": messages,\n",
|
||||
" \"stream\": False\n",
|
||||
" }\n",
|
||||
" response = requests.post(OLLAMA_API, json=payload, headers=HEADERS)\n",
|
||||
" return response.json()['message']['content']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c826a52c-d1d3-493a-8b7c-6e75b848b453",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Introducing Ollama package "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "519e27da-eeff-4c1b-a8c6-e680fdf01da2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import ollama\n",
|
||||
"\n",
|
||||
"def house_renting_ollama(system_prompt, user_prompt):\n",
|
||||
" try:\n",
|
||||
" messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]\n",
|
||||
" response = ollama.chat(model=MODEL, messages=messages)\n",
|
||||
" return response['message']['content']\n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"Error communicating with Ollama: {e}\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "60e98b28-06d9-4303-b8ca-f7b798244eb4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"\"\"\n",
|
||||
"You are a helpful real estate assistant specializing in UK property rentals. Your job is to guide users in finding houses to rent, especially in Durham. Follow these rules:\n",
|
||||
"1. Always ask clarifying questions if user input is vague. Determine location, budget, number of bedrooms, and tenant type (e.g. student, family, professional).\n",
|
||||
"2. Use structured data provided from the website (like property listings) to identify relevant options.\n",
|
||||
"3. If listings are provided, filter and rank them based on the user's preferences.\n",
|
||||
"4. Recommend up to 5 top properties with rent price, bedroom count, key features, and location.\n",
|
||||
"5. Always respond in markdown with clean formatting using headers, bold text, and bullet points.\n",
|
||||
"6. If no listings match well, provide tips (e.g. \"try adjusting your budget or search radius\").\n",
|
||||
"7. Stay concise, helpful, and adapt to whether the user is a student, family, couple, or solo tenant.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"def user_prompt_for_renting(website, user_needs):\n",
|
||||
" return f\"\"\"\n",
|
||||
"I want to rent a house and here's what I'm looking for:\n",
|
||||
"{user_needs}\n",
|
||||
"\n",
|
||||
"Here are the property listings I found on the website titled: \"{website.title}\".\n",
|
||||
"\n",
|
||||
"Please analyze them and recommend the best 3–5 options that match my needs. If none are suitable, tell me why and offer suggestions.\n",
|
||||
"\n",
|
||||
"The page content is below:\n",
|
||||
"{website.text[:4000]}\n",
|
||||
"\"\"\" # content is truncated for token limits"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ef420f4b-e3d2-4fbd-bf6f-811f2c8536e0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Ollama Package"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1cf128af-4ece-41ab-b353-5c8564c7de1d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if __name__ == \"__main__\": \n",
|
||||
" print(\"Starting AI Property Rental Assistant...\")\n",
|
||||
" print(\"=\" * 50)\n",
|
||||
" \n",
|
||||
" website_url = \"https://www.onthemarket.com/to-rent/property/durham/\"\n",
|
||||
" print(f\"🔍 Scraping properties from: {website_url}\")\n",
|
||||
" \n",
|
||||
" website = Website(website_url)\n",
|
||||
" print(f\"Website Title: {website.title}\")\n",
|
||||
" print(f\"Content Length: {len(website.text)} characters\")\n",
|
||||
" print(f\"Successfully scraped property listings\\n\")\n",
|
||||
" \n",
|
||||
" user_needs = \"I'm a student looking for a 2-bedroom house in Durham under £2,000/month\"\n",
|
||||
" print(f\"User Requirements: {user_needs}\\n\")\n",
|
||||
" \n",
|
||||
" user_prompt = user_prompt_for_renting(website, user_needs)\n",
|
||||
" print(\"Generating AI recommendations...\")\n",
|
||||
" \n",
|
||||
" # Choose which method to use (comment out the one you don't want)\n",
|
||||
" \n",
|
||||
" # Method 1: Using ollama Python library\n",
|
||||
" output = house_renting_ollama(system_prompt, user_prompt)\n",
|
||||
" \n",
|
||||
" # Method 2: Using direct API call\n",
|
||||
" # output = house_renting(system_prompt, user_prompt)\n",
|
||||
" \n",
|
||||
" display(Markdown(output))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python [conda env:llms]",
|
||||
"language": "python",
|
||||
"name": "conda-env-llms-py"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
103
week1/community-contributions/Day-1_email_summarizers.ipynb
Normal file
103
week1/community-contributions/Day-1_email_summarizers.ipynb
Normal file
@@ -0,0 +1,103 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d7a6bb51",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import library\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"\n",
|
||||
"# Load your API key from an .env file\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7ac4cdf9",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 1: Create your prompts\n",
|
||||
"system_prompt = \"you are a helpful assistant that suggests an appropriate short subject line for an email based on its contents.\"\n",
|
||||
"\n",
|
||||
"user_prompt = \"\"\"\n",
|
||||
"Hi John,\n",
|
||||
"I hope this email finds you well. I wanted to follow up on our meeting last week regarding the quarterly budget proposal.\n",
|
||||
"After reviewing the numbers with my team, we've identified some areas where we can reduce costs by approximately 15% without impacting our core operations. This would involve consolidating some vendor contracts and optimizing our software licensing.\n",
|
||||
"Could we schedule a meeting next week to discuss these findings in detail? I'm available Tuesday through Thursday afternoon.\n",
|
||||
"Looking forward to hearing from you.\n",
|
||||
"\n",
|
||||
"Best regards,\n",
|
||||
"Sarah\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a77ca09e",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 2: Make the messages list\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8404f0fe",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 3: Call OpenAI\n",
|
||||
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7a4875f7",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 4: Print the result\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
290
week1/community-contributions/Day-2_exercise_with_ollama3.ipynb
Normal file
290
week1/community-contributions/Day-2_exercise_with_ollama3.ipynb
Normal file
@@ -0,0 +1,290 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "135717e7",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import ollama"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "29a9e634",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTION 1\n",
|
||||
"# using openai\n",
|
||||
"\n",
|
||||
"# message = \"Hello, GPT! This is my first ever message to you! Hi!\"\n",
|
||||
"# client = OpenAI(base_url=\"http://localhost:11434/v1\", api_key=\"not-needed\")\n",
|
||||
"# response = openai.chat.completions.create(model=`<name of model>`, messages=[{\"role\":\"user\", \"content\":message}])\n",
|
||||
"# print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "306993ed",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTION 2\n",
|
||||
"# using Ollama\n",
|
||||
"\n",
|
||||
"message = \"Hello, GPT! This is my first ever message to you! Hi!\"\n",
|
||||
"model=\"llama3\"\n",
|
||||
"response=ollama.chat(model=model,messages=[{\"role\":\"user\",\"content\":message}])\n",
|
||||
"print(response[\"message\"][\"content\"])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "856f767b",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "4ce558dc",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's try one out. Change the website and add print statements to follow along.\n",
|
||||
"\n",
|
||||
"ed = Website(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "5e3956f8",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
|
||||
"\n",
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "99d791b4",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "5d89b748",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# See how this function creates exactly the format above\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "9a97d3e2",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# And now: call the OpenAI API. You will get very familiar with this!\n",
|
||||
"\n",
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response=ollama.chat(model=model,messages=messages_for(website))\n",
|
||||
" return(response[\"message\"][\"content\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ec13fe0a",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarize(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "e3ade092",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function to display this nicely in the Jupyter output, using markdown\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "be2d49e6",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1ccbf33b",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://cnn.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ae3d0eae",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://anthropic.com\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
181
week1/community-contributions/Top Tech products.ipynb
Normal file
181
week1/community-contributions/Top Tech products.ipynb
Normal file
@@ -0,0 +1,181 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bbd8585e-0a28-4fd9-80b5-690569f93e16",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#This notebook will help you to get top tech products with by providing category and subcategory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "df039118-f462-4a8b-949e-53d3a726e292",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"aa"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e2ffd2e5-d061-446c-891e-15a6d1958ab6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "92e26007-521f-4ea2-9df9-edd77dd7e183",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "27d21593-8feb-42e4-bbc0-2e949b51137d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tech_product(category_subcategory_budget):\n",
|
||||
" parts = category_subcategory_budget.split('_')\n",
|
||||
" return f\"{parts[0]}-{parts[1]}-{parts[2]}\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dd978d25-5b84-4122-af7c-116f2bf72179",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_for(products):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": \"you are a tech product expert and you need to suggest the best suited product available in India basis the input received in the form of category-subcategory-budget (in inr),\\\n",
|
||||
" revert with category and subcategory and show the product links as well along with pros and cons, respond in markdown\"},\n",
|
||||
" {\"role\": \"user\", \"content\": tech_product(products)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b916db7a-81a4-41d9-87c2-a2346fd874d2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages_for(\"phone_gaming_40000\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3b4bb3f1-95de-4eb5-afe1-068744f93301",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_top_products(category_subcategory):\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages= messages_for(category_subcategory)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content \n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c9272942-acfe-4fca-bd0a-3435c1ee6691",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_top_products('phone_gaming_30000')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2c2b3b9a-aceb-4f00-8c8d-8f6837ab94fc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def display_markdown(category_subcategory_budget):\n",
|
||||
" output = get_top_products(category_subcategory_budget)\n",
|
||||
" display(Markdown(output))\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6c135dd7-4ed4-48ee-ba3f-9b4ca1c32149",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_markdown('Console_gaming_100000')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0ba06c55-7ef9-47eb-aeaf-3c4a7b29bccc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
210
week1/community-contributions/ai-powered-marketing-brochures-gpt-5/.gitignore
vendored
Normal file
210
week1/community-contributions/ai-powered-marketing-brochures-gpt-5/.gitignore
vendored
Normal file
@@ -0,0 +1,210 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[codz]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# UV
|
||||
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
#uv.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
#poetry.toml
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
||||
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
||||
#pdm.lock
|
||||
#pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# pixi
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
||||
#pixi.lock
|
||||
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
||||
# in the .venv directory. It is recommended not to include this directory in version control.
|
||||
.pixi
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.envrc
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
# Abstra
|
||||
# Abstra is an AI-powered process automation framework.
|
||||
# Ignore directories containing user credentials, local state, and settings.
|
||||
# Learn more at https://abstra.io/docs
|
||||
.abstra/
|
||||
|
||||
# Visual Studio Code
|
||||
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
||||
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
||||
# you could uncomment the following to ignore the entire vscode folder
|
||||
# .vscode/
|
||||
|
||||
# Ruff stuff:
|
||||
.ruff_cache/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
|
||||
# Cursor
|
||||
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
||||
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
||||
# refer to https://docs.cursor.com/context/ignore-files
|
||||
.cursorignore
|
||||
.cursorindexingignore
|
||||
|
||||
# Marimo
|
||||
marimo/_static/
|
||||
marimo/_lsp/
|
||||
__marimo__/
|
||||
|
||||
|
||||
.*-env
|
||||
@@ -0,0 +1,207 @@
|
||||
from ai_core import AICore
|
||||
from ai_brochure_config import AIBrochureConfig
|
||||
from extractor_of_relevant_links import ExtractorOfRelevantLinks
|
||||
from website import Website
|
||||
from openai.types.responses import Response
|
||||
from rich.console import Console
|
||||
from rich.markdown import Markdown
|
||||
from requests import Session
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from json import loads
|
||||
|
||||
class BrochureCreator(AICore[str]):
|
||||
"""
|
||||
Builds a short Markdown brochure for a company or individual by:
|
||||
- extracting relevant links from the website,
|
||||
- inferring the entity name and status,
|
||||
- and prompting the model using the collected page content.
|
||||
"""
|
||||
|
||||
@property
|
||||
def _website(self) -> Website:
|
||||
"""Return the main Website instance to analyze."""
|
||||
return self.__website
|
||||
|
||||
@property
|
||||
def _extractor(self) -> ExtractorOfRelevantLinks:
|
||||
"""Return the helper responsible for extracting relevant links."""
|
||||
return self.__extractor
|
||||
|
||||
def __init__(self, config: AIBrochureConfig, website: Website) -> None:
|
||||
"""
|
||||
Initialize the brochure creator with configuration and target website.
|
||||
|
||||
Parameters:
|
||||
config: AI and runtime configuration.
|
||||
website: The root website to analyze and summarize.
|
||||
"""
|
||||
system_behavior: str = ("You are an assistant that analyzes the contents of several relevant pages from a company website "
|
||||
"and creates a short brochure about the company for prospective customers, investors and recruits. "
|
||||
"Include details of company culture, customers and careers/jobs if information is available. ")
|
||||
super().__init__(config, system_behavior)
|
||||
self.__website: Website = website
|
||||
self.__extractor: ExtractorOfRelevantLinks = ExtractorOfRelevantLinks(config, website)
|
||||
|
||||
def create_brochure(self) -> str:
|
||||
"""
|
||||
Create a short Markdown brochure based on the website's content.
|
||||
|
||||
Returns:
|
||||
A Markdown string with the brochure, or a fallback message if no relevant pages were found.
|
||||
"""
|
||||
relevant_pages: list[dict[str, str | Website]] = self._get_relevant_pages()
|
||||
if not relevant_pages:
|
||||
return "No relevant pages found to create a brochure."
|
||||
|
||||
brochure_prompt_part: str = self._form_brochure_prompt(relevant_pages)
|
||||
inferred_company_name, inferred_status = self._infer_entity(brochure_prompt_part)
|
||||
|
||||
full_brochure_prompt: str = self._form_full_prompt(inferred_company_name, inferred_status)
|
||||
response: str = self.ask(full_brochure_prompt)
|
||||
return response
|
||||
|
||||
def _get_relevant_pages(self) -> list[dict[str, str | Website]]:
|
||||
"""
|
||||
Resolve relevant links into Website objects using a shared session and concurrency.
|
||||
"""
|
||||
relevant_pages: list[dict[str, str | Website]] = []
|
||||
relevant_links: list[dict[str, str]] = self._extractor.extract_relevant_links()["links"]
|
||||
# Limit the number of pages to fetch to keep latency and token usage reasonable.
|
||||
MAX_PAGES: int = 6
|
||||
links_subset = relevant_links[:MAX_PAGES]
|
||||
|
||||
def build_page(item: dict[str, str], session: Session) -> dict[str, str | Website] | None:
|
||||
try:
|
||||
url = str(item["url"])
|
||||
page_type = str(item["type"])
|
||||
return {"type": page_type, "page": Website(url, session=session)}
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
with Session() as session, ThreadPoolExecutor(max_workers=4) as executor:
|
||||
futures = [executor.submit(build_page, link, session) for link in links_subset]
|
||||
for fut in as_completed(futures):
|
||||
res = fut.result()
|
||||
if res:
|
||||
relevant_pages.append(res)
|
||||
|
||||
return relevant_pages
|
||||
|
||||
def _truncate_text(self, text: str, limit: int) -> str:
|
||||
"""
|
||||
Truncate text to 'limit' characters to reduce tokens and latency.
|
||||
"""
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return text[: max(0, limit - 20)] + "... [truncated]"
|
||||
|
||||
def _form_brochure_prompt(self, relevant_pages: list[dict[str, str | Website]]) -> str:
|
||||
"""
|
||||
Assemble a prompt that includes the main page and relevant pages' titles and text.
|
||||
|
||||
Parameters:
|
||||
relevant_pages: List of page descriptors returned by _get_relevant_pages.
|
||||
|
||||
Returns:
|
||||
A prompt string containing quoted sections per page.
|
||||
"""
|
||||
QUOTE_DELIMITER: str = "\n\"\"\"\n"
|
||||
MAX_MAIN_CHARS = 6000
|
||||
MAX_PAGE_CHARS = 3000
|
||||
prompt: str = (
|
||||
f"Main page:{QUOTE_DELIMITER}"
|
||||
f"Title: {self._website.title}\n"
|
||||
f"Text:\n{self._truncate_text(self._website.text, MAX_MAIN_CHARS)}{QUOTE_DELIMITER}\n"
|
||||
)
|
||||
|
||||
for page in relevant_pages:
|
||||
if isinstance(page['page'], Website) and not page['page'].fetch_failed:
|
||||
prompt += (
|
||||
f"{page['type']}:{QUOTE_DELIMITER}"
|
||||
f"Title: {page['page'].title}\n"
|
||||
f"Text:\n{self._truncate_text(page['page'].text, MAX_PAGE_CHARS)}{QUOTE_DELIMITER}\n"
|
||||
)
|
||||
|
||||
return prompt
|
||||
|
||||
def _infer_entity(self, brochure_prompt_part: str) -> tuple[str, str]:
|
||||
"""
|
||||
Infer both the entity name and status in a single model call to reduce latency.
|
||||
Returns:
|
||||
(name, status) where status is 'company' or 'individual'.
|
||||
"""
|
||||
prompt = (
|
||||
"From the following website excerpts, infer the entity name and whether it is a company or an individual. "
|
||||
"Respond strictly as JSON with keys 'name' and 'status' (status must be 'company' or 'individual').\n"
|
||||
f"{brochure_prompt_part}"
|
||||
)
|
||||
raw = self.ask(prompt)
|
||||
try:
|
||||
data: dict[str, str] = loads(raw)
|
||||
name: str = str(data.get("name", "")).strip() or "Unknown"
|
||||
status: str = str(data.get("status", "")).strip().lower()
|
||||
if status not in ("company", "individual"):
|
||||
status = "company"
|
||||
return name, status
|
||||
except Exception:
|
||||
# Fallback: use entire output as name, assume company
|
||||
return raw.strip() or "Unknown", "company"
|
||||
|
||||
def _form_full_prompt(self, inferred_company_name: str, inferred_status: str) -> str:
|
||||
"""
|
||||
Build the final brochure-generation prompt using the inferred entity and prior history.
|
||||
|
||||
Parameters:
|
||||
inferred_company_name: The inferred entity name.
|
||||
inferred_status: Either 'company' or 'individual'.
|
||||
|
||||
Returns:
|
||||
A final prompt instructing the model to produce a Markdown brochure.
|
||||
"""
|
||||
full_prompt: str = (f"You are looking at a {inferred_status} called {inferred_company_name}, to whom website {self._website.website_url} belongs.\n"
|
||||
f"Build a short brochure about the {inferred_status}. Use the information from the website that is already stored in the history.\n"
|
||||
"Your response must be in a Markdown format.")
|
||||
return full_prompt
|
||||
|
||||
def ask(self, question: str) -> str:
|
||||
"""
|
||||
Send a question to the model, update chat history, and return the text output.
|
||||
|
||||
Parameters:
|
||||
question: The user prompt.
|
||||
|
||||
Returns:
|
||||
The model output text.
|
||||
"""
|
||||
self.history_manager.add_user_message(question)
|
||||
response: Response = self._ai_api.responses.create(
|
||||
model=self.config.model_name,
|
||||
instructions=self.history_manager.system_behavior,
|
||||
input=self.history_manager.chat_history,
|
||||
reasoning={ "effort": "low" }
|
||||
)
|
||||
self.history_manager.add_assistant_message(response)
|
||||
return response.output_text
|
||||
|
||||
console: Console = Console()
|
||||
|
||||
def display_markdown(content: str) -> None:
|
||||
"""
|
||||
Render Markdown content to the console using rich.
|
||||
"""
|
||||
console.print(Markdown(content))
|
||||
|
||||
def show_summary(summary: str) -> None:
|
||||
"""
|
||||
Print a Markdown summary if provided; otherwise print a fallback message.
|
||||
"""
|
||||
if summary:
|
||||
display_markdown(summary)
|
||||
else:
|
||||
console.print("No summary found.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
website: Website = Website("<put your site address here>")
|
||||
brochure_creator: BrochureCreator = BrochureCreator(AIBrochureConfig(), website)
|
||||
brochure: str = brochure_creator.create_brochure()
|
||||
display_markdown(brochure)
|
||||
@@ -0,0 +1,59 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
class AIBrochureConfig:
|
||||
"""
|
||||
Configuration class to load environment variables.
|
||||
"""
|
||||
|
||||
def __get_config_value(self, key: str):
|
||||
"""
|
||||
Get the value of an environment variable.
|
||||
"""
|
||||
if not key:
|
||||
raise ValueError("Key must be provided")
|
||||
|
||||
value: str | None = os.getenv(key)
|
||||
if not value:
|
||||
raise ValueError(f"Environment variable '{key}' not found")
|
||||
|
||||
return value
|
||||
|
||||
def _get_str(self, key: str) -> str:
|
||||
"""
|
||||
Get a string value from the environment variables.
|
||||
"""
|
||||
return self.__get_config_value(key)
|
||||
|
||||
def _get_int(self, key: str) -> int:
|
||||
"""
|
||||
Get an integer value from the environment variables.
|
||||
"""
|
||||
value = self.__get_config_value(key)
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
raise ValueError(f"Environment variable '{key}' must be an integer")
|
||||
|
||||
@property
|
||||
def openai_api_key(self) -> str:
|
||||
"""
|
||||
Get the OpenAI API key from the environment variables.
|
||||
"""
|
||||
if self.__openai_api_key == "":
|
||||
self.__openai_api_key = self._get_str("OPENAI_API_KEY")
|
||||
return self.__openai_api_key
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
"""
|
||||
Get the model name from the environment variables.
|
||||
"""
|
||||
if self.__model_name == "":
|
||||
self.__model_name = self._get_str("MODEL_NAME")
|
||||
return self.__model_name
|
||||
|
||||
def __init__(self) -> None:
|
||||
load_dotenv(dotenv_path=".env")
|
||||
self.__openai_api_key: str = ""
|
||||
self.__model_name: str = ""
|
||||
@@ -0,0 +1,181 @@
|
||||
import openai
|
||||
from abc import ABC, abstractmethod
|
||||
from ai_brochure_config import AIBrochureConfig
|
||||
from typing import Any, cast, Generic, TypeVar
|
||||
from openai.types.responses import ResponseInputItemParam, Response, ResponseOutputMessage
|
||||
|
||||
TAiResponse = TypeVar('TAiResponse', default=Any)
|
||||
|
||||
class HistoryManager:
|
||||
"""
|
||||
Manage chat history and system behavior for a conversation with the model.
|
||||
"""
|
||||
@property
|
||||
def chat_history(self) -> list[ResponseInputItemParam]:
|
||||
"""
|
||||
Return the accumulated conversation as a list of response input items.
|
||||
"""
|
||||
return self.__chat_history
|
||||
|
||||
@property
|
||||
def system_behavior(self) -> str:
|
||||
"""
|
||||
Return the system behavior (instructions) used for this conversation.
|
||||
"""
|
||||
return self.__system_behavior
|
||||
|
||||
def __init__(self, system_behavior: str) -> None:
|
||||
"""
|
||||
Initialize the history manager.
|
||||
|
||||
Parameters:
|
||||
system_behavior: The system instruction string for the conversation.
|
||||
"""
|
||||
self.__chat_history: list[ResponseInputItemParam] = []
|
||||
self.__system_behavior: str = system_behavior
|
||||
|
||||
def add_user_message(self, message: str) -> None:
|
||||
"""
|
||||
Append a user message to the chat history.
|
||||
|
||||
Parameters:
|
||||
message: The user text to add.
|
||||
"""
|
||||
self.__chat_history.append({
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": message}],
|
||||
})
|
||||
|
||||
def add_assistant_message(self, output_message: Response) -> None:
|
||||
"""
|
||||
Append the assistant's output to the chat history.
|
||||
|
||||
Parameters:
|
||||
output_message: The model response to convert and store.
|
||||
"""
|
||||
for out in output_message.output:
|
||||
# Convert the Pydantic output model to an input item shape
|
||||
self.__chat_history.append(
|
||||
cast(ResponseInputItemParam, out.model_dump(exclude_unset=True))
|
||||
)
|
||||
|
||||
|
||||
class AICore(ABC, Generic[TAiResponse]):
|
||||
"""
|
||||
Abstract base class for AI core functionalities.
|
||||
"""
|
||||
@property
|
||||
def config(self) -> AIBrochureConfig:
|
||||
"""
|
||||
Return the stored AIBrochureConfig for this instance.
|
||||
|
||||
Returns:
|
||||
AIBrochureConfig: The current configuration used by this object.
|
||||
|
||||
Notes:
|
||||
- This accessor returns the internal configuration reference. Mutating the returned
|
||||
object may affect the internal state of this instance.
|
||||
- To change the configuration, use the appropriate setter or factory method rather
|
||||
than modifying the returned value in-place.
|
||||
"""
|
||||
return self.__config
|
||||
|
||||
@config.setter
|
||||
def config(self, config: AIBrochureConfig | None) -> None:
|
||||
"""
|
||||
Set the instance configuration for the AI brochure generator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
config : AIBrochureConfig | None
|
||||
The configuration to assign to the instance. If None, the instance's
|
||||
configuration will be reset to a newly created default AIBrochureConfig.
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
|
||||
Notes
|
||||
-----
|
||||
This method stores the provided configuration on a private attribute
|
||||
"""
|
||||
if config is None:
|
||||
self.__config = AIBrochureConfig()
|
||||
else:
|
||||
self.__config = config
|
||||
|
||||
@property
|
||||
def _ai_api(self) -> openai.OpenAI:
|
||||
"""
|
||||
Return the cached OpenAI API client, initializing it on first access.
|
||||
|
||||
This private helper lazily constructs and caches an openai.OpenAI client using
|
||||
the API key found on self.config.openai_api_key. On the first call, if the
|
||||
client has not yet been created, the method verifies that self.config is set,
|
||||
creates the client with openai.OpenAI(api_key=...), stores it on
|
||||
self.__ai_api, and returns it. Subsequent calls return the same cached
|
||||
instance.
|
||||
|
||||
Returns:
|
||||
openai.OpenAI: A configured OpenAI API client.
|
||||
|
||||
Raises:
|
||||
ValueError: If self.config is None when attempting to initialize the client.
|
||||
|
||||
Notes:
|
||||
- The method mutates self.__ai_api as a side effect (caching).
|
||||
- The caller should treat this as a private implementation detail.
|
||||
- Thread safety is not guaranteed; concurrent initialization may result in
|
||||
multiple client instances if invoked from multiple threads simultaneously.
|
||||
"""
|
||||
if self.__ai_api is None:
|
||||
if self.config is None:
|
||||
raise ValueError("Configuration must be set before accessing AI API")
|
||||
self.__ai_api = openai.OpenAI(api_key=self.config.openai_api_key)
|
||||
return self.__ai_api
|
||||
|
||||
@property
|
||||
def history_manager(self) -> HistoryManager:
|
||||
"""
|
||||
Return the history manager for this AI core instance.
|
||||
|
||||
This property provides access to the HistoryManager that tracks the chat
|
||||
history and system behavior.
|
||||
|
||||
Returns:
|
||||
HistoryManager: The current history manager. This property always returns
|
||||
a HistoryManager instance and never None.
|
||||
"""
|
||||
return self.__history_manager
|
||||
|
||||
def __init__(self, config: AIBrochureConfig, system_behavior: str) -> None:
|
||||
"""
|
||||
Initializes the AI core with the provided configuration.
|
||||
|
||||
Parameters:
|
||||
config (AIBrochureConfig): The configuration object for the AI core.
|
||||
system_behavior (str): The behavior of the system.
|
||||
"""
|
||||
# Initialize all instance-level attributes here
|
||||
self.__config: AIBrochureConfig = config
|
||||
self.__history_manager: HistoryManager = HistoryManager(system_behavior)
|
||||
self.__ai_api: openai.OpenAI | None = None
|
||||
|
||||
if __debug__:
|
||||
# Sanity check: confirm attributes are initialized
|
||||
assert hasattr(self, "_AICore__config")
|
||||
assert hasattr(self, "_AICore__history_manager")
|
||||
assert hasattr(self, "_AICore__ai_api")
|
||||
|
||||
@abstractmethod
|
||||
def ask(self, question: str) -> TAiResponse:
|
||||
"""
|
||||
Ask a question to the AI model.
|
||||
|
||||
Parameters:
|
||||
question: The question to ask.
|
||||
|
||||
Returns:
|
||||
TAiResponse: The model's response type defined by the subclass.
|
||||
"""
|
||||
pass
|
||||
@@ -0,0 +1,91 @@
|
||||
from ai_brochure_config import AIBrochureConfig
|
||||
from website import Website
|
||||
from ai_core import AICore
|
||||
from openai.types.responses import Response
|
||||
from json import loads
|
||||
|
||||
RelevantLinksDict = dict[str, list[dict[str, str]]]
|
||||
|
||||
class ExtractorOfRelevantLinks(AICore[RelevantLinksDict]):
|
||||
"""
|
||||
Extractor for relevant links from a website.
|
||||
"""
|
||||
|
||||
@property
|
||||
def website(self) -> Website:
|
||||
"""Return the root Website whose links are being analyzed."""
|
||||
return self.__website
|
||||
|
||||
def __init__(self, config: AIBrochureConfig, website: Website) -> None:
|
||||
"""
|
||||
Initialize the extractor with configuration and target website.
|
||||
|
||||
Parameters:
|
||||
config: AI and runtime configuration.
|
||||
website: The Website from which links were collected.
|
||||
"""
|
||||
system_behavior: str = ("You are an expert in creation of online advertisement materials."
|
||||
"You are going to be provided with a list of links found on a website."
|
||||
"You are able to decide which of the links would be most relevant to include in a brochure about the company,"
|
||||
"such as links to an About page or a Company page or Careers/Jobs pages.\n"
|
||||
"You should respond in JSON as in this example:")
|
||||
system_behavior += """
|
||||
{
|
||||
"links": [
|
||||
{"type": "about page", "url": "https://www.example.com/about"},
|
||||
{"type": "company page", "url": "https://www.another_example.net/company"},
|
||||
{"type": "careers page", "url": "https://ex.one_more_example.org/careers"}
|
||||
]
|
||||
}
|
||||
"""
|
||||
super().__init__(config, system_behavior)
|
||||
self.__website: Website = website
|
||||
|
||||
def get_links_user_prompt(self) -> str:
|
||||
"""
|
||||
Build a user prompt listing discovered links and instructions for relevance filtering.
|
||||
|
||||
Returns:
|
||||
A string to send to the model listing links and guidance.
|
||||
"""
|
||||
starter_part: str = (f"Here is a list of links found on the website of {self.website.website_url} - "
|
||||
"please decide which of these links are relevant web links for a brochure about company."
|
||||
"Respond with full HTTPS URLs. Avoid including Terms of Service, Privacy, email links.\n"
|
||||
"Links (some might be relative links):\n")
|
||||
|
||||
links_part: str = "\n".join(f"- {link}" for link in self.website.links_on_page) if self.website.links_on_page else "No links found."
|
||||
|
||||
return starter_part + links_part
|
||||
|
||||
def extract_relevant_links(self) -> RelevantLinksDict:
|
||||
"""
|
||||
Request the model to select relevant links for brochure creation.
|
||||
|
||||
Returns:
|
||||
A dictionary with a 'links' array containing objects with 'type' and 'url'.
|
||||
"""
|
||||
user_prompt = self.get_links_user_prompt()
|
||||
response = self.ask(user_prompt)
|
||||
return response
|
||||
|
||||
def ask(self, question: str) -> RelevantLinksDict:
|
||||
"""
|
||||
Send a question to the model and parse the JSON response.
|
||||
|
||||
Parameters:
|
||||
question: The prompt to submit.
|
||||
|
||||
Returns:
|
||||
RelevantLinksDict: Parsed JSON containing selected links.
|
||||
"""
|
||||
self.history_manager.add_user_message(question)
|
||||
|
||||
response: Response = self._ai_api.responses.create(
|
||||
model=self.config.model_name,
|
||||
instructions=self.history_manager.system_behavior,
|
||||
reasoning={ "effort": "low" },
|
||||
input=self.history_manager.chat_history
|
||||
)
|
||||
|
||||
self.history_manager.add_assistant_message(response)
|
||||
return loads(response.output_text)
|
||||
@@ -0,0 +1,5 @@
|
||||
python-dotenv
|
||||
openai
|
||||
bs4
|
||||
requests
|
||||
rich
|
||||
@@ -0,0 +1,286 @@
|
||||
from ipaddress import ip_address, IPv4Address, IPv6Address
|
||||
from urllib.parse import ParseResult, urlparse
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from requests import get, RequestException, Session
|
||||
|
||||
class Extractor:
|
||||
"""
|
||||
Extracts and processes content from HTML response text using BeautifulSoup.
|
||||
"""
|
||||
__soup: BeautifulSoup
|
||||
|
||||
__extracted_title: str = ""
|
||||
@property
|
||||
def extracted_title(self) -> str:
|
||||
"""
|
||||
Returns the extracted title from the HTML content.
|
||||
"""
|
||||
if not self.__extracted_title:
|
||||
self.__extracted_title = self.get_title()
|
||||
return self.__extracted_title
|
||||
|
||||
__extracted_text: str = ""
|
||||
@property
|
||||
def extracted_text(self) -> str:
|
||||
"""
|
||||
Returns the extracted main text content from the HTML, excluding irrelevant tags.
|
||||
"""
|
||||
if not self.__extracted_text:
|
||||
self.__extracted_text = self.get_text()
|
||||
return self.__extracted_text
|
||||
|
||||
__extracted_links_on_page: list[str] | None = None
|
||||
@property
|
||||
def extracted_links_on_page(self) -> list[str]:
|
||||
"""
|
||||
Return all href values found on the page.
|
||||
|
||||
Notes:
|
||||
- Only anchor tags with an href are included.
|
||||
- Values are returned as-is (may be relative or absolute).
|
||||
"""
|
||||
if self.__extracted_links_on_page is None:
|
||||
self.__extracted_links_on_page = [str(a.get("href")) for a in self._soup.find_all('a', href=True) if isinstance(a, Tag)]
|
||||
return self.__extracted_links_on_page
|
||||
|
||||
@property
|
||||
def _soup(self) -> BeautifulSoup:
|
||||
"""
|
||||
Returns the BeautifulSoup object for the HTML content.
|
||||
"""
|
||||
return self.__soup
|
||||
|
||||
def __init__(self, response_text_content: str) -> None:
|
||||
"""
|
||||
Initializes the Extractor with HTML response text.
|
||||
|
||||
Parameters:
|
||||
response_text_content (str): The HTML response text to be processed.
|
||||
"""
|
||||
self.__soup = BeautifulSoup(response_text_content, "html.parser")
|
||||
self.__extracted_links_on_page = None
|
||||
|
||||
def get_title(self) -> str:
|
||||
"""
|
||||
Extracts the title from the HTML content.
|
||||
"""
|
||||
return self._soup.title.get_text() if self._soup.title is not None else "No title"
|
||||
|
||||
def get_text(self) -> str:
|
||||
"""
|
||||
Extracts and cleans the main text content from the HTML, removing irrelevant tags.
|
||||
"""
|
||||
for irrelevant in self._soup.find_all(["script", "style", "img", "figure", "video", "audio", "button", "svg", "canvas", "input", "form", "meta"]):
|
||||
irrelevant.decompose()
|
||||
raw_text: str = self._soup.get_text(separator="\n")
|
||||
cleaned_text: str = " ".join(raw_text.split())
|
||||
return cleaned_text if cleaned_text else "No content"
|
||||
|
||||
class Website:
|
||||
"""
|
||||
A class to represent a website.
|
||||
"""
|
||||
|
||||
__DEFAULT_ALLOWED_DOMAINS: list[str] = [".com", ".org", ".net"]
|
||||
|
||||
__title: str = ""
|
||||
__website_url: str = ""
|
||||
__text: str = ""
|
||||
__allowed_domains: list[str] = []
|
||||
__links_on_page: list[str] | None = None
|
||||
|
||||
@property
|
||||
def title(self) -> str:
|
||||
"""
|
||||
Returns the title of the website.
|
||||
"""
|
||||
return self.__title
|
||||
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""
|
||||
Returns the main text content of the website.
|
||||
"""
|
||||
return self.__text
|
||||
|
||||
@property
|
||||
def website_url(self) -> str:
|
||||
"""
|
||||
Returns the URL of the website.
|
||||
"""
|
||||
return self.__website_url
|
||||
|
||||
@property
|
||||
def links_on_page(self) -> list[str] | None:
|
||||
"""
|
||||
Returns the list of links extracted from the website.
|
||||
"""
|
||||
return self.__links_on_page
|
||||
|
||||
@property
|
||||
def _allowed_domains(self) -> list[str]:
|
||||
"""
|
||||
Returns the list of allowed domain suffixes.
|
||||
"""
|
||||
return self.__allowed_domains
|
||||
|
||||
@_allowed_domains.setter
|
||||
def _allowed_domains(self, value: list[str] | str) -> None:
|
||||
"""
|
||||
Sets the list of allowed domain suffixes.
|
||||
Filters out empty strings and ensures each suffix starts with a dot.
|
||||
"""
|
||||
if isinstance(value, str):
|
||||
value = [
|
||||
item.strip() if item.strip().startswith(".") else f".{item.strip()}"
|
||||
for item in value.split(",")
|
||||
if item.strip()
|
||||
]
|
||||
else:
|
||||
value = [
|
||||
item if item.startswith(".") else f".{item}"
|
||||
for item in value
|
||||
if item
|
||||
]
|
||||
self.__allowed_domains = value
|
||||
|
||||
def _set_website_url(self, value: str) -> None:
|
||||
"""
|
||||
Protected: set the website URL after validating and fetch website data.
|
||||
Use this from inside the class to initialize or change the URL.
|
||||
"""
|
||||
if not value:
|
||||
raise ValueError("Website URL must be provided")
|
||||
|
||||
parsed_url: ParseResult = urlparse(value)
|
||||
|
||||
self._validate(parsed_url)
|
||||
|
||||
self.__website_url = value
|
||||
self.__fetch_website_data()
|
||||
|
||||
@property
|
||||
def fetch_failed(self) -> bool:
|
||||
"""
|
||||
Returns whether the website data fetch failed.
|
||||
"""
|
||||
return self.__fetch_failed
|
||||
|
||||
def _validate(self, parsed_url: ParseResult) -> None:
|
||||
"""
|
||||
Validate the parsed URL.
|
||||
|
||||
Parameters:
|
||||
parsed_url: The parsed URL to validate.
|
||||
|
||||
Raises:
|
||||
ValueError: If the URL is missing parts, uses an invalid scheme,
|
||||
points to a local/private address, or is not in allowed domains.
|
||||
"""
|
||||
if not parsed_url.netloc or parsed_url.scheme not in ("http", "https"):
|
||||
raise ValueError("Website URL must be a valid URL")
|
||||
|
||||
if not parsed_url.hostname:
|
||||
raise ValueError("Website URL must contain a valid hostname")
|
||||
|
||||
if self.__is_local_address(parsed_url.hostname):
|
||||
raise ValueError("Website URL must not be a local address")
|
||||
|
||||
if not self.__is_allowed_domain(parsed_url.hostname):
|
||||
raise ValueError("Website URL must be an allowed domain")
|
||||
|
||||
def __is_local_address(self, hostname: str) -> bool:
|
||||
"""
|
||||
Check if the given hostname is a local address.
|
||||
|
||||
Parameters:
|
||||
hostname (str): The hostname to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the hostname is a local address, False otherwise.
|
||||
"""
|
||||
if hostname in ("localhost", "127.0.0.1", "::1"):
|
||||
return True
|
||||
|
||||
try:
|
||||
ip: IPv4Address | IPv6Address = ip_address(hostname)
|
||||
if ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_reserved:
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
def __is_allowed_domain(self, hostname: str) -> bool:
|
||||
"""
|
||||
Check if the given hostname is an allowed domain.
|
||||
|
||||
Parameters:
|
||||
hostname (str): The hostname to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the hostname is an allowed domain, False otherwise.
|
||||
"""
|
||||
allowed_domains = [".com", ".org", ".net", ".io"]
|
||||
return any(hostname.endswith(domain) for domain in allowed_domains)
|
||||
|
||||
def __fetch_website_data(self) -> None:
|
||||
"""
|
||||
Fetch website content and populate title, text, and links.
|
||||
|
||||
Side effects:
|
||||
- Sets internal state: __title, __text, __links_on_page, __fetch_failed.
|
||||
- Performs an HTTP GET with a browser-like User-Agent.
|
||||
"""
|
||||
try:
|
||||
get_fn = self.__session.get if self.__session else get
|
||||
response = get_fn(
|
||||
self.website_url,
|
||||
timeout=10,
|
||||
verify=True,
|
||||
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"}
|
||||
)
|
||||
except RequestException as e:
|
||||
self.__title = "Error"
|
||||
self.__text = str(e)
|
||||
self.__fetch_failed = True
|
||||
return
|
||||
|
||||
if response.ok:
|
||||
extractor: Extractor = Extractor(response.text)
|
||||
self.__title = extractor.extracted_title
|
||||
self.__text = extractor.extracted_text
|
||||
self.__links_on_page = extractor.extracted_links_on_page
|
||||
else:
|
||||
if response.status_code == 404:
|
||||
self.__title = "Not Found"
|
||||
self.__text = "The requested page was not found (404)."
|
||||
else:
|
||||
self.__title = "Error"
|
||||
self.__text = f"Error: {response.status_code} - {response.reason}"
|
||||
self.__fetch_failed = True
|
||||
|
||||
def __init__(self, website_url: str, allowed_domains: list[str] | str | None = None, session: Session | None = None) -> None:
|
||||
"""
|
||||
Initializes the Website object and fetches its data.
|
||||
|
||||
Parameters:
|
||||
website_url (str): The URL of the website to fetch.
|
||||
allowed_domains (list[str] | str, optional): A list of allowed domain suffixes.
|
||||
If a string is provided, it should be a comma-separated list of domain suffixes (e.g., ".com,.org,.net").
|
||||
session (requests.Session | None, optional): Reused HTTP session for connection pooling.
|
||||
"""
|
||||
self.__fetch_failed: bool = False
|
||||
self.__session: Session | None = session
|
||||
if allowed_domains is None:
|
||||
self._allowed_domains = self.__DEFAULT_ALLOWED_DOMAINS.copy()
|
||||
else:
|
||||
self._allowed_domains = allowed_domains
|
||||
# Use protected setter internally so the public API exposes only the getter.
|
||||
self._set_website_url(website_url)
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
Returns a string representation of the Website object.
|
||||
"""
|
||||
return f"Website(title={self.title}, url={self.website_url})"
|
||||
@@ -0,0 +1,402 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9905f163-759f-474b-8f7a-7d14da0df44d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### BUSINESS CHALLENGE: Using Multi-shot Prompting\n",
|
||||
"#### Day 5\n",
|
||||
"\n",
|
||||
"Create a product that builds a Brochure for a company to be used for prospective clients, investors and potential recruits.\n",
|
||||
"\n",
|
||||
"We will be provided a company name and their primary website."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a0895f24-65ff-4624-8ae0-15d2d400d8f0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"import json\n",
|
||||
"from typing import List\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"from openai import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7794aa70-5962-4669-b86f-b53639f4f9ea",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initialize and constants\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:\n",
|
||||
" print(\"API key looks good so far\")\n",
|
||||
"else:\n",
|
||||
" print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n",
|
||||
" \n",
|
||||
"MODEL = 'gpt-4o-mini'\n",
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "63bf8631-2746-4255-bec1-522855d3e812",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
" \"\"\"\n",
|
||||
" A utility class to represent a Website that we have scraped, now with links\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" self.body = response.content\n",
|
||||
" soup = BeautifulSoup(self.body, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" if soup.body:\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
|
||||
" else:\n",
|
||||
" self.text = \"\"\n",
|
||||
" links = [link.get('href') for link in soup.find_all('a')]\n",
|
||||
" self.links = [link for link in links if link]\n",
|
||||
"\n",
|
||||
" def get_contents(self):\n",
|
||||
" return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1e7bb527-e769-4245-bb91-ae65e64593ff",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## First step: Have GPT-4o-mini figure out which links are relevant\n",
|
||||
"\n",
|
||||
"### Use a call to gpt-4o-mini to read the links on a webpage, and respond in structured JSON. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1ce303ae-b967-4261-aadc-02dafa54db4a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n",
|
||||
"You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n",
|
||||
"such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
|
||||
"link_system_prompt += \"You should respond in JSON as in this example:\"\n",
|
||||
"link_system_prompt += \"\"\"\n",
|
||||
"{\n",
|
||||
" \"links\": [\n",
|
||||
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
|
||||
" {\"type\": \"careers page\", \"url\": \"https://another.full.url/careers\"}\n",
|
||||
" ]\n",
|
||||
"}\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d24a4c0c-a1d1-4897-b2a7-4128d25c2e08",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_links_user_prompt(website):\n",
|
||||
" user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
|
||||
" user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
|
||||
"Do not include Terms of Service, Privacy, email links.\\n\"\n",
|
||||
" user_prompt += \"Links (some might be relative links):\\n\"\n",
|
||||
" user_prompt += \"\\n\".join(website.links)\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8103fc11-5bc0-41c4-8c97-502c9e96429c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_links(url): # 1st inference\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
|
||||
" ],\n",
|
||||
" response_format={\"type\": \"json_object\"}\n",
|
||||
" )\n",
|
||||
" result = response.choices[0].message.content\n",
|
||||
" return json.loads(result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dc84a695-515d-4292-9a95-818f4fe3d20e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"huggingface = Website(\"https://huggingface.co\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "91896908-1632-41fc-9b8b-39a7638d8dd1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Second step: make the brochure!\n",
|
||||
"\n",
|
||||
"Assemble all the details into another prompt to GPT4-o"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ab7c54e3-e654-4b1f-8671-09194b628aa0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_all_details(url): # 1st inference wrapper\n",
|
||||
" result = \"Landing page:\\n\"\n",
|
||||
" result += Website(url).get_contents()\n",
|
||||
" links = get_links(url) # inference\n",
|
||||
" # print(\"Found links:\", links)\n",
|
||||
" for link in links[\"links\"]:\n",
|
||||
" result += f\"\\n\\n{link['type']}\\n\"\n",
|
||||
" result += Website(link[\"url\"]).get_contents()\n",
|
||||
" return result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ea9f54d1-a248-4c56-a1de-6633193de5bf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n",
|
||||
"and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n",
|
||||
"Include details of company culture, customers and careers/jobs if you have the information.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "13412c85-badd-4d79-a5ac-8283e4bb832f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_brochure_user_prompt(company_name, url):\n",
|
||||
" user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
|
||||
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n",
|
||||
" user_prompt += get_all_details(url) # inference wrapper\n",
|
||||
" user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "addc0047-ea73-4748-abc3-747ff343c134",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_brochure(company_name, url): # 2nd inference\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n",
|
||||
" ],\n",
|
||||
" )\n",
|
||||
" result = response.choices[0].message.content\n",
|
||||
" return result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "82a3b61a-da26-4265-840a-0a93f81cd048",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"brochure_english = create_brochure(\"HuggingFace\", \"https://huggingface.co\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5d165e3f-8fe2-4712-b098-d34d9fabe583",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display(Markdown(brochure_english))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "107a2100-3f7d-4f16-8ba7-b5da602393c6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def stream_brochure(company_name, url):\n",
|
||||
" stream = openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n",
|
||||
" ],\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" response = \"\"\n",
|
||||
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
||||
" for chunk in stream:\n",
|
||||
" response += chunk.choices[0].delta.content or ''\n",
|
||||
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
|
||||
" update_display(Markdown(response), display_id=display_handle.display_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "26cbe9b5-3603-49a1-a676-75c7ddaacdb8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"stream_brochure(\"HuggingFace\", \"https://huggingface.co\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c10d8189-7f79-4991-abc4-0764369b7d64",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Third step: Translate the entire brochure to Spanish"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "666817eb-1e8b-4fee-bbab-c0dbfe2ea7c0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a brochure \\\n",
|
||||
"and translates to Spanish. Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c48adb12-bc3c-48f9-ab38-b7ca895195f6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def translate_user_prompt(company_name, url):\n",
|
||||
" user_prompt = f\"Please translate the following brochure content to Spanish\\n\"\n",
|
||||
" user_prompt += create_brochure(company_name, url) # inference wrapper\n",
|
||||
" # user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b92b61ac-3be3-4e84-9000-ec8233697b81",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"translate_user_prompt(\"HuggingFace\", \"https://huggingface.co\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6bfd04f4-4381-4730-ac5d-c9fa02f906df",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def translate_brochure(): # 3rd inference\n",
|
||||
" stream = openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": translate_user_prompt(\"HuggingFace\", \"https://huggingface.co\")}\n",
|
||||
" ],\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" response = \"\"\n",
|
||||
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
||||
" for chunk in stream:\n",
|
||||
" response += chunk.choices[0].delta.content or ''\n",
|
||||
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
|
||||
" update_display(Markdown(response), display_id=display_handle.display_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bb78ed28-9ecd-4c08-ae96-d7473cbc97dd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"translate_brochure()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,270 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f60dab2a-a377-4761-8be3-69a3b8124ca6",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pdfplumber\n",
|
||||
"import re\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"def parse_transaction_line(line):\n",
|
||||
" # More specific pattern that captures each component'\n",
|
||||
" pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})\\s+(-?[\\d,]+\\.\\d{2})$'\n",
|
||||
" match = re.match(pattern, line.strip())\n",
|
||||
" \n",
|
||||
" if match:\n",
|
||||
" date, description, amount, balance = match.groups()\n",
|
||||
" return {\n",
|
||||
" 'date': date,\n",
|
||||
" 'description': description.strip(),\n",
|
||||
" 'amount': amount,\n",
|
||||
" 'balance': balance\n",
|
||||
" }\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
"def parse_Credit_Card_transaction_line(line):\n",
|
||||
" # More specific pattern that captures each component'\n",
|
||||
" pattern = r'^(\\d{2}/\\d{2})\\s+(.+?)\\s+(-?[\\d,]+\\.\\d{2})$'\n",
|
||||
" match = re.match(pattern, line.strip())\n",
|
||||
" \n",
|
||||
" if match:\n",
|
||||
" date, description, amount = match.groups()\n",
|
||||
" return {\n",
|
||||
" 'date': date,\n",
|
||||
" 'description': description.strip(),\n",
|
||||
" 'amount': amount\n",
|
||||
" }\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
"# \n",
|
||||
"def extract_transactions_CA_from_pdf(pdf_path):\n",
|
||||
" transactions = []\n",
|
||||
" \n",
|
||||
" with pdfplumber.open(pdf_path) as pdf:\n",
|
||||
" for page in pdf.pages:\n",
|
||||
" text = page.extract_text()\n",
|
||||
" for line in text.split(\"\\n\"):\n",
|
||||
" parsed = parse_transaction_line(line)\n",
|
||||
" if parsed:\n",
|
||||
" transactions.append(parsed)\n",
|
||||
" return transactions\n",
|
||||
"\n",
|
||||
"def extract_transactions_CreditCard_from_pdf(pdf_path):\n",
|
||||
" transactions = []\n",
|
||||
" \n",
|
||||
" with pdfplumber.open(pdf_path) as pdf:\n",
|
||||
" for page in pdf.pages:\n",
|
||||
" text = page.extract_text()\n",
|
||||
" for line in text.split(\"\\n\"):\n",
|
||||
" parsed = parse_Credit_Card_transaction_line(line)\n",
|
||||
" if parsed:\n",
|
||||
" transactions.append(parsed)\n",
|
||||
" return transactions\n",
|
||||
"# print(transactions, len(transactions)) # check first 10 extracted lines\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "82c34eac-fc30-41d6-8325-77efc48d0dd8",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "769ee512-75f5-480a-9407-f9c4cd46b679",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# ---------- STEP 3: Build prompts ----------\n",
|
||||
"\n",
|
||||
"def build_prompts(transactions):\n",
|
||||
" system_prompt = \"\"\"\n",
|
||||
"You are a personal financial assistant.\n",
|
||||
"Your job is to analyze bank transactions, categorize each expense into categories such as:\n",
|
||||
"Food, Clothing, Rent, Utilities, Entertainment, Travel, Health, Miscellaneous, and Others.\n",
|
||||
"\n",
|
||||
"Your responsibilities:\n",
|
||||
"\n",
|
||||
"Categorize all transactions and compute total spending per category.\n",
|
||||
"\n",
|
||||
"Identify the top 5 categories by total spending.\n",
|
||||
"\n",
|
||||
"Detect high-frequency purchases, even if individual amounts are small (e.g., $4 coffee bought 40 times).\n",
|
||||
"\n",
|
||||
"For these, group transactions by merchant/description and count frequency.\n",
|
||||
"\n",
|
||||
"Highlight the top 5 frequent purchases, with both frequency and total spend.\n",
|
||||
"\n",
|
||||
"Provide a practical summary of spending habits, covering both biggest expenses and frequent small purchases.\n",
|
||||
"\n",
|
||||
"Suggest 2–3 actionable recommendations to reduce spending, targeting both:\n",
|
||||
"\n",
|
||||
"Big categories (e.g., Rent, Travel, Entertainment).\n",
|
||||
"\n",
|
||||
"Small but frequent “habit expenses” (e.g., coffee, fast food, subscriptions).\n",
|
||||
"\n",
|
||||
"The output should be a valid JSON object with this structure:\n",
|
||||
"{\n",
|
||||
" \"summary\": {\n",
|
||||
" \"Food\": <amount>,\n",
|
||||
" \"Clothing\": <amount>,\n",
|
||||
" \"Rent\": <amount>,\n",
|
||||
" \"Utilities\": <amount>,\n",
|
||||
" \"Entertainment\": <amount>,\n",
|
||||
" \"Travel\": <amount>,\n",
|
||||
" \"Health\": <amount>,\n",
|
||||
" \"Miscellaneous\": <amount>,\n",
|
||||
" \"Others\": <amount>\n",
|
||||
" },\n",
|
||||
" \"total_expenses\": <total>,\n",
|
||||
" \"top_5_categories\": [ {\"category\": <name>, \"amount\": <amount>} ],\n",
|
||||
" \"top_5_frequent_purchases\": [ {\"item\": <merchant/description>, \"count\": <frequency>, \"total\": <amount>} ],\n",
|
||||
" \"insights\": \"<short paragraph summary of spending, including both big categories and frequent small habits>\",\n",
|
||||
" \"recommendations\": [ \"<tip1>\", \"<tip2>\", \"<tip3>\" ]\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
" user_prompt = \"Here are my bank account transactions for the past few months:\\n\\n\"\n",
|
||||
" for txn in transactions:\n",
|
||||
" user_prompt += f\"- Date: {txn['date']}, Description: {txn['description']}, Amount: {txn['amount']}\\n\"\n",
|
||||
"\n",
|
||||
" user_prompt += \"\"\"\n",
|
||||
"Please analyze these transactions according to the instructions in the system prompt.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
" return system_prompt, user_prompt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "307ca02b-2df6-4996-85e7-d073f74592f5",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ---------- STEP 4: Call OpenAI ----------\n",
|
||||
"def analyze_transactions(pdf_path):\n",
|
||||
" transactions = extract_transactions_CreditCard_from_pdf(pdf_path)\n",
|
||||
" system_prompt, user_prompt = build_prompts(transactions)\n",
|
||||
"\n",
|
||||
" client = OpenAI() # assumes OPENAI_API_KEY is set in env\n",
|
||||
"\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ],\n",
|
||||
" response_format={\"type\": \"json_object\"} # ensures valid JSON\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" result = response.choices[0].message.content\n",
|
||||
" return json.loads(result)\n",
|
||||
"\n",
|
||||
"# ---------- MAIN ----------\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" cc_pdf_file = \"cc_statement.pdf\"\n",
|
||||
" # To Debug in case of failures\n",
|
||||
" # transactions = extract_transactions_from_pdf(pdf_file)\n",
|
||||
" # print(cc_transactions,len(cc_transactions))\n",
|
||||
" # system_prompt, user_prompt = build_prompts(cc_transactions)\n",
|
||||
" # print(system_prompt, user_prompt)\n",
|
||||
"\n",
|
||||
" # Analyse the function to create a smart alert\n",
|
||||
" cc_transactions = extract_transactions_CreditCard_from_pdf(cc_pdf_file)\n",
|
||||
" analysis = analyze_transactions(cc_pdf_file)\n",
|
||||
" print(\"=========================================\")\n",
|
||||
" print(\"=== Top 5 Spending Habits & Insights ====\")\n",
|
||||
" print(\"=========================================\")\n",
|
||||
" print(json.dumps(analysis, indent=2))\n",
|
||||
" print(\"=========================================\")\n",
|
||||
" print(\"=========================================\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "831922f4-5efd-4cba-9975-54767b65f6d6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
305
week1/community-contributions/day-1-thesis_pdf_summarizer.ipynb
Normal file
305
week1/community-contributions/day-1-thesis_pdf_summarizer.ipynb
Normal file
@@ -0,0 +1,305 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "581151c0-941e-47b3-a3e0-2da65ba70087",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "47353a41-4b47-499e-9460-fd645345f591",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"API key found and looks good so far\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"load_dotenv()\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print('No API key was found')\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"API key is found but is not in the proper format\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "dbfbb29a-3452-45a0-b9b3-4e329ac776fb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "88ffe256-e46a-45e8-a616-0ac574aa7085",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"\"\"You are a research summarizer specialized in wireless communication systems and propagation modeling. Your task is to summarize a research thesis in no more than 1000 words. The summary must be clear, structured, and written in markdown format.\n",
|
||||
"\n",
|
||||
"The summary should include the following sections:\n",
|
||||
"\n",
|
||||
"1. **Title and Authors** – Provide the full title of the thesis and author name(s).\n",
|
||||
"2. **Objective / Research Problem** – Clearly state the core research goal or question addressed in the thesis.\n",
|
||||
"3. **Scientific and Regional Background** – Explain the technical context of radio wave propagation, and why studying it in the Horn of Africa region is important.\n",
|
||||
"4. **Methodology** – Summarize the modeling techniques, data sources, simulation tools, frequency bands (e.g., microwave, millimeter), and measurement or evaluation methods used.\n",
|
||||
"5. **Key Findings** – Highlight the quantitative and qualitative results, including differences between precipitation and clear-air conditions, and observed trends across geographic locations.\n",
|
||||
"6. **Conclusion** – Describe the primary outcomes and how they advance understanding in wireless communications.\n",
|
||||
"7. **Limitations** – Point out any constraints (e.g., lack of in-situ measurement, simulation assumptions).\n",
|
||||
"8. **Future Work** – Suggest next steps for improving or extending this research.\n",
|
||||
"9. **Real-World Applications** – Discuss how the models or findings could improve wireless network planning, 5G deployment, or link budgeting in East Africa and similar regions.\n",
|
||||
"\n",
|
||||
"Use academic language but keep it concise, clear, and structured for a technical reader. Output in markdown format only.\n",
|
||||
"\"\"\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "5f3f7b1a-865f-44cc-854d-9e9e7771eb82",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: ipywidgets in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (8.1.7)\n",
|
||||
"Collecting pdfplumber\n",
|
||||
" Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)\n",
|
||||
"Requirement already satisfied: comm>=0.1.3 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipywidgets) (0.2.3)\n",
|
||||
"Requirement already satisfied: ipython>=6.1.0 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipywidgets) (9.4.0)\n",
|
||||
"Requirement already satisfied: traitlets>=4.3.1 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipywidgets) (5.14.3)\n",
|
||||
"Requirement already satisfied: widgetsnbextension~=4.0.14 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipywidgets) (4.0.14)\n",
|
||||
"Requirement already satisfied: jupyterlab_widgets~=3.0.15 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipywidgets) (3.0.15)\n",
|
||||
"Collecting pdfminer.six==20250506 (from pdfplumber)\n",
|
||||
" Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)\n",
|
||||
"Requirement already satisfied: Pillow>=9.1 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from pdfplumber) (11.3.0)\n",
|
||||
"Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
|
||||
" Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)\n",
|
||||
"Requirement already satisfied: charset-normalizer>=2.0.0 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from pdfminer.six==20250506->pdfplumber) (3.4.3)\n",
|
||||
"Requirement already satisfied: cryptography>=36.0.0 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from pdfminer.six==20250506->pdfplumber) (45.0.6)\n",
|
||||
"Requirement already satisfied: cffi>=1.14 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (1.17.1)\n",
|
||||
"Requirement already satisfied: pycparser in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from cffi>=1.14->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.22)\n",
|
||||
"Requirement already satisfied: colorama in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.4.6)\n",
|
||||
"Requirement already satisfied: decorator in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (5.2.1)\n",
|
||||
"Requirement already satisfied: ipython-pygments-lexers in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (1.1.1)\n",
|
||||
"Requirement already satisfied: jedi>=0.16 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.19.2)\n",
|
||||
"Requirement already satisfied: matplotlib-inline in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.1.7)\n",
|
||||
"Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (3.0.51)\n",
|
||||
"Requirement already satisfied: pygments>=2.4.0 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (2.19.2)\n",
|
||||
"Requirement already satisfied: stack_data in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n",
|
||||
"Requirement already satisfied: typing_extensions>=4.6 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from ipython>=6.1.0->ipywidgets) (4.14.1)\n",
|
||||
"Requirement already satisfied: wcwidth in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.13)\n",
|
||||
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.4)\n",
|
||||
"Requirement already satisfied: executing>=1.2.0 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (2.2.0)\n",
|
||||
"Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (3.0.0)\n",
|
||||
"Requirement already satisfied: pure_eval in c:\\users\\esku4\\anaconda3\\envs\\llms\\lib\\site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (0.2.3)\n",
|
||||
"Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)\n",
|
||||
"Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)\n",
|
||||
" ---------------------------------------- 0.0/5.6 MB ? eta -:--:--\n",
|
||||
" --------------------------------------- 5.5/5.6 MB 30.7 MB/s eta 0:00:01\n",
|
||||
" ---------------------------------------- 5.6/5.6 MB 22.9 MB/s 0:00:00\n",
|
||||
"Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl (2.9 MB)\n",
|
||||
" ---------------------------------------- 0.0/2.9 MB ? eta -:--:--\n",
|
||||
" ---------------------------------------- 2.9/2.9 MB 28.0 MB/s 0:00:00\n",
|
||||
"Installing collected packages: pypdfium2, pdfminer.six, pdfplumber\n",
|
||||
"\n",
|
||||
" ---------------------------------------- 0/3 [pypdfium2]\n",
|
||||
" ---------------------------------------- 0/3 [pypdfium2]\n",
|
||||
" ------------- -------------------------- 1/3 [pdfminer.six]\n",
|
||||
" ------------- -------------------------- 1/3 [pdfminer.six]\n",
|
||||
" ------------- -------------------------- 1/3 [pdfminer.six]\n",
|
||||
" ------------- -------------------------- 1/3 [pdfminer.six]\n",
|
||||
" ------------- -------------------------- 1/3 [pdfminer.six]\n",
|
||||
" ------------- -------------------------- 1/3 [pdfminer.six]\n",
|
||||
" -------------------------- ------------- 2/3 [pdfplumber]\n",
|
||||
" ---------------------------------------- 3/3 [pdfplumber]\n",
|
||||
"\n",
|
||||
"Successfully installed pdfminer.six-20250506 pdfplumber-0.11.7 pypdfium2-4.30.0\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pip install ipywidgets pdfplumber"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "06dcfc1d-b106-4b9a-9346-6dd6af4a4015",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"UNIVERSITY OF KWAZULU-NATAL\n",
|
||||
"Radio Wave Propagation Modeling under\n",
|
||||
"Precipitation and Clear-air at Microwave\n",
|
||||
"and Millimetric Bands over Wireless Links\n",
|
||||
"in the Horn of Africa\n",
|
||||
"Feyisa Debo Diba\n",
|
||||
"February, 2017\n",
|
||||
"Supervisor: Professor Thomas J. Afullo\n",
|
||||
"Co-supervisor: Dr. Akintunde Ayodeji Alonge\n",
|
||||
"Radio Wave Propagation Modeling under\n",
|
||||
"Precipitation and Clear-air at Microwave\n",
|
||||
"and Millimetric Bands over Wireless Links\n",
|
||||
"in the Horn of Africa\n",
|
||||
"Feyisa Debo Diba\n",
|
||||
"In fulfillment of the Degree of Doctor of Philosophy in\n",
|
||||
"Electronic Engineering, College of Agriculture, Engineering\n",
|
||||
"and Science, University of KwaZulu-Natal, Durban\n",
|
||||
"February, 2017\n",
|
||||
"Supervisor:\n",
|
||||
"As the candidate’s Supervisor, I agree/do not agree to the submission of this thesis\n",
|
||||
"Professor T.J. Afullo ———————————-\n",
|
||||
"Date—————————————————\n",
|
||||
"Co-Supervisor:\n",
|
||||
"Dr. Akintunde Ayodeji Alonge\n",
|
||||
"As the candidate’s Co.Supervisor, I agree to the submission of this thesis\n",
|
||||
"Dr. A. A. Alonge ———————————-\n",
|
||||
"Date—————————————————\n",
|
||||
"ii\n",
|
||||
"DECLARATION 1 - PLAGIARISM\n",
|
||||
"I, Feyisa Debo Diba\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Cell 3: Download and extract from PDF URL\n",
|
||||
"pdf_url = (\n",
|
||||
" \"https://researchspace.ukzn.ac.za/server/api/core/bitstreams/\"\n",
|
||||
" \"29218203-bfc8-4fcb-bc63-9afba3341910/content\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = requests.get(pdf_url)\n",
|
||||
"if response.status_code != 200:\n",
|
||||
" raise Exception(f\"Failed to download PDF (Status code: {response.status_code})\")\n",
|
||||
"\n",
|
||||
"with pdfplumber.open(BytesIO(response.content)) as pdf:\n",
|
||||
" thesis_text = \"\\n\".join(page.extract_text() for page in pdf.pages if page.extract_text())\n",
|
||||
"\n",
|
||||
"# Optional Preview\n",
|
||||
"print(thesis_text[:1000])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "84c544db-64a0-4181-beb0-1cc72bc88466",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"# Summary of the Research Thesis\n",
|
||||
"\n",
|
||||
"## 1. Title and Authors\n",
|
||||
"**Title:** Radio Wave Propagation Modeling under Precipitation and Clear-air at Microwave and Millimetric Bands over Wireless Links in the Horn of Africa \n",
|
||||
"**Author:** Feyisa Debo Diba \n",
|
||||
"**Supervisors:** Professor Thomas J. Afullo, Dr. Akintunde Ayodeji Alonge \n",
|
||||
"\n",
|
||||
"## 2. Objective / Research Problem\n",
|
||||
"The thesis investigates radio wave propagation modeling in clear air and precipitation conditions over wireless communication systems in the Horn of Africa, specifically Ethiopia. The research aims to address the attenuation problem caused by precipitation for systems operating at higher frequency bands.\n",
|
||||
"\n",
|
||||
"## 3. Scientific and Regional Background\n",
|
||||
"The congestion of lower operating frequency bands has led to the rapid growth of utilizing higher frequency spectrum for wireless communication systems. However, the Horn of Africa, particularly Ethiopia, lacks comprehensive studies on propagation modeling under different atmospheric conditions. This research provides valuable insights for the region, contributing to the efficient operation of wireless networks.\n",
|
||||
"\n",
|
||||
"## 4. Methodology\n",
|
||||
"The research uses three years of atmospheric data (temperature, pressure, relative humidity) from the National Meteorological Agency of Ethiopia and clear air signal measurements over terrestrial Line-of-Sight (LOS) links from EthioTelecom. Rainfall data from a Davis Vantage weather station installed at Jimma University, Ethiopia, are also used. The study applies the ITU-R model for refractivity gradient prediction and the Rice-Holmberg (R-H) model for one-minute rain rate distribution. A semi-Markovian model is used for rainfall event characterization and generation.\n",
|
||||
"\n",
|
||||
"## 5. Key Findings\n",
|
||||
"The research derived radio climatological parameters for different rain and clear air fade models. It also proposed rainfall rate conversion factors for Ethiopian sites and developed rainfall rate and fade margin contour maps for Ethiopia. The study found that the sojourn time of spikes in every rain regime is appropriately described by Erlang-k distribution. The number of spikes of generated rainfall events and the corresponding sojourn times follow the power-law relationship.\n",
|
||||
"\n",
|
||||
"## 6. Conclusion\n",
|
||||
"The research provides a comprehensive analysis of radio wave propagation under different atmospheric conditions in Ethiopia. The findings contribute to the understanding of the impact of atmospheric conditions on wireless communication systems operating at higher frequency bands.\n",
|
||||
"\n",
|
||||
"## 7. Limitations\n",
|
||||
"The research is limited by the availability and quality of atmospheric and signal level data. The simulation models also have inherent assumptions that may affect the accuracy of the results.\n",
|
||||
"\n",
|
||||
"## 8. Future Work\n",
|
||||
"Future research could focus on refining the models used in this study by incorporating more data and improving the simulation techniques. Studies could also be extended to other regions in the Horn of Africa.\n",
|
||||
"\n",
|
||||
"## 9. Real-World Applications\n",
|
||||
"The findings of this research can improve wireless network planning and 5G deployment in East Africa. The models developed can also be used in link budgeting, which is crucial for the design and operation of wireless communication systems."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Cell 4: Summarize via OpenAI\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": f\"Here is the thesis text (truncated):\\n\\n{thesis_text[:10000]}\"}\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"response = openai.chat.completions.create(\n",
|
||||
" model=\"gpt-4\",\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.3\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"summary = response.choices[0].message.content.strip()\n",
|
||||
"display(Markdown(summary))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e1cdf9ec-5efb-4d4b-8de2-83648865f092",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
|
||||
"# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "00743dac-0e70-45b7-879a-d7293a6f68a6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 1: Create your prompts\n",
|
||||
"\n",
|
||||
"system_prompt = \"Eres un analista acostumbrado a trabajar con correos electrónicos que contiene un gran conocimiento sobre la mejor manera de resumir contenido releveante \\\n",
|
||||
"dejando de lado cualquier información que no despierte interés o no sea el tema principal del correo. Tu función será leer contenido de correos y definir un listado de las 3 mejores opciones con el formato: Opción *numero de la opción*: *sujeto* Motivo: *que palabras clave dentro del texto has utilizado para llegar a esa conclusion y la relación semántica con tu idea\"\n",
|
||||
"user_prompt = \"\"\"\n",
|
||||
"Tengo un correo que le quiero enviar a mi profesor pero no se muy bien como llamarlo, ayudame. El correo es el siguiente:\n",
|
||||
"Hola profe,\n",
|
||||
"Ultimamente estoy disfrutando mucho sus clases y la información que presenta me parece muy importante. Este fin de semana me voy de vacaciones y no podré\n",
|
||||
"ir a sus clases la semana que viene. Me gustaría si pudiera pasarme los pdfs de la siguiente semana para echarle un vistazo por mi cuenta durante mi ausencia en Francia.\n",
|
||||
"\n",
|
||||
"Un saludo,\n",
|
||||
"Daniel.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Step 2: Make the messages list\n",
|
||||
"\n",
|
||||
"messages = [{\"role\" : \"system\" , \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}]\n",
|
||||
"\n",
|
||||
"# Step 3: Call OpenAI\n",
|
||||
"\n",
|
||||
"response = openai.chat.completions.create( \n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages)\n",
|
||||
"\n",
|
||||
"# Step 4: print the result\n",
|
||||
"\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,202 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5c527a13-459e-4a46-b00e-f2c5056de155",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Research Paper Summarizer with Text Highlighting"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "861a0be5-6da7-4f66-8f82-bc083a913f9f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "74bf6765-53b6-457b-ac2d-0d1afa7fbf8f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"API key found and looks good so far!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "227ed7af-d539-4c87-988b-80e6e049c863",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
|
||||
"# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "dcaadf8b-456d-48ca-af9d-9f57d3414308",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "6315093f-be68-408e-a5e1-6a2e4ea675e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at an article website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"I'm also looking for complete statements containing the following keywords (if found): \\\n",
|
||||
"'large circuit model', 'ChipGPT' \\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"article = Website(\"https://arxiv.org/html/2401.12224v1\")\n",
|
||||
"# print(user_prompt_for(article))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "ff8a4112-f118-4866-b6cf-82675de0a38d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a scientific \\\n",
|
||||
"article for a PhD student (who has to read a lot of papers and journals). The \\\n",
|
||||
"user will provide the article website and keyword(s) they are looking to learn and \\\n",
|
||||
"cite from. Your job is to summarize the paper and point out all the statements \\\n",
|
||||
"containing the specific keyword(s) the user typed. \\\n",
|
||||
"Respond in markdown.\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"#messages_for(article)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "b5e47bea-403d-48c3-ab9d-4d6adef83241",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9f6ac1bc-5bc8-4daa-8174-d201400e517a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://arxiv.org/html/2401.12224v1\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,260 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2588fbba",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Website Analysis and Summarization with Selenium and OpenAI\n",
|
||||
"\n",
|
||||
"> This notebook demonstrates how to extract and summarize the main content of any website using Selenium for dynamic extraction and OpenAI for generating concise summaries in Mexican Spanish.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"This notebook provides a workflow to automatically analyze websites, extract relevant text, and generate a short summary using a language model. Navigation elements are ignored, focusing on news, announcements, and main content.\n",
|
||||
"\n",
|
||||
"## Features\n",
|
||||
"- Extracts relevant text from web pages using Selenium and BeautifulSoup.\n",
|
||||
"- Generates automatic summaries using OpenAI's language models.\n",
|
||||
"- Presents results in markdown format.\n",
|
||||
"\n",
|
||||
"## Requirements\n",
|
||||
"- Python 3.8+\n",
|
||||
"- Google Chrome browser installed\n",
|
||||
"- The following Python packages:\n",
|
||||
" - selenium\n",
|
||||
" - webdriver-manager\n",
|
||||
" - beautifulsoup4\n",
|
||||
" - openai\n",
|
||||
" - python-dotenv\n",
|
||||
" - requests\n",
|
||||
"- An OpenAI API key (project key, starting with `sk-proj-`)\n",
|
||||
"- Internet connection\n",
|
||||
"\n",
|
||||
"## How to Use\n",
|
||||
"1. Install the required packages:\n",
|
||||
" ```bash\n",
|
||||
" pip install selenium webdriver-manager undetected-chromedriver beautifulsoup4 openai python-dotenv requests\n",
|
||||
" ```\n",
|
||||
"2. Add your OpenAI API key to a `.env` file as `OPENAI_API_KEY`.\n",
|
||||
"3. Run the notebook cells in order. You can change the target website URL in the code to analyze different sites.\n",
|
||||
"4. The summary will be displayed in markdown format below the code cell.\n",
|
||||
"\n",
|
||||
"**Note:** Some websites may block automated access. The notebook includes options to simulate a real user and avoid bot detection, but results may vary depending on the site's protections.\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dc7c2ade",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Imports\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.chrome.service import Service\n",
|
||||
"from selenium.webdriver.common.by import By\n",
|
||||
"from selenium.webdriver.chrome.options import Options\n",
|
||||
"from selenium.webdriver.support.ui import WebDriverWait\n",
|
||||
"from selenium.webdriver.support import expected_conditions as EC\n",
|
||||
"from webdriver_manager.chrome import ChromeDriverManager\n",
|
||||
"import undetected_chromedriver as uc"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a2d21987",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the environment variables from .env\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bbb3a8ed",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5313aa64",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class Website:\n",
|
||||
" def __init__(self, url, headless=True, wait_time=10):\n",
|
||||
" self.url = url # Website URL to analyze\n",
|
||||
" self.title = None # Title of the website\n",
|
||||
" self.text = None # Extracted text from the website\n",
|
||||
" \n",
|
||||
" # Chrome options configuration for Selenium\n",
|
||||
" options = Options()\n",
|
||||
" if headless:\n",
|
||||
" options.add_argument(\"--headless=new\") # Run Chrome in headless mode (no window)\n",
|
||||
" options.add_argument(\"--disable-gpu\") # Disable GPU acceleration\n",
|
||||
" options.add_argument(\"--no-sandbox\") # Disable Chrome sandbox (required for some environments)\n",
|
||||
" options.add_argument(\"--window-size=1920,1080\") # Set window size to simulate a real user\n",
|
||||
" # Simulate a real user-agent to avoid bot detection\n",
|
||||
" options.add_argument(\"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\")\n",
|
||||
" \n",
|
||||
" # Initialize Chrome WebDriver\n",
|
||||
" self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)\n",
|
||||
" self.driver.get(url) # Open the URL in the browser\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" # Wait until the <body> element is present in the page\n",
|
||||
" WebDriverWait(self.driver, wait_time).until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n",
|
||||
" html = self.driver.page_source # Get the full HTML of the page\n",
|
||||
" soup = BeautifulSoup(html, 'html.parser') # Parse HTML with BeautifulSoup\n",
|
||||
" self.title = soup.title.string if soup.title else 'No title found' # Extract the title\n",
|
||||
" if soup.body:\n",
|
||||
" # Remove irrelevant elements from the body\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" # Extract clean text from the body\n",
|
||||
" self.text = soup.body.get_text(separator='\\n', strip=True)\n",
|
||||
" else:\n",
|
||||
" self.text = \"No body found\" # If no body is found, indicate it\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error accessing the site: {e}\") # Print error to console\n",
|
||||
" self.text = \"Error accessing the site\" # Store error in the attribute\n",
|
||||
" finally:\n",
|
||||
" self.driver.quit() # Always close the browser, whether or not an error occurred"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e902c6b2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown in Mexican Spanish.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eaee8f36",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9ac4ed8b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Creates messages for the OpenAI API\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1536d537",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Creates a summary for the given URL\n",
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fe135339",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Shows the summary for the given URL\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a301ab4e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://openai.com/\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
817
week1/community-contributions/day1.ipynb
Normal file
817
week1/community-contributions/day1.ipynb
Normal file
@@ -0,0 +1,817 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# YOUR FIRST LAB\n",
|
||||
"### Please read this section. This is valuable to get you prepared, even if it's a long read -- it's important stuff.\n",
|
||||
"\n",
|
||||
"## Your first Frontier LLM Project\n",
|
||||
"\n",
|
||||
"Let's build a useful LLM solution - in a matter of minutes.\n",
|
||||
"\n",
|
||||
"By the end of this course, you will have built an autonomous Agentic AI solution with 7 agents that collaborate to solve a business problem. All in good time! We will start with something smaller...\n",
|
||||
"\n",
|
||||
"Our goal is to code a new kind of Web Browser. Give it a URL, and it will respond with a summary. The Reader's Digest of the internet!!\n",
|
||||
"\n",
|
||||
"Before starting, you should have completed the setup for [PC](../SETUP-PC.md) or [Mac](../SETUP-mac.md) and you hopefully launched this jupyter lab from within the project root directory, with your environment activated.\n",
|
||||
"\n",
|
||||
"## If you're new to Jupyter Lab\n",
|
||||
"\n",
|
||||
"Welcome to the wonderful world of Data Science experimentation! Once you've used Jupyter Lab, you'll wonder how you ever lived without it. Simply click in each \"cell\" with code in it, such as the cell immediately below this text, and hit Shift+Return to execute that cell. As you wish, you can add a cell with the + button in the toolbar, and print values of variables, or try out variations. \n",
|
||||
"\n",
|
||||
"I've written a notebook called [Guide to Jupyter](Guide%20to%20Jupyter.ipynb) to help you get more familiar with Jupyter Labs, including adding Markdown comments, using `!` to run shell commands, and `tqdm` to show progress.\n",
|
||||
"\n",
|
||||
"## If you're new to the Command Line\n",
|
||||
"\n",
|
||||
"Please see these excellent guides: [Command line on PC](https://chatgpt.com/share/67b0acea-ba38-8012-9c34-7a2541052665) and [Command line on Mac](https://chatgpt.com/canvas/shared/67b0b10c93a081918210723867525d2b). \n",
|
||||
"\n",
|
||||
"## If you'd prefer to work in IDEs\n",
|
||||
"\n",
|
||||
"If you're more comfortable in IDEs like VSCode, Cursor or PyCharm, they both work great with these lab notebooks too. \n",
|
||||
"If you'd prefer to work in VSCode, [here](https://chatgpt.com/share/676f2e19-c228-8012-9911-6ca42f8ed766) are instructions from an AI friend on how to configure it for the course.\n",
|
||||
"\n",
|
||||
"## If you'd like to brush up your Python\n",
|
||||
"\n",
|
||||
"I've added a notebook called [Intermediate Python](Intermediate%20Python.ipynb) to get you up to speed. But you should give it a miss if you already have a good idea what this code does: \n",
|
||||
"`yield from {book.get(\"author\") for book in books if book.get(\"author\")}`\n",
|
||||
"\n",
|
||||
"## I am here to help\n",
|
||||
"\n",
|
||||
"If you have any problems at all, please do reach out. \n",
|
||||
"I'm available through the platform, or at ed@edwarddonner.com, or at https://www.linkedin.com/in/eddonner/ if you'd like to connect (and I love connecting!) \n",
|
||||
"And this is new to me, but I'm also trying out X/Twitter at [@edwarddonner](https://x.com/edwarddonner) - if you're on X, please show me how it's done 😂 \n",
|
||||
"\n",
|
||||
"## More troubleshooting\n",
|
||||
"\n",
|
||||
"Please see the [troubleshooting](troubleshooting.ipynb) notebook in this folder to diagnose and fix common problems. At the very end of it is a diagnostics script with some useful debug info.\n",
|
||||
"\n",
|
||||
"## For foundational technical knowledge (eg Git, APIs, debugging) \n",
|
||||
"\n",
|
||||
"If you're relatively new to programming -- I've got your back! While it's ideal to have some programming experience for this course, there's only one mandatory prerequisite: plenty of patience. 😁 I've put together a set of self-study guides that cover Git and GitHub, APIs and endpoints, beginner python and more.\n",
|
||||
"\n",
|
||||
"This covers Git and GitHub; what they are, the difference, and how to use them: \n",
|
||||
"https://github.com/ed-donner/agents/blob/main/guides/03_git_and_github.ipynb\n",
|
||||
"\n",
|
||||
"This covers technical foundations: \n",
|
||||
"ChatGPT vs API; taking screenshots; Environment Variables; Networking basics; APIs and endpoints: \n",
|
||||
"https://github.com/ed-donner/agents/blob/main/guides/04_technical_foundations.ipynb\n",
|
||||
"\n",
|
||||
"This covers Python for beginners, and making sure that a `NameError` never trips you up: \n",
|
||||
"https://github.com/ed-donner/agents/blob/main/guides/06_python_foundations.ipynb\n",
|
||||
"\n",
|
||||
"This covers the essential techniques for figuring out errors: \n",
|
||||
"https://github.com/ed-donner/agents/blob/main/guides/08_debugging.ipynb\n",
|
||||
"\n",
|
||||
"And you'll find other useful guides in the same folder in GitHub. Some information applies to my other Udemy course (eg Async Python) but most of it is very relevant for LLM engineering.\n",
|
||||
"\n",
|
||||
"## If this is old hat!\n",
|
||||
"\n",
|
||||
"If you're already comfortable with today's material, please hang in there; you can move swiftly through the first few labs - we will get much more in depth as the weeks progress. Ultimately we will fine-tune our own LLM to compete with OpenAI!\n",
|
||||
"\n",
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../important.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#900;\">Please read - important note</h2>\n",
|
||||
" <span style=\"color:#900;\">The way I collaborate with you may be different to other courses you've taken. I prefer not to type code while you watch. Rather, I execute Jupyter Labs, like this, and give you an intuition for what's going on. My suggestion is that you carefully execute this yourself, <b>after</b> watching the lecture. Add print statements to understand what's going on, and then come up with your own variations. If you have a Github account, use this to showcase your variations. Not only is this essential practice, but it demonstrates your skills to others, including perhaps future clients or employers...</span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>\n",
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../resources.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#f71;\">This code is a live resource - keep an eye out for my emails</h2>\n",
|
||||
" <span style=\"color:#f71;\">I push updates to the code regularly. As people ask questions, I add more examples or improved commentary. As a result, you'll notice that the code below isn't identical to the videos. Everything from the videos is here; but I've also added better explanations and new models like DeepSeek. Consider this like an interactive book.<br/><br/>\n",
|
||||
" I try to send emails regularly with important updates related to the course. You can find this in the 'Announcements' section of Udemy in the left sidebar. You can also choose to receive my emails via your Notification Settings in Udemy. I'm respectful of your inbox and always try to add value with my emails!\n",
|
||||
" </span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>\n",
|
||||
"<table style=\"margin: 0; text-align: left;\">\n",
|
||||
" <tr>\n",
|
||||
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
||||
" <img src=\"../business.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
||||
" </td>\n",
|
||||
" <td>\n",
|
||||
" <h2 style=\"color:#181;\">Business value of these exercises</h2>\n",
|
||||
" <span style=\"color:#181;\">A final thought. While I've designed these notebooks to be educational, I've also tried to make them enjoyable. We'll do fun things like have LLMs tell jokes and argue with each other. But fundamentally, my goal is to teach skills you can apply in business. I'll explain business implications as we go, and it's worth keeping this in mind: as you build experience with models and techniques, think of ways you could put this into action at work today. Please do contact me if you'd like to discuss more or if you have ideas to bounce off me.</span>\n",
|
||||
" </td>\n",
|
||||
" </tr>\n",
|
||||
"</table>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6900b2a8-6384-4316-8aaa-5e519fca4254",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Connecting to OpenAI (or Ollama)\n",
|
||||
"\n",
|
||||
"The next cell is where we load in the environment variables in your `.env` file and connect to OpenAI. \n",
|
||||
"\n",
|
||||
"If you'd like to use free Ollama instead, please see the README section \"Free Alternative to Paid APIs\", and if you're not sure how to do this, there's a full solution in the solutions folder (day1_with_ollama.ipynb).\n",
|
||||
"\n",
|
||||
"## Troubleshooting if you have problems:\n",
|
||||
"\n",
|
||||
"Head over to the [troubleshooting](troubleshooting.ipynb) notebook in this folder for step by step code to identify the root cause and fix it!\n",
|
||||
"\n",
|
||||
"If you make a change, try restarting the \"Kernel\" (the python process sitting behind this notebook) by Kernel menu >> Restart Kernel and Clear Outputs of All Cells. Then try this notebook again, starting at the top.\n",
|
||||
"\n",
|
||||
"Or, contact me! Message me or email ed@edwarddonner.com and we will get this to work.\n",
|
||||
"\n",
|
||||
"Any concerns about API costs? See my notes in the README - costs should be minimal, and you can control it at every point. You can also use Ollama as a free alternative, which we discuss during Day 2."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
|
||||
"# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "442fc84b-0815-4f40-99ab-d9a5da6bda91",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Let's make a quick call to a Frontier model to get started, as a preview!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "a58394bf-1e45-46af-9bfd-01e24da6f49a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Hello! It’s great to hear from you! How can I help you today?\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# To give you a preview -- calling OpenAI with these messages is this easy. Any problems, head over to the Troubleshooting notebook.\n",
|
||||
"\n",
|
||||
"message = \"Hello, GPT! This is my first ever message to you! Hi!\"\n",
|
||||
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=[{\"role\":\"user\", \"content\":message}])\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2aa190e5-cb31-456a-96cc-db109919cd78",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## OK onwards with our first project"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "c5e793b2-6775-426a-a139-4848291d0463",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Home - Edward Donner\n",
|
||||
"Home\n",
|
||||
"Connect Four\n",
|
||||
"Outsmart\n",
|
||||
"An arena that pits LLMs against each other in a battle of diplomacy and deviousness\n",
|
||||
"About\n",
|
||||
"Posts\n",
|
||||
"Well, hi there.\n",
|
||||
"I’m Ed. I like writing code and experimenting with LLMs, and hopefully you’re here because you do too. I also enjoy DJing (but I’m badly out of practice), amateur electronic music production (\n",
|
||||
"very\n",
|
||||
"amateur) and losing myself in\n",
|
||||
"Hacker News\n",
|
||||
", nodding my head sagely to things I only half understand.\n",
|
||||
"I’m the co-founder and CTO of\n",
|
||||
"Nebula.io\n",
|
||||
". We’re applying AI to a field where it can make a massive, positive impact: helping people discover their potential and pursue their reason for being. Recruiters use our product today to source, understand, engage and manage talent. I’m previously the founder and CEO of AI startup untapt,\n",
|
||||
"acquired in 2021\n",
|
||||
".\n",
|
||||
"We work with groundbreaking, proprietary LLMs verticalized for talent, we’ve\n",
|
||||
"patented\n",
|
||||
"our matching model, and our award-winning platform has happy customers and tons of press coverage.\n",
|
||||
"Connect\n",
|
||||
"with me for more!\n",
|
||||
"May 28, 2025\n",
|
||||
"Connecting my courses – become an LLM expert and leader\n",
|
||||
"May 18, 2025\n",
|
||||
"2025 AI Executive Briefing\n",
|
||||
"April 21, 2025\n",
|
||||
"The Complete Agentic AI Engineering Course\n",
|
||||
"January 23, 2025\n",
|
||||
"LLM Workshop – Hands-on with Agents – resources\n",
|
||||
"Navigation\n",
|
||||
"Home\n",
|
||||
"Connect Four\n",
|
||||
"Outsmart\n",
|
||||
"An arena that pits LLMs against each other in a battle of diplomacy and deviousness\n",
|
||||
"About\n",
|
||||
"Posts\n",
|
||||
"Get in touch\n",
|
||||
"ed [at] edwarddonner [dot] com\n",
|
||||
"www.edwarddonner.com\n",
|
||||
"Follow me\n",
|
||||
"LinkedIn\n",
|
||||
"Twitter\n",
|
||||
"Facebook\n",
|
||||
"Subscribe to newsletter\n",
|
||||
"Type your email…\n",
|
||||
"Subscribe\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Let's try one out. Change the website and add print statements to follow along.\n",
|
||||
"\n",
|
||||
"ed = Website(\"https://edwarddonner.com\")\n",
|
||||
"print(ed.title)\n",
|
||||
"print(ed.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6a478a0c-2c53-48ff-869c-4d08199931e1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Types of prompts\n",
|
||||
"\n",
|
||||
"You may know this already - but if not, you will get very familiar with it!\n",
|
||||
"\n",
|
||||
"Models like GPT4o have been trained to receive instructions in a particular way.\n",
|
||||
"\n",
|
||||
"They expect to receive:\n",
|
||||
"\n",
|
||||
"**A system prompt** that tells them what task they are performing and what tone they should use\n",
|
||||
"\n",
|
||||
"**A user prompt** -- the conversation starter that they should reply to"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
|
||||
"\n",
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "f0275b1b-7cfe-4f9d-abfa-7650d378da0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "26448ec4-5c00-4204-baec-7df91d11ff2e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"You are looking at a website titled Home - Edward Donner\n",
|
||||
"The contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.\n",
|
||||
"\n",
|
||||
"Home\n",
|
||||
"Connect Four\n",
|
||||
"Outsmart\n",
|
||||
"An arena that pits LLMs against each other in a battle of diplomacy and deviousness\n",
|
||||
"About\n",
|
||||
"Posts\n",
|
||||
"Well, hi there.\n",
|
||||
"I’m Ed. I like writing code and experimenting with LLMs, and hopefully you’re here because you do too. I also enjoy DJing (but I’m badly out of practice), amateur electronic music production (\n",
|
||||
"very\n",
|
||||
"amateur) and losing myself in\n",
|
||||
"Hacker News\n",
|
||||
", nodding my head sagely to things I only half understand.\n",
|
||||
"I’m the co-founder and CTO of\n",
|
||||
"Nebula.io\n",
|
||||
". We’re applying AI to a field where it can make a massive, positive impact: helping people discover their potential and pursue their reason for being. Recruiters use our product today to source, understand, engage and manage talent. I’m previously the founder and CEO of AI startup untapt,\n",
|
||||
"acquired in 2021\n",
|
||||
".\n",
|
||||
"We work with groundbreaking, proprietary LLMs verticalized for talent, we’ve\n",
|
||||
"patented\n",
|
||||
"our matching model, and our award-winning platform has happy customers and tons of press coverage.\n",
|
||||
"Connect\n",
|
||||
"with me for more!\n",
|
||||
"May 28, 2025\n",
|
||||
"Connecting my courses – become an LLM expert and leader\n",
|
||||
"May 18, 2025\n",
|
||||
"2025 AI Executive Briefing\n",
|
||||
"April 21, 2025\n",
|
||||
"The Complete Agentic AI Engineering Course\n",
|
||||
"January 23, 2025\n",
|
||||
"LLM Workshop – Hands-on with Agents – resources\n",
|
||||
"Navigation\n",
|
||||
"Home\n",
|
||||
"Connect Four\n",
|
||||
"Outsmart\n",
|
||||
"An arena that pits LLMs against each other in a battle of diplomacy and deviousness\n",
|
||||
"About\n",
|
||||
"Posts\n",
|
||||
"Get in touch\n",
|
||||
"ed [at] edwarddonner [dot] com\n",
|
||||
"www.edwarddonner.com\n",
|
||||
"Follow me\n",
|
||||
"LinkedIn\n",
|
||||
"Twitter\n",
|
||||
"Facebook\n",
|
||||
"Subscribe to newsletter\n",
|
||||
"Type your email…\n",
|
||||
"Subscribe\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(user_prompt_for(ed))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ea211b5f-28e1-4a86-8e52-c0b7677cadcc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Messages\n",
|
||||
"\n",
|
||||
"The API from OpenAI expects to receive messages in a particular structure.\n",
|
||||
"Many of the other APIs share this structure:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"[\n",
|
||||
" {\"role\": \"system\", \"content\": \"system message goes here\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"user message goes here\"}\n",
|
||||
"]\n",
|
||||
"```\n",
|
||||
"To give you a preview, the next 2 cells make a rather simple call - we won't stretch the mighty GPT (yet!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "f25dcd35-0cd0-4235-9f64-ac37ed9eaaa5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a snarky assistant\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"What is 2 + 2?\"}\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "21ed95c5-7001-47de-a36d-1d6673b403ce",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Oh, you’re going for the big math questions now, huh? Well, if you insist on dragging me into elementary school territory, the answer is 4. Shocking, I know.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# To give you a preview -- calling OpenAI with system and user messages:\n",
|
||||
"\n",
|
||||
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d06e8d78-ce4c-4b05-aa8e-17050c82bb47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## And now let's build useful messages for GPT-4o-mini, using a function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# See how this function creates exactly the format above\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "36478464-39ee-485c-9f3f-6a4e458dbc9c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'role': 'system',\n",
|
||||
" 'content': 'You are an assistant that analyzes the contents of a website and provides a short summary, ignoring text that might be navigation related. Respond in markdown.'},\n",
|
||||
" {'role': 'user',\n",
|
||||
" 'content': 'You are looking at a website titled Home - Edward Donner\\nThe contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.\\n\\nHome\\nConnect Four\\nOutsmart\\nAn arena that pits LLMs against each other in a battle of diplomacy and deviousness\\nAbout\\nPosts\\nWell, hi there.\\nI’m Ed. I like writing code and experimenting with LLMs, and hopefully you’re here because you do too. I also enjoy DJing (but I’m badly out of practice), amateur electronic music production (\\nvery\\namateur) and losing myself in\\nHacker News\\n, nodding my head sagely to things I only half understand.\\nI’m the co-founder and CTO of\\nNebula.io\\n. We’re applying AI to a field where it can make a massive, positive impact: helping people discover their potential and pursue their reason for being. Recruiters use our product today to source, understand, engage and manage talent. I’m previously the founder and CEO of AI startup untapt,\\nacquired in 2021\\n.\\nWe work with groundbreaking, proprietary LLMs verticalized for talent, we’ve\\npatented\\nour matching model, and our award-winning platform has happy customers and tons of press coverage.\\nConnect\\nwith me for more!\\nMay 28, 2025\\nConnecting my courses – become an LLM expert and leader\\nMay 18, 2025\\n2025 AI Executive Briefing\\nApril 21, 2025\\nThe Complete Agentic AI Engineering Course\\nJanuary 23, 2025\\nLLM Workshop – Hands-on with Agents – resources\\nNavigation\\nHome\\nConnect Four\\nOutsmart\\nAn arena that pits LLMs against each other in a battle of diplomacy and deviousness\\nAbout\\nPosts\\nGet in touch\\ned [at] edwarddonner [dot] com\\nwww.edwarddonner.com\\nFollow me\\nLinkedIn\\nTwitter\\nFacebook\\nSubscribe to newsletter\\nType your email…\\nSubscribe'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Try this out, and then try for a few more websites\n",
|
||||
"\n",
|
||||
"messages_for(ed)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "16f49d46-bf55-4c3e-928f-68fc0bf715b0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Time to bring it together - the API for OpenAI is very simple!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "905b9919-aba7-45b5-ae65-81b3d1d78e34",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# And now: call the OpenAI API. You will get very familiar with this!\n",
|
||||
"\n",
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "05e38d41-dfa4-4b20-9c96-c46ea75d9fb5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'# Summary of Edward Donner\\'s Website\\n\\nThe website is the personal platform of Ed Donner, a software engineer and co-founder/CTO of Nebula.io, an AI-focused company that aims to help individuals discover their potential through technology. Ed expresses his passion for coding, experimenting with large language models (LLMs), and interests in DJing and electronic music production.\\n\\n## Key Sections:\\n- **About Ed**: Provides personal background, detailing his experience in AI startups, including his previous venture, untapt, which was acquired in 2021. He highlights the use of patented matching models and LLMs in talent management.\\n- **Connect Four & Outsmart**: Features interactive games or platforms where LLMs engage in diplomatic and strategic challenges.\\n- **Courses & Announcements**:\\n - **May 28, 2025**: Announced a course focused on becoming an LLM expert and leader.\\n - **May 18, 2025**: Announcement for the 2025 AI Executive Briefing.\\n - **April 21, 2025**: Introduction of \"The Complete Agentic AI Engineering Course.\"\\n - **January 23, 2025**: A workshop providing hands-on experience with agents and associated resources.\\n\\nThe website also encourages visitors to connect and engage through various social media platforms and a newsletter subscription.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"summarize(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "3d926d59-450e-4609-92ba-2d6f244f1342",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function to display this nicely in the Jupyter output, using markdown\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "3018853a-445f-41ff-9560-d925d1774b2f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"# Summary of \"Home - Edward Donner\"\n",
|
||||
"\n",
|
||||
"The website is dedicated to Edward Donner, a co-founder and CTO of Nebula.io, a platform focused on leveraging AI to assist individuals in discovering their potential and engaging with talent. Edward expresses an interest in coding, experiments with large language models (LLMs), DJing, and electronic music production. He has a history as the founder of an AI startup, untapt, which was acquired in 2021.\n",
|
||||
"\n",
|
||||
"## Key Features:\n",
|
||||
"- **Connect Four**: A game involving LLMs competing in diplomacy and strategy.\n",
|
||||
"- **About**: Information about Edward's professional background and interests.\n",
|
||||
"- **Courses and Workshops**: \n",
|
||||
" - **Recent Announcements**:\n",
|
||||
" - **May 28, 2025**: Launch of a program to become an LLM expert and leader.\n",
|
||||
" - **May 18, 2025**: Announcement of a 2025 AI Executive Briefing.\n",
|
||||
" - **April 21, 2025**: Introduction of the Complete Agentic AI Engineering Course.\n",
|
||||
" - **January 23, 2025**: A hands-on LLM Workshop focusing on resources related to agents.\n",
|
||||
"\n",
|
||||
"The content emphasizes his passion for AI and education within the industry."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display_summary(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b3bcf6f4-adce-45e9-97ad-d9a5d7a3a624",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Let's try more websites\n",
|
||||
"\n",
|
||||
"Note that this will only work on websites that can be scraped using this simplistic approach.\n",
|
||||
"\n",
|
||||
"Websites that are rendered with Javascript, like React apps, won't show up. See the community-contributions folder for a Selenium implementation that gets around this. You'll need to read up on installing Selenium (ask ChatGPT!)\n",
|
||||
"\n",
|
||||
"Also Websites protected with CloudFront (and similar) may give 403 errors - many thanks Andy J for pointing this out.\n",
|
||||
"\n",
|
||||
"But many websites will work just fine!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "45d83403-a24c-44b5-84ac-961449b4008f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"# CNN Website Summary\n",
|
||||
"\n",
|
||||
"CNN is a major news outlet that provides comprehensive coverage of world events, featuring sections on U.S. and international news, political analysis, business, health, entertainment, and sports. The site emphasizes real-time updates and includes various multimedia formats such as videos and articles.\n",
|
||||
"\n",
|
||||
"### Notable Articles and Updates\n",
|
||||
"\n",
|
||||
"- **Ukraine-Russia War**: Pro-Ukraine protests are ongoing, and there are discussions regarding dignitaries meeting on U.S. soil amidst rising tensions.\n",
|
||||
" \n",
|
||||
"- **Israel-Hamas Conflict**: Analysis and reports highlight significant developments including Israel’s settlement plans which may impact the future of a Palestinian state.\n",
|
||||
"\n",
|
||||
"- **Health**: New heart health guidelines suggest going alcohol-free; studies indicate a high level of stress among teenagers.\n",
|
||||
"\n",
|
||||
"- **Entertainment**:\n",
|
||||
" - Megadeth is set to release its final album and embark on a farewell tour.\n",
|
||||
" - Taylor Swift's recent appearances are noted for cultural impact.\n",
|
||||
"\n",
|
||||
"- **Science**: Climate-related findings unveil vulnerabilities in GPS and satellites due to pollution.\n",
|
||||
"\n",
|
||||
"- **Business**: Discussions are ongoing about potential government stake in Intel, affecting stock prices.\n",
|
||||
"\n",
|
||||
"### Additional Features\n",
|
||||
"CNN also offers a variety of interactive content including quizzes, games, and newsletters tailored to reader interests. The site encourages user engagement through feedback on advertisements and technical issues.\n",
|
||||
"\n",
|
||||
"Overall, CNN remains a significant source for breaking news and in-depth analysis across a broad spectrum of topics."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display_summary(\"https://cnn.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "75e9fd40-b354-4341-991e-863ef2e59db7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://anthropic.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "00743dac-0e70-45b7-879a-d7293a6f68a6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Subject: Leave Notification: Medical Emergency (Aug 17-21, 2025)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Step 1: Create your prompts\n",
|
||||
"\n",
|
||||
"system_prompt = \"you are the email assistant, which provide the subject of the email\"\n",
|
||||
"user_prompt = \"\"\"\n",
|
||||
" please provide the appropriate subject for below email\n",
|
||||
"hi team,\n",
|
||||
"due to some medical emergency , i will be on leave for 5 days starting\n",
|
||||
"from 17-08-2025 to 21-08-2025.\n",
|
||||
"\n",
|
||||
"please call me in case of any urgency.\n",
|
||||
"\n",
|
||||
"regards\n",
|
||||
"Rahul\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Step 2: Make the messages list\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Step 3: Call OpenAI\n",
|
||||
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Step 4: print the result\n",
|
||||
"\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "36ed9f14-b349-40e9-a42c-b367e77f8bda",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## An extra exercise for those who enjoy web scraping\n",
|
||||
"\n",
|
||||
"You may notice that if you try `display_summary(\"https://openai.com\")` - it doesn't work! That's because OpenAI has a fancy website that uses Javascript. There are many ways around this that some of you might be familiar with. For example, Selenium is a hugely popular framework that runs a browser behind the scenes, renders the page, and allows you to query it. If you have experience with Selenium, Playwright or similar, then feel free to improve the Website class to use them. In the community-contributions folder, you'll find an example Selenium solution from a student (thank you!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eeab24dc-5f90-4570-b542-b0585aca3eb6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Sharing your code\n",
|
||||
"\n",
|
||||
"I'd love it if you share your code afterwards so I can share it with others! You'll notice that some students have already made changes (including a Selenium implementation) which you will find in the community-contributions folder. If you'd like add your changes to that folder, submit a Pull Request with your new versions in that folder and I'll merge your changes.\n",
|
||||
"\n",
|
||||
"If you're not an expert with git (and I am not!) then GPT has given some nice instructions on how to submit a Pull Request. It's a bit of an involved process, but once you've done it once it's pretty clear. As a pro-tip: it's best if you clear the outputs of your Jupyter notebooks (Edit >> Clean outputs of all cells, and then Save) for clean notebooks.\n",
|
||||
"\n",
|
||||
"Here are good instructions courtesy of an AI friend: \n",
|
||||
"https://chatgpt.com/share/677a9cb5-c64c-8012-99e0-e06e88afd293"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f4484fcf-8b39-4c3f-9674-37970ed71988",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,211 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d955d75d-4970-48fe-983e-a2a850cecfc5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"import PyPDF2\n",
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.chrome.options import Options\n",
|
||||
"from selenium.webdriver.chrome.service import Service\n",
|
||||
"from webdriver_manager.chrome import ChromeDriverManager\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6e1e5dd3-f91a-466b-8fd4-2dbf4eedf101",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override = True)\n",
|
||||
"api_key = os.getenv(\"OPENAI_API_KEY\")\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"API key doesn't look correct, check it\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"It looks like API key has an extra space - check it\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key looks good, moving on!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "67a6e583-1ef7-4b77-8886-c0e8c619933c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "34a07806-dd68-4a86-8b6e-e1b2aaf0daa1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# path to the CV\n",
|
||||
"path = \"/Users/yanasklar/Documents/For applying/CV/СV_YanaSklyar_c.pdf\"\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Vacancy:\n",
|
||||
" def __init__(self, url, instructions = \"\"):\n",
|
||||
" self.url = url\n",
|
||||
" \n",
|
||||
" # configure Chrome settings\n",
|
||||
" options = Options()\n",
|
||||
" # options.add_argument(\"--headless\") \n",
|
||||
" \"\"\"\n",
|
||||
" Headless mode runs the browser in the background (invisible).\n",
|
||||
" However, some websites (like openai.com) block headless browsers.\n",
|
||||
" So if this line is active, the page may not load correctly and you may not get the full content.\n",
|
||||
" \"\"\"\n",
|
||||
" options.add_argument(\"--disable-gpu\")\n",
|
||||
" options.add_argument(\"--no-sandbox\")\n",
|
||||
" options.add_argument(\"--window-size=1920x1080\")\n",
|
||||
"\n",
|
||||
" # use webdriver-manager to manage ChromeDriver\n",
|
||||
" service = Service(ChromeDriverManager().install())\n",
|
||||
" driver = webdriver.Chrome(service=service, options=options)\n",
|
||||
" driver.get(url)\n",
|
||||
" time.sleep(3) # let the page load\n",
|
||||
"\n",
|
||||
" # take the source of the page\n",
|
||||
" page_source = driver.page_source\n",
|
||||
" driver.quit()\n",
|
||||
"\n",
|
||||
" # analyse with BeautifulSoup\n",
|
||||
" soup = BeautifulSoup(page_source, 'html.parser')\n",
|
||||
"\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"img\", \"script\", \"style\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator='\\n', strip=True)\n",
|
||||
"\n",
|
||||
" # read CV\n",
|
||||
" with open(path, 'rb') as f:\n",
|
||||
" reader = PyPDF2.PdfReader(f)\n",
|
||||
" cv_text = \"\"\n",
|
||||
" for page in reader.pages:\n",
|
||||
" text = page.extract_text()\n",
|
||||
" if text:\n",
|
||||
" cv_text += text + \"\\n\"\n",
|
||||
" self.cv_text = cv_text\n",
|
||||
"\n",
|
||||
" # summarise and print the description of the job\n",
|
||||
" message = f\"\"\"Here is the content of a webpage: {self.text}.\n",
|
||||
" Find job description on that page,\n",
|
||||
" summarise it, include the list requirements and other important details.\n",
|
||||
" \"\"\"\n",
|
||||
" messages = [{\"role\":\"user\", \"content\":message}]\n",
|
||||
" response = openai.chat.completions.create(model='gpt-4o-mini', messages = messages)\n",
|
||||
" print(\"The job description: \", response.choices[0].message.content)\n",
|
||||
"\n",
|
||||
" # create prompts\n",
|
||||
" self.system_prompt = \"\"\"You are a career assistant specializing in writing cover letter.\n",
|
||||
" Your tasks:\n",
|
||||
" 1. Read the candidate's CV (provided as text).\n",
|
||||
" 2. Read the job description (provided from a webpage).\n",
|
||||
" 3. Write a concise and compelling cover letter, that:\n",
|
||||
" - Hightlights the most relevant experience and skills from the CV,\n",
|
||||
" - Aligns directly wit the requirements in the job description,\n",
|
||||
" - Adapts to cultural and professional norms in Israel.\n",
|
||||
" The letter should be no longer than half a page, persuasive and tailored to make the applicant stand out.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" user_prompt = f\"\"\"\n",
|
||||
" Here is my CV:\n",
|
||||
" {self.cv_text}\n",
|
||||
" \n",
|
||||
" The job vacancy is from the website {self.title}.\n",
|
||||
" Here is the decription of the vacancy:\n",
|
||||
" {self.text}\n",
|
||||
" Please write a cover letter that connects my background to this vacancy.\n",
|
||||
" Make it persuasive and suitable for Israeli job market.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" if instructions:\n",
|
||||
" user_prompt += f\"Additional instructions: {instructions}\"\n",
|
||||
" self.user_prompt = user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9160b9f5-177b-4477-8e54-3a212f275a22",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def cover_letter(url, instructions = \"\"):\n",
|
||||
" vacancy = Vacancy(url, instructions)\n",
|
||||
" messages = [\n",
|
||||
" {\"role\":\"system\", \"content\":vacancy.system_prompt},\n",
|
||||
" {\"role\":\"user\", \"content\":vacancy.user_prompt}\n",
|
||||
" ]\n",
|
||||
" response = openai.chat.completions.create(model='gpt-4o-mini', messages=messages)\n",
|
||||
" if not response:\n",
|
||||
" print(\"smt went wrong\")\n",
|
||||
" print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1de4b55c-a8da-445f-9865-c7a8bafdbc3c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"a = \"https://www.linkedin.com/jobs/view/4285898438/?alternateChannel=search&eBP=CwEAAAGY3R5LOabDLOVTy6xvBcSlWyAkIXQz8IRkSM3rgsqTPtvcEvUSnq980O7oLV2Hh_ldTpc2cBBmRq1IRnLtp7TzEcUvndFEXeCuviA5yo7oFYfW7KoEp4SPNzmf3D9LtnSgk9Iudy3skk6n3hVOtyDpx8Zm0AiTWPvdwCaZ_w5Xu8lAG797NRNDco71ynm99LmCOC9Go7DdDQ2eLewamc4SOsA4xWcXy0GmZVy3kBF1AprK3ylAYR2wrm5-hp4lRpbbfUxXjkEOG6H_GbPpKtN-N8mYnMd9w_cej5qQmTFX86gqSi6HuXFtK0h46TbOS5r-YQksVd1Yb4kYZnDznWXPLbxp04xVJSPzsHoa05wQdOfZ2UUSoMTJmic3n3qfV2u9Bp8n4sLYtINpzKdvm4eADGGkN-nR3O2oPeas9XjGbBwNdjXHAcX_PJoRwlFdQ1gVkYQEF1T7qAfXUJoUt-fv4oLxGnIgV6yJuMgw&refId=9NA7Bvt%2FhCqDkFNRGu1dPA%3D%3D&trackingId=W11hvpcIjHA%2FjU%2FFZ%2B1uAA%3D%3D\"\n",
|
||||
"b = \"The style of the cover letter should informal, as if i talked to a friend about my background\"\n",
|
||||
"cover_letter(a, b)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0feb3cbe-686a-4a97-9ca3-a0cb32a24c5d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python (llms)",
|
||||
"language": "python",
|
||||
"name": "llms"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3ba06289-d17a-4ccd-85f5-2b79956d4e59",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install selenium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "935fe7b1-1807-4f75-863d-4c118e425a19",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip show selenium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eabbbc62-1de1-4883-9b3e-9c90145ea6c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.edge.options import Options as EdgeOptions # Import EdgeOptions\n",
|
||||
"from selenium.webdriver.edge.service import Service as EdgeService # Import EdgeService\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"import time\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
" def __init__(self, url, driver_path=None, wait_time=3):\n",
|
||||
" self.url = url\n",
|
||||
" self.wait_time = wait_time\n",
|
||||
"\n",
|
||||
" # Headless Edge settings\n",
|
||||
" options = EdgeOptions() # Use EdgeOptions\n",
|
||||
" # options.add_argument(\"--headless\")\n",
|
||||
" options.add_argument(\"--disable-gpu\")\n",
|
||||
" options.add_argument(\"--no-sandbox\")\n",
|
||||
" options.add_argument(\"--window-size=1920x1080\")\n",
|
||||
"\n",
|
||||
" # Driver path\n",
|
||||
" if driver_path:\n",
|
||||
" # For Edge, you might need to specify the path to msedgedriver\n",
|
||||
" # For driver download, https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads\n",
|
||||
" service = EdgeService(executable_path=driver_path) # Use EdgeService\n",
|
||||
" else:\n",
|
||||
" # If msedgedriver.exe is in your system's PATH, you can omit executable_path\n",
|
||||
" service = EdgeService()\n",
|
||||
"\n",
|
||||
" # Start browser\n",
|
||||
" # Use webdriver.Edge() for Microsoft Edge\n",
|
||||
" driver = webdriver.Edge(service=service, options=options)\n",
|
||||
" driver.get(url)\n",
|
||||
"\n",
|
||||
" # Wait for the loading page\n",
|
||||
" time.sleep(self.wait_time)\n",
|
||||
"\n",
|
||||
" # Take page source\n",
|
||||
" html = driver.page_source\n",
|
||||
" driver.quit()\n",
|
||||
"\n",
|
||||
" # Analysis with BeautifulSoup \n",
|
||||
" soup = BeautifulSoup(html, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
"\n",
|
||||
" # Clean irrelevant tags\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
"\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "852c52e2-bd4d-4bb9-94ef-e498c33f1a89",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"site = Website(\"https://openai.com\", driver_path=\"/Users/klee/Documents/edgedriver_mac64_m1/msedgedriver\")\n",
|
||||
"print(\"Title:\", site.title)\n",
|
||||
"print(\"\\nFirst 500 character:\\n\", site.text[:500])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7620c685-c35c-4d6b-aaf1-a3da98f19ca7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
319
week1/community-contributions/day2_exercise_using_input.ipynb
Normal file
319
week1/community-contributions/day2_exercise_using_input.ipynb
Normal file
@@ -0,0 +1,319 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Welcome to your first assignment!\n",
|
||||
"\n",
|
||||
"Instructions are below. Please give this a try, and look in the solutions folder if you get stuck (or feel free to ask me!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29ddd15d-a3c5-4f4e-a678-873f56162724",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"\n",
|
||||
"OLLAMA_API = \"http://localhost:11434/api/chat\"\n",
|
||||
"HEADERS = {\"Content-Type\": \"application/json\"}\n",
|
||||
"MODEL = \"llama3.2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dac0a679-599c-441f-9bf2-ddc73d35b940",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a messages list using the same format that we used for OpenAI\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"user\", \"content\": \"Describe some of the business applications of Generative AI\"}\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7bb9c624-14f0-4945-a719-8ddb64f66f47",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"payload = {\n",
|
||||
" \"model\": MODEL,\n",
|
||||
" \"messages\": messages,\n",
|
||||
" \"stream\": False\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7745b9c4-57dc-4867-9180-61fa5db55eb8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import ollama\n",
|
||||
"\n",
|
||||
"response = ollama.chat(model=MODEL, messages=messages)\n",
|
||||
"print(response['message']['content'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a4704e10-f5fb-4c15-a935-f046c06fb13d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Alternative approach - using OpenAI python library to connect to Ollama"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "23057e00-b6fc-4678-93a9-6b31cb704bff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# There's actually an alternative approach that some people might prefer\n",
|
||||
"# You can use the OpenAI client python library to call Ollama:\n",
|
||||
"\n",
|
||||
"from openai import OpenAI\n",
|
||||
"ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
"\n",
|
||||
"response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=messages\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1622d9bb-5c68-4d4e-9ca4-b492c751f898",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# NOW the exercise for you\n",
|
||||
"\n",
|
||||
"Take the code from day1 and incorporate it here, to build a website summarizer that uses Llama 3.2 running locally instead of OpenAI; use either of the above approaches."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "0c1f84c4-4cc0-4085-8ea5-871a8ca46a47",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import ollama"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "890852ab-2cd4-41dc-b168-6bd1360b967a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL = \"llama3.2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "6de38216-6d1c-48c4-877b-86d403f4e0f8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "9d398f9a-c66e-42b5-91b4-5417944b8408",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt_generator(website) -> str:\n",
|
||||
" user_prompt = f\"You will act as a website summarizer with knowledge of Web Content Accessibility Guidelines. You will look into the web: {website.title} and \"\n",
|
||||
" user_prompt += \"break down the relevant information about it in this categories: What is the website about, \\\n",
|
||||
" to whom the website belongs and what practises should improve to have a better user experience. \\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
"\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "156d7c67-b714-4156-9f69-faf0c50aaf13",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_generator(user_prompt : str) -> list[dict[str, str]]:\n",
|
||||
" messages = [{\"role\" : \"user\", \"content\" : user_prompt}]\n",
|
||||
"\n",
|
||||
" return messages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "f07c4143-6cc5-4d28-846c-a373564e9264",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_request_reader() -> str:\n",
|
||||
" while True:\n",
|
||||
" website_url = input(\"Define what website you want to summarize by giving the url: \")\n",
|
||||
" if website_url.lower().startswith(\"http\"):\n",
|
||||
" return website_url\n",
|
||||
" print(\"URL not valid. Please provide a full url.\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "94933255-2ca8-40b5-8f74-865d3e781058",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarizer_bot():\n",
|
||||
" website_url = user_request_reader()\n",
|
||||
" website = Website(website_url)\n",
|
||||
" \n",
|
||||
" user_prompt = user_prompt_generator(website)\n",
|
||||
" messages = messages_generator(user_prompt)\n",
|
||||
"\n",
|
||||
" response = ollama.chat(model=MODEL, messages=messages)\n",
|
||||
" print(response['message']['content'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "2d81faa4-25b3-4d5d-8f36-93772e449b5c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Define what website you want to summarize by giving the url: test.com\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"URL not valid. Please provide a full url.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Define what website you want to summarize by giving the url: https://edwarddonner.com\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"**Summary:**\n",
|
||||
"\n",
|
||||
"The website \"Home - Edward Donner\" belongs to Edward Donner, a co-founder and CTO of Nebula.io, an AI startup. The website is about Edward's interests in writing code, experimenting with Large Language Models (LLMs), and DJing, as well as his work in applying AI to help people discover their potential.\n",
|
||||
"\n",
|
||||
"**Categories:**\n",
|
||||
"\n",
|
||||
"### What is the website about?\n",
|
||||
"\n",
|
||||
"The website is primarily about Edward Donner's personal brand, showcasing his expertise in AI and LLMs. It includes information about his work at Nebula.io, which applies AI to talent management. The website also features a \"Connect Four\" arena where LLMs compete against each other, as well as sections for learning more about LLMs and staying up-to-date with Edward's courses and publications.\n",
|
||||
"\n",
|
||||
"### To whom does the website belong?\n",
|
||||
"\n",
|
||||
"The website belongs to Edward Donner, a co-founder and CTO of Nebula.io. It appears to be a personal website or blog, showcasing his expertise and interests in AI and LLMs.\n",
|
||||
"\n",
|
||||
"### Practices to improve for better user experience:\n",
|
||||
"\n",
|
||||
"1. **Clearer navigation**: The website's menu is simple but not intuitive. Adding clear categories or sections would help users quickly find the information they're looking for.\n",
|
||||
"2. **More detailed about section**: The \"About\" section provides a brief overview of Edward's work and interests, but it could be more detailed and comprehensive.\n",
|
||||
"3. **Improved accessibility**: While the website is likely following general web accessibility guidelines, there are no clear indications of this on the page. Adding alt text to images, providing a clear font size and color scheme, and ensuring sufficient contrast between background and foreground would improve the user experience for people with disabilities.\n",
|
||||
"4. **Better calls-to-action (CTAs)**: The website could benefit from more prominent CTAs, guiding users towards specific actions such as signing up for courses or following Edward on social media.\n",
|
||||
"5. **SEO optimization**: The website's content and meta tags appear to be optimized for search engines, but a more thorough SEO analysis would help identify areas for improvement.\n",
|
||||
"\n",
|
||||
"Overall, the website provides a clear overview of Edward Donner's interests and expertise in AI and LLMs, but could benefit from some tweaks to improve accessibility, navigation, and CTAs.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# The call\n",
|
||||
"summarizer_bot()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
338
week1/community-contributions/day5-GitaScripting.ipynb
Normal file
338
week1/community-contributions/day5-GitaScripting.ipynb
Normal file
@@ -0,0 +1,338 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7759922b-12c9-44e0-8ac3-5f2a02b321d7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import fitz # PyMuPDF\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"import json\n",
|
||||
"from typing import List\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"from openai import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a266273a-05e3-451e-a318-428726cfa39c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initialize and constants\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:\n",
|
||||
" print(\"API key looks good so far\")\n",
|
||||
"else:\n",
|
||||
" print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n",
|
||||
" \n",
|
||||
"MODEL = 'gpt-4o-mini'\n",
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "45566572-dd66-48dc-ab7b-6adbe26eacba",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"exceptions = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "effc0e7b-d668-48b3-86d0-dbb5d8fe3d55",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Building system prompt\n",
|
||||
"def get_verse_system_prompt():\n",
|
||||
" system_prompt = \"You are a spiritual student who classifies the versus of the BhagavadGita according to a given theme.\\n\"\n",
|
||||
" system_prompt += \"Given a theme, you should pick a verse from any chapter and give it's location in the form of index chapter.verse_number (6.2)\\n\"\n",
|
||||
" system_prompt += \"You should respond in JSON as in this example:\\n\"\n",
|
||||
" system_prompt += \"\"\"\n",
|
||||
" {\"title\": \"Chapter 3, Verse 21 (3.21)\", \"verse\": \"कर्मणा ह्यपि संसिद्धिम्\n",
|
||||
" आस्थिता जनकादय:।\n",
|
||||
" लोकसंग्रहमेवापि\n",
|
||||
" सम्पश्यन्कर्तुमर्हसि॥\"}\n",
|
||||
" \"\"\"\n",
|
||||
" return system_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bbfb1035-b183-4481-9b49-3cc1b12b42e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(get_verse_system_prompt())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6acdcd6c-1fc5-4c71-81d0-665e25808e46",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define user prompt\n",
|
||||
"def get_verse_user_prompt(theme):\n",
|
||||
" user_prompt = f'''\n",
|
||||
" Here is the theme : {theme},\n",
|
||||
" Please find a verse from BhagavadGita excluding {exceptions} for a given theme {theme}\n",
|
||||
" '''#excluding those results which are already used\n",
|
||||
" \n",
|
||||
" user_prompt += \"If the verse is not in the exceptions for a given theme and used for a different theme, you are free to suggest it for a different theme.\"\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "72f5c755-ec2d-4545-9a31-0f6b2e5ed4da",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(get_verse_user_prompt('motivation'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "304d432c-7216-4a90-a5d8-db36b193657d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Call openAI to return versus\n",
|
||||
"def get_verses(theme):\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": get_verse_system_prompt()},\n",
|
||||
" {\"role\": \"user\", \"content\": get_verse_user_prompt(theme)}\n",
|
||||
" ],\n",
|
||||
" response_format={\"type\": \"json_object\"}\n",
|
||||
" )\n",
|
||||
" result = response.choices[0].message.content\n",
|
||||
" result = json.loads(result)\n",
|
||||
"\n",
|
||||
" #Remember those results which are suggested now\n",
|
||||
" combination = (theme, result['title'])\n",
|
||||
" exceptions.append(combination)\n",
|
||||
" return result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b90eeb35-e10e-48ee-ade6-e0594da8c51b",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(get_verses('motivation'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b5b8925e-52e4-4cb7-9205-51c65ed88fb8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# So far we have fetched the new verses relevant to a given theme \n",
|
||||
"# Lets generate a script for producting youtube video"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8ff0862b-0310-4174-ad12-64047932dc9e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#def function for system prompt\n",
|
||||
"def get_script_system_prompt(tone, theme, format):\n",
|
||||
" sys_prompt = 'You are a script writer for a youtube spiritual channel\\n'\n",
|
||||
" sys_prompt += 'You are given a verse like below: \\n'\n",
|
||||
" sys_prompt += str(get_verses(theme))\n",
|
||||
" sys_prompt += '\\n'\n",
|
||||
" sys_prompt += f'Give me an engaging script in a {tone} tone for a {format} format video for audience like youth seeking purpose, spiritual seekers, indians abroad, scholars and curious minds.'\n",
|
||||
"\n",
|
||||
" return sys_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "47476516-cd2f-4b16-b378-a70617bbe284",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(get_script_system_prompt('Motivating','motivation','long'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e305525b-8dde-4e93-927a-e24531827498",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# def function for user prompt\n",
|
||||
"def get_script_user_prompt(format, theme):\n",
|
||||
" user_prompt = f'Given the verse, help me generate a detailed script suitable for {format} format video.\\n'\n",
|
||||
" user_prompt += f'Please give me the complete verse, its meaning, a relevant story having a dilemma which the verse solves and the interpretation of the verse with respect to {theme}.\\n'\n",
|
||||
" user_prompt += 'Let the script give cues about video editing, host actions.'\n",
|
||||
" user_prompt += 'given the below example, please follow the format:\\n'\n",
|
||||
" user_prompt += \"\"\"\n",
|
||||
" [Opening Scene - Soft Instrumental Music Playing]\n",
|
||||
"\n",
|
||||
" [Cut to Host in a serene setting, perhaps by a river or in a lush green garden.]\n",
|
||||
"\n",
|
||||
" Host: (Smiling at the camera) \"Namaste, dear viewers! Welcome back to our channel, where we explore the depths of spirituality and seek to ignite the flame of wisdom within you. Today, we delve into a profound verse from the Bhagavad Gita that speaks to the very essence of life and identity.\"\n",
|
||||
"\n",
|
||||
" [Text On Screen: Chapter 2, Verse 13 (2.13)]\n",
|
||||
"\n",
|
||||
" Host: (With a sense of reverence) \"Let’s first take a moment to recite this verse together. It goes like this:\n",
|
||||
"\n",
|
||||
" देहिनोऽस्मिन्न्यथा देहे कौमारं यौवनं जरा।\n",
|
||||
" तथादेहान्तरप्राप्तिर्धीरस्तत्र न मुह्यति॥\n",
|
||||
"\n",
|
||||
" Now, let’s understand the essence of this verse.\"\n",
|
||||
"\n",
|
||||
" [Cut to Graphic: Verse Translation with Key Concepts Highlighted]\n",
|
||||
"\n",
|
||||
" Host Voiceover: (Calm and engaging tone) \"The meaning of this beautiful verse translates to: 'Just as the body undergoes changes from childhood to old age, similarly, the soul transitions from one body to another. The wise, who understand this, are never bewildered by these changes.'\n",
|
||||
"\n",
|
||||
" [Cut back to Host]\n",
|
||||
"\n",
|
||||
" Host: (Nodding, creating a connection)\n",
|
||||
" \"So, why is this verse so important, especially for us as young seekers of purpose? It highlights a profound truth—that our identities are not confined by our physical forms or the stages of life we experience. Instead, we are eternal beings who are constantly evolving.\"\n",
|
||||
"\n",
|
||||
" [Scene Transition - Soft Music Playing]\n",
|
||||
"\n",
|
||||
" [Cut to a Story Animation - A young man named Arjun in a busy city]\n",
|
||||
"\n",
|
||||
" Host (Voiceover): \"Let me share a relatable story. Meet Arjun. Like many of us, he was once full of dreams and aspirations. He excelling in school, pursuing a career in engineering. But as the years passed, he faced a crossroads. As the pressure mounted, he began to question his identity.\n",
|
||||
"\n",
|
||||
" (Visuals show Arjun overwhelmed by societal expectations, with people pushing him in different directions.)\n",
|
||||
"\n",
|
||||
" He felt distinct phases of life pulling at him: childhood dreams, youthful ambitions, and the looming responsibilities of adulthood. The changing seasons of his life left him confused and wondering if he had lost his true self.\"\n",
|
||||
"\n",
|
||||
" [Cut back to Host, empathetic tone]\n",
|
||||
"\n",
|
||||
" Host: \"Have you ever felt like Arjun? It’s a dilemma we all face, especially in today's fast-paced world where expectations can cloud our true identity. But just like our verse suggests, we should recognize that these changes don’t define us. They are simply part of the journey.\"\n",
|
||||
"\n",
|
||||
" [Scene Transition - Calm Music Playing while Host meditates]\n",
|
||||
"\n",
|
||||
" Host: (Speaking gently) \"Let’s take a moment to reflect. When we are sad, does that sadness define us? Or when we achieve success, do we become defined solely by that success? The answer isn't as straightforward as it seems. Here’s the catch: our essence is beyond these transient states. Like the body, our identities are fluid.\"\n",
|
||||
"\n",
|
||||
" [Cut to Visuals of Nature - flowing rivers, trees shedding leaves, etc.]\n",
|
||||
"\n",
|
||||
" Host Voiceover: \"Imagine the endless cycle of nature—the changing seasons, the growth, the decay, and rebirth. Just like the leaves that drop to make way for new growth, our experiences contribute to our spiritual evolution.\"\n",
|
||||
"\n",
|
||||
" [Cut back to Host - Inviting and Warm Tone]\n",
|
||||
"\n",
|
||||
" Host: \"Just as the wise who understand the transformation of the soul remain unshaken, we, too, can cultivate that wisdom to rise above the chaos of change. Recognize your true essence—beyond the body, the roles, the titles. Understand that your spirit is eternal.\"\n",
|
||||
"\n",
|
||||
" [Scene Transition - Soft Inspirational Music Begins]\n",
|
||||
"\n",
|
||||
" Host: (Passionately) \"So how can we embody this truth in our daily lives? Here’s a small exercise: Each day, take a few moments to meditate on who you really are. Write down what aspects of your identity are tied to transient things. Challenge yourself—what happens when you peel these layers away?\"\n",
|
||||
"\n",
|
||||
" [Cut to host with a pad, writing ideas]\n",
|
||||
"\n",
|
||||
" [Scene Transition - Editing Cues - Show engaging graphics of identity, layers of a person, etc.]\n",
|
||||
"\n",
|
||||
" Host Voiceover: \"Each effort towards understanding and embracing our true self draws us closer to the realization that we are eternal souls, having a human experience. This is the wisdom that can empower you to stand tall against the adversities of life.\"\n",
|
||||
"\n",
|
||||
" [Cut back to Host]\n",
|
||||
"\n",
|
||||
" Host: (Concluding) \"Thank you for joining me today in this exploration of Chapter 2, Verse 13 of the Bhagavad Gita. Remember, when you feel lost in the complexities of life, return to this teachings and remind yourself that you are not just a body; you are an eternal being on a magnificent journey.\n",
|
||||
"\n",
|
||||
" [Closing Scene - Uplifting Music Playing]\n",
|
||||
"\n",
|
||||
" Host: \"Don’t forget to like, share, and subscribe if you found resonance in this message. And share your thoughts in the comments below. What did you find most challenging in your own journey of self-identity? Let’s connect and support each other in our spiritual quests. Until next time, stay enlightened, stay inspired!\"\n",
|
||||
"\n",
|
||||
" [End Screen with Subscribe Button and Previous Video Suggestions]\n",
|
||||
"\n",
|
||||
" [End of Script]\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c4b29cb9-d8d1-413a-8152-4250e2430a42",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(get_script_user_prompt('long','motivation'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1dfa60ce-9e88-4f7d-8e60-ac37a0aafc15",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_script(tone, theme, format):\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": get_script_system_prompt(tone,theme,format)},\n",
|
||||
" {\"role\": \"user\", \"content\": get_script_user_prompt(format,theme)}\n",
|
||||
" ],\n",
|
||||
" )\n",
|
||||
" result = response.choices[0].message.content\n",
|
||||
" display(Markdown(result))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ec86c436-42ae-4313-b12f-4fad42ab2227",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"create_script('motivating','self-identity','long')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,329 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9ab446e4-219c-4589-aa8f-9386adcf5c60",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Project Overview\n",
|
||||
"This project combines web scraping with OpenAI’s GPT models to summarize online training content. It extracts material from Microsoft’s **Quantum Computing Fundamentals** learning path, cleans it, and generates concise summaries per lesson as well as an overall course summary. \n",
|
||||
"\n",
|
||||
"## Key Features\n",
|
||||
"- Fetches and parses webpages using **requests** and **BeautifulSoup** \n",
|
||||
"- Produces summaries in multiple languages (e.g., English, Spanish, or any language) and at varying levels of detail (short, medium, detailed) \n",
|
||||
"- Summarizes individual lessons on demand or processes entire learning paths \n",
|
||||
"- Presents results as clean, structured **Markdown** directly in the notebook \n",
|
||||
"\n",
|
||||
"## Tech Stack\n",
|
||||
"- **Model**: GPT-4o-mini \n",
|
||||
"- **Language**: Python \n",
|
||||
"- **Libraries**: BeautifulSoup, OpenAI \n",
|
||||
"\n",
|
||||
"## Purpose\n",
|
||||
"This project demonstrates how AI can streamline the understanding of technical documentation and online courses by generating multilingual, customizable summaries. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables from .env file (not included)\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c5e793b2-6775-426a-a139-4848291d0463",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"training_website = Website(\"https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/\")\n",
|
||||
"print(training_website.title)\n",
|
||||
"print(training_website.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a system prompt function that can use different language and length \n",
|
||||
"\n",
|
||||
"def build_system_prompt(language=\"Spanish\", length=\"short\"):\n",
|
||||
" return f\"\"\"You are an assistant that analyzes the contents of a website and provides a {length} summary, ignoring text that might be navigation related.\n",
|
||||
" Respond in 20 words or less markdown, and respond in {language}.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "987c95a6-6618-4d22-a2c3-3038a9d3f154",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary in {language} of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8a846c89-81d8-4f48-9d62-7744d76694e2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(user_prompt_for(training_website))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "26448ec4-5c00-4204-baec-7df91d11ff2e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(user_prompt_for(training_website))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d06e8d78-ce4c-4b05-aa8e-17050c82bb47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## And now let's build useful messages for GPT-4o-mini, using a function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"def messages_for(website, language=\"Spanish\", length=\"short\"):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": build_system_prompt(language, length)},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "16f49d46-bf55-4c3e-928f-68fc0bf715b0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Time to bring it together - the API for OpenAI is very simple!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "425214b8-c5c5-4d7a-8b79-f9e151c9d54f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "905b9919-aba7-45b5-ae65-81b3d1d78e34",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#call the OpenAI API. \n",
|
||||
"\n",
|
||||
"def summarize(url, language=\"Spanish\", length=\"short\"):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" messages=messages_for(website, language, length)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1c437357-d004-49f5-95c3-fce38aefcb5c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Summarize all the lessons in microsoft quantum computer training, having the option to summarize by lesson, or the training as a whole\n",
|
||||
"\n",
|
||||
"def summarize_training(path_url, language=\"Spanish\", length=\"short\"):\n",
|
||||
" links = get_links_from_path(path_url)\n",
|
||||
" print(f\"Found {len(links)} lessons\")\n",
|
||||
"\n",
|
||||
" all_summaries = []\n",
|
||||
"\n",
|
||||
" for link in links:\n",
|
||||
" print(f\"Summarizing {link}...\")\n",
|
||||
" summary = summarize(link, language, length)\n",
|
||||
" all_summaries.append(f\"### {link}\\n{summary}\\n\")\n",
|
||||
"\n",
|
||||
" combined_prompt = \"Here are summaries of each lesson:\\n\\n\" + \"\\n\".join(all_summaries)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": build_system_prompt(language, length)},\n",
|
||||
" {\"role\": \"user\", \"content\": \"Please summarize the entire training path based on these lesson summaries:\\n\\n\" + combined_prompt}\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" return \"\\n\".join(all_summaries) + \"\\n\\n## General Course Summary\\n\" + response.choices[0].message.content\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "05e38d41-dfa4-4b20-9c96-c46ea75d9fb5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarize(\"https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3d926d59-450e-4609-92ba-2d6f244f1342",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function to display this nicely in the Jupyter output, using markdown\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3018853a-445f-41ff-9560-d925d1774b2f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a68b1042-558a-4051-85e2-9ffd7a31a871",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Website Summarization Using llama\n",
|
||||
"### Week 1 Day 2 Exercise"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "176fcb2f-9ac7-460b-9fad-415e89c4920e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "b9c63761-c904-491b-92c7-e41eb319c3e4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"\n",
|
||||
"# OLLAMA_API = \"http://localhost:11434/api/chat\"\n",
|
||||
"# HEADERS = {\"Content-Type\": \"application/json\"}\n",
|
||||
"MODEL = \"llama3.2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "afe29712-751c-4322-a4c6-aed01e6acf26",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "be3eeb3f-aec5-4ef8-9427-3b80b2dce919",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
"\n",
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model = MODEL,\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "a78b587d-3a75-45a8-9ac5-f78dcddfa822",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://cnn.com\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
337
week1/community-contributions/week-1_exercise.ipynb
Normal file
337
week1/community-contributions/week-1_exercise.ipynb
Normal file
@@ -0,0 +1,337 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "64d2e4a0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# End of Week 1 Exercise\n",
|
||||
"\n",
|
||||
"To demonstrate your familiarity with OpenAI API, and also Ollama, build a tool that takes a technical question,\n",
|
||||
"and responds with an explanation. This is a tool that you will be able to use yourself during the course!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "e62b915e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from openai import OpenAI\n",
|
||||
"import ollama\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import os\n",
|
||||
"from IPython.display import display, update_display, Markdown"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "8bdfc47a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL_GPT = 'gpt-4o-mini'\n",
|
||||
"MODEL_LLAMA = 'llama3'\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"openai=OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "57983d03",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_messages(prompt=\"Describe some of the business applications of Generative AI\"):\n",
|
||||
" \"\"\"Create properly formatted messages for API calls\"\"\"\n",
|
||||
" messages = [\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": \"You are a helpful technical assistant that provides clear, detailed explanations for technical questions.\"\n",
|
||||
" },\n",
|
||||
" {\"role\": \"user\", \"content\": prompt}\n",
|
||||
" ]\n",
|
||||
" return messages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "a6bcb94d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def answer_with_openai(prompt=\"Describe some of the business applications of Generative AI\"):\n",
|
||||
" \"\"\"Get answer using OpenAI API and print in stream\"\"\"\n",
|
||||
" try:\n",
|
||||
" messages = create_messages(prompt)\n",
|
||||
" stream = openai.chat.completions.create(\n",
|
||||
" model=MODEL_GPT,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.7,\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" answer = \"\"\n",
|
||||
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
||||
" for chunk in stream:\n",
|
||||
" if chunk.choices[0].delta.content:\n",
|
||||
" answer += chunk.choices[0].delta.content\n",
|
||||
" # Clean up markdown formatting for display\n",
|
||||
" clean_answer = answer.replace(\"```\", \"\").replace(\"markdown\", \"\")\n",
|
||||
" update_display(Markdown(clean_answer), display_id=display_handle.display_id)\n",
|
||||
" return answer\n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"Error with OpenAI: {str(e)}\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "e96159ab",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def answer_with_ollama(prompt=\"Describe some of the business applications of Generative AI\"):\n",
|
||||
" \"\"\"Get answer using Ollama API and print in stream\"\"\"\n",
|
||||
" try:\n",
|
||||
" messages = create_messages(prompt)\n",
|
||||
" stream = ollama.chat(\n",
|
||||
" model=MODEL_LLAMA,\n",
|
||||
" messages=messages,\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" answer = \"\"\n",
|
||||
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
||||
" for chunk in stream:\n",
|
||||
" if chunk['message']['content']:\n",
|
||||
" answer += chunk['message']['content']\n",
|
||||
" # Clean up markdown formatting for display\n",
|
||||
" clean_answer = answer.replace(\"```\", \"\").replace(\"markdown\", \"\")\n",
|
||||
" update_display(Markdown(clean_answer), display_id=display_handle.display_id)\n",
|
||||
" return answer\n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"Error with Ollama: {str(e)}\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "ab72f8b6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def technical_qa_tool(question, use_openai=True, use_ollama=True):\n",
|
||||
" \"\"\"Main function to get technical explanations from both APIs\"\"\"\n",
|
||||
" print(f\"Question: {question}\")\n",
|
||||
" print(\"=\" * 80)\n",
|
||||
" \n",
|
||||
" if use_openai:\n",
|
||||
" print(\"\\n🤖 OpenAI Response:\")\n",
|
||||
" print(\"-\" * 40)\n",
|
||||
" answer_with_openai(question)\n",
|
||||
" \n",
|
||||
" if use_ollama:\n",
|
||||
" print(\"\\n🦙 Ollama Response:\")\n",
|
||||
" print(\"-\" * 40)\n",
|
||||
" answer_with_ollama(question)\n",
|
||||
" # display(Markdown(ollama_answer))\n",
|
||||
" \n",
|
||||
" print(\"\\n\" + \"=\" * 80)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "1a6aa4a2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Question: What is the difference between supervised and unsupervised machine learning?\n",
|
||||
"================================================================================\n",
|
||||
"\n",
|
||||
"🤖 OpenAI Response:\n",
|
||||
"----------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"Supervised and unsupervised machine learning are two primary categories of machine learning techniques, and they differ mainly in how they learn from data and the type of problems they are used to solve. Here’s a detailed explanation of each:\n",
|
||||
"\n",
|
||||
"### Supervised Machine Learning\n",
|
||||
"\n",
|
||||
"**Definition**: In supervised learning, the model is trained on a labeled dataset, meaning that each training example is paired with an output label. The goal is to learn a mapping from inputs (features) to the output labels.\n",
|
||||
"\n",
|
||||
"**Characteristics**:\n",
|
||||
"- **Labeled Data**: Requires a dataset that includes both the input features and the corresponding output labels.\n",
|
||||
"- **Objective**: The objective is to predict the output for new, unseen data based on the learned mapping from the training data.\n",
|
||||
"- **Common Techniques**:\n",
|
||||
" - **Regression**: For predicting continuous values (e.g., predicting house prices).\n",
|
||||
" - **Classification**: For predicting discrete labels (e.g., spam detection in emails).\n",
|
||||
"- **Examples**:\n",
|
||||
" - Predicting whether an email is spam or not based on various features (classification).\n",
|
||||
" - Forecasting sales figures based on historical sales data (regression).\n",
|
||||
"\n",
|
||||
"### Unsupervised Machine Learning\n",
|
||||
"\n",
|
||||
"**Definition**: In unsupervised learning, the model is trained on data that is not labeled, meaning that it does not have predefined output labels. The goal is to discover patterns, groupings, or structures within the data.\n",
|
||||
"\n",
|
||||
"**Characteristics**:\n",
|
||||
"- **Unlabeled Data**: Works with datasets that only have input features without any associated output labels.\n",
|
||||
"- **Objective**: The objective is to explore the data and find hidden patterns or intrinsic structures without specific guidance.\n",
|
||||
"- **Common Techniques**:\n",
|
||||
" - **Clustering**: Grouping similar data points together (e.g., customer segmentation).\n",
|
||||
" - **Dimensionality Reduction**: Reducing the number of features while retaining essential information (e.g., PCA - Principal Component Analysis).\n",
|
||||
"- **Examples**:\n",
|
||||
" - Grouping customers into segments based on purchasing behavior (clustering).\n",
|
||||
" - Reducing the dimensionality of a dataset to visualize it in two or three dimensions (dimensionality reduction).\n",
|
||||
"\n",
|
||||
"### Key Differences\n",
|
||||
"\n",
|
||||
"1. **Data Type**:\n",
|
||||
" - Supervised Learning: Requires labeled data.\n",
|
||||
" - Unsupervised Learning: Works with unlabeled data.\n",
|
||||
"\n",
|
||||
"2. **Goal**:\n",
|
||||
" - Supervised Learning: To learn a function that maps inputs to the correct outputs.\n",
|
||||
" - Unsupervised Learning: To identify patterns or groupings in the input data.\n",
|
||||
"\n",
|
||||
"3. **Applications**:\n",
|
||||
" - Supervised Learning: Typically used in scenarios where past data with known outcomes is available (e.g., fraud detection, image classification).\n",
|
||||
" - Unsupervised Learning: Used for exploratory data analysis or when the outcome is not known (e.g., market basket analysis, anomaly detection).\n",
|
||||
"\n",
|
||||
"In summary, the primary difference between supervised and unsupervised machine learning lies in the presence or absence of labeled data and the objectives of the learning process. Supervised learning aims to predict outcomes based on existing labels, while unsupervised learning seeks to identify hidden structures in data without predefined labels."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"🦙 Ollama Response:\n",
|
||||
"----------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"In machine learning, there are two main categories: supervised and unsupervised learning. The key difference lies in the type of data used to train the model and the goal of the learning process.\n",
|
||||
"\n",
|
||||
"**Supervised Learning**\n",
|
||||
"\n",
|
||||
"In supervised learning, you have a labeled dataset that contains both input data (features) and corresponding output labels or target variables. The goal is to learn a mapping between the input data and the output labels so that the model can make accurate predictions on new, unseen data.\n",
|
||||
"\n",
|
||||
"Here are some characteristics of supervised learning:\n",
|
||||
"\n",
|
||||
"1. Labeled training data: You have a dataset with input data and corresponding output labels.\n",
|
||||
"2. Specific goal: You want to predict the output label for a given input instance.\n",
|
||||
"3. Model evaluation: You evaluate the performance of your model using metrics like accuracy, precision, recall, F1 score, etc.\n",
|
||||
"\n",
|
||||
"Examples of supervised learning tasks include:\n",
|
||||
"\n",
|
||||
"* Image classification (e.g., recognizing dogs vs. cats)\n",
|
||||
"* Sentiment analysis (e.g., determining if text is positive or negative)\n",
|
||||
"* Regression problems (e.g., predicting house prices based on features like number of bedrooms and square footage)\n",
|
||||
"\n",
|
||||
"**Unsupervised Learning**\n",
|
||||
"\n",
|
||||
"In unsupervised learning, you have an unlabeled dataset, and the goal is to discover patterns, relationships, or structure in the data without a specific target variable. This type of learning is often used for exploratory data analysis, feature selection, and dimensionality reduction.\n",
|
||||
"\n",
|
||||
"Here are some characteristics of unsupervised learning:\n",
|
||||
"\n",
|
||||
"1. Unlabeled training data: You have a dataset with only input features (no output labels).\n",
|
||||
"2. No specific goal: You want to find interesting patterns or structure in the data.\n",
|
||||
"3. Model evaluation: You evaluate the performance of your model using metrics like silhouette score, Calinski-Harabasz index, etc.\n",
|
||||
"\n",
|
||||
"Examples of unsupervised learning tasks include:\n",
|
||||
"\n",
|
||||
"* Clustering (e.g., grouping customers based on their purchase history)\n",
|
||||
"* Dimensionality reduction (e.g., reducing the number of features in a dataset while preserving important information)\n",
|
||||
"* Anomaly detection (e.g., identifying unusual behavior or outliers in financial transactions)\n",
|
||||
"\n",
|
||||
"In summary, supervised learning involves training a model to make predictions based on labeled data, whereas unsupervised learning aims to discover patterns and relationships in unlabeled data."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"================================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Test the tool with a technical question\n",
|
||||
"technical_question = \"What is the difference between supervised and unsupervised machine learning?\"\n",
|
||||
"technical_qa_tool(technical_question)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0a976ce1",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9b0a539e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Interactive version - uncomment to use\n",
|
||||
"# user_question = input(\"Enter your technical question: \")\n",
|
||||
"# technical_qa_tool(user_question)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,206 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fe12c203-e6a6-452c-a655-afb8a03a4ff5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# End of week 1 exercise\n",
|
||||
"\n",
|
||||
"To demonstrate your familiarity with OpenAI API, and also Ollama, build a tool that takes a technical question, \n",
|
||||
"and responds with an explanation. This is a tool that you will be able to use yourself during the course!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c1070317-3ed9-4659-abe3-828943230e03",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import json\n",
|
||||
"from IPython.display import Markdown, display, update_display\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4a456906-915a-4bfd-bb9d-57e505c5093f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# constants\n",
|
||||
"\n",
|
||||
"MODEL_GPT = 'gpt-4o-mini'\n",
|
||||
"MODEL_LLAMA = 'llama3.2'\n",
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "a8d7923c-5f28-4c30-8556-342d7c8497c1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"API key looks good so far\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# set up environment\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv(\"OPENAI_API_KEY\")\n",
|
||||
"if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:\n",
|
||||
" print(\"API key looks good so far\")\n",
|
||||
"else:\n",
|
||||
" print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "3f0d0137-52b0-47a8-81a8-11a90a010798",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# here is the question; type over this to ask something new\n",
|
||||
"system_prompt = \"You are a software engineering and data science expert and you have knowledge in all the areas of software engineering and latest technologies, trends. You should guide and help users with your technical solutions for all software engineering and data science related questions\"\n",
|
||||
"user_prompt = \"\"\"\n",
|
||||
"Please explain what this code does and why:\n",
|
||||
"yield from {book.get(\"author\") for book in books if book.get(\"author\")}\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "60ce7000-a4a5-4cce-a261-e75ef45063b4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"This code snippet is utilizing a Python generator expression combined with the `yield from` statement to yield values from a set comprehension. Let's break it down:\n",
|
||||
"\n",
|
||||
"1. **Set Comprehension**:\n",
|
||||
" ```python\n",
|
||||
" {book.get(\"author\") for book in books if book.get(\"author\")}\n",
|
||||
" ```\n",
|
||||
" - This is a set comprehension that iterates over a collection called `books`.\n",
|
||||
" - For each `book`, it retrieves the value associated with the key `\"author\"` using the `get()` method.\n",
|
||||
" - The `if book.get(\"author\")` condition ensures that only books that have a valid (non-None or non-empty) author are included. This effectively filters out any books where the author is not present.\n",
|
||||
"\n",
|
||||
" As a result, this part creates a set of unique authors from the list of books. Since sets automatically discard duplicates, if multiple books have the same author, that author will only appear once in the resulting set.\n",
|
||||
"\n",
|
||||
"2. **Yielding Values**:\n",
|
||||
" ```python\n",
|
||||
" yield from\n",
|
||||
" ```\n",
|
||||
" - The `yield from` statement is used when you want to yield all values from an iterable. It allows a generator to yield all values from another generator or iterable.\n",
|
||||
" - In this context, it will yield each author from the set created by the comprehension.\n",
|
||||
"\n",
|
||||
"3. **Putting It All Together**:\n",
|
||||
" What this overall code does is:\n",
|
||||
" - It generates and yields unique authors from a collection of books, ensuring that each author is listed only once and only for books that actually specify an author.\n",
|
||||
"\n",
|
||||
"### Purpose:\n",
|
||||
"This code is useful in scenarios where you need to obtain a seemingly infinite generator of authors from a collection of books, processing each author one by one without creating a permanent list or set in memory, which can be beneficial for memory efficiency especially if you have a very large collection of books.\n",
|
||||
"\n",
|
||||
"### Example Usage:\n",
|
||||
"Here’s a basic example of how you might use this in a generator function:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"def get_unique_authors(books):\n",
|
||||
" yield from {book.get(\"author\") for book in books if book.get(\"author\")}\n",
|
||||
"\n",
|
||||
"# Example books list\n",
|
||||
"books = [\n",
|
||||
" {\"title\": \"Book 1\", \"author\": \"Author A\"},\n",
|
||||
" {\"title\": \"Book 2\", \"author\": \"Author B\"},\n",
|
||||
" {\"title\": \"Book 3\", \"author\": \"Author A\"},\n",
|
||||
" {\"title\": \"Book 4\", \"author\": None},\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for author in get_unique_authors(books):\n",
|
||||
" print(author)\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"This would output:\n",
|
||||
"```\n",
|
||||
"Author A\n",
|
||||
"Author B\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"In this example, `Author A` only appears once, demonstrating the uniqueness provided by the set comprehension."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"None\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Get gpt-4o-mini to answer, with streaming\n",
|
||||
"response = openai.chat.completions.create(\n",
|
||||
" model=MODEL_GPT,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ],\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
"result = response.choices[0].message.content\n",
|
||||
"print(display(Markdown(result)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8f7c8ea8-4082-4ad0-8751-3301adcf6538",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get Llama 3.2 to answer"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llms",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
|
||||
class ScrapeWebsite:
|
||||
|
||||
def __init__(self, url, headers):
|
||||
""" Scraping Website which provides title and content"""
|
||||
self.url = url
|
||||
response = requests.get(self.url, headers=headers)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
self.title = soup.title.string if soup.title else "No title found"
|
||||
for irrelevant in soup.body(["script", "style", "img", "input"]):
|
||||
irrelevant.decompose()
|
||||
self.text = soup.body.get_text(separator="\n", strip=True)
|
||||
@@ -0,0 +1,186 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI \n",
|
||||
"from scrape_website import ScrapeWebsite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29ddd15d-a3c5-4f4e-a678-873f56162724",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"MODEL = \"llama3.2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "42c8a8c2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an analyst that analyses the content of the website \\\n",
|
||||
" provides summary and ignore text related to navigation. Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "51e86dd1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; Please provide short summary in Markdown. Please include news and \\\n",
|
||||
" announcements\"\n",
|
||||
" user_prompt+=website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b69d7238",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\":\"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\":\"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a56e99ea",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9b4061d0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarise(url):\n",
|
||||
" website = ScrapeWebsite(url, headers)\n",
|
||||
" ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
" response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=messages_for(website)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "65f96545",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarise(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "23057e00-b6fc-4678-93a9-6b31cb704bff",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Generative AI has numerous business applications across various industries. Here are some examples:\n",
|
||||
"\n",
|
||||
"1. **Marketing and Advertising**: Generative AI can create personalized product recommendations, generate targeted advertisements, and develop new marketing campaigns.\n",
|
||||
"2. **Content Creation**: AI-powered tools can assist in content creation, such as writing articles, generating social media posts, and creating videos, podcasts, and music.\n",
|
||||
"3. **Product Design and Development**: Generative AI can aid in designing products, such as 3D modeling, prototyping, and testing product feasibility.\n",
|
||||
"4. **Customer Service Chatbots**: AI-powered chatbots can provide personalized customer service, answering common queries, and helping resolve issues faster.\n",
|
||||
"5. **Language Translation**: Generative AI can translate languages in real-time, enabling businesses to communicate with global customers more effectively.\n",
|
||||
"6. **Data Analysis and Visualization**: AI can analyze large datasets, identify patterns, and create insights, making it easier for businesses to make informed decisions.\n",
|
||||
"7. **Cybersecurity Threat Detection**: Generative AI-powered systems can detect and respond to cyber threats more efficiently, reducing the risk of data breaches and attacks.\n",
|
||||
"8. **Supply Chain Optimization**: AI can optimize supply chain operations, predict demand, and identify opportunities for improvement, leading to increased efficiency and reduced costs.\n",
|
||||
"9. **Network Security**: Generative AI can analyze network traffic patterns, detect anomalies, and prevent cyber-attacks.\n",
|
||||
"10. **Finance and Banking**: AI-powered systems can detect financial fraud, predict customer creditworthiness, and generate credit reports.\n",
|
||||
"\n",
|
||||
"**Industry-specific applications:**\n",
|
||||
"\n",
|
||||
"1. **Healthcare**: AI can help with medical diagnosis, patient data analysis, and personalized medicine.\n",
|
||||
"2. **Manufacturing**: Generative AI can create optimized production schedules, predict equipment failures, and improve product quality.\n",
|
||||
"3. **Education**: AI-powered tools can develop personalized learning plans, automate grading, and provide educational resources.\n",
|
||||
"4. **Real Estate**: AI can help with property valuations, identify market trends, and analyze potential clients' needs.\n",
|
||||
"\n",
|
||||
"**Business benefits:**\n",
|
||||
"\n",
|
||||
"1. **Increased efficiency**: Automating mundane tasks frees up human resources for more strategic work.\n",
|
||||
"2. **Improved accuracy**: Generative AI reduces the likelihood of human error in decision-making and task execution.\n",
|
||||
"3. **Enhanced customer experience**: Personalized experiences are created through data-driven insights.\n",
|
||||
"4. **Competitive advantage**: Companies using AI can differentiate themselves from competitors by offering innovative services and products.\n",
|
||||
"\n",
|
||||
"As Generative AI continues to evolve, we can expect even more exciting applications across various industries, leading to increased efficiency, accuracy, and improved competitiveness for businesses worldwide.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display_summary(\"https://www.firstpost.com/world/united-states/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6de38216-6d1c-48c4-877b-86d403f4e0f8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llms",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,265 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1e45263e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Web Data Extraction and Summarization using openAI Latest model gpt-5-mini"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "df155151",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Import Libraries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "588f8e43",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI \n",
|
||||
"from scrape_website import ScrapeWebsite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b5925769",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### load api key"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "6cca85ec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "56703f80",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### ScrapWebsite using BeautifulSoup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "3d60c909",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a8b73c27",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### System Prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "4a0c3bda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an analyst that analyses the content of the website \\\n",
|
||||
" provides summary and ignore text related to navigation. Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9117963b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### User Prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "ab164d55",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; Please provide short summary in Markdown. Please include news and \\\n",
|
||||
" announcements\"\n",
|
||||
" user_prompt+=website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "de7423fb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Format messages in openAI standard"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "47c82247",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\":\"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\":\"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6e9bb6e1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Summarise the content in website using openAI latest model gpt-5-mini"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "068d6bb2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarise(url):\n",
|
||||
" website = ScrapeWebsite(url, headers)\n",
|
||||
" openai = OpenAI()\n",
|
||||
" response = openai.chat.completions.create(model=\"gpt-5-mini\", messages=messages_for(website))\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7e6e9da6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Show summary as Markdown"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "cd86c2ca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarise(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ed5e50d2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Output"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "74a056b1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"# Summary — United States Of America | Firstpost (Live/Latest)\n",
|
||||
"\n",
|
||||
"Site focus: Live updates and rundowns of US and world news with emphasis on politics, justice, economy, national security, and breaking incidents. Coverage mixes headlines, investigations, opinion and special features/web stories.\n",
|
||||
"\n",
|
||||
"## Major news (headlines)\n",
|
||||
"- Police shooting near CDC/Emory in Atlanta: a suspected shooter and a police officer were killed after reports of an active shooter near the CDC and Emory University campuses. \n",
|
||||
"- Death of astronaut Jim Lovell (97): Apollo 13 commander and former Navy pilot died in a Chicago suburb. \n",
|
||||
"- Stephen Miran named to Fed Board (short-term): Trump appointed economist Stephen Miran to the Federal Reserve Board through Jan 2026; noted for support of tariffs and rate cuts. \n",
|
||||
"- Trump fires labour statistics chief: President Trump sacked the official overseeing labor data hours after a weak jobs report. \n",
|
||||
"- House panel subpoenas Clintons over Epstein: congressional subpoenas seek documents in relation to Jeffrey Epstein amid pressure on the administration over Epstein files. \n",
|
||||
"- Ghislaine Maxwell moved to lower-security prison in Texas amid scrutiny of Epstein files and government handling. \n",
|
||||
"- FBI/administration tension on Epstein Files: Trump said he would “release everything” after reports the FBI redacted names from the Epstein Files. \n",
|
||||
"- Probe launched into attorney who investigated Trump cases: US officials began a probe targeting Special Counsel Jack Smith. \n",
|
||||
"- NTSB finds technical issues in Army helicopter crash: investigation into crash that killed 67 people identified technical problems. \n",
|
||||
"- Trump unveils modified reciprocal tariffs: new executive order introduced modified tariffs on multiple countries; effective date possibly as late as Oct 5. \n",
|
||||
"- Trump-EU trade deal announced: reported pact imposing a 15% tariff on most EU goods, with large energy and investment components but unresolved issues remain. \n",
|
||||
"- Federal Reserve holds rates steady: Fed kept rates unchanged for a fifth meeting, despite political pressure from Trump. \n",
|
||||
"- White House remodel plan: Trump pushing to build a reported $200 million ballroom at the presidential residence, funded by Trump/donors per WH. \n",
|
||||
"- US citizenship test format under review: Trump administration considers reverting to the 2020 naturalisation test format, citing concerns the current test is too easy. \n",
|
||||
"- American Airlines incident in Denver: passengers evacuated after a Boeing plane caught fire (tire/maintenance issue) before takeoff. \n",
|
||||
"- John Bolton criticizes Tulsi Gabbard: former NSA lambastes Gabbard’s report on Obama as exaggerated and lacking substance. \n",
|
||||
"- Ohio solicitor general Mathura Sridharan trolled: Indian-origin jurist faced racist online backlash after appointment; Ohio AG responded strongly.\n",
|
||||
"\n",
|
||||
"## Announcements, features & recurring elements\n",
|
||||
"- Web stories and quick-read lists: travel/animals/safety themed pieces (e.g., “10 airport codes”, “10 animals that are naturally blue”, World Tiger Day lists). \n",
|
||||
"- Regular sections and shows highlighted in coverage: Firstpost America, Firstpost Africa, First Sports, Vantage, Fast and Factual, Between The Lines, Flashback, Live TV. \n",
|
||||
"- Events and special coverage teased: Raisina Dialogue, Champions Trophy, Delhi Elections 2025, Budget 2025, US Elections 2024, Firstpost Defence Summit. \n",
|
||||
"- Trending topics emphasized: Donald Trump, Narendra Modi, Elon Musk, United States, Joe Biden. \n",
|
||||
"- Quick-links / network: cross-promotion of other Network18 properties (News18, Moneycontrol, CNBC TV18, Forbes India).\n",
|
||||
"\n",
|
||||
"## Tone and emphasis\n",
|
||||
"- Heavy focus on US politics, Trump administration actions and controversies (Epstein Files, tariffs, personnel changes), justice probes, national security incidents, and major breaking events.\n",
|
||||
"- Mix of investigative/legal reporting, immediate breaking news, and light/web-story listicles.\n",
|
||||
"\n",
|
||||
"If you want, I can produce a one-page brief of just the Trump-related items, a timeline of the Epstein/Clinton/Subpoena coverage, or extract all headlines with publication order."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display_summary(\"https://www.firstpost.com/world/united-states/\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llms",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
216
week1/community-contributions/youtube_video_summarize.ipynb
Normal file
216
week1/community-contributions/youtube_video_summarize.ipynb
Normal file
@@ -0,0 +1,216 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"id": "8ca2e60d-17c0-40fc-91c6-c16915b39c06",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re, html, json\n",
|
||||
"import requests\n",
|
||||
"from urllib.error import HTTPError\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled, VideoUnavailable\n",
|
||||
"\n",
|
||||
"OLLAMA_API = \"http://localhost:11434/api/chat\"\n",
|
||||
"HEADERS = {\"Content-Type\": \"application/json\"}\n",
|
||||
"MODEL = \"llama3.2\"\n",
|
||||
"api_key='ollama'\n",
|
||||
"\n",
|
||||
"def yt_title_desc_transcript(url: str, lang=\"en\"):\n",
|
||||
" \"\"\"\n",
|
||||
" Returns {\"title\": str|None, \"description\": str|None, \"transcript\": str|None}.\n",
|
||||
" - Title via oEmbed (no API key).\n",
|
||||
" - Description scraped from the watch page (shortDescription).\n",
|
||||
" - Transcript via youtube-transcript-api, gracefully handling 400/disabled.\n",
|
||||
" \"\"\"\n",
|
||||
" # --- extract 11-char video id ---\n",
|
||||
" m = re.search(r\"(?:v=|/)([0-9A-Za-z_-]{11})|^([0-9A-Za-z_-]{11})$\", url)\n",
|
||||
" vid = (m.group(1) or m.group(2)) if m else None\n",
|
||||
" if not vid:\n",
|
||||
" return {\"title\": None, \"description\": None, \"transcript\": None}\n",
|
||||
"\n",
|
||||
" # --- title via oEmbed (very robust) ---\n",
|
||||
" title = None\n",
|
||||
" try:\n",
|
||||
" r = requests.get(\"https://www.youtube.com/oembed\",\n",
|
||||
" params={\"url\": f\"https://www.youtube.com/watch?v={vid}\", \"format\": \"json\"},\n",
|
||||
" timeout=10)\n",
|
||||
" if r.ok:\n",
|
||||
" title = r.json().get(\"title\")\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" # --- description from watch page (shortDescription in initial JSON) ---\n",
|
||||
" description = None\n",
|
||||
" try:\n",
|
||||
" page = requests.get(f\"https://www.youtube.com/watch?v={vid}\", timeout=10).text\n",
|
||||
" # Look for ytInitialPlayerResponse JSON\n",
|
||||
" jmatch = re.search(r\"ytInitialPlayerResponse\\s*=\\s*({.*?});\", page, re.DOTALL)\n",
|
||||
" if jmatch:\n",
|
||||
" data = json.loads(jmatch.group(1))\n",
|
||||
" desc = data.get(\"videoDetails\", {}).get(\"shortDescription\")\n",
|
||||
" if desc:\n",
|
||||
" description = html.unescape(desc)\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" # --- transcript (handle 400 cleanly) ---\n",
|
||||
" transcript_text = None\n",
|
||||
" try:\n",
|
||||
" items = YouTubeTranscriptApi.get_transcript(vid, languages=[lang])\n",
|
||||
" transcript_text = \" \".join(ch[\"text\"].strip() for ch in items if ch.get(\"text\"))\n",
|
||||
" except (NoTranscriptFound, TranscriptsDisabled, VideoUnavailable, HTTPError):\n",
|
||||
" # HTTPError covers the \"HTTP Error 400: Bad Request\" case\n",
|
||||
" transcript_text = None\n",
|
||||
" except Exception:\n",
|
||||
" transcript_text = None\n",
|
||||
"\n",
|
||||
" return {\"title\": title, \"description\": description, \"transcript\": transcript_text}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "ad9be496-4e91-4562-90f3-54d11208da55",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"system_prompt = '''\n",
|
||||
"You are an assistant that generates detailed yet concise summaries of YouTube videos.\n",
|
||||
"When the user provides a title and description of a YouTube video, your task is to write a coherent, engaging, and informative summary of around 500 words.\n",
|
||||
"The summary should:\n",
|
||||
"\n",
|
||||
"Capture the main themes and key points the video likely covers.\n",
|
||||
"\n",
|
||||
"Expand on the description logically, providing context and flow.\n",
|
||||
"\n",
|
||||
"Stay neutral, factual, and clear (no personal opinions).\n",
|
||||
"\n",
|
||||
"Be self-contained so it makes sense without needing to watch the video.\n",
|
||||
"'''"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"id": "dd4be0bc-df1f-47e0-9e03-9b734117f80a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt(title, description):\n",
|
||||
" prompt = '''Provide me the YouTube video title and description.\\n\n",
|
||||
" I will generate a clear, engaging, and concise summary of the video content in around 500 words,\\n\n",
|
||||
" highlighting the main ideas, key points, and important details.\\n'''\n",
|
||||
" prompt += f'here is the title : {title} \\n Description : {description} '\n",
|
||||
" return prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"id": "46896ad3-db1e-448a-8a03-036b9568c69f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def stream_youtube(yt_url):\n",
|
||||
" ollama = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
" video_metadata = yt_title_desc_transcript(yt_url)\n",
|
||||
" stream = ollama.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages = [\n",
|
||||
" {\"role\":\"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\":\"user\", \"content\": user_prompt(video_metadata['title'], video_metadata['description'])}\n",
|
||||
" ],\n",
|
||||
" stream=True\n",
|
||||
" \n",
|
||||
" )\n",
|
||||
" response = \"\"\n",
|
||||
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
||||
" for chunk in stream:\n",
|
||||
" response += chunk.choices[0].delta.content or ''\n",
|
||||
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
|
||||
" update_display(Markdown(response), display_id=display_handle.display_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"id": "b59f8773-c13e-4050-ad3c-b578d07ef5e7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"Here is a summary of the YouTube video:\n",
|
||||
"\n",
|
||||
"**Monta Re: A Baul-Inspired Tribute to the Mystic Guru Shankaracharya**\n",
|
||||
"\n",
|
||||
"The music video for \"Monta Re\" by Amit Trivedi, featuring Swanand Kirkire and Amitabh Bhattacharya, is a soulful tribute to the mystic guru Shankaracharya. Set in the Bengali folk music tradition, this song brings to life the ancient tales of Shankaracharya's spiritual journey.\n",
|
||||
"\n",
|
||||
"With elegant lyrics penned by Amitabh Bhattacharya, \"Monta Re\" transports listeners to the banks of the Ganges River, where Shankaracharya wandered in search of wisdom and inner peace. The song's haunting melodies and emotive vocals evoke a sense of longing and introspection, perfectly capturing the mystic guru's spiritual essence.\n",
|
||||
"\n",
|
||||
"The music video beautifully illustrates the baul-inspired style, with intricate traditional dance movements performed by a group of energetic dancers. The choreography seamlessly blends elements of Bengal's folk heritage with modern sensibilities, making the song an engaging watch for audience members interested in Indian classical music.\n",
|
||||
"\n",
|
||||
"**Music and Lyric Credit:**\n",
|
||||
"Amit Trivedi handles the music composition, ensuring that the melody complements the song's themes without overpowering them. Amitabh Bhattacharya takes credit for the lyrics, which tell stunning stories of Shankaracharya's spiritual adventures. The song features Swanand Kirkire and Amitabh Bhattacharya as vocalists, further enriching its emotional impact.\n",
|
||||
"\n",
|
||||
"**Relevance to Bengali Culture:**\n",
|
||||
"\"Monta Re\" is a heartwarming tribute to Bengal's rich cultural heritage. Inspired by the baul traditions of the region, this song honors Shankaracharya's life and spiritual journey without diminishing his significance in modern times. By showcasing these folk roots, \"Monta Re\" provides fans with an enriching sensory experience.\n",
|
||||
"\n",
|
||||
"You can listen to \"Monta Re\" along with other T-Series music videos released by Amit Trivedi at the links provided below:\n",
|
||||
"\n",
|
||||
"- Watch \"Ankahee\"\n",
|
||||
"- Check out \"Sawaar Loon\"\n",
|
||||
"- Explore \"Zinda Hoon\"\n",
|
||||
"\n",
|
||||
"Follow the official T-SERIES YouTube channel for an ever-growing variety of original music tracks!\n",
|
||||
"\n",
|
||||
"By embracing the richness of Bengali folk traditions, \"Monta Re\" embodies a musical reflection of Shankaracharya's extraordinary journey as both spiritual guide and symbol of timeless wisdom."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"stream_youtube('https://youtu.be/99NUJ1cLbBI?list=RDdJ6_aU6auZc')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "649287ca-aff8-4b59-91b7-731c007e83a7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user