Bootcamp: Solisoma(fix:edit only community_contributiions folder)

This commit is contained in:
unknown
2025-10-20 12:55:19 +01:00
parent 3346c67ba5
commit 1fe9ce7378
2 changed files with 1204 additions and 0 deletions

View File

@@ -0,0 +1,488 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "6df489a5",
"metadata": {},
"outputs": [],
"source": [
"# week1 -> day1\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"\n",
"#week2 -> day2\n",
"import gradio as gr"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8e7fbf42",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"api_key:str = os.getenv('OPENAI_API_KEY')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b9266d13",
"metadata": {},
"outputs": [],
"source": [
"class SolveTechnicalQuestions:\n",
" _system_prompt = \"\"\"\n",
" You are a snarkyassistant that analyzes the contents of a website, \n",
" and provides a short, snarky, humorous summary, ignoring text that might be navigation related.\n",
" Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.\n",
" \"\"\"\n",
"\n",
" def __init__(self, model: str = \"gpt-4o-mini\") -> None:\n",
" self.openai_client = OpenAI()\n",
" self._MODEL = model\n",
"\n",
" def get_user_technical_question_prompt(self, question:str):\n",
" prompt = f\"\"\"\n",
" Answer this technical questio comprehensively:\n",
" Provide:\n",
" 1. A clear, accurate answer\n",
" 2. Code examples if relevant\n",
" 3. Best practices and recommendations\n",
" 4. Potential pitfalls or considerations\n",
" 5. Additional resources or references if helpful\n",
"\n",
" Format your response in a structured, easy-to-read manner.\n",
"\n",
" Question {question}\n",
" \"\"\"\n",
"\n",
" return prompt\n",
" \n",
" def set_system_prompt(self, system_prompt: str) -> None:\n",
" self._system_prompt = system_prompt\n",
" \n",
" def set_endpoint(self, endpoint: str, api_key: str = \"ollama\") -> None:\n",
" self.openai_client = OpenAI(base_url=endpoint, api_key=api_key)\n",
"\n",
" def set_model(self, model: str) -> None:\n",
" self._MODEL = model\n",
"\n",
" def start(self, stream=False):\n",
" try:\n",
" while True:\n",
" question = input(\">>> \")\n",
" \n",
" if question.strip().lower() in ['quit', 'exit', 'q']:\n",
" print(\"Goodbye!\")\n",
" break\n",
" \n",
" if not question.strip():\n",
" print(\"Please enter a question.\")\n",
" continue\n",
" \n",
" message = self.get_user_technical_question_prompt(question.strip())\n",
" \n",
" response = self.openai_client.chat.completions.create(\n",
" model=self._MODEL, \n",
" messages=[\n",
" {\"role\": \"system\", \"content\": self._system_prompt},\n",
" {\"role\": \"user\", \"content\": message},\n",
" ],\n",
" stream=stream\n",
" )\n",
"\n",
" if stream:\n",
" full_response = \"\"\n",
" display_handle = display(Markdown(full_response), display_id=True)\n",
" for chunk in response:\n",
" if chunk.choices[0].delta.content:\n",
" full_response += chunk.choices[0].delta.content\n",
" update_display(Markdown(full_response), display_id=display_handle.display_id)\n",
" full_response += \"\\n\"\n",
" update_display(Markdown(full_response), display_id=display_handle.display_id)\n",
" else:\n",
" full_response = response.choices[0].message.content\n",
" display(Markdown(full_response))\n",
" \n",
" except KeyboardInterrupt:\n",
" print(\"\\nGoodbye!\")\n",
" except Exception as e:\n",
" print(f\"Error: {e}\")\n",
"\n",
" def start_with_gradio(self, question:str, stream=False):\n",
" if not question.strip():\n",
" return \"Please enter a question.\"\n",
" \n",
" message = self.get_user_technical_question_prompt(question.strip())\n",
" \n",
" response = self.openai_client.chat.completions.create(\n",
" model=self._MODEL, \n",
" messages=[\n",
" {\"role\": \"system\", \"content\": self._system_prompt},\n",
" {\"role\": \"user\", \"content\": message},\n",
" ],\n",
" stream=stream\n",
" )\n",
"\n",
" if stream:\n",
" full_response = \"\"\n",
" for chunk in response:\n",
" if chunk.choices[0].delta.content:\n",
" full_response += chunk.choices[0].delta.content\n",
" yield full_response\n",
" full_response += \"\\n\"\n",
" yield full_response\n",
" else:\n",
" yield response.choices[0].message.content\n",
"\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "0bddb2e5",
"metadata": {},
"outputs": [],
"source": [
"TECHNICAL_SYSTEM_PROMPT = \"\"\"\n",
"You are an expert technical assistant with deep knowledge in:\n",
"\n",
"PROGRAMMING & DEVELOPMENT:\n",
"- Python, JavaScript, Java, C++, Go, Rust, TypeScript\n",
"- Web development (React, Vue, Angular, Node.js)\n",
"- Mobile development (iOS, Android, Flutter)\n",
"- DevOps (Docker, Kubernetes, CI/CD, AWS, Azure, GCP)\n",
"- Database systems (SQL, NoSQL, PostgreSQL, MongoDB)\n",
"- Software architecture patterns and best practices\n",
"\n",
"SYSTEMS & INFRASTRUCTURE:\n",
"- Operating systems (Linux, Windows, macOS)\n",
"- Networking protocols and security\n",
"- Cloud computing and distributed systems\n",
"- Monitoring, logging, and observability\n",
"- Performance optimization and scaling\n",
"\n",
"AI & MACHINE LEARNING:\n",
"- Machine learning algorithms and frameworks\n",
"- Deep learning (TensorFlow, PyTorch)\n",
"- Natural language processing\n",
"- Computer vision and image processing\n",
"- MLOps and model deployment\n",
"\n",
"RESPONSE GUIDELINES:\n",
"1. Provide accurate, up-to-date technical information\n",
"2. Include code examples when relevant\n",
"3. Explain complex concepts clearly\n",
"4. Suggest best practices and alternatives\n",
"5. Warn about potential pitfalls or security issues\n",
"6. Reference official documentation when appropriate\n",
"\n",
"Always prioritize accuracy and practical applicability in your technical responses.\n",
"\"\"\"\n",
"\n",
"Chat = SolveTechnicalQuestions()\n",
"Chat.set_system_prompt(TECHNICAL_SYSTEM_PROMPT)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8675757",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"## Understanding the `for` Loop\n",
"\n",
"### 1. Clear and Accurate Answer\n",
"\n",
"A `for` loop is a control flow statement used in programming to iterate over a sequence (like a list, tuple, string, or range) or perform a task a specific number of times. It allows you to execute a block of code repeatedly, which is crucial for automating repetitive tasks.\n",
"\n",
"### 2. Code Examples\n",
"\n",
"#### Python Example\n",
"\n",
"```python\n",
"# Iterate over a list\n",
"numbers = [1, 2, 3, 4, 5]\n",
"for number in numbers:\n",
" print(number)\n",
"```\n",
"\n",
"#### JavaScript Example\n",
"\n",
"```javascript\n",
"// Iterate over an array\n",
"const numbers = [1, 2, 3, 4, 5];\n",
"for (let number of numbers) {\n",
" console.log(number);\n",
"}\n",
"```\n",
"\n",
"#### Java Example\n",
"\n",
"```java\n",
"// Iterate over an array\n",
"int[] numbers = {1, 2, 3, 4, 5};\n",
"for (int number : numbers) {\n",
" System.out.println(number);\n",
"}\n",
"```\n",
"\n",
"### 3. Best Practices and Recommendations\n",
"\n",
"- **Use Descriptive Variable Names:** This improves code readability. Avoid vague names like `i` or `j`, unless they are commonly used as loop counters.\n",
" \n",
"- **Limit Loop Complexity:** Ensure that the logic inside the loop is straightforward. If the loop gets complicated, consider refactoring or extracting the logic into a separate function.\n",
"\n",
"- **Control Iteration with Care:** If you're iterating through large datasets, be mindful of performance impacts and consider alternatives (like list comprehensions in Python).\n",
"\n",
"### 4. Potential Pitfalls or Considerations\n",
"\n",
"- **Off-by-One Errors:** These are common when dealing with loop boundaries. Always double-check loop conditions to ensure you dont miss elements or go out of range.\n",
"\n",
"- **Infinite Loops:** Ensure that your loop has a condition that eventually becomes false, or it could result in an infinite loop, causing your program to hang.\n",
"\n",
"- **Modifying the Loop Variable:** Changing the loop variable within the loops body can lead to unexpected behaviors, especially in languages like Python.\n",
"\n",
"### 5. Additional Resources or References\n",
"\n",
"- [Python for Loop Documentation](https://docs.python.org/3/reference/compound_stmts.html#for)\n",
"- [JavaScript for Loop Documentation](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/for)\n",
"- [Java for Loop Documentation](https://docs.oracle.com/javase/tutorial/java/nutsandbolts/ch04.html#for)\n",
"\n",
"These resources provide in-depth explanations and examples for different programming languages, and can be useful for further learning about `for` loops and loops in general.\n"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Goodbye!\n"
]
}
],
"source": [
"# Set stream to true to allow streaming of the response\n",
"# It Mimics REPL\n",
"# After running look up to see a terminal where you put in your question\n",
"Chat.start(stream=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "7a086b95",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"### What is an `if` Statement?\n",
"\n",
"An `if` statement is a fundamental control flow statement in programming that allows you to execute a block of code based on a specified condition. If the condition evaluates to `true`, the block of code will execute; otherwise, it will be skipped.\n",
"\n",
"#### 1. A Clear, Accurate Answer\n",
"\n",
"In programming, the `if` statement checks a condition. If the condition is `true`, the code inside the `if` block is executed. If the condition is `false`, the block is ignored. \n",
"\n",
"Heres the basic syntax in Python and JavaScript as examples:\n",
"\n",
"**Python Syntax:**\n",
"```python\n",
"if condition:\n",
" # Code to execute if condition is true\n",
"```\n",
"\n",
"**JavaScript Syntax:**\n",
"```javascript\n",
"if (condition) {\n",
" // Code to execute if condition is true\n",
"}\n",
"```\n",
"\n",
"#### 2. Code Examples\n",
"\n",
"**Python Example:**\n",
"```python\n",
"temperature = 30\n",
"\n",
"if temperature > 25:\n",
" print(\"It's a hot day!\")\n",
"```\n",
"\n",
"**JavaScript Example:**\n",
"```javascript\n",
"let temperature = 30;\n",
"\n",
"if (temperature > 25) {\n",
" console.log(\"It's a hot day!\");\n",
"}\n",
"```\n",
"\n",
"In both examples, if the `temperature` variable is greater than 25, the corresponding message will be printed to the console.\n",
"\n",
"#### 3. Best Practices and Recommendations\n",
"\n",
"- **Use Clear Conditions**: Ensure that the condition being evaluated is clear and understandable. \n",
"- **Avoid Complex Conditions**: If conditions become too complex, consider breaking them down into multiple `if` statements or using logical operators for clarity.\n",
"- **Indentation**: Properly indent your code blocks. This improves readability and maintainability.\n",
"- **Use `elif`/`else if` for Multiple Conditions**: When evaluating multiple conditions, use `elif` (Python) or `else if` (JavaScript) to make the logic cleaner.\n",
" \n",
" **Example with `elif`:**\n",
" ```python\n",
" score = 85\n",
"\n",
" if score >= 90:\n",
" print(\"Grade: A\")\n",
" elif score >= 80:\n",
" print(\"Grade: B\")\n",
" else:\n",
" print(\"Grade: C\")\n",
" ```\n",
"\n",
"#### 4. Potential Pitfalls or Considerations\n",
"\n",
"- **Boolean Context**: Ensure that the condition evaluates to a boolean (`true` or `false`). Improper conditions could result in unexpected behavior.\n",
"- **Missing `else` or `elif`**: If not handled correctly, cases that fall outside the specified conditions may go unnoticed. Consider using an `else` statement to capture any situations not defined in prior conditions.\n",
"- **Short-Circuit Evaluation**: In languages like Python and JavaScript, using logical operators (`and`, `or`) as conditions can lead to short-circuit evaluation, which might affect the execution of your code. Be cautious about using these in conditions.\n",
"\n",
"#### 5. Additional Resources or References\n",
"\n",
"- **Python Documentation on `if` Statements**: [Python If Statement](https://docs.python.org/3/tutorial/controlflow.html#if-statements)\n",
"- **JavaScript Documentation on Conditional Statements**: [MDN Web Docs - Conditionals](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Control_flow_and_error_handling#conditional_statements)\n",
"\n",
"Understanding how `if` statements work is crucial for implementing decision-making logic in your programs, enabling dynamic behavior based on varying conditions."
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Goodbye!\n"
]
}
],
"source": [
"# Set stream to false to get a single response\n",
"Chat.start(stream=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95daf1f1",
"metadata": {},
"outputs": [],
"source": [
"# Ignore if you don't want to use ollama\n",
"# Here shows the ability to switch from one endpoint to another\n",
"Chat.set_endpoint(\"http://localhost:11434/v1\")\n",
"Chat.set_model(\"llama3.2\")\n",
"\n",
"Chat.start(stream=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c7d66ef7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"* Running on local URL: http://127.0.0.1:7861\n",
"* To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7861/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": []
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gr_output = gr.Markdown(label=\"Response\")\n",
"stream_input = gr.Checkbox(label='Stream', value=False)\n",
"question_input = gr.Textbox(label=\"Question\", info=\"Ask it any technical question\", lines=1)\n",
"\n",
"interface = gr.Interface(\n",
" fn=Chat.start_with_gradio, \n",
" title=\"ChatGPT\", \n",
" inputs=[question_input, stream_input], \n",
" outputs=[gr_output], \n",
" flagging_mode=\"never\"\n",
")\n",
"\n",
"interface.launch()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed776b93",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,716 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "7dbe3347",
"metadata": {},
"source": [
"# 🚀 Advanced Web Scraping & AI Assistant - Week 1 Complete Exercise\n",
"\n",
"## 📋 **Notebook Overview**\n",
"\n",
"This notebook demonstrates the **complete evolution** of a web scraping solution through **Week 1** of the LLM Engineering course.\n",
"\n",
"### **Exercise Progression:**\n",
"- **Cells 1-7**: Week 1 Day 1 (basic scraping + AI)\n",
"- **Cell 8**: Week 1 Day 2 (Ollama integration) \n",
"- **Cells 9-13**: Week 1 Day 5 (advanced features + brochure generation)\n",
"\n",
"### **Key Learning Progression:**\n",
"1. **Day 1**: JavaScript scraping problem → Selenium solution\n",
"2. **Day 2**: Remote ↔ Local AI flexibility (OpenAI ↔ Ollama)\n",
"3. **Day 5**: Multi-page intelligence + business automation\n",
"\n",
"### **Technical Skills:**\n",
"- Selenium WebDriver, OpenAI API, Ollama, JSON processing, Class inheritance, Streaming responses\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9addd8d1",
"metadata": {},
"outputs": [],
"source": [
"# week1 -> day1\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium import webdriver\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"\n",
"# week1 -> day5\n",
"import json\n",
"from typing import Dict, List\n",
"\n",
"#week2 -> day2\n",
"import gradio as gr"
]
},
{
"cell_type": "markdown",
"id": "85bf7734",
"metadata": {},
"source": [
"## 📦 **Dependencies**\n",
"\n",
"**Week 1 Day 1**: Core scraping + AI integration\n",
"**Week 1 Day 5**: Added JSON processing + type hints\n"
]
},
{
"cell_type": "markdown",
"id": "f881e916",
"metadata": {},
"source": [
"## **Environment Setup**\n",
"\n",
"This cell loads the OpenAI API key from the `.env` file. The `override=True` parameter ensures that any existing environment variables are replaced with values from the `.env` file.\n",
"\n",
"**Important**: Make sure you have a `.env` file in your project root with:\n",
"```\n",
"OPENAI_API_KEY=your-actual-api-key-here\n",
"```\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7123ba55",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"api_key:str = os.getenv('OPENAI_API_KEY')"
]
},
{
"cell_type": "markdown",
"id": "ab17f1a7",
"metadata": {},
"source": [
"## 🏗️ **WebpageSummarizer Class**\n",
"\n",
"**Day 1**: Basic scraping + AI integration\n",
"**Day 2**: Remote ↔ Local flexibility (`set_endpoint`, `set_model`)\n",
"**Day 5**: Multi-page intelligence + brochure generation\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "5e9cdf71",
"metadata": {},
"outputs": [],
"source": [
"class WebpageSummarizer:\n",
" # week1 -> day1\n",
" _system_prompt = \"\"\"\n",
" You are a snarkyassistant that analyzes the contents of a website, \n",
" and provides a short, snarky, humorous summary, ignoring text that might be navigation related.\n",
" Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.\n",
" \"\"\"\n",
" \n",
" # week1 -> day1\n",
" _MODEL = \"gpt-4o-mini\"\n",
"\n",
" # week1 -> day1\n",
" def __init__(self, model: str = _MODEL) -> None:\n",
" self.openai_client = OpenAI()\n",
" self.driver = webdriver.Chrome()\n",
" self._MODEL = model\n",
" \n",
" # week1 -> day1\n",
" def scrape_website(self, url: str) -> str:\n",
" self.driver.get(url)\n",
" self.driver.implicitly_wait(10)\n",
" title = self.driver.title\n",
" text_content = self.driver.find_element(By.TAG_NAME, \"body\").text\n",
" return title + \"\\n\\n\" + text_content\n",
"\n",
" # week1 -> day1\n",
" def summarize_text(self, url: str) -> str:\n",
" text = self.scrape_website(url)\n",
" response = self.openai_client.chat.completions.create(\n",
" model=self._MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": self._system_prompt},\n",
" {\"role\": \"user\", \"content\": text}\n",
" ]\n",
" )\n",
"\n",
" return response.choices[0].message.content\n",
"\n",
" # week1 -> day1\n",
" def display_summary(self, url: str)-> None:\n",
" summary:str = self.summarize_text(url)\n",
" display(Markdown(summary))\n",
"\n",
" # week1 -> day2\n",
" def set_endpoint(self, endpoint: str, api_key: str = \"ollama\") -> None:\n",
" self.openai_client = OpenAI(base_url=endpoint, api_key=api_key)\n",
"\n",
" # week1 -> day2\n",
" def set_model(self, model: str) -> None:\n",
" self._MODEL = model\n",
"\n",
" # week1 -> day5\n",
" def set_system_prompt(self, system_prompt: str) -> None:\n",
" self._system_prompt = system_prompt\n",
"\n",
" # week1 -> day5\n",
" def scrape_website_links(self, url: str) -> list[str]:\n",
" self.driver.get(url)\n",
" self.driver.implicitly_wait(10)\n",
" \n",
" links = self.driver.find_elements(By.TAG_NAME, \"a\")\n",
" return [link.get_attribute(\"href\") for link in links \n",
" if link.get_attribute(\"href\") and link.get_attribute(\"href\").strip()]\n",
"\n",
" # week1 -> day5\n",
" def generate_user_prompt_to_select_relevant_links(self, url: str) -> str:\n",
" user_prompt = f\"\"\"\n",
" Here is the list of links on the website {url} -\n",
" Please decide which of these are relevant web links for a brochure about the company, \n",
" respond with the full https URL in JSON format.\n",
" Do not include Terms of Service, Privacy, email links.\n",
"\n",
" Links (some might be relative links):\n",
" \"\"\"\n",
" links = self.scrape_website_links(url)\n",
" user_prompt += \"\\n\".join(links)\n",
" return user_prompt\n",
"\n",
" # week1 -> day5\n",
" def select_relevant_links(self, url:str) -> Dict[str, List[Dict[str, str]]]:\n",
" message = self.generate_user_prompt_to_select_relevant_links(url)\n",
" response = self.openai_client.chat.completions.create(\n",
" model=self._MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": self._system_prompt},\n",
" {\"role\": \"user\", \"content\": message}\n",
" ],\n",
" response_format={\"type\": \"json_object\"}\n",
" )\n",
"\n",
" json_response = json.loads(response.choices[0].message.content)\n",
"\n",
" return json_response\n",
"\n",
" # week1 -> day5\n",
" def fetch_page_and_all_relevant_links(self, url):\n",
" contents = self.scrape_website(url)\n",
" relevant_links = self.select_relevant_links(url)\n",
" result = f\"## Landing Page:\\n\\n{contents}\\n## Relevant Links:\\n\"\n",
" for link in relevant_links[\"links\"]:\n",
" result += f\"\\n\\n### Link: {link[\"type\"]}\\n\"\n",
" result += self.scrape_website(link[\"url\"])\n",
" return result\n",
" \n",
" def get_user_prompt_for_brochure(self, company_name:str, url:str) -> str:\n",
" user_prompt = f\"\"\"\n",
" You are looking at a company called: {company_name}\n",
" Here are the contents of its landing page and other relevant pages;\n",
" use this information to build a short brochure of the company in markdown without code blocks.\\n\\n\n",
" \"\"\"\n",
" user_prompt += self.fetch_page_and_all_relevant_links(url)\n",
" user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
" return user_prompt\n",
"\n",
" # week1 -> day5\n",
" def generate_brochure(self, company_name:str, url:str, link_prompt: str, brochure_prompt: str, stream: bool = False) -> None:\n",
" self.set_system_prompt(link_prompt)\n",
" contents = self.get_user_prompt_for_brochure(company_name,url)\n",
" self.set_system_prompt(brochure_prompt)\n",
" response = self.openai_client.chat.completions.create(\n",
" model=self._MODEL,\n",
" messages=[{\"role\": \"system\", \"content\": self._system_prompt}, {\"role\": \"user\", \"content\": contents}],\n",
" stream=stream # for streaming response\n",
" )\n",
"\n",
" if stream:\n",
" full_response = \"\"\n",
" display_handle = display(Markdown(full_response), display_id=True)\n",
" for chunk in response:\n",
" full_response += chunk.choices[0].delta.content or \"\"\n",
" update_display(Markdown(full_response), display_id=display_handle.display_id)\n",
" else:\n",
" result = response.choices[0].message.content\n",
" display(Markdown(result))\n",
"\n",
" # week2 -> day2\n",
" def generate_brochure_with_gradio(self, company_name:str, url:str, link_prompt: str, brochure_prompt: str, stream: bool = False):\n",
" self.set_system_prompt(link_prompt)\n",
" contents = self.get_user_prompt_for_brochure(company_name,url)\n",
" self.set_system_prompt(brochure_prompt)\n",
" response = self.openai_client.chat.completions.create(\n",
" model=self._MODEL,\n",
" messages=[{\"role\": \"system\", \"content\": self._system_prompt}, {\"role\": \"user\", \"content\": contents}],\n",
" stream=stream # for streaming response\n",
" )\n",
"\n",
" if stream:\n",
" full_response = \"\"\n",
" for chunk in response:\n",
" full_response += chunk.choices[0].delta.content or \"\"\n",
" yield full_response\n",
" else:\n",
" result = response.choices[0].message.content\n",
" yield result\n",
" "
]
},
{
"cell_type": "markdown",
"id": "cc085a2b",
"metadata": {},
"source": [
"## Demo: LinkedIn Summary\n",
"\n",
"This cell demonstrates the WebpageSummarizer in action by:\n",
"\n",
"1. **Creating an instance** with the GPT-5-nano model\n",
"2. **Scraping LinkedIn's homepage** - a JavaScript-heavy site that traditional scraping can't handle\n",
"3. **Generating a snarky summary** that captures the essence of LinkedIn's professional networking platform\n",
"\n",
"### What Happens:\n",
"- Selenium opens Chrome browser (visible window)\n",
"- Navigates to LinkedIn.com\n",
"- Waits for JavaScript to render all content\n",
"- Extracts all visible text from the page\n",
"- Sends content to OpenAI for summarization\n",
"- Displays the humorous, sarcastic summary in markdown format\n",
"\n",
"### Expected Output:\n",
"A witty, entertaining summary that captures LinkedIn's key features and business model with a humorous tone.\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "cfe93bea",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)\n"
]
},
{
"data": {
"text/markdown": [
"LinkedIns homepage in a nutshell: a corporate buffet of jobs, courses, tools, and guilt-inducing “Open to Work” vibes, wrapped in a lot of navigation clutter.\n",
"\n",
"- Top Content: Curated posts and expert insights by topic (Career, Productivity, Finance, Soft Skills, Project Management, etc.). Yes, because your feed needed more buzzwords.\n",
"- Jobs: Find the right job or internship across a big menu of roles (Engineering, Marketing, IT, HR, Admin, Retail, etc.). Tempting you with endless openings.\n",
"- Post your job: Post a job for millions to see. Because nothing says “were hiring” like a public billboard.\n",
"- Software tools: Discover the best software—CRM, HRMS, Project Management, Help Desk, etc.—as if you were deciding which inbox to dread today.\n",
"- Games: Keep your mind sharp with daily games (Pinpoint, Queens, Crossclimb, Tango, Zip, Mini Sudoku). Productivity through micro-snacks!\n",
"- Open To Work: Privately tell recruiters or publicly broadcast youre looking for opportunities. Subtle as a neon sign.\n",
"- Connect and Learn: Find people you know, learn new skills, and choose topics to study. Professional life, now with more onboarding prompts.\n",
"- Who is LinkedIn for?: Anyone navigating professional life—because apparently thats everyone.\n",
"- Bottom line: Its a hub of professional action—job hunting, learning, toolshopping, and the occasional brain teaser to distract you from the grim reality of deadlines."
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# week1 -> day1\n",
"Summarizer = WebpageSummarizer(\"gpt-5-nano\")\n",
"\n",
"Summarizer.display_summary(\"https://www.linkedin.com\")\n"
]
},
{
"cell_type": "markdown",
"id": "4816a966",
"metadata": {},
"source": [
"## 🔄 **Day 2 - Remote ↔ Local AI**\n",
"\n",
"Seamless switching between OpenAI (cloud) and Ollama (local) using `set_endpoint()`\n"
]
},
{
"cell_type": "markdown",
"id": "7b650e50",
"metadata": {},
"source": [
"## 🚀 **Day 5 - Multi-Page Intelligence**\n",
"\n",
"AI-powered link analysis + automated company brochure generation\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d586747e",
"metadata": {},
"outputs": [],
"source": [
"# week1 -> day2\n",
"Summarizer.set_endpoint(\"http://localhost:11434/v1\")\n",
"Summarizer.set_model(\"llama3.2\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43331574",
"metadata": {},
"outputs": [],
"source": [
"Summarizer.display_summary(\"https://www.linkedin.com\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e4e90b4a",
"metadata": {},
"outputs": [],
"source": [
"Summarizer = WebpageSummarizer(\"gpt-5-nano\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "fb6b2d25",
"metadata": {},
"outputs": [],
"source": [
"LINK_SYSTEM_PROMPT = \"\"\"\n",
" You are provided with a list of links found on a webpage.\n",
" You are able to decide which of the links would be most relevant to include in a brochure about the company,\n",
" such as links to an About page, or a Company page, or Careers/Jobs pages.\n",
" You should respond in JSON as in this example:\n",
"\n",
" {\n",
" \"links\": [\n",
" {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
" {\"type\": \"careers page\", \"url\": \"https://another.full.url/careers\"}\n",
" ]\n",
" }\n",
" \"\"\""
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "da54f8ca",
"metadata": {},
"outputs": [],
"source": [
"BRAND_SYSTEM_PROMPT = \"\"\" \n",
"You are an assistant that analyzes the contents of several relevant pages from a company website\n",
"and creates a short brochure about the company for prospective customers, investors and recruits.\n",
"Respond in markdown without code blocks.\n",
"Include details of company culture, customers and careers/jobs if you have the information. \n",
"\"\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "b6055ce5",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"# Hugging Face — The AI community building the future\n",
"\n",
"Hugging Face is the collaboration platform at the heart of the machine learning community. We empower researchers, engineers, and end users to learn, share, and build open, ethical AI together.\n",
"\n",
"---\n",
"\n",
"## What we do\n",
"\n",
"- A vibrant platform where the ML community collaborates on models, datasets, and applications\n",
"- Browse 1M+ models, discover 400k+ apps, and explore 250k+ datasets\n",
"- Multi-modality support: text, image, video, audio, and even 3D\n",
"- Build and showcase your ML portfolio by sharing your work with the world\n",
"- Sign up to join a thriving ecosystem and accelerate your ML journey\n",
"\n",
"---\n",
"\n",
"## The platform (products and capabilities)\n",
"\n",
"- Hub for Models, Datasets, and Spaces\n",
" - Host and collaborate on unlimited public models, datasets, and applications\n",
"- HF Open Source Stack\n",
" - Move faster with a comprehensive open source foundation\n",
"- Inference & Deployment\n",
" - Inference Endpoints to deploy at scale; GPU-enabled Spaces in a few clicks\n",
" - Inference Providers give access to 45,000+ models via a single unified API (no service fees)\n",
"- HuggingChat Omni\n",
" - Chat with AI across the ecosystem\n",
"- Services for teams\n",
" - Enterprise-grade security, access controls, and dedicated support\n",
" - Starting at $20 per user per month\n",
"- Compute options\n",
" - Starting at $0.60/hour for GPU\n",
"- Open ecosystem\n",
" - Our open source projects power the ML toolchain and community\n",
" - Key projects include Transformers, Diffusers, Safetensors, Tokenizers, TRL, Transformers.js, smolagents, and more\n",
"\n",
"---\n",
"\n",
"## Our open source core\n",
"\n",
"Were building the foundation of ML tooling with the community. Our flagship projects include:\n",
"- Transformers (state-of-the-art models for PyTorch)\n",
"- Diffusers (diffusion models)\n",
"- Safetensors (safe storage/distribution of weights)\n",
"- Hub Python Library (Python client for the Hugging Face Hub)\n",
"- Tokenizers, TRL, Transformers.js, smolagents\n",
"- These projects power the vast Hugging Face ecosystem and enable researchers and developers to innovate openly\n",
"\n",
"---\n",
"\n",
"## Customers, partners, and impact\n",
"\n",
"- More than 50,000 organizations use Hugging Face\n",
"- Notable teams and enterprises rely on our platform, including leaders such as Meta AI, Amazon, Google, Microsoft, Intel, Grammarly, Writer, and more\n",
"- We support both individual researchers and large teams with scalable, secure solutions\n",
"\n",
"---\n",
"\n",
"## Culture, community, and values\n",
"\n",
"- Open and ethical AI future, built together with the community\n",
"- A learning-first, collaborative environment that values openness and sharing\n",
"- Strong emphasis on open source tooling and transparent collaboration\n",
"- A platform that empowers the next generation of ML engineers, scientists, and end users\n",
"\n",
"From brand storytelling to product strategy, we emphasize a cooperative, community-driven approach to advancing AI in a responsible way.\n",
"\n",
"---\n",
"\n",
"## Careers and how to join\n",
"\n",
"- We regularly post opportunities on our Careers page. If youre excited by open science, open source tooling, and building tools that empower thousands of practitioners, Hugging Face could be a great fit.\n",
"- Join a growing, mission-driven team that supports developers, researchers, and enterprise customers with cutting-edge AI tooling\n",
"\n",
"---\n",
"\n",
"## How to engage\n",
"\n",
"- Explore Models, Datasets, and Spaces\n",
"- Try HuggingChat Omni\n",
"- Sign up to build your ML portfolio and collaborate with the community\n",
"- For teams, learn about our enterprise options, security, and dedicated support\n",
"\n",
"---\n",
"\n",
"## Why invest or partner with Hugging Face\n",
"\n",
"- A thriving, open-source ecosystem with broad adoption across industry and academia\n",
"- A scalable platform that combines models, datasets, spaces, and applications under one roof\n",
"- A proven track record of enabling organizations to accelerate AI development while offering enterprise-grade security and support\n",
"- A growing customer base and a clear pathway from community tools to enterprise deployment\n",
"\n",
"---\n",
"\n",
"If youd like more detail on specific products, a few success stories, or to see current open roles, I can pull together a concise section tailored to investors, customers, or prospective hires."
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Generate brochure without streaming the response\n",
"Summarizer.generate_brochure(\"Hugging Face\", \"https://huggingface.co\", LINK_SYSTEM_PROMPT, BRAND_SYSTEM_PROMPT)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ff5a5341",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"# Edward (Ed) Donner — Co-founder & CTO, Nebula.io\n",
"\n",
"A glimpse into the mission, technology, and culture behind Nebula.io, led by Ed Donner, with a focus on transforming recruitment through AI.\n",
"\n",
"## Who we are\n",
"- Edward (Ed) Donner is the co-founder and CTO of Nebula.io.\n",
"- Nebula.io applies Generative AI and other machine learning to help recruiters source, understand, engage, and manage talent.\n",
"- The platform uses a patented matching model that connects people with roles more accurately and quickly—without relying on keywords.\n",
"\n",
"## What we do\n",
"- Enable recruiters to source, understand, engage, and manage talent at scale.\n",
"- Use proprietary, verticalized LLMs tailored for talent and hiring workflows.\n",
"- Offer a patented matching model that improves accuracy and speed, with no keyword tyranny.\n",
"- Provide a platform that is award-winning and backed by press coverage, designed to help people discover roles where they will thrive.\n",
"- The product is described as free to try, offering a no-barrier way to explore its capabilities.\n",
"\n",
"## Our technology and approach\n",
"- Proprietary LLMs specialized for talent recruitment.\n",
"- A patented matching engine that aligns people with roles more effectively than traditional keyword-based methods.\n",
"- Emphasis on real-world impact: applying AI to help people discover their potential and pursue their Ikigai—finding roles where they can be fulfilled and successful.\n",
"- The platform supports Gen AI and Agentic AI use cases, including practical deployments at scale (evidenced by references to AWS-scale implementations).\n",
"\n",
"## Why Nebula.io matters\n",
"- Addressing a broad human capital challenge: many people feel uninspired or disengaged at work, and Nebula.io aims to change that by better matching individuals to meaningful roles.\n",
"- The long-term vision centers on raising human prosperity by helping people pursue fulfilling career paths.\n",
"\n",
"## History, credibility, and impact\n",
"- Origin: Nebula.io traces back to Eds prior venture, untapt (founded in 2013), which built talent marketplaces and data science tools for recruitment.\n",
"- Early recognition: selected for the Accenture FinTech Innovation Lab; named an American Banker Top 20 Company To Watch.\n",
"- Media coverage: features in Fast Company, Forbes, and American Banker; Ed has spoken publicly about AI and recruitment, including high-profile interviews.\n",
"- Legacy of real-world impact: Nebula.io builds on a track record of applying AI to recruitment challenges and delivering value to customers.\n",
"\n",
"## Culture and values\n",
"- Ikigai-driven philosophy: helping people discover their potential and pursue meaningful work.\n",
"- A hands-on, creative founder who blends technical rigor with curiosity (Eds interests include coding, experimenting with LLMs, DJing, and exploring tech culture).\n",
"- A pragmatic, impact-focused approach to AI—prioritizing real-world problems and measurable outcomes for customers and candidates alike.\n",
"\n",
"## Customers and impact\n",
"- The platform is used by recruiters today to source, understand, engage, and manage talent.\n",
"- The emphasis is on delivering a better, faster, more accurate matching experience—reducing reliance on keyword matching and accelerating hiring outcomes.\n",
"- While specific customer names arent listed on the public pages, the platform is described as having happy customers and broad press coverage, underscoring credibility and market reception.\n",
"\n",
"## Careers and opportunities\n",
"- The site highlights a culture of innovation and hands-on AI work, but does not list open job postings.\n",
"- For those inspired to work at the intersection of AI and talent, Nebula.io invites connections and conversations about opportunities to contribute to real-world hiring problems.\n",
"- If youre interested in joining or collaborating, consider reaching out to Ed Donner and exploring how your skills could fit the mission.\n",
"\n",
"## How to connect\n",
"- Email: ed [at] edwarddonner [dot] com\n",
"- Website: www.edwarddonner.com\n",
"- Follow Ed on social: LinkedIn, Twitter, Facebook\n",
"- Newsletter: Subscribe to updates and course offerings related to AI, LLMs, and talent acquisition\n",
"\n",
"## Why invest or partner with Nebula.io\n",
"- Strong founder-led vision focused on meaningful, measurable outcomes in hiring.\n",
"- Proven track record through prior ventures and credible industry recognition.\n",
"- Patent-backed technology offering a differentiated approach to talent matching.\n",
"- Clear social impact goal: helping people find roles where they will be fulfilled and productive, contributing to broader prosperity.\n",
"\n",
"If youd like a tailored brochure version for investors, customers, or potential recruits, I can adjust the emphasis and add any additional details youd like highlighted."
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Generate brochure while streaming the response\n",
"Summarizer.generate_brochure(\"Ed Donner\", \"https://edwarddonner.com\", LINK_SYSTEM_PROMPT, BRAND_SYSTEM_PROMPT, stream=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "3f84d4c3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"* Running on local URL: http://127.0.0.1:7862\n",
"* To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7862/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": []
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Generate brochure using the Gradio interface\n",
"company_name = gr.Textbox(label=\"Company Name\", info=\"Write the name of the company\")\n",
"company_url = gr.Textbox(label=\"Company URL\", info=\"Write the URL of the company\")\n",
"link_system_prompt = gr.Textbox(\n",
" label=\"Link System Prompt\", \n",
" info=\"This is a system prompt to decide which of the links would be most relevant to include in a brochure about the company\", \n",
" value=LINK_SYSTEM_PROMPT\n",
")\n",
"brand_system_prompt = gr.Textbox(\n",
" label=\"Brand System Prompt\", \n",
" info=\"This is a system prompt that analyzes the contents of several relevant pages from a company website and creates a short brochure about the company for prospective customers, investors and recruits.\", \n",
" value=BRAND_SYSTEM_PROMPT\n",
")\n",
"stream_value = gr.Checkbox(label=\"Stream\", value=False)\n",
"gr_output = gr.Markdown(label=\"Response\")\n",
"\n",
"interface = gr.Interface(\n",
" fn=Summarizer.generate_brochure_with_gradio, \n",
" title=\"Brochure Generator\", \n",
" inputs=[company_name, company_url, link_system_prompt, brand_system_prompt, stream_value], \n",
" outputs=[gr_output], \n",
" flagging_mode=\"never\"\n",
")\n",
"\n",
"interface.launch(inbrowser=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7114df30",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}