diff --git a/week1/community-contributions/solisoma/end_of_week_exercise.ipynb b/week1/community-contributions/solisoma/end_of_week_exercise.ipynb new file mode 100644 index 0000000..e879bf5 --- /dev/null +++ b/week1/community-contributions/solisoma/end_of_week_exercise.ipynb @@ -0,0 +1,488 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "6df489a5", + "metadata": {}, + "outputs": [], + "source": [ + "# week1 -> day1\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from IPython.display import Markdown, display, update_display\n", + "from openai import OpenAI\n", + "\n", + "#week2 -> day2\n", + "import gradio as gr" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8e7fbf42", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "api_key:str = os.getenv('OPENAI_API_KEY')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b9266d13", + "metadata": {}, + "outputs": [], + "source": [ + "class SolveTechnicalQuestions:\n", + " _system_prompt = \"\"\"\n", + " You are a snarkyassistant that analyzes the contents of a website, \n", + " and provides a short, snarky, humorous summary, ignoring text that might be navigation related.\n", + " Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.\n", + " \"\"\"\n", + "\n", + " def __init__(self, model: str = \"gpt-4o-mini\") -> None:\n", + " self.openai_client = OpenAI()\n", + " self._MODEL = model\n", + "\n", + " def get_user_technical_question_prompt(self, question:str):\n", + " prompt = f\"\"\"\n", + " Answer this technical questio comprehensively:\n", + " Provide:\n", + " 1. A clear, accurate answer\n", + " 2. Code examples if relevant\n", + " 3. Best practices and recommendations\n", + " 4. Potential pitfalls or considerations\n", + " 5. Additional resources or references if helpful\n", + "\n", + " Format your response in a structured, easy-to-read manner.\n", + "\n", + " Question {question}\n", + " \"\"\"\n", + "\n", + " return prompt\n", + " \n", + " def set_system_prompt(self, system_prompt: str) -> None:\n", + " self._system_prompt = system_prompt\n", + " \n", + " def set_endpoint(self, endpoint: str, api_key: str = \"ollama\") -> None:\n", + " self.openai_client = OpenAI(base_url=endpoint, api_key=api_key)\n", + "\n", + " def set_model(self, model: str) -> None:\n", + " self._MODEL = model\n", + "\n", + " def start(self, stream=False):\n", + " try:\n", + " while True:\n", + " question = input(\">>> \")\n", + " \n", + " if question.strip().lower() in ['quit', 'exit', 'q']:\n", + " print(\"Goodbye!\")\n", + " break\n", + " \n", + " if not question.strip():\n", + " print(\"Please enter a question.\")\n", + " continue\n", + " \n", + " message = self.get_user_technical_question_prompt(question.strip())\n", + " \n", + " response = self.openai_client.chat.completions.create(\n", + " model=self._MODEL, \n", + " messages=[\n", + " {\"role\": \"system\", \"content\": self._system_prompt},\n", + " {\"role\": \"user\", \"content\": message},\n", + " ],\n", + " stream=stream\n", + " )\n", + "\n", + " if stream:\n", + " full_response = \"\"\n", + " display_handle = display(Markdown(full_response), display_id=True)\n", + " for chunk in response:\n", + " if chunk.choices[0].delta.content:\n", + " full_response += chunk.choices[0].delta.content\n", + " update_display(Markdown(full_response), display_id=display_handle.display_id)\n", + " full_response += \"\\n\"\n", + " update_display(Markdown(full_response), display_id=display_handle.display_id)\n", + " else:\n", + " full_response = response.choices[0].message.content\n", + " display(Markdown(full_response))\n", + " \n", + " except KeyboardInterrupt:\n", + " print(\"\\nGoodbye!\")\n", + " except Exception as e:\n", + " print(f\"Error: {e}\")\n", + "\n", + " def start_with_gradio(self, question:str, stream=False):\n", + " if not question.strip():\n", + " return \"Please enter a question.\"\n", + " \n", + " message = self.get_user_technical_question_prompt(question.strip())\n", + " \n", + " response = self.openai_client.chat.completions.create(\n", + " model=self._MODEL, \n", + " messages=[\n", + " {\"role\": \"system\", \"content\": self._system_prompt},\n", + " {\"role\": \"user\", \"content\": message},\n", + " ],\n", + " stream=stream\n", + " )\n", + "\n", + " if stream:\n", + " full_response = \"\"\n", + " for chunk in response:\n", + " if chunk.choices[0].delta.content:\n", + " full_response += chunk.choices[0].delta.content\n", + " yield full_response\n", + " full_response += \"\\n\"\n", + " yield full_response\n", + " else:\n", + " yield response.choices[0].message.content\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0bddb2e5", + "metadata": {}, + "outputs": [], + "source": [ + "TECHNICAL_SYSTEM_PROMPT = \"\"\"\n", + "You are an expert technical assistant with deep knowledge in:\n", + "\n", + "PROGRAMMING & DEVELOPMENT:\n", + "- Python, JavaScript, Java, C++, Go, Rust, TypeScript\n", + "- Web development (React, Vue, Angular, Node.js)\n", + "- Mobile development (iOS, Android, Flutter)\n", + "- DevOps (Docker, Kubernetes, CI/CD, AWS, Azure, GCP)\n", + "- Database systems (SQL, NoSQL, PostgreSQL, MongoDB)\n", + "- Software architecture patterns and best practices\n", + "\n", + "SYSTEMS & INFRASTRUCTURE:\n", + "- Operating systems (Linux, Windows, macOS)\n", + "- Networking protocols and security\n", + "- Cloud computing and distributed systems\n", + "- Monitoring, logging, and observability\n", + "- Performance optimization and scaling\n", + "\n", + "AI & MACHINE LEARNING:\n", + "- Machine learning algorithms and frameworks\n", + "- Deep learning (TensorFlow, PyTorch)\n", + "- Natural language processing\n", + "- Computer vision and image processing\n", + "- MLOps and model deployment\n", + "\n", + "RESPONSE GUIDELINES:\n", + "1. Provide accurate, up-to-date technical information\n", + "2. Include code examples when relevant\n", + "3. Explain complex concepts clearly\n", + "4. Suggest best practices and alternatives\n", + "5. Warn about potential pitfalls or security issues\n", + "6. Reference official documentation when appropriate\n", + "\n", + "Always prioritize accuracy and practical applicability in your technical responses.\n", + "\"\"\"\n", + "\n", + "Chat = SolveTechnicalQuestions()\n", + "Chat.set_system_prompt(TECHNICAL_SYSTEM_PROMPT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8675757", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Understanding the `for` Loop\n", + "\n", + "### 1. Clear and Accurate Answer\n", + "\n", + "A `for` loop is a control flow statement used in programming to iterate over a sequence (like a list, tuple, string, or range) or perform a task a specific number of times. It allows you to execute a block of code repeatedly, which is crucial for automating repetitive tasks.\n", + "\n", + "### 2. Code Examples\n", + "\n", + "#### Python Example\n", + "\n", + "```python\n", + "# Iterate over a list\n", + "numbers = [1, 2, 3, 4, 5]\n", + "for number in numbers:\n", + " print(number)\n", + "```\n", + "\n", + "#### JavaScript Example\n", + "\n", + "```javascript\n", + "// Iterate over an array\n", + "const numbers = [1, 2, 3, 4, 5];\n", + "for (let number of numbers) {\n", + " console.log(number);\n", + "}\n", + "```\n", + "\n", + "#### Java Example\n", + "\n", + "```java\n", + "// Iterate over an array\n", + "int[] numbers = {1, 2, 3, 4, 5};\n", + "for (int number : numbers) {\n", + " System.out.println(number);\n", + "}\n", + "```\n", + "\n", + "### 3. Best Practices and Recommendations\n", + "\n", + "- **Use Descriptive Variable Names:** This improves code readability. Avoid vague names like `i` or `j`, unless they are commonly used as loop counters.\n", + " \n", + "- **Limit Loop Complexity:** Ensure that the logic inside the loop is straightforward. If the loop gets complicated, consider refactoring or extracting the logic into a separate function.\n", + "\n", + "- **Control Iteration with Care:** If you're iterating through large datasets, be mindful of performance impacts and consider alternatives (like list comprehensions in Python).\n", + "\n", + "### 4. Potential Pitfalls or Considerations\n", + "\n", + "- **Off-by-One Errors:** These are common when dealing with loop boundaries. Always double-check loop conditions to ensure you don’t miss elements or go out of range.\n", + "\n", + "- **Infinite Loops:** Ensure that your loop has a condition that eventually becomes false, or it could result in an infinite loop, causing your program to hang.\n", + "\n", + "- **Modifying the Loop Variable:** Changing the loop variable within the loop’s body can lead to unexpected behaviors, especially in languages like Python.\n", + "\n", + "### 5. Additional Resources or References\n", + "\n", + "- [Python for Loop Documentation](https://docs.python.org/3/reference/compound_stmts.html#for)\n", + "- [JavaScript for Loop Documentation](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/for)\n", + "- [Java for Loop Documentation](https://docs.oracle.com/javase/tutorial/java/nutsandbolts/ch04.html#for)\n", + "\n", + "These resources provide in-depth explanations and examples for different programming languages, and can be useful for further learning about `for` loops and loops in general.\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Goodbye!\n" + ] + } + ], + "source": [ + "# Set stream to true to allow streaming of the response\n", + "# It Mimics REPL\n", + "# After running look up to see a terminal where you put in your question\n", + "Chat.start(stream=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7a086b95", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### What is an `if` Statement?\n", + "\n", + "An `if` statement is a fundamental control flow statement in programming that allows you to execute a block of code based on a specified condition. If the condition evaluates to `true`, the block of code will execute; otherwise, it will be skipped.\n", + "\n", + "#### 1. A Clear, Accurate Answer\n", + "\n", + "In programming, the `if` statement checks a condition. If the condition is `true`, the code inside the `if` block is executed. If the condition is `false`, the block is ignored. \n", + "\n", + "Here’s the basic syntax in Python and JavaScript as examples:\n", + "\n", + "**Python Syntax:**\n", + "```python\n", + "if condition:\n", + " # Code to execute if condition is true\n", + "```\n", + "\n", + "**JavaScript Syntax:**\n", + "```javascript\n", + "if (condition) {\n", + " // Code to execute if condition is true\n", + "}\n", + "```\n", + "\n", + "#### 2. Code Examples\n", + "\n", + "**Python Example:**\n", + "```python\n", + "temperature = 30\n", + "\n", + "if temperature > 25:\n", + " print(\"It's a hot day!\")\n", + "```\n", + "\n", + "**JavaScript Example:**\n", + "```javascript\n", + "let temperature = 30;\n", + "\n", + "if (temperature > 25) {\n", + " console.log(\"It's a hot day!\");\n", + "}\n", + "```\n", + "\n", + "In both examples, if the `temperature` variable is greater than 25, the corresponding message will be printed to the console.\n", + "\n", + "#### 3. Best Practices and Recommendations\n", + "\n", + "- **Use Clear Conditions**: Ensure that the condition being evaluated is clear and understandable. \n", + "- **Avoid Complex Conditions**: If conditions become too complex, consider breaking them down into multiple `if` statements or using logical operators for clarity.\n", + "- **Indentation**: Properly indent your code blocks. This improves readability and maintainability.\n", + "- **Use `elif`/`else if` for Multiple Conditions**: When evaluating multiple conditions, use `elif` (Python) or `else if` (JavaScript) to make the logic cleaner.\n", + " \n", + " **Example with `elif`:**\n", + " ```python\n", + " score = 85\n", + "\n", + " if score >= 90:\n", + " print(\"Grade: A\")\n", + " elif score >= 80:\n", + " print(\"Grade: B\")\n", + " else:\n", + " print(\"Grade: C\")\n", + " ```\n", + "\n", + "#### 4. Potential Pitfalls or Considerations\n", + "\n", + "- **Boolean Context**: Ensure that the condition evaluates to a boolean (`true` or `false`). Improper conditions could result in unexpected behavior.\n", + "- **Missing `else` or `elif`**: If not handled correctly, cases that fall outside the specified conditions may go unnoticed. Consider using an `else` statement to capture any situations not defined in prior conditions.\n", + "- **Short-Circuit Evaluation**: In languages like Python and JavaScript, using logical operators (`and`, `or`) as conditions can lead to short-circuit evaluation, which might affect the execution of your code. Be cautious about using these in conditions.\n", + "\n", + "#### 5. Additional Resources or References\n", + "\n", + "- **Python Documentation on `if` Statements**: [Python If Statement](https://docs.python.org/3/tutorial/controlflow.html#if-statements)\n", + "- **JavaScript Documentation on Conditional Statements**: [MDN Web Docs - Conditionals](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Control_flow_and_error_handling#conditional_statements)\n", + "\n", + "Understanding how `if` statements work is crucial for implementing decision-making logic in your programs, enabling dynamic behavior based on varying conditions." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Goodbye!\n" + ] + } + ], + "source": [ + "# Set stream to false to get a single response\n", + "Chat.start(stream=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95daf1f1", + "metadata": {}, + "outputs": [], + "source": [ + "# Ignore if you don't want to use ollama\n", + "# Here shows the ability to switch from one endpoint to another\n", + "Chat.set_endpoint(\"http://localhost:11434/v1\")\n", + "Chat.set_model(\"llama3.2\")\n", + "\n", + "Chat.start(stream=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c7d66ef7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* Running on local URL: http://127.0.0.1:7861\n", + "* To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gr_output = gr.Markdown(label=\"Response\")\n", + "stream_input = gr.Checkbox(label='Stream', value=False)\n", + "question_input = gr.Textbox(label=\"Question\", info=\"Ask it any technical question\", lines=1)\n", + "\n", + "interface = gr.Interface(\n", + " fn=Chat.start_with_gradio, \n", + " title=\"ChatGPT\", \n", + " inputs=[question_input, stream_input], \n", + " outputs=[gr_output], \n", + " flagging_mode=\"never\"\n", + ")\n", + "\n", + "interface.launch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed776b93", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week1/community-contributions/solisoma/week1_exercises.ipynb b/week1/community-contributions/solisoma/week1_exercises.ipynb new file mode 100644 index 0000000..6542f52 --- /dev/null +++ b/week1/community-contributions/solisoma/week1_exercises.ipynb @@ -0,0 +1,716 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7dbe3347", + "metadata": {}, + "source": [ + "# 🚀 Advanced Web Scraping & AI Assistant - Week 1 Complete Exercise\n", + "\n", + "## 📋 **Notebook Overview**\n", + "\n", + "This notebook demonstrates the **complete evolution** of a web scraping solution through **Week 1** of the LLM Engineering course.\n", + "\n", + "### **Exercise Progression:**\n", + "- **Cells 1-7**: Week 1 Day 1 (basic scraping + AI)\n", + "- **Cell 8**: Week 1 Day 2 (Ollama integration) \n", + "- **Cells 9-13**: Week 1 Day 5 (advanced features + brochure generation)\n", + "\n", + "### **Key Learning Progression:**\n", + "1. **Day 1**: JavaScript scraping problem → Selenium solution\n", + "2. **Day 2**: Remote ↔ Local AI flexibility (OpenAI ↔ Ollama)\n", + "3. **Day 5**: Multi-page intelligence + business automation\n", + "\n", + "### **Technical Skills:**\n", + "- Selenium WebDriver, OpenAI API, Ollama, JSON processing, Class inheritance, Streaming responses\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9addd8d1", + "metadata": {}, + "outputs": [], + "source": [ + "# week1 -> day1\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.chrome.options import Options\n", + "from selenium import webdriver\n", + "from IPython.display import Markdown, display, update_display\n", + "from openai import OpenAI\n", + "\n", + "# week1 -> day5\n", + "import json\n", + "from typing import Dict, List\n", + "\n", + "#week2 -> day2\n", + "import gradio as gr" + ] + }, + { + "cell_type": "markdown", + "id": "85bf7734", + "metadata": {}, + "source": [ + "## 📦 **Dependencies**\n", + "\n", + "**Week 1 Day 1**: Core scraping + AI integration\n", + "**Week 1 Day 5**: Added JSON processing + type hints\n" + ] + }, + { + "cell_type": "markdown", + "id": "f881e916", + "metadata": {}, + "source": [ + "## **Environment Setup**\n", + "\n", + "This cell loads the OpenAI API key from the `.env` file. The `override=True` parameter ensures that any existing environment variables are replaced with values from the `.env` file.\n", + "\n", + "**Important**: Make sure you have a `.env` file in your project root with:\n", + "```\n", + "OPENAI_API_KEY=your-actual-api-key-here\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7123ba55", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "api_key:str = os.getenv('OPENAI_API_KEY')" + ] + }, + { + "cell_type": "markdown", + "id": "ab17f1a7", + "metadata": {}, + "source": [ + "## 🏗️ **WebpageSummarizer Class**\n", + "\n", + "**Day 1**: Basic scraping + AI integration\n", + "**Day 2**: Remote ↔ Local flexibility (`set_endpoint`, `set_model`)\n", + "**Day 5**: Multi-page intelligence + brochure generation\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5e9cdf71", + "metadata": {}, + "outputs": [], + "source": [ + "class WebpageSummarizer:\n", + " # week1 -> day1\n", + " _system_prompt = \"\"\"\n", + " You are a snarkyassistant that analyzes the contents of a website, \n", + " and provides a short, snarky, humorous summary, ignoring text that might be navigation related.\n", + " Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.\n", + " \"\"\"\n", + " \n", + " # week1 -> day1\n", + " _MODEL = \"gpt-4o-mini\"\n", + "\n", + " # week1 -> day1\n", + " def __init__(self, model: str = _MODEL) -> None:\n", + " self.openai_client = OpenAI()\n", + " self.driver = webdriver.Chrome()\n", + " self._MODEL = model\n", + " \n", + " # week1 -> day1\n", + " def scrape_website(self, url: str) -> str:\n", + " self.driver.get(url)\n", + " self.driver.implicitly_wait(10)\n", + " title = self.driver.title\n", + " text_content = self.driver.find_element(By.TAG_NAME, \"body\").text\n", + " return title + \"\\n\\n\" + text_content\n", + "\n", + " # week1 -> day1\n", + " def summarize_text(self, url: str) -> str:\n", + " text = self.scrape_website(url)\n", + " response = self.openai_client.chat.completions.create(\n", + " model=self._MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": self._system_prompt},\n", + " {\"role\": \"user\", \"content\": text}\n", + " ]\n", + " )\n", + "\n", + " return response.choices[0].message.content\n", + "\n", + " # week1 -> day1\n", + " def display_summary(self, url: str)-> None:\n", + " summary:str = self.summarize_text(url)\n", + " display(Markdown(summary))\n", + "\n", + " # week1 -> day2\n", + " def set_endpoint(self, endpoint: str, api_key: str = \"ollama\") -> None:\n", + " self.openai_client = OpenAI(base_url=endpoint, api_key=api_key)\n", + "\n", + " # week1 -> day2\n", + " def set_model(self, model: str) -> None:\n", + " self._MODEL = model\n", + "\n", + " # week1 -> day5\n", + " def set_system_prompt(self, system_prompt: str) -> None:\n", + " self._system_prompt = system_prompt\n", + "\n", + " # week1 -> day5\n", + " def scrape_website_links(self, url: str) -> list[str]:\n", + " self.driver.get(url)\n", + " self.driver.implicitly_wait(10)\n", + " \n", + " links = self.driver.find_elements(By.TAG_NAME, \"a\")\n", + " return [link.get_attribute(\"href\") for link in links \n", + " if link.get_attribute(\"href\") and link.get_attribute(\"href\").strip()]\n", + "\n", + " # week1 -> day5\n", + " def generate_user_prompt_to_select_relevant_links(self, url: str) -> str:\n", + " user_prompt = f\"\"\"\n", + " Here is the list of links on the website {url} -\n", + " Please decide which of these are relevant web links for a brochure about the company, \n", + " respond with the full https URL in JSON format.\n", + " Do not include Terms of Service, Privacy, email links.\n", + "\n", + " Links (some might be relative links):\n", + " \"\"\"\n", + " links = self.scrape_website_links(url)\n", + " user_prompt += \"\\n\".join(links)\n", + " return user_prompt\n", + "\n", + " # week1 -> day5\n", + " def select_relevant_links(self, url:str) -> Dict[str, List[Dict[str, str]]]:\n", + " message = self.generate_user_prompt_to_select_relevant_links(url)\n", + " response = self.openai_client.chat.completions.create(\n", + " model=self._MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": self._system_prompt},\n", + " {\"role\": \"user\", \"content\": message}\n", + " ],\n", + " response_format={\"type\": \"json_object\"}\n", + " )\n", + "\n", + " json_response = json.loads(response.choices[0].message.content)\n", + "\n", + " return json_response\n", + "\n", + " # week1 -> day5\n", + " def fetch_page_and_all_relevant_links(self, url):\n", + " contents = self.scrape_website(url)\n", + " relevant_links = self.select_relevant_links(url)\n", + " result = f\"## Landing Page:\\n\\n{contents}\\n## Relevant Links:\\n\"\n", + " for link in relevant_links[\"links\"]:\n", + " result += f\"\\n\\n### Link: {link[\"type\"]}\\n\"\n", + " result += self.scrape_website(link[\"url\"])\n", + " return result\n", + " \n", + " def get_user_prompt_for_brochure(self, company_name:str, url:str) -> str:\n", + " user_prompt = f\"\"\"\n", + " You are looking at a company called: {company_name}\n", + " Here are the contents of its landing page and other relevant pages;\n", + " use this information to build a short brochure of the company in markdown without code blocks.\\n\\n\n", + " \"\"\"\n", + " user_prompt += self.fetch_page_and_all_relevant_links(url)\n", + " user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n", + " return user_prompt\n", + "\n", + " # week1 -> day5\n", + " def generate_brochure(self, company_name:str, url:str, link_prompt: str, brochure_prompt: str, stream: bool = False) -> None:\n", + " self.set_system_prompt(link_prompt)\n", + " contents = self.get_user_prompt_for_brochure(company_name,url)\n", + " self.set_system_prompt(brochure_prompt)\n", + " response = self.openai_client.chat.completions.create(\n", + " model=self._MODEL,\n", + " messages=[{\"role\": \"system\", \"content\": self._system_prompt}, {\"role\": \"user\", \"content\": contents}],\n", + " stream=stream # for streaming response\n", + " )\n", + "\n", + " if stream:\n", + " full_response = \"\"\n", + " display_handle = display(Markdown(full_response), display_id=True)\n", + " for chunk in response:\n", + " full_response += chunk.choices[0].delta.content or \"\"\n", + " update_display(Markdown(full_response), display_id=display_handle.display_id)\n", + " else:\n", + " result = response.choices[0].message.content\n", + " display(Markdown(result))\n", + "\n", + " # week2 -> day2\n", + " def generate_brochure_with_gradio(self, company_name:str, url:str, link_prompt: str, brochure_prompt: str, stream: bool = False):\n", + " self.set_system_prompt(link_prompt)\n", + " contents = self.get_user_prompt_for_brochure(company_name,url)\n", + " self.set_system_prompt(brochure_prompt)\n", + " response = self.openai_client.chat.completions.create(\n", + " model=self._MODEL,\n", + " messages=[{\"role\": \"system\", \"content\": self._system_prompt}, {\"role\": \"user\", \"content\": contents}],\n", + " stream=stream # for streaming response\n", + " )\n", + "\n", + " if stream:\n", + " full_response = \"\"\n", + " for chunk in response:\n", + " full_response += chunk.choices[0].delta.content or \"\"\n", + " yield full_response\n", + " else:\n", + " result = response.choices[0].message.content\n", + " yield result\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "cc085a2b", + "metadata": {}, + "source": [ + "## Demo: LinkedIn Summary\n", + "\n", + "This cell demonstrates the WebpageSummarizer in action by:\n", + "\n", + "1. **Creating an instance** with the GPT-5-nano model\n", + "2. **Scraping LinkedIn's homepage** - a JavaScript-heavy site that traditional scraping can't handle\n", + "3. **Generating a snarky summary** that captures the essence of LinkedIn's professional networking platform\n", + "\n", + "### What Happens:\n", + "- Selenium opens Chrome browser (visible window)\n", + "- Navigates to LinkedIn.com\n", + "- Waits for JavaScript to render all content\n", + "- Extracts all visible text from the page\n", + "- Sends content to OpenAI for summarization\n", + "- Displays the humorous, sarcastic summary in markdown format\n", + "\n", + "### Expected Output:\n", + "A witty, entertaining summary that captures LinkedIn's key features and business model with a humorous tone.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cfe93bea", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)\n" + ] + }, + { + "data": { + "text/markdown": [ + "LinkedIn’s homepage in a nutshell: a corporate buffet of jobs, courses, tools, and guilt-inducing “Open to Work” vibes, wrapped in a lot of navigation clutter.\n", + "\n", + "- Top Content: Curated posts and expert insights by topic (Career, Productivity, Finance, Soft Skills, Project Management, etc.). Yes, because your feed needed more buzzwords.\n", + "- Jobs: Find the right job or internship across a big menu of roles (Engineering, Marketing, IT, HR, Admin, Retail, etc.). Tempting you with endless openings.\n", + "- Post your job: Post a job for millions to see. Because nothing says “we’re hiring” like a public billboard.\n", + "- Software tools: Discover the best software—CRM, HRMS, Project Management, Help Desk, etc.—as if you were deciding which inbox to dread today.\n", + "- Games: Keep your mind sharp with daily games (Pinpoint, Queens, Crossclimb, Tango, Zip, Mini Sudoku). Productivity through micro-snacks!\n", + "- Open To Work: Privately tell recruiters or publicly broadcast you’re looking for opportunities. Subtle as a neon sign.\n", + "- Connect and Learn: Find people you know, learn new skills, and choose topics to study. Professional life, now with more onboarding prompts.\n", + "- Who is LinkedIn for?: Anyone navigating professional life—because apparently that’s everyone.\n", + "- Bottom line: It’s a hub of professional action—job hunting, learning, toolshopping, and the occasional brain teaser to distract you from the grim reality of deadlines." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# week1 -> day1\n", + "Summarizer = WebpageSummarizer(\"gpt-5-nano\")\n", + "\n", + "Summarizer.display_summary(\"https://www.linkedin.com\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "4816a966", + "metadata": {}, + "source": [ + "## 🔄 **Day 2 - Remote ↔ Local AI**\n", + "\n", + "Seamless switching between OpenAI (cloud) and Ollama (local) using `set_endpoint()`\n" + ] + }, + { + "cell_type": "markdown", + "id": "7b650e50", + "metadata": {}, + "source": [ + "## 🚀 **Day 5 - Multi-Page Intelligence**\n", + "\n", + "AI-powered link analysis + automated company brochure generation\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d586747e", + "metadata": {}, + "outputs": [], + "source": [ + "# week1 -> day2\n", + "Summarizer.set_endpoint(\"http://localhost:11434/v1\")\n", + "Summarizer.set_model(\"llama3.2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43331574", + "metadata": {}, + "outputs": [], + "source": [ + "Summarizer.display_summary(\"https://www.linkedin.com\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e4e90b4a", + "metadata": {}, + "outputs": [], + "source": [ + "Summarizer = WebpageSummarizer(\"gpt-5-nano\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "fb6b2d25", + "metadata": {}, + "outputs": [], + "source": [ + "LINK_SYSTEM_PROMPT = \"\"\"\n", + " You are provided with a list of links found on a webpage.\n", + " You are able to decide which of the links would be most relevant to include in a brochure about the company,\n", + " such as links to an About page, or a Company page, or Careers/Jobs pages.\n", + " You should respond in JSON as in this example:\n", + "\n", + " {\n", + " \"links\": [\n", + " {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n", + " {\"type\": \"careers page\", \"url\": \"https://another.full.url/careers\"}\n", + " ]\n", + " }\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "da54f8ca", + "metadata": {}, + "outputs": [], + "source": [ + "BRAND_SYSTEM_PROMPT = \"\"\" \n", + "You are an assistant that analyzes the contents of several relevant pages from a company website\n", + "and creates a short brochure about the company for prospective customers, investors and recruits.\n", + "Respond in markdown without code blocks.\n", + "Include details of company culture, customers and careers/jobs if you have the information. \n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "b6055ce5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "# Hugging Face — The AI community building the future\n", + "\n", + "Hugging Face is the collaboration platform at the heart of the machine learning community. We empower researchers, engineers, and end users to learn, share, and build open, ethical AI together.\n", + "\n", + "---\n", + "\n", + "## What we do\n", + "\n", + "- A vibrant platform where the ML community collaborates on models, datasets, and applications\n", + "- Browse 1M+ models, discover 400k+ apps, and explore 250k+ datasets\n", + "- Multi-modality support: text, image, video, audio, and even 3D\n", + "- Build and showcase your ML portfolio by sharing your work with the world\n", + "- Sign up to join a thriving ecosystem and accelerate your ML journey\n", + "\n", + "---\n", + "\n", + "## The platform (products and capabilities)\n", + "\n", + "- Hub for Models, Datasets, and Spaces\n", + " - Host and collaborate on unlimited public models, datasets, and applications\n", + "- HF Open Source Stack\n", + " - Move faster with a comprehensive open source foundation\n", + "- Inference & Deployment\n", + " - Inference Endpoints to deploy at scale; GPU-enabled Spaces in a few clicks\n", + " - Inference Providers give access to 45,000+ models via a single unified API (no service fees)\n", + "- HuggingChat Omni\n", + " - Chat with AI across the ecosystem\n", + "- Services for teams\n", + " - Enterprise-grade security, access controls, and dedicated support\n", + " - Starting at $20 per user per month\n", + "- Compute options\n", + " - Starting at $0.60/hour for GPU\n", + "- Open ecosystem\n", + " - Our open source projects power the ML toolchain and community\n", + " - Key projects include Transformers, Diffusers, Safetensors, Tokenizers, TRL, Transformers.js, smolagents, and more\n", + "\n", + "---\n", + "\n", + "## Our open source core\n", + "\n", + "We’re building the foundation of ML tooling with the community. Our flagship projects include:\n", + "- Transformers (state-of-the-art models for PyTorch)\n", + "- Diffusers (diffusion models)\n", + "- Safetensors (safe storage/distribution of weights)\n", + "- Hub Python Library (Python client for the Hugging Face Hub)\n", + "- Tokenizers, TRL, Transformers.js, smolagents\n", + "- These projects power the vast Hugging Face ecosystem and enable researchers and developers to innovate openly\n", + "\n", + "---\n", + "\n", + "## Customers, partners, and impact\n", + "\n", + "- More than 50,000 organizations use Hugging Face\n", + "- Notable teams and enterprises rely on our platform, including leaders such as Meta AI, Amazon, Google, Microsoft, Intel, Grammarly, Writer, and more\n", + "- We support both individual researchers and large teams with scalable, secure solutions\n", + "\n", + "---\n", + "\n", + "## Culture, community, and values\n", + "\n", + "- Open and ethical AI future, built together with the community\n", + "- A learning-first, collaborative environment that values openness and sharing\n", + "- Strong emphasis on open source tooling and transparent collaboration\n", + "- A platform that empowers the next generation of ML engineers, scientists, and end users\n", + "\n", + "From brand storytelling to product strategy, we emphasize a cooperative, community-driven approach to advancing AI in a responsible way.\n", + "\n", + "---\n", + "\n", + "## Careers and how to join\n", + "\n", + "- We regularly post opportunities on our Careers page. If you’re excited by open science, open source tooling, and building tools that empower thousands of practitioners, Hugging Face could be a great fit.\n", + "- Join a growing, mission-driven team that supports developers, researchers, and enterprise customers with cutting-edge AI tooling\n", + "\n", + "---\n", + "\n", + "## How to engage\n", + "\n", + "- Explore Models, Datasets, and Spaces\n", + "- Try HuggingChat Omni\n", + "- Sign up to build your ML portfolio and collaborate with the community\n", + "- For teams, learn about our enterprise options, security, and dedicated support\n", + "\n", + "---\n", + "\n", + "## Why invest or partner with Hugging Face\n", + "\n", + "- A thriving, open-source ecosystem with broad adoption across industry and academia\n", + "- A scalable platform that combines models, datasets, spaces, and applications under one roof\n", + "- A proven track record of enabling organizations to accelerate AI development while offering enterprise-grade security and support\n", + "- A growing customer base and a clear pathway from community tools to enterprise deployment\n", + "\n", + "---\n", + "\n", + "If you’d like more detail on specific products, a few success stories, or to see current open roles, I can pull together a concise section tailored to investors, customers, or prospective hires." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Generate brochure without streaming the response\n", + "Summarizer.generate_brochure(\"Hugging Face\", \"https://huggingface.co\", LINK_SYSTEM_PROMPT, BRAND_SYSTEM_PROMPT)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ff5a5341", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "# Edward (Ed) Donner — Co-founder & CTO, Nebula.io\n", + "\n", + "A glimpse into the mission, technology, and culture behind Nebula.io, led by Ed Donner, with a focus on transforming recruitment through AI.\n", + "\n", + "## Who we are\n", + "- Edward (Ed) Donner is the co-founder and CTO of Nebula.io.\n", + "- Nebula.io applies Generative AI and other machine learning to help recruiters source, understand, engage, and manage talent.\n", + "- The platform uses a patented matching model that connects people with roles more accurately and quickly—without relying on keywords.\n", + "\n", + "## What we do\n", + "- Enable recruiters to source, understand, engage, and manage talent at scale.\n", + "- Use proprietary, verticalized LLMs tailored for talent and hiring workflows.\n", + "- Offer a patented matching model that improves accuracy and speed, with no keyword tyranny.\n", + "- Provide a platform that is award-winning and backed by press coverage, designed to help people discover roles where they will thrive.\n", + "- The product is described as free to try, offering a no-barrier way to explore its capabilities.\n", + "\n", + "## Our technology and approach\n", + "- Proprietary LLMs specialized for talent recruitment.\n", + "- A patented matching engine that aligns people with roles more effectively than traditional keyword-based methods.\n", + "- Emphasis on real-world impact: applying AI to help people discover their potential and pursue their Ikigai—finding roles where they can be fulfilled and successful.\n", + "- The platform supports Gen AI and Agentic AI use cases, including practical deployments at scale (evidenced by references to AWS-scale implementations).\n", + "\n", + "## Why Nebula.io matters\n", + "- Addressing a broad human capital challenge: many people feel uninspired or disengaged at work, and Nebula.io aims to change that by better matching individuals to meaningful roles.\n", + "- The long-term vision centers on raising human prosperity by helping people pursue fulfilling career paths.\n", + "\n", + "## History, credibility, and impact\n", + "- Origin: Nebula.io traces back to Ed’s prior venture, untapt (founded in 2013), which built talent marketplaces and data science tools for recruitment.\n", + "- Early recognition: selected for the Accenture FinTech Innovation Lab; named an American Banker Top 20 Company To Watch.\n", + "- Media coverage: features in Fast Company, Forbes, and American Banker; Ed has spoken publicly about AI and recruitment, including high-profile interviews.\n", + "- Legacy of real-world impact: Nebula.io builds on a track record of applying AI to recruitment challenges and delivering value to customers.\n", + "\n", + "## Culture and values\n", + "- Ikigai-driven philosophy: helping people discover their potential and pursue meaningful work.\n", + "- A hands-on, creative founder who blends technical rigor with curiosity (Ed’s interests include coding, experimenting with LLMs, DJing, and exploring tech culture).\n", + "- A pragmatic, impact-focused approach to AI—prioritizing real-world problems and measurable outcomes for customers and candidates alike.\n", + "\n", + "## Customers and impact\n", + "- The platform is used by recruiters today to source, understand, engage, and manage talent.\n", + "- The emphasis is on delivering a better, faster, more accurate matching experience—reducing reliance on keyword matching and accelerating hiring outcomes.\n", + "- While specific customer names aren’t listed on the public pages, the platform is described as having happy customers and broad press coverage, underscoring credibility and market reception.\n", + "\n", + "## Careers and opportunities\n", + "- The site highlights a culture of innovation and hands-on AI work, but does not list open job postings.\n", + "- For those inspired to work at the intersection of AI and talent, Nebula.io invites connections and conversations about opportunities to contribute to real-world hiring problems.\n", + "- If you’re interested in joining or collaborating, consider reaching out to Ed Donner and exploring how your skills could fit the mission.\n", + "\n", + "## How to connect\n", + "- Email: ed [at] edwarddonner [dot] com\n", + "- Website: www.edwarddonner.com\n", + "- Follow Ed on social: LinkedIn, Twitter, Facebook\n", + "- Newsletter: Subscribe to updates and course offerings related to AI, LLMs, and talent acquisition\n", + "\n", + "## Why invest or partner with Nebula.io\n", + "- Strong founder-led vision focused on meaningful, measurable outcomes in hiring.\n", + "- Proven track record through prior ventures and credible industry recognition.\n", + "- Patent-backed technology offering a differentiated approach to talent matching.\n", + "- Clear social impact goal: helping people find roles where they will be fulfilled and productive, contributing to broader prosperity.\n", + "\n", + "If you’d like a tailored brochure version for investors, customers, or potential recruits, I can adjust the emphasis and add any additional details you’d like highlighted." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Generate brochure while streaming the response\n", + "Summarizer.generate_brochure(\"Ed Donner\", \"https://edwarddonner.com\", LINK_SYSTEM_PROMPT, BRAND_SYSTEM_PROMPT, stream=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "3f84d4c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* Running on local URL: http://127.0.0.1:7862\n", + "* To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Generate brochure using the Gradio interface\n", + "company_name = gr.Textbox(label=\"Company Name\", info=\"Write the name of the company\")\n", + "company_url = gr.Textbox(label=\"Company URL\", info=\"Write the URL of the company\")\n", + "link_system_prompt = gr.Textbox(\n", + " label=\"Link System Prompt\", \n", + " info=\"This is a system prompt to decide which of the links would be most relevant to include in a brochure about the company\", \n", + " value=LINK_SYSTEM_PROMPT\n", + ")\n", + "brand_system_prompt = gr.Textbox(\n", + " label=\"Brand System Prompt\", \n", + " info=\"This is a system prompt that analyzes the contents of several relevant pages from a company website and creates a short brochure about the company for prospective customers, investors and recruits.\", \n", + " value=BRAND_SYSTEM_PROMPT\n", + ")\n", + "stream_value = gr.Checkbox(label=\"Stream\", value=False)\n", + "gr_output = gr.Markdown(label=\"Response\")\n", + "\n", + "interface = gr.Interface(\n", + " fn=Summarizer.generate_brochure_with_gradio, \n", + " title=\"Brochure Generator\", \n", + " inputs=[company_name, company_url, link_system_prompt, brand_system_prompt, stream_value], \n", + " outputs=[gr_output], \n", + " flagging_mode=\"never\"\n", + ")\n", + "\n", + "interface.launch(inbrowser=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7114df30", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}