diff --git a/week1/community-contributions/accommodation_assistant.ipynb b/week1/community-contributions/accommodation_assistant.ipynb new file mode 100644 index 0000000..5276ac4 --- /dev/null +++ b/week1/community-contributions/accommodation_assistant.ipynb @@ -0,0 +1,260 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "44b58c16-8319-4095-b194-85b58928e6fd", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "import json\n", + "import re\n", + "from typing import List, Dict\n", + "from bs4 import BeautifulSoup\n", + "from openai import OpenAI\n", + "from selenium import webdriver\n", + "from selenium.webdriver.chrome.service import Service\n", + "from selenium.webdriver.chrome.options import Options" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bcb4ab0-30f6-4f29-a97e-02ff6e287c37", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "MODEL = \"llama3.2\"\n", + "openai = OpenAI(base_url = \"http://localhost:11434/v1\", api_key = \"ollama\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6d30cf9-0b57-44b3-a81a-ccbd622140c3", + "metadata": {}, + "outputs": [], + "source": [ + "class HotelListing:\n", + " def __init__(self, name, price, url, features = None):\n", + " self.name = name\n", + " self.price = price\n", + " self.url = url\n", + " self.features = features or []\n", + " def to_dict(self):\n", + " return {\n", + " \"name\": self.name,\n", + " \"price\": self.price,\n", + " \"url\": self.url,\n", + " \"features\": self.features\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c547397-3e14-44dc-b08e-c192028d9ded", + "metadata": {}, + "outputs": [], + "source": [ + "class BookingParser:\n", + " def __init__(self, url, headers = None):\n", + " self.url = url\n", + " self.headers = headers or {\"User-Agent\": \"Mozilla/5.0\"}\n", + " self.listings = []\n", + " self.fetch_and_parse()\n", + "\n", + " def fetch_and_parse(self):\n", + " try:\n", + " request = requests.get(self.url, headers = self.headers, timeout = 10)\n", + " request.raise_for_status()\n", + " except Exception as e:\n", + " print(f\"Page download error: {e}\")\n", + " return\n", + "\n", + " soup = BeautifulSoup(request.content, \"html.parser\")\n", + "\n", + " hotel_cards = soup.find_all(\"div\", {\"data-stid\": \"property-listing-results\"})\n", + "\n", + " if not hotel_cards:\n", + " hotel_cards = soup.find_all(\"div\", class_ = re.compile(\"property-listing|property-card-card-results\"))\n", + "\n", + " for card in hotel_cards[:10]:\n", + " listing = self._parse_hotel_card(card)\n", + " if listing:\n", + " self.listings.append(listing)\n", + "\n", + " def _parse_hotel_card(self, card):\n", + " try:\n", + " name_element = card.find(\"a\", {\"data-stid\": \"open-hotel-information\"})\n", + " if not name_element:\n", + " name_element = card.find(\"h3\") or car.find(\"span\", class_ = re.compile(\"is-visually-hidden\"))\n", + " name = name_element.get_text(strip = True) if name_element else \"name unknown\"\n", + "\n", + " price_element = card.find(\"span\", {\"class\": \"uitk-badge-base-text\"})\n", + "\n", + " price_text = price_element.get_text(strip = True) if price_element else \"0\"\n", + " price_match = request.search(r'(\\d+)', price_text.replace('$', ''))\n", + " price = int(price_match.group(1)) if price_match else 0\n", + "\n", + " link_element = card.find(\"a\", href = True)\n", + " url = \"https://www.hotels.com\" + link_element[\"href\"] if link_element else \"\"\n", + "\n", + " features = []\n", + " feature_spans = card.select('[data-stid=\"sp-content-list\"]')\n", + " if feature_spans:\n", + " items = feature_spans[0].select('li[data-stid^=\"sp-content-item\"]')\n", + " \n", + " for item in items:\n", + " text = item.get_text(strip=True)\n", + " if text:\n", + " features.append(text.lower())\n", + "\n", + " card_text = card.get_text().lower()\n", + " if \"wi-fi\" in card_text or \"wifi\" in card_text:\n", + " features.append(\"wifi\")\n", + " if \"breakfest\" in card_text:\n", + " features.append(\"breakfest\")\n", + "\n", + " return HotelListing(name, price, url, features)\n", + " except Exception as e:\n", + " print(f\"Parsing hotel card error: {e}\")\n", + " return None\n", + "\n", + " def get_listings(self):\n", + " return [listing.to_dict() for listing in self.listings]\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e700023-1f0c-4e8b-a823-c5e3ce9bfb28", + "metadata": {}, + "outputs": [], + "source": [ + "def make_prompt(listings: List[Dict], user_preferences: Dict):\n", + " prompt = (\n", + " \"You are an assistant and help a user in accommodation choosing.\\n\"\n", + " \"Below is a list of hotel offers and user preferences.\\n\"\n", + " \"HOTELS OFERTS:\\n\"\n", + " f\"{json.dumps(listings, ensure_ascii = False, indent = 1)}\\n\\n\"\n", + " \"USER PREFERENCES:\\n\"\n", + " f\"{json.dumps(user_preferences, ensure_ascii = False, indent = 1)}\\n\\n\"\n", + " \"For every ofert:\\n\"\n", + " \"1) Assess suitability in 0-10 rate (where 10 = ideal suitability)\\n\"\n", + " \"2) Give 2-3 short reasons for your assessment\\n\"\n", + " \"3) Please indicate if the price is within your budget\\n\"\n", + " \"Finally, list the TOP 3 best offers with justification.\\n\"\n", + " )\n", + " return prompt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58fa69bd-162b-4088-91ab-fe1fc39b4a50", + "metadata": {}, + "outputs": [], + "source": [ + "def analyze_listings(listings: List[Dict], preferences: Dict):\n", + " if not listings:\n", + " print(\"No offers to analyze.\")\n", + " return None\n", + "\n", + " prompt = make_prompt(listings, preferences)\n", + "\n", + " try:\n", + " response = openai.chat.completions.create(\n", + " model = MODEL,\n", + " messages = [\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": \"You are an expert in choosing the best accommodation.\\n\" \n", + " \"You analyze offers and advise users.\"\n", + " },\n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ]\n", + " )\n", + "\n", + " result = response.choices[0].message.content\n", + " return result\n", + "\n", + " except Exception as e:\n", + " print(f\"Communication error with LLM: {e}\")\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4ade5a4-3a3c-422d-9740-d3b647996222", + "metadata": {}, + "outputs": [], + "source": [ + "def main():\n", + " url = (\"https://www.hotels.com/Hotel-Search?destination=Warsaw%20-%20Eastern%20Poland%2C%20Poland&d1=2025-10-18&startDate=2025-10-18&d2=2025-10-20&endDate=2025-10-20&adults=1&rooms=1®ionId=6057142&sort=RECOMMENDED&theme=&userIntent=&semdtl=&categorySearch=&useRewards=false&children=&latLong&pwaDialog=&daysInFuture&stayLength\")\n", + "\n", + " preferences = {\n", + " \"max_price\": 200,\n", + " \"must_have\": [\"wifi\", \"breakfest\"],\n", + " \"number_of_rooms\": 1,\n", + " \"localization\": \"Warsaw\"\n", + " }\n", + "\n", + " print(\"🔍 Oferts downloading from Hotels.com..\")\n", + " parser = BookingParser(url)\n", + " listings = parser.get_listings()\n", + "\n", + " print(f\"✅ Found {len(listings)} offerts\\n\")\n", + " print(\"=\"*60)\n", + "\n", + " print(\"FOUND OFFERTS:\\n\")\n", + " for i, listing in enumerate(listings, 1):\n", + " print(f\"\\n{i}. {listing['name']}\")\n", + " print(f\"Amount: {listing['price']} pln\")\n", + " print(f\"Features: {', '.join(listing['features']) if listing['features'] else 'Informations lack.'}\")\n", + "\n", + " analysis = analyze_listings(listings, preferences)\n", + "\n", + " if analysis:\n", + " print(analysis)\n", + " else:\n", + " print(\"❌ Analysis failed\")\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week1/community-contributions/company_brochure_relevent_links.ipynb b/week1/community-contributions/company_brochure_relevent_links.ipynb new file mode 100644 index 0000000..bcdb4b1 --- /dev/null +++ b/week1/community-contributions/company_brochure_relevent_links.ipynb @@ -0,0 +1,279 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "4e5da3f5-ebd0-4e20-ab89-95847187287b", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import json\n", + "from typing import List\n", + "from bs4 import BeautifulSoup\n", + "from IPython.display import Markdown, display, update_display, clear_output\n", + "from openai import OpenAI\n", + "from dotenv import load_dotenv\n", + "import os\n", + "from scraper import fetch_website_links, fetch_website_contents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86adec56-3b27-46da-9b1a-1e5946a76a09", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENROUTER_API_KEY')\n", + "\n", + "# Check the key\n", + "\n", + "if not api_key:\n", + " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", + "elif api_key.strip() != api_key:\n", + " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", + "else:\n", + " print(\"API key found and looks good so far!\")\n", + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", + "}\n", + "\n", + "openrouter_url = \"https://openrouter.ai/api/v1\"\n", + "openai = OpenAI(api_key=api_key, base_url=openrouter_url)\n", + "MODEL = \"gpt-5-nano\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abf2f706-2709-404a-9fb7-774a9f57dd11", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "company_name = input(\"Enter the company name: \")\n", + "url = input(\"Enter the company url: \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "153fa3d1-3ce5-46d0-838d-3e95a4b8628b", + "metadata": {}, + "outputs": [], + "source": [ + "link_system_prompt = \"You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n", + "link_system_prompt += \"You should respond in JSON as in this example:\"\n", + "link_system_prompt += \"\"\"\n", + " EXAMPLE 1:\n", + " {\n", + " \"links\": [\n", + " {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n", + " {\"type\": \"careers page\", \"url\": \"https://another.full.url/careers\"}\n", + " ]\n", + " }\n", + " EXAMPLE 2:\n", + " {\n", + " \"links\": [\n", + " {\"type\": \"company blog\", \"url\": \"https://blog.example.com\"},\n", + " {\"type\": \"our story\", \"url\": \"https://example.com/our-story\"}\n", + " ]\n", + " }\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fcacc2e-7445-4d8a-aa80-489d3a2247ec", + "metadata": {}, + "outputs": [], + "source": [ + "def get_links_user_prompt(url):\n", + " user_prompt = f\"Here is the list of links on the website of {url} - \"\n", + " user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.\\n\"\n", + " user_prompt += \"Links (some might be relative links):\\n\"\n", + " links = fetch_website_links(url)\n", + " user_prompt += \"\\n\".join(links[:20])\n", + " return user_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfe222c5-0d3e-4be2-85e1-596ab9d407dc", + "metadata": {}, + "outputs": [], + "source": [ + "def get_links(url):\n", + " response = openai.chat.completions.create(\n", + " model = MODEL,\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": link_system_prompt},\n", + " {\"role\": \"user\", \"content\": get_links_user_prompt(url)}\n", + " ],\n", + " response_format = {\"type\": \"json_object\"}\n", + " )\n", + " result = response.choices[0].message.content\n", + " return json.loads(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c964bdce-be5d-41c7-a8d7-8e25e58463c5", + "metadata": {}, + "outputs": [], + "source": [ + "def get_all_details(url):\n", + " result = \"Landing page:\\n\"\n", + " result += fetch_website_contents(url)\n", + " links = get_links(url)\n", + "\n", + " for link in links[\"links\"]:\n", + " result += f\"{link['type']}\\n\"\n", + " try:\n", + " result += f\"\\n\\n### Link: Link: {link['type']}\\n\"\n", + " result += fetch_website_contents(link[\"url\"])\n", + " except Exception as e:\n", + " print(f\"Omitted link: {link['url']}: {e}\")\n", + " continue\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5337019a-b789-49d7-bf10-0f15148c0276", + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = (\n", + " \"You are an assistant that analyzes the contents of several relevant pages from a company website \"\n", + " \"and creates a great type of brochure about the company for prospective customers, investors, and recruits. \"\n", + " \"Respond in markdown. Include details of company culture, customers, and careers/jobs if you have the information. Add emoticons where ever possible.\\n\\n\"\n", + "\n", + " \"Please structure the brochure using the following sections:\\n\"\n", + " \"1. **Introduction**: A brief overview of the company.\\n\"\n", + " \"2. **Company Culture**: Emphasize fun, atmosphere, and any unique cultural elements.\\n\"\n", + " \"3. **Customers**: Mention notable customers or industries.\\n\"\n", + " \"4. **Careers/Jobs**: Highlight career opportunities.\\n\"\n", + " \"5. **Conclusion**: Wrap up with a final lighthearted message.\\n\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dd4f2d4-8189-452a-b15a-c09ae5894ac8", + "metadata": {}, + "outputs": [], + "source": [ + "def get_brochure_user_prompt(company_name, url):\n", + " user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n", + " user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n", + " user_prompt += get_all_details(url)\n", + " user_prompt = user_prompt[:20000]\n", + " return user_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ab4bfef-eb22-43fb-8a46-f1f6a225793b", + "metadata": {}, + "outputs": [], + "source": [ + "def stream_brochure():\n", + " global brochure_text\n", + " brochure_text = \"\"\n", + "\n", + " stream = openai.chat.completions.create(\n", + " model = MODEL,\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n", + " ],\n", + " stream = True\n", + " )\n", + "\n", + " response = \"\"\n", + " display_handle = display(Markdown(\"\"), display_id = True)\n", + " for chunk in stream:\n", + " content = chunk.choices[0].delta.content or ''\n", + " response += content\n", + " brochure_text += content\n", + " response = response.replace(\"```\", \"\"). replace(\"markdown\", \"\")\n", + " update_display(Markdown(response), display_id = display_handle.display_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7828c747-7872-48e2-b3e6-faab95ba76cb", + "metadata": {}, + "outputs": [], + "source": [ + "def user_translate_brochure(language):\n", + " clear_output(wait = True)\n", + "\n", + " translation_stream = openai.chat.completions.create(\n", + " model = MODEL,\n", + " messages = [\n", + " {\"role\": \"user\", \"content\": f\"Translate the following to {language}:\\n {brochure_text}\"}\n", + " ],\n", + " stream = True\n", + " )\n", + "\n", + " display_handle = display(Markdown(\"\"), display_id = True)\n", + " translated_text = \"\"\n", + "\n", + " for chunk in translation_stream:\n", + " content = chunk.choices[0].delta.content or \"\"\n", + " if content:\n", + " translated_text += content\n", + " update_display(Markdown(translated_text), display_id = display_handle.display_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6cfa92a-8a86-485d-a7e1-1651705ee6dc", + "metadata": {}, + "outputs": [], + "source": [ + "stream_brochure()\n", + "language_choice = input(\"Enter the language to translate the brochure into (e.g., 'French'): \")\n", + "user_translate_brochure(language_choice)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}