From cb992c9d91c9870e7c796855c7a1d3fa0b47b270 Mon Sep 17 00:00:00 2001 From: Sergei Sevriugin Date: Fri, 3 Oct 2025 07:16:17 +0200 Subject: [PATCH 1/2] feat: week 1 addon italian restaurant menu parser --- week1/menu_parser.ipynb | 259 ++++++++++++++++++++++++++++++++++++++++ week1/website.py | 32 +++++ 2 files changed, 291 insertions(+) create mode 100644 week1/menu_parser.ipynb create mode 100644 week1/website.py diff --git a/week1/menu_parser.ipynb b/week1/menu_parser.ipynb new file mode 100644 index 0000000..96bdb22 --- /dev/null +++ b/week1/menu_parser.ipynb @@ -0,0 +1,259 @@ +{ + "cells": [ + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2025-10-02T18:07:54.689902Z", + "start_time": "2025-10-02T18:07:54.330580Z" + } + }, + "source": [ + "import os\n", + "import json\n", + "from dotenv import load_dotenv\n", + "from IPython.display import Markdown, display\n", + "from openai import OpenAI\n", + "\n", + "from website import Website" + ], + "outputs": [], + "execution_count": 1 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-02T18:07:58.182655Z", + "start_time": "2025-10-02T18:07:58.176747Z" + } + }, + "cell_type": "code", + "source": [ + "link_system_prompt = \"You are provided with a list of links found on a Italian restaurant webpage. \\\n", + "You are able to decide which of the links would be most relevant to include in the restaurant menu, \\\n", + "such as links to an menu pdf file, Menù page, Piatti, or Bevande.\\n\"\n", + "link_system_prompt += \"You should respond in JSON as in this example:\"\n", + "link_system_prompt += \"\"\"\n", + "{\n", + " \"links\": [\n", + " {\"type\": \"menu pdf\", \"url\": \"https://www.ristoranteapprodo.com/Documenti/MenuEstivo2024.pdf\"},\n", + " {\"type\": \"menu page\", \"url\": \"https://www.giocapizza.com/men%C3%B9\"}\n", + " ]\n", + "}\n", + "\"\"\"" + ], + "id": "ff5d21dc8dd6bd29", + "outputs": [], + "execution_count": 3 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-02T18:08:01.823456Z", + "start_time": "2025-10-02T18:08:01.119076Z" + } + }, + "cell_type": "code", + "source": [ + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "if api_key and api_key.startswith('sk-proj-') and len(api_key) > 10:\n", + " print(\"API key looks good so far\")\n", + "else:\n", + " print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n", + "\n", + "MODEL = 'gpt-4o-mini'\n", + "openai = OpenAI()\n", + "\n", + "ed = Website(\"https://www.giocapizza.com/\")\n", + "print(ed.links)" + ], + "id": "bae61e79319ead26", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API key looks good so far\n", + "['https://www.giocapizza.com', 'tel:349-6705657', 'https://www.instagram.com/giocapizza/', 'https://www.facebook.com/giocapizza/', 'https://www.tripadvisor.it/Restaurant_Review-g2337656-d17784755-Reviews-Gioca_Pizza-Adrara_San_Martino_Province_of_Bergamo_Lombardy.html', 'https://www.youtube.com/@GiocaPizza', 'https://www.pinterest.jp/giocapizza/', 'https://www.giocapizza.com', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/menù', 'https://www.giocapizza.com/servizi', 'https://www.giocapizza.com/menù', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/incorniciate', 'mailto:giocapizza@gmail.com', 'http://www.sinapsisnc.com']\n" + ] + } + ], + "execution_count": 4 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-02T18:08:05.104624Z", + "start_time": "2025-10-02T18:08:05.102463Z" + } + }, + "cell_type": "code", + "source": [ + "def get_links_user_prompt(website):\n", + " user_prompt = f\"Here is the list of links on the italian restaurant website of {website.url} - \"\n", + " user_prompt += \"please decide which of these are relevant web links for the restaurant menu, respond with the full https URL in JSON format.\"\n", + " user_prompt += \"Links (some might be relative links):\\n\"\n", + " user_prompt += \"\\n\".join(website.links)\n", + " return user_prompt\n" + ], + "id": "1b5a43ae68ed636", + "outputs": [], + "execution_count": 5 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-02T18:08:08.740268Z", + "start_time": "2025-10-02T18:08:08.734461Z" + } + }, + "cell_type": "code", + "source": [ + "def get_links(url):\n", + " website = Website(url)\n", + " response = openai.chat.completions.create(\n", + " model=MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": link_system_prompt},\n", + " {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n", + " ],\n", + " response_format={\"type\": \"json_object\"}\n", + " )\n", + " result = response.choices[0].message.content\n", + " return json.loads(result)\n" + ], + "id": "69e91ccd319153f7", + "outputs": [], + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-02T18:08:15.402276Z", + "start_time": "2025-10-02T18:08:15.397800Z" + } + }, + "cell_type": "code", + "source": [ + "def get_all_details(url):\n", + " result = \"Landing page:\\n\"\n", + " result += Website(url).get_contents()\n", + " links = get_links(url)\n", + " print(\"Found links:\", links)\n", + " for link in links[\"links\"]:\n", + " result += f\"\\n\\n{link['type']}\\n\"\n", + " result += Website(link[\"url\"]).get_contents()\n", + " return result\n" + ], + "id": "e76a1deea9a05353", + "outputs": [], + "execution_count": 8 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-02T18:08:18.390851Z", + "start_time": "2025-10-02T18:08:18.387630Z" + } + }, + "cell_type": "code", + "source": [ + "system_prompt = \"You are an assistant that analyzes the contents of several menu pages from an italian restaurant website \\\n", + "and creates restaurant menu with dishes and prices in Euro. Respond in markdown.\"\n", + "\n", + "def get_restaurant_menu_user_prompt(company_name, url):\n", + " user_prompt = f\"You are looking at a restaurant called: {company_name}\\n\"\n", + " user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a restaurant menu in markdown.\\n\"\n", + " user_prompt += get_all_details(url)\n", + " user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n", + " return user_prompt\n" + ], + "id": "5f60f05dab091ec7", + "outputs": [], + "execution_count": 9 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-02T18:08:20.804552Z", + "start_time": "2025-10-02T18:08:20.800766Z" + } + }, + "cell_type": "code", + "source": [ + "def create_restaurant_menu(company_name, url):\n", + " response = openai.chat.completions.create(\n", + " model=MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": get_restaurant_menu_user_prompt(company_name, url)}\n", + " ],\n", + " )\n", + " result = response.choices[0].message.content\n", + " display(Markdown(result))" + ], + "id": "32c64d933b194bc7", + "outputs": [], + "execution_count": 10 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-10-02T18:08:55.009134Z", + "start_time": "2025-10-02T18:08:32.164709Z" + } + }, + "cell_type": "code", + "source": "create_restaurant_menu(\"La Cascina\", \"https://www.lacascinacredaro.it/\")", + "id": "19bbd3984732895d", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found links: {'links': [{'type': 'piatti', 'url': 'http://www.byserviziinternet.com/cascina/#piatti'}]}\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ], + "text/markdown": "# La Cascina – Ristorante Pizzeria Menu\n\n## Antipasti (Starters)\n- **Bruschetta al Pomodoro** - €5.00 \n Grilled bread topped with fresh tomatoes, garlic, and basil.\n\n- **Crostini Toscani** - €7.00 \n Toasted bread with traditional chicken liver pâté.\n\n- **Tagliere di Salumi** - €9.00 \n Selection of cured meats served with pickles and bread.\n\n## Primi Piatti (First Courses)\n- **Gnocchetti di Patate con Erbette** - €10.00 \n Potato gnocchi with a blend of seasonal greens.\n\n- **Paccheri con Polipetti** - €12.00 \n Large tubular pasta with baby octopus in a tomato sauce.\n\n- **Risotto ai Frutti di Mare** - €15.00 \n Arborio rice cooked with fresh seafood.\n\n- **Tagliolini al Tartufo** - €14.00 \n Homemade tagliolini pasta with truffle sauce.\n\n- **Zuppa di Cipolle** - €8.00 \n Traditional onion soup topped with melted cheese.\n\n## Secondi Piatti (Main Courses)\n- **Filetto di Manzo** - €18.00 \n Grilled beef fillet served with a side of seasonal vegetables.\n\n- **Pollo alla Griglia** - €12.00 \n Grilled chicken breast served with rosemary potatoes.\n\n- **Branzino al Forno** - €17.00 \n Oven-baked sea bass served with a lemon-herb sauce.\n\n## Pizze (Pizzas)\n- **Margherita** - €8.00 \n Classic pizza with tomato sauce, mozzarella, and basil.\n\n- **Diavola** - €10.00 \n Spicy salami pizza with tomato sauce and mozzarella.\n\n- **Funghi e Prosciutto** - €11.00 \n Pizza topped with mushrooms and ham.\n\n- **Vegetariana** - €9.50 \n Mixed vegetable pizza with mozzarella.\n\n## Dessert\n- **Tiramisu** - €5.00 \n Classic coffee-flavored Italian dessert.\n\n- **Panna Cotta** - €5.50 \n Creamy dessert served with berry sauce.\n\n- **Gelato** - €4.00 \n Selection of homemade ice creams.\n\n## Bevande (Beverages)\n- **Acqua Naturale / Frizzante** - €2.50 \n Still or sparkling water.\n\n- **Birra Artigianale** - €4.00 \n Local craft beer.\n\n- **Vino della Casa** - €5.50 / glass \n House wine selection.\n\nFor reservations or inquiries, please contact us at +39 035 936383. \n**Address:** Via L. Cadorna, 9, 24060 - Credaro (BG) \n**Closed on Wednesdays**." + }, + "metadata": {}, + "output_type": "display_data", + "jetTransient": { + "display_id": null + } + } + ], + "execution_count": 11 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week1/website.py b/week1/website.py new file mode 100644 index 0000000..f0e0631 --- /dev/null +++ b/week1/website.py @@ -0,0 +1,32 @@ +import requests +from bs4 import BeautifulSoup + +# A class to represent a Webpage + +# Some websites need you to use proper headers when fetching them: +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" +} + +class Website: + """ + A utility class to represent a Website that we have scraped, now with links. + """ + + def __init__(self, url): + self.url = url + response = requests.get(url, headers=headers) + self.body = response.content + soup = BeautifulSoup(self.body, 'html.parser') + self.title = soup.title.string if soup.title else "No title found" + if soup.body: + for irrelevant in soup.body(["script", "style", "img", "input"]): + irrelevant.decompose() + self.text = soup.body.get_text(separator="\n", strip=True) + else: + self.text = "" + links = [link.get('href') for link in soup.find_all('a')] + self.links = [link for link in links if link] + + def get_contents(self): + return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n" \ No newline at end of file From 61ccef865aec742961139f91a9c7019093d88460 Mon Sep 17 00:00:00 2001 From: Sergei Sevriugin Date: Fri, 3 Oct 2025 07:45:36 +0200 Subject: [PATCH 2/2] feat: week 1 addon italian restaurant menu parser --- week1/{ => community-contributions/menu-parser}/menu_parser.ipynb | 0 week1/{ => community-contributions/menu-parser}/website.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename week1/{ => community-contributions/menu-parser}/menu_parser.ipynb (100%) rename week1/{ => community-contributions/menu-parser}/website.py (100%) diff --git a/week1/menu_parser.ipynb b/week1/community-contributions/menu-parser/menu_parser.ipynb similarity index 100% rename from week1/menu_parser.ipynb rename to week1/community-contributions/menu-parser/menu_parser.ipynb diff --git a/week1/website.py b/week1/community-contributions/menu-parser/website.py similarity index 100% rename from week1/website.py rename to week1/community-contributions/menu-parser/website.py