Merge pull request #704 from sevriugin/feat/menu-parser

feat: llm restaurant menu parser
2025-10-07 15:57:44 -04:00
parent 560ee89b26 61ccef865a
commit 211b2cbb0a
2 changed files with 291 additions and 0 deletions
--- a/week1/community-contributions/menu-parser/menu_parser.ipynb
+++ b/week1/community-contributions/menu-parser/menu_parser.ipynb
@@ -0,0 +1,259 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2025-10-02T18:07:54.689902Z",
+     "start_time": "2025-10-02T18:07:54.330580Z"
+    }
+   },
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from dotenv import load_dotenv\n",
+    "from IPython.display import Markdown, display\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "from website import Website"
+   ],
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-10-02T18:07:58.182655Z",
+     "start_time": "2025-10-02T18:07:58.176747Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "link_system_prompt = \"You are provided with a list of links found on a Italian restaurant webpage. \\\n",
+    "You are able to decide which of the links would be most relevant to include in the restaurant menu, \\\n",
+    "such as links to an menu pdf file, Menù page, Piatti, or Bevande.\\n\"\n",
+    "link_system_prompt += \"You should respond in JSON as in this example:\"\n",
+    "link_system_prompt += \"\"\"\n",
+    "{\n",
+    "    \"links\": [\n",
+    "        {\"type\": \"menu pdf\", \"url\": \"https://www.ristoranteapprodo.com/Documenti/MenuEstivo2024.pdf\"},\n",
+    "        {\"type\": \"menu page\", \"url\": \"https://www.giocapizza.com/men%C3%B9\"}\n",
+    "    ]\n",
+    "}\n",
+    "\"\"\""
+   ],
+   "id": "ff5d21dc8dd6bd29",
+   "outputs": [],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-10-02T18:08:01.823456Z",
+     "start_time": "2025-10-02T18:08:01.119076Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "load_dotenv(override=True)\n",
+    "api_key = os.getenv('OPENAI_API_KEY')\n",
+    "\n",
+    "if api_key and api_key.startswith('sk-proj-') and len(api_key) > 10:\n",
+    "    print(\"API key looks good so far\")\n",
+    "else:\n",
+    "    print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n",
+    "\n",
+    "MODEL = 'gpt-4o-mini'\n",
+    "openai = OpenAI()\n",
+    "\n",
+    "ed = Website(\"https://www.giocapizza.com/\")\n",
+    "print(ed.links)"
+   ],
+   "id": "bae61e79319ead26",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "API key looks good so far\n",
+      "['https://www.giocapizza.com', 'tel:349-6705657', 'https://www.instagram.com/giocapizza/', 'https://www.facebook.com/giocapizza/', 'https://www.tripadvisor.it/Restaurant_Review-g2337656-d17784755-Reviews-Gioca_Pizza-Adrara_San_Martino_Province_of_Bergamo_Lombardy.html', 'https://www.youtube.com/@GiocaPizza', 'https://www.pinterest.jp/giocapizza/', 'https://www.giocapizza.com', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/menù', 'https://www.giocapizza.com/servizi', 'https://www.giocapizza.com/menù', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/incorniciate', 'mailto:giocapizza@gmail.com', 'http://www.sinapsisnc.com']\n"
+     ]
+    }
+   ],
+   "execution_count": 4
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-10-02T18:08:05.104624Z",
+     "start_time": "2025-10-02T18:08:05.102463Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "def get_links_user_prompt(website):\n",
+    "    user_prompt = f\"Here is the list of links on the italian restaurant website of {website.url} - \"\n",
+    "    user_prompt += \"please decide which of these are relevant web links for the restaurant menu, respond with the full https URL in JSON format.\"\n",
+    "    user_prompt += \"Links (some might be relative links):\\n\"\n",
+    "    user_prompt += \"\\n\".join(website.links)\n",
+    "    return user_prompt\n"
+   ],
+   "id": "1b5a43ae68ed636",
+   "outputs": [],
+   "execution_count": 5
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-10-02T18:08:08.740268Z",
+     "start_time": "2025-10-02T18:08:08.734461Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "def get_links(url):\n",
+    "    website = Website(url)\n",
+    "    response = openai.chat.completions.create(\n",
+    "        model=MODEL,\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": link_system_prompt},\n",
+    "            {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
+    "      ],\n",
+    "        response_format={\"type\": \"json_object\"}\n",
+    "    )\n",
+    "    result = response.choices[0].message.content\n",
+    "    return json.loads(result)\n"
+   ],
+   "id": "69e91ccd319153f7",
+   "outputs": [],
+   "execution_count": 6
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-10-02T18:08:15.402276Z",
+     "start_time": "2025-10-02T18:08:15.397800Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "def get_all_details(url):\n",
+    "    result = \"Landing page:\\n\"\n",
+    "    result += Website(url).get_contents()\n",
+    "    links = get_links(url)\n",
+    "    print(\"Found links:\", links)\n",
+    "    for link in links[\"links\"]:\n",
+    "        result += f\"\\n\\n{link['type']}\\n\"\n",
+    "        result += Website(link[\"url\"]).get_contents()\n",
+    "    return result\n"
+   ],
+   "id": "e76a1deea9a05353",
+   "outputs": [],
+   "execution_count": 8
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-10-02T18:08:18.390851Z",
+     "start_time": "2025-10-02T18:08:18.387630Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "system_prompt = \"You are an assistant that analyzes the contents of several menu pages from an italian restaurant website \\\n",
+    "and creates restaurant menu with dishes and prices in Euro. Respond in markdown.\"\n",
+    "\n",
+    "def get_restaurant_menu_user_prompt(company_name, url):\n",
+    "    user_prompt = f\"You are looking at a restaurant called: {company_name}\\n\"\n",
+    "    user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a restaurant menu in markdown.\\n\"\n",
+    "    user_prompt += get_all_details(url)\n",
+    "    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
+    "    return user_prompt\n"
+   ],
+   "id": "5f60f05dab091ec7",
+   "outputs": [],
+   "execution_count": 9
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-10-02T18:08:20.804552Z",
+     "start_time": "2025-10-02T18:08:20.800766Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "def create_restaurant_menu(company_name, url):\n",
+    "    response = openai.chat.completions.create(\n",
+    "        model=MODEL,\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": system_prompt},\n",
+    "            {\"role\": \"user\", \"content\": get_restaurant_menu_user_prompt(company_name, url)}\n",
+    "          ],\n",
+    "    )\n",
+    "    result = response.choices[0].message.content\n",
+    "    display(Markdown(result))"
+   ],
+   "id": "32c64d933b194bc7",
+   "outputs": [],
+   "execution_count": 10
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-10-02T18:08:55.009134Z",
+     "start_time": "2025-10-02T18:08:32.164709Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "create_restaurant_menu(\"La Cascina\", \"https://www.lacascinacredaro.it/\")",
+   "id": "19bbd3984732895d",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found links: {'links': [{'type': 'piatti', 'url': 'http://www.byserviziinternet.com/cascina/#piatti'}]}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ],
+      "text/markdown": "# La Cascina – Ristorante Pizzeria Menu\n\n## Antipasti (Starters)\n- **Bruschetta al Pomodoro** - €5.00  \n  Grilled bread topped with fresh tomatoes, garlic, and basil.\n\n- **Crostini Toscani** - €7.00  \n  Toasted bread with traditional chicken liver pâté.\n\n- **Tagliere di Salumi** - €9.00  \n  Selection of cured meats served with pickles and bread.\n\n## Primi Piatti (First Courses)\n- **Gnocchetti di Patate con Erbette** - €10.00  \n  Potato gnocchi with a blend of seasonal greens.\n\n- **Paccheri con Polipetti** - €12.00  \n  Large tubular pasta with baby octopus in a tomato sauce.\n\n- **Risotto ai Frutti di Mare** - €15.00  \n  Arborio rice cooked with fresh seafood.\n\n- **Tagliolini al Tartufo** - €14.00  \n  Homemade tagliolini pasta with truffle sauce.\n\n- **Zuppa di Cipolle** - €8.00  \n  Traditional onion soup topped with melted cheese.\n\n## Secondi Piatti (Main Courses)\n- **Filetto di Manzo** - €18.00  \n  Grilled beef fillet served with a side of seasonal vegetables.\n\n- **Pollo alla Griglia** - €12.00  \n  Grilled chicken breast served with rosemary potatoes.\n\n- **Branzino al Forno** - €17.00  \n  Oven-baked sea bass served with a lemon-herb sauce.\n\n## Pizze (Pizzas)\n- **Margherita** - €8.00  \n  Classic pizza with tomato sauce, mozzarella, and basil.\n\n- **Diavola** - €10.00  \n  Spicy salami pizza with tomato sauce and mozzarella.\n\n- **Funghi e Prosciutto** - €11.00  \n  Pizza topped with mushrooms and ham.\n\n- **Vegetariana** - €9.50  \n  Mixed vegetable pizza with mozzarella.\n\n## Dessert\n- **Tiramisu** - €5.00  \n  Classic coffee-flavored Italian dessert.\n\n- **Panna Cotta** - €5.50  \n  Creamy dessert served with berry sauce.\n\n- **Gelato** - €4.00  \n  Selection of homemade ice creams.\n\n## Bevande (Beverages)\n- **Acqua Naturale / Frizzante** - €2.50  \n  Still or sparkling water.\n\n- **Birra Artigianale** - €4.00  \n  Local craft beer.\n\n- **Vino della Casa** - €5.50 / glass  \n  House wine selection.\n\nFor reservations or inquiries, please contact us at +39 035 936383.  \n**Address:** Via L. Cadorna, 9, 24060 - Credaro (BG)  \n**Closed on Wednesdays**."
+     },
+     "metadata": {},
+     "output_type": "display_data",
+     "jetTransient": {
+      "display_id": null
+     }
+    }
+   ],
+   "execution_count": 11
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/week1/community-contributions/menu-parser/website.py
+++ b/week1/community-contributions/menu-parser/website.py
@@ -0,0 +1,32 @@
+import requests
+from bs4 import BeautifulSoup
+
+# A class to represent a Webpage
+
+# Some websites need you to use proper headers when fetching them:
+headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
+}
+
+class Website:
+    """
+    A utility class to represent a Website that we have scraped, now with links.
+    """
+
+    def __init__(self, url):
+        self.url = url
+        response = requests.get(url, headers=headers)
+        self.body = response.content
+        soup = BeautifulSoup(self.body, 'html.parser')
+        self.title = soup.title.string if soup.title else "No title found"
+        if soup.body:
+            for irrelevant in soup.body(["script", "style", "img", "input"]):
+                irrelevant.decompose()
+            self.text = soup.body.get_text(separator="\n", strip=True)
+        else:
+            self.text = ""
+        links = [link.get('href') for link in soup.find_all('a')]
+        self.links = [link for link in links if link]
+
+    def get_contents(self):
+        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"