Merge pull request #704 from sevriugin/feat/menu-parser

feat: llm restaurant menu parser
This commit is contained in:
Ed Donner
2025-10-07 15:57:44 -04:00
committed by GitHub
2 changed files with 291 additions and 0 deletions

View File

@@ -0,0 +1,259 @@
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-10-02T18:07:54.689902Z",
"start_time": "2025-10-02T18:07:54.330580Z"
}
},
"source": [
"import os\n",
"import json\n",
"from dotenv import load_dotenv\n",
"from IPython.display import Markdown, display\n",
"from openai import OpenAI\n",
"\n",
"from website import Website"
],
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-10-02T18:07:58.182655Z",
"start_time": "2025-10-02T18:07:58.176747Z"
}
},
"cell_type": "code",
"source": [
"link_system_prompt = \"You are provided with a list of links found on a Italian restaurant webpage. \\\n",
"You are able to decide which of the links would be most relevant to include in the restaurant menu, \\\n",
"such as links to an menu pdf file, Menù page, Piatti, or Bevande.\\n\"\n",
"link_system_prompt += \"You should respond in JSON as in this example:\"\n",
"link_system_prompt += \"\"\"\n",
"{\n",
" \"links\": [\n",
" {\"type\": \"menu pdf\", \"url\": \"https://www.ristoranteapprodo.com/Documenti/MenuEstivo2024.pdf\"},\n",
" {\"type\": \"menu page\", \"url\": \"https://www.giocapizza.com/men%C3%B9\"}\n",
" ]\n",
"}\n",
"\"\"\""
],
"id": "ff5d21dc8dd6bd29",
"outputs": [],
"execution_count": 3
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-10-02T18:08:01.823456Z",
"start_time": "2025-10-02T18:08:01.119076Z"
}
},
"cell_type": "code",
"source": [
"load_dotenv(override=True)\n",
"api_key = os.getenv('OPENAI_API_KEY')\n",
"\n",
"if api_key and api_key.startswith('sk-proj-') and len(api_key) > 10:\n",
" print(\"API key looks good so far\")\n",
"else:\n",
" print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n",
"\n",
"MODEL = 'gpt-4o-mini'\n",
"openai = OpenAI()\n",
"\n",
"ed = Website(\"https://www.giocapizza.com/\")\n",
"print(ed.links)"
],
"id": "bae61e79319ead26",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"API key looks good so far\n",
"['https://www.giocapizza.com', 'tel:349-6705657', 'https://www.instagram.com/giocapizza/', 'https://www.facebook.com/giocapizza/', 'https://www.tripadvisor.it/Restaurant_Review-g2337656-d17784755-Reviews-Gioca_Pizza-Adrara_San_Martino_Province_of_Bergamo_Lombardy.html', 'https://www.youtube.com/@GiocaPizza', 'https://www.pinterest.jp/giocapizza/', 'https://www.giocapizza.com', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/menù', 'https://www.giocapizza.com/servizi', 'https://www.giocapizza.com/menù', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/incorniciate', 'mailto:giocapizza@gmail.com', 'http://www.sinapsisnc.com']\n"
]
}
],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-10-02T18:08:05.104624Z",
"start_time": "2025-10-02T18:08:05.102463Z"
}
},
"cell_type": "code",
"source": [
"def get_links_user_prompt(website):\n",
" user_prompt = f\"Here is the list of links on the italian restaurant website of {website.url} - \"\n",
" user_prompt += \"please decide which of these are relevant web links for the restaurant menu, respond with the full https URL in JSON format.\"\n",
" user_prompt += \"Links (some might be relative links):\\n\"\n",
" user_prompt += \"\\n\".join(website.links)\n",
" return user_prompt\n"
],
"id": "1b5a43ae68ed636",
"outputs": [],
"execution_count": 5
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-10-02T18:08:08.740268Z",
"start_time": "2025-10-02T18:08:08.734461Z"
}
},
"cell_type": "code",
"source": [
"def get_links(url):\n",
" website = Website(url)\n",
" response = openai.chat.completions.create(\n",
" model=MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
" {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
" ],\n",
" response_format={\"type\": \"json_object\"}\n",
" )\n",
" result = response.choices[0].message.content\n",
" return json.loads(result)\n"
],
"id": "69e91ccd319153f7",
"outputs": [],
"execution_count": 6
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-10-02T18:08:15.402276Z",
"start_time": "2025-10-02T18:08:15.397800Z"
}
},
"cell_type": "code",
"source": [
"def get_all_details(url):\n",
" result = \"Landing page:\\n\"\n",
" result += Website(url).get_contents()\n",
" links = get_links(url)\n",
" print(\"Found links:\", links)\n",
" for link in links[\"links\"]:\n",
" result += f\"\\n\\n{link['type']}\\n\"\n",
" result += Website(link[\"url\"]).get_contents()\n",
" return result\n"
],
"id": "e76a1deea9a05353",
"outputs": [],
"execution_count": 8
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-10-02T18:08:18.390851Z",
"start_time": "2025-10-02T18:08:18.387630Z"
}
},
"cell_type": "code",
"source": [
"system_prompt = \"You are an assistant that analyzes the contents of several menu pages from an italian restaurant website \\\n",
"and creates restaurant menu with dishes and prices in Euro. Respond in markdown.\"\n",
"\n",
"def get_restaurant_menu_user_prompt(company_name, url):\n",
" user_prompt = f\"You are looking at a restaurant called: {company_name}\\n\"\n",
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a restaurant menu in markdown.\\n\"\n",
" user_prompt += get_all_details(url)\n",
" user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
" return user_prompt\n"
],
"id": "5f60f05dab091ec7",
"outputs": [],
"execution_count": 9
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-10-02T18:08:20.804552Z",
"start_time": "2025-10-02T18:08:20.800766Z"
}
},
"cell_type": "code",
"source": [
"def create_restaurant_menu(company_name, url):\n",
" response = openai.chat.completions.create(\n",
" model=MODEL,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": get_restaurant_menu_user_prompt(company_name, url)}\n",
" ],\n",
" )\n",
" result = response.choices[0].message.content\n",
" display(Markdown(result))"
],
"id": "32c64d933b194bc7",
"outputs": [],
"execution_count": 10
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-10-02T18:08:55.009134Z",
"start_time": "2025-10-02T18:08:32.164709Z"
}
},
"cell_type": "code",
"source": "create_restaurant_menu(\"La Cascina\", \"https://www.lacascinacredaro.it/\")",
"id": "19bbd3984732895d",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found links: {'links': [{'type': 'piatti', 'url': 'http://www.byserviziinternet.com/cascina/#piatti'}]}\n"
]
},
{
"data": {
"text/plain": [
"<IPython.core.display.Markdown object>"
],
"text/markdown": "# La Cascina Ristorante Pizzeria Menu\n\n## Antipasti (Starters)\n- **Bruschetta al Pomodoro** - €5.00 \n Grilled bread topped with fresh tomatoes, garlic, and basil.\n\n- **Crostini Toscani** - €7.00 \n Toasted bread with traditional chicken liver pâté.\n\n- **Tagliere di Salumi** - €9.00 \n Selection of cured meats served with pickles and bread.\n\n## Primi Piatti (First Courses)\n- **Gnocchetti di Patate con Erbette** - €10.00 \n Potato gnocchi with a blend of seasonal greens.\n\n- **Paccheri con Polipetti** - €12.00 \n Large tubular pasta with baby octopus in a tomato sauce.\n\n- **Risotto ai Frutti di Mare** - €15.00 \n Arborio rice cooked with fresh seafood.\n\n- **Tagliolini al Tartufo** - €14.00 \n Homemade tagliolini pasta with truffle sauce.\n\n- **Zuppa di Cipolle** - €8.00 \n Traditional onion soup topped with melted cheese.\n\n## Secondi Piatti (Main Courses)\n- **Filetto di Manzo** - €18.00 \n Grilled beef fillet served with a side of seasonal vegetables.\n\n- **Pollo alla Griglia** - €12.00 \n Grilled chicken breast served with rosemary potatoes.\n\n- **Branzino al Forno** - €17.00 \n Oven-baked sea bass served with a lemon-herb sauce.\n\n## Pizze (Pizzas)\n- **Margherita** - €8.00 \n Classic pizza with tomato sauce, mozzarella, and basil.\n\n- **Diavola** - €10.00 \n Spicy salami pizza with tomato sauce and mozzarella.\n\n- **Funghi e Prosciutto** - €11.00 \n Pizza topped with mushrooms and ham.\n\n- **Vegetariana** - €9.50 \n Mixed vegetable pizza with mozzarella.\n\n## Dessert\n- **Tiramisu** - €5.00 \n Classic coffee-flavored Italian dessert.\n\n- **Panna Cotta** - €5.50 \n Creamy dessert served with berry sauce.\n\n- **Gelato** - €4.00 \n Selection of homemade ice creams.\n\n## Bevande (Beverages)\n- **Acqua Naturale / Frizzante** - €2.50 \n Still or sparkling water.\n\n- **Birra Artigianale** - €4.00 \n Local craft beer.\n\n- **Vino della Casa** - €5.50 / glass \n House wine selection.\n\nFor reservations or inquiries, please contact us at +39 035 936383. \n**Address:** Via L. Cadorna, 9, 24060 - Credaro (BG) \n**Closed on Wednesdays**."
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
}
],
"execution_count": 11
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,32 @@
import requests
from bs4 import BeautifulSoup
# A class to represent a Webpage
# Some websites need you to use proper headers when fetching them:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}
class Website:
"""
A utility class to represent a Website that we have scraped, now with links.
"""
def __init__(self, url):
self.url = url
response = requests.get(url, headers=headers)
self.body = response.content
soup = BeautifulSoup(self.body, 'html.parser')
self.title = soup.title.string if soup.title else "No title found"
if soup.body:
for irrelevant in soup.body(["script", "style", "img", "input"]):
irrelevant.decompose()
self.text = soup.body.get_text(separator="\n", strip=True)
else:
self.text = ""
links = [link.get('href') for link in soup.find_all('a')]
self.links = [link for link in links if link]
def get_contents(self):
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"