Merge pull request #704 from sevriugin/feat/menu-parser
feat: llm restaurant menu parser
This commit is contained in:
259
week1/community-contributions/menu-parser/menu_parser.ipynb
Normal file
259
week1/community-contributions/menu-parser/menu_parser.ipynb
Normal file
@@ -0,0 +1,259 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "initial_id",
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-02T18:07:54.689902Z",
|
||||
"start_time": "2025-10-02T18:07:54.330580Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"from website import Website"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 1
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-02T18:07:58.182655Z",
|
||||
"start_time": "2025-10-02T18:07:58.176747Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"link_system_prompt = \"You are provided with a list of links found on a Italian restaurant webpage. \\\n",
|
||||
"You are able to decide which of the links would be most relevant to include in the restaurant menu, \\\n",
|
||||
"such as links to an menu pdf file, Menù page, Piatti, or Bevande.\\n\"\n",
|
||||
"link_system_prompt += \"You should respond in JSON as in this example:\"\n",
|
||||
"link_system_prompt += \"\"\"\n",
|
||||
"{\n",
|
||||
" \"links\": [\n",
|
||||
" {\"type\": \"menu pdf\", \"url\": \"https://www.ristoranteapprodo.com/Documenti/MenuEstivo2024.pdf\"},\n",
|
||||
" {\"type\": \"menu page\", \"url\": \"https://www.giocapizza.com/men%C3%B9\"}\n",
|
||||
" ]\n",
|
||||
"}\n",
|
||||
"\"\"\""
|
||||
],
|
||||
"id": "ff5d21dc8dd6bd29",
|
||||
"outputs": [],
|
||||
"execution_count": 3
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-02T18:08:01.823456Z",
|
||||
"start_time": "2025-10-02T18:08:01.119076Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"if api_key and api_key.startswith('sk-proj-') and len(api_key) > 10:\n",
|
||||
" print(\"API key looks good so far\")\n",
|
||||
"else:\n",
|
||||
" print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n",
|
||||
"\n",
|
||||
"MODEL = 'gpt-4o-mini'\n",
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"ed = Website(\"https://www.giocapizza.com/\")\n",
|
||||
"print(ed.links)"
|
||||
],
|
||||
"id": "bae61e79319ead26",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"API key looks good so far\n",
|
||||
"['https://www.giocapizza.com', 'tel:349-6705657', 'https://www.instagram.com/giocapizza/', 'https://www.facebook.com/giocapizza/', 'https://www.tripadvisor.it/Restaurant_Review-g2337656-d17784755-Reviews-Gioca_Pizza-Adrara_San_Martino_Province_of_Bergamo_Lombardy.html', 'https://www.youtube.com/@GiocaPizza', 'https://www.pinterest.jp/giocapizza/', 'https://www.giocapizza.com', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/menù', 'https://www.giocapizza.com/servizi', 'https://www.giocapizza.com/menù', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/incorniciate', 'mailto:giocapizza@gmail.com', 'http://www.sinapsisnc.com']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 4
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-02T18:08:05.104624Z",
|
||||
"start_time": "2025-10-02T18:08:05.102463Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def get_links_user_prompt(website):\n",
|
||||
" user_prompt = f\"Here is the list of links on the italian restaurant website of {website.url} - \"\n",
|
||||
" user_prompt += \"please decide which of these are relevant web links for the restaurant menu, respond with the full https URL in JSON format.\"\n",
|
||||
" user_prompt += \"Links (some might be relative links):\\n\"\n",
|
||||
" user_prompt += \"\\n\".join(website.links)\n",
|
||||
" return user_prompt\n"
|
||||
],
|
||||
"id": "1b5a43ae68ed636",
|
||||
"outputs": [],
|
||||
"execution_count": 5
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-02T18:08:08.740268Z",
|
||||
"start_time": "2025-10-02T18:08:08.734461Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def get_links(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
|
||||
" ],\n",
|
||||
" response_format={\"type\": \"json_object\"}\n",
|
||||
" )\n",
|
||||
" result = response.choices[0].message.content\n",
|
||||
" return json.loads(result)\n"
|
||||
],
|
||||
"id": "69e91ccd319153f7",
|
||||
"outputs": [],
|
||||
"execution_count": 6
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-02T18:08:15.402276Z",
|
||||
"start_time": "2025-10-02T18:08:15.397800Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def get_all_details(url):\n",
|
||||
" result = \"Landing page:\\n\"\n",
|
||||
" result += Website(url).get_contents()\n",
|
||||
" links = get_links(url)\n",
|
||||
" print(\"Found links:\", links)\n",
|
||||
" for link in links[\"links\"]:\n",
|
||||
" result += f\"\\n\\n{link['type']}\\n\"\n",
|
||||
" result += Website(link[\"url\"]).get_contents()\n",
|
||||
" return result\n"
|
||||
],
|
||||
"id": "e76a1deea9a05353",
|
||||
"outputs": [],
|
||||
"execution_count": 8
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-02T18:08:18.390851Z",
|
||||
"start_time": "2025-10-02T18:08:18.387630Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of several menu pages from an italian restaurant website \\\n",
|
||||
"and creates restaurant menu with dishes and prices in Euro. Respond in markdown.\"\n",
|
||||
"\n",
|
||||
"def get_restaurant_menu_user_prompt(company_name, url):\n",
|
||||
" user_prompt = f\"You are looking at a restaurant called: {company_name}\\n\"\n",
|
||||
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a restaurant menu in markdown.\\n\"\n",
|
||||
" user_prompt += get_all_details(url)\n",
|
||||
" user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
|
||||
" return user_prompt\n"
|
||||
],
|
||||
"id": "5f60f05dab091ec7",
|
||||
"outputs": [],
|
||||
"execution_count": 9
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-02T18:08:20.804552Z",
|
||||
"start_time": "2025-10-02T18:08:20.800766Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def create_restaurant_menu(company_name, url):\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": get_restaurant_menu_user_prompt(company_name, url)}\n",
|
||||
" ],\n",
|
||||
" )\n",
|
||||
" result = response.choices[0].message.content\n",
|
||||
" display(Markdown(result))"
|
||||
],
|
||||
"id": "32c64d933b194bc7",
|
||||
"outputs": [],
|
||||
"execution_count": 10
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-10-02T18:08:55.009134Z",
|
||||
"start_time": "2025-10-02T18:08:32.164709Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": "create_restaurant_menu(\"La Cascina\", \"https://www.lacascinacredaro.it/\")",
|
||||
"id": "19bbd3984732895d",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found links: {'links': [{'type': 'piatti', 'url': 'http://www.byserviziinternet.com/cascina/#piatti'}]}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
],
|
||||
"text/markdown": "# La Cascina – Ristorante Pizzeria Menu\n\n## Antipasti (Starters)\n- **Bruschetta al Pomodoro** - €5.00 \n Grilled bread topped with fresh tomatoes, garlic, and basil.\n\n- **Crostini Toscani** - €7.00 \n Toasted bread with traditional chicken liver pâté.\n\n- **Tagliere di Salumi** - €9.00 \n Selection of cured meats served with pickles and bread.\n\n## Primi Piatti (First Courses)\n- **Gnocchetti di Patate con Erbette** - €10.00 \n Potato gnocchi with a blend of seasonal greens.\n\n- **Paccheri con Polipetti** - €12.00 \n Large tubular pasta with baby octopus in a tomato sauce.\n\n- **Risotto ai Frutti di Mare** - €15.00 \n Arborio rice cooked with fresh seafood.\n\n- **Tagliolini al Tartufo** - €14.00 \n Homemade tagliolini pasta with truffle sauce.\n\n- **Zuppa di Cipolle** - €8.00 \n Traditional onion soup topped with melted cheese.\n\n## Secondi Piatti (Main Courses)\n- **Filetto di Manzo** - €18.00 \n Grilled beef fillet served with a side of seasonal vegetables.\n\n- **Pollo alla Griglia** - €12.00 \n Grilled chicken breast served with rosemary potatoes.\n\n- **Branzino al Forno** - €17.00 \n Oven-baked sea bass served with a lemon-herb sauce.\n\n## Pizze (Pizzas)\n- **Margherita** - €8.00 \n Classic pizza with tomato sauce, mozzarella, and basil.\n\n- **Diavola** - €10.00 \n Spicy salami pizza with tomato sauce and mozzarella.\n\n- **Funghi e Prosciutto** - €11.00 \n Pizza topped with mushrooms and ham.\n\n- **Vegetariana** - €9.50 \n Mixed vegetable pizza with mozzarella.\n\n## Dessert\n- **Tiramisu** - €5.00 \n Classic coffee-flavored Italian dessert.\n\n- **Panna Cotta** - €5.50 \n Creamy dessert served with berry sauce.\n\n- **Gelato** - €4.00 \n Selection of homemade ice creams.\n\n## Bevande (Beverages)\n- **Acqua Naturale / Frizzante** - €2.50 \n Still or sparkling water.\n\n- **Birra Artigianale** - €4.00 \n Local craft beer.\n\n- **Vino della Casa** - €5.50 / glass \n House wine selection.\n\nFor reservations or inquiries, please contact us at +39 035 936383. \n**Address:** Via L. Cadorna, 9, 24060 - Credaro (BG) \n**Closed on Wednesdays**."
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data",
|
||||
"jetTransient": {
|
||||
"display_id": null
|
||||
}
|
||||
}
|
||||
],
|
||||
"execution_count": 11
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
32
week1/community-contributions/menu-parser/website.py
Normal file
32
week1/community-contributions/menu-parser/website.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# A class to represent a Webpage
|
||||
|
||||
# Some websites need you to use proper headers when fetching them:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
class Website:
|
||||
"""
|
||||
A utility class to represent a Website that we have scraped, now with links.
|
||||
"""
|
||||
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
response = requests.get(url, headers=headers)
|
||||
self.body = response.content
|
||||
soup = BeautifulSoup(self.body, 'html.parser')
|
||||
self.title = soup.title.string if soup.title else "No title found"
|
||||
if soup.body:
|
||||
for irrelevant in soup.body(["script", "style", "img", "input"]):
|
||||
irrelevant.decompose()
|
||||
self.text = soup.body.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
self.text = ""
|
||||
links = [link.get('href') for link in soup.find_all('a')]
|
||||
self.links = [link for link in links if link]
|
||||
|
||||
def get_contents(self):
|
||||
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
|
||||
Reference in New Issue
Block a user