feat: week 1 addon italian restaurant menu parser
This commit is contained in:
259
week1/menu_parser.ipynb
Normal file
259
week1/menu_parser.ipynb
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"id": "initial_id",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-10-02T18:07:54.689902Z",
|
||||||
|
"start_time": "2025-10-02T18:07:54.330580Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import json\n",
|
||||||
|
"from dotenv import load_dotenv\n",
|
||||||
|
"from IPython.display import Markdown, display\n",
|
||||||
|
"from openai import OpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"from website import Website"
|
||||||
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-10-02T18:07:58.182655Z",
|
||||||
|
"start_time": "2025-10-02T18:07:58.176747Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"link_system_prompt = \"You are provided with a list of links found on a Italian restaurant webpage. \\\n",
|
||||||
|
"You are able to decide which of the links would be most relevant to include in the restaurant menu, \\\n",
|
||||||
|
"such as links to an menu pdf file, Menù page, Piatti, or Bevande.\\n\"\n",
|
||||||
|
"link_system_prompt += \"You should respond in JSON as in this example:\"\n",
|
||||||
|
"link_system_prompt += \"\"\"\n",
|
||||||
|
"{\n",
|
||||||
|
" \"links\": [\n",
|
||||||
|
" {\"type\": \"menu pdf\", \"url\": \"https://www.ristoranteapprodo.com/Documenti/MenuEstivo2024.pdf\"},\n",
|
||||||
|
" {\"type\": \"menu page\", \"url\": \"https://www.giocapizza.com/men%C3%B9\"}\n",
|
||||||
|
" ]\n",
|
||||||
|
"}\n",
|
||||||
|
"\"\"\""
|
||||||
|
],
|
||||||
|
"id": "ff5d21dc8dd6bd29",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-10-02T18:08:01.823456Z",
|
||||||
|
"start_time": "2025-10-02T18:08:01.119076Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"load_dotenv(override=True)\n",
|
||||||
|
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||||
|
"\n",
|
||||||
|
"if api_key and api_key.startswith('sk-proj-') and len(api_key) > 10:\n",
|
||||||
|
" print(\"API key looks good so far\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n",
|
||||||
|
"\n",
|
||||||
|
"MODEL = 'gpt-4o-mini'\n",
|
||||||
|
"openai = OpenAI()\n",
|
||||||
|
"\n",
|
||||||
|
"ed = Website(\"https://www.giocapizza.com/\")\n",
|
||||||
|
"print(ed.links)"
|
||||||
|
],
|
||||||
|
"id": "bae61e79319ead26",
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"API key looks good so far\n",
|
||||||
|
"['https://www.giocapizza.com', 'tel:349-6705657', 'https://www.instagram.com/giocapizza/', 'https://www.facebook.com/giocapizza/', 'https://www.tripadvisor.it/Restaurant_Review-g2337656-d17784755-Reviews-Gioca_Pizza-Adrara_San_Martino_Province_of_Bergamo_Lombardy.html', 'https://www.youtube.com/@GiocaPizza', 'https://www.pinterest.jp/giocapizza/', 'https://www.giocapizza.com', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/menù', 'https://www.giocapizza.com/servizi', 'https://www.giocapizza.com/menù', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/incorniciate', 'https://www.giocapizza.com/incorniciate', 'mailto:giocapizza@gmail.com', 'http://www.sinapsisnc.com']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"execution_count": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-10-02T18:08:05.104624Z",
|
||||||
|
"start_time": "2025-10-02T18:08:05.102463Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def get_links_user_prompt(website):\n",
|
||||||
|
" user_prompt = f\"Here is the list of links on the italian restaurant website of {website.url} - \"\n",
|
||||||
|
" user_prompt += \"please decide which of these are relevant web links for the restaurant menu, respond with the full https URL in JSON format.\"\n",
|
||||||
|
" user_prompt += \"Links (some might be relative links):\\n\"\n",
|
||||||
|
" user_prompt += \"\\n\".join(website.links)\n",
|
||||||
|
" return user_prompt\n"
|
||||||
|
],
|
||||||
|
"id": "1b5a43ae68ed636",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-10-02T18:08:08.740268Z",
|
||||||
|
"start_time": "2025-10-02T18:08:08.734461Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def get_links(url):\n",
|
||||||
|
" website = Website(url)\n",
|
||||||
|
" response = openai.chat.completions.create(\n",
|
||||||
|
" model=MODEL,\n",
|
||||||
|
" messages=[\n",
|
||||||
|
" {\"role\": \"system\", \"content\": link_system_prompt},\n",
|
||||||
|
" {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
|
||||||
|
" ],\n",
|
||||||
|
" response_format={\"type\": \"json_object\"}\n",
|
||||||
|
" )\n",
|
||||||
|
" result = response.choices[0].message.content\n",
|
||||||
|
" return json.loads(result)\n"
|
||||||
|
],
|
||||||
|
"id": "69e91ccd319153f7",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 6
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-10-02T18:08:15.402276Z",
|
||||||
|
"start_time": "2025-10-02T18:08:15.397800Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def get_all_details(url):\n",
|
||||||
|
" result = \"Landing page:\\n\"\n",
|
||||||
|
" result += Website(url).get_contents()\n",
|
||||||
|
" links = get_links(url)\n",
|
||||||
|
" print(\"Found links:\", links)\n",
|
||||||
|
" for link in links[\"links\"]:\n",
|
||||||
|
" result += f\"\\n\\n{link['type']}\\n\"\n",
|
||||||
|
" result += Website(link[\"url\"]).get_contents()\n",
|
||||||
|
" return result\n"
|
||||||
|
],
|
||||||
|
"id": "e76a1deea9a05353",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 8
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-10-02T18:08:18.390851Z",
|
||||||
|
"start_time": "2025-10-02T18:08:18.387630Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"system_prompt = \"You are an assistant that analyzes the contents of several menu pages from an italian restaurant website \\\n",
|
||||||
|
"and creates restaurant menu with dishes and prices in Euro. Respond in markdown.\"\n",
|
||||||
|
"\n",
|
||||||
|
"def get_restaurant_menu_user_prompt(company_name, url):\n",
|
||||||
|
" user_prompt = f\"You are looking at a restaurant called: {company_name}\\n\"\n",
|
||||||
|
" user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a restaurant menu in markdown.\\n\"\n",
|
||||||
|
" user_prompt += get_all_details(url)\n",
|
||||||
|
" user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
|
||||||
|
" return user_prompt\n"
|
||||||
|
],
|
||||||
|
"id": "5f60f05dab091ec7",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 9
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-10-02T18:08:20.804552Z",
|
||||||
|
"start_time": "2025-10-02T18:08:20.800766Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def create_restaurant_menu(company_name, url):\n",
|
||||||
|
" response = openai.chat.completions.create(\n",
|
||||||
|
" model=MODEL,\n",
|
||||||
|
" messages=[\n",
|
||||||
|
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||||
|
" {\"role\": \"user\", \"content\": get_restaurant_menu_user_prompt(company_name, url)}\n",
|
||||||
|
" ],\n",
|
||||||
|
" )\n",
|
||||||
|
" result = response.choices[0].message.content\n",
|
||||||
|
" display(Markdown(result))"
|
||||||
|
],
|
||||||
|
"id": "32c64d933b194bc7",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-10-02T18:08:55.009134Z",
|
||||||
|
"start_time": "2025-10-02T18:08:32.164709Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": "create_restaurant_menu(\"La Cascina\", \"https://www.lacascinacredaro.it/\")",
|
||||||
|
"id": "19bbd3984732895d",
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Found links: {'links': [{'type': 'piatti', 'url': 'http://www.byserviziinternet.com/cascina/#piatti'}]}\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
],
|
||||||
|
"text/markdown": "# La Cascina – Ristorante Pizzeria Menu\n\n## Antipasti (Starters)\n- **Bruschetta al Pomodoro** - €5.00 \n Grilled bread topped with fresh tomatoes, garlic, and basil.\n\n- **Crostini Toscani** - €7.00 \n Toasted bread with traditional chicken liver pâté.\n\n- **Tagliere di Salumi** - €9.00 \n Selection of cured meats served with pickles and bread.\n\n## Primi Piatti (First Courses)\n- **Gnocchetti di Patate con Erbette** - €10.00 \n Potato gnocchi with a blend of seasonal greens.\n\n- **Paccheri con Polipetti** - €12.00 \n Large tubular pasta with baby octopus in a tomato sauce.\n\n- **Risotto ai Frutti di Mare** - €15.00 \n Arborio rice cooked with fresh seafood.\n\n- **Tagliolini al Tartufo** - €14.00 \n Homemade tagliolini pasta with truffle sauce.\n\n- **Zuppa di Cipolle** - €8.00 \n Traditional onion soup topped with melted cheese.\n\n## Secondi Piatti (Main Courses)\n- **Filetto di Manzo** - €18.00 \n Grilled beef fillet served with a side of seasonal vegetables.\n\n- **Pollo alla Griglia** - €12.00 \n Grilled chicken breast served with rosemary potatoes.\n\n- **Branzino al Forno** - €17.00 \n Oven-baked sea bass served with a lemon-herb sauce.\n\n## Pizze (Pizzas)\n- **Margherita** - €8.00 \n Classic pizza with tomato sauce, mozzarella, and basil.\n\n- **Diavola** - €10.00 \n Spicy salami pizza with tomato sauce and mozzarella.\n\n- **Funghi e Prosciutto** - €11.00 \n Pizza topped with mushrooms and ham.\n\n- **Vegetariana** - €9.50 \n Mixed vegetable pizza with mozzarella.\n\n## Dessert\n- **Tiramisu** - €5.00 \n Classic coffee-flavored Italian dessert.\n\n- **Panna Cotta** - €5.50 \n Creamy dessert served with berry sauce.\n\n- **Gelato** - €4.00 \n Selection of homemade ice creams.\n\n## Bevande (Beverages)\n- **Acqua Naturale / Frizzante** - €2.50 \n Still or sparkling water.\n\n- **Birra Artigianale** - €4.00 \n Local craft beer.\n\n- **Vino della Casa** - €5.50 / glass \n House wine selection.\n\nFor reservations or inquiries, please contact us at +39 035 936383. \n**Address:** Via L. Cadorna, 9, 24060 - Credaro (BG) \n**Closed on Wednesdays**."
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data",
|
||||||
|
"jetTransient": {
|
||||||
|
"display_id": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"execution_count": 11
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 2
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython2",
|
||||||
|
"version": "2.7.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
32
week1/website.py
Normal file
32
week1/website.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# A class to represent a Webpage
|
||||||
|
|
||||||
|
# Some websites need you to use proper headers when fetching them:
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
class Website:
|
||||||
|
"""
|
||||||
|
A utility class to represent a Website that we have scraped, now with links.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
self.body = response.content
|
||||||
|
soup = BeautifulSoup(self.body, 'html.parser')
|
||||||
|
self.title = soup.title.string if soup.title else "No title found"
|
||||||
|
if soup.body:
|
||||||
|
for irrelevant in soup.body(["script", "style", "img", "input"]):
|
||||||
|
irrelevant.decompose()
|
||||||
|
self.text = soup.body.get_text(separator="\n", strip=True)
|
||||||
|
else:
|
||||||
|
self.text = ""
|
||||||
|
links = [link.get('href') for link in soup.find_all('a')]
|
||||||
|
self.links = [link for link in links if link]
|
||||||
|
|
||||||
|
def get_contents(self):
|
||||||
|
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
|
||||||
Reference in New Issue
Block a user