diff --git a/week1/community-contributions/week1-google-map-review-summarizer/google-map-review-summarizer.ipynb b/week1/community-contributions/week1-google-map-review-summarizer/google-map-review-summarizer.ipynb new file mode 100644 index 0000000..d18222b --- /dev/null +++ b/week1/community-contributions/week1-google-map-review-summarizer/google-map-review-summarizer.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1fecd49e", + "metadata": {}, + "source": [ + "# ๐Ÿ—บ๏ธ Google Maps Review Summarizer\n", + "\n", + "This Python app automates the process of fetching and summarizing Google Maps reviews for any business or location.\n", + "\n", + "## ๐Ÿš€ Overview\n", + "The app performs two main tasks:\n", + "1. **Scrape Reviews** โ€“ Uses a web scraping script to extract reviews directly from Google Maps.\n", + "2. **Summarize Content** โ€“ Leverages OpenAI's language models to generate concise, insightful summaries of the collected reviews and analyse the sentiments.\n", + "\n", + "## ๐Ÿง  Tech Stack\n", + "- **Python** โ€“ Core language\n", + "- **Playwright** โ€“ For scraping reviews\n", + "- **OpenAI API** โ€“ For natural language summarization\n", + "- **Jupyter Notebook** โ€“ For exploration, testing, and demonstration\n", + "\n", + "### ๐Ÿ™ Credits\n", + "The web scraping logic is **inspired by [Antonello Zaniniโ€™s blog post](https://blog.apify.com/how-to-scrape-google-reviews/)** on building a Google Reviews scraper. Special thanks for the valuable insights on **structuring and automating the scraping workflow**, which greatly informed the development of this improved scraper.\n", + "\n", + "This app, however, uses an **enhanced version of the scraper** that can scroll infinitely to load more reviews until it collects **at least 1,000 reviews**. If only a smaller number of reviews are available, the scraper stops scrolling earlier.\n", + "\n", + "## โœ… Sample Output\n", + "Here is a summary of reviews of a restuarant generated by the app.\n", + "\n", + "![Alt text](google-map-review-summary.jpg)\n", + "\n", + "\n", + "---\n", + "\n", + "**Note:** This project is intended for educational and research purposes. Please ensure compliance with Googleโ€™s [Terms of Service](https://policies.google.com/terms) when scraping or using their data.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df04a4aa", + "metadata": {}, + "outputs": [], + "source": [ + "#Activate the llm_engineering virtual environment\n", + "!source ../../../.venv/bin/activate \n", + "\n", + "#Make sure pip is available and up to date inside the venv\n", + "!python3 -m ensurepip --upgrade\n", + "\n", + "#Verify that pip now points to the venv path (should end with /.venv/bin/pip)\n", + "!which pip3\n", + "\n", + "#Install Playwright inside the venv\n", + "!pip3 install playwright\n", + "\n", + "#Download the required browser binaries and dependencies\n", + "!python3 -m playwright install" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1c794cfd", + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "from playwright.async_api import async_playwright\n", + "from IPython.display import Markdown, display\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from openai import OpenAI\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "317af2b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API key found and looks good so far!\n" + ] + } + ], + "source": [ + "# Load environment variables in a file called .env\n", + "\n", + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "# Check the key\n", + "\n", + "if not api_key:\n", + " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", + "elif not api_key.startswith(\"sk-proj-\"):\n", + " print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n", + "elif api_key.strip() != api_key:\n", + " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", + "else:\n", + " print(\"API key found and looks good so far!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6f142c79", + "metadata": {}, + "outputs": [], + "source": [ + "async def scroll_reviews_panel(page, max_scrolls=50, max_reviews=10):\n", + " \"\"\"\n", + " Scrolls through the reviews panel to lazy load all reviews.\n", + " \n", + " Args:\n", + " page: Playwright page object\n", + " max_scrolls: Maximum number of scroll attempts to prevent infinite loops\n", + " \n", + " Returns:\n", + " Number of reviews loaded\n", + " \"\"\"\n", + " # Find the scrollable reviews container\n", + " # Google Maps reviews are in a specific scrollable div\n", + " scrollable_div = page.locator('div[role=\"main\"] div[jslog$=\"mutable:true;\"]').first\n", + " \n", + " previous_review_count = 0\n", + " scroll_attempts = 0\n", + " no_change_count = 0\n", + "\n", + " print(\"Starting to scroll and load reviews...\")\n", + " \n", + " while scroll_attempts < max_scrolls:\n", + " # Get current count of reviews\n", + " review_elements = page.locator(\"div[data-review-id][jsaction]\")\n", + " current_review_count = await review_elements.count()\n", + " \n", + " #if we have loaded max_reviews, we will stop scrolling\n", + " if current_review_count >= max_reviews:\n", + " break\n", + "\n", + " print(f\"Scroll attempt {scroll_attempts + 1}: Found {current_review_count} reviews\")\n", + " \n", + " # Scroll to the bottom of the reviews panel\n", + " await scrollable_div.evaluate(\"\"\"\n", + " (element) => {\n", + " element.scrollTo(0, element.scrollHeight + 100);\n", + " }\n", + " \"\"\")\n", + " \n", + " # Wait for potential new content to load\n", + " await asyncio.sleep(2)\n", + " \n", + " # Check if new reviews were loaded\n", + " if current_review_count == previous_review_count:\n", + " no_change_count += 1\n", + " # If count hasn't changed for 3 consecutive scrolls, we've likely reached the end\n", + " if no_change_count >= 3:\n", + " print(f\"No new reviews loaded after {no_change_count} attempts. Finished loading.\")\n", + " break\n", + " else:\n", + " no_change_count = 0\n", + " \n", + " previous_review_count = current_review_count\n", + " scroll_attempts += 1\n", + " \n", + " final_count = await review_elements.count()\n", + " print(f\"Finished scrolling. Total reviews loaded: {final_count}\")\n", + " return final_count" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f7f67b70", + "metadata": {}, + "outputs": [], + "source": [ + "async def scrape_google_reviews(url):\n", + " # Where to store the scraped data\n", + " reviews = []\n", + "\n", + " async with async_playwright() as p:\n", + " # Initialize a new Playwright instance\n", + " browser = await p.chromium.launch(\n", + " headless=True # Set to False if you want to see the browser in action\n", + " )\n", + " context = await browser.new_context()\n", + " page = await context.new_page()\n", + "\n", + " # The URL of the Google Maps reviews page\n", + "\n", + " # Navigate to the target Google Maps page\n", + " print(\"Navigating to Google Maps page...\")\n", + " await page.goto(url)\n", + "\n", + " # Wait for initial reviews to load\n", + " print(\"Waiting for initial reviews to load...\")\n", + " review_html_elements = page.locator(\"div[data-review-id][jsaction]\")\n", + " await review_html_elements.first.wait_for(state=\"visible\", timeout=10000)\n", + " \n", + " # Scroll through the reviews panel to lazy load all reviews\n", + " total_reviews = await scroll_reviews_panel(page, max_scrolls=100)\n", + " \n", + " print(f\"\\nStarting to scrape {total_reviews} reviews...\")\n", + "\n", + " # Get all review elements after scrolling\n", + " review_html_elements = page.locator(\"div[data-review-id][jsaction]\")\n", + " all_reviews = await review_html_elements.all()\n", + " \n", + " # Iterate over the elements and scrape data from each of them\n", + " for idx, review_html_element in enumerate(all_reviews, 1):\n", + " try:\n", + " # Scraping logic\n", + "\n", + " stars_element = review_html_element.locator(\"[aria-label*=\\\"star\\\"]\")\n", + " stars_label = await stars_element.get_attribute(\"aria-label\")\n", + "\n", + " # Extract the review score from the stars label\n", + " stars = None\n", + " for i in range(1, 6):\n", + " if stars_label and str(i) in stars_label:\n", + " stars = i\n", + " break\n", + "\n", + " # Get the next sibling of the previous element with an XPath expression\n", + " time_sibling = stars_element.locator(\"xpath=following-sibling::span\")\n", + " time = await time_sibling.text_content()\n", + "\n", + " # Select the \"More\" button and if it is present, click it\n", + " more_element = review_html_element.locator(\"button[aria-label=\\\"See more\\\"]\").first\n", + " if await more_element.is_visible():\n", + " await more_element.click()\n", + " await asyncio.sleep(0.3) # Brief wait for text expansion\n", + "\n", + " text_element = review_html_element.locator(\"div[tabindex=\\\"-1\\\"][id][lang]\")\n", + " text = await text_element.text_content()\n", + "\n", + " reviews.append(str(stars) + \" Stars: \\n\" +\"Reviewed On:\" + time + \"\\n\"+ text)\n", + " \n", + " if idx % 10 == 0:\n", + " print(f\"Scraped {idx}/{total_reviews} reviews...\")\n", + " \n", + " except Exception as e:\n", + " print(f\"Error scraping review {idx}: {str(e)}\")\n", + " continue\n", + "\n", + " print(f\"\\nSuccessfully scraped {len(reviews)} reviews!\")\n", + "\n", + " # Close the browser and release its resources\n", + " await browser.close()\n", + "\n", + " return \"\\n\".join(reviews)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cb160d5f", + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = \"\"\"\n", + "You are an expert assistant that analyzes google reviews,\n", + "and provides a summary and centiment of the reviews.\n", + "Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "69e08d4b", + "metadata": {}, + "outputs": [], + "source": [ + "# Define our user prompt\n", + "\n", + "user_prompt_prefix = \"\"\"\n", + "Here are the reviews of a google map location/business.\n", + "Provide a short summary of the reviews and the sentiment of the reviews.\n", + "\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d710972d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def prepare_message(reviews):\n", + " return [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt_prefix + reviews}\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cb51f436", + "metadata": {}, + "outputs": [], + "source": [ + "async def summarize(url):\n", + " openai = OpenAI()\n", + " reviews = await scrape_google_reviews(url)\n", + " response = openai.chat.completions.create(\n", + " model = \"gpt-4.1-mini\",\n", + " messages = prepare_message(reviews)\n", + " )\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2f09e2d2", + "metadata": {}, + "outputs": [], + "source": [ + "async def display_summary(url):\n", + " summary = await summarize(url)\n", + " display(Markdown(summary))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca7995c9", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://www.google.com/maps/place/Grace+Home+Nursing+%26+Assisted+Living/@12.32184,75.0853037,17z/data=!4m8!3m7!1s0x3ba47da1be6a0279:0x9e73181ab0827f7e!8m2!3d12.32184!4d75.0853037!9m1!1b1!16s%2Fg%2F11qjl430n_?entry=ttu&g_ep=EgoyMDI1MTAyMC4wIKXMDSoASAFQAw%3D%3D\"\n", + "await display_summary(url)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week1/community-contributions/week1-google-map-review-summarizer/google-map-review-summary.jpg b/week1/community-contributions/week1-google-map-review-summarizer/google-map-review-summary.jpg new file mode 100644 index 0000000..43a7891 Binary files /dev/null and b/week1/community-contributions/week1-google-map-review-summarizer/google-map-review-summary.jpg differ