add a python script for an automated website content analysis & SEO extraction

2025-02-27 14:18:14 +00:00
parent 8ffd3a523f
commit c70c6c4f84
1 changed files with 176 additions and 0 deletions
--- a/week1/community-contributions/day-1-marketing_insights_scraper.py
+++ b/week1/community-contributions/day-1-marketing_insights_scraper.py
@@ -0,0 +1,176 @@
+import os
+import time
+import pandas as pd
+import re
+from dotenv import load_dotenv
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from openai import OpenAI
+from openpyxl import load_workbook
+from openpyxl.styles import Font, Alignment
+
+# Load environment variables
+load_dotenv(override=True)
+api_key = os.getenv('OPENAI_API_KEY')
+
+# Validate API Key
+if not api_key:
+    raise ValueError("No API key was found - please check your .env file.")
+
+# Initialize OpenAI client
+openai = OpenAI()
+
+# Set up Selenium WebDriver
+chrome_options = Options()
+chrome_options.add_argument("--headless")
+chrome_options.add_argument("--disable-gpu")
+chrome_options.add_argument("--no-sandbox")
+chrome_options.add_argument("--disable-dev-shm-usage")
+
+class Website:
+    """Scrapes and processes website content using Selenium."""
+
+    def __init__(self, url: str):
+        self.url = url
+        self.text = "No content extracted."
+
+        service = Service(executable_path="/opt/homebrew/bin/chromedriver")
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+
+        try:
+            driver.get(url)
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.TAG_NAME, "body"))
+            )
+            body_element = driver.find_element(By.TAG_NAME, "body")
+            self.text = body_element.text.strip() if body_element else "No content extracted."
+        except Exception as e:
+            print(f"Error fetching website: {e}")
+        finally:
+            driver.quit()
+
+    def summarized_text(self, max_length=1500):
+        return self.text[:max_length] + ("..." if len(self.text) > max_length else "")
+
+def clean_text(text):
+    """
+    Cleans extracted text by removing markdown-style formatting.
+    """
+    text = re.sub(r"###*\s*", "", text)
+    text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)
+    return text.strip()
+
+# Aspect-specific prompts for concise output
+aspect_prompts = {
+    "Marketing Strategies": "Summarize the core marketing strategies used on this website in in under 30 words. Do not include a title or introduction.",
+    "SEO Keywords": "List only the most relevant SEO keywords from this website, separated by commas. Do not include a title or introduction.",
+    "User Engagement Tactics": "List key engagement tactics used on this website (e.g., interactive features, user incentives, social proof). Keep responses to 3-5 bullet points. Do not include a title or introduction.",
+    "Call-to-Action Phrases": "List only the most common Call-to-Action phrases used on this website, separated by commas. Do not include a title or introduction.",
+    "Branding Elements": "Summarize the brand's tone, style, and positioning in under 30 words.  Do not include a title or introduction.",
+    "Competitor Comparison": "Briefly describe how this website differentiates itself from competitors in under 30 words.  Do not include a title or introduction.",
+    "Product Descriptions": "List the most important features or benefits of the products/services described on this website in under 30 words.  Do not include a title or introduction.",
+    "Customer Reviews Sentiment": "Summarize the overall sentiment of customer reviews in oin under 30 words, highlighting common themes.  Do not include a title or introduction.",
+    "Social Media Strategy": "List key social media strategies used on this website, separated by commas. Do not include a title or introduction."
+}
+
+
+def summarize(url: str) -> dict:
+    """
+    Fetches a website, extracts relevant content, and generates a separate summary for each aspect.
+
+    :param url: The website URL to analyze.
+    :return: A dictionary containing extracted information.
+    """
+    website = Website(url)
+
+    if not website.text or website.text == "No content extracted.":
+        return {"URL": url, "Error": "Failed to extract content"}
+
+    extracted_data = {"URL": url}
+
+    for aspect, prompt in aspect_prompts.items():
+        try:
+            formatted_prompt = f"{prompt} \n\nContent:\n{website.summarized_text()}"
+            response = openai.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": "You are an expert at extracting structured information from website content."},
+                    {"role": "user", "content": formatted_prompt}
+                ]
+            )
+
+            extracted_data[aspect] = clean_text(response.choices[0].message.content)
+
+        except Exception as e:
+            extracted_data[aspect] = f"Error generating summary: {e}"
+
+    return extracted_data
+
+def save_to_excel(data_list: list, filename="website_analysis.xlsx"):
+    """
+    Saves extracted information to an Excel file with proper formatting.
+
+    :param data_list: A list of dictionaries containing extracted website details.
+    :param filename: The name of the Excel file to save data.
+    """
+    df = pd.DataFrame(data_list)
+
+    df.to_excel(filename, index=False)
+
+    wb = load_workbook(filename)
+    ws = wb.active
+
+    # Auto-adjust column widths
+    for col in ws.columns:
+        max_length = 0
+        col_letter = col[0].column_letter
+        for cell in col:
+            try:
+                if cell.value:
+                    max_length = max(max_length, len(str(cell.value)))
+            except:
+                pass
+        ws.column_dimensions[col_letter].width = min(max_length + 2, 50)
+
+    # Format headers
+    for cell in ws[1]:
+        cell.font = Font(bold=True)
+        cell.alignment = Alignment(horizontal="center", vertical="center")
+
+    # Wrap text for extracted content
+    for row in ws.iter_rows(min_row=2):
+        for cell in row:
+            cell.alignment = Alignment(wrap_text=True, vertical="top")
+
+    wb.save(filename)
+    print(f"Data saved to {filename} with improved formatting.")
+
+# 🔹 LIST OF WEBSITES TO PROCESS
+websites = [
+    "https://www.udacity.com/",
+    "https://www.coursera.org",
+    "https://www.udemy.com",
+    "https://www.edx.org",
+    "https://www.freecodecamp.org/",
+    "https://www.datacamp.com/",
+    "https://www.w3schools.com/",
+    "https://www.futurelearn.com/",
+    "https://codefirstgirls.com/",
+    "https://www.linkedin.com/learning",
+]
+
+if __name__ == "__main__":
+    print("\nProcessing websites...\n")
+    extracted_data_list = []
+
+    for site in websites:
+        print(f"Extracting data from {site}...")
+        extracted_data = summarize(site)
+        extracted_data_list.append(extracted_data)
+
+    save_to_excel(extracted_data_list)
+    print("\nAll websites processed successfully!")