add a python script for an automated website content analysis & SEO extraction

2025-02-27 14:18:14 +00:00
parent 8ffd3a523f
commit c70c6c4f84
1 changed files with 176 additions and 0 deletions
--- a/week1/community-contributions/day-1-marketing_insights_scraper.py
+++ b/week1/community-contributions/day-1-marketing_insights_scraper.py
@@ -0,0 +1,176 @@
 import os
 import time
 import pandas as pd
 import re
 from dotenv import load_dotenv
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from openai import OpenAI
 from openpyxl import load_workbook
 from openpyxl.styles import Font, Alignment
 # Load environment variables
 load_dotenv(override=True)
 api_key = os.getenv('OPENAI_API_KEY')
 # Validate API Key
 if not api_key:
    raise ValueError("No API key was found - please check your .env file.")
 # Initialize OpenAI client
 openai = OpenAI()
 # Set up Selenium WebDriver
 chrome_options = Options()
 chrome_options.add_argument("--headless")
 chrome_options.add_argument("--disable-gpu")
 chrome_options.add_argument("--no-sandbox")
 chrome_options.add_argument("--disable-dev-shm-usage")
 class Website:
    """Scrapes and processes website content using Selenium."""
    def __init__(self, url: str):
        self.url = url
        self.text = "No content extracted."
        service = Service(executable_path="/opt/homebrew/bin/chromedriver")
        driver = webdriver.Chrome(service=service, options=chrome_options)
        try:
            driver.get(url)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            body_element = driver.find_element(By.TAG_NAME, "body")
            self.text = body_element.text.strip() if body_element else "No content extracted."
        except Exception as e:
            print(f"Error fetching website: {e}")
        finally:
            driver.quit()
    def summarized_text(self, max_length=1500):
        return self.text[:max_length] + ("..." if len(self.text) > max_length else "")
 def clean_text(text):
    """
    Cleans extracted text by removing markdown-style formatting.
    """
    text = re.sub(r"###*\s*", "", text)
    text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)
    return text.strip()
 # Aspect-specific prompts for concise output
 aspect_prompts = {
    "Marketing Strategies": "Summarize the core marketing strategies used on this website in in under 30 words. Do not include a title or introduction.",
    "SEO Keywords": "List only the most relevant SEO keywords from this website, separated by commas. Do not include a title or introduction.",
    "User Engagement Tactics": "List key engagement tactics used on this website (e.g., interactive features, user incentives, social proof). Keep responses to 3-5 bullet points. Do not include a title or introduction.",
    "Call-to-Action Phrases": "List only the most common Call-to-Action phrases used on this website, separated by commas. Do not include a title or introduction.",
    "Branding Elements": "Summarize the brand's tone, style, and positioning in under 30 words.  Do not include a title or introduction.",
    "Competitor Comparison": "Briefly describe how this website differentiates itself from competitors in under 30 words.  Do not include a title or introduction.",
    "Product Descriptions": "List the most important features or benefits of the products/services described on this website in under 30 words.  Do not include a title or introduction.",
    "Customer Reviews Sentiment": "Summarize the overall sentiment of customer reviews in oin under 30 words, highlighting common themes.  Do not include a title or introduction.",
    "Social Media Strategy": "List key social media strategies used on this website, separated by commas. Do not include a title or introduction."
 }
 def summarize(url: str) -> dict:
    """
    Fetches a website, extracts relevant content, and generates a separate summary for each aspect.
    :param url: The website URL to analyze.
    :return: A dictionary containing extracted information.
    """
    website = Website(url)
    if not website.text or website.text == "No content extracted.":
        return {"URL": url, "Error": "Failed to extract content"}
    extracted_data = {"URL": url}
    for aspect, prompt in aspect_prompts.items():
        try:
            formatted_prompt = f"{prompt} \n\nContent:\n{website.summarized_text()}"
            response = openai.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are an expert at extracting structured information from website content."},
                    {"role": "user", "content": formatted_prompt}
                ]
            )
            extracted_data[aspect] = clean_text(response.choices[0].message.content)
        except Exception as e:
            extracted_data[aspect] = f"Error generating summary: {e}"
    return extracted_data
 def save_to_excel(data_list: list, filename="website_analysis.xlsx"):
    """
    Saves extracted information to an Excel file with proper formatting.
    :param data_list: A list of dictionaries containing extracted website details.
    :param filename: The name of the Excel file to save data.
    """
    df = pd.DataFrame(data_list)
    df.to_excel(filename, index=False)
    wb = load_workbook(filename)
    ws = wb.active
    # Auto-adjust column widths
    for col in ws.columns:
        max_length = 0
        col_letter = col[0].column_letter
        for cell in col:
            try:
                if cell.value:
                    max_length = max(max_length, len(str(cell.value)))
            except:
                pass
        ws.column_dimensions[col_letter].width = min(max_length + 2, 50)
    # Format headers
    for cell in ws[1]:
        cell.font = Font(bold=True)
        cell.alignment = Alignment(horizontal="center", vertical="center")
    # Wrap text for extracted content
    for row in ws.iter_rows(min_row=2):
        for cell in row:
            cell.alignment = Alignment(wrap_text=True, vertical="top")
    wb.save(filename)
    print(f"Data saved to {filename} with improved formatting.")
 # 🔹 LIST OF WEBSITES TO PROCESS
 websites = [
    "https://www.udacity.com/",
    "https://www.coursera.org",
    "https://www.udemy.com",
    "https://www.edx.org",
    "https://www.freecodecamp.org/",
    "https://www.datacamp.com/",
    "https://www.w3schools.com/",
    "https://www.futurelearn.com/",
    "https://codefirstgirls.com/",
    "https://www.linkedin.com/learning",
 ]
 if __name__ == "__main__":
    print("\nProcessing websites...\n")
    extracted_data_list = []
    for site in websites:
        print(f"Extracting data from {site}...")
        extracted_data = summarize(site)
        extracted_data_list.append(extracted_data)
    save_to_excel(extracted_data_list)
    print("\nAll websites processed successfully!")