add a python script for an automated website content analysis & SEO extraction
This commit is contained in:
@@ -0,0 +1,176 @@
|
||||
import os
|
||||
import time
|
||||
import pandas as pd
|
||||
import re
|
||||
from dotenv import load_dotenv
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from openai import OpenAI
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.styles import Font, Alignment
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(override=True)
|
||||
api_key = os.getenv('OPENAI_API_KEY')
|
||||
|
||||
# Validate API Key
|
||||
if not api_key:
|
||||
raise ValueError("No API key was found - please check your .env file.")
|
||||
|
||||
# Initialize OpenAI client
|
||||
openai = OpenAI()
|
||||
|
||||
# Set up Selenium WebDriver
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--disable-gpu")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
|
||||
class Website:
|
||||
"""Scrapes and processes website content using Selenium."""
|
||||
|
||||
def __init__(self, url: str):
|
||||
self.url = url
|
||||
self.text = "No content extracted."
|
||||
|
||||
service = Service(executable_path="/opt/homebrew/bin/chromedriver")
|
||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
body_element = driver.find_element(By.TAG_NAME, "body")
|
||||
self.text = body_element.text.strip() if body_element else "No content extracted."
|
||||
except Exception as e:
|
||||
print(f"Error fetching website: {e}")
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
def summarized_text(self, max_length=1500):
|
||||
return self.text[:max_length] + ("..." if len(self.text) > max_length else "")
|
||||
|
||||
def clean_text(text):
|
||||
"""
|
||||
Cleans extracted text by removing markdown-style formatting.
|
||||
"""
|
||||
text = re.sub(r"###*\s*", "", text)
|
||||
text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)
|
||||
return text.strip()
|
||||
|
||||
# Aspect-specific prompts for concise output
|
||||
aspect_prompts = {
|
||||
"Marketing Strategies": "Summarize the core marketing strategies used on this website in in under 30 words. Do not include a title or introduction.",
|
||||
"SEO Keywords": "List only the most relevant SEO keywords from this website, separated by commas. Do not include a title or introduction.",
|
||||
"User Engagement Tactics": "List key engagement tactics used on this website (e.g., interactive features, user incentives, social proof). Keep responses to 3-5 bullet points. Do not include a title or introduction.",
|
||||
"Call-to-Action Phrases": "List only the most common Call-to-Action phrases used on this website, separated by commas. Do not include a title or introduction.",
|
||||
"Branding Elements": "Summarize the brand's tone, style, and positioning in under 30 words. Do not include a title or introduction.",
|
||||
"Competitor Comparison": "Briefly describe how this website differentiates itself from competitors in under 30 words. Do not include a title or introduction.",
|
||||
"Product Descriptions": "List the most important features or benefits of the products/services described on this website in under 30 words. Do not include a title or introduction.",
|
||||
"Customer Reviews Sentiment": "Summarize the overall sentiment of customer reviews in oin under 30 words, highlighting common themes. Do not include a title or introduction.",
|
||||
"Social Media Strategy": "List key social media strategies used on this website, separated by commas. Do not include a title or introduction."
|
||||
}
|
||||
|
||||
|
||||
def summarize(url: str) -> dict:
|
||||
"""
|
||||
Fetches a website, extracts relevant content, and generates a separate summary for each aspect.
|
||||
|
||||
:param url: The website URL to analyze.
|
||||
:return: A dictionary containing extracted information.
|
||||
"""
|
||||
website = Website(url)
|
||||
|
||||
if not website.text or website.text == "No content extracted.":
|
||||
return {"URL": url, "Error": "Failed to extract content"}
|
||||
|
||||
extracted_data = {"URL": url}
|
||||
|
||||
for aspect, prompt in aspect_prompts.items():
|
||||
try:
|
||||
formatted_prompt = f"{prompt} \n\nContent:\n{website.summarized_text()}"
|
||||
response = openai.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[
|
||||
{"role": "system", "content": "You are an expert at extracting structured information from website content."},
|
||||
{"role": "user", "content": formatted_prompt}
|
||||
]
|
||||
)
|
||||
|
||||
extracted_data[aspect] = clean_text(response.choices[0].message.content)
|
||||
|
||||
except Exception as e:
|
||||
extracted_data[aspect] = f"Error generating summary: {e}"
|
||||
|
||||
return extracted_data
|
||||
|
||||
def save_to_excel(data_list: list, filename="website_analysis.xlsx"):
|
||||
"""
|
||||
Saves extracted information to an Excel file with proper formatting.
|
||||
|
||||
:param data_list: A list of dictionaries containing extracted website details.
|
||||
:param filename: The name of the Excel file to save data.
|
||||
"""
|
||||
df = pd.DataFrame(data_list)
|
||||
|
||||
df.to_excel(filename, index=False)
|
||||
|
||||
wb = load_workbook(filename)
|
||||
ws = wb.active
|
||||
|
||||
# Auto-adjust column widths
|
||||
for col in ws.columns:
|
||||
max_length = 0
|
||||
col_letter = col[0].column_letter
|
||||
for cell in col:
|
||||
try:
|
||||
if cell.value:
|
||||
max_length = max(max_length, len(str(cell.value)))
|
||||
except:
|
||||
pass
|
||||
ws.column_dimensions[col_letter].width = min(max_length + 2, 50)
|
||||
|
||||
# Format headers
|
||||
for cell in ws[1]:
|
||||
cell.font = Font(bold=True)
|
||||
cell.alignment = Alignment(horizontal="center", vertical="center")
|
||||
|
||||
# Wrap text for extracted content
|
||||
for row in ws.iter_rows(min_row=2):
|
||||
for cell in row:
|
||||
cell.alignment = Alignment(wrap_text=True, vertical="top")
|
||||
|
||||
wb.save(filename)
|
||||
print(f"Data saved to {filename} with improved formatting.")
|
||||
|
||||
# 🔹 LIST OF WEBSITES TO PROCESS
|
||||
websites = [
|
||||
"https://www.udacity.com/",
|
||||
"https://www.coursera.org",
|
||||
"https://www.udemy.com",
|
||||
"https://www.edx.org",
|
||||
"https://www.freecodecamp.org/",
|
||||
"https://www.datacamp.com/",
|
||||
"https://www.w3schools.com/",
|
||||
"https://www.futurelearn.com/",
|
||||
"https://codefirstgirls.com/",
|
||||
"https://www.linkedin.com/learning",
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("\nProcessing websites...\n")
|
||||
extracted_data_list = []
|
||||
|
||||
for site in websites:
|
||||
print(f"Extracting data from {site}...")
|
||||
extracted_data = summarize(site)
|
||||
extracted_data_list.append(extracted_data)
|
||||
|
||||
save_to_excel(extracted_data_list)
|
||||
print("\nAll websites processed successfully!")
|
||||
Reference in New Issue
Block a user