1678 lines
67 KiB
Python
1678 lines
67 KiB
Python
import sys
|
|
import json
|
|
from urllib.parse import urlparse
|
|
from PyQt5.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
|
|
QHBoxLayout, QLabel, QLineEdit, QSpinBox, QPushButton,
|
|
QTextEdit, QTableWidget, QTableWidgetItem, QTabWidget,
|
|
QProgressBar, QComboBox, QMessageBox, QSplitter,
|
|
QGroupBox, QGridLayout, QHeaderView, QFrame, QScrollArea,
|
|
QSystemTrayIcon, QStyle, QAction, QMenu, QTreeWidget, QTreeWidgetItem,
|
|
QListWidget, QListWidgetItem, QSizePolicy, QAbstractItemView)
|
|
from PyQt5.QtCore import QThread, pyqtSignal, Qt, QTimer, QUrl
|
|
from PyQt5.QtGui import QFont, QIcon, QPalette, QColor, QPixmap
|
|
try:
|
|
from PyQt5.QtWebEngineWidgets import QWebEngineView
|
|
WEB_ENGINE_AVAILABLE = True
|
|
print("PyQtWebEngine successfully imported - Visual preview enabled")
|
|
except ImportError as e:
|
|
WEB_ENGINE_AVAILABLE = False
|
|
print(f"PyQtWebEngine not available: {e}")
|
|
print("Visual preview will be disabled. Install with: pip install PyQtWebEngine")
|
|
import module
|
|
import re
|
|
import webbrowser
|
|
import os
|
|
try:
|
|
from openai import OpenAI
|
|
OPENAI_AVAILABLE = True
|
|
except ImportError:
|
|
OPENAI_AVAILABLE = False
|
|
from datetime import datetime
|
|
from dotenv import load_dotenv
|
|
import markdown
|
|
|
|
# Load environment variables from .env file
|
|
load_dotenv()
|
|
|
|
class ScrapingThread(QThread):
|
|
"""Thread for running web scraping operations"""
|
|
progress_updated = pyqtSignal(str)
|
|
scraping_complete = pyqtSignal(list)
|
|
error_occurred = pyqtSignal(str)
|
|
|
|
def __init__(self, url, max_depth):
|
|
super().__init__()
|
|
self.url = url
|
|
self.max_depth = max_depth
|
|
self.scraper = module.WebScraper()
|
|
self._stop_requested = False
|
|
|
|
def stop(self):
|
|
"""Request graceful stop of the scraping process"""
|
|
self._stop_requested = True
|
|
if hasattr(self.scraper, 'stop_scraping'):
|
|
self.scraper.stop_scraping()
|
|
|
|
def run(self):
|
|
try:
|
|
self.progress_updated.emit("Starting web scraping...")
|
|
|
|
# Reset scraper state for new crawl
|
|
self.scraper.reset()
|
|
|
|
def progress_callback(website):
|
|
if self._stop_requested:
|
|
return # Stop processing if requested
|
|
if website:
|
|
self.progress_updated.emit(f"Scraped: {website.title} (depth {website.depth})")
|
|
|
|
# Start scraping with progress callback
|
|
websites = self.scraper.crawl_website(self.url, self.max_depth, progress_callback)
|
|
|
|
# Check if stop was requested
|
|
if self._stop_requested:
|
|
self.progress_updated.emit("Scraping stopped by user.")
|
|
return
|
|
|
|
# Emit final progress
|
|
self.progress_updated.emit(f"Scraping complete! Found {len(websites)} websites.")
|
|
self.scraping_complete.emit(websites)
|
|
|
|
except Exception as e:
|
|
if not self._stop_requested: # Only emit error if not stopped by user
|
|
self.error_occurred.emit(str(e))
|
|
|
|
class ModernButton(QPushButton):
|
|
"""Custom modern button with hover effects"""
|
|
def __init__(self, text, primary=False):
|
|
super().__init__(text)
|
|
self.primary = primary
|
|
self.setMinimumHeight(40)
|
|
self.setFont(QFont("Segoe UI", 10, QFont.Weight.Medium))
|
|
self.setCursor(Qt.CursorShape.PointingHandCursor)
|
|
self.update_style()
|
|
|
|
def update_style(self):
|
|
if self.primary:
|
|
self.setStyleSheet("""
|
|
QPushButton {
|
|
background: #3b82f6;
|
|
border: none;
|
|
color: white;
|
|
padding: 12px 24px;
|
|
border-radius: 6px;
|
|
font-weight: 600;
|
|
}
|
|
QPushButton:hover {
|
|
background: #2563eb;
|
|
}
|
|
QPushButton:pressed {
|
|
background: #1d4ed8;
|
|
}
|
|
QPushButton:disabled {
|
|
background: #9ca3af;
|
|
color: #f3f4f6;
|
|
}
|
|
""")
|
|
else:
|
|
self.setStyleSheet("""
|
|
QPushButton {
|
|
background: white;
|
|
border: 1px solid #d1d5db;
|
|
color: #374151;
|
|
padding: 10px 20px;
|
|
border-radius: 6px;
|
|
font-weight: 500;
|
|
}
|
|
QPushButton:hover {
|
|
border-color: #3b82f6;
|
|
color: #3b82f6;
|
|
background: #f8fafc;
|
|
}
|
|
QPushButton:pressed {
|
|
background: #f1f5f9;
|
|
}
|
|
QPushButton:disabled {
|
|
background: #f9fafb;
|
|
border-color: #e5e7eb;
|
|
color: #9ca3af;
|
|
}
|
|
""")
|
|
|
|
class ModernLineEdit(QLineEdit):
|
|
"""Custom modern input field"""
|
|
def __init__(self, placeholder=""):
|
|
super().__init__()
|
|
self.setPlaceholderText(placeholder)
|
|
self.setMinimumHeight(40)
|
|
self.setFont(QFont("Segoe UI", 10))
|
|
self.setStyleSheet("""
|
|
QLineEdit {
|
|
border: 1px solid #d1d5db;
|
|
border-radius: 6px;
|
|
padding: 8px 12px;
|
|
background: white;
|
|
color: #374151;
|
|
font-size: 14px;
|
|
}
|
|
QLineEdit:focus {
|
|
border-color: #3b82f6;
|
|
outline: none;
|
|
}
|
|
QLineEdit::placeholder {
|
|
color: #9ca3af;
|
|
}
|
|
""")
|
|
|
|
class ModernSpinBox(QSpinBox):
|
|
"""Custom modern spin box"""
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.setMinimumHeight(40)
|
|
self.setFont(QFont("Segoe UI", 10))
|
|
self.setStyleSheet("""
|
|
QSpinBox {
|
|
border: 1px solid #d1d5db;
|
|
border-radius: 6px;
|
|
padding: 8px 12px;
|
|
background: white;
|
|
color: #374151;
|
|
font-size: 14px;
|
|
}
|
|
QSpinBox:focus {
|
|
border-color: #3b82f6;
|
|
}
|
|
QSpinBox::up-button, QSpinBox::down-button {
|
|
border: none;
|
|
background: #f9fafb;
|
|
border-radius: 3px;
|
|
margin: 2px;
|
|
}
|
|
QSpinBox::up-button:hover, QSpinBox::down-button:hover {
|
|
background: #f3f4f6;
|
|
}
|
|
""")
|
|
|
|
class ChatBubbleWidget(QWidget):
|
|
def __init__(self, message, timestamp, role):
|
|
super().__init__()
|
|
layout = QVBoxLayout(self)
|
|
layout.setContentsMargins(0, 0, 0, 0)
|
|
layout.setSpacing(2)
|
|
# Bubble
|
|
if role == "ai":
|
|
html = markdown.markdown(message)
|
|
bubble = QLabel(html)
|
|
bubble.setTextFormat(Qt.TextFormat.RichText)
|
|
else:
|
|
bubble = QLabel(message)
|
|
bubble.setTextFormat(Qt.TextFormat.PlainText)
|
|
bubble.setWordWrap(True)
|
|
bubble.setTextInteractionFlags(Qt.TextInteractionFlag.TextSelectableByMouse)
|
|
bubble.setFont(QFont("Segoe UI", 11))
|
|
bubble.setSizePolicy(QSizePolicy.Preferred, QSizePolicy.Maximum)
|
|
bubble.setMinimumWidth(800)
|
|
bubble.setMaximumWidth(1200)
|
|
bubble.adjustSize()
|
|
# Timestamp
|
|
ts = QLabel(("🤖 " if role == "ai" else "") + timestamp)
|
|
ts.setFont(QFont("Segoe UI", 8))
|
|
ts.setStyleSheet("color: #9ca3af;")
|
|
if role == "user":
|
|
bubble.setStyleSheet("background: #2563eb; color: white; border-radius: 16px; padding: 10px 16px; margin-left: 40px;")
|
|
layout.setAlignment(Qt.AlignmentFlag.AlignRight)
|
|
ts.setAlignment(Qt.AlignmentFlag.AlignRight)
|
|
else:
|
|
bubble.setStyleSheet("background: #f3f4f6; color: #1e293b; border-radius: 16px; padding: 10px 16px; margin-right: 40px;")
|
|
layout.setAlignment(Qt.AlignmentFlag.AlignLeft)
|
|
ts.setAlignment(Qt.AlignmentFlag.AlignLeft)
|
|
layout.addWidget(bubble)
|
|
layout.addWidget(ts)
|
|
|
|
class WebScraperApp(QMainWindow):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.websites = []
|
|
self.scraper = module.WebScraper()
|
|
self.init_ui()
|
|
|
|
def init_ui(self):
|
|
self.setWindowTitle("Web Scraper & Data Analyzer")
|
|
self.setGeometry(100, 100, 1400, 900)
|
|
self.setMinimumSize(1200, 800) # Set minimum size to prevent geometry issues
|
|
|
|
# Set clean, minimal styling
|
|
self.setStyleSheet("""
|
|
QMainWindow {
|
|
background: #1e293b;
|
|
}
|
|
QTabWidget::pane {
|
|
border: none;
|
|
background: white;
|
|
border-radius: 8px;
|
|
margin: 8px 8px 8px 8px;
|
|
padding-top: 8px;
|
|
}
|
|
QTabBar::tab {
|
|
background: #475569;
|
|
color: #e2e8f0;
|
|
padding: 12px 20px;
|
|
margin-right: 4px;
|
|
border-top-left-radius: 8px;
|
|
border-top-right-radius: 8px;
|
|
font-weight: 600;
|
|
font-size: 14px;
|
|
min-width: 120px;
|
|
margin-bottom: 8px;
|
|
}
|
|
QTabBar::tab:selected {
|
|
background: white;
|
|
color: #1e293b;
|
|
border-bottom: none;
|
|
margin-bottom: 8px;
|
|
}
|
|
QTabBar::tab:hover:!selected {
|
|
background: #64748b;
|
|
color: #f1f5f9;
|
|
}
|
|
QTabBar::tab:first {
|
|
margin-left: 8px;
|
|
}
|
|
QTabBar::tab:last {
|
|
margin-right: 8px;
|
|
}
|
|
QGroupBox {
|
|
font-weight: 600;
|
|
font-size: 14px;
|
|
border: 2px solid #e2e8f0;
|
|
border-radius: 8px;
|
|
margin-top: 16px;
|
|
padding-top: 16px;
|
|
background: #f8fafc;
|
|
}
|
|
QGroupBox::title {
|
|
subcontrol-origin: margin;
|
|
left: 16px;
|
|
|
|
color: #1e293b;
|
|
background: #f8fafc;
|
|
}
|
|
QTableWidget {
|
|
border: 2px solid #e2e8f0;
|
|
border-radius: 8px;
|
|
background: white;
|
|
gridline-color: #f1f5f9;
|
|
alternate-background-color: #f8fafc;
|
|
selection-background-color: #dbeafe;
|
|
selection-color: #1e293b;
|
|
}
|
|
QTableWidget::item {
|
|
padding: 8px 4px;
|
|
border: none;
|
|
min-height: 20px;
|
|
}
|
|
QTableWidget::item:selected {
|
|
background: #dbeafe;
|
|
color: #1e293b;
|
|
}
|
|
QHeaderView::section {
|
|
background: #e2e8f0;
|
|
padding: 12px 8px;
|
|
border: none;
|
|
border-right: 1px solid #cbd5e1;
|
|
border-bottom: 1px solid #cbd5e1;
|
|
font-weight: 600;
|
|
color: #1e293b;
|
|
}
|
|
QHeaderView::section:vertical {
|
|
background: #f8fafc;
|
|
padding: 8px 4px;
|
|
border: none;
|
|
border-bottom: 1px solid #e2e8f0;
|
|
font-weight: 500;
|
|
color: #64748b;
|
|
min-width: 40px;
|
|
}
|
|
QProgressBar {
|
|
border: 2px solid #e2e8f0;
|
|
border-radius: 6px;
|
|
text-align: center;
|
|
background: #f1f5f9;
|
|
}
|
|
QProgressBar::chunk {
|
|
background: #3b82f6;
|
|
border-radius: 5px;
|
|
}
|
|
QTextEdit {
|
|
border: 2px solid #e2e8f0;
|
|
border-radius: 6px;
|
|
padding: 12px;
|
|
background: white;
|
|
color: #1e293b;
|
|
font-family: 'Segoe UI', sans-serif;
|
|
}
|
|
QComboBox {
|
|
border: 2px solid #d1d5db;
|
|
border-radius: 6px;
|
|
padding: 8px 12px;
|
|
background: white;
|
|
color: #1e293b;
|
|
font-size: 14px;
|
|
min-height: 40px;
|
|
}
|
|
QComboBox:focus {
|
|
border-color: #3b82f6;
|
|
}
|
|
QComboBox::drop-down {
|
|
border: none;
|
|
width: 30px;
|
|
}
|
|
QComboBox::down-arrow {
|
|
image: none;
|
|
border-left: 5px solid transparent;
|
|
border-right: 5px solid transparent;
|
|
border-top: 5px solid #6b7280;
|
|
margin-right: 10px;
|
|
}
|
|
QLabel {
|
|
color: #1e293b;
|
|
font-weight: 500;
|
|
font-size: 14px;
|
|
}
|
|
""")
|
|
|
|
# System tray icon for notifications
|
|
|
|
self.tray_icon = QSystemTrayIcon(self)
|
|
self.tray_icon.setIcon(self.style().standardIcon(QStyle.StandardPixmap.SP_ComputerIcon))
|
|
self.tray_icon.setVisible(True)
|
|
|
|
# Create central widget and main layout
|
|
central_widget = QWidget()
|
|
self.setCentralWidget(central_widget)
|
|
main_layout = QVBoxLayout(central_widget)
|
|
main_layout.setContentsMargins(16, 16, 16, 16)
|
|
main_layout.setSpacing(12)
|
|
|
|
# Create header
|
|
header = self.create_header()
|
|
main_layout.addWidget(header)
|
|
|
|
# Add proper spacing after header
|
|
spacer = QWidget()
|
|
spacer.setFixedHeight(12)
|
|
main_layout.addWidget(spacer)
|
|
|
|
# Create tab widget with proper margins
|
|
self.tab_widget = QTabWidget()
|
|
self.tab_widget.setStyleSheet("""
|
|
QTabWidget {
|
|
margin-top: 0px;
|
|
background: transparent;
|
|
}
|
|
QTabWidget::pane {
|
|
border: none;
|
|
background: white;
|
|
border-radius: 8px;
|
|
margin: 4px 8px 8px 8px;
|
|
padding-top: 4px;
|
|
}
|
|
QTabBar {
|
|
background: transparent;
|
|
spacing: 0px;
|
|
}
|
|
QTabBar::tab {
|
|
background: #475569;
|
|
color: #e2e8f0;
|
|
padding: 12px 20px;
|
|
margin-right: 4px;
|
|
border-top-left-radius: 8px;
|
|
border-top-right-radius: 8px;
|
|
font-weight: 600;
|
|
font-size: 14px;
|
|
min-width: 120px;
|
|
margin-bottom: 4px;
|
|
}
|
|
QTabBar::tab:selected {
|
|
background: white;
|
|
color: #1e293b;
|
|
border-bottom: none;
|
|
margin-bottom: 4px;
|
|
}
|
|
QTabBar::tab:hover:!selected {
|
|
background: #64748b;
|
|
color: #f1f5f9;
|
|
}
|
|
QTabBar::tab:first {
|
|
margin-left: 8px;
|
|
}
|
|
QTabBar::tab:last {
|
|
margin-right: 8px;
|
|
}
|
|
""")
|
|
main_layout.addWidget(self.tab_widget)
|
|
|
|
# Create tabs
|
|
self.create_scraping_tab()
|
|
self.create_data_tab()
|
|
self.create_analysis_tab()
|
|
self.create_sitemap_tab()
|
|
self.create_ai_tab()
|
|
|
|
def create_header(self):
|
|
"""Create a clean header with help button only (no theme toggle)"""
|
|
header_widget = QWidget()
|
|
header_widget.setStyleSheet("""
|
|
QWidget {
|
|
background: #0f172a;
|
|
border-radius: 12px;
|
|
margin: 4px 4px 8px 4px;
|
|
}
|
|
""")
|
|
header_layout = QHBoxLayout(header_widget)
|
|
header_layout.setContentsMargins(24, 20, 24, 20)
|
|
header_layout.setSpacing(16)
|
|
|
|
# Title
|
|
title_label = QLabel("Web Scraper & Data Analyzer")
|
|
title_label.setStyleSheet("""
|
|
QLabel {
|
|
color: #f8fafc;
|
|
font-size: 28px;
|
|
font-weight: 800;
|
|
font-family: 'Segoe UI', sans-serif;
|
|
}
|
|
""")
|
|
|
|
# Subtitle
|
|
subtitle_label = QLabel("Modern web scraping with intelligent data analysis")
|
|
subtitle_label.setStyleSheet("""
|
|
QLabel {
|
|
color: #cbd5e1;
|
|
font-size: 16px;
|
|
font-weight: 500;
|
|
font-family: 'Segoe UI', sans-serif;
|
|
}
|
|
""")
|
|
|
|
# Help button
|
|
help_button = ModernButton("Help")
|
|
help_button.clicked.connect(self.show_help)
|
|
|
|
# Right side info
|
|
info_widget = QWidget()
|
|
info_layout = QVBoxLayout(info_widget)
|
|
info_layout.setAlignment(Qt.AlignmentFlag.AlignRight)
|
|
info_layout.setSpacing(4)
|
|
|
|
version_label = QLabel("v2.0")
|
|
version_label.setStyleSheet("""
|
|
QLabel {
|
|
color: #94a3b8;
|
|
font-size: 14px;
|
|
font-weight: 600;
|
|
background: #1e293b;
|
|
padding: 6px 12px;
|
|
border-radius: 6px;
|
|
border: 1px solid #334155;
|
|
}
|
|
""")
|
|
|
|
info_layout.addWidget(version_label)
|
|
|
|
header_layout.addWidget(title_label)
|
|
header_layout.addStretch()
|
|
header_layout.addWidget(subtitle_label)
|
|
header_layout.addStretch()
|
|
header_layout.addWidget(help_button)
|
|
header_layout.addWidget(info_widget)
|
|
|
|
return header_widget
|
|
|
|
def create_scraping_tab(self):
|
|
"""Create the web scraping configuration tab"""
|
|
scraping_widget = QWidget()
|
|
main_layout = QVBoxLayout(scraping_widget)
|
|
main_layout.setContentsMargins(16, 16, 16, 16)
|
|
main_layout.setSpacing(16)
|
|
|
|
# Create scroll area
|
|
scroll_area = QScrollArea()
|
|
scroll_area.setWidgetResizable(True)
|
|
scroll_area.setStyleSheet("QScrollArea { border: none; }")
|
|
scroll_area.setHorizontalScrollBarPolicy(Qt.ScrollBarPolicy.ScrollBarAsNeeded)
|
|
scroll_area.setVerticalScrollBarPolicy(Qt.ScrollBarPolicy.ScrollBarAsNeeded)
|
|
|
|
# Create content widget for scrolling
|
|
content_widget = QWidget()
|
|
layout = QVBoxLayout(content_widget)
|
|
layout.setSpacing(16)
|
|
layout.setContentsMargins(0, 0, 0, 0)
|
|
|
|
# Input group
|
|
input_group = QGroupBox("Scraping Configuration")
|
|
input_layout = QGridLayout(input_group)
|
|
input_layout.setSpacing(12)
|
|
|
|
# URL input
|
|
input_layout.addWidget(QLabel("Website URL:"), 0, 0)
|
|
self.url_input = ModernLineEdit("https://example.com")
|
|
input_layout.addWidget(self.url_input, 0, 1)
|
|
|
|
# Depth input
|
|
input_layout.addWidget(QLabel("Max Depth (1-100):"), 1, 0)
|
|
self.depth_input = ModernSpinBox()
|
|
self.depth_input.setRange(1, 100)
|
|
self.depth_input.setValue(3)
|
|
input_layout.addWidget(self.depth_input, 1, 1)
|
|
|
|
# Control buttons
|
|
button_layout = QHBoxLayout()
|
|
button_layout.setSpacing(8)
|
|
|
|
self.start_button = ModernButton("Start Scraping", primary=True)
|
|
self.start_button.clicked.connect(self.start_scraping)
|
|
button_layout.addWidget(self.start_button)
|
|
|
|
self.stop_button = ModernButton("Stop")
|
|
self.stop_button.clicked.connect(self.stop_scraping)
|
|
self.stop_button.setEnabled(False)
|
|
button_layout.addWidget(self.stop_button)
|
|
|
|
input_layout.addLayout(button_layout, 2, 0, 1, 2)
|
|
layout.addWidget(input_group)
|
|
|
|
# Progress group
|
|
progress_group = QGroupBox("Progress")
|
|
progress_layout = QVBoxLayout(progress_group)
|
|
progress_layout.setSpacing(8)
|
|
|
|
self.progress_bar = QProgressBar()
|
|
self.progress_bar.setVisible(False)
|
|
self.progress_bar.setMinimumHeight(20)
|
|
progress_layout.addWidget(self.progress_bar)
|
|
|
|
self.status_label = QLabel("Ready to start scraping...")
|
|
self.status_label.setStyleSheet("""
|
|
QLabel {
|
|
color: #374151;
|
|
font-size: 14px;
|
|
padding: 8px;
|
|
background: #f8fafc;
|
|
border-radius: 6px;
|
|
border-left: 3px solid #3b82f6;
|
|
}
|
|
""")
|
|
self.status_label.setWordWrap(True) # Enable word wrapping
|
|
progress_layout.addWidget(self.status_label)
|
|
|
|
layout.addWidget(progress_group)
|
|
|
|
# Results preview
|
|
results_group = QGroupBox("Scraping Results")
|
|
results_layout = QVBoxLayout(results_group)
|
|
|
|
self.results_text = QTextEdit()
|
|
self.results_text.setReadOnly(True)
|
|
self.results_text.setMinimumHeight(80) # Reduced minimum height for more compact output
|
|
results_layout.addWidget(self.results_text)
|
|
|
|
layout.addWidget(results_group)
|
|
|
|
# Set the content widget in the scroll area
|
|
scroll_area.setWidget(content_widget)
|
|
main_layout.addWidget(scroll_area)
|
|
|
|
self.tab_widget.addTab(scraping_widget, "Web Scraping")
|
|
|
|
def create_data_tab(self):
|
|
"""Create the data viewing and filtering tab"""
|
|
data_widget = QWidget()
|
|
layout = QVBoxLayout(data_widget)
|
|
layout.setSpacing(16)
|
|
|
|
# Search and filter controls
|
|
controls_group = QGroupBox("Search & Filter")
|
|
controls_layout = QHBoxLayout(controls_group)
|
|
controls_layout.setSpacing(12)
|
|
|
|
controls_layout.addWidget(QLabel("Search:"))
|
|
self.search_input = ModernLineEdit("Enter search term...")
|
|
self.search_input.textChanged.connect(self.filter_data)
|
|
controls_layout.addWidget(self.search_input)
|
|
|
|
controls_layout.addWidget(QLabel("Domain:"))
|
|
self.domain_filter = QComboBox()
|
|
self.domain_filter.currentTextChanged.connect(self.filter_data)
|
|
controls_layout.addWidget(self.domain_filter)
|
|
|
|
self.export_button = ModernButton("Export Data")
|
|
self.export_button.clicked.connect(self.export_data)
|
|
controls_layout.addWidget(self.export_button)
|
|
|
|
# Sitemap button
|
|
self.sitemap_button = ModernButton("Generate Sitemap.xml")
|
|
self.sitemap_button.clicked.connect(self.generate_sitemap)
|
|
controls_layout.addWidget(self.sitemap_button)
|
|
|
|
layout.addWidget(controls_group)
|
|
|
|
# Data table
|
|
self.data_table = QTableWidget()
|
|
self.data_table.setColumnCount(6)
|
|
self.data_table.setHorizontalHeaderLabels([
|
|
"Title", "URL", "Depth", "Links", "Words", "Load Time"
|
|
])
|
|
|
|
# Set table properties to fill available width
|
|
header = self.data_table.horizontalHeader()
|
|
header.setStretchLastSection(False) # Don't stretch the last section
|
|
|
|
# Set resize modes to make table fill width properly
|
|
header.setSectionResizeMode(0, QHeaderView.Stretch) # Title - stretch to fill
|
|
header.setSectionResizeMode(1, QHeaderView.Stretch) # URL - stretch to fill
|
|
header.setSectionResizeMode(2, QHeaderView.Fixed) # Depth - fixed
|
|
header.setSectionResizeMode(3, QHeaderView.Fixed) # Links - fixed
|
|
header.setSectionResizeMode(4, QHeaderView.Fixed) # Words - fixed
|
|
header.setSectionResizeMode(5, QHeaderView.Fixed) # Load Time - fixed
|
|
|
|
# Set fixed column widths for non-stretching columns
|
|
self.data_table.setColumnWidth(2, 80) # Depth
|
|
self.data_table.setColumnWidth(3, 80) # Links
|
|
self.data_table.setColumnWidth(4, 80) # Words
|
|
self.data_table.setColumnWidth(5, 100) # Load Time
|
|
|
|
# Set row height to prevent index cutoff
|
|
self.data_table.verticalHeader().setDefaultSectionSize(40) # Increased row height
|
|
self.data_table.verticalHeader().setMinimumSectionSize(35) # Minimum row height
|
|
|
|
# Enable word wrapping for title and URL columns
|
|
self.data_table.setWordWrap(True)
|
|
|
|
# Connect double-click signal
|
|
self.data_table.cellDoubleClicked.connect(self.show_content_preview)
|
|
|
|
layout.addWidget(self.data_table)
|
|
|
|
self.tab_widget.addTab(data_widget, "Data View")
|
|
|
|
def create_analysis_tab(self):
|
|
"""Create the data analysis tab"""
|
|
analysis_widget = QWidget()
|
|
layout = QVBoxLayout(analysis_widget)
|
|
layout.setSpacing(16)
|
|
|
|
# Create scroll area for better layout
|
|
scroll_area = QScrollArea()
|
|
scroll_area.setWidgetResizable(True)
|
|
scroll_area.setStyleSheet("QScrollArea { border: none; }")
|
|
|
|
content_widget = QWidget()
|
|
content_layout = QVBoxLayout(content_widget)
|
|
content_layout.setSpacing(16)
|
|
|
|
# Statistics group
|
|
stats_group = QGroupBox("Statistics")
|
|
stats_layout = QGridLayout(stats_group)
|
|
stats_layout.setSpacing(12)
|
|
|
|
self.stats_labels = {}
|
|
stats_fields = [
|
|
("Total Pages", "Total Pages"),
|
|
("Total Links", "Total Links"),
|
|
("Total Words", "Total Words"),
|
|
("Average Load Time", "Average Load Time"),
|
|
("Max Depth Reached", "Max Depth Reached")
|
|
]
|
|
|
|
for i, (label_text, field) in enumerate(stats_fields):
|
|
stats_layout.addWidget(QLabel(f"{label_text}:"), i, 0)
|
|
label = QLabel("0")
|
|
label.setStyleSheet("""
|
|
QLabel {
|
|
font-weight: 700;
|
|
color: #3b82f6;
|
|
font-size: 16px;
|
|
padding: 8px 12px;
|
|
background: #eff6ff;
|
|
border-radius: 6px;
|
|
border-left: 3px solid #3b82f6;
|
|
}
|
|
""")
|
|
self.stats_labels[field] = label
|
|
stats_layout.addWidget(label, i, 1)
|
|
|
|
content_layout.addWidget(stats_group)
|
|
|
|
# Domain breakdown
|
|
domain_group = QGroupBox("Domain Breakdown")
|
|
domain_layout = QVBoxLayout(domain_group)
|
|
|
|
self.domain_text = QTextEdit()
|
|
self.domain_text.setReadOnly(True)
|
|
self.domain_text.setMaximumHeight(150)
|
|
domain_layout.addWidget(self.domain_text)
|
|
|
|
content_layout.addWidget(domain_group)
|
|
|
|
# Content preview
|
|
content_preview_group = QGroupBox("Content Preview")
|
|
content_preview_layout = QVBoxLayout(content_preview_group)
|
|
|
|
# Create splitter for text and visual preview
|
|
preview_splitter = QSplitter(Qt.Orientation.Horizontal)
|
|
|
|
# Text preview
|
|
text_preview_widget = QWidget()
|
|
text_preview_layout = QVBoxLayout(text_preview_widget)
|
|
text_preview_layout.setContentsMargins(0, 0, 0, 0)
|
|
|
|
text_label = QLabel("Text Content:")
|
|
text_label.setStyleSheet("font-weight: 600; margin-bottom: 8px;")
|
|
text_preview_layout.addWidget(text_label)
|
|
|
|
self.content_text = QTextEdit()
|
|
self.content_text.setReadOnly(True)
|
|
self.content_text.setMaximumHeight(400)
|
|
self.content_text.setFont(QFont("Segoe UI", 12))
|
|
self.content_text.setStyleSheet("""
|
|
QTextEdit {
|
|
font-size: 12px;
|
|
line-height: 1.4;
|
|
padding: 16px;
|
|
}
|
|
""")
|
|
text_preview_layout.addWidget(self.content_text)
|
|
|
|
# Visual HTML preview
|
|
visual_preview_widget = QWidget()
|
|
visual_preview_layout = QVBoxLayout(visual_preview_widget)
|
|
visual_preview_layout.setContentsMargins(0, 0, 0, 0)
|
|
|
|
visual_label = QLabel("Visual Preview:")
|
|
visual_label.setStyleSheet("font-weight: 600; margin-bottom: 8px;")
|
|
visual_preview_layout.addWidget(visual_label)
|
|
|
|
if WEB_ENGINE_AVAILABLE:
|
|
self.web_view = QWebEngineView()
|
|
self.web_view.setMinimumHeight(400)
|
|
self.web_view.setMaximumHeight(400)
|
|
visual_preview_layout.addWidget(self.web_view)
|
|
else:
|
|
self.web_view = QLabel("Visual preview not available\nInstall PyQtWebEngine for HTML rendering")
|
|
self.web_view.setStyleSheet("color: #6b7280; padding: 20px; text-align: center;")
|
|
self.web_view.setMinimumHeight(400)
|
|
self.web_view.setMaximumHeight(400)
|
|
visual_preview_layout.addWidget(self.web_view)
|
|
|
|
# Add widgets to splitter
|
|
preview_splitter.addWidget(text_preview_widget)
|
|
preview_splitter.addWidget(visual_preview_widget)
|
|
preview_splitter.setSizes([400, 600]) # Set initial split ratio
|
|
|
|
content_preview_layout.addWidget(preview_splitter)
|
|
|
|
content_layout.addWidget(content_preview_group)
|
|
|
|
scroll_area.setWidget(content_widget)
|
|
layout.addWidget(scroll_area)
|
|
|
|
self.tab_widget.addTab(analysis_widget, "Analysis")
|
|
|
|
def create_sitemap_tab(self):
|
|
"""Create the visual sitemap tab with a tree widget and export button"""
|
|
sitemap_widget = QWidget()
|
|
layout = QVBoxLayout(sitemap_widget)
|
|
layout.setSpacing(16)
|
|
|
|
# Export button
|
|
self.export_sitemap_button = ModernButton("Export Sitemap (JSON)")
|
|
self.export_sitemap_button.clicked.connect(self.export_sitemap_json)
|
|
layout.addWidget(self.export_sitemap_button)
|
|
|
|
self.sitemap_tree = QTreeWidget()
|
|
self.sitemap_tree.setHeaderLabels(["Page Title", "URL"])
|
|
self.sitemap_tree.setColumnWidth(0, 350)
|
|
self.sitemap_tree.setColumnWidth(1, 600)
|
|
self.sitemap_tree.itemDoubleClicked.connect(self.open_url_in_browser)
|
|
layout.addWidget(self.sitemap_tree)
|
|
|
|
self.tab_widget.addTab(sitemap_widget, "Sitemap")
|
|
|
|
def create_ai_tab(self):
|
|
"""Create a simplified, modern AI Analysis tab with a chat interface and compact quick actions, using more curves to match the app style."""
|
|
ai_widget = QWidget()
|
|
layout = QVBoxLayout(ai_widget)
|
|
layout.setSpacing(8)
|
|
layout.setContentsMargins(16, 16, 16, 16)
|
|
|
|
hint_label = QLabel("💡 Ask questions about your scraped websites below.")
|
|
hint_label.setStyleSheet("""
|
|
QLabel {
|
|
color: #64748b;
|
|
font-size: 13px;
|
|
padding: 4px 0 8px 0;
|
|
}
|
|
""")
|
|
layout.addWidget(hint_label)
|
|
|
|
# --- Chat area ---
|
|
self.ai_chat_history = QListWidget()
|
|
self.ai_chat_history.setStyleSheet("""
|
|
QListWidget {
|
|
background: #f8fafc;
|
|
border: 1.5px solid #e2e8f0;
|
|
border-radius: 22px;
|
|
font-size: 15px;
|
|
color: #1e293b;
|
|
padding: 12px;
|
|
font-family: 'Segoe UI', sans-serif;
|
|
}
|
|
""")
|
|
self.ai_chat_history.setSpacing(6)
|
|
self.ai_chat_history.setMinimumHeight(300)
|
|
self.ai_chat_history.setResizeMode(QListWidget.Adjust)
|
|
self.ai_chat_history.setVerticalScrollMode(QAbstractItemView.ScrollPerPixel)
|
|
layout.addWidget(self.ai_chat_history, stretch=1)
|
|
self.chat_messages = [] # Store (role, message, timestamp) tuples
|
|
self.render_chat_history()
|
|
|
|
# --- Quick action buttons ---
|
|
quick_actions_widget = QWidget()
|
|
quick_actions_layout = QHBoxLayout(quick_actions_widget)
|
|
quick_actions_layout.setSpacing(8)
|
|
quick_actions_layout.setContentsMargins(0, 0, 0, 0)
|
|
quick_questions = [
|
|
"Analyze the website structure",
|
|
"Find key content themes",
|
|
"Suggest SEO improvements",
|
|
"Compare page performance"
|
|
]
|
|
for question in quick_questions:
|
|
quick_btn = QPushButton(question)
|
|
quick_btn.setFont(QFont("Segoe UI", 10))
|
|
quick_btn.setCursor(Qt.CursorShape.PointingHandCursor)
|
|
quick_btn.clicked.connect(lambda _, q=question: self.quick_question(q))
|
|
quick_btn.setStyleSheet("""
|
|
QPushButton {
|
|
background: #e0e7ef;
|
|
border: none;
|
|
color: #374151;
|
|
padding: 8px 22px;
|
|
border-radius: 22px;
|
|
font-weight: 500;
|
|
font-size: 13px;
|
|
box-shadow: 0 2px 8px rgba(59, 130, 246, 0.04);
|
|
}
|
|
QPushButton:hover {
|
|
background: #3b82f6;
|
|
color: white;
|
|
}
|
|
QPushButton:pressed {
|
|
background: #2563eb;
|
|
color: white;
|
|
}
|
|
""")
|
|
quick_actions_layout.addWidget(quick_btn)
|
|
layout.addWidget(quick_actions_widget)
|
|
|
|
# --- Input area ---
|
|
input_container = QWidget()
|
|
input_layout = QHBoxLayout(input_container)
|
|
input_layout.setContentsMargins(0, 0, 0, 0)
|
|
input_layout.setSpacing(8)
|
|
self.ai_input = QLineEdit()
|
|
self.ai_input.setPlaceholderText("Type your question and press Enter...")
|
|
self.ai_input.setMinimumHeight(44)
|
|
self.ai_input.setFont(QFont("Segoe UI", 12))
|
|
self.ai_input.returnPressed.connect(self.send_ai_message)
|
|
self.ai_input.setStyleSheet("""
|
|
QLineEdit {
|
|
border: 1.5px solid #e2e8f0;
|
|
border-radius: 22px;
|
|
padding: 10px 20px;
|
|
background: white;
|
|
color: #1e293b;
|
|
font-size: 14px;
|
|
}
|
|
QLineEdit:focus {
|
|
border-color: #3b82f6;
|
|
outline: none;
|
|
}
|
|
QLineEdit::placeholder {
|
|
color: #9ca3af;
|
|
}
|
|
""")
|
|
self.ai_send_button = QPushButton("Send")
|
|
self.ai_send_button.setMinimumHeight(44)
|
|
self.ai_send_button.setMinimumWidth(80)
|
|
self.ai_send_button.setFont(QFont("Segoe UI", 12, QFont.Weight.Medium))
|
|
self.ai_send_button.setCursor(Qt.CursorShape.PointingHandCursor)
|
|
self.ai_send_button.clicked.connect(self.send_ai_message)
|
|
self.ai_send_button.setStyleSheet("""
|
|
QPushButton {
|
|
background: #3b82f6;
|
|
border: none;
|
|
color: white;
|
|
padding: 10px 28px;
|
|
border-radius: 22px;
|
|
font-weight: 600;
|
|
font-size: 15px;
|
|
box-shadow: 0 2px 8px rgba(59, 130, 246, 0.08);
|
|
}
|
|
QPushButton:hover {
|
|
background: #2563eb;
|
|
}
|
|
QPushButton:pressed {
|
|
background: #1d4ed8;
|
|
}
|
|
QPushButton:disabled {
|
|
background: #9ca3af;
|
|
color: #f3f4f6;
|
|
}
|
|
""")
|
|
input_layout.addWidget(self.ai_input, stretch=1)
|
|
input_layout.addWidget(self.ai_send_button)
|
|
layout.addWidget(input_container)
|
|
|
|
self.tab_widget.addTab(ai_widget, "AI Analysis")
|
|
ai_tab_index = self.tab_widget.count() - 1
|
|
self.set_ai_tab_gradient(ai_tab_index)
|
|
|
|
def render_chat_history(self):
|
|
self.ai_chat_history.clear()
|
|
for role, msg, timestamp in self.chat_messages:
|
|
item = QListWidgetItem()
|
|
bubble = ChatBubbleWidget(msg, timestamp, role)
|
|
bubble.adjustSize()
|
|
item.setSizeHint(bubble.sizeHint())
|
|
self.ai_chat_history.addItem(item)
|
|
self.ai_chat_history.setItemWidget(item, bubble)
|
|
self.ai_chat_history.scrollToBottom()
|
|
|
|
def send_ai_message(self):
|
|
user_msg = self.ai_input.text().strip()
|
|
if not user_msg:
|
|
return
|
|
timestamp = datetime.now().strftime("%H:%M")
|
|
self.chat_messages.append(("user", user_msg, timestamp))
|
|
self.render_chat_history()
|
|
self.ai_input.clear()
|
|
# Show thinking indicator as AI message
|
|
self.chat_messages.append(("ai", "<i>🤔 Analyzing your question...</i>", timestamp))
|
|
self.render_chat_history()
|
|
ai_context = self.get_ai_context(user_msg)
|
|
QTimer.singleShot(100, lambda: self._do_ai_response_openrouter(user_msg, ai_context))
|
|
|
|
def _do_ai_response_openrouter(self, user_msg, ai_context):
|
|
if OPENAI_AVAILABLE:
|
|
try:
|
|
client = OpenAI(
|
|
base_url="https://openrouter.ai/api/v1",
|
|
api_key=os.environ.get("OPENROUTER_API_KEY"),
|
|
)
|
|
system_prompt = """You are an expert website analyst and AI assistant specializing in web scraping analysis. Your role is to:\n\n1. **Analyze website content** - Provide insights about the scraped websites\n2. **Identify patterns** - Find common themes, structures, and content types\n3. **Offer recommendations** - Suggest improvements for SEO, content, or structure\n4. **Answer questions** - Respond to specific queries about the websites\n5. **Provide actionable insights** - Give practical advice based on the data\n\n**Response Guidelines:**\n- Be professional yet conversational\n- Use clear, structured responses with bullet points when appropriate\n- Reference specific websites by title when relevant\n- Provide specific examples from the content\n- Suggest actionable next steps when possible\n- Use markdown formatting for better readability\n\n**Context:** You have access to scraped website data including titles, URLs, content previews, and metadata."""
|
|
user_prompt = f"""# Website Analysis Request\n\n## User Question\n{user_msg}\n\n## Available Website Data\n{ai_context}\n\n## Instructions\nPlease provide a comprehensive analysis based on the user's question. Use the website data above to support your response. If the question is about specific aspects (SEO, content, structure, etc.), focus your analysis accordingly.\n\n**Format your response with:**\n- Clear headings and structure\n- Specific examples from the websites\n- Actionable insights and recommendations\n- Professional, helpful tone"""
|
|
completion = client.chat.completions.create(
|
|
extra_headers={
|
|
"HTTP-Referer": "http://localhost:8000",
|
|
"X-Title": "Web Scraper & Data Analyzer - AI Analysis",
|
|
},
|
|
extra_body={},
|
|
model="deepseek/deepseek-r1-0528-qwen3-8b:free",
|
|
messages=[
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_prompt}
|
|
],
|
|
temperature=0.7,
|
|
max_tokens=2000
|
|
)
|
|
try:
|
|
answer = completion.choices[0].message.content
|
|
if answer is not None:
|
|
answer = answer.strip()
|
|
else:
|
|
answer = "❌ **AI Analysis Error**\n\nNo response content received from the AI model."
|
|
except (AttributeError, IndexError, KeyError):
|
|
answer = "❌ **AI Analysis Error**\n\nUnexpected response format from the AI model."
|
|
if hasattr(self, "ai_stats_label"):
|
|
self.ai_stats_label.setText(f"Analyzed {len(self.websites)} websites")
|
|
except Exception as e:
|
|
answer = f"❌ **AI Analysis Error**\n\nI encountered an error while analyzing your request: `{str(e)}`\n\nPlease try again or check your internet connection."
|
|
else:
|
|
if ai_context == "No data available. Please scrape some websites first.":
|
|
answer = "📊 **No Data Available**\n\nPlease scrape some websites first to enable AI analysis."
|
|
else:
|
|
answer = f"🤖 **AI Analysis Preview**\n\nI have analyzed {len(self.websites)} websites. Your question: '{user_msg}'\n\n*(This is a placeholder response. Install the 'openai' package for real AI analysis.)*"
|
|
# Remove the last AI thinking message
|
|
if self.chat_messages and self.chat_messages[-1][1].startswith("<i>🤔"):
|
|
self.chat_messages.pop()
|
|
timestamp = datetime.now().strftime("%H:%M")
|
|
self.chat_messages.append(("ai", answer, timestamp))
|
|
self.render_chat_history()
|
|
|
|
def open_url_in_browser(self, item, column):
|
|
url = item.data(1, Qt.ItemDataRole.DisplayRole)
|
|
if url:
|
|
webbrowser.open(url)
|
|
|
|
def get_icon(self, is_root=False):
|
|
|
|
if is_root:
|
|
return self.style().standardIcon(QStyle.StandardPixmap.SP_DesktopIcon)
|
|
else:
|
|
return self.style().standardIcon(QStyle.StandardPixmap.SP_DirIcon)
|
|
"""Build and display the sitemap tree from crawled data, with icons and tooltips"""
|
|
self.sitemap_tree.clear()
|
|
if not self.websites:
|
|
return
|
|
url_to_website = {w.url: w for w in self.websites}
|
|
children_map = {w.url: [] for w in self.websites}
|
|
for w in self.websites:
|
|
for link in w.links:
|
|
if link in url_to_website:
|
|
children_map[w.url].append(link)
|
|
root_url = self.websites[0].url
|
|
def add_items(parent_item, url, visited, depth):
|
|
if url in visited:
|
|
return
|
|
visited.add(url)
|
|
website = url_to_website[url]
|
|
item = QTreeWidgetItem([website.title, website.url])
|
|
item.setIcon(0, self.get_icon(is_root=False))
|
|
tooltip = f"<b>Title:</b> {website.title}<br>"
|
|
tooltip += f"<b>URL:</b> {website.url}<br>"
|
|
tooltip += f"<b>Depth:</b> {website.depth}<br>"
|
|
tooltip += f"<b>Outgoing Links:</b> {len(website.links)}"
|
|
item.setToolTip(0, tooltip)
|
|
item.setToolTip(1, tooltip)
|
|
parent_item.addChild(item)
|
|
for child_url in children_map[url]:
|
|
add_items(item, child_url, visited, depth+1)
|
|
root_website = url_to_website[root_url]
|
|
root_item = QTreeWidgetItem([root_website.title, root_website.url])
|
|
root_item.setIcon(0, self.get_icon(is_root=True))
|
|
tooltip = f"<b>Title:</b> {root_website.title}<br>"
|
|
tooltip += f"<b>URL:</b> {root_website.url}<br>"
|
|
tooltip += f"<b>Depth:</b> {root_website.depth}<br>"
|
|
tooltip += f"<b>Outgoing Links:</b> {len(root_website.links)}"
|
|
root_item.setToolTip(0, tooltip)
|
|
root_item.setToolTip(1, tooltip)
|
|
self.sitemap_tree.addTopLevelItem(root_item)
|
|
visited = set([root_url])
|
|
for child_url in children_map[root_url]:
|
|
add_items(root_item, child_url, visited, 1)
|
|
self.sitemap_tree.expandToDepth(1)
|
|
|
|
def export_sitemap_json(self):
|
|
"""Export the sitemap tree as a JSON file (preserving hierarchy)"""
|
|
if not self.websites:
|
|
QMessageBox.warning(self, "Error", "No sitemap data to export.")
|
|
return
|
|
def build_tree(item):
|
|
data = {
|
|
'title': item.text(0),
|
|
'url': item.text(1),
|
|
'children': [build_tree(item.child(i)) for i in range(item.childCount())]
|
|
}
|
|
return data
|
|
root = self.sitemap_tree.topLevelItem(0)
|
|
if not root:
|
|
QMessageBox.warning(self, "Error", "No sitemap data to export.")
|
|
return
|
|
sitemap_data = build_tree(root)
|
|
try:
|
|
with open('sitemap_tree.json', 'w', encoding='utf-8') as f:
|
|
json.dump(sitemap_data, f, indent=2, ensure_ascii=False)
|
|
QMessageBox.information(self, "Success", "Sitemap exported to 'sitemap_tree.json'")
|
|
except Exception as e:
|
|
QMessageBox.critical(self, "Error", f"Failed to export sitemap: {e}")
|
|
|
|
def is_valid_url(self, url):
|
|
"""Check if the URL is valid (basic check for scheme and domain)"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
return all([parsed.scheme in ("http", "https"), parsed.netloc])
|
|
except Exception:
|
|
return False
|
|
|
|
def start_scraping(self):
|
|
"""Start the web scraping process"""
|
|
url = self.url_input.text().strip()
|
|
if not url:
|
|
QMessageBox.warning(self, "Error", "Please enter a valid URL")
|
|
return
|
|
|
|
if not url.startswith(('http://', 'https://')):
|
|
url = 'https://' + url
|
|
|
|
# Validate URL format
|
|
if not self.is_valid_url(url):
|
|
QMessageBox.warning(self, "Invalid URL", "Please enter a valid website URL (e.g. https://example.com)")
|
|
return
|
|
|
|
max_depth = self.depth_input.value()
|
|
|
|
# Update UI
|
|
self.start_button.setEnabled(False)
|
|
self.stop_button.setEnabled(True)
|
|
self.progress_bar.setVisible(True)
|
|
self.progress_bar.setRange(0, 0) # Indeterminate progress
|
|
self.status_label.setText("Scraping in progress...")
|
|
self.status_label.setStyleSheet("""
|
|
QLabel {
|
|
color: #1e40af;
|
|
font-size: 14px;
|
|
padding: 8px;
|
|
background: #eff6ff;
|
|
border-radius: 6px;
|
|
border-left: 3px solid #3b82f6;
|
|
}
|
|
""")
|
|
|
|
# Start scraping thread
|
|
self.scraping_thread = ScrapingThread(url, max_depth)
|
|
self.scraping_thread.progress_updated.connect(self.update_progress)
|
|
self.scraping_thread.scraping_complete.connect(self.scraping_finished)
|
|
self.scraping_thread.error_occurred.connect(self.scraping_error)
|
|
self.scraping_thread.start()
|
|
|
|
def stop_scraping(self):
|
|
"""Stop the scraping process"""
|
|
if hasattr(self, 'scraping_thread') and self.scraping_thread.isRunning():
|
|
# Use graceful stop instead of forceful termination
|
|
self.scraping_thread.stop()
|
|
|
|
# Wait for the thread to finish gracefully (with timeout)
|
|
if not self.scraping_thread.wait(5000): # Wait up to 5 seconds
|
|
# If it doesn't stop gracefully, then force terminate
|
|
self.scraping_thread.terminate()
|
|
self.scraping_thread.wait(2000) # Wait up to 2 more seconds
|
|
|
|
self.start_button.setEnabled(True)
|
|
self.stop_button.setEnabled(False)
|
|
self.progress_bar.setVisible(False)
|
|
self.status_label.setText("Scraping stopped.")
|
|
self.status_label.setStyleSheet("""
|
|
QLabel {
|
|
color: #92400e;
|
|
font-size: 14px;
|
|
padding: 8px;
|
|
background: #fffbeb;
|
|
border-radius: 6px;
|
|
border-left: 3px solid #f59e0b;
|
|
}
|
|
""")
|
|
|
|
def update_progress(self, message):
|
|
"""Update progress message"""
|
|
self.status_label.setText(message)
|
|
self.results_text.append(message)
|
|
|
|
def show_help(self):
|
|
"""Show a help/info dialog with usage instructions (no theme switch info)"""
|
|
help_text = (
|
|
"<h2>Web Scraper & Data Analyzer - Help</h2>"
|
|
"<ul>"
|
|
"<li><b>Enter a valid website URL</b> and set the max depth, then click <b>Start Scraping</b>.</li>"
|
|
"<li>View and filter scraped data in the <b>Data View</b> tab.</li>"
|
|
"<li>Analyze statistics and preview content in the <b>Analysis</b> tab.</li>"
|
|
"<li>Export data to JSON or generate a <b>sitemap.xml</b> from the Data View tab.</li>"
|
|
"<li>Get desktop notifications when scraping completes or on errors.</li>"
|
|
"</ul>"
|
|
"<p>For more info, see the README or contact support.</p>"
|
|
)
|
|
QMessageBox.information(self, "Help / Info", help_text)
|
|
|
|
def scraping_finished(self, websites):
|
|
"""Handle scraping completion"""
|
|
self.websites = websites
|
|
self.scraper.websites = websites
|
|
|
|
# Update UI
|
|
self.start_button.setEnabled(True)
|
|
self.stop_button.setEnabled(False)
|
|
self.progress_bar.setVisible(False)
|
|
self.status_label.setText(f"Scraping complete! Found {len(websites)} websites.")
|
|
self.status_label.setStyleSheet("""
|
|
QLabel {
|
|
color: #166534;
|
|
font-size: 14px;
|
|
padding: 8px;
|
|
background: #f0fdf4;
|
|
border-radius: 6px;
|
|
border-left: 3px solid #22c55e;
|
|
}
|
|
""")
|
|
|
|
# Update data view
|
|
self.update_data_table()
|
|
self.update_analysis()
|
|
self.update_sitemap_tree()
|
|
|
|
# Switch to data tab
|
|
self.tab_widget.setCurrentIndex(1)
|
|
|
|
# Show desktop notification
|
|
self.tray_icon.showMessage(
|
|
"Web Scraper",
|
|
f"Scraping complete! Found {len(websites)} websites.",
|
|
QSystemTrayIcon.MessageIcon(1), # 1 = Information
|
|
5000
|
|
)
|
|
|
|
def scraping_error(self, error_message):
|
|
"""Handle scraping errors"""
|
|
QMessageBox.critical(self, "Error", f"Scraping failed: {error_message}")
|
|
self.start_button.setEnabled(True)
|
|
self.stop_button.setEnabled(False)
|
|
self.progress_bar.setVisible(False)
|
|
self.status_label.setText("Scraping failed.")
|
|
self.status_label.setStyleSheet("""
|
|
QLabel {
|
|
color: #991b1b;
|
|
font-size: 14px;
|
|
padding: 8px;
|
|
background: #fef2f2;
|
|
border-radius: 6px;
|
|
border-left: 3px solid #ef4444;
|
|
}
|
|
""")
|
|
|
|
# Show desktop notification
|
|
self.tray_icon.showMessage(
|
|
"Web Scraper",
|
|
f"Scraping failed: {error_message}",
|
|
QSystemTrayIcon.MessageIcon(3),
|
|
5000
|
|
)
|
|
|
|
def update_data_table(self):
|
|
"""Update the data table with scraped websites"""
|
|
self.data_table.setRowCount(len(self.websites))
|
|
for row, website in enumerate(self.websites):
|
|
self.data_table.setRowHeight(row, 40)
|
|
title_item = QTableWidgetItem(website.title)
|
|
title_item.setTextAlignment(Qt.AlignmentFlag.AlignTop | Qt.AlignmentFlag.AlignLeft)
|
|
url_item = QTableWidgetItem(website.url)
|
|
url_item.setTextAlignment(Qt.AlignmentFlag.AlignTop | Qt.AlignmentFlag.AlignLeft)
|
|
depth_item = QTableWidgetItem(str(website.depth))
|
|
depth_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
links_item = QTableWidgetItem(str(len(website.links)))
|
|
links_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
words_item = QTableWidgetItem(str(website.get_word_count()))
|
|
words_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
load_time = f"{website.load_time:.2f}s" if website.load_time else "N/A"
|
|
load_time_item = QTableWidgetItem(load_time)
|
|
load_time_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
self.data_table.setItem(row, 0, title_item)
|
|
self.data_table.setItem(row, 1, url_item)
|
|
self.data_table.setItem(row, 2, depth_item)
|
|
self.data_table.setItem(row, 3, links_item)
|
|
self.data_table.setItem(row, 4, words_item)
|
|
self.data_table.setItem(row, 5, load_time_item)
|
|
# Update domain filter
|
|
domains = list(set(w.get_normalized_domain() for w in self.websites))
|
|
self.domain_filter.clear()
|
|
self.domain_filter.addItem("All Domains")
|
|
self.domain_filter.addItems(domains)
|
|
# Update content preview with first website
|
|
if self.websites:
|
|
first_website = self.websites[0]
|
|
content_preview = first_website.get_text_preview(800)
|
|
self.content_text.setText(content_preview)
|
|
|
|
# Also update visual preview for first website
|
|
if WEB_ENGINE_AVAILABLE and hasattr(self, 'web_view'):
|
|
try:
|
|
html_content = first_website.content
|
|
if html_content and html_content.strip():
|
|
full_html = f"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>{first_website.title}</title>
|
|
<style>
|
|
body {{
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
line-height: 1.6;
|
|
margin: 20px;
|
|
color: #333;
|
|
}}
|
|
img {{ max-width: 100%; height: auto; }}
|
|
a {{ color: #3b82f6; text-decoration: none; }}
|
|
a:hover {{ text-decoration: underline; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
{html_content}
|
|
</body>
|
|
</html>
|
|
"""
|
|
self.web_view.setHtml(full_html, QUrl(first_website.url))
|
|
else:
|
|
self.web_view.setHtml("""
|
|
<html>
|
|
<body style="font-family: Arial, sans-serif; padding: 20px; color: #666;">
|
|
<h3>No HTML Content Available</h3>
|
|
<p>This page doesn't have HTML content to display in the visual preview.</p>
|
|
</body>
|
|
</html>
|
|
""")
|
|
except Exception as e:
|
|
self.web_view.setHtml(f"""
|
|
<html>
|
|
<body style="font-family: Arial, sans-serif; padding: 20px; color: #dc2626;">
|
|
<h3>Error Loading Preview</h3>
|
|
<p>Failed to load the visual preview:</p>
|
|
<p><strong>{str(e)}</strong></p>
|
|
<p>This might be due to:</p>
|
|
<ul>
|
|
<li>Invalid HTML content</li>
|
|
<li>Missing resources (images, CSS, etc.)</li>
|
|
<li>Security restrictions</li>
|
|
</ul>
|
|
</body>
|
|
</html>
|
|
""")
|
|
|
|
def filter_data(self):
|
|
"""Filter the data table based on search and domain filters"""
|
|
search_term = self.search_input.text().lower()
|
|
selected_domain = self.domain_filter.currentText()
|
|
|
|
for row in range(self.data_table.rowCount()):
|
|
website = self.websites[row]
|
|
|
|
# Check search term
|
|
matches_search = (search_term in website.title.lower() or
|
|
search_term in website.url.lower() or
|
|
website.search_content(search_term))
|
|
|
|
# Check domain filter
|
|
matches_domain = (selected_domain == "All Domains" or
|
|
website.get_normalized_domain() == selected_domain)
|
|
|
|
# Show/hide row
|
|
self.data_table.setRowHidden(row, not (matches_search and matches_domain))
|
|
|
|
def update_analysis(self):
|
|
"""Update the analysis tab with enhanced statistics"""
|
|
if not self.websites:
|
|
return
|
|
|
|
stats = self.scraper.get_statistics()
|
|
|
|
# Update statistics labels
|
|
self.stats_labels["Total Pages"].setText(str(stats['total_pages']))
|
|
self.stats_labels["Total Links"].setText(str(stats['total_links']))
|
|
self.stats_labels["Total Words"].setText(str(stats['total_words']))
|
|
self.stats_labels["Average Load Time"].setText(f"{stats['avg_load_time']:.2f}s")
|
|
self.stats_labels["Max Depth Reached"].setText(str(stats['max_depth_reached']))
|
|
|
|
# Update domain breakdown with enhanced information
|
|
domain_text = "Domain Breakdown:\n\n"
|
|
|
|
# Show visited URLs count
|
|
domain_text += f"📊 Total URLs Checked: {stats.get('visited_urls_count', 0)}\n"
|
|
domain_text += f"🎯 Starting Domain: {stats.get('start_domain', 'N/A')}\n\n"
|
|
|
|
# Show domain page counts
|
|
if stats.get('domain_page_counts'):
|
|
domain_text += "📈 Pages per Domain:\n"
|
|
for domain, count in stats['domain_page_counts'].items():
|
|
domain_text += f" • {domain}: {count} pages\n"
|
|
domain_text += "\n"
|
|
|
|
# Show final domain breakdown
|
|
domain_text += "🏠 Final Domain Distribution:\n"
|
|
for domain, count in stats['domains'].items():
|
|
domain_text += f" • {domain}: {count} pages\n"
|
|
|
|
self.domain_text.setText(domain_text)
|
|
|
|
def export_data(self):
|
|
"""Export scraped data to JSON file"""
|
|
if not self.websites:
|
|
QMessageBox.warning(self, "Error", "No data to export")
|
|
return
|
|
|
|
try:
|
|
data = []
|
|
for website in self.websites:
|
|
website_data = {
|
|
'title': website.title,
|
|
'url': website.url,
|
|
'depth': website.depth,
|
|
'links': website.links,
|
|
'word_count': website.get_word_count(),
|
|
'load_time': website.load_time,
|
|
'domain': website.get_domain(),
|
|
'normalized_domain': website.get_normalized_domain(),
|
|
'timestamp': website.timestamp.isoformat()
|
|
}
|
|
data.append(website_data)
|
|
|
|
with open('scraped_data.json', 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
QMessageBox.information(self, "Success", "Data exported to 'scraped_data.json'")
|
|
|
|
except Exception as e:
|
|
QMessageBox.critical(self, "Error", f"Failed to export data: {e}")
|
|
|
|
def show_content_preview(self, row, column):
|
|
"""Show content preview for the selected website"""
|
|
if row < len(self.websites):
|
|
website = self.websites[row]
|
|
|
|
# Update text preview with more content
|
|
content_preview = website.get_text_preview(1000) # Increased from 500
|
|
self.content_text.setText(content_preview)
|
|
|
|
# Update visual HTML preview
|
|
if WEB_ENGINE_AVAILABLE and hasattr(self, 'web_view'):
|
|
try:
|
|
# Get the HTML content
|
|
html_content = website.content
|
|
if html_content and html_content.strip():
|
|
# Create a complete HTML document with proper encoding
|
|
full_html = f"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>{website.title}</title>
|
|
<style>
|
|
body {{
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
line-height: 1.6;
|
|
margin: 20px;
|
|
color: #333;
|
|
}}
|
|
img {{ max-width: 100%; height: auto; }}
|
|
a {{ color: #3b82f6; text-decoration: none; }}
|
|
a:hover {{ text-decoration: underline; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
{html_content}
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
# Load the HTML content
|
|
self.web_view.setHtml(full_html, QUrl(website.url))
|
|
else:
|
|
# Show a message if no HTML content
|
|
self.web_view.setHtml("""
|
|
<html>
|
|
<body style="font-family: Arial, sans-serif; padding: 20px; color: #666;">
|
|
<h3>No HTML Content Available</h3>
|
|
<p>This page doesn't have HTML content to display in the visual preview.</p>
|
|
<p>Check the text preview tab for the extracted content.</p>
|
|
</body>
|
|
</html>
|
|
""")
|
|
except Exception as e:
|
|
# Show error message in the web view
|
|
error_html = f"""
|
|
<html>
|
|
<body style="font-family: Arial, sans-serif; padding: 20px; color: #dc2626;">
|
|
<h3>Error Loading Preview</h3>
|
|
<p>Failed to load the visual preview:</p>
|
|
<p><strong>{str(e)}</strong></p>
|
|
<p>This might be due to:</p>
|
|
<ul>
|
|
<li>Invalid HTML content</li>
|
|
<li>Missing resources (images, CSS, etc.)</li>
|
|
<li>Security restrictions</li>
|
|
</ul>
|
|
</body>
|
|
</html>
|
|
"""
|
|
self.web_view.setHtml(error_html)
|
|
else:
|
|
# Fallback for when PyQtWebEngine is not available
|
|
if hasattr(self, 'web_view'):
|
|
self.web_view.setText("Visual preview not available\nInstall PyQtWebEngine for HTML rendering")
|
|
|
|
def generate_sitemap(self):
|
|
"""Generate sitemap.xml from crawled URLs"""
|
|
if not self.websites:
|
|
QMessageBox.warning(self, "Error", "No data to generate sitemap.")
|
|
return
|
|
try:
|
|
urls = [w.url for w in self.websites]
|
|
sitemap = [
|
|
'<?xml version="1.0" encoding="UTF-8"?>',
|
|
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
|
|
]
|
|
for url in urls:
|
|
sitemap.append(" <url>")
|
|
sitemap.append(f" <loc>{url}</loc>")
|
|
sitemap.append(" </url>")
|
|
sitemap.append("</urlset>")
|
|
with open("sitemap.xml", "w", encoding="utf-8") as f:
|
|
f.write("\n".join(sitemap))
|
|
QMessageBox.information(self, "Sitemap Generated", "sitemap.xml has been created in the current directory.")
|
|
self.tray_icon.showMessage(
|
|
"Web Scraper",
|
|
"sitemap.xml has been generated.",
|
|
QSystemTrayIcon.MessageIcon(1),
|
|
4000
|
|
)
|
|
except Exception as e:
|
|
QMessageBox.critical(self, "Error", f"Failed to generate sitemap: {e}")
|
|
self.tray_icon.showMessage(
|
|
"Web Scraper",
|
|
f"Failed to generate sitemap: {e}",
|
|
QSystemTrayIcon.MessageIcon(3),
|
|
4000
|
|
)
|
|
|
|
def update_sitemap_tree(self):
|
|
"""Build and display the sitemap tree from crawled data, with icons and tooltips."""
|
|
self.sitemap_tree.clear()
|
|
if not self.websites:
|
|
return
|
|
url_to_website = {w.url: w for w in self.websites}
|
|
children_map = {w.url: [] for w in self.websites}
|
|
for w in self.websites:
|
|
for link in w.links:
|
|
if link in url_to_website:
|
|
children_map[w.url].append(link)
|
|
root_url = self.websites[0].url
|
|
def add_items(parent_item, url, visited, depth):
|
|
if url in visited:
|
|
return
|
|
visited.add(url)
|
|
website = url_to_website[url]
|
|
item = QTreeWidgetItem([website.title, website.url])
|
|
item.setIcon(0, self.get_icon(is_root=False))
|
|
tooltip = f"<b>Title:</b> {website.title}<br>"
|
|
tooltip += f"<b>URL:</b> {website.url}<br>"
|
|
tooltip += f"<b>Depth:</b> {website.depth}<br>"
|
|
tooltip += f"<b>Outgoing Links:</b> {len(website.links)}"
|
|
item.setToolTip(0, tooltip)
|
|
item.setToolTip(1, tooltip)
|
|
parent_item.addChild(item)
|
|
for child_url in children_map[url]:
|
|
add_items(item, child_url, visited, depth+1)
|
|
root_website = url_to_website[root_url]
|
|
root_item = QTreeWidgetItem([root_website.title, root_website.url])
|
|
root_item.setIcon(0, self.get_icon(is_root=True))
|
|
tooltip = f"<b>Title:</b> {root_website.title}<br>"
|
|
tooltip += f"<b>URL:</b> {root_website.url}<br>"
|
|
tooltip += f"<b>Depth:</b> {root_website.depth}<br>"
|
|
tooltip += f"<b>Outgoing Links:</b> {len(root_website.links)}"
|
|
root_item.setToolTip(0, tooltip)
|
|
root_item.setToolTip(1, tooltip)
|
|
self.sitemap_tree.addTopLevelItem(root_item)
|
|
visited = set([root_url])
|
|
for child_url in children_map[root_url]:
|
|
add_items(root_item, child_url, visited, 1)
|
|
self.sitemap_tree.expandToDepth(1)
|
|
|
|
def set_ai_tab_gradient(self, tab_index):
|
|
"""Apply premium gradient styling to the AI tab header"""
|
|
gradient_css = """
|
|
QTabBar::tab:nth-child({}) {{
|
|
background: qlineargradient(x1:0, y1:0, x2:1, y2:0,
|
|
stop:0 #667eea, stop:0.5 #764ba2, stop:1 #f093fb);
|
|
color: white;
|
|
font-weight: 700;
|
|
border: 2px solid #667eea;
|
|
border-bottom: none;
|
|
padding: 14px 24px;
|
|
font-size: 15px;
|
|
}}
|
|
QTabBar::tab:nth-child({}):selected {{
|
|
background: qlineargradient(x1:0, y1:0, x2:1, y2:0,
|
|
stop:0 #f093fb, stop:0.5 #764ba2, stop:1 #667eea);
|
|
color: white;
|
|
font-weight: 800;
|
|
border-bottom: none;
|
|
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
|
|
}}
|
|
QTabBar::tab:nth-child({}):hover:!selected {{
|
|
background: qlineargradient(x1:0, y1:0, x2:1, y2:0,
|
|
stop:0 #5a67d8, stop:0.5 #6b46c1, stop:1 #e879f9);
|
|
}}
|
|
""".format(tab_index+1, tab_index+1, tab_index+1)
|
|
self.tab_widget.tabBar().setStyleSheet(self.tab_widget.tabBar().styleSheet() + gradient_css)
|
|
|
|
def quick_question(self, question):
|
|
"""Handle quick question button clicks by sending the question as if typed by the user."""
|
|
self.ai_input.setText(question)
|
|
self.send_ai_message()
|
|
|
|
def get_ai_context(self, user_msg=None):
|
|
"""Return a string summary of the scraped websites for AI analysis. If no data, return a message indicating no data is available."""
|
|
if not self.websites:
|
|
return "No data available. Please scrape some websites first."
|
|
# Summarize up to 5 websites for context
|
|
context_lines = []
|
|
for i, w in enumerate(self.websites[:5]):
|
|
context_lines.append(f"{i+1}. Title: {w.title}\n URL: {w.url}\n Preview: {w.get_text_preview(120)}")
|
|
context = "\n".join(context_lines)
|
|
return context
|
|
|
|
def main():
|
|
app = QApplication(sys.argv)
|
|
app.setStyle('Fusion') # Use Fusion style for modern look
|
|
|
|
# Set application icon and properties
|
|
app.setApplicationName("Web Scraper & Data Analyzer")
|
|
app.setApplicationVersion("2.0")
|
|
|
|
window = WebScraperApp()
|
|
window.show()
|
|
|
|
sys.exit(app.exec_())
|
|
|
|
if __name__ == '__main__':
|
|
main() |