473 lines
18 KiB
Python
473 lines
18 KiB
Python
import urllib.request
|
|
import urllib.parse
|
|
import urllib.error
|
|
import html.parser
|
|
import re
|
|
from datetime import datetime
|
|
import time
|
|
import ssl
|
|
from urllib.parse import urljoin, urlparse
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import threading
|
|
from functools import partial
|
|
|
|
class HTMLParser(html.parser.HTMLParser):
|
|
"""Custom HTML parser to extract title, links, and text content"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.title = ""
|
|
self.links = []
|
|
self.text_content = []
|
|
self.in_title = False
|
|
self.in_body = False
|
|
self.current_tag = ""
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
self.current_tag = tag.lower()
|
|
|
|
if tag.lower() == 'title':
|
|
self.in_title = True
|
|
elif tag.lower() == 'body':
|
|
self.in_body = True
|
|
elif tag.lower() == 'a':
|
|
# Extract href attribute
|
|
for attr, value in attrs:
|
|
if attr.lower() == 'href' and value:
|
|
self.links.append(value)
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag.lower() == 'title':
|
|
self.in_title = False
|
|
elif tag.lower() == 'body':
|
|
self.in_body = False
|
|
|
|
def handle_data(self, data):
|
|
if self.in_title:
|
|
self.title += data
|
|
elif self.in_body and self.current_tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'li']:
|
|
# Clean the text data
|
|
cleaned_data = re.sub(r'\s+', ' ', data.strip())
|
|
if cleaned_data:
|
|
self.text_content.append(cleaned_data)
|
|
|
|
def get_text(self):
|
|
"""Return all extracted text content as a single string"""
|
|
return ' '.join(self.text_content)
|
|
|
|
def get_clean_text(self, max_length=500):
|
|
"""Return cleaned text content with length limit"""
|
|
text = self.get_text()
|
|
# Remove extra whitespace and limit length
|
|
text = re.sub(r'\s+', ' ', text.strip())
|
|
if len(text) > max_length:
|
|
text = text[:max_length] + "..."
|
|
return text
|
|
|
|
class Website:
|
|
"""Class to store website data"""
|
|
|
|
def __init__(self, title, url, content, depth, links=None, load_time=None):
|
|
self.title = title or "No Title"
|
|
self.url = url
|
|
self.content = content
|
|
self.depth = depth
|
|
self.links = links or []
|
|
self.load_time = load_time
|
|
self.timestamp = datetime.now()
|
|
|
|
def get_word_count(self):
|
|
"""Get word count from content"""
|
|
if not self.content:
|
|
return 0
|
|
# Extract text content and count words
|
|
text_content = re.sub(r'<[^>]+>', '', self.content)
|
|
words = text_content.split()
|
|
return len(words)
|
|
|
|
def get_domain(self):
|
|
"""Extract domain from URL"""
|
|
try:
|
|
parsed = urlparse(self.url)
|
|
return parsed.netloc
|
|
except:
|
|
return ""
|
|
|
|
def get_normalized_domain(self):
|
|
"""Get domain without www prefix for consistent filtering"""
|
|
domain = self.get_domain()
|
|
if domain.startswith('www.'):
|
|
return domain[4:]
|
|
return domain
|
|
|
|
def search_content(self, query):
|
|
"""Search for query in content"""
|
|
if not self.content or not query:
|
|
return False
|
|
return query.lower() in self.content.lower()
|
|
|
|
def get_text_preview(self, max_length=200):
|
|
"""Get a text preview of the content"""
|
|
if not self.content:
|
|
return "No content available"
|
|
|
|
# Extract text content
|
|
text_content = re.sub(r'<[^>]+>', '', self.content)
|
|
text_content = re.sub(r'\s+', ' ', text_content.strip())
|
|
|
|
if len(text_content) > max_length:
|
|
return text_content[:max_length] + "..."
|
|
return text_content
|
|
|
|
class WebScraper:
|
|
"""Web scraper with multithreading support and robust duplicate detection"""
|
|
|
|
def __init__(self):
|
|
self.websites = []
|
|
self.visited_urls = set()
|
|
self.visited_domains = set() # Track visited domains
|
|
self.start_domain = None # Store the starting domain
|
|
self.lock = threading.Lock()
|
|
self.max_workers = 10 # Number of concurrent threads
|
|
# Removed all page limits - unlimited crawling
|
|
self.domain_page_counts = {} # Track page count per domain (for statistics only)
|
|
self._stop_requested = False # Flag to stop scraping
|
|
|
|
def normalize_url(self, url):
|
|
"""Normalize URL to handle www prefixes and remove fragments"""
|
|
if not url:
|
|
return url
|
|
|
|
# Remove fragments (#) to prevent duplicate content
|
|
if '#' in url:
|
|
url = url.split('#')[0]
|
|
|
|
# Remove trailing slashes for consistency
|
|
url = url.rstrip('/')
|
|
|
|
# Remove www prefix for consistent domain handling
|
|
if url.startswith('https://www.'):
|
|
return url.replace('https://www.', 'https://', 1)
|
|
elif url.startswith('http://www.'):
|
|
return url.replace('http://www.', 'http://', 1)
|
|
return url
|
|
|
|
def get_domain_from_url(self, url):
|
|
"""Extract and normalize domain from URL"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc
|
|
if domain.startswith('www.'):
|
|
return domain[4:]
|
|
return domain
|
|
except:
|
|
return ""
|
|
|
|
def should_skip_url(self, url, current_depth):
|
|
"""Check if URL should be skipped based on various criteria"""
|
|
normalized_url = self.normalize_url(url)
|
|
|
|
# Skip if already visited
|
|
if normalized_url in self.visited_urls:
|
|
return True, "Already visited"
|
|
|
|
# Skip if not a valid HTTP/HTTPS URL
|
|
if not normalized_url.startswith(('http://', 'https://')):
|
|
return True, "Not HTTP/HTTPS URL"
|
|
|
|
# Get domain
|
|
domain = self.get_domain_from_url(normalized_url)
|
|
if not domain:
|
|
return True, "Invalid domain"
|
|
|
|
# Removed all domain page limits - unlimited crawling
|
|
# Removed external domain depth limits - crawl as deep as needed
|
|
|
|
return False, "OK"
|
|
|
|
def scrape_url(self, url, depth):
|
|
"""Scrape a single URL with error handling and rate limiting"""
|
|
try:
|
|
# Check if stop was requested
|
|
if self._stop_requested:
|
|
return None
|
|
|
|
# Check if URL should be skipped
|
|
should_skip, reason = self.should_skip_url(url, depth)
|
|
if should_skip:
|
|
print(f"Skipping {url}: {reason}")
|
|
return None
|
|
|
|
# Normalize URL
|
|
normalized_url = self.normalize_url(url)
|
|
|
|
# Mark as visited and update domain count (for statistics only)
|
|
with self.lock:
|
|
self.visited_urls.add(normalized_url)
|
|
domain = self.get_domain_from_url(normalized_url)
|
|
if domain:
|
|
self.domain_page_counts[domain] = self.domain_page_counts.get(domain, 0) + 1
|
|
|
|
# Add small delay to prevent overwhelming servers
|
|
time.sleep(0.1)
|
|
|
|
start_time = time.time()
|
|
|
|
# Create request with headers
|
|
req = urllib.request.Request(
|
|
normalized_url,
|
|
headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
}
|
|
)
|
|
|
|
# Fetch the page with timeout
|
|
with urllib.request.urlopen(req, timeout=15) as response:
|
|
# Check content type
|
|
content_type = response.headers.get('content-type', '').lower()
|
|
if 'text/html' not in content_type and 'application/xhtml' not in content_type:
|
|
print(f"Skipping {url}: Not HTML content ({content_type})")
|
|
return None
|
|
|
|
html_content = response.read().decode('utf-8', errors='ignore')
|
|
|
|
load_time = time.time() - start_time
|
|
|
|
# Skip if content is too small (likely error page)
|
|
if len(html_content) < 100:
|
|
print(f"Skipping {url}: Content too small ({len(html_content)} chars)")
|
|
return None
|
|
|
|
# Parse HTML
|
|
parser = HTMLParser()
|
|
parser.feed(html_content)
|
|
|
|
# Extract links and normalize them with duplicate detection
|
|
links = []
|
|
base_url = normalized_url
|
|
seen_links = set() # Track links within this page to avoid duplicates
|
|
|
|
for link in parser.links:
|
|
try:
|
|
absolute_url = urljoin(base_url, link)
|
|
normalized_link = self.normalize_url(absolute_url)
|
|
|
|
# Skip if already seen in this page or should be skipped
|
|
if normalized_link in seen_links:
|
|
continue
|
|
seen_links.add(normalized_link)
|
|
|
|
should_skip, reason = self.should_skip_url(normalized_link, depth + 1)
|
|
if should_skip:
|
|
continue
|
|
|
|
# Only include http/https links and filter out common non-content URLs
|
|
if (normalized_link.startswith(('http://', 'https://')) and
|
|
not any(skip in normalized_link.lower() for skip in [
|
|
'mailto:', 'tel:', 'javascript:', 'data:', 'file:',
|
|
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rar',
|
|
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.ico',
|
|
'.css', '.js', '.xml', '.json', '.txt', '.log'
|
|
])):
|
|
links.append(normalized_link)
|
|
except:
|
|
continue
|
|
|
|
# Create Website object
|
|
website = Website(
|
|
title=parser.title,
|
|
url=normalized_url,
|
|
content=html_content,
|
|
depth=depth,
|
|
links=links,
|
|
load_time=load_time
|
|
)
|
|
|
|
return website
|
|
|
|
except urllib.error.HTTPError as e:
|
|
print(f"HTTP Error scraping {url}: {e.code} - {e.reason}")
|
|
return None
|
|
except urllib.error.URLError as e:
|
|
print(f"URL Error scraping {url}: {e.reason}")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error scraping {url}: {str(e)}")
|
|
return None
|
|
|
|
def crawl_website(self, start_url, max_depth=3, progress_callback=None):
|
|
"""Crawl website with multithreading support and no page limits"""
|
|
if not start_url.startswith(('http://', 'https://')):
|
|
start_url = 'https://' + start_url
|
|
|
|
# Initialize tracking
|
|
self.websites = []
|
|
self.visited_urls = set()
|
|
self.visited_domains = set()
|
|
self.domain_page_counts = {}
|
|
self.start_domain = self.get_domain_from_url(start_url)
|
|
self._stop_requested = False # Reset stop flag
|
|
|
|
print(f"Starting crawl from: {start_url}")
|
|
print(f"Starting domain: {self.start_domain}")
|
|
print(f"Max depth: {max_depth}")
|
|
print(f"Unlimited crawling - no page limits")
|
|
|
|
# Start with the initial URL
|
|
urls_to_scrape = [(start_url, 0)]
|
|
max_depth_reached = 0
|
|
consecutive_empty_levels = 0
|
|
max_consecutive_empty = 3 # Stop if 3 consecutive levels have no new URLs
|
|
total_pages_scraped = 0
|
|
# Removed all page limits - unlimited crawling
|
|
|
|
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
for current_depth in range(max_depth + 1):
|
|
# Check if stop was requested
|
|
if self._stop_requested:
|
|
print("Scraping stopped by user request")
|
|
break
|
|
|
|
if not urls_to_scrape:
|
|
print(f"Stopping at depth {current_depth}: No more URLs to scrape")
|
|
break
|
|
|
|
# Check if we've reached too many consecutive empty levels
|
|
if consecutive_empty_levels >= max_consecutive_empty:
|
|
print(f"Stopping at depth {current_depth}: {max_consecutive_empty} consecutive empty levels")
|
|
break
|
|
|
|
# Removed absolute page limit check - unlimited pages
|
|
|
|
print(f"Scraping depth {current_depth} with {len(urls_to_scrape)} URLs")
|
|
|
|
# Submit all URLs at current depth for concurrent scraping
|
|
future_to_url = {
|
|
executor.submit(self.scrape_url, url, depth): url
|
|
for url, depth in urls_to_scrape
|
|
}
|
|
|
|
# Collect results and prepare next level
|
|
urls_to_scrape = []
|
|
level_results = 0
|
|
|
|
for future in as_completed(future_to_url):
|
|
# Check if stop was requested
|
|
if self._stop_requested:
|
|
print("Stopping processing of current level")
|
|
break
|
|
|
|
website = future.result()
|
|
if website:
|
|
with self.lock:
|
|
self.websites.append(website)
|
|
level_results += 1
|
|
total_pages_scraped += 1
|
|
|
|
# Emit progress if callback provided
|
|
if progress_callback:
|
|
progress_callback(website)
|
|
|
|
# Add links for next depth level (no limits)
|
|
if current_depth < max_depth:
|
|
for link in website.links:
|
|
# Removed URL limit per level - process all URLs
|
|
|
|
should_skip, reason = self.should_skip_url(link, current_depth + 1)
|
|
if not should_skip:
|
|
urls_to_scrape.append((link, current_depth + 1))
|
|
|
|
# Check if stop was requested after processing level
|
|
if self._stop_requested:
|
|
break
|
|
|
|
# Update depth tracking
|
|
if level_results > 0:
|
|
max_depth_reached = current_depth
|
|
consecutive_empty_levels = 0
|
|
else:
|
|
consecutive_empty_levels += 1
|
|
|
|
# Only stop if we've reached the actual max depth
|
|
if current_depth >= max_depth:
|
|
print(f"Reached maximum depth: {max_depth}")
|
|
break
|
|
|
|
# Print progress summary
|
|
print(f"Depth {current_depth} completed: {level_results} pages, Total: {len(self.websites)}")
|
|
if self.domain_page_counts:
|
|
print(f"Domain breakdown: {dict(self.domain_page_counts)}")
|
|
|
|
print(f"Crawling completed. Max depth reached: {max_depth_reached}, Total pages: {len(self.websites)}")
|
|
print(f"Visited URLs: {len(self.visited_urls)}")
|
|
print(f"Domain breakdown: {dict(self.domain_page_counts)}")
|
|
return self.websites
|
|
|
|
def reset(self):
|
|
"""Reset the scraper state for a new crawl"""
|
|
self.websites = []
|
|
self.visited_urls = set()
|
|
self.visited_domains = set()
|
|
self.domain_page_counts = {}
|
|
self.start_domain = None
|
|
self._stop_requested = False # Reset stop flag
|
|
|
|
def get_statistics(self):
|
|
"""Get scraping statistics with enhanced tracking information"""
|
|
if not self.websites:
|
|
return {
|
|
'total_pages': 0,
|
|
'total_links': 0,
|
|
'total_words': 0,
|
|
'avg_load_time': 0,
|
|
'max_depth_reached': 0,
|
|
'domains': {},
|
|
'visited_urls_count': 0,
|
|
'domain_page_counts': {},
|
|
'start_domain': self.start_domain
|
|
}
|
|
|
|
total_pages = len(self.websites)
|
|
total_links = sum(len(w.links) for w in self.websites)
|
|
total_words = sum(w.get_word_count() for w in self.websites)
|
|
|
|
load_times = [w.load_time for w in self.websites if w.load_time]
|
|
avg_load_time = sum(load_times) / len(load_times) if load_times else 0
|
|
|
|
max_depth_reached = max(w.depth for w in self.websites)
|
|
|
|
# Count domains
|
|
domains = {}
|
|
for website in self.websites:
|
|
domain = website.get_normalized_domain()
|
|
domains[domain] = domains.get(domain, 0) + 1
|
|
|
|
return {
|
|
'total_pages': total_pages,
|
|
'total_links': total_links,
|
|
'total_words': total_words,
|
|
'avg_load_time': avg_load_time,
|
|
'max_depth_reached': max_depth_reached,
|
|
'domains': domains,
|
|
'visited_urls_count': len(self.visited_urls),
|
|
'domain_page_counts': dict(self.domain_page_counts),
|
|
'start_domain': self.start_domain
|
|
}
|
|
|
|
def filter_by_domain(self, domain):
|
|
"""Filter websites by domain"""
|
|
normalized_domain = self.normalize_url(domain)
|
|
return [w for w in self.websites if w.get_normalized_domain() == normalized_domain]
|
|
|
|
def search_websites(self, query):
|
|
"""Search websites by query"""
|
|
return [w for w in self.websites if w.search_content(query)]
|
|
|
|
def stop_scraping(self):
|
|
"""Request graceful stop of the scraping process"""
|
|
self._stop_requested = True |