161 lines
5.0 KiB
Python
161 lines
5.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simple test script to verify the web scraping functionality
|
|
"""
|
|
|
|
import module
|
|
|
|
def test_basic_scraping():
|
|
"""Test basic scraping functionality"""
|
|
print("Testing basic web scraping...")
|
|
|
|
# Create a scraper instance
|
|
scraper = module.WebScraper()
|
|
|
|
# Test with a simple website (httpbin.org is a safe test site)
|
|
test_url = "https://httpbin.org/html"
|
|
|
|
print(f"Scraping {test_url} with depth 1...")
|
|
|
|
try:
|
|
# Scrape with depth 1 to keep it fast
|
|
websites = scraper.crawl_website(test_url, max_depth=1)
|
|
|
|
print(f"Successfully scraped {len(websites)} websites")
|
|
|
|
if websites:
|
|
# Show first website details
|
|
first_site = websites[0]
|
|
print(f"\nFirst website:")
|
|
print(f" Title: {first_site.title}")
|
|
print(f" URL: {first_site.url}")
|
|
print(f" Depth: {first_site.depth}")
|
|
print(f" Links found: {len(first_site.links)}")
|
|
print(f" Word count: {first_site.get_word_count()}")
|
|
|
|
# Show statistics
|
|
stats = scraper.get_statistics()
|
|
print(f"\nStatistics:")
|
|
print(f" Total pages: {stats['total_pages']}")
|
|
print(f" Total links: {stats['total_links']}")
|
|
print(f" Total words: {stats['total_words']}")
|
|
print(f" Average load time: {stats['avg_load_time']:.2f}s")
|
|
|
|
return True
|
|
else:
|
|
print("No websites were scraped")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"Error during scraping: {e}")
|
|
return False
|
|
|
|
def test_website_class():
|
|
"""Test the Website class functionality"""
|
|
print("\nTesting Website class...")
|
|
|
|
# Create a test website
|
|
website = module.Website(
|
|
title="Test Website",
|
|
url="https://example.com",
|
|
content="<html><body><h1>Test Content</h1><p>This is a test paragraph.</p></body></html>",
|
|
depth=0,
|
|
links=["https://example.com/page1", "https://example.com/page2"]
|
|
)
|
|
|
|
# Test methods
|
|
print(f"Website title: {website.title}")
|
|
print(f"Website URL: {website.url}")
|
|
print(f"Word count: {website.get_word_count()}")
|
|
print(f"Domain: {website.get_domain()}")
|
|
print(f"Normalized domain: {website.get_normalized_domain()}")
|
|
print(f"Search for 'test': {website.search_content('test')}")
|
|
print(f"Search for 'nonexistent': {website.search_content('nonexistent')}")
|
|
|
|
return True
|
|
|
|
def test_html_parser():
|
|
"""Test the HTML parser functionality"""
|
|
print("\nTesting HTML Parser...")
|
|
|
|
parser = module.HTMLParser()
|
|
test_html = """
|
|
<html>
|
|
<head><title>Test Page</title></head>
|
|
<body>
|
|
<h1>Welcome</h1>
|
|
<p>This is a <a href="https://example.com">link</a> to example.com</p>
|
|
<p>Here's another <a href="/relative-link">relative link</a></p>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
parser.feed(test_html)
|
|
print(f"Title extracted: {parser.title}")
|
|
print(f"Links found: {parser.links}")
|
|
print(f"Text content length: {len(parser.get_text())}")
|
|
|
|
return True
|
|
|
|
def test_url_normalization():
|
|
"""Test URL normalization to handle www. prefixes"""
|
|
print("\nTesting URL Normalization...")
|
|
|
|
scraper = module.WebScraper()
|
|
|
|
# Test URLs with and without www.
|
|
test_urls = [
|
|
"https://www.example.com/page",
|
|
"https://example.com/page",
|
|
"http://www.test.com/path?param=value#fragment",
|
|
"http://test.com/path?param=value#fragment"
|
|
]
|
|
|
|
print("URL Normalization Results:")
|
|
for url in test_urls:
|
|
normalized = scraper.normalize_url(url)
|
|
print(f" Original: {url}")
|
|
print(f" Normalized: {normalized}")
|
|
print()
|
|
|
|
# Test domain filtering
|
|
print("Domain Filtering Test:")
|
|
test_websites = [
|
|
module.Website("Site 1", "https://www.example.com", "content", 0),
|
|
module.Website("Site 2", "https://example.com", "content", 0),
|
|
module.Website("Site 3", "https://www.test.com", "content", 0)
|
|
]
|
|
|
|
scraper.websites = test_websites
|
|
|
|
# Test filtering by domain with and without www.
|
|
domains_to_test = ["example.com", "www.example.com", "test.com", "www.test.com"]
|
|
|
|
for domain in domains_to_test:
|
|
filtered = scraper.filter_by_domain(domain)
|
|
print(f" Filter '{domain}': {len(filtered)} results")
|
|
for site in filtered:
|
|
print(f" - {site.title} ({site.url})")
|
|
|
|
return True
|
|
|
|
if __name__ == "__main__":
|
|
print("Web Scraper Test Suite")
|
|
print("=" * 50)
|
|
|
|
# Test HTML parser
|
|
test_html_parser()
|
|
|
|
# Test Website class
|
|
test_website_class()
|
|
|
|
# Test URL normalization
|
|
test_url_normalization()
|
|
|
|
# Test basic scraping (uncomment to test actual scraping)
|
|
# Note: This requires internet connection
|
|
# test_basic_scraping()
|
|
|
|
print("\nTest completed!")
|
|
print("\nTo run the full application:")
|
|
print("python web_scraper_app.py") |