test_staticfile.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. """
  2. Tests for the staticfile plugin.
  3. Tests the real staticfile hook with actual URLs to verify
  4. static file detection and download.
  5. """
  6. import json
  7. import shutil
  8. import subprocess
  9. import sys
  10. import tempfile
  11. import time
  12. from pathlib import Path
  13. import pytest
  14. from django.test import TestCase
  15. # Import chrome test helpers
  16. sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
  17. from chrome_test_helpers import (
  18. chrome_session,
  19. get_test_env,
  20. get_plugin_dir,
  21. get_hook_script,
  22. )
  23. def chrome_available() -> bool:
  24. """Check if Chrome/Chromium is available."""
  25. for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
  26. if shutil.which(name):
  27. return True
  28. return False
  29. # Get the path to the staticfile hook
  30. PLUGIN_DIR = get_plugin_dir(__file__)
  31. STATICFILE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_staticfile.*')
  32. class TestStaticfilePlugin(TestCase):
  33. """Test the staticfile plugin."""
  34. def test_staticfile_hook_exists(self):
  35. """Staticfile hook script should exist."""
  36. self.assertIsNotNone(STATICFILE_HOOK, "Staticfile hook not found in plugin directory")
  37. self.assertTrue(STATICFILE_HOOK.exists(), f"Hook not found: {STATICFILE_HOOK}")
  38. class TestStaticfileWithChrome(TestCase):
  39. """Integration tests for staticfile plugin with Chrome."""
  40. def setUp(self):
  41. """Set up test environment."""
  42. self.temp_dir = Path(tempfile.mkdtemp())
  43. def tearDown(self):
  44. """Clean up."""
  45. shutil.rmtree(self.temp_dir, ignore_errors=True)
  46. def test_staticfile_skips_html_pages(self):
  47. """Staticfile hook should skip HTML pages (not static files)."""
  48. test_url = 'https://example.com' # HTML page, not a static file
  49. snapshot_id = 'test-staticfile-snapshot'
  50. try:
  51. with chrome_session(
  52. self.temp_dir,
  53. crawl_id='test-staticfile-crawl',
  54. snapshot_id=snapshot_id,
  55. test_url=test_url,
  56. navigate=True,
  57. timeout=30,
  58. ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
  59. # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
  60. # Run staticfile hook with the active Chrome session (background hook)
  61. result = subprocess.Popen(
  62. ['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
  63. cwd=str(snapshot_chrome_dir),
  64. stdout=subprocess.PIPE,
  65. stderr=subprocess.PIPE,
  66. text=True,
  67. env=env
  68. )
  69. # Allow it to run briefly, then terminate (background hook)
  70. time.sleep(3)
  71. if result.poll() is None:
  72. result.terminate()
  73. try:
  74. stdout, stderr = result.communicate(timeout=5)
  75. except subprocess.TimeoutExpired:
  76. result.kill()
  77. stdout, stderr = result.communicate()
  78. else:
  79. stdout, stderr = result.communicate()
  80. # Verify hook ran without crash
  81. self.assertNotIn('Traceback', stderr)
  82. # Parse JSONL output to verify it recognized HTML as non-static
  83. for line in stdout.split('\n'):
  84. line = line.strip()
  85. if line.startswith('{'):
  86. try:
  87. record = json.loads(line)
  88. if record.get('type') == 'ArchiveResult':
  89. # HTML pages should be skipped
  90. if record.get('status') == 'skipped':
  91. self.assertIn('Not a static file', record.get('output_str', ''))
  92. break
  93. except json.JSONDecodeError:
  94. continue
  95. except RuntimeError:
  96. raise
  97. if __name__ == '__main__':
  98. pytest.main([__file__, '-v'])