test_parse_dom_outlinks.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. """
  2. Tests for the parse_dom_outlinks plugin.
  3. Tests the real DOM outlinks hook with an actual URL to verify
  4. link extraction and categorization.
  5. """
  6. import json
  7. import shutil
  8. import subprocess
  9. import sys
  10. import tempfile
  11. from pathlib import Path
  12. import pytest
  13. from django.test import TestCase
  14. # Import chrome test helpers
  15. sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
  16. from chrome_test_helpers import (
  17. chrome_session,
  18. get_test_env,
  19. get_plugin_dir,
  20. get_hook_script,
  21. )
  22. def chrome_available() -> bool:
  23. """Check if Chrome/Chromium is available."""
  24. for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
  25. if shutil.which(name):
  26. return True
  27. return False
  28. # Get the path to the parse_dom_outlinks hook
  29. PLUGIN_DIR = get_plugin_dir(__file__)
  30. OUTLINKS_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_parse_dom_outlinks.*')
  31. class TestParseDomOutlinksPlugin(TestCase):
  32. """Test the parse_dom_outlinks plugin."""
  33. def test_outlinks_hook_exists(self):
  34. """DOM outlinks hook script should exist."""
  35. self.assertIsNotNone(OUTLINKS_HOOK, "DOM outlinks hook not found in plugin directory")
  36. self.assertTrue(OUTLINKS_HOOK.exists(), f"Hook not found: {OUTLINKS_HOOK}")
  37. class TestParseDomOutlinksWithChrome(TestCase):
  38. """Integration tests for parse_dom_outlinks plugin with Chrome."""
  39. def setUp(self):
  40. """Set up test environment."""
  41. self.temp_dir = Path(tempfile.mkdtemp())
  42. def tearDown(self):
  43. """Clean up."""
  44. shutil.rmtree(self.temp_dir, ignore_errors=True)
  45. def test_outlinks_extracts_links_from_page(self):
  46. """DOM outlinks hook should extract and categorize links from page."""
  47. test_url = 'https://example.com'
  48. snapshot_id = 'test-outlinks-snapshot'
  49. try:
  50. with chrome_session(
  51. self.temp_dir,
  52. crawl_id='test-outlinks-crawl',
  53. snapshot_id=snapshot_id,
  54. test_url=test_url,
  55. navigate=True,
  56. timeout=30,
  57. ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
  58. # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
  59. # Run outlinks hook with the active Chrome session
  60. result = subprocess.run(
  61. ['node', str(OUTLINKS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
  62. cwd=str(snapshot_chrome_dir),
  63. capture_output=True,
  64. text=True,
  65. timeout=60,
  66. env=env
  67. )
  68. # Check for output file
  69. outlinks_output = snapshot_chrome_dir / 'outlinks.json'
  70. outlinks_data = None
  71. json_error = None
  72. # Try parsing from file first
  73. if outlinks_output.exists():
  74. with open(outlinks_output) as f:
  75. try:
  76. outlinks_data = json.load(f)
  77. except json.JSONDecodeError as e:
  78. json_error = str(e)
  79. # Verify hook ran successfully
  80. self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
  81. self.assertNotIn('Traceback', result.stderr)
  82. # Verify we got outlinks data with expected categories
  83. self.assertIsNotNone(outlinks_data, f"No outlinks data found - file missing or invalid JSON: {json_error}")
  84. self.assertIn('url', outlinks_data, f"Missing url: {outlinks_data}")
  85. self.assertIn('hrefs', outlinks_data, f"Missing hrefs: {outlinks_data}")
  86. # example.com has at least one link (to iana.org)
  87. self.assertIsInstance(outlinks_data['hrefs'], list)
  88. except RuntimeError:
  89. raise
  90. if __name__ == '__main__':
  91. pytest.main([__file__, '-v'])