test_dom.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. """
  2. Integration tests for dom plugin
  3. Tests verify:
  4. 1. Hook script exists
  5. 2. Dependencies installed via chrome validation hooks
  6. 3. Verify deps with abx-pkg
  7. 4. DOM extraction works on https://example.com
  8. 5. JSONL output is correct
  9. 6. Filesystem output contains actual page content
  10. 7. Config options work
  11. """
  12. import json
  13. import os
  14. import subprocess
  15. import sys
  16. import tempfile
  17. from pathlib import Path
  18. import pytest
  19. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  20. get_test_env,
  21. get_plugin_dir,
  22. get_hook_script,
  23. run_hook_and_parse,
  24. LIB_DIR,
  25. NODE_MODULES_DIR,
  26. PLUGINS_ROOT,
  27. chrome_session,
  28. )
  29. PLUGIN_DIR = get_plugin_dir(__file__)
  30. DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*')
  31. NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py')
  32. TEST_URL = 'https://example.com'
  33. def test_hook_script_exists():
  34. """Verify on_Snapshot hook exists."""
  35. assert DOM_HOOK.exists(), f"Hook not found: {DOM_HOOK}"
  36. def test_verify_deps_with_abx_pkg():
  37. """Verify dependencies are available via abx-pkg after hook installation."""
  38. from abx_pkg import Binary, EnvProvider, BinProviderOverrides
  39. EnvProvider.model_rebuild()
  40. # Verify node is available
  41. node_binary = Binary(name='node', binproviders=[EnvProvider()])
  42. node_loaded = node_binary.load()
  43. assert node_loaded and node_loaded.abspath, "Node.js required for dom plugin"
  44. def test_extracts_dom_from_example_com():
  45. """Test full workflow: extract DOM from real example.com via hook."""
  46. # Prerequisites checked by earlier test
  47. with tempfile.TemporaryDirectory() as tmpdir:
  48. tmpdir = Path(tmpdir)
  49. with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
  50. dom_dir = snapshot_chrome_dir.parent / 'dom'
  51. dom_dir.mkdir(exist_ok=True)
  52. # Run DOM extraction hook
  53. result = subprocess.run(
  54. ['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
  55. cwd=dom_dir,
  56. capture_output=True,
  57. text=True,
  58. timeout=120,
  59. env=env
  60. )
  61. assert result.returncode == 0, f"Extraction failed: {result.stderr}"
  62. # Parse clean JSONL output
  63. result_json = None
  64. for line in result.stdout.strip().split('\n'):
  65. line = line.strip()
  66. if line.startswith('{'):
  67. try:
  68. record = json.loads(line)
  69. if record.get('type') == 'ArchiveResult':
  70. result_json = record
  71. break
  72. except json.JSONDecodeError:
  73. pass
  74. assert result_json, "Should have ArchiveResult JSONL output"
  75. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  76. # Verify filesystem output (hook writes directly to working dir)
  77. dom_file = dom_dir / 'output.html'
  78. assert dom_file.exists(), f"output.html not created. Files: {list(tmpdir.iterdir())}"
  79. # Verify HTML content contains REAL example.com text
  80. html_content = dom_file.read_text(errors='ignore')
  81. assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
  82. assert '<html' in html_content.lower(), "Missing <html> tag"
  83. assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
  84. assert ('this domain' in html_content.lower() or
  85. 'illustrative examples' in html_content.lower()), \
  86. "Missing example.com description text"
  87. def test_config_save_dom_false_skips():
  88. """Test that DOM_ENABLED=False exits without emitting JSONL."""
  89. import os
  90. with tempfile.TemporaryDirectory() as tmpdir:
  91. tmpdir = Path(tmpdir)
  92. env = os.environ.copy()
  93. env['DOM_ENABLED'] = 'False'
  94. result = subprocess.run(
  95. ['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
  96. cwd=tmpdir,
  97. capture_output=True,
  98. text=True,
  99. env=env,
  100. timeout=30
  101. )
  102. assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
  103. # Feature disabled - temporary failure, should NOT emit JSONL
  104. assert 'Skipping DOM' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
  105. # Should NOT emit any JSONL
  106. jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
  107. assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
  108. def test_staticfile_present_skips():
  109. """Test that dom skips when staticfile already downloaded."""
  110. with tempfile.TemporaryDirectory() as tmpdir:
  111. tmpdir = Path(tmpdir)
  112. # Create directory structure like real ArchiveBox:
  113. # tmpdir/
  114. # staticfile/ <- staticfile extractor output
  115. # dom/ <- dom extractor runs here, looks for ../staticfile
  116. staticfile_dir = tmpdir / 'staticfile'
  117. staticfile_dir.mkdir()
  118. (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n')
  119. dom_dir = tmpdir / 'dom'
  120. dom_dir.mkdir()
  121. result = subprocess.run(
  122. ['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=teststatic'],
  123. cwd=dom_dir, # Run from dom subdirectory
  124. capture_output=True,
  125. text=True,
  126. timeout=30
  127. ,
  128. env=get_test_env())
  129. assert result.returncode == 0, "Should exit 0 when permanently skipping"
  130. # Permanent skip - should emit ArchiveResult with status='skipped'
  131. result_json = None
  132. for line in result.stdout.strip().split('\n'):
  133. line = line.strip()
  134. if line.startswith('{'):
  135. try:
  136. record = json.loads(line)
  137. if record.get('type') == 'ArchiveResult':
  138. result_json = record
  139. break
  140. except json.JSONDecodeError:
  141. pass
  142. assert result_json, "Should emit ArchiveResult JSONL for permanent skip"
  143. assert result_json['status'] == 'skipped', f"Should have status='skipped': {result_json}"
  144. assert 'staticfile' in result_json.get('output_str', '').lower(), "Should mention staticfile in output_str"
  145. if __name__ == '__main__':
  146. pytest.main([__file__, '-v'])