test_mercury.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. """
  2. Integration tests for mercury plugin
  3. Tests verify:
  4. 1. Hook script exists
  5. 2. Dependencies installed via validation hooks
  6. 3. Verify deps with abx-pkg
  7. 4. Mercury extraction works on https://example.com
  8. 5. JSONL output is correct
  9. 6. Filesystem output contains extracted content
  10. 7. Config options work
  11. """
  12. import json
  13. import subprocess
  14. import sys
  15. import tempfile
  16. from pathlib import Path
  17. import pytest
  18. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  19. get_plugin_dir,
  20. get_hook_script,
  21. PLUGINS_ROOT,
  22. )
  23. PLUGIN_DIR = get_plugin_dir(__file__)
  24. MERCURY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_mercury.*')
  25. TEST_URL = 'https://example.com'
  26. def test_hook_script_exists():
  27. """Verify on_Snapshot hook exists."""
  28. assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
  29. def test_verify_deps_with_abx_pkg():
  30. """Verify postlight-parser is available via abx-pkg."""
  31. from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
  32. # Verify postlight-parser is available
  33. mercury_binary = Binary(
  34. name='postlight-parser',
  35. binproviders=[NpmProvider(), EnvProvider()],
  36. overrides={'npm': {'packages': ['@postlight/parser']}}
  37. )
  38. mercury_loaded = mercury_binary.load()
  39. # If validate hook found it (exit 0), this should succeed
  40. # If validate hook didn't find it (exit 1), this may fail unless binprovider installed it
  41. if mercury_loaded and mercury_loaded.abspath:
  42. assert True, "postlight-parser is available"
  43. else:
  44. pass
  45. def test_extracts_with_mercury_parser():
  46. """Test full workflow: extract with postlight-parser from real HTML via hook."""
  47. # Prerequisites checked by earlier test
  48. with tempfile.TemporaryDirectory() as tmpdir:
  49. tmpdir = Path(tmpdir)
  50. # Create HTML source that mercury can parse
  51. (tmpdir / 'singlefile').mkdir()
  52. (tmpdir / 'singlefile' / 'singlefile.html').write_text(
  53. '<html><head><title>Test Article</title></head><body>'
  54. '<article><h1>Example Article</h1><p>This is test content for mercury parser.</p></article>'
  55. '</body></html>'
  56. )
  57. # Run mercury extraction hook
  58. result = subprocess.run(
  59. [sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
  60. cwd=tmpdir,
  61. capture_output=True,
  62. text=True,
  63. timeout=60
  64. )
  65. assert result.returncode == 0, f"Extraction failed: {result.stderr}"
  66. # Parse clean JSONL output
  67. result_json = None
  68. for line in result.stdout.strip().split('\n'):
  69. line = line.strip()
  70. if line.startswith('{'):
  71. pass
  72. try:
  73. record = json.loads(line)
  74. if record.get('type') == 'ArchiveResult':
  75. result_json = record
  76. break
  77. except json.JSONDecodeError:
  78. pass
  79. assert result_json, "Should have ArchiveResult JSONL output"
  80. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  81. # Verify filesystem output (hook writes to current directory)
  82. output_file = tmpdir / 'content.html'
  83. assert output_file.exists(), "content.html not created"
  84. content = output_file.read_text()
  85. assert len(content) > 0, "Output should not be empty"
  86. def test_config_save_mercury_false_skips():
  87. """Test that MERCURY_ENABLED=False exits without emitting JSONL."""
  88. import os
  89. with tempfile.TemporaryDirectory() as tmpdir:
  90. env = os.environ.copy()
  91. env['MERCURY_ENABLED'] = 'False'
  92. result = subprocess.run(
  93. [sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
  94. cwd=tmpdir,
  95. capture_output=True,
  96. text=True,
  97. env=env,
  98. timeout=30
  99. )
  100. assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
  101. # Feature disabled - temporary failure, should NOT emit JSONL
  102. assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
  103. # Should NOT emit any JSONL
  104. jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
  105. assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
  106. def test_fails_gracefully_without_html():
  107. """Test that mercury works even without HTML source (fetches URL directly)."""
  108. with tempfile.TemporaryDirectory() as tmpdir:
  109. result = subprocess.run(
  110. [sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
  111. cwd=tmpdir,
  112. capture_output=True,
  113. text=True,
  114. timeout=30
  115. )
  116. # Mercury fetches URL directly with postlight-parser, doesn't need HTML source
  117. # Parse clean JSONL output
  118. result_json = None
  119. for line in result.stdout.strip().split('\n'):
  120. line = line.strip()
  121. if line.startswith('{'):
  122. try:
  123. record = json.loads(line)
  124. if record.get('type') == 'ArchiveResult':
  125. result_json = record
  126. break
  127. except json.JSONDecodeError:
  128. pass
  129. # Mercury should succeed or fail based on network, not based on HTML source
  130. assert result_json, "Should emit ArchiveResult"
  131. assert result_json['status'] in ['succeeded', 'failed'], f"Should succeed or fail: {result_json}"
  132. if __name__ == '__main__':
  133. pytest.main([__file__, '-v'])