test_pdf.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. """
  2. Integration tests for pdf plugin
  3. Tests verify:
  4. pass
  5. 1. Hook script exists
  6. 2. Dependencies installed via chrome validation hooks
  7. 3. Verify deps with abx-pkg
  8. 4. PDF extraction works on https://example.com
  9. 5. JSONL output is correct
  10. 6. Filesystem output is valid PDF file
  11. 7. Config options work
  12. """
  13. import json
  14. import os
  15. import subprocess
  16. import sys
  17. import tempfile
  18. from pathlib import Path
  19. import pytest
  20. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  21. get_test_env,
  22. get_plugin_dir,
  23. get_hook_script,
  24. run_hook_and_parse,
  25. LIB_DIR,
  26. NODE_MODULES_DIR,
  27. PLUGINS_ROOT,
  28. chrome_session,
  29. )
  30. PLUGIN_DIR = get_plugin_dir(__file__)
  31. PDF_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_pdf.*')
  32. NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
  33. TEST_URL = 'https://example.com'
  34. def test_hook_script_exists():
  35. """Verify on_Snapshot hook exists."""
  36. assert PDF_HOOK.exists(), f"Hook not found: {PDF_HOOK}"
  37. def test_verify_deps_with_abx_pkg():
  38. """Verify dependencies are available via abx-pkg after hook installation."""
  39. from abx_pkg import Binary, EnvProvider, BinProviderOverrides
  40. EnvProvider.model_rebuild()
  41. # Verify node is available
  42. node_binary = Binary(name='node', binproviders=[EnvProvider()])
  43. node_loaded = node_binary.load()
  44. assert node_loaded and node_loaded.abspath, "Node.js required for pdf plugin"
  45. def test_extracts_pdf_from_example_com():
  46. """Test full workflow: extract PDF from real example.com via hook."""
  47. # Prerequisites checked by earlier test
  48. with tempfile.TemporaryDirectory() as tmpdir:
  49. tmpdir = Path(tmpdir)
  50. with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
  51. pdf_dir = snapshot_chrome_dir.parent / 'pdf'
  52. pdf_dir.mkdir(exist_ok=True)
  53. # Run PDF extraction hook
  54. result = subprocess.run(
  55. ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
  56. cwd=pdf_dir,
  57. capture_output=True,
  58. text=True,
  59. timeout=120,
  60. env=env
  61. )
  62. # Parse clean JSONL output (hook might fail due to network issues)
  63. result_json = None
  64. for line in result.stdout.strip().split('\n'):
  65. line = line.strip()
  66. if line.startswith('{'):
  67. pass
  68. try:
  69. record = json.loads(line)
  70. if record.get('type') == 'ArchiveResult':
  71. result_json = record
  72. break
  73. except json.JSONDecodeError:
  74. pass
  75. assert result_json, "Should have ArchiveResult JSONL output"
  76. # Skip verification if network failed
  77. if result_json['status'] != 'succeeded':
  78. pass
  79. if 'TIMED_OUT' in result_json.get('output_str', '') or 'timeout' in result_json.get('output_str', '').lower():
  80. pass
  81. pytest.fail(f"Extraction failed: {result_json}")
  82. assert result.returncode == 0, f"Should exit 0 on success: {result.stderr}"
  83. # Verify filesystem output (hook writes to current directory)
  84. pdf_file = pdf_dir / 'output.pdf'
  85. assert pdf_file.exists(), "output.pdf not created"
  86. # Verify file is valid PDF
  87. file_size = pdf_file.stat().st_size
  88. assert file_size > 500, f"PDF too small: {file_size} bytes"
  89. assert file_size < 10 * 1024 * 1024, f"PDF suspiciously large: {file_size} bytes"
  90. # Check PDF magic bytes
  91. pdf_data = pdf_file.read_bytes()
  92. assert pdf_data[:4] == b'%PDF', "Should be valid PDF file"
  93. def test_config_save_pdf_false_skips():
  94. """Test that PDF_ENABLED=False exits without emitting JSONL."""
  95. import os
  96. with tempfile.TemporaryDirectory() as tmpdir:
  97. tmpdir = Path(tmpdir)
  98. env = get_test_env()
  99. env['PDF_ENABLED'] = 'False'
  100. result = subprocess.run(
  101. ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
  102. cwd=tmpdir,
  103. capture_output=True,
  104. text=True,
  105. env=env,
  106. timeout=30
  107. )
  108. assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
  109. # Feature disabled - temporary failure, should NOT emit JSONL
  110. assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
  111. # Should NOT emit any JSONL
  112. jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
  113. assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
  114. def test_reports_missing_chrome():
  115. """Test that script reports error when Chrome session is missing."""
  116. import os
  117. with tempfile.TemporaryDirectory() as tmpdir:
  118. tmpdir = Path(tmpdir)
  119. env = get_test_env()
  120. pdf_dir = tmpdir / 'snapshot' / 'pdf'
  121. pdf_dir.mkdir(parents=True, exist_ok=True)
  122. result = subprocess.run(
  123. ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test123'],
  124. cwd=pdf_dir,
  125. capture_output=True,
  126. text=True,
  127. env=env,
  128. timeout=30
  129. )
  130. assert result.returncode != 0, "Should fail without shared Chrome session"
  131. combined = result.stdout + result.stderr
  132. assert 'chrome session' in combined.lower() or 'chrome plugin' in combined.lower()
  133. def test_runs_with_shared_chrome_session():
  134. """Test that PDF hook completes when shared Chrome session is available."""
  135. with tempfile.TemporaryDirectory() as tmpdir:
  136. tmpdir = Path(tmpdir)
  137. with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
  138. pdf_dir = snapshot_chrome_dir.parent / 'pdf'
  139. pdf_dir.mkdir(exist_ok=True)
  140. result = subprocess.run(
  141. ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
  142. cwd=pdf_dir,
  143. capture_output=True,
  144. text=True,
  145. env=env,
  146. timeout=30
  147. )
  148. # Should complete (success or fail, but not hang)
  149. assert result.returncode in (0, 1), "Should complete without hanging"
  150. if __name__ == '__main__':
  151. pytest.main([__file__, '-v'])