2
0

test_infiniscroll.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. """
  2. Integration tests for infiniscroll plugin
  3. Tests verify:
  4. 1. Hook script exists
  5. 2. Dependencies installed via chrome validation hooks
  6. 3. Verify deps with abx-pkg
  7. 4. INFINISCROLL_ENABLED=False skips without JSONL
  8. 5. Fails gracefully when no chrome session exists
  9. 6. Full integration test: scrolls page and outputs stats
  10. 7. Config options work (scroll limit, min height)
  11. """
  12. import json
  13. import os
  14. import re
  15. import subprocess
  16. import time
  17. import tempfile
  18. from pathlib import Path
  19. import pytest
  20. # Import shared Chrome test helpers
  21. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  22. get_test_env,
  23. chrome_session,
  24. )
  25. PLUGIN_DIR = Path(__file__).parent.parent
  26. INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
  27. TEST_URL = 'https://www.singsing.movie/'
  28. def test_hook_script_exists():
  29. """Verify on_Snapshot hook exists."""
  30. assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
  31. assert INFINISCROLL_HOOK.exists(), f"Hook not found: {INFINISCROLL_HOOK}"
  32. def test_verify_deps_with_abx_pkg():
  33. """Verify dependencies are available via abx-pkg after hook installation."""
  34. from abx_pkg import Binary, EnvProvider, BinProviderOverrides
  35. EnvProvider.model_rebuild()
  36. # Verify node is available
  37. node_binary = Binary(name='node', binproviders=[EnvProvider()])
  38. node_loaded = node_binary.load()
  39. assert node_loaded and node_loaded.abspath, "Node.js required for infiniscroll plugin"
  40. def test_config_infiniscroll_disabled_skips():
  41. """Test that INFINISCROLL_ENABLED=False exits without emitting JSONL."""
  42. with tempfile.TemporaryDirectory() as tmpdir:
  43. tmpdir = Path(tmpdir)
  44. env = get_test_env()
  45. env['INFINISCROLL_ENABLED'] = 'False'
  46. result = subprocess.run(
  47. ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
  48. cwd=tmpdir,
  49. capture_output=True,
  50. text=True,
  51. env=env,
  52. timeout=30
  53. )
  54. assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
  55. assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
  56. # Should NOT emit any JSONL
  57. jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
  58. assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}"
  59. def test_fails_gracefully_without_chrome_session():
  60. """Test that hook fails gracefully when no chrome session exists."""
  61. with tempfile.TemporaryDirectory() as tmpdir:
  62. tmpdir = Path(tmpdir)
  63. infiniscroll_dir = tmpdir / 'snapshot' / 'infiniscroll'
  64. infiniscroll_dir.mkdir(parents=True, exist_ok=True)
  65. result = subprocess.run(
  66. ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
  67. cwd=infiniscroll_dir,
  68. capture_output=True,
  69. text=True,
  70. env=get_test_env(),
  71. timeout=30
  72. )
  73. # Should fail (exit 1) when no chrome session
  74. assert result.returncode != 0, "Should fail when no chrome session exists"
  75. # Error could be about chrome/CDP not found, or puppeteer module missing
  76. err_lower = result.stderr.lower()
  77. assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \
  78. f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
  79. def test_scrolls_page_and_outputs_stats():
  80. """Integration test: scroll page and verify JSONL output format."""
  81. with tempfile.TemporaryDirectory() as tmpdir:
  82. with chrome_session(
  83. Path(tmpdir),
  84. crawl_id='test-infiniscroll',
  85. snapshot_id='snap-infiniscroll',
  86. test_url=TEST_URL,
  87. ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
  88. # Create infiniscroll output directory (sibling to chrome)
  89. infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
  90. infiniscroll_dir.mkdir()
  91. # Run infiniscroll hook
  92. env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test
  93. env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling
  94. env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test
  95. result = subprocess.run(
  96. ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
  97. cwd=str(infiniscroll_dir),
  98. capture_output=True,
  99. text=True,
  100. timeout=60,
  101. env=env
  102. )
  103. assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}\nStdout: {result.stdout}"
  104. # Parse JSONL output
  105. result_json = None
  106. for line in result.stdout.strip().split('\n'):
  107. line = line.strip()
  108. if line.startswith('{'):
  109. try:
  110. record = json.loads(line)
  111. if record.get('type') == 'ArchiveResult':
  112. result_json = record
  113. break
  114. except json.JSONDecodeError:
  115. pass
  116. assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {result.stdout}"
  117. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  118. # Verify output_str format: "scrolled to X,XXXpx (+Y,YYYpx new content) over Z.Zs"
  119. output_str = result_json.get('output_str', '')
  120. assert output_str.startswith('scrolled to'), f"output_str should start with 'scrolled to': {output_str}"
  121. assert 'px' in output_str, f"output_str should contain pixel count: {output_str}"
  122. assert re.search(r'over \d+(\.\d+)?s', output_str), f"output_str should contain duration: {output_str}"
  123. # Verify no files created in output directory
  124. output_files = list(infiniscroll_dir.iterdir())
  125. assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
  126. def test_config_scroll_limit_honored():
  127. """Test that INFINISCROLL_SCROLL_LIMIT config is respected."""
  128. with tempfile.TemporaryDirectory() as tmpdir:
  129. with chrome_session(
  130. Path(tmpdir),
  131. crawl_id='test-scroll-limit',
  132. snapshot_id='snap-limit',
  133. test_url=TEST_URL,
  134. ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
  135. infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
  136. infiniscroll_dir.mkdir()
  137. # Set scroll limit to 2 (use env from setup_chrome_session)
  138. env['INFINISCROLL_SCROLL_LIMIT'] = '2'
  139. env['INFINISCROLL_SCROLL_DELAY'] = '500'
  140. env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in
  141. result = subprocess.run(
  142. ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
  143. cwd=str(infiniscroll_dir),
  144. capture_output=True,
  145. text=True,
  146. timeout=60,
  147. env=env
  148. )
  149. assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}"
  150. # Parse output and verify scroll count
  151. result_json = None
  152. for line in result.stdout.strip().split('\n'):
  153. if line.strip().startswith('{'):
  154. try:
  155. record = json.loads(line)
  156. if record.get('type') == 'ArchiveResult':
  157. result_json = record
  158. break
  159. except json.JSONDecodeError:
  160. pass
  161. assert result_json is not None, "Should have JSONL output"
  162. output_str = result_json.get('output_str', '')
  163. # Verify output format and that it completed (scroll limit enforced internally)
  164. assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}"
  165. assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}"
  166. def test_config_timeout_honored():
  167. """Test that INFINISCROLL_TIMEOUT config is respected."""
  168. with tempfile.TemporaryDirectory() as tmpdir:
  169. with chrome_session(
  170. Path(tmpdir),
  171. crawl_id='test-timeout',
  172. snapshot_id='snap-timeout',
  173. test_url=TEST_URL,
  174. ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
  175. infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
  176. infiniscroll_dir.mkdir()
  177. # Set very short timeout (use env from setup_chrome_session)
  178. env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds
  179. env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger
  180. env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit
  181. env['INFINISCROLL_MIN_HEIGHT'] = '100000'
  182. start_time = time.time()
  183. result = subprocess.run(
  184. ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
  185. cwd=str(infiniscroll_dir),
  186. capture_output=True,
  187. text=True,
  188. timeout=30,
  189. env=env
  190. )
  191. elapsed = time.time() - start_time
  192. # Should complete within reasonable time (timeout + buffer)
  193. assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s"
  194. assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}"
  195. if __name__ == '__main__':
  196. pytest.main([__file__, '-v'])