test_title.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. """
  2. Integration tests for title plugin
  3. Tests verify:
  4. 1. Plugin script exists
  5. 2. Node.js is available
  6. 3. Title extraction works for real example.com
  7. 4. Output file contains actual page title
  8. 5. Handles various title sources (<title>, og:title, twitter:title)
  9. 6. Config options work (TITLE_TIMEOUT)
  10. """
  11. import json
  12. import shutil
  13. import subprocess
  14. import tempfile
  15. from pathlib import Path
  16. import pytest
  17. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  18. get_plugin_dir,
  19. get_hook_script,
  20. parse_jsonl_output,
  21. get_test_env,
  22. chrome_session,
  23. CHROME_NAVIGATE_HOOK,
  24. )
  25. PLUGIN_DIR = get_plugin_dir(__file__)
  26. TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
  27. TEST_URL = 'https://example.com'
  28. def run_title_capture(title_dir, snapshot_chrome_dir, env, url, snapshot_id):
  29. nav_result = subprocess.run(
  30. ['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
  31. cwd=str(snapshot_chrome_dir),
  32. capture_output=True,
  33. text=True,
  34. timeout=120,
  35. env=env,
  36. )
  37. result = subprocess.run(
  38. ['node', str(TITLE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
  39. cwd=title_dir,
  40. capture_output=True,
  41. text=True,
  42. timeout=60,
  43. env=env,
  44. )
  45. return nav_result, result
  46. def test_hook_script_exists():
  47. """Verify hook script exists."""
  48. assert TITLE_HOOK.exists(), f"Hook script not found: {TITLE_HOOK}"
  49. def test_extracts_title_from_example_com():
  50. """Test full workflow: extract title from real example.com."""
  51. # Check node is available
  52. if not shutil.which('node'):
  53. pass
  54. with tempfile.TemporaryDirectory() as tmpdir:
  55. tmpdir = Path(tmpdir)
  56. with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
  57. title_dir = snapshot_chrome_dir.parent / 'title'
  58. title_dir.mkdir(exist_ok=True)
  59. nav_result, result = run_title_capture(
  60. title_dir,
  61. snapshot_chrome_dir,
  62. env,
  63. TEST_URL,
  64. 'test789',
  65. )
  66. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  67. assert result.returncode == 0, f"Extraction failed: {result.stderr}"
  68. # Parse clean JSONL output
  69. result_json = None
  70. for line in result.stdout.strip().split('\n'):
  71. line = line.strip()
  72. if line.startswith('{'):
  73. pass
  74. try:
  75. record = json.loads(line)
  76. if record.get('type') == 'ArchiveResult':
  77. result_json = record
  78. break
  79. except json.JSONDecodeError:
  80. pass
  81. assert result_json, "Should have ArchiveResult JSONL output"
  82. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  83. # Verify output file exists (hook writes to current directory)
  84. title_file = title_dir / 'title.txt'
  85. assert title_file.exists(), "title.txt not created"
  86. # Verify title contains REAL example.com title
  87. title_text = title_file.read_text().strip()
  88. assert len(title_text) > 0, "Title should not be empty"
  89. assert 'example' in title_text.lower(), "Title should contain 'example'"
  90. # example.com has title "Example Domain"
  91. assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"
  92. def test_fails_without_chrome_session():
  93. """Test that title plugin fails when chrome session is missing."""
  94. if not shutil.which('node'):
  95. pass
  96. with tempfile.TemporaryDirectory() as tmpdir:
  97. tmpdir = Path(tmpdir)
  98. title_dir = tmpdir / 'snapshot' / 'title'
  99. title_dir.mkdir(parents=True, exist_ok=True)
  100. # Run title extraction
  101. result = subprocess.run(
  102. ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
  103. cwd=title_dir,
  104. capture_output=True,
  105. text=True,
  106. timeout=60,
  107. env=get_test_env(),
  108. )
  109. assert result.returncode != 0, f"Should fail without chrome session: {result.stderr}"
  110. assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)
  111. def test_config_timeout_honored():
  112. """Test that TITLE_TIMEOUT config is respected."""
  113. if not shutil.which('node'):
  114. pass
  115. with tempfile.TemporaryDirectory() as tmpdir:
  116. tmpdir = Path(tmpdir)
  117. # Set very short timeout (but example.com should still succeed)
  118. import os
  119. env_override = os.environ.copy()
  120. env_override['TITLE_TIMEOUT'] = '5'
  121. with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
  122. title_dir = snapshot_chrome_dir.parent / 'title'
  123. title_dir.mkdir(exist_ok=True)
  124. env.update(env_override)
  125. nav_result, result = run_title_capture(
  126. title_dir,
  127. snapshot_chrome_dir,
  128. env,
  129. TEST_URL,
  130. 'testtimeout',
  131. )
  132. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  133. # Should complete (success or fail, but not hang)
  134. assert result.returncode in (0, 1), "Should complete without hanging"
  135. def test_handles_https_urls():
  136. """Test that HTTPS URLs work correctly."""
  137. if not shutil.which('node'):
  138. pass
  139. with tempfile.TemporaryDirectory() as tmpdir:
  140. tmpdir = Path(tmpdir)
  141. with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
  142. title_dir = snapshot_chrome_dir.parent / 'title'
  143. title_dir.mkdir(exist_ok=True)
  144. nav_result, result = run_title_capture(
  145. title_dir,
  146. snapshot_chrome_dir,
  147. env,
  148. 'https://example.org',
  149. 'testhttps',
  150. )
  151. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  152. if result.returncode == 0:
  153. # Hook writes to current directory
  154. output_title_file = title_dir / 'title.txt'
  155. if output_title_file.exists():
  156. title_text = output_title_file.read_text().strip()
  157. assert len(title_text) > 0, "Title should not be empty"
  158. assert 'example' in title_text.lower()
  159. def test_handles_404_gracefully():
  160. """Test that title plugin handles 404 pages.
  161. Note: example.com returns valid HTML even for 404 pages, so extraction may succeed
  162. with the generic "Example Domain" title.
  163. """
  164. if not shutil.which('node'):
  165. pass
  166. with tempfile.TemporaryDirectory() as tmpdir:
  167. tmpdir = Path(tmpdir)
  168. with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (
  169. _process,
  170. _pid,
  171. snapshot_chrome_dir,
  172. env,
  173. ):
  174. title_dir = snapshot_chrome_dir.parent / 'title'
  175. title_dir.mkdir(exist_ok=True)
  176. nav_result, result = run_title_capture(
  177. title_dir,
  178. snapshot_chrome_dir,
  179. env,
  180. 'https://example.com/nonexistent-page-404',
  181. 'test404',
  182. )
  183. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  184. # May succeed or fail depending on server behavior
  185. # example.com returns "Example Domain" even for 404s
  186. assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
  187. def test_handles_redirects():
  188. """Test that title plugin handles redirects correctly."""
  189. if not shutil.which('node'):
  190. pass
  191. with tempfile.TemporaryDirectory() as tmpdir:
  192. tmpdir = Path(tmpdir)
  193. with chrome_session(tmpdir, test_url='http://example.com', navigate=False) as (
  194. _process,
  195. _pid,
  196. snapshot_chrome_dir,
  197. env,
  198. ):
  199. title_dir = snapshot_chrome_dir.parent / 'title'
  200. title_dir.mkdir(exist_ok=True)
  201. # http://example.com redirects to https://example.com
  202. nav_result, result = run_title_capture(
  203. title_dir,
  204. snapshot_chrome_dir,
  205. env,
  206. 'http://example.com',
  207. 'testredirect',
  208. )
  209. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  210. # Should succeed and follow redirect
  211. if result.returncode == 0:
  212. # Hook writes to current directory
  213. output_title_file = title_dir / 'title.txt'
  214. if output_title_file.exists():
  215. title_text = output_title_file.read_text().strip()
  216. assert 'example' in title_text.lower()
  217. if __name__ == '__main__':
  218. pytest.main([__file__, '-v'])