test_headers.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. """
  2. Integration tests for headers plugin
  3. Tests verify:
  4. pass
  5. 1. Plugin script exists and is executable
  6. 2. Node.js is available
  7. 3. Headers extraction works for real example.com
  8. 4. Output JSON contains actual HTTP headers
  9. 5. Config options work (TIMEOUT, USER_AGENT)
  10. """
  11. import json
  12. import shutil
  13. import subprocess
  14. import tempfile
  15. import time
  16. from pathlib import Path
  17. import pytest
  18. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  19. CHROME_NAVIGATE_HOOK,
  20. get_test_env,
  21. chrome_session,
  22. )
  23. PLUGIN_DIR = Path(__file__).parent.parent
  24. HEADERS_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_headers.*'), None)
  25. TEST_URL = 'https://example.com'
  26. def normalize_root_url(url: str) -> str:
  27. return url.rstrip('/')
  28. def run_headers_capture(headers_dir, snapshot_chrome_dir, env, url, snapshot_id):
  29. hook_proc = subprocess.Popen(
  30. ['node', str(HEADERS_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
  31. cwd=headers_dir,
  32. stdout=subprocess.PIPE,
  33. stderr=subprocess.PIPE,
  34. text=True,
  35. env=env,
  36. )
  37. nav_result = subprocess.run(
  38. ['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
  39. cwd=snapshot_chrome_dir,
  40. capture_output=True,
  41. text=True,
  42. timeout=120,
  43. env=env,
  44. )
  45. headers_file = headers_dir / 'headers.json'
  46. for _ in range(60):
  47. if headers_file.exists() and headers_file.stat().st_size > 0:
  48. break
  49. time.sleep(1)
  50. if hook_proc.poll() is None:
  51. hook_proc.terminate()
  52. try:
  53. stdout, stderr = hook_proc.communicate(timeout=5)
  54. except subprocess.TimeoutExpired:
  55. hook_proc.kill()
  56. stdout, stderr = hook_proc.communicate()
  57. else:
  58. stdout, stderr = hook_proc.communicate()
  59. return hook_proc.returncode, stdout, stderr, nav_result, headers_file
  60. def test_hook_script_exists():
  61. """Verify hook script exists."""
  62. assert HEADERS_HOOK.exists(), f"Hook script not found: {HEADERS_HOOK}"
  63. def test_node_is_available():
  64. """Test that Node.js is available on the system."""
  65. result = subprocess.run(
  66. ['which', 'node'],
  67. capture_output=True,
  68. text=True
  69. )
  70. if result.returncode != 0:
  71. pass
  72. binary_path = result.stdout.strip()
  73. assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
  74. # Test that node is executable and get version
  75. result = subprocess.run(
  76. ['node', '--version'],
  77. capture_output=True,
  78. text=True,
  79. timeout=10
  80. ,
  81. env=get_test_env())
  82. assert result.returncode == 0, f"node not executable: {result.stderr}"
  83. assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}"
  84. def test_extracts_headers_from_example_com():
  85. """Test full workflow: extract headers from real example.com."""
  86. # Check node is available
  87. if not shutil.which('node'):
  88. pass
  89. with tempfile.TemporaryDirectory() as tmpdir:
  90. tmpdir = Path(tmpdir)
  91. with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
  92. headers_dir = snapshot_chrome_dir.parent / 'headers'
  93. headers_dir.mkdir(exist_ok=True)
  94. result = run_headers_capture(
  95. headers_dir,
  96. snapshot_chrome_dir,
  97. env,
  98. TEST_URL,
  99. 'test789',
  100. )
  101. hook_code, stdout, stderr, nav_result, headers_file = result
  102. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  103. assert hook_code == 0, f"Extraction failed: {stderr}"
  104. # Parse clean JSONL output
  105. result_json = None
  106. for line in stdout.strip().split('\n'):
  107. line = line.strip()
  108. if line.startswith('{'):
  109. pass
  110. try:
  111. record = json.loads(line)
  112. if record.get('type') == 'ArchiveResult':
  113. result_json = record
  114. break
  115. except json.JSONDecodeError:
  116. pass
  117. assert result_json, "Should have ArchiveResult JSONL output"
  118. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  119. # Verify output file exists (hook writes to current directory)
  120. assert headers_file.exists(), "headers.json not created"
  121. # Verify headers JSON contains REAL example.com response
  122. headers_data = json.loads(headers_file.read_text())
  123. assert 'url' in headers_data, "Should have url field"
  124. assert normalize_root_url(headers_data['url']) == normalize_root_url(TEST_URL), f"URL should be {TEST_URL}"
  125. assert 'status' in headers_data, "Should have status field"
  126. assert headers_data['status'] in [200, 301, 302], \
  127. f"Should have valid HTTP status, got {headers_data['status']}"
  128. assert 'request_headers' in headers_data, "Should have request_headers field"
  129. assert isinstance(headers_data['request_headers'], dict), "Request headers should be a dict"
  130. assert 'response_headers' in headers_data, "Should have response_headers field"
  131. assert isinstance(headers_data['response_headers'], dict), "Response headers should be a dict"
  132. assert len(headers_data['response_headers']) > 0, "Response headers dict should not be empty"
  133. assert 'headers' in headers_data, "Should have headers field"
  134. assert isinstance(headers_data['headers'], dict), "Headers should be a dict"
  135. # Verify common HTTP headers are present
  136. headers_lower = {k.lower(): v for k, v in headers_data['response_headers'].items()}
  137. assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
  138. "Should have at least one common HTTP header"
  139. assert headers_data['response_headers'].get(':status') == str(headers_data['status']), \
  140. "Response headers should include :status pseudo header"
  141. def test_headers_output_structure():
  142. """Test that headers plugin produces correctly structured output."""
  143. if not shutil.which('node'):
  144. pass
  145. with tempfile.TemporaryDirectory() as tmpdir:
  146. tmpdir = Path(tmpdir)
  147. with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
  148. headers_dir = snapshot_chrome_dir.parent / 'headers'
  149. headers_dir.mkdir(exist_ok=True)
  150. result = run_headers_capture(
  151. headers_dir,
  152. snapshot_chrome_dir,
  153. env,
  154. TEST_URL,
  155. 'testformat',
  156. )
  157. hook_code, stdout, stderr, nav_result, headers_file = result
  158. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  159. assert hook_code == 0, f"Extraction failed: {stderr}"
  160. # Parse clean JSONL output
  161. result_json = None
  162. for line in stdout.strip().split('\n'):
  163. line = line.strip()
  164. if line.startswith('{'):
  165. pass
  166. try:
  167. record = json.loads(line)
  168. if record.get('type') == 'ArchiveResult':
  169. result_json = record
  170. break
  171. except json.JSONDecodeError:
  172. pass
  173. assert result_json, "Should have ArchiveResult JSONL output"
  174. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  175. # Verify output structure
  176. assert headers_file.exists(), "Output headers.json not created"
  177. output_data = json.loads(headers_file.read_text())
  178. # Verify all required fields are present
  179. assert 'url' in output_data, "Output should have url field"
  180. assert 'status' in output_data, "Output should have status field"
  181. assert 'request_headers' in output_data, "Output should have request_headers field"
  182. assert 'response_headers' in output_data, "Output should have response_headers field"
  183. assert 'headers' in output_data, "Output should have headers field"
  184. # Verify data types
  185. assert isinstance(output_data['status'], int), "Status should be integer"
  186. assert isinstance(output_data['request_headers'], dict), "Request headers should be dict"
  187. assert isinstance(output_data['response_headers'], dict), "Response headers should be dict"
  188. assert isinstance(output_data['headers'], dict), "Headers should be dict"
  189. # Verify example.com returns expected headers
  190. assert normalize_root_url(output_data['url']) == normalize_root_url(TEST_URL)
  191. assert output_data['status'] in [200, 301, 302]
  192. def test_fails_without_chrome_session():
  193. """Test that headers plugin fails when chrome session is missing."""
  194. if not shutil.which('node'):
  195. pass
  196. with tempfile.TemporaryDirectory() as tmpdir:
  197. tmpdir = Path(tmpdir)
  198. # Run headers extraction
  199. result = subprocess.run(
  200. ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
  201. cwd=tmpdir,
  202. capture_output=True,
  203. text=True,
  204. timeout=60
  205. ,
  206. env=get_test_env())
  207. assert result.returncode != 0, "Should fail without chrome session"
  208. assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)
  209. def test_config_timeout_honored():
  210. """Test that TIMEOUT config is respected."""
  211. if not shutil.which('node'):
  212. pass
  213. with tempfile.TemporaryDirectory() as tmpdir:
  214. tmpdir = Path(tmpdir)
  215. # Set very short timeout (but example.com should still succeed)
  216. import os
  217. env_override = os.environ.copy()
  218. env_override['TIMEOUT'] = '5'
  219. with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
  220. headers_dir = snapshot_chrome_dir.parent / 'headers'
  221. headers_dir.mkdir(exist_ok=True)
  222. env.update(env_override)
  223. result = run_headers_capture(
  224. headers_dir,
  225. snapshot_chrome_dir,
  226. env,
  227. TEST_URL,
  228. 'testtimeout',
  229. )
  230. # Should complete (success or fail, but not hang)
  231. hook_code, _stdout, _stderr, nav_result, _headers_file = result
  232. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  233. assert hook_code in (0, 1), "Should complete without hanging"
  234. def test_config_user_agent():
  235. """Test that USER_AGENT config is used."""
  236. if not shutil.which('node'):
  237. pass
  238. with tempfile.TemporaryDirectory() as tmpdir:
  239. tmpdir = Path(tmpdir)
  240. # Set custom user agent
  241. import os
  242. env_override = os.environ.copy()
  243. env_override['USER_AGENT'] = 'TestBot/1.0'
  244. with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
  245. headers_dir = snapshot_chrome_dir.parent / 'headers'
  246. headers_dir.mkdir(exist_ok=True)
  247. env.update(env_override)
  248. result = run_headers_capture(
  249. headers_dir,
  250. snapshot_chrome_dir,
  251. env,
  252. TEST_URL,
  253. 'testua',
  254. )
  255. # Should succeed (example.com doesn't block)
  256. hook_code, stdout, _stderr, nav_result, _headers_file = result
  257. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  258. if hook_code == 0:
  259. # Parse clean JSONL output
  260. result_json = None
  261. for line in stdout.strip().split('\n'):
  262. line = line.strip()
  263. if line.startswith('{'):
  264. pass
  265. try:
  266. record = json.loads(line)
  267. if record.get('type') == 'ArchiveResult':
  268. result_json = record
  269. break
  270. except json.JSONDecodeError:
  271. pass
  272. assert result_json, "Should have ArchiveResult JSONL output"
  273. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  274. def test_handles_https_urls():
  275. """Test that HTTPS URLs work correctly."""
  276. if not shutil.which('node'):
  277. pass
  278. with tempfile.TemporaryDirectory() as tmpdir:
  279. tmpdir = Path(tmpdir)
  280. with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
  281. headers_dir = snapshot_chrome_dir.parent / 'headers'
  282. headers_dir.mkdir(exist_ok=True)
  283. result = run_headers_capture(
  284. headers_dir,
  285. snapshot_chrome_dir,
  286. env,
  287. 'https://example.org',
  288. 'testhttps',
  289. )
  290. hook_code, _stdout, _stderr, nav_result, headers_file = result
  291. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  292. if hook_code == 0:
  293. if headers_file.exists():
  294. output_data = json.loads(headers_file.read_text())
  295. assert normalize_root_url(output_data['url']) == normalize_root_url('https://example.org')
  296. assert output_data['status'] in [200, 301, 302]
  297. def test_handles_404_gracefully():
  298. """Test that headers plugin handles 404s gracefully."""
  299. if not shutil.which('node'):
  300. pass
  301. with tempfile.TemporaryDirectory() as tmpdir:
  302. tmpdir = Path(tmpdir)
  303. with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
  304. headers_dir = snapshot_chrome_dir.parent / 'headers'
  305. headers_dir.mkdir(exist_ok=True)
  306. result = run_headers_capture(
  307. headers_dir,
  308. snapshot_chrome_dir,
  309. env,
  310. 'https://example.com/nonexistent-page-404',
  311. 'test404',
  312. )
  313. # May succeed or fail depending on server behavior
  314. # If it succeeds, verify 404 status is captured
  315. hook_code, _stdout, _stderr, nav_result, headers_file = result
  316. assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
  317. if hook_code == 0:
  318. if headers_file.exists():
  319. output_data = json.loads(headers_file.read_text())
  320. assert output_data['status'] == 404, "Should capture 404 status"
  321. if __name__ == '__main__':
  322. pytest.main([__file__, '-v'])