test_headers.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. """
  2. Integration tests for headers plugin
  3. Tests verify:
  4. pass
  5. 1. Plugin script exists and is executable
  6. 2. Node.js is available
  7. 3. Headers extraction works for real example.com
  8. 4. Output JSON contains actual HTTP headers
  9. 5. HTTP fallback works correctly
  10. 6. Config options work (TIMEOUT, USER_AGENT)
  11. """
  12. import json
  13. import shutil
  14. import subprocess
  15. import tempfile
  16. from pathlib import Path
  17. import pytest
  18. PLUGIN_DIR = Path(__file__).parent.parent
  19. HEADERS_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_headers.*'), None)
  20. TEST_URL = 'https://example.com'
  21. def test_hook_script_exists():
  22. """Verify hook script exists."""
  23. assert HEADERS_HOOK.exists(), f"Hook script not found: {HEADERS_HOOK}"
  24. def test_node_is_available():
  25. """Test that Node.js is available on the system."""
  26. result = subprocess.run(
  27. ['which', 'node'],
  28. capture_output=True,
  29. text=True
  30. )
  31. if result.returncode != 0:
  32. pass
  33. binary_path = result.stdout.strip()
  34. assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
  35. # Test that node is executable and get version
  36. result = subprocess.run(
  37. ['node', '--version'],
  38. capture_output=True,
  39. text=True,
  40. timeout=10
  41. )
  42. assert result.returncode == 0, f"node not executable: {result.stderr}"
  43. assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}"
  44. def test_extracts_headers_from_example_com():
  45. """Test full workflow: extract headers from real example.com."""
  46. # Check node is available
  47. if not shutil.which('node'):
  48. pass
  49. with tempfile.TemporaryDirectory() as tmpdir:
  50. tmpdir = Path(tmpdir)
  51. # Run headers extraction
  52. result = subprocess.run(
  53. ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
  54. cwd=tmpdir,
  55. capture_output=True,
  56. text=True,
  57. timeout=60
  58. )
  59. assert result.returncode == 0, f"Extraction failed: {result.stderr}"
  60. # Parse clean JSONL output
  61. result_json = None
  62. for line in result.stdout.strip().split('\n'):
  63. line = line.strip()
  64. if line.startswith('{'):
  65. pass
  66. try:
  67. record = json.loads(line)
  68. if record.get('type') == 'ArchiveResult':
  69. result_json = record
  70. break
  71. except json.JSONDecodeError:
  72. pass
  73. assert result_json, "Should have ArchiveResult JSONL output"
  74. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  75. # Verify output file exists (hook writes to current directory)
  76. headers_file = tmpdir / 'headers.json'
  77. assert headers_file.exists(), "headers.json not created"
  78. # Verify headers JSON contains REAL example.com response
  79. headers_data = json.loads(headers_file.read_text())
  80. assert 'url' in headers_data, "Should have url field"
  81. assert headers_data['url'] == TEST_URL, f"URL should be {TEST_URL}"
  82. assert 'status' in headers_data, "Should have status field"
  83. assert headers_data['status'] in [200, 301, 302], \
  84. f"Should have valid HTTP status, got {headers_data['status']}"
  85. assert 'headers' in headers_data, "Should have headers field"
  86. assert isinstance(headers_data['headers'], dict), "Headers should be a dict"
  87. assert len(headers_data['headers']) > 0, "Headers dict should not be empty"
  88. # Verify common HTTP headers are present
  89. headers_lower = {k.lower(): v for k, v in headers_data['headers'].items()}
  90. assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
  91. "Should have at least one common HTTP header"
  92. def test_headers_output_structure():
  93. """Test that headers plugin produces correctly structured output."""
  94. if not shutil.which('node'):
  95. pass
  96. with tempfile.TemporaryDirectory() as tmpdir:
  97. tmpdir = Path(tmpdir)
  98. # Run headers extraction against real example.com
  99. result = subprocess.run(
  100. ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testformat'],
  101. cwd=tmpdir,
  102. capture_output=True,
  103. text=True,
  104. timeout=60
  105. )
  106. assert result.returncode == 0, f"Extraction failed: {result.stderr}"
  107. # Parse clean JSONL output
  108. result_json = None
  109. for line in result.stdout.strip().split('\n'):
  110. line = line.strip()
  111. if line.startswith('{'):
  112. pass
  113. try:
  114. record = json.loads(line)
  115. if record.get('type') == 'ArchiveResult':
  116. result_json = record
  117. break
  118. except json.JSONDecodeError:
  119. pass
  120. assert result_json, "Should have ArchiveResult JSONL output"
  121. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  122. # Verify output structure
  123. output_headers_file = tmpdir / 'headers.json'
  124. assert output_headers_file.exists(), "Output headers.json not created"
  125. output_data = json.loads(output_headers_file.read_text())
  126. # Verify all required fields are present
  127. assert 'url' in output_data, "Output should have url field"
  128. assert 'status' in output_data, "Output should have status field"
  129. assert 'headers' in output_data, "Output should have headers field"
  130. # Verify data types
  131. assert isinstance(output_data['status'], int), "Status should be integer"
  132. assert isinstance(output_data['headers'], dict), "Headers should be dict"
  133. # Verify example.com returns expected headers
  134. assert output_data['url'] == TEST_URL
  135. assert output_data['status'] in [200, 301, 302]
  136. def test_falls_back_to_http_when_chrome_unavailable():
  137. """Test that headers plugin falls back to HTTP HEAD when chrome unavailable."""
  138. if not shutil.which('node'):
  139. pass
  140. with tempfile.TemporaryDirectory() as tmpdir:
  141. tmpdir = Path(tmpdir)
  142. # Don't create chrome directory - force HTTP fallback
  143. # Run headers extraction
  144. result = subprocess.run(
  145. ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
  146. cwd=tmpdir,
  147. capture_output=True,
  148. text=True,
  149. timeout=60
  150. )
  151. assert result.returncode == 0, f"Extraction failed: {result.stderr}"
  152. # Parse clean JSONL output
  153. result_json = None
  154. for line in result.stdout.strip().split('\n'):
  155. line = line.strip()
  156. if line.startswith('{'):
  157. pass
  158. try:
  159. record = json.loads(line)
  160. if record.get('type') == 'ArchiveResult':
  161. result_json = record
  162. break
  163. except json.JSONDecodeError:
  164. pass
  165. assert result_json, "Should have ArchiveResult JSONL output"
  166. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  167. # Verify output exists and has real HTTP headers
  168. output_headers_file = tmpdir / 'headers.json'
  169. assert output_headers_file.exists(), "Output headers.json not created"
  170. output_data = json.loads(output_headers_file.read_text())
  171. assert output_data['url'] == TEST_URL
  172. assert output_data['status'] in [200, 301, 302]
  173. assert isinstance(output_data['headers'], dict)
  174. assert len(output_data['headers']) > 0
  175. def test_config_timeout_honored():
  176. """Test that TIMEOUT config is respected."""
  177. if not shutil.which('node'):
  178. pass
  179. with tempfile.TemporaryDirectory() as tmpdir:
  180. tmpdir = Path(tmpdir)
  181. # Set very short timeout (but example.com should still succeed)
  182. import os
  183. env = os.environ.copy()
  184. env['TIMEOUT'] = '5'
  185. result = subprocess.run(
  186. ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
  187. cwd=tmpdir,
  188. capture_output=True,
  189. text=True,
  190. env=env,
  191. timeout=30
  192. )
  193. # Should complete (success or fail, but not hang)
  194. assert result.returncode in (0, 1), "Should complete without hanging"
  195. def test_config_user_agent():
  196. """Test that USER_AGENT config is used."""
  197. if not shutil.which('node'):
  198. pass
  199. with tempfile.TemporaryDirectory() as tmpdir:
  200. tmpdir = Path(tmpdir)
  201. # Set custom user agent
  202. import os
  203. env = os.environ.copy()
  204. env['USER_AGENT'] = 'TestBot/1.0'
  205. result = subprocess.run(
  206. ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
  207. cwd=tmpdir,
  208. capture_output=True,
  209. text=True,
  210. env=env,
  211. timeout=60
  212. )
  213. # Should succeed (example.com doesn't block)
  214. if result.returncode == 0:
  215. # Parse clean JSONL output
  216. result_json = None
  217. for line in result.stdout.strip().split('\n'):
  218. line = line.strip()
  219. if line.startswith('{'):
  220. pass
  221. try:
  222. record = json.loads(line)
  223. if record.get('type') == 'ArchiveResult':
  224. result_json = record
  225. break
  226. except json.JSONDecodeError:
  227. pass
  228. assert result_json, "Should have ArchiveResult JSONL output"
  229. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  230. def test_handles_https_urls():
  231. """Test that HTTPS URLs work correctly."""
  232. if not shutil.which('node'):
  233. pass
  234. with tempfile.TemporaryDirectory() as tmpdir:
  235. tmpdir = Path(tmpdir)
  236. result = subprocess.run(
  237. ['node', str(HEADERS_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
  238. cwd=tmpdir,
  239. capture_output=True,
  240. text=True,
  241. timeout=60
  242. )
  243. if result.returncode == 0:
  244. output_headers_file = tmpdir / 'headers.json'
  245. if output_headers_file.exists():
  246. output_data = json.loads(output_headers_file.read_text())
  247. assert output_data['url'] == 'https://example.org'
  248. assert output_data['status'] in [200, 301, 302]
  249. def test_handles_404_gracefully():
  250. """Test that headers plugin handles 404s gracefully."""
  251. if not shutil.which('node'):
  252. pass
  253. with tempfile.TemporaryDirectory() as tmpdir:
  254. tmpdir = Path(tmpdir)
  255. result = subprocess.run(
  256. ['node', str(HEADERS_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
  257. cwd=tmpdir,
  258. capture_output=True,
  259. text=True,
  260. timeout=60
  261. )
  262. # May succeed or fail depending on server behavior
  263. # If it succeeds, verify 404 status is captured
  264. if result.returncode == 0:
  265. output_headers_file = tmpdir / 'headers.json'
  266. if output_headers_file.exists():
  267. output_data = json.loads(output_headers_file.read_text())
  268. assert output_data['status'] == 404, "Should capture 404 status"
  269. if __name__ == '__main__':
  270. pytest.main([__file__, '-v'])