test_wget.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. """
  2. Integration tests for wget plugin
  3. Tests verify:
  4. pass
  5. 1. Validate hook checks for wget binary
  6. 2. Verify deps with abx-pkg
  7. 3. Config options work (WGET_ENABLED, WGET_SAVE_WARC, etc.)
  8. 4. Extraction works against real example.com
  9. 5. Output files contain actual page content
  10. 6. Skip cases work (WGET_ENABLED=False, staticfile present)
  11. 7. Failure cases handled (404, network errors)
  12. """
  13. import json
  14. import os
  15. import shutil
  16. import subprocess
  17. import sys
  18. import tempfile
  19. import uuid
  20. from pathlib import Path
  21. import pytest
  22. PLUGIN_DIR = Path(__file__).parent.parent
  23. PLUGINS_ROOT = PLUGIN_DIR.parent
  24. WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.*'))
  25. BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Binary__install_using_brew_provider.py'
  26. APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Binary__install_using_apt_provider.py'
  27. TEST_URL = 'https://example.com'
  28. def test_hook_script_exists():
  29. """Verify hook script exists."""
  30. assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
  31. def test_verify_deps_with_abx_pkg():
  32. """Verify wget is available via abx-pkg."""
  33. from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
  34. wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
  35. wget_loaded = wget_binary.load()
  36. if wget_loaded and wget_loaded.abspath:
  37. assert True, "wget is available"
  38. else:
  39. pass
  40. def test_reports_missing_dependency_when_not_installed():
  41. """Test that script reports DEPENDENCY_NEEDED when wget is not found."""
  42. with tempfile.TemporaryDirectory() as tmpdir:
  43. tmpdir = Path(tmpdir)
  44. # Run with empty PATH so binary won't be found
  45. env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
  46. result = subprocess.run(
  47. [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
  48. cwd=tmpdir,
  49. capture_output=True,
  50. text=True,
  51. env=env
  52. )
  53. # Missing binary is a transient error - should exit 1 with no JSONL
  54. assert result.returncode == 1, "Should exit 1 when dependency missing"
  55. # Should NOT emit JSONL (transient error - will be retried)
  56. jsonl_lines = [line for line in result.stdout.strip().split('\n')
  57. if line.strip().startswith('{')]
  58. assert len(jsonl_lines) == 0, "Should not emit JSONL for transient error (missing binary)"
  59. # Should log error to stderr
  60. assert 'wget' in result.stderr.lower() or 'error' in result.stderr.lower(), \
  61. "Should report error in stderr"
  62. def test_can_install_wget_via_provider():
  63. """Test that wget can be installed via brew/apt provider hooks."""
  64. # Determine which provider to use
  65. if shutil.which('brew'):
  66. provider_hook = BREW_HOOK
  67. provider_name = 'brew'
  68. elif shutil.which('apt-get'):
  69. provider_hook = APT_HOOK
  70. provider_name = 'apt'
  71. else:
  72. pass
  73. assert provider_hook.exists(), f"Provider hook not found: {provider_hook}"
  74. # Test installation via provider hook
  75. binary_id = str(uuid.uuid4())
  76. machine_id = str(uuid.uuid4())
  77. result = subprocess.run(
  78. [
  79. sys.executable,
  80. str(provider_hook),
  81. '--binary-id', binary_id,
  82. '--machine-id', machine_id,
  83. '--name', 'wget',
  84. '--binproviders', 'apt,brew,env'
  85. ],
  86. capture_output=True,
  87. text=True,
  88. timeout=300 # Installation can take time
  89. )
  90. # Should succeed (wget installs successfully or is already installed)
  91. assert result.returncode == 0, f"{provider_name} install failed: {result.stderr}"
  92. # Should output Binary JSONL record
  93. assert 'Binary' in result.stdout or 'wget' in result.stderr, \
  94. f"Should output installation info: stdout={result.stdout}, stderr={result.stderr}"
  95. # Parse JSONL if present
  96. if result.stdout.strip():
  97. pass
  98. for line in result.stdout.strip().split('\n'):
  99. pass
  100. try:
  101. record = json.loads(line)
  102. if record.get('type') == 'Binary':
  103. assert record['name'] == 'wget'
  104. assert record['binprovider'] in ['brew', 'apt']
  105. assert record['abspath'], "Should have binary path"
  106. assert Path(record['abspath']).exists(), f"Binary should exist at {record['abspath']}"
  107. break
  108. except json.JSONDecodeError:
  109. continue
  110. # Verify wget is now available
  111. result = subprocess.run(['which', 'wget'], capture_output=True, text=True)
  112. assert result.returncode == 0, "wget should be available after installation"
  113. def test_archives_example_com():
  114. """Test full workflow: ensure wget installed then archive example.com."""
  115. # First ensure wget is installed via provider
  116. if shutil.which('brew'):
  117. provider_hook = BREW_HOOK
  118. elif shutil.which('apt-get'):
  119. provider_hook = APT_HOOK
  120. else:
  121. pass
  122. # Run installation (idempotent - will succeed if already installed)
  123. install_result = subprocess.run(
  124. [
  125. sys.executable,
  126. str(provider_hook),
  127. '--dependency-id', str(uuid.uuid4()),
  128. '--bin-name', 'wget',
  129. '--bin-providers', 'apt,brew,env'
  130. ],
  131. capture_output=True,
  132. text=True,
  133. timeout=300
  134. )
  135. if install_result.returncode != 0:
  136. pass
  137. # Now test archiving
  138. with tempfile.TemporaryDirectory() as tmpdir:
  139. tmpdir = Path(tmpdir)
  140. # Run wget extraction
  141. result = subprocess.run(
  142. [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
  143. cwd=tmpdir,
  144. capture_output=True,
  145. text=True,
  146. timeout=120
  147. )
  148. assert result.returncode == 0, f"Extraction failed: {result.stderr}"
  149. # Parse clean JSONL output
  150. result_json = None
  151. for line in result.stdout.strip().split('\n'):
  152. line = line.strip()
  153. if line.startswith('{'):
  154. pass
  155. try:
  156. record = json.loads(line)
  157. if record.get('type') == 'ArchiveResult':
  158. result_json = record
  159. break
  160. except json.JSONDecodeError:
  161. pass
  162. assert result_json, "Should have ArchiveResult JSONL output"
  163. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  164. # Verify files were downloaded
  165. downloaded_files = list(tmpdir.rglob('*.html')) + list(tmpdir.rglob('*.htm'))
  166. assert len(downloaded_files) > 0, "No HTML files downloaded"
  167. # Find main HTML file (should contain example.com)
  168. main_html = None
  169. for html_file in downloaded_files:
  170. content = html_file.read_text(errors='ignore')
  171. if 'example domain' in content.lower():
  172. main_html = html_file
  173. break
  174. assert main_html is not None, "Could not find main HTML file with example.com content"
  175. # Verify HTML content contains REAL example.com text
  176. html_content = main_html.read_text(errors='ignore')
  177. assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
  178. assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
  179. assert ('this domain' in html_content.lower() or
  180. 'illustrative examples' in html_content.lower()), \
  181. "Missing example.com description text"
  182. assert ('iana' in html_content.lower() or
  183. 'more information' in html_content.lower()), \
  184. "Missing IANA reference"
  185. def test_config_save_wget_false_skips():
  186. """Test that WGET_ENABLED=False exits without emitting JSONL."""
  187. with tempfile.TemporaryDirectory() as tmpdir:
  188. tmpdir = Path(tmpdir)
  189. # Set WGET_ENABLED=False
  190. env = os.environ.copy()
  191. env['WGET_ENABLED'] = 'False'
  192. result = subprocess.run(
  193. [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
  194. cwd=tmpdir,
  195. capture_output=True,
  196. text=True,
  197. env=env,
  198. timeout=30
  199. )
  200. # Should exit 0 when feature disabled
  201. assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
  202. # Feature disabled - no JSONL emission, just logs to stderr
  203. assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
  204. # Should NOT emit any JSONL
  205. jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
  206. assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
  207. def test_config_save_warc():
  208. """Test that WGET_SAVE_WARC=True creates WARC files."""
  209. # Ensure wget is available
  210. if not shutil.which('wget'):
  211. pass
  212. with tempfile.TemporaryDirectory() as tmpdir:
  213. tmpdir = Path(tmpdir)
  214. # Set WGET_SAVE_WARC=True explicitly
  215. env = os.environ.copy()
  216. env['WGET_SAVE_WARC'] = 'True'
  217. result = subprocess.run(
  218. [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testwarc'],
  219. cwd=tmpdir,
  220. capture_output=True,
  221. text=True,
  222. env=env,
  223. timeout=120
  224. )
  225. if result.returncode == 0:
  226. # Look for WARC files in warc/ subdirectory
  227. warc_dir = tmpdir / 'warc'
  228. if warc_dir.exists():
  229. warc_files = list(warc_dir.rglob('*'))
  230. warc_files = [f for f in warc_files if f.is_file()]
  231. assert len(warc_files) > 0, "WARC file not created when WGET_SAVE_WARC=True"
  232. def test_staticfile_present_skips():
  233. """Test that wget skips when staticfile already downloaded."""
  234. with tempfile.TemporaryDirectory() as tmpdir:
  235. tmpdir = Path(tmpdir)
  236. # Create directory structure like real ArchiveBox:
  237. # tmpdir/
  238. # staticfile/ <- staticfile extractor output
  239. # wget/ <- wget extractor runs here, looks for ../staticfile
  240. staticfile_dir = tmpdir / 'staticfile'
  241. staticfile_dir.mkdir()
  242. (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n')
  243. wget_dir = tmpdir / 'wget'
  244. wget_dir.mkdir()
  245. result = subprocess.run(
  246. [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'teststatic'],
  247. cwd=wget_dir, # Run from wget subdirectory
  248. capture_output=True,
  249. text=True,
  250. timeout=30
  251. )
  252. # Should skip with permanent skip JSONL
  253. assert result.returncode == 0, "Should exit 0 when permanently skipping"
  254. # Parse clean JSONL output
  255. result_json = None
  256. for line in result.stdout.strip().split('\n'):
  257. line = line.strip()
  258. if line.startswith('{'):
  259. pass
  260. try:
  261. record = json.loads(line)
  262. if record.get('type') == 'ArchiveResult':
  263. result_json = record
  264. break
  265. except json.JSONDecodeError:
  266. pass
  267. assert result_json, "Should emit ArchiveResult JSONL for permanent skip"
  268. assert result_json['status'] == 'skipped', f"Should have status='skipped': {result_json}"
  269. assert 'staticfile' in result_json.get('output_str', '').lower(), "Should mention staticfile in output_str"
  270. def test_handles_404_gracefully():
  271. """Test that wget fails gracefully on 404."""
  272. if not shutil.which('wget'):
  273. pass
  274. with tempfile.TemporaryDirectory() as tmpdir:
  275. tmpdir = Path(tmpdir)
  276. # Try to download non-existent page
  277. result = subprocess.run(
  278. [sys.executable, str(WGET_HOOK), '--url', 'https://example.com/nonexistent-page-404', '--snapshot-id', 'test404'],
  279. cwd=tmpdir,
  280. capture_output=True,
  281. text=True,
  282. timeout=60
  283. )
  284. # Should fail
  285. assert result.returncode != 0, "Should fail on 404"
  286. combined = result.stdout + result.stderr
  287. assert '404' in combined or 'Not Found' in combined or 'No files downloaded' in combined, \
  288. "Should report 404 or no files downloaded"
  289. def test_config_timeout_honored():
  290. """Test that WGET_TIMEOUT config is respected."""
  291. if not shutil.which('wget'):
  292. pass
  293. with tempfile.TemporaryDirectory() as tmpdir:
  294. tmpdir = Path(tmpdir)
  295. # Set very short timeout
  296. env = os.environ.copy()
  297. env['WGET_TIMEOUT'] = '5'
  298. # This should still succeed for example.com (it's fast)
  299. result = subprocess.run(
  300. [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
  301. cwd=tmpdir,
  302. capture_output=True,
  303. text=True,
  304. env=env,
  305. timeout=30
  306. )
  307. # Verify it completed (success or fail, but didn't hang)
  308. assert result.returncode in (0, 1), "Should complete (success or fail)"
  309. def test_config_user_agent():
  310. """Test that WGET_USER_AGENT config is used."""
  311. if not shutil.which('wget'):
  312. pass
  313. with tempfile.TemporaryDirectory() as tmpdir:
  314. tmpdir = Path(tmpdir)
  315. # Set custom user agent
  316. env = os.environ.copy()
  317. env['WGET_USER_AGENT'] = 'TestBot/1.0'
  318. result = subprocess.run(
  319. [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
  320. cwd=tmpdir,
  321. capture_output=True,
  322. text=True,
  323. env=env,
  324. timeout=120
  325. )
  326. # Should succeed (example.com doesn't block)
  327. if result.returncode == 0:
  328. # Parse clean JSONL output
  329. result_json = None
  330. for line in result.stdout.strip().split('\n'):
  331. line = line.strip()
  332. if line.startswith('{'):
  333. pass
  334. try:
  335. record = json.loads(line)
  336. if record.get('type') == 'ArchiveResult':
  337. result_json = record
  338. break
  339. except json.JSONDecodeError:
  340. pass
  341. assert result_json, "Should have ArchiveResult JSONL output"
  342. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  343. if __name__ == '__main__':
  344. pytest.main([__file__, '-v'])