test_worker_config_propagation.py 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069
  1. """
  2. Integration test for config propagation through worker hierarchy.
  3. Tests that config is properly merged and passed through:
  4. Parent CLI/Orchestrator
  5. └── CrawlWorker subprocess (via Process.env)
  6. └── SnapshotWorker subprocess (via Process.env)
  7. └── Hook subprocess (via Process.env)
  8. Config priority order (highest to lowest):
  9. 1. Snapshot.config (JSON field)
  10. 2. Crawl.config (JSON field)
  11. 3. User.config (JSON field)
  12. 4. Environment variables (os.environ + Process.env)
  13. 5. Config file (ArchiveBox.conf)
  14. 6. Plugin defaults (config.json)
  15. 7. Core defaults
  16. """
  17. import os
  18. import json
  19. import tempfile
  20. import subprocess
  21. import time
  22. from pathlib import Path
  23. def test_config_propagation_through_worker_hierarchy():
  24. """
  25. Integration test: Verify config is properly merged at every level.
  26. Test flow:
  27. 1. Create test archive with custom config in ArchiveBox.conf
  28. 2. Set custom env vars before spawning worker
  29. 3. Create Crawl with custom crawl.config JSON field
  30. 4. Create Snapshot with custom snapshot.config JSON field
  31. 5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
  32. 6. Verify worker received merged config from all sources
  33. 7. Verify hook subprocess also received correct config
  34. """
  35. with tempfile.TemporaryDirectory() as tmpdir:
  36. data_dir = Path(tmpdir) / 'test_archive'
  37. data_dir.mkdir()
  38. print(f"\n{'='*80}")
  39. print(f"Test: Config Propagation Through Worker Hierarchy")
  40. print(f"DATA_DIR: {data_dir}")
  41. print(f"{'='*80}\n")
  42. # Step 1: Initialize archive
  43. print("Step 1: Initialize archive")
  44. result = subprocess.run(
  45. ['python', '-m', 'archivebox', 'init'],
  46. cwd=str(data_dir),
  47. env={
  48. **os.environ,
  49. 'DATA_DIR': str(data_dir),
  50. 'USE_COLOR': 'False',
  51. },
  52. capture_output=True,
  53. timeout=60,
  54. )
  55. assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
  56. print(f"✓ Archive initialized\n")
  57. # Step 2: Write custom config to ArchiveBox.conf
  58. print("Step 2: Write custom config to ArchiveBox.conf")
  59. config_file = data_dir / 'ArchiveBox.conf'
  60. config_file.write_text("""
  61. [GENERAL]
  62. # Custom timeout in config file
  63. TIMEOUT = 999
  64. [ARCHIVING_CONFIG]
  65. # Enable all plugins for proper testing
  66. SAVE_WGET = True
  67. SAVE_WARC = True
  68. SAVE_PDF = True
  69. SAVE_DOM = True
  70. SAVE_SINGLEFILE = True
  71. SAVE_READABILITY = True
  72. SAVE_MERCURY = True
  73. SAVE_HTMLTOTEXT = True
  74. SAVE_GIT = True
  75. SAVE_MEDIA = True
  76. SAVE_ARCHIVE_DOT_ORG = True
  77. SAVE_TITLE = True
  78. SAVE_FAVICON = True
  79. SAVE_SCREENSHOT = True
  80. """)
  81. print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
  82. # Step 2.5: Set Machine.config values
  83. print("Step 2.5: Set Machine.config with custom binary path")
  84. set_machine_config_script = f"""
  85. import os
  86. os.environ['DATA_DIR'] = '{data_dir}'
  87. from archivebox.config.django import setup_django
  88. setup_django()
  89. from archivebox.machine.models import Machine
  90. machine = Machine.current()
  91. machine.config = {{
  92. 'CUSTOM_MACHINE_KEY': 'from_machine_config',
  93. 'WGET_BINARY': '/custom/machine/wget', # Machine-specific binary path
  94. }}
  95. machine.save()
  96. print(f"Machine {{machine.hostname}} config updated")
  97. """
  98. result = subprocess.run(
  99. ['python', '-c', set_machine_config_script],
  100. cwd=str(data_dir.parent),
  101. env={
  102. **os.environ,
  103. 'DATA_DIR': str(data_dir),
  104. 'USE_COLOR': 'False',
  105. },
  106. capture_output=True,
  107. timeout=30,
  108. )
  109. assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
  110. print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
  111. # Step 3: Create Crawl via Django ORM with custom crawl.config
  112. print("Step 3: Create Crawl with custom crawl.config JSON")
  113. create_crawl_script = f"""
  114. import os
  115. os.environ['DATA_DIR'] = '{data_dir}'
  116. from archivebox.config.django import setup_django
  117. setup_django()
  118. from django.utils import timezone
  119. from archivebox.crawls.models import Crawl
  120. # Create crawl with custom config
  121. crawl = Crawl.objects.create(
  122. status='queued',
  123. retry_at=timezone.now(),
  124. urls='https://example.com',
  125. config={{
  126. 'TIMEOUT': 777, # Crawl-level override (higher priority than file)
  127. 'CUSTOM_CRAWL_KEY': 'from_crawl_json',
  128. }}
  129. )
  130. print(crawl.id)
  131. """
  132. result = subprocess.run(
  133. ['python', '-c', create_crawl_script],
  134. cwd=str(data_dir.parent),
  135. env={
  136. **os.environ,
  137. 'DATA_DIR': str(data_dir),
  138. 'USE_COLOR': 'False',
  139. },
  140. capture_output=True,
  141. timeout=30,
  142. )
  143. assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
  144. # Extract UUID from output (last line should be the UUID)
  145. crawl_id = result.stdout.decode().strip().split('\n')[-1]
  146. print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
  147. # Step 4: Create Snapshot with custom snapshot.config
  148. print("Step 4: Create Snapshot with custom snapshot.config JSON")
  149. create_snapshot_script = f"""
  150. import os
  151. os.environ['DATA_DIR'] = '{data_dir}'
  152. from archivebox.config.django import setup_django
  153. setup_django()
  154. from django.utils import timezone
  155. from archivebox.core.models import Snapshot
  156. from archivebox.crawls.models import Crawl
  157. crawl = Crawl.objects.get(id='{crawl_id}')
  158. snapshot = Snapshot.objects.create(
  159. url='https://example.com',
  160. crawl=crawl,
  161. status='queued',
  162. retry_at=timezone.now(),
  163. config={{
  164. 'TIMEOUT': 555, # Snapshot-level override (highest priority)
  165. 'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
  166. 'SAVE_SCREENSHOT': True, # Keep screenshot enabled
  167. 'SAVE_WGET': False, # But disable wget as a test of per-snapshot override
  168. }}
  169. )
  170. print(snapshot.id)
  171. """
  172. result = subprocess.run(
  173. ['python', '-c', create_snapshot_script],
  174. cwd=str(data_dir.parent),
  175. env={
  176. **os.environ,
  177. 'DATA_DIR': str(data_dir),
  178. 'USE_COLOR': 'False',
  179. },
  180. capture_output=True,
  181. timeout=30,
  182. )
  183. assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
  184. # Extract UUID from output (last line should be the UUID)
  185. snapshot_id = result.stdout.decode().strip().split('\n')[-1]
  186. print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
  187. # Step 5: Run SnapshotWorker with additional env var
  188. print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
  189. result = subprocess.run(
  190. ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
  191. cwd=str(data_dir),
  192. env={
  193. **os.environ,
  194. 'DATA_DIR': str(data_dir),
  195. 'USE_COLOR': 'False',
  196. 'ENV_VAR_KEY': 'from_environment', # Environment variable
  197. },
  198. capture_output=True,
  199. timeout=120,
  200. )
  201. stdout = result.stdout.decode()
  202. stderr = result.stderr.decode()
  203. print("\n--- SnapshotWorker stdout ---")
  204. print(stdout)
  205. print("\n--- SnapshotWorker stderr ---")
  206. print(stderr)
  207. print("--- End output ---\n")
  208. # Step 6: Verify config was properly merged
  209. print("Step 6: Verify config merging")
  210. # Check that SnapshotWorker ran successfully
  211. assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
  212. # Verify config by checking stderr debug output and ArchiveResults in database
  213. print("\n--- Verifying config propagation ---\n")
  214. # Check for config debug messages in stderr
  215. assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
  216. "Expected debug output not found in stderr"
  217. print("✓ Config debug output found in stderr")
  218. # Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults
  219. verify_precedence_script = f"""
  220. import os
  221. os.environ['DATA_DIR'] = '{data_dir}'
  222. from archivebox.config.django import setup_django
  223. setup_django()
  224. from archivebox.core.models import Snapshot
  225. from archivebox.config.configset import get_config
  226. snapshot = Snapshot.objects.get(id='{snapshot_id}')
  227. # Test precedence by getting config at different levels
  228. print("\\nTesting config precedence order:")
  229. # 1. Just defaults (lowest priority)
  230. config_defaults = get_config()
  231. print(f" Defaults only: TIMEOUT={{config_defaults.get('TIMEOUT')}}")
  232. # 2. With machine config
  233. from archivebox.machine.models import Machine
  234. machine = Machine.current()
  235. config_machine = get_config(machine=machine)
  236. custom_machine = config_machine.get('CUSTOM_MACHINE_KEY')
  237. print(f" + Machine: CUSTOM_MACHINE_KEY={{custom_machine}}")
  238. # 3. With crawl config
  239. config_crawl = get_config(crawl=snapshot.crawl)
  240. print(f" + Crawl: TIMEOUT={{config_crawl.get('TIMEOUT')}} (should be 777 from crawl.config)")
  241. assert config_crawl.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl.get('TIMEOUT')}}"
  242. # 4. With snapshot config (highest priority)
  243. config_snapshot = get_config(snapshot=snapshot)
  244. print(f" + Snapshot: TIMEOUT={{config_snapshot.get('TIMEOUT')}} (should be 555 from snapshot.config)")
  245. assert config_snapshot.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config_snapshot.get('TIMEOUT')}}"
  246. # Verify snapshot config overrides crawl config
  247. assert config_snapshot.get('CUSTOM_CRAWL_KEY') == 'from_crawl_json', "Crawl config should be present"
  248. assert config_snapshot.get('CUSTOM_SNAPSHOT_KEY') == 'from_snapshot_json', "Snapshot config should be present"
  249. assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Machine config should be present"
  250. print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults")
  251. """
  252. result = subprocess.run(
  253. ['python', '-c', verify_precedence_script],
  254. cwd=str(data_dir.parent),
  255. env={
  256. **os.environ,
  257. 'DATA_DIR': str(data_dir),
  258. 'USE_COLOR': 'False',
  259. },
  260. capture_output=True,
  261. timeout=30,
  262. )
  263. print(result.stdout.decode())
  264. if result.returncode != 0:
  265. print("\nPrecedence verification error:")
  266. print(result.stderr.decode())
  267. assert result.returncode == 0, f"Precedence verification failed: {result.stderr.decode()}"
  268. # Verify config values were actually used by checking ArchiveResults
  269. verify_script = f"""
  270. import os
  271. os.environ['DATA_DIR'] = '{data_dir}'
  272. from archivebox.config.django import setup_django
  273. setup_django()
  274. from archivebox.core.models import Snapshot, ArchiveResult
  275. from archivebox.config.configset import get_config
  276. snapshot = Snapshot.objects.get(id='{snapshot_id}')
  277. print(f"Snapshot status: {{snapshot.status}}")
  278. print(f"Snapshot URL: {{snapshot.url}}")
  279. # Check that snapshot reached sealed state
  280. assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
  281. # Verify all config sources are present in merged config
  282. print("\\nVerifying config merge priority:")
  283. config = get_config(snapshot=snapshot)
  284. # 1. Snapshot.config (highest priority)
  285. timeout = config.get('TIMEOUT')
  286. print(f" 1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
  287. assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
  288. wget_enabled = config.get('SAVE_WGET')
  289. print(f" 1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
  290. assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
  291. custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
  292. print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
  293. assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
  294. # 2. Crawl.config
  295. custom_crawl = config.get('CUSTOM_CRAWL_KEY')
  296. print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
  297. assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
  298. # 6. Machine.config
  299. custom_machine = config.get('CUSTOM_MACHINE_KEY')
  300. print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
  301. assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
  302. wget_binary = config.get('WGET_BINARY')
  303. print(f" 6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
  304. # Note: This might be overridden by environment or other sources, just check it's present
  305. assert wget_binary is not None, f"WGET_BINARY should be present"
  306. # Check ArchiveResults to verify plugins actually ran with correct config
  307. results = ArchiveResult.objects.filter(snapshot=snapshot)
  308. print(f"\\nArchiveResults created: {{results.count()}}")
  309. for ar in results.order_by('plugin'):
  310. print(f" {{ar.plugin}}: {{ar.status}}")
  311. # Verify SAVE_WGET=False was respected (should have no wget result)
  312. wget_results = results.filter(plugin='wget')
  313. print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
  314. assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
  315. # Verify SAVE_SCREENSHOT=True was respected (should have screenshot result)
  316. screenshot_results = results.filter(plugin='screenshot')
  317. print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)")
  318. assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results"
  319. print("\\n✓ All config sources correctly merged:")
  320. print(" - Snapshot.config overrides (highest priority)")
  321. print(" - Crawl.config values present")
  322. print(" - Machine.config values present")
  323. print(" - File config values present")
  324. print("✓ Config priority order verified")
  325. print("✓ Snapshot successfully sealed")
  326. """
  327. result = subprocess.run(
  328. ['python', '-c', verify_script],
  329. cwd=str(data_dir.parent),
  330. env={
  331. **os.environ,
  332. 'DATA_DIR': str(data_dir),
  333. 'USE_COLOR': 'False',
  334. },
  335. capture_output=True,
  336. timeout=30,
  337. )
  338. print(result.stdout.decode())
  339. if result.returncode != 0:
  340. print("\nVerification error:")
  341. print(result.stderr.decode())
  342. assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
  343. print("\n" + "="*80)
  344. print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
  345. print("="*80 + "\n")
  346. def test_config_environment_variable_parsing():
  347. """
  348. Test that Process._build_env() correctly serializes config values,
  349. and get_config() correctly parses them back from environment.
  350. """
  351. with tempfile.TemporaryDirectory() as tmpdir:
  352. data_dir = Path(tmpdir) / 'test_archive'
  353. data_dir.mkdir()
  354. print(f"\n{'='*80}")
  355. print(f"Test: Config Environment Variable Parsing")
  356. print(f"DATA_DIR: {data_dir}")
  357. print(f"{'='*80}\n")
  358. # Initialize archive
  359. result = subprocess.run(
  360. ['python', '-m', 'archivebox', 'init'],
  361. cwd=str(data_dir),
  362. env={
  363. **os.environ,
  364. 'DATA_DIR': str(data_dir),
  365. 'USE_COLOR': 'False',
  366. },
  367. capture_output=True,
  368. timeout=60,
  369. )
  370. assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
  371. # Test various data types in config
  372. test_config_types_script = f"""
  373. import os
  374. os.environ['DATA_DIR'] = '{data_dir}'
  375. from archivebox.config.django import setup_django
  376. setup_django()
  377. from archivebox.config.configset import get_config
  378. from archivebox.machine.models import Process, Machine
  379. # Test get_config() with no overrides (baseline)
  380. config = get_config()
  381. print(f"Baseline config keys: {{len(config)}}")
  382. # Create a test Process with various config types
  383. process = Process.objects.create(
  384. machine=Machine.current(),
  385. process_type=Process.TypeChoices.WORKER,
  386. pwd='{data_dir}',
  387. cmd=['test'],
  388. env={{
  389. 'STRING_VAL': 'hello',
  390. 'INT_VAL': 123,
  391. 'FLOAT_VAL': 45.67,
  392. 'BOOL_TRUE': True,
  393. 'BOOL_FALSE': False,
  394. 'LIST_VAL': ['a', 'b', 'c'],
  395. 'DICT_VAL': {{'key': 'value'}},
  396. 'NONE_VAL': None,
  397. }},
  398. )
  399. # Test _build_env() serialization
  400. env = process._build_env()
  401. print(f"\\nSerialized environment:")
  402. print(f" STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
  403. print(f" INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
  404. print(f" FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
  405. print(f" BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
  406. print(f" BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
  407. print(f" LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
  408. print(f" DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
  409. print(f" NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
  410. # Verify all are strings (required by subprocess.Popen)
  411. assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
  412. assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
  413. assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
  414. assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
  415. assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
  416. assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
  417. assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
  418. print("\\n✓ All environment values correctly serialized as strings")
  419. # Now test that get_config() can parse them back
  420. # Simulate subprocess by setting os.environ
  421. import json
  422. for key, val in env.items():
  423. if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
  424. os.environ[key] = val
  425. # Get config again - should parse from environment
  426. config = get_config()
  427. print(f"\\nParsed from environment:")
  428. print(f" STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
  429. print(f" INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
  430. print(f" FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
  431. print(f" BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
  432. print(f" BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
  433. print(f" LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
  434. print(f" DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
  435. print("\\n✓ All config values correctly parsed from environment")
  436. """
  437. result = subprocess.run(
  438. ['python', '-c', test_config_types_script],
  439. cwd=str(data_dir.parent),
  440. env={
  441. **os.environ,
  442. 'DATA_DIR': str(data_dir),
  443. 'USE_COLOR': 'False',
  444. },
  445. capture_output=True,
  446. timeout=30,
  447. )
  448. print(result.stdout.decode())
  449. if result.stderr:
  450. print("Script stderr:")
  451. print(result.stderr.decode())
  452. assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
  453. print("\n" + "="*80)
  454. print("✓ TEST PASSED: Config serialization and parsing works correctly")
  455. print("="*80 + "\n")
  456. def test_parent_environment_preserved_in_hooks():
  457. """
  458. Test that parent environment variables are preserved in hook execution.
  459. This test catches the bug where we built env=os.environ.copy() but then
  460. clobbered it with process.env={}, losing all parent environment.
  461. Also verifies:
  462. - NODE_PATH is correctly derived from LIB_DIR/npm/node_modules
  463. - LIB_BIN_DIR is correctly derived and added to PATH
  464. """
  465. with tempfile.TemporaryDirectory() as tmpdir:
  466. data_dir = Path(tmpdir) / 'test_archive'
  467. data_dir.mkdir()
  468. print(f"\n{'='*80}")
  469. print(f"Test: Parent Environment Preserved in Hooks")
  470. print(f"DATA_DIR: {data_dir}")
  471. print(f"{'='*80}\n")
  472. # Initialize archive
  473. print("Step 1: Initialize archive")
  474. result = subprocess.run(
  475. ['python', '-m', 'archivebox', 'init'],
  476. cwd=str(data_dir),
  477. env={
  478. **os.environ,
  479. 'DATA_DIR': str(data_dir),
  480. 'USE_COLOR': 'False',
  481. },
  482. capture_output=True,
  483. timeout=60,
  484. )
  485. assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
  486. print(f"✓ Archive initialized\n")
  487. # Create snapshot
  488. print("Step 2: Create Snapshot")
  489. create_snapshot_script = f"""
  490. import os
  491. os.environ['DATA_DIR'] = '{data_dir}'
  492. from archivebox.config.django import setup_django
  493. setup_django()
  494. from django.utils import timezone
  495. from archivebox.core.models import Snapshot
  496. from archivebox.crawls.models import Crawl
  497. crawl = Crawl.objects.create(
  498. urls='https://example.com',
  499. status='queued',
  500. retry_at=timezone.now()
  501. )
  502. snapshot = Snapshot.objects.create(
  503. url='https://example.com',
  504. crawl=crawl,
  505. status='queued',
  506. retry_at=timezone.now()
  507. )
  508. print(snapshot.id)
  509. """
  510. result = subprocess.run(
  511. ['python', '-c', create_snapshot_script],
  512. cwd=str(data_dir.parent),
  513. env={
  514. **os.environ,
  515. 'DATA_DIR': str(data_dir),
  516. 'USE_COLOR': 'False',
  517. },
  518. capture_output=True,
  519. timeout=30,
  520. )
  521. assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
  522. snapshot_id = result.stdout.decode().strip().split('\n')[-1]
  523. print(f"✓ Created snapshot {snapshot_id}\n")
  524. # Run SnapshotWorker with custom parent environment variable
  525. print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process")
  526. result = subprocess.run(
  527. ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
  528. cwd=str(data_dir),
  529. env={
  530. **os.environ,
  531. 'DATA_DIR': str(data_dir),
  532. 'USE_COLOR': 'False',
  533. 'TEST_PARENT_ENV_VAR': 'preserved_from_parent', # This should reach the hook
  534. 'PLUGINS': 'favicon', # Use existing plugin (favicon is simple and fast)
  535. },
  536. capture_output=True,
  537. timeout=120,
  538. )
  539. stdout = result.stdout.decode()
  540. stderr = result.stderr.decode()
  541. print("\n--- SnapshotWorker stderr (first 50 lines) ---")
  542. print('\n'.join(stderr.split('\n')[:50]))
  543. print("--- End stderr ---\n")
  544. # Verify hooks ran by checking Process records
  545. print("Step 4: Verify environment variables in hook Process records")
  546. verify_env_script = f"""
  547. import os
  548. os.environ['DATA_DIR'] = '{data_dir}'
  549. from archivebox.config.django import setup_django
  550. setup_django()
  551. from archivebox.machine.models import Process
  552. from archivebox.core.models import Snapshot
  553. import json
  554. snapshot = Snapshot.objects.get(id='{snapshot_id}')
  555. # Find hook processes for this snapshot
  556. hook_processes = Process.objects.filter(
  557. process_type=Process.TypeChoices.HOOK,
  558. pwd__contains=str(snapshot.id)
  559. ).order_by('-created_at')
  560. print(f"Found {{hook_processes.count()}} hook processes")
  561. if hook_processes.count() == 0:
  562. print("ERROR: No hook processes found!")
  563. import sys
  564. sys.exit(1)
  565. # Check the first hook process environment
  566. hook_process = hook_processes.first()
  567. print(f"\\nChecking hook: {{hook_process.cmd}}")
  568. print(f"Hook env keys: {{len(hook_process.env)}} total")
  569. # Verify TEST_PARENT_ENV_VAR was preserved
  570. test_parent = hook_process.env.get('TEST_PARENT_ENV_VAR')
  571. print(f" TEST_PARENT_ENV_VAR: {{test_parent}}")
  572. assert test_parent == 'preserved_from_parent', f"Expected 'preserved_from_parent', got {{test_parent}}"
  573. # Verify LIB_DIR is set
  574. lib_dir = hook_process.env.get('LIB_DIR')
  575. print(f" LIB_DIR: {{lib_dir}}")
  576. assert lib_dir is not None, "LIB_DIR not set"
  577. # Verify LIB_BIN_DIR is derived
  578. lib_bin_dir = hook_process.env.get('LIB_BIN_DIR')
  579. print(f" LIB_BIN_DIR: {{lib_bin_dir}}")
  580. if lib_dir:
  581. assert lib_bin_dir is not None, "LIB_BIN_DIR not derived from LIB_DIR"
  582. assert lib_bin_dir.endswith('/bin'), f"LIB_BIN_DIR should end with /bin, got {{lib_bin_dir}}"
  583. # Verify LIB_BIN_DIR is in PATH
  584. path = hook_process.env.get('PATH')
  585. if lib_bin_dir:
  586. assert lib_bin_dir in path, f"LIB_BIN_DIR not in PATH. LIB_BIN_DIR={{lib_bin_dir}}, PATH={{path[:200]}}..."
  587. # Verify NODE_PATH is set
  588. node_path = hook_process.env.get('NODE_PATH')
  589. node_modules_dir = hook_process.env.get('NODE_MODULES_DIR')
  590. print(f" NODE_PATH: {{node_path}}")
  591. print(f" NODE_MODULES_DIR: {{node_modules_dir}}")
  592. if node_path:
  593. # Should also have NODE_MODULES_DIR for backwards compatibility
  594. assert node_modules_dir == node_path, f"NODE_MODULES_DIR should match NODE_PATH"
  595. print("\\n✓ All environment checks passed")
  596. """
  597. result = subprocess.run(
  598. ['python', '-c', verify_env_script],
  599. cwd=str(data_dir.parent),
  600. env={
  601. **os.environ,
  602. 'DATA_DIR': str(data_dir),
  603. 'USE_COLOR': 'False',
  604. },
  605. capture_output=True,
  606. timeout=30,
  607. )
  608. print(result.stdout.decode())
  609. if result.returncode != 0:
  610. print("\nVerification error:")
  611. print(result.stderr.decode())
  612. assert result.returncode == 0, f"Environment verification failed: {result.stderr.decode()}"
  613. print("\n" + "="*80)
  614. print("✓ TEST PASSED: Parent environment preserved in hooks")
  615. print(" - Custom parent env vars reach hooks")
  616. print(" - LIB_DIR propagated correctly")
  617. print(" - LIB_BIN_DIR derived and added to PATH")
  618. print(" - NODE_PATH/NODE_MODULES_DIR set when available")
  619. print("="*80 + "\n")
  620. def test_config_auto_fetch_relationships():
  621. """
  622. Test that get_config() auto-fetches related objects from relationships.
  623. Verifies:
  624. - snapshot auto-fetched from archiveresult.snapshot
  625. - crawl auto-fetched from snapshot.crawl
  626. - user auto-fetched from crawl.created_by
  627. """
  628. with tempfile.TemporaryDirectory() as tmpdir:
  629. data_dir = Path(tmpdir) / 'test_archive'
  630. data_dir.mkdir()
  631. print(f"\n{'='*80}")
  632. print(f"Test: Config Auto-Fetch Relationships")
  633. print(f"DATA_DIR: {data_dir}")
  634. print(f"{'='*80}\n")
  635. # Initialize archive
  636. print("Step 1: Initialize archive")
  637. result = subprocess.run(
  638. ['python', '-m', 'archivebox', 'init'],
  639. cwd=str(data_dir),
  640. env={
  641. **os.environ,
  642. 'DATA_DIR': str(data_dir),
  643. 'USE_COLOR': 'False',
  644. },
  645. capture_output=True,
  646. timeout=60,
  647. )
  648. assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
  649. print(f"✓ Archive initialized\n")
  650. # Create objects with config at each level
  651. print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
  652. create_objects_script = f"""
  653. import os
  654. os.environ['DATA_DIR'] = '{data_dir}'
  655. from archivebox.config.django import setup_django
  656. setup_django()
  657. from django.utils import timezone
  658. from archivebox.crawls.models import Crawl
  659. from archivebox.core.models import Snapshot, ArchiveResult
  660. from archivebox.config.configset import get_config
  661. # Create crawl with config
  662. crawl = Crawl.objects.create(
  663. urls='https://example.com',
  664. status='queued',
  665. retry_at=timezone.now(),
  666. config={{
  667. 'CRAWL_KEY': 'from_crawl',
  668. 'TIMEOUT': 777,
  669. }}
  670. )
  671. # Create snapshot with config
  672. snapshot = Snapshot.objects.create(
  673. url='https://example.com',
  674. crawl=crawl,
  675. status='queued',
  676. retry_at=timezone.now(),
  677. config={{
  678. 'SNAPSHOT_KEY': 'from_snapshot',
  679. 'TIMEOUT': 555,
  680. }}
  681. )
  682. # Create ArchiveResult
  683. ar = ArchiveResult.objects.create(
  684. snapshot=snapshot,
  685. plugin='test',
  686. hook_name='test_hook',
  687. status=ArchiveResult.StatusChoices.STARTED
  688. )
  689. print(f"Created: crawl={{crawl.id}}, snapshot={{snapshot.id}}, ar={{ar.id}}")
  690. # Test 1: Auto-fetch crawl from snapshot
  691. print("\\nTest 1: get_config(snapshot=snapshot) auto-fetches crawl")
  692. config = get_config(snapshot=snapshot)
  693. assert config.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config.get('TIMEOUT')}}"
  694. assert config.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot, got {{config.get('SNAPSHOT_KEY')}}"
  695. assert config.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl, got {{config.get('CRAWL_KEY')}}"
  696. print("✓ Snapshot config (TIMEOUT=555) overrides crawl config (TIMEOUT=777)")
  697. print("✓ Both snapshot.config and crawl.config values present")
  698. # Test 2: Auto-fetch snapshot from archiveresult
  699. print("\\nTest 2: get_config(archiveresult=ar) auto-fetches snapshot and crawl")
  700. config_from_ar = get_config(archiveresult=ar)
  701. assert config_from_ar.get('TIMEOUT') == 555, f"Expected 555, got {{config_from_ar.get('TIMEOUT')}}"
  702. assert config_from_ar.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot"
  703. assert config_from_ar.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl"
  704. print("✓ Auto-fetched snapshot from ar.snapshot")
  705. print("✓ Auto-fetched crawl from snapshot.crawl")
  706. # Test 3: Precedence without auto-fetch (explicit crawl only)
  707. print("\\nTest 3: get_config(crawl=crawl) without snapshot")
  708. config_crawl_only = get_config(crawl=crawl)
  709. assert config_crawl_only.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl_only.get('TIMEOUT')}}"
  710. assert config_crawl_only.get('CRAWL_KEY') == 'from_crawl'
  711. assert config_crawl_only.get('SNAPSHOT_KEY') is None, "Should not have snapshot config"
  712. print("✓ Crawl-only config has TIMEOUT=777")
  713. print("✓ No snapshot config values present")
  714. print("\\n✓ All auto-fetch tests passed")
  715. """
  716. result = subprocess.run(
  717. ['python', '-c', create_objects_script],
  718. cwd=str(data_dir.parent),
  719. env={
  720. **os.environ,
  721. 'DATA_DIR': str(data_dir),
  722. 'USE_COLOR': 'False',
  723. },
  724. capture_output=True,
  725. timeout=30,
  726. )
  727. print(result.stdout.decode())
  728. if result.returncode != 0:
  729. print("\nAuto-fetch test error:")
  730. print(result.stderr.decode())
  731. assert result.returncode == 0, f"Auto-fetch test failed: {result.stderr.decode()}"
  732. print("\n" + "="*80)
  733. print("✓ TEST PASSED: Config auto-fetches related objects correctly")
  734. print(" - archiveresult → snapshot → crawl → user")
  735. print(" - Precedence preserved during auto-fetch")
  736. print("="*80 + "\n")
  737. def test_config_precedence_with_environment_vars():
  738. """
  739. Test that config precedence order is correct when environment vars are set.
  740. Documented order (highest to lowest):
  741. 1. snapshot.config
  742. 2. crawl.config
  743. 3. user.config
  744. 4. persona config
  745. 5. environment variables <-- LOWER priority than snapshot/crawl
  746. 6. machine.config
  747. 7. config file
  748. 8. plugin defaults
  749. 9. core defaults
  750. This test verifies snapshot.config overrides environment variables.
  751. """
  752. with tempfile.TemporaryDirectory() as tmpdir:
  753. data_dir = Path(tmpdir) / 'test_archive'
  754. data_dir.mkdir()
  755. print(f"\n{'='*80}")
  756. print(f"Test: Config Precedence with Environment Variables")
  757. print(f"DATA_DIR: {data_dir}")
  758. print(f"{'='*80}\n")
  759. # Initialize
  760. result = subprocess.run(
  761. ['python', '-m', 'archivebox', 'init'],
  762. cwd=str(data_dir),
  763. env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
  764. capture_output=True,
  765. timeout=60,
  766. )
  767. assert result.returncode == 0
  768. print("✓ Archive initialized\n")
  769. # Test with environment variable set
  770. print("Step 1: Test with TIMEOUT=999 in environment")
  771. test_script = f"""
  772. import os
  773. os.environ['DATA_DIR'] = '{data_dir}'
  774. os.environ['TIMEOUT'] = '999' # Set env var
  775. from archivebox.config.django import setup_django
  776. setup_django()
  777. from django.utils import timezone
  778. from archivebox.crawls.models import Crawl
  779. from archivebox.core.models import Snapshot
  780. from archivebox.config.configset import get_config
  781. # Create crawl with TIMEOUT=777
  782. crawl = Crawl.objects.create(
  783. urls='https://example.com',
  784. status='queued',
  785. retry_at=timezone.now(),
  786. config={{'TIMEOUT': 777}}
  787. )
  788. # Create snapshot with TIMEOUT=555
  789. snapshot = Snapshot.objects.create(
  790. url='https://example.com',
  791. crawl=crawl,
  792. status='queued',
  793. retry_at=timezone.now(),
  794. config={{'TIMEOUT': 555}}
  795. )
  796. # Get config with all sources
  797. config = get_config(snapshot=snapshot)
  798. print(f"Environment: TIMEOUT={{os.environ.get('TIMEOUT')}}")
  799. print(f"Crawl config: TIMEOUT={{crawl.config.get('TIMEOUT')}}")
  800. print(f"Snapshot config: TIMEOUT={{snapshot.config.get('TIMEOUT')}}")
  801. print(f"Merged config: TIMEOUT={{config.get('TIMEOUT')}}")
  802. # Snapshot should override both crawl AND environment
  803. expected = 555
  804. actual = config.get('TIMEOUT')
  805. if actual != expected:
  806. print(f"\\n❌ PRECEDENCE BUG: Expected {{expected}}, got {{actual}}")
  807. print(f" Snapshot.config should have highest priority!")
  808. import sys
  809. sys.exit(1)
  810. print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999) and crawl.config (777)")
  811. """
  812. result = subprocess.run(
  813. ['python', '-c', test_script],
  814. cwd=str(data_dir.parent),
  815. capture_output=True,
  816. timeout=30,
  817. )
  818. print(result.stdout.decode())
  819. if result.returncode != 0:
  820. print("\nPrecedence bug detected:")
  821. print(result.stderr.decode())
  822. assert result.returncode == 0, f"Precedence test failed: {result.stderr.decode()}"
  823. print("\n" + "="*80)
  824. print("✓ TEST PASSED: Snapshot config correctly overrides environment variables")
  825. print("="*80 + "\n")
  826. def test_new_environment_variables_added():
  827. """
  828. Test that NEW environment variables (not in defaults) are added to config.
  829. This is important for worker subprocesses that receive config via Process.env.
  830. When Worker.start() creates a subprocess, it serializes config to Process.env.
  831. The subprocess must be able to read those values back via get_config().
  832. """
  833. with tempfile.TemporaryDirectory() as tmpdir:
  834. data_dir = Path(tmpdir) / 'test_archive'
  835. data_dir.mkdir()
  836. print(f"\n{'='*80}")
  837. print(f"Test: New Environment Variables Added to Config")
  838. print(f"DATA_DIR: {data_dir}")
  839. print(f"{'='*80}\n")
  840. # Initialize
  841. result = subprocess.run(
  842. ['python', '-m', 'archivebox', 'init'],
  843. cwd=str(data_dir),
  844. env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
  845. capture_output=True,
  846. timeout=60,
  847. )
  848. assert result.returncode == 0
  849. print("✓ Archive initialized\n")
  850. print("Step 1: Test that new uppercase env vars are added to config")
  851. test_script = f"""
  852. import os
  853. os.environ['DATA_DIR'] = '{data_dir}'
  854. os.environ['NEW_CUSTOM_VAR'] = 'custom_value' # Not in defaults
  855. os.environ['ANOTHER_VAR'] = 'another_value'
  856. os.environ['lowercase_var'] = 'should_be_ignored' # Lowercase should be ignored
  857. from archivebox.config.django import setup_django
  858. setup_django()
  859. from archivebox.config.configset import get_config
  860. config = get_config()
  861. # Check uppercase vars are added
  862. new_var = config.get('NEW_CUSTOM_VAR')
  863. another_var = config.get('ANOTHER_VAR')
  864. lowercase_var = config.get('lowercase_var')
  865. print(f"NEW_CUSTOM_VAR: {{new_var}}")
  866. print(f"ANOTHER_VAR: {{another_var}}")
  867. print(f"lowercase_var: {{lowercase_var}}")
  868. assert new_var == 'custom_value', f"Expected 'custom_value', got {{new_var}}"
  869. assert another_var == 'another_value', f"Expected 'another_value', got {{another_var}}"
  870. assert lowercase_var is None, f"Lowercase vars should be ignored, got {{lowercase_var}}"
  871. print("\\n✓ New uppercase environment variables added to config")
  872. print("✓ Lowercase environment variables ignored")
  873. """
  874. result = subprocess.run(
  875. ['python', '-c', test_script],
  876. cwd=str(data_dir.parent),
  877. capture_output=True,
  878. timeout=30,
  879. )
  880. print(result.stdout.decode())
  881. if result.returncode != 0:
  882. print("\nTest error:")
  883. print(result.stderr.decode())
  884. assert result.returncode == 0, f"Test failed: {result.stderr.decode()}"
  885. print("\n" + "="*80)
  886. print("✓ TEST PASSED: New environment variables correctly added to config")
  887. print("="*80 + "\n")
  888. if __name__ == '__main__':
  889. # Run as standalone script
  890. test_config_propagation_through_worker_hierarchy()
  891. test_config_environment_variable_parsing()
  892. test_parent_environment_preserved_in_hooks()
  893. test_config_auto_fetch_relationships()
  894. test_config_precedence_with_environment_vars()
  895. test_new_environment_variables_added()