""" Integration test for config propagation through worker hierarchy. Tests that config is properly merged and passed through: Parent CLI/Orchestrator └── CrawlWorker subprocess (via Process.env) └── SnapshotWorker subprocess (via Process.env) └── Hook subprocess (via Process.env) Config priority order (highest to lowest): 1. Snapshot.config (JSON field) 2. Crawl.config (JSON field) 3. User.config (JSON field) 4. Environment variables (os.environ + Process.env) 5. Config file (ArchiveBox.conf) 6. Plugin defaults (config.json) 7. Core defaults """ import os import json import tempfile import subprocess import time from pathlib import Path def test_config_propagation_through_worker_hierarchy(): """ Integration test: Verify config is properly merged at every level. Test flow: 1. Create test archive with custom config in ArchiveBox.conf 2. Set custom env vars before spawning worker 3. Create Crawl with custom crawl.config JSON field 4. Create Snapshot with custom snapshot.config JSON field 5. Spawn SnapshotWorker via archivebox run --snapshot-id=... 6. Verify worker received merged config from all sources 7. Verify hook subprocess also received correct config """ with tempfile.TemporaryDirectory() as tmpdir: data_dir = Path(tmpdir) / 'test_archive' data_dir.mkdir() print(f"\n{'='*80}") print(f"Test: Config Propagation Through Worker Hierarchy") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") # Step 1: Initialize archive print("Step 1: Initialize archive") result = subprocess.run( ['python', '-m', 'archivebox', 'init'], cwd=str(data_dir), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=60, ) assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" print(f"✓ Archive initialized\n") # Step 2: Write custom config to ArchiveBox.conf print("Step 2: Write custom config to ArchiveBox.conf") config_file = data_dir / 'ArchiveBox.conf' config_file.write_text(""" [GENERAL] # Custom timeout in config file TIMEOUT = 999 [ARCHIVING_CONFIG] # Enable all plugins for proper testing SAVE_WGET = True SAVE_WARC = True SAVE_PDF = True SAVE_DOM = True SAVE_SINGLEFILE = True SAVE_READABILITY = True SAVE_MERCURY = True SAVE_HTMLTOTEXT = True SAVE_GIT = True SAVE_MEDIA = True SAVE_ARCHIVE_DOT_ORG = True SAVE_TITLE = True SAVE_FAVICON = True SAVE_SCREENSHOT = True """) print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n") # Step 2.5: Set Machine.config values print("Step 2.5: Set Machine.config with custom binary path") set_machine_config_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' from archivebox.config.django import setup_django setup_django() from archivebox.machine.models import Machine machine = Machine.current() machine.config = {{ 'CUSTOM_MACHINE_KEY': 'from_machine_config', 'WGET_BINARY': '/custom/machine/wget', # Machine-specific binary path }} machine.save() print(f"Machine {{machine.hostname}} config updated") """ result = subprocess.run( ['python', '-c', set_machine_config_script], cwd=str(data_dir.parent), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=30, ) assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}" print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n") # Step 3: Create Crawl via Django ORM with custom crawl.config print("Step 3: Create Crawl with custom crawl.config JSON") create_crawl_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' from archivebox.config.django import setup_django setup_django() from django.utils import timezone from archivebox.crawls.models import Crawl # Create crawl with custom config crawl = Crawl.objects.create( status='queued', retry_at=timezone.now(), urls='https://example.com', config={{ 'TIMEOUT': 777, # Crawl-level override (higher priority than file) 'CUSTOM_CRAWL_KEY': 'from_crawl_json', }} ) print(crawl.id) """ result = subprocess.run( ['python', '-c', create_crawl_script], cwd=str(data_dir.parent), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=30, ) assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}" # Extract UUID from output (last line should be the UUID) crawl_id = result.stdout.decode().strip().split('\n')[-1] print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n") # Step 4: Create Snapshot with custom snapshot.config print("Step 4: Create Snapshot with custom snapshot.config JSON") create_snapshot_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' from archivebox.config.django import setup_django setup_django() from django.utils import timezone from archivebox.core.models import Snapshot from archivebox.crawls.models import Crawl crawl = Crawl.objects.get(id='{crawl_id}') snapshot = Snapshot.objects.create( url='https://example.com', crawl=crawl, status='queued', retry_at=timezone.now(), config={{ 'TIMEOUT': 555, # Snapshot-level override (highest priority) 'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json', 'SAVE_SCREENSHOT': True, # Keep screenshot enabled 'SAVE_WGET': False, # But disable wget as a test of per-snapshot override }} ) print(snapshot.id) """ result = subprocess.run( ['python', '-c', create_snapshot_script], cwd=str(data_dir.parent), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=30, ) assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}" # Extract UUID from output (last line should be the UUID) snapshot_id = result.stdout.decode().strip().split('\n')[-1] print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n") # Step 5: Run SnapshotWorker with additional env var print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment") result = subprocess.run( ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id], cwd=str(data_dir), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', 'ENV_VAR_KEY': 'from_environment', # Environment variable }, capture_output=True, timeout=120, ) stdout = result.stdout.decode() stderr = result.stderr.decode() print("\n--- SnapshotWorker stdout ---") print(stdout) print("\n--- SnapshotWorker stderr ---") print(stderr) print("--- End output ---\n") # Step 6: Verify config was properly merged print("Step 6: Verify config merging") # Check that SnapshotWorker ran successfully assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}" # Verify config by checking stderr debug output and ArchiveResults in database print("\n--- Verifying config propagation ---\n") # Check for config debug messages in stderr assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \ "Expected debug output not found in stderr" print("✓ Config debug output found in stderr") # Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults verify_precedence_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' from archivebox.config.django import setup_django setup_django() from archivebox.core.models import Snapshot from archivebox.config.configset import get_config snapshot = Snapshot.objects.get(id='{snapshot_id}') # Test precedence by getting config at different levels print("\\nTesting config precedence order:") # 1. Just defaults (lowest priority) config_defaults = get_config() print(f" Defaults only: TIMEOUT={{config_defaults.get('TIMEOUT')}}") # 2. With machine config from archivebox.machine.models import Machine machine = Machine.current() config_machine = get_config(machine=machine) custom_machine = config_machine.get('CUSTOM_MACHINE_KEY') print(f" + Machine: CUSTOM_MACHINE_KEY={{custom_machine}}") # 3. With crawl config config_crawl = get_config(crawl=snapshot.crawl) print(f" + Crawl: TIMEOUT={{config_crawl.get('TIMEOUT')}} (should be 777 from crawl.config)") assert config_crawl.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl.get('TIMEOUT')}}" # 4. With snapshot config (highest priority) config_snapshot = get_config(snapshot=snapshot) print(f" + Snapshot: TIMEOUT={{config_snapshot.get('TIMEOUT')}} (should be 555 from snapshot.config)") assert config_snapshot.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config_snapshot.get('TIMEOUT')}}" # Verify snapshot config overrides crawl config assert config_snapshot.get('CUSTOM_CRAWL_KEY') == 'from_crawl_json', "Crawl config should be present" assert config_snapshot.get('CUSTOM_SNAPSHOT_KEY') == 'from_snapshot_json', "Snapshot config should be present" assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Machine config should be present" print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults") """ result = subprocess.run( ['python', '-c', verify_precedence_script], cwd=str(data_dir.parent), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=30, ) print(result.stdout.decode()) if result.returncode != 0: print("\nPrecedence verification error:") print(result.stderr.decode()) assert result.returncode == 0, f"Precedence verification failed: {result.stderr.decode()}" # Verify config values were actually used by checking ArchiveResults verify_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' from archivebox.config.django import setup_django setup_django() from archivebox.core.models import Snapshot, ArchiveResult from archivebox.config.configset import get_config snapshot = Snapshot.objects.get(id='{snapshot_id}') print(f"Snapshot status: {{snapshot.status}}") print(f"Snapshot URL: {{snapshot.url}}") # Check that snapshot reached sealed state assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}" # Verify all config sources are present in merged config print("\\nVerifying config merge priority:") config = get_config(snapshot=snapshot) # 1. Snapshot.config (highest priority) timeout = config.get('TIMEOUT') print(f" 1. Snapshot.config: TIMEOUT={timeout} (expected: 555)") assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}" wget_enabled = config.get('SAVE_WGET') print(f" 1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)") assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}" custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY') print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)") assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}" # 2. Crawl.config custom_crawl = config.get('CUSTOM_CRAWL_KEY') print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)") assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}" # 6. Machine.config custom_machine = config.get('CUSTOM_MACHINE_KEY') print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)") assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}" wget_binary = config.get('WGET_BINARY') print(f" 6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)") # Note: This might be overridden by environment or other sources, just check it's present assert wget_binary is not None, f"WGET_BINARY should be present" # Check ArchiveResults to verify plugins actually ran with correct config results = ArchiveResult.objects.filter(snapshot=snapshot) print(f"\\nArchiveResults created: {{results.count()}}") for ar in results.order_by('plugin'): print(f" {{ar.plugin}}: {{ar.status}}") # Verify SAVE_WGET=False was respected (should have no wget result) wget_results = results.filter(plugin='wget') print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)") assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results" # Verify SAVE_SCREENSHOT=True was respected (should have screenshot result) screenshot_results = results.filter(plugin='screenshot') print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)") assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results" print("\\n✓ All config sources correctly merged:") print(" - Snapshot.config overrides (highest priority)") print(" - Crawl.config values present") print(" - Machine.config values present") print(" - File config values present") print("✓ Config priority order verified") print("✓ Snapshot successfully sealed") """ result = subprocess.run( ['python', '-c', verify_script], cwd=str(data_dir.parent), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=30, ) print(result.stdout.decode()) if result.returncode != 0: print("\nVerification error:") print(result.stderr.decode()) assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}" print("\n" + "="*80) print("✓ TEST PASSED: Config properly propagated through worker hierarchy") print("="*80 + "\n") def test_config_environment_variable_parsing(): """ Test that Process._build_env() correctly serializes config values, and get_config() correctly parses them back from environment. """ with tempfile.TemporaryDirectory() as tmpdir: data_dir = Path(tmpdir) / 'test_archive' data_dir.mkdir() print(f"\n{'='*80}") print(f"Test: Config Environment Variable Parsing") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") # Initialize archive result = subprocess.run( ['python', '-m', 'archivebox', 'init'], cwd=str(data_dir), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=60, ) assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" # Test various data types in config test_config_types_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' from archivebox.config.django import setup_django setup_django() from archivebox.config.configset import get_config from archivebox.machine.models import Process, Machine # Test get_config() with no overrides (baseline) config = get_config() print(f"Baseline config keys: {{len(config)}}") # Create a test Process with various config types process = Process.objects.create( machine=Machine.current(), process_type=Process.TypeChoices.WORKER, pwd='{data_dir}', cmd=['test'], env={{ 'STRING_VAL': 'hello', 'INT_VAL': 123, 'FLOAT_VAL': 45.67, 'BOOL_TRUE': True, 'BOOL_FALSE': False, 'LIST_VAL': ['a', 'b', 'c'], 'DICT_VAL': {{'key': 'value'}}, 'NONE_VAL': None, }}, ) # Test _build_env() serialization env = process._build_env() print(f"\\nSerialized environment:") print(f" STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})") print(f" INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})") print(f" FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})") print(f" BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})") print(f" BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})") print(f" LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})") print(f" DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})") print(f" NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)") # Verify all are strings (required by subprocess.Popen) assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str" assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str" assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str" assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str" assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str" assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str" assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str" print("\\n✓ All environment values correctly serialized as strings") # Now test that get_config() can parse them back # Simulate subprocess by setting os.environ import json for key, val in env.items(): if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']: os.environ[key] = val # Get config again - should parse from environment config = get_config() print(f"\\nParsed from environment:") print(f" STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})") print(f" INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})") print(f" FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})") print(f" BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})") print(f" BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})") print(f" LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})") print(f" DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})") print("\\n✓ All config values correctly parsed from environment") """ result = subprocess.run( ['python', '-c', test_config_types_script], cwd=str(data_dir.parent), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=30, ) print(result.stdout.decode()) if result.stderr: print("Script stderr:") print(result.stderr.decode()) assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}" print("\n" + "="*80) print("✓ TEST PASSED: Config serialization and parsing works correctly") print("="*80 + "\n") def test_parent_environment_preserved_in_hooks(): """ Test that parent environment variables are preserved in hook execution. This test catches the bug where we built env=os.environ.copy() but then clobbered it with process.env={}, losing all parent environment. Also verifies: - NODE_PATH is correctly derived from LIB_DIR/npm/node_modules - LIB_BIN_DIR is correctly derived and added to PATH """ with tempfile.TemporaryDirectory() as tmpdir: data_dir = Path(tmpdir) / 'test_archive' data_dir.mkdir() print(f"\n{'='*80}") print(f"Test: Parent Environment Preserved in Hooks") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") # Initialize archive print("Step 1: Initialize archive") result = subprocess.run( ['python', '-m', 'archivebox', 'init'], cwd=str(data_dir), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=60, ) assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" print(f"✓ Archive initialized\n") # Create snapshot print("Step 2: Create Snapshot") create_snapshot_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' from archivebox.config.django import setup_django setup_django() from django.utils import timezone from archivebox.core.models import Snapshot from archivebox.crawls.models import Crawl crawl = Crawl.objects.create( urls='https://example.com', status='queued', retry_at=timezone.now() ) snapshot = Snapshot.objects.create( url='https://example.com', crawl=crawl, status='queued', retry_at=timezone.now() ) print(snapshot.id) """ result = subprocess.run( ['python', '-c', create_snapshot_script], cwd=str(data_dir.parent), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=30, ) assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}" snapshot_id = result.stdout.decode().strip().split('\n')[-1] print(f"✓ Created snapshot {snapshot_id}\n") # Run SnapshotWorker with custom parent environment variable print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process") result = subprocess.run( ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id], cwd=str(data_dir), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', 'TEST_PARENT_ENV_VAR': 'preserved_from_parent', # This should reach the hook 'PLUGINS': 'favicon', # Use existing plugin (favicon is simple and fast) }, capture_output=True, timeout=120, ) stdout = result.stdout.decode() stderr = result.stderr.decode() print("\n--- SnapshotWorker stderr (first 50 lines) ---") print('\n'.join(stderr.split('\n')[:50])) print("--- End stderr ---\n") # Verify hooks ran by checking Process records print("Step 4: Verify environment variables in hook Process records") verify_env_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' from archivebox.config.django import setup_django setup_django() from archivebox.machine.models import Process from archivebox.core.models import Snapshot import json snapshot = Snapshot.objects.get(id='{snapshot_id}') # Find hook processes for this snapshot hook_processes = Process.objects.filter( process_type=Process.TypeChoices.HOOK, pwd__contains=str(snapshot.id) ).order_by('-created_at') print(f"Found {{hook_processes.count()}} hook processes") if hook_processes.count() == 0: print("ERROR: No hook processes found!") import sys sys.exit(1) # Check the first hook process environment hook_process = hook_processes.first() print(f"\\nChecking hook: {{hook_process.cmd}}") print(f"Hook env keys: {{len(hook_process.env)}} total") # Verify TEST_PARENT_ENV_VAR was preserved test_parent = hook_process.env.get('TEST_PARENT_ENV_VAR') print(f" TEST_PARENT_ENV_VAR: {{test_parent}}") assert test_parent == 'preserved_from_parent', f"Expected 'preserved_from_parent', got {{test_parent}}" # Verify LIB_DIR is set lib_dir = hook_process.env.get('LIB_DIR') print(f" LIB_DIR: {{lib_dir}}") assert lib_dir is not None, "LIB_DIR not set" # Verify LIB_BIN_DIR is derived lib_bin_dir = hook_process.env.get('LIB_BIN_DIR') print(f" LIB_BIN_DIR: {{lib_bin_dir}}") if lib_dir: assert lib_bin_dir is not None, "LIB_BIN_DIR not derived from LIB_DIR" assert lib_bin_dir.endswith('/bin'), f"LIB_BIN_DIR should end with /bin, got {{lib_bin_dir}}" # Verify LIB_BIN_DIR is in PATH path = hook_process.env.get('PATH') if lib_bin_dir: assert lib_bin_dir in path, f"LIB_BIN_DIR not in PATH. LIB_BIN_DIR={{lib_bin_dir}}, PATH={{path[:200]}}..." # Verify NODE_PATH is set node_path = hook_process.env.get('NODE_PATH') node_modules_dir = hook_process.env.get('NODE_MODULES_DIR') print(f" NODE_PATH: {{node_path}}") print(f" NODE_MODULES_DIR: {{node_modules_dir}}") if node_path: # Should also have NODE_MODULES_DIR for backwards compatibility assert node_modules_dir == node_path, f"NODE_MODULES_DIR should match NODE_PATH" print("\\n✓ All environment checks passed") """ result = subprocess.run( ['python', '-c', verify_env_script], cwd=str(data_dir.parent), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=30, ) print(result.stdout.decode()) if result.returncode != 0: print("\nVerification error:") print(result.stderr.decode()) assert result.returncode == 0, f"Environment verification failed: {result.stderr.decode()}" print("\n" + "="*80) print("✓ TEST PASSED: Parent environment preserved in hooks") print(" - Custom parent env vars reach hooks") print(" - LIB_DIR propagated correctly") print(" - LIB_BIN_DIR derived and added to PATH") print(" - NODE_PATH/NODE_MODULES_DIR set when available") print("="*80 + "\n") def test_config_auto_fetch_relationships(): """ Test that get_config() auto-fetches related objects from relationships. Verifies: - snapshot auto-fetched from archiveresult.snapshot - crawl auto-fetched from snapshot.crawl - user auto-fetched from crawl.created_by """ with tempfile.TemporaryDirectory() as tmpdir: data_dir = Path(tmpdir) / 'test_archive' data_dir.mkdir() print(f"\n{'='*80}") print(f"Test: Config Auto-Fetch Relationships") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") # Initialize archive print("Step 1: Initialize archive") result = subprocess.run( ['python', '-m', 'archivebox', 'init'], cwd=str(data_dir), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=60, ) assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" print(f"✓ Archive initialized\n") # Create objects with config at each level print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level") create_objects_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' from archivebox.config.django import setup_django setup_django() from django.utils import timezone from archivebox.crawls.models import Crawl from archivebox.core.models import Snapshot, ArchiveResult from archivebox.config.configset import get_config # Create crawl with config crawl = Crawl.objects.create( urls='https://example.com', status='queued', retry_at=timezone.now(), config={{ 'CRAWL_KEY': 'from_crawl', 'TIMEOUT': 777, }} ) # Create snapshot with config snapshot = Snapshot.objects.create( url='https://example.com', crawl=crawl, status='queued', retry_at=timezone.now(), config={{ 'SNAPSHOT_KEY': 'from_snapshot', 'TIMEOUT': 555, }} ) # Create ArchiveResult ar = ArchiveResult.objects.create( snapshot=snapshot, plugin='test', hook_name='test_hook', status=ArchiveResult.StatusChoices.STARTED ) print(f"Created: crawl={{crawl.id}}, snapshot={{snapshot.id}}, ar={{ar.id}}") # Test 1: Auto-fetch crawl from snapshot print("\\nTest 1: get_config(snapshot=snapshot) auto-fetches crawl") config = get_config(snapshot=snapshot) assert config.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config.get('TIMEOUT')}}" assert config.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot, got {{config.get('SNAPSHOT_KEY')}}" assert config.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl, got {{config.get('CRAWL_KEY')}}" print("✓ Snapshot config (TIMEOUT=555) overrides crawl config (TIMEOUT=777)") print("✓ Both snapshot.config and crawl.config values present") # Test 2: Auto-fetch snapshot from archiveresult print("\\nTest 2: get_config(archiveresult=ar) auto-fetches snapshot and crawl") config_from_ar = get_config(archiveresult=ar) assert config_from_ar.get('TIMEOUT') == 555, f"Expected 555, got {{config_from_ar.get('TIMEOUT')}}" assert config_from_ar.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot" assert config_from_ar.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl" print("✓ Auto-fetched snapshot from ar.snapshot") print("✓ Auto-fetched crawl from snapshot.crawl") # Test 3: Precedence without auto-fetch (explicit crawl only) print("\\nTest 3: get_config(crawl=crawl) without snapshot") config_crawl_only = get_config(crawl=crawl) assert config_crawl_only.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl_only.get('TIMEOUT')}}" assert config_crawl_only.get('CRAWL_KEY') == 'from_crawl' assert config_crawl_only.get('SNAPSHOT_KEY') is None, "Should not have snapshot config" print("✓ Crawl-only config has TIMEOUT=777") print("✓ No snapshot config values present") print("\\n✓ All auto-fetch tests passed") """ result = subprocess.run( ['python', '-c', create_objects_script], cwd=str(data_dir.parent), env={ **os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False', }, capture_output=True, timeout=30, ) print(result.stdout.decode()) if result.returncode != 0: print("\nAuto-fetch test error:") print(result.stderr.decode()) assert result.returncode == 0, f"Auto-fetch test failed: {result.stderr.decode()}" print("\n" + "="*80) print("✓ TEST PASSED: Config auto-fetches related objects correctly") print(" - archiveresult → snapshot → crawl → user") print(" - Precedence preserved during auto-fetch") print("="*80 + "\n") def test_config_precedence_with_environment_vars(): """ Test that config precedence order is correct when environment vars are set. Documented order (highest to lowest): 1. snapshot.config 2. crawl.config 3. user.config 4. persona config 5. environment variables <-- LOWER priority than snapshot/crawl 6. machine.config 7. config file 8. plugin defaults 9. core defaults This test verifies snapshot.config overrides environment variables. """ with tempfile.TemporaryDirectory() as tmpdir: data_dir = Path(tmpdir) / 'test_archive' data_dir.mkdir() print(f"\n{'='*80}") print(f"Test: Config Precedence with Environment Variables") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") # Initialize result = subprocess.run( ['python', '-m', 'archivebox', 'init'], cwd=str(data_dir), env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'}, capture_output=True, timeout=60, ) assert result.returncode == 0 print("✓ Archive initialized\n") # Test with environment variable set print("Step 1: Test with TIMEOUT=999 in environment") test_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' os.environ['TIMEOUT'] = '999' # Set env var from archivebox.config.django import setup_django setup_django() from django.utils import timezone from archivebox.crawls.models import Crawl from archivebox.core.models import Snapshot from archivebox.config.configset import get_config # Create crawl with TIMEOUT=777 crawl = Crawl.objects.create( urls='https://example.com', status='queued', retry_at=timezone.now(), config={{'TIMEOUT': 777}} ) # Create snapshot with TIMEOUT=555 snapshot = Snapshot.objects.create( url='https://example.com', crawl=crawl, status='queued', retry_at=timezone.now(), config={{'TIMEOUT': 555}} ) # Get config with all sources config = get_config(snapshot=snapshot) print(f"Environment: TIMEOUT={{os.environ.get('TIMEOUT')}}") print(f"Crawl config: TIMEOUT={{crawl.config.get('TIMEOUT')}}") print(f"Snapshot config: TIMEOUT={{snapshot.config.get('TIMEOUT')}}") print(f"Merged config: TIMEOUT={{config.get('TIMEOUT')}}") # Snapshot should override both crawl AND environment expected = 555 actual = config.get('TIMEOUT') if actual != expected: print(f"\\n❌ PRECEDENCE BUG: Expected {{expected}}, got {{actual}}") print(f" Snapshot.config should have highest priority!") import sys sys.exit(1) print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999) and crawl.config (777)") """ result = subprocess.run( ['python', '-c', test_script], cwd=str(data_dir.parent), capture_output=True, timeout=30, ) print(result.stdout.decode()) if result.returncode != 0: print("\nPrecedence bug detected:") print(result.stderr.decode()) assert result.returncode == 0, f"Precedence test failed: {result.stderr.decode()}" print("\n" + "="*80) print("✓ TEST PASSED: Snapshot config correctly overrides environment variables") print("="*80 + "\n") def test_new_environment_variables_added(): """ Test that NEW environment variables (not in defaults) are added to config. This is important for worker subprocesses that receive config via Process.env. When Worker.start() creates a subprocess, it serializes config to Process.env. The subprocess must be able to read those values back via get_config(). """ with tempfile.TemporaryDirectory() as tmpdir: data_dir = Path(tmpdir) / 'test_archive' data_dir.mkdir() print(f"\n{'='*80}") print(f"Test: New Environment Variables Added to Config") print(f"DATA_DIR: {data_dir}") print(f"{'='*80}\n") # Initialize result = subprocess.run( ['python', '-m', 'archivebox', 'init'], cwd=str(data_dir), env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'}, capture_output=True, timeout=60, ) assert result.returncode == 0 print("✓ Archive initialized\n") print("Step 1: Test that new uppercase env vars are added to config") test_script = f""" import os os.environ['DATA_DIR'] = '{data_dir}' os.environ['NEW_CUSTOM_VAR'] = 'custom_value' # Not in defaults os.environ['ANOTHER_VAR'] = 'another_value' os.environ['lowercase_var'] = 'should_be_ignored' # Lowercase should be ignored from archivebox.config.django import setup_django setup_django() from archivebox.config.configset import get_config config = get_config() # Check uppercase vars are added new_var = config.get('NEW_CUSTOM_VAR') another_var = config.get('ANOTHER_VAR') lowercase_var = config.get('lowercase_var') print(f"NEW_CUSTOM_VAR: {{new_var}}") print(f"ANOTHER_VAR: {{another_var}}") print(f"lowercase_var: {{lowercase_var}}") assert new_var == 'custom_value', f"Expected 'custom_value', got {{new_var}}" assert another_var == 'another_value', f"Expected 'another_value', got {{another_var}}" assert lowercase_var is None, f"Lowercase vars should be ignored, got {{lowercase_var}}" print("\\n✓ New uppercase environment variables added to config") print("✓ Lowercase environment variables ignored") """ result = subprocess.run( ['python', '-c', test_script], cwd=str(data_dir.parent), capture_output=True, timeout=30, ) print(result.stdout.decode()) if result.returncode != 0: print("\nTest error:") print(result.stderr.decode()) assert result.returncode == 0, f"Test failed: {result.stderr.decode()}" print("\n" + "="*80) print("✓ TEST PASSED: New environment variables correctly added to config") print("="*80 + "\n") if __name__ == '__main__': # Run as standalone script test_config_propagation_through_worker_hierarchy() test_config_environment_variable_parsing() test_parent_environment_preserved_in_hooks() test_config_auto_fetch_relationships() test_config_precedence_with_environment_vars() test_new_environment_variables_added()