| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069 |
- """
- Integration test for config propagation through worker hierarchy.
- Tests that config is properly merged and passed through:
- Parent CLI/Orchestrator
- └── CrawlWorker subprocess (via Process.env)
- └── SnapshotWorker subprocess (via Process.env)
- └── Hook subprocess (via Process.env)
- Config priority order (highest to lowest):
- 1. Snapshot.config (JSON field)
- 2. Crawl.config (JSON field)
- 3. User.config (JSON field)
- 4. Environment variables (os.environ + Process.env)
- 5. Config file (ArchiveBox.conf)
- 6. Plugin defaults (config.json)
- 7. Core defaults
- """
- import os
- import json
- import tempfile
- import subprocess
- import time
- from pathlib import Path
- def test_config_propagation_through_worker_hierarchy():
- """
- Integration test: Verify config is properly merged at every level.
- Test flow:
- 1. Create test archive with custom config in ArchiveBox.conf
- 2. Set custom env vars before spawning worker
- 3. Create Crawl with custom crawl.config JSON field
- 4. Create Snapshot with custom snapshot.config JSON field
- 5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
- 6. Verify worker received merged config from all sources
- 7. Verify hook subprocess also received correct config
- """
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir) / 'test_archive'
- data_dir.mkdir()
- print(f"\n{'='*80}")
- print(f"Test: Config Propagation Through Worker Hierarchy")
- print(f"DATA_DIR: {data_dir}")
- print(f"{'='*80}\n")
- # Step 1: Initialize archive
- print("Step 1: Initialize archive")
- result = subprocess.run(
- ['python', '-m', 'archivebox', 'init'],
- cwd=str(data_dir),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=60,
- )
- assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
- print(f"✓ Archive initialized\n")
- # Step 2: Write custom config to ArchiveBox.conf
- print("Step 2: Write custom config to ArchiveBox.conf")
- config_file = data_dir / 'ArchiveBox.conf'
- config_file.write_text("""
- [GENERAL]
- # Custom timeout in config file
- TIMEOUT = 999
- [ARCHIVING_CONFIG]
- # Enable all plugins for proper testing
- SAVE_WGET = True
- SAVE_WARC = True
- SAVE_PDF = True
- SAVE_DOM = True
- SAVE_SINGLEFILE = True
- SAVE_READABILITY = True
- SAVE_MERCURY = True
- SAVE_HTMLTOTEXT = True
- SAVE_GIT = True
- SAVE_MEDIA = True
- SAVE_ARCHIVE_DOT_ORG = True
- SAVE_TITLE = True
- SAVE_FAVICON = True
- SAVE_SCREENSHOT = True
- """)
- print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
- # Step 2.5: Set Machine.config values
- print("Step 2.5: Set Machine.config with custom binary path")
- set_machine_config_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- from archivebox.config.django import setup_django
- setup_django()
- from archivebox.machine.models import Machine
- machine = Machine.current()
- machine.config = {{
- 'CUSTOM_MACHINE_KEY': 'from_machine_config',
- 'WGET_BINARY': '/custom/machine/wget', # Machine-specific binary path
- }}
- machine.save()
- print(f"Machine {{machine.hostname}} config updated")
- """
- result = subprocess.run(
- ['python', '-c', set_machine_config_script],
- cwd=str(data_dir.parent),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=30,
- )
- assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
- print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
- # Step 3: Create Crawl via Django ORM with custom crawl.config
- print("Step 3: Create Crawl with custom crawl.config JSON")
- create_crawl_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- from archivebox.config.django import setup_django
- setup_django()
- from django.utils import timezone
- from archivebox.crawls.models import Crawl
- # Create crawl with custom config
- crawl = Crawl.objects.create(
- status='queued',
- retry_at=timezone.now(),
- urls='https://example.com',
- config={{
- 'TIMEOUT': 777, # Crawl-level override (higher priority than file)
- 'CUSTOM_CRAWL_KEY': 'from_crawl_json',
- }}
- )
- print(crawl.id)
- """
- result = subprocess.run(
- ['python', '-c', create_crawl_script],
- cwd=str(data_dir.parent),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=30,
- )
- assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
- # Extract UUID from output (last line should be the UUID)
- crawl_id = result.stdout.decode().strip().split('\n')[-1]
- print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
- # Step 4: Create Snapshot with custom snapshot.config
- print("Step 4: Create Snapshot with custom snapshot.config JSON")
- create_snapshot_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- from archivebox.config.django import setup_django
- setup_django()
- from django.utils import timezone
- from archivebox.core.models import Snapshot
- from archivebox.crawls.models import Crawl
- crawl = Crawl.objects.get(id='{crawl_id}')
- snapshot = Snapshot.objects.create(
- url='https://example.com',
- crawl=crawl,
- status='queued',
- retry_at=timezone.now(),
- config={{
- 'TIMEOUT': 555, # Snapshot-level override (highest priority)
- 'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
- 'SAVE_SCREENSHOT': True, # Keep screenshot enabled
- 'SAVE_WGET': False, # But disable wget as a test of per-snapshot override
- }}
- )
- print(snapshot.id)
- """
- result = subprocess.run(
- ['python', '-c', create_snapshot_script],
- cwd=str(data_dir.parent),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=30,
- )
- assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
- # Extract UUID from output (last line should be the UUID)
- snapshot_id = result.stdout.decode().strip().split('\n')[-1]
- print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
- # Step 5: Run SnapshotWorker with additional env var
- print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
- result = subprocess.run(
- ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
- cwd=str(data_dir),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- 'ENV_VAR_KEY': 'from_environment', # Environment variable
- },
- capture_output=True,
- timeout=120,
- )
- stdout = result.stdout.decode()
- stderr = result.stderr.decode()
- print("\n--- SnapshotWorker stdout ---")
- print(stdout)
- print("\n--- SnapshotWorker stderr ---")
- print(stderr)
- print("--- End output ---\n")
- # Step 6: Verify config was properly merged
- print("Step 6: Verify config merging")
- # Check that SnapshotWorker ran successfully
- assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
- # Verify config by checking stderr debug output and ArchiveResults in database
- print("\n--- Verifying config propagation ---\n")
- # Check for config debug messages in stderr
- assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
- "Expected debug output not found in stderr"
- print("✓ Config debug output found in stderr")
- # Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults
- verify_precedence_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- from archivebox.config.django import setup_django
- setup_django()
- from archivebox.core.models import Snapshot
- from archivebox.config.configset import get_config
- snapshot = Snapshot.objects.get(id='{snapshot_id}')
- # Test precedence by getting config at different levels
- print("\\nTesting config precedence order:")
- # 1. Just defaults (lowest priority)
- config_defaults = get_config()
- print(f" Defaults only: TIMEOUT={{config_defaults.get('TIMEOUT')}}")
- # 2. With machine config
- from archivebox.machine.models import Machine
- machine = Machine.current()
- config_machine = get_config(machine=machine)
- custom_machine = config_machine.get('CUSTOM_MACHINE_KEY')
- print(f" + Machine: CUSTOM_MACHINE_KEY={{custom_machine}}")
- # 3. With crawl config
- config_crawl = get_config(crawl=snapshot.crawl)
- print(f" + Crawl: TIMEOUT={{config_crawl.get('TIMEOUT')}} (should be 777 from crawl.config)")
- assert config_crawl.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl.get('TIMEOUT')}}"
- # 4. With snapshot config (highest priority)
- config_snapshot = get_config(snapshot=snapshot)
- print(f" + Snapshot: TIMEOUT={{config_snapshot.get('TIMEOUT')}} (should be 555 from snapshot.config)")
- assert config_snapshot.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config_snapshot.get('TIMEOUT')}}"
- # Verify snapshot config overrides crawl config
- assert config_snapshot.get('CUSTOM_CRAWL_KEY') == 'from_crawl_json', "Crawl config should be present"
- assert config_snapshot.get('CUSTOM_SNAPSHOT_KEY') == 'from_snapshot_json', "Snapshot config should be present"
- assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Machine config should be present"
- print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults")
- """
- result = subprocess.run(
- ['python', '-c', verify_precedence_script],
- cwd=str(data_dir.parent),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=30,
- )
- print(result.stdout.decode())
- if result.returncode != 0:
- print("\nPrecedence verification error:")
- print(result.stderr.decode())
- assert result.returncode == 0, f"Precedence verification failed: {result.stderr.decode()}"
- # Verify config values were actually used by checking ArchiveResults
- verify_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- from archivebox.config.django import setup_django
- setup_django()
- from archivebox.core.models import Snapshot, ArchiveResult
- from archivebox.config.configset import get_config
- snapshot = Snapshot.objects.get(id='{snapshot_id}')
- print(f"Snapshot status: {{snapshot.status}}")
- print(f"Snapshot URL: {{snapshot.url}}")
- # Check that snapshot reached sealed state
- assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
- # Verify all config sources are present in merged config
- print("\\nVerifying config merge priority:")
- config = get_config(snapshot=snapshot)
- # 1. Snapshot.config (highest priority)
- timeout = config.get('TIMEOUT')
- print(f" 1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
- assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
- wget_enabled = config.get('SAVE_WGET')
- print(f" 1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
- assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
- custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
- print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
- assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
- # 2. Crawl.config
- custom_crawl = config.get('CUSTOM_CRAWL_KEY')
- print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
- assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
- # 6. Machine.config
- custom_machine = config.get('CUSTOM_MACHINE_KEY')
- print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
- assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
- wget_binary = config.get('WGET_BINARY')
- print(f" 6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
- # Note: This might be overridden by environment or other sources, just check it's present
- assert wget_binary is not None, f"WGET_BINARY should be present"
- # Check ArchiveResults to verify plugins actually ran with correct config
- results = ArchiveResult.objects.filter(snapshot=snapshot)
- print(f"\\nArchiveResults created: {{results.count()}}")
- for ar in results.order_by('plugin'):
- print(f" {{ar.plugin}}: {{ar.status}}")
- # Verify SAVE_WGET=False was respected (should have no wget result)
- wget_results = results.filter(plugin='wget')
- print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
- assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
- # Verify SAVE_SCREENSHOT=True was respected (should have screenshot result)
- screenshot_results = results.filter(plugin='screenshot')
- print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)")
- assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results"
- print("\\n✓ All config sources correctly merged:")
- print(" - Snapshot.config overrides (highest priority)")
- print(" - Crawl.config values present")
- print(" - Machine.config values present")
- print(" - File config values present")
- print("✓ Config priority order verified")
- print("✓ Snapshot successfully sealed")
- """
- result = subprocess.run(
- ['python', '-c', verify_script],
- cwd=str(data_dir.parent),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=30,
- )
- print(result.stdout.decode())
- if result.returncode != 0:
- print("\nVerification error:")
- print(result.stderr.decode())
- assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
- print("\n" + "="*80)
- print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
- print("="*80 + "\n")
- def test_config_environment_variable_parsing():
- """
- Test that Process._build_env() correctly serializes config values,
- and get_config() correctly parses them back from environment.
- """
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir) / 'test_archive'
- data_dir.mkdir()
- print(f"\n{'='*80}")
- print(f"Test: Config Environment Variable Parsing")
- print(f"DATA_DIR: {data_dir}")
- print(f"{'='*80}\n")
- # Initialize archive
- result = subprocess.run(
- ['python', '-m', 'archivebox', 'init'],
- cwd=str(data_dir),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=60,
- )
- assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
- # Test various data types in config
- test_config_types_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- from archivebox.config.django import setup_django
- setup_django()
- from archivebox.config.configset import get_config
- from archivebox.machine.models import Process, Machine
- # Test get_config() with no overrides (baseline)
- config = get_config()
- print(f"Baseline config keys: {{len(config)}}")
- # Create a test Process with various config types
- process = Process.objects.create(
- machine=Machine.current(),
- process_type=Process.TypeChoices.WORKER,
- pwd='{data_dir}',
- cmd=['test'],
- env={{
- 'STRING_VAL': 'hello',
- 'INT_VAL': 123,
- 'FLOAT_VAL': 45.67,
- 'BOOL_TRUE': True,
- 'BOOL_FALSE': False,
- 'LIST_VAL': ['a', 'b', 'c'],
- 'DICT_VAL': {{'key': 'value'}},
- 'NONE_VAL': None,
- }},
- )
- # Test _build_env() serialization
- env = process._build_env()
- print(f"\\nSerialized environment:")
- print(f" STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
- print(f" INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
- print(f" FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
- print(f" BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
- print(f" BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
- print(f" LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
- print(f" DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
- print(f" NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
- # Verify all are strings (required by subprocess.Popen)
- assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
- assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
- assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
- assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
- assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
- assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
- assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
- print("\\n✓ All environment values correctly serialized as strings")
- # Now test that get_config() can parse them back
- # Simulate subprocess by setting os.environ
- import json
- for key, val in env.items():
- if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
- os.environ[key] = val
- # Get config again - should parse from environment
- config = get_config()
- print(f"\\nParsed from environment:")
- print(f" STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
- print(f" INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
- print(f" FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
- print(f" BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
- print(f" BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
- print(f" LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
- print(f" DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
- print("\\n✓ All config values correctly parsed from environment")
- """
- result = subprocess.run(
- ['python', '-c', test_config_types_script],
- cwd=str(data_dir.parent),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=30,
- )
- print(result.stdout.decode())
- if result.stderr:
- print("Script stderr:")
- print(result.stderr.decode())
- assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
- print("\n" + "="*80)
- print("✓ TEST PASSED: Config serialization and parsing works correctly")
- print("="*80 + "\n")
- def test_parent_environment_preserved_in_hooks():
- """
- Test that parent environment variables are preserved in hook execution.
- This test catches the bug where we built env=os.environ.copy() but then
- clobbered it with process.env={}, losing all parent environment.
- Also verifies:
- - NODE_PATH is correctly derived from LIB_DIR/npm/node_modules
- - LIB_BIN_DIR is correctly derived and added to PATH
- """
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir) / 'test_archive'
- data_dir.mkdir()
- print(f"\n{'='*80}")
- print(f"Test: Parent Environment Preserved in Hooks")
- print(f"DATA_DIR: {data_dir}")
- print(f"{'='*80}\n")
- # Initialize archive
- print("Step 1: Initialize archive")
- result = subprocess.run(
- ['python', '-m', 'archivebox', 'init'],
- cwd=str(data_dir),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=60,
- )
- assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
- print(f"✓ Archive initialized\n")
- # Create snapshot
- print("Step 2: Create Snapshot")
- create_snapshot_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- from archivebox.config.django import setup_django
- setup_django()
- from django.utils import timezone
- from archivebox.core.models import Snapshot
- from archivebox.crawls.models import Crawl
- crawl = Crawl.objects.create(
- urls='https://example.com',
- status='queued',
- retry_at=timezone.now()
- )
- snapshot = Snapshot.objects.create(
- url='https://example.com',
- crawl=crawl,
- status='queued',
- retry_at=timezone.now()
- )
- print(snapshot.id)
- """
- result = subprocess.run(
- ['python', '-c', create_snapshot_script],
- cwd=str(data_dir.parent),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=30,
- )
- assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
- snapshot_id = result.stdout.decode().strip().split('\n')[-1]
- print(f"✓ Created snapshot {snapshot_id}\n")
- # Run SnapshotWorker with custom parent environment variable
- print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process")
- result = subprocess.run(
- ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
- cwd=str(data_dir),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- 'TEST_PARENT_ENV_VAR': 'preserved_from_parent', # This should reach the hook
- 'PLUGINS': 'favicon', # Use existing plugin (favicon is simple and fast)
- },
- capture_output=True,
- timeout=120,
- )
- stdout = result.stdout.decode()
- stderr = result.stderr.decode()
- print("\n--- SnapshotWorker stderr (first 50 lines) ---")
- print('\n'.join(stderr.split('\n')[:50]))
- print("--- End stderr ---\n")
- # Verify hooks ran by checking Process records
- print("Step 4: Verify environment variables in hook Process records")
- verify_env_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- from archivebox.config.django import setup_django
- setup_django()
- from archivebox.machine.models import Process
- from archivebox.core.models import Snapshot
- import json
- snapshot = Snapshot.objects.get(id='{snapshot_id}')
- # Find hook processes for this snapshot
- hook_processes = Process.objects.filter(
- process_type=Process.TypeChoices.HOOK,
- pwd__contains=str(snapshot.id)
- ).order_by('-created_at')
- print(f"Found {{hook_processes.count()}} hook processes")
- if hook_processes.count() == 0:
- print("ERROR: No hook processes found!")
- import sys
- sys.exit(1)
- # Check the first hook process environment
- hook_process = hook_processes.first()
- print(f"\\nChecking hook: {{hook_process.cmd}}")
- print(f"Hook env keys: {{len(hook_process.env)}} total")
- # Verify TEST_PARENT_ENV_VAR was preserved
- test_parent = hook_process.env.get('TEST_PARENT_ENV_VAR')
- print(f" TEST_PARENT_ENV_VAR: {{test_parent}}")
- assert test_parent == 'preserved_from_parent', f"Expected 'preserved_from_parent', got {{test_parent}}"
- # Verify LIB_DIR is set
- lib_dir = hook_process.env.get('LIB_DIR')
- print(f" LIB_DIR: {{lib_dir}}")
- assert lib_dir is not None, "LIB_DIR not set"
- # Verify LIB_BIN_DIR is derived
- lib_bin_dir = hook_process.env.get('LIB_BIN_DIR')
- print(f" LIB_BIN_DIR: {{lib_bin_dir}}")
- if lib_dir:
- assert lib_bin_dir is not None, "LIB_BIN_DIR not derived from LIB_DIR"
- assert lib_bin_dir.endswith('/bin'), f"LIB_BIN_DIR should end with /bin, got {{lib_bin_dir}}"
- # Verify LIB_BIN_DIR is in PATH
- path = hook_process.env.get('PATH')
- if lib_bin_dir:
- assert lib_bin_dir in path, f"LIB_BIN_DIR not in PATH. LIB_BIN_DIR={{lib_bin_dir}}, PATH={{path[:200]}}..."
- # Verify NODE_PATH is set
- node_path = hook_process.env.get('NODE_PATH')
- node_modules_dir = hook_process.env.get('NODE_MODULES_DIR')
- print(f" NODE_PATH: {{node_path}}")
- print(f" NODE_MODULES_DIR: {{node_modules_dir}}")
- if node_path:
- # Should also have NODE_MODULES_DIR for backwards compatibility
- assert node_modules_dir == node_path, f"NODE_MODULES_DIR should match NODE_PATH"
- print("\\n✓ All environment checks passed")
- """
- result = subprocess.run(
- ['python', '-c', verify_env_script],
- cwd=str(data_dir.parent),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=30,
- )
- print(result.stdout.decode())
- if result.returncode != 0:
- print("\nVerification error:")
- print(result.stderr.decode())
- assert result.returncode == 0, f"Environment verification failed: {result.stderr.decode()}"
- print("\n" + "="*80)
- print("✓ TEST PASSED: Parent environment preserved in hooks")
- print(" - Custom parent env vars reach hooks")
- print(" - LIB_DIR propagated correctly")
- print(" - LIB_BIN_DIR derived and added to PATH")
- print(" - NODE_PATH/NODE_MODULES_DIR set when available")
- print("="*80 + "\n")
- def test_config_auto_fetch_relationships():
- """
- Test that get_config() auto-fetches related objects from relationships.
- Verifies:
- - snapshot auto-fetched from archiveresult.snapshot
- - crawl auto-fetched from snapshot.crawl
- - user auto-fetched from crawl.created_by
- """
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir) / 'test_archive'
- data_dir.mkdir()
- print(f"\n{'='*80}")
- print(f"Test: Config Auto-Fetch Relationships")
- print(f"DATA_DIR: {data_dir}")
- print(f"{'='*80}\n")
- # Initialize archive
- print("Step 1: Initialize archive")
- result = subprocess.run(
- ['python', '-m', 'archivebox', 'init'],
- cwd=str(data_dir),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=60,
- )
- assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
- print(f"✓ Archive initialized\n")
- # Create objects with config at each level
- print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
- create_objects_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- from archivebox.config.django import setup_django
- setup_django()
- from django.utils import timezone
- from archivebox.crawls.models import Crawl
- from archivebox.core.models import Snapshot, ArchiveResult
- from archivebox.config.configset import get_config
- # Create crawl with config
- crawl = Crawl.objects.create(
- urls='https://example.com',
- status='queued',
- retry_at=timezone.now(),
- config={{
- 'CRAWL_KEY': 'from_crawl',
- 'TIMEOUT': 777,
- }}
- )
- # Create snapshot with config
- snapshot = Snapshot.objects.create(
- url='https://example.com',
- crawl=crawl,
- status='queued',
- retry_at=timezone.now(),
- config={{
- 'SNAPSHOT_KEY': 'from_snapshot',
- 'TIMEOUT': 555,
- }}
- )
- # Create ArchiveResult
- ar = ArchiveResult.objects.create(
- snapshot=snapshot,
- plugin='test',
- hook_name='test_hook',
- status=ArchiveResult.StatusChoices.STARTED
- )
- print(f"Created: crawl={{crawl.id}}, snapshot={{snapshot.id}}, ar={{ar.id}}")
- # Test 1: Auto-fetch crawl from snapshot
- print("\\nTest 1: get_config(snapshot=snapshot) auto-fetches crawl")
- config = get_config(snapshot=snapshot)
- assert config.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config.get('TIMEOUT')}}"
- assert config.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot, got {{config.get('SNAPSHOT_KEY')}}"
- assert config.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl, got {{config.get('CRAWL_KEY')}}"
- print("✓ Snapshot config (TIMEOUT=555) overrides crawl config (TIMEOUT=777)")
- print("✓ Both snapshot.config and crawl.config values present")
- # Test 2: Auto-fetch snapshot from archiveresult
- print("\\nTest 2: get_config(archiveresult=ar) auto-fetches snapshot and crawl")
- config_from_ar = get_config(archiveresult=ar)
- assert config_from_ar.get('TIMEOUT') == 555, f"Expected 555, got {{config_from_ar.get('TIMEOUT')}}"
- assert config_from_ar.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot"
- assert config_from_ar.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl"
- print("✓ Auto-fetched snapshot from ar.snapshot")
- print("✓ Auto-fetched crawl from snapshot.crawl")
- # Test 3: Precedence without auto-fetch (explicit crawl only)
- print("\\nTest 3: get_config(crawl=crawl) without snapshot")
- config_crawl_only = get_config(crawl=crawl)
- assert config_crawl_only.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl_only.get('TIMEOUT')}}"
- assert config_crawl_only.get('CRAWL_KEY') == 'from_crawl'
- assert config_crawl_only.get('SNAPSHOT_KEY') is None, "Should not have snapshot config"
- print("✓ Crawl-only config has TIMEOUT=777")
- print("✓ No snapshot config values present")
- print("\\n✓ All auto-fetch tests passed")
- """
- result = subprocess.run(
- ['python', '-c', create_objects_script],
- cwd=str(data_dir.parent),
- env={
- **os.environ,
- 'DATA_DIR': str(data_dir),
- 'USE_COLOR': 'False',
- },
- capture_output=True,
- timeout=30,
- )
- print(result.stdout.decode())
- if result.returncode != 0:
- print("\nAuto-fetch test error:")
- print(result.stderr.decode())
- assert result.returncode == 0, f"Auto-fetch test failed: {result.stderr.decode()}"
- print("\n" + "="*80)
- print("✓ TEST PASSED: Config auto-fetches related objects correctly")
- print(" - archiveresult → snapshot → crawl → user")
- print(" - Precedence preserved during auto-fetch")
- print("="*80 + "\n")
- def test_config_precedence_with_environment_vars():
- """
- Test that config precedence order is correct when environment vars are set.
- Documented order (highest to lowest):
- 1. snapshot.config
- 2. crawl.config
- 3. user.config
- 4. persona config
- 5. environment variables <-- LOWER priority than snapshot/crawl
- 6. machine.config
- 7. config file
- 8. plugin defaults
- 9. core defaults
- This test verifies snapshot.config overrides environment variables.
- """
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir) / 'test_archive'
- data_dir.mkdir()
- print(f"\n{'='*80}")
- print(f"Test: Config Precedence with Environment Variables")
- print(f"DATA_DIR: {data_dir}")
- print(f"{'='*80}\n")
- # Initialize
- result = subprocess.run(
- ['python', '-m', 'archivebox', 'init'],
- cwd=str(data_dir),
- env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
- capture_output=True,
- timeout=60,
- )
- assert result.returncode == 0
- print("✓ Archive initialized\n")
- # Test with environment variable set
- print("Step 1: Test with TIMEOUT=999 in environment")
- test_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- os.environ['TIMEOUT'] = '999' # Set env var
- from archivebox.config.django import setup_django
- setup_django()
- from django.utils import timezone
- from archivebox.crawls.models import Crawl
- from archivebox.core.models import Snapshot
- from archivebox.config.configset import get_config
- # Create crawl with TIMEOUT=777
- crawl = Crawl.objects.create(
- urls='https://example.com',
- status='queued',
- retry_at=timezone.now(),
- config={{'TIMEOUT': 777}}
- )
- # Create snapshot with TIMEOUT=555
- snapshot = Snapshot.objects.create(
- url='https://example.com',
- crawl=crawl,
- status='queued',
- retry_at=timezone.now(),
- config={{'TIMEOUT': 555}}
- )
- # Get config with all sources
- config = get_config(snapshot=snapshot)
- print(f"Environment: TIMEOUT={{os.environ.get('TIMEOUT')}}")
- print(f"Crawl config: TIMEOUT={{crawl.config.get('TIMEOUT')}}")
- print(f"Snapshot config: TIMEOUT={{snapshot.config.get('TIMEOUT')}}")
- print(f"Merged config: TIMEOUT={{config.get('TIMEOUT')}}")
- # Snapshot should override both crawl AND environment
- expected = 555
- actual = config.get('TIMEOUT')
- if actual != expected:
- print(f"\\n❌ PRECEDENCE BUG: Expected {{expected}}, got {{actual}}")
- print(f" Snapshot.config should have highest priority!")
- import sys
- sys.exit(1)
- print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999) and crawl.config (777)")
- """
- result = subprocess.run(
- ['python', '-c', test_script],
- cwd=str(data_dir.parent),
- capture_output=True,
- timeout=30,
- )
- print(result.stdout.decode())
- if result.returncode != 0:
- print("\nPrecedence bug detected:")
- print(result.stderr.decode())
- assert result.returncode == 0, f"Precedence test failed: {result.stderr.decode()}"
- print("\n" + "="*80)
- print("✓ TEST PASSED: Snapshot config correctly overrides environment variables")
- print("="*80 + "\n")
- def test_new_environment_variables_added():
- """
- Test that NEW environment variables (not in defaults) are added to config.
- This is important for worker subprocesses that receive config via Process.env.
- When Worker.start() creates a subprocess, it serializes config to Process.env.
- The subprocess must be able to read those values back via get_config().
- """
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir) / 'test_archive'
- data_dir.mkdir()
- print(f"\n{'='*80}")
- print(f"Test: New Environment Variables Added to Config")
- print(f"DATA_DIR: {data_dir}")
- print(f"{'='*80}\n")
- # Initialize
- result = subprocess.run(
- ['python', '-m', 'archivebox', 'init'],
- cwd=str(data_dir),
- env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
- capture_output=True,
- timeout=60,
- )
- assert result.returncode == 0
- print("✓ Archive initialized\n")
- print("Step 1: Test that new uppercase env vars are added to config")
- test_script = f"""
- import os
- os.environ['DATA_DIR'] = '{data_dir}'
- os.environ['NEW_CUSTOM_VAR'] = 'custom_value' # Not in defaults
- os.environ['ANOTHER_VAR'] = 'another_value'
- os.environ['lowercase_var'] = 'should_be_ignored' # Lowercase should be ignored
- from archivebox.config.django import setup_django
- setup_django()
- from archivebox.config.configset import get_config
- config = get_config()
- # Check uppercase vars are added
- new_var = config.get('NEW_CUSTOM_VAR')
- another_var = config.get('ANOTHER_VAR')
- lowercase_var = config.get('lowercase_var')
- print(f"NEW_CUSTOM_VAR: {{new_var}}")
- print(f"ANOTHER_VAR: {{another_var}}")
- print(f"lowercase_var: {{lowercase_var}}")
- assert new_var == 'custom_value', f"Expected 'custom_value', got {{new_var}}"
- assert another_var == 'another_value', f"Expected 'another_value', got {{another_var}}"
- assert lowercase_var is None, f"Lowercase vars should be ignored, got {{lowercase_var}}"
- print("\\n✓ New uppercase environment variables added to config")
- print("✓ Lowercase environment variables ignored")
- """
- result = subprocess.run(
- ['python', '-c', test_script],
- cwd=str(data_dir.parent),
- capture_output=True,
- timeout=30,
- )
- print(result.stdout.decode())
- if result.returncode != 0:
- print("\nTest error:")
- print(result.stderr.decode())
- assert result.returncode == 0, f"Test failed: {result.stderr.decode()}"
- print("\n" + "="*80)
- print("✓ TEST PASSED: New environment variables correctly added to config")
- print("="*80 + "\n")
- if __name__ == '__main__':
- # Run as standalone script
- test_config_propagation_through_worker_hierarchy()
- test_config_environment_variable_parsing()
- test_parent_environment_preserved_in_hooks()
- test_config_auto_fetch_relationships()
- test_config_precedence_with_environment_vars()
- test_new_environment_variables_added()
|