on_Crawl__10_wget_install.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. #!/usr/bin/env python3
  2. """
  3. Emit wget Binary dependency for the crawl.
  4. """
  5. import json
  6. import os
  7. import sys
  8. # Read config from environment (already validated by JSONSchema)
  9. def get_env(name: str, default: str = '') -> str:
  10. return os.environ.get(name, default).strip()
  11. def get_env_bool(name: str, default: bool = False) -> bool:
  12. val = get_env(name, '').lower()
  13. if val in ('true', '1', 'yes', 'on'):
  14. return True
  15. if val in ('false', '0', 'no', 'off'):
  16. return False
  17. return default
  18. def get_env_int(name: str, default: int = 0) -> int:
  19. try:
  20. return int(get_env(name, str(default)))
  21. except ValueError:
  22. return default
  23. def output_binary(name: str, binproviders: str):
  24. """Output Binary JSONL record for a dependency."""
  25. machine_id = os.environ.get('MACHINE_ID', '')
  26. record = {
  27. 'type': 'Binary',
  28. 'name': name,
  29. 'binproviders': binproviders,
  30. 'machine_id': machine_id,
  31. }
  32. print(json.dumps(record))
  33. def output_machine_config(config: dict):
  34. """Output Machine config JSONL patch."""
  35. if not config:
  36. return
  37. record = {
  38. 'type': 'Machine',
  39. 'config': config,
  40. }
  41. print(json.dumps(record))
  42. def main():
  43. warnings = []
  44. errors = []
  45. # Get config values
  46. wget_enabled = get_env_bool('WGET_ENABLED', True)
  47. wget_save_warc = get_env_bool('WGET_SAVE_WARC', True)
  48. wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
  49. wget_binary = get_env('WGET_BINARY', 'wget')
  50. # Compute derived values (USE_WGET for backward compatibility)
  51. use_wget = wget_enabled
  52. # Validate timeout with warning (not error)
  53. if use_wget and wget_timeout < 20:
  54. warnings.append(
  55. f"WGET_TIMEOUT={wget_timeout} is very low. "
  56. "wget may fail to archive sites if set to less than ~20 seconds. "
  57. "Consider setting WGET_TIMEOUT=60 or higher."
  58. )
  59. if use_wget:
  60. output_binary(name='wget', binproviders='apt,brew,pip,env')
  61. # Output computed config patch as JSONL
  62. output_machine_config({
  63. 'USE_WGET': use_wget,
  64. 'WGET_BINARY': wget_binary,
  65. })
  66. for warning in warnings:
  67. print(f"WARNING:{warning}", file=sys.stderr)
  68. for error in errors:
  69. print(f"ERROR:{error}", file=sys.stderr)
  70. # Exit with error if any hard errors
  71. sys.exit(1 if errors else 0)
  72. if __name__ == '__main__':
  73. main()