on_Crawl__35_readability_install.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. #!/usr/bin/env python3
  2. """
  3. Emit readability-extractor Binary dependency for the crawl.
  4. """
  5. import json
  6. import os
  7. import sys
  8. def get_env(name: str, default: str = '') -> str:
  9. return os.environ.get(name, default).strip()
  10. def get_env_bool(name: str, default: bool = False) -> bool:
  11. val = get_env(name, '').lower()
  12. if val in ('true', '1', 'yes', 'on'):
  13. return True
  14. if val in ('false', '0', 'no', 'off'):
  15. return False
  16. return default
  17. def output_binary(name: str, binproviders: str):
  18. """Output Binary JSONL record for a dependency."""
  19. machine_id = os.environ.get('MACHINE_ID', '')
  20. record = {
  21. 'type': 'Binary',
  22. 'name': name,
  23. 'binproviders': binproviders,
  24. 'overrides': {
  25. 'npm': {
  26. 'packages': ['https://github.com/ArchiveBox/readability-extractor'],
  27. },
  28. },
  29. 'machine_id': machine_id,
  30. }
  31. print(json.dumps(record))
  32. def main():
  33. readability_enabled = get_env_bool('READABILITY_ENABLED', True)
  34. if not readability_enabled:
  35. sys.exit(0)
  36. output_binary(name='readability-extractor', binproviders='npm,env')
  37. sys.exit(0)
  38. if __name__ == '__main__':
  39. main()