archivebox_worker.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox worker'
  4. import sys
  5. import rich_click as click
  6. from archivebox.misc.util import docstring
  7. def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
  8. """
  9. Start a worker process to process items from the queue.
  10. Worker types:
  11. - crawl: Process Crawl objects (parse seeds, create snapshots)
  12. - snapshot: Process Snapshot objects (create archive results)
  13. - archiveresult: Process ArchiveResult objects (run plugins)
  14. Workers poll the database for queued items, claim them atomically,
  15. and spawn subprocess tasks to handle each item.
  16. """
  17. from archivebox.workers.worker import get_worker_class
  18. WorkerClass = get_worker_class(worker_type)
  19. # Build kwargs
  20. kwargs = {'daemon': daemon}
  21. if plugin and worker_type == 'archiveresult':
  22. kwargs['extractor'] = plugin # internal field still called extractor
  23. # Create and run worker
  24. worker_instance = WorkerClass(**kwargs)
  25. worker_instance.runloop()
  26. @click.command()
  27. @click.argument('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
  28. @click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
  29. @click.option('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
  30. @docstring(worker.__doc__)
  31. def main(worker_type: str, daemon: bool, plugin: str | None):
  32. """Start an ArchiveBox worker process"""
  33. worker(worker_type, daemon=daemon, plugin=plugin)
  34. if __name__ == '__main__':
  35. main()