software
/
archival.ArchiveBox
mirror of https://github.com/ArchiveBox/ArchiveBox.git


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
							#!/usr/bin/env python3

__package__ = 'archivebox.cli'
__command__ = 'archivebox worker'

import sys

import rich_click as click

from archivebox.misc.util import docstring


def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
    """
    Start a worker process to process items from the queue.

    Worker types:
        - crawl: Process Crawl objects (parse seeds, create snapshots)
        - snapshot: Process Snapshot objects (create archive results)
        - archiveresult: Process ArchiveResult objects (run plugins)

    Workers poll the database for queued items, claim them atomically,
    and spawn subprocess tasks to handle each item.
    """
    from archivebox.workers.worker import get_worker_class

    WorkerClass = get_worker_class(worker_type)

    # Build kwargs
    kwargs = {'daemon': daemon}
    if plugin and worker_type == 'archiveresult':
        kwargs['extractor'] = plugin  # internal field still called extractor

    # Create and run worker
    worker_instance = WorkerClass(**kwargs)
    worker_instance.runloop()


@click.command()
@click.argument('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
@docstring(worker.__doc__)
def main(worker_type: str, daemon: bool, plugin: str | None):
    """Start an ArchiveBox worker process"""
    worker(worker_type, daemon=daemon, plugin=plugin)


if __name__ == '__main__':
    main()