| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248 |
- #!/usr/bin/env python3
- __package__ = 'archivebox.cli'
- __command__ = 'archivebox schedule'
- import sys
- import argparse
- from pathlib import Path
- from typing import Optional, List, IO
- from archivebox.misc.util import docstring
- from archivebox.config import DATA_DIR
- from archivebox.misc.logging_util import SmartFormatter, reject_stdin
- from archivebox.config.common import ARCHIVING_CONFIG
- # @enforce_types
- def schedule(add: bool=False,
- show: bool=False,
- clear: bool=False,
- foreground: bool=False,
- run_all: bool=False,
- quiet: bool=False,
- every: Optional[str]=None,
- tag: str='',
- depth: int=0,
- overwrite: bool=False,
- update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
- import_path: Optional[str]=None,
- out_dir: Path=DATA_DIR):
- """Set ArchiveBox to regularly import URLs at specific times using cron"""
-
- check_data_folder()
- from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
- from archivebox.config.permissions import USER
- Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
- cron = CronTab(user=True)
- cron = dedupe_cron_jobs(cron)
- if clear:
- print(cron.remove_all(comment=CRON_COMMENT))
- cron.write()
- raise SystemExit(0)
- existing_jobs = list(cron.find_comment(CRON_COMMENT))
- if every or add:
- every = every or 'day'
- quoted = lambda s: f'"{s}"' if (s and ' ' in str(s)) else str(s)
- cmd = [
- 'cd',
- quoted(out_dir),
- '&&',
- quoted(ARCHIVEBOX_BINARY.load().abspath),
- *([
- 'add',
- *(['--overwrite'] if overwrite else []),
- *(['--update'] if update else []),
- *([f'--tag={tag}'] if tag else []),
- f'--depth={depth}',
- f'"{import_path}"',
- ] if import_path else ['update']),
- '>>',
- quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
- '2>&1',
- ]
- new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
- if every in ('minute', 'hour', 'day', 'month', 'year'):
- set_every = getattr(new_job.every(), every)
- set_every()
- elif CronSlices.is_valid(every):
- new_job.setall(every)
- else:
- stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI))
- stderr(' It must be one of minute/hour/day/month')
- stderr(' or a quoted cron-format schedule like:')
- stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
- stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml')
- raise SystemExit(1)
- cron = dedupe_cron_jobs(cron)
- cron.write()
- total_runs = sum(j.frequency_per_year() for j in cron)
- existing_jobs = list(cron.find_comment(CRON_COMMENT))
- print()
- print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
- print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
- if total_runs > 60 and not quiet:
- stderr()
- stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI))
- stderr(' Congrats on being an enthusiastic internet archiver! 👌')
- stderr()
- stderr(' Make sure you have enough storage space available to hold all the data.')
- stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
- stderr('')
- elif show:
- if existing_jobs:
- print('\n'.join(str(cmd) for cmd in existing_jobs))
- else:
- stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI))
- stderr(' To schedule a new job, run:')
- stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
- raise SystemExit(0)
- cron = CronTab(user=True)
- cron = dedupe_cron_jobs(cron)
- existing_jobs = list(cron.find_comment(CRON_COMMENT))
- if foreground or run_all:
- if not existing_jobs:
- stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI))
- stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
- raise SystemExit(1)
- print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI))
- if run_all:
- try:
- for job in existing_jobs:
- sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n')
- sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
- sys.stdout.flush()
- job.run()
- sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n')
- except KeyboardInterrupt:
- print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
- raise SystemExit(1)
- if foreground:
- try:
- for job in existing_jobs:
- print(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
- for result in cron.run_scheduler():
- print(result)
- except KeyboardInterrupt:
- print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
- raise SystemExit(1)
- # if CAN_UPGRADE:
- # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
- @docstring(schedule.__doc__)
- def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
- parser = argparse.ArgumentParser(
- prog=__command__,
- description=schedule.__doc__,
- add_help=True,
- formatter_class=SmartFormatter,
- )
- parser.add_argument(
- '--quiet', '-q',
- action='store_true',
- help=("Don't warn about storage space."),
- )
- group = parser.add_mutually_exclusive_group()
- group.add_argument(
- '--add', # '-a',
- action='store_true',
- help='Add a new scheduled ArchiveBox update job to cron',
- )
- parser.add_argument(
- '--every', # '-e',
- type=str,
- default=None,
- help='Run ArchiveBox once every [timeperiod] (hour/day/month/year or cron format e.g. "0 0 * * *")',
- )
- parser.add_argument(
- '--tag', '-t',
- type=str,
- default='',
- help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
- )
- parser.add_argument(
- '--depth', # '-d',
- type=int,
- choices=[0, 1],
- default=0,
- help='Depth to archive to [0] or 1, see "add" command help for more info',
- )
- parser.add_argument(
- '--overwrite',
- action='store_true',
- help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots',
- )
- parser.add_argument(
- '--update',
- action='store_true',
- help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults',
- )
- group.add_argument(
- '--clear', # '-c'
- action='store_true',
- help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"),
- )
- group.add_argument(
- '--show', # '-s'
- action='store_true',
- help=("Print a list of currently active ArchiveBox cron jobs"),
- )
- group.add_argument(
- '--foreground', '-f',
- action='store_true',
- help=("Launch ArchiveBox scheduler as a long-running foreground task "
- "instead of using cron."),
- )
- group.add_argument(
- '--run-all', # '-a',
- action='store_true',
- help=("Run all the scheduled jobs once immediately, independent of "
- "their configured schedules, can be used together with --foreground"),
- )
- parser.add_argument(
- 'import_path',
- nargs='?',
- type=str,
- default=None,
- help=("Check this path and import any new links on every run "
- "(can be either local file or remote URL)"),
- )
- command = parser.parse_args(args or ())
- reject_stdin(__command__, stdin)
- schedule(
- add=command.add,
- show=command.show,
- clear=command.clear,
- foreground=command.foreground,
- run_all=command.run_all,
- quiet=command.quiet,
- every=command.every,
- tag=command.tag,
- depth=command.depth,
- overwrite=command.overwrite,
- update=command.update,
- import_path=command.import_path,
- out_dir=Path(pwd) if pwd else DATA_DIR,
- )
- if __name__ == '__main__':
- main(args=sys.argv[1:], stdin=sys.stdin)
|