archivebox_schedule.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. import sys
  4. from pathlib import Path
  5. import rich_click as click
  6. from rich import print
  7. from archivebox.misc.util import enforce_types, docstring
  8. from archivebox.config import DATA_DIR, CONSTANTS
  9. from archivebox.config.common import ARCHIVING_CONFIG
  10. from archivebox.config.permissions import USER
  11. CRON_COMMENT = 'ArchiveBox'
  12. @enforce_types
  13. def schedule(add: bool=False,
  14. show: bool=False,
  15. clear: bool=False,
  16. foreground: bool=False,
  17. run_all: bool=False,
  18. quiet: bool=False,
  19. every: str | None=None,
  20. tag: str='',
  21. depth: int | str=0,
  22. overwrite: bool=False,
  23. update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
  24. import_path: str | None=None,
  25. out_dir: Path=DATA_DIR) -> None:
  26. """Set ArchiveBox to regularly import URLs at specific times using cron"""
  27. depth = int(depth)
  28. import shutil
  29. from crontab import CronTab, CronSlices
  30. from archivebox.misc.system import dedupe_cron_jobs
  31. # Find the archivebox binary path
  32. ARCHIVEBOX_ABSPATH = shutil.which('archivebox') or sys.executable.replace('python', 'archivebox')
  33. Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
  34. cron = CronTab(user=True)
  35. cron = dedupe_cron_jobs(cron)
  36. if clear:
  37. print(cron.remove_all(comment=CRON_COMMENT))
  38. cron.write()
  39. raise SystemExit(0)
  40. existing_jobs = list(cron.find_comment(CRON_COMMENT))
  41. if every or add:
  42. every = every or 'day'
  43. quoted = lambda s: f'"{s}"' if (s and ' ' in str(s)) else str(s)
  44. cmd = [
  45. 'cd',
  46. quoted(out_dir),
  47. '&&',
  48. quoted(ARCHIVEBOX_ABSPATH),
  49. *([
  50. 'add',
  51. *(['--overwrite'] if overwrite else []),
  52. *(['--update'] if update else []),
  53. *([f'--tag={tag}'] if tag else []),
  54. f'--depth={depth}',
  55. f'"{import_path}"',
  56. ] if import_path else ['update']),
  57. '>>',
  58. quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
  59. '2>&1',
  60. ]
  61. new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
  62. if every in ('minute', 'hour', 'day', 'month', 'year'):
  63. set_every = getattr(new_job.every(), every)
  64. set_every()
  65. elif CronSlices.is_valid(every):
  66. new_job.setall(every)
  67. else:
  68. print('[red]\\[X] Got invalid timeperiod for cron task.[/red]')
  69. print(' It must be one of minute/hour/day/month')
  70. print(' or a quoted cron-format schedule like:')
  71. print(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
  72. print(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml')
  73. raise SystemExit(1)
  74. cron = dedupe_cron_jobs(cron)
  75. print(cron)
  76. cron.write()
  77. total_runs = sum(j.frequency_per_year() for j in cron)
  78. existing_jobs = list(cron.find_command('archivebox'))
  79. print()
  80. print('[green]\\[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).[/green]'.format(USER, len(existing_jobs)))
  81. print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
  82. if total_runs > 60 and not quiet:
  83. print()
  84. print('[yellow]\\[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.[/yellow]'.format(total_runs))
  85. print(' Congrats on being an enthusiastic internet archiver! 👌')
  86. print()
  87. print(' [violet]Make sure you have enough storage space available to hold all the data.[/violet]')
  88. print(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
  89. print()
  90. elif show:
  91. if existing_jobs:
  92. print('\n'.join(str(cmd) for cmd in existing_jobs))
  93. else:
  94. print('[red]\\[X] There are no ArchiveBox cron jobs scheduled for your user ({}).[/red]'.format(USER))
  95. print(' To schedule a new job, run:')
  96. print(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
  97. raise SystemExit(0)
  98. if foreground or run_all:
  99. if not existing_jobs:
  100. print('[red]\\[X] You must schedule some jobs first before running in foreground mode.[/red]')
  101. print(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
  102. raise SystemExit(1)
  103. print('[green]\\[*] Running {} ArchiveBox jobs in foreground task scheduler...[/green]'.format(len(existing_jobs)))
  104. if run_all:
  105. try:
  106. for job in existing_jobs:
  107. sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n')
  108. sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
  109. sys.stdout.flush()
  110. job.run()
  111. sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n')
  112. except KeyboardInterrupt:
  113. print('\n[green]\\[√] Stopped.[/green] (Ctrl+C)')
  114. raise SystemExit(1)
  115. if foreground:
  116. try:
  117. for job in existing_jobs:
  118. print(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
  119. for result in cron.run_scheduler():
  120. print(result)
  121. except KeyboardInterrupt:
  122. print('\n[green]\\[√] Stopped.[/green] (Ctrl+C)')
  123. raise SystemExit(1)
  124. @click.command()
  125. @click.option('--quiet', '-q', is_flag=True, help="Don't warn about storage space")
  126. @click.option('--add', is_flag=True, help='Add a new scheduled ArchiveBox update job to cron')
  127. @click.option('--every', type=str, help='Run ArchiveBox once every [timeperiod] (hour/day/month/year or cron format e.g. "0 0 * * *")')
  128. @click.option('--tag', '-t', default='', help='Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3')
  129. @click.option('--depth', type=click.Choice(['0', '1']), default='0', help='Depth to archive to [0] or 1')
  130. @click.option('--overwrite', is_flag=True, help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots')
  131. @click.option('--update', is_flag=True, help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults')
  132. @click.option('--clear', is_flag=True, help='Stop all ArchiveBox scheduled runs (remove cron jobs)')
  133. @click.option('--show', is_flag=True, help='Print a list of currently active ArchiveBox cron jobs')
  134. @click.option('--foreground', '-f', is_flag=True, help='Launch ArchiveBox scheduler as a long-running foreground task instead of using cron')
  135. @click.option('--run-all', is_flag=True, help='Run all the scheduled jobs once immediately, independent of their configured schedules')
  136. @click.argument('import_path', required=False)
  137. @docstring(schedule.__doc__)
  138. def main(**kwargs):
  139. """Set ArchiveBox to regularly import URLs at specific times using cron"""
  140. schedule(**kwargs)
  141. if __name__ == '__main__':
  142. main()