archivebox_schedule.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox schedule'
  4. import sys
  5. import argparse
  6. from pathlib import Path
  7. from typing import Optional, List, IO
  8. from archivebox.misc.util import docstring
  9. from archivebox.config import DATA_DIR
  10. from archivebox.misc.logging_util import SmartFormatter, reject_stdin
  11. from archivebox.config.common import ARCHIVING_CONFIG
  12. # @enforce_types
  13. def schedule(add: bool=False,
  14. show: bool=False,
  15. clear: bool=False,
  16. foreground: bool=False,
  17. run_all: bool=False,
  18. quiet: bool=False,
  19. every: Optional[str]=None,
  20. tag: str='',
  21. depth: int=0,
  22. overwrite: bool=False,
  23. update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
  24. import_path: Optional[str]=None,
  25. out_dir: Path=DATA_DIR):
  26. """Set ArchiveBox to regularly import URLs at specific times using cron"""
  27. check_data_folder()
  28. from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
  29. from archivebox.config.permissions import USER
  30. Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
  31. cron = CronTab(user=True)
  32. cron = dedupe_cron_jobs(cron)
  33. if clear:
  34. print(cron.remove_all(comment=CRON_COMMENT))
  35. cron.write()
  36. raise SystemExit(0)
  37. existing_jobs = list(cron.find_comment(CRON_COMMENT))
  38. if every or add:
  39. every = every or 'day'
  40. quoted = lambda s: f'"{s}"' if (s and ' ' in str(s)) else str(s)
  41. cmd = [
  42. 'cd',
  43. quoted(out_dir),
  44. '&&',
  45. quoted(ARCHIVEBOX_BINARY.load().abspath),
  46. *([
  47. 'add',
  48. *(['--overwrite'] if overwrite else []),
  49. *(['--update'] if update else []),
  50. *([f'--tag={tag}'] if tag else []),
  51. f'--depth={depth}',
  52. f'"{import_path}"',
  53. ] if import_path else ['update']),
  54. '>>',
  55. quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
  56. '2>&1',
  57. ]
  58. new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
  59. if every in ('minute', 'hour', 'day', 'month', 'year'):
  60. set_every = getattr(new_job.every(), every)
  61. set_every()
  62. elif CronSlices.is_valid(every):
  63. new_job.setall(every)
  64. else:
  65. stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI))
  66. stderr(' It must be one of minute/hour/day/month')
  67. stderr(' or a quoted cron-format schedule like:')
  68. stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
  69. stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml')
  70. raise SystemExit(1)
  71. cron = dedupe_cron_jobs(cron)
  72. cron.write()
  73. total_runs = sum(j.frequency_per_year() for j in cron)
  74. existing_jobs = list(cron.find_comment(CRON_COMMENT))
  75. print()
  76. print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
  77. print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
  78. if total_runs > 60 and not quiet:
  79. stderr()
  80. stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI))
  81. stderr(' Congrats on being an enthusiastic internet archiver! 👌')
  82. stderr()
  83. stderr(' Make sure you have enough storage space available to hold all the data.')
  84. stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
  85. stderr('')
  86. elif show:
  87. if existing_jobs:
  88. print('\n'.join(str(cmd) for cmd in existing_jobs))
  89. else:
  90. stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI))
  91. stderr(' To schedule a new job, run:')
  92. stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
  93. raise SystemExit(0)
  94. cron = CronTab(user=True)
  95. cron = dedupe_cron_jobs(cron)
  96. existing_jobs = list(cron.find_comment(CRON_COMMENT))
  97. if foreground or run_all:
  98. if not existing_jobs:
  99. stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI))
  100. stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
  101. raise SystemExit(1)
  102. print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI))
  103. if run_all:
  104. try:
  105. for job in existing_jobs:
  106. sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n')
  107. sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
  108. sys.stdout.flush()
  109. job.run()
  110. sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n')
  111. except KeyboardInterrupt:
  112. print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
  113. raise SystemExit(1)
  114. if foreground:
  115. try:
  116. for job in existing_jobs:
  117. print(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}')
  118. for result in cron.run_scheduler():
  119. print(result)
  120. except KeyboardInterrupt:
  121. print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
  122. raise SystemExit(1)
  123. # if CAN_UPGRADE:
  124. # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
  125. @docstring(schedule.__doc__)
  126. def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
  127. parser = argparse.ArgumentParser(
  128. prog=__command__,
  129. description=schedule.__doc__,
  130. add_help=True,
  131. formatter_class=SmartFormatter,
  132. )
  133. parser.add_argument(
  134. '--quiet', '-q',
  135. action='store_true',
  136. help=("Don't warn about storage space."),
  137. )
  138. group = parser.add_mutually_exclusive_group()
  139. group.add_argument(
  140. '--add', # '-a',
  141. action='store_true',
  142. help='Add a new scheduled ArchiveBox update job to cron',
  143. )
  144. parser.add_argument(
  145. '--every', # '-e',
  146. type=str,
  147. default=None,
  148. help='Run ArchiveBox once every [timeperiod] (hour/day/month/year or cron format e.g. "0 0 * * *")',
  149. )
  150. parser.add_argument(
  151. '--tag', '-t',
  152. type=str,
  153. default='',
  154. help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
  155. )
  156. parser.add_argument(
  157. '--depth', # '-d',
  158. type=int,
  159. choices=[0, 1],
  160. default=0,
  161. help='Depth to archive to [0] or 1, see "add" command help for more info',
  162. )
  163. parser.add_argument(
  164. '--overwrite',
  165. action='store_true',
  166. help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots',
  167. )
  168. parser.add_argument(
  169. '--update',
  170. action='store_true',
  171. help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults',
  172. )
  173. group.add_argument(
  174. '--clear', # '-c'
  175. action='store_true',
  176. help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"),
  177. )
  178. group.add_argument(
  179. '--show', # '-s'
  180. action='store_true',
  181. help=("Print a list of currently active ArchiveBox cron jobs"),
  182. )
  183. group.add_argument(
  184. '--foreground', '-f',
  185. action='store_true',
  186. help=("Launch ArchiveBox scheduler as a long-running foreground task "
  187. "instead of using cron."),
  188. )
  189. group.add_argument(
  190. '--run-all', # '-a',
  191. action='store_true',
  192. help=("Run all the scheduled jobs once immediately, independent of "
  193. "their configured schedules, can be used together with --foreground"),
  194. )
  195. parser.add_argument(
  196. 'import_path',
  197. nargs='?',
  198. type=str,
  199. default=None,
  200. help=("Check this path and import any new links on every run "
  201. "(can be either local file or remote URL)"),
  202. )
  203. command = parser.parse_args(args or ())
  204. reject_stdin(__command__, stdin)
  205. schedule(
  206. add=command.add,
  207. show=command.show,
  208. clear=command.clear,
  209. foreground=command.foreground,
  210. run_all=command.run_all,
  211. quiet=command.quiet,
  212. every=command.every,
  213. tag=command.tag,
  214. depth=command.depth,
  215. overwrite=command.overwrite,
  216. update=command.update,
  217. import_path=command.import_path,
  218. out_dir=Path(pwd) if pwd else DATA_DIR,
  219. )
  220. if __name__ == '__main__':
  221. main(args=sys.argv[1:], stdin=sys.stdin)