2
0

archive.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #!/usr/bin/env python3
  2. """
  3. ArchiveBox command line application.
  4. ./archive and ./bin/archivebox both point to this file,
  5. but you can also run it directly using `python3 archive.py`
  6. Usage & Documentation:
  7. https://github.com/pirate/ArchiveBox/Wiki
  8. """
  9. import os
  10. import sys
  11. from links import links_after_timestamp
  12. from index import write_links_index, load_links_index
  13. from archive_methods import archive_link
  14. from config import (
  15. ARCHIVE_DIR,
  16. ONLY_NEW,
  17. OUTPUT_DIR,
  18. GIT_SHA,
  19. )
  20. from util import (
  21. check_dependencies,
  22. save_remote_source,
  23. save_stdin_source,
  24. )
  25. from logs import (
  26. log_archiving_started,
  27. log_archiving_paused,
  28. log_archiving_finished,
  29. )
  30. __AUTHOR__ = 'Nick Sweeting <[email protected]>'
  31. __VERSION__ = GIT_SHA
  32. __DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
  33. __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
  34. def print_help():
  35. print('ArchiveBox: The self-hosted internet archive.\n')
  36. print("Documentation:")
  37. print(" https://github.com/pirate/ArchiveBox/wiki\n")
  38. print("Usage:")
  39. print(" echo 'https://examplecom' | ./bin/archivebox\n")
  40. print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
  41. print(" ./bin/archivebox https://example.com/feed.rss\n")
  42. print(" ./bin/archivebox 15109948213.123\n")
  43. def main(*args):
  44. if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
  45. print_help()
  46. raise SystemExit(0)
  47. ### Handle CLI arguments
  48. # ./archive bookmarks.html
  49. # ./archive 1523422111.234
  50. import_path, resume = None, None
  51. if len(args) == 2:
  52. # if the argument is a string, it's a import_path file to import
  53. # if it's a number, it's a timestamp to resume archiving from
  54. if args[1].replace('.', '').isdigit():
  55. import_path, resume = None, args[1]
  56. else:
  57. import_path, resume = args[1], None
  58. ### Set up output folder
  59. if not os.path.exists(OUTPUT_DIR):
  60. os.makedirs(OUTPUT_DIR)
  61. ### Handle ingesting urls piped in through stdin
  62. # (.e.g if user does cat example_urls.txt | ./archive)
  63. if not sys.stdin.isatty():
  64. stdin_raw_text = sys.stdin.read()
  65. if stdin_raw_text and import_path:
  66. print(
  67. '[X] You should pass either a path as an argument, '
  68. 'or pass a list of links via stdin, but not both.\n'
  69. )
  70. print_help()
  71. raise SystemExit(1)
  72. import_path = save_stdin_source(stdin_raw_text)
  73. ### Handle ingesting urls from a remote file/feed
  74. # (e.g. if an RSS feed URL is used as the import path)
  75. if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  76. import_path = save_remote_source(import_path)
  77. ### Run the main archive update process
  78. update_archive_data(import_path=import_path, resume=resume)
  79. def update_archive_data(import_path=None, resume=None):
  80. """The main ArchiveBox entrancepoint. Everything starts here."""
  81. check_dependencies()
  82. # Step 1: Load list of links from the existing index
  83. # merge in and dedupe new links from import_path
  84. all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
  85. # Step 2: Write updated index with deduped old and new links back to disk
  86. write_links_index(out_dir=OUTPUT_DIR, links=all_links)
  87. # Step 3: Run the archive methods for each link
  88. links = new_links if ONLY_NEW else all_links
  89. log_archiving_started(len(links), resume)
  90. idx, link = 0, 0
  91. try:
  92. for idx, link in enumerate(links_after_timestamp(links, resume)):
  93. link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
  94. archive_link(link_dir, link)
  95. except KeyboardInterrupt:
  96. log_archiving_paused(len(links), idx, link and link['timestamp'])
  97. raise SystemExit(0)
  98. except:
  99. print()
  100. raise
  101. log_archiving_finished(len(links))
  102. # Step 4: Re-write links index with updated titles, icons, and resources
  103. all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
  104. write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
  105. if __name__ == '__main__':
  106. main(*sys.argv)