archive.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. #!/usr/bin/env python3
  2. """
  3. ArchiveBox command line application.
  4. ./archive and ./bin/archivebox both point to this file,
  5. but you can also run it directly using `python3 archive.py`
  6. Usage & Documentation:
  7. https://github.com/pirate/ArchiveBox/Wiki
  8. """
  9. import os
  10. import sys
  11. from typing import List
  12. from schema import Link
  13. from links import links_after_timestamp
  14. from index import write_links_index, load_links_index
  15. from archive_methods import archive_link
  16. from config import (
  17. ARCHIVE_DIR,
  18. ONLY_NEW,
  19. OUTPUT_DIR,
  20. GIT_SHA,
  21. )
  22. from util import (
  23. save_remote_source,
  24. save_stdin_source,
  25. )
  26. from logs import (
  27. log_archiving_started,
  28. log_archiving_paused,
  29. log_archiving_finished,
  30. )
  31. __AUTHOR__ = 'Nick Sweeting <[email protected]>'
  32. __VERSION__ = GIT_SHA[:9]
  33. __DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
  34. __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
  35. def print_help():
  36. print('ArchiveBox: The self-hosted internet archive.\n')
  37. print("Documentation:")
  38. print(" https://github.com/pirate/ArchiveBox/wiki\n")
  39. print("UI Usage:")
  40. print(" Open output/index.html to view your archive.\n")
  41. print("CLI Usage:")
  42. print(" echo 'https://example.com' | ./archive\n")
  43. print(" ./archive ~/Downloads/bookmarks_export.html\n")
  44. print(" ./archive https://example.com/feed.rss\n")
  45. print(" ./archive 15109948213.123\n")
  46. def main(*args) -> List[Link]:
  47. if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
  48. print_help()
  49. raise SystemExit(0)
  50. if set(args).intersection(('--version', 'version')):
  51. print('ArchiveBox version {}'.format(__VERSION__))
  52. raise SystemExit(0)
  53. ### Handle CLI arguments
  54. # ./archive bookmarks.html
  55. # ./archive 1523422111.234
  56. import_path, resume = None, None
  57. if len(args) == 2:
  58. # if the argument is a string, it's a import_path file to import
  59. # if it's a number, it's a timestamp to resume archiving from
  60. if args[1].replace('.', '').isdigit():
  61. import_path, resume = None, args[1]
  62. else:
  63. import_path, resume = args[1], None
  64. ### Set up output folder
  65. if not os.path.exists(OUTPUT_DIR):
  66. os.makedirs(OUTPUT_DIR)
  67. ### Handle ingesting urls piped in through stdin
  68. # (.e.g if user does cat example_urls.txt | ./archive)
  69. if not sys.stdin.isatty():
  70. stdin_raw_text = sys.stdin.read()
  71. if stdin_raw_text and import_path:
  72. print(
  73. '[X] You should pass either a path as an argument, '
  74. 'or pass a list of links via stdin, but not both.\n'
  75. )
  76. print_help()
  77. raise SystemExit(1)
  78. import_path = save_stdin_source(stdin_raw_text)
  79. ### Handle ingesting urls from a remote file/feed
  80. # (e.g. if an RSS feed URL is used as the import path)
  81. if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  82. import_path = save_remote_source(import_path)
  83. ### Run the main archive update process
  84. return update_archive_data(import_path=import_path, resume=resume)
  85. def update_archive_data(import_path: str=None, resume: float=None) -> List[Link]:
  86. """The main ArchiveBox entrancepoint. Everything starts here."""
  87. # Step 1: Load list of links from the existing index
  88. # merge in and dedupe new links from import_path
  89. all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
  90. # Step 2: Write updated index with deduped old and new links back to disk
  91. write_links_index(out_dir=OUTPUT_DIR, links=all_links)
  92. # Step 3: Run the archive methods for each link
  93. links = new_links if ONLY_NEW else all_links
  94. log_archiving_started(len(links), resume)
  95. idx, link = 0, {'timestamp': 0}
  96. try:
  97. for idx, link in enumerate(links_after_timestamp(links, resume)):
  98. link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
  99. archive_link(link_dir, link)
  100. except KeyboardInterrupt:
  101. log_archiving_paused(len(links), idx, link['timestamp'])
  102. raise SystemExit(0)
  103. except:
  104. print()
  105. raise
  106. log_archiving_finished(len(links))
  107. # Step 4: Re-write links index with updated titles, icons, and resources
  108. all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
  109. write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
  110. return all_links
  111. if __name__ == '__main__':
  112. main(*sys.argv)