archive.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. #!/usr/bin/env python3
  2. # ArchiveBox
  3. # Nick Sweeting 2017 | MIT License
  4. # https://github.com/pirate/ArchiveBox
  5. import os
  6. import sys
  7. from datetime import datetime
  8. from subprocess import run
  9. from parse import parse_links
  10. from links import validate_links
  11. from archive_methods import archive_links, _RESULTS_TOTALS
  12. from index import (
  13. write_links_index,
  14. write_link_index,
  15. parse_json_links_index,
  16. parse_json_link_index,
  17. )
  18. from config import (
  19. ONLY_NEW,
  20. OUTPUT_PERMISSIONS,
  21. OUTPUT_DIR,
  22. REPO_DIR,
  23. ANSI,
  24. TIMEOUT,
  25. SHOW_PROGRESS,
  26. GIT_SHA,
  27. )
  28. from util import (
  29. download_url,
  30. save_source,
  31. progress,
  32. cleanup_archive,
  33. pretty_path,
  34. migrate_data,
  35. )
  36. __AUTHOR__ = 'Nick Sweeting <[email protected]>'
  37. __VERSION__ = GIT_SHA
  38. __DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.'
  39. __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
  40. def print_help():
  41. print(__DESCRIPTION__)
  42. print("Documentation: {}\n".format(__DOCUMENTATION__))
  43. print("Usage:")
  44. print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
  45. print("")
  46. print(" ./bin/archivebox https://example.com/feed.rss\n")
  47. print("")
  48. print(" echo 'https://examplecom' | ./bin/archivebox\n")
  49. def load_links(archive_path=OUTPUT_DIR, import_path=None):
  50. """get new links from file and optionally append them to links in existing archive"""
  51. existing_links = []
  52. if archive_path:
  53. existing_links = parse_json_links_index(archive_path)
  54. new_links = []
  55. if import_path:
  56. # parse and validate the import file
  57. raw_links, parser_name = parse_links(import_path)
  58. new_links = validate_links(raw_links)
  59. if SHOW_PROGRESS:
  60. print()
  61. # merge existing links in archive_path and new links
  62. all_links = validate_links(existing_links + new_links)
  63. num_new_links = len(all_links) - len(existing_links)
  64. if import_path and parser_name:
  65. print(' > Adding {} new links to index from {} (parsed as {} format)'.format(
  66. num_new_links,
  67. pretty_path(import_path),
  68. parser_name,
  69. ))
  70. return all_links, new_links
  71. def update_archive(archive_path, links, source=None, resume=None, append=True):
  72. """update or create index.html+json given a path to an export file containing new links"""
  73. start_ts = datetime.now().timestamp()
  74. if resume:
  75. print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
  76. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  77. resume,
  78. **ANSI,
  79. ))
  80. else:
  81. print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
  82. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  83. len(links),
  84. **ANSI,
  85. ))
  86. # loop over links and archive them
  87. archive_links(archive_path, links, source=source, resume=resume)
  88. # print timing information & summary
  89. end_ts = datetime.now().timestamp()
  90. seconds = end_ts - start_ts
  91. if seconds > 60:
  92. duration = '{0:.2f} min'.format(seconds / 60, 2)
  93. else:
  94. duration = '{0:.2f} sec'.format(seconds, 2)
  95. print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
  96. ANSI['green'],
  97. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  98. len(links),
  99. duration,
  100. ANSI['reset'],
  101. ))
  102. print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
  103. print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
  104. print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
  105. print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
  106. if __name__ == '__main__':
  107. argc = len(sys.argv)
  108. if set(sys.argv).intersection(('-h', '--help', 'help')):
  109. print_help()
  110. raise SystemExit(0)
  111. migrate_data()
  112. source = sys.argv[1] if argc > 1 else None # path of links file to import
  113. resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
  114. stdin_raw_text = []
  115. if not sys.stdin.isatty():
  116. stdin_raw_text = sys.stdin.read()
  117. if source and stdin_raw_text:
  118. print(
  119. '[X] You should pass either a path as an argument, '
  120. 'or pass a list of links via stdin, but not both.\n'
  121. )
  122. print_help()
  123. raise SystemExit(1)
  124. if argc == 1:
  125. source, resume = None, None
  126. elif argc == 2:
  127. if all(d.isdigit() for d in sys.argv[1].split('.')):
  128. # argv[1] is a resume timestamp
  129. source, resume = None, sys.argv[1]
  130. else:
  131. # argv[1] is a path to a file to import
  132. source, resume = sys.argv[1].strip(), None
  133. elif argc == 3:
  134. source, resume = sys.argv[1].strip(), sys.argv[2]
  135. else:
  136. print_help()
  137. raise SystemExit(1)
  138. # See if archive folder already exists
  139. for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
  140. if os.path.exists(out_dir):
  141. break
  142. else:
  143. out_dir = OUTPUT_DIR
  144. # Step 0: Download url to local file (only happens if a URL is specified instead of local path)
  145. if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  146. source = download_url(source)
  147. elif stdin_raw_text:
  148. source = save_source(stdin_raw_text)
  149. # Step 1: Parse the links and dedupe them with existing archive
  150. all_links, new_links = load_links(archive_path=out_dir, import_path=source)
  151. # Step 2: Write new index
  152. write_links_index(out_dir=out_dir, links=all_links)
  153. # Step 3: Verify folder structure is 1:1 with index
  154. # cleanup_archive(out_dir, links)
  155. # Step 4: Run the archive methods for each link
  156. if ONLY_NEW:
  157. update_archive(out_dir, new_links, source=source, resume=resume, append=True)
  158. else:
  159. update_archive(out_dir, all_links, source=source, resume=resume, append=True)