archive.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. #!/usr/bin/env python3
  2. # ArchiveBox
  3. # Nick Sweeting 2017 | MIT License
  4. # https://github.com/pirate/ArchiveBox
  5. import os
  6. import sys
  7. from datetime import datetime
  8. from subprocess import run
  9. from parse import parse_links
  10. from links import validate_links
  11. from archive_methods import archive_links, _RESULTS_TOTALS
  12. from index import (
  13. write_links_index,
  14. write_link_index,
  15. parse_json_links_index,
  16. parse_json_link_index,
  17. )
  18. from config import (
  19. ONLY_NEW,
  20. OUTPUT_PERMISSIONS,
  21. OUTPUT_DIR,
  22. REPO_DIR,
  23. ANSI,
  24. TIMEOUT,
  25. SHOW_PROGRESS,
  26. GIT_SHA,
  27. )
  28. from util import (
  29. download_url,
  30. save_source,
  31. progress,
  32. cleanup_archive,
  33. pretty_path,
  34. migrate_data,
  35. )
  36. __AUTHOR__ = 'Nick Sweeting <[email protected]>'
  37. __VERSION__ = GIT_SHA
  38. __DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.'
  39. __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
  40. def print_help():
  41. print(__DESCRIPTION__)
  42. print("Documentation: {}\n".format(__DOCUMENTATION__))
  43. print("Usage:")
  44. print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
  45. print("")
  46. print(" ./bin/archivebox https://example.com/feed.rss\n")
  47. print("")
  48. print(" echo 'https://examplecom' | ./bin/archivebox\n")
  49. def load_links(archive_path=OUTPUT_DIR, import_path=None):
  50. """get new links from file and optionally append them to links in existing archive"""
  51. existing_links = []
  52. if archive_path:
  53. existing_links = parse_json_links_index(archive_path)
  54. existing_links = validate_links(existing_links)
  55. new_links = []
  56. if import_path:
  57. # parse and validate the import file
  58. raw_links, parser_name = parse_links(import_path)
  59. new_links = validate_links(raw_links)
  60. if SHOW_PROGRESS:
  61. print()
  62. # merge existing links in archive_path and new links
  63. all_links = validate_links(existing_links + new_links)
  64. num_new_links = len(all_links) - len(existing_links)
  65. if import_path and parser_name:
  66. print(' > Adding {} new links to index from {} (parsed as {} format)'.format(
  67. num_new_links,
  68. pretty_path(import_path),
  69. parser_name,
  70. ))
  71. return all_links, new_links
  72. def update_archive(archive_path, links, source=None, resume=None, append=True):
  73. """update or create index.html+json given a path to an export file containing new links"""
  74. start_ts = datetime.now().timestamp()
  75. if resume:
  76. print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
  77. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  78. resume,
  79. **ANSI,
  80. ))
  81. else:
  82. print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
  83. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  84. len(links),
  85. **ANSI,
  86. ))
  87. # loop over links and archive them
  88. archive_links(archive_path, links, source=source, resume=resume)
  89. # print timing information & summary
  90. end_ts = datetime.now().timestamp()
  91. seconds = end_ts - start_ts
  92. if seconds > 60:
  93. duration = '{0:.2f} min'.format(seconds / 60, 2)
  94. else:
  95. duration = '{0:.2f} sec'.format(seconds, 2)
  96. print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
  97. ANSI['green'],
  98. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  99. len(links),
  100. duration,
  101. ANSI['reset'],
  102. ))
  103. print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
  104. print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
  105. print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
  106. print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
  107. if __name__ == '__main__':
  108. argc = len(sys.argv)
  109. if set(sys.argv).intersection(('-h', '--help', 'help')):
  110. print_help()
  111. raise SystemExit(0)
  112. migrate_data()
  113. source = sys.argv[1] if argc > 1 else None # path of links file to import
  114. resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
  115. stdin_raw_text = []
  116. if not sys.stdin.isatty():
  117. stdin_raw_text = sys.stdin.read()
  118. if source and stdin_raw_text:
  119. print(
  120. '[X] You should pass either a path as an argument, '
  121. 'or pass a list of links via stdin, but not both.\n'
  122. )
  123. print_help()
  124. raise SystemExit(1)
  125. if argc == 1:
  126. source, resume = None, None
  127. elif argc == 2:
  128. if all(d.isdigit() for d in sys.argv[1].split('.')):
  129. # argv[1] is a resume timestamp
  130. source, resume = None, sys.argv[1]
  131. else:
  132. # argv[1] is a path to a file to import
  133. source, resume = sys.argv[1].strip(), None
  134. elif argc == 3:
  135. source, resume = sys.argv[1].strip(), sys.argv[2]
  136. else:
  137. print_help()
  138. raise SystemExit(1)
  139. # See if archive folder already exists
  140. for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
  141. if os.path.exists(out_dir):
  142. break
  143. else:
  144. out_dir = OUTPUT_DIR
  145. # Step 0: Download url to local file (only happens if a URL is specified instead of local path)
  146. if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  147. source = download_url(source)
  148. elif stdin_raw_text:
  149. source = save_source(stdin_raw_text)
  150. # Step 1: Parse the links and dedupe them with existing archive
  151. all_links, new_links = load_links(archive_path=out_dir, import_path=source)
  152. # Step 2: Write new index
  153. write_links_index(out_dir=out_dir, links=all_links)
  154. # Step 3: Verify folder structure is 1:1 with index
  155. # cleanup_archive(out_dir, links)
  156. # Step 4: Run the archive methods for each link
  157. if ONLY_NEW:
  158. update_archive(out_dir, new_links, source=source, resume=resume, append=True)
  159. else:
  160. update_archive(out_dir, all_links, source=source, resume=resume, append=True)