archive.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. #!/usr/bin/env python3
  2. # ArchiveBox
  3. # Nick Sweeting 2017 | MIT License
  4. # https://github.com/pirate/ArchiveBox
  5. import os
  6. import sys
  7. from datetime import datetime
  8. from peekable import Peekable
  9. from parse import parse_links
  10. from links import validate_links, links_after_timestamp
  11. from archive_methods import archive_link, _RESULTS_TOTALS
  12. from index import (
  13. write_links_index,
  14. parse_json_links_index,
  15. )
  16. from config import (
  17. ARCHIVE_DIR,
  18. ONLY_NEW,
  19. OUTPUT_DIR,
  20. REPO_DIR,
  21. ANSI,
  22. GIT_SHA,
  23. )
  24. from util import (
  25. check_dependencies,
  26. download_url,
  27. save_source,
  28. pretty_path,
  29. migrate_data,
  30. check_links_structure,
  31. )
  32. __AUTHOR__ = 'Nick Sweeting <[email protected]>'
  33. __VERSION__ = GIT_SHA
  34. __DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.'
  35. __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
  36. def print_help():
  37. print(__DESCRIPTION__)
  38. print("Documentation: {}\n".format(__DOCUMENTATION__))
  39. print("Usage:")
  40. print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
  41. print("")
  42. print(" ./bin/archivebox https://example.com/feed.rss\n")
  43. print("")
  44. print(" echo 'https://examplecom' | ./bin/archivebox\n")
  45. def load_links(archive_path=OUTPUT_DIR, import_path=None):
  46. """get new links from file and optionally append them to links in existing archive"""
  47. existing_links = []
  48. if archive_path:
  49. existing_links = parse_json_links_index(archive_path)
  50. check_links_structure(existing_links)
  51. new_links = []
  52. if import_path:
  53. # parse and validate the import file
  54. raw_links, parser_name = parse_links(import_path)
  55. new_links = validate_links(raw_links)
  56. check_links_structure(new_links)
  57. # merge existing links in archive_path and new links
  58. all_links = validate_links(existing_links + new_links)
  59. check_links_structure(all_links)
  60. num_new_links = len(all_links) - len(existing_links)
  61. if import_path and parser_name:
  62. print(' > Adding {} new links to index from {} (parsed as {} format)'.format(
  63. num_new_links,
  64. pretty_path(import_path),
  65. parser_name,
  66. ))
  67. return all_links, new_links
  68. def update_archive(archive_path, links, source=None, resume=None, append=True):
  69. """update or create index.html+json given a path to an export file containing new links"""
  70. start_ts = datetime.now().timestamp()
  71. if resume:
  72. print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
  73. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  74. resume,
  75. **ANSI,
  76. ))
  77. else:
  78. print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
  79. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  80. len(links),
  81. **ANSI,
  82. ))
  83. check_links_structure(links)
  84. # prefetch the first link off the generator so that if we pause or fail
  85. # immediately we can show that we paused on the first link and not just None
  86. to_archive = Peekable(links_after_timestamp(links, resume))
  87. idx, link = 0, to_archive.peek(0)
  88. # loop over links and archive them
  89. try:
  90. check_dependencies()
  91. for idx, link in enumerate(to_archive):
  92. link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
  93. archive_link(link_dir, link)
  94. except (KeyboardInterrupt, SystemExit, Exception) as e:
  95. print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
  96. **ANSI,
  97. now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  98. idx=idx+1,
  99. timestamp=link['timestamp'],
  100. total=len(links),
  101. ))
  102. print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
  103. print(' Continue where you left off by running:')
  104. print(' {} {}'.format(
  105. pretty_path(sys.argv[0]),
  106. link['timestamp'],
  107. ))
  108. if not isinstance(e, KeyboardInterrupt):
  109. print()
  110. raise e
  111. raise SystemExit(1)
  112. # print timing information & summary
  113. end_ts = datetime.now().timestamp()
  114. seconds = end_ts - start_ts
  115. if seconds > 60:
  116. duration = '{0:.2f} min'.format(seconds / 60, 2)
  117. else:
  118. duration = '{0:.2f} sec'.format(seconds, 2)
  119. print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
  120. ANSI['green'],
  121. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  122. len(links),
  123. duration,
  124. ANSI['reset'],
  125. ))
  126. print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
  127. print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
  128. print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
  129. print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
  130. if __name__ == '__main__':
  131. argc = len(sys.argv)
  132. if set(sys.argv).intersection(('-h', '--help', 'help')):
  133. print_help()
  134. raise SystemExit(0)
  135. migrate_data()
  136. source = sys.argv[1] if argc > 1 else None # path of links file to import
  137. resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
  138. stdin_raw_text = ''
  139. if not sys.stdin.isatty():
  140. stdin_raw_text = sys.stdin.read()
  141. if source and stdin_raw_text:
  142. print(
  143. '[X] You should pass either a path as an argument, '
  144. 'or pass a list of links via stdin, but not both.\n'
  145. )
  146. print_help()
  147. raise SystemExit(1)
  148. if argc == 1:
  149. source, resume = None, None
  150. elif argc == 2:
  151. if all(d.isdigit() for d in sys.argv[1].split('.')):
  152. # argv[1] is a resume timestamp
  153. source, resume = None, sys.argv[1]
  154. else:
  155. # argv[1] is a path to a file to import
  156. source, resume = sys.argv[1].strip(), None
  157. elif argc == 3:
  158. source, resume = sys.argv[1].strip(), sys.argv[2]
  159. else:
  160. print_help()
  161. raise SystemExit(1)
  162. # See if archive folder already exists
  163. for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
  164. if os.path.exists(out_dir):
  165. break
  166. else:
  167. out_dir = OUTPUT_DIR
  168. # Step 0: Download url to local file (only happens if a URL is specified instead of local path)
  169. if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  170. source = download_url(source)
  171. elif stdin_raw_text:
  172. source = save_source(stdin_raw_text)
  173. # Step 1: Parse the links and dedupe them with existing archive
  174. all_links, new_links = load_links(archive_path=out_dir, import_path=source)
  175. # Step 2: Write new index
  176. write_links_index(out_dir=out_dir, links=all_links)
  177. # Step 3: Verify folder structure is 1:1 with index
  178. # cleanup_archive(out_dir, links)
  179. # Step 4: Run the archive methods for each link
  180. if ONLY_NEW:
  181. update_archive(out_dir, new_links, source=source, resume=resume, append=True)
  182. else:
  183. update_archive(out_dir, all_links, source=source, resume=resume, append=True)
  184. # Step 5: Re-write links index with updated titles, icons, and resources
  185. all_links, _ = load_links(archive_path=out_dir)
  186. write_links_index(out_dir=out_dir, links=all_links)