| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- #!/usr/bin/env python3
- # ArchiveBox
- # Nick Sweeting 2017 | MIT License
- # https://github.com/pirate/ArchiveBox
- import os
- import sys
- from datetime import datetime
- from peekable import Peekable
- from parse import parse_links
- from links import validate_links, links_after_timestamp
- from archive_methods import archive_link, _RESULTS_TOTALS
- from index import (
- write_links_index,
- parse_json_links_index,
- )
- from config import (
- ARCHIVE_DIR,
- ONLY_NEW,
- OUTPUT_DIR,
- REPO_DIR,
- ANSI,
- GIT_SHA,
- )
- from util import (
- check_dependencies,
- download_url,
- save_source,
- pretty_path,
- migrate_data,
- check_links_structure,
- )
- __AUTHOR__ = 'Nick Sweeting <[email protected]>'
- __VERSION__ = GIT_SHA
- __DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.'
- __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
- def print_help():
- print(__DESCRIPTION__)
- print("Documentation: {}\n".format(__DOCUMENTATION__))
- print("Usage:")
- print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
- print("")
- print(" ./bin/archivebox https://example.com/feed.rss\n")
- print("")
- print(" echo 'https://examplecom' | ./bin/archivebox\n")
- def load_links(archive_path=OUTPUT_DIR, import_path=None):
- """get new links from file and optionally append them to links in existing archive"""
- existing_links = []
- if archive_path:
- existing_links = parse_json_links_index(archive_path)
- check_links_structure(existing_links)
- new_links = []
- if import_path:
- # parse and validate the import file
- raw_links, parser_name = parse_links(import_path)
- new_links = validate_links(raw_links)
- check_links_structure(new_links)
- # merge existing links in archive_path and new links
- all_links = validate_links(existing_links + new_links)
- check_links_structure(all_links)
- num_new_links = len(all_links) - len(existing_links)
- if import_path and parser_name:
- print(' > Adding {} new links to index from {} (parsed as {} format)'.format(
- num_new_links,
- pretty_path(import_path),
- parser_name,
- ))
- return all_links, new_links
- def update_archive(archive_path, links, source=None, resume=None, append=True):
- """update or create index.html+json given a path to an export file containing new links"""
- start_ts = datetime.now().timestamp()
- if resume:
- print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
- datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
- resume,
- **ANSI,
- ))
- else:
- print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
- datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
- len(links),
- **ANSI,
- ))
- check_links_structure(links)
- # prefetch the first link off the generator so that if we pause or fail
- # immediately we can show that we paused on the first link and not just None
- to_archive = Peekable(links_after_timestamp(links, resume))
- idx, link = 0, to_archive.peek(0)
- # loop over links and archive them
- try:
- check_dependencies()
- for idx, link in enumerate(to_archive):
- link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
- archive_link(link_dir, link)
- except (KeyboardInterrupt, SystemExit, Exception) as e:
- print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
- **ANSI,
- now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
- idx=idx+1,
- timestamp=link['timestamp'],
- total=len(links),
- ))
- print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
- print(' Continue where you left off by running:')
- print(' {} {}'.format(
- pretty_path(sys.argv[0]),
- link['timestamp'],
- ))
- if not isinstance(e, KeyboardInterrupt):
- print()
- raise e
- raise SystemExit(1)
- # print timing information & summary
- end_ts = datetime.now().timestamp()
- seconds = end_ts - start_ts
- if seconds > 60:
- duration = '{0:.2f} min'.format(seconds / 60, 2)
- else:
- duration = '{0:.2f} sec'.format(seconds, 2)
- print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
- ANSI['green'],
- datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
- len(links),
- duration,
- ANSI['reset'],
- ))
- print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
- print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
- print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
- print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
- if __name__ == '__main__':
- argc = len(sys.argv)
- if set(sys.argv).intersection(('-h', '--help', 'help')):
- print_help()
- raise SystemExit(0)
- migrate_data()
- source = sys.argv[1] if argc > 1 else None # path of links file to import
- resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
-
- stdin_raw_text = ''
- if not sys.stdin.isatty():
- stdin_raw_text = sys.stdin.read()
- if source and stdin_raw_text:
- print(
- '[X] You should pass either a path as an argument, '
- 'or pass a list of links via stdin, but not both.\n'
- )
- print_help()
- raise SystemExit(1)
- if argc == 1:
- source, resume = None, None
- elif argc == 2:
- if all(d.isdigit() for d in sys.argv[1].split('.')):
- # argv[1] is a resume timestamp
- source, resume = None, sys.argv[1]
- else:
- # argv[1] is a path to a file to import
- source, resume = sys.argv[1].strip(), None
- elif argc == 3:
- source, resume = sys.argv[1].strip(), sys.argv[2]
- else:
- print_help()
- raise SystemExit(1)
- # See if archive folder already exists
- for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
- if os.path.exists(out_dir):
- break
- else:
- out_dir = OUTPUT_DIR
- # Step 0: Download url to local file (only happens if a URL is specified instead of local path)
- if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
- source = download_url(source)
- elif stdin_raw_text:
- source = save_source(stdin_raw_text)
- # Step 1: Parse the links and dedupe them with existing archive
- all_links, new_links = load_links(archive_path=out_dir, import_path=source)
- # Step 2: Write new index
- write_links_index(out_dir=out_dir, links=all_links)
- # Step 3: Verify folder structure is 1:1 with index
- # cleanup_archive(out_dir, links)
- # Step 4: Run the archive methods for each link
- if ONLY_NEW:
- update_archive(out_dir, new_links, source=source, resume=resume, append=True)
- else:
- update_archive(out_dir, all_links, source=source, resume=resume, append=True)
- # Step 5: Re-write links index with updated titles, icons, and resources
- all_links, _ = load_links(archive_path=out_dir)
- write_links_index(out_dir=out_dir, links=all_links)
|