Sfoglia il codice sorgente

move dependency checking into config file

Nick Sweeting 6 anni fa
parent
commit
4c499d77b6
4 ha cambiato i file con 146 aggiunte e 138 eliminazioni
  1. 12 8
      archivebox/archive.py
  2. 3 3
      archivebox/archive_methods.py
  3. 111 40
      archivebox/config.py
  4. 20 87
      archivebox/util.py

+ 12 - 8
archivebox/archive.py

@@ -22,7 +22,6 @@ from config import (
     GIT_SHA,
 )
 from util import (
-    check_dependencies,
     save_remote_source,
     save_stdin_source,
 )
@@ -33,7 +32,7 @@ from logs import (
 )
 
 __AUTHOR__ = 'Nick Sweeting <[email protected]>'
-__VERSION__ = GIT_SHA
+__VERSION__ = GIT_SHA[:9]
 __DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
 
@@ -42,11 +41,13 @@ def print_help():
     print('ArchiveBox: The self-hosted internet archive.\n')
     print("Documentation:")
     print("    https://github.com/pirate/ArchiveBox/wiki\n")
-    print("Usage:")
-    print("    echo 'https://examplecom' | ./bin/archivebox\n")
-    print("    ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
-    print("    ./bin/archivebox https://example.com/feed.rss\n")
-    print("    ./bin/archivebox 15109948213.123\n")
+    print("UI Usage:")
+    print("    Open output/index.html to view your archive.\n")
+    print("CLI Usage:")
+    print("    echo 'https://example.com' | ./archive\n")
+    print("    ./archive ~/Downloads/bookmarks_export.html\n")
+    print("    ./archive https://example.com/feed.rss\n")
+    print("    ./archive 15109948213.123\n")
 
 
 def main(*args):
@@ -54,6 +55,10 @@ def main(*args):
         print_help()
         raise SystemExit(0)
 
+    if set(args).intersection(('--version', 'version')):
+        print('ArchiveBox version {}'.format(__VERSION__))
+        raise SystemExit(0)
+
     ### Handle CLI arguments
     #     ./archive bookmarks.html
     #     ./archive 1523422111.234
@@ -95,7 +100,6 @@ def main(*args):
 
 def update_archive_data(import_path=None, resume=None):
     """The main ArchiveBox entrancepoint. Everything starts here."""
-    check_dependencies()
 
     # Step 1: Load list of links from the existing index
     #         merge in and dedupe new links from import_path

+ 3 - 3
archivebox/archive_methods.py

@@ -297,7 +297,7 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT):
 
     output = 'output.pdf'
     cmd = [
-        *chrome_args(timeout=timeout),
+        *chrome_args(TIMEOUT=timeout),
         '--print-to-pdf',
         link['url'],
     ]
@@ -339,7 +339,7 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
 
     output = 'screenshot.png'
     cmd = [
-        *chrome_args(timeout=timeout),
+        *chrome_args(TIMEOUT=timeout),
         '--screenshot',
         link['url'],
     ]
@@ -382,7 +382,7 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT):
     output = 'output.html'
     output_path = os.path.join(link_dir, output)
     cmd = [
-        *chrome_args(timeout=timeout),
+        *chrome_args(TIMEOUT=timeout),
         '--dump-dom',
         link['url']
     ]

+ 111 - 40
archivebox/config.py

@@ -1,8 +1,9 @@
 import os
+import re
 import sys
 import shutil
 
-from subprocess import run, PIPE
+from subprocess import run, PIPE, DEVNULL
 
 # ******************************************************************************
 # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
@@ -68,7 +69,6 @@ SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
 PYTHON_PATH = os.path.join(REPO_DIR, 'archivebox')
 TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates')
 
-
 CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
 USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
 USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC
@@ -137,44 +137,115 @@ if not USE_COLOR:
     # dont show colors if USE_COLOR is False
     ANSI = {k: '' for k in ANSI.keys()}
 
-### Confirm Environment Setup
-GIT_SHA = 'unknown'
-try:
-    GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
-except Exception:
-    print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
-
-CHROME_VERSION = 'unknown'
-try:
-    chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
-    CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0]
-except Exception:
-    if USE_CHROME:
-        print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?')
 
-WGET_VERSION = 'unknown'
-try:
-    wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
-    WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
-except Exception:
-    if USE_WGET:
-        print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?')
-
-WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION)
+### Confirm Environment Setup
 
 try:
-    COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None
-except Exception:
-    print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?')
-    raise
-
-if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
-    print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
-    print('    To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
-    print('')
-    print('    Confirm that it\'s fixed by opening a new shell and running:')
-    print('        python3 -c "import sys; print(sys.stdout.encoding)"   # should output UTF-8')
-    print('')
-    print('    Alternatively, run this script with:')
-    print('        env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
-
+    ### Check Python environment
+    python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
+    if python_vers < 3.5:
+        print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
+        print('    See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
+        raise SystemExit(1)
+
+    if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
+        print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
+        print('    To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
+        print('')
+        print('    Confirm that it\'s fixed by opening a new shell and running:')
+        print('        python3 -c "import sys; print(sys.stdout.encoding)"   # should output UTF-8')
+        print('')
+        print('    Alternatively, run this script with:')
+        print('        env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
+
+    ### Get code version by parsing git log
+    GIT_SHA = 'unknown'
+    try:
+        GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
+    except Exception:
+        print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
+
+    ### Get absolute path for cookies file
+    try:
+        COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None
+    except Exception:
+        print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?')
+        raise
+
+    ### Make sure curl is installed
+    if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
+        if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
+            print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
+            print('    Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
+            print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+            raise SystemExit(1)
+
+    ### Make sure wget is installed and calculate version
+    if FETCH_WGET or FETCH_WARC:
+        if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
+            print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
+            print('    Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
+            print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+            raise SystemExit(1)
+
+        WGET_VERSION = 'unknown'
+        try:
+            wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
+            WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
+        except Exception:
+            if USE_WGET:
+                print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?')
+
+        WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION)
+
+    ### Make sure chrome is installed and calculate version
+    if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
+        if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
+            print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
+            print('    Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
+            print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+            raise SystemExit(1)
+
+        # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
+        try:
+            result = run([CHROME_BINARY, '--version'], stdout=PIPE)
+            version_str = result.stdout.decode('utf-8')
+            version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
+            version = [l for l in version_lines if l.isdigit()][-1]
+            if int(version) < 59:
+                print(version_lines)
+                print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+                raise SystemExit(1)
+        except (IndexError, TypeError, OSError):
+            print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
+            print('    Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
+            print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+            raise SystemExit(1)
+
+        CHROME_VERSION = 'unknown'
+        try:
+            chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
+            CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0]
+        except Exception:
+            if USE_CHROME:
+                print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?')
+
+    ### Make sure git is installed
+    if FETCH_GIT:
+        if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
+            print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
+            print('    Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
+            print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+            raise SystemExit(1)
+
+    ### Make sure youtube-dl is installed
+    if FETCH_MEDIA:
+        if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
+            print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
+            print('    Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
+            print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+            raise SystemExit(1)
+
+except KeyboardInterrupt:
+    raise SystemExit(1)

+ 20 - 87
archivebox/util.py

@@ -25,11 +25,6 @@ from config import (
     OUTPUT_PERMISSIONS,
     TIMEOUT,
     SHOW_PROGRESS,
-    CURL_BINARY,
-    WGET_BINARY,
-    CHROME_BINARY,
-    GIT_BINARY,
-    YOUTUBEDL_BINARY,
     FETCH_TITLE,
     FETCH_FAVICON,
     FETCH_WGET,
@@ -124,70 +119,6 @@ def check_links_structure(links):
     if links:
         check_link_structure(links[0])
 
-def check_dependencies():
-    """Check that all necessary dependencies are installed, and have valid versions"""
-
-    try:
-        python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
-        if python_vers < 3.5:
-            print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
-            print('    See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
-            raise SystemExit(1)
-
-        if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
-            if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
-                print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
-                print('    Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
-                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
-                raise SystemExit(1)
-
-        if FETCH_WGET or FETCH_WARC:
-            if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
-                print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
-                print('    Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
-                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
-                raise SystemExit(1)
-
-        if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
-            if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
-                print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
-                print('    Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
-                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
-                raise SystemExit(1)
-
-            # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
-            try:
-                result = run([CHROME_BINARY, '--version'], stdout=PIPE)
-                version_str = result.stdout.decode('utf-8')
-                version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
-                version = [l for l in version_lines if l.isdigit()][-1]
-                if int(version) < 59:
-                    print(version_lines)
-                    print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
-                    print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
-                    raise SystemExit(1)
-            except (IndexError, TypeError, OSError):
-                print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
-                print('    Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
-                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
-                raise SystemExit(1)
-
-        if FETCH_GIT:
-            if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
-                print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
-                print('    Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
-                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
-                raise SystemExit(1)
-
-        if FETCH_MEDIA:
-            if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
-                print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
-                print('    Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
-                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
-                raise SystemExit(1)
-    except (KeyboardInterrupt, Exception):
-        raise SystemExit(1)
-
 def check_url_parsing_invariants():
     """Check that plain text regex URL parsing works as expected"""
 
@@ -284,7 +215,7 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
 
         match = re.search(HTML_TITLE_REGEX, html)
         return match.group(1).strip() if match else None
-    except Exception as err:
+    except Exception as err:  # noqa
         # print('[!] Failed to fetch title because of {}: {}'.format(
         #     err.__class__.__name__,
         #     err,
@@ -560,10 +491,13 @@ def progress_bar(seconds, prefix):
         pass
 
 class TimedProgress:
+    """Show a progress bar and measure elapsed time until .end() is called"""
+
     def __init__(self, seconds, prefix=''):
         if SHOW_PROGRESS:
             self.p = Process(target=progress_bar, args=(seconds, prefix))
             self.p.start()
+
         self.stats = {
             'start_ts': datetime.now(),
             'end_ts': None,
@@ -571,7 +505,7 @@ class TimedProgress:
         }
 
     def end(self):
-        """immediately finish progress and clear the progressbar line"""
+        """immediately end progress, clear the progressbar line, and save end_ts"""
 
         end_ts = datetime.now()
         self.stats.update({
@@ -591,6 +525,8 @@ class TimedProgress:
             sys.stdout.flush()
 
 def download_url(url, timeout=TIMEOUT):
+    """Download the contents of a remote url and return the text"""
+
     req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
 
     if CHECK_SSL_VALIDITY:
@@ -615,34 +551,31 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
         raise Exception('Failed to chmod {}/{}'.format(cwd, path))
 
 
-def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
-               headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
-               check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
-               resolution=RESOLUTION, timeout=TIMEOUT):
+def chrome_args(**options):
     """helper to build up a chrome shell command with arguments"""
 
-    cmd_args = [binary]
+    cmd_args = [options['CHROME_BINARY']]
 
-    if headless:
+    if options['HEADLESS']:
         cmd_args += ('--headless',)
     
-    if not sandbox:
+    if not options['CHROME_SANDBOX']:
         # dont use GPU or sandbox when running inside docker container
         cmd_args += ('--no-sandbox', '--disable-gpu')
 
-    if not check_ssl_validity:
+    if not options['CHECK_SSL_VALIDITY']:
         cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
 
-    if user_agent:
-        cmd_args += ('--user-agent={}'.format(user_agent),)
+    if options['CHROME_USER_AGENT']:
+        cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
 
-    if resolution:
-        cmd_args += ('--window-size={}'.format(RESOLUTION),)
+    if options['RESOLUTION']:
+        cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
 
-    if timeout:
-        cmd_args += ('--timeout={}'.format((timeout) * 1000),)
+    if options['TIMEOUT']:
+        cmd_args += ('--timeout={}'.format((options['TIMEOUT']) * 1000),)
 
-    if user_data_dir:
-        cmd_args.append('--user-data-dir={}'.format(user_data_dir))
+    if options['CHROME_USER_DATA_DIR']:
+        cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
     
     return cmd_args