Explorar o código

add new config and dependency options

Nick Sweeting %!s(int64=6) %!d(string=hai) anos
pai
achega
d689264365
Modificáronse 3 ficheiros con 78 adicións e 36 borrados
  1. 23 7
      archivebox/archive_methods.py
  2. 11 8
      archivebox/config.py
  3. 44 21
      archivebox/util.py

+ 23 - 7
archivebox/archive_methods.py

@@ -11,6 +11,10 @@ from peekable import Peekable
 from index import wget_output_path, parse_json_link_index, write_link_index
 from links import links_after_timestamp
 from config import (
+    CURL_BINARY,
+    GIT_BINARY,
+    WGET_BINARY,
+    YOUTUBEDL_BINARY,
     CHROME_BINARY,
     FETCH_FAVICON,
     FETCH_TITLE,
@@ -37,6 +41,7 @@ from config import (
     GIT_SHA,
 )
 from util import (
+    without_hash,
     check_dependencies,
     fetch_page_title,
     progress,
@@ -214,7 +219,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
 
     # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
     CMD = [
-        'wget',
+        WGET_BINARY,
         # '--server-response',  # print headers for better error parsing
         '--no-verbose',
         '--adjust-extension',
@@ -417,7 +422,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
 
     success = False
     CMD = [
-        'curl',
+        CURL_BINARY,
         '--location',
         '--head',
         '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),
@@ -481,8 +486,9 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
         return {'output': 'favicon.ico', 'status': 'skipped'}
 
     CMD = [
-        'curl',
+        CURL_BINARY,
         '--max-time', str(timeout),
+        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
         'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
     ]
     fout = open('{}/favicon.ico'.format(link_dir), 'w')
@@ -542,7 +548,7 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
 
     os.makedirs(output, exist_ok=True)
     CMD = [
-        'youtube-dl',
+        YOUTUBEDL_BINARY,
         '--write-description',
         '--write-info-json',
         '--write-annotations',
@@ -552,12 +558,15 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
         '--no-check-certificate',
         '--user-agent',
         '--all-subs',
-        '-x',
-        '-k',
+        '--extract-audio',
+        '--keep-video',
+        '--ignore-errors',
+        '--geo-bypass',
         '--audio-format', 'mp3',
         '--audio-quality', '320K',
         '--embed-thumbnail',
         '--add-metadata',
+        *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
         link['url'],
     ]
 
@@ -605,7 +614,14 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
     if os.path.exists(os.path.join(link_dir, 'git')):
         return {'output': 'git', 'status': 'skipped'}
 
-    CMD = ['git', 'clone', '--mirror', '--recursive', link['url'].split('#')[0], 'git']
+    CMD = [
+        GIT_BINARY,
+        *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
+        'clone',
+        '--mirror',
+        '--recursive',
+        without_hash(link['url']),
+    ]
     output = 'git'
 
     end = progress(timeout, prefix='      ')

+ 11 - 8
archivebox/config.py

@@ -11,12 +11,13 @@ from subprocess import run, PIPE
 # ******************************************************************************
 
 IS_TTY = sys.stdout.isatty()
-ONLY_NEW =               os.getenv('ONLY_NEW',               'False'            ).lower() == 'true'
 USE_COLOR =              os.getenv('USE_COLOR',              str(IS_TTY)        ).lower() == 'true'
 SHOW_PROGRESS =          os.getenv('SHOW_PROGRESS',          str(IS_TTY)        ).lower() == 'true'
-OUTPUT_PERMISSIONS =     os.getenv('OUTPUT_PERMISSIONS',     '755'              )
+ONLY_NEW =               os.getenv('ONLY_NEW',               'False'            ).lower() == 'true'
 MEDIA_TIMEOUT =          int(os.getenv('MEDIA_TIMEOUT',      '3600'))
 TIMEOUT =                int(os.getenv('TIMEOUT',            '60'))
+OUTPUT_PERMISSIONS =     os.getenv('OUTPUT_PERMISSIONS',     '755'              )
+FOOTER_INFO =            os.getenv('FOOTER_INFO',            'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.',)
 
 FETCH_WGET =             os.getenv('FETCH_WGET',             'True'             ).lower() == 'true'
 FETCH_WGET_REQUISITES =  os.getenv('FETCH_WGET_REQUISITES',  'True'             ).lower() == 'true'
@@ -33,13 +34,15 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'
 CHECK_SSL_VALIDITY =     os.getenv('CHECK_SSL_VALIDITY',     'True'             ).lower() == 'true'
 RESOLUTION =             os.getenv('RESOLUTION',             '1440,2000'        )
 GIT_DOMAINS =            os.getenv('GIT_DOMAINS',            'github.com,bitbucket.org,gitlab.com').split(',')
-COOKIES_FILE =           os.getenv('COOKIES_FILE',           None)
 WGET_USER_AGENT =        os.getenv('WGET_USER_AGENT',        'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
+COOKIES_FILE =           os.getenv('COOKIES_FILE',           None)
 CHROME_USER_DATA_DIR =   os.getenv('CHROME_USER_DATA_DIR',   None)
 
-CHROME_BINARY =          os.getenv('CHROME_BINARY',          None)  # change to google-chrome browser if using google-chrome
-WGET_BINARY =            os.getenv('WGET_BINARY',            'wget'             )
-FOOTER_INFO =            os.getenv('FOOTER_INFO',            'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.',)
+CURL_BINARY =            os.getenv('CURL_BINARY',            'curl')
+GIT_BINARY =             os.getenv('GIT_BINARY',             'git')
+WGET_BINARY =            os.getenv('WGET_BINARY',            'wget')
+YOUTUBEDL_BINARY =       os.getenv('YOUTUBEDL_BINARY',       'youtube-dl')
+CHROME_BINARY =          os.getenv('CHROME_BINARY',          None)
 
 ### Paths
 REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
@@ -101,7 +104,7 @@ if not USE_COLOR:
 ### Confirm Environment Setup
 GIT_SHA = 'unknown'
 try:
-    GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
+    GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
 except Exception:
     print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
 
@@ -115,7 +118,7 @@ except Exception:
 
 WGET_VERSION = 'unknown'
 try:
-    wget_vers_str = run(["wget", "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
+    wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
     WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
 except Exception:
     if USE_WGET:

+ 44 - 21
archivebox/util.py

@@ -14,23 +14,30 @@ from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, C
 from multiprocessing import Process
 
 from config import (
+    ANSI,
     IS_TTY,
-    OUTPUT_PERMISSIONS,
+    TERM_WIDTH,
     REPO_DIR,
-    SOURCES_DIR,
     OUTPUT_DIR,
+    SOURCES_DIR,
     ARCHIVE_DIR,
+    OUTPUT_PERMISSIONS,
     TIMEOUT,
-    TERM_WIDTH,
     SHOW_PROGRESS,
-    ANSI,
+    CHECK_SSL_VALIDITY,
+    CURL_BINARY,
+    WGET_BINARY,
     CHROME_BINARY,
+    GIT_BINARY,
+    YOUTUBEDL_BINARY,
+    FETCH_TITLE,
+    FETCH_FAVICON,
     FETCH_WGET,
+    FETCH_WARC,
     FETCH_PDF,
     FETCH_SCREENSHOT,
     FETCH_DOM,
-    FETCH_FAVICON,
-    FETCH_TITLE,
+    FETCH_GIT,
     FETCH_MEDIA,
     SUBMIT_ARCHIVE_DOT_ORG,
 )
@@ -64,6 +71,20 @@ def check_dependencies():
         print('    See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
         raise SystemExit(1)
 
+    if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
+        if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
+            print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
+            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
+            print('    See https://github.com/pirate/ArchiveBox for help.')
+            raise SystemExit(1)
+
+    if FETCH_WGET or FETCH_WARC:
+        if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
+            print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
+            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
+            print('    See https://github.com/pirate/ArchiveBox for help.')
+            raise SystemExit(1)
+
     if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
         if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
             print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
@@ -88,24 +109,17 @@ def check_dependencies():
             print('    See https://github.com/pirate/ArchiveBox for help.')
             raise SystemExit(1)
 
-    if FETCH_WGET:
-        if run(['which', 'wget'], stdout=DEVNULL).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode:
-            print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget'))
-            print('    See https://github.com/pirate/ArchiveBox for help.')
-            raise SystemExit(1)
-
-    if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
-        if run(['which', 'curl'], stdout=DEVNULL).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode:
-            print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl'))
+    if FETCH_GIT:
+        if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
+            print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
+            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
             print('    See https://github.com/pirate/ArchiveBox for help.')
             raise SystemExit(1)
 
     if FETCH_MEDIA:
-        if run(['which', 'youtube-dl'], stdout=DEVNULL).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode:
+        if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
             print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl'))
+            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
             print('    See https://github.com/pirate/ArchiveBox for help.')
             raise SystemExit(1)
 
@@ -246,8 +260,17 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
             sys.stdout.write('.')
             sys.stdout.flush()
 
-        html_content = urlopen(url, timeout=timeout).read().decode('utf-8')
-        match = re.search(HTML_TITLE_REGEX, html_content)
+        if CHECK_SSL_VALIDITY:
+            html_content = urlopen(url, timeout=timeout)
+        else:
+            try:
+                import ssl
+                insecure = ssl._create_unverified_context()
+                html_content = urlopen(url, timeout=timeout, context=insecure)
+            except ImportError:
+                html_content = urlopen(url, timeout=timeout)
+
+        match = re.search(HTML_TITLE_REGEX, html_content.read().decode('utf-8'))
         return match.group(1).strip() if match else None
     except Exception:
         return None