Nick Sweeting %!s(int64=5) %!d(string=hai) anos
pai
achega
5c2bbe7efe

+ 1 - 1
archivebox/config/__init__.py

@@ -78,7 +78,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com'},
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
 
-        'CURL_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'}
+        'CURL_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'},
         'WGET_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
         'CHROME_USER_AGENT':        {'type': str,   'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
 

+ 6 - 5
archivebox/extractors/archive_org.py

@@ -6,18 +6,18 @@ from typing import Optional, List, Dict, Tuple
 from collections import defaultdict
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
-from ..system import run, PIPE, DEVNULL, chmod_file
+from ..system import run, chmod_file
 from ..util import (
     enforce_types,
     is_static_file,
 )
 from ..config import (
-    VERSION,
     TIMEOUT,
+    CHECK_SSL_VALIDITY,
     SAVE_ARCHIVE_DOT_ORG,
     CURL_BINARY,
     CURL_VERSION,
-    CHECK_SSL_VALIDITY
+    CURL_USER_AGENT,
 )
 from ..cli.logging import TimedProgress
 
@@ -45,17 +45,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=T
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
     cmd = [
         CURL_BINARY,
+        '--silent',
         '--location',
         '--head',
-        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
         '--max-time', str(timeout),
+        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
         submit_url,
     ]
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     try:
-        result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
+        result = run(cmd, cwd=out_dir, timeout=timeout)
         content_location, errors = parse_archive_dot_org_response(result.stdout)
         if content_location:
             archive_org_url = 'https://web.archive.org{}'.format(content_location[0])

+ 2 - 2
archivebox/extractors/dom.py

@@ -5,7 +5,7 @@ import os
 from typing import Optional
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
-from ..system import run, PIPE, chmod_file
+from ..system import run, chmod_file
 from ..util import (
     enforce_types,
     is_static_file,
@@ -47,7 +47,7 @@ def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
     timer = TimedProgress(timeout, prefix='      ')
     try:
         with open(output_path, 'w+') as f:
-            result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
+            result = run(cmd, stdout=f, cwd=out_dir, timeout=timeout)
 
         if result.returncode:
             hints = result.stderr.decode()

+ 3 - 3
archivebox/extractors/favicon.py

@@ -5,7 +5,7 @@ import os
 from typing import Optional
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput
-from ..system import chmod_file, run, PIPE
+from ..system import chmod_file, run
 from ..util import enforce_types, domain
 from ..config import (
     TIMEOUT,
@@ -38,14 +38,14 @@ def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
         '--max-time', str(timeout),
         '--location',
         '--output', str(output),
-        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else [],
+        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
         'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
     ]
     status = 'pending'
     timer = TimedProgress(timeout, prefix='      ')
     try:
-        run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+        run(cmd, cwd=out_dir, timeout=timeout)
         chmod_file(output, cwd=out_dir)
         status = 'succeeded'
     except Exception as err:

+ 2 - 2
archivebox/extractors/git.py

@@ -5,7 +5,7 @@ import os
 from typing import Optional
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
-from ..system import run, PIPE, chmod_file
+from ..system import run, chmod_file
 from ..util import (
     enforce_types,
     is_static_file,
@@ -64,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     try:
-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
+        result = run(cmd, cwd=output_path, timeout=timeout + 1)
 
         if result.returncode == 128:
             # ignore failed re-download when the folder already exists

+ 2 - 2
archivebox/extractors/media.py

@@ -5,7 +5,7 @@ import os
 from typing import Optional
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
-from ..system import run, PIPE, chmod_file
+from ..system import run, chmod_file
 from ..util import (
     enforce_types,
     is_static_file,
@@ -66,7 +66,7 @@ def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEO
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     try:
-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
+        result = run(cmd, cwd=output_path, timeout=timeout + 1)
         chmod_file(output, cwd=out_dir)
         if result.returncode:
             if (b'ERROR: Unsupported URL' in result.stderr

+ 2 - 2
archivebox/extractors/pdf.py

@@ -5,7 +5,7 @@ import os
 from typing import Optional
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
-from ..system import run, PIPE, chmod_file
+from ..system import run, chmod_file
 from ..util import (
     enforce_types,
     is_static_file,
@@ -45,7 +45,7 @@ def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     try:
-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+        result = run(cmd, cwd=out_dir, timeout=timeout)
 
         if result.returncode:
             hints = (result.stderr or result.stdout).decode()

+ 2 - 2
archivebox/extractors/screenshot.py

@@ -5,7 +5,7 @@ import os
 from typing import Optional
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
-from ..system import run, PIPE, chmod_file
+from ..system import run, chmod_file
 from ..util import (
     enforce_types,
     is_static_file,
@@ -45,7 +45,7 @@ def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOU
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     try:
-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+        result = run(cmd, cwd=out_dir, timeout=timeout)
 
         if result.returncode:
             hints = (result.stderr or result.stdout).decode()

+ 7 - 0
archivebox/extractors/title.py

@@ -12,9 +12,11 @@ from ..util import (
 )
 from ..config import (
     TIMEOUT,
+    CHECK_SSL_VALIDITY,
     SAVE_TITLE,
     CURL_BINARY,
     CURL_VERSION,
+    CURL_USER_AGENT,
 )
 from ..cli.logging import TimedProgress
 
@@ -44,6 +46,11 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
     output: ArchiveOutput = None
     cmd = [
         CURL_BINARY,
+        '--silent',
+        '--max-time', str(timeout),
+        '--location',
+        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
+        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
         link.url,
         '|',
         'grep',

+ 2 - 2
archivebox/extractors/wget.py

@@ -7,7 +7,7 @@ from typing import Optional
 from datetime import datetime
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
-from ..system import run, PIPE
+from ..system import run
 from ..util import (
     enforce_types,
     is_static_file,
@@ -81,7 +81,7 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     try:
-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+        result = run(cmd, cwd=out_dir, timeout=timeout)
         output = wget_output_path(link)
 
         # parse out number of files downloaded from last line of stderr:

+ 17 - 42
archivebox/system.py

@@ -4,69 +4,44 @@ __package__ = 'archivebox'
 import os
 import shutil
 
-import json as pyjson
+from json import dump
+from pathlib import Path
 from typing import Optional, Union, Set, Tuple
+from subprocess import run as subprocess_run
 
 from crontab import CronTab
-from atomicwrites import atomic_write as awrite
-
-from subprocess import (
-    Popen,
-    PIPE,
-    DEVNULL, 
-    CompletedProcess,
-    TimeoutExpired,
-    CalledProcessError,
-)
+from atomicwrites import atomic_write as lib_atomic_write
 
 from .util import enforce_types, ExtendedEncoder
 from .config import OUTPUT_PERMISSIONS
 
 
 
-def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
+def run(*args, input=None, capture_output=True, text=True, timeout=None, check=False, **kwargs):
     """Patched of subprocess.run to fix blocking io making timeout=innefective"""
 
     if input is not None:
         if 'stdin' in kwargs:
             raise ValueError('stdin and input arguments may not both be used.')
-        kwargs['stdin'] = PIPE
 
     if capture_output:
         if ('stdout' in kwargs) or ('stderr' in kwargs):
             raise ValueError('stdout and stderr arguments may not be used '
                              'with capture_output.')
-        kwargs['stdout'] = PIPE
-        kwargs['stderr'] = PIPE
-
-    with Popen(*popenargs, **kwargs) as process:
-        try:
-            stdout, stderr = process.communicate(input, timeout=timeout)
-        except TimeoutExpired:
-            process.kill()
-            try:
-                stdout, stderr = process.communicate(input, timeout=2)
-            except:
-                pass
-            raise TimeoutExpired(popenargs[0][0], timeout)
-        except BaseException:
-            process.kill()
-            # We don't call process.wait() as .__exit__ does that for us.
-            raise 
-        retcode = process.poll()
-        if check and retcode:
-            raise CalledProcessError(retcode, process.args,
-                                     output=stdout, stderr=stderr)
-    return CompletedProcess(process.args, retcode, stdout, stderr)
-
-
-def atomic_write(path: str, contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
+
+    return subprocess_run(*args, input=input, capture_output=capture_output, text=text, timeout=timeout, check=check, **kwargs)
+
+@enforce_types
+def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
     """Safe atomic write to filesystem by writing to temp file + atomic rename"""
     
-    with awrite(path, overwrite=overwrite) as f:
+    mode = 'wb+' if isinstance(contents, bytes) else 'w'
+
+    # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
+    with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
         if isinstance(contents, dict):
-            pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
-        else:
+            dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
+        elif isinstance(contents, (bytes, str)):
             f.write(contents)
 
 @enforce_types
@@ -76,7 +51,7 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim
     if not os.path.exists(os.path.join(cwd, path)):
         raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
 
-    chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
+    chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, timeout=timeout)
     if chmod_result.returncode == 1:
         print('     ', chmod_result.stderr.decode())
         raise Exception('Failed to chmod {}/{}'.format(cwd, path))