소스 검색

Merge branch 'dev' into DanielBatteryStapler-patch-1

Nick Sweeting 2 년 전
부모
커밋
110a22ee32
9개의 변경된 파일739개의 추가작업 그리고 268개의 파일을 삭제
  1. 2 5
      archivebox/config.py
  2. 2 0
      archivebox/extractors/dom.py
  3. 2 0
      archivebox/extractors/pdf.py
  4. 2 0
      archivebox/extractors/screenshot.py
  5. 1 1
      archivebox/logging_util.py
  6. 12 0
      archivebox/util.py
  7. 1 1
      bin/docker_entrypoint.sh
  8. 715 260
      package-lock.json
  9. 2 1
      package.json

+ 2 - 5
archivebox/config.py

@@ -159,10 +159,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 '--write-thumbnail',
                                                                 '--no-call-home',
                                                                 '--write-sub',
-                                                                '--all-subs',
-                                                                # There are too many of these and youtube
-                                                                # throttles you with HTTP error 429
-                                                                #'--write-auto-subs',
+                                                                '--write-auto-subs',
                                                                 '--convert-subs=srt',
                                                                 '--yes-playlist',
                                                                 '--continue',
@@ -175,7 +172,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 '--ignore-errors',
                                                                 '--geo-bypass',
                                                                 '--add-metadata',
-                                                                '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
+                                                                '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
                                                                 ]},
 
 

+ 2 - 0
archivebox/extractors/dom.py

@@ -9,6 +9,7 @@ from ..util import (
     enforce_types,
     is_static_file,
     chrome_args,
+    chrome_cleanup,
 )
 from ..config import (
     TIMEOUT,
@@ -57,6 +58,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
     except Exception as err:
         status = 'failed'
         output = err
+        chrome_cleanup()
     finally:
         timer.end()
 

+ 2 - 0
archivebox/extractors/pdf.py

@@ -9,6 +9,7 @@ from ..util import (
     enforce_types,
     is_static_file,
     chrome_args,
+    chrome_cleanup,
 )
 from ..config import (
     TIMEOUT,
@@ -54,6 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
     except Exception as err:
         status = 'failed'
         output = err
+        chrome_cleanup()
     finally:
         timer.end()
 

+ 2 - 0
archivebox/extractors/screenshot.py

@@ -9,6 +9,7 @@ from ..util import (
     enforce_types,
     is_static_file,
     chrome_args,
+    chrome_cleanup,
 )
 from ..config import (
     TIMEOUT,
@@ -54,6 +55,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     except Exception as err:
         status = 'failed'
         output = err
+        chrome_cleanup()
     finally:
         timer.end()
 

+ 1 - 1
archivebox/logging_util.py

@@ -441,7 +441,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
 
             hints = (
                 '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
-                for line in hints[:5] if line.strip()
+                for line in list(hints)[:5] if line.strip()
             )
 
 

+ 12 - 0
archivebox/util.py

@@ -17,6 +17,8 @@ from requests.exceptions import RequestException, ReadTimeout
 
 from .vendor.base32_crockford import encode as base32_encode                            # type: ignore
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
+from os.path import lexists
+from os import remove as remove_file
 
 try:
     import chardet
@@ -272,6 +274,16 @@ def chrome_args(**options) -> List[str]:
     
     return cmd_args
 
+def chrome_cleanup():
+    """
+    Cleans up any state or runtime files that chrome leaves behind when killed by
+    a timeout or other error
+    """
+
+    from .config import IN_DOCKER
+    
+    if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
+        remove_file("/home/archivebox/.config/chromium/SingletonLock")
 
 def ansi_to_html(text):
     """

+ 1 - 1
bin/docker_entrypoint.sh

@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/bin/bash
 
 DATA_DIR="${DATA_DIR:-/data}"
 ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 715 - 260
package-lock.json


+ 2 - 1
package.json

@@ -7,7 +7,8 @@
   "license": "MIT",
   "dependencies": {
     "@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
+    "playwright": "^1.37.1",
     "readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
-    "single-file": "git+https://github.com/gildas-lormeau/SingleFile.git"
+    "single-file-cli": "^1.0.63"
   }
 }

이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.