2
0
Эх сурвалжийг харах

Merge branch 'dev' into DanielBatteryStapler-patch-1

Nick Sweeting 2 жил өмнө
parent
commit
110a22ee32

+ 2 - 5
archivebox/config.py

@@ -159,10 +159,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 '--write-thumbnail',
                                                                 '--write-thumbnail',
                                                                 '--no-call-home',
                                                                 '--no-call-home',
                                                                 '--write-sub',
                                                                 '--write-sub',
-                                                                '--all-subs',
-                                                                # There are too many of these and youtube
-                                                                # throttles you with HTTP error 429
-                                                                #'--write-auto-subs',
+                                                                '--write-auto-subs',
                                                                 '--convert-subs=srt',
                                                                 '--convert-subs=srt',
                                                                 '--yes-playlist',
                                                                 '--yes-playlist',
                                                                 '--continue',
                                                                 '--continue',
@@ -175,7 +172,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 '--ignore-errors',
                                                                 '--ignore-errors',
                                                                 '--geo-bypass',
                                                                 '--geo-bypass',
                                                                 '--add-metadata',
                                                                 '--add-metadata',
-                                                                '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
+                                                                '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
                                                                 ]},
                                                                 ]},
 
 
 
 

+ 2 - 0
archivebox/extractors/dom.py

@@ -9,6 +9,7 @@ from ..util import (
     enforce_types,
     enforce_types,
     is_static_file,
     is_static_file,
     chrome_args,
     chrome_args,
+    chrome_cleanup,
 )
 )
 from ..config import (
 from ..config import (
     TIMEOUT,
     TIMEOUT,
@@ -57,6 +58,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
     except Exception as err:
     except Exception as err:
         status = 'failed'
         status = 'failed'
         output = err
         output = err
+        chrome_cleanup()
     finally:
     finally:
         timer.end()
         timer.end()
 
 

+ 2 - 0
archivebox/extractors/pdf.py

@@ -9,6 +9,7 @@ from ..util import (
     enforce_types,
     enforce_types,
     is_static_file,
     is_static_file,
     chrome_args,
     chrome_args,
+    chrome_cleanup,
 )
 )
 from ..config import (
 from ..config import (
     TIMEOUT,
     TIMEOUT,
@@ -54,6 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
     except Exception as err:
     except Exception as err:
         status = 'failed'
         status = 'failed'
         output = err
         output = err
+        chrome_cleanup()
     finally:
     finally:
         timer.end()
         timer.end()
 
 

+ 2 - 0
archivebox/extractors/screenshot.py

@@ -9,6 +9,7 @@ from ..util import (
     enforce_types,
     enforce_types,
     is_static_file,
     is_static_file,
     chrome_args,
     chrome_args,
+    chrome_cleanup,
 )
 )
 from ..config import (
 from ..config import (
     TIMEOUT,
     TIMEOUT,
@@ -54,6 +55,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     except Exception as err:
     except Exception as err:
         status = 'failed'
         status = 'failed'
         output = err
         output = err
+        chrome_cleanup()
     finally:
     finally:
         timer.end()
         timer.end()
 
 

+ 1 - 1
archivebox/logging_util.py

@@ -441,7 +441,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
 
 
             hints = (
             hints = (
                 '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
                 '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
-                for line in hints[:5] if line.strip()
+                for line in list(hints)[:5] if line.strip()
             )
             )
 
 
 
 

+ 12 - 0
archivebox/util.py

@@ -17,6 +17,8 @@ from requests.exceptions import RequestException, ReadTimeout
 
 
 from .vendor.base32_crockford import encode as base32_encode                            # type: ignore
 from .vendor.base32_crockford import encode as base32_encode                            # type: ignore
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
+from os.path import lexists
+from os import remove as remove_file
 
 
 try:
 try:
     import chardet
     import chardet
@@ -272,6 +274,16 @@ def chrome_args(**options) -> List[str]:
     
     
     return cmd_args
     return cmd_args
 
 
+def chrome_cleanup():
+    """
+    Cleans up any state or runtime files that chrome leaves behind when killed by
+    a timeout or other error
+    """
+
+    from .config import IN_DOCKER
+    
+    if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
+        remove_file("/home/archivebox/.config/chromium/SingletonLock")
 
 
 def ansi_to_html(text):
 def ansi_to_html(text):
     """
     """

+ 1 - 1
bin/docker_entrypoint.sh

@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/bin/bash
 
 
 DATA_DIR="${DATA_DIR:-/data}"
 DATA_DIR="${DATA_DIR:-/data}"
 ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
 ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 715 - 260
package-lock.json


+ 2 - 1
package.json

@@ -7,7 +7,8 @@
   "license": "MIT",
   "license": "MIT",
   "dependencies": {
   "dependencies": {
     "@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
     "@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
+    "playwright": "^1.37.1",
     "readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
     "readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
-    "single-file": "git+https://github.com/gildas-lormeau/SingleFile.git"
+    "single-file-cli": "^1.0.63"
   }
   }
 }
 }

Энэ ялгаанд хэт олон файл өөрчлөгдсөн тул зарим файлыг харуулаагүй болно