Procházet zdrojové kódy

fix: Improve headers handling

Cristian před 5 roky
rodič
revize
62ed11a5ca

+ 1 - 0
archivebox/config/__init__.py

@@ -85,6 +85,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'SAVE_PDF':                 {'type': bool,  'default': True, 'aliases': ('FETCH_PDF',)},
         'SAVE_SCREENSHOT':          {'type': bool,  'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
         'SAVE_DOM':                 {'type': bool,  'default': True, 'aliases': ('FETCH_DOM',)},
+        'SAVE_HEADERS':             {'type': bool,  'default': True, 'aliases': ('FETCH_HEADERS',)},
         'SAVE_WARC':                {'type': bool,  'default': True, 'aliases': ('FETCH_WARC',)},
         'SAVE_GIT':                 {'type': bool,  'default': True, 'aliases': ('FETCH_GIT',)},
         'SAVE_MEDIA':               {'type': bool,  'default': True, 'aliases': ('FETCH_MEDIA',)},

+ 2 - 1
archivebox/extractors/headers.py

@@ -16,6 +16,7 @@ from ..config import (
     CURL_USER_AGENT,
     CURL_VERSION,
     CHECK_SSL_VALIDITY,
+    SAVE_HEADERS
 )
 from ..logging_util import TimedProgress
 
@@ -24,7 +25,7 @@ def should_save_headers(link: Link, out_dir: Optional[str]=None) -> bool:
     out_dir = out_dir or link.link_dir
 
     output = Path(out_dir or link.link_dir) / 'headers.json'
-    return not output.exists()
+    return not output.exists() and SAVE_HEADERS
 
 
 @enforce_types

+ 4 - 0
archivebox/util.py

@@ -186,13 +186,17 @@ def get_headers(url: str, timeout: int=None) -> str:
             headers={'User-Agent': WGET_USER_AGENT},
             verify=CHECK_SSL_VALIDITY,
             timeout=timeout,
+            allow_redirects=True
         )
+        if response.status_code >= 400:
+            raise RequestException
     except RequestException:
         response = requests.get(
             url,
             headers={'User-Agent': WGET_USER_AGENT},
             verify=CHECK_SSL_VALIDITY,
             timeout=timeout,
+            stream=True
         )
     
     return pyjson.dumps(dict(response.headers), indent=4)

+ 1 - 0
tests/fixtures.py

@@ -20,6 +20,7 @@ def disable_extractors_dict():
         "SAVE_PDF": "false",
         "SAVE_SCREENSHOT": "false",
         "SAVE_DOM": "false",
+        "SAVE_HEADERS": "false",
         "USE_GIT": "false",
         "SAVE_MEDIA": "false",
         "SAVE_ARCHIVE_DOT_ORG": "false"

+ 21 - 1
tests/mock_server/server.py

@@ -2,7 +2,7 @@ from os.path import abspath
 from os import getcwd
 from pathlib import Path
 
-from bottle import route, run, static_file, response
+from bottle import route, run, static_file, response, redirect
 
 @route("/")
 def index():
@@ -30,5 +30,25 @@ def static_path_with_headers(filename):
     response.add_header("Content-Style-Type", "text/css")
     return response
 
+@route("/static/400/<filename>", method="HEAD")
+def static_400(filename):
+    template_path = abspath(getcwd()) / Path("tests/mock_server/templates")
+    response = static_file(filename, root=template_path)
+    response.status = 400
+    response.add_header("Status-Code", "400")
+    return response
+
+@route("/static/400/<filename>", method="GET")
+def static_200(filename):
+    template_path = abspath(getcwd()) / Path("tests/mock_server/templates")
+    response = static_file(filename, root=template_path)
+    response.add_header("Status-Code", "200")
+    return response
+
+@route("/redirect/headers/<filename>")
+def redirect_to_static(filename):
+    redirect(f"/static/headers/$filename")
+
+
 def start():
     run(host='localhost', port=8080)

+ 31 - 1
tests/test_extractors.py

@@ -71,7 +71,15 @@ def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, d
     assert "> singlefile" not in output_str
     assert "> readability" not in output_str
 
-def test_headers(tmp_path, process, disable_extractors_dict):
+def test_headers_ignored(tmp_path, process, disable_extractors_dict):
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict)
+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
+    output_file = archived_item_path / "headers.json"
+    assert not output_file.exists()
+
+def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"SAVE_HEADERS": "true"})
     add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'],
                                   capture_output=True, env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
@@ -83,3 +91,25 @@ def test_headers(tmp_path, process, disable_extractors_dict):
     assert headers['Content-Language'] == 'en'
     assert headers['Content-Script-Type'] == 'text/javascript'
     assert headers['Content-Style-Type'] == 'text/css'
+
+def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"SAVE_HEADERS": "true"})
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/redirect/headers/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict)
+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
+    output_file = archived_item_path / "headers.json" 
+    with open(output_file) as f:
+        headers = pyjson.load(f)
+    assert headers['Content-Language'] == 'en'
+    assert headers['Content-Script-Type'] == 'text/javascript'
+    assert headers['Content-Style-Type'] == 'text/css'
+
+def test_headers_400_plus(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"SAVE_HEADERS": "true"})
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/400/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict)
+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
+    output_file = archived_item_path / "headers.json" 
+    with open(output_file) as f:
+        headers = pyjson.load(f)
+    assert headers["Status-Code"] == "200"