před 5 roky · 62ed11a5ca
--- a/archivebox/config/__init__.py
+++ b/archivebox/config/__init__.py
@@ -85,6 +85,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
 
				         'SAVE_PDF':                 {'type': bool,  'default': True, 'aliases': ('FETCH_PDF',)},
			
 
				         'SAVE_SCREENSHOT':          {'type': bool,  'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
			
 
				         'SAVE_DOM':                 {'type': bool,  'default': True, 'aliases': ('FETCH_DOM',)},
			
 
				+        'SAVE_HEADERS':             {'type': bool,  'default': True, 'aliases': ('FETCH_HEADERS',)},
			
 
				         'SAVE_WARC':                {'type': bool,  'default': True, 'aliases': ('FETCH_WARC',)},
			
 
				         'SAVE_GIT':                 {'type': bool,  'default': True, 'aliases': ('FETCH_GIT',)},
			
 
				         'SAVE_MEDIA':               {'type': bool,  'default': True, 'aliases': ('FETCH_MEDIA',)},
			
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@@ -16,6 +16,7 @@ from ..config import (
 
				     CURL_USER_AGENT,
			
 
				     CURL_VERSION,
			
 
				     CHECK_SSL_VALIDITY,
			
 
				+    SAVE_HEADERS
			
 
				 )
			
 
				 from ..logging_util import TimedProgress
			
 
				 
			
@@ -24,7 +25,7 @@ def should_save_headers(link: Link, out_dir: Optional[str]=None) -> bool:
 
				     out_dir = out_dir or link.link_dir
			
 
				 
			
 
				     output = Path(out_dir or link.link_dir) / 'headers.json'
			
 
				-    return not output.exists()
			
 
				+    return not output.exists() and SAVE_HEADERS
			
 
				 
			
 
				 
			
 
				 @enforce_types
			
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -186,13 +186,17 @@ def get_headers(url: str, timeout: int=None) -> str:
 
				             headers={'User-Agent': WGET_USER_AGENT},
			
 
				             verify=CHECK_SSL_VALIDITY,
			
 
				             timeout=timeout,
			
 
				+            allow_redirects=True
			
 
				         )
			
 
				+        if response.status_code >= 400:
			
 
				+            raise RequestException
			
 
				     except RequestException:
			
 
				         response = requests.get(
			
 
				             url,
			
 
				             headers={'User-Agent': WGET_USER_AGENT},
			
 
				             verify=CHECK_SSL_VALIDITY,
			
 
				             timeout=timeout,
			
 
				+            stream=True
			
 
				         )
			
 
				     
			
 
				     return pyjson.dumps(dict(response.headers), indent=4)
			
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -20,6 +20,7 @@ def disable_extractors_dict():
 
				         "SAVE_PDF": "false",
			
 
				         "SAVE_SCREENSHOT": "false",
			
 
				         "SAVE_DOM": "false",
			
 
				+        "SAVE_HEADERS": "false",
			
 
				         "USE_GIT": "false",
			
 
				         "SAVE_MEDIA": "false",
			
 
				         "SAVE_ARCHIVE_DOT_ORG": "false"
			
--- a/tests/mock_server/server.py
+++ b/tests/mock_server/server.py
@@ -2,7 +2,7 @@ from os.path import abspath
 
				 from os import getcwd
			
 
				 from pathlib import Path
			
 
				 
			
 
				-from bottle import route, run, static_file, response
			
 
				+from bottle import route, run, static_file, response, redirect
			
 
				 
			
 
				 @route("/")
			
 
				 def index():
			
@@ -30,5 +30,25 @@ def static_path_with_headers(filename):
 
				     response.add_header("Content-Style-Type", "text/css")
			
 
				     return response
			
 
				 
			
 
				+@route("/static/400/<filename>", method="HEAD")
			
 
				+def static_400(filename):
			
 
				+    template_path = abspath(getcwd()) / Path("tests/mock_server/templates")
			
 
				+    response = static_file(filename, root=template_path)
			
 
				+    response.status = 400
			
 
				+    response.add_header("Status-Code", "400")
			
 
				+    return response
			
 
				+
			
 
				+@route("/static/400/<filename>", method="GET")
			
 
				+def static_200(filename):
			
 
				+    template_path = abspath(getcwd()) / Path("tests/mock_server/templates")
			
 
				+    response = static_file(filename, root=template_path)
			
 
				+    response.add_header("Status-Code", "200")
			
 
				+    return response
			
 
				+
			
 
				+@route("/redirect/headers/<filename>")
			
 
				+def redirect_to_static(filename):
			
 
				+    redirect(f"/static/headers/$filename")
			
 
				+
			
 
				+
			
 
				 def start():
			
 
				     run(host='localhost', port=8080)
			
--- a/tests/test_extractors.py
+++ b/tests/test_extractors.py
@@ -71,7 +71,15 @@ def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, d
 
				     assert "> singlefile" not in output_str
			
 
				     assert "> readability" not in output_str
			
 
				 
			
 
				-def test_headers(tmp_path, process, disable_extractors_dict):
			
 
				+def test_headers_ignored(tmp_path, process, disable_extractors_dict):
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
			
 
				+    output_file = archived_item_path / "headers.json"
			
 
				+    assert not output_file.exists()
			
 
				+
			
 
				+def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
			
 
				+    disable_extractors_dict.update({"SAVE_HEADERS": "true"})
			
 
				     add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'],
			
 
				                                   capture_output=True, env=disable_extractors_dict)
			
 
				     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
			
@@ -83,3 +91,25 @@ def test_headers(tmp_path, process, disable_extractors_dict):
 
				     assert headers['Content-Language'] == 'en'
			
 
				     assert headers['Content-Script-Type'] == 'text/javascript'
			
 
				     assert headers['Content-Style-Type'] == 'text/css'
			
 
				+
			
 
				+def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict):
			
 
				+    disable_extractors_dict.update({"SAVE_HEADERS": "true"})
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/redirect/headers/example.com.html'],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
			
 
				+    output_file = archived_item_path / "headers.json" 
			
 
				+    with open(output_file) as f:
			
 
				+        headers = pyjson.load(f)
			
 
				+    assert headers['Content-Language'] == 'en'
			
 
				+    assert headers['Content-Script-Type'] == 'text/javascript'
			
 
				+    assert headers['Content-Style-Type'] == 'text/css'
			
 
				+
			
 
				+def test_headers_400_plus(tmp_path, process, disable_extractors_dict):
			
 
				+    disable_extractors_dict.update({"SAVE_HEADERS": "true"})
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/400/example.com.html'],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
			
 
				+    output_file = archived_item_path / "headers.json" 
			
 
				+    with open(output_file) as f:
			
 
				+        headers = pyjson.load(f)
			
 
				+    assert headers["Status-Code"] == "200"