5 ani în urmă · 2a68af1b94
--- a/archivebox/extractors/readability.py
+++ b/archivebox/extractors/readability.py
@@ -37,7 +37,7 @@ def get_html(link: Link, path: Path) -> str:
 
				             with open(abs_path / source, "r") as f:
			
 
				                 document = f.read()
			
 
				                 break
			
 
				-        except FileNotFoundError:
			
 
				+        except (FileNotFoundError, TypeError):
			
 
				             continue
			
 
				     if document is None:
			
 
				         return download_url(link.url)
			
@@ -51,6 +51,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool:
 
				         return False
			
 
				 
			
 
				     output = Path(out_dir or link.link_dir) / 'readability.json'
			
 
				+    print(output, SAVE_READABILITY)
			
 
				     return SAVE_READABILITY and (not output.exists())
			
 
				 
			
 
				 
			
@@ -63,8 +64,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
 
				     output = str(output_folder)
			
 
				 
			
 
				     document = get_html(link, out_dir)
			
 
				-    temp_doc = NamedTemporaryFile()
			
 
				+    temp_doc = NamedTemporaryFile(delete=False)
			
 
				     temp_doc.write(document.encode("utf-8"))
			
 
				+    temp_doc.close()
			
 
				     # Readability Docs: https://github.com/mozilla/readability
			
 
				     cmd = [
			
 
				         READABILITY_BINARY,
			
@@ -101,7 +103,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
 
				         output = err
			
 
				     finally:
			
 
				         timer.end()
			
 
				-        temp_doc.close()
			
 
				 
			
 
				     return ArchiveResult(
			
 
				         cmd=cmd,
			
--- a/tests/test_extractors.py
+++ b/tests/test_extractors.py
@@ -21,3 +21,35 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict):
 
				     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
			
 
				     output_file = archived_item_path / "singlefile.html" 
			
 
				     assert output_file.exists()
			
 
				+
			
 
				+def test_readability_works(tmp_path, process, disable_extractors_dict):
			
 
				+    disable_extractors_dict.update({"USE_READABILITY": "true"})
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
			
 
				+    output_file = archived_item_path / "readability" / "content.html"
			
 
				+    assert output_file.exists()
			
 
				+
			
 
				+def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict):
			
 
				+    disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"})
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
			
 
				+    output_file = archived_item_path / "readability" / "content.html"
			
 
				+    assert output_file.exists()
			
 
				+
			
 
				+def test_readability_works_with_singlefile(tmp_path, process, disable_extractors_dict):
			
 
				+    disable_extractors_dict.update({"USE_READABILITY": "true", "USE_SINGLEFILE": "true"})
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
			
 
				+    output_file = archived_item_path / "readability" / "content.html"
			
 
				+    assert output_file.exists()
			
 
				+
			
 
				+def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict):
			
 
				+    disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true"})
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
			
 
				+    output_file = archived_item_path / "readability" / "content.html"
			
 
				+    assert output_file.exists()