|
|
@@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
|
|
|
|
|
out_dir = Path(out_dir or link.link_dir)
|
|
|
output_folder = out_dir.absolute() / "readability"
|
|
|
- output = str(output_folder)
|
|
|
+ output = "readability"
|
|
|
|
|
|
# Readability Docs: https://github.com/mozilla/readability
|
|
|
|
|
|
@@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
|
|
temp_doc.write(document.encode("utf-8"))
|
|
|
temp_doc.close()
|
|
|
|
|
|
+ if not document or len(document) < 10:
|
|
|
+ raise ArchiveError('Readability could not find HTML to parse for article text')
|
|
|
+
|
|
|
cmd = [
|
|
|
DEPENDENCIES['READABILITY_BINARY']['path'],
|
|
|
- temp_doc.name
|
|
|
+ temp_doc.name,
|
|
|
]
|
|
|
|
|
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
|
|
- result_json = json.loads(result.stdout)
|
|
|
+ try:
|
|
|
+ result_json = json.loads(result.stdout)
|
|
|
+ except json.JSONDecodeError:
|
|
|
+ raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
|
|
|
+
|
|
|
output_folder.mkdir(exist_ok=True)
|
|
|
readability_content = result_json.pop("textContent")
|
|
|
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
|
|
|
@@ -122,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
|
|
cmd_version=READABILITY_VERSION,
|
|
|
output=output,
|
|
|
status=status,
|
|
|
- index_texts= [readability_content] if readability_content else [],
|
|
|
+ index_texts=[readability_content] if readability_content else [],
|
|
|
**timer.stats,
|
|
|
)
|