Browse Source

improve readability and mercury error handling and fix output path to be relative

Nick Sweeting 4 years ago
parent
commit
acb932ba12
2 changed files with 18 additions and 5 deletions
  1. 7 1
      archivebox/extractors/mercury.py
  2. 11 4
      archivebox/extractors/readability.py

+ 7 - 1
archivebox/extractors/mercury.py

@@ -54,7 +54,7 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
     output_folder = out_dir.absolute() / "mercury"
     output_folder = out_dir.absolute() / "mercury"
-    output = str(output_folder)
+    output = "mercury"
 
 
     status = 'succeeded'
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')
@@ -73,6 +73,9 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
         except json.JSONDecodeError:
         except json.JSONDecodeError:
             raise ShellError(cmd, result)
             raise ShellError(cmd, result)
         
         
+        if article_text.get('failed'):
+            raise ArchiveError('Mercury was not able to get article text from the URL')
+
         atomic_write(str(output_folder / "content.txt"), article_text["content"])
         atomic_write(str(output_folder / "content.txt"), article_text["content"])
 
 
         # Get HTML version of article
         # Get HTML version of article
@@ -86,6 +89,9 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
         except json.JSONDecodeError:
         except json.JSONDecodeError:
             raise ShellError(cmd, result)
             raise ShellError(cmd, result)
 
 
+        if article_text.get('failed'):
+            raise ArchiveError('Mercury was not able to get article HTML from the URL')
+
         atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
         atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
         atomic_write(str(output_folder / "article.json"), article_json)
         atomic_write(str(output_folder / "article.json"), article_json)
 
 

+ 11 - 4
archivebox/extractors/readability.py

@@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
     output_folder = out_dir.absolute() / "readability"
     output_folder = out_dir.absolute() / "readability"
-    output = str(output_folder)
+    output = "readability"
 
 
     # Readability Docs: https://github.com/mozilla/readability
     # Readability Docs: https://github.com/mozilla/readability
 
 
@@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         temp_doc.write(document.encode("utf-8"))
         temp_doc.write(document.encode("utf-8"))
         temp_doc.close()
         temp_doc.close()
 
 
+        if not document or len(document) < 10:
+            raise ArchiveError('Readability could not find HTML to parse for article text')
+
         cmd = [
         cmd = [
             DEPENDENCIES['READABILITY_BINARY']['path'],
             DEPENDENCIES['READABILITY_BINARY']['path'],
-            temp_doc.name
+            temp_doc.name,
         ]
         ]
 
 
         result = run(cmd, cwd=out_dir, timeout=timeout)
         result = run(cmd, cwd=out_dir, timeout=timeout)
-        result_json = json.loads(result.stdout)
+        try:
+            result_json = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
+
         output_folder.mkdir(exist_ok=True)
         output_folder.mkdir(exist_ok=True)
         readability_content = result_json.pop("textContent") 
         readability_content = result_json.pop("textContent") 
         atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
         atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
@@ -122,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         cmd_version=READABILITY_VERSION,
         cmd_version=READABILITY_VERSION,
         output=output,
         output=output,
         status=status,
         status=status,
-        index_texts= [readability_content] if readability_content else [],
+        index_texts=[readability_content] if readability_content else [],
         **timer.stats,  
         **timer.stats,  
     )
     )