Browse Source

prefer dom dump to singlefile for generating readability output

Nick Sweeting 1 year ago
parent
commit
db2984e47b
1 changed files with 3 additions and 1 deletions
  1. 3 1
      archivebox/extractors/title.py

+ 3 - 1
archivebox/extractors/title.py

@@ -66,7 +66,9 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
     """
     canonical = link.canonical_outputs()
     abs_path = path.absolute()
-    sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
+
+    # prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers
+    sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]]
     document = None
     for source in sources:
         try: