readability.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from tempfile import NamedTemporaryFile
  4. from typing import Optional
  5. import json
  6. from django.db.models import Model
  7. from ..index.schema import ArchiveResult, ArchiveError
  8. from ..system import run, atomic_write
  9. from ..util import (
  10. enforce_types,
  11. download_url,
  12. is_static_file,
  13. )
  14. from ..config import (
  15. TIMEOUT,
  16. CURL_BINARY,
  17. SAVE_READABILITY,
  18. DEPENDENCIES,
  19. READABILITY_VERSION,
  20. )
  21. from ..logging_util import TimedProgress
  22. @enforce_types
  23. def get_html(snapshot: Model, path: Path) -> str:
  24. """
  25. Try to find wget, singlefile and then dom files.
  26. If none is found, download the url again.
  27. """
  28. canonical = snapshot.canonical_outputs()
  29. abs_path = path.absolute()
  30. sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
  31. document = None
  32. for source in sources:
  33. try:
  34. with open(abs_path / source, "r") as f:
  35. document = f.read()
  36. break
  37. except (FileNotFoundError, TypeError):
  38. continue
  39. if document is None:
  40. return download_url(snapshot.url)
  41. else:
  42. return document
  43. # output = 'readability/'
  44. @enforce_types
  45. def should_save_readability(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
  46. out_dir = out_dir or snapshot.link_dir
  47. if is_static_file(snapshot.url):
  48. return False
  49. output = Path(out_dir or snapshot.snapshot_dir) / 'readability'
  50. if not overwrite and output.exists():
  51. return False
  52. return SAVE_READABILITY and READABILITY_VERSION
  53. @enforce_types
  54. def save_readability(snapshot: Model, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  55. """download reader friendly version using @mozilla/readability"""
  56. out_dir = Path(out_dir or snapshot.snapshot_dir)
  57. output_folder = out_dir.absolute() / "readability"
  58. output = str(output_folder)
  59. # Readability Docs: https://github.com/mozilla/readability
  60. status = 'succeeded'
  61. # fake command to show the user so they have something to try debugging if get_html fails
  62. cmd = [
  63. CURL_BINARY,
  64. snapshot.url
  65. ]
  66. readability_content = None
  67. timer = TimedProgress(timeout, prefix=' ')
  68. try:
  69. document = get_html(snapshot, out_dir)
  70. temp_doc = NamedTemporaryFile(delete=False)
  71. temp_doc.write(document.encode("utf-8"))
  72. temp_doc.close()
  73. cmd = [
  74. DEPENDENCIES['READABILITY_BINARY']['path'],
  75. temp_doc.name
  76. ]
  77. result = run(cmd, cwd=out_dir, timeout=timeout)
  78. result_json = json.loads(result.stdout)
  79. output_folder.mkdir(exist_ok=True)
  80. readability_content = result_json.pop("textContent")
  81. atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
  82. atomic_write(str(output_folder / "content.txt"), readability_content)
  83. atomic_write(str(output_folder / "article.json"), result_json)
  84. # parse out number of files downloaded from last line of stderr:
  85. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  86. output_tail = [
  87. line.strip()
  88. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  89. if line.strip()
  90. ]
  91. hints = (
  92. 'Got readability response code: {}.'.format(result.returncode),
  93. *output_tail,
  94. )
  95. # Check for common failure cases
  96. if (result.returncode > 0):
  97. raise ArchiveError('Readability was not able to archive the page', hints)
  98. except (Exception, OSError) as err:
  99. status = 'failed'
  100. output = err
  101. finally:
  102. timer.end()
  103. return ArchiveResult(
  104. cmd=cmd,
  105. pwd=str(out_dir),
  106. cmd_version=READABILITY_VERSION,
  107. output=output,
  108. status=status,
  109. index_texts= [readability_content] if readability_content else [],
  110. **timer.stats,
  111. )