mercury.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from subprocess import CompletedProcess
  4. from typing import Optional, List
  5. import json
  6. from ..index.schema import Link, ArchiveResult, ArchiveError
  7. from ..system import run, atomic_write
  8. from ..util import (
  9. enforce_types,
  10. is_static_file,
  11. dedupe,
  12. )
  13. from ..config import (
  14. TIMEOUT,
  15. SAVE_MERCURY,
  16. DEPENDENCIES,
  17. MERCURY_VERSION,
  18. MERCURY_ARGS,
  19. MERCURY_EXTRA_ARGS,
  20. )
  21. from ..logging_util import TimedProgress
  22. @enforce_types
  23. def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
  24. # parse out last line of stderr
  25. return ArchiveError(
  26. f'Got {cmd[0]} response code: {result.returncode}).',
  27. " ".join(
  28. line.strip()
  29. for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:]
  30. if line.strip()
  31. ),
  32. )
  33. @enforce_types
  34. def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
  35. if is_static_file(link.url):
  36. return False
  37. out_dir = out_dir or Path(link.link_dir)
  38. if not overwrite and (out_dir / 'mercury').exists():
  39. return False
  40. return SAVE_MERCURY
  41. @enforce_types
  42. def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  43. """download reader friendly version using @postlight/mercury-parser"""
  44. out_dir = Path(out_dir or link.link_dir)
  45. output_folder = out_dir.absolute() / "mercury"
  46. output = "mercury"
  47. status = 'succeeded'
  48. timer = TimedProgress(timeout, prefix=' ')
  49. try:
  50. output_folder.mkdir(exist_ok=True)
  51. # later options take precedence
  52. options = [
  53. *MERCURY_ARGS,
  54. *MERCURY_EXTRA_ARGS,
  55. ]
  56. # By default, get plain text version of article
  57. cmd = [
  58. DEPENDENCIES['MERCURY_BINARY']['path'],
  59. link.url,
  60. *dedupe(options)
  61. ]
  62. result = run(cmd, cwd=out_dir, timeout=timeout)
  63. try:
  64. article_text = json.loads(result.stdout)
  65. except json.JSONDecodeError:
  66. raise ShellError(cmd, result)
  67. if article_text.get('failed'):
  68. raise ArchiveError('Mercury was not able to get article text from the URL')
  69. atomic_write(str(output_folder / "content.txt"), article_text["content"])
  70. # Get HTML version of article
  71. cmd = [
  72. DEPENDENCIES['MERCURY_BINARY']['path'],
  73. link.url
  74. ]
  75. result = run(cmd, cwd=out_dir, timeout=timeout)
  76. try:
  77. article_json = json.loads(result.stdout)
  78. except json.JSONDecodeError:
  79. raise ShellError(cmd, result)
  80. if article_text.get('failed'):
  81. raise ArchiveError('Mercury was not able to get article HTML from the URL')
  82. atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
  83. atomic_write(str(output_folder / "article.json"), article_json)
  84. # Check for common failure cases
  85. if (result.returncode > 0):
  86. raise ShellError(cmd, result)
  87. except (ArchiveError, Exception, OSError) as err:
  88. status = 'failed'
  89. output = err
  90. finally:
  91. timer.end()
  92. return ArchiveResult(
  93. cmd=cmd,
  94. pwd=str(out_dir),
  95. cmd_version=MERCURY_VERSION,
  96. output=output,
  97. status=status,
  98. **timer.stats,
  99. )