mercury.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from subprocess import CompletedProcess
  4. from typing import Optional, List
  5. import json
  6. from ..index.schema import Link, ArchiveResult, ArchiveError
  7. from ..system import run, atomic_write
  8. from ..util import (
  9. enforce_types,
  10. is_static_file,
  11. dedupe,
  12. )
  13. from ..config import (
  14. TIMEOUT,
  15. SAVE_MERCURY,
  16. DEPENDENCIES,
  17. MERCURY_VERSION,
  18. MERCURY_ARGS,
  19. MERCURY_EXTRA_ARGS,
  20. )
  21. from ..logging_util import TimedProgress
  22. def get_output_path():
  23. return 'mercury/'
  24. def get_embed_path(archiveresult=None):
  25. return get_output_path() + 'content.html'
  26. @enforce_types
  27. def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
  28. # parse out last line of stderr
  29. return ArchiveError(
  30. f'Got {cmd[0]} response code: {result.returncode}).',
  31. " ".join(
  32. line.strip()
  33. for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:]
  34. if line.strip()
  35. ),
  36. )
  37. @enforce_types
  38. def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
  39. if is_static_file(link.url):
  40. return False
  41. out_dir = out_dir or Path(link.link_dir)
  42. if not overwrite and (out_dir / get_output_path()).exists():
  43. return False
  44. return SAVE_MERCURY
  45. @enforce_types
  46. def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  47. """download reader friendly version using @postlight/mercury-parser"""
  48. out_dir = Path(out_dir or link.link_dir)
  49. output_folder = out_dir.absolute() / get_output_path()
  50. output = get_output_path()
  51. status = 'succeeded'
  52. timer = TimedProgress(timeout, prefix=' ')
  53. try:
  54. output_folder.mkdir(exist_ok=True)
  55. # later options take precedence
  56. options = [
  57. *MERCURY_ARGS,
  58. *MERCURY_EXTRA_ARGS,
  59. ]
  60. # By default, get plain text version of article
  61. cmd = [
  62. DEPENDENCIES['MERCURY_BINARY']['path'],
  63. link.url,
  64. *dedupe(options)
  65. ]
  66. result = run(cmd, cwd=out_dir, timeout=timeout)
  67. try:
  68. article_text = json.loads(result.stdout)
  69. except json.JSONDecodeError:
  70. raise ShellError(cmd, result)
  71. if article_text.get('failed'):
  72. raise ArchiveError('Mercury was not able to get article text from the URL')
  73. atomic_write(str(output_folder / "content.txt"), article_text["content"])
  74. # Get HTML version of article
  75. cmd = [
  76. DEPENDENCIES['MERCURY_BINARY']['path'],
  77. link.url
  78. ]
  79. result = run(cmd, cwd=out_dir, timeout=timeout)
  80. try:
  81. article_json = json.loads(result.stdout)
  82. except json.JSONDecodeError:
  83. raise ShellError(cmd, result)
  84. if article_text.get('failed'):
  85. raise ArchiveError('Mercury was not able to get article HTML from the URL')
  86. atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
  87. atomic_write(str(output_folder / "article.json"), article_json)
  88. # Check for common failure cases
  89. if (result.returncode > 0):
  90. raise ShellError(cmd, result)
  91. except (ArchiveError, Exception, OSError) as err:
  92. status = 'failed'
  93. output = err
  94. finally:
  95. timer.end()
  96. return ArchiveResult(
  97. cmd=cmd,
  98. pwd=str(out_dir),
  99. cmd_version=MERCURY_VERSION,
  100. output=output,
  101. status=status,
  102. **timer.stats,
  103. )