mercury.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from subprocess import CompletedProcess
  4. from typing import Optional, List
  5. import json
  6. from ..index.schema import Link, ArchiveResult, ArchiveError
  7. from ..system import run, atomic_write
  8. from ..util import (
  9. enforce_types,
  10. is_static_file,
  11. )
  12. from ..config import (
  13. TIMEOUT,
  14. SAVE_MERCURY,
  15. DEPENDENCIES,
  16. MERCURY_VERSION,
  17. )
  18. from ..logging_util import TimedProgress
  19. @enforce_types
  20. def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
  21. # parse out last line of stderr
  22. return ArchiveError(
  23. f'Got {cmd[0]} response code: {result.returncode}).',
  24. " ".join(
  25. line.strip()
  26. for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:]
  27. if line.strip()
  28. ),
  29. )
  30. @enforce_types
  31. def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
  32. if is_static_file(link.url):
  33. return False
  34. out_dir = out_dir or Path(link.link_dir)
  35. if not overwrite and (out_dir / 'mercury').exists():
  36. return False
  37. return SAVE_MERCURY
  38. @enforce_types
  39. def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  40. """download reader friendly version using @postlight/mercury-parser"""
  41. out_dir = Path(out_dir or link.link_dir)
  42. output_folder = out_dir.absolute() / "mercury"
  43. output = "mercury"
  44. status = 'succeeded'
  45. timer = TimedProgress(timeout, prefix=' ')
  46. try:
  47. output_folder.mkdir(exist_ok=True)
  48. # Get plain text version of article
  49. cmd = [
  50. DEPENDENCIES['MERCURY_BINARY']['path'],
  51. link.url,
  52. "--format=text"
  53. ]
  54. result = run(cmd, cwd=out_dir, timeout=timeout)
  55. try:
  56. article_text = json.loads(result.stdout)
  57. except json.JSONDecodeError:
  58. raise ShellError(cmd, result)
  59. if article_text.get('failed'):
  60. raise ArchiveError('Mercury was not able to get article text from the URL')
  61. atomic_write(str(output_folder / "content.txt"), article_text["content"])
  62. # Get HTML version of article
  63. cmd = [
  64. DEPENDENCIES['MERCURY_BINARY']['path'],
  65. link.url
  66. ]
  67. result = run(cmd, cwd=out_dir, timeout=timeout)
  68. try:
  69. article_json = json.loads(result.stdout)
  70. except json.JSONDecodeError:
  71. raise ShellError(cmd, result)
  72. if article_text.get('failed'):
  73. raise ArchiveError('Mercury was not able to get article HTML from the URL')
  74. atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
  75. atomic_write(str(output_folder / "article.json"), article_json)
  76. # Check for common failure cases
  77. if (result.returncode > 0):
  78. raise ShellError(cmd, result)
  79. except (ArchiveError, Exception, OSError) as err:
  80. status = 'failed'
  81. output = err
  82. finally:
  83. timer.end()
  84. return ArchiveResult(
  85. cmd=cmd,
  86. pwd=str(out_dir),
  87. cmd_version=MERCURY_VERSION,
  88. output=output,
  89. status=status,
  90. **timer.stats,
  91. )