singlefile.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. import json
  5. from ..index.schema import Link, ArchiveResult, ArchiveError
  6. from ..system import run, chmod_file
  7. from ..util import (
  8. enforce_types,
  9. is_static_file,
  10. chrome_args,
  11. )
  12. from ..config import (
  13. TIMEOUT,
  14. SAVE_SINGLEFILE,
  15. DEPENDENCIES,
  16. SINGLEFILE_VERSION,
  17. SINGLEFILE_ARGS,
  18. CHROME_BINARY,
  19. )
  20. from ..logging_util import TimedProgress
  21. @enforce_types
  22. def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  23. if is_static_file(link.url):
  24. return False
  25. out_dir = out_dir or Path(link.link_dir)
  26. if not overwrite and (out_dir / 'singlefile.html').exists():
  27. return False
  28. return SAVE_SINGLEFILE
  29. @enforce_types
  30. def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  31. """download full site using single-file"""
  32. out_dir = out_dir or Path(link.link_dir)
  33. output = "singlefile.html"
  34. browser_args = chrome_args(CHROME_TIMEOUT=0)
  35. # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
  36. browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
  37. options = [
  38. *SINGLEFILE_ARGS,
  39. '--browser-executable-path={}'.format(CHROME_BINARY),
  40. browser_args,
  41. ]
  42. # Deduplicate options (single-file doesn't like when you use the same option two times)
  43. #
  44. # NOTE: Options names that come first clobber conflicting names that come later
  45. # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
  46. # specificity, therefore the user sets it with a lot intent, therefore it should take precedence
  47. # kind of like the ergonomic principle of lexical scope in programming languages.
  48. seen_option_names = []
  49. def test_seen(argument):
  50. option_name = argument.split("=")[0]
  51. if option_name in seen_option_names:
  52. return False
  53. else:
  54. seen_option_names.append(option_name)
  55. return True
  56. deduped_options = list(filter(test_seen, options))
  57. cmd = [
  58. DEPENDENCIES['SINGLEFILE_BINARY']['path'],
  59. *deduped_options,
  60. link.url,
  61. output,
  62. ]
  63. status = 'succeeded'
  64. timer = TimedProgress(timeout, prefix=' ')
  65. try:
  66. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  67. # parse out number of files downloaded from last line of stderr:
  68. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  69. output_tail = [
  70. line.strip()
  71. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  72. if line.strip()
  73. ]
  74. hints = (
  75. 'Got single-file response code: {}.'.format(result.returncode),
  76. *output_tail,
  77. )
  78. # Check for common failure cases
  79. if (result.returncode > 0) or not (out_dir / output).is_file():
  80. raise ArchiveError('SingleFile was not able to archive the page', hints)
  81. chmod_file(output, cwd=str(out_dir))
  82. except (Exception, OSError) as err:
  83. status = 'failed'
  84. # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
  85. cmd[2] = browser_args.replace('"', "\\\"")
  86. output = err
  87. finally:
  88. timer.end()
  89. return ArchiveResult(
  90. cmd=cmd,
  91. pwd=str(out_dir),
  92. cmd_version=SINGLEFILE_VERSION,
  93. output=output,
  94. status=status,
  95. **timer.stats,
  96. )