singlefile.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. import json
  5. from ..index.schema import Link, ArchiveResult, ArchiveError
  6. from ..system import run, chmod_file
  7. from ..util import (
  8. enforce_types,
  9. is_static_file,
  10. chrome_args,
  11. dedupe,
  12. )
  13. from ..config import (
  14. TIMEOUT,
  15. SAVE_SINGLEFILE,
  16. DEPENDENCIES,
  17. SINGLEFILE_VERSION,
  18. SINGLEFILE_ARGS,
  19. SINGLEFILE_EXTRA_ARGS,
  20. CHROME_BINARY,
  21. COOKIES_FILE,
  22. )
  23. from ..logging_util import TimedProgress
  24. def get_output_path():
  25. return 'singlefile.html'
  26. @enforce_types
  27. def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  28. if is_static_file(link.url):
  29. return False
  30. out_dir = out_dir or Path(link.link_dir)
  31. if not overwrite and (out_dir / get_output_path()).exists():
  32. return False
  33. return SAVE_SINGLEFILE
  34. @enforce_types
  35. def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  36. """download full site using single-file"""
  37. out_dir = out_dir or Path(link.link_dir)
  38. output = get_output_path()
  39. browser_args = chrome_args(CHROME_TIMEOUT=0)
  40. # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
  41. browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
  42. # later options take precedence
  43. options = [
  44. '--browser-executable-path={}'.format(CHROME_BINARY),
  45. *(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
  46. browser_args,
  47. *SINGLEFILE_ARGS,
  48. *SINGLEFILE_EXTRA_ARGS,
  49. ]
  50. cmd = [
  51. DEPENDENCIES['SINGLEFILE_BINARY']['path'],
  52. *dedupe(options),
  53. link.url,
  54. output,
  55. ]
  56. status = 'succeeded'
  57. timer = TimedProgress(timeout, prefix=' ')
  58. result = None
  59. try:
  60. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  61. # parse out number of files downloaded from last line of stderr:
  62. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  63. output_tail = [
  64. line.strip()
  65. for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
  66. if line.strip()
  67. ]
  68. hints = (
  69. 'Got single-file response code: {}.'.format(result.returncode),
  70. *output_tail,
  71. )
  72. # Check for common failure cases
  73. if (result.returncode > 0) or not (out_dir / output).is_file():
  74. raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
  75. chmod_file(output, cwd=str(out_dir))
  76. except (Exception, OSError) as err:
  77. status = 'failed'
  78. # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
  79. cmd[2] = browser_args.replace('"', "\\\"")
  80. if result:
  81. err.hints = (result.stdout + result.stderr).decode().split('\n')
  82. output = err
  83. finally:
  84. timer.end()
  85. return ArchiveResult(
  86. cmd=cmd,
  87. pwd=str(out_dir),
  88. cmd_version=SINGLEFILE_VERSION,
  89. output=output,
  90. status=status,
  91. **timer.stats,
  92. )