singlefile.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. import json
  5. from ..index.schema import Link, ArchiveResult, ArchiveError
  6. from ..system import run, chmod_file
  7. from ..util import (
  8. enforce_types,
  9. is_static_file,
  10. chrome_args,
  11. dedupe,
  12. )
  13. from ..config import (
  14. TIMEOUT,
  15. SAVE_SINGLEFILE,
  16. DEPENDENCIES,
  17. SINGLEFILE_VERSION,
  18. SINGLEFILE_ARGS,
  19. SINGLEFILE_EXTRA_ARGS,
  20. CHROME_BINARY,
  21. COOKIES_FILE,
  22. )
  23. from ..logging_util import TimedProgress
  24. @enforce_types
  25. def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  26. if is_static_file(link.url):
  27. return False
  28. out_dir = out_dir or Path(link.link_dir)
  29. if not overwrite and (out_dir / 'singlefile.html').exists():
  30. return False
  31. return SAVE_SINGLEFILE
  32. @enforce_types
  33. def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  34. """download full site using single-file"""
  35. out_dir = out_dir or Path(link.link_dir)
  36. output = "singlefile.html"
  37. browser_args = chrome_args(CHROME_TIMEOUT=0)
  38. # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
  39. browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
  40. # later options take precedence
  41. options = [
  42. '--browser-executable-path={}'.format(CHROME_BINARY),
  43. *(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
  44. browser_args,
  45. *SINGLEFILE_ARGS,
  46. *SINGLEFILE_EXTRA_ARGS,
  47. ]
  48. cmd = [
  49. DEPENDENCIES['SINGLEFILE_BINARY']['path'],
  50. *dedupe(options),
  51. link.url,
  52. output,
  53. ]
  54. status = 'succeeded'
  55. timer = TimedProgress(timeout, prefix=' ')
  56. result = None
  57. try:
  58. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  59. # parse out number of files downloaded from last line of stderr:
  60. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  61. output_tail = [
  62. line.strip()
  63. for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
  64. if line.strip()
  65. ]
  66. hints = (
  67. 'Got single-file response code: {}.'.format(result.returncode),
  68. *output_tail,
  69. )
  70. # Check for common failure cases
  71. if (result.returncode > 0) or not (out_dir / output).is_file():
  72. raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
  73. chmod_file(output, cwd=str(out_dir))
  74. except (Exception, OSError) as err:
  75. status = 'failed'
  76. # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
  77. cmd[2] = browser_args.replace('"', "\\\"")
  78. err.hints = (result.stdout + result.stderr).decode().split('\n')
  79. output = err
  80. finally:
  81. timer.end()
  82. return ArchiveResult(
  83. cmd=cmd,
  84. pwd=str(out_dir),
  85. cmd_version=SINGLEFILE_VERSION,
  86. output=output,
  87. status=status,
  88. **timer.stats,
  89. )