singlefile.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. import json
  5. from django.db.models import Model
  6. from ..index.schema import ArchiveResult, ArchiveError
  7. from ..system import run, chmod_file
  8. from ..util import (
  9. enforce_types,
  10. is_static_file,
  11. chrome_args,
  12. )
  13. from ..config import (
  14. TIMEOUT,
  15. SAVE_SINGLEFILE,
  16. DEPENDENCIES,
  17. SINGLEFILE_VERSION,
  18. CHROME_BINARY,
  19. )
  20. from ..logging_util import TimedProgress
  21. @enforce_types
  22. def should_save_singlefile(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
  23. out_dir = out_dir or Path(snapshot.snapshot_dir)
  24. if is_static_file(snapshot.url):
  25. return False
  26. out_dir = out_dir or Path(link.link_dir)
  27. if not overwrite and (out_dir / 'singlefile.html').exists():
  28. return False
  29. return SAVE_SINGLEFILE
  30. @enforce_types
  31. def save_singlefile(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  32. """download full site using single-file"""
  33. out_dir = out_dir or Path(snapshot.snapshot_dir)
  34. output = "singlefile.html"
  35. browser_args = chrome_args(TIMEOUT=0)
  36. # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
  37. browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
  38. cmd = [
  39. DEPENDENCIES['SINGLEFILE_BINARY']['path'],
  40. '--browser-executable-path={}'.format(CHROME_BINARY),
  41. browser_args,
  42. snapshot.url,
  43. output
  44. output,
  45. ]
  46. status = 'succeeded'
  47. timer = TimedProgress(timeout, prefix=' ')
  48. try:
  49. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  50. # parse out number of files downloaded from last line of stderr:
  51. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  52. output_tail = [
  53. line.strip()
  54. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  55. if line.strip()
  56. ]
  57. hints = (
  58. 'Got single-file response code: {}.'.format(result.returncode),
  59. *output_tail,
  60. )
  61. # Check for common failure cases
  62. if (result.returncode > 0) or not (out_dir / output).is_file():
  63. raise ArchiveError('SingleFile was not able to archive the page', hints)
  64. chmod_file(output, cwd=str(out_dir))
  65. except (Exception, OSError) as err:
  66. status = 'failed'
  67. # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
  68. cmd[2] = browser_args.replace('"', "\\\"")
  69. output = err
  70. finally:
  71. timer.end()
  72. return ArchiveResult(
  73. cmd=cmd,
  74. pwd=str(out_dir),
  75. cmd_version=SINGLEFILE_VERSION,
  76. output=output,
  77. status=status,
  78. **timer.stats,
  79. )