dom.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file, atomic_write
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. chrome_args,
  10. chrome_cleanup,
  11. )
  12. from ..config import (
  13. TIMEOUT,
  14. SAVE_DOM,
  15. CHROME_VERSION,
  16. )
  17. from ..logging_util import TimedProgress
  18. @enforce_types
  19. def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  20. if is_static_file(link.url):
  21. return False
  22. out_dir = out_dir or Path(link.link_dir)
  23. if not overwrite and (out_dir / 'output.html').exists():
  24. if (out_dir / 'output.html').stat().st_size > 1:
  25. return False
  26. return SAVE_DOM
  27. @enforce_types
  28. def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  29. """print HTML of site to file using chrome --dump-html"""
  30. out_dir = out_dir or Path(link.link_dir)
  31. output: ArchiveOutput = 'output.html'
  32. output_path = out_dir / output
  33. cmd = [
  34. *chrome_args(),
  35. '--dump-dom',
  36. link.url
  37. ]
  38. status = 'succeeded'
  39. timer = TimedProgress(timeout, prefix=' ')
  40. try:
  41. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  42. atomic_write(output_path, result.stdout)
  43. if result.returncode:
  44. hints = result.stderr.decode()
  45. raise ArchiveError('Failed to save DOM', hints)
  46. chmod_file(output, cwd=str(out_dir))
  47. except Exception as err:
  48. status = 'failed'
  49. output = err
  50. chrome_cleanup()
  51. finally:
  52. timer.end()
  53. return ArchiveResult(
  54. cmd=cmd,
  55. pwd=str(out_dir),
  56. cmd_version=CHROME_VERSION,
  57. output=output,
  58. status=status,
  59. **timer.stats,
  60. )