dom.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file, atomic_write
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. chrome_args,
  10. chrome_cleanup,
  11. )
  12. from ..config import (
  13. TIMEOUT,
  14. SAVE_DOM,
  15. CHROME_VERSION,
  16. )
  17. from ..logging_util import TimedProgress
  18. def get_output_path():
  19. return 'output.html'
  20. @enforce_types
  21. def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  22. if is_static_file(link.url):
  23. return False
  24. out_dir = out_dir or Path(link.link_dir)
  25. if not overwrite and (out_dir / get_output_path()).exists():
  26. if (out_dir / get_output_path()).stat().st_size > 1:
  27. return False
  28. return SAVE_DOM
  29. @enforce_types
  30. def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  31. """print HTML of site to file using chrome --dump-html"""
  32. out_dir = out_dir or Path(link.link_dir)
  33. output: ArchiveOutput = get_output_path()
  34. output_path = out_dir / output
  35. cmd = [
  36. *chrome_args(),
  37. '--dump-dom',
  38. link.url
  39. ]
  40. status = 'succeeded'
  41. timer = TimedProgress(timeout, prefix=' ')
  42. try:
  43. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  44. atomic_write(output_path, result.stdout)
  45. if result.returncode:
  46. hints = result.stderr.decode()
  47. raise ArchiveError('Failed to save DOM', hints)
  48. chmod_file(output, cwd=str(out_dir))
  49. except Exception as err:
  50. status = 'failed'
  51. output = err
  52. chrome_cleanup()
  53. finally:
  54. timer.end()
  55. return ArchiveResult(
  56. cmd=cmd,
  57. pwd=str(out_dir),
  58. cmd_version=CHROME_VERSION,
  59. output=output,
  60. status=status,
  61. **timer.stats,
  62. )