dom.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. __package__ = 'archivebox.extractors'
  2. import os
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. chrome_args,
  10. )
  11. from ..config import (
  12. TIMEOUT,
  13. SAVE_DOM,
  14. CHROME_VERSION,
  15. )
  16. from ..cli.logging import TimedProgress
  17. @enforce_types
  18. def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
  19. out_dir = out_dir or link.link_dir
  20. if is_static_file(link.url):
  21. return False
  22. if os.path.exists(os.path.join(out_dir, 'output.html')):
  23. return False
  24. return SAVE_DOM
  25. @enforce_types
  26. def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  27. """print HTML of site to file using chrome --dump-html"""
  28. out_dir = out_dir or link.link_dir
  29. output: ArchiveOutput = 'output.html'
  30. output_path = os.path.join(out_dir, str(output))
  31. cmd = [
  32. *chrome_args(TIMEOUT=timeout),
  33. '--dump-dom',
  34. link.url
  35. ]
  36. status = 'succeeded'
  37. timer = TimedProgress(timeout, prefix=' ')
  38. try:
  39. with open(output_path, 'w+') as f:
  40. result = run(cmd, stdout=f, cwd=out_dir, timeout=timeout)
  41. if result.returncode:
  42. hints = result.stderr.decode()
  43. raise ArchiveError('Failed to save DOM', hints)
  44. chmod_file(output, cwd=out_dir)
  45. except Exception as err:
  46. status = 'failed'
  47. output = err
  48. finally:
  49. timer.end()
  50. return ArchiveResult(
  51. cmd=cmd,
  52. pwd=out_dir,
  53. cmd_version=CHROME_VERSION,
  54. output=output,
  55. status=status,
  56. **timer.stats,
  57. )