screenshot.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. chrome_args,
  10. chrome_cleanup,
  11. )
  12. from ..config import (
  13. TIMEOUT,
  14. SAVE_SCREENSHOT,
  15. CHROME_VERSION,
  16. )
  17. from ..logging_util import TimedProgress
  18. @enforce_types
  19. def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  20. if is_static_file(link.url):
  21. return False
  22. out_dir = out_dir or Path(link.link_dir)
  23. if not overwrite and (out_dir / 'screenshot.png').exists():
  24. return False
  25. return SAVE_SCREENSHOT
  26. @enforce_types
  27. def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  28. """take screenshot of site using chrome --headless"""
  29. out_dir = out_dir or Path(link.link_dir)
  30. output: ArchiveOutput = 'screenshot.png'
  31. cmd = [
  32. *chrome_args(),
  33. '--screenshot',
  34. link.url,
  35. ]
  36. status = 'succeeded'
  37. timer = TimedProgress(timeout, prefix=' ')
  38. try:
  39. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  40. if result.returncode:
  41. hints = (result.stderr or result.stdout).decode()
  42. raise ArchiveError('Failed to save screenshot', hints)
  43. chmod_file(output, cwd=str(out_dir))
  44. except Exception as err:
  45. status = 'failed'
  46. output = err
  47. chrome_cleanup()
  48. finally:
  49. timer.end()
  50. return ArchiveResult(
  51. cmd=cmd,
  52. pwd=str(out_dir),
  53. cmd_version=CHROME_VERSION,
  54. output=output,
  55. status=status,
  56. **timer.stats,
  57. )