pdf.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. chrome_args,
  10. chrome_cleanup,
  11. )
  12. from ..config import (
  13. TIMEOUT,
  14. SAVE_PDF,
  15. CHROME_VERSION,
  16. )
  17. from ..logging_util import TimedProgress
  18. @enforce_types
  19. def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  20. if is_static_file(link.url):
  21. return False
  22. out_dir = out_dir or Path(link.link_dir)
  23. if not overwrite and (out_dir / 'output.pdf').exists():
  24. return False
  25. return SAVE_PDF
  26. @enforce_types
  27. def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  28. """print PDF of site to file using chrome --headless"""
  29. out_dir = out_dir or Path(link.link_dir)
  30. output: ArchiveOutput = 'output.pdf'
  31. cmd = [
  32. *chrome_args(),
  33. '--print-to-pdf',
  34. link.url,
  35. ]
  36. status = 'succeeded'
  37. timer = TimedProgress(timeout, prefix=' ')
  38. try:
  39. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  40. if result.returncode:
  41. hints = (result.stderr or result.stdout).decode()
  42. raise ArchiveError('Failed to save PDF', hints)
  43. chmod_file('output.pdf', cwd=str(out_dir))
  44. except Exception as err:
  45. status = 'failed'
  46. output = err
  47. chrome_cleanup()
  48. finally:
  49. timer.end()
  50. return ArchiveResult(
  51. cmd=cmd,
  52. pwd=str(out_dir),
  53. cmd_version=CHROME_VERSION,
  54. output=output,
  55. status=status,
  56. **timer.stats,
  57. )