pdf.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. __package__ = 'archivebox.extractors'
  2. import os
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. chrome_args,
  10. )
  11. from ..config import (
  12. TIMEOUT,
  13. SAVE_PDF,
  14. CHROME_VERSION,
  15. )
  16. from ..cli.logging import TimedProgress
  17. @enforce_types
  18. def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
  19. out_dir = out_dir or link.link_dir
  20. if is_static_file(link.url):
  21. return False
  22. if os.path.exists(os.path.join(out_dir, 'output.pdf')):
  23. return False
  24. return SAVE_PDF
  25. @enforce_types
  26. def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  27. """print PDF of site to file using chrome --headless"""
  28. out_dir = out_dir or link.link_dir
  29. output: ArchiveOutput = 'output.pdf'
  30. cmd = [
  31. *chrome_args(TIMEOUT=timeout),
  32. '--print-to-pdf',
  33. link.url,
  34. ]
  35. status = 'succeeded'
  36. timer = TimedProgress(timeout, prefix=' ')
  37. try:
  38. result = run(cmd, cwd=out_dir, timeout=timeout)
  39. if result.returncode:
  40. hints = (result.stderr or result.stdout).decode()
  41. raise ArchiveError('Failed to save PDF', hints)
  42. chmod_file('output.pdf', cwd=out_dir)
  43. except Exception as err:
  44. status = 'failed'
  45. output = err
  46. finally:
  47. timer.end()
  48. return ArchiveResult(
  49. cmd=cmd,
  50. pwd=out_dir,
  51. cmd_version=CHROME_VERSION,
  52. output=output,
  53. status=status,
  54. **timer.stats,
  55. )