pdf.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. chrome_args,
  10. chrome_cleanup,
  11. )
  12. from ..config import (
  13. TIMEOUT,
  14. SAVE_PDF,
  15. CHROME_VERSION,
  16. )
  17. from ..logging_util import TimedProgress
  18. def get_output_path():
  19. return 'output.pdf'
  20. @enforce_types
  21. def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  22. if is_static_file(link.url):
  23. return False
  24. out_dir = out_dir or Path(link.link_dir)
  25. if not overwrite and (out_dir / get_output_path()).exists():
  26. return False
  27. return SAVE_PDF
  28. @enforce_types
  29. def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  30. """print PDF of site to file using chrome --headless"""
  31. out_dir = out_dir or Path(link.link_dir)
  32. output: ArchiveOutput = get_output_path()
  33. cmd = [
  34. *chrome_args(),
  35. '--print-to-pdf',
  36. link.url,
  37. ]
  38. status = 'succeeded'
  39. timer = TimedProgress(timeout, prefix=' ')
  40. try:
  41. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  42. if result.returncode:
  43. hints = (result.stderr or result.stdout).decode()
  44. raise ArchiveError('Failed to save PDF', hints)
  45. chmod_file(get_output_path(), cwd=str(out_dir))
  46. except Exception as err:
  47. status = 'failed'
  48. output = err
  49. chrome_cleanup()
  50. finally:
  51. timer.end()
  52. return ArchiveResult(
  53. cmd=cmd,
  54. pwd=str(out_dir),
  55. cmd_version=CHROME_VERSION,
  56. output=output,
  57. status=status,
  58. **timer.stats,
  59. )