pdf.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from django.db.models import Model
  5. from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
  6. from ..system import run, chmod_file
  7. from ..util import (
  8. enforce_types,
  9. is_static_file,
  10. chrome_args,
  11. )
  12. from ..config import (
  13. TIMEOUT,
  14. SAVE_PDF,
  15. CHROME_VERSION,
  16. )
  17. from ..logging_util import TimedProgress
  18. # output = 'output.pdf'
  19. @enforce_types
  20. def should_save_pdf(snapshot: Model, verwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
  21. out_dir = out_dir or Path(snapshot.snapshot_dir)
  22. if is_static_file(snapshot.url):
  23. return False
  24. out_dir = out_dir or Path(link.link_dir)
  25. if not overwrite and (out_dir / 'output.pdf').exists():
  26. return False
  27. return SAVE_PDF
  28. @enforce_types
  29. def save_pdf(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  30. """print PDF of site to file using chrome --headless"""
  31. out_dir = out_dir or Path(snapshot.snapshot_dir)
  32. output: ArchiveOutput = 'output.pdf'
  33. cmd = [
  34. *chrome_args(TIMEOUT=timeout),
  35. '--print-to-pdf',
  36. snapshot.url,
  37. ]
  38. status = 'succeeded'
  39. timer = TimedProgress(timeout, prefix=' ')
  40. try:
  41. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  42. if result.returncode:
  43. hints = (result.stderr or result.stdout).decode()
  44. raise ArchiveError('Failed to save PDF', hints)
  45. chmod_file('output.pdf', cwd=str(out_dir))
  46. except Exception as err:
  47. status = 'failed'
  48. output = err
  49. finally:
  50. timer.end()
  51. return ArchiveResult(
  52. cmd=cmd,
  53. pwd=str(out_dir),
  54. cmd_version=CHROME_VERSION,
  55. output=output,
  56. status=status,
  57. **timer.stats,
  58. )