favicon.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput
  5. from ..system import chmod_file, run
  6. from ..util import (
  7. enforce_types,
  8. domain,
  9. dedupe,
  10. )
  11. from ..config import CONFIG
  12. from ..logging_util import TimedProgress
  13. @enforce_types
  14. def should_save_favicon(link: Link, out_dir: str | Path | None=None, overwrite: bool=False) -> bool:
  15. assert link.link_dir
  16. out_dir = Path(out_dir or link.link_dir)
  17. if not overwrite and (out_dir / 'favicon.ico').exists():
  18. return False
  19. return CONFIG.SAVE_FAVICON
  20. @enforce_types
  21. def get_output_path():
  22. return 'favicon.ico'
  23. @enforce_types
  24. def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult:
  25. """download site favicon from google's favicon api"""
  26. out_dir = Path(out_dir or link.link_dir)
  27. assert out_dir.exists()
  28. output: ArchiveOutput = 'favicon.ico'
  29. # later options take precedence
  30. options = [
  31. *CONFIG.CURL_ARGS,
  32. *CONFIG.CURL_EXTRA_ARGS,
  33. '--max-time', str(timeout),
  34. '--output', str(output),
  35. *(['--user-agent', '{}'.format(CONFIG.CURL_USER_AGENT)] if CONFIG.CURL_USER_AGENT else []),
  36. *([] if CONFIG.CHECK_SSL_VALIDITY else ['--insecure']),
  37. ]
  38. cmd = [
  39. CONFIG.CURL_BINARY,
  40. *dedupe(options),
  41. CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
  42. ]
  43. status = 'failed'
  44. timer = TimedProgress(timeout, prefix=' ')
  45. try:
  46. run(cmd, cwd=str(out_dir), timeout=timeout)
  47. chmod_file(output, cwd=str(out_dir))
  48. status = 'succeeded'
  49. except Exception as err:
  50. output = err
  51. finally:
  52. timer.end()
  53. return ArchiveResult(
  54. cmd=cmd,
  55. pwd=str(out_dir),
  56. cmd_version=CONFIG.CURL_VERSION,
  57. output=output,
  58. status=status,
  59. **timer.stats,
  60. )