git.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from archivebox.misc.system import run, chmod_file
  5. from archivebox.misc.util import (
  6. enforce_types,
  7. is_static_file,
  8. domain,
  9. extension,
  10. without_query,
  11. without_fragment,
  12. )
  13. from ..logging_util import TimedProgress
  14. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  15. from archivebox.plugins_extractor.git.config import GIT_CONFIG
  16. from archivebox.plugins_extractor.git.binaries import GIT_BINARY
  17. def get_output_path():
  18. return 'git/'
  19. def get_embed_path(archiveresult=None):
  20. if not archiveresult:
  21. return get_output_path()
  22. try:
  23. return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
  24. except IndexError:
  25. pass
  26. return get_output_path()
  27. @enforce_types
  28. def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  29. if is_static_file(link.url):
  30. return False
  31. out_dir = out_dir or Path(link.link_dir)
  32. if not overwrite and (out_dir / get_output_path()).exists():
  33. return False
  34. is_clonable_url = (
  35. (domain(link.url) in GIT_CONFIG.GIT_DOMAINS)
  36. or (extension(link.url) == 'git')
  37. )
  38. if not is_clonable_url:
  39. return False
  40. return GIT_CONFIG.SAVE_GIT
  41. @enforce_types
  42. def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=GIT_CONFIG.GIT_TIMEOUT) -> ArchiveResult:
  43. """download full site using git"""
  44. git_binary = GIT_BINARY.load()
  45. assert git_binary.abspath and git_binary.version
  46. out_dir = out_dir or Path(link.link_dir)
  47. output: ArchiveOutput = get_output_path()
  48. output_path = out_dir / output
  49. output_path.mkdir(exist_ok=True)
  50. cmd = [
  51. str(git_binary.abspath),
  52. 'clone',
  53. *GIT_CONFIG.GIT_ARGS,
  54. *([] if GIT_CONFIG.GIT_CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
  55. without_query(without_fragment(link.url)),
  56. ]
  57. status = 'succeeded'
  58. timer = TimedProgress(timeout, prefix=' ')
  59. try:
  60. result = run(cmd, cwd=str(output_path), timeout=timeout + 1)
  61. if result.returncode == 128:
  62. # ignore failed re-download when the folder already exists
  63. pass
  64. elif result.returncode > 0:
  65. hints = 'Got git response code: {}.'.format(result.returncode)
  66. raise ArchiveError('Failed to save git clone', hints)
  67. chmod_file(output, cwd=str(out_dir))
  68. except Exception as err:
  69. status = 'failed'
  70. output = err
  71. finally:
  72. timer.end()
  73. return ArchiveResult(
  74. cmd=cmd,
  75. pwd=str(out_dir),
  76. cmd_version=str(git_binary.version),
  77. output=output,
  78. status=status,
  79. **timer.stats,
  80. )