git.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. domain,
  10. extension,
  11. without_query,
  12. without_fragment,
  13. )
  14. from ..config import (
  15. TIMEOUT,
  16. SAVE_GIT,
  17. GIT_BINARY,
  18. GIT_ARGS,
  19. GIT_VERSION,
  20. GIT_DOMAINS,
  21. CHECK_SSL_VALIDITY
  22. )
  23. from ..logging_util import TimedProgress
  24. def get_output_path():
  25. return 'git/'
  26. def get_embed_path(archiveresult=None):
  27. if not archiveresult:
  28. return get_output_path()
  29. try:
  30. return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
  31. except IndexError:
  32. pass
  33. return get_output_path()
  34. @enforce_types
  35. def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  36. if is_static_file(link.url):
  37. return False
  38. out_dir = out_dir or Path(link.link_dir)
  39. if not overwrite and (out_dir / get_output_path()).exists():
  40. return False
  41. is_clonable_url = (
  42. (domain(link.url) in GIT_DOMAINS)
  43. or (extension(link.url) == 'git')
  44. )
  45. if not is_clonable_url:
  46. return False
  47. return SAVE_GIT
  48. @enforce_types
  49. def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  50. """download full site using git"""
  51. out_dir = out_dir or Path(link.link_dir)
  52. output: ArchiveOutput = get_output_path()
  53. output_path = out_dir / output
  54. output_path.mkdir(exist_ok=True)
  55. cmd = [
  56. GIT_BINARY,
  57. 'clone',
  58. *GIT_ARGS,
  59. *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
  60. without_query(without_fragment(link.url)),
  61. ]
  62. status = 'succeeded'
  63. timer = TimedProgress(timeout, prefix=' ')
  64. try:
  65. result = run(cmd, cwd=str(output_path), timeout=timeout + 1)
  66. if result.returncode == 128:
  67. # ignore failed re-download when the folder already exists
  68. pass
  69. elif result.returncode > 0:
  70. hints = 'Got git response code: {}.'.format(result.returncode)
  71. raise ArchiveError('Failed to save git clone', hints)
  72. chmod_file(output, cwd=str(out_dir))
  73. except Exception as err:
  74. status = 'failed'
  75. output = err
  76. finally:
  77. timer.end()
  78. return ArchiveResult(
  79. cmd=cmd,
  80. pwd=str(out_dir),
  81. cmd_version=GIT_VERSION,
  82. output=output,
  83. status=status,
  84. **timer.stats,
  85. )