2
0

git.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. __package__ = 'archivebox.extractors'
  2. import os
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. domain,
  10. extension,
  11. without_query,
  12. without_fragment,
  13. )
  14. from ..config import (
  15. TIMEOUT,
  16. SAVE_GIT,
  17. GIT_BINARY,
  18. GIT_VERSION,
  19. GIT_DOMAINS,
  20. CHECK_SSL_VALIDITY
  21. )
  22. from ..cli.logging import TimedProgress
  23. @enforce_types
  24. def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
  25. out_dir = out_dir or link.link_dir
  26. if is_static_file(link.url):
  27. return False
  28. if os.path.exists(os.path.join(out_dir, 'git')):
  29. return False
  30. is_clonable_url = (
  31. (domain(link.url) in GIT_DOMAINS)
  32. or (extension(link.url) == 'git')
  33. )
  34. if not is_clonable_url:
  35. return False
  36. return SAVE_GIT
  37. @enforce_types
  38. def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  39. """download full site using git"""
  40. out_dir = out_dir or link.link_dir
  41. output: ArchiveOutput = 'git'
  42. output_path = os.path.join(out_dir, str(output))
  43. os.makedirs(output_path, exist_ok=True)
  44. cmd = [
  45. GIT_BINARY,
  46. 'clone',
  47. '--mirror',
  48. '--recursive',
  49. *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
  50. without_query(without_fragment(link.url)),
  51. ]
  52. status = 'succeeded'
  53. timer = TimedProgress(timeout, prefix=' ')
  54. try:
  55. result = run(cmd, cwd=output_path, timeout=timeout + 1)
  56. if result.returncode == 128:
  57. # ignore failed re-download when the folder already exists
  58. pass
  59. elif result.returncode > 0:
  60. hints = 'Got git response code: {}.'.format(result.returncode)
  61. raise ArchiveError('Failed to save git clone', hints)
  62. chmod_file(output, cwd=out_dir)
  63. except Exception as err:
  64. status = 'failed'
  65. output = err
  66. finally:
  67. timer.end()
  68. return ArchiveResult(
  69. cmd=cmd,
  70. pwd=out_dir,
  71. cmd_version=GIT_VERSION,
  72. output=output,
  73. status=status,
  74. **timer.stats,
  75. )