git.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from django.db.models import Model
  5. from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
  6. from ..system import run, chmod_file
  7. from ..util import (
  8. enforce_types,
  9. is_static_file,
  10. domain,
  11. extension,
  12. without_query,
  13. without_fragment,
  14. )
  15. from ..config import (
  16. TIMEOUT,
  17. SAVE_GIT,
  18. GIT_BINARY,
  19. GIT_ARGS,
  20. GIT_VERSION,
  21. GIT_DOMAINS,
  22. CHECK_SSL_VALIDITY
  23. )
  24. from ..logging_util import TimedProgress
  25. # output = 'git/'
  26. # @contents = output.glob('*.*')
  27. # @exists = self.contents.exists()
  28. # @size => get_size(self.contents)
  29. # @num_files => len(self.contents)
  30. @enforce_types
  31. def should_save_git(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
  32. out_dir = out_dir or snapshot.snapshot_dir
  33. if is_static_file(snapshot.url):
  34. return False
  35. out_dir = out_dir or Path(link.link_dir)
  36. if not overwrite and (out_dir / 'git').exists():
  37. return False
  38. is_clonable_url = (
  39. (domain(snapshot.url) in GIT_DOMAINS)
  40. or (extension(snapshot.url) == 'git')
  41. )
  42. if not is_clonable_url:
  43. return False
  44. return SAVE_GIT
  45. @enforce_types
  46. def save_git(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  47. """download full site using git"""
  48. out_dir = out_dir or Path(snapshot.snapshot_dir)
  49. output: ArchiveOutput = 'git'
  50. output_path = out_dir / output
  51. output_path.mkdir(exist_ok=True)
  52. cmd = [
  53. GIT_BINARY,
  54. 'clone',
  55. *GIT_ARGS,
  56. *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
  57. without_query(without_fragment(snapshot.url)),
  58. ]
  59. status = 'succeeded'
  60. timer = TimedProgress(timeout, prefix=' ')
  61. try:
  62. result = run(cmd, cwd=str(output_path), timeout=timeout + 1)
  63. if result.returncode == 128:
  64. # ignore failed re-download when the folder already exists
  65. pass
  66. elif result.returncode > 0:
  67. hints = 'Got git response code: {}.'.format(result.returncode)
  68. raise ArchiveError('Failed to save git clone', hints)
  69. chmod_file(output, cwd=str(out_dir))
  70. except Exception as err:
  71. status = 'failed'
  72. output = err
  73. finally:
  74. timer.end()
  75. return ArchiveResult(
  76. cmd=cmd,
  77. pwd=str(out_dir),
  78. cmd_version=GIT_VERSION,
  79. output=output,
  80. status=status,
  81. **timer.stats,
  82. )