git.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. domain,
  10. extension,
  11. without_query,
  12. without_fragment,
  13. )
  14. from ..config import CONFIG
  15. from ..logging_util import TimedProgress
  16. def get_output_path():
  17. return 'git/'
  18. def get_embed_path(archiveresult=None):
  19. if not archiveresult:
  20. return get_output_path()
  21. try:
  22. return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
  23. except IndexError:
  24. pass
  25. return get_output_path()
  26. @enforce_types
  27. def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  28. if is_static_file(link.url):
  29. return False
  30. out_dir = out_dir or Path(link.link_dir)
  31. if not overwrite and (out_dir / get_output_path()).exists():
  32. return False
  33. is_clonable_url = (
  34. (domain(link.url) in CONFIG.GIT_DOMAINS)
  35. or (extension(link.url) == 'git')
  36. )
  37. if not is_clonable_url:
  38. return False
  39. return CONFIG.SAVE_GIT
  40. @enforce_types
  41. def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult:
  42. """download full site using git"""
  43. out_dir = out_dir or Path(link.link_dir)
  44. output: ArchiveOutput = get_output_path()
  45. output_path = out_dir / output
  46. output_path.mkdir(exist_ok=True)
  47. cmd = [
  48. CONFIG.GIT_BINARY,
  49. 'clone',
  50. *CONFIG.GIT_ARGS,
  51. *([] if CONFIG.CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
  52. without_query(without_fragment(link.url)),
  53. ]
  54. status = 'succeeded'
  55. timer = TimedProgress(timeout, prefix=' ')
  56. try:
  57. result = run(cmd, cwd=str(output_path), timeout=timeout + 1)
  58. if result.returncode == 128:
  59. # ignore failed re-download when the folder already exists
  60. pass
  61. elif result.returncode > 0:
  62. hints = 'Got git response code: {}.'.format(result.returncode)
  63. raise ArchiveError('Failed to save git clone', hints)
  64. chmod_file(output, cwd=str(out_dir))
  65. except Exception as err:
  66. status = 'failed'
  67. output = err
  68. finally:
  69. timer.end()
  70. return ArchiveResult(
  71. cmd=cmd,
  72. pwd=str(out_dir),
  73. cmd_version=CONFIG.GIT_VERSION,
  74. output=output,
  75. status=status,
  76. **timer.stats,
  77. )