git.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. __package__ = 'archivebox.extractors'
  2. import os
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. domain,
  10. extension,
  11. without_query,
  12. without_fragment,
  13. )
  14. from ..config import (
  15. TIMEOUT,
  16. SAVE_GIT,
  17. GIT_BINARY,
  18. GIT_VERSION,
  19. GIT_DOMAINS,
  20. CHECK_SSL_VALIDITY
  21. )
  22. from ..logging import TimedProgress
  23. @enforce_types
  24. def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
  25. out_dir = out_dir or link.link_dir
  26. if is_static_file(link.url):
  27. return False
  28. if os.path.exists(os.path.join(out_dir, 'git')):
  29. return False
  30. is_clonable_url = (
  31. (domain(link.url) in GIT_DOMAINS)
  32. or (extension(link.url) == 'git')
  33. )
  34. if not is_clonable_url:
  35. return False
  36. return SAVE_GIT
  37. @enforce_types
  38. def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  39. """download full site using git"""
  40. out_dir = out_dir or link.link_dir
  41. output: ArchiveOutput = 'git'
  42. output_path = os.path.join(out_dir, str(output))
  43. os.makedirs(output_path, exist_ok=True)
  44. cmd = [
  45. GIT_BINARY,
  46. 'clone',
  47. '--recursive',
  48. *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
  49. without_query(without_fragment(link.url)),
  50. ]
  51. status = 'succeeded'
  52. timer = TimedProgress(timeout, prefix=' ')
  53. try:
  54. result = run(cmd, cwd=output_path, timeout=timeout + 1)
  55. if result.returncode == 128:
  56. # ignore failed re-download when the folder already exists
  57. pass
  58. elif result.returncode > 0:
  59. hints = 'Got git response code: {}.'.format(result.returncode)
  60. raise ArchiveError('Failed to save git clone', hints)
  61. chmod_file(output, cwd=out_dir)
  62. except Exception as err:
  63. status = 'failed'
  64. output = err
  65. finally:
  66. timer.end()
  67. return ArchiveResult(
  68. cmd=cmd,
  69. pwd=out_dir,
  70. cmd_version=GIT_VERSION,
  71. output=output,
  72. status=status,
  73. **timer.stats,
  74. )