wget.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. __package__ = 'archivebox.extractors'
  2. import os
  3. import re
  4. from typing import Optional
  5. from datetime import datetime
  6. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  7. from ..system import run
  8. from ..util import (
  9. enforce_types,
  10. is_static_file,
  11. without_scheme,
  12. without_fragment,
  13. without_query,
  14. path,
  15. domain,
  16. urldecode,
  17. )
  18. from ..config import (
  19. TIMEOUT,
  20. SAVE_WGET,
  21. SAVE_WARC,
  22. WGET_BINARY,
  23. WGET_VERSION,
  24. RESTRICT_FILE_NAMES,
  25. CHECK_SSL_VALIDITY,
  26. SAVE_WGET_REQUISITES,
  27. WGET_AUTO_COMPRESSION,
  28. WGET_USER_AGENT,
  29. COOKIES_FILE,
  30. )
  31. from ..cli.logging import TimedProgress
  32. @enforce_types
  33. def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
  34. output_path = wget_output_path(link)
  35. out_dir = out_dir or link.link_dir
  36. if output_path and os.path.exists(os.path.join(out_dir, output_path)):
  37. return False
  38. return SAVE_WGET
  39. @enforce_types
  40. def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  41. """download full site using wget"""
  42. out_dir = out_dir or link.link_dir
  43. if SAVE_WARC:
  44. warc_dir = os.path.join(out_dir, 'warc')
  45. os.makedirs(warc_dir, exist_ok=True)
  46. warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
  47. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  48. output: ArchiveOutput = None
  49. cmd = [
  50. WGET_BINARY,
  51. # '--server-response', # print headers for better error parsing
  52. '--no-verbose',
  53. '--adjust-extension',
  54. '--convert-links',
  55. '--force-directories',
  56. '--backup-converted',
  57. '--span-hosts',
  58. '--no-parent',
  59. '-e', 'robots=off',
  60. '--timeout={}'.format(timeout),
  61. *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
  62. *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
  63. *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
  64. *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
  65. *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
  66. *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
  67. *([] if SAVE_WARC else ['--timestamping']),
  68. *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
  69. link.url,
  70. ]
  71. status = 'succeeded'
  72. timer = TimedProgress(timeout, prefix=' ')
  73. try:
  74. result = run(cmd, cwd=out_dir, timeout=timeout)
  75. output = wget_output_path(link)
  76. # parse out number of files downloaded from last line of stderr:
  77. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  78. output_tail = [
  79. line.strip()
  80. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  81. if line.strip()
  82. ]
  83. files_downloaded = (
  84. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  85. if 'Downloaded:' in output_tail[-1]
  86. else 0
  87. )
  88. # Check for common failure cases
  89. if result.returncode > 0 and files_downloaded < 1:
  90. hints = (
  91. 'Got wget response code: {}.'.format(result.returncode),
  92. *output_tail,
  93. )
  94. if b'403: Forbidden' in result.stderr:
  95. raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
  96. if b'404: Not Found' in result.stderr:
  97. raise ArchiveError('404 Not Found', hints)
  98. if b'ERROR 500: Internal Server Error' in result.stderr:
  99. raise ArchiveError('500 Internal Server Error', hints)
  100. raise ArchiveError('Got an error from the server', hints)
  101. # chmod_file(output, cwd=out_dir)
  102. except Exception as err:
  103. status = 'failed'
  104. output = err
  105. finally:
  106. timer.end()
  107. return ArchiveResult(
  108. cmd=cmd,
  109. pwd=out_dir,
  110. cmd_version=WGET_VERSION,
  111. output=output,
  112. status=status,
  113. **timer.stats,
  114. )
  115. @enforce_types
  116. def wget_output_path(link: Link) -> Optional[str]:
  117. """calculate the path to the wgetted .html file, since wget may
  118. adjust some paths to be different than the base_url path.
  119. See docs on wget --adjust-extension (-E)
  120. """
  121. if is_static_file(link.url):
  122. return without_scheme(without_fragment(link.url))
  123. # Wget downloads can save in a number of different ways depending on the url:
  124. # https://example.com
  125. # > example.com/index.html
  126. # https://example.com?v=zzVa_tX1OiI
  127. # > example.com/index.html?v=zzVa_tX1OiI.html
  128. # https://www.example.com/?v=zzVa_tX1OiI
  129. # > example.com/index.html?v=zzVa_tX1OiI.html
  130. # https://example.com/abc
  131. # > example.com/abc.html
  132. # https://example.com/abc/
  133. # > example.com/abc/index.html
  134. # https://example.com/abc?v=zzVa_tX1OiI.html
  135. # > example.com/abc?v=zzVa_tX1OiI.html
  136. # https://example.com/abc/?v=zzVa_tX1OiI.html
  137. # > example.com/abc/index.html?v=zzVa_tX1OiI.html
  138. # https://example.com/abc/test.html
  139. # > example.com/abc/test.html
  140. # https://example.com/abc/test?v=zzVa_tX1OiI
  141. # > example.com/abc/test?v=zzVa_tX1OiI.html
  142. # https://example.com/abc/test/?v=zzVa_tX1OiI
  143. # > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
  144. # There's also lots of complexity around how the urlencoding and renaming
  145. # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
  146. # Since the wget algorithm for -E (appending .html) is incredibly complex
  147. # and there's no way to get the computed output path from wget
  148. # in order to avoid having to reverse-engineer how they calculate it,
  149. # we just look in the output folder read the filename wget used from the filesystem
  150. full_path = without_fragment(without_query(path(link.url))).strip('/')
  151. search_dir = os.path.join(
  152. link.link_dir,
  153. domain(link.url),
  154. urldecode(full_path),
  155. )
  156. for _ in range(4):
  157. if os.path.exists(search_dir):
  158. if os.path.isdir(search_dir):
  159. html_files = [
  160. f for f in os.listdir(search_dir)
  161. if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
  162. ]
  163. if html_files:
  164. path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
  165. return os.path.join(path_from_link_dir, html_files[0])
  166. # Move up one directory level
  167. search_dir = search_dir.rsplit('/', 1)[0]
  168. if search_dir == link.link_dir:
  169. break
  170. return None