wget.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. __package__ = 'archivebox.extractors'
  2. import re
  3. from pathlib import Path
  4. from typing import Optional
  5. from datetime import datetime, timezone
  6. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  7. from ..system import run, chmod_file
  8. from ..util import (
  9. enforce_types,
  10. without_fragment,
  11. without_query,
  12. path,
  13. domain,
  14. urldecode,
  15. dedupe,
  16. )
  17. from ..config import (
  18. WGET_ARGS,
  19. WGET_EXTRA_ARGS,
  20. TIMEOUT,
  21. SAVE_WGET,
  22. SAVE_WARC,
  23. WGET_BINARY,
  24. WGET_VERSION,
  25. RESTRICT_FILE_NAMES,
  26. CHECK_SSL_VALIDITY,
  27. SAVE_WGET_REQUISITES,
  28. WGET_AUTO_COMPRESSION,
  29. WGET_USER_AGENT,
  30. COOKIES_FILE,
  31. )
  32. from ..logging_util import TimedProgress
  33. def get_output_path():
  34. # TODO: actually save output into this folder, instead of do {domain}/**/index.html
  35. return 'wget/'
  36. def get_embed_path(archiveresult=None):
  37. if not archiveresult:
  38. return get_output_path()
  39. link = archiveresult.snapshot.as_link()
  40. return wget_output_path(link)
  41. @enforce_types
  42. def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  43. output_path = wget_output_path(link)
  44. out_dir = out_dir or Path(link.link_dir)
  45. if not overwrite and output_path and (out_dir / output_path).exists():
  46. return False
  47. return SAVE_WGET
  48. @enforce_types
  49. def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  50. """download full site using wget"""
  51. out_dir = out_dir or link.link_dir
  52. if SAVE_WARC:
  53. warc_dir = out_dir / "warc"
  54. warc_dir.mkdir(exist_ok=True)
  55. warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
  56. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  57. output: ArchiveOutput = None
  58. # later options take precedence
  59. options = [
  60. *WGET_ARGS,
  61. *WGET_EXTRA_ARGS,
  62. '--timeout={}'.format(timeout),
  63. *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
  64. *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
  65. *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
  66. *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
  67. *(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),
  68. *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
  69. *([] if SAVE_WARC else ['--timestamping']),
  70. *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
  71. # '--server-response', # print headers for better error parsing
  72. ]
  73. cmd = [
  74. WGET_BINARY,
  75. *dedupe(options),
  76. link.url,
  77. ]
  78. status = 'succeeded'
  79. timer = TimedProgress(timeout, prefix=' ')
  80. try:
  81. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  82. output = wget_output_path(link)
  83. # parse out number of files downloaded from last line of stderr:
  84. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  85. output_tail = [
  86. line.strip()
  87. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  88. if line.strip()
  89. ]
  90. files_downloaded = (
  91. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  92. if 'Downloaded:' in output_tail[-1]
  93. else 0
  94. )
  95. hints = (
  96. 'Got wget response code: {}.'.format(result.returncode),
  97. *output_tail,
  98. )
  99. # Check for common failure cases
  100. if (result.returncode > 0 and files_downloaded < 1) or output is None:
  101. if b'403: Forbidden' in result.stderr:
  102. raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
  103. if b'404: Not Found' in result.stderr:
  104. raise ArchiveError('404 Not Found', hints)
  105. if b'ERROR 500: Internal Server Error' in result.stderr:
  106. raise ArchiveError('500 Internal Server Error', hints)
  107. raise ArchiveError('Wget failed or got an error from the server', hints)
  108. if (out_dir / output).exists():
  109. chmod_file(output, cwd=str(out_dir))
  110. else:
  111. print(f' {out_dir}/{output}')
  112. raise ArchiveError('Failed to find wget output after running', hints)
  113. except Exception as err:
  114. status = 'failed'
  115. output = err
  116. finally:
  117. timer.end()
  118. return ArchiveResult(
  119. cmd=cmd,
  120. pwd=str(out_dir),
  121. cmd_version=WGET_VERSION,
  122. output=output,
  123. status=status,
  124. **timer.stats,
  125. )
  126. @enforce_types
  127. def unsafe_wget_output_path(link: Link) -> Optional[str]:
  128. # There used to be a bunch of complex reverse-engineering path mapping logic here,
  129. # but it was removed in favor of just walking through the output folder recursively to try to find the
  130. # html file that wget produced. It's *much much much* slower than deriving it statically, and is currently
  131. # one of the main bottlenecks of ArchiveBox's performance (the output data is often on a slow HDD or network mount).
  132. # But it's STILL better than trying to figure out URL -> html filepath mappings ourselves from first principles.
  133. full_path = without_fragment(without_query(path(link.url))).strip('/')
  134. search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
  135. for _ in range(4):
  136. try:
  137. if search_dir.exists():
  138. if search_dir.is_dir():
  139. html_files = [
  140. f for f in search_dir.iterdir()
  141. if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
  142. ]
  143. if html_files:
  144. return str(html_files[0].relative_to(link.link_dir))
  145. # sometimes wget'd URLs have no ext and return non-html
  146. # e.g. /some/example/rss/all -> some RSS XML content)
  147. # /some/other/url.o4g -> some binary unrecognized ext)
  148. # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
  149. last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
  150. for file_present in search_dir.iterdir():
  151. if file_present == last_part_of_url:
  152. return str((search_dir / file_present).relative_to(link.link_dir))
  153. except OSError:
  154. # OSError 36 and others can happen here, caused by trying to check for impossible paths
  155. # (paths derived from URLs can often contain illegal unicode characters or be too long,
  156. # causing the OS / filesystem to reject trying to open them with a system-level error)
  157. pass
  158. # Move up one directory level
  159. search_dir = search_dir.parent
  160. if str(search_dir) == link.link_dir:
  161. break
  162. # check for literally any file present that isnt an empty folder
  163. domain_dir = Path(domain(link.url).replace(":", "+"))
  164. files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
  165. if files_within:
  166. return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
  167. # abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
  168. # that it's better we just pretend it doesnt exist
  169. # this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
  170. return None
  171. @enforce_types
  172. def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]:
  173. """calculate the path to the wgetted .html file, since wget may
  174. adjust some paths to be different than the base_url path.
  175. See docs on: wget --adjust-extension (-E), --restrict-file-names=windows|unix|ascii, --convert-links
  176. WARNING: this function is extremely error prone because mapping URLs to filesystem paths deterministically
  177. is basically impossible. Every OS and filesystem have different requirements on what special characters are
  178. allowed, and URLs are *full* of all kinds of special characters, illegal unicode, and generally unsafe strings
  179. that you dont want anywhere near your filesystem. Also URLs can be obscenely long, but most filesystems dont
  180. accept paths longer than 250 characters. On top of all that, this function only exists to try to reverse engineer
  181. wget's approach to solving this problem, so this is a shittier, less tested version of their already insanely
  182. complicated attempt to do this. Here be dragons:
  183. - https://github.com/ArchiveBox/ArchiveBox/issues/549
  184. - https://github.com/ArchiveBox/ArchiveBox/issues/1373
  185. - https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
  186. - and probably many more that I didn't realize were caused by this...
  187. The only constructive thing we could possibly do to this function is to figure out how to remove it.
  188. Preach loudly to anyone who will listen: never attempt to map URLs to filesystem paths,
  189. and pray you never have to deal with the aftermath of someone else's attempt to do so...
  190. """
  191. # Wget downloads can save in a number of different ways depending on the url:
  192. # https://example.com
  193. # > example.com/index.html
  194. # https://example.com?v=zzVa_tX1OiI
  195. # > example.com/index.html@v=zzVa_tX1OiI.html
  196. # https://www.example.com/?v=zzVa_tX1OiI
  197. # > example.com/index.html@v=zzVa_tX1OiI.html
  198. # https://example.com/abc
  199. # > example.com/abc.html
  200. # https://example.com/abc/
  201. # > example.com/abc/index.html
  202. # https://example.com/abc?v=zzVa_tX1OiI.html
  203. # > example.com/abc@v=zzVa_tX1OiI.html
  204. # https://example.com/abc/?v=zzVa_tX1OiI.html
  205. # > example.com/abc/index.html@v=zzVa_tX1OiI.html
  206. # https://example.com/abc/test.html
  207. # > example.com/abc/test.html
  208. # https://example.com/abc/test?v=zzVa_tX1OiI
  209. # > example.com/abc/test@v=zzVa_tX1OiI.html
  210. # https://example.com/abc/test/?v=zzVa_tX1OiI
  211. # > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
  212. cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path'
  213. if not nocache:
  214. from django.core.cache import cache
  215. cached_result = cache.get(cache_key)
  216. if cached_result:
  217. return cached_result
  218. # There's also lots of complexity around how the urlencoding and renaming
  219. # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
  220. # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
  221. # 4 characters, paths with multipe extensions, etc. the list goes on...
  222. output_path = None
  223. try:
  224. output_path = unsafe_wget_output_path(link)
  225. except Exception as err:
  226. pass # better to pretend it just failed to download than expose gnarly OSErrors to users
  227. # check for unprintable unicode characters
  228. # https://github.com/ArchiveBox/ArchiveBox/issues/1373
  229. if output_path:
  230. safe_path = output_path.encode('utf-8', 'replace').decode()
  231. if output_path != safe_path:
  232. # contains unprintable unicode characters that will break other parts of archivebox
  233. # better to pretend it doesnt exist and fallback to parent dir than crash archivebox
  234. output_path = None
  235. # check for a path that is just too long to safely handle across different OS's
  236. # https://github.com/ArchiveBox/ArchiveBox/issues/549
  237. if output_path and len(output_path) > 250:
  238. output_path = None
  239. if output_path:
  240. if not nocache:
  241. cache.set(cache_key, output_path)
  242. return output_path
  243. # fallback to just the domain dir
  244. search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
  245. if search_dir.is_dir():
  246. return domain(link.url).replace(":", "+")
  247. # fallback to just the domain dir without port
  248. search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
  249. if search_dir.is_dir():
  250. return domain(link.url).split(":", 1)[0]
  251. return None