wget.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. __package__ = 'archivebox.extractors'
  2. import re
  3. from pathlib import Path
  4. from typing import Optional
  5. from datetime import datetime, timezone
  6. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  7. from ..system import run, chmod_file
  8. from ..util import (
  9. enforce_types,
  10. without_fragment,
  11. without_query,
  12. path,
  13. domain,
  14. urldecode,
  15. dedupe,
  16. )
  17. from ..config import (
  18. WGET_ARGS,
  19. WGET_EXTRA_ARGS,
  20. TIMEOUT,
  21. SAVE_WGET,
  22. SAVE_WARC,
  23. WGET_BINARY,
  24. WGET_VERSION,
  25. RESTRICT_FILE_NAMES,
  26. CHECK_SSL_VALIDITY,
  27. SAVE_WGET_REQUISITES,
  28. WGET_AUTO_COMPRESSION,
  29. WGET_USER_AGENT,
  30. COOKIES_FILE,
  31. )
  32. from ..logging_util import TimedProgress
  33. @enforce_types
  34. def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  35. output_path = wget_output_path(link)
  36. out_dir = out_dir or Path(link.link_dir)
  37. if not overwrite and output_path and (out_dir / output_path).exists():
  38. return False
  39. return SAVE_WGET
  40. @enforce_types
  41. def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  42. """download full site using wget"""
  43. out_dir = out_dir or link.link_dir
  44. if SAVE_WARC:
  45. warc_dir = out_dir / "warc"
  46. warc_dir.mkdir(exist_ok=True)
  47. warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
  48. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  49. output: ArchiveOutput = None
  50. # later options take precedence
  51. options = [
  52. *WGET_ARGS,
  53. *WGET_EXTRA_ARGS,
  54. '--timeout={}'.format(timeout),
  55. *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
  56. *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
  57. *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
  58. *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
  59. *(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),
  60. *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
  61. *([] if SAVE_WARC else ['--timestamping']),
  62. *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
  63. # '--server-response', # print headers for better error parsing
  64. ]
  65. cmd = [
  66. WGET_BINARY,
  67. *dedupe(options),
  68. link.url,
  69. ]
  70. status = 'succeeded'
  71. timer = TimedProgress(timeout, prefix=' ')
  72. try:
  73. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  74. output = wget_output_path(link)
  75. # parse out number of files downloaded from last line of stderr:
  76. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  77. output_tail = [
  78. line.strip()
  79. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  80. if line.strip()
  81. ]
  82. files_downloaded = (
  83. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  84. if 'Downloaded:' in output_tail[-1]
  85. else 0
  86. )
  87. hints = (
  88. 'Got wget response code: {}.'.format(result.returncode),
  89. *output_tail,
  90. )
  91. # Check for common failure cases
  92. if (result.returncode > 0 and files_downloaded < 1) or output is None:
  93. if b'403: Forbidden' in result.stderr:
  94. raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
  95. if b'404: Not Found' in result.stderr:
  96. raise ArchiveError('404 Not Found', hints)
  97. if b'ERROR 500: Internal Server Error' in result.stderr:
  98. raise ArchiveError('500 Internal Server Error', hints)
  99. raise ArchiveError('Wget failed or got an error from the server', hints)
  100. if (out_dir / output).exists():
  101. chmod_file(output, cwd=str(out_dir))
  102. else:
  103. print(f' {out_dir}/{output}')
  104. raise ArchiveError('Failed to find wget output after running', hints)
  105. except Exception as err:
  106. status = 'failed'
  107. output = err
  108. finally:
  109. timer.end()
  110. return ArchiveResult(
  111. cmd=cmd,
  112. pwd=str(out_dir),
  113. cmd_version=WGET_VERSION,
  114. output=output,
  115. status=status,
  116. **timer.stats,
  117. )
  118. @enforce_types
  119. def wget_output_path(link: Link) -> Optional[str]:
  120. """calculate the path to the wgetted .html file, since wget may
  121. adjust some paths to be different than the base_url path.
  122. See docs on wget --adjust-extension (-E)
  123. """
  124. # Wget downloads can save in a number of different ways depending on the url:
  125. # https://example.com
  126. # > example.com/index.html
  127. # https://example.com?v=zzVa_tX1OiI
  128. # > example.com/index.html?v=zzVa_tX1OiI.html
  129. # https://www.example.com/?v=zzVa_tX1OiI
  130. # > example.com/index.html?v=zzVa_tX1OiI.html
  131. # https://example.com/abc
  132. # > example.com/abc.html
  133. # https://example.com/abc/
  134. # > example.com/abc/index.html
  135. # https://example.com/abc?v=zzVa_tX1OiI.html
  136. # > example.com/abc?v=zzVa_tX1OiI.html
  137. # https://example.com/abc/?v=zzVa_tX1OiI.html
  138. # > example.com/abc/index.html?v=zzVa_tX1OiI.html
  139. # https://example.com/abc/test.html
  140. # > example.com/abc/test.html
  141. # https://example.com/abc/test?v=zzVa_tX1OiI
  142. # > example.com/abc/test?v=zzVa_tX1OiI.html
  143. # https://example.com/abc/test/?v=zzVa_tX1OiI
  144. # > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
  145. # There's also lots of complexity around how the urlencoding and renaming
  146. # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
  147. # Since the wget algorithm for -E (appending .html) is incredibly complex
  148. # and there's no way to get the computed output path from wget
  149. # in order to avoid having to reverse-engineer how they calculate it,
  150. # we just look in the output folder read the filename wget used from the filesystem
  151. full_path = without_fragment(without_query(path(link.url))).strip('/')
  152. search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
  153. for _ in range(4):
  154. if search_dir.exists():
  155. if search_dir.is_dir():
  156. html_files = [
  157. f for f in search_dir.iterdir()
  158. if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
  159. ]
  160. if html_files:
  161. return str(html_files[0].relative_to(link.link_dir))
  162. # sometimes wget'd URLs have no ext and return non-html
  163. # e.g. /some/example/rss/all -> some RSS XML content)
  164. # /some/other/url.o4g -> some binary unrecognized ext)
  165. # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
  166. last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
  167. for file_present in search_dir.iterdir():
  168. if file_present == last_part_of_url:
  169. return str((search_dir / file_present).relative_to(link.link_dir))
  170. # Move up one directory level
  171. search_dir = search_dir.parent
  172. if str(search_dir) == link.link_dir:
  173. break
  174. # check for literally any file present that isnt an empty folder
  175. domain_dir = Path(domain(link.url).replace(":", "+"))
  176. files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
  177. if files_within:
  178. return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
  179. # fallback to just the domain dir
  180. search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
  181. if search_dir.is_dir():
  182. return domain(link.url).replace(":", "+")
  183. # fallback to just the domain dir without port
  184. search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
  185. if search_dir.is_dir():
  186. return domain(link.url).split(":", 1)[0]
  187. return None