wget.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. __package__ = 'archivebox.extractors'
  2. import re
  3. from pathlib import Path
  4. from typing import Optional
  5. from datetime import datetime, timezone
  6. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  7. from ..system import run, chmod_file
  8. from ..util import (
  9. enforce_types,
  10. without_fragment,
  11. without_query,
  12. path,
  13. domain,
  14. urldecode,
  15. )
  16. from ..config import (
  17. WGET_ARGS,
  18. TIMEOUT,
  19. SAVE_WGET,
  20. SAVE_WARC,
  21. WGET_BINARY,
  22. WGET_VERSION,
  23. RESTRICT_FILE_NAMES,
  24. CHECK_SSL_VALIDITY,
  25. SAVE_WGET_REQUISITES,
  26. WGET_AUTO_COMPRESSION,
  27. WGET_USER_AGENT,
  28. COOKIES_FILE,
  29. )
  30. from ..logging_util import TimedProgress
  31. @enforce_types
  32. def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  33. output_path = wget_output_path(link)
  34. out_dir = out_dir or Path(link.link_dir)
  35. if not overwrite and output_path and (out_dir / output_path).exists():
  36. return False
  37. return SAVE_WGET
  38. @enforce_types
  39. def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  40. """download full site using wget"""
  41. out_dir = out_dir or link.link_dir
  42. if SAVE_WARC:
  43. warc_dir = out_dir / "warc"
  44. warc_dir.mkdir(exist_ok=True)
  45. warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
  46. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  47. output: ArchiveOutput = None
  48. cmd = [
  49. WGET_BINARY,
  50. # '--server-response', # print headers for better error parsing
  51. *WGET_ARGS,
  52. '--timeout={}'.format(timeout),
  53. *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
  54. *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
  55. *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
  56. *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
  57. *(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),
  58. *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
  59. *([] if SAVE_WARC else ['--timestamping']),
  60. *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
  61. link.url,
  62. ]
  63. status = 'succeeded'
  64. timer = TimedProgress(timeout, prefix=' ')
  65. try:
  66. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  67. output = wget_output_path(link)
  68. # parse out number of files downloaded from last line of stderr:
  69. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  70. output_tail = [
  71. line.strip()
  72. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  73. if line.strip()
  74. ]
  75. files_downloaded = (
  76. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  77. if 'Downloaded:' in output_tail[-1]
  78. else 0
  79. )
  80. hints = (
  81. 'Got wget response code: {}.'.format(result.returncode),
  82. *output_tail,
  83. )
  84. # Check for common failure cases
  85. if (result.returncode > 0 and files_downloaded < 1) or output is None:
  86. if b'403: Forbidden' in result.stderr:
  87. raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
  88. if b'404: Not Found' in result.stderr:
  89. raise ArchiveError('404 Not Found', hints)
  90. if b'ERROR 500: Internal Server Error' in result.stderr:
  91. raise ArchiveError('500 Internal Server Error', hints)
  92. raise ArchiveError('Wget failed or got an error from the server', hints)
  93. if (out_dir / output).exists():
  94. chmod_file(output, cwd=str(out_dir))
  95. else:
  96. print(f' {out_dir}/{output}')
  97. raise ArchiveError('Failed to find wget output after running', hints)
  98. except Exception as err:
  99. status = 'failed'
  100. output = err
  101. finally:
  102. timer.end()
  103. return ArchiveResult(
  104. cmd=cmd,
  105. pwd=str(out_dir),
  106. cmd_version=WGET_VERSION,
  107. output=output,
  108. status=status,
  109. **timer.stats,
  110. )
  111. @enforce_types
  112. def wget_output_path(link: Link) -> Optional[str]:
  113. """calculate the path to the wgetted .html file, since wget may
  114. adjust some paths to be different than the base_url path.
  115. See docs on wget --adjust-extension (-E)
  116. """
  117. # Wget downloads can save in a number of different ways depending on the url:
  118. # https://example.com
  119. # > example.com/index.html
  120. # https://example.com?v=zzVa_tX1OiI
  121. # > example.com/index.html?v=zzVa_tX1OiI.html
  122. # https://www.example.com/?v=zzVa_tX1OiI
  123. # > example.com/index.html?v=zzVa_tX1OiI.html
  124. # https://example.com/abc
  125. # > example.com/abc.html
  126. # https://example.com/abc/
  127. # > example.com/abc/index.html
  128. # https://example.com/abc?v=zzVa_tX1OiI.html
  129. # > example.com/abc?v=zzVa_tX1OiI.html
  130. # https://example.com/abc/?v=zzVa_tX1OiI.html
  131. # > example.com/abc/index.html?v=zzVa_tX1OiI.html
  132. # https://example.com/abc/test.html
  133. # > example.com/abc/test.html
  134. # https://example.com/abc/test?v=zzVa_tX1OiI
  135. # > example.com/abc/test?v=zzVa_tX1OiI.html
  136. # https://example.com/abc/test/?v=zzVa_tX1OiI
  137. # > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
  138. # There's also lots of complexity around how the urlencoding and renaming
  139. # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
  140. # Since the wget algorithm for -E (appending .html) is incredibly complex
  141. # and there's no way to get the computed output path from wget
  142. # in order to avoid having to reverse-engineer how they calculate it,
  143. # we just look in the output folder read the filename wget used from the filesystem
  144. full_path = without_fragment(without_query(path(link.url))).strip('/')
  145. search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
  146. for _ in range(4):
  147. if search_dir.exists():
  148. if search_dir.is_dir():
  149. html_files = [
  150. f for f in search_dir.iterdir()
  151. if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
  152. ]
  153. if html_files:
  154. return str(html_files[0].relative_to(link.link_dir))
  155. # sometimes wget'd URLs have no ext and return non-html
  156. # e.g. /some/example/rss/all -> some RSS XML content)
  157. # /some/other/url.o4g -> some binary unrecognized ext)
  158. # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
  159. last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
  160. for file_present in search_dir.iterdir():
  161. if file_present == last_part_of_url:
  162. return str((search_dir / file_present).relative_to(link.link_dir))
  163. # Move up one directory level
  164. search_dir = search_dir.parent
  165. if str(search_dir) == link.link_dir:
  166. break
  167. # check for literally any file present that isnt an empty folder
  168. domain_dir = Path(domain(link.url).replace(":", "+"))
  169. files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
  170. if files_within:
  171. return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
  172. # fallback to just the domain dir
  173. search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
  174. if search_dir.is_dir():
  175. return domain(link.url).replace(":", "+")
  176. return None