util.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. import os
  2. import re
  3. import sys
  4. import time
  5. from urllib.request import Request, urlopen
  6. from urllib.parse import urlparse, quote
  7. from decimal import Decimal
  8. from datetime import datetime
  9. from multiprocessing import Process
  10. from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
  11. from config import (
  12. ANSI,
  13. TERM_WIDTH,
  14. REPO_DIR,
  15. SOURCES_DIR,
  16. ARCHIVE_DIR,
  17. OUTPUT_PERMISSIONS,
  18. TIMEOUT,
  19. SHOW_PROGRESS,
  20. CHECK_SSL_VALIDITY,
  21. WGET_USER_AGENT,
  22. CURL_BINARY,
  23. WGET_BINARY,
  24. CHROME_BINARY,
  25. GIT_BINARY,
  26. YOUTUBEDL_BINARY,
  27. FETCH_TITLE,
  28. FETCH_FAVICON,
  29. FETCH_WGET,
  30. FETCH_WARC,
  31. FETCH_PDF,
  32. FETCH_SCREENSHOT,
  33. FETCH_DOM,
  34. FETCH_GIT,
  35. FETCH_MEDIA,
  36. SUBMIT_ARCHIVE_DOT_ORG,
  37. ARCHIVE_DIR_NAME,
  38. )
  39. ### Parsing Helpers
  40. # Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing
  41. scheme = lambda url: urlparse(url).scheme
  42. without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
  43. without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
  44. without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
  45. without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
  46. path = lambda url: urlparse(url).path
  47. basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
  48. domain = lambda url: urlparse(url).netloc
  49. query = lambda url: urlparse(url).query
  50. fragment = lambda url: urlparse(url).fragment
  51. extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
  52. base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
  53. short_ts = lambda ts: ts.split('.')[0]
  54. URL_REGEX = re.compile(
  55. r'http[s]?://' # start matching from allowed schemes
  56. r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
  57. r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
  58. r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
  59. r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
  60. re.IGNORECASE,
  61. )
  62. HTML_TITLE_REGEX = re.compile(
  63. r'<title>' # start matching text after <title> tag
  64. r'(.[^<>]+)', # get everything up to these symbols
  65. re.IGNORECASE,
  66. )
  67. ### Checks & Tests
  68. def check_link_structure(link):
  69. """basic sanity check invariants to make sure the data is valid"""
  70. assert isinstance(link, dict)
  71. assert isinstance(link.get('url'), str)
  72. assert len(link['url']) > 2
  73. assert len(re.findall(URL_REGEX, link['url'])) == 1
  74. def check_links_structure(links):
  75. """basic sanity check invariants to make sure the data is valid"""
  76. assert isinstance(links, list)
  77. if links:
  78. check_link_structure(links[0])
  79. def check_dependencies():
  80. """Check that all necessary dependencies are installed, and have valid versions"""
  81. python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
  82. if python_vers < 3.5:
  83. print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
  84. print(' See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
  85. raise SystemExit(1)
  86. if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
  87. if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
  88. print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
  89. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
  90. print(' See https://github.com/pirate/ArchiveBox for help.')
  91. raise SystemExit(1)
  92. if FETCH_WGET or FETCH_WARC:
  93. if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
  94. print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
  95. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
  96. print(' See https://github.com/pirate/ArchiveBox for help.')
  97. raise SystemExit(1)
  98. if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
  99. if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
  100. print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
  101. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
  102. print(' See https://github.com/pirate/ArchiveBox for help.')
  103. raise SystemExit(1)
  104. # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
  105. try:
  106. result = run([CHROME_BINARY, '--version'], stdout=PIPE)
  107. version_str = result.stdout.decode('utf-8')
  108. version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
  109. version = [l for l in version_lines if l.isdigit()][-1]
  110. if int(version) < 59:
  111. print(version_lines)
  112. print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
  113. print(' See https://github.com/pirate/ArchiveBox for help.')
  114. raise SystemExit(1)
  115. except (IndexError, TypeError, OSError):
  116. print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
  117. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
  118. print(' See https://github.com/pirate/ArchiveBox for help.')
  119. raise SystemExit(1)
  120. if FETCH_GIT:
  121. if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
  122. print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
  123. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
  124. print(' See https://github.com/pirate/ArchiveBox for help.')
  125. raise SystemExit(1)
  126. if FETCH_MEDIA:
  127. if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
  128. print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
  129. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
  130. print(' See https://github.com/pirate/ArchiveBox for help.')
  131. raise SystemExit(1)
  132. def check_url_parsing():
  133. """Check that plain text regex URL parsing works as expected"""
  134. test_urls = '''
  135. https://example1.com/what/is/happening.html?what=1#how-about-this=1
  136. https://example2.com/what/is/happening/?what=1#how-about-this=1
  137. HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
  138. https://example4.com/what/is/happening.html
  139. https://example5.com/
  140. https://example6.com
  141. <test>http://example7.com</test>
  142. [https://example8.com/what/is/this.php?what=1]
  143. [and http://example9.com?what=1&other=3#and-thing=2]
  144. <what>https://example10.com#and-thing=2 "</about>
  145. abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
  146. sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
  147. example13.bada
  148. and example14.badb
  149. <or>htt://example15.badc</that>
  150. '''
  151. # print('\n'.join(re.findall(URL_REGEX, test_urls)))
  152. assert len(re.findall(URL_REGEX, test_urls)) == 12
  153. ### Random Helpers
  154. def save_stdin_source(raw_text):
  155. if not os.path.exists(SOURCES_DIR):
  156. os.makedirs(SOURCES_DIR)
  157. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  158. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
  159. with open(source_path, 'w', encoding='utf-8') as f:
  160. f.write(raw_text)
  161. return source_path
  162. def save_remote_source(url, timeout=TIMEOUT):
  163. """download a given url's content into output/sources/domain-<timestamp>.txt"""
  164. if not os.path.exists(SOURCES_DIR):
  165. os.makedirs(SOURCES_DIR)
  166. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  167. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
  168. print('{}[*] [{}] Downloading {}{}'.format(
  169. ANSI['green'],
  170. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  171. url,
  172. ANSI['reset'],
  173. ))
  174. end = progress(TIMEOUT, prefix=' ')
  175. try:
  176. downloaded_xml = download_url(url, timeout=timeout)
  177. end()
  178. except Exception as e:
  179. end()
  180. print('{}[!] Failed to download {}{}\n'.format(
  181. ANSI['red'],
  182. url,
  183. ANSI['reset'],
  184. ))
  185. print(' ', e)
  186. raise SystemExit(1)
  187. with open(source_path, 'w', encoding='utf-8') as f:
  188. f.write(downloaded_xml)
  189. print(' > {}'.format(pretty_path(source_path)))
  190. return source_path
  191. def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
  192. """Attempt to guess a page's title by downloading the html"""
  193. if not FETCH_TITLE:
  194. return None
  195. try:
  196. if progress:
  197. sys.stdout.write('.')
  198. sys.stdout.flush()
  199. html = download_url(url, timeout=timeout)
  200. match = re.search(HTML_TITLE_REGEX, html)
  201. return match.group(1).strip() if match else None
  202. except Exception as err:
  203. # print('[!] Failed to fetch title because of {}: {}'.format(
  204. # err.__class__.__name__,
  205. # err,
  206. # ))
  207. return None
  208. def wget_output_path(link, look_in=None):
  209. """calculate the path to the wgetted .html file, since wget may
  210. adjust some paths to be different than the base_url path.
  211. See docs on wget --adjust-extension (-E)
  212. """
  213. # if we have it stored, always prefer the actual output path to computed one
  214. if link.get('latest', {}).get('wget'):
  215. return link['latest']['wget']
  216. urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
  217. if link['type'] in ('PDF', 'image'):
  218. return urlencode(base_url(link['url']))
  219. # Since the wget algorithm to for -E (appending .html) is incredibly complex
  220. # instead of trying to emulate it here, we just look in the output folder
  221. # to see what html file wget actually created as the output
  222. wget_folder = base_url(link['url']).rsplit('/', 1)[0].split('/')
  223. look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder)
  224. if look_in and os.path.exists(look_in):
  225. html_files = [
  226. f for f in os.listdir(look_in)
  227. if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
  228. ]
  229. if html_files:
  230. return urlencode(os.path.join(*wget_folder, html_files[0]))
  231. return None
  232. # If finding the actual output file didn't work, fall back to the buggy
  233. # implementation of the wget .html appending algorithm
  234. # split_url = link['url'].split('#', 1)
  235. # query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
  236. # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
  237. # # already ends in .html
  238. # return urlencode(base_url(link['url']))
  239. # else:
  240. # # .html needs to be appended
  241. # without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
  242. # if without_scheme.endswith('/'):
  243. # if query:
  244. # return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
  245. # return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
  246. # else:
  247. # if query:
  248. # return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
  249. # elif '/' in without_scheme:
  250. # return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
  251. # return urlencode(base_url(link['url']) + '/index.html')
  252. ### String Manipulation & Logging Helpers
  253. def str_between(string, start, end=None):
  254. """(<abc>12345</def>, <abc>, </def>) -> 12345"""
  255. content = string.split(start, 1)[-1]
  256. if end is not None:
  257. content = content.rsplit(end, 1)[0]
  258. return content
  259. def pretty_path(path):
  260. """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
  261. return path.replace(REPO_DIR + '/', '')
  262. def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '):
  263. """quote the argument with whitespace in a command so the user can
  264. copy-paste the outputted string directly to run the cmd
  265. """
  266. quoted_cmd = ' '.join(
  267. '"{}"'.format(arg) if ' ' in arg else arg
  268. for arg in cmd
  269. )
  270. output_lines = [
  271. '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
  272. ' {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None,
  273. 'Run to see full output:'
  274. ' cd {};'.format(pwd),
  275. ' {}'.format(quoted_cmd),
  276. ]
  277. return '\n'.join(
  278. '{}{}'.format(prefix, line)
  279. for line in output_lines
  280. if line
  281. )
  282. ### Link Helpers
  283. def merge_links(a, b):
  284. """deterministially merge two links, favoring longer field values over shorter,
  285. and "cleaner" values over worse ones.
  286. """
  287. longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
  288. earlier = lambda key: a[key] if a[key] < b[key] else b[key]
  289. url = longer('url')
  290. longest_title = longer('title')
  291. cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
  292. link = {
  293. 'timestamp': earlier('timestamp'),
  294. 'url': url,
  295. 'domain': domain(url),
  296. 'base_url': base_url(url),
  297. 'tags': longer('tags'),
  298. 'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
  299. 'sources': list(set(a.get('sources', []) + b.get('sources', []))),
  300. }
  301. link['type'] = get_link_type(link)
  302. return link
  303. def get_link_type(link):
  304. """Certain types of links need to be handled specially, this figures out when that's the case"""
  305. if extension(link['url']) == 'pdf':
  306. return 'PDF'
  307. elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
  308. return 'image'
  309. elif 'wikipedia.org' in domain(link['url']).lower():
  310. return 'wiki'
  311. elif 'youtube.com' in domain(link['url']).lower():
  312. return 'youtube'
  313. elif 'soundcloud.com' in domain(link['url']).lower():
  314. return 'soundcloud'
  315. elif 'youku.com' in domain(link['url']).lower():
  316. return 'youku'
  317. elif 'vimeo.com' in domain(link['url']).lower():
  318. return 'vimeo'
  319. return None
  320. def derived_link_info(link):
  321. """extend link info with the archive urls and other derived data"""
  322. url = link['url']
  323. to_date_str = lambda ts: datetime.fromtimestamp(Decimal(ts)).strftime('%Y-%m-%d %H:%M')
  324. extended_info = {
  325. **link,
  326. 'link_dir': '{}/{}'.format(ARCHIVE_DIR_NAME, link['timestamp']),
  327. 'bookmarked_date': to_date_str(link['timestamp']),
  328. 'updated_date': to_date_str(link['updated']) if 'updated' in link else None,
  329. 'domain': domain(url),
  330. 'path': path(url),
  331. 'basename': basename(url),
  332. 'base_url': base_url(url),
  333. }
  334. # Archive Method Output URLs
  335. extended_info = {
  336. **extended_info,
  337. 'index_url': 'index.html',
  338. 'favicon_url': 'favicon.ico',
  339. 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
  340. 'archive_url': wget_output_path(link) or 'index.html',
  341. 'warc_url': 'warc',
  342. 'pdf_url': 'output.pdf',
  343. 'screenshot_url': 'screenshot.png',
  344. 'dom_url': 'output.html',
  345. 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
  346. 'git_url': 'git',
  347. 'media_url': 'media',
  348. }
  349. # PDF and images are handled slightly differently
  350. # wget, screenshot, & pdf urls all point to the same file
  351. if link['type'] in ('PDF', 'image'):
  352. extended_info.update({
  353. 'title': basename(link['url']),
  354. 'archive_url': base_url(url),
  355. 'pdf_url': base_url(url),
  356. 'screenshot_url': base_url(url),
  357. 'dom_url': base_url(url),
  358. })
  359. return extended_info
  360. ### Python / System Helpers
  361. def progress(seconds=TIMEOUT, prefix=''):
  362. """Show a (subprocess-controlled) progress bar with a <seconds> timeout,
  363. returns end() function to instantly finish the progress
  364. """
  365. if not SHOW_PROGRESS:
  366. return lambda: None
  367. def progress_bar(seconds, prefix):
  368. """show timer in the form of progress bar, with percentage and seconds remaining"""
  369. chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
  370. chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
  371. try:
  372. for s in range(seconds * chunks):
  373. progress = s / chunks / seconds * 100
  374. bar_width = round(progress/(100/chunks))
  375. # ████████████████████ 0.9% (1/60sec)
  376. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  377. prefix,
  378. ANSI['green'],
  379. (chunk * bar_width).ljust(chunks),
  380. ANSI['reset'],
  381. round(progress, 1),
  382. round(s/chunks),
  383. seconds,
  384. ))
  385. sys.stdout.flush()
  386. time.sleep(1 / chunks)
  387. # ██████████████████████████████████ 100.0% (60/60sec)
  388. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
  389. prefix,
  390. ANSI['red'],
  391. chunk * chunks,
  392. ANSI['reset'],
  393. 100.0,
  394. seconds,
  395. seconds,
  396. ))
  397. sys.stdout.flush()
  398. except KeyboardInterrupt:
  399. print()
  400. pass
  401. p = Process(target=progress_bar, args=(seconds, prefix))
  402. p.start()
  403. def end():
  404. """immediately finish progress and clear the progressbar line"""
  405. # protect from double termination
  406. #if p is None or not hasattr(p, 'kill'):
  407. # return
  408. nonlocal p
  409. if p is not None:
  410. p.terminate()
  411. p = None
  412. sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line
  413. sys.stdout.flush()
  414. return end
  415. def download_url(url, timeout=TIMEOUT):
  416. req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
  417. if CHECK_SSL_VALIDITY:
  418. resp = urlopen(req, timeout=timeout)
  419. else:
  420. import ssl
  421. insecure = ssl._create_unverified_context()
  422. resp = urlopen(req, timeout=timeout, context=insecure)
  423. encoding = resp.headers.get_content_charset() or 'utf-8'
  424. return resp.read().decode(encoding)
  425. def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
  426. """chmod -R <permissions> <cwd>/<path>"""
  427. if not os.path.exists(os.path.join(cwd, path)):
  428. raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
  429. chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
  430. if chmod_result.returncode == 1:
  431. print(' ', chmod_result.stderr.decode())
  432. raise Exception('Failed to chmod {}/{}'.format(cwd, path))
  433. def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
  434. """Patched of subprocess.run to fix blocking io making timeout=innefective"""
  435. if input is not None:
  436. if 'stdin' in kwargs:
  437. raise ValueError('stdin and input arguments may not both be used.')
  438. kwargs['stdin'] = PIPE
  439. if capture_output:
  440. if ('stdout' in kwargs) or ('stderr' in kwargs):
  441. raise ValueError('stdout and stderr arguments may not be used '
  442. 'with capture_output.')
  443. kwargs['stdout'] = PIPE
  444. kwargs['stderr'] = PIPE
  445. with Popen(*popenargs, **kwargs) as process:
  446. try:
  447. stdout, stderr = process.communicate(input, timeout=timeout)
  448. except TimeoutExpired:
  449. process.kill()
  450. try:
  451. stdout, stderr = process.communicate(input, timeout=2)
  452. except:
  453. pass
  454. raise TimeoutExpired(popenargs[0][0], timeout)
  455. except BaseException as err:
  456. process.kill()
  457. # We don't call process.wait() as .__exit__ does that for us.
  458. raise
  459. retcode = process.poll()
  460. if check and retcode:
  461. raise CalledProcessError(retcode, process.args,
  462. output=stdout, stderr=stderr)
  463. return CompletedProcess(process.args, retcode, stdout, stderr)