archive_methods.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. import os
  2. from functools import wraps
  3. from collections import defaultdict
  4. from datetime import datetime
  5. from index import (
  6. wget_output_path,
  7. parse_json_link_index,
  8. write_link_index,
  9. patch_index_title_hack,
  10. )
  11. from config import (
  12. OUTPUT_DIR,
  13. CURL_BINARY,
  14. GIT_BINARY,
  15. WGET_BINARY,
  16. YOUTUBEDL_BINARY,
  17. CHROME_BINARY,
  18. FETCH_FAVICON,
  19. FETCH_TITLE,
  20. FETCH_WGET,
  21. FETCH_WGET_REQUISITES,
  22. FETCH_PDF,
  23. FETCH_SCREENSHOT,
  24. FETCH_DOM,
  25. FETCH_WARC,
  26. FETCH_GIT,
  27. FETCH_MEDIA,
  28. RESOLUTION,
  29. CHECK_SSL_VALIDITY,
  30. SUBMIT_ARCHIVE_DOT_ORG,
  31. COOKIES_FILE,
  32. WGET_USER_AGENT,
  33. CHROME_USER_DATA_DIR,
  34. CHROME_SANDBOX,
  35. TIMEOUT,
  36. MEDIA_TIMEOUT,
  37. ANSI,
  38. ARCHIVE_DIR,
  39. GIT_DOMAINS,
  40. GIT_SHA,
  41. )
  42. from util import (
  43. domain,
  44. without_fragment,
  45. fetch_page_title,
  46. progress,
  47. chmod_file,
  48. pretty_path,
  49. check_link_structure,
  50. run, PIPE, DEVNULL,
  51. )
  52. _RESULTS_TOTALS = { # globals are bad, mmkay
  53. 'skipped': 0,
  54. 'succeded': 0,
  55. 'failed': 0,
  56. }
  57. def archive_link(link_dir, link, overwrite=True):
  58. """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
  59. check_link_structure(link)
  60. try:
  61. update_existing = os.path.exists(link_dir)
  62. if update_existing:
  63. link = {
  64. **parse_json_link_index(link_dir),
  65. **link,
  66. }
  67. else:
  68. os.makedirs(link_dir)
  69. print_link_status_line(link_dir, link, update_existing)
  70. if FETCH_FAVICON:
  71. link = fetch_favicon(link_dir, link, overwrite=overwrite)
  72. if FETCH_TITLE:
  73. link = fetch_title(link_dir, link, overwrite=overwrite)
  74. if FETCH_WGET:
  75. link = fetch_wget(link_dir, link, overwrite=overwrite)
  76. if FETCH_PDF:
  77. link = fetch_pdf(link_dir, link, overwrite=overwrite)
  78. if FETCH_SCREENSHOT:
  79. link = fetch_screenshot(link_dir, link, overwrite=overwrite)
  80. if FETCH_DOM:
  81. link = fetch_dom(link_dir, link, overwrite=overwrite)
  82. if SUBMIT_ARCHIVE_DOT_ORG:
  83. link = archive_dot_org(link_dir, link, overwrite=overwrite)
  84. if FETCH_GIT:
  85. link = fetch_git(link_dir, link, overwrite=overwrite)
  86. if FETCH_MEDIA:
  87. link = fetch_media(link_dir, link, overwrite=overwrite)
  88. write_link_index(link_dir, link)
  89. except Exception as err:
  90. print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
  91. return link
  92. def print_link_status_line(link_dir, link, update_existing):
  93. print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
  94. symbol='*' if update_existing else '+',
  95. symbol_color=ANSI['black' if update_existing else 'green'],
  96. now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  97. **{**link, 'title': link['title'] or link['url']},
  98. **ANSI,
  99. ))
  100. print(' > {}{}'.format(pretty_path(link_dir), '' if update_existing else ' (new)'))
  101. # if link['type']:
  102. # print(' i {}'.format(link['type']))
  103. def attach_result_to_link(method):
  104. """
  105. Instead of returning a result={output:'...', status:'success'} object,
  106. attach that result to the links's history & latest fields, then return
  107. the updated link object.
  108. """
  109. def decorator(fetch_func):
  110. @wraps(fetch_func)
  111. def timed_fetch_func(link_dir, link, overwrite=False, **kwargs):
  112. # initialize methods and history json field on link
  113. link['latest'] = link.get('latest') or {}
  114. link['latest'][method] = link['latest'].get(method) or None
  115. link['history'] = link.get('history') or {}
  116. link['history'][method] = link['history'].get(method) or []
  117. start_ts = datetime.now().timestamp()
  118. # if a valid method output is already present, dont run the fetch function
  119. if link['latest'][method] and not overwrite:
  120. print(' √ {}'.format(method))
  121. result = None
  122. else:
  123. print(' > {}'.format(method))
  124. result = fetch_func(link_dir, link, **kwargs)
  125. end_ts = datetime.now().timestamp()
  126. duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0]
  127. # append a history item recording fail/success
  128. history_entry = {
  129. 'timestamp': str(start_ts).split('.')[0],
  130. }
  131. if result is None:
  132. history_entry['status'] = 'skipped'
  133. elif isinstance(result.get('output'), Exception):
  134. history_entry['status'] = 'failed'
  135. history_entry['duration'] = duration
  136. history_entry.update(result or {})
  137. link['history'][method].append(history_entry)
  138. else:
  139. history_entry['status'] = 'succeded'
  140. history_entry['duration'] = duration
  141. history_entry.update(result or {})
  142. link['history'][method].append(history_entry)
  143. link['latest'][method] = result['output']
  144. _RESULTS_TOTALS[history_entry['status']] += 1
  145. return link
  146. return timed_fetch_func
  147. return decorator
  148. @attach_result_to_link('wget')
  149. def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT):
  150. """download full site using wget"""
  151. domain_dir = os.path.join(link_dir, domain(link['url']))
  152. existing_file = wget_output_path(link)
  153. if os.path.exists(domain_dir) and existing_file:
  154. return {'output': existing_file, 'status': 'skipped'}
  155. if warc:
  156. warc_dir = os.path.join(link_dir, 'warc')
  157. os.makedirs(warc_dir, exist_ok=True)
  158. warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
  159. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  160. CMD = [
  161. WGET_BINARY,
  162. # '--server-response', # print headers for better error parsing
  163. '--no-verbose',
  164. '--adjust-extension',
  165. '--convert-links',
  166. '--force-directories',
  167. '--backup-converted',
  168. '--span-hosts',
  169. '--no-parent',
  170. '-e', 'robots=off',
  171. '--restrict-file-names=unix',
  172. '--timeout={}'.format(timeout),
  173. *(() if warc else ('--timestamping',)),
  174. *(('--warc-file={}'.format(warc_path),) if warc else ()),
  175. *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
  176. *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
  177. *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
  178. *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
  179. link['url'],
  180. ]
  181. end = progress(timeout, prefix=' ')
  182. try:
  183. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # index.html
  184. end()
  185. output = wget_output_path(link, look_in=domain_dir)
  186. output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()]
  187. # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  188. files_downloaded = (
  189. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  190. if 'Downloaded:' in output_tail[-1]
  191. else 0
  192. )
  193. # Check for common failure cases
  194. if result.returncode > 0 and files_downloaded < 1:
  195. print(' Got wget response code {}:'.format(result.returncode))
  196. print('\n'.join(output_tail))
  197. if b'403: Forbidden' in result.stderr:
  198. raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
  199. if b'404: Not Found' in result.stderr:
  200. raise Exception('404 Not Found')
  201. if b'ERROR 500: Internal Server Error' in result.stderr:
  202. raise Exception('500 Internal Server Error')
  203. raise Exception('Got an error from the server')
  204. except Exception as e:
  205. end()
  206. # to let the user copy-paste the command and run it safely we have
  207. # to quote some of the arguments that could have spaces in them
  208. quoted_cmd = ' '.join(CMD)
  209. quoted_cmd = quoted_cmd.replace(WGET_USER_AGENT, '"{}"'.format(WGET_USER_AGENT))
  210. if COOKIES_FILE:
  211. quoted_cmd = quoted_cmd.replace(COOKIES_FILE, '"{}"'.format(COOKIES_FILE))
  212. print(' {}Some resources were skipped: {}{}'.format(ANSI['lightyellow'], e, ANSI['reset']))
  213. print(' Run to see full output:')
  214. print(' cd {};'.format(link_dir))
  215. print(' {}'.format(quoted_cmd))
  216. output = e
  217. return {
  218. 'cmd': CMD,
  219. 'output': output,
  220. }
  221. @attach_result_to_link('pdf')
  222. def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
  223. """print PDF of site to file using chrome --headless"""
  224. if link['type'] in ('PDF', 'image'):
  225. return {'output': wget_output_path(link)}
  226. if os.path.exists(os.path.join(link_dir, 'output.pdf')):
  227. return {'output': 'output.pdf', 'status': 'skipped'}
  228. CMD = [
  229. *chrome_headless(user_data_dir=user_data_dir),
  230. '--print-to-pdf',
  231. '--hide-scrollbars',
  232. '--timeout={}'.format((timeout) * 1000),
  233. *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
  234. link['url']
  235. ]
  236. end = progress(timeout, prefix=' ')
  237. try:
  238. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # output.pdf
  239. end()
  240. if result.returncode:
  241. print(' ', (result.stderr or result.stdout).decode())
  242. raise Exception('Failed to print PDF')
  243. chmod_file('output.pdf', cwd=link_dir)
  244. output = 'output.pdf'
  245. except Exception as e:
  246. end()
  247. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  248. print(' Run to see full output:')
  249. print(' cd {};'.format(link_dir))
  250. print(' {}'.format(' '.join(CMD)))
  251. output = e
  252. return {
  253. 'cmd': CMD,
  254. 'output': output,
  255. }
  256. @attach_result_to_link('screenshot')
  257. def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
  258. """take screenshot of site using chrome --headless"""
  259. if link['type'] in ('PDF', 'image'):
  260. return {'output': wget_output_path(link)}
  261. if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
  262. return {'output': 'screenshot.png', 'status': 'skipped'}
  263. CMD = [
  264. *chrome_headless(user_data_dir=user_data_dir),
  265. '--screenshot',
  266. '--window-size={}'.format(resolution),
  267. '--hide-scrollbars',
  268. '--timeout={}'.format((timeout) * 1000),
  269. *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
  270. # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
  271. link['url'],
  272. ]
  273. end = progress(timeout, prefix=' ')
  274. try:
  275. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # sreenshot.png
  276. end()
  277. if result.returncode:
  278. print(' ', (result.stderr or result.stdout).decode())
  279. raise Exception('Failed to take screenshot')
  280. chmod_file('screenshot.png', cwd=link_dir)
  281. output = 'screenshot.png'
  282. except Exception as e:
  283. end()
  284. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  285. print(' Run to see full output:')
  286. print(' cd {};'.format(link_dir))
  287. print(' {}'.format(' '.join(CMD)))
  288. output = e
  289. return {
  290. 'cmd': CMD,
  291. 'output': output,
  292. }
  293. @attach_result_to_link('dom')
  294. def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
  295. """print HTML of site to file using chrome --dump-html"""
  296. if link['type'] in ('PDF', 'image'):
  297. return {'output': wget_output_path(link)}
  298. output_path = os.path.join(link_dir, 'output.html')
  299. if os.path.exists(output_path):
  300. return {'output': 'output.html', 'status': 'skipped'}
  301. CMD = [
  302. *chrome_headless(user_data_dir=user_data_dir),
  303. '--dump-dom',
  304. '--timeout={}'.format((timeout) * 1000),
  305. link['url']
  306. ]
  307. end = progress(timeout, prefix=' ')
  308. try:
  309. with open(output_path, 'w+') as f:
  310. result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout) # output.html
  311. end()
  312. if result.returncode:
  313. print(' ', (result.stderr).decode())
  314. raise Exception('Failed to fetch DOM')
  315. chmod_file('output.html', cwd=link_dir)
  316. output = 'output.html'
  317. except Exception as e:
  318. end()
  319. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  320. print(' Run to see full output:')
  321. print(' cd {};'.format(link_dir))
  322. print(' {}'.format(' '.join(CMD)))
  323. output = e
  324. return {
  325. 'cmd': CMD,
  326. 'output': output,
  327. }
  328. @attach_result_to_link('archive_org')
  329. def archive_dot_org(link_dir, link, timeout=TIMEOUT):
  330. """submit site to archive.org for archiving via their service, save returned archive url"""
  331. path = os.path.join(link_dir, 'archive.org.txt')
  332. if os.path.exists(path):
  333. archive_org_url = open(path, 'r').read().strip()
  334. return {'output': archive_org_url, 'status': 'skipped'}
  335. submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
  336. success = False
  337. CMD = [
  338. CURL_BINARY,
  339. '--location',
  340. '--head',
  341. '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),
  342. '--max-time', str(timeout),
  343. '--get',
  344. *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
  345. submit_url,
  346. ]
  347. end = progress(timeout, prefix=' ')
  348. try:
  349. result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout) # archive.org.txt
  350. end()
  351. # Parse archive.org response headers
  352. headers = defaultdict(list)
  353. # lowercase all the header names and store in dict
  354. for header in result.stdout.splitlines():
  355. if b':' not in header or not header.strip():
  356. continue
  357. name, val = header.decode().split(':', 1)
  358. headers[name.lower().strip()].append(val.strip())
  359. # Get successful archive url in "content-location" header or any errors
  360. content_location = headers['content-location']
  361. errors = headers['x-archive-wayback-runtime-error']
  362. if content_location:
  363. saved_url = 'https://web.archive.org{}'.format(content_location[0])
  364. success = True
  365. elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
  366. output = submit_url
  367. # raise Exception('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
  368. elif errors:
  369. raise Exception(', '.join(errors))
  370. else:
  371. raise Exception('Failed to find "content-location" URL header in Archive.org response.')
  372. except Exception as e:
  373. end()
  374. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  375. print(' Run to see full output:')
  376. print(' {}'.format(' '.join(CMD)))
  377. output = e
  378. if success:
  379. with open(os.path.join(link_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
  380. f.write(saved_url)
  381. chmod_file('archive.org.txt', cwd=link_dir)
  382. output = saved_url
  383. return {
  384. 'cmd': CMD,
  385. 'output': output,
  386. }
  387. @attach_result_to_link('favicon')
  388. def fetch_favicon(link_dir, link, timeout=TIMEOUT):
  389. """download site favicon from google's favicon api"""
  390. if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
  391. return {'output': 'favicon.ico', 'status': 'skipped'}
  392. CMD = [
  393. CURL_BINARY,
  394. '--max-time', str(timeout),
  395. *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
  396. 'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
  397. ]
  398. fout = open('{}/favicon.ico'.format(link_dir), 'w')
  399. end = progress(timeout, prefix=' ')
  400. try:
  401. run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout) # favicon.ico
  402. fout.close()
  403. end()
  404. chmod_file('favicon.ico', cwd=link_dir)
  405. output = 'favicon.ico'
  406. except Exception as e:
  407. fout.close()
  408. end()
  409. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  410. print(' Run to see full output:')
  411. print(' {}'.format(' '.join(CMD)))
  412. output = e
  413. return {
  414. 'cmd': CMD,
  415. 'output': output,
  416. }
  417. @attach_result_to_link('title')
  418. def fetch_title(link_dir, link, timeout=TIMEOUT):
  419. """try to guess the page's title from its content"""
  420. # if link already has valid title, skip it
  421. if link['title'] and not link['title'].lower().startswith('http'):
  422. return {'output': link['title'], 'status': 'skipped'}
  423. end = progress(timeout, prefix=' ')
  424. try:
  425. title = fetch_page_title(link['url'], timeout=timeout, progress=False)
  426. end()
  427. output = title
  428. except Exception as e:
  429. end()
  430. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  431. output = e
  432. # titles should show up in the global index immediatley for better UX,
  433. # do a hacky immediate replacement to add them in as we're archiving
  434. # TODO: figure out how to do this without gnarly string replacement
  435. if title:
  436. link['title'] = title
  437. patch_index_title_hack(link['url'], title)
  438. return {
  439. 'cmd': 'fetch_page_title("{}")'.format(link['url']),
  440. 'output': output,
  441. }
  442. @attach_result_to_link('media')
  443. def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
  444. """Download playlists or individual video, audio, and subtitles using youtube-dl"""
  445. # import ipdb; ipdb.set_trace()
  446. output = os.path.join(link_dir, 'media')
  447. already_done = os.path.exists(output) # and os.listdir(output)
  448. if already_done and not overwrite:
  449. return {'output': 'media', 'status': 'skipped'}
  450. os.makedirs(output, exist_ok=True)
  451. CMD = [
  452. YOUTUBEDL_BINARY,
  453. '--write-description',
  454. '--write-info-json',
  455. '--write-annotations',
  456. '--yes-playlist',
  457. '--write-thumbnail',
  458. '--no-call-home',
  459. '--no-check-certificate',
  460. '--user-agent',
  461. '--all-subs',
  462. '--extract-audio',
  463. '--keep-video',
  464. '--ignore-errors',
  465. '--geo-bypass',
  466. '--audio-format', 'mp3',
  467. '--audio-quality', '320K',
  468. '--embed-thumbnail',
  469. '--add-metadata',
  470. *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
  471. link['url'],
  472. ]
  473. end = progress(timeout, prefix=' ')
  474. try:
  475. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output, timeout=timeout + 1) # audio/audio.mp3
  476. chmod_file('media', cwd=link_dir)
  477. output = 'media'
  478. end()
  479. if result.returncode:
  480. if (b'ERROR: Unsupported URL' in result.stderr
  481. or b'HTTP Error 404' in result.stderr
  482. or b'HTTP Error 403' in result.stderr
  483. or b'URL could be a direct video link' in result.stderr
  484. or b'Unable to extract container ID' in result.stderr):
  485. # These happen too frequently on non-media pages to warrant printing to console
  486. pass
  487. else:
  488. print(' got youtubedl response code {}:'.format(result.returncode))
  489. print(result.stderr)
  490. raise Exception('Failed to download media')
  491. except Exception as e:
  492. end()
  493. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  494. print(' Run to see full output:')
  495. print(' cd {};'.format(link_dir))
  496. print(' {}'.format(' '.join(CMD)))
  497. output = e
  498. return {
  499. 'cmd': CMD,
  500. 'output': output,
  501. }
  502. @attach_result_to_link('git')
  503. def fetch_git(link_dir, link, timeout=TIMEOUT):
  504. """download full site using git"""
  505. if not (domain(link['url']) in GIT_DOMAINS
  506. or link['url'].endswith('.git')
  507. or link['type'] == 'git'):
  508. return
  509. if os.path.exists(os.path.join(link_dir, 'git')):
  510. return {'output': 'git', 'status': 'skipped'}
  511. CMD = [
  512. GIT_BINARY,
  513. *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
  514. 'clone',
  515. '--mirror',
  516. '--recursive',
  517. without_fragment(link['url']),
  518. ]
  519. output = 'git'
  520. end = progress(timeout, prefix=' ')
  521. try:
  522. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # git/<reponame>
  523. end()
  524. if result.returncode > 0:
  525. print(' got git response code {}:'.format(result.returncode))
  526. raise Exception('Failed git download')
  527. except Exception as e:
  528. end()
  529. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  530. print(' Run to see full output:')
  531. print(' cd {};'.format(link_dir))
  532. print(' {}'.format(' '.join(CMD)))
  533. output = e
  534. return {
  535. 'cmd': CMD,
  536. 'output': output,
  537. }
  538. def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR):
  539. args = [binary, '--headless'] # '--disable-gpu'
  540. if not CHROME_SANDBOX:
  541. args.append('--no-sandbox')
  542. default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome')
  543. if user_data_dir:
  544. args.append('--user-data-dir={}'.format(user_data_dir))
  545. elif os.path.exists(default_profile):
  546. args.append('--user-data-dir={}'.format(default_profile))
  547. return args