archive_methods.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. import os
  2. from functools import wraps
  3. from collections import defaultdict
  4. from datetime import datetime
  5. from index import (
  6. wget_output_path,
  7. parse_json_link_index,
  8. write_link_index,
  9. patch_index_title_hack,
  10. )
  11. from config import (
  12. OUTPUT_DIR,
  13. CURL_BINARY,
  14. GIT_BINARY,
  15. WGET_BINARY,
  16. YOUTUBEDL_BINARY,
  17. CHROME_BINARY,
  18. FETCH_FAVICON,
  19. FETCH_TITLE,
  20. FETCH_WGET,
  21. FETCH_WGET_REQUISITES,
  22. FETCH_PDF,
  23. FETCH_SCREENSHOT,
  24. FETCH_DOM,
  25. FETCH_WARC,
  26. FETCH_GIT,
  27. FETCH_MEDIA,
  28. RESOLUTION,
  29. CHECK_SSL_VALIDITY,
  30. SUBMIT_ARCHIVE_DOT_ORG,
  31. COOKIES_FILE,
  32. WGET_USER_AGENT,
  33. CHROME_USER_DATA_DIR,
  34. CHROME_SANDBOX,
  35. TIMEOUT,
  36. MEDIA_TIMEOUT,
  37. ANSI,
  38. ARCHIVE_DIR,
  39. GIT_DOMAINS,
  40. GIT_SHA,
  41. )
  42. from util import (
  43. without_fragment,
  44. fetch_page_title,
  45. progress,
  46. chmod_file,
  47. pretty_path,
  48. check_link_structure,
  49. run, PIPE, DEVNULL,
  50. )
  51. _RESULTS_TOTALS = { # globals are bad, mmkay
  52. 'skipped': 0,
  53. 'succeded': 0,
  54. 'failed': 0,
  55. }
  56. def archive_link(link_dir, link, overwrite=True):
  57. """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
  58. check_link_structure(link)
  59. try:
  60. update_existing = os.path.exists(link_dir)
  61. if update_existing:
  62. link = {
  63. **parse_json_link_index(link_dir),
  64. **link,
  65. }
  66. else:
  67. os.makedirs(link_dir)
  68. print_link_status_line(link_dir, link, update_existing)
  69. if FETCH_FAVICON:
  70. link = fetch_favicon(link_dir, link, overwrite=overwrite)
  71. if FETCH_TITLE:
  72. link = fetch_title(link_dir, link, overwrite=overwrite)
  73. if FETCH_WGET:
  74. link = fetch_wget(link_dir, link, overwrite=overwrite)
  75. if FETCH_PDF:
  76. link = fetch_pdf(link_dir, link, overwrite=overwrite)
  77. if FETCH_SCREENSHOT:
  78. link = fetch_screenshot(link_dir, link, overwrite=overwrite)
  79. if FETCH_DOM:
  80. link = fetch_dom(link_dir, link, overwrite=overwrite)
  81. if SUBMIT_ARCHIVE_DOT_ORG:
  82. link = archive_dot_org(link_dir, link, overwrite=overwrite)
  83. if FETCH_GIT:
  84. link = fetch_git(link_dir, link, overwrite=overwrite)
  85. if FETCH_MEDIA:
  86. link = fetch_media(link_dir, link, overwrite=overwrite)
  87. write_link_index(link_dir, link)
  88. except Exception as err:
  89. print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
  90. return link
  91. def print_link_status_line(link_dir, link, update_existing):
  92. print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
  93. symbol='*' if update_existing else '+',
  94. symbol_color=ANSI['black' if update_existing else 'green'],
  95. now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  96. **{**link, 'title': link['title'] or link['url']},
  97. **ANSI,
  98. ))
  99. print(' > {}{}'.format(pretty_path(link_dir), '' if update_existing else ' (new)'))
  100. # if link['type']:
  101. # print(' i {}'.format(link['type']))
  102. def attach_result_to_link(method):
  103. """
  104. Instead of returning a result={output:'...', status:'success'} object,
  105. attach that result to the links's history & latest fields, then return
  106. the updated link object.
  107. """
  108. def decorator(fetch_func):
  109. @wraps(fetch_func)
  110. def timed_fetch_func(link_dir, link, overwrite=False, **kwargs):
  111. # initialize methods and history json field on link
  112. link['latest'] = link.get('latest') or {}
  113. link['latest'][method] = link['latest'].get(method) or None
  114. link['history'] = link.get('history') or {}
  115. link['history'][method] = link['history'].get(method) or []
  116. start_ts = datetime.now().timestamp()
  117. # if a valid method output is already present, dont run the fetch function
  118. if link['latest'][method] and not overwrite:
  119. print(' √ {}'.format(method))
  120. result = None
  121. else:
  122. print(' > {}'.format(method))
  123. result = fetch_func(link_dir, link, **kwargs)
  124. end_ts = datetime.now().timestamp()
  125. duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0]
  126. # append a history item recording fail/success
  127. history_entry = {
  128. 'timestamp': str(start_ts).split('.')[0],
  129. }
  130. if result is None:
  131. history_entry['status'] = 'skipped'
  132. elif isinstance(result.get('output'), Exception):
  133. history_entry['status'] = 'failed'
  134. history_entry['duration'] = duration
  135. history_entry.update(result or {})
  136. link['history'][method].append(history_entry)
  137. else:
  138. history_entry['status'] = 'succeded'
  139. history_entry['duration'] = duration
  140. history_entry.update(result or {})
  141. link['history'][method].append(history_entry)
  142. link['latest'][method] = result['output']
  143. _RESULTS_TOTALS[history_entry['status']] += 1
  144. return link
  145. return timed_fetch_func
  146. return decorator
  147. @attach_result_to_link('wget')
  148. def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT):
  149. """download full site using wget"""
  150. domain_dir = os.path.join(link_dir, link['domain'])
  151. existing_file = wget_output_path(link)
  152. if os.path.exists(domain_dir) and existing_file:
  153. return {'output': existing_file, 'status': 'skipped'}
  154. if warc:
  155. warc_dir = os.path.join(link_dir, 'warc')
  156. os.makedirs(warc_dir, exist_ok=True)
  157. warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
  158. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  159. CMD = [
  160. WGET_BINARY,
  161. # '--server-response', # print headers for better error parsing
  162. '--no-verbose',
  163. '--adjust-extension',
  164. '--convert-links',
  165. '--force-directories',
  166. '--backup-converted',
  167. '--span-hosts',
  168. '--no-parent',
  169. '-e', 'robots=off',
  170. '--restrict-file-names=unix',
  171. '--timeout={}'.format(timeout),
  172. *(() if warc else ('--timestamping',)),
  173. *(('--warc-file={}'.format(warc_path),) if warc else ()),
  174. *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
  175. *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
  176. *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
  177. *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
  178. link['url'],
  179. ]
  180. end = progress(timeout, prefix=' ')
  181. try:
  182. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # index.html
  183. end()
  184. output = wget_output_path(link, look_in=domain_dir)
  185. output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()]
  186. # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  187. files_downloaded = (
  188. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  189. if 'Downloaded:' in output_tail[-1]
  190. else 0
  191. )
  192. # Check for common failure cases
  193. if result.returncode > 0 and files_downloaded < 1:
  194. print(' Got wget response code {}:'.format(result.returncode))
  195. print('\n'.join(output_tail))
  196. if b'403: Forbidden' in result.stderr:
  197. raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
  198. if b'404: Not Found' in result.stderr:
  199. raise Exception('404 Not Found')
  200. if b'ERROR 500: Internal Server Error' in result.stderr:
  201. raise Exception('500 Internal Server Error')
  202. raise Exception('Got an error from the server')
  203. except Exception as e:
  204. end()
  205. # to let the user copy-paste the command and run it safely we have
  206. # to quote some of the arguments that could have spaces in them
  207. quoted_cmd = ' '.join(CMD)
  208. quoted_cmd = quoted_cmd.replace(WGET_USER_AGENT, '"{}"'.format(WGET_USER_AGENT))
  209. if COOKIES_FILE:
  210. quoted_cmd = quoted_cmd.replace(COOKIES_FILE, '"{}"'.format(COOKIES_FILE))
  211. print(' {}Some resources were skipped: {}{}'.format(ANSI['lightyellow'], e, ANSI['reset']))
  212. print(' Run to see full output:')
  213. print(' cd {};'.format(link_dir))
  214. print(' {}'.format(quoted_cmd))
  215. output = e
  216. return {
  217. 'cmd': CMD,
  218. 'output': output,
  219. }
  220. @attach_result_to_link('pdf')
  221. def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
  222. """print PDF of site to file using chrome --headless"""
  223. if link['type'] in ('PDF', 'image'):
  224. return {'output': wget_output_path(link)}
  225. if os.path.exists(os.path.join(link_dir, 'output.pdf')):
  226. return {'output': 'output.pdf', 'status': 'skipped'}
  227. CMD = [
  228. *chrome_headless(user_data_dir=user_data_dir),
  229. '--print-to-pdf',
  230. '--hide-scrollbars',
  231. '--timeout={}'.format((timeout) * 1000),
  232. *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
  233. link['url']
  234. ]
  235. end = progress(timeout, prefix=' ')
  236. try:
  237. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # output.pdf
  238. end()
  239. if result.returncode:
  240. print(' ', (result.stderr or result.stdout).decode())
  241. raise Exception('Failed to print PDF')
  242. chmod_file('output.pdf', cwd=link_dir)
  243. output = 'output.pdf'
  244. except Exception as e:
  245. end()
  246. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  247. print(' Run to see full output:')
  248. print(' cd {};'.format(link_dir))
  249. print(' {}'.format(' '.join(CMD)))
  250. output = e
  251. return {
  252. 'cmd': CMD,
  253. 'output': output,
  254. }
  255. @attach_result_to_link('screenshot')
  256. def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
  257. """take screenshot of site using chrome --headless"""
  258. if link['type'] in ('PDF', 'image'):
  259. return {'output': wget_output_path(link)}
  260. if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
  261. return {'output': 'screenshot.png', 'status': 'skipped'}
  262. CMD = [
  263. *chrome_headless(user_data_dir=user_data_dir),
  264. '--screenshot',
  265. '--window-size={}'.format(resolution),
  266. '--hide-scrollbars',
  267. '--timeout={}'.format((timeout) * 1000),
  268. *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
  269. # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
  270. link['url'],
  271. ]
  272. end = progress(timeout, prefix=' ')
  273. try:
  274. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # sreenshot.png
  275. end()
  276. if result.returncode:
  277. print(' ', (result.stderr or result.stdout).decode())
  278. raise Exception('Failed to take screenshot')
  279. chmod_file('screenshot.png', cwd=link_dir)
  280. output = 'screenshot.png'
  281. except Exception as e:
  282. end()
  283. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  284. print(' Run to see full output:')
  285. print(' cd {};'.format(link_dir))
  286. print(' {}'.format(' '.join(CMD)))
  287. output = e
  288. return {
  289. 'cmd': CMD,
  290. 'output': output,
  291. }
  292. @attach_result_to_link('dom')
  293. def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
  294. """print HTML of site to file using chrome --dump-html"""
  295. if link['type'] in ('PDF', 'image'):
  296. return {'output': wget_output_path(link)}
  297. output_path = os.path.join(link_dir, 'output.html')
  298. if os.path.exists(output_path):
  299. return {'output': 'output.html', 'status': 'skipped'}
  300. CMD = [
  301. *chrome_headless(user_data_dir=user_data_dir),
  302. '--dump-dom',
  303. '--timeout={}'.format((timeout) * 1000),
  304. link['url']
  305. ]
  306. end = progress(timeout, prefix=' ')
  307. try:
  308. with open(output_path, 'w+') as f:
  309. result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout) # output.html
  310. end()
  311. if result.returncode:
  312. print(' ', (result.stderr).decode())
  313. raise Exception('Failed to fetch DOM')
  314. chmod_file('output.html', cwd=link_dir)
  315. output = 'output.html'
  316. except Exception as e:
  317. end()
  318. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  319. print(' Run to see full output:')
  320. print(' cd {};'.format(link_dir))
  321. print(' {}'.format(' '.join(CMD)))
  322. output = e
  323. return {
  324. 'cmd': CMD,
  325. 'output': output,
  326. }
  327. @attach_result_to_link('archive_org')
  328. def archive_dot_org(link_dir, link, timeout=TIMEOUT):
  329. """submit site to archive.org for archiving via their service, save returned archive url"""
  330. path = os.path.join(link_dir, 'archive.org.txt')
  331. if os.path.exists(path):
  332. archive_org_url = open(path, 'r').read().strip()
  333. return {'output': archive_org_url, 'status': 'skipped'}
  334. submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
  335. success = False
  336. CMD = [
  337. CURL_BINARY,
  338. '--location',
  339. '--head',
  340. '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),
  341. '--max-time', str(timeout),
  342. '--get',
  343. *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
  344. submit_url,
  345. ]
  346. end = progress(timeout, prefix=' ')
  347. try:
  348. result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout) # archive.org.txt
  349. end()
  350. # Parse archive.org response headers
  351. headers = defaultdict(list)
  352. # lowercase all the header names and store in dict
  353. for header in result.stdout.splitlines():
  354. if b':' not in header or not header.strip():
  355. continue
  356. name, val = header.decode().split(':', 1)
  357. headers[name.lower().strip()].append(val.strip())
  358. # Get successful archive url in "content-location" header or any errors
  359. content_location = headers['content-location']
  360. errors = headers['x-archive-wayback-runtime-error']
  361. if content_location:
  362. saved_url = 'https://web.archive.org{}'.format(content_location[0])
  363. success = True
  364. elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
  365. output = submit_url
  366. # raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
  367. elif errors:
  368. raise Exception(', '.join(errors))
  369. else:
  370. raise Exception('Failed to find "content-location" URL header in Archive.org response.')
  371. except Exception as e:
  372. end()
  373. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  374. print(' Run to see full output:')
  375. print(' {}'.format(' '.join(CMD)))
  376. output = e
  377. if success:
  378. with open(os.path.join(link_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
  379. f.write(saved_url)
  380. chmod_file('archive.org.txt', cwd=link_dir)
  381. output = saved_url
  382. return {
  383. 'cmd': CMD,
  384. 'output': output,
  385. }
  386. @attach_result_to_link('favicon')
  387. def fetch_favicon(link_dir, link, timeout=TIMEOUT):
  388. """download site favicon from google's favicon api"""
  389. if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
  390. return {'output': 'favicon.ico', 'status': 'skipped'}
  391. CMD = [
  392. CURL_BINARY,
  393. '--max-time', str(timeout),
  394. *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
  395. 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
  396. ]
  397. fout = open('{}/favicon.ico'.format(link_dir), 'w')
  398. end = progress(timeout, prefix=' ')
  399. try:
  400. run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout) # favicon.ico
  401. fout.close()
  402. end()
  403. chmod_file('favicon.ico', cwd=link_dir)
  404. output = 'favicon.ico'
  405. except Exception as e:
  406. fout.close()
  407. end()
  408. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  409. print(' Run to see full output:')
  410. print(' {}'.format(' '.join(CMD)))
  411. output = e
  412. return {
  413. 'cmd': CMD,
  414. 'output': output,
  415. }
  416. @attach_result_to_link('title')
  417. def fetch_title(link_dir, link, timeout=TIMEOUT):
  418. """try to guess the page's title from its content"""
  419. # if link already has valid title, skip it
  420. if link['title'] and not link['title'].lower().startswith('http'):
  421. return {'output': link['title'], 'status': 'skipped'}
  422. end = progress(timeout, prefix=' ')
  423. try:
  424. title = fetch_page_title(link['url'], timeout=timeout, progress=False)
  425. end()
  426. output = title
  427. except Exception as e:
  428. end()
  429. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  430. output = e
  431. # titles should show up in the global index immediatley for better UX,
  432. # do a hacky immediate replacement to add them in as we're archiving
  433. # TODO: figure out how to do this without gnarly string replacement
  434. if title:
  435. link['title'] = title
  436. patch_index_title_hack(link['url'], title)
  437. return {
  438. 'cmd': 'fetch_page_title("{}")'.format(link['url']),
  439. 'output': output,
  440. }
  441. @attach_result_to_link('media')
  442. def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
  443. """Download playlists or individual video, audio, and subtitles using youtube-dl"""
  444. # import ipdb; ipdb.set_trace()
  445. output = os.path.join(link_dir, 'media')
  446. already_done = os.path.exists(output) # and os.listdir(output)
  447. if already_done and not overwrite:
  448. return {'output': 'media', 'status': 'skipped'}
  449. os.makedirs(output, exist_ok=True)
  450. CMD = [
  451. YOUTUBEDL_BINARY,
  452. '--write-description',
  453. '--write-info-json',
  454. '--write-annotations',
  455. '--yes-playlist',
  456. '--write-thumbnail',
  457. '--no-call-home',
  458. '--no-check-certificate',
  459. '--user-agent',
  460. '--all-subs',
  461. '--extract-audio',
  462. '--keep-video',
  463. '--ignore-errors',
  464. '--geo-bypass',
  465. '--audio-format', 'mp3',
  466. '--audio-quality', '320K',
  467. '--embed-thumbnail',
  468. '--add-metadata',
  469. *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
  470. link['url'],
  471. ]
  472. end = progress(timeout, prefix=' ')
  473. try:
  474. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output, timeout=timeout + 1) # audio/audio.mp3
  475. chmod_file('media', cwd=link_dir)
  476. output = 'media'
  477. end()
  478. if result.returncode:
  479. if (b'ERROR: Unsupported URL' in result.stderr
  480. or b'HTTP Error 404' in result.stderr
  481. or b'HTTP Error 403' in result.stderr
  482. or b'URL could be a direct video link' in result.stderr
  483. or b'Unable to extract container ID' in result.stderr):
  484. # These happen too frequently on non-media pages to warrant printing to console
  485. pass
  486. else:
  487. print(' got youtubedl response code {}:'.format(result.returncode))
  488. print(result.stderr)
  489. raise Exception('Failed to download media')
  490. except Exception as e:
  491. end()
  492. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  493. print(' Run to see full output:')
  494. print(' cd {};'.format(link_dir))
  495. print(' {}'.format(' '.join(CMD)))
  496. output = e
  497. return {
  498. 'cmd': CMD,
  499. 'output': output,
  500. }
  501. @attach_result_to_link('git')
  502. def fetch_git(link_dir, link, timeout=TIMEOUT):
  503. """download full site using git"""
  504. if not (link['domain'] in GIT_DOMAINS
  505. or link['url'].endswith('.git')
  506. or link['type'] == 'git'):
  507. return
  508. if os.path.exists(os.path.join(link_dir, 'git')):
  509. return {'output': 'git', 'status': 'skipped'}
  510. CMD = [
  511. GIT_BINARY,
  512. *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
  513. 'clone',
  514. '--mirror',
  515. '--recursive',
  516. without_fragment(link['url']),
  517. ]
  518. output = 'git'
  519. end = progress(timeout, prefix=' ')
  520. try:
  521. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # git/<reponame>
  522. end()
  523. if result.returncode > 0:
  524. print(' got git response code {}:'.format(result.returncode))
  525. raise Exception('Failed git download')
  526. except Exception as e:
  527. end()
  528. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  529. print(' Run to see full output:')
  530. print(' cd {};'.format(link_dir))
  531. print(' {}'.format(' '.join(CMD)))
  532. output = e
  533. return {
  534. 'cmd': CMD,
  535. 'output': output,
  536. }
  537. def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR):
  538. args = [binary, '--headless'] # '--disable-gpu'
  539. if not CHROME_SANDBOX:
  540. args.append('--no-sandbox')
  541. default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome')
  542. if user_data_dir:
  543. args.append('--user-data-dir={}'.format(user_data_dir))
  544. elif os.path.exists(default_profile):
  545. args.append('--user-data-dir={}'.format(default_profile))
  546. return args