archive_methods.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657
  1. import os
  2. import re
  3. import sys
  4. from functools import wraps
  5. from collections import defaultdict
  6. from datetime import datetime
  7. from peekable import Peekable
  8. from index import wget_output_path, parse_json_link_index, write_link_index
  9. from links import links_after_timestamp
  10. from config import (
  11. CURL_BINARY,
  12. GIT_BINARY,
  13. WGET_BINARY,
  14. YOUTUBEDL_BINARY,
  15. CHROME_BINARY,
  16. FETCH_FAVICON,
  17. FETCH_TITLE,
  18. FETCH_WGET,
  19. FETCH_WGET_REQUISITES,
  20. FETCH_PDF,
  21. FETCH_SCREENSHOT,
  22. FETCH_DOM,
  23. FETCH_WARC,
  24. FETCH_GIT,
  25. FETCH_MEDIA,
  26. RESOLUTION,
  27. CHECK_SSL_VALIDITY,
  28. SUBMIT_ARCHIVE_DOT_ORG,
  29. COOKIES_FILE,
  30. WGET_USER_AGENT,
  31. CHROME_USER_DATA_DIR,
  32. CHROME_SANDBOX,
  33. TIMEOUT,
  34. MEDIA_TIMEOUT,
  35. ANSI,
  36. ARCHIVE_DIR,
  37. GIT_DOMAINS,
  38. GIT_SHA,
  39. )
  40. from util import (
  41. without_hash,
  42. check_dependencies,
  43. fetch_page_title,
  44. progress,
  45. chmod_file,
  46. pretty_path,
  47. run, PIPE, DEVNULL
  48. )
  49. _RESULTS_TOTALS = { # globals are bad, mmkay
  50. 'skipped': 0,
  51. 'succeded': 0,
  52. 'failed': 0,
  53. }
  54. def archive_links(archive_path, links, source=None, resume=None):
  55. check_dependencies()
  56. to_archive = Peekable(links_after_timestamp(links, resume))
  57. idx, link = 0, to_archive.peek(0)
  58. try:
  59. for idx, link in enumerate(to_archive):
  60. link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
  61. archive_link(link_dir, link)
  62. except (KeyboardInterrupt, SystemExit, Exception) as e:
  63. print('{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
  64. **ANSI,
  65. now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  66. idx=idx+1,
  67. timestamp=link['timestamp'],
  68. total=len(links),
  69. ))
  70. print(' Continue where you left off by running:')
  71. print(' {} {}'.format(
  72. pretty_path(sys.argv[0]),
  73. link['timestamp'],
  74. ))
  75. if not isinstance(e, KeyboardInterrupt):
  76. raise e
  77. raise SystemExit(1)
  78. def archive_link(link_dir, link, overwrite=True):
  79. """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
  80. try:
  81. update_existing = os.path.exists(link_dir)
  82. if update_existing:
  83. link = {
  84. **parse_json_link_index(link_dir),
  85. **link,
  86. }
  87. else:
  88. os.makedirs(link_dir)
  89. log_link_archive(link_dir, link, update_existing)
  90. if FETCH_FAVICON:
  91. link = fetch_favicon(link_dir, link, overwrite=overwrite)
  92. if FETCH_TITLE:
  93. link = fetch_title(link_dir, link, overwrite=overwrite)
  94. if FETCH_WGET:
  95. link = fetch_wget(link_dir, link, overwrite=overwrite)
  96. if FETCH_PDF:
  97. link = fetch_pdf(link_dir, link, overwrite=overwrite)
  98. if FETCH_SCREENSHOT:
  99. link = fetch_screenshot(link_dir, link, overwrite=overwrite)
  100. if FETCH_DOM:
  101. link = fetch_dom(link_dir, link, overwrite=overwrite)
  102. if SUBMIT_ARCHIVE_DOT_ORG:
  103. link = archive_dot_org(link_dir, link, overwrite=overwrite)
  104. if FETCH_GIT:
  105. link = fetch_git(link_dir, link, overwrite=overwrite)
  106. if FETCH_MEDIA:
  107. link = fetch_media(link_dir, link, overwrite=overwrite)
  108. write_link_index(link_dir, link)
  109. except Exception as err:
  110. print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
  111. return link
  112. def log_link_archive(link_dir, link, update_existing):
  113. print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
  114. symbol='*' if update_existing else '+',
  115. symbol_color=ANSI['black' if update_existing else 'green'],
  116. now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  117. **{**link, 'title': link['title'] or link['url']},
  118. **ANSI,
  119. ))
  120. print(' > {}{}'.format(pretty_path(link_dir), '' if update_existing else ' (new)'))
  121. # if link['type']:
  122. # print(' i {}'.format(link['type']))
  123. def attach_result_to_link(method):
  124. """
  125. Instead of returning a result={output:'...', status:'success'} object,
  126. attach that result to the links's history & latest fields, then return
  127. the updated link object.
  128. """
  129. def decorator(fetch_func):
  130. @wraps(fetch_func)
  131. def timed_fetch_func(link_dir, link, overwrite=False, **kwargs):
  132. # initialize methods and history json field on link
  133. link['latest'] = link.get('latest') or {}
  134. link['latest'][method] = link['latest'].get(method) or None
  135. link['history'] = link.get('history') or {}
  136. link['history'][method] = link['history'].get(method) or []
  137. start_ts = datetime.now().timestamp()
  138. # if a valid method output is already present, dont run the fetch function
  139. if link['latest'][method] and not overwrite:
  140. print(' √ {}'.format(method))
  141. result = None
  142. else:
  143. print(' > {}'.format(method))
  144. result = fetch_func(link_dir, link, **kwargs)
  145. end_ts = datetime.now().timestamp()
  146. duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0]
  147. # append a history item recording fail/success
  148. history_entry = {
  149. 'timestamp': str(start_ts).split('.')[0],
  150. }
  151. if result is None:
  152. history_entry['status'] = 'skipped'
  153. elif isinstance(result.get('output'), Exception):
  154. history_entry['status'] = 'failed'
  155. history_entry['duration'] = duration
  156. history_entry.update(result or {})
  157. link['history'][method].append(history_entry)
  158. else:
  159. history_entry['status'] = 'succeded'
  160. history_entry['duration'] = duration
  161. history_entry.update(result or {})
  162. link['history'][method].append(history_entry)
  163. link['latest'][method] = result['output']
  164. _RESULTS_TOTALS[history_entry['status']] += 1
  165. return link
  166. return timed_fetch_func
  167. return decorator
  168. @attach_result_to_link('wget')
  169. def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT):
  170. """download full site using wget"""
  171. domain_dir = os.path.join(link_dir, link['domain'])
  172. existing_file = wget_output_path(link)
  173. if os.path.exists(domain_dir) and existing_file:
  174. return {'output': existing_file, 'status': 'skipped'}
  175. if warc:
  176. warc_dir = os.path.join(link_dir, 'warc')
  177. os.makedirs(warc_dir, exist_ok=True)
  178. warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
  179. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  180. CMD = [
  181. WGET_BINARY,
  182. # '--server-response', # print headers for better error parsing
  183. '--no-verbose',
  184. '--adjust-extension',
  185. '--convert-links',
  186. '--force-directories',
  187. '--backup-converted',
  188. '--span-hosts',
  189. '--no-parent',
  190. '-e', 'robots=off',
  191. '--restrict-file-names=unix',
  192. '--timeout={}'.format(timeout),
  193. *(() if warc else ('--timestamping',)),
  194. *(('--warc-file={}'.format(warc_path),) if warc else ()),
  195. *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
  196. *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
  197. *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
  198. *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
  199. link['url'],
  200. ]
  201. end = progress(timeout, prefix=' ')
  202. try:
  203. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # index.html
  204. end()
  205. output = wget_output_path(link, look_in=domain_dir)
  206. output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()]
  207. # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  208. files_downloaded = (
  209. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  210. if 'Downloaded:' in output_tail[-1]
  211. else 0
  212. )
  213. # Check for common failure cases
  214. if result.returncode > 0 and files_downloaded < 1:
  215. print(' Got wget response code {}:'.format(result.returncode))
  216. print('\n'.join(output_tail))
  217. if b'403: Forbidden' in result.stderr:
  218. raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
  219. if b'404: Not Found' in result.stderr:
  220. raise Exception('404 Not Found')
  221. if b'ERROR 500: Internal Server Error' in result.stderr:
  222. raise Exception('500 Internal Server Error')
  223. raise Exception('Got an error from the server')
  224. except Exception as e:
  225. end()
  226. # to let the user copy-paste the command and run it safely we have
  227. # to quote some of the arguments that could have spaces in them
  228. quoted_cmd = ' '.join(CMD)
  229. quoted_cmd = quoted_cmd.replace(WGET_USER_AGENT, '"{}"'.format(WGET_USER_AGENT))
  230. if COOKIES_FILE:
  231. quoted_cmd = quoted_cmd.replace(COOKIES_FILE, '"{}"'.format(COOKIES_FILE))
  232. print(' {}Some resources were skipped: {}{}'.format(ANSI['lightyellow'], e, ANSI['reset']))
  233. print(' Run to see full output:')
  234. print(' cd {};'.format(link_dir))
  235. print(' {}'.format(quoted_cmd))
  236. output = e
  237. return {
  238. 'cmd': CMD,
  239. 'output': output,
  240. }
  241. @attach_result_to_link('pdf')
  242. def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
  243. """print PDF of site to file using chrome --headless"""
  244. if link['type'] in ('PDF', 'image'):
  245. return {'output': wget_output_path(link)}
  246. if os.path.exists(os.path.join(link_dir, 'output.pdf')):
  247. return {'output': 'output.pdf', 'status': 'skipped'}
  248. CMD = [
  249. *chrome_headless(user_data_dir=user_data_dir),
  250. '--print-to-pdf',
  251. '--hide-scrollbars',
  252. '--timeout={}'.format((timeout) * 1000),
  253. *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
  254. link['url']
  255. ]
  256. end = progress(timeout, prefix=' ')
  257. try:
  258. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # output.pdf
  259. end()
  260. if result.returncode:
  261. print(' ', (result.stderr or result.stdout).decode())
  262. raise Exception('Failed to print PDF')
  263. chmod_file('output.pdf', cwd=link_dir)
  264. output = 'output.pdf'
  265. except Exception as e:
  266. end()
  267. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  268. print(' Run to see full output:')
  269. print(' cd {};'.format(link_dir))
  270. print(' {}'.format(' '.join(CMD)))
  271. output = e
  272. return {
  273. 'cmd': CMD,
  274. 'output': output,
  275. }
  276. @attach_result_to_link('screenshot')
  277. def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
  278. """take screenshot of site using chrome --headless"""
  279. if link['type'] in ('PDF', 'image'):
  280. return {'output': wget_output_path(link)}
  281. if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
  282. return {'output': 'screenshot.png', 'status': 'skipped'}
  283. CMD = [
  284. *chrome_headless(user_data_dir=user_data_dir),
  285. '--screenshot',
  286. '--window-size={}'.format(resolution),
  287. '--hide-scrollbars',
  288. '--timeout={}'.format((timeout) * 1000),
  289. *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
  290. # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
  291. link['url'],
  292. ]
  293. end = progress(timeout, prefix=' ')
  294. try:
  295. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) # sreenshot.png
  296. end()
  297. if result.returncode:
  298. print(' ', (result.stderr or result.stdout).decode())
  299. raise Exception('Failed to take screenshot')
  300. chmod_file('screenshot.png', cwd=link_dir)
  301. output = 'screenshot.png'
  302. except Exception as e:
  303. end()
  304. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  305. print(' Run to see full output:')
  306. print(' cd {};'.format(link_dir))
  307. print(' {}'.format(' '.join(CMD)))
  308. output = e
  309. return {
  310. 'cmd': CMD,
  311. 'output': output,
  312. }
  313. @attach_result_to_link('dom')
  314. def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
  315. """print HTML of site to file using chrome --dump-html"""
  316. if link['type'] in ('PDF', 'image'):
  317. return {'output': wget_output_path(link)}
  318. output_path = os.path.join(link_dir, 'output.html')
  319. if os.path.exists(output_path):
  320. return {'output': 'output.html', 'status': 'skipped'}
  321. CMD = [
  322. *chrome_headless(user_data_dir=user_data_dir),
  323. '--dump-dom',
  324. '--timeout={}'.format((timeout) * 1000),
  325. link['url']
  326. ]
  327. end = progress(timeout, prefix=' ')
  328. try:
  329. with open(output_path, 'w+') as f:
  330. result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout) # output.html
  331. end()
  332. if result.returncode:
  333. print(' ', (result.stderr).decode())
  334. raise Exception('Failed to fetch DOM')
  335. chmod_file('output.html', cwd=link_dir)
  336. output = 'output.html'
  337. except Exception as e:
  338. end()
  339. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  340. print(' Run to see full output:')
  341. print(' cd {};'.format(link_dir))
  342. print(' {}'.format(' '.join(CMD)))
  343. output = e
  344. return {
  345. 'cmd': CMD,
  346. 'output': output,
  347. }
  348. @attach_result_to_link('archive_org')
  349. def archive_dot_org(link_dir, link, timeout=TIMEOUT):
  350. """submit site to archive.org for archiving via their service, save returned archive url"""
  351. path = os.path.join(link_dir, 'archive.org.txt')
  352. if os.path.exists(path):
  353. archive_org_url = open(path, 'r').read().strip()
  354. return {'output': archive_org_url, 'status': 'skipped'}
  355. submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
  356. success = False
  357. CMD = [
  358. CURL_BINARY,
  359. '--location',
  360. '--head',
  361. '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),
  362. '--max-time', str(timeout),
  363. '--get',
  364. *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
  365. submit_url,
  366. ]
  367. end = progress(timeout, prefix=' ')
  368. try:
  369. result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout) # archive.org.txt
  370. end()
  371. # Parse archive.org response headers
  372. headers = defaultdict(list)
  373. # lowercase all the header names and store in dict
  374. for header in result.stdout.splitlines():
  375. if b':' not in header or not header.strip():
  376. continue
  377. name, val = header.decode().split(':', 1)
  378. headers[name.lower().strip()].append(val.strip())
  379. # Get successful archive url in "content-location" header or any errors
  380. content_location = headers['content-location']
  381. errors = headers['x-archive-wayback-runtime-error']
  382. if content_location:
  383. saved_url = 'https://web.archive.org{}'.format(content_location[0])
  384. success = True
  385. elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
  386. output = submit_url
  387. # raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
  388. elif errors:
  389. raise Exception(', '.join(errors))
  390. else:
  391. raise Exception('Failed to find "content-location" URL header in Archive.org response.')
  392. except Exception as e:
  393. end()
  394. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  395. print(' Run to see full output:')
  396. print(' {}'.format(' '.join(CMD)))
  397. output = e
  398. if success:
  399. with open(os.path.join(link_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
  400. f.write(saved_url)
  401. chmod_file('archive.org.txt', cwd=link_dir)
  402. output = saved_url
  403. return {
  404. 'cmd': CMD,
  405. 'output': output,
  406. }
  407. @attach_result_to_link('favicon')
  408. def fetch_favicon(link_dir, link, timeout=TIMEOUT):
  409. """download site favicon from google's favicon api"""
  410. if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
  411. return {'output': 'favicon.ico', 'status': 'skipped'}
  412. CMD = [
  413. CURL_BINARY,
  414. '--max-time', str(timeout),
  415. *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
  416. 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
  417. ]
  418. fout = open('{}/favicon.ico'.format(link_dir), 'w')
  419. end = progress(timeout, prefix=' ')
  420. try:
  421. run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout) # favicon.ico
  422. fout.close()
  423. end()
  424. chmod_file('favicon.ico', cwd=link_dir)
  425. output = 'favicon.ico'
  426. except Exception as e:
  427. fout.close()
  428. end()
  429. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  430. print(' Run to see full output:')
  431. print(' {}'.format(' '.join(CMD)))
  432. output = e
  433. return {
  434. 'cmd': CMD,
  435. 'output': output,
  436. }
  437. @attach_result_to_link('title')
  438. def fetch_title(link_dir, link, timeout=TIMEOUT):
  439. """try to guess the page's title from its content"""
  440. # if link already has valid title, skip it
  441. if link['title'] and not link['title'].lower().startswith('http'):
  442. return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
  443. end = progress(timeout, prefix=' ')
  444. try:
  445. title = fetch_page_title(link['url'], timeout=timeout, progress=False)
  446. end()
  447. output = title
  448. except Exception as e:
  449. end()
  450. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  451. output = e
  452. return {
  453. 'cmd': 'fetch_page_title("{}")'.format(link['url']),
  454. 'output': output,
  455. }
  456. @attach_result_to_link('media')
  457. def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
  458. """Download playlists or individual video, audio, and subtitles using youtube-dl"""
  459. # import ipdb; ipdb.set_trace()
  460. output = os.path.join(link_dir, 'media')
  461. already_done = os.path.exists(output) # and os.listdir(output)
  462. if already_done and not overwrite:
  463. return {'output': 'media', 'status': 'skipped'}
  464. os.makedirs(output, exist_ok=True)
  465. CMD = [
  466. YOUTUBEDL_BINARY,
  467. '--write-description',
  468. '--write-info-json',
  469. '--write-annotations',
  470. '--yes-playlist',
  471. '--write-thumbnail',
  472. '--no-call-home',
  473. '--no-check-certificate',
  474. '--user-agent',
  475. '--all-subs',
  476. '--extract-audio',
  477. '--keep-video',
  478. '--ignore-errors',
  479. '--geo-bypass',
  480. '--audio-format', 'mp3',
  481. '--audio-quality', '320K',
  482. '--embed-thumbnail',
  483. '--add-metadata',
  484. *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
  485. link['url'],
  486. ]
  487. end = progress(timeout, prefix=' ')
  488. try:
  489. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output, timeout=timeout + 1) # audio/audio.mp3
  490. chmod_file('media', cwd=link_dir)
  491. output = 'media'
  492. end()
  493. if result.returncode:
  494. if (b'ERROR: Unsupported URL' in result.stderr
  495. or b'HTTP Error 404' in result.stderr
  496. or b'HTTP Error 403' in result.stderr
  497. or b'URL could be a direct video link' in result.stderr
  498. or b'Unable to extract container ID' in result.stderr):
  499. # These happen too frequently on non-media pages to warrant printing to console
  500. pass
  501. else:
  502. print(' got youtubedl response code {}:'.format(result.returncode))
  503. print(result.stderr)
  504. raise Exception('Failed to download media')
  505. except Exception as e:
  506. end()
  507. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  508. print(' Run to see full output:')
  509. print(' cd {};'.format(link_dir))
  510. print(' {}'.format(' '.join(CMD)))
  511. output = e
  512. return {
  513. 'cmd': CMD,
  514. 'output': output,
  515. }
  516. @attach_result_to_link('git')
  517. def fetch_git(link_dir, link, timeout=TIMEOUT):
  518. """download full site using git"""
  519. if not (link['domain'] in GIT_DOMAINS
  520. or link['url'].endswith('.git')
  521. or link['type'] == 'git'):
  522. return
  523. if os.path.exists(os.path.join(link_dir, 'git')):
  524. return {'output': 'git', 'status': 'skipped'}
  525. CMD = [
  526. GIT_BINARY,
  527. *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
  528. 'clone',
  529. '--mirror',
  530. '--recursive',
  531. without_hash(link['url']),
  532. ]
  533. output = 'git'
  534. end = progress(timeout, prefix=' ')
  535. try:
  536. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # git/<reponame>
  537. end()
  538. if result.returncode > 0:
  539. print(' got git response code {}:'.format(result.returncode))
  540. raise Exception('Failed git download')
  541. except Exception as e:
  542. end()
  543. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  544. print(' Run to see full output:')
  545. print(' cd {};'.format(link_dir))
  546. print(' {}'.format(' '.join(CMD)))
  547. output = e
  548. return {
  549. 'cmd': CMD,
  550. 'output': output,
  551. }
  552. def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR):
  553. args = [binary, '--headless'] # '--disable-gpu'
  554. if not CHROME_SANDBOX:
  555. args.append('--no-sandbox')
  556. default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome')
  557. if user_data_dir:
  558. args.append('--user-data-dir={}'.format(user_data_dir))
  559. elif os.path.exists(default_profile):
  560. args.append('--user-data-dir={}'.format(default_profile))
  561. return args