util.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602
  1. import os
  2. import re
  3. import sys
  4. import time
  5. from urllib.request import Request, urlopen
  6. from urllib.parse import urlparse, quote
  7. from decimal import Decimal
  8. from datetime import datetime
  9. from multiprocessing import Process
  10. from stdlib_patches import run, PIPE, DEVNULL
  11. from config import (
  12. ANSI,
  13. TERM_WIDTH,
  14. REPO_DIR,
  15. SOURCES_DIR,
  16. ARCHIVE_DIR,
  17. OUTPUT_PERMISSIONS,
  18. TIMEOUT,
  19. SHOW_PROGRESS,
  20. CURL_BINARY,
  21. WGET_BINARY,
  22. CHROME_BINARY,
  23. GIT_BINARY,
  24. YOUTUBEDL_BINARY,
  25. FETCH_TITLE,
  26. FETCH_FAVICON,
  27. FETCH_WGET,
  28. FETCH_WARC,
  29. FETCH_PDF,
  30. FETCH_SCREENSHOT,
  31. FETCH_DOM,
  32. FETCH_GIT,
  33. FETCH_MEDIA,
  34. SUBMIT_ARCHIVE_DOT_ORG,
  35. ARCHIVE_DIR_NAME,
  36. RESOLUTION,
  37. CHECK_SSL_VALIDITY,
  38. WGET_USER_AGENT,
  39. CHROME_USER_AGENT,
  40. CHROME_USER_DATA_DIR,
  41. CHROME_HEADLESS,
  42. CHROME_SANDBOX,
  43. )
  44. ### Parsing Helpers
  45. # Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing
  46. scheme = lambda url: urlparse(url).scheme
  47. without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
  48. without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
  49. without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
  50. without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
  51. path = lambda url: urlparse(url).path
  52. basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
  53. domain = lambda url: urlparse(url).netloc
  54. query = lambda url: urlparse(url).query
  55. fragment = lambda url: urlparse(url).fragment
  56. extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
  57. base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
  58. short_ts = lambda ts: ts.split('.')[0]
  59. urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
  60. URL_REGEX = re.compile(
  61. r'http[s]?://' # start matching from allowed schemes
  62. r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
  63. r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
  64. r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
  65. r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
  66. re.IGNORECASE,
  67. )
  68. HTML_TITLE_REGEX = re.compile(
  69. r'<title.*?>' # start matching text after <title> tag
  70. r'(.[^<>]+)', # get everything up to these symbols
  71. re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
  72. )
  73. STATICFILE_EXTENSIONS = {
  74. # 99.999% of the time, URLs ending in these extentions are static files
  75. # that can be downloaded as-is, not html pages that need to be rendered
  76. 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
  77. 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
  78. 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8'
  79. 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
  80. 'atom', 'rss', 'css', 'js', 'json',
  81. 'dmg', 'iso', 'img',
  82. 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
  83. # Less common extensions to consider adding later
  84. # jar, swf, bin, com, exe, dll, deb
  85. # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
  86. # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
  87. # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
  88. # Thse are always treated as pages, not as static files, never add them:
  89. # html, htm, shtml, xhtml, xml, aspx, php, cgi
  90. }
  91. ### Checks & Tests
  92. def check_link_structure(link):
  93. """basic sanity check invariants to make sure the data is valid"""
  94. assert isinstance(link, dict)
  95. assert isinstance(link.get('url'), str)
  96. assert len(link['url']) > 2
  97. assert len(re.findall(URL_REGEX, link['url'])) == 1
  98. def check_links_structure(links):
  99. """basic sanity check invariants to make sure the data is valid"""
  100. assert isinstance(links, list)
  101. if links:
  102. check_link_structure(links[0])
  103. def check_dependencies():
  104. """Check that all necessary dependencies are installed, and have valid versions"""
  105. try:
  106. python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
  107. if python_vers < 3.5:
  108. print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
  109. print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
  110. raise SystemExit(1)
  111. if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
  112. if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
  113. print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
  114. print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
  115. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  116. raise SystemExit(1)
  117. if FETCH_WGET or FETCH_WARC:
  118. if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
  119. print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
  120. print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
  121. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  122. raise SystemExit(1)
  123. if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
  124. if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
  125. print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
  126. print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
  127. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  128. raise SystemExit(1)
  129. # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
  130. try:
  131. result = run([CHROME_BINARY, '--version'], stdout=PIPE)
  132. version_str = result.stdout.decode('utf-8')
  133. version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
  134. version = [l for l in version_lines if l.isdigit()][-1]
  135. if int(version) < 59:
  136. print(version_lines)
  137. print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
  138. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  139. raise SystemExit(1)
  140. except (IndexError, TypeError, OSError):
  141. print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
  142. print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
  143. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  144. raise SystemExit(1)
  145. if FETCH_GIT:
  146. if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
  147. print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
  148. print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
  149. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  150. raise SystemExit(1)
  151. if FETCH_MEDIA:
  152. if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
  153. print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
  154. print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
  155. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  156. raise SystemExit(1)
  157. except (KeyboardInterrupt, Exception):
  158. raise SystemExit(1)
  159. def check_url_parsing_invariants():
  160. """Check that plain text regex URL parsing works as expected"""
  161. # this is last-line-of-defense to make sure the URL_REGEX isn't
  162. # misbehaving, as the consequences could be disastrous and lead to many
  163. # incorrect/badly parsed links being added to the archive
  164. test_urls = '''
  165. https://example1.com/what/is/happening.html?what=1#how-about-this=1
  166. https://example2.com/what/is/happening/?what=1#how-about-this=1
  167. HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
  168. https://example4.com/what/is/happening.html
  169. https://example5.com/
  170. https://example6.com
  171. <test>http://example7.com</test>
  172. [https://example8.com/what/is/this.php?what=1]
  173. [and http://example9.com?what=1&other=3#and-thing=2]
  174. <what>https://example10.com#and-thing=2 "</about>
  175. abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
  176. sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
  177. example13.bada
  178. and example14.badb
  179. <or>htt://example15.badc</that>
  180. '''
  181. # print('\n'.join(re.findall(URL_REGEX, test_urls)))
  182. assert len(re.findall(URL_REGEX, test_urls)) == 12
  183. ### Random Helpers
  184. def save_stdin_source(raw_text):
  185. if not os.path.exists(SOURCES_DIR):
  186. os.makedirs(SOURCES_DIR)
  187. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  188. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
  189. with open(source_path, 'w', encoding='utf-8') as f:
  190. f.write(raw_text)
  191. return source_path
  192. def save_remote_source(url, timeout=TIMEOUT):
  193. """download a given url's content into output/sources/domain-<timestamp>.txt"""
  194. if not os.path.exists(SOURCES_DIR):
  195. os.makedirs(SOURCES_DIR)
  196. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  197. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
  198. print('{}[*] [{}] Downloading {}{}'.format(
  199. ANSI['green'],
  200. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  201. url,
  202. ANSI['reset'],
  203. ))
  204. end = progress(TIMEOUT, prefix=' ')
  205. try:
  206. downloaded_xml = download_url(url, timeout=timeout)
  207. end()
  208. except Exception as e:
  209. end()
  210. print('{}[!] Failed to download {}{}\n'.format(
  211. ANSI['red'],
  212. url,
  213. ANSI['reset'],
  214. ))
  215. print(' ', e)
  216. raise SystemExit(1)
  217. with open(source_path, 'w', encoding='utf-8') as f:
  218. f.write(downloaded_xml)
  219. print(' > {}'.format(pretty_path(source_path)))
  220. return source_path
  221. def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
  222. """Attempt to guess a page's title by downloading the html"""
  223. if not FETCH_TITLE:
  224. return None
  225. try:
  226. if progress:
  227. sys.stdout.write('.')
  228. sys.stdout.flush()
  229. html = download_url(url, timeout=timeout)
  230. match = re.search(HTML_TITLE_REGEX, html)
  231. return match.group(1).strip() if match else None
  232. except Exception as err:
  233. # print('[!] Failed to fetch title because of {}: {}'.format(
  234. # err.__class__.__name__,
  235. # err,
  236. # ))
  237. return None
  238. def wget_output_path(link):
  239. """calculate the path to the wgetted .html file, since wget may
  240. adjust some paths to be different than the base_url path.
  241. See docs on wget --adjust-extension (-E)
  242. """
  243. # if we have it stored, always prefer the actual output path to computed one
  244. if link.get('latest', {}).get('wget'):
  245. return link['latest']['wget']
  246. if is_static_file(link['url']):
  247. return urlencode(without_scheme(without_fragment(link['url'])))
  248. # Wget downloads can save in a number of different ways depending on the url
  249. # https://example.com
  250. # > output/archive/<timestamp>/example.com/index.html
  251. # https://example.com/abc
  252. # > output/archive/<timestamp>/example.com/abc.html
  253. # https://example.com/abc/
  254. # > output/archive/<timestamp>/example.com/abc/index.html
  255. # https://example.com/abc/test.html
  256. # > output/archive/<timestamp>/example.com/abc/test.html
  257. # There's also lots of complexity around how the urlencoding and renaming
  258. # is done for pages with query and hash fragments or extensions like shtml / htm
  259. # Since the wget algorithm for -E (appending .html) is incredibly complex
  260. # and there's no way to get the computed output path from wget
  261. # in order to avoid having to reverse-engineer how they calculate it,
  262. # we just look in the output folder read the filename wget used from the filesystem
  263. link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
  264. full_path = without_fragment(without_query(path(link['url']))).strip('/')
  265. search_dir = os.path.join(
  266. link_dir,
  267. domain(link['url']),
  268. full_path,
  269. )
  270. for _ in range(4):
  271. if os.path.exists(search_dir):
  272. if os.path.isdir(search_dir):
  273. html_files = [
  274. f for f in os.listdir(search_dir)
  275. if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
  276. ]
  277. if html_files:
  278. path_from_link_dir = search_dir.split(link_dir)[-1].strip('/')
  279. return urlencode(os.path.join(path_from_link_dir, html_files[0]))
  280. # Move up one directory level
  281. search_dir = search_dir.rsplit('/', 1)[0]
  282. if search_dir == link_dir:
  283. break
  284. return None
  285. # If finding the actual output file didn't work, fall back to the buggy
  286. # implementation of the wget .html appending algorithm
  287. # split_url = link['url'].split('#', 1)
  288. # query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
  289. # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
  290. # # already ends in .html
  291. # return urlencode(base_url(link['url']))
  292. # else:
  293. # # .html needs to be appended
  294. # without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
  295. # if without_scheme.endswith('/'):
  296. # if query:
  297. # return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
  298. # return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
  299. # else:
  300. # if query:
  301. # return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
  302. # elif '/' in without_scheme:
  303. # return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
  304. # return urlencode(base_url(link['url']) + '/index.html')
  305. ### String Manipulation & Logging Helpers
  306. def str_between(string, start, end=None):
  307. """(<abc>12345</def>, <abc>, </def>) -> 12345"""
  308. content = string.split(start, 1)[-1]
  309. if end is not None:
  310. content = content.rsplit(end, 1)[0]
  311. return content
  312. ### Link Helpers
  313. def merge_links(a, b):
  314. """deterministially merge two links, favoring longer field values over shorter,
  315. and "cleaner" values over worse ones.
  316. """
  317. longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
  318. earlier = lambda key: a[key] if a[key] < b[key] else b[key]
  319. url = longer('url')
  320. longest_title = longer('title')
  321. cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
  322. return {
  323. 'url': url,
  324. 'timestamp': earlier('timestamp'),
  325. 'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
  326. 'tags': longer('tags'),
  327. 'sources': list(set(a.get('sources', []) + b.get('sources', []))),
  328. }
  329. def is_static_file(url):
  330. """Certain URLs just point to a single static file, and
  331. don't need to be re-archived in many formats
  332. """
  333. # TODO: the proper way is with MIME type detection, not using extension
  334. return extension(url) in STATICFILE_EXTENSIONS
  335. def derived_link_info(link):
  336. """extend link info with the archive urls and other derived data"""
  337. url = link['url']
  338. to_date_str = lambda ts: datetime.fromtimestamp(Decimal(ts)).strftime('%Y-%m-%d %H:%M')
  339. extended_info = {
  340. **link,
  341. 'link_dir': '{}/{}'.format(ARCHIVE_DIR_NAME, link['timestamp']),
  342. 'bookmarked_date': to_date_str(link['timestamp']),
  343. 'updated_date': to_date_str(link['updated']) if 'updated' in link else None,
  344. 'domain': domain(url),
  345. 'path': path(url),
  346. 'basename': basename(url),
  347. 'extension': extension(url),
  348. 'base_url': base_url(url),
  349. 'is_static': is_static_file(url),
  350. 'is_archived': os.path.exists(os.path.join(
  351. ARCHIVE_DIR,
  352. link['timestamp'],
  353. domain(url),
  354. )),
  355. 'num_outputs': len([entry for entry in link['latest'].values() if entry]) if 'latest' in link else 0,
  356. }
  357. # Archive Method Output URLs
  358. extended_info.update({
  359. 'index_url': 'index.html',
  360. 'favicon_url': 'favicon.ico',
  361. 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
  362. 'archive_url': wget_output_path(link),
  363. 'warc_url': 'warc',
  364. 'pdf_url': 'output.pdf',
  365. 'screenshot_url': 'screenshot.png',
  366. 'dom_url': 'output.html',
  367. 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
  368. 'git_url': 'git',
  369. 'media_url': 'media',
  370. })
  371. # static binary files like PDF and images are handled slightly differently.
  372. # they're just downloaded once and aren't archived separately multiple times,
  373. # so the wget, screenshot, & pdf urls should all point to the same file
  374. if is_static_file(url):
  375. extended_info.update({
  376. 'title': basename(url),
  377. 'archive_url': base_url(url),
  378. 'pdf_url': base_url(url),
  379. 'screenshot_url': base_url(url),
  380. 'dom_url': base_url(url),
  381. })
  382. return extended_info
  383. ### Python / System Helpers
  384. def progress(seconds=TIMEOUT, prefix=''):
  385. """Show a (subprocess-controlled) progress bar with a <seconds> timeout,
  386. returns end() function to instantly finish the progress
  387. """
  388. if not SHOW_PROGRESS:
  389. return lambda: None
  390. def progress_bar(seconds, prefix):
  391. """show timer in the form of progress bar, with percentage and seconds remaining"""
  392. chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
  393. chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
  394. try:
  395. for s in range(seconds * chunks):
  396. progress = s / chunks / seconds * 100
  397. bar_width = round(progress/(100/chunks))
  398. # ████████████████████ 0.9% (1/60sec)
  399. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  400. prefix,
  401. ANSI['green'],
  402. (chunk * bar_width).ljust(chunks),
  403. ANSI['reset'],
  404. round(progress, 1),
  405. round(s/chunks),
  406. seconds,
  407. ))
  408. sys.stdout.flush()
  409. time.sleep(1 / chunks)
  410. # ██████████████████████████████████ 100.0% (60/60sec)
  411. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
  412. prefix,
  413. ANSI['red'],
  414. chunk * chunks,
  415. ANSI['reset'],
  416. 100.0,
  417. seconds,
  418. seconds,
  419. ))
  420. sys.stdout.flush()
  421. except KeyboardInterrupt:
  422. print()
  423. pass
  424. p = Process(target=progress_bar, args=(seconds, prefix))
  425. p.start()
  426. def end():
  427. """immediately finish progress and clear the progressbar line"""
  428. # protect from double termination
  429. #if p is None or not hasattr(p, 'kill'):
  430. # return
  431. nonlocal p
  432. if p is not None:
  433. p.terminate()
  434. p = None
  435. sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line
  436. sys.stdout.flush()
  437. return end
  438. def download_url(url, timeout=TIMEOUT):
  439. req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
  440. if CHECK_SSL_VALIDITY:
  441. resp = urlopen(req, timeout=timeout)
  442. else:
  443. import ssl
  444. insecure = ssl._create_unverified_context()
  445. resp = urlopen(req, timeout=timeout, context=insecure)
  446. encoding = resp.headers.get_content_charset() or 'utf-8'
  447. return resp.read().decode(encoding)
  448. def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
  449. """chmod -R <permissions> <cwd>/<path>"""
  450. if not os.path.exists(os.path.join(cwd, path)):
  451. raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
  452. chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
  453. if chmod_result.returncode == 1:
  454. print(' ', chmod_result.stderr.decode())
  455. raise Exception('Failed to chmod {}/{}'.format(cwd, path))
  456. CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
  457. def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
  458. headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
  459. check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
  460. resolution=RESOLUTION, timeout=TIMEOUT):
  461. """helper to build up a chrome shell command with arguments"""
  462. global CACHED_USER_DATA_DIR
  463. user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
  464. cmd_args = [binary]
  465. if headless:
  466. cmd_args += ('--headless',)
  467. if not sandbox:
  468. # dont use GPU or sandbox when running inside docker container
  469. cmd_args += ('--no-sandbox', '--disable-gpu')
  470. if not check_ssl_validity:
  471. cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
  472. if user_agent:
  473. cmd_args += ('--user-agent={}'.format(user_agent),)
  474. if resolution:
  475. cmd_args += ('--window-size={}'.format(RESOLUTION),)
  476. if timeout:
  477. cmd_args += ('--timeout={}'.format((timeout) * 1000),)
  478. # Find chrome user data directory
  479. default_profile_paths = (
  480. '~/.config/chromium',
  481. '~/.config/google-chrome',
  482. '~/.config/google-chrome-beta',
  483. '~/.config/google-chrome-unstable',
  484. '~/Library/Application Support/Chromium',
  485. '~/Library/Application Support/Google/Chrome',
  486. '~/Library/Application Support/Google/Chrome Canary',
  487. '~/AppData/Local/Chromium/User Data',
  488. '~/AppData/Local/Google/Chrome/User Data',
  489. '~/AppData/Local/Google/Chrome SxS/User Data',
  490. )
  491. if user_data_dir:
  492. cmd_args.append('--user-data-dir={}'.format(user_data_dir))
  493. else:
  494. for path in default_profile_paths:
  495. full_path = os.path.expanduser(path)
  496. if os.path.exists(full_path):
  497. CACHED_USER_DATA_DIR = full_path
  498. cmd_args.append('--user-data-dir={}'.format(full_path))
  499. break
  500. return cmd_args