util.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. import os
  2. import re
  3. import sys
  4. import time
  5. from typing import List, Dict, Any, Optional, Union
  6. from urllib.request import Request, urlopen
  7. from urllib.parse import urlparse, quote
  8. from decimal import Decimal
  9. from datetime import datetime
  10. from multiprocessing import Process
  11. from subprocess import (
  12. Popen,
  13. PIPE,
  14. DEVNULL,
  15. CompletedProcess,
  16. TimeoutExpired,
  17. CalledProcessError,
  18. )
  19. from config import (
  20. ANSI,
  21. TERM_WIDTH,
  22. SOURCES_DIR,
  23. ARCHIVE_DIR,
  24. OUTPUT_PERMISSIONS,
  25. TIMEOUT,
  26. SHOW_PROGRESS,
  27. FETCH_TITLE,
  28. ARCHIVE_DIR_NAME,
  29. CHECK_SSL_VALIDITY,
  30. WGET_USER_AGENT,
  31. CHROME_OPTIONS,
  32. PYTHON_PATH,
  33. )
  34. from logs import pretty_path
  35. ### Parsing Helpers
  36. # Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing
  37. scheme = lambda url: urlparse(url).scheme
  38. without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
  39. without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
  40. without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
  41. without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
  42. path = lambda url: urlparse(url).path
  43. basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
  44. domain = lambda url: urlparse(url).netloc
  45. query = lambda url: urlparse(url).query
  46. fragment = lambda url: urlparse(url).fragment
  47. extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
  48. base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
  49. short_ts = lambda ts: ts.split('.')[0]
  50. urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
  51. URL_REGEX = re.compile(
  52. r'http[s]?://' # start matching from allowed schemes
  53. r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
  54. r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
  55. r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
  56. r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
  57. re.IGNORECASE,
  58. )
  59. HTML_TITLE_REGEX = re.compile(
  60. r'<title.*?>' # start matching text after <title> tag
  61. r'(.[^<>]+)', # get everything up to these symbols
  62. re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
  63. )
  64. STATICFILE_EXTENSIONS = {
  65. # 99.999% of the time, URLs ending in these extentions are static files
  66. # that can be downloaded as-is, not html pages that need to be rendered
  67. 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
  68. 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
  69. 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8'
  70. 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
  71. 'atom', 'rss', 'css', 'js', 'json',
  72. 'dmg', 'iso', 'img',
  73. 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
  74. # Less common extensions to consider adding later
  75. # jar, swf, bin, com, exe, dll, deb
  76. # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
  77. # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
  78. # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
  79. # Thse are always treated as pages, not as static files, never add them:
  80. # html, htm, shtml, xhtml, xml, aspx, php, cgi
  81. }
  82. Link = Dict[str, Any]
  83. ### Checks & Tests
  84. def check_link_structure(link: Link) -> None:
  85. """basic sanity check invariants to make sure the data is valid"""
  86. assert isinstance(link, dict)
  87. assert isinstance(link.get('url'), str)
  88. assert len(link['url']) > 2
  89. assert len(re.findall(URL_REGEX, link['url'])) == 1
  90. if 'history' in link:
  91. assert isinstance(link['history'], dict), 'history must be a Dict'
  92. for key, val in link['history'].items():
  93. assert isinstance(key, str)
  94. assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
  95. def check_links_structure(links: List[Link]) -> None:
  96. """basic sanity check invariants to make sure the data is valid"""
  97. assert isinstance(links, list)
  98. if links:
  99. check_link_structure(links[0])
  100. def check_url_parsing_invariants() -> None:
  101. """Check that plain text regex URL parsing works as expected"""
  102. # this is last-line-of-defense to make sure the URL_REGEX isn't
  103. # misbehaving, as the consequences could be disastrous and lead to many
  104. # incorrect/badly parsed links being added to the archive
  105. test_urls = '''
  106. https://example1.com/what/is/happening.html?what=1#how-about-this=1
  107. https://example2.com/what/is/happening/?what=1#how-about-this=1
  108. HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
  109. https://example4.com/what/is/happening.html
  110. https://example5.com/
  111. https://example6.com
  112. <test>http://example7.com</test>
  113. [https://example8.com/what/is/this.php?what=1]
  114. [and http://example9.com?what=1&other=3#and-thing=2]
  115. <what>https://example10.com#and-thing=2 "</about>
  116. abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
  117. sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
  118. example13.bada
  119. and example14.badb
  120. <or>htt://example15.badc</that>
  121. '''
  122. # print('\n'.join(re.findall(URL_REGEX, test_urls)))
  123. assert len(re.findall(URL_REGEX, test_urls)) == 12
  124. ### Random Helpers
  125. def save_stdin_source(raw_text: str) -> str:
  126. if not os.path.exists(SOURCES_DIR):
  127. os.makedirs(SOURCES_DIR)
  128. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  129. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
  130. with open(source_path, 'w', encoding='utf-8') as f:
  131. f.write(raw_text)
  132. return source_path
  133. def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
  134. """download a given url's content into output/sources/domain-<timestamp>.txt"""
  135. if not os.path.exists(SOURCES_DIR):
  136. os.makedirs(SOURCES_DIR)
  137. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  138. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
  139. print('{}[*] [{}] Downloading {}{}'.format(
  140. ANSI['green'],
  141. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  142. url,
  143. ANSI['reset'],
  144. ))
  145. timer = TimedProgress(timeout, prefix=' ')
  146. try:
  147. downloaded_xml = download_url(url, timeout=timeout)
  148. timer.end()
  149. except Exception as e:
  150. timer.end()
  151. print('{}[!] Failed to download {}{}\n'.format(
  152. ANSI['red'],
  153. url,
  154. ANSI['reset'],
  155. ))
  156. print(' ', e)
  157. raise SystemExit(1)
  158. with open(source_path, 'w', encoding='utf-8') as f:
  159. f.write(downloaded_xml)
  160. print(' > {}'.format(pretty_path(source_path)))
  161. return source_path
  162. def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]:
  163. """Attempt to guess a page's title by downloading the html"""
  164. if not FETCH_TITLE:
  165. return None
  166. try:
  167. if progress:
  168. sys.stdout.write('.')
  169. sys.stdout.flush()
  170. html = download_url(url, timeout=timeout)
  171. match = re.search(HTML_TITLE_REGEX, html)
  172. return match.group(1).strip() if match else None
  173. except Exception as err: # noqa
  174. # print('[!] Failed to fetch title because of {}: {}'.format(
  175. # err.__class__.__name__,
  176. # err,
  177. # ))
  178. return None
  179. def wget_output_path(link: Link) -> Optional[str]:
  180. """calculate the path to the wgetted .html file, since wget may
  181. adjust some paths to be different than the base_url path.
  182. See docs on wget --adjust-extension (-E)
  183. """
  184. if is_static_file(link['url']):
  185. return without_scheme(without_fragment(link['url']))
  186. # Wget downloads can save in a number of different ways depending on the url:
  187. # https://example.com
  188. # > output/archive/<timestamp>/example.com/index.html
  189. # https://example.com?v=zzVa_tX1OiI
  190. # > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
  191. # https://www.example.com/?v=zzVa_tX1OiI
  192. # > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
  193. # https://example.com/abc
  194. # > output/archive/<timestamp>/example.com/abc.html
  195. # https://example.com/abc/
  196. # > output/archive/<timestamp>/example.com/abc/index.html
  197. # https://example.com/abc?v=zzVa_tX1OiI.html
  198. # > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
  199. # https://example.com/abc/?v=zzVa_tX1OiI.html
  200. # > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
  201. # https://example.com/abc/test.html
  202. # > output/archive/<timestamp>/example.com/abc/test.html
  203. # https://example.com/abc/test?v=zzVa_tX1OiI
  204. # > output/archive/<timestamp>/example.com/abc/test?v=zzVa_tX1OiI.html
  205. # https://example.com/abc/test/?v=zzVa_tX1OiI
  206. # > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
  207. # There's also lots of complexity around how the urlencoding and renaming
  208. # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
  209. # Since the wget algorithm for -E (appending .html) is incredibly complex
  210. # and there's no way to get the computed output path from wget
  211. # in order to avoid having to reverse-engineer how they calculate it,
  212. # we just look in the output folder read the filename wget used from the filesystem
  213. link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
  214. full_path = without_fragment(without_query(path(link['url']))).strip('/')
  215. search_dir = os.path.join(
  216. link_dir,
  217. domain(link['url']),
  218. full_path,
  219. )
  220. for _ in range(4):
  221. if os.path.exists(search_dir):
  222. if os.path.isdir(search_dir):
  223. html_files = [
  224. f for f in os.listdir(search_dir)
  225. if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
  226. ]
  227. if html_files:
  228. path_from_link_dir = search_dir.split(link_dir)[-1].strip('/')
  229. return os.path.join(path_from_link_dir, html_files[0])
  230. # Move up one directory level
  231. search_dir = search_dir.rsplit('/', 1)[0]
  232. if search_dir == link_dir:
  233. break
  234. return None
  235. def read_js_script(script_name: str) -> str:
  236. script_path = os.path.join(PYTHON_PATH, 'scripts', script_name)
  237. with open(script_path, 'r') as f:
  238. return f.read().split('// INFO BELOW HERE')[0].strip()
  239. ### String Manipulation & Logging Helpers
  240. def str_between(string: str, start: str, end: str=None) -> str:
  241. """(<abc>12345</def>, <abc>, </def>) -> 12345"""
  242. content = string.split(start, 1)[-1]
  243. if end is not None:
  244. content = content.rsplit(end, 1)[0]
  245. return content
  246. ### Link Helpers
  247. def merge_links(a: Link, b: Link) -> Link:
  248. """deterministially merge two links, favoring longer field values over shorter,
  249. and "cleaner" values over worse ones.
  250. """
  251. longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
  252. earlier = lambda key: a[key] if a[key] < b[key] else b[key]
  253. url = longer('url')
  254. longest_title = longer('title')
  255. cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
  256. return {
  257. 'url': url,
  258. 'timestamp': earlier('timestamp'),
  259. 'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
  260. 'tags': longer('tags'),
  261. 'sources': list(set(a.get('sources', []) + b.get('sources', []))),
  262. }
  263. def is_static_file(url: str) -> bool:
  264. """Certain URLs just point to a single static file, and
  265. don't need to be re-archived in many formats
  266. """
  267. # TODO: the proper way is with MIME type detection, not using extension
  268. return extension(url) in STATICFILE_EXTENSIONS
  269. def derived_link_info(link: Link) -> dict:
  270. """extend link info with the archive urls and other derived data"""
  271. url = link['url']
  272. to_date_str = lambda ts: datetime.fromtimestamp(Decimal(ts)).strftime('%Y-%m-%d %H:%M')
  273. extended_info = {
  274. **link,
  275. 'link_dir': '{}/{}'.format(ARCHIVE_DIR_NAME, link['timestamp']),
  276. 'bookmarked_date': to_date_str(link['timestamp']),
  277. 'updated_date': to_date_str(link['updated']) if 'updated' in link else None,
  278. 'domain': domain(url),
  279. 'path': path(url),
  280. 'basename': basename(url),
  281. 'extension': extension(url),
  282. 'base_url': base_url(url),
  283. 'is_static': is_static_file(url),
  284. 'is_archived': os.path.exists(os.path.join(
  285. ARCHIVE_DIR,
  286. link['timestamp'],
  287. domain(url),
  288. )),
  289. 'num_outputs': len([entry for entry in latest_output(link).values() if entry]),
  290. }
  291. # Archive Method Output URLs
  292. extended_info.update({
  293. 'index_url': 'index.html',
  294. 'favicon_url': 'favicon.ico',
  295. 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
  296. 'archive_url': wget_output_path(link),
  297. 'warc_url': 'warc',
  298. 'pdf_url': 'output.pdf',
  299. 'screenshot_url': 'screenshot.png',
  300. 'dom_url': 'output.html',
  301. 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
  302. 'git_url': 'git',
  303. 'media_url': 'media',
  304. })
  305. # static binary files like PDF and images are handled slightly differently.
  306. # they're just downloaded once and aren't archived separately multiple times,
  307. # so the wget, screenshot, & pdf urls should all point to the same file
  308. if is_static_file(url):
  309. extended_info.update({
  310. 'title': basename(url),
  311. 'archive_url': base_url(url),
  312. 'pdf_url': base_url(url),
  313. 'screenshot_url': base_url(url),
  314. 'dom_url': base_url(url),
  315. })
  316. return extended_info
  317. def latest_output(link: Link, status: str=None) -> Dict[str, Optional[str]]:
  318. """get the latest output that each archive method produced for link"""
  319. latest = {
  320. 'title': None,
  321. 'favicon': None,
  322. 'wget': None,
  323. 'warc': None,
  324. 'pdf': None,
  325. 'screenshot': None,
  326. 'dom': None,
  327. 'git': None,
  328. 'media': None,
  329. 'archive_org': None,
  330. }
  331. for archive_method in latest.keys():
  332. # get most recent succesful result in history for each archive method
  333. history = link.get('history', {}).get(archive_method) or []
  334. history = filter(lambda result: result['output'], reversed(history))
  335. if status is not None:
  336. history = filter(lambda result: result['status'] == status, history)
  337. history = list(history)
  338. if history:
  339. latest[archive_method] = history[0]['output']
  340. return latest
  341. ### Python / System Helpers
  342. def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
  343. """Patched of subprocess.run to fix blocking io making timeout=innefective"""
  344. if input is not None:
  345. if 'stdin' in kwargs:
  346. raise ValueError('stdin and input arguments may not both be used.')
  347. kwargs['stdin'] = PIPE
  348. if capture_output:
  349. if ('stdout' in kwargs) or ('stderr' in kwargs):
  350. raise ValueError('stdout and stderr arguments may not be used '
  351. 'with capture_output.')
  352. kwargs['stdout'] = PIPE
  353. kwargs['stderr'] = PIPE
  354. with Popen(*popenargs, **kwargs) as process:
  355. try:
  356. stdout, stderr = process.communicate(input, timeout=timeout)
  357. except TimeoutExpired:
  358. process.kill()
  359. try:
  360. stdout, stderr = process.communicate(input, timeout=2)
  361. except:
  362. pass
  363. raise TimeoutExpired(popenargs[0][0], timeout)
  364. except BaseException:
  365. process.kill()
  366. # We don't call process.wait() as .__exit__ does that for us.
  367. raise
  368. retcode = process.poll()
  369. if check and retcode:
  370. raise CalledProcessError(retcode, process.args,
  371. output=stdout, stderr=stderr)
  372. return CompletedProcess(process.args, retcode, stdout, stderr)
  373. class TimedProgress:
  374. """Show a progress bar and measure elapsed time until .end() is called"""
  375. def __init__(self, seconds, prefix=''):
  376. if SHOW_PROGRESS:
  377. self.p = Process(target=progress_bar, args=(seconds, prefix))
  378. self.p.start()
  379. self.stats = {
  380. 'start_ts': datetime.now(),
  381. 'end_ts': None,
  382. 'duration': None,
  383. }
  384. def end(self):
  385. """immediately end progress, clear the progressbar line, and save end_ts"""
  386. end_ts = datetime.now()
  387. self.stats.update({
  388. 'end_ts': end_ts,
  389. 'duration': (end_ts - self.stats['start_ts']).seconds,
  390. })
  391. if SHOW_PROGRESS:
  392. # protect from double termination
  393. #if p is None or not hasattr(p, 'kill'):
  394. # return
  395. if self.p is not None:
  396. self.p.terminate()
  397. self.p = None
  398. sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) # clear whole terminal line
  399. sys.stdout.flush()
  400. def progress_bar(seconds: int, prefix: str='') -> None:
  401. """show timer in the form of progress bar, with percentage and seconds remaining"""
  402. chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
  403. chunks = TERM_WIDTH() - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
  404. try:
  405. for s in range(seconds * chunks):
  406. chunks = TERM_WIDTH() - len(prefix) - 20
  407. progress = s / chunks / seconds * 100
  408. bar_width = round(progress/(100/chunks))
  409. # ████████████████████ 0.9% (1/60sec)
  410. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  411. prefix,
  412. ANSI['green'],
  413. (chunk * bar_width).ljust(chunks),
  414. ANSI['reset'],
  415. round(progress, 1),
  416. round(s/chunks),
  417. seconds,
  418. ))
  419. sys.stdout.flush()
  420. time.sleep(1 / chunks)
  421. # ██████████████████████████████████ 100.0% (60/60sec)
  422. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
  423. prefix,
  424. ANSI['red'],
  425. chunk * chunks,
  426. ANSI['reset'],
  427. 100.0,
  428. seconds,
  429. seconds,
  430. ))
  431. sys.stdout.flush()
  432. except KeyboardInterrupt:
  433. print()
  434. pass
  435. def download_url(url: str, timeout: int=TIMEOUT) -> str:
  436. """Download the contents of a remote url and return the text"""
  437. req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
  438. if CHECK_SSL_VALIDITY:
  439. resp = urlopen(req, timeout=timeout)
  440. else:
  441. import ssl
  442. insecure = ssl._create_unverified_context()
  443. resp = urlopen(req, timeout=timeout, context=insecure)
  444. encoding = resp.headers.get_content_charset() or 'utf-8'
  445. return resp.read().decode(encoding)
  446. def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
  447. """chmod -R <permissions> <cwd>/<path>"""
  448. if not os.path.exists(os.path.join(cwd, path)):
  449. raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
  450. chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
  451. if chmod_result.returncode == 1:
  452. print(' ', chmod_result.stderr.decode())
  453. raise Exception('Failed to chmod {}/{}'.format(cwd, path))
  454. def chrome_args(**options) -> List[str]:
  455. """helper to build up a chrome shell command with arguments"""
  456. options = {**CHROME_OPTIONS, **options}
  457. cmd_args = [options['CHROME_BINARY']]
  458. if options['CHROME_HEADLESS']:
  459. cmd_args += ('--headless',)
  460. if not options['CHROME_SANDBOX']:
  461. # dont use GPU or sandbox when running inside docker container
  462. cmd_args += ('--no-sandbox', '--disable-gpu')
  463. if not options['CHECK_SSL_VALIDITY']:
  464. cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
  465. if options['CHROME_USER_AGENT']:
  466. cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
  467. if options['RESOLUTION']:
  468. cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
  469. if options['TIMEOUT']:
  470. cmd_args += ('--timeout={}'.format((options['TIMEOUT']) * 1000),)
  471. if options['CHROME_USER_DATA_DIR']:
  472. cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
  473. return cmd_args