util.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. import os
  2. import re
  3. import sys
  4. import json
  5. import time
  6. import shutil
  7. from json import JSONEncoder
  8. from typing import List, Optional, Any, Union
  9. from inspect import signature
  10. from functools import wraps
  11. from hashlib import sha256
  12. from urllib.request import Request, urlopen
  13. from urllib.parse import urlparse, quote, unquote
  14. from html import escape, unescape
  15. from datetime import datetime
  16. from multiprocessing import Process
  17. from subprocess import (
  18. Popen,
  19. PIPE,
  20. DEVNULL,
  21. CompletedProcess,
  22. TimeoutExpired,
  23. CalledProcessError,
  24. )
  25. from base32_crockford import encode as base32_encode # type: ignore
  26. from .schema import Link
  27. from .config import (
  28. ANSI,
  29. TERM_WIDTH,
  30. SOURCES_DIR,
  31. OUTPUT_PERMISSIONS,
  32. TIMEOUT,
  33. SHOW_PROGRESS,
  34. FETCH_TITLE,
  35. CHECK_SSL_VALIDITY,
  36. WGET_USER_AGENT,
  37. CHROME_OPTIONS,
  38. )
  39. from .logs import pretty_path
  40. ### Parsing Helpers
  41. # All of these are (str) -> str
  42. # shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
  43. scheme = lambda url: urlparse(url).scheme.lower()
  44. without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
  45. without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
  46. without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
  47. without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
  48. path = lambda url: urlparse(url).path
  49. basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
  50. domain = lambda url: urlparse(url).netloc
  51. query = lambda url: urlparse(url).query
  52. fragment = lambda url: urlparse(url).fragment
  53. extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
  54. base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
  55. without_www = lambda url: url.replace('://www.', '://', 1)
  56. without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
  57. fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
  58. hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
  59. urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
  60. urldecode = lambda s: s and unquote(s)
  61. htmlencode = lambda s: s and escape(s, quote=True)
  62. htmldecode = lambda s: s and unescape(s)
  63. short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
  64. ts_to_date = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
  65. ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
  66. URL_REGEX = re.compile(
  67. r'http[s]?://' # start matching from allowed schemes
  68. r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
  69. r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
  70. r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
  71. r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
  72. re.IGNORECASE,
  73. )
  74. HTML_TITLE_REGEX = re.compile(
  75. r'<title.*?>' # start matching text after <title> tag
  76. r'(.[^<>]+)', # get everything up to these symbols
  77. re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
  78. )
  79. STATICFILE_EXTENSIONS = {
  80. # 99.999% of the time, URLs ending in these extentions are static files
  81. # that can be downloaded as-is, not html pages that need to be rendered
  82. 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
  83. 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
  84. 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
  85. 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
  86. 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
  87. 'atom', 'rss', 'css', 'js', 'json',
  88. 'dmg', 'iso', 'img',
  89. 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
  90. # Less common extensions to consider adding later
  91. # jar, swf, bin, com, exe, dll, deb
  92. # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
  93. # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
  94. # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
  95. # Thse are always treated as pages, not as static files, never add them:
  96. # html, htm, shtml, xhtml, xml, aspx, php, cgi
  97. }
  98. ### Checks & Tests
  99. def enforce_types(func):
  100. """
  101. Enforce function arg and kwarg types at runtime using its python3 type hints
  102. """
  103. # TODO: check return type as well
  104. @wraps(func)
  105. def typechecked_function(*args, **kwargs):
  106. sig = signature(func)
  107. def check_argument_type(arg_key, arg_val):
  108. try:
  109. annotation = sig.parameters[arg_key].annotation
  110. except KeyError:
  111. annotation = None
  112. if annotation is not None and annotation.__class__ is type:
  113. if not isinstance(arg_val, annotation):
  114. raise TypeError(
  115. '{}(..., {}: {}) got unexpected {} argument {}={}'.format(
  116. func.__name__,
  117. arg_key,
  118. annotation.__name__,
  119. type(arg_val).__name__,
  120. arg_key,
  121. str(arg_val)[:64],
  122. )
  123. )
  124. # check args
  125. for arg_val, arg_key in zip(args, sig.parameters):
  126. check_argument_type(arg_key, arg_val)
  127. # check kwargs
  128. for arg_key, arg_val in kwargs.items():
  129. check_argument_type(arg_key, arg_val)
  130. return func(*args, **kwargs)
  131. return typechecked_function
  132. def check_url_parsing_invariants() -> None:
  133. """Check that plain text regex URL parsing works as expected"""
  134. # this is last-line-of-defense to make sure the URL_REGEX isn't
  135. # misbehaving, as the consequences could be disastrous and lead to many
  136. # incorrect/badly parsed links being added to the archive
  137. test_urls = '''
  138. https://example1.com/what/is/happening.html?what=1#how-about-this=1
  139. https://example2.com/what/is/happening/?what=1#how-about-this=1
  140. HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
  141. https://example4.com/what/is/happening.html
  142. https://example5.com/
  143. https://example6.com
  144. <test>http://example7.com</test>
  145. [https://example8.com/what/is/this.php?what=1]
  146. [and http://example9.com?what=1&other=3#and-thing=2]
  147. <what>https://example10.com#and-thing=2 "</about>
  148. abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
  149. sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
  150. example13.bada
  151. and example14.badb
  152. <or>htt://example15.badc</that>
  153. '''
  154. # print('\n'.join(re.findall(URL_REGEX, test_urls)))
  155. assert len(re.findall(URL_REGEX, test_urls)) == 12
  156. ### Random Helpers
  157. @enforce_types
  158. def handle_stdin_import(raw_text: str) -> str:
  159. if not os.path.exists(SOURCES_DIR):
  160. os.makedirs(SOURCES_DIR)
  161. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  162. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
  163. atomic_write(raw_text, source_path)
  164. return source_path
  165. @enforce_types
  166. def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
  167. """download a given url's content into output/sources/domain-<timestamp>.txt"""
  168. if not os.path.exists(SOURCES_DIR):
  169. os.makedirs(SOURCES_DIR)
  170. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  171. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
  172. if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  173. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
  174. print('{}[*] [{}] Downloading {}{}'.format(
  175. ANSI['green'],
  176. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  177. path,
  178. ANSI['reset'],
  179. ))
  180. timer = TimedProgress(timeout, prefix=' ')
  181. try:
  182. raw_source_text = download_url(path, timeout=timeout)
  183. timer.end()
  184. except Exception as e:
  185. timer.end()
  186. print('{}[!] Failed to download {}{}\n'.format(
  187. ANSI['red'],
  188. path,
  189. ANSI['reset'],
  190. ))
  191. print(' ', e)
  192. raise SystemExit(1)
  193. else:
  194. with open(path, 'r') as f:
  195. raw_source_text = f.read()
  196. atomic_write(raw_source_text, source_path)
  197. print(' > {}'.format(pretty_path(source_path)))
  198. return source_path
  199. @enforce_types
  200. def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]:
  201. """Attempt to guess a page's title by downloading the html"""
  202. if not FETCH_TITLE:
  203. return None
  204. try:
  205. html = download_url(url, timeout=timeout)
  206. match = re.search(HTML_TITLE_REGEX, html)
  207. return htmldecode(match.group(1).strip()) if match else None
  208. except Exception as err: # noqa
  209. # print('[!] Failed to fetch title because of {}: {}'.format(
  210. # err.__class__.__name__,
  211. # err,
  212. # ))
  213. return None
  214. @enforce_types
  215. def wget_output_path(link: Link) -> Optional[str]:
  216. """calculate the path to the wgetted .html file, since wget may
  217. adjust some paths to be different than the base_url path.
  218. See docs on wget --adjust-extension (-E)
  219. """
  220. if is_static_file(link.url):
  221. return without_scheme(without_fragment(link.url))
  222. # Wget downloads can save in a number of different ways depending on the url:
  223. # https://example.com
  224. # > output/archive/<timestamp>/example.com/index.html
  225. # https://example.com?v=zzVa_tX1OiI
  226. # > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
  227. # https://www.example.com/?v=zzVa_tX1OiI
  228. # > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
  229. # https://example.com/abc
  230. # > output/archive/<timestamp>/example.com/abc.html
  231. # https://example.com/abc/
  232. # > output/archive/<timestamp>/example.com/abc/index.html
  233. # https://example.com/abc?v=zzVa_tX1OiI.html
  234. # > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
  235. # https://example.com/abc/?v=zzVa_tX1OiI.html
  236. # > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
  237. # https://example.com/abc/test.html
  238. # > output/archive/<timestamp>/example.com/abc/test.html
  239. # https://example.com/abc/test?v=zzVa_tX1OiI
  240. # > output/archive/<timestamp>/example.com/abc/test?v=zzVa_tX1OiI.html
  241. # https://example.com/abc/test/?v=zzVa_tX1OiI
  242. # > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
  243. # There's also lots of complexity around how the urlencoding and renaming
  244. # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
  245. # Since the wget algorithm for -E (appending .html) is incredibly complex
  246. # and there's no way to get the computed output path from wget
  247. # in order to avoid having to reverse-engineer how they calculate it,
  248. # we just look in the output folder read the filename wget used from the filesystem
  249. full_path = without_fragment(without_query(path(link.url))).strip('/')
  250. search_dir = os.path.join(
  251. link.link_dir,
  252. domain(link.url),
  253. urldecode(full_path),
  254. )
  255. for _ in range(4):
  256. if os.path.exists(search_dir):
  257. if os.path.isdir(search_dir):
  258. html_files = [
  259. f for f in os.listdir(search_dir)
  260. if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
  261. ]
  262. if html_files:
  263. path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
  264. return os.path.join(path_from_link_dir, html_files[0])
  265. # Move up one directory level
  266. search_dir = search_dir.rsplit('/', 1)[0]
  267. if search_dir == link.link_dir:
  268. break
  269. return None
  270. ### String Manipulation & Logging Helpers
  271. @enforce_types
  272. def str_between(string: str, start: str, end: str=None) -> str:
  273. """(<abc>12345</def>, <abc>, </def>) -> 12345"""
  274. content = string.split(start, 1)[-1]
  275. if end is not None:
  276. content = content.rsplit(end, 1)[0]
  277. return content
  278. @enforce_types
  279. def parse_date(date: Any) -> Optional[datetime]:
  280. """Parse unix timestamps, iso format, and human-readable strings"""
  281. if date is None:
  282. return None
  283. if isinstance(date, datetime):
  284. return date
  285. if isinstance(date, (float, int)):
  286. date = str(date)
  287. if isinstance(date, str):
  288. if date.replace('.', '').isdigit():
  289. # this is a brittle attempt at unix timestamp parsing (which is
  290. # notoriously hard to do). It may lead to dates being off by
  291. # anything from hours to decades, depending on which app, OS,
  292. # and sytem time configuration was used for the original timestamp
  293. # more info: https://github.com/pirate/ArchiveBox/issues/119
  294. # Note: always always always store the original timestamp string
  295. # somewhere indepentendly of the parsed datetime, so that later
  296. # bugs dont repeatedly misparse and rewrite increasingly worse dates.
  297. # the correct date can always be re-derived from the timestamp str
  298. timestamp = float(date)
  299. EARLIEST_POSSIBLE = 473403600.0 # 1985
  300. LATEST_POSSIBLE = 1735707600.0 # 2025
  301. if EARLIEST_POSSIBLE < timestamp < LATEST_POSSIBLE:
  302. # number is seconds
  303. return datetime.fromtimestamp(timestamp)
  304. elif EARLIEST_POSSIBLE * 1000 < timestamp < LATEST_POSSIBLE * 1000:
  305. # number is milliseconds
  306. return datetime.fromtimestamp(timestamp / 1000)
  307. elif EARLIEST_POSSIBLE * 1000*1000 < timestamp < LATEST_POSSIBLE * 1000*1000:
  308. # number is microseconds
  309. return datetime.fromtimestamp(timestamp / (1000*1000))
  310. else:
  311. # continue to the end and raise a parsing failed error.
  312. # we dont want to even attempt parsing timestamp strings that
  313. # arent within these ranges
  314. pass
  315. if '-' in date:
  316. try:
  317. return datetime.fromisoformat(date)
  318. except Exception:
  319. try:
  320. return datetime.strptime(date, '%Y-%m-%d %H:%M')
  321. except Exception:
  322. pass
  323. raise ValueError('Tried to parse invalid date! {}'.format(date))
  324. ### Link Helpers
  325. @enforce_types
  326. def merge_links(a: Link, b: Link) -> Link:
  327. """deterministially merge two links, favoring longer field values over shorter,
  328. and "cleaner" values over worse ones.
  329. """
  330. assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
  331. url = a.url if len(a.url) > len(b.url) else b.url
  332. possible_titles = [
  333. title
  334. for title in (a.title, b.title)
  335. if title and title.strip() and '://' not in title
  336. ]
  337. title = None
  338. if len(possible_titles) == 2:
  339. title = max(possible_titles, key=lambda t: len(t))
  340. elif len(possible_titles) == 1:
  341. title = possible_titles[0]
  342. timestamp = (
  343. a.timestamp
  344. if float(a.timestamp or 0) < float(b.timestamp or 0) else
  345. b.timestamp
  346. )
  347. tags_set = (
  348. set(tag.strip() for tag in (a.tags or '').split(','))
  349. | set(tag.strip() for tag in (b.tags or '').split(','))
  350. )
  351. tags = ','.join(tags_set) or None
  352. sources = list(set(a.sources + b.sources))
  353. all_methods = set(list(a.history.keys()) + list(a.history.keys()))
  354. history = {
  355. method: (a.history.get(method) or []) + (b.history.get(method) or [])
  356. for method in all_methods
  357. }
  358. return Link(
  359. url=url,
  360. timestamp=timestamp,
  361. title=title,
  362. tags=tags,
  363. sources=sources,
  364. history=history,
  365. )
  366. @enforce_types
  367. def is_static_file(url: str) -> bool:
  368. """Certain URLs just point to a single static file, and
  369. don't need to be re-archived in many formats
  370. """
  371. # TODO: the proper way is with MIME type detection, not using extension
  372. return extension(url) in STATICFILE_EXTENSIONS
  373. @enforce_types
  374. def derived_link_info(link: Link) -> dict:
  375. """extend link info with the archive urls and other derived data"""
  376. info = link._asdict(extended=True)
  377. info.update(link.canonical_outputs())
  378. return info
  379. ### Python / System Helpers
  380. def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
  381. """Patched of subprocess.run to fix blocking io making timeout=innefective"""
  382. if input is not None:
  383. if 'stdin' in kwargs:
  384. raise ValueError('stdin and input arguments may not both be used.')
  385. kwargs['stdin'] = PIPE
  386. if capture_output:
  387. if ('stdout' in kwargs) or ('stderr' in kwargs):
  388. raise ValueError('stdout and stderr arguments may not be used '
  389. 'with capture_output.')
  390. kwargs['stdout'] = PIPE
  391. kwargs['stderr'] = PIPE
  392. with Popen(*popenargs, **kwargs) as process:
  393. try:
  394. stdout, stderr = process.communicate(input, timeout=timeout)
  395. except TimeoutExpired:
  396. process.kill()
  397. try:
  398. stdout, stderr = process.communicate(input, timeout=2)
  399. except:
  400. pass
  401. raise TimeoutExpired(popenargs[0][0], timeout)
  402. except BaseException:
  403. process.kill()
  404. # We don't call process.wait() as .__exit__ does that for us.
  405. raise
  406. retcode = process.poll()
  407. if check and retcode:
  408. raise CalledProcessError(retcode, process.args,
  409. output=stdout, stderr=stderr)
  410. return CompletedProcess(process.args, retcode, stdout, stderr)
  411. class TimedProgress:
  412. """Show a progress bar and measure elapsed time until .end() is called"""
  413. def __init__(self, seconds, prefix=''):
  414. if SHOW_PROGRESS:
  415. self.p = Process(target=progress_bar, args=(seconds, prefix))
  416. self.p.start()
  417. self.stats = {'start_ts': datetime.now(), 'end_ts': None}
  418. def end(self):
  419. """immediately end progress, clear the progressbar line, and save end_ts"""
  420. end_ts = datetime.now()
  421. self.stats['end_ts'] = end_ts
  422. if SHOW_PROGRESS:
  423. # protect from double termination
  424. #if p is None or not hasattr(p, 'kill'):
  425. # return
  426. if self.p is not None:
  427. self.p.terminate()
  428. self.p = None
  429. sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) # clear whole terminal line
  430. @enforce_types
  431. def progress_bar(seconds: int, prefix: str='') -> None:
  432. """show timer in the form of progress bar, with percentage and seconds remaining"""
  433. chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
  434. chunks = TERM_WIDTH() - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
  435. try:
  436. for s in range(seconds * chunks):
  437. chunks = TERM_WIDTH() - len(prefix) - 20
  438. progress = s / chunks / seconds * 100
  439. bar_width = round(progress/(100/chunks))
  440. # ████████████████████ 0.9% (1/60sec)
  441. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  442. prefix,
  443. ANSI['green'],
  444. (chunk * bar_width).ljust(chunks),
  445. ANSI['reset'],
  446. round(progress, 1),
  447. round(s/chunks),
  448. seconds,
  449. ))
  450. sys.stdout.flush()
  451. time.sleep(1 / chunks)
  452. # ██████████████████████████████████ 100.0% (60/60sec)
  453. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
  454. prefix,
  455. ANSI['red'],
  456. chunk * chunks,
  457. ANSI['reset'],
  458. 100.0,
  459. seconds,
  460. seconds,
  461. ))
  462. sys.stdout.flush()
  463. except KeyboardInterrupt:
  464. print()
  465. pass
  466. @enforce_types
  467. def download_url(url: str, timeout: int=TIMEOUT) -> str:
  468. """Download the contents of a remote url and return the text"""
  469. req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
  470. if CHECK_SSL_VALIDITY:
  471. resp = urlopen(req, timeout=timeout)
  472. else:
  473. import ssl
  474. insecure = ssl._create_unverified_context()
  475. resp = urlopen(req, timeout=timeout, context=insecure)
  476. encoding = resp.headers.get_content_charset() or 'utf-8' # type: ignore
  477. return resp.read().decode(encoding)
  478. @enforce_types
  479. def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
  480. """chmod -R <permissions> <cwd>/<path>"""
  481. if not os.path.exists(os.path.join(cwd, path)):
  482. raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
  483. chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
  484. if chmod_result.returncode == 1:
  485. print(' ', chmod_result.stderr.decode())
  486. raise Exception('Failed to chmod {}/{}'.format(cwd, path))
  487. @enforce_types
  488. def copy_and_overwrite(from_path: str, to_path: str):
  489. if os.path.exists(to_path):
  490. shutil.rmtree(to_path)
  491. shutil.copytree(from_path, to_path)
  492. @enforce_types
  493. def chrome_args(**options) -> List[str]:
  494. """helper to build up a chrome shell command with arguments"""
  495. options = {**CHROME_OPTIONS, **options}
  496. cmd_args = [options['CHROME_BINARY']]
  497. if options['CHROME_HEADLESS']:
  498. cmd_args += ('--headless',)
  499. if not options['CHROME_SANDBOX']:
  500. # dont use GPU or sandbox when running inside docker container
  501. cmd_args += ('--no-sandbox', '--disable-gpu')
  502. if not options['CHECK_SSL_VALIDITY']:
  503. cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
  504. if options['CHROME_USER_AGENT']:
  505. cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
  506. if options['RESOLUTION']:
  507. cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
  508. if options['TIMEOUT']:
  509. cmd_args += ('--timeout={}'.format((options['TIMEOUT']) * 1000),)
  510. if options['CHROME_USER_DATA_DIR']:
  511. cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
  512. return cmd_args
  513. class ExtendedEncoder(JSONEncoder):
  514. """
  515. Extended json serializer that supports serializing several model
  516. fields and objects
  517. """
  518. def default(self, obj):
  519. cls_name = obj.__class__.__name__
  520. if hasattr(obj, '_asdict'):
  521. return obj._asdict()
  522. elif isinstance(obj, bytes):
  523. return obj.decode()
  524. elif isinstance(obj, datetime):
  525. return obj.isoformat()
  526. elif isinstance(obj, Exception):
  527. return '{}: {}'.format(obj.__class__.__name__, obj)
  528. elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
  529. return tuple(obj)
  530. return JSONEncoder.default(self, obj)
  531. def atomic_write(contents: Union[dict, str], path: str) -> None:
  532. """Safe atomic write to filesystem by writing to temp file + atomic rename"""
  533. try:
  534. tmp_file = '{}.tmp'.format(path)
  535. with open(tmp_file, 'w+', encoding='utf-8') as f:
  536. if isinstance(contents, dict):
  537. json.dump(contents, f, indent=4, cls=ExtendedEncoder)
  538. else:
  539. f.write(contents)
  540. os.fsync(f.fileno())
  541. os.rename(tmp_file, path)
  542. chmod_file(path)
  543. finally:
  544. if os.path.exists(tmp_file):
  545. os.remove(tmp_file)