2
0

util.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. import os
  2. import re
  3. import sys
  4. import json
  5. import time
  6. import shutil
  7. from json import JSONEncoder
  8. from typing import List, Optional, Any, Union
  9. from inspect import signature
  10. from functools import wraps
  11. from hashlib import sha256
  12. from urllib.request import Request, urlopen
  13. from urllib.parse import urlparse, quote, unquote
  14. from html import escape, unescape
  15. from datetime import datetime
  16. from multiprocessing import Process
  17. from subprocess import (
  18. Popen,
  19. PIPE,
  20. DEVNULL,
  21. CompletedProcess,
  22. TimeoutExpired,
  23. CalledProcessError,
  24. )
  25. from base32_crockford import encode as base32_encode # type: ignore
  26. from core.schema import Link
  27. from core.config import (
  28. ANSI,
  29. TERM_WIDTH,
  30. SOURCES_DIR,
  31. OUTPUT_PERMISSIONS,
  32. TIMEOUT,
  33. SHOW_PROGRESS,
  34. FETCH_TITLE,
  35. CHECK_SSL_VALIDITY,
  36. WGET_USER_AGENT,
  37. CHROME_OPTIONS,
  38. PYTHON_DIR,
  39. )
  40. from core.logs import pretty_path
  41. ### Parsing Helpers
  42. # All of these are (str) -> str
  43. # shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
  44. scheme = lambda url: urlparse(url).scheme.lower()
  45. without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
  46. without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
  47. without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
  48. without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
  49. path = lambda url: urlparse(url).path
  50. basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
  51. domain = lambda url: urlparse(url).netloc
  52. query = lambda url: urlparse(url).query
  53. fragment = lambda url: urlparse(url).fragment
  54. extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
  55. base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
  56. without_www = lambda url: url.replace('://www.', '://', 1)
  57. without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
  58. fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
  59. hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
  60. urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
  61. urldecode = lambda s: s and unquote(s)
  62. htmlencode = lambda s: s and escape(s, quote=True)
  63. htmldecode = lambda s: s and unescape(s)
  64. short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
  65. ts_to_date = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
  66. ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
  67. URL_REGEX = re.compile(
  68. r'http[s]?://' # start matching from allowed schemes
  69. r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
  70. r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
  71. r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
  72. r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
  73. re.IGNORECASE,
  74. )
  75. HTML_TITLE_REGEX = re.compile(
  76. r'<title.*?>' # start matching text after <title> tag
  77. r'(.[^<>]+)', # get everything up to these symbols
  78. re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
  79. )
  80. STATICFILE_EXTENSIONS = {
  81. # 99.999% of the time, URLs ending in these extentions are static files
  82. # that can be downloaded as-is, not html pages that need to be rendered
  83. 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
  84. 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
  85. 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
  86. 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
  87. 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
  88. 'atom', 'rss', 'css', 'js', 'json',
  89. 'dmg', 'iso', 'img',
  90. 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
  91. # Less common extensions to consider adding later
  92. # jar, swf, bin, com, exe, dll, deb
  93. # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
  94. # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
  95. # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
  96. # Thse are always treated as pages, not as static files, never add them:
  97. # html, htm, shtml, xhtml, xml, aspx, php, cgi
  98. }
  99. ### Checks & Tests
  100. def enforce_types(func):
  101. """
  102. Enforce function arg and kwarg types at runtime using its python3 type hints
  103. """
  104. # TODO: check return type as well
  105. @wraps(func)
  106. def typechecked_function(*args, **kwargs):
  107. sig = signature(func)
  108. def check_argument_type(arg_key, arg_val):
  109. try:
  110. annotation = sig.parameters[arg_key].annotation
  111. except KeyError:
  112. annotation = None
  113. if annotation is not None and annotation.__class__ is type:
  114. if not isinstance(arg_val, annotation):
  115. raise TypeError(
  116. '{}(..., {}: {}) got unexpected {} argument {}={}'.format(
  117. func.__name__,
  118. arg_key,
  119. annotation.__name__,
  120. type(arg_val).__name__,
  121. arg_key,
  122. str(arg_val)[:64],
  123. )
  124. )
  125. # check args
  126. for arg_val, arg_key in zip(args, sig.parameters):
  127. check_argument_type(arg_key, arg_val)
  128. # check kwargs
  129. for arg_key, arg_val in kwargs.items():
  130. check_argument_type(arg_key, arg_val)
  131. return func(*args, **kwargs)
  132. return typechecked_function
  133. def check_url_parsing_invariants() -> None:
  134. """Check that plain text regex URL parsing works as expected"""
  135. # this is last-line-of-defense to make sure the URL_REGEX isn't
  136. # misbehaving, as the consequences could be disastrous and lead to many
  137. # incorrect/badly parsed links being added to the archive
  138. test_urls = '''
  139. https://example1.com/what/is/happening.html?what=1#how-about-this=1
  140. https://example2.com/what/is/happening/?what=1#how-about-this=1
  141. HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
  142. https://example4.com/what/is/happening.html
  143. https://example5.com/
  144. https://example6.com
  145. <test>http://example7.com</test>
  146. [https://example8.com/what/is/this.php?what=1]
  147. [and http://example9.com?what=1&other=3#and-thing=2]
  148. <what>https://example10.com#and-thing=2 "</about>
  149. abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
  150. sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
  151. example13.bada
  152. and example14.badb
  153. <or>htt://example15.badc</that>
  154. '''
  155. # print('\n'.join(re.findall(URL_REGEX, test_urls)))
  156. assert len(re.findall(URL_REGEX, test_urls)) == 12
  157. ### Random Helpers
  158. @enforce_types
  159. def handle_stdin_import(raw_text: str) -> str:
  160. if not os.path.exists(SOURCES_DIR):
  161. os.makedirs(SOURCES_DIR)
  162. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  163. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
  164. atomic_write(raw_text, source_path)
  165. return source_path
  166. @enforce_types
  167. def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
  168. """download a given url's content into output/sources/domain-<timestamp>.txt"""
  169. if not os.path.exists(SOURCES_DIR):
  170. os.makedirs(SOURCES_DIR)
  171. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  172. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
  173. if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  174. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
  175. print('{}[*] [{}] Downloading {}{}'.format(
  176. ANSI['green'],
  177. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  178. path,
  179. ANSI['reset'],
  180. ))
  181. timer = TimedProgress(timeout, prefix=' ')
  182. try:
  183. raw_source_text = download_url(path, timeout=timeout)
  184. timer.end()
  185. except Exception as e:
  186. timer.end()
  187. print('{}[!] Failed to download {}{}\n'.format(
  188. ANSI['red'],
  189. path,
  190. ANSI['reset'],
  191. ))
  192. print(' ', e)
  193. raise SystemExit(1)
  194. else:
  195. with open(path, 'r') as f:
  196. raw_source_text = f.read()
  197. atomic_write(raw_source_text, source_path)
  198. print(' > {}'.format(pretty_path(source_path)))
  199. return source_path
  200. @enforce_types
  201. def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]:
  202. """Attempt to guess a page's title by downloading the html"""
  203. if not FETCH_TITLE:
  204. return None
  205. try:
  206. html = download_url(url, timeout=timeout)
  207. match = re.search(HTML_TITLE_REGEX, html)
  208. return htmldecode(match.group(1).strip()) if match else None
  209. except Exception as err: # noqa
  210. # print('[!] Failed to fetch title because of {}: {}'.format(
  211. # err.__class__.__name__,
  212. # err,
  213. # ))
  214. return None
  215. @enforce_types
  216. def wget_output_path(link: Link) -> Optional[str]:
  217. """calculate the path to the wgetted .html file, since wget may
  218. adjust some paths to be different than the base_url path.
  219. See docs on wget --adjust-extension (-E)
  220. """
  221. if is_static_file(link.url):
  222. return without_scheme(without_fragment(link.url))
  223. # Wget downloads can save in a number of different ways depending on the url:
  224. # https://example.com
  225. # > output/archive/<timestamp>/example.com/index.html
  226. # https://example.com?v=zzVa_tX1OiI
  227. # > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
  228. # https://www.example.com/?v=zzVa_tX1OiI
  229. # > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
  230. # https://example.com/abc
  231. # > output/archive/<timestamp>/example.com/abc.html
  232. # https://example.com/abc/
  233. # > output/archive/<timestamp>/example.com/abc/index.html
  234. # https://example.com/abc?v=zzVa_tX1OiI.html
  235. # > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
  236. # https://example.com/abc/?v=zzVa_tX1OiI.html
  237. # > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
  238. # https://example.com/abc/test.html
  239. # > output/archive/<timestamp>/example.com/abc/test.html
  240. # https://example.com/abc/test?v=zzVa_tX1OiI
  241. # > output/archive/<timestamp>/example.com/abc/test?v=zzVa_tX1OiI.html
  242. # https://example.com/abc/test/?v=zzVa_tX1OiI
  243. # > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
  244. # There's also lots of complexity around how the urlencoding and renaming
  245. # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
  246. # Since the wget algorithm for -E (appending .html) is incredibly complex
  247. # and there's no way to get the computed output path from wget
  248. # in order to avoid having to reverse-engineer how they calculate it,
  249. # we just look in the output folder read the filename wget used from the filesystem
  250. full_path = without_fragment(without_query(path(link.url))).strip('/')
  251. search_dir = os.path.join(
  252. link.link_dir,
  253. domain(link.url),
  254. urldecode(full_path),
  255. )
  256. for _ in range(4):
  257. if os.path.exists(search_dir):
  258. if os.path.isdir(search_dir):
  259. html_files = [
  260. f for f in os.listdir(search_dir)
  261. if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
  262. ]
  263. if html_files:
  264. path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
  265. return os.path.join(path_from_link_dir, html_files[0])
  266. # Move up one directory level
  267. search_dir = search_dir.rsplit('/', 1)[0]
  268. if search_dir == link.link_dir:
  269. break
  270. return None
  271. @enforce_types
  272. def read_js_script(script_name: str) -> str:
  273. script_path = os.path.join(PYTHON_DIR, 'scripts', script_name)
  274. with open(script_path, 'r') as f:
  275. return f.read().split('// INFO BELOW HERE')[0].strip()
  276. ### String Manipulation & Logging Helpers
  277. @enforce_types
  278. def str_between(string: str, start: str, end: str=None) -> str:
  279. """(<abc>12345</def>, <abc>, </def>) -> 12345"""
  280. content = string.split(start, 1)[-1]
  281. if end is not None:
  282. content = content.rsplit(end, 1)[0]
  283. return content
  284. @enforce_types
  285. def parse_date(date: Any) -> Optional[datetime]:
  286. """Parse unix timestamps, iso format, and human-readable strings"""
  287. if date is None:
  288. return None
  289. if isinstance(date, datetime):
  290. return date
  291. if isinstance(date, (float, int)):
  292. date = str(date)
  293. if isinstance(date, str):
  294. if date.replace('.', '').isdigit():
  295. # this is a brittle attempt at unix timestamp parsing (which is
  296. # notoriously hard to do). It may lead to dates being off by
  297. # anything from hours to decades, depending on which app, OS,
  298. # and sytem time configuration was used for the original timestamp
  299. # more info: https://github.com/pirate/ArchiveBox/issues/119
  300. # Note: always always always store the original timestamp string
  301. # somewhere indepentendly of the parsed datetime, so that later
  302. # bugs dont repeatedly misparse and rewrite increasingly worse dates.
  303. # the correct date can always be re-derived from the timestamp str
  304. timestamp = float(date)
  305. EARLIEST_POSSIBLE = 473403600.0 # 1985
  306. LATEST_POSSIBLE = 1735707600.0 # 2025
  307. if EARLIEST_POSSIBLE < timestamp < LATEST_POSSIBLE:
  308. # number is seconds
  309. return datetime.fromtimestamp(timestamp)
  310. elif EARLIEST_POSSIBLE * 1000 < timestamp < LATEST_POSSIBLE * 1000:
  311. # number is milliseconds
  312. return datetime.fromtimestamp(timestamp / 1000)
  313. elif EARLIEST_POSSIBLE * 1000*1000 < timestamp < LATEST_POSSIBLE * 1000*1000:
  314. # number is microseconds
  315. return datetime.fromtimestamp(timestamp / (1000*1000))
  316. else:
  317. # continue to the end and raise a parsing failed error.
  318. # we dont want to even attempt parsing timestamp strings that
  319. # arent within these ranges
  320. pass
  321. if '-' in date:
  322. try:
  323. return datetime.fromisoformat(date)
  324. except Exception:
  325. try:
  326. return datetime.strptime(date, '%Y-%m-%d %H:%M')
  327. except Exception:
  328. pass
  329. raise ValueError('Tried to parse invalid date! {}'.format(date))
  330. ### Link Helpers
  331. @enforce_types
  332. def merge_links(a: Link, b: Link) -> Link:
  333. """deterministially merge two links, favoring longer field values over shorter,
  334. and "cleaner" values over worse ones.
  335. """
  336. assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
  337. url = a.url if len(a.url) > len(b.url) else b.url
  338. possible_titles = [
  339. title
  340. for title in (a.title, b.title)
  341. if title and title.strip() and '://' not in title
  342. ]
  343. title = None
  344. if len(possible_titles) == 2:
  345. title = max(possible_titles, key=lambda t: len(t))
  346. elif len(possible_titles) == 1:
  347. title = possible_titles[0]
  348. timestamp = (
  349. a.timestamp
  350. if float(a.timestamp or 0) < float(b.timestamp or 0) else
  351. b.timestamp
  352. )
  353. tags_set = (
  354. set(tag.strip() for tag in (a.tags or '').split(','))
  355. | set(tag.strip() for tag in (b.tags or '').split(','))
  356. )
  357. tags = ','.join(tags_set) or None
  358. sources = list(set(a.sources + b.sources))
  359. all_methods = set(list(a.history.keys()) + list(a.history.keys()))
  360. history = {
  361. method: (a.history.get(method) or []) + (b.history.get(method) or [])
  362. for method in all_methods
  363. }
  364. return Link(
  365. url=url,
  366. timestamp=timestamp,
  367. title=title,
  368. tags=tags,
  369. sources=sources,
  370. history=history,
  371. )
  372. @enforce_types
  373. def is_static_file(url: str) -> bool:
  374. """Certain URLs just point to a single static file, and
  375. don't need to be re-archived in many formats
  376. """
  377. # TODO: the proper way is with MIME type detection, not using extension
  378. return extension(url) in STATICFILE_EXTENSIONS
  379. @enforce_types
  380. def derived_link_info(link: Link) -> dict:
  381. """extend link info with the archive urls and other derived data"""
  382. info = link._asdict(extended=True)
  383. info.update(link.canonical_outputs())
  384. return info
  385. ### Python / System Helpers
  386. def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
  387. """Patched of subprocess.run to fix blocking io making timeout=innefective"""
  388. if input is not None:
  389. if 'stdin' in kwargs:
  390. raise ValueError('stdin and input arguments may not both be used.')
  391. kwargs['stdin'] = PIPE
  392. if capture_output:
  393. if ('stdout' in kwargs) or ('stderr' in kwargs):
  394. raise ValueError('stdout and stderr arguments may not be used '
  395. 'with capture_output.')
  396. kwargs['stdout'] = PIPE
  397. kwargs['stderr'] = PIPE
  398. with Popen(*popenargs, **kwargs) as process:
  399. try:
  400. stdout, stderr = process.communicate(input, timeout=timeout)
  401. except TimeoutExpired:
  402. process.kill()
  403. try:
  404. stdout, stderr = process.communicate(input, timeout=2)
  405. except:
  406. pass
  407. raise TimeoutExpired(popenargs[0][0], timeout)
  408. except BaseException:
  409. process.kill()
  410. # We don't call process.wait() as .__exit__ does that for us.
  411. raise
  412. retcode = process.poll()
  413. if check and retcode:
  414. raise CalledProcessError(retcode, process.args,
  415. output=stdout, stderr=stderr)
  416. return CompletedProcess(process.args, retcode, stdout, stderr)
  417. class TimedProgress:
  418. """Show a progress bar and measure elapsed time until .end() is called"""
  419. def __init__(self, seconds, prefix=''):
  420. if SHOW_PROGRESS:
  421. self.p = Process(target=progress_bar, args=(seconds, prefix))
  422. self.p.start()
  423. self.stats = {'start_ts': datetime.now(), 'end_ts': None}
  424. def end(self):
  425. """immediately end progress, clear the progressbar line, and save end_ts"""
  426. end_ts = datetime.now()
  427. self.stats['end_ts'] = end_ts
  428. if SHOW_PROGRESS:
  429. # protect from double termination
  430. #if p is None or not hasattr(p, 'kill'):
  431. # return
  432. if self.p is not None:
  433. self.p.terminate()
  434. self.p = None
  435. sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) # clear whole terminal line
  436. @enforce_types
  437. def progress_bar(seconds: int, prefix: str='') -> None:
  438. """show timer in the form of progress bar, with percentage and seconds remaining"""
  439. chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
  440. chunks = TERM_WIDTH() - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
  441. try:
  442. for s in range(seconds * chunks):
  443. chunks = TERM_WIDTH() - len(prefix) - 20
  444. progress = s / chunks / seconds * 100
  445. bar_width = round(progress/(100/chunks))
  446. # ████████████████████ 0.9% (1/60sec)
  447. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  448. prefix,
  449. ANSI['green'],
  450. (chunk * bar_width).ljust(chunks),
  451. ANSI['reset'],
  452. round(progress, 1),
  453. round(s/chunks),
  454. seconds,
  455. ))
  456. sys.stdout.flush()
  457. time.sleep(1 / chunks)
  458. # ██████████████████████████████████ 100.0% (60/60sec)
  459. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
  460. prefix,
  461. ANSI['red'],
  462. chunk * chunks,
  463. ANSI['reset'],
  464. 100.0,
  465. seconds,
  466. seconds,
  467. ))
  468. sys.stdout.flush()
  469. except KeyboardInterrupt:
  470. print()
  471. pass
  472. @enforce_types
  473. def download_url(url: str, timeout: int=TIMEOUT) -> str:
  474. """Download the contents of a remote url and return the text"""
  475. req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
  476. if CHECK_SSL_VALIDITY:
  477. resp = urlopen(req, timeout=timeout)
  478. else:
  479. import ssl
  480. insecure = ssl._create_unverified_context()
  481. resp = urlopen(req, timeout=timeout, context=insecure)
  482. encoding = resp.headers.get_content_charset() or 'utf-8' # type: ignore
  483. return resp.read().decode(encoding)
  484. @enforce_types
  485. def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
  486. """chmod -R <permissions> <cwd>/<path>"""
  487. if not os.path.exists(os.path.join(cwd, path)):
  488. raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
  489. chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
  490. if chmod_result.returncode == 1:
  491. print(' ', chmod_result.stderr.decode())
  492. raise Exception('Failed to chmod {}/{}'.format(cwd, path))
  493. @enforce_types
  494. def copy_and_overwrite(from_path: str, to_path: str):
  495. if os.path.exists(to_path):
  496. shutil.rmtree(to_path)
  497. shutil.copytree(from_path, to_path)
  498. @enforce_types
  499. def chrome_args(**options) -> List[str]:
  500. """helper to build up a chrome shell command with arguments"""
  501. options = {**CHROME_OPTIONS, **options}
  502. cmd_args = [options['CHROME_BINARY']]
  503. if options['CHROME_HEADLESS']:
  504. cmd_args += ('--headless',)
  505. if not options['CHROME_SANDBOX']:
  506. # dont use GPU or sandbox when running inside docker container
  507. cmd_args += ('--no-sandbox', '--disable-gpu')
  508. if not options['CHECK_SSL_VALIDITY']:
  509. cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
  510. if options['CHROME_USER_AGENT']:
  511. cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
  512. if options['RESOLUTION']:
  513. cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
  514. if options['TIMEOUT']:
  515. cmd_args += ('--timeout={}'.format((options['TIMEOUT']) * 1000),)
  516. if options['CHROME_USER_DATA_DIR']:
  517. cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
  518. return cmd_args
  519. class ExtendedEncoder(JSONEncoder):
  520. """
  521. Extended json serializer that supports serializing several model
  522. fields and objects
  523. """
  524. def default(self, obj):
  525. cls_name = obj.__class__.__name__
  526. if hasattr(obj, '_asdict'):
  527. return obj._asdict()
  528. elif isinstance(obj, bytes):
  529. return obj.decode()
  530. elif isinstance(obj, datetime):
  531. return obj.isoformat()
  532. elif isinstance(obj, Exception):
  533. return '{}: {}'.format(obj.__class__.__name__, obj)
  534. elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
  535. return tuple(obj)
  536. return JSONEncoder.default(self, obj)
  537. def atomic_write(contents: Union[dict, str], path: str) -> None:
  538. """Safe atomic write to filesystem by writing to temp file + atomic rename"""
  539. try:
  540. tmp_file = '{}.tmp'.format(path)
  541. with open(tmp_file, 'w+', encoding='utf-8') as f:
  542. if isinstance(contents, dict):
  543. json.dump(contents, f, indent=4, cls=ExtendedEncoder)
  544. else:
  545. f.write(contents)
  546. os.fsync(f.fileno())
  547. os.rename(tmp_file, path)
  548. chmod_file(path)
  549. finally:
  550. if os.path.exists(tmp_file):
  551. os.remove(tmp_file)