util.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653
  1. import os
  2. import re
  3. import sys
  4. import time
  5. from urllib.request import Request, urlopen
  6. from urllib.parse import urlparse, quote
  7. from decimal import Decimal
  8. from datetime import datetime
  9. from multiprocessing import Process
  10. from subprocess import (
  11. Popen,
  12. PIPE,
  13. DEVNULL,
  14. CompletedProcess,
  15. TimeoutExpired,
  16. CalledProcessError,
  17. )
  18. from config import (
  19. ANSI,
  20. TERM_WIDTH,
  21. SOURCES_DIR,
  22. ARCHIVE_DIR,
  23. OUTPUT_PERMISSIONS,
  24. TIMEOUT,
  25. SHOW_PROGRESS,
  26. CURL_BINARY,
  27. WGET_BINARY,
  28. CHROME_BINARY,
  29. GIT_BINARY,
  30. YOUTUBEDL_BINARY,
  31. FETCH_TITLE,
  32. FETCH_FAVICON,
  33. FETCH_WGET,
  34. FETCH_WARC,
  35. FETCH_PDF,
  36. FETCH_SCREENSHOT,
  37. FETCH_DOM,
  38. FETCH_GIT,
  39. FETCH_MEDIA,
  40. SUBMIT_ARCHIVE_DOT_ORG,
  41. ARCHIVE_DIR_NAME,
  42. RESOLUTION,
  43. CHECK_SSL_VALIDITY,
  44. WGET_USER_AGENT,
  45. CHROME_USER_AGENT,
  46. CHROME_USER_DATA_DIR,
  47. CHROME_HEADLESS,
  48. CHROME_SANDBOX,
  49. )
  50. from logs import pretty_path
  51. ### Parsing Helpers
  52. # Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing
  53. scheme = lambda url: urlparse(url).scheme
  54. without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
  55. without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
  56. without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
  57. without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
  58. path = lambda url: urlparse(url).path
  59. basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
  60. domain = lambda url: urlparse(url).netloc
  61. query = lambda url: urlparse(url).query
  62. fragment = lambda url: urlparse(url).fragment
  63. extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
  64. base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
  65. short_ts = lambda ts: ts.split('.')[0]
  66. urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
  67. URL_REGEX = re.compile(
  68. r'http[s]?://' # start matching from allowed schemes
  69. r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
  70. r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
  71. r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
  72. r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
  73. re.IGNORECASE,
  74. )
  75. HTML_TITLE_REGEX = re.compile(
  76. r'<title.*?>' # start matching text after <title> tag
  77. r'(.[^<>]+)', # get everything up to these symbols
  78. re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
  79. )
  80. STATICFILE_EXTENSIONS = {
  81. # 99.999% of the time, URLs ending in these extentions are static files
  82. # that can be downloaded as-is, not html pages that need to be rendered
  83. 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
  84. 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
  85. 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8'
  86. 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
  87. 'atom', 'rss', 'css', 'js', 'json',
  88. 'dmg', 'iso', 'img',
  89. 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
  90. # Less common extensions to consider adding later
  91. # jar, swf, bin, com, exe, dll, deb
  92. # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
  93. # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
  94. # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
  95. # Thse are always treated as pages, not as static files, never add them:
  96. # html, htm, shtml, xhtml, xml, aspx, php, cgi
  97. }
  98. ### Checks & Tests
  99. def check_link_structure(link):
  100. """basic sanity check invariants to make sure the data is valid"""
  101. assert isinstance(link, dict)
  102. assert isinstance(link.get('url'), str)
  103. assert len(link['url']) > 2
  104. assert len(re.findall(URL_REGEX, link['url'])) == 1
  105. if 'history' in link:
  106. assert isinstance(link['history'], dict), 'history must be a Dict'
  107. for key, val in link['history'].items():
  108. assert isinstance(key, str)
  109. assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
  110. if 'latest' in link:
  111. assert isinstance(link['latest'], dict), 'latest must be a Dict'
  112. for key, val in link['latest'].items():
  113. assert isinstance(key, str)
  114. assert (val is None) or isinstance(val, (str, Exception)), 'latest must be a Dict[str, Optional[str]], got: {}'.format(link['latest'])
  115. def check_links_structure(links):
  116. """basic sanity check invariants to make sure the data is valid"""
  117. assert isinstance(links, list)
  118. if links:
  119. check_link_structure(links[0])
  120. def check_dependencies():
  121. """Check that all necessary dependencies are installed, and have valid versions"""
  122. try:
  123. python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
  124. if python_vers < 3.5:
  125. print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
  126. print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
  127. raise SystemExit(1)
  128. if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
  129. if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
  130. print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
  131. print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
  132. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  133. raise SystemExit(1)
  134. if FETCH_WGET or FETCH_WARC:
  135. if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
  136. print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
  137. print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
  138. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  139. raise SystemExit(1)
  140. if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
  141. if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
  142. print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
  143. print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
  144. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  145. raise SystemExit(1)
  146. # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
  147. try:
  148. result = run([CHROME_BINARY, '--version'], stdout=PIPE)
  149. version_str = result.stdout.decode('utf-8')
  150. version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
  151. version = [l for l in version_lines if l.isdigit()][-1]
  152. if int(version) < 59:
  153. print(version_lines)
  154. print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
  155. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  156. raise SystemExit(1)
  157. except (IndexError, TypeError, OSError):
  158. print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
  159. print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
  160. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  161. raise SystemExit(1)
  162. if FETCH_GIT:
  163. if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
  164. print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
  165. print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
  166. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  167. raise SystemExit(1)
  168. if FETCH_MEDIA:
  169. if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
  170. print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
  171. print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
  172. print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
  173. raise SystemExit(1)
  174. except (KeyboardInterrupt, Exception):
  175. raise SystemExit(1)
  176. def check_url_parsing_invariants():
  177. """Check that plain text regex URL parsing works as expected"""
  178. # this is last-line-of-defense to make sure the URL_REGEX isn't
  179. # misbehaving, as the consequences could be disastrous and lead to many
  180. # incorrect/badly parsed links being added to the archive
  181. test_urls = '''
  182. https://example1.com/what/is/happening.html?what=1#how-about-this=1
  183. https://example2.com/what/is/happening/?what=1#how-about-this=1
  184. HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
  185. https://example4.com/what/is/happening.html
  186. https://example5.com/
  187. https://example6.com
  188. <test>http://example7.com</test>
  189. [https://example8.com/what/is/this.php?what=1]
  190. [and http://example9.com?what=1&other=3#and-thing=2]
  191. <what>https://example10.com#and-thing=2 "</about>
  192. abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
  193. sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
  194. example13.bada
  195. and example14.badb
  196. <or>htt://example15.badc</that>
  197. '''
  198. # print('\n'.join(re.findall(URL_REGEX, test_urls)))
  199. assert len(re.findall(URL_REGEX, test_urls)) == 12
  200. ### Random Helpers
  201. def save_stdin_source(raw_text):
  202. if not os.path.exists(SOURCES_DIR):
  203. os.makedirs(SOURCES_DIR)
  204. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  205. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
  206. with open(source_path, 'w', encoding='utf-8') as f:
  207. f.write(raw_text)
  208. return source_path
  209. def save_remote_source(url, timeout=TIMEOUT):
  210. """download a given url's content into output/sources/domain-<timestamp>.txt"""
  211. if not os.path.exists(SOURCES_DIR):
  212. os.makedirs(SOURCES_DIR)
  213. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  214. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
  215. print('{}[*] [{}] Downloading {}{}'.format(
  216. ANSI['green'],
  217. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  218. url,
  219. ANSI['reset'],
  220. ))
  221. timer = TimedProgress(timeout, prefix=' ')
  222. try:
  223. downloaded_xml = download_url(url, timeout=timeout)
  224. timer.end()
  225. except Exception as e:
  226. timer.end()
  227. print('{}[!] Failed to download {}{}\n'.format(
  228. ANSI['red'],
  229. url,
  230. ANSI['reset'],
  231. ))
  232. print(' ', e)
  233. raise SystemExit(1)
  234. with open(source_path, 'w', encoding='utf-8') as f:
  235. f.write(downloaded_xml)
  236. print(' > {}'.format(pretty_path(source_path)))
  237. return source_path
  238. def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
  239. """Attempt to guess a page's title by downloading the html"""
  240. if not FETCH_TITLE:
  241. return None
  242. try:
  243. if progress:
  244. sys.stdout.write('.')
  245. sys.stdout.flush()
  246. html = download_url(url, timeout=timeout)
  247. match = re.search(HTML_TITLE_REGEX, html)
  248. return match.group(1).strip() if match else None
  249. except Exception as err:
  250. # print('[!] Failed to fetch title because of {}: {}'.format(
  251. # err.__class__.__name__,
  252. # err,
  253. # ))
  254. return None
  255. def wget_output_path(link):
  256. """calculate the path to the wgetted .html file, since wget may
  257. adjust some paths to be different than the base_url path.
  258. See docs on wget --adjust-extension (-E)
  259. """
  260. # if we have it stored, always prefer the actual output path to computed one
  261. if link.get('latest', {}).get('wget'):
  262. return link['latest']['wget']
  263. if is_static_file(link['url']):
  264. return without_scheme(without_fragment(link['url']))
  265. # Wget downloads can save in a number of different ways depending on the url:
  266. # https://example.com
  267. # > output/archive/<timestamp>/example.com/index.html
  268. # https://example.com?v=zzVa_tX1OiI
  269. # > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
  270. # https://www.example.com/?v=zzVa_tX1OiI
  271. # > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
  272. # https://example.com/abc
  273. # > output/archive/<timestamp>/example.com/abc.html
  274. # https://example.com/abc/
  275. # > output/archive/<timestamp>/example.com/abc/index.html
  276. # https://example.com/abc?v=zzVa_tX1OiI.html
  277. # > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
  278. # https://example.com/abc/?v=zzVa_tX1OiI.html
  279. # > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
  280. # https://example.com/abc/test.html
  281. # > output/archive/<timestamp>/example.com/abc/test.html
  282. # https://example.com/abc/test?v=zzVa_tX1OiI
  283. # > output/archive/<timestamp>/example.com/abc/test?v=zzVa_tX1OiI.html
  284. # https://example.com/abc/test/?v=zzVa_tX1OiI
  285. # > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
  286. # There's also lots of complexity around how the urlencoding and renaming
  287. # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
  288. # Since the wget algorithm for -E (appending .html) is incredibly complex
  289. # and there's no way to get the computed output path from wget
  290. # in order to avoid having to reverse-engineer how they calculate it,
  291. # we just look in the output folder read the filename wget used from the filesystem
  292. link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
  293. full_path = without_fragment(without_query(path(link['url']))).strip('/')
  294. search_dir = os.path.join(
  295. link_dir,
  296. domain(link['url']),
  297. full_path,
  298. )
  299. for _ in range(4):
  300. if os.path.exists(search_dir):
  301. if os.path.isdir(search_dir):
  302. html_files = [
  303. f for f in os.listdir(search_dir)
  304. if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
  305. ]
  306. if html_files:
  307. path_from_link_dir = search_dir.split(link_dir)[-1].strip('/')
  308. return os.path.join(path_from_link_dir, html_files[0])
  309. # Move up one directory level
  310. search_dir = search_dir.rsplit('/', 1)[0]
  311. if search_dir == link_dir:
  312. break
  313. return None
  314. ### String Manipulation & Logging Helpers
  315. def str_between(string, start, end=None):
  316. """(<abc>12345</def>, <abc>, </def>) -> 12345"""
  317. content = string.split(start, 1)[-1]
  318. if end is not None:
  319. content = content.rsplit(end, 1)[0]
  320. return content
  321. ### Link Helpers
  322. def merge_links(a, b):
  323. """deterministially merge two links, favoring longer field values over shorter,
  324. and "cleaner" values over worse ones.
  325. """
  326. longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
  327. earlier = lambda key: a[key] if a[key] < b[key] else b[key]
  328. url = longer('url')
  329. longest_title = longer('title')
  330. cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
  331. return {
  332. 'url': url,
  333. 'timestamp': earlier('timestamp'),
  334. 'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
  335. 'tags': longer('tags'),
  336. 'sources': list(set(a.get('sources', []) + b.get('sources', []))),
  337. }
  338. def is_static_file(url):
  339. """Certain URLs just point to a single static file, and
  340. don't need to be re-archived in many formats
  341. """
  342. # TODO: the proper way is with MIME type detection, not using extension
  343. return extension(url) in STATICFILE_EXTENSIONS
  344. def derived_link_info(link):
  345. """extend link info with the archive urls and other derived data"""
  346. url = link['url']
  347. to_date_str = lambda ts: datetime.fromtimestamp(Decimal(ts)).strftime('%Y-%m-%d %H:%M')
  348. extended_info = {
  349. **link,
  350. 'link_dir': '{}/{}'.format(ARCHIVE_DIR_NAME, link['timestamp']),
  351. 'bookmarked_date': to_date_str(link['timestamp']),
  352. 'updated_date': to_date_str(link['updated']) if 'updated' in link else None,
  353. 'domain': domain(url),
  354. 'path': path(url),
  355. 'basename': basename(url),
  356. 'extension': extension(url),
  357. 'base_url': base_url(url),
  358. 'is_static': is_static_file(url),
  359. 'is_archived': os.path.exists(os.path.join(
  360. ARCHIVE_DIR,
  361. link['timestamp'],
  362. domain(url),
  363. )),
  364. 'num_outputs': len([entry for entry in link['latest'].values() if entry]) if 'latest' in link else 0,
  365. }
  366. # Archive Method Output URLs
  367. extended_info.update({
  368. 'index_url': 'index.html',
  369. 'favicon_url': 'favicon.ico',
  370. 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
  371. 'archive_url': wget_output_path(link),
  372. 'warc_url': 'warc',
  373. 'pdf_url': 'output.pdf',
  374. 'screenshot_url': 'screenshot.png',
  375. 'dom_url': 'output.html',
  376. 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
  377. 'git_url': 'git',
  378. 'media_url': 'media',
  379. })
  380. # static binary files like PDF and images are handled slightly differently.
  381. # they're just downloaded once and aren't archived separately multiple times,
  382. # so the wget, screenshot, & pdf urls should all point to the same file
  383. if is_static_file(url):
  384. extended_info.update({
  385. 'title': basename(url),
  386. 'archive_url': base_url(url),
  387. 'pdf_url': base_url(url),
  388. 'screenshot_url': base_url(url),
  389. 'dom_url': base_url(url),
  390. })
  391. return extended_info
  392. ### Python / System Helpers
  393. def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
  394. """Patched of subprocess.run to fix blocking io making timeout=innefective"""
  395. if input is not None:
  396. if 'stdin' in kwargs:
  397. raise ValueError('stdin and input arguments may not both be used.')
  398. kwargs['stdin'] = PIPE
  399. if capture_output:
  400. if ('stdout' in kwargs) or ('stderr' in kwargs):
  401. raise ValueError('stdout and stderr arguments may not be used '
  402. 'with capture_output.')
  403. kwargs['stdout'] = PIPE
  404. kwargs['stderr'] = PIPE
  405. with Popen(*popenargs, **kwargs) as process:
  406. try:
  407. stdout, stderr = process.communicate(input, timeout=timeout)
  408. except TimeoutExpired:
  409. process.kill()
  410. try:
  411. stdout, stderr = process.communicate(input, timeout=2)
  412. except:
  413. pass
  414. raise TimeoutExpired(popenargs[0][0], timeout)
  415. except BaseException:
  416. process.kill()
  417. # We don't call process.wait() as .__exit__ does that for us.
  418. raise
  419. retcode = process.poll()
  420. if check and retcode:
  421. raise CalledProcessError(retcode, process.args,
  422. output=stdout, stderr=stderr)
  423. return CompletedProcess(process.args, retcode, stdout, stderr)
  424. def progress_bar(seconds, prefix):
  425. """show timer in the form of progress bar, with percentage and seconds remaining"""
  426. chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
  427. chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
  428. try:
  429. for s in range(seconds * chunks):
  430. progress = s / chunks / seconds * 100
  431. bar_width = round(progress/(100/chunks))
  432. # ████████████████████ 0.9% (1/60sec)
  433. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  434. prefix,
  435. ANSI['green'],
  436. (chunk * bar_width).ljust(chunks),
  437. ANSI['reset'],
  438. round(progress, 1),
  439. round(s/chunks),
  440. seconds,
  441. ))
  442. sys.stdout.flush()
  443. time.sleep(1 / chunks)
  444. # ██████████████████████████████████ 100.0% (60/60sec)
  445. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
  446. prefix,
  447. ANSI['red'],
  448. chunk * chunks,
  449. ANSI['reset'],
  450. 100.0,
  451. seconds,
  452. seconds,
  453. ))
  454. sys.stdout.flush()
  455. except KeyboardInterrupt:
  456. print()
  457. pass
  458. class TimedProgress:
  459. def __init__(self, seconds, prefix=''):
  460. if SHOW_PROGRESS:
  461. self.p = Process(target=progress_bar, args=(seconds, prefix))
  462. self.p.start()
  463. self.stats = {
  464. 'start_ts': datetime.now(),
  465. 'end_ts': None,
  466. 'duration': None,
  467. }
  468. def end(self):
  469. """immediately finish progress and clear the progressbar line"""
  470. end_ts = datetime.now()
  471. self.stats.update({
  472. 'end_ts': end_ts,
  473. 'duration': (end_ts - self.stats['start_ts']).seconds,
  474. })
  475. if SHOW_PROGRESS:
  476. # protect from double termination
  477. #if p is None or not hasattr(p, 'kill'):
  478. # return
  479. if self.p is not None:
  480. self.p.terminate()
  481. self.p = None
  482. sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line
  483. sys.stdout.flush()
  484. def download_url(url, timeout=TIMEOUT):
  485. req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
  486. if CHECK_SSL_VALIDITY:
  487. resp = urlopen(req, timeout=timeout)
  488. else:
  489. import ssl
  490. insecure = ssl._create_unverified_context()
  491. resp = urlopen(req, timeout=timeout, context=insecure)
  492. encoding = resp.headers.get_content_charset() or 'utf-8'
  493. return resp.read().decode(encoding)
  494. def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
  495. """chmod -R <permissions> <cwd>/<path>"""
  496. if not os.path.exists(os.path.join(cwd, path)):
  497. raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
  498. chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
  499. if chmod_result.returncode == 1:
  500. print(' ', chmod_result.stderr.decode())
  501. raise Exception('Failed to chmod {}/{}'.format(cwd, path))
  502. CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
  503. def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
  504. headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
  505. check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
  506. resolution=RESOLUTION, timeout=TIMEOUT):
  507. """helper to build up a chrome shell command with arguments"""
  508. global CACHED_USER_DATA_DIR
  509. user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
  510. cmd_args = [binary]
  511. if headless:
  512. cmd_args += ('--headless',)
  513. if not sandbox:
  514. # dont use GPU or sandbox when running inside docker container
  515. cmd_args += ('--no-sandbox', '--disable-gpu')
  516. if not check_ssl_validity:
  517. cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
  518. if user_agent:
  519. cmd_args += ('--user-agent={}'.format(user_agent),)
  520. if resolution:
  521. cmd_args += ('--window-size={}'.format(RESOLUTION),)
  522. if timeout:
  523. cmd_args += ('--timeout={}'.format((timeout) * 1000),)
  524. # Find chrome user data directory
  525. default_profile_paths = (
  526. '~/.config/chromium',
  527. '~/Library/Application Support/Chromium',
  528. '~/AppData/Local/Chromium/User Data',
  529. '~/.config/google-chrome',
  530. '~/Library/Application Support/Google/Chrome',
  531. '~/AppData/Local/Google/Chrome/User Data',
  532. '~/.config/google-chrome-beta',
  533. '~/.config/google-chrome-unstable',
  534. '~/Library/Application Support/Google/Chrome Canary',
  535. '~/AppData/Local/Google/Chrome SxS/User Data',
  536. )
  537. if user_data_dir:
  538. cmd_args.append('--user-data-dir={}'.format(user_data_dir))
  539. else:
  540. for path in default_profile_paths:
  541. full_path = os.path.expanduser(path)
  542. if os.path.exists(full_path):
  543. CACHED_USER_DATA_DIR = full_path
  544. cmd_args.append('--user-data-dir={}'.format(full_path))
  545. break
  546. return cmd_args