util.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. __package__ = 'archivebox'
  2. import re
  3. import requests
  4. import json as pyjson
  5. import http.cookiejar
  6. from typing import List, Optional, Any
  7. from pathlib import Path
  8. from inspect import signature
  9. from functools import wraps
  10. from hashlib import sha256
  11. from urllib.parse import urlparse, quote, unquote
  12. from html import escape, unescape
  13. from datetime import datetime, timezone
  14. from dateparser import parse as dateparser
  15. from requests.exceptions import RequestException, ReadTimeout
  16. from .vendor.base32_crockford import encode as base32_encode # type: ignore
  17. from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
  18. from os.path import lexists
  19. from os import remove as remove_file
  20. try:
  21. import chardet
  22. detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
  23. except ImportError:
  24. detect_encoding = lambda rawdata: "utf-8"
  25. ### Parsing Helpers
  26. # All of these are (str) -> str
  27. # shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
  28. scheme = lambda url: urlparse(url).scheme.lower()
  29. without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
  30. without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
  31. without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
  32. without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
  33. path = lambda url: urlparse(url).path
  34. basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
  35. domain = lambda url: urlparse(url).netloc
  36. query = lambda url: urlparse(url).query
  37. fragment = lambda url: urlparse(url).fragment
  38. extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
  39. base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
  40. without_www = lambda url: url.replace('://www.', '://', 1)
  41. without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
  42. hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
  43. urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
  44. urldecode = lambda s: s and unquote(s)
  45. htmlencode = lambda s: s and escape(s, quote=True)
  46. htmldecode = lambda s: s and unescape(s)
  47. short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
  48. ts_to_date_str = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
  49. ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
  50. COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
  51. # https://mathiasbynens.be/demo/url-regex
  52. URL_REGEX = re.compile(
  53. r'(?=(' +
  54. r'http[s]?://' + # start matching from allowed schemes
  55. r'(?:[a-zA-Z]|[0-9]' + # followed by allowed alphanum characters
  56. r'|[-_$@.&+!*\(\),]' + # or allowed symbols (keep hyphen first to match literal hyphen)
  57. r'|[^\u0000-\u007F])+' + # or allowed unicode bytes
  58. r'[^\]\[<>"\'\s]+' + # stop parsing at these symbols
  59. r'))',
  60. re.IGNORECASE | re.UNICODE,
  61. )
  62. def parens_are_matched(string: str, open_char='(', close_char=')'):
  63. """check that all parentheses in a string are balanced and nested properly"""
  64. count = 0
  65. for c in string:
  66. if c == open_char:
  67. count += 1
  68. elif c == close_char:
  69. count -= 1
  70. if count < 0:
  71. return False
  72. return count == 0
  73. def fix_url_from_markdown(url_str: str) -> str:
  74. """
  75. cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax
  76. helpful to fix URLs parsed from markdown e.g.
  77. input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
  78. result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
  79. """
  80. trimmed_url = url_str
  81. # cut off one trailing character at a time
  82. # until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c
  83. while not parens_are_matched(trimmed_url):
  84. trimmed_url = trimmed_url[:-1]
  85. # make sure trimmed url is still valid
  86. if re.findall(URL_REGEX, trimmed_url):
  87. return trimmed_url
  88. return url_str
  89. def find_all_urls(urls_str: str):
  90. for url in re.findall(URL_REGEX, urls_str):
  91. yield fix_url_from_markdown(url)
  92. def is_static_file(url: str):
  93. # TODO: the proper way is with MIME type detection + ext, not only extension
  94. from .config import STATICFILE_EXTENSIONS
  95. return extension(url).lower() in STATICFILE_EXTENSIONS
  96. def enforce_types(func):
  97. """
  98. Enforce function arg and kwarg types at runtime using its python3 type hints
  99. """
  100. # TODO: check return type as well
  101. @wraps(func)
  102. def typechecked_function(*args, **kwargs):
  103. sig = signature(func)
  104. def check_argument_type(arg_key, arg_val):
  105. try:
  106. annotation = sig.parameters[arg_key].annotation
  107. except KeyError:
  108. annotation = None
  109. if annotation is not None and annotation.__class__ is type:
  110. if not isinstance(arg_val, annotation):
  111. raise TypeError(
  112. '{}(..., {}: {}) got unexpected {} argument {}={}'.format(
  113. func.__name__,
  114. arg_key,
  115. annotation.__name__,
  116. type(arg_val).__name__,
  117. arg_key,
  118. str(arg_val)[:64],
  119. )
  120. )
  121. # check args
  122. for arg_val, arg_key in zip(args, sig.parameters):
  123. check_argument_type(arg_key, arg_val)
  124. # check kwargs
  125. for arg_key, arg_val in kwargs.items():
  126. check_argument_type(arg_key, arg_val)
  127. return func(*args, **kwargs)
  128. return typechecked_function
  129. def docstring(text: Optional[str]):
  130. """attach the given docstring to the decorated function"""
  131. def decorator(func):
  132. if text:
  133. func.__doc__ = text
  134. return func
  135. return decorator
  136. @enforce_types
  137. def str_between(string: str, start: str, end: str=None) -> str:
  138. """(<abc>12345</def>, <abc>, </def>) -> 12345"""
  139. content = string.split(start, 1)[-1]
  140. if end is not None:
  141. content = content.rsplit(end, 1)[0]
  142. return content
  143. @enforce_types
  144. def parse_date(date: Any) -> Optional[datetime]:
  145. """Parse unix timestamps, iso format, and human-readable strings"""
  146. if date is None:
  147. return None
  148. if isinstance(date, datetime):
  149. if date.tzinfo is None:
  150. return date.replace(tzinfo=timezone.utc)
  151. assert date.tzinfo.utcoffset(datetime.now()).seconds == 0, 'Refusing to load a non-UTC date!'
  152. return date
  153. if isinstance(date, (float, int)):
  154. date = str(date)
  155. if isinstance(date, str):
  156. return dateparser(date, settings={'TIMEZONE': 'UTC'}).replace(tzinfo=timezone.utc)
  157. raise ValueError('Tried to parse invalid date! {}'.format(date))
  158. @enforce_types
  159. def download_url(url: str, timeout: int=None) -> str:
  160. """Download the contents of a remote url and return the text"""
  161. from .config import (
  162. TIMEOUT,
  163. CHECK_SSL_VALIDITY,
  164. WGET_USER_AGENT,
  165. COOKIES_FILE,
  166. )
  167. timeout = timeout or TIMEOUT
  168. session = requests.Session()
  169. if COOKIES_FILE and Path(COOKIES_FILE).is_file():
  170. cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE)
  171. cookie_jar.load(ignore_discard=True, ignore_expires=True)
  172. for cookie in cookie_jar:
  173. session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
  174. response = session.get(
  175. url,
  176. headers={'User-Agent': WGET_USER_AGENT},
  177. verify=CHECK_SSL_VALIDITY,
  178. timeout=timeout,
  179. )
  180. content_type = response.headers.get('Content-Type', '')
  181. encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text)
  182. if encoding is not None:
  183. response.encoding = encoding
  184. try:
  185. return response.text
  186. except UnicodeDecodeError:
  187. # if response is non-test (e.g. image or other binary files), just return the filename instead
  188. return url.rsplit('/', 1)[-1]
  189. @enforce_types
  190. def get_headers(url: str, timeout: int=None) -> str:
  191. """Download the contents of a remote url and return the headers"""
  192. from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
  193. timeout = timeout or TIMEOUT
  194. try:
  195. response = requests.head(
  196. url,
  197. headers={'User-Agent': WGET_USER_AGENT},
  198. verify=CHECK_SSL_VALIDITY,
  199. timeout=timeout,
  200. allow_redirects=True,
  201. )
  202. if response.status_code >= 400:
  203. raise RequestException
  204. except ReadTimeout:
  205. raise
  206. except RequestException:
  207. response = requests.get(
  208. url,
  209. headers={'User-Agent': WGET_USER_AGENT},
  210. verify=CHECK_SSL_VALIDITY,
  211. timeout=timeout,
  212. stream=True
  213. )
  214. return pyjson.dumps(
  215. {
  216. 'URL': url,
  217. 'Status-Code': response.status_code,
  218. 'Elapsed': response.elapsed,
  219. 'Encoding': response.encoding,
  220. 'Apparent-Encoding': response.apparent_encoding,
  221. **dict(response.headers),
  222. },
  223. indent=4,
  224. )
  225. @enforce_types
  226. def chrome_args(**options) -> List[str]:
  227. """helper to build up a chrome shell command with arguments"""
  228. # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
  229. from .config import (
  230. CHROME_OPTIONS,
  231. CHROME_VERSION,
  232. CHROME_EXTRA_ARGS,
  233. )
  234. options = {**CHROME_OPTIONS, **options}
  235. if not options['CHROME_BINARY']:
  236. raise Exception('Could not find any CHROME_BINARY installed on your system')
  237. cmd_args = [options['CHROME_BINARY']]
  238. cmd_args += CHROME_EXTRA_ARGS
  239. if options['CHROME_HEADLESS']:
  240. chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
  241. if chrome_major_version >= 111:
  242. cmd_args += ("--headless=new",)
  243. else:
  244. cmd_args += ('--headless',)
  245. if not options['CHROME_SANDBOX']:
  246. # assume this means we are running inside a docker container
  247. # in docker, GPU support is limited, sandboxing is unecessary,
  248. # and SHM is limited to 64MB by default (which is too low to be usable).
  249. cmd_args += (
  250. "--no-sandbox",
  251. "--no-zygote",
  252. "--disable-dev-shm-usage",
  253. "--disable-software-rasterizer",
  254. "--run-all-compositor-stages-before-draw",
  255. "--hide-scrollbars",
  256. "--autoplay-policy=no-user-gesture-required",
  257. "--no-first-run",
  258. "--use-fake-ui-for-media-stream",
  259. "--use-fake-device-for-media-stream",
  260. "--disable-sync",
  261. # "--password-store=basic",
  262. )
  263. # disable automatic updating when running headless, as there's no user to see the upgrade prompts
  264. cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
  265. # set window size for screenshot/pdf/etc. rendering
  266. cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
  267. if not options['CHECK_SSL_VALIDITY']:
  268. cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
  269. if options['CHROME_USER_AGENT']:
  270. cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
  271. if options['CHROME_TIMEOUT']:
  272. cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),)
  273. if options['CHROME_USER_DATA_DIR']:
  274. cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
  275. cmd_args.append('--profile-directory=Default')
  276. return dedupe(cmd_args)
  277. def chrome_cleanup():
  278. """
  279. Cleans up any state or runtime files that chrome leaves behind when killed by
  280. a timeout or other error
  281. """
  282. from .config import IN_DOCKER
  283. if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
  284. remove_file("/home/archivebox/.config/chromium/SingletonLock")
  285. def ansi_to_html(text):
  286. """
  287. Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
  288. """
  289. from .config import COLOR_DICT
  290. TEMPLATE = '<span style="color: rgb{}"><br>'
  291. text = text.replace('[m', '</span>')
  292. def single_sub(match):
  293. argsdict = match.groupdict()
  294. if argsdict['arg_3'] is None:
  295. if argsdict['arg_2'] is None:
  296. _, color = 0, argsdict['arg_1']
  297. else:
  298. _, color = argsdict['arg_1'], argsdict['arg_2']
  299. else:
  300. _, color = argsdict['arg_3'], argsdict['arg_2']
  301. return TEMPLATE.format(COLOR_DICT[color][0])
  302. return COLOR_REGEX.sub(single_sub, text)
  303. @enforce_types
  304. def dedupe(options: List[str]) -> List[str]:
  305. """
  306. Deduplicates the given options. Options that come later clobber earlier
  307. conflicting options.
  308. """
  309. deduped = {}
  310. for option in options:
  311. deduped[option.split('=')[0]] = option
  312. return list(deduped.values())
  313. class AttributeDict(dict):
  314. """Helper to allow accessing dict values via Example.key or Example['key']"""
  315. def __init__(self, *args, **kwargs):
  316. super().__init__(*args, **kwargs)
  317. # Recursively convert nested dicts to AttributeDicts (optional):
  318. # for key, val in self.items():
  319. # if isinstance(val, dict) and type(val) is not AttributeDict:
  320. # self[key] = AttributeDict(val)
  321. def __getattr__(self, attr: str) -> Any:
  322. return dict.__getitem__(self, attr)
  323. def __setattr__(self, attr: str, value: Any) -> None:
  324. return dict.__setitem__(self, attr, value)
  325. class ExtendedEncoder(pyjson.JSONEncoder):
  326. """
  327. Extended json serializer that supports serializing several model
  328. fields and objects
  329. """
  330. def default(self, obj):
  331. cls_name = obj.__class__.__name__
  332. if hasattr(obj, '_asdict'):
  333. return obj._asdict()
  334. elif isinstance(obj, bytes):
  335. return obj.decode()
  336. elif isinstance(obj, datetime):
  337. return obj.isoformat()
  338. elif isinstance(obj, Exception):
  339. return '{}: {}'.format(obj.__class__.__name__, obj)
  340. elif isinstance(obj, Path):
  341. return str(obj)
  342. elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
  343. return tuple(obj)
  344. return pyjson.JSONEncoder.default(self, obj)
  345. ### URL PARSING TESTS / ASSERTIONS
  346. # they run at runtime because I like having them inline in this file,
  347. # I like the peace of mind knowing it's enforced at runtime across all OS's (in case the regex engine ever has any weird locale-specific quirks),
  348. # and these assertions are basically instant, so not a big performance cost to do it on startup
  349. assert fix_url_from_markdown('/a(b)c).x(y)z') == '/a(b)c'
  350. assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
  351. URL_REGEX_TESTS = [
  352. ('https://example.com', ['https://example.com']),
  353. ('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
  354. ('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
  355. ('<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc', ['https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ']),
  356. ('///a', []),
  357. ('http://', []),
  358. ('http://../', ['http://../']),
  359. ('http://-error-.invalid/', ['http://-error-.invalid/']),
  360. ('https://a(b)c+1#2?3&4/', ['https://a(b)c+1#2?3&4/']),
  361. ('http://उदाहरण.परीक्षा', ['http://उदाहरण.परीक्षा']),
  362. ('http://例子.测试', ['http://例子.测试']),
  363. ('http://➡.ws/䨹 htps://abc.1243?234', ['http://➡.ws/䨹']),
  364. ('http://⌘.ws">https://exa+mple.com//:abc ', ['http://⌘.ws', 'https://exa+mple.com//:abc']),
  365. ('http://مثال.إختبار/abc?def=ت&ب=abc#abc=234', ['http://مثال.إختبار/abc?def=ت&ب=abc#abc=234']),
  366. ('http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c\'om', ['http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c']),
  367. ('http://us:[email protected]:42/http://ex.co:19/a?_d=4#-a=2.3', ['http://us:[email protected]:42/http://ex.co:19/a?_d=4#-a=2.3', 'http://ex.co:19/a?_d=4#-a=2.3']),
  368. ('http://code.google.com/events/#&product=browser', ['http://code.google.com/events/#&product=browser']),
  369. ('http://foo.bar?q=Spaces should be encoded', ['http://foo.bar?q=Spaces']),
  370. ('http://foo.com/blah_(wikipedia)#c(i)t[e]-1', ['http://foo.com/blah_(wikipedia)#c(i)t']),
  371. ('http://foo.com/(something)?after=parens', ['http://foo.com/(something)?after=parens']),
  372. ('http://foo.com/unicode_(✪)_in_parens) abc', ['http://foo.com/unicode_(✪)_in_parens']),
  373. ('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
  374. ('[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff', ['http://a.b/?q=(Test)%20U']),
  375. ('[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123', ['http://a.b/?q=(Test)%20U', 'https://abc+123']),
  376. ('[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3', ['http://a.b/?q=(Test)%20U', 'https://a(b)c+12']),
  377. ('[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3', ['http://a.b/?q=(Test)a', 'https://a(b)c+12']),
  378. ('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
  379. ]
  380. for urls_str, expected_url_matches in URL_REGEX_TESTS:
  381. url_matches = list(find_all_urls(urls_str))
  382. assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'