util.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. __package__ = 'archivebox.misc'
  2. import re
  3. import requests
  4. import json as pyjson
  5. import http.cookiejar
  6. from typing import List, Optional, Any
  7. from pathlib import Path
  8. from inspect import signature
  9. from functools import wraps
  10. from hashlib import sha256
  11. from urllib.parse import urlparse, quote, unquote
  12. from html import escape, unescape
  13. from datetime import datetime, timezone
  14. from dateparser import parse as dateparser
  15. from requests.exceptions import RequestException, ReadTimeout
  16. from base32_crockford import encode as base32_encode # type: ignore
  17. from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
  18. try:
  19. import chardet
  20. detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
  21. except ImportError:
  22. detect_encoding = lambda rawdata: "utf-8"
  23. from archivebox.config import CONSTANTS
  24. from archivebox.config.common import ARCHIVING_CONFIG
  25. from .logging import COLOR_DICT
  26. ### Parsing Helpers
  27. # All of these are (str) -> str
  28. # shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
  29. scheme = lambda url: urlparse(url).scheme.lower()
  30. without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
  31. without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
  32. without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
  33. without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
  34. path = lambda url: urlparse(url).path
  35. basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
  36. domain = lambda url: urlparse(url).netloc
  37. query = lambda url: urlparse(url).query
  38. fragment = lambda url: urlparse(url).fragment
  39. extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
  40. base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
  41. without_www = lambda url: url.replace('://www.', '://', 1)
  42. without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
  43. hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
  44. urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
  45. urldecode = lambda s: s and unquote(s)
  46. htmlencode = lambda s: s and escape(s, quote=True)
  47. htmldecode = lambda s: s and unescape(s)
  48. short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
  49. ts_to_date_str = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
  50. ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
  51. COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
  52. # https://mathiasbynens.be/demo/url-regex
  53. URL_REGEX = re.compile(
  54. r'(?=('
  55. r'http[s]?://' # start matching from allowed schemes
  56. r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
  57. r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
  58. r'|[^\u0000-\u007F])+' # or allowed unicode bytes
  59. r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
  60. r'))',
  61. re.IGNORECASE | re.UNICODE,
  62. )
  63. def parens_are_matched(string: str, open_char='(', close_char=')'):
  64. """check that all parentheses in a string are balanced and nested properly"""
  65. count = 0
  66. for c in string:
  67. if c == open_char:
  68. count += 1
  69. elif c == close_char:
  70. count -= 1
  71. if count < 0:
  72. return False
  73. return count == 0
  74. def fix_url_from_markdown(url_str: str) -> str:
  75. """
  76. cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax
  77. helpful to fix URLs parsed from markdown e.g.
  78. input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
  79. result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
  80. IMPORTANT ASSUMPTION: valid urls wont have unbalanced or incorrectly nested parentheses
  81. e.g. this will fail the user actually wants to ingest a url like 'https://example.com/some_wei)(rd_url'
  82. in that case it will return https://example.com/some_wei (truncated up to the first unbalanced paren)
  83. This assumption is true 99.9999% of the time, and for the rare edge case the user can use url_list parser.
  84. """
  85. trimmed_url = url_str
  86. # cut off one trailing character at a time
  87. # until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c
  88. while not parens_are_matched(trimmed_url):
  89. trimmed_url = trimmed_url[:-1]
  90. # make sure trimmed url is still valid
  91. if re.findall(URL_REGEX, trimmed_url):
  92. return trimmed_url
  93. return url_str
  94. def find_all_urls(urls_str: str):
  95. for url in re.findall(URL_REGEX, urls_str):
  96. yield fix_url_from_markdown(url)
  97. def is_static_file(url: str):
  98. # TODO: the proper way is with MIME type detection + ext, not only extension
  99. return extension(url).lower() in CONSTANTS.STATICFILE_EXTENSIONS
  100. def enforce_types(func):
  101. """
  102. Enforce function arg and kwarg types at runtime using its python3 type hints
  103. Simpler version of pydantic @validate_call decorator
  104. """
  105. # TODO: check return type as well
  106. @wraps(func)
  107. def typechecked_function(*args, **kwargs):
  108. sig = signature(func)
  109. def check_argument_type(arg_key, arg_val):
  110. try:
  111. annotation = sig.parameters[arg_key].annotation
  112. except KeyError:
  113. annotation = None
  114. if annotation is not None and annotation.__class__ is type:
  115. if not isinstance(arg_val, annotation):
  116. raise TypeError(
  117. '{}(..., {}: {}) got unexpected {} argument {}={}'.format(
  118. func.__name__,
  119. arg_key,
  120. annotation.__name__,
  121. type(arg_val).__name__,
  122. arg_key,
  123. str(arg_val)[:64],
  124. )
  125. )
  126. # check args
  127. for arg_val, arg_key in zip(args, sig.parameters):
  128. check_argument_type(arg_key, arg_val)
  129. # check kwargs
  130. for arg_key, arg_val in kwargs.items():
  131. check_argument_type(arg_key, arg_val)
  132. return func(*args, **kwargs)
  133. return typechecked_function
  134. def docstring(text: Optional[str]):
  135. """attach the given docstring to the decorated function"""
  136. def decorator(func):
  137. if text:
  138. func.__doc__ = text
  139. return func
  140. return decorator
  141. @enforce_types
  142. def str_between(string: str, start: str, end: str=None) -> str:
  143. """(<abc>12345</def>, <abc>, </def>) -> 12345"""
  144. content = string.split(start, 1)[-1]
  145. if end is not None:
  146. content = content.rsplit(end, 1)[0]
  147. return content
  148. @enforce_types
  149. def parse_date(date: Any) -> Optional[datetime]:
  150. """Parse unix timestamps, iso format, and human-readable strings"""
  151. if date is None:
  152. return None
  153. if isinstance(date, datetime):
  154. if date.tzinfo is None:
  155. return date.replace(tzinfo=timezone.utc)
  156. assert date.tzinfo.utcoffset(datetime.now()).seconds == 0, 'Refusing to load a non-UTC date!'
  157. return date
  158. if isinstance(date, (float, int)):
  159. date = str(date)
  160. if isinstance(date, str):
  161. return dateparser(date, settings={'TIMEZONE': 'UTC'}).astimezone(timezone.utc)
  162. raise ValueError('Tried to parse invalid date! {}'.format(date))
  163. @enforce_types
  164. def download_url(url: str, timeout: int=None) -> str:
  165. """Download the contents of a remote url and return the text"""
  166. timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
  167. session = requests.Session()
  168. if ARCHIVING_CONFIG.COOKIES_FILE and Path(ARCHIVING_CONFIG.COOKIES_FILE).is_file():
  169. cookie_jar = http.cookiejar.MozillaCookieJar(ARCHIVING_CONFIG.COOKIES_FILE)
  170. cookie_jar.load(ignore_discard=True, ignore_expires=True)
  171. for cookie in cookie_jar:
  172. session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
  173. response = session.get(
  174. url,
  175. headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
  176. verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
  177. timeout=timeout,
  178. )
  179. content_type = response.headers.get('Content-Type', '')
  180. encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text)
  181. if encoding is not None:
  182. response.encoding = encoding
  183. try:
  184. return response.text
  185. except UnicodeDecodeError:
  186. # if response is non-test (e.g. image or other binary files), just return the filename instead
  187. return url.rsplit('/', 1)[-1]
  188. @enforce_types
  189. def get_headers(url: str, timeout: int=None) -> str:
  190. """Download the contents of a remote url and return the headers"""
  191. timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
  192. try:
  193. response = requests.head(
  194. url,
  195. headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
  196. verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
  197. timeout=timeout,
  198. allow_redirects=True,
  199. )
  200. if response.status_code >= 400:
  201. raise RequestException
  202. except ReadTimeout:
  203. raise
  204. except RequestException:
  205. response = requests.get(
  206. url,
  207. headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
  208. verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
  209. timeout=timeout,
  210. stream=True
  211. )
  212. return pyjson.dumps(
  213. {
  214. 'URL': url,
  215. 'Status-Code': response.status_code,
  216. 'Elapsed': response.elapsed.total_seconds()*1000,
  217. 'Encoding': str(response.encoding),
  218. 'Apparent-Encoding': response.apparent_encoding,
  219. **dict(response.headers),
  220. },
  221. indent=4,
  222. )
  223. @enforce_types
  224. def ansi_to_html(text: str) -> str:
  225. """
  226. Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
  227. Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though.
  228. """
  229. TEMPLATE = '<span style="color: rgb{}"><br>'
  230. text = text.replace('[m', '</span>')
  231. def single_sub(match):
  232. argsdict = match.groupdict()
  233. if argsdict['arg_3'] is None:
  234. if argsdict['arg_2'] is None:
  235. _, color = 0, argsdict['arg_1']
  236. else:
  237. _, color = argsdict['arg_1'], argsdict['arg_2']
  238. else:
  239. _, color = argsdict['arg_3'], argsdict['arg_2']
  240. return TEMPLATE.format(COLOR_DICT[color][0])
  241. return COLOR_REGEX.sub(single_sub, text)
  242. @enforce_types
  243. def dedupe(options: List[str]) -> List[str]:
  244. """
  245. Deduplicates the given options. Options that come later clobber earlier
  246. conflicting options.
  247. """
  248. deduped = {}
  249. for option in options:
  250. deduped[option.split('=')[0]] = option
  251. return list(deduped.values())
  252. class ExtendedEncoder(pyjson.JSONEncoder):
  253. """
  254. Extended json serializer that supports serializing several model
  255. fields and objects
  256. """
  257. def default(self, obj):
  258. cls_name = obj.__class__.__name__
  259. if hasattr(obj, '_asdict'):
  260. return obj._asdict()
  261. elif isinstance(obj, bytes):
  262. return obj.decode()
  263. elif isinstance(obj, datetime):
  264. return obj.isoformat()
  265. elif isinstance(obj, Exception):
  266. return '{}: {}'.format(obj.__class__.__name__, obj)
  267. elif isinstance(obj, Path):
  268. return str(obj)
  269. elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
  270. return tuple(obj)
  271. return pyjson.JSONEncoder.default(self, obj)
  272. ### URL PARSING TESTS / ASSERTIONS
  273. # Check that plain text regex URL parsing works as expected
  274. # this is last-line-of-defense to make sure the URL_REGEX isn't
  275. # misbehaving due to some OS-level or environment level quirks (e.g. regex engine / cpython / locale differences)
  276. # the consequences of bad URL parsing could be disastrous and lead to many
  277. # incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
  278. assert fix_url_from_markdown('http://example.com/a(b)c).x(y)z') == 'http://example.com/a(b)c'
  279. assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
  280. URL_REGEX_TESTS = [
  281. ('https://example.com', ['https://example.com']),
  282. ('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
  283. ('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
  284. ('<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc', ['https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ']),
  285. ('///a', []),
  286. ('http://', []),
  287. ('http://../', ['http://../']),
  288. ('http://-error-.invalid/', ['http://-error-.invalid/']),
  289. ('https://a(b)c+1#2?3&4/', ['https://a(b)c+1#2?3&4/']),
  290. ('http://उदाहरण.परीक्षा', ['http://उदाहरण.परीक्षा']),
  291. ('http://例子.测试', ['http://例子.测试']),
  292. ('http://➡.ws/䨹 htps://abc.1243?234', ['http://➡.ws/䨹']),
  293. ('http://⌘.ws">https://exa+mple.com//:abc ', ['http://⌘.ws', 'https://exa+mple.com//:abc']),
  294. ('http://مثال.إختبار/abc?def=ت&ب=abc#abc=234', ['http://مثال.إختبار/abc?def=ت&ب=abc#abc=234']),
  295. ('http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c\'om', ['http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c']),
  296. ('http://us:[email protected]:42/http://ex.co:19/a?_d=4#-a=2.3', ['http://us:[email protected]:42/http://ex.co:19/a?_d=4#-a=2.3', 'http://ex.co:19/a?_d=4#-a=2.3']),
  297. ('http://code.google.com/events/#&product=browser', ['http://code.google.com/events/#&product=browser']),
  298. ('http://foo.bar?q=Spaces should be encoded', ['http://foo.bar?q=Spaces']),
  299. ('http://foo.com/blah_(wikipedia)#c(i)t[e]-1', ['http://foo.com/blah_(wikipedia)#c(i)t']),
  300. ('http://foo.com/(something)?after=parens', ['http://foo.com/(something)?after=parens']),
  301. ('http://foo.com/unicode_(✪)_in_parens) abc', ['http://foo.com/unicode_(✪)_in_parens']),
  302. ('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
  303. ('[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff', ['http://a.b/?q=(Test)%20U']),
  304. ('[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123', ['http://a.b/?q=(Test)%20U', 'https://abc+123']),
  305. ('[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3', ['http://a.b/?q=(Test)%20U', 'https://a(b)c+12']),
  306. ('[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3', ['http://a.b/?q=(Test)a', 'https://a(b)c+12']),
  307. ('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
  308. ]
  309. for urls_str, expected_url_matches in URL_REGEX_TESTS:
  310. url_matches = list(find_all_urls(urls_str))
  311. assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
  312. # More test cases
  313. _test_url_strs = {
  314. 'example.com': 0,
  315. '/example.com': 0,
  316. '//example.com': 0,
  317. ':/example.com': 0,
  318. '://example.com': 0,
  319. 'htt://example8.com': 0,
  320. '/htt://example.com': 0,
  321. 'https://example': 1,
  322. 'https://localhost/2345': 1,
  323. 'https://localhost:1234/123': 1,
  324. '://': 0,
  325. 'https://': 0,
  326. 'http://': 0,
  327. 'ftp://': 0,
  328. 'ftp://example.com': 0,
  329. 'https://example.com': 1,
  330. 'https://example.com/': 1,
  331. 'https://a.example.com': 1,
  332. 'https://a.example.com/': 1,
  333. 'https://a.example.com/what/is/happening.html': 1,
  334. 'https://a.example.com/what/ís/happening.html': 1,
  335. 'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
  336. 'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
  337. 'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
  338. 'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
  339. 'https://example.com?what=1#how-about-this=1&2%20baf': 1,
  340. '<test>http://example7.com</test>': 1,
  341. 'https://<test>': 0,
  342. 'https://[test]': 0,
  343. 'http://"test"': 0,
  344. 'http://\'test\'': 0,
  345. '[https://example8.com/what/is/this.php?what=1]': 1,
  346. '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
  347. '<what>https://example10.com#and-thing=2 "</about>': 1,
  348. 'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
  349. 'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
  350. '<or>http://examplehttp://15.badc</that>': 2,
  351. 'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
  352. '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
  353. }
  354. for url_str, num_urls in _test_url_strs.items():
  355. assert len(list(find_all_urls(url_str))) == num_urls, (
  356. f'{url_str} does not contain {num_urls} urls')