JsonParser.cs 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778
  1. using System.Buffers;
  2. using System.Diagnostics.CodeAnalysis;
  3. using System.Globalization;
  4. using System.Runtime.CompilerServices;
  5. using System.Runtime.InteropServices;
  6. using System.Text;
  7. using Jint.Runtime;
  8. namespace Jint.Native.Json
  9. {
  10. public sealed class JsonParser
  11. {
  12. private readonly Engine _engine;
  13. private readonly int _maxDepth;
  14. /// <summary>
  15. /// Creates a new parser using the recursion depth specified in <see cref="Options.JsonOptions.MaxParseDepth"/>.
  16. /// </summary>
  17. public JsonParser(Engine engine)
  18. : this(engine, engine.Options.Json.MaxParseDepth)
  19. {
  20. }
  21. public JsonParser(Engine engine, int maxDepth)
  22. {
  23. if (maxDepth < 0)
  24. {
  25. throw new ArgumentOutOfRangeException(nameof(maxDepth), $"Max depth must be greater or equal to zero");
  26. }
  27. _maxDepth = maxDepth;
  28. _engine = engine;
  29. // Two tokens are "live" during parsing,
  30. // lookahead and the current one on the stack
  31. // To add a safety boundary to not overwrite
  32. // "still in use" stuff, the buffer contains 5
  33. // instead of 2 tokens.
  34. _tokenBuffer = new Token[5];
  35. for (int i = 0; i < _tokenBuffer.Length; i++)
  36. {
  37. _tokenBuffer[i] = new Token();
  38. }
  39. _tokenBufferIndex = 0;
  40. }
  41. private int _index; // position in the stream
  42. private int _length; // length of the stream
  43. private Token _lookahead = null!;
  44. private string _source = null!;
  45. private readonly Token[] _tokenBuffer;
  46. private int _tokenBufferIndex;
  47. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  48. private static bool IsDecimalDigit(char ch)
  49. {
  50. // * For characters, which are before the '0', the equation will be negative and then wrap
  51. // around because of the unsigned short cast
  52. // * For characters, which are after the '9', the equation will be positive, but > 9
  53. // * For digits, the equation will be between int(0) and int(9)
  54. return ((uint) (ch - '0')) <= 9;
  55. }
  56. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  57. private static bool IsLowerCaseHexAlpha(char ch)
  58. {
  59. return ((uint) (ch - 'a')) <= 5;
  60. }
  61. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  62. private static bool IsUpperCaseHexAlpha(char ch)
  63. {
  64. return ((uint) (ch - 'A')) <= 5;
  65. }
  66. private static bool IsHexDigit(char ch)
  67. {
  68. return
  69. IsDecimalDigit(ch) ||
  70. IsLowerCaseHexAlpha(ch) ||
  71. IsUpperCaseHexAlpha(ch)
  72. ;
  73. }
  74. private static bool IsWhiteSpace(char ch)
  75. {
  76. return (ch == ' ') ||
  77. (ch == '\t') ||
  78. (ch == '\n') ||
  79. (ch == '\r');
  80. }
  81. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  82. private static bool IsLineTerminator(char ch)
  83. {
  84. return (ch == 10) || (ch == 13) || (ch == 0x2028) || (ch == 0x2029);
  85. }
  86. private char ScanHexEscape()
  87. {
  88. int code = char.MinValue;
  89. for (int i = 0; i < 4; ++i)
  90. {
  91. if (_index < _length + 1 && IsHexDigit(_source[_index]))
  92. {
  93. char ch = char.ToLower(_source[_index++], CultureInfo.InvariantCulture);
  94. code = code * 16 + "0123456789abcdef".IndexOf(ch);
  95. }
  96. else
  97. {
  98. ThrowError(_index, Messages.ExpectedHexadecimalDigit);
  99. }
  100. }
  101. return (char) code;
  102. }
  103. private char ReadToNextSignificantCharacter()
  104. {
  105. char result = _index < _length ? _source[_index] : char.MinValue;
  106. while (IsWhiteSpace(result))
  107. {
  108. if ((++_index) >= _length)
  109. {
  110. return char.MinValue;
  111. }
  112. result = _source[_index];
  113. }
  114. return result;
  115. }
  116. private Token CreateToken(Tokens type, string text, char firstCharacter, JsValue value, in TextRange range)
  117. {
  118. Token result = _tokenBuffer[_tokenBufferIndex++];
  119. if (_tokenBufferIndex >= _tokenBuffer.Length)
  120. {
  121. _tokenBufferIndex = 0;
  122. }
  123. result.Type = type;
  124. result.Text = text;
  125. result.FirstCharacter = firstCharacter;
  126. result.Value = value;
  127. result.Range = range;
  128. return result;
  129. }
  130. private Token ScanPunctuator()
  131. {
  132. int start = _index;
  133. char code = start < _source.Length ? _source[_index] : char.MinValue;
  134. string value = ScanPunctuatorValue(start, code);
  135. ++_index;
  136. return CreateToken(Tokens.Punctuator, value, code, JsValue.Undefined, new TextRange(start, _index));
  137. }
  138. private string ScanPunctuatorValue(int start, char code)
  139. {
  140. switch (code)
  141. {
  142. case '.': return ".";
  143. case ',': return ",";
  144. case '{': return "{";
  145. case '}': return "}";
  146. case '[': return "[";
  147. case ']': return "]";
  148. case ':': return ":";
  149. default:
  150. ThrowError(start, Messages.UnexpectedToken, code);
  151. return null!;
  152. }
  153. }
  154. private Token ScanNumericLiteral()
  155. {
  156. using var sb = new ValueStringBuilder(stackalloc char[64]);
  157. var start = _index;
  158. var ch = _source.CharCodeAt(_index);
  159. var canBeInteger = true;
  160. // Number start with a -
  161. if (ch == '-')
  162. {
  163. sb.Append(ch);
  164. ch = _source.CharCodeAt(++_index);
  165. }
  166. if (ch != '.')
  167. {
  168. var firstCharacter = ch;
  169. sb.Append(ch);
  170. ch = _source.CharCodeAt(++_index);
  171. // Hex number starts with '0x'.
  172. // Octal number starts with '0'.
  173. if (sb.Length == 1 && firstCharacter == '0')
  174. {
  175. canBeInteger = false;
  176. // decimal number starts with '0' such as '09' is illegal.
  177. if (ch > 0 && IsDecimalDigit(ch))
  178. {
  179. ThrowError(_index, Messages.UnexpectedToken, ch);
  180. }
  181. }
  182. while (IsDecimalDigit((ch = _source.CharCodeAt(_index))))
  183. {
  184. sb.Append(ch);
  185. _index++;
  186. }
  187. }
  188. if (ch == '.')
  189. {
  190. canBeInteger = false;
  191. sb.Append(ch);
  192. _index++;
  193. while (IsDecimalDigit((ch = _source.CharCodeAt(_index))))
  194. {
  195. sb.Append(ch);
  196. _index++;
  197. }
  198. }
  199. if (ch is 'e' or 'E')
  200. {
  201. canBeInteger = false;
  202. sb.Append(ch);
  203. ch = _source.CharCodeAt(++_index);
  204. if (ch is '+' or '-')
  205. {
  206. sb.Append(ch);
  207. ch = _source.CharCodeAt(++_index);
  208. }
  209. if (IsDecimalDigit(ch))
  210. {
  211. while (IsDecimalDigit(ch = _source.CharCodeAt(_index)))
  212. {
  213. sb.Append(ch);
  214. _index++;
  215. }
  216. }
  217. else
  218. {
  219. ThrowError(_index, Messages.UnexpectedToken, _source.CharCodeAt(_index));
  220. }
  221. }
  222. var number = sb.ToString();
  223. JsNumber value;
  224. if (canBeInteger && long.TryParse(number, NumberStyles.Integer, CultureInfo.InvariantCulture, out var longResult) && longResult != -0)
  225. {
  226. value = JsNumber.Create(longResult);
  227. }
  228. else
  229. {
  230. value = new JsNumber(double.Parse(number, NumberStyles.AllowLeadingSign | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, CultureInfo.InvariantCulture));
  231. }
  232. return CreateToken(Tokens.Number, number, '\0', value, new TextRange(start, _index));
  233. }
  234. private Token ScanBooleanLiteral()
  235. {
  236. var start = _index;
  237. if (ConsumeMatch("true"))
  238. {
  239. return CreateToken(Tokens.BooleanLiteral, "true", '\t', JsBoolean.True, new TextRange(start, _index));
  240. }
  241. if (ConsumeMatch("false"))
  242. {
  243. return CreateToken(Tokens.BooleanLiteral, "false", '\f', JsBoolean.False, new TextRange(start, _index));
  244. }
  245. ThrowError(start, Messages.UnexpectedTokenIllegal);
  246. return null!;
  247. }
  248. private bool ConsumeMatch(string text)
  249. {
  250. var start = _index;
  251. var length = text.Length;
  252. if (start + length - 1 < _source.Length && _source.AsSpan(start, length).SequenceEqual(text.AsSpan()))
  253. {
  254. _index += length;
  255. return true;
  256. }
  257. return false;
  258. }
  259. private Token ScanNullLiteral()
  260. {
  261. int start = _index;
  262. if (ConsumeMatch("null"))
  263. {
  264. return CreateToken(Tokens.NullLiteral, "null", 'n', JsValue.Null, new TextRange(start, _index));
  265. }
  266. ThrowError(start, Messages.UnexpectedTokenIllegal);
  267. return null!;
  268. }
  269. private Token ScanStringLiteral(ref State state)
  270. {
  271. char quote = _source[_index];
  272. int start = _index;
  273. ++_index;
  274. using var sb = new ValueStringBuilder(stackalloc char[64]);
  275. while (_index < _length)
  276. {
  277. char ch = _source[_index++];
  278. if (ch == quote)
  279. {
  280. quote = char.MinValue;
  281. break;
  282. }
  283. if (ch <= 31)
  284. {
  285. ThrowError(_index - 1, Messages.InvalidCharacter);
  286. }
  287. if (ch == '\\')
  288. {
  289. ch = _source.CharCodeAt(_index++);
  290. switch (ch)
  291. {
  292. case '"':
  293. sb.Append('"');
  294. break;
  295. case '\\':
  296. sb.Append('\\');
  297. break;
  298. case '/':
  299. sb.Append('/');
  300. break;
  301. case 'n':
  302. sb.Append('\n');
  303. break;
  304. case 'r':
  305. sb.Append('\r');
  306. break;
  307. case 't':
  308. sb.Append('\t');
  309. break;
  310. case 'u':
  311. sb.Append(ScanHexEscape());
  312. break;
  313. case 'b':
  314. sb.Append('\b');
  315. break;
  316. case 'f':
  317. sb.Append('\f');
  318. break;
  319. default:
  320. ThrowError(_index - 1, Messages.UnexpectedToken, ch);
  321. break;
  322. }
  323. }
  324. else if (IsLineTerminator(ch))
  325. {
  326. break;
  327. }
  328. else
  329. {
  330. sb.Append(ch);
  331. }
  332. }
  333. if (quote != 0)
  334. {
  335. // unterminated string literal
  336. ThrowError(_index, Messages.UnexpectedEOS);
  337. }
  338. var value = sb.ToString();
  339. return CreateToken(Tokens.String, value, '\"', new JsString(value), new TextRange(start, _index));
  340. }
  341. private Token Advance(ref State state)
  342. {
  343. char ch = ReadToNextSignificantCharacter();
  344. if (ch == char.MinValue)
  345. {
  346. return CreateToken(Tokens.EOF, string.Empty, '\0', JsValue.Undefined, new TextRange(_index, _index));
  347. }
  348. // String literal starts with double quote (#34).
  349. // Single quote (#39) are not allowed in JSON.
  350. if (ch == '"')
  351. {
  352. return ScanStringLiteral(ref state);
  353. }
  354. if (ch == '-') // Negative Number
  355. {
  356. if (IsDecimalDigit(_source.CharCodeAt(_index + 1)))
  357. {
  358. return ScanNumericLiteral();
  359. }
  360. return ScanPunctuator();
  361. }
  362. if (IsDecimalDigit(ch))
  363. {
  364. return ScanNumericLiteral();
  365. }
  366. if (ch == 't' || ch == 'f')
  367. {
  368. return ScanBooleanLiteral();
  369. }
  370. if (ch == 'n')
  371. {
  372. return ScanNullLiteral();
  373. }
  374. return ScanPunctuator();
  375. }
  376. private Token Lex(ref State state)
  377. {
  378. Token token = _lookahead;
  379. _index = token.Range.End;
  380. _lookahead = Advance(ref state);
  381. _index = token.Range.End;
  382. return token;
  383. }
  384. private void Peek(ref State state)
  385. {
  386. int pos = _index;
  387. _lookahead = Advance(ref state);
  388. _index = pos;
  389. }
  390. [DoesNotReturn]
  391. private void ThrowDepthLimitReached(Token token)
  392. {
  393. ThrowError(token.Range.Start, Messages.MaxDepthLevelReached);
  394. }
  395. [DoesNotReturn]
  396. private void ThrowError(Token token, string messageFormat, params object[] arguments)
  397. {
  398. ThrowError(token.Range.Start, messageFormat, arguments);
  399. }
  400. [DoesNotReturn]
  401. private void ThrowError(int position, string messageFormat, params object[] arguments)
  402. {
  403. var msg = string.Format(CultureInfo.InvariantCulture, messageFormat, arguments);
  404. ExceptionHelper.ThrowSyntaxError(_engine.Realm, $"{msg} at position {position}");
  405. }
  406. // Throw an exception because of the token.
  407. private void ThrowUnexpected(Token token)
  408. {
  409. if (token.Type == Tokens.EOF)
  410. {
  411. ThrowError(token, Messages.UnexpectedEOS);
  412. }
  413. if (token.Type == Tokens.Number)
  414. {
  415. ThrowError(token, Messages.UnexpectedNumber);
  416. }
  417. if (token.Type == Tokens.String)
  418. {
  419. ThrowError(token, Messages.UnexpectedString);
  420. }
  421. // BooleanLiteral, NullLiteral, or Punctuator.
  422. ThrowError(token, Messages.UnexpectedToken, token.Text);
  423. }
  424. // Expect the next token to match the specified punctuator.
  425. // If not, an exception will be thrown.
  426. private void Expect(ref State state, char value)
  427. {
  428. Token token = Lex(ref state);
  429. if (token.Type != Tokens.Punctuator || value != token.FirstCharacter)
  430. {
  431. ThrowUnexpected(token);
  432. }
  433. }
  434. // Return true if the next token matches the specified punctuator.
  435. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  436. public bool Match(char value)
  437. {
  438. return _lookahead.Type == Tokens.Punctuator && value == _lookahead.FirstCharacter;
  439. }
  440. private JsArray ParseJsonArray(ref State state)
  441. {
  442. if ((++state.CurrentDepth) > _maxDepth)
  443. {
  444. ThrowDepthLimitReached(_lookahead);
  445. }
  446. /*
  447. To speed up performance, the list allocation is deferred.
  448. First the elements are stored within an array received
  449. from the .NET array pool.
  450. If a list contains less elements that the size that array,
  451. a Jint array is constructed with the values stored in that
  452. array.
  453. When the number of elements exceed the buffer size,
  454. The elements-array gets created and filled with the content
  455. of the array. The array will then turn into an
  456. intermediate buffer which gets flushed to the list
  457. when its full.
  458. */
  459. List<JsValue>? elements = null;
  460. Expect(ref state, '[');
  461. int bufferIndex = 0;
  462. JsArray? result = null;
  463. JsValue[] buffer = ArrayPool<JsValue>.Shared.Rent(16);
  464. try
  465. {
  466. while (!Match(']'))
  467. {
  468. buffer[bufferIndex++] = ParseJsonValue(ref state);
  469. if (!Match(']'))
  470. {
  471. Expect(ref state, ',');
  472. }
  473. if (bufferIndex >= buffer.Length)
  474. {
  475. if (elements is null)
  476. {
  477. elements = new List<JsValue>(buffer);
  478. }
  479. else
  480. {
  481. elements.AddRange(buffer);
  482. }
  483. bufferIndex = 0;
  484. }
  485. }
  486. // BufferIndex = 0 has two meanings
  487. // * Empty JSON array (elements will be null)
  488. // * The buffer array has just been flushed (elements will NOT be null)
  489. if (bufferIndex > 0)
  490. {
  491. if (elements is null)
  492. {
  493. // No element list has been created, all values did fit into the array.
  494. // The Jint-Array can get constructed from that array.
  495. var data = new JsValue[bufferIndex];
  496. System.Array.Copy(buffer, data, length: bufferIndex);
  497. result = new JsArray(_engine, data);
  498. }
  499. else
  500. {
  501. // An element list has been created. Flush the
  502. // remaining added items within the array to that list.
  503. for (var i = 0; i < bufferIndex; ++i)
  504. {
  505. elements.Add(buffer[i]);
  506. }
  507. }
  508. }
  509. else if (elements is null)
  510. {
  511. // the JSON array did not have any elements
  512. // aka: []
  513. result = new JsArray(_engine);
  514. }
  515. }
  516. finally
  517. {
  518. ArrayPool<JsValue>.Shared.Return(buffer);
  519. }
  520. Expect(ref state, ']');
  521. state.CurrentDepth--;
  522. return result ?? new JsArray(_engine, elements!.ToArray());
  523. }
  524. private JsObject ParseJsonObject(ref State state)
  525. {
  526. if ((++state.CurrentDepth) > _maxDepth)
  527. {
  528. ThrowDepthLimitReached(_lookahead);
  529. }
  530. Expect(ref state, '{');
  531. var obj = new JsObject(_engine);
  532. while (!Match('}'))
  533. {
  534. Tokens type = _lookahead.Type;
  535. if (type != Tokens.String)
  536. {
  537. ThrowUnexpected(Lex(ref state));
  538. }
  539. var nameToken = Lex(ref state);
  540. var name = nameToken.Text;
  541. if (PropertyNameContainsInvalidCharacters(name))
  542. {
  543. ThrowError(nameToken, Messages.InvalidCharacter);
  544. }
  545. Expect(ref state, ':');
  546. var value = ParseJsonValue(ref state);
  547. obj.FastSetDataProperty(name, value);
  548. if (!Match('}'))
  549. {
  550. Expect(ref state, ',');
  551. }
  552. }
  553. Expect(ref state, '}');
  554. state.CurrentDepth--;
  555. return obj;
  556. }
  557. private static bool PropertyNameContainsInvalidCharacters(string propertyName)
  558. {
  559. const char max = (char) 31;
  560. foreach (var c in propertyName)
  561. {
  562. if (c != '\t' && c <= max)
  563. {
  564. return true;
  565. }
  566. }
  567. return false;
  568. }
  569. /// <summary>
  570. /// Optimization.
  571. /// By calling Lex().Value for each type, we parse the token twice.
  572. /// It was already parsed by the peek() method.
  573. /// _lookahead.Value already contain the value.
  574. /// </summary>
  575. private JsValue ParseJsonValue(ref State state)
  576. {
  577. Tokens type = _lookahead.Type;
  578. switch (type)
  579. {
  580. case Tokens.NullLiteral:
  581. case Tokens.BooleanLiteral:
  582. case Tokens.String:
  583. case Tokens.Number:
  584. return Lex(ref state).Value;
  585. case Tokens.Punctuator:
  586. if (_lookahead.FirstCharacter == '[')
  587. {
  588. return ParseJsonArray(ref state);
  589. }
  590. if (_lookahead.FirstCharacter == '{')
  591. {
  592. return ParseJsonObject(ref state);
  593. }
  594. ThrowUnexpected(Lex(ref state));
  595. break;
  596. }
  597. ThrowUnexpected(Lex(ref state));
  598. // can't be reached
  599. return JsValue.Null;
  600. }
  601. public JsValue Parse(string code)
  602. {
  603. _source = code;
  604. _index = 0;
  605. _length = _source.Length;
  606. _lookahead = null!;
  607. State state = new State();
  608. Peek(ref state);
  609. JsValue jsv = ParseJsonValue(ref state);
  610. Peek(ref state);
  611. if (_lookahead.Type != Tokens.EOF)
  612. {
  613. ThrowError(_lookahead, Messages.UnexpectedToken, _lookahead.Text);
  614. }
  615. return jsv;
  616. }
  617. [StructLayout(LayoutKind.Auto)]
  618. private ref struct State
  619. {
  620. /// <summary>
  621. /// The current recursion depth
  622. /// </summary>
  623. public int CurrentDepth { get; set; }
  624. }
  625. private enum Tokens
  626. {
  627. NullLiteral,
  628. BooleanLiteral,
  629. String,
  630. Number,
  631. Punctuator,
  632. EOF,
  633. };
  634. private sealed class Token
  635. {
  636. public Tokens Type;
  637. public char FirstCharacter;
  638. public JsValue Value = JsValue.Undefined;
  639. public string Text = null!;
  640. public TextRange Range;
  641. }
  642. [StructLayout(LayoutKind.Auto)]
  643. private readonly struct TextRange
  644. {
  645. public TextRange(int start, int end)
  646. {
  647. Start = start;
  648. End = end;
  649. }
  650. public int Start { get; }
  651. public int End { get; }
  652. }
  653. static class Messages
  654. {
  655. public const string InvalidCharacter = "Invalid character in JSON";
  656. public const string ExpectedHexadecimalDigit = "Expected hexadecimal digit in JSON";
  657. public const string UnexpectedToken = "Unexpected token '{0}' in JSON";
  658. public const string UnexpectedTokenIllegal = "Unexpected token ILLEGAL in JSON";
  659. public const string UnexpectedNumber = "Unexpected number in JSON";
  660. public const string UnexpectedString = "Unexpected string in JSON";
  661. public const string UnexpectedEOS = "Unexpected end of JSON input";
  662. public const string MaxDepthLevelReached = "Max. depth level of JSON reached";
  663. };
  664. }
  665. internal static class StringExtensions
  666. {
  667. internal static char CharCodeAt(this string source, int index)
  668. {
  669. if (index > source.Length - 1)
  670. {
  671. // char.MinValue is used as the null value
  672. return char.MinValue;
  673. }
  674. return source[index];
  675. }
  676. }
  677. }