Scanner.cs 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800
  1. using Lua.Internal;
  2. using System.Globalization;
  3. using System.Text;
  4. using static System.Diagnostics.Debug;
  5. namespace Lua.CodeAnalysis.Compilation;
  6. using static Constants;
  7. internal struct Scanner
  8. {
  9. public LuaState L;
  10. public StringBuilder Buffer;
  11. public TextReader R;
  12. public int Current;
  13. public int LineNumber, LastLine;
  14. public string Source;
  15. public Token LookAheadToken;
  16. private int lastNewLinePos;
  17. ///inline
  18. public Token Token;
  19. public int T => Token.T;
  20. public const int FirstReserved = ushort.MaxValue + 257;
  21. public const int EndOfStream = -1;
  22. public const int MaxInt = int.MaxValue >> 1 + 1; //9223372036854775807
  23. public const int TkAnd = FirstReserved;
  24. public const int TkBreak = TkAnd + 1;
  25. public const int TkDo = TkBreak + 1;
  26. public const int TkElse = TkDo + 1;
  27. public const int TkElseif = TkElse + 1;
  28. public const int TkEnd = TkElseif + 1;
  29. public const int TkFalse = TkEnd + 1;
  30. public const int TkFor = TkFalse + 1;
  31. public const int TkFunction = TkFor + 1;
  32. public const int TkGoto = TkFunction + 1;
  33. public const int TkIf = TkGoto + 1;
  34. public const int TkIn = TkIf + 1;
  35. public const int TkLocal = TkIn + 1;
  36. public const int TkNil = TkLocal + 1;
  37. public const int TkNot = TkNil + 1;
  38. public const int TkOr = TkNot + 1;
  39. public const int TkRepeat = TkOr + 1;
  40. public const int TkReturn = TkRepeat + 1;
  41. public const int TkThen = TkReturn + 1;
  42. public const int TkTrue = TkThen + 1;
  43. public const int TkUntil = TkTrue + 1;
  44. public const int TkWhile = TkUntil + 1;
  45. public const int TkConcat = TkWhile + 1;
  46. public const int TkDots = TkConcat + 1;
  47. public const int TkEq = TkDots + 1;
  48. public const int TkGe = TkEq + 1;
  49. public const int TkLe = TkGe + 1;
  50. public const int TkNe = TkLe + 1;
  51. public const int TkDoubleColon = TkNe + 1;
  52. public const int TkEos = TkDoubleColon + 1;
  53. public const int TkNumber = TkEos + 1;
  54. public const int TkName = TkNumber + 1;
  55. public const int TkString = TkName + 1;
  56. public const int ReservedCount = TkWhile - FirstReserved + 1;
  57. static readonly string[] tokens =
  58. [
  59. "and", "break", "do", "else", "elseif",
  60. "end", "false", "for", "function", "goto", "if",
  61. "in", "local", "nil", "not", "or", "repeat",
  62. "return", "then", "true", "until", "while",
  63. "..", "...", "==", ">=", "<=", "~=", "::", "<eof>",
  64. "<number>", "<name>", "<string>"
  65. ];
  66. public static ReadOnlySpan<string> Tokens => tokens;
  67. public void SyntaxError(string message) => ScanError(message, Token.T);
  68. public void ErrorExpected(char t) => SyntaxError(TokenToString(t) + " expected");
  69. public void NumberError() => ScanError("malformed number", TkNumber);
  70. public static bool IsNewLine(int c) => c is '\n' or '\r';
  71. public static bool IsDecimal(int c) => c is >= '0' and <= '9';
  72. public static string TokenToString(Token t) => t.T switch
  73. {
  74. TkName or TkString => t.S,
  75. TkNumber => $"{t.N}",
  76. < FirstReserved => $"{(char)t.T}", // TODO check for printable rune
  77. < TkEos => $"'{tokens[t.T - FirstReserved]}'",
  78. _ => tokens[t.T - FirstReserved]
  79. };
  80. public string TokenToString(int t) => t switch
  81. {
  82. TkName or TkString => Token.S,
  83. TkNumber => $"{Token.N}",
  84. < FirstReserved => $"{(char)t}", // TODO check for printable rune
  85. < TkEos => $"'{tokens[t - FirstReserved]}'",
  86. _ => tokens[t - FirstReserved]
  87. };
  88. public static string TokenRuteToString(int t) => t switch
  89. {
  90. < FirstReserved => $"{(char)t}", // TODO check for printable rune
  91. <= TkString => $"'{tokens[t - FirstReserved]}'",
  92. _ => tokens[t - FirstReserved]
  93. };
  94. public void ScanError(string message, int token)
  95. {
  96. var shortSourceBuffer = (stackalloc char[59]);
  97. var len = LuaDebug.WriteShortSource(Source, shortSourceBuffer);
  98. var buff = shortSourceBuffer[..len].ToString();
  99. var pos = R.Position;
  100. if (token != 0)
  101. {
  102. var t = TokenToString(token);
  103. message = $"{message} near {t}";
  104. pos = Token.Pos;
  105. }
  106. throw new LuaCompileException(buff, new SourcePosition(LineNumber, pos - lastNewLinePos + 1), pos - 1, message);
  107. }
  108. public void ScanError(int pos, string message, int token)
  109. {
  110. var shortSourceBuffer = (stackalloc char[59]);
  111. var len = LuaDebug.WriteShortSource(Source, shortSourceBuffer);
  112. var buff = shortSourceBuffer[..len].ToString();
  113. if (token != 0)
  114. {
  115. var t = TokenToString(token);
  116. message = $"{message} near {t}";
  117. }
  118. throw new LuaCompileException(buff, new SourcePosition(LineNumber, pos - lastNewLinePos + 1), pos - 1, message);
  119. }
  120. public void IncrementLineNumber()
  121. {
  122. var old = Current;
  123. Assert(IsNewLine(old));
  124. Advance();
  125. if (IsNewLine(Current) && Current != old) Advance();
  126. lastNewLinePos = R.Position;
  127. if (++LineNumber >= MaxLine) SyntaxError("chunk has too many lines");
  128. }
  129. public void Advance()
  130. {
  131. Current = R.TryRead(out var c) ? c : EndOfStream;
  132. }
  133. public void SaveAndAdvance()
  134. {
  135. Save(Current);
  136. Advance();
  137. }
  138. public void AdvanceAndSave(int c)
  139. {
  140. Advance();
  141. Save(c);
  142. }
  143. public void Save(int c)
  144. {
  145. Buffer.Append((char)c);
  146. }
  147. public bool CheckNext(string str)
  148. {
  149. if (Current == 0 || !str.Contains((char)Current)) return false;
  150. SaveAndAdvance();
  151. return true;
  152. }
  153. public int SkipSeparator()
  154. {
  155. var (i, c) = (0, Current);
  156. Assert(c is '[' or ']');
  157. for (SaveAndAdvance(); Current == '='; i++) SaveAndAdvance();
  158. if (Current == c) return i;
  159. return -i - 1;
  160. }
  161. public string ReadMultiLine(bool comment, int sep)
  162. {
  163. SaveAndAdvance();
  164. if (IsNewLine(Current))
  165. {
  166. IncrementLineNumber();
  167. }
  168. for (;;)
  169. {
  170. switch (Current)
  171. {
  172. case EndOfStream:
  173. ScanError(comment ? "unfinished long comment" : "unfinished long string", TkEos);
  174. break;
  175. case ']':
  176. if (SkipSeparator() == sep)
  177. {
  178. SaveAndAdvance();
  179. if (!comment)
  180. {
  181. var s = Buffer.ToString(2 + sep, Buffer.Length - (4 + 2 * sep));
  182. Buffer.Clear();
  183. return s;
  184. }
  185. Buffer.Clear();
  186. return "";
  187. }
  188. break;
  189. case '\r':
  190. goto case '\n';
  191. case '\n':
  192. Save('\n');
  193. IncrementLineNumber();
  194. break;
  195. default:
  196. if (!comment)
  197. {
  198. Save(Current);
  199. }
  200. Advance();
  201. break;
  202. }
  203. }
  204. }
  205. public int ReadDigits()
  206. {
  207. var c = Current;
  208. for (; IsDecimal(c); c = Current) SaveAndAdvance();
  209. return c;
  210. }
  211. public static bool IsHexadecimal(int c) => c is >= '0' and <= '9' or >= 'a' and <= 'f' or >= 'A' and <= 'F';
  212. public (double n, int c, int i) ReadHexNumber(double x)
  213. {
  214. var c = Current;
  215. var n = x;
  216. if (!IsHexadecimal(c))
  217. {
  218. return (n, c, 0);
  219. }
  220. var i = 0;
  221. for (;;)
  222. {
  223. switch (c)
  224. {
  225. case >= '0' and <= '9':
  226. c = c - '0';
  227. break;
  228. case >= 'a' and <= 'f':
  229. c = c - 'a' + 10;
  230. break;
  231. case >= 'A' and <= 'F':
  232. c = c - 'A' + 10;
  233. break;
  234. default:
  235. return (n, c, i);
  236. }
  237. Advance();
  238. (c, n, i) = (Current, n * 16.0 + c, i + 1);
  239. }
  240. }
  241. public Token ReadNumber(int pos)
  242. {
  243. var c = Current;
  244. Assert(IsDecimal(c));
  245. SaveAndAdvance();
  246. if (c == '0' && CheckNext("Xx")) // hexadecimal
  247. {
  248. Buffer.Clear();
  249. var exponent = 0;
  250. (var fraction, c, var i) = ReadHexNumber(0);
  251. if (c == '.')
  252. {
  253. Advance();
  254. (fraction, c, exponent) = ReadHexNumber(fraction);
  255. }
  256. if (i == 0 && exponent == 0)
  257. {
  258. NumberError();
  259. }
  260. exponent *= -4;
  261. if (c is 'p' or 'P')
  262. {
  263. Advance();
  264. var negativeExponent = false;
  265. c = Current;
  266. if (c is '+' or '-')
  267. {
  268. negativeExponent = c == '-';
  269. Advance();
  270. }
  271. if (!IsDecimal(Current))
  272. {
  273. NumberError();
  274. }
  275. _ = ReadDigits();
  276. if (!long.TryParse(Buffer.ToString(), NumberStyles.Float, CultureInfo.InvariantCulture, out long e))
  277. {
  278. NumberError();
  279. }
  280. else if (negativeExponent)
  281. {
  282. exponent += (int)(-e);
  283. }
  284. else
  285. {
  286. exponent += (int)e;
  287. }
  288. Buffer.Clear();
  289. }
  290. return new(pos, fraction * Math.Pow(2, exponent));
  291. }
  292. c = ReadDigits();
  293. if (c == '.')
  294. {
  295. SaveAndAdvance();
  296. c = ReadDigits();
  297. }
  298. if (c is 'e' or 'E')
  299. {
  300. SaveAndAdvance();
  301. c = Current;
  302. if (c is '+' or '-')
  303. {
  304. SaveAndAdvance();
  305. }
  306. _ = ReadDigits();
  307. }
  308. var str = Buffer.ToString();
  309. if (str.StartsWith("0"))
  310. {
  311. if (str.Length == 1)
  312. {
  313. Buffer.Clear();
  314. return new(pos, 0d);
  315. }
  316. str = str.TrimStart('0');
  317. if (!IsDecimal(str[0]))
  318. {
  319. str = "0" + str;
  320. }
  321. }
  322. if (!double.TryParse(str, NumberStyles.Float, CultureInfo.InvariantCulture, out double f))
  323. {
  324. NumberError();
  325. }
  326. Buffer.Clear();
  327. return new(pos, f);
  328. }
  329. static readonly Dictionary<int, char> escapes = new()
  330. {
  331. { 'a', '\a' },
  332. { 'b', '\b' },
  333. { 'f', '\f' },
  334. { 'n', '\n' },
  335. { 'r', '\r' },
  336. { 't', '\t' },
  337. { 'v', '\v' },
  338. { '\\', '\\' },
  339. { '"', '"' },
  340. { '\'', '\'' },
  341. };
  342. public void EscapeError(int pos, ReadOnlySpan<int> c, string message)
  343. {
  344. Buffer.Clear();
  345. Save('\'');
  346. Save('\\');
  347. foreach (var r in c)
  348. {
  349. if (r == EndOfStream)
  350. {
  351. break;
  352. }
  353. Save(r);
  354. }
  355. Save('\'');
  356. Token = new(pos, TkString, Buffer.ToString());
  357. Buffer.Clear();
  358. ScanError(pos, message, TkString);
  359. }
  360. public int ReadHexEscape()
  361. {
  362. Advance();
  363. var r = 0;
  364. var b = (stackalloc int[3] { 'x', 0, 0 });
  365. var (i, c) = (1, Current);
  366. var pos = R.Position;
  367. for (; i < b.Length; (i, c, r) = (i + 1, Current, (r << 4) + c))
  368. {
  369. b[i] = c;
  370. switch (c)
  371. {
  372. case >= '0' and <= '9':
  373. c -= '0';
  374. break;
  375. case >= 'a' and <= 'f':
  376. c -= ('a' - 10);
  377. break;
  378. case >= 'A' and <= 'F':
  379. c -= ('A' - 10);
  380. break;
  381. default:
  382. EscapeError(pos, b.Slice(0, i + 1), "hexadecimal digit expected");
  383. break;
  384. }
  385. Advance();
  386. pos = R.Position;
  387. }
  388. return r;
  389. }
  390. public int ReadDecimalEscape()
  391. {
  392. var b = (stackalloc int[3] { 0, 0, 0 });
  393. var c = Current;
  394. var r = 0;
  395. var pos = R.Position;
  396. for (int i = 0; i < b.Length && IsDecimal(c); i++, c = Current)
  397. {
  398. b[i] = c;
  399. r = 10 * r + c - '0';
  400. Advance();
  401. pos = R.Position;
  402. }
  403. if (r > 255)
  404. {
  405. EscapeError(pos, b, "decimal escape too large");
  406. }
  407. return r;
  408. }
  409. public Token ReadString()
  410. {
  411. var pos = R.Position;
  412. var delimiter = Current;
  413. for (SaveAndAdvance(); Current != delimiter;)
  414. {
  415. switch (Current)
  416. {
  417. case EndOfStream:
  418. Token = new(R.Position - Buffer.Length, TkString, Buffer.ToString());
  419. ScanError(R.Position, "unfinished string", TkEos);
  420. break;
  421. case '\n' or '\r':
  422. Token = new(R.Position - Buffer.Length, TkString, Buffer.ToString());
  423. ScanError(R.Position, "unfinished string", TkString);
  424. break;
  425. case '\\':
  426. Advance();
  427. var c = Current;
  428. if (escapes.TryGetValue(c, out var esc))
  429. {
  430. AdvanceAndSave(esc);
  431. }
  432. else if (IsNewLine(c))
  433. {
  434. IncrementLineNumber();
  435. Save('\n');
  436. }
  437. else if (c == EndOfStream) // do nothing
  438. {
  439. }
  440. else if (c == 'x')
  441. {
  442. Save(ReadHexEscape());
  443. }
  444. else if (c == 'z')
  445. {
  446. for (Advance(); IsWhiteSpace(Current);)
  447. {
  448. if (IsNewLine(Current))
  449. {
  450. IncrementLineNumber();
  451. }
  452. else
  453. {
  454. Advance();
  455. }
  456. }
  457. }
  458. else if (IsDecimal(c))
  459. {
  460. Save(ReadDecimalEscape());
  461. }
  462. else
  463. {
  464. EscapeError(R.Position - 1, [c], "invalid escape sequence");
  465. }
  466. break;
  467. default:
  468. SaveAndAdvance();
  469. break;
  470. }
  471. }
  472. SaveAndAdvance();
  473. var length = Buffer.Length - 2;
  474. // if (0<length&&Buffer[^2] == '\0')
  475. // {
  476. // length--;
  477. // }
  478. var str = Buffer.ToString(1, length);
  479. Buffer.Clear();
  480. return new(pos, TkString, str);
  481. }
  482. public static bool IsReserved(string s)
  483. {
  484. foreach (var reserved in Tokens)
  485. {
  486. if (s == reserved)
  487. {
  488. return true;
  489. }
  490. }
  491. return false;
  492. }
  493. public Token ReservedOrName()
  494. {
  495. var pos = R.Position - Buffer.Length;
  496. var str = Buffer.ToString();
  497. Buffer.Clear();
  498. for (var i = 0; i < Tokens.Length; i++)
  499. {
  500. if (str == Tokens[i])
  501. {
  502. return new(pos, (i + FirstReserved), str);
  503. }
  504. }
  505. return new(pos, TkName, str);
  506. }
  507. public Token Scan()
  508. {
  509. const bool comment = true, str = false;
  510. var pos = R.Position;
  511. while (true)
  512. {
  513. var c = Current;
  514. switch (c)
  515. {
  516. case '\n':
  517. case '\r':
  518. IncrementLineNumber();
  519. break;
  520. case ' ':
  521. case '\f':
  522. case '\t':
  523. case '\v':
  524. Advance();
  525. pos = R.Position;
  526. break;
  527. case '-':
  528. Advance();
  529. if (Current != '-')
  530. {
  531. return new(pos, '-');
  532. }
  533. Advance();
  534. if (Current == '[')
  535. {
  536. var sep = SkipSeparator();
  537. if (sep >= 0)
  538. {
  539. _ = ReadMultiLine(comment, sep);
  540. break;
  541. }
  542. Buffer.Clear();
  543. }
  544. while (!IsNewLine(Current) && (Current != EndOfStream))
  545. {
  546. Advance();
  547. }
  548. break;
  549. case '[':
  550. {
  551. var sep = SkipSeparator();
  552. if (sep >= 0)
  553. {
  554. return new(pos, TkString, ReadMultiLine(str, sep));
  555. }
  556. Buffer.Clear();
  557. if (sep == -1) return new(pos, '[');
  558. ScanError("invalid long string delimiter", TkString);
  559. break;
  560. }
  561. case '=':
  562. Advance();
  563. if (Current != '=')
  564. {
  565. return new(pos, '=');
  566. }
  567. Advance();
  568. return new(pos, TkEq);
  569. case '<':
  570. Advance();
  571. if (Current != '=')
  572. {
  573. return new(pos, '<');
  574. }
  575. Advance();
  576. return new(pos, TkLe);
  577. case '>':
  578. Advance();
  579. if (Current != '=')
  580. {
  581. return new(pos, '>');
  582. }
  583. Advance();
  584. return new(pos, TkGe);
  585. case '~':
  586. Advance();
  587. if (Current != '=')
  588. {
  589. return new(pos, '~');
  590. }
  591. Advance();
  592. return new(pos, TkNe);
  593. case ':':
  594. Advance();
  595. if (Current != ':')
  596. {
  597. return new(pos, ':');
  598. }
  599. Advance();
  600. return new(pos, TkDoubleColon);
  601. case '"':
  602. case '\'':
  603. return ReadString();
  604. case EndOfStream:
  605. return new(pos, TkEos);
  606. case '.':
  607. SaveAndAdvance();
  608. if (CheckNext("."))
  609. {
  610. if (CheckNext("."))
  611. {
  612. Buffer.Clear();
  613. return new(pos, TkDots);
  614. }
  615. Buffer.Clear();
  616. return new(pos, TkConcat);
  617. }
  618. if (!IsDigit(Current))
  619. {
  620. Buffer.Clear();
  621. return new(pos, '.');
  622. }
  623. return ReadNumber(pos);
  624. case 0:
  625. Advance();
  626. pos = R.Position;
  627. break;
  628. default:
  629. {
  630. if (IsDigit(c))
  631. {
  632. return ReadNumber(pos);
  633. }
  634. if (IsLetter(c))
  635. {
  636. for (; IsLetter(c) || IsDigit(c); c = Current)
  637. {
  638. SaveAndAdvance();
  639. }
  640. return ReservedOrName();
  641. }
  642. Advance();
  643. return new(pos, c);
  644. }
  645. }
  646. }
  647. }
  648. public void Next()
  649. {
  650. LastLine = LineNumber;
  651. if (LookAheadToken.T != TkEos)
  652. {
  653. Token = LookAheadToken;
  654. LookAheadToken = new(0, TkEos);
  655. }
  656. else
  657. {
  658. Token = Scan();
  659. }
  660. }
  661. public int LookAhead()
  662. {
  663. Assert(LookAheadToken.T == TkEos);
  664. LookAheadToken = Scan();
  665. return LookAheadToken.T;
  666. }
  667. public bool TestNext(int t)
  668. {
  669. var r = Token.T == t;
  670. if (!r) return false;
  671. Next();
  672. return true;
  673. }
  674. public void Check(int t)
  675. {
  676. if (Token.T != t)
  677. {
  678. ErrorExpected((char)t);
  679. }
  680. }
  681. public void CheckMatch(int what, int who, int where)
  682. {
  683. if (TestNext(what)) return;
  684. if (where == LineNumber)
  685. {
  686. ErrorExpected((char)what);
  687. }
  688. else
  689. {
  690. SyntaxError($"{TokenToString(what)} expected (to close {TokenToString(who)} at line {where})");
  691. }
  692. }
  693. static bool IsWhiteSpace(int c) => c is ' ' or '\t' or '\n' or '\r' or '\f' or '\v';
  694. static bool IsDigit(int c) => c is >= '0' and <= '9';
  695. static bool IsLetter(int c)
  696. {
  697. return c is < ushort.MaxValue and ('_' or >= 'a' and <= 'z' or >= 'A' and <= 'Z');
  698. }
  699. }