Lexer.cs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543
  1. using System.Runtime.CompilerServices;
  2. using Lua.Internal;
  3. namespace Lua.CodeAnalysis.Syntax;
  4. public ref struct Lexer
  5. {
  6. public required ReadOnlyMemory<char> Source { get; init; }
  7. public string? ChunkName { get; init; }
  8. SyntaxToken current;
  9. SourcePosition position = new(1, 0);
  10. int offset;
  11. public Lexer()
  12. {
  13. }
  14. public readonly SyntaxToken Current => current;
  15. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  16. void Advance(int count)
  17. {
  18. var span = Source.Span;
  19. for (int i = 0; i < count; i++)
  20. {
  21. if (offset >= span.Length)
  22. {
  23. LuaParseException.SyntaxError(ChunkName, position, null);
  24. }
  25. var c = span[offset];
  26. offset++;
  27. var isLF = c is '\n';
  28. var isCR = c is '\r' && (span.Length == offset || span[offset] is not '\n');
  29. if (isLF || isCR)
  30. {
  31. position.Column = 0;
  32. position.Line++;
  33. }
  34. else
  35. {
  36. position.Column++;
  37. }
  38. }
  39. }
  40. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  41. bool TryRead(int offset, out char value)
  42. {
  43. if (Source.Length <= offset)
  44. {
  45. value = default;
  46. return false;
  47. }
  48. value = Source.Span[offset];
  49. return true;
  50. }
  51. public bool MoveNext()
  52. {
  53. if (Source.Length <= offset) return false;
  54. var span = Source.Span;
  55. var startOffset = offset;
  56. var position = this.position;
  57. var c1 = span[offset];
  58. Advance(1);
  59. var c2 = span.Length == offset ? char.MinValue : span[offset];
  60. switch (c1)
  61. {
  62. case ' ':
  63. case '\t':
  64. return MoveNext();
  65. case '\n':
  66. current = SyntaxToken.EndOfLine(position);
  67. return true;
  68. case '\r':
  69. if (c2 == '\n') Advance(1);
  70. current = SyntaxToken.EndOfLine(position);
  71. return true;
  72. case '(':
  73. current = SyntaxToken.LParen(position);
  74. return true;
  75. case ')':
  76. current = SyntaxToken.RParen(position);
  77. return true;
  78. case '{':
  79. current = SyntaxToken.LCurly(position);
  80. return true;
  81. case '}':
  82. current = SyntaxToken.RCurly(position);
  83. return true;
  84. case ']':
  85. current = SyntaxToken.RSquare(position);
  86. return true;
  87. case '+':
  88. current = SyntaxToken.Addition(position);
  89. return true;
  90. case '-':
  91. // comment
  92. if (c2 == '-')
  93. {
  94. var pos = position;
  95. Advance(1);
  96. // block comment
  97. if (span.Length > offset + 1 && span[offset] is '[' && span[offset + 1] is '[' or '=')
  98. {
  99. Advance(1);
  100. (_, _, var isTerminated) = ReadUntilLongBracketEnd(ref span);
  101. if (!isTerminated) LuaParseException.UnfinishedLongComment(ChunkName, pos);
  102. }
  103. else // line comment
  104. {
  105. ReadUntilEOL(ref span, ref offset, out _);
  106. }
  107. return MoveNext();
  108. }
  109. else
  110. {
  111. current = SyntaxToken.Subtraction(position);
  112. return true;
  113. }
  114. case '*':
  115. current = SyntaxToken.Multiplication(position);
  116. return true;
  117. case '/':
  118. current = SyntaxToken.Division(position);
  119. return true;
  120. case '%':
  121. current = SyntaxToken.Modulo(position);
  122. return true;
  123. case '^':
  124. current = SyntaxToken.Exponentiation(position);
  125. return true;
  126. case '=':
  127. if (c2 == '=')
  128. {
  129. current = SyntaxToken.Equality(position);
  130. Advance(1);
  131. }
  132. else
  133. {
  134. current = SyntaxToken.Assignment(position);
  135. }
  136. return true;
  137. case '~':
  138. if (c2 == '=')
  139. {
  140. current = SyntaxToken.Inequality(position);
  141. Advance(1);
  142. }
  143. else
  144. {
  145. throw new LuaParseException(ChunkName, position, $"error: Invalid '~' token");
  146. }
  147. return true;
  148. case '>':
  149. if (c2 == '=')
  150. {
  151. current = SyntaxToken.GreaterThanOrEqual(position);
  152. Advance(1);
  153. }
  154. else
  155. {
  156. current = SyntaxToken.GreaterThan(position);
  157. }
  158. return true;
  159. case '<':
  160. if (c2 == '=')
  161. {
  162. current = SyntaxToken.LessThanOrEqual(position);
  163. Advance(1);
  164. }
  165. else
  166. {
  167. current = SyntaxToken.LessThan(position);
  168. }
  169. return true;
  170. case '.':
  171. if (c2 == '.')
  172. {
  173. var c3 = span.Length == (offset + 1) ? char.MinValue : span[offset + 1];
  174. if (c3 == '.')
  175. {
  176. // vararg
  177. current = SyntaxToken.VarArg(position);
  178. Advance(2);
  179. }
  180. else
  181. {
  182. // concat
  183. current = SyntaxToken.Concat(position);
  184. Advance(1);
  185. }
  186. return true;
  187. }
  188. if (!StringHelper.IsNumber(c2))
  189. {
  190. current = SyntaxToken.Dot(position);
  191. return true;
  192. }
  193. break;
  194. case '#':
  195. current = SyntaxToken.Length(position);
  196. return true;
  197. case ',':
  198. current = SyntaxToken.Comma(position);
  199. return true;
  200. case ';':
  201. current = SyntaxToken.SemiColon(position);
  202. return true;
  203. }
  204. // numeric literal
  205. if (c1 is '.' || StringHelper.IsNumber(c1))
  206. {
  207. if (c1 is '0' && c2 is 'x' or 'X') // hex 0x
  208. {
  209. Advance(1);
  210. if (span[offset] is '.') Advance(1);
  211. ReadDigit(ref span, ref offset, out var readCount);
  212. if (span.Length > offset && span[offset] is '.')
  213. {
  214. Advance(1);
  215. ReadDigit(ref span, ref offset, out _);
  216. }
  217. if (span.Length > offset && span[offset] is 'p' or 'P')
  218. {
  219. Advance(1);
  220. if (span[offset] is '-' or '+') Advance(1);
  221. ReadDigit(ref span, ref offset, out _);
  222. }
  223. if (readCount == 0)
  224. {
  225. throw new LuaParseException(ChunkName, this.position, $"error: Illegal hexadecimal number");
  226. }
  227. }
  228. else
  229. {
  230. ReadNumber(ref span, ref offset, out _);
  231. if (span.Length > offset && span[offset] is '.')
  232. {
  233. Advance(1);
  234. ReadNumber(ref span, ref offset, out _);
  235. }
  236. if (span.Length > offset && span[offset] is 'e' or 'E')
  237. {
  238. Advance(1);
  239. if (span[offset] is '-' or '+') Advance(1);
  240. ReadNumber(ref span, ref offset, out _);
  241. }
  242. }
  243. current = new(SyntaxTokenType.Number, Source[startOffset..offset], position);
  244. return true;
  245. }
  246. // label
  247. if (c1 is ':')
  248. {
  249. if (c2 is ':')
  250. {
  251. var stringStartOffset = offset + 1;
  252. Advance(2);
  253. var prevC = char.MinValue;
  254. while (span.Length > offset)
  255. {
  256. var c = span[offset];
  257. if (prevC == ':' && c == ':') break;
  258. Advance(1);
  259. prevC = c;
  260. }
  261. current = SyntaxToken.Label(Source[stringStartOffset..(offset - 1)], position);
  262. Advance(1);
  263. }
  264. else
  265. {
  266. current = SyntaxToken.Colon(position);
  267. }
  268. return true;
  269. }
  270. // short string literal
  271. if (c1 is '"' or '\'')
  272. {
  273. var quote = c1;
  274. var stringStartOffset = offset;
  275. var isTerminated = false;
  276. while (span.Length > offset)
  277. {
  278. var c = span[offset];
  279. if (c is '\n' or '\r')
  280. {
  281. break;
  282. }
  283. if (c is '\\')
  284. {
  285. Advance(1);
  286. if (span.Length <= offset) break;
  287. if (span[offset] == '\r')
  288. {
  289. if (span.Length<=offset +1) continue;
  290. if (span[offset+1] == '\n')Advance(1);
  291. }
  292. }
  293. else if (c == quote)
  294. {
  295. isTerminated = true;
  296. break;
  297. }
  298. Advance(1);
  299. }
  300. if (!isTerminated)
  301. {
  302. throw new LuaParseException(ChunkName, this.position, "error: Unterminated string");
  303. }
  304. current = SyntaxToken.String(Source[stringStartOffset..offset], position);
  305. Advance(1);
  306. return true;
  307. }
  308. // long string literal
  309. if (c1 is '[')
  310. {
  311. if (c2 is '[' or '=')
  312. {
  313. (var start, var end, var isTerminated) = ReadUntilLongBracketEnd(ref span);
  314. if (!isTerminated)
  315. {
  316. throw new LuaParseException(ChunkName, this.position, "error: Unterminated string");
  317. }
  318. current = SyntaxToken.RawString(Source[start..end], position);
  319. return true;
  320. }
  321. else
  322. {
  323. current = SyntaxToken.LSquare(position);
  324. return true;
  325. }
  326. }
  327. // identifier
  328. if (IsIdentifier(c1))
  329. {
  330. while (span.Length > offset && IsIdentifier(span[offset]))
  331. {
  332. Advance(1);
  333. }
  334. var identifier = Source[startOffset..offset];
  335. current = identifier.Span switch
  336. {
  337. Keywords.Nil => SyntaxToken.Nil(position),
  338. Keywords.True => SyntaxToken.True(position),
  339. Keywords.False => SyntaxToken.False(position),
  340. Keywords.And => SyntaxToken.And(position),
  341. Keywords.Or => SyntaxToken.Or(position),
  342. Keywords.Not => SyntaxToken.Not(position),
  343. Keywords.End => SyntaxToken.End(position),
  344. Keywords.Then => SyntaxToken.Then(position),
  345. Keywords.If => SyntaxToken.If(position),
  346. Keywords.ElseIf => SyntaxToken.ElseIf(position),
  347. Keywords.Else => SyntaxToken.Else(position),
  348. Keywords.Local => SyntaxToken.Local(position),
  349. Keywords.Return => SyntaxToken.Return(position),
  350. Keywords.Goto => SyntaxToken.Goto(position),
  351. Keywords.Do => SyntaxToken.Do(position),
  352. Keywords.In => SyntaxToken.In(position),
  353. Keywords.While => SyntaxToken.While(position),
  354. Keywords.Repeat => SyntaxToken.Repeat(position),
  355. Keywords.For => SyntaxToken.For(position),
  356. Keywords.Until => SyntaxToken.Until(position),
  357. Keywords.Break => SyntaxToken.Break(position),
  358. Keywords.Function => SyntaxToken.Function(position),
  359. _ => new(SyntaxTokenType.Identifier, identifier, position),
  360. };
  361. return true;
  362. }
  363. throw new LuaParseException(ChunkName, position, $"unexpected symbol near '{c1}'");
  364. }
  365. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  366. void ReadUntilEOL(ref ReadOnlySpan<char> span, ref int offset, out int readCount)
  367. {
  368. readCount = 0;
  369. var flag = true;
  370. while (flag)
  371. {
  372. if (span.Length <= offset) return;
  373. var c1 = span[offset];
  374. if (c1 is '\n')
  375. {
  376. flag = false;
  377. }
  378. else if (c1 is '\r')
  379. {
  380. var c2 = span.Length == offset + 1 ? char.MinValue : span[offset + 1];
  381. if (c2 is '\n')
  382. {
  383. Advance(1);
  384. readCount++;
  385. }
  386. flag = false;
  387. }
  388. Advance(1);
  389. readCount++;
  390. }
  391. }
  392. (int Start, int End, bool IsTerminated) ReadUntilLongBracketEnd(ref ReadOnlySpan<char> span)
  393. {
  394. var c = span[offset];
  395. var level = 0;
  396. while (c is '=')
  397. {
  398. level++;
  399. Advance(1);
  400. c = span[offset];
  401. }
  402. Advance(1);
  403. var startOffset = offset;
  404. var endOffset = 0;
  405. var isTerminated = false;
  406. var prevC = char.MinValue;
  407. while (span.Length > offset + level + 1)
  408. {
  409. var current = span[offset];
  410. // skip first newline
  411. if (offset == startOffset)
  412. {
  413. if (current == '\r')
  414. {
  415. startOffset += 2;
  416. Advance(span[offset + 1] == '\n' ? 2 : 1);
  417. continue;
  418. }
  419. else if (current == '\n')
  420. {
  421. startOffset++;
  422. Advance(1);
  423. continue;
  424. }
  425. }
  426. if (current is ']' && prevC is not '\\')
  427. {
  428. endOffset = offset;
  429. for (int i = 1; i <= level; i++)
  430. {
  431. if (span[offset + i] is not '=') goto CONTINUE;
  432. }
  433. if (span[offset + level + 1] is not ']') goto CONTINUE;
  434. Advance(level + 2);
  435. isTerminated = true;
  436. break;
  437. }
  438. CONTINUE:
  439. prevC = current;
  440. Advance(1);
  441. }
  442. return (startOffset, endOffset, isTerminated);
  443. }
  444. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  445. void ReadDigit(ref ReadOnlySpan<char> span, ref int offset, out int readCount)
  446. {
  447. readCount = 0;
  448. while (span.Length > offset && StringHelper.IsDigit(span[offset]))
  449. {
  450. Advance(1);
  451. readCount++;
  452. }
  453. }
  454. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  455. void ReadNumber(ref ReadOnlySpan<char> span, ref int offset, out int readCount)
  456. {
  457. readCount = 0;
  458. while (span.Length > offset && StringHelper.IsNumber(span[offset]))
  459. {
  460. Advance(1);
  461. readCount++;
  462. }
  463. }
  464. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  465. static bool IsIdentifier(char c)
  466. {
  467. return c == '_' ||
  468. ('A' <= c && c <= 'Z') ||
  469. ('a' <= c && c <= 'z') ||
  470. StringHelper.IsNumber(c);
  471. }
  472. }