tokenizer.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796
  1. #define TOKEN_KINDS \
  2. TOKEN_KIND(Token_Invalid, "Invalid"), \
  3. TOKEN_KIND(Token_EOF, "EOF"), \
  4. \
  5. TOKEN_KIND(Token__LiteralBegin, "_LiteralBegin"), \
  6. TOKEN_KIND(Token_Identifier, "Identifier"), \
  7. TOKEN_KIND(Token_Integer, "Integer"), \
  8. TOKEN_KIND(Token_Float, "Float"), \
  9. TOKEN_KIND(Token_Rune, "Rune"), \
  10. TOKEN_KIND(Token_String, "String"), \
  11. TOKEN_KIND(Token__LiteralEnd, "_LiteralEnd"), \
  12. \
  13. TOKEN_KIND(Token__OperatorBegin, "_OperatorBegin"), \
  14. TOKEN_KIND(Token_Eq, "="), \
  15. TOKEN_KIND(Token_Not, "!"), \
  16. TOKEN_KIND(Token_Hash, "#"), \
  17. TOKEN_KIND(Token_At, "@"), \
  18. TOKEN_KIND(Token_Pointer, "^"), \
  19. TOKEN_KIND(Token_Add, "+"), \
  20. TOKEN_KIND(Token_Sub, "-"), \
  21. TOKEN_KIND(Token_Mul, "*"), \
  22. TOKEN_KIND(Token_Quo, "/"), \
  23. TOKEN_KIND(Token_Mod, "%"), \
  24. TOKEN_KIND(Token_And, "&"), \
  25. TOKEN_KIND(Token_Or, "|"), \
  26. TOKEN_KIND(Token_Xor, "~"), \
  27. TOKEN_KIND(Token_AndNot, "&~"), \
  28. TOKEN_KIND(Token_Shl, "<<"), \
  29. TOKEN_KIND(Token_Shr, ">>"), \
  30. \
  31. TOKEN_KIND(Token_as, "as"), \
  32. TOKEN_KIND(Token_transmute, "transmute"), \
  33. \
  34. TOKEN_KIND(Token__AssignOpBegin, "_AssignOpBegin"), \
  35. TOKEN_KIND(Token_AddEq, "+="), \
  36. TOKEN_KIND(Token_SubEq, "-="), \
  37. TOKEN_KIND(Token_MulEq, "*="), \
  38. TOKEN_KIND(Token_QuoEq, "/="), \
  39. TOKEN_KIND(Token_ModEq, "%="), \
  40. TOKEN_KIND(Token_AndEq, "&="), \
  41. TOKEN_KIND(Token_OrEq, "|="), \
  42. TOKEN_KIND(Token_XorEq, "~="), \
  43. TOKEN_KIND(Token_AndNotEq, "&~="), \
  44. TOKEN_KIND(Token_ShlEq, "<<="), \
  45. TOKEN_KIND(Token_ShrEq, ">>="), \
  46. TOKEN_KIND(Token__AssignOpEnd, "_AssignOpEnd"), \
  47. TOKEN_KIND(Token_Increment, "++"), \
  48. TOKEN_KIND(Token_Decrement, "--"), \
  49. TOKEN_KIND(Token_ArrowRight, "->"), \
  50. TOKEN_KIND(Token_ArrowLeft, "<-"), \
  51. \
  52. TOKEN_KIND(Token_CmpAnd, "&&"), \
  53. TOKEN_KIND(Token_CmpOr, "||"), \
  54. TOKEN_KIND(Token_CmpAndEq, "&&="), \
  55. TOKEN_KIND(Token_CmpOrEq, "||="), \
  56. \
  57. TOKEN_KIND(Token__ComparisonBegin, "_ComparisonBegin"), \
  58. TOKEN_KIND(Token_CmpEq, "=="), \
  59. TOKEN_KIND(Token_NotEq, "!="), \
  60. TOKEN_KIND(Token_Lt, "<"), \
  61. TOKEN_KIND(Token_Gt, ">"), \
  62. TOKEN_KIND(Token_LtEq, "<="), \
  63. TOKEN_KIND(Token_GtEq, ">="), \
  64. TOKEN_KIND(Token__ComparisonEnd, "_ComparisonEnd"), \
  65. \
  66. TOKEN_KIND(Token_OpenParen, "("), \
  67. TOKEN_KIND(Token_CloseParen, ")"), \
  68. TOKEN_KIND(Token_OpenBracket, "["), \
  69. TOKEN_KIND(Token_CloseBracket, "]"), \
  70. TOKEN_KIND(Token_OpenBrace, "{"), \
  71. TOKEN_KIND(Token_CloseBrace, "}"), \
  72. TOKEN_KIND(Token_Colon, ":"), \
  73. TOKEN_KIND(Token_Semicolon, ";"), \
  74. TOKEN_KIND(Token_Period, "."), \
  75. TOKEN_KIND(Token_Comma, ","), \
  76. TOKEN_KIND(Token_Ellipsis, ".."), \
  77. TOKEN_KIND(Token__OperatorEnd, "_OperatorEnd"), \
  78. \
  79. TOKEN_KIND(Token__KeywordBegin, "_KeywordBegin"), \
  80. TOKEN_KIND(Token_type, "type"), \
  81. TOKEN_KIND(Token_alias, "alias"), \
  82. TOKEN_KIND(Token_proc, "proc"), \
  83. TOKEN_KIND(Token_match, "match"), \
  84. TOKEN_KIND(Token_break, "break"), \
  85. TOKEN_KIND(Token_continue, "continue"), \
  86. TOKEN_KIND(Token_fallthrough, "fallthrough"), \
  87. TOKEN_KIND(Token_case, "case"), \
  88. TOKEN_KIND(Token_then, "then"), \
  89. TOKEN_KIND(Token_if, "if"), \
  90. TOKEN_KIND(Token_else, "else"), \
  91. TOKEN_KIND(Token_for, "for"), \
  92. TOKEN_KIND(Token_defer, "defer"), \
  93. TOKEN_KIND(Token_return, "return"), \
  94. TOKEN_KIND(Token_struct, "struct"), \
  95. TOKEN_KIND(Token_union, "union"), \
  96. TOKEN_KIND(Token_enum, "enum"), \
  97. TOKEN_KIND(Token__KeywordEnd, "_KeywordEnd"), \
  98. TOKEN_KIND(Token_Count, "")
  99. enum TokenKind {
  100. #define TOKEN_KIND(e, s) e
  101. TOKEN_KINDS
  102. #undef TOKEN_KIND
  103. };
  104. String const token_strings[] = {
  105. #define TOKEN_KIND(e, s) {cast(u8 *)s, gb_size_of(s)-1}
  106. TOKEN_KINDS
  107. #undef TOKEN_KIND
  108. };
  109. struct TokenPos {
  110. String file;
  111. isize line, column;
  112. };
  113. i32 token_pos_cmp(TokenPos a, TokenPos b) {
  114. if (a.line == b.line) {
  115. if (a.column == b.column) {
  116. isize min_len = gb_min(a.file.len, b.file.len);
  117. return gb_memcompare(a.file.text, b.file.text, min_len);
  118. }
  119. return (a.column < b.column) ? -1 : +1;
  120. }
  121. return (a.line < b.line) ? -1 : +1;
  122. }
  123. b32 token_pos_are_equal(TokenPos a, TokenPos b) {
  124. return token_pos_cmp(a, b) == 0;
  125. }
  126. // NOTE(bill): Text is UTF-8, thus why u8 and not char
  127. struct Token {
  128. TokenKind kind;
  129. String string;
  130. TokenPos pos;
  131. };
  132. Token empty_token = {Token_Invalid};
  133. struct ErrorCollector {
  134. TokenPos prev;
  135. i64 count;
  136. };
  137. gb_no_inline void error(ErrorCollector *ec, Token token, char *fmt, ...) {
  138. ec->count++;
  139. // NOTE(bill): Duplicate error, skip it
  140. if (!token_pos_are_equal(ec->prev, token.pos)) {
  141. ec->prev = token.pos;
  142. va_list va;
  143. va_start(va, fmt);
  144. gb_printf_err("%.*s(%td:%td) Error: %s\n",
  145. LIT(token.pos.file), token.pos.line, token.pos.column,
  146. gb_bprintf_va(fmt, va));
  147. va_end(va);
  148. }
  149. }
  150. gb_no_inline void warning(Token token, char *fmt, ...) {
  151. va_list va;
  152. va_start(va, fmt);
  153. gb_printf_err("%.*s(%td:%td) Warning: %s\n",
  154. LIT(token.pos.file), token.pos.line, token.pos.column,
  155. gb_bprintf_va(fmt, va));
  156. va_end(va);
  157. }
  158. i32 token_precedence(Token t) {
  159. switch (t.kind) {
  160. case Token_CmpOr:
  161. return 1;
  162. case Token_CmpAnd:
  163. return 2;
  164. case Token_CmpEq:
  165. case Token_NotEq:
  166. case Token_Lt:
  167. case Token_Gt:
  168. case Token_LtEq:
  169. case Token_GtEq:
  170. return 3;
  171. case Token_Add:
  172. case Token_Sub:
  173. case Token_Or:
  174. case Token_Xor:
  175. return 4;
  176. case Token_Mul:
  177. case Token_Quo:
  178. case Token_Mod:
  179. case Token_And:
  180. case Token_AndNot:
  181. case Token_Shl:
  182. case Token_Shr:
  183. return 5;
  184. case Token_as:
  185. case Token_transmute:
  186. return 6;
  187. }
  188. return 0;
  189. }
  190. gb_inline b32 token_is_literal(Token t) {
  191. return gb_is_between(t.kind, Token__LiteralBegin+1, Token__LiteralEnd-1);
  192. }
  193. gb_inline b32 token_is_operator(Token t) {
  194. return gb_is_between(t.kind, Token__OperatorBegin+1, Token__OperatorEnd-1);
  195. }
  196. gb_inline b32 token_is_keyword(Token t) {
  197. return gb_is_between(t.kind, Token__KeywordBegin+1, Token__KeywordEnd-1);
  198. }
  199. gb_inline b32 token_is_comparison(Token t) {
  200. return gb_is_between(t.kind, Token__ComparisonBegin+1, Token__ComparisonEnd-1);
  201. }
  202. gb_inline b32 token_is_shift(Token t) {
  203. return t.kind == Token_Shl || t.kind == Token_Shr;
  204. }
  205. gb_inline void print_token(Token t) { gb_printf("%.*s\n", LIT(t.string)); }
  206. enum TokenizerInitError {
  207. TokenizerInit_None,
  208. TokenizerInit_Invalid,
  209. TokenizerInit_NotExists,
  210. TokenizerInit_Permission,
  211. TokenizerInit_Empty,
  212. TokenizerInit_Count,
  213. };
  214. struct Tokenizer {
  215. String fullpath;
  216. u8 *start;
  217. u8 *end;
  218. Rune curr_rune; // current character
  219. u8 * curr; // character pos
  220. u8 * read_curr; // pos from start
  221. u8 * line; // current line pos
  222. isize line_count;
  223. isize error_count;
  224. gbArray(String) allocated_strings;
  225. };
  226. #define tokenizer_err(t, msg, ...) tokenizer_err_(t, __FUNCTION__, msg, ##__VA_ARGS__)
  227. void tokenizer_err_(Tokenizer *t, char *function, char *msg, ...) {
  228. va_list va;
  229. isize column = t->read_curr - t->line+1;
  230. if (column < 1)
  231. column = 1;
  232. #if 0
  233. gb_printf_err("%s()\n", function);
  234. #endif
  235. gb_printf_err("%s(%td:%td) ", t->fullpath, t->line_count, column);
  236. va_start(va, msg);
  237. gb_printf_err_va(msg, va);
  238. va_end(va);
  239. gb_printf_err("\n");
  240. t->error_count++;
  241. }
  242. void advance_to_next_rune(Tokenizer *t) {
  243. if (t->read_curr < t->end) {
  244. Rune rune;
  245. isize width = 1;
  246. t->curr = t->read_curr;
  247. if (t->curr_rune == '\n') {
  248. t->line = t->curr;
  249. t->line_count++;
  250. }
  251. rune = *t->read_curr;
  252. if (rune == 0) {
  253. tokenizer_err(t, "Illegal character NUL");
  254. } else if (rune >= 0x80) { // not ASCII
  255. width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
  256. if (rune == GB_RUNE_INVALID && width == 1)
  257. tokenizer_err(t, "Illegal UTF-8 encoding");
  258. else if (rune == GB_RUNE_BOM && t->curr-t->start > 0)
  259. tokenizer_err(t, "Illegal byte order mark");
  260. }
  261. t->read_curr += width;
  262. t->curr_rune = rune;
  263. } else {
  264. t->curr = t->end;
  265. if (t->curr_rune == '\n') {
  266. t->line = t->curr;
  267. t->line_count++;
  268. }
  269. t->curr_rune = GB_RUNE_EOF;
  270. }
  271. }
  272. TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath) {
  273. char *c_str = gb_alloc_array(gb_heap_allocator(), char, fullpath.len+1);
  274. memcpy(c_str, fullpath.text, fullpath.len);
  275. c_str[fullpath.len] = '\0';
  276. defer (gb_free(gb_heap_allocator(), c_str));
  277. gbFileContents fc = gb_file_read_contents(gb_heap_allocator(), true, c_str);
  278. gb_zero_item(t);
  279. if (fc.data != NULL) {
  280. t->start = cast(u8 *)fc.data;
  281. t->line = t->read_curr = t->curr = t->start;
  282. t->end = t->start + fc.size;
  283. t->fullpath = fullpath;
  284. t->line_count = 1;
  285. advance_to_next_rune(t);
  286. if (t->curr_rune == GB_RUNE_BOM)
  287. advance_to_next_rune(t); // Ignore BOM at file beginning
  288. gb_array_init(t->allocated_strings, gb_heap_allocator());
  289. return TokenizerInit_None;
  290. }
  291. gbFile f = {};
  292. gbFileError err = gb_file_open(&f, c_str);
  293. defer (gb_file_close(&f));
  294. switch (err) {
  295. case gbFileError_Invalid:
  296. return TokenizerInit_Invalid;
  297. case gbFileError_NotExists:
  298. return TokenizerInit_NotExists;
  299. case gbFileError_Permission:
  300. return TokenizerInit_Permission;
  301. }
  302. if (gb_file_size(&f) == 0)
  303. return TokenizerInit_Empty;
  304. return TokenizerInit_None;
  305. }
  306. gb_inline void destroy_tokenizer(Tokenizer *t) {
  307. if (t->start != NULL) {
  308. gb_free(gb_heap_allocator(), t->start);
  309. }
  310. if (t->allocated_strings != NULL) {
  311. gb_array_free(t->allocated_strings);
  312. }
  313. }
  314. void tokenizer_skip_whitespace(Tokenizer *t) {
  315. for (;;) {
  316. if (rune_is_whitespace(t->curr_rune)) {
  317. advance_to_next_rune(t);
  318. } else if (t->curr_rune == '/') {
  319. if (t->read_curr[0] == '/') { // Line comment //
  320. while (t->curr_rune != '\n')
  321. advance_to_next_rune(t);
  322. } else if (t->read_curr[0] == '*') { // (Nested) Block comment /**/
  323. isize comment_scope = 1;
  324. for (;;) {
  325. advance_to_next_rune(t);
  326. if (t->curr_rune == '/') {
  327. advance_to_next_rune(t);
  328. if (t->curr_rune == '*') {
  329. advance_to_next_rune(t);
  330. comment_scope++;
  331. }
  332. }
  333. if (t->curr_rune == '*') {
  334. advance_to_next_rune(t);
  335. if (t->curr_rune == '/') {
  336. advance_to_next_rune(t);
  337. comment_scope--;
  338. }
  339. }
  340. if (comment_scope == 0)
  341. break;
  342. }
  343. } else {
  344. break;
  345. }
  346. } else {
  347. break;
  348. }
  349. }
  350. }
  351. gb_inline i32 digit_value(Rune r) {
  352. if (gb_char_is_digit(cast(char)r))
  353. return r - '0';
  354. if (gb_is_between(cast(char)r, 'a', 'f'))
  355. return r - 'a' + 10;
  356. if (gb_is_between(cast(char)r, 'A', 'F'))
  357. return r - 'A' + 10;
  358. return 16; // NOTE(bill): Larger than highest possible
  359. }
  360. gb_inline void scan_mantissa(Tokenizer *t, i32 base) {
  361. // TODO(bill): Allow for underscores in numbers as a number separator
  362. // TODO(bill): Is this a good idea?
  363. // while (digit_value(t->curr_rune) < base || t->curr_rune == '_')
  364. while (digit_value(t->curr_rune) < base)
  365. advance_to_next_rune(t);
  366. }
  367. Token scan_number_to_token(Tokenizer *t, b32 seen_decimal_point) {
  368. Token token = {};
  369. u8 *start_curr = t->curr;
  370. token.kind = Token_Integer;
  371. token.string = make_string(start_curr, 1);
  372. token.pos.file = t->fullpath;
  373. token.pos.line = t->line_count;
  374. token.pos.column = t->curr-t->line+1;
  375. if (seen_decimal_point) {
  376. start_curr--;
  377. token.kind = Token_Float;
  378. scan_mantissa(t, 10);
  379. goto exponent;
  380. }
  381. if (t->curr_rune == '0') {
  382. u8 *prev = t->curr;
  383. advance_to_next_rune(t);
  384. if (t->curr_rune == 'b') { // Binary
  385. advance_to_next_rune(t);
  386. scan_mantissa(t, 2);
  387. if (t->curr - prev <= 2)
  388. token.kind = Token_Invalid;
  389. } else if (t->curr_rune == 'o') { // Octal
  390. advance_to_next_rune(t);
  391. scan_mantissa(t, 8);
  392. if (t->curr - prev <= 2)
  393. token.kind = Token_Invalid;
  394. } else if (t->curr_rune == 'd') { // Decimal
  395. advance_to_next_rune(t);
  396. scan_mantissa(t, 10);
  397. if (t->curr - prev <= 2)
  398. token.kind = Token_Invalid;
  399. } else if (t->curr_rune == 'x') { // Hexadecimal
  400. advance_to_next_rune(t);
  401. scan_mantissa(t, 16);
  402. if (t->curr - prev <= 2)
  403. token.kind = Token_Invalid;
  404. } else {
  405. seen_decimal_point = false;
  406. scan_mantissa(t, 10);
  407. if (t->curr_rune == '.' || t->curr_rune == 'e' || t->curr_rune == 'E') {
  408. seen_decimal_point = true;
  409. goto fraction;
  410. }
  411. }
  412. token.string.len = t->curr - token.string.text;
  413. return token;
  414. }
  415. scan_mantissa(t, 10);
  416. fraction:
  417. if (t->curr_rune == '.') {
  418. token.kind = Token_Float;
  419. advance_to_next_rune(t);
  420. scan_mantissa(t, 10);
  421. }
  422. exponent:
  423. if (t->curr_rune == 'e' || t->curr_rune == 'E') {
  424. token.kind = Token_Float;
  425. advance_to_next_rune(t);
  426. if (t->curr_rune == '-' || t->curr_rune == '+')
  427. advance_to_next_rune(t);
  428. scan_mantissa(t, 10);
  429. }
  430. token.string.len = t->curr - token.string.text;
  431. return token;
  432. }
  433. // Quote == " for string and ' for char
  434. b32 scan_escape(Tokenizer *t, Rune quote) {
  435. isize len = 0;
  436. u32 base = 0, max = 0, x = 0;
  437. Rune r = t->curr_rune;
  438. if (r == 'a' ||
  439. r == 'b' ||
  440. r == 'f' ||
  441. r == 'n' ||
  442. r == 'r' ||
  443. r == 't' ||
  444. r == 'v' ||
  445. r == '\\' ||
  446. r == quote) {
  447. advance_to_next_rune(t);
  448. return true;
  449. } else if (gb_is_between(r, '0', '7')) {
  450. len = 3; base = 8; max = 255;
  451. } else if (r == 'x') {
  452. advance_to_next_rune(t);
  453. len = 2; base = 16; max = 255;
  454. } else if (r == 'u') {
  455. advance_to_next_rune(t);
  456. len = 4; base = 16; max = GB_RUNE_MAX;
  457. } else if (r == 'U') {
  458. advance_to_next_rune(t);
  459. len = 8; base = 16; max = GB_RUNE_MAX;
  460. } else {
  461. if (t->curr_rune < 0)
  462. tokenizer_err(t, "Escape sequence was not terminated");
  463. else
  464. tokenizer_err(t, "Unknown escape sequence");
  465. return false;
  466. }
  467. while (len --> 0) {
  468. u32 d = cast(u32)digit_value(t->curr_rune);
  469. if (d >= base) {
  470. if (t->curr_rune < 0)
  471. tokenizer_err(t, "Escape sequence was not terminated");
  472. else
  473. tokenizer_err(t, "Illegal character %d in escape sequence", t->curr_rune);
  474. return false;
  475. }
  476. x = x*base + d;
  477. advance_to_next_rune(t);
  478. }
  479. return true;
  480. }
  481. gb_inline TokenKind token_kind_variant2(Tokenizer *t, TokenKind a, TokenKind b) {
  482. if (t->curr_rune == '=') {
  483. advance_to_next_rune(t);
  484. return b;
  485. }
  486. return a;
  487. }
  488. gb_inline TokenKind token_kind_variant3(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c) {
  489. if (t->curr_rune == '=') {
  490. advance_to_next_rune(t);
  491. return b;
  492. }
  493. if (t->curr_rune == ch_c) {
  494. advance_to_next_rune(t);
  495. return c;
  496. }
  497. return a;
  498. }
  499. gb_inline TokenKind token_kind_variant4(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c, Rune ch_d, TokenKind d) {
  500. if (t->curr_rune == '=') {
  501. advance_to_next_rune(t);
  502. return b;
  503. } else if (t->curr_rune == ch_c) {
  504. advance_to_next_rune(t);
  505. return c;
  506. } else if (t->curr_rune == ch_d) {
  507. advance_to_next_rune(t);
  508. return d;
  509. }
  510. return a;
  511. }
  512. gb_inline TokenKind token_kind_dub_eq(Tokenizer *t, Rune sing_rune, TokenKind sing, TokenKind sing_eq, TokenKind dub, TokenKind dub_eq) {
  513. if (t->curr_rune == '=') {
  514. advance_to_next_rune(t);
  515. return sing_eq;
  516. } else if (t->curr_rune == sing_rune) {
  517. advance_to_next_rune(t);
  518. if (t->curr_rune == '=') {
  519. advance_to_next_rune(t);
  520. return dub_eq;
  521. }
  522. return dub;
  523. }
  524. return sing;
  525. }
  526. Token tokenizer_get_token(Tokenizer *t) {
  527. Token token = {};
  528. Rune curr_rune;
  529. tokenizer_skip_whitespace(t);
  530. token.string = make_string(t->curr, 1);
  531. token.pos.file = t->fullpath;
  532. token.pos.line = t->line_count;
  533. token.pos.column = t->curr - t->line + 1;
  534. curr_rune = t->curr_rune;
  535. if (rune_is_letter(curr_rune)) {
  536. token.kind = Token_Identifier;
  537. while (rune_is_letter(t->curr_rune) || rune_is_digit(t->curr_rune))
  538. advance_to_next_rune(t);
  539. token.string.len = t->curr - token.string.text;
  540. // NOTE(bill): ALL identifiers are > 1
  541. if (token.string.len > 1) {
  542. if (are_strings_equal(token.string, token_strings[Token_as])) {
  543. token.kind = Token_as;
  544. } else if (are_strings_equal(token.string, token_strings[Token_transmute])) {
  545. token.kind = Token_transmute;
  546. } else {
  547. for (i32 k = Token__KeywordBegin+1; k < Token__KeywordEnd; k++) {
  548. if (are_strings_equal(token.string, token_strings[k])) {
  549. token.kind = cast(TokenKind)k;
  550. break;
  551. }
  552. }
  553. }
  554. }
  555. } else if (gb_is_between(curr_rune, '0', '9')) {
  556. token = scan_number_to_token(t, false);
  557. } else {
  558. advance_to_next_rune(t);
  559. switch (curr_rune) {
  560. case GB_RUNE_EOF:
  561. token.kind = Token_EOF;
  562. break;
  563. case '`': // Raw String Literal
  564. case '"': // String Literal
  565. {
  566. Rune quote = curr_rune;
  567. token.kind = Token_String;
  568. if (curr_rune == '"') {
  569. for (;;) {
  570. Rune r = t->curr_rune;
  571. if (r == '\n' || r < 0) {
  572. tokenizer_err(t, "String literal not terminated");
  573. break;
  574. }
  575. advance_to_next_rune(t);
  576. if (r == quote)
  577. break;
  578. if (r == '\\')
  579. scan_escape(t, '"');
  580. }
  581. } else {
  582. for (;;) {
  583. Rune r = t->curr_rune;
  584. if (r < 0) {
  585. tokenizer_err(t, "String literal not terminated");
  586. break;
  587. }
  588. advance_to_next_rune(t);
  589. if (r == quote)
  590. break;
  591. }
  592. }
  593. token.string.len = t->curr - token.string.text;
  594. i32 success = unquote_string(gb_heap_allocator(), &token.string);
  595. if (success > 0) {
  596. if (success == 2) {
  597. gb_array_append(t->allocated_strings, token.string);
  598. }
  599. return token;
  600. } else {
  601. tokenizer_err(t, "Invalid string literal");
  602. }
  603. } break;
  604. case '\'': { // Rune Literal
  605. b32 valid = true;
  606. isize len = 0;
  607. token.kind = Token_Rune;
  608. for (;;) {
  609. Rune r = t->curr_rune;
  610. if (r == '\n' || r < 0) {
  611. if (valid)
  612. tokenizer_err(t, "Rune literal not terminated");
  613. break;
  614. }
  615. advance_to_next_rune(t);
  616. if (r == '\'')
  617. break;
  618. len++;
  619. if (r == '\\') {
  620. if (!scan_escape(t, '\''))
  621. valid = false;
  622. }
  623. }
  624. if (valid && len != 1)
  625. tokenizer_err(t, "Illegal rune literal");
  626. token.string.len = t->curr - token.string.text;
  627. i32 success = unquote_string(gb_heap_allocator(), &token.string);
  628. if (success > 0) {
  629. if (success == 2) {
  630. gb_array_append(t->allocated_strings, token.string);
  631. }
  632. return token;
  633. } else {
  634. tokenizer_err(t, "Invalid rune literal");
  635. }
  636. } break;
  637. case '.':
  638. token.kind = Token_Period; // Default
  639. if (gb_is_between(t->curr_rune, '0', '9')) { // Might be a number
  640. token = scan_number_to_token(t, true);
  641. } else if (t->curr_rune == '.') { // Could be an ellipsis
  642. advance_to_next_rune(t);
  643. token.kind = Token_Ellipsis;
  644. }
  645. break;
  646. case '#': token.kind = Token_Hash; break;
  647. case '@': token.kind = Token_At; break;
  648. case '^': token.kind = Token_Pointer; break;
  649. case ';': token.kind = Token_Semicolon; break;
  650. case ',': token.kind = Token_Comma; break;
  651. case '(': token.kind = Token_OpenParen; break;
  652. case ')': token.kind = Token_CloseParen; break;
  653. case '[': token.kind = Token_OpenBracket; break;
  654. case ']': token.kind = Token_CloseBracket; break;
  655. case '{': token.kind = Token_OpenBrace; break;
  656. case '}': token.kind = Token_CloseBrace; break;
  657. case ':': token.kind = Token_Colon; break;
  658. case '*': token.kind = token_kind_variant2(t, Token_Mul, Token_MulEq); break;
  659. case '/': token.kind = token_kind_variant2(t, Token_Quo, Token_QuoEq); break;
  660. case '%': token.kind = token_kind_variant2(t, Token_Mod, Token_ModEq); break;
  661. case '=': token.kind = token_kind_variant2(t, Token_Eq, Token_CmpEq); break;
  662. case '~': token.kind = token_kind_variant2(t, Token_Xor, Token_XorEq); break;
  663. case '!': token.kind = token_kind_variant2(t, Token_Not, Token_NotEq); break;
  664. case '+': token.kind = token_kind_variant3(t, Token_Add, Token_AddEq, '+', Token_Increment); break;
  665. case '-': token.kind = token_kind_variant4(t, Token_Sub, Token_SubEq, '-', Token_Decrement, '>', Token_ArrowRight); break;
  666. case '<':
  667. if (t->curr_rune == '-') {
  668. token.kind = Token_ArrowLeft;
  669. } else {
  670. token.kind = token_kind_dub_eq(t, '<', Token_Lt, Token_LtEq, Token_Shl, Token_ShlEq);
  671. }
  672. break;
  673. case '>':
  674. token.kind = token_kind_dub_eq(t, '>', Token_Gt, Token_GtEq, Token_Shr, Token_ShrEq);
  675. break;
  676. case '&':
  677. token.kind = Token_And;
  678. if (t->curr_rune == '~') {
  679. token.kind = Token_AndNot;
  680. advance_to_next_rune(t);
  681. if (t->curr_rune == '=') {
  682. token.kind = Token_AndNotEq;
  683. advance_to_next_rune(t);
  684. }
  685. } else {
  686. token.kind = token_kind_dub_eq(t, '&', Token_And, Token_AndEq, Token_CmpAnd, Token_CmpAndEq);
  687. }
  688. break;
  689. case '|': token.kind = token_kind_dub_eq(t, '|', Token_Or, Token_OrEq, Token_CmpOr, Token_CmpOrEq); break;
  690. default:
  691. if (curr_rune != GB_RUNE_BOM) {
  692. u8 str[4] = {};
  693. int len = cast(int)gb_utf8_encode_rune(str, curr_rune);
  694. tokenizer_err(t, "Illegal character: %.*s (%d) ", len, str, curr_rune);
  695. }
  696. token.kind = Token_Invalid;
  697. break;
  698. }
  699. }
  700. token.string.len = t->curr - token.string.text;
  701. return token;
  702. }