tokenizer.cpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817
  1. #define TOKEN_KINDS \
  2. TOKEN_KIND(Token_Invalid, "Invalid"), \
  3. TOKEN_KIND(Token_EOF, "EOF"), \
  4. TOKEN_KIND(Token_Comment, "Comment"), \
  5. \
  6. TOKEN_KIND(Token__LiteralBegin, "_LiteralBegin"), \
  7. TOKEN_KIND(Token_Identifier, "Identifier"), \
  8. TOKEN_KIND(Token_Integer, "Integer"), \
  9. TOKEN_KIND(Token_Float, "Float"), \
  10. TOKEN_KIND(Token_Rune, "Rune"), \
  11. TOKEN_KIND(Token_String, "String"), \
  12. TOKEN_KIND(Token__LiteralEnd, "_LiteralEnd"), \
  13. \
  14. TOKEN_KIND(Token__OperatorBegin, "_OperatorBegin"), \
  15. TOKEN_KIND(Token_Eq, "="), \
  16. TOKEN_KIND(Token_Not, "!"), \
  17. TOKEN_KIND(Token_Hash, "#"), \
  18. TOKEN_KIND(Token_At, "@"), \
  19. TOKEN_KIND(Token_Pointer, "^"), \
  20. TOKEN_KIND(Token_Maybe, "?"), \
  21. TOKEN_KIND(Token_Add, "+"), \
  22. TOKEN_KIND(Token_Sub, "-"), \
  23. TOKEN_KIND(Token_Mul, "*"), \
  24. TOKEN_KIND(Token_Quo, "/"), \
  25. TOKEN_KIND(Token_Mod, "%"), \
  26. TOKEN_KIND(Token_And, "&"), \
  27. TOKEN_KIND(Token_Or, "|"), \
  28. TOKEN_KIND(Token_Xor, "~"), \
  29. TOKEN_KIND(Token_AndNot, "&~"), \
  30. TOKEN_KIND(Token_Shl, "<<"), \
  31. TOKEN_KIND(Token_Shr, ">>"), \
  32. \
  33. TOKEN_KIND(Token_as, "as"), \
  34. TOKEN_KIND(Token_transmute, "transmute"), \
  35. TOKEN_KIND(Token_down_cast, "down_cast"), \
  36. TOKEN_KIND(Token_union_cast, "union_cast"), \
  37. \
  38. TOKEN_KIND(Token_Prime, "'"), \
  39. TOKEN_KIND(Token_DoublePrime, "''"), \
  40. \
  41. TOKEN_KIND(Token__AssignOpBegin, "_AssignOpBegin"), \
  42. TOKEN_KIND(Token_AddEq, "+="), \
  43. TOKEN_KIND(Token_SubEq, "-="), \
  44. TOKEN_KIND(Token_MulEq, "*="), \
  45. TOKEN_KIND(Token_QuoEq, "/="), \
  46. TOKEN_KIND(Token_ModEq, "%="), \
  47. TOKEN_KIND(Token_AndEq, "&="), \
  48. TOKEN_KIND(Token_OrEq, "|="), \
  49. TOKEN_KIND(Token_XorEq, "~="), \
  50. TOKEN_KIND(Token_AndNotEq, "&~="), \
  51. TOKEN_KIND(Token_ShlEq, "<<="), \
  52. TOKEN_KIND(Token_ShrEq, ">>="), \
  53. TOKEN_KIND(Token__AssignOpEnd, "_AssignOpEnd"), \
  54. TOKEN_KIND(Token_Increment, "++"), \
  55. TOKEN_KIND(Token_Decrement, "--"), \
  56. TOKEN_KIND(Token_ArrowRight, "->"), \
  57. TOKEN_KIND(Token_ArrowLeft, "<-"), \
  58. \
  59. TOKEN_KIND(Token_CmpAnd, "&&"), \
  60. TOKEN_KIND(Token_CmpOr, "||"), \
  61. TOKEN_KIND(Token_CmpAndEq, "&&="), \
  62. TOKEN_KIND(Token_CmpOrEq, "||="), \
  63. \
  64. TOKEN_KIND(Token__ComparisonBegin, "_ComparisonBegin"), \
  65. TOKEN_KIND(Token_CmpEq, "=="), \
  66. TOKEN_KIND(Token_NotEq, "!="), \
  67. TOKEN_KIND(Token_Lt, "<"), \
  68. TOKEN_KIND(Token_Gt, ">"), \
  69. TOKEN_KIND(Token_LtEq, "<="), \
  70. TOKEN_KIND(Token_GtEq, ">="), \
  71. TOKEN_KIND(Token__ComparisonEnd, "_ComparisonEnd"), \
  72. \
  73. TOKEN_KIND(Token_OpenParen, "("), \
  74. TOKEN_KIND(Token_CloseParen, ")"), \
  75. TOKEN_KIND(Token_OpenBracket, "["), \
  76. TOKEN_KIND(Token_CloseBracket, "]"), \
  77. TOKEN_KIND(Token_OpenBrace, "{"), \
  78. TOKEN_KIND(Token_CloseBrace, "}"), \
  79. TOKEN_KIND(Token_Colon, ":"), \
  80. TOKEN_KIND(Token_Semicolon, ";"), \
  81. TOKEN_KIND(Token_Period, "."), \
  82. TOKEN_KIND(Token_Comma, ","), \
  83. TOKEN_KIND(Token_Ellipsis, ".."), \
  84. TOKEN_KIND(Token_RangeExclusive, "..<"), \
  85. TOKEN_KIND(Token__OperatorEnd, "_OperatorEnd"), \
  86. \
  87. TOKEN_KIND(Token__KeywordBegin, "_KeywordBegin"), \
  88. TOKEN_KIND(Token_type, "type"), \
  89. TOKEN_KIND(Token_proc, "proc"), \
  90. TOKEN_KIND(Token_match, "match"), \
  91. TOKEN_KIND(Token_break, "break"), \
  92. TOKEN_KIND(Token_continue, "continue"), \
  93. TOKEN_KIND(Token_fallthrough, "fallthrough"), \
  94. TOKEN_KIND(Token_case, "case"), \
  95. TOKEN_KIND(Token_default, "default"), \
  96. TOKEN_KIND(Token_then, "then"), \
  97. TOKEN_KIND(Token_if, "if"), \
  98. TOKEN_KIND(Token_else, "else"), \
  99. TOKEN_KIND(Token_for, "for"), \
  100. TOKEN_KIND(Token_range, "range"), \
  101. TOKEN_KIND(Token_defer, "defer"), \
  102. TOKEN_KIND(Token_return, "return"), \
  103. TOKEN_KIND(Token_struct, "struct"), \
  104. TOKEN_KIND(Token_union, "union"), \
  105. TOKEN_KIND(Token_raw_union, "raw_union"), \
  106. TOKEN_KIND(Token_enum, "enum"), \
  107. TOKEN_KIND(Token_using, "using"), \
  108. TOKEN_KIND(Token_asm, "asm"), \
  109. TOKEN_KIND(Token_volatile, "volatile"), \
  110. TOKEN_KIND(Token_atomic, "atomic"), \
  111. TOKEN_KIND(Token_push_allocator, "push_allocator"), \
  112. TOKEN_KIND(Token_push_context, "push_context"), \
  113. TOKEN_KIND(Token__KeywordEnd, "_KeywordEnd"), \
  114. TOKEN_KIND(Token_Count, "")
  115. enum TokenKind {
  116. #define TOKEN_KIND(e, s) e
  117. TOKEN_KINDS
  118. #undef TOKEN_KIND
  119. };
  120. String const token_strings[] = {
  121. #define TOKEN_KIND(e, s) {cast(u8 *)s, gb_size_of(s)-1}
  122. TOKEN_KINDS
  123. #undef TOKEN_KIND
  124. };
  125. struct TokenPos {
  126. String file;
  127. isize line, column;
  128. };
  129. i32 token_pos_cmp(TokenPos a, TokenPos b) {
  130. if (a.line == b.line) {
  131. if (a.column == b.column) {
  132. isize min_len = gb_min(a.file.len, b.file.len);
  133. return gb_memcompare(a.file.text, b.file.text, min_len);
  134. }
  135. return (a.column < b.column) ? -1 : +1;
  136. }
  137. return (a.line < b.line) ? -1 : +1;
  138. }
  139. b32 token_pos_are_equal(TokenPos a, TokenPos b) {
  140. return token_pos_cmp(a, b) == 0;
  141. }
  142. // NOTE(bill): Text is UTF-8, thus why u8 and not char
  143. struct Token {
  144. TokenKind kind;
  145. String string;
  146. TokenPos pos;
  147. };
  148. Token empty_token = {Token_Invalid};
  149. Token blank_token = {Token_Identifier, {cast(u8 *)"_", 1}};
  150. Token make_token_ident(String s) {
  151. Token t = {Token_Identifier};
  152. t.string = s;
  153. return t;
  154. }
  155. struct ErrorCollector {
  156. TokenPos prev;
  157. i64 count;
  158. i64 warning_count;
  159. gbMutex mutex;
  160. };
  161. gb_global ErrorCollector global_error_collector;
  162. void init_global_error_collector(void) {
  163. gb_mutex_init(&global_error_collector.mutex);
  164. }
  165. void warning(Token token, char *fmt, ...) {
  166. gb_mutex_lock(&global_error_collector.mutex);
  167. defer (gb_mutex_unlock(&global_error_collector.mutex));
  168. global_error_collector.warning_count++;
  169. // NOTE(bill): Duplicate error, skip it
  170. if (!token_pos_are_equal(global_error_collector.prev, token.pos)) {
  171. va_list va;
  172. global_error_collector.prev = token.pos;
  173. va_start(va, fmt);
  174. gb_printf_err("%.*s(%td:%td) Warning: %s\n",
  175. LIT(token.pos.file), token.pos.line, token.pos.column,
  176. gb_bprintf_va(fmt, va));
  177. va_end(va);
  178. }
  179. }
  180. void error(Token token, char *fmt, ...) {
  181. gb_mutex_lock(&global_error_collector.mutex);
  182. defer (gb_mutex_unlock(&global_error_collector.mutex));
  183. global_error_collector.count++;
  184. // NOTE(bill): Duplicate error, skip it
  185. if (!token_pos_are_equal(global_error_collector.prev, token.pos)) {
  186. va_list va;
  187. global_error_collector.prev = token.pos;
  188. va_start(va, fmt);
  189. gb_printf_err("%.*s(%td:%td) %s\n",
  190. LIT(token.pos.file), token.pos.line, token.pos.column,
  191. gb_bprintf_va(fmt, va));
  192. va_end(va);
  193. }
  194. }
  195. void syntax_error(Token token, char *fmt, ...) {
  196. gb_mutex_lock(&global_error_collector.mutex);
  197. defer (gb_mutex_unlock(&global_error_collector.mutex));
  198. global_error_collector.count++;
  199. // NOTE(bill): Duplicate error, skip it
  200. if (!token_pos_are_equal(global_error_collector.prev, token.pos)) {
  201. va_list va;
  202. global_error_collector.prev = token.pos;
  203. va_start(va, fmt);
  204. gb_printf_err("%.*s(%td:%td) Syntax Error: %s\n",
  205. LIT(token.pos.file), token.pos.line, token.pos.column,
  206. gb_bprintf_va(fmt, va));
  207. va_end(va);
  208. }
  209. }
  210. void compiler_error(char *fmt, ...) {
  211. va_list va;
  212. va_start(va, fmt);
  213. gb_printf_err("Internal Compiler Error: %s\n",
  214. gb_bprintf_va(fmt, va));
  215. va_end(va);
  216. gb_exit(1);
  217. }
  218. gb_inline b32 token_is_literal(Token t) {
  219. return gb_is_between(t.kind, Token__LiteralBegin+1, Token__LiteralEnd-1);
  220. }
  221. gb_inline b32 token_is_operator(Token t) {
  222. return gb_is_between(t.kind, Token__OperatorBegin+1, Token__OperatorEnd-1);
  223. }
  224. gb_inline b32 token_is_keyword(Token t) {
  225. return gb_is_between(t.kind, Token__KeywordBegin+1, Token__KeywordEnd-1);
  226. }
  227. gb_inline b32 token_is_comparison(Token t) {
  228. return gb_is_between(t.kind, Token__ComparisonBegin+1, Token__ComparisonEnd-1);
  229. }
  230. gb_inline b32 token_is_shift(Token t) {
  231. return t.kind == Token_Shl || t.kind == Token_Shr;
  232. }
  233. gb_inline void print_token(Token t) { gb_printf("%.*s\n", LIT(t.string)); }
  234. enum TokenizerInitError {
  235. TokenizerInit_None,
  236. TokenizerInit_Invalid,
  237. TokenizerInit_NotExists,
  238. TokenizerInit_Permission,
  239. TokenizerInit_Empty,
  240. TokenizerInit_Count,
  241. };
  242. struct Tokenizer {
  243. String fullpath;
  244. u8 *start;
  245. u8 *end;
  246. Rune curr_rune; // current character
  247. u8 * curr; // character pos
  248. u8 * read_curr; // pos from start
  249. u8 * line; // current line pos
  250. isize line_count;
  251. isize error_count;
  252. Array<String> allocated_strings;
  253. };
  254. void tokenizer_err(Tokenizer *t, char *msg, ...) {
  255. va_list va;
  256. isize column = t->read_curr - t->line+1;
  257. if (column < 1)
  258. column = 1;
  259. gb_printf_err("%.*s(%td:%td) Syntax error: ", LIT(t->fullpath), t->line_count, column);
  260. va_start(va, msg);
  261. gb_printf_err_va(msg, va);
  262. va_end(va);
  263. gb_printf_err("\n");
  264. t->error_count++;
  265. }
  266. void advance_to_next_rune(Tokenizer *t) {
  267. if (t->read_curr < t->end) {
  268. Rune rune;
  269. isize width = 1;
  270. t->curr = t->read_curr;
  271. if (t->curr_rune == '\n') {
  272. t->line = t->curr;
  273. t->line_count++;
  274. }
  275. rune = *t->read_curr;
  276. if (rune == 0) {
  277. tokenizer_err(t, "Illegal character NUL");
  278. } else if (rune >= 0x80) { // not ASCII
  279. width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
  280. if (rune == GB_RUNE_INVALID && width == 1)
  281. tokenizer_err(t, "Illegal UTF-8 encoding");
  282. else if (rune == GB_RUNE_BOM && t->curr-t->start > 0)
  283. tokenizer_err(t, "Illegal byte order mark");
  284. }
  285. t->read_curr += width;
  286. t->curr_rune = rune;
  287. } else {
  288. t->curr = t->end;
  289. if (t->curr_rune == '\n') {
  290. t->line = t->curr;
  291. t->line_count++;
  292. }
  293. t->curr_rune = GB_RUNE_EOF;
  294. }
  295. }
  296. TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath) {
  297. PROF_PROC();
  298. char *c_str = gb_alloc_array(gb_heap_allocator(), char, fullpath.len+1);
  299. memcpy(c_str, fullpath.text, fullpath.len);
  300. c_str[fullpath.len] = '\0';
  301. defer (gb_free(gb_heap_allocator(), c_str));
  302. gbFileContents fc = gb_file_read_contents(gb_heap_allocator(), true, c_str);
  303. gb_zero_item(t);
  304. if (fc.data != NULL) {
  305. t->start = cast(u8 *)fc.data;
  306. t->line = t->read_curr = t->curr = t->start;
  307. t->end = t->start + fc.size;
  308. t->fullpath = fullpath;
  309. t->line_count = 1;
  310. advance_to_next_rune(t);
  311. if (t->curr_rune == GB_RUNE_BOM)
  312. advance_to_next_rune(t); // Ignore BOM at file beginning
  313. array_init(&t->allocated_strings, gb_heap_allocator());
  314. return TokenizerInit_None;
  315. }
  316. gbFile f = {};
  317. gbFileError err = gb_file_open(&f, c_str);
  318. defer (gb_file_close(&f));
  319. switch (err) {
  320. case gbFileError_Invalid:
  321. return TokenizerInit_Invalid;
  322. case gbFileError_NotExists:
  323. return TokenizerInit_NotExists;
  324. case gbFileError_Permission:
  325. return TokenizerInit_Permission;
  326. }
  327. if (gb_file_size(&f) == 0)
  328. return TokenizerInit_Empty;
  329. return TokenizerInit_None;
  330. }
  331. gb_inline void destroy_tokenizer(Tokenizer *t) {
  332. if (t->start != NULL) {
  333. gb_free(gb_heap_allocator(), t->start);
  334. }
  335. for_array(i, t->allocated_strings) {
  336. gb_free(gb_heap_allocator(), t->allocated_strings[i].text);
  337. }
  338. array_free(&t->allocated_strings);
  339. }
  340. void tokenizer_skip_whitespace(Tokenizer *t) {
  341. while (rune_is_whitespace(t->curr_rune)) {
  342. advance_to_next_rune(t);
  343. }
  344. }
  345. gb_inline i32 digit_value(Rune r) {
  346. if (gb_char_is_digit(cast(char)r))
  347. return r - '0';
  348. if (gb_is_between(cast(char)r, 'a', 'f'))
  349. return r - 'a' + 10;
  350. if (gb_is_between(cast(char)r, 'A', 'F'))
  351. return r - 'A' + 10;
  352. return 16; // NOTE(bill): Larger than highest possible
  353. }
  354. gb_inline void scan_mantissa(Tokenizer *t, i32 base) {
  355. // TODO(bill): Allow for underscores in numbers as a number separator
  356. // TODO(bill): Is this a good idea?
  357. // while (digit_value(t->curr_rune) < base || t->curr_rune == '_')
  358. while (digit_value(t->curr_rune) < base)
  359. advance_to_next_rune(t);
  360. }
  361. Token scan_number_to_token(Tokenizer *t, b32 seen_decimal_point) {
  362. Token token = {};
  363. u8 *start_curr = t->curr;
  364. token.kind = Token_Integer;
  365. token.string = make_string(start_curr, 1);
  366. token.pos.file = t->fullpath;
  367. token.pos.line = t->line_count;
  368. token.pos.column = t->curr-t->line+1;
  369. if (seen_decimal_point) {
  370. start_curr--;
  371. token.kind = Token_Float;
  372. scan_mantissa(t, 10);
  373. goto exponent;
  374. }
  375. if (t->curr_rune == '0') {
  376. u8 *prev = t->curr;
  377. advance_to_next_rune(t);
  378. if (t->curr_rune == 'b') { // Binary
  379. advance_to_next_rune(t);
  380. scan_mantissa(t, 2);
  381. if (t->curr - prev <= 2)
  382. token.kind = Token_Invalid;
  383. } else if (t->curr_rune == 'o') { // Octal
  384. advance_to_next_rune(t);
  385. scan_mantissa(t, 8);
  386. if (t->curr - prev <= 2)
  387. token.kind = Token_Invalid;
  388. } else if (t->curr_rune == 'd') { // Decimal
  389. advance_to_next_rune(t);
  390. scan_mantissa(t, 10);
  391. if (t->curr - prev <= 2)
  392. token.kind = Token_Invalid;
  393. } else if (t->curr_rune == 'x') { // Hexadecimal
  394. advance_to_next_rune(t);
  395. scan_mantissa(t, 16);
  396. if (t->curr - prev <= 2)
  397. token.kind = Token_Invalid;
  398. } else {
  399. seen_decimal_point = false;
  400. scan_mantissa(t, 10);
  401. if (t->curr_rune == '.' || t->curr_rune == 'e' || t->curr_rune == 'E') {
  402. seen_decimal_point = true;
  403. goto fraction;
  404. }
  405. }
  406. token.string.len = t->curr - token.string.text;
  407. return token;
  408. }
  409. scan_mantissa(t, 10);
  410. fraction:
  411. if (t->curr_rune == '.') {
  412. token.kind = Token_Float;
  413. advance_to_next_rune(t);
  414. scan_mantissa(t, 10);
  415. }
  416. exponent:
  417. if (t->curr_rune == 'e' || t->curr_rune == 'E') {
  418. token.kind = Token_Float;
  419. advance_to_next_rune(t);
  420. if (t->curr_rune == '-' || t->curr_rune == '+')
  421. advance_to_next_rune(t);
  422. scan_mantissa(t, 10);
  423. }
  424. token.string.len = t->curr - token.string.text;
  425. return token;
  426. }
  427. // Quote == " for string
  428. b32 scan_escape(Tokenizer *t, Rune quote) {
  429. isize len = 0;
  430. u32 base = 0, max = 0, x = 0;
  431. Rune r = t->curr_rune;
  432. if (r == 'a' ||
  433. r == 'b' ||
  434. r == 'f' ||
  435. r == 'n' ||
  436. r == 'r' ||
  437. r == 't' ||
  438. r == 'v' ||
  439. r == '\\' ||
  440. r == quote) {
  441. advance_to_next_rune(t);
  442. return true;
  443. } else if (gb_is_between(r, '0', '7')) {
  444. len = 3; base = 8; max = 255;
  445. } else if (r == 'x') {
  446. advance_to_next_rune(t);
  447. len = 2; base = 16; max = 255;
  448. } else if (r == 'u') {
  449. advance_to_next_rune(t);
  450. len = 4; base = 16; max = GB_RUNE_MAX;
  451. } else if (r == 'U') {
  452. advance_to_next_rune(t);
  453. len = 8; base = 16; max = GB_RUNE_MAX;
  454. } else {
  455. if (t->curr_rune < 0)
  456. tokenizer_err(t, "Escape sequence was not terminated");
  457. else
  458. tokenizer_err(t, "Unknown escape sequence");
  459. return false;
  460. }
  461. while (len --> 0) {
  462. u32 d = cast(u32)digit_value(t->curr_rune);
  463. if (d >= base) {
  464. if (t->curr_rune < 0)
  465. tokenizer_err(t, "Escape sequence was not terminated");
  466. else
  467. tokenizer_err(t, "Illegal character %d in escape sequence", t->curr_rune);
  468. return false;
  469. }
  470. x = x*base + d;
  471. advance_to_next_rune(t);
  472. }
  473. return true;
  474. }
  475. gb_inline TokenKind token_kind_variant2(Tokenizer *t, TokenKind a, TokenKind b) {
  476. if (t->curr_rune == '=') {
  477. advance_to_next_rune(t);
  478. return b;
  479. }
  480. return a;
  481. }
  482. gb_inline TokenKind token_kind_variant3(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c) {
  483. if (t->curr_rune == '=') {
  484. advance_to_next_rune(t);
  485. return b;
  486. }
  487. if (t->curr_rune == ch_c) {
  488. advance_to_next_rune(t);
  489. return c;
  490. }
  491. return a;
  492. }
  493. gb_inline TokenKind token_kind_variant4(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c, Rune ch_d, TokenKind d) {
  494. if (t->curr_rune == '=') {
  495. advance_to_next_rune(t);
  496. return b;
  497. } else if (t->curr_rune == ch_c) {
  498. advance_to_next_rune(t);
  499. return c;
  500. } else if (t->curr_rune == ch_d) {
  501. advance_to_next_rune(t);
  502. return d;
  503. }
  504. return a;
  505. }
  506. gb_inline TokenKind token_kind_dub_eq(Tokenizer *t, Rune sing_rune, TokenKind sing, TokenKind sing_eq, TokenKind dub, TokenKind dub_eq) {
  507. if (t->curr_rune == '=') {
  508. advance_to_next_rune(t);
  509. return sing_eq;
  510. } else if (t->curr_rune == sing_rune) {
  511. advance_to_next_rune(t);
  512. if (t->curr_rune == '=') {
  513. advance_to_next_rune(t);
  514. return dub_eq;
  515. }
  516. return dub;
  517. }
  518. return sing;
  519. }
  520. Token tokenizer_get_token(Tokenizer *t) {
  521. Token token = {};
  522. Rune curr_rune;
  523. tokenizer_skip_whitespace(t);
  524. token.string = make_string(t->curr, 1);
  525. token.pos.file = t->fullpath;
  526. token.pos.line = t->line_count;
  527. token.pos.column = t->curr - t->line + 1;
  528. curr_rune = t->curr_rune;
  529. if (rune_is_letter(curr_rune)) {
  530. token.kind = Token_Identifier;
  531. while (rune_is_letter(t->curr_rune) || rune_is_digit(t->curr_rune))
  532. advance_to_next_rune(t);
  533. token.string.len = t->curr - token.string.text;
  534. // NOTE(bill): All keywords are > 1
  535. if (token.string.len > 1) {
  536. if (token.string == token_strings[Token_as]) {
  537. token.kind = Token_as;
  538. } else if (token.string == token_strings[Token_transmute]) {
  539. token.kind = Token_transmute;
  540. } else if (token.string == token_strings[Token_down_cast]) {
  541. token.kind = Token_down_cast;
  542. } else if (token.string == token_strings[Token_union_cast]) {
  543. token.kind = Token_union_cast;
  544. } else {
  545. for (i32 k = Token__KeywordBegin+1; k < Token__KeywordEnd; k++) {
  546. if (token.string == token_strings[k]) {
  547. token.kind = cast(TokenKind)k;
  548. break;
  549. }
  550. }
  551. }
  552. }
  553. } else if (gb_is_between(curr_rune, '0', '9')) {
  554. token = scan_number_to_token(t, false);
  555. } else {
  556. advance_to_next_rune(t);
  557. switch (curr_rune) {
  558. case GB_RUNE_EOF:
  559. token.kind = Token_EOF;
  560. break;
  561. case '\'':
  562. token.kind = Token_Prime;
  563. if (t->curr_rune == '\'') {
  564. advance_to_next_rune(t);
  565. token.kind = Token_DoublePrime;
  566. }
  567. break;
  568. case '`': // Raw String Literal
  569. case '"': // String Literal
  570. {
  571. Rune quote = curr_rune;
  572. token.kind = Token_String;
  573. if (curr_rune == '"') {
  574. for (;;) {
  575. Rune r = t->curr_rune;
  576. if (r == '\n' || r < 0) {
  577. tokenizer_err(t, "String literal not terminated");
  578. break;
  579. }
  580. advance_to_next_rune(t);
  581. if (r == quote)
  582. break;
  583. if (r == '\\')
  584. scan_escape(t, '"');
  585. }
  586. } else {
  587. for (;;) {
  588. Rune r = t->curr_rune;
  589. if (r < 0) {
  590. tokenizer_err(t, "String literal not terminated");
  591. break;
  592. }
  593. advance_to_next_rune(t);
  594. if (r == quote)
  595. break;
  596. }
  597. }
  598. token.string.len = t->curr - token.string.text;
  599. i32 success = unquote_string(gb_heap_allocator(), &token.string);
  600. if (success > 0) {
  601. if (success == 2) {
  602. array_add(&t->allocated_strings, token.string);
  603. }
  604. return token;
  605. } else {
  606. tokenizer_err(t, "Invalid string literal");
  607. }
  608. } break;
  609. case '.':
  610. token.kind = Token_Period; // Default
  611. if (gb_is_between(t->curr_rune, '0', '9')) { // Might be a number
  612. token = scan_number_to_token(t, true);
  613. } else if (t->curr_rune == '.') { // Could be an ellipsis
  614. advance_to_next_rune(t);
  615. token.kind = Token_Ellipsis;
  616. if (t->curr_rune == '<') {
  617. advance_to_next_rune(t);
  618. token.kind = Token_RangeExclusive;
  619. }
  620. }
  621. break;
  622. case '#': token.kind = Token_Hash; break;
  623. case '@': token.kind = Token_At; break;
  624. case '^': token.kind = Token_Pointer; break;
  625. case '?': token.kind = Token_Maybe; break;
  626. case ';': token.kind = Token_Semicolon; break;
  627. case ',': token.kind = Token_Comma; break;
  628. case '(': token.kind = Token_OpenParen; break;
  629. case ')': token.kind = Token_CloseParen; break;
  630. case '[': token.kind = Token_OpenBracket; break;
  631. case ']': token.kind = Token_CloseBracket; break;
  632. case '{': token.kind = Token_OpenBrace; break;
  633. case '}': token.kind = Token_CloseBrace; break;
  634. case ':': token.kind = Token_Colon; break;
  635. case '*': token.kind = token_kind_variant2(t, Token_Mul, Token_MulEq); break;
  636. case '%': token.kind = token_kind_variant2(t, Token_Mod, Token_ModEq); break;
  637. case '=': token.kind = token_kind_variant2(t, Token_Eq, Token_CmpEq); break;
  638. case '~': token.kind = token_kind_variant2(t, Token_Xor, Token_XorEq); break;
  639. case '!': token.kind = token_kind_variant2(t, Token_Not, Token_NotEq); break;
  640. case '+': token.kind = token_kind_variant3(t, Token_Add, Token_AddEq, '+', Token_Increment); break;
  641. case '-': token.kind = token_kind_variant4(t, Token_Sub, Token_SubEq, '-', Token_Decrement, '>', Token_ArrowRight); break;
  642. case '/': {
  643. if (t->curr_rune == '/') {
  644. while (t->curr_rune != '\n') {
  645. advance_to_next_rune(t);
  646. }
  647. token.kind = Token_Comment;
  648. } else if (t->curr_rune == '*') {
  649. isize comment_scope = 1;
  650. advance_to_next_rune(t);
  651. while (comment_scope > 0) {
  652. if (t->curr_rune == '/') {
  653. advance_to_next_rune(t);
  654. if (t->curr_rune == '*') {
  655. advance_to_next_rune(t);
  656. comment_scope++;
  657. }
  658. } else if (t->curr_rune == '*') {
  659. advance_to_next_rune(t);
  660. if (t->curr_rune == '/') {
  661. advance_to_next_rune(t);
  662. comment_scope--;
  663. }
  664. } else {
  665. advance_to_next_rune(t);
  666. }
  667. }
  668. token.kind = Token_Comment;
  669. } else {
  670. token.kind = token_kind_variant2(t, Token_Quo, Token_QuoEq);
  671. }
  672. } break;
  673. case '<':
  674. if (t->curr_rune == '-') {
  675. token.kind = Token_ArrowLeft;
  676. } else {
  677. token.kind = token_kind_dub_eq(t, '<', Token_Lt, Token_LtEq, Token_Shl, Token_ShlEq);
  678. }
  679. break;
  680. case '>':
  681. token.kind = token_kind_dub_eq(t, '>', Token_Gt, Token_GtEq, Token_Shr, Token_ShrEq);
  682. break;
  683. case '&':
  684. token.kind = Token_And;
  685. if (t->curr_rune == '~') {
  686. token.kind = Token_AndNot;
  687. advance_to_next_rune(t);
  688. if (t->curr_rune == '=') {
  689. token.kind = Token_AndNotEq;
  690. advance_to_next_rune(t);
  691. }
  692. } else {
  693. token.kind = token_kind_dub_eq(t, '&', Token_And, Token_AndEq, Token_CmpAnd, Token_CmpAndEq);
  694. }
  695. break;
  696. case '|': token.kind = token_kind_dub_eq(t, '|', Token_Or, Token_OrEq, Token_CmpOr, Token_CmpOrEq); break;
  697. default:
  698. if (curr_rune != GB_RUNE_BOM) {
  699. u8 str[4] = {};
  700. int len = cast(int)gb_utf8_encode_rune(str, curr_rune);
  701. tokenizer_err(t, "Illegal character: %.*s (%d) ", len, str, curr_rune);
  702. }
  703. token.kind = Token_Invalid;
  704. break;
  705. }
  706. }
  707. token.string.len = t->curr - token.string.text;
  708. return token;
  709. }