tokenizer.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023
  1. #define TOKEN_KINDS \
  2. TOKEN_KIND(Token_Invalid, "Invalid"), \
  3. TOKEN_KIND(Token_EOF, "EOF"), \
  4. TOKEN_KIND(Token_Comment, "Comment"), \
  5. \
  6. TOKEN_KIND(Token__LiteralBegin, "_LiteralBegin"), \
  7. TOKEN_KIND(Token_Ident, "identifier"), \
  8. TOKEN_KIND(Token_Integer, "integer"), \
  9. TOKEN_KIND(Token_Float, "float"), \
  10. TOKEN_KIND(Token_Imag, "imaginary"), \
  11. TOKEN_KIND(Token_Rune, "rune"), \
  12. TOKEN_KIND(Token_String, "string"), \
  13. TOKEN_KIND(Token__LiteralEnd, "_LiteralEnd"), \
  14. \
  15. TOKEN_KIND(Token__OperatorBegin, "_OperatorBegin"), \
  16. TOKEN_KIND(Token_Eq, "="), \
  17. TOKEN_KIND(Token_Not, "!"), \
  18. TOKEN_KIND(Token_Hash, "#"), \
  19. TOKEN_KIND(Token_At, "@"), \
  20. TOKEN_KIND(Token_Dollar, "$"), \
  21. TOKEN_KIND(Token_Pointer, "^"), \
  22. TOKEN_KIND(Token_Question, "?"), \
  23. TOKEN_KIND(Token_Add, "+"), \
  24. TOKEN_KIND(Token_Sub, "-"), \
  25. TOKEN_KIND(Token_Mul, "*"), \
  26. TOKEN_KIND(Token_Quo, "/"), \
  27. TOKEN_KIND(Token_Mod, "%"), \
  28. TOKEN_KIND(Token_ModMod, "%%"), \
  29. TOKEN_KIND(Token_And, "&"), \
  30. TOKEN_KIND(Token_Or, "|"), \
  31. TOKEN_KIND(Token_Xor, "~"), \
  32. TOKEN_KIND(Token_AndNot, "&~"), \
  33. TOKEN_KIND(Token_Shl, "<<"), \
  34. TOKEN_KIND(Token_Shr, ">>"), \
  35. \
  36. TOKEN_KIND(Token_CmpAnd, "&&"), \
  37. TOKEN_KIND(Token_CmpOr, "||"), \
  38. \
  39. TOKEN_KIND(Token__AssignOpBegin, "_AssignOpBegin"), \
  40. TOKEN_KIND(Token_AddEq, "+="), \
  41. TOKEN_KIND(Token_SubEq, "-="), \
  42. TOKEN_KIND(Token_MulEq, "*="), \
  43. TOKEN_KIND(Token_QuoEq, "/="), \
  44. TOKEN_KIND(Token_ModEq, "%="), \
  45. TOKEN_KIND(Token_ModModEq, "%%="), \
  46. TOKEN_KIND(Token_AndEq, "&="), \
  47. TOKEN_KIND(Token_OrEq, "|="), \
  48. TOKEN_KIND(Token_XorEq, "~="), \
  49. TOKEN_KIND(Token_AndNotEq, "&~="), \
  50. TOKEN_KIND(Token_ShlEq, "<<="), \
  51. TOKEN_KIND(Token_ShrEq, ">>="), \
  52. TOKEN_KIND(Token_CmpAndEq, "&&="), \
  53. TOKEN_KIND(Token_CmpOrEq, "||="), \
  54. TOKEN_KIND(Token__AssignOpEnd, "_AssignOpEnd"), \
  55. TOKEN_KIND(Token_ArrowRight, "->"), \
  56. TOKEN_KIND(Token_ArrowLeft, "<-"), \
  57. TOKEN_KIND(Token_DoubleArrowRight, "=>"), \
  58. TOKEN_KIND(Token_Inc, "++"), \
  59. TOKEN_KIND(Token_Dec, "--"), \
  60. TOKEN_KIND(Token_Undef, "---"), \
  61. \
  62. TOKEN_KIND(Token__ComparisonBegin, "_ComparisonBegin"), \
  63. TOKEN_KIND(Token_CmpEq, "=="), \
  64. TOKEN_KIND(Token_NotEq, "!="), \
  65. TOKEN_KIND(Token_Lt, "<"), \
  66. TOKEN_KIND(Token_Gt, ">"), \
  67. TOKEN_KIND(Token_LtEq, "<="), \
  68. TOKEN_KIND(Token_GtEq, ">="), \
  69. TOKEN_KIND(Token__ComparisonEnd, "_ComparisonEnd"), \
  70. \
  71. TOKEN_KIND(Token_OpenParen, "("), \
  72. TOKEN_KIND(Token_CloseParen, ")"), \
  73. TOKEN_KIND(Token_OpenBracket, "["), \
  74. TOKEN_KIND(Token_CloseBracket, "]"), \
  75. TOKEN_KIND(Token_OpenBrace, "{"), \
  76. TOKEN_KIND(Token_CloseBrace, "}"), \
  77. TOKEN_KIND(Token_Colon, ":"), \
  78. TOKEN_KIND(Token_Semicolon, ";"), \
  79. TOKEN_KIND(Token_Period, "."), \
  80. TOKEN_KIND(Token_Comma, ","), \
  81. TOKEN_KIND(Token_Ellipsis, ".."), \
  82. TOKEN_KIND(Token_HalfClosed, "..<"), \
  83. TOKEN_KIND(Token_BackSlash, "\\"), \
  84. TOKEN_KIND(Token__OperatorEnd, "_OperatorEnd"), \
  85. \
  86. TOKEN_KIND(Token__KeywordBegin, "_KeywordBegin"), \
  87. TOKEN_KIND(Token_import, "import"), \
  88. TOKEN_KIND(Token_import_load, "import_load"), \
  89. TOKEN_KIND(Token_foreign, "foreign"), \
  90. TOKEN_KIND(Token_foreign_library, "foreign_library"), \
  91. TOKEN_KIND(Token_foreign_system_library, "foreign_system_library"), \
  92. TOKEN_KIND(Token_type, "type"), \
  93. TOKEN_KIND(Token_when, "when"), \
  94. TOKEN_KIND(Token_if, "if"), \
  95. TOKEN_KIND(Token_else, "else"), \
  96. TOKEN_KIND(Token_for, "for"), \
  97. TOKEN_KIND(Token_in, "in"), \
  98. TOKEN_KIND(Token_match, "match"), \
  99. TOKEN_KIND(Token_case, "case"), \
  100. TOKEN_KIND(Token_break, "break"), \
  101. TOKEN_KIND(Token_continue, "continue"), \
  102. TOKEN_KIND(Token_fallthrough, "fallthrough"), \
  103. TOKEN_KIND(Token_defer, "defer"), \
  104. TOKEN_KIND(Token_do, "do"), \
  105. TOKEN_KIND(Token_return, "return"), \
  106. TOKEN_KIND(Token_proc, "proc"), \
  107. TOKEN_KIND(Token_macro, "macro"), \
  108. TOKEN_KIND(Token_struct, "struct"), \
  109. TOKEN_KIND(Token_union, "union"), \
  110. TOKEN_KIND(Token_raw_union, "raw_union"), \
  111. TOKEN_KIND(Token_enum, "enum"), \
  112. TOKEN_KIND(Token_bit_field, "bit_field"), \
  113. TOKEN_KIND(Token_vector, "vector"), \
  114. TOKEN_KIND(Token_static, "static"), \
  115. TOKEN_KIND(Token_dynamic, "dynamic"), \
  116. TOKEN_KIND(Token_map, "map"), \
  117. TOKEN_KIND(Token_using, "using"), \
  118. TOKEN_KIND(Token_context, "context"), \
  119. TOKEN_KIND(Token_push_context, "push_context"), \
  120. TOKEN_KIND(Token_push_allocator, "push_allocator"), \
  121. TOKEN_KIND(Token_size_of, "size_of"), \
  122. TOKEN_KIND(Token_align_of, "align_of"), \
  123. TOKEN_KIND(Token_offset_of, "offset_of"), \
  124. TOKEN_KIND(Token_type_of, "type_of"), \
  125. TOKEN_KIND(Token_asm, "asm"), \
  126. TOKEN_KIND(Token_yield, "yield"), \
  127. TOKEN_KIND(Token_await, "await"), \
  128. TOKEN_KIND(Token_atomic, "atomic"), \
  129. TOKEN_KIND(Token__KeywordEnd, "_KeywordEnd"), \
  130. TOKEN_KIND(Token_Count, "")
  131. enum TokenKind {
  132. #define TOKEN_KIND(e, s) e
  133. TOKEN_KINDS
  134. #undef TOKEN_KIND
  135. };
  136. String const token_strings[] = {
  137. #define TOKEN_KIND(e, s) {cast(u8 *)s, gb_size_of(s)-1}
  138. TOKEN_KINDS
  139. #undef TOKEN_KIND
  140. };
  141. struct TokenPos {
  142. String file;
  143. isize line;
  144. isize column;
  145. };
  146. i32 token_pos_cmp(TokenPos a, TokenPos b) {
  147. if (a.line == b.line) {
  148. if (a.column == b.column) {
  149. isize min_len = gb_min(a.file.len, b.file.len);
  150. return gb_memcompare(a.file.text, b.file.text, min_len);
  151. }
  152. return (a.column < b.column) ? -1 : +1;
  153. }
  154. return (a.line < b.line) ? -1 : +1;
  155. }
  156. bool token_pos_eq(TokenPos a, TokenPos b) {
  157. return token_pos_cmp(a, b) == 0;
  158. }
  159. struct Token {
  160. TokenKind kind;
  161. String string;
  162. TokenPos pos;
  163. };
  164. Token empty_token = {Token_Invalid};
  165. Token blank_token = {Token_Ident, {cast(u8 *)"_", 1}};
  166. Token make_token_ident(String s) {
  167. Token t = {Token_Ident, s};
  168. return t;
  169. }
  170. struct ErrorCollector {
  171. TokenPos prev;
  172. i64 count;
  173. i64 warning_count;
  174. gbMutex mutex;
  175. };
  176. gb_global ErrorCollector global_error_collector;
  177. void init_global_error_collector(void) {
  178. gb_mutex_init(&global_error_collector.mutex);
  179. }
  180. void warning_va(Token token, char *fmt, va_list va) {
  181. gb_mutex_lock(&global_error_collector.mutex);
  182. global_error_collector.warning_count++;
  183. // NOTE(bill): Duplicate error, skip it
  184. if (!token_pos_eq(global_error_collector.prev, token.pos)) {
  185. global_error_collector.prev = token.pos;
  186. gb_printf_err("%.*s(%td:%td) Warning: %s\n",
  187. LIT(token.pos.file), token.pos.line, token.pos.column,
  188. gb_bprintf_va(fmt, va));
  189. }
  190. gb_mutex_unlock(&global_error_collector.mutex);
  191. }
  192. void error_va(Token token, char *fmt, va_list va) {
  193. gb_mutex_lock(&global_error_collector.mutex);
  194. global_error_collector.count++;
  195. // NOTE(bill): Duplicate error, skip it
  196. if (!token_pos_eq(global_error_collector.prev, token.pos)) {
  197. global_error_collector.prev = token.pos;
  198. gb_printf_err("%.*s(%td:%td) %s\n",
  199. LIT(token.pos.file), token.pos.line, token.pos.column,
  200. gb_bprintf_va(fmt, va));
  201. } else if (token.pos.line == 0) {
  202. gb_printf_err("Error: %s\n", gb_bprintf_va(fmt, va));
  203. }
  204. gb_mutex_unlock(&global_error_collector.mutex);
  205. }
  206. void syntax_error_va(Token token, char *fmt, va_list va) {
  207. gb_mutex_lock(&global_error_collector.mutex);
  208. global_error_collector.count++;
  209. // NOTE(bill): Duplicate error, skip it
  210. if (!token_pos_eq(global_error_collector.prev, token.pos)) {
  211. global_error_collector.prev = token.pos;
  212. gb_printf_err("%.*s(%td:%td) Syntax Error: %s\n",
  213. LIT(token.pos.file), token.pos.line, token.pos.column,
  214. gb_bprintf_va(fmt, va));
  215. } else if (token.pos.line == 0) {
  216. gb_printf_err("Error: %s\n", gb_bprintf_va(fmt, va));
  217. }
  218. gb_mutex_unlock(&global_error_collector.mutex);
  219. }
  220. void syntax_warning_va(Token token, char *fmt, va_list va) {
  221. gb_mutex_lock(&global_error_collector.mutex);
  222. global_error_collector.warning_count++;
  223. // NOTE(bill): Duplicate error, skip it
  224. if (!token_pos_eq(global_error_collector.prev, token.pos)) {
  225. global_error_collector.prev = token.pos;
  226. gb_printf_err("%.*s(%td:%td) Syntax Warning: %s\n",
  227. LIT(token.pos.file), token.pos.line, token.pos.column,
  228. gb_bprintf_va(fmt, va));
  229. } else if (token.pos.line == 0) {
  230. gb_printf_err("Warning: %s\n", gb_bprintf_va(fmt, va));
  231. }
  232. gb_mutex_unlock(&global_error_collector.mutex);
  233. }
  234. void warning(Token token, char *fmt, ...) {
  235. va_list va;
  236. va_start(va, fmt);
  237. warning_va(token, fmt, va);
  238. va_end(va);
  239. }
  240. void error(Token token, char *fmt, ...) {
  241. va_list va;
  242. va_start(va, fmt);
  243. error_va(token, fmt, va);
  244. va_end(va);
  245. }
  246. void syntax_error(Token token, char *fmt, ...) {
  247. va_list va;
  248. va_start(va, fmt);
  249. syntax_error_va(token, fmt, va);
  250. va_end(va);
  251. }
  252. void syntax_warning(Token token, char *fmt, ...) {
  253. va_list va;
  254. va_start(va, fmt);
  255. syntax_warning_va(token, fmt, va);
  256. va_end(va);
  257. }
  258. void compiler_error(char *fmt, ...) {
  259. va_list va;
  260. va_start(va, fmt);
  261. gb_printf_err("Internal Compiler Error: %s\n",
  262. gb_bprintf_va(fmt, va));
  263. va_end(va);
  264. gb_exit(1);
  265. }
  266. gb_inline bool token_is_literal(TokenKind t) {
  267. return gb_is_between(t, Token__LiteralBegin+1, Token__LiteralEnd-1);
  268. }
  269. gb_inline bool token_is_operator(TokenKind t) {
  270. return gb_is_between(t, Token__OperatorBegin+1, Token__OperatorEnd-1);
  271. }
  272. gb_inline bool token_is_keyword(TokenKind t) {
  273. return gb_is_between(t, Token__KeywordBegin+1, Token__KeywordEnd-1);
  274. }
  275. gb_inline bool token_is_comparison(TokenKind t) {
  276. return gb_is_between(t, Token__ComparisonBegin+1, Token__ComparisonEnd-1);
  277. }
  278. gb_inline bool token_is_shift(TokenKind t) {
  279. return t == Token_Shl || t == Token_Shr;
  280. }
  281. gb_inline void print_token(Token t) { gb_printf("%.*s\n", LIT(t.string)); }
  282. enum TokenizerInitError {
  283. TokenizerInit_None,
  284. TokenizerInit_Invalid,
  285. TokenizerInit_NotExists,
  286. TokenizerInit_Permission,
  287. TokenizerInit_Empty,
  288. TokenizerInit_Count,
  289. };
  290. struct TokenizerState {
  291. Rune curr_rune; // current character
  292. u8 * curr; // character pos
  293. u8 * read_curr; // pos from start
  294. u8 * line; // current line pos
  295. isize line_count;
  296. };
  297. struct Tokenizer {
  298. String fullpath;
  299. u8 *start;
  300. u8 *end;
  301. Rune curr_rune; // current character
  302. u8 * curr; // character pos
  303. u8 * read_curr; // pos from start
  304. u8 * line; // current line pos
  305. isize line_count;
  306. isize error_count;
  307. Array<String> allocated_strings;
  308. };
  309. TokenizerState save_tokenizer_state(Tokenizer *t) {
  310. TokenizerState state = {};
  311. state.curr_rune = t->curr_rune;
  312. state.curr = t->curr;
  313. state.read_curr = t->read_curr;
  314. state.line = t->line;
  315. state.line_count = t->line_count;
  316. return state;
  317. }
  318. void restore_tokenizer_state(Tokenizer *t, TokenizerState *state) {
  319. t->curr_rune = state->curr_rune;
  320. t->curr = state->curr;
  321. t->read_curr = state->read_curr;
  322. t->line = state->line;
  323. t->line_count = state->line_count;
  324. }
  325. void tokenizer_err(Tokenizer *t, char *msg, ...) {
  326. va_list va;
  327. isize column = t->read_curr - t->line+1;
  328. if (column < 1) {
  329. column = 1;
  330. }
  331. gb_printf_err("%.*s(%td:%td) Syntax error: ", LIT(t->fullpath), t->line_count, column);
  332. va_start(va, msg);
  333. gb_printf_err_va(msg, va);
  334. va_end(va);
  335. gb_printf_err("\n");
  336. t->error_count++;
  337. }
  338. void advance_to_next_rune(Tokenizer *t) {
  339. if (t->read_curr < t->end) {
  340. Rune rune;
  341. isize width = 1;
  342. t->curr = t->read_curr;
  343. if (t->curr_rune == '\n') {
  344. t->line = t->curr;
  345. t->line_count++;
  346. }
  347. rune = *t->read_curr;
  348. if (rune == 0) {
  349. tokenizer_err(t, "Illegal character NUL");
  350. } else if (rune >= 0x80) { // not ASCII
  351. width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
  352. if (rune == GB_RUNE_INVALID && width == 1)
  353. tokenizer_err(t, "Illegal UTF-8 encoding");
  354. else if (rune == GB_RUNE_BOM && t->curr-t->start > 0)
  355. tokenizer_err(t, "Illegal byte order mark");
  356. }
  357. t->read_curr += width;
  358. t->curr_rune = rune;
  359. } else {
  360. t->curr = t->end;
  361. if (t->curr_rune == '\n') {
  362. t->line = t->curr;
  363. t->line_count++;
  364. }
  365. t->curr_rune = GB_RUNE_EOF;
  366. }
  367. }
  368. TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath) {
  369. TokenizerInitError err = TokenizerInit_None;
  370. char *c_str = gb_alloc_array(heap_allocator(), char, fullpath.len+1);
  371. gb_memcopy(c_str, fullpath.text, fullpath.len);
  372. c_str[fullpath.len] = '\0';
  373. // TODO(bill): Memory map rather than copy contents
  374. gbFileContents fc = gb_file_read_contents(heap_allocator(), true, c_str);
  375. gb_zero_item(t);
  376. if (fc.data != nullptr) {
  377. t->start = cast(u8 *)fc.data;
  378. t->line = t->read_curr = t->curr = t->start;
  379. t->end = t->start + fc.size;
  380. t->fullpath = fullpath;
  381. t->line_count = 1;
  382. advance_to_next_rune(t);
  383. if (t->curr_rune == GB_RUNE_BOM) {
  384. advance_to_next_rune(t); // Ignore BOM at file beginning
  385. }
  386. array_init(&t->allocated_strings, heap_allocator());
  387. } else {
  388. gbFile f = {};
  389. gbFileError file_err = gb_file_open(&f, c_str);
  390. switch (file_err) {
  391. case gbFileError_Invalid: err = TokenizerInit_Invalid; break;
  392. case gbFileError_NotExists: err = TokenizerInit_NotExists; break;
  393. case gbFileError_Permission: err = TokenizerInit_Permission; break;
  394. }
  395. if (err == TokenizerInit_None && gb_file_size(&f) == 0) {
  396. err = TokenizerInit_Empty;
  397. }
  398. gb_file_close(&f);
  399. }
  400. gb_free(heap_allocator(), c_str);
  401. return err;
  402. }
  403. gb_inline void destroy_tokenizer(Tokenizer *t) {
  404. if (t->start != nullptr) {
  405. gb_free(heap_allocator(), t->start);
  406. }
  407. for_array(i, t->allocated_strings) {
  408. gb_free(heap_allocator(), t->allocated_strings[i].text);
  409. }
  410. array_free(&t->allocated_strings);
  411. }
  412. void tokenizer_skip_whitespace(Tokenizer *t) {
  413. while (t->curr_rune == ' ' ||
  414. t->curr_rune == '\t' ||
  415. t->curr_rune == '\n' ||
  416. t->curr_rune == '\r') {
  417. advance_to_next_rune(t);
  418. }
  419. }
  420. gb_inline i32 digit_value(Rune r) {
  421. if (gb_char_is_digit(cast(char)r)) {
  422. return r - '0';
  423. } else if (gb_is_between(cast(char)r, 'a', 'f')) {
  424. return r - 'a' + 10;
  425. } else if (gb_is_between(cast(char)r, 'A', 'F')) {
  426. return r - 'A' + 10;
  427. }
  428. return 16; // NOTE(bill): Larger than highest possible
  429. }
  430. gb_inline void scan_mantissa(Tokenizer *t, i32 base) {
  431. while (digit_value(t->curr_rune) < base || t->curr_rune == '_') {
  432. advance_to_next_rune(t);
  433. }
  434. }
  435. Token scan_number_to_token(Tokenizer *t, bool seen_decimal_point) {
  436. Token token = {};
  437. token.kind = Token_Integer;
  438. token.string = make_string(t->curr, 1);
  439. token.pos.file = t->fullpath;
  440. token.pos.line = t->line_count;
  441. token.pos.column = t->curr-t->line+1;
  442. if (seen_decimal_point) {
  443. token.kind = Token_Float;
  444. scan_mantissa(t, 10);
  445. goto exponent;
  446. }
  447. if (t->curr_rune == '0') {
  448. u8 *prev = t->curr;
  449. advance_to_next_rune(t);
  450. if (t->curr_rune == 'b') { // Binary
  451. advance_to_next_rune(t);
  452. scan_mantissa(t, 2);
  453. if (t->curr - prev <= 2) {
  454. token.kind = Token_Invalid;
  455. }
  456. } else if (t->curr_rune == 'o') { // Octal
  457. advance_to_next_rune(t);
  458. scan_mantissa(t, 8);
  459. if (t->curr - prev <= 2) {
  460. token.kind = Token_Invalid;
  461. }
  462. } else if (t->curr_rune == 'd') { // Decimal
  463. advance_to_next_rune(t);
  464. scan_mantissa(t, 10);
  465. if (t->curr - prev <= 2) {
  466. token.kind = Token_Invalid;
  467. }
  468. } else if (t->curr_rune == 'z') { // Dozenal
  469. advance_to_next_rune(t);
  470. scan_mantissa(t, 12);
  471. if (t->curr - prev <= 2) {
  472. token.kind = Token_Invalid;
  473. }
  474. } else if (t->curr_rune == 'x') { // Hexadecimal
  475. advance_to_next_rune(t);
  476. scan_mantissa(t, 16);
  477. if (t->curr - prev <= 2) {
  478. token.kind = Token_Invalid;
  479. }
  480. } /* else if (t->curr_rune == 'h') { // Hexadecimal Float
  481. token.kind = Token_Float;
  482. advance_to_next_rune(t);
  483. scan_mantissa(t, 16);
  484. if (t->curr - prev <= 2) {
  485. token.kind = Token_Invalid;
  486. }
  487. } */ else {
  488. seen_decimal_point = false;
  489. scan_mantissa(t, 10);
  490. if (t->curr_rune == '.' || t->curr_rune == 'e' || t->curr_rune == 'E') {
  491. seen_decimal_point = true;
  492. goto fraction;
  493. }
  494. }
  495. goto end;
  496. }
  497. scan_mantissa(t, 10);
  498. fraction:
  499. if (t->curr_rune == '.') {
  500. // HACK(bill): This may be inefficient
  501. TokenizerState state = save_tokenizer_state(t);
  502. advance_to_next_rune(t);
  503. if (t->curr_rune == '.') {
  504. // TODO(bill): Clean up this shit
  505. restore_tokenizer_state(t, &state);
  506. goto end;
  507. }
  508. token.kind = Token_Float;
  509. scan_mantissa(t, 10);
  510. }
  511. exponent:
  512. if (t->curr_rune == 'e' || t->curr_rune == 'E') {
  513. token.kind = Token_Float;
  514. advance_to_next_rune(t);
  515. if (t->curr_rune == '-' || t->curr_rune == '+') {
  516. advance_to_next_rune(t);
  517. }
  518. scan_mantissa(t, 10);
  519. }
  520. if (t->curr_rune == 'i') {
  521. token.kind = Token_Imag;
  522. advance_to_next_rune(t);
  523. }
  524. end:
  525. token.string.len = t->curr - token.string.text;
  526. return token;
  527. }
  528. // Quote == " for string
  529. bool scan_escape(Tokenizer *t, Rune quote) {
  530. isize len = 0;
  531. u32 base = 0, max = 0, x = 0;
  532. Rune r = t->curr_rune;
  533. if (r == 'a' ||
  534. r == 'b' ||
  535. r == 'f' ||
  536. r == 'n' ||
  537. r == 'r' ||
  538. r == 't' ||
  539. r == 'v' ||
  540. r == '\\' ||
  541. r == quote) {
  542. advance_to_next_rune(t);
  543. return true;
  544. } else if (gb_is_between(r, '0', '7')) {
  545. len = 3; base = 8; max = 255;
  546. } else if (r == 'x') {
  547. advance_to_next_rune(t);
  548. len = 2; base = 16; max = 255;
  549. } else if (r == 'u') {
  550. advance_to_next_rune(t);
  551. len = 4; base = 16; max = GB_RUNE_MAX;
  552. } else if (r == 'U') {
  553. advance_to_next_rune(t);
  554. len = 8; base = 16; max = GB_RUNE_MAX;
  555. } else {
  556. if (t->curr_rune < 0) {
  557. tokenizer_err(t, "Escape sequence was not terminated");
  558. } else {
  559. tokenizer_err(t, "Unknown escape sequence");
  560. }
  561. return false;
  562. }
  563. while (len --> 0) {
  564. u32 d = cast(u32)digit_value(t->curr_rune);
  565. if (d >= base) {
  566. if (t->curr_rune < 0) {
  567. tokenizer_err(t, "Escape sequence was not terminated");
  568. } else {
  569. tokenizer_err(t, "Illegal character %d in escape sequence", t->curr_rune);
  570. }
  571. return false;
  572. }
  573. x = x*base + d;
  574. advance_to_next_rune(t);
  575. }
  576. return true;
  577. }
  578. gb_inline TokenKind token_kind_variant2(Tokenizer *t, TokenKind a, TokenKind b) {
  579. if (t->curr_rune == '=') {
  580. advance_to_next_rune(t);
  581. return b;
  582. }
  583. return a;
  584. }
  585. gb_inline TokenKind token_kind_variant3(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c) {
  586. if (t->curr_rune == '=') {
  587. advance_to_next_rune(t);
  588. return b;
  589. }
  590. if (t->curr_rune == ch_c) {
  591. advance_to_next_rune(t);
  592. return c;
  593. }
  594. return a;
  595. }
  596. gb_inline TokenKind token_kind_variant4(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c, Rune ch_d, TokenKind d) {
  597. if (t->curr_rune == '=') {
  598. advance_to_next_rune(t);
  599. return b;
  600. } else if (t->curr_rune == ch_c) {
  601. advance_to_next_rune(t);
  602. return c;
  603. } else if (t->curr_rune == ch_d) {
  604. advance_to_next_rune(t);
  605. return d;
  606. }
  607. return a;
  608. }
  609. gb_inline TokenKind token_kind_dub_eq(Tokenizer *t, Rune sing_rune, TokenKind sing, TokenKind sing_eq, TokenKind dub, TokenKind dub_eq) {
  610. if (t->curr_rune == '=') {
  611. advance_to_next_rune(t);
  612. return sing_eq;
  613. } else if (t->curr_rune == sing_rune) {
  614. advance_to_next_rune(t);
  615. if (t->curr_rune == '=') {
  616. advance_to_next_rune(t);
  617. return dub_eq;
  618. }
  619. return dub;
  620. }
  621. return sing;
  622. }
  623. void tokenizer__fle_update(Tokenizer *t) {
  624. t->curr_rune = '/';
  625. t->curr = t->curr-1;
  626. t->read_curr = t->curr+1;
  627. advance_to_next_rune(t);
  628. }
  629. // NOTE(bill): needed if comment is straight after a "semicolon"
  630. bool tokenizer_find_line_end(Tokenizer *t) {
  631. while (t->curr_rune == '/' || t->curr_rune == '*') {
  632. if (t->curr_rune == '/') {
  633. tokenizer__fle_update(t);
  634. return true;
  635. }
  636. advance_to_next_rune(t);
  637. while (t->curr_rune >= 0) {
  638. Rune r = t->curr_rune;
  639. if (r == '\n') {
  640. tokenizer__fle_update(t);
  641. return true;
  642. }
  643. advance_to_next_rune(t);
  644. if (r == '*' && t->curr_rune == '/') {
  645. advance_to_next_rune(t);
  646. break;
  647. }
  648. }
  649. tokenizer_skip_whitespace(t);
  650. if (t->curr_rune < 0 || t->curr_rune == '\n') {
  651. tokenizer__fle_update(t);
  652. return true;
  653. }
  654. if (t->curr_rune != '/') {
  655. tokenizer__fle_update(t);
  656. return false;
  657. }
  658. advance_to_next_rune(t);
  659. }
  660. tokenizer__fle_update(t);
  661. return false;
  662. }
  663. Token tokenizer_get_token(Tokenizer *t) {
  664. tokenizer_skip_whitespace(t);
  665. Token token = {};
  666. token.string = make_string(t->curr, 1);
  667. token.pos.file = t->fullpath;
  668. token.pos.line = t->line_count;
  669. token.pos.column = t->curr - t->line + 1;
  670. Rune curr_rune = t->curr_rune;
  671. if (rune_is_letter(curr_rune)) {
  672. token.kind = Token_Ident;
  673. while (rune_is_letter(t->curr_rune) || rune_is_digit(t->curr_rune)) {
  674. advance_to_next_rune(t);
  675. }
  676. token.string.len = t->curr - token.string.text;
  677. // NOTE(bill): All keywords are > 1
  678. if (token.string.len > 1) {
  679. for (i32 k = Token__KeywordBegin+1; k < Token__KeywordEnd; k++) {
  680. if (token.string == token_strings[k]) {
  681. token.kind = cast(TokenKind)k;
  682. break;
  683. }
  684. }
  685. }
  686. } else if (gb_is_between(curr_rune, '0', '9')) {
  687. token = scan_number_to_token(t, false);
  688. } else {
  689. advance_to_next_rune(t);
  690. switch (curr_rune) {
  691. case GB_RUNE_EOF:
  692. token.kind = Token_EOF;
  693. break;
  694. case '\'': // Rune Literal
  695. {
  696. token.kind = Token_Rune;
  697. Rune quote = curr_rune;
  698. bool valid = true;
  699. i32 n = 0, success;
  700. for (;;) {
  701. Rune r = t->curr_rune;
  702. if (r == '\n' || r < 0) {
  703. tokenizer_err(t, "Rune literal not terminated");
  704. break;
  705. }
  706. advance_to_next_rune(t);
  707. if (r == quote) {
  708. break;
  709. }
  710. n++;
  711. if (r == '\\') {
  712. if (!scan_escape(t, quote)) {
  713. valid = false;
  714. }
  715. }
  716. }
  717. // TODO(bill): Better Error Handling
  718. if (valid && n != 1) {
  719. tokenizer_err(t, "Invalid rune literal");
  720. }
  721. token.string.len = t->curr - token.string.text;
  722. success = unquote_string(heap_allocator(), &token.string);
  723. if (success > 0) {
  724. if (success == 2) {
  725. array_add(&t->allocated_strings, token.string);
  726. }
  727. return token;
  728. } else {
  729. tokenizer_err(t, "Invalid rune literal");
  730. }
  731. } break;
  732. case '`': // Raw String Literal
  733. case '"': // String Literal
  734. {
  735. i32 success;
  736. Rune quote = curr_rune;
  737. token.kind = Token_String;
  738. if (curr_rune == '"') {
  739. for (;;) {
  740. Rune r = t->curr_rune;
  741. if (r == '\n' || r < 0) {
  742. tokenizer_err(t, "String literal not terminated");
  743. break;
  744. }
  745. advance_to_next_rune(t);
  746. if (r == quote) {
  747. break;
  748. }
  749. if (r == '\\') {
  750. scan_escape(t, quote);
  751. }
  752. }
  753. } else {
  754. for (;;) {
  755. Rune r = t->curr_rune;
  756. if (r < 0) {
  757. tokenizer_err(t, "String literal not terminated");
  758. break;
  759. }
  760. advance_to_next_rune(t);
  761. if (r == quote) {
  762. break;
  763. }
  764. }
  765. }
  766. token.string.len = t->curr - token.string.text;
  767. success = unquote_string(heap_allocator(), &token.string);
  768. if (success > 0) {
  769. if (success == 2) {
  770. array_add(&t->allocated_strings, token.string);
  771. }
  772. return token;
  773. } else {
  774. tokenizer_err(t, "Invalid string literal");
  775. }
  776. } break;
  777. case '.':
  778. token.kind = Token_Period; // Default
  779. if (t->curr_rune == '.') { // Could be an ellipsis
  780. advance_to_next_rune(t);
  781. token.kind = Token_Ellipsis;
  782. if (t->curr_rune == '<') {
  783. advance_to_next_rune(t);
  784. token.kind = Token_HalfClosed;
  785. }
  786. }
  787. break;
  788. case '#': token.kind = Token_Hash; break;
  789. case '@': token.kind = Token_At; break;
  790. case '$': token.kind = Token_Dollar; break;
  791. case '?': token.kind = Token_Question; break;
  792. case '^': token.kind = Token_Pointer; break;
  793. case ';': token.kind = Token_Semicolon; break;
  794. case ',': token.kind = Token_Comma; break;
  795. case ':': token.kind = Token_Colon; break;
  796. case '(': token.kind = Token_OpenParen; break;
  797. case ')': token.kind = Token_CloseParen; break;
  798. case '[': token.kind = Token_OpenBracket; break;
  799. case ']': token.kind = Token_CloseBracket; break;
  800. case '{': token.kind = Token_OpenBrace; break;
  801. case '}': token.kind = Token_CloseBrace; break;
  802. case '\\': token.kind = Token_BackSlash; break;
  803. case 0x2260: token.kind = Token_NotEq; break; // '≠'
  804. case 0x2264: token.kind = Token_LtEq; break; // '≤'
  805. case 0x2265: token.kind = Token_GtEq; break; // '≥'
  806. case '%': token.kind = token_kind_dub_eq(t, '%', Token_Mod, Token_ModEq, Token_ModMod, Token_ModModEq); break;
  807. case '*': token.kind = token_kind_variant2(t, Token_Mul, Token_MulEq); break;
  808. case '=':
  809. token.kind = Token_Eq;
  810. if (t->curr_rune == '>') {
  811. advance_to_next_rune(t);
  812. token.kind = Token_DoubleArrowRight;
  813. } else if (t->curr_rune == '=') {
  814. advance_to_next_rune(t);
  815. token.kind = Token_CmpEq;
  816. }
  817. break;
  818. case '~': token.kind = token_kind_variant2(t, Token_Xor, Token_XorEq); break;
  819. case '!': token.kind = token_kind_variant2(t, Token_Not, Token_NotEq); break;
  820. case '+': token.kind = token_kind_variant3(t, Token_Add, Token_AddEq, '+', Token_Inc); break;
  821. case '-':
  822. token.kind = Token_Sub;
  823. if (t->curr_rune == '=') {
  824. advance_to_next_rune(t);
  825. token.kind = Token_SubEq;
  826. } else if (t->curr_rune == '-') {
  827. advance_to_next_rune(t);
  828. token.kind = Token_Dec;
  829. if (t->curr_rune == '-') {
  830. advance_to_next_rune(t);
  831. token.kind = Token_Undef;
  832. }
  833. } else if (t->curr_rune == '>') {
  834. advance_to_next_rune(t);
  835. token.kind = Token_ArrowRight;
  836. }
  837. break;
  838. case '/': {
  839. if (t->curr_rune == '/') {
  840. while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) {
  841. advance_to_next_rune(t);
  842. }
  843. token.kind = Token_Comment;
  844. } else if (t->curr_rune == '*') {
  845. isize comment_scope = 1;
  846. advance_to_next_rune(t);
  847. while (comment_scope > 0) {
  848. if (t->curr_rune == GB_RUNE_EOF) {
  849. break;
  850. } else if (t->curr_rune == '/') {
  851. advance_to_next_rune(t);
  852. if (t->curr_rune == '*') {
  853. advance_to_next_rune(t);
  854. comment_scope++;
  855. }
  856. } else if (t->curr_rune == '*') {
  857. advance_to_next_rune(t);
  858. if (t->curr_rune == '/') {
  859. advance_to_next_rune(t);
  860. comment_scope--;
  861. }
  862. } else {
  863. advance_to_next_rune(t);
  864. }
  865. }
  866. token.kind = Token_Comment;
  867. } else {
  868. token.kind = token_kind_variant2(t, Token_Quo, Token_QuoEq);
  869. }
  870. } break;
  871. case '<':
  872. if (t->curr_rune == '-') {
  873. advance_to_next_rune(t);
  874. token.kind = Token_ArrowLeft;
  875. } else {
  876. token.kind = token_kind_dub_eq(t, '<', Token_Lt, Token_LtEq, Token_Shl, Token_ShlEq);
  877. }
  878. break;
  879. case '>': token.kind = token_kind_dub_eq(t, '>', Token_Gt, Token_GtEq, Token_Shr, Token_ShrEq); break;
  880. case '&':
  881. token.kind = Token_And;
  882. if (t->curr_rune == '~') {
  883. token.kind = Token_AndNot;
  884. advance_to_next_rune(t);
  885. if (t->curr_rune == '=') {
  886. token.kind = Token_AndNotEq;
  887. advance_to_next_rune(t);
  888. }
  889. } else {
  890. token.kind = token_kind_dub_eq(t, '&', Token_And, Token_AndEq, Token_CmpAnd, Token_CmpAndEq);
  891. }
  892. break;
  893. case '|': token.kind = token_kind_dub_eq(t, '|', Token_Or, Token_OrEq, Token_CmpOr, Token_CmpOrEq); break;
  894. default:
  895. if (curr_rune != GB_RUNE_BOM) {
  896. u8 str[4] = {};
  897. int len = cast(int)gb_utf8_encode_rune(str, curr_rune);
  898. tokenizer_err(t, "Illegal character: %.*s (%d) ", len, str, curr_rune);
  899. }
  900. token.kind = Token_Invalid;
  901. break;
  902. }
  903. }
  904. token.string.len = t->curr - token.string.text;
  905. return token;
  906. }