tokenizer.cpp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086
  1. #define TOKEN_KINDS \
  2. TOKEN_KIND(Token_Invalid, "Invalid"), \
  3. TOKEN_KIND(Token_EOF, "EOF"), \
  4. TOKEN_KIND(Token_Comment, "Comment"), \
  5. \
  6. TOKEN_KIND(Token__LiteralBegin, ""), \
  7. TOKEN_KIND(Token_Ident, "identifier"), \
  8. TOKEN_KIND(Token_Integer, "integer"), \
  9. TOKEN_KIND(Token_Float, "float"), \
  10. TOKEN_KIND(Token_Imag, "imaginary"), \
  11. TOKEN_KIND(Token_Rune, "rune"), \
  12. TOKEN_KIND(Token_String, "string"), \
  13. TOKEN_KIND(Token__LiteralEnd, ""), \
  14. \
  15. TOKEN_KIND(Token__OperatorBegin, ""), \
  16. TOKEN_KIND(Token_Eq, "="), \
  17. TOKEN_KIND(Token_Not, "!"), \
  18. TOKEN_KIND(Token_Hash, "#"), \
  19. TOKEN_KIND(Token_At, "@"), \
  20. TOKEN_KIND(Token_Dollar, "$"), \
  21. TOKEN_KIND(Token_Pointer, "^"), \
  22. TOKEN_KIND(Token_Question, "?"), \
  23. TOKEN_KIND(Token_Add, "+"), \
  24. TOKEN_KIND(Token_Sub, "-"), \
  25. TOKEN_KIND(Token_Mul, "*"), \
  26. TOKEN_KIND(Token_Quo, "/"), \
  27. TOKEN_KIND(Token_Mod, "%"), \
  28. TOKEN_KIND(Token_ModMod, "%%"), \
  29. TOKEN_KIND(Token_And, "&"), \
  30. TOKEN_KIND(Token_Or, "|"), \
  31. TOKEN_KIND(Token_Xor, "~"), \
  32. TOKEN_KIND(Token_AndNot, "&~"), \
  33. TOKEN_KIND(Token_Shl, "<<"), \
  34. TOKEN_KIND(Token_Shr, ">>"), \
  35. \
  36. TOKEN_KIND(Token_CmpAnd, "&&"), \
  37. TOKEN_KIND(Token_CmpOr, "||"), \
  38. \
  39. TOKEN_KIND(Token__AssignOpBegin, ""), \
  40. TOKEN_KIND(Token_AddEq, "+="), \
  41. TOKEN_KIND(Token_SubEq, "-="), \
  42. TOKEN_KIND(Token_MulEq, "*="), \
  43. TOKEN_KIND(Token_QuoEq, "/="), \
  44. TOKEN_KIND(Token_ModEq, "%="), \
  45. TOKEN_KIND(Token_ModModEq, "%%="), \
  46. TOKEN_KIND(Token_AndEq, "&="), \
  47. TOKEN_KIND(Token_OrEq, "|="), \
  48. TOKEN_KIND(Token_XorEq, "~="), \
  49. TOKEN_KIND(Token_AndNotEq, "&~="), \
  50. TOKEN_KIND(Token_ShlEq, "<<="), \
  51. TOKEN_KIND(Token_ShrEq, ">>="), \
  52. TOKEN_KIND(Token_CmpAndEq, "&&="), \
  53. TOKEN_KIND(Token_CmpOrEq, "||="), \
  54. TOKEN_KIND(Token__AssignOpEnd, ""), \
  55. TOKEN_KIND(Token_ArrowRight, "->"), \
  56. TOKEN_KIND(Token_ArrowLeft, "<-"), \
  57. TOKEN_KIND(Token_DoubleArrowRight, "=>"), \
  58. TOKEN_KIND(Token_Undef, "---"), \
  59. \
  60. TOKEN_KIND(Token__ComparisonBegin, ""), \
  61. TOKEN_KIND(Token_CmpEq, "=="), \
  62. TOKEN_KIND(Token_NotEq, "!="), \
  63. TOKEN_KIND(Token_Lt, "<"), \
  64. TOKEN_KIND(Token_Gt, ">"), \
  65. TOKEN_KIND(Token_LtEq, "<="), \
  66. TOKEN_KIND(Token_GtEq, ">="), \
  67. TOKEN_KIND(Token__ComparisonEnd, ""), \
  68. \
  69. TOKEN_KIND(Token_OpenParen, "("), \
  70. TOKEN_KIND(Token_CloseParen, ")"), \
  71. TOKEN_KIND(Token_OpenBracket, "["), \
  72. TOKEN_KIND(Token_CloseBracket, "]"), \
  73. TOKEN_KIND(Token_OpenBrace, "{"), \
  74. TOKEN_KIND(Token_CloseBrace, "}"), \
  75. TOKEN_KIND(Token_Colon, ":"), \
  76. TOKEN_KIND(Token_Semicolon, ";"), \
  77. TOKEN_KIND(Token_Period, "."), \
  78. TOKEN_KIND(Token_Comma, ","), \
  79. TOKEN_KIND(Token_Ellipsis, ".."), \
  80. TOKEN_KIND(Token_BackSlash, "\\"), \
  81. TOKEN_KIND(Token__OperatorEnd, ""), \
  82. \
  83. TOKEN_KIND(Token__KeywordBegin, ""), \
  84. TOKEN_KIND(Token_import, "import"), \
  85. TOKEN_KIND(Token_export, "export"), \
  86. TOKEN_KIND(Token_foreign, "foreign"), \
  87. TOKEN_KIND(Token_package, "package"), \
  88. TOKEN_KIND(Token_typeid, "typeid"), \
  89. TOKEN_KIND(Token_when, "when"), \
  90. TOKEN_KIND(Token_if, "if"), \
  91. TOKEN_KIND(Token_else, "else"), \
  92. TOKEN_KIND(Token_for, "for"), \
  93. TOKEN_KIND(Token_switch, "switch"), \
  94. TOKEN_KIND(Token_in, "in"), \
  95. TOKEN_KIND(Token_do, "do"), \
  96. TOKEN_KIND(Token_case, "case"), \
  97. TOKEN_KIND(Token_break, "break"), \
  98. TOKEN_KIND(Token_continue, "continue"), \
  99. TOKEN_KIND(Token_fallthrough, "fallthrough"), \
  100. TOKEN_KIND(Token_defer, "defer"), \
  101. TOKEN_KIND(Token_return, "return"), \
  102. TOKEN_KIND(Token_proc, "proc"), \
  103. TOKEN_KIND(Token_macro, "macro"), \
  104. TOKEN_KIND(Token_struct, "struct"), \
  105. TOKEN_KIND(Token_union, "union"), \
  106. TOKEN_KIND(Token_enum, "enum"), \
  107. TOKEN_KIND(Token_bit_field, "bit_field"), \
  108. TOKEN_KIND(Token_bit_set, "bit_set"), \
  109. TOKEN_KIND(Token_map, "map"), \
  110. TOKEN_KIND(Token_static, "static"), \
  111. TOKEN_KIND(Token_dynamic, "dynamic"), \
  112. TOKEN_KIND(Token_auto_cast, "auto_cast"), \
  113. TOKEN_KIND(Token_cast, "cast"), \
  114. TOKEN_KIND(Token_transmute, "transmute"), \
  115. TOKEN_KIND(Token_distinct, "distinct"), \
  116. TOKEN_KIND(Token_opaque, "opaque"), \
  117. TOKEN_KIND(Token_using, "using"), \
  118. TOKEN_KIND(Token_inline, "inline"), \
  119. TOKEN_KIND(Token_no_inline, "no_inline"), \
  120. TOKEN_KIND(Token_context, "context"), \
  121. TOKEN_KIND(Token_size_of, "size_of"), \
  122. TOKEN_KIND(Token_align_of, "align_of"), \
  123. TOKEN_KIND(Token_offset_of, "offset_of"), \
  124. TOKEN_KIND(Token_type_of, "type_of"), \
  125. TOKEN_KIND(Token_const, "const"), \
  126. TOKEN_KIND(Token_asm, "asm"), \
  127. TOKEN_KIND(Token_yield, "yield"), \
  128. TOKEN_KIND(Token_await, "await"), \
  129. TOKEN_KIND(Token__KeywordEnd, ""), \
  130. TOKEN_KIND(Token_Count, "")
  131. enum TokenKind {
  132. #define TOKEN_KIND(e, s) e
  133. TOKEN_KINDS
  134. #undef TOKEN_KIND
  135. };
  136. String const token_strings[] = {
  137. #define TOKEN_KIND(e, s) {cast(u8 *)s, gb_size_of(s)-1}
  138. TOKEN_KINDS
  139. #undef TOKEN_KIND
  140. };
  141. struct TokenPos {
  142. String file;
  143. isize offset; // starting at 0
  144. isize line; // starting at 1
  145. isize column; // starting at 1
  146. };
  147. i32 token_pos_cmp(TokenPos const &a, TokenPos const &b) {
  148. if (a.offset != b.offset) {
  149. return (a.offset < b.offset) ? -1 : +1;
  150. }
  151. if (a.line != b.line) {
  152. return (a.line < b.line) ? -1 : +1;
  153. }
  154. if (a.column != b.column) {
  155. return (a.column < b.column) ? -1 : +1;
  156. }
  157. return string_compare(a.file, b.file);
  158. }
  159. bool operator==(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) == 0; }
  160. bool operator!=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) != 0; }
  161. bool operator< (TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) < 0; }
  162. bool operator<=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) <= 0; }
  163. bool operator> (TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) > 0; }
  164. bool operator>=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) >= 0; }
  165. struct Token {
  166. TokenKind kind;
  167. String string;
  168. TokenPos pos;
  169. };
  170. Token empty_token = {Token_Invalid};
  171. Token blank_token = {Token_Ident, {cast(u8 *)"_", 1}};
  172. Token make_token_ident(String s) {
  173. Token t = {Token_Ident, s};
  174. return t;
  175. }
  176. struct ErrorCollector {
  177. TokenPos prev;
  178. i64 count;
  179. i64 warning_count;
  180. gbMutex mutex;
  181. };
  182. gb_global ErrorCollector global_error_collector;
  183. void init_global_error_collector(void) {
  184. gb_mutex_init(&global_error_collector.mutex);
  185. }
  186. void warning_va(Token token, char *fmt, va_list va) {
  187. gb_mutex_lock(&global_error_collector.mutex);
  188. global_error_collector.warning_count++;
  189. // NOTE(bill): Duplicate error, skip it
  190. if (token.pos.line == 0) {
  191. gb_printf_err("Error: %s\n", gb_bprintf_va(fmt, va));
  192. } else if (global_error_collector.prev != token.pos) {
  193. global_error_collector.prev = token.pos;
  194. gb_printf_err("%.*s(%td:%td) Warning: %s\n",
  195. LIT(token.pos.file), token.pos.line, token.pos.column,
  196. gb_bprintf_va(fmt, va));
  197. }
  198. gb_mutex_unlock(&global_error_collector.mutex);
  199. }
  200. void error_va(Token token, char *fmt, va_list va) {
  201. gb_mutex_lock(&global_error_collector.mutex);
  202. global_error_collector.count++;
  203. // NOTE(bill): Duplicate error, skip it
  204. if (token.pos.line == 0) {
  205. gb_printf_err("Error: %s\n", gb_bprintf_va(fmt, va));
  206. } else if (global_error_collector.prev != token.pos) {
  207. global_error_collector.prev = token.pos;
  208. gb_printf_err("%.*s(%td:%td) %s\n",
  209. LIT(token.pos.file), token.pos.line, token.pos.column,
  210. gb_bprintf_va(fmt, va));
  211. }
  212. gb_mutex_unlock(&global_error_collector.mutex);
  213. if (global_error_collector.count > 20) {
  214. gb_exit(1);
  215. }
  216. }
  217. void error_no_newline_va(Token token, char *fmt, va_list va) {
  218. gb_mutex_lock(&global_error_collector.mutex);
  219. global_error_collector.count++;
  220. // NOTE(bill): Duplicate error, skip it
  221. if (token.pos.line == 0) {
  222. gb_printf_err("Error: %s", gb_bprintf_va(fmt, va));
  223. } else if (global_error_collector.prev != token.pos) {
  224. global_error_collector.prev = token.pos;
  225. gb_printf_err("%.*s(%td:%td) %s",
  226. LIT(token.pos.file), token.pos.line, token.pos.column,
  227. gb_bprintf_va(fmt, va));
  228. }
  229. gb_mutex_unlock(&global_error_collector.mutex);
  230. if (global_error_collector.count > 20) {
  231. gb_exit(1);
  232. }
  233. }
  234. void syntax_error_va(Token token, char *fmt, va_list va) {
  235. gb_mutex_lock(&global_error_collector.mutex);
  236. global_error_collector.count++;
  237. // NOTE(bill): Duplicate error, skip it
  238. if (global_error_collector.prev != token.pos) {
  239. global_error_collector.prev = token.pos;
  240. gb_printf_err("%.*s(%td:%td) Syntax Error: %s\n",
  241. LIT(token.pos.file), token.pos.line, token.pos.column,
  242. gb_bprintf_va(fmt, va));
  243. } else if (token.pos.line == 0) {
  244. gb_printf_err("Syntax Error: %s\n", gb_bprintf_va(fmt, va));
  245. }
  246. gb_mutex_unlock(&global_error_collector.mutex);
  247. if (global_error_collector.count > 20) {
  248. gb_exit(1);
  249. }
  250. }
  251. void syntax_warning_va(Token token, char *fmt, va_list va) {
  252. gb_mutex_lock(&global_error_collector.mutex);
  253. global_error_collector.warning_count++;
  254. // NOTE(bill): Duplicate error, skip it
  255. if (global_error_collector.prev != token.pos) {
  256. global_error_collector.prev = token.pos;
  257. gb_printf_err("%.*s(%td:%td) Syntax Warning: %s\n",
  258. LIT(token.pos.file), token.pos.line, token.pos.column,
  259. gb_bprintf_va(fmt, va));
  260. } else if (token.pos.line == 0) {
  261. gb_printf_err("Warning: %s\n", gb_bprintf_va(fmt, va));
  262. }
  263. gb_mutex_unlock(&global_error_collector.mutex);
  264. }
  265. void warning(Token token, char *fmt, ...) {
  266. va_list va;
  267. va_start(va, fmt);
  268. warning_va(token, fmt, va);
  269. va_end(va);
  270. }
  271. void error(Token token, char *fmt, ...) {
  272. va_list va;
  273. va_start(va, fmt);
  274. error_va(token, fmt, va);
  275. va_end(va);
  276. }
  277. void error(TokenPos pos, char *fmt, ...) {
  278. va_list va;
  279. va_start(va, fmt);
  280. Token token = {};
  281. token.pos = pos;
  282. error_va(token, fmt, va);
  283. va_end(va);
  284. }
  285. void syntax_error(Token token, char *fmt, ...) {
  286. va_list va;
  287. va_start(va, fmt);
  288. syntax_error_va(token, fmt, va);
  289. va_end(va);
  290. }
  291. void syntax_warning(Token token, char *fmt, ...) {
  292. va_list va;
  293. va_start(va, fmt);
  294. syntax_warning_va(token, fmt, va);
  295. va_end(va);
  296. }
  297. void compiler_error(char *fmt, ...) {
  298. va_list va;
  299. va_start(va, fmt);
  300. gb_printf_err("Internal Compiler Error: %s\n",
  301. gb_bprintf_va(fmt, va));
  302. va_end(va);
  303. gb_exit(1);
  304. }
  305. gb_inline bool token_is_literal(TokenKind t) {
  306. return gb_is_between(t, Token__LiteralBegin+1, Token__LiteralEnd-1);
  307. }
  308. gb_inline bool token_is_operator(TokenKind t) {
  309. return gb_is_between(t, Token__OperatorBegin+1, Token__OperatorEnd-1);
  310. }
  311. gb_inline bool token_is_keyword(TokenKind t) {
  312. return gb_is_between(t, Token__KeywordBegin+1, Token__KeywordEnd-1);
  313. }
  314. gb_inline bool token_is_comparison(TokenKind t) {
  315. return gb_is_between(t, Token__ComparisonBegin+1, Token__ComparisonEnd-1);
  316. }
  317. gb_inline bool token_is_shift(TokenKind t) {
  318. return t == Token_Shl || t == Token_Shr;
  319. }
  320. gb_inline void print_token(Token t) { gb_printf("%.*s\n", LIT(t.string)); }
  321. enum TokenizerInitError {
  322. TokenizerInit_None,
  323. TokenizerInit_Invalid,
  324. TokenizerInit_NotExists,
  325. TokenizerInit_Permission,
  326. TokenizerInit_Empty,
  327. TokenizerInit_Count,
  328. };
  329. struct TokenizerState {
  330. Rune curr_rune; // current character
  331. u8 * curr; // character pos
  332. u8 * read_curr; // pos from start
  333. u8 * line; // current line pos
  334. isize line_count;
  335. };
  336. struct Tokenizer {
  337. String fullpath;
  338. u8 *start;
  339. u8 *end;
  340. Rune curr_rune; // current character
  341. u8 * curr; // character pos
  342. u8 * read_curr; // pos from start
  343. u8 * line; // current line pos
  344. isize line_count;
  345. isize error_count;
  346. Array<String> allocated_strings;
  347. };
  348. TokenizerState save_tokenizer_state(Tokenizer *t) {
  349. TokenizerState state = {};
  350. state.curr_rune = t->curr_rune;
  351. state.curr = t->curr;
  352. state.read_curr = t->read_curr;
  353. state.line = t->line;
  354. state.line_count = t->line_count;
  355. return state;
  356. }
  357. void restore_tokenizer_state(Tokenizer *t, TokenizerState *state) {
  358. t->curr_rune = state->curr_rune;
  359. t->curr = state->curr;
  360. t->read_curr = state->read_curr;
  361. t->line = state->line;
  362. t->line_count = state->line_count;
  363. }
  364. void tokenizer_err(Tokenizer *t, char *msg, ...) {
  365. va_list va;
  366. isize column = t->read_curr - t->line+1;
  367. if (column < 1) {
  368. column = 1;
  369. }
  370. Token token = {};
  371. token.pos.file = t->fullpath;
  372. token.pos.line = t->line_count;
  373. token.pos.column = column;
  374. va_start(va, msg);
  375. syntax_error_va(token, msg, va);
  376. va_end(va);
  377. t->error_count++;
  378. }
  379. void advance_to_next_rune(Tokenizer *t) {
  380. if (t->read_curr < t->end) {
  381. Rune rune;
  382. isize width = 1;
  383. t->curr = t->read_curr;
  384. if (t->curr_rune == '\n') {
  385. t->line = t->curr;
  386. t->line_count++;
  387. }
  388. rune = *t->read_curr;
  389. if (rune == 0) {
  390. tokenizer_err(t, "Illegal character NUL");
  391. } else if (rune >= 0x80) { // not ASCII
  392. width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
  393. if (rune == GB_RUNE_INVALID && width == 1)
  394. tokenizer_err(t, "Illegal UTF-8 encoding");
  395. else if (rune == GB_RUNE_BOM && t->curr-t->start > 0)
  396. tokenizer_err(t, "Illegal byte order mark");
  397. }
  398. t->read_curr += width;
  399. t->curr_rune = rune;
  400. } else {
  401. t->curr = t->end;
  402. if (t->curr_rune == '\n') {
  403. t->line = t->curr;
  404. t->line_count++;
  405. }
  406. t->curr_rune = GB_RUNE_EOF;
  407. }
  408. }
  409. TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath) {
  410. TokenizerInitError err = TokenizerInit_None;
  411. char *c_str = alloc_cstring(heap_allocator(), fullpath);
  412. defer (gb_free(heap_allocator(), c_str));
  413. // TODO(bill): Memory map rather than copy contents
  414. gbFileContents fc = gb_file_read_contents(heap_allocator(), true, c_str);
  415. gb_zero_item(t);
  416. t->fullpath = fullpath;
  417. t->line_count = 1;
  418. if (fc.data != nullptr) {
  419. t->start = cast(u8 *)fc.data;
  420. t->line = t->read_curr = t->curr = t->start;
  421. t->end = t->start + fc.size;
  422. advance_to_next_rune(t);
  423. if (t->curr_rune == GB_RUNE_BOM) {
  424. advance_to_next_rune(t); // Ignore BOM at file beginning
  425. }
  426. array_init(&t->allocated_strings, heap_allocator());
  427. } else {
  428. gbFile f = {};
  429. gbFileError file_err = gb_file_open(&f, c_str);
  430. defer (gb_file_close(&f));
  431. switch (file_err) {
  432. case gbFileError_Invalid: err = TokenizerInit_Invalid; break;
  433. case gbFileError_NotExists: err = TokenizerInit_NotExists; break;
  434. case gbFileError_Permission: err = TokenizerInit_Permission; break;
  435. }
  436. if (err == TokenizerInit_None && gb_file_size(&f) == 0) {
  437. err = TokenizerInit_Empty;
  438. }
  439. }
  440. return err;
  441. }
  442. gb_inline void destroy_tokenizer(Tokenizer *t) {
  443. if (t->start != nullptr) {
  444. gb_free(heap_allocator(), t->start);
  445. }
  446. for_array(i, t->allocated_strings) {
  447. gb_free(heap_allocator(), t->allocated_strings[i].text);
  448. }
  449. array_free(&t->allocated_strings);
  450. }
  451. void tokenizer_skip_whitespace(Tokenizer *t) {
  452. while (t->curr_rune == ' ' ||
  453. t->curr_rune == '\t' ||
  454. t->curr_rune == '\n' ||
  455. t->curr_rune == '\r') {
  456. advance_to_next_rune(t);
  457. }
  458. }
  459. gb_inline i32 digit_value(Rune r) {
  460. if (gb_char_is_digit(cast(char)r)) {
  461. return r - '0';
  462. } else if (gb_is_between(cast(char)r, 'a', 'f')) {
  463. return r - 'a' + 10;
  464. } else if (gb_is_between(cast(char)r, 'A', 'F')) {
  465. return r - 'A' + 10;
  466. }
  467. return 16; // NOTE(bill): Larger than highest possible
  468. }
  469. gb_inline void scan_mantissa(Tokenizer *t, i32 base) {
  470. while (digit_value(t->curr_rune) < base || t->curr_rune == '_') {
  471. advance_to_next_rune(t);
  472. }
  473. }
  474. Token scan_number_to_token(Tokenizer *t, bool seen_decimal_point) {
  475. Token token = {};
  476. token.kind = Token_Integer;
  477. token.string = make_string(t->curr, 1);
  478. token.pos.file = t->fullpath;
  479. token.pos.line = t->line_count;
  480. token.pos.column = t->curr-t->line+1;
  481. if (seen_decimal_point) {
  482. token.string.text -= 1;
  483. token.string.len += 1;
  484. token.pos.column -= 1;
  485. token.kind = Token_Float;
  486. scan_mantissa(t, 10);
  487. goto exponent;
  488. }
  489. if (t->curr_rune == '0') {
  490. u8 *prev = t->curr;
  491. advance_to_next_rune(t);
  492. if (t->curr_rune == 'b') { // Binary
  493. advance_to_next_rune(t);
  494. scan_mantissa(t, 2);
  495. if (t->curr - prev <= 2) {
  496. token.kind = Token_Invalid;
  497. }
  498. } else if (t->curr_rune == 'o') { // Octal
  499. advance_to_next_rune(t);
  500. scan_mantissa(t, 8);
  501. if (t->curr - prev <= 2) {
  502. token.kind = Token_Invalid;
  503. }
  504. } else if (t->curr_rune == 'd') { // Decimal
  505. advance_to_next_rune(t);
  506. scan_mantissa(t, 10);
  507. if (t->curr - prev <= 2) {
  508. token.kind = Token_Invalid;
  509. }
  510. } else if (t->curr_rune == 'z') { // Dozenal
  511. advance_to_next_rune(t);
  512. scan_mantissa(t, 12);
  513. if (t->curr - prev <= 2) {
  514. token.kind = Token_Invalid;
  515. }
  516. } else if (t->curr_rune == 'x') { // Hexadecimal
  517. advance_to_next_rune(t);
  518. scan_mantissa(t, 16);
  519. if (t->curr - prev <= 2) {
  520. token.kind = Token_Invalid;
  521. }
  522. } else if (t->curr_rune == 'h') { // Hexadecimal Float
  523. token.kind = Token_Float;
  524. advance_to_next_rune(t);
  525. scan_mantissa(t, 16);
  526. if (t->curr - prev <= 2) {
  527. token.kind = Token_Invalid;
  528. } else {
  529. u8 *start = prev+2;
  530. isize n = t->curr - start;
  531. isize digit_count = 0;
  532. for (isize i = 0; i < n; i++) {
  533. if (start[i] != '_') {
  534. digit_count += 1;
  535. }
  536. }
  537. switch (digit_count) {
  538. case 8:
  539. case 16:
  540. break;
  541. default:
  542. tokenizer_err(t, "Invalid hexadecimal float, expected 8 or 16 digits, got %td", digit_count);
  543. break;
  544. }
  545. }
  546. } else {
  547. seen_decimal_point = false;
  548. scan_mantissa(t, 10);
  549. if (t->curr_rune == '.' || t->curr_rune == 'e' || t->curr_rune == 'E') {
  550. seen_decimal_point = true;
  551. goto fraction;
  552. }
  553. }
  554. goto end;
  555. }
  556. scan_mantissa(t, 10);
  557. fraction:
  558. if (t->curr_rune == '.') {
  559. // HACK(bill): This may be inefficient
  560. TokenizerState state = save_tokenizer_state(t);
  561. advance_to_next_rune(t);
  562. if (t->curr_rune == '.') {
  563. // TODO(bill): Clean up this shit
  564. restore_tokenizer_state(t, &state);
  565. goto end;
  566. }
  567. token.kind = Token_Float;
  568. scan_mantissa(t, 10);
  569. }
  570. exponent:
  571. if (t->curr_rune == 'e' || t->curr_rune == 'E') {
  572. token.kind = Token_Float;
  573. advance_to_next_rune(t);
  574. if (t->curr_rune == '-' || t->curr_rune == '+') {
  575. advance_to_next_rune(t);
  576. }
  577. scan_mantissa(t, 10);
  578. }
  579. if (t->curr_rune == 'i') {
  580. token.kind = Token_Imag;
  581. advance_to_next_rune(t);
  582. }
  583. end:
  584. token.string.len = t->curr - token.string.text;
  585. return token;
  586. }
  587. bool scan_escape(Tokenizer *t) {
  588. isize len = 0;
  589. u32 base = 0, max = 0, x = 0;
  590. Rune r = t->curr_rune;
  591. if (r == 'a' ||
  592. r == 'b' ||
  593. r == 'e' ||
  594. r == 'f' ||
  595. r == 'n' ||
  596. r == 'r' ||
  597. r == 't' ||
  598. r == 'v' ||
  599. r == '\\' ||
  600. r == '\'' ||
  601. r == '\"') {
  602. advance_to_next_rune(t);
  603. return true;
  604. } else if (gb_is_between(r, '0', '7')) {
  605. len = 3; base = 8; max = 255;
  606. } else if (r == 'x') {
  607. advance_to_next_rune(t);
  608. len = 2; base = 16; max = 255;
  609. } else if (r == 'u') {
  610. advance_to_next_rune(t);
  611. len = 4; base = 16; max = GB_RUNE_MAX;
  612. } else if (r == 'U') {
  613. advance_to_next_rune(t);
  614. len = 8; base = 16; max = GB_RUNE_MAX;
  615. } else {
  616. if (t->curr_rune < 0) {
  617. tokenizer_err(t, "Escape sequence was not terminated");
  618. } else {
  619. tokenizer_err(t, "Unknown escape sequence");
  620. }
  621. return false;
  622. }
  623. while (len --> 0) {
  624. u32 d = cast(u32)digit_value(t->curr_rune);
  625. if (d >= base) {
  626. if (t->curr_rune < 0) {
  627. tokenizer_err(t, "Escape sequence was not terminated");
  628. } else {
  629. tokenizer_err(t, "Illegal character %d in escape sequence", t->curr_rune);
  630. }
  631. return false;
  632. }
  633. x = x*base + d;
  634. advance_to_next_rune(t);
  635. }
  636. return true;
  637. }
  638. gb_inline TokenKind token_kind_variant2(Tokenizer *t, TokenKind a, TokenKind b) {
  639. if (t->curr_rune == '=') {
  640. advance_to_next_rune(t);
  641. return b;
  642. }
  643. return a;
  644. }
  645. gb_inline TokenKind token_kind_variant3(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c) {
  646. if (t->curr_rune == '=') {
  647. advance_to_next_rune(t);
  648. return b;
  649. }
  650. if (t->curr_rune == ch_c) {
  651. advance_to_next_rune(t);
  652. return c;
  653. }
  654. return a;
  655. }
  656. gb_inline TokenKind token_kind_variant4(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c, Rune ch_d, TokenKind d) {
  657. if (t->curr_rune == '=') {
  658. advance_to_next_rune(t);
  659. return b;
  660. } else if (t->curr_rune == ch_c) {
  661. advance_to_next_rune(t);
  662. return c;
  663. } else if (t->curr_rune == ch_d) {
  664. advance_to_next_rune(t);
  665. return d;
  666. }
  667. return a;
  668. }
  669. gb_inline TokenKind token_kind_dub_eq(Tokenizer *t, Rune sing_rune, TokenKind sing, TokenKind sing_eq, TokenKind dub, TokenKind dub_eq) {
  670. if (t->curr_rune == '=') {
  671. advance_to_next_rune(t);
  672. return sing_eq;
  673. } else if (t->curr_rune == sing_rune) {
  674. advance_to_next_rune(t);
  675. if (t->curr_rune == '=') {
  676. advance_to_next_rune(t);
  677. return dub_eq;
  678. }
  679. return dub;
  680. }
  681. return sing;
  682. }
  683. void tokenizer__fle_update(Tokenizer *t) {
  684. t->curr_rune = '/';
  685. t->curr = t->curr-1;
  686. t->read_curr = t->curr+1;
  687. advance_to_next_rune(t);
  688. }
  689. // NOTE(bill): needed if comment is straight after a "semicolon"
  690. bool tokenizer_find_line_end(Tokenizer *t) {
  691. while (t->curr_rune == '/' || t->curr_rune == '*') {
  692. if (t->curr_rune == '/') {
  693. tokenizer__fle_update(t);
  694. return true;
  695. }
  696. advance_to_next_rune(t);
  697. while (t->curr_rune >= 0) {
  698. Rune r = t->curr_rune;
  699. if (r == '\n') {
  700. tokenizer__fle_update(t);
  701. return true;
  702. }
  703. advance_to_next_rune(t);
  704. if (r == '*' && t->curr_rune == '/') {
  705. advance_to_next_rune(t);
  706. break;
  707. }
  708. }
  709. tokenizer_skip_whitespace(t);
  710. if (t->curr_rune < 0 || t->curr_rune == '\n') {
  711. tokenizer__fle_update(t);
  712. return true;
  713. }
  714. if (t->curr_rune != '/') {
  715. tokenizer__fle_update(t);
  716. return false;
  717. }
  718. advance_to_next_rune(t);
  719. }
  720. tokenizer__fle_update(t);
  721. return false;
  722. }
  723. Token tokenizer_get_token(Tokenizer *t) {
  724. tokenizer_skip_whitespace(t);
  725. Token token = {};
  726. token.string = make_string(t->curr, 1);
  727. token.pos.file = t->fullpath;
  728. token.pos.line = t->line_count;
  729. token.pos.offset = t->curr - t->start;
  730. token.pos.column = t->curr - t->line + 1;
  731. Rune curr_rune = t->curr_rune;
  732. if (rune_is_letter(curr_rune)) {
  733. token.kind = Token_Ident;
  734. while (rune_is_letter(t->curr_rune) || rune_is_digit(t->curr_rune)) {
  735. advance_to_next_rune(t);
  736. }
  737. token.string.len = t->curr - token.string.text;
  738. // NOTE(bill): All keywords are > 1
  739. if (token.string.len > 1) {
  740. for (i32 k = Token__KeywordBegin+1; k < Token__KeywordEnd; k++) {
  741. if (token.string == token_strings[k]) {
  742. token.kind = cast(TokenKind)k;
  743. break;
  744. }
  745. }
  746. }
  747. } else if (gb_is_between(curr_rune, '0', '9')) {
  748. token = scan_number_to_token(t, false);
  749. } else {
  750. advance_to_next_rune(t);
  751. switch (curr_rune) {
  752. case GB_RUNE_EOF:
  753. token.kind = Token_EOF;
  754. break;
  755. case '\'': // Rune Literal
  756. {
  757. token.kind = Token_Rune;
  758. Rune quote = curr_rune;
  759. bool valid = true;
  760. i32 n = 0, success;
  761. for (;;) {
  762. Rune r = t->curr_rune;
  763. if (r == '\n' || r < 0) {
  764. tokenizer_err(t, "Rune literal not terminated");
  765. break;
  766. }
  767. advance_to_next_rune(t);
  768. if (r == quote) {
  769. break;
  770. }
  771. n++;
  772. if (r == '\\') {
  773. if (!scan_escape(t)) {
  774. valid = false;
  775. }
  776. }
  777. }
  778. // TODO(bill): Better Error Handling
  779. if (valid && n != 1) {
  780. tokenizer_err(t, "Invalid rune literal");
  781. }
  782. token.string.len = t->curr - token.string.text;
  783. success = unquote_string(heap_allocator(), &token.string);
  784. if (success > 0) {
  785. if (success == 2) {
  786. array_add(&t->allocated_strings, token.string);
  787. }
  788. return token;
  789. } else {
  790. tokenizer_err(t, "Invalid rune literal");
  791. }
  792. } break;
  793. case '`': // Raw String Literal
  794. case '"': // String Literal
  795. {
  796. i32 success;
  797. Rune quote = curr_rune;
  798. token.kind = Token_String;
  799. if (curr_rune == '"') {
  800. for (;;) {
  801. Rune r = t->curr_rune;
  802. if (r == '\n' || r < 0) {
  803. tokenizer_err(t, "String literal not terminated");
  804. break;
  805. }
  806. advance_to_next_rune(t);
  807. if (r == quote) {
  808. break;
  809. }
  810. if (r == '\\') {
  811. scan_escape(t);
  812. }
  813. }
  814. } else {
  815. for (;;) {
  816. Rune r = t->curr_rune;
  817. if (r < 0) {
  818. tokenizer_err(t, "String literal not terminated");
  819. break;
  820. }
  821. advance_to_next_rune(t);
  822. if (r == quote) {
  823. break;
  824. }
  825. }
  826. }
  827. token.string.len = t->curr - token.string.text;
  828. success = unquote_string(heap_allocator(), &token.string);
  829. if (success > 0) {
  830. if (success == 2) {
  831. array_add(&t->allocated_strings, token.string);
  832. }
  833. return token;
  834. } else {
  835. tokenizer_err(t, "Invalid string literal");
  836. }
  837. } break;
  838. case '.':
  839. if (t->curr_rune == '.') { // Could be an ellipsis
  840. advance_to_next_rune(t);
  841. token.kind = Token_Ellipsis;
  842. } else if ('0' <= t->curr_rune && t->curr_rune <= '9') {
  843. token = scan_number_to_token(t, true);
  844. } else {
  845. token.kind = Token_Period;
  846. }
  847. break;
  848. case '#': token.kind = Token_Hash; break;
  849. case '@': token.kind = Token_At; break;
  850. case '$': token.kind = Token_Dollar; break;
  851. case '?': token.kind = Token_Question; break;
  852. case '^': token.kind = Token_Pointer; break;
  853. case ';': token.kind = Token_Semicolon; break;
  854. case ',': token.kind = Token_Comma; break;
  855. case ':': token.kind = Token_Colon; break;
  856. case '(': token.kind = Token_OpenParen; break;
  857. case ')': token.kind = Token_CloseParen; break;
  858. case '[': token.kind = Token_OpenBracket; break;
  859. case ']': token.kind = Token_CloseBracket; break;
  860. case '{': token.kind = Token_OpenBrace; break;
  861. case '}': token.kind = Token_CloseBrace; break;
  862. case '\\': token.kind = Token_BackSlash; break;
  863. case 0x2260: token.kind = Token_NotEq; break; // '≠'
  864. case 0x2264: token.kind = Token_LtEq; break; // '≤'
  865. case 0x2265: token.kind = Token_GtEq; break; // '≥'
  866. case '%': token.kind = token_kind_dub_eq(t, '%', Token_Mod, Token_ModEq, Token_ModMod, Token_ModModEq); break;
  867. case '*': token.kind = token_kind_variant2(t, Token_Mul, Token_MulEq); break;
  868. case '=':
  869. token.kind = Token_Eq;
  870. if (t->curr_rune == '>') {
  871. advance_to_next_rune(t);
  872. token.kind = Token_DoubleArrowRight;
  873. } else if (t->curr_rune == '=') {
  874. advance_to_next_rune(t);
  875. token.kind = Token_CmpEq;
  876. }
  877. break;
  878. case '~': token.kind = token_kind_variant2(t, Token_Xor, Token_XorEq); break;
  879. case '!': token.kind = token_kind_variant2(t, Token_Not, Token_NotEq); break;
  880. // case '+': token.kind = token_kind_variant3(t, Token_Add, Token_AddEq, '+', Token_Inc); break;
  881. case '+': token.kind = token_kind_variant2(t, Token_Add, Token_AddEq); break;
  882. case '-':
  883. token.kind = Token_Sub;
  884. if (t->curr_rune == '=') {
  885. advance_to_next_rune(t);
  886. token.kind = Token_SubEq;
  887. } else if (t->curr_rune == '-') {
  888. advance_to_next_rune(t);
  889. token.kind = Token_Invalid;
  890. if (t->curr_rune == '-') {
  891. advance_to_next_rune(t);
  892. token.kind = Token_Undef;
  893. }
  894. } else if (t->curr_rune == '>') {
  895. advance_to_next_rune(t);
  896. token.kind = Token_ArrowRight;
  897. }
  898. break;
  899. case '/': {
  900. if (t->curr_rune == '/') {
  901. while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) {
  902. advance_to_next_rune(t);
  903. }
  904. token.kind = Token_Comment;
  905. } else if (t->curr_rune == '*') {
  906. isize comment_scope = 1;
  907. advance_to_next_rune(t);
  908. while (comment_scope > 0) {
  909. if (t->curr_rune == GB_RUNE_EOF) {
  910. break;
  911. } else if (t->curr_rune == '/') {
  912. advance_to_next_rune(t);
  913. if (t->curr_rune == '*') {
  914. advance_to_next_rune(t);
  915. comment_scope++;
  916. }
  917. } else if (t->curr_rune == '*') {
  918. advance_to_next_rune(t);
  919. if (t->curr_rune == '/') {
  920. advance_to_next_rune(t);
  921. comment_scope--;
  922. }
  923. } else {
  924. advance_to_next_rune(t);
  925. }
  926. }
  927. token.kind = Token_Comment;
  928. } else {
  929. token.kind = token_kind_variant2(t, Token_Quo, Token_QuoEq);
  930. }
  931. } break;
  932. case '<':
  933. if (t->curr_rune == '-') {
  934. advance_to_next_rune(t);
  935. token.kind = Token_ArrowLeft;
  936. } else {
  937. token.kind = token_kind_dub_eq(t, '<', Token_Lt, Token_LtEq, Token_Shl, Token_ShlEq);
  938. }
  939. break;
  940. case '>': token.kind = token_kind_dub_eq(t, '>', Token_Gt, Token_GtEq, Token_Shr, Token_ShrEq); break;
  941. case '&':
  942. token.kind = Token_And;
  943. if (t->curr_rune == '~') {
  944. token.kind = Token_AndNot;
  945. advance_to_next_rune(t);
  946. if (t->curr_rune == '=') {
  947. token.kind = Token_AndNotEq;
  948. advance_to_next_rune(t);
  949. }
  950. } else {
  951. token.kind = token_kind_dub_eq(t, '&', Token_And, Token_AndEq, Token_CmpAnd, Token_CmpAndEq);
  952. }
  953. break;
  954. case '|': token.kind = token_kind_dub_eq(t, '|', Token_Or, Token_OrEq, Token_CmpOr, Token_CmpOrEq); break;
  955. default:
  956. if (curr_rune != GB_RUNE_BOM) {
  957. u8 str[4] = {};
  958. int len = cast(int)gb_utf8_encode_rune(str, curr_rune);
  959. tokenizer_err(t, "Illegal character: %.*s (%d) ", len, str, curr_rune);
  960. }
  961. token.kind = Token_Invalid;
  962. break;
  963. }
  964. }
  965. token.string.len = t->curr - token.string.text;
  966. return token;
  967. }