tokenizer.cpp 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069
  1. #define TOKEN_KINDS \
  2. TOKEN_KIND(Token_Invalid, "Invalid"), \
  3. TOKEN_KIND(Token_EOF, "EOF"), \
  4. TOKEN_KIND(Token_Comment, "Comment"), \
  5. \
  6. TOKEN_KIND(Token__LiteralBegin, ""), \
  7. TOKEN_KIND(Token_Ident, "identifier"), \
  8. TOKEN_KIND(Token_Integer, "integer"), \
  9. TOKEN_KIND(Token_Float, "float"), \
  10. TOKEN_KIND(Token_Imag, "imaginary"), \
  11. TOKEN_KIND(Token_Rune, "rune"), \
  12. TOKEN_KIND(Token_String, "string"), \
  13. TOKEN_KIND(Token__LiteralEnd, ""), \
  14. \
  15. TOKEN_KIND(Token__OperatorBegin, ""), \
  16. TOKEN_KIND(Token_Eq, "="), \
  17. TOKEN_KIND(Token_Not, "!"), \
  18. TOKEN_KIND(Token_Hash, "#"), \
  19. TOKEN_KIND(Token_At, "@"), \
  20. TOKEN_KIND(Token_Dollar, "$"), \
  21. TOKEN_KIND(Token_Pointer, "^"), \
  22. TOKEN_KIND(Token_Question, "?"), \
  23. TOKEN_KIND(Token_Add, "+"), \
  24. TOKEN_KIND(Token_Sub, "-"), \
  25. TOKEN_KIND(Token_Mul, "*"), \
  26. TOKEN_KIND(Token_Quo, "/"), \
  27. TOKEN_KIND(Token_Mod, "%"), \
  28. TOKEN_KIND(Token_ModMod, "%%"), \
  29. TOKEN_KIND(Token_And, "&"), \
  30. TOKEN_KIND(Token_Or, "|"), \
  31. TOKEN_KIND(Token_Xor, "~"), \
  32. TOKEN_KIND(Token_AndNot, "&~"), \
  33. TOKEN_KIND(Token_Shl, "<<"), \
  34. TOKEN_KIND(Token_Shr, ">>"), \
  35. \
  36. TOKEN_KIND(Token_CmpAnd, "&&"), \
  37. TOKEN_KIND(Token_CmpOr, "||"), \
  38. \
  39. TOKEN_KIND(Token__AssignOpBegin, ""), \
  40. TOKEN_KIND(Token_AddEq, "+="), \
  41. TOKEN_KIND(Token_SubEq, "-="), \
  42. TOKEN_KIND(Token_MulEq, "*="), \
  43. TOKEN_KIND(Token_QuoEq, "/="), \
  44. TOKEN_KIND(Token_ModEq, "%="), \
  45. TOKEN_KIND(Token_ModModEq, "%%="), \
  46. TOKEN_KIND(Token_AndEq, "&="), \
  47. TOKEN_KIND(Token_OrEq, "|="), \
  48. TOKEN_KIND(Token_XorEq, "~="), \
  49. TOKEN_KIND(Token_AndNotEq, "&~="), \
  50. TOKEN_KIND(Token_ShlEq, "<<="), \
  51. TOKEN_KIND(Token_ShrEq, ">>="), \
  52. TOKEN_KIND(Token_CmpAndEq, "&&="), \
  53. TOKEN_KIND(Token_CmpOrEq, "||="), \
  54. TOKEN_KIND(Token__AssignOpEnd, ""), \
  55. TOKEN_KIND(Token_ArrowRight, "->"), \
  56. TOKEN_KIND(Token_ArrowLeft, "<-"), \
  57. TOKEN_KIND(Token_DoubleArrowRight, "=>"), \
  58. TOKEN_KIND(Token_Undef, "---"), \
  59. \
  60. TOKEN_KIND(Token__ComparisonBegin, ""), \
  61. TOKEN_KIND(Token_CmpEq, "=="), \
  62. TOKEN_KIND(Token_NotEq, "!="), \
  63. TOKEN_KIND(Token_Lt, "<"), \
  64. TOKEN_KIND(Token_Gt, ">"), \
  65. TOKEN_KIND(Token_LtEq, "<="), \
  66. TOKEN_KIND(Token_GtEq, ">="), \
  67. TOKEN_KIND(Token__ComparisonEnd, ""), \
  68. \
  69. TOKEN_KIND(Token_OpenParen, "("), \
  70. TOKEN_KIND(Token_CloseParen, ")"), \
  71. TOKEN_KIND(Token_OpenBracket, "["), \
  72. TOKEN_KIND(Token_CloseBracket, "]"), \
  73. TOKEN_KIND(Token_OpenBrace, "{"), \
  74. TOKEN_KIND(Token_CloseBrace, "}"), \
  75. TOKEN_KIND(Token_Colon, ":"), \
  76. TOKEN_KIND(Token_Semicolon, ";"), \
  77. TOKEN_KIND(Token_Period, "."), \
  78. TOKEN_KIND(Token_Comma, ","), \
  79. TOKEN_KIND(Token_Ellipsis, "..."), \
  80. TOKEN_KIND(Token_HalfClosed, ".."), \
  81. TOKEN_KIND(Token_BackSlash, "\\"), \
  82. TOKEN_KIND(Token__OperatorEnd, ""), \
  83. \
  84. TOKEN_KIND(Token__KeywordBegin, ""), \
  85. TOKEN_KIND(Token_import, "import"), \
  86. TOKEN_KIND(Token_export, "export"), \
  87. TOKEN_KIND(Token_foreign, "foreign"), \
  88. TOKEN_KIND(Token_package, "package"), \
  89. TOKEN_KIND(Token_type, "type"), \
  90. TOKEN_KIND(Token_when, "when"), \
  91. TOKEN_KIND(Token_if, "if"), \
  92. TOKEN_KIND(Token_else, "else"), \
  93. TOKEN_KIND(Token_for, "for"), \
  94. TOKEN_KIND(Token_switch, "switch"), \
  95. TOKEN_KIND(Token_in, "in"), \
  96. TOKEN_KIND(Token_do, "do"), \
  97. TOKEN_KIND(Token_case, "case"), \
  98. TOKEN_KIND(Token_break, "break"), \
  99. TOKEN_KIND(Token_continue, "continue"), \
  100. TOKEN_KIND(Token_fallthrough, "fallthrough"), \
  101. TOKEN_KIND(Token_defer, "defer"), \
  102. TOKEN_KIND(Token_return, "return"), \
  103. TOKEN_KIND(Token_proc, "proc"), \
  104. TOKEN_KIND(Token_macro, "macro"), \
  105. TOKEN_KIND(Token_struct, "struct"), \
  106. TOKEN_KIND(Token_union, "union"), \
  107. TOKEN_KIND(Token_enum, "enum"), \
  108. TOKEN_KIND(Token_bit_field, "bit_field"), \
  109. TOKEN_KIND(Token_map, "map"), \
  110. TOKEN_KIND(Token_static, "static"), \
  111. TOKEN_KIND(Token_dynamic, "dynamic"), \
  112. TOKEN_KIND(Token_auto_cast, "auto_cast"), \
  113. TOKEN_KIND(Token_cast, "cast"), \
  114. TOKEN_KIND(Token_transmute, "transmute"), \
  115. TOKEN_KIND(Token_distinct, "distinct"), \
  116. TOKEN_KIND(Token_using, "using"), \
  117. TOKEN_KIND(Token_inline, "inline"), \
  118. TOKEN_KIND(Token_no_inline, "no_inline"), \
  119. TOKEN_KIND(Token_context, "context"), \
  120. TOKEN_KIND(Token_size_of, "size_of"), \
  121. TOKEN_KIND(Token_align_of, "align_of"), \
  122. TOKEN_KIND(Token_offset_of, "offset_of"), \
  123. TOKEN_KIND(Token_type_of, "type_of"), \
  124. TOKEN_KIND(Token_const, "const"), \
  125. TOKEN_KIND(Token_asm, "asm"), \
  126. TOKEN_KIND(Token_yield, "yield"), \
  127. TOKEN_KIND(Token_await, "await"), \
  128. TOKEN_KIND(Token__KeywordEnd, ""), \
  129. TOKEN_KIND(Token_Count, "")
  130. enum TokenKind {
  131. #define TOKEN_KIND(e, s) e
  132. TOKEN_KINDS
  133. #undef TOKEN_KIND
  134. };
  135. String const token_strings[] = {
  136. #define TOKEN_KIND(e, s) {cast(u8 *)s, gb_size_of(s)-1}
  137. TOKEN_KINDS
  138. #undef TOKEN_KIND
  139. };
  140. struct TokenPos {
  141. String file;
  142. isize line;
  143. isize column;
  144. };
  145. TokenPos token_pos(String file, isize line, isize column) {
  146. TokenPos pos = {file, line, column};
  147. return pos;
  148. }
  149. i32 token_pos_cmp(TokenPos const &a, TokenPos const &b) {
  150. if (a.line != b.line) {
  151. return (a.line < b.line) ? -1 : +1;
  152. }
  153. if (a.column != b.column) {
  154. return (a.column < b.column) ? -1 : +1;
  155. }
  156. return string_compare(a.file, b.file);
  157. }
  158. bool operator==(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) == 0; }
  159. bool operator!=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) != 0; }
  160. bool operator< (TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) < 0; }
  161. bool operator<=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) <= 0; }
  162. bool operator> (TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) > 0; }
  163. bool operator>=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) >= 0; }
  164. struct Token {
  165. TokenKind kind;
  166. String string;
  167. TokenPos pos;
  168. };
  169. Token empty_token = {Token_Invalid};
  170. Token blank_token = {Token_Ident, {cast(u8 *)"_", 1}};
  171. Token make_token_ident(String s) {
  172. Token t = {Token_Ident, s};
  173. return t;
  174. }
  175. struct ErrorCollector {
  176. TokenPos prev;
  177. i64 count;
  178. i64 warning_count;
  179. gbMutex mutex;
  180. };
  181. gb_global ErrorCollector global_error_collector;
  182. void init_global_error_collector(void) {
  183. gb_mutex_init(&global_error_collector.mutex);
  184. }
  185. void warning_va(Token token, char *fmt, va_list va) {
  186. gb_mutex_lock(&global_error_collector.mutex);
  187. global_error_collector.warning_count++;
  188. // NOTE(bill): Duplicate error, skip it
  189. if (token.pos.line == 0) {
  190. gb_printf_err("Error: %s\n", gb_bprintf_va(fmt, va));
  191. } else if (global_error_collector.prev != token.pos) {
  192. global_error_collector.prev = token.pos;
  193. gb_printf_err("%.*s(%td:%td) Warning: %s\n",
  194. LIT(token.pos.file), token.pos.line, token.pos.column,
  195. gb_bprintf_va(fmt, va));
  196. }
  197. gb_mutex_unlock(&global_error_collector.mutex);
  198. }
  199. void error_va(Token token, char *fmt, va_list va) {
  200. gb_mutex_lock(&global_error_collector.mutex);
  201. global_error_collector.count++;
  202. // NOTE(bill): Duplicate error, skip it
  203. if (token.pos.line == 0) {
  204. gb_printf_err("Error: %s\n", gb_bprintf_va(fmt, va));
  205. } else if (global_error_collector.prev != token.pos) {
  206. global_error_collector.prev = token.pos;
  207. gb_printf_err("%.*s(%td:%td) %s\n",
  208. LIT(token.pos.file), token.pos.line, token.pos.column,
  209. gb_bprintf_va(fmt, va));
  210. }
  211. gb_mutex_unlock(&global_error_collector.mutex);
  212. }
  213. void error_no_newline_va(Token token, char *fmt, va_list va) {
  214. gb_mutex_lock(&global_error_collector.mutex);
  215. global_error_collector.count++;
  216. // NOTE(bill): Duplicate error, skip it
  217. if (token.pos.line == 0) {
  218. gb_printf_err("Error: %s", gb_bprintf_va(fmt, va));
  219. } else if (global_error_collector.prev != token.pos) {
  220. global_error_collector.prev = token.pos;
  221. gb_printf_err("%.*s(%td:%td) %s",
  222. LIT(token.pos.file), token.pos.line, token.pos.column,
  223. gb_bprintf_va(fmt, va));
  224. }
  225. gb_mutex_unlock(&global_error_collector.mutex);
  226. }
  227. void syntax_error_va(Token token, char *fmt, va_list va) {
  228. gb_mutex_lock(&global_error_collector.mutex);
  229. global_error_collector.count++;
  230. // NOTE(bill): Duplicate error, skip it
  231. if (global_error_collector.prev != token.pos) {
  232. global_error_collector.prev = token.pos;
  233. gb_printf_err("%.*s(%td:%td) Syntax Error: %s\n",
  234. LIT(token.pos.file), token.pos.line, token.pos.column,
  235. gb_bprintf_va(fmt, va));
  236. } else if (token.pos.line == 0) {
  237. gb_printf_err("Syntax Error: %s\n", gb_bprintf_va(fmt, va));
  238. }
  239. gb_mutex_unlock(&global_error_collector.mutex);
  240. }
  241. void syntax_warning_va(Token token, char *fmt, va_list va) {
  242. gb_mutex_lock(&global_error_collector.mutex);
  243. global_error_collector.warning_count++;
  244. // NOTE(bill): Duplicate error, skip it
  245. if (global_error_collector.prev != token.pos) {
  246. global_error_collector.prev = token.pos;
  247. gb_printf_err("%.*s(%td:%td) Syntax Warning: %s\n",
  248. LIT(token.pos.file), token.pos.line, token.pos.column,
  249. gb_bprintf_va(fmt, va));
  250. } else if (token.pos.line == 0) {
  251. gb_printf_err("Warning: %s\n", gb_bprintf_va(fmt, va));
  252. }
  253. gb_mutex_unlock(&global_error_collector.mutex);
  254. }
  255. void warning(Token token, char *fmt, ...) {
  256. va_list va;
  257. va_start(va, fmt);
  258. warning_va(token, fmt, va);
  259. va_end(va);
  260. }
  261. void error(Token token, char *fmt, ...) {
  262. va_list va;
  263. va_start(va, fmt);
  264. error_va(token, fmt, va);
  265. va_end(va);
  266. }
  267. void syntax_error(Token token, char *fmt, ...) {
  268. va_list va;
  269. va_start(va, fmt);
  270. syntax_error_va(token, fmt, va);
  271. va_end(va);
  272. }
  273. void syntax_warning(Token token, char *fmt, ...) {
  274. va_list va;
  275. va_start(va, fmt);
  276. syntax_warning_va(token, fmt, va);
  277. va_end(va);
  278. }
  279. void compiler_error(char *fmt, ...) {
  280. va_list va;
  281. va_start(va, fmt);
  282. gb_printf_err("Internal Compiler Error: %s\n",
  283. gb_bprintf_va(fmt, va));
  284. va_end(va);
  285. gb_exit(1);
  286. }
  287. gb_inline bool token_is_literal(TokenKind t) {
  288. return gb_is_between(t, Token__LiteralBegin+1, Token__LiteralEnd-1);
  289. }
  290. gb_inline bool token_is_operator(TokenKind t) {
  291. return gb_is_between(t, Token__OperatorBegin+1, Token__OperatorEnd-1);
  292. }
  293. gb_inline bool token_is_keyword(TokenKind t) {
  294. return gb_is_between(t, Token__KeywordBegin+1, Token__KeywordEnd-1);
  295. }
  296. gb_inline bool token_is_comparison(TokenKind t) {
  297. return gb_is_between(t, Token__ComparisonBegin+1, Token__ComparisonEnd-1);
  298. }
  299. gb_inline bool token_is_shift(TokenKind t) {
  300. return t == Token_Shl || t == Token_Shr;
  301. }
  302. gb_inline void print_token(Token t) { gb_printf("%.*s\n", LIT(t.string)); }
  303. enum TokenizerInitError {
  304. TokenizerInit_None,
  305. TokenizerInit_Invalid,
  306. TokenizerInit_NotExists,
  307. TokenizerInit_Permission,
  308. TokenizerInit_Empty,
  309. TokenizerInit_Count,
  310. };
  311. struct TokenizerState {
  312. Rune curr_rune; // current character
  313. u8 * curr; // character pos
  314. u8 * read_curr; // pos from start
  315. u8 * line; // current line pos
  316. isize line_count;
  317. };
  318. struct Tokenizer {
  319. String fullpath;
  320. u8 *start;
  321. u8 *end;
  322. Rune curr_rune; // current character
  323. u8 * curr; // character pos
  324. u8 * read_curr; // pos from start
  325. u8 * line; // current line pos
  326. isize line_count;
  327. isize error_count;
  328. Array<String> allocated_strings;
  329. };
  330. TokenizerState save_tokenizer_state(Tokenizer *t) {
  331. TokenizerState state = {};
  332. state.curr_rune = t->curr_rune;
  333. state.curr = t->curr;
  334. state.read_curr = t->read_curr;
  335. state.line = t->line;
  336. state.line_count = t->line_count;
  337. return state;
  338. }
  339. void restore_tokenizer_state(Tokenizer *t, TokenizerState *state) {
  340. t->curr_rune = state->curr_rune;
  341. t->curr = state->curr;
  342. t->read_curr = state->read_curr;
  343. t->line = state->line;
  344. t->line_count = state->line_count;
  345. }
  346. void tokenizer_err(Tokenizer *t, char *msg, ...) {
  347. va_list va;
  348. isize column = t->read_curr - t->line+1;
  349. if (column < 1) {
  350. column = 1;
  351. }
  352. Token token = {};
  353. token.pos.file = t->fullpath;
  354. token.pos.line = t->line_count;
  355. token.pos.column = column;
  356. va_start(va, msg);
  357. syntax_error_va(token, msg, va);
  358. va_end(va);
  359. t->error_count++;
  360. }
  361. void advance_to_next_rune(Tokenizer *t) {
  362. if (t->read_curr < t->end) {
  363. Rune rune;
  364. isize width = 1;
  365. t->curr = t->read_curr;
  366. if (t->curr_rune == '\n') {
  367. t->line = t->curr;
  368. t->line_count++;
  369. }
  370. rune = *t->read_curr;
  371. if (rune == 0) {
  372. tokenizer_err(t, "Illegal character NUL");
  373. } else if (rune >= 0x80) { // not ASCII
  374. width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
  375. if (rune == GB_RUNE_INVALID && width == 1)
  376. tokenizer_err(t, "Illegal UTF-8 encoding");
  377. else if (rune == GB_RUNE_BOM && t->curr-t->start > 0)
  378. tokenizer_err(t, "Illegal byte order mark");
  379. }
  380. t->read_curr += width;
  381. t->curr_rune = rune;
  382. } else {
  383. t->curr = t->end;
  384. if (t->curr_rune == '\n') {
  385. t->line = t->curr;
  386. t->line_count++;
  387. }
  388. t->curr_rune = GB_RUNE_EOF;
  389. }
  390. }
  391. TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath) {
  392. TokenizerInitError err = TokenizerInit_None;
  393. char *c_str = alloc_cstring(heap_allocator(), fullpath);
  394. defer (gb_free(heap_allocator(), c_str));
  395. // TODO(bill): Memory map rather than copy contents
  396. gbFileContents fc = gb_file_read_contents(heap_allocator(), true, c_str);
  397. gb_zero_item(t);
  398. t->fullpath = fullpath;
  399. t->line_count = 1;
  400. if (fc.data != nullptr) {
  401. t->start = cast(u8 *)fc.data;
  402. t->line = t->read_curr = t->curr = t->start;
  403. t->end = t->start + fc.size;
  404. advance_to_next_rune(t);
  405. if (t->curr_rune == GB_RUNE_BOM) {
  406. advance_to_next_rune(t); // Ignore BOM at file beginning
  407. }
  408. array_init(&t->allocated_strings, heap_allocator());
  409. } else {
  410. gbFile f = {};
  411. gbFileError file_err = gb_file_open(&f, c_str);
  412. defer (gb_file_close(&f));
  413. switch (file_err) {
  414. case gbFileError_Invalid: err = TokenizerInit_Invalid; break;
  415. case gbFileError_NotExists: err = TokenizerInit_NotExists; break;
  416. case gbFileError_Permission: err = TokenizerInit_Permission; break;
  417. }
  418. if (err == TokenizerInit_None && gb_file_size(&f) == 0) {
  419. err = TokenizerInit_Empty;
  420. }
  421. }
  422. return err;
  423. }
  424. gb_inline void destroy_tokenizer(Tokenizer *t) {
  425. if (t->start != nullptr) {
  426. gb_free(heap_allocator(), t->start);
  427. }
  428. for_array(i, t->allocated_strings) {
  429. gb_free(heap_allocator(), t->allocated_strings[i].text);
  430. }
  431. array_free(&t->allocated_strings);
  432. }
  433. void tokenizer_skip_whitespace(Tokenizer *t) {
  434. while (t->curr_rune == ' ' ||
  435. t->curr_rune == '\t' ||
  436. t->curr_rune == '\n' ||
  437. t->curr_rune == '\r') {
  438. advance_to_next_rune(t);
  439. }
  440. }
  441. gb_inline i32 digit_value(Rune r) {
  442. if (gb_char_is_digit(cast(char)r)) {
  443. return r - '0';
  444. } else if (gb_is_between(cast(char)r, 'a', 'f')) {
  445. return r - 'a' + 10;
  446. } else if (gb_is_between(cast(char)r, 'A', 'F')) {
  447. return r - 'A' + 10;
  448. }
  449. return 16; // NOTE(bill): Larger than highest possible
  450. }
  451. gb_inline void scan_mantissa(Tokenizer *t, i32 base) {
  452. while (digit_value(t->curr_rune) < base || t->curr_rune == '_') {
  453. advance_to_next_rune(t);
  454. }
  455. }
  456. Token scan_number_to_token(Tokenizer *t, bool seen_decimal_point) {
  457. Token token = {};
  458. token.kind = Token_Integer;
  459. token.string = make_string(t->curr, 1);
  460. token.pos.file = t->fullpath;
  461. token.pos.line = t->line_count;
  462. token.pos.column = t->curr-t->line+1;
  463. if (seen_decimal_point) {
  464. token.string.text -= 1;
  465. token.string.len += 1;
  466. token.pos.column -= 1;
  467. token.kind = Token_Float;
  468. scan_mantissa(t, 10);
  469. goto exponent;
  470. }
  471. if (t->curr_rune == '0') {
  472. u8 *prev = t->curr;
  473. advance_to_next_rune(t);
  474. if (t->curr_rune == 'b') { // Binary
  475. advance_to_next_rune(t);
  476. scan_mantissa(t, 2);
  477. if (t->curr - prev <= 2) {
  478. token.kind = Token_Invalid;
  479. }
  480. } else if (t->curr_rune == 'o') { // Octal
  481. advance_to_next_rune(t);
  482. scan_mantissa(t, 8);
  483. if (t->curr - prev <= 2) {
  484. token.kind = Token_Invalid;
  485. }
  486. } else if (t->curr_rune == 'd') { // Decimal
  487. advance_to_next_rune(t);
  488. scan_mantissa(t, 10);
  489. if (t->curr - prev <= 2) {
  490. token.kind = Token_Invalid;
  491. }
  492. } else if (t->curr_rune == 'z') { // Dozenal
  493. advance_to_next_rune(t);
  494. scan_mantissa(t, 12);
  495. if (t->curr - prev <= 2) {
  496. token.kind = Token_Invalid;
  497. }
  498. } else if (t->curr_rune == 'x') { // Hexadecimal
  499. advance_to_next_rune(t);
  500. scan_mantissa(t, 16);
  501. if (t->curr - prev <= 2) {
  502. token.kind = Token_Invalid;
  503. }
  504. } else if (t->curr_rune == 'h') { // Hexadecimal Float
  505. token.kind = Token_Float;
  506. advance_to_next_rune(t);
  507. scan_mantissa(t, 16);
  508. if (t->curr - prev <= 2) {
  509. token.kind = Token_Invalid;
  510. } else {
  511. u8 *start = prev+2;
  512. isize n = t->curr - start;
  513. isize digit_count = 0;
  514. for (isize i = 0; i < n; i++) {
  515. if (start[i] != '_') {
  516. digit_count += 1;
  517. }
  518. }
  519. switch (digit_count) {
  520. case 8:
  521. case 16:
  522. break;
  523. default:
  524. tokenizer_err(t, "Invalid hexadecimal float, expected 8 or 16 digits, got %td", digit_count);
  525. break;
  526. }
  527. }
  528. } else {
  529. seen_decimal_point = false;
  530. scan_mantissa(t, 10);
  531. if (t->curr_rune == '.' || t->curr_rune == 'e' || t->curr_rune == 'E') {
  532. seen_decimal_point = true;
  533. goto fraction;
  534. }
  535. }
  536. goto end;
  537. }
  538. scan_mantissa(t, 10);
  539. fraction:
  540. if (t->curr_rune == '.') {
  541. // HACK(bill): This may be inefficient
  542. TokenizerState state = save_tokenizer_state(t);
  543. advance_to_next_rune(t);
  544. if (t->curr_rune == '.') {
  545. // TODO(bill): Clean up this shit
  546. restore_tokenizer_state(t, &state);
  547. goto end;
  548. }
  549. token.kind = Token_Float;
  550. scan_mantissa(t, 10);
  551. }
  552. exponent:
  553. if (t->curr_rune == 'e' || t->curr_rune == 'E') {
  554. token.kind = Token_Float;
  555. advance_to_next_rune(t);
  556. if (t->curr_rune == '-' || t->curr_rune == '+') {
  557. advance_to_next_rune(t);
  558. }
  559. scan_mantissa(t, 10);
  560. }
  561. if (t->curr_rune == 'i') {
  562. token.kind = Token_Imag;
  563. advance_to_next_rune(t);
  564. }
  565. end:
  566. token.string.len = t->curr - token.string.text;
  567. return token;
  568. }
  569. // Quote == " for string
  570. bool scan_escape(Tokenizer *t, Rune quote) {
  571. isize len = 0;
  572. u32 base = 0, max = 0, x = 0;
  573. Rune r = t->curr_rune;
  574. if (r == 'a' ||
  575. r == 'b' ||
  576. r == 'f' ||
  577. r == 'n' ||
  578. r == 'r' ||
  579. r == 't' ||
  580. r == 'v' ||
  581. r == '\\' ||
  582. r == quote) {
  583. advance_to_next_rune(t);
  584. return true;
  585. } else if (gb_is_between(r, '0', '7')) {
  586. len = 3; base = 8; max = 255;
  587. } else if (r == 'x') {
  588. advance_to_next_rune(t);
  589. len = 2; base = 16; max = 255;
  590. } else if (r == 'u') {
  591. advance_to_next_rune(t);
  592. len = 4; base = 16; max = GB_RUNE_MAX;
  593. } else if (r == 'U') {
  594. advance_to_next_rune(t);
  595. len = 8; base = 16; max = GB_RUNE_MAX;
  596. } else {
  597. if (t->curr_rune < 0) {
  598. tokenizer_err(t, "Escape sequence was not terminated");
  599. } else {
  600. tokenizer_err(t, "Unknown escape sequence");
  601. }
  602. return false;
  603. }
  604. while (len --> 0) {
  605. u32 d = cast(u32)digit_value(t->curr_rune);
  606. if (d >= base) {
  607. if (t->curr_rune < 0) {
  608. tokenizer_err(t, "Escape sequence was not terminated");
  609. } else {
  610. tokenizer_err(t, "Illegal character %d in escape sequence", t->curr_rune);
  611. }
  612. return false;
  613. }
  614. x = x*base + d;
  615. advance_to_next_rune(t);
  616. }
  617. return true;
  618. }
  619. gb_inline TokenKind token_kind_variant2(Tokenizer *t, TokenKind a, TokenKind b) {
  620. if (t->curr_rune == '=') {
  621. advance_to_next_rune(t);
  622. return b;
  623. }
  624. return a;
  625. }
  626. gb_inline TokenKind token_kind_variant3(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c) {
  627. if (t->curr_rune == '=') {
  628. advance_to_next_rune(t);
  629. return b;
  630. }
  631. if (t->curr_rune == ch_c) {
  632. advance_to_next_rune(t);
  633. return c;
  634. }
  635. return a;
  636. }
  637. gb_inline TokenKind token_kind_variant4(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c, Rune ch_d, TokenKind d) {
  638. if (t->curr_rune == '=') {
  639. advance_to_next_rune(t);
  640. return b;
  641. } else if (t->curr_rune == ch_c) {
  642. advance_to_next_rune(t);
  643. return c;
  644. } else if (t->curr_rune == ch_d) {
  645. advance_to_next_rune(t);
  646. return d;
  647. }
  648. return a;
  649. }
  650. gb_inline TokenKind token_kind_dub_eq(Tokenizer *t, Rune sing_rune, TokenKind sing, TokenKind sing_eq, TokenKind dub, TokenKind dub_eq) {
  651. if (t->curr_rune == '=') {
  652. advance_to_next_rune(t);
  653. return sing_eq;
  654. } else if (t->curr_rune == sing_rune) {
  655. advance_to_next_rune(t);
  656. if (t->curr_rune == '=') {
  657. advance_to_next_rune(t);
  658. return dub_eq;
  659. }
  660. return dub;
  661. }
  662. return sing;
  663. }
  664. void tokenizer__fle_update(Tokenizer *t) {
  665. t->curr_rune = '/';
  666. t->curr = t->curr-1;
  667. t->read_curr = t->curr+1;
  668. advance_to_next_rune(t);
  669. }
  670. // NOTE(bill): needed if comment is straight after a "semicolon"
  671. bool tokenizer_find_line_end(Tokenizer *t) {
  672. while (t->curr_rune == '/' || t->curr_rune == '*') {
  673. if (t->curr_rune == '/') {
  674. tokenizer__fle_update(t);
  675. return true;
  676. }
  677. advance_to_next_rune(t);
  678. while (t->curr_rune >= 0) {
  679. Rune r = t->curr_rune;
  680. if (r == '\n') {
  681. tokenizer__fle_update(t);
  682. return true;
  683. }
  684. advance_to_next_rune(t);
  685. if (r == '*' && t->curr_rune == '/') {
  686. advance_to_next_rune(t);
  687. break;
  688. }
  689. }
  690. tokenizer_skip_whitespace(t);
  691. if (t->curr_rune < 0 || t->curr_rune == '\n') {
  692. tokenizer__fle_update(t);
  693. return true;
  694. }
  695. if (t->curr_rune != '/') {
  696. tokenizer__fle_update(t);
  697. return false;
  698. }
  699. advance_to_next_rune(t);
  700. }
  701. tokenizer__fle_update(t);
  702. return false;
  703. }
  704. Token tokenizer_get_token(Tokenizer *t) {
  705. tokenizer_skip_whitespace(t);
  706. Token token = {};
  707. token.string = make_string(t->curr, 1);
  708. token.pos.file = t->fullpath;
  709. token.pos.line = t->line_count;
  710. token.pos.column = t->curr - t->line + 1;
  711. Rune curr_rune = t->curr_rune;
  712. if (rune_is_letter(curr_rune)) {
  713. token.kind = Token_Ident;
  714. while (rune_is_letter(t->curr_rune) || rune_is_digit(t->curr_rune)) {
  715. advance_to_next_rune(t);
  716. }
  717. token.string.len = t->curr - token.string.text;
  718. // NOTE(bill): All keywords are > 1
  719. if (token.string.len > 1) {
  720. for (i32 k = Token__KeywordBegin+1; k < Token__KeywordEnd; k++) {
  721. if (token.string == token_strings[k]) {
  722. token.kind = cast(TokenKind)k;
  723. break;
  724. }
  725. }
  726. }
  727. } else if (gb_is_between(curr_rune, '0', '9')) {
  728. token = scan_number_to_token(t, false);
  729. } else {
  730. advance_to_next_rune(t);
  731. switch (curr_rune) {
  732. case GB_RUNE_EOF:
  733. token.kind = Token_EOF;
  734. break;
  735. case '\'': // Rune Literal
  736. {
  737. token.kind = Token_Rune;
  738. Rune quote = curr_rune;
  739. bool valid = true;
  740. i32 n = 0, success;
  741. for (;;) {
  742. Rune r = t->curr_rune;
  743. if (r == '\n' || r < 0) {
  744. tokenizer_err(t, "Rune literal not terminated");
  745. break;
  746. }
  747. advance_to_next_rune(t);
  748. if (r == quote) {
  749. break;
  750. }
  751. n++;
  752. if (r == '\\') {
  753. if (!scan_escape(t, quote)) {
  754. valid = false;
  755. }
  756. }
  757. }
  758. // TODO(bill): Better Error Handling
  759. if (valid && n != 1) {
  760. tokenizer_err(t, "Invalid rune literal");
  761. }
  762. token.string.len = t->curr - token.string.text;
  763. success = unquote_string(heap_allocator(), &token.string);
  764. if (success > 0) {
  765. if (success == 2) {
  766. array_add(&t->allocated_strings, token.string);
  767. }
  768. return token;
  769. } else {
  770. tokenizer_err(t, "Invalid rune literal");
  771. }
  772. } break;
  773. case '`': // Raw String Literal
  774. case '"': // String Literal
  775. {
  776. i32 success;
  777. Rune quote = curr_rune;
  778. token.kind = Token_String;
  779. if (curr_rune == '"') {
  780. for (;;) {
  781. Rune r = t->curr_rune;
  782. if (r == '\n' || r < 0) {
  783. tokenizer_err(t, "String literal not terminated");
  784. break;
  785. }
  786. advance_to_next_rune(t);
  787. if (r == quote) {
  788. break;
  789. }
  790. if (r == '\\') {
  791. scan_escape(t, quote);
  792. }
  793. }
  794. } else {
  795. for (;;) {
  796. Rune r = t->curr_rune;
  797. if (r < 0) {
  798. tokenizer_err(t, "String literal not terminated");
  799. break;
  800. }
  801. advance_to_next_rune(t);
  802. if (r == quote) {
  803. break;
  804. }
  805. }
  806. }
  807. token.string.len = t->curr - token.string.text;
  808. success = unquote_string(heap_allocator(), &token.string);
  809. if (success > 0) {
  810. if (success == 2) {
  811. array_add(&t->allocated_strings, token.string);
  812. }
  813. return token;
  814. } else {
  815. tokenizer_err(t, "Invalid string literal");
  816. }
  817. } break;
  818. case '.':
  819. if (t->curr_rune == '.') { // Could be an ellipsis
  820. advance_to_next_rune(t);
  821. token.kind = Token_HalfClosed;
  822. if (t->curr_rune == '.') {
  823. advance_to_next_rune(t);
  824. token.kind = Token_Ellipsis;
  825. }
  826. } else if ('0' <= t->curr_rune && t->curr_rune <= '9') {
  827. token = scan_number_to_token(t, true);
  828. } else {
  829. token.kind = Token_Period;
  830. }
  831. break;
  832. case '#': token.kind = Token_Hash; break;
  833. case '@': token.kind = Token_At; break;
  834. case '$': token.kind = Token_Dollar; break;
  835. case '?': token.kind = Token_Question; break;
  836. case '^': token.kind = Token_Pointer; break;
  837. case ';': token.kind = Token_Semicolon; break;
  838. case ',': token.kind = Token_Comma; break;
  839. case ':': token.kind = Token_Colon; break;
  840. case '(': token.kind = Token_OpenParen; break;
  841. case ')': token.kind = Token_CloseParen; break;
  842. case '[': token.kind = Token_OpenBracket; break;
  843. case ']': token.kind = Token_CloseBracket; break;
  844. case '{': token.kind = Token_OpenBrace; break;
  845. case '}': token.kind = Token_CloseBrace; break;
  846. case '\\': token.kind = Token_BackSlash; break;
  847. case 0x2260: token.kind = Token_NotEq; break; // '≠'
  848. case 0x2264: token.kind = Token_LtEq; break; // '≤'
  849. case 0x2265: token.kind = Token_GtEq; break; // '≥'
  850. case '%': token.kind = token_kind_dub_eq(t, '%', Token_Mod, Token_ModEq, Token_ModMod, Token_ModModEq); break;
  851. case '*': token.kind = token_kind_variant2(t, Token_Mul, Token_MulEq); break;
  852. case '=':
  853. token.kind = Token_Eq;
  854. if (t->curr_rune == '>') {
  855. advance_to_next_rune(t);
  856. token.kind = Token_DoubleArrowRight;
  857. } else if (t->curr_rune == '=') {
  858. advance_to_next_rune(t);
  859. token.kind = Token_CmpEq;
  860. }
  861. break;
  862. case '~': token.kind = token_kind_variant2(t, Token_Xor, Token_XorEq); break;
  863. case '!': token.kind = token_kind_variant2(t, Token_Not, Token_NotEq); break;
  864. // case '+': token.kind = token_kind_variant3(t, Token_Add, Token_AddEq, '+', Token_Inc); break;
  865. case '+': token.kind = token_kind_variant2(t, Token_Add, Token_AddEq); break;
  866. case '-':
  867. token.kind = Token_Sub;
  868. if (t->curr_rune == '=') {
  869. advance_to_next_rune(t);
  870. token.kind = Token_SubEq;
  871. } else if (t->curr_rune == '-') {
  872. advance_to_next_rune(t);
  873. token.kind = Token_Invalid;
  874. if (t->curr_rune == '-') {
  875. advance_to_next_rune(t);
  876. token.kind = Token_Undef;
  877. }
  878. } else if (t->curr_rune == '>') {
  879. advance_to_next_rune(t);
  880. token.kind = Token_ArrowRight;
  881. }
  882. break;
  883. case '/': {
  884. if (t->curr_rune == '/') {
  885. while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) {
  886. advance_to_next_rune(t);
  887. }
  888. token.kind = Token_Comment;
  889. } else if (t->curr_rune == '*') {
  890. isize comment_scope = 1;
  891. advance_to_next_rune(t);
  892. while (comment_scope > 0) {
  893. if (t->curr_rune == GB_RUNE_EOF) {
  894. break;
  895. } else if (t->curr_rune == '/') {
  896. advance_to_next_rune(t);
  897. if (t->curr_rune == '*') {
  898. advance_to_next_rune(t);
  899. comment_scope++;
  900. }
  901. } else if (t->curr_rune == '*') {
  902. advance_to_next_rune(t);
  903. if (t->curr_rune == '/') {
  904. advance_to_next_rune(t);
  905. comment_scope--;
  906. }
  907. } else {
  908. advance_to_next_rune(t);
  909. }
  910. }
  911. token.kind = Token_Comment;
  912. } else {
  913. token.kind = token_kind_variant2(t, Token_Quo, Token_QuoEq);
  914. }
  915. } break;
  916. case '<':
  917. if (t->curr_rune == '-') {
  918. advance_to_next_rune(t);
  919. token.kind = Token_ArrowLeft;
  920. } else {
  921. token.kind = token_kind_dub_eq(t, '<', Token_Lt, Token_LtEq, Token_Shl, Token_ShlEq);
  922. }
  923. break;
  924. case '>': token.kind = token_kind_dub_eq(t, '>', Token_Gt, Token_GtEq, Token_Shr, Token_ShrEq); break;
  925. case '&':
  926. token.kind = Token_And;
  927. if (t->curr_rune == '~') {
  928. token.kind = Token_AndNot;
  929. advance_to_next_rune(t);
  930. if (t->curr_rune == '=') {
  931. token.kind = Token_AndNotEq;
  932. advance_to_next_rune(t);
  933. }
  934. } else {
  935. token.kind = token_kind_dub_eq(t, '&', Token_And, Token_AndEq, Token_CmpAnd, Token_CmpAndEq);
  936. }
  937. break;
  938. case '|': token.kind = token_kind_dub_eq(t, '|', Token_Or, Token_OrEq, Token_CmpOr, Token_CmpOrEq); break;
  939. default:
  940. if (curr_rune != GB_RUNE_BOM) {
  941. u8 str[4] = {};
  942. int len = cast(int)gb_utf8_encode_rune(str, curr_rune);
  943. tokenizer_err(t, "Illegal character: %.*s (%d) ", len, str, curr_rune);
  944. }
  945. token.kind = Token_Invalid;
  946. break;
  947. }
  948. }
  949. token.string.len = t->curr - token.string.text;
  950. return token;
  951. }