tokenizer.cpp 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024
  1. #define TOKEN_KINDS \
  2. TOKEN_KIND(Token_Invalid, "Invalid"), \
  3. TOKEN_KIND(Token_EOF, "EOF"), \
  4. TOKEN_KIND(Token_Comment, "Comment"), \
  5. \
  6. TOKEN_KIND(Token__LiteralBegin, "_LiteralBegin"), \
  7. TOKEN_KIND(Token_Ident, "identifier"), \
  8. TOKEN_KIND(Token_Integer, "integer"), \
  9. TOKEN_KIND(Token_Float, "float"), \
  10. TOKEN_KIND(Token_Imag, "imaginary"), \
  11. TOKEN_KIND(Token_Rune, "rune"), \
  12. TOKEN_KIND(Token_String, "string"), \
  13. TOKEN_KIND(Token__LiteralEnd, "_LiteralEnd"), \
  14. \
  15. TOKEN_KIND(Token__OperatorBegin, "_OperatorBegin"), \
  16. TOKEN_KIND(Token_Eq, "="), \
  17. TOKEN_KIND(Token_Not, "!"), \
  18. TOKEN_KIND(Token_Hash, "#"), \
  19. TOKEN_KIND(Token_At, "@"), \
  20. TOKEN_KIND(Token_Dollar, "$"), \
  21. TOKEN_KIND(Token_Pointer, "^"), \
  22. TOKEN_KIND(Token_Question, "?"), \
  23. TOKEN_KIND(Token_Add, "+"), \
  24. TOKEN_KIND(Token_Sub, "-"), \
  25. TOKEN_KIND(Token_Mul, "*"), \
  26. TOKEN_KIND(Token_Quo, "/"), \
  27. TOKEN_KIND(Token_Mod, "%"), \
  28. TOKEN_KIND(Token_ModMod, "%%"), \
  29. TOKEN_KIND(Token_And, "&"), \
  30. TOKEN_KIND(Token_Or, "|"), \
  31. TOKEN_KIND(Token_Xor, "~"), \
  32. TOKEN_KIND(Token_AndNot, "&~"), \
  33. TOKEN_KIND(Token_Shl, "<<"), \
  34. TOKEN_KIND(Token_Shr, ">>"), \
  35. \
  36. TOKEN_KIND(Token_CmpAnd, "&&"), \
  37. TOKEN_KIND(Token_CmpOr, "||"), \
  38. \
  39. TOKEN_KIND(Token__AssignOpBegin, "_AssignOpBegin"), \
  40. TOKEN_KIND(Token_AddEq, "+="), \
  41. TOKEN_KIND(Token_SubEq, "-="), \
  42. TOKEN_KIND(Token_MulEq, "*="), \
  43. TOKEN_KIND(Token_QuoEq, "/="), \
  44. TOKEN_KIND(Token_ModEq, "%="), \
  45. TOKEN_KIND(Token_ModModEq, "%%="), \
  46. TOKEN_KIND(Token_AndEq, "&="), \
  47. TOKEN_KIND(Token_OrEq, "|="), \
  48. TOKEN_KIND(Token_XorEq, "~="), \
  49. TOKEN_KIND(Token_AndNotEq, "&~="), \
  50. TOKEN_KIND(Token_ShlEq, "<<="), \
  51. TOKEN_KIND(Token_ShrEq, ">>="), \
  52. TOKEN_KIND(Token_CmpAndEq, "&&="), \
  53. TOKEN_KIND(Token_CmpOrEq, "||="), \
  54. TOKEN_KIND(Token__AssignOpEnd, "_AssignOpEnd"), \
  55. TOKEN_KIND(Token_ArrowRight, "->"), \
  56. TOKEN_KIND(Token_ArrowLeft, "<-"), \
  57. TOKEN_KIND(Token_DoubleArrowRight, "=>"), \
  58. TOKEN_KIND(Token_Inc, "++"), \
  59. TOKEN_KIND(Token_Dec, "--"), \
  60. TOKEN_KIND(Token_Undef, "---"), \
  61. \
  62. TOKEN_KIND(Token__ComparisonBegin, "_ComparisonBegin"), \
  63. TOKEN_KIND(Token_CmpEq, "=="), \
  64. TOKEN_KIND(Token_NotEq, "!="), \
  65. TOKEN_KIND(Token_Lt, "<"), \
  66. TOKEN_KIND(Token_Gt, ">"), \
  67. TOKEN_KIND(Token_LtEq, "<="), \
  68. TOKEN_KIND(Token_GtEq, ">="), \
  69. TOKEN_KIND(Token__ComparisonEnd, "_ComparisonEnd"), \
  70. \
  71. TOKEN_KIND(Token_OpenParen, "("), \
  72. TOKEN_KIND(Token_CloseParen, ")"), \
  73. TOKEN_KIND(Token_OpenBracket, "["), \
  74. TOKEN_KIND(Token_CloseBracket, "]"), \
  75. TOKEN_KIND(Token_OpenBrace, "{"), \
  76. TOKEN_KIND(Token_CloseBrace, "}"), \
  77. TOKEN_KIND(Token_Colon, ":"), \
  78. TOKEN_KIND(Token_Semicolon, ";"), \
  79. TOKEN_KIND(Token_Period, "."), \
  80. TOKEN_KIND(Token_Comma, ","), \
  81. TOKEN_KIND(Token_Ellipsis, "..."), \
  82. TOKEN_KIND(Token_HalfClosed, ".."), \
  83. TOKEN_KIND(Token_BackSlash, "\\"), \
  84. TOKEN_KIND(Token__OperatorEnd, "_OperatorEnd"), \
  85. \
  86. TOKEN_KIND(Token__KeywordBegin, "_KeywordBegin"), \
  87. TOKEN_KIND(Token_import, "import"), \
  88. TOKEN_KIND(Token_import_load, "import_load"), \
  89. TOKEN_KIND(Token_foreign, "foreign"), \
  90. TOKEN_KIND(Token_foreign_library, "foreign_library"), \
  91. TOKEN_KIND(Token_foreign_system_library, "foreign_system_library"), \
  92. TOKEN_KIND(Token_type, "type"), \
  93. TOKEN_KIND(Token_when, "when"), \
  94. TOKEN_KIND(Token_if, "if"), \
  95. TOKEN_KIND(Token_else, "else"), \
  96. TOKEN_KIND(Token_for, "for"), \
  97. TOKEN_KIND(Token_match, "match"), \
  98. TOKEN_KIND(Token_in, "in"), \
  99. TOKEN_KIND(Token_do, "do"), \
  100. TOKEN_KIND(Token_case, "case"), \
  101. TOKEN_KIND(Token_break, "break"), \
  102. TOKEN_KIND(Token_continue, "continue"), \
  103. TOKEN_KIND(Token_fallthrough, "fallthrough"), \
  104. TOKEN_KIND(Token_defer, "defer"), \
  105. TOKEN_KIND(Token_return, "return"), \
  106. TOKEN_KIND(Token_proc, "proc"), \
  107. TOKEN_KIND(Token_macro, "macro"), \
  108. TOKEN_KIND(Token_struct, "struct"), \
  109. TOKEN_KIND(Token_union, "union"), \
  110. TOKEN_KIND(Token_raw_union, "raw_union"), \
  111. TOKEN_KIND(Token_enum, "enum"), \
  112. TOKEN_KIND(Token_bit_field, "bit_field"), \
  113. TOKEN_KIND(Token_vector, "vector"), \
  114. TOKEN_KIND(Token_map, "map"), \
  115. TOKEN_KIND(Token_static, "static"), \
  116. TOKEN_KIND(Token_dynamic, "dynamic"), \
  117. TOKEN_KIND(Token_cast, "cast"), \
  118. TOKEN_KIND(Token_using, "using"), \
  119. TOKEN_KIND(Token_context, "context"), \
  120. TOKEN_KIND(Token_push_context, "push_context"), \
  121. TOKEN_KIND(Token_push_allocator, "push_allocator"), \
  122. TOKEN_KIND(Token_size_of, "size_of"), \
  123. TOKEN_KIND(Token_align_of, "align_of"), \
  124. TOKEN_KIND(Token_offset_of, "offset_of"), \
  125. TOKEN_KIND(Token_type_of, "type_of"), \
  126. TOKEN_KIND(Token_asm, "asm"), \
  127. TOKEN_KIND(Token_yield, "yield"), \
  128. TOKEN_KIND(Token_await, "await"), \
  129. TOKEN_KIND(Token_atomic, "atomic"), \
  130. TOKEN_KIND(Token__KeywordEnd, "_KeywordEnd"), \
  131. TOKEN_KIND(Token_Count, "")
  132. enum TokenKind {
  133. #define TOKEN_KIND(e, s) e
  134. TOKEN_KINDS
  135. #undef TOKEN_KIND
  136. };
  137. String const token_strings[] = {
  138. #define TOKEN_KIND(e, s) {cast(u8 *)s, gb_size_of(s)-1}
  139. TOKEN_KINDS
  140. #undef TOKEN_KIND
  141. };
  142. struct TokenPos {
  143. String file;
  144. isize line;
  145. isize column;
  146. };
  147. i32 token_pos_cmp(TokenPos a, TokenPos b) {
  148. if (a.line == b.line) {
  149. if (a.column == b.column) {
  150. isize min_len = gb_min(a.file.len, b.file.len);
  151. return gb_memcompare(a.file.text, b.file.text, min_len);
  152. }
  153. return (a.column < b.column) ? -1 : +1;
  154. }
  155. return (a.line < b.line) ? -1 : +1;
  156. }
  157. bool token_pos_eq(TokenPos a, TokenPos b) {
  158. return token_pos_cmp(a, b) == 0;
  159. }
  160. struct Token {
  161. TokenKind kind;
  162. String string;
  163. TokenPos pos;
  164. };
  165. Token empty_token = {Token_Invalid};
  166. Token blank_token = {Token_Ident, {cast(u8 *)"_", 1}};
  167. Token make_token_ident(String s) {
  168. Token t = {Token_Ident, s};
  169. return t;
  170. }
  171. struct ErrorCollector {
  172. TokenPos prev;
  173. i64 count;
  174. i64 warning_count;
  175. gbMutex mutex;
  176. };
  177. gb_global ErrorCollector global_error_collector;
  178. void init_global_error_collector(void) {
  179. gb_mutex_init(&global_error_collector.mutex);
  180. }
  181. void warning_va(Token token, char *fmt, va_list va) {
  182. gb_mutex_lock(&global_error_collector.mutex);
  183. global_error_collector.warning_count++;
  184. // NOTE(bill): Duplicate error, skip it
  185. if (!token_pos_eq(global_error_collector.prev, token.pos)) {
  186. global_error_collector.prev = token.pos;
  187. gb_printf_err("%.*s(%td:%td) Warning: %s\n",
  188. LIT(token.pos.file), token.pos.line, token.pos.column,
  189. gb_bprintf_va(fmt, va));
  190. }
  191. gb_mutex_unlock(&global_error_collector.mutex);
  192. }
  193. void error_va(Token token, char *fmt, va_list va) {
  194. gb_mutex_lock(&global_error_collector.mutex);
  195. global_error_collector.count++;
  196. // NOTE(bill): Duplicate error, skip it
  197. if (!token_pos_eq(global_error_collector.prev, token.pos)) {
  198. global_error_collector.prev = token.pos;
  199. gb_printf_err("%.*s(%td:%td) %s\n",
  200. LIT(token.pos.file), token.pos.line, token.pos.column,
  201. gb_bprintf_va(fmt, va));
  202. } else if (token.pos.line == 0) {
  203. gb_printf_err("Error: %s\n", gb_bprintf_va(fmt, va));
  204. }
  205. gb_mutex_unlock(&global_error_collector.mutex);
  206. }
  207. void syntax_error_va(Token token, char *fmt, va_list va) {
  208. gb_mutex_lock(&global_error_collector.mutex);
  209. global_error_collector.count++;
  210. // NOTE(bill): Duplicate error, skip it
  211. if (!token_pos_eq(global_error_collector.prev, token.pos)) {
  212. global_error_collector.prev = token.pos;
  213. gb_printf_err("%.*s(%td:%td) Syntax Error: %s\n",
  214. LIT(token.pos.file), token.pos.line, token.pos.column,
  215. gb_bprintf_va(fmt, va));
  216. } else if (token.pos.line == 0) {
  217. gb_printf_err("Error: %s\n", gb_bprintf_va(fmt, va));
  218. }
  219. gb_mutex_unlock(&global_error_collector.mutex);
  220. }
  221. void syntax_warning_va(Token token, char *fmt, va_list va) {
  222. gb_mutex_lock(&global_error_collector.mutex);
  223. global_error_collector.warning_count++;
  224. // NOTE(bill): Duplicate error, skip it
  225. if (!token_pos_eq(global_error_collector.prev, token.pos)) {
  226. global_error_collector.prev = token.pos;
  227. gb_printf_err("%.*s(%td:%td) Syntax Warning: %s\n",
  228. LIT(token.pos.file), token.pos.line, token.pos.column,
  229. gb_bprintf_va(fmt, va));
  230. } else if (token.pos.line == 0) {
  231. gb_printf_err("Warning: %s\n", gb_bprintf_va(fmt, va));
  232. }
  233. gb_mutex_unlock(&global_error_collector.mutex);
  234. }
  235. void warning(Token token, char *fmt, ...) {
  236. va_list va;
  237. va_start(va, fmt);
  238. warning_va(token, fmt, va);
  239. va_end(va);
  240. }
  241. void error(Token token, char *fmt, ...) {
  242. va_list va;
  243. va_start(va, fmt);
  244. error_va(token, fmt, va);
  245. va_end(va);
  246. }
  247. void syntax_error(Token token, char *fmt, ...) {
  248. va_list va;
  249. va_start(va, fmt);
  250. syntax_error_va(token, fmt, va);
  251. va_end(va);
  252. }
  253. void syntax_warning(Token token, char *fmt, ...) {
  254. va_list va;
  255. va_start(va, fmt);
  256. syntax_warning_va(token, fmt, va);
  257. va_end(va);
  258. }
  259. void compiler_error(char *fmt, ...) {
  260. va_list va;
  261. va_start(va, fmt);
  262. gb_printf_err("Internal Compiler Error: %s\n",
  263. gb_bprintf_va(fmt, va));
  264. va_end(va);
  265. gb_exit(1);
  266. }
  267. gb_inline bool token_is_literal(TokenKind t) {
  268. return gb_is_between(t, Token__LiteralBegin+1, Token__LiteralEnd-1);
  269. }
  270. gb_inline bool token_is_operator(TokenKind t) {
  271. return gb_is_between(t, Token__OperatorBegin+1, Token__OperatorEnd-1);
  272. }
  273. gb_inline bool token_is_keyword(TokenKind t) {
  274. return gb_is_between(t, Token__KeywordBegin+1, Token__KeywordEnd-1);
  275. }
  276. gb_inline bool token_is_comparison(TokenKind t) {
  277. return gb_is_between(t, Token__ComparisonBegin+1, Token__ComparisonEnd-1);
  278. }
  279. gb_inline bool token_is_shift(TokenKind t) {
  280. return t == Token_Shl || t == Token_Shr;
  281. }
  282. gb_inline void print_token(Token t) { gb_printf("%.*s\n", LIT(t.string)); }
  283. enum TokenizerInitError {
  284. TokenizerInit_None,
  285. TokenizerInit_Invalid,
  286. TokenizerInit_NotExists,
  287. TokenizerInit_Permission,
  288. TokenizerInit_Empty,
  289. TokenizerInit_Count,
  290. };
  291. struct TokenizerState {
  292. Rune curr_rune; // current character
  293. u8 * curr; // character pos
  294. u8 * read_curr; // pos from start
  295. u8 * line; // current line pos
  296. isize line_count;
  297. };
  298. struct Tokenizer {
  299. String fullpath;
  300. u8 *start;
  301. u8 *end;
  302. Rune curr_rune; // current character
  303. u8 * curr; // character pos
  304. u8 * read_curr; // pos from start
  305. u8 * line; // current line pos
  306. isize line_count;
  307. isize error_count;
  308. Array<String> allocated_strings;
  309. };
  310. TokenizerState save_tokenizer_state(Tokenizer *t) {
  311. TokenizerState state = {};
  312. state.curr_rune = t->curr_rune;
  313. state.curr = t->curr;
  314. state.read_curr = t->read_curr;
  315. state.line = t->line;
  316. state.line_count = t->line_count;
  317. return state;
  318. }
  319. void restore_tokenizer_state(Tokenizer *t, TokenizerState *state) {
  320. t->curr_rune = state->curr_rune;
  321. t->curr = state->curr;
  322. t->read_curr = state->read_curr;
  323. t->line = state->line;
  324. t->line_count = state->line_count;
  325. }
  326. void tokenizer_err(Tokenizer *t, char *msg, ...) {
  327. va_list va;
  328. isize column = t->read_curr - t->line+1;
  329. if (column < 1) {
  330. column = 1;
  331. }
  332. gb_printf_err("%.*s(%td:%td) Syntax error: ", LIT(t->fullpath), t->line_count, column);
  333. va_start(va, msg);
  334. gb_printf_err_va(msg, va);
  335. va_end(va);
  336. gb_printf_err("\n");
  337. t->error_count++;
  338. }
  339. void advance_to_next_rune(Tokenizer *t) {
  340. if (t->read_curr < t->end) {
  341. Rune rune;
  342. isize width = 1;
  343. t->curr = t->read_curr;
  344. if (t->curr_rune == '\n') {
  345. t->line = t->curr;
  346. t->line_count++;
  347. }
  348. rune = *t->read_curr;
  349. if (rune == 0) {
  350. tokenizer_err(t, "Illegal character NUL");
  351. } else if (rune >= 0x80) { // not ASCII
  352. width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
  353. if (rune == GB_RUNE_INVALID && width == 1)
  354. tokenizer_err(t, "Illegal UTF-8 encoding");
  355. else if (rune == GB_RUNE_BOM && t->curr-t->start > 0)
  356. tokenizer_err(t, "Illegal byte order mark");
  357. }
  358. t->read_curr += width;
  359. t->curr_rune = rune;
  360. } else {
  361. t->curr = t->end;
  362. if (t->curr_rune == '\n') {
  363. t->line = t->curr;
  364. t->line_count++;
  365. }
  366. t->curr_rune = GB_RUNE_EOF;
  367. }
  368. }
  369. TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath) {
  370. TokenizerInitError err = TokenizerInit_None;
  371. char *c_str = gb_alloc_array(heap_allocator(), char, fullpath.len+1);
  372. gb_memcopy(c_str, fullpath.text, fullpath.len);
  373. c_str[fullpath.len] = '\0';
  374. // TODO(bill): Memory map rather than copy contents
  375. gbFileContents fc = gb_file_read_contents(heap_allocator(), true, c_str);
  376. gb_zero_item(t);
  377. if (fc.data != nullptr) {
  378. t->start = cast(u8 *)fc.data;
  379. t->line = t->read_curr = t->curr = t->start;
  380. t->end = t->start + fc.size;
  381. t->fullpath = fullpath;
  382. t->line_count = 1;
  383. advance_to_next_rune(t);
  384. if (t->curr_rune == GB_RUNE_BOM) {
  385. advance_to_next_rune(t); // Ignore BOM at file beginning
  386. }
  387. array_init(&t->allocated_strings, heap_allocator());
  388. } else {
  389. gbFile f = {};
  390. gbFileError file_err = gb_file_open(&f, c_str);
  391. switch (file_err) {
  392. case gbFileError_Invalid: err = TokenizerInit_Invalid; break;
  393. case gbFileError_NotExists: err = TokenizerInit_NotExists; break;
  394. case gbFileError_Permission: err = TokenizerInit_Permission; break;
  395. }
  396. if (err == TokenizerInit_None && gb_file_size(&f) == 0) {
  397. err = TokenizerInit_Empty;
  398. }
  399. gb_file_close(&f);
  400. }
  401. gb_free(heap_allocator(), c_str);
  402. return err;
  403. }
  404. gb_inline void destroy_tokenizer(Tokenizer *t) {
  405. if (t->start != nullptr) {
  406. gb_free(heap_allocator(), t->start);
  407. }
  408. for_array(i, t->allocated_strings) {
  409. gb_free(heap_allocator(), t->allocated_strings[i].text);
  410. }
  411. array_free(&t->allocated_strings);
  412. }
  413. void tokenizer_skip_whitespace(Tokenizer *t) {
  414. while (t->curr_rune == ' ' ||
  415. t->curr_rune == '\t' ||
  416. t->curr_rune == '\n' ||
  417. t->curr_rune == '\r') {
  418. advance_to_next_rune(t);
  419. }
  420. }
  421. gb_inline i32 digit_value(Rune r) {
  422. if (gb_char_is_digit(cast(char)r)) {
  423. return r - '0';
  424. } else if (gb_is_between(cast(char)r, 'a', 'f')) {
  425. return r - 'a' + 10;
  426. } else if (gb_is_between(cast(char)r, 'A', 'F')) {
  427. return r - 'A' + 10;
  428. }
  429. return 16; // NOTE(bill): Larger than highest possible
  430. }
  431. gb_inline void scan_mantissa(Tokenizer *t, i32 base) {
  432. while (digit_value(t->curr_rune) < base || t->curr_rune == '_') {
  433. advance_to_next_rune(t);
  434. }
  435. }
  436. Token scan_number_to_token(Tokenizer *t, bool seen_decimal_point) {
  437. Token token = {};
  438. token.kind = Token_Integer;
  439. token.string = make_string(t->curr, 1);
  440. token.pos.file = t->fullpath;
  441. token.pos.line = t->line_count;
  442. token.pos.column = t->curr-t->line+1;
  443. if (seen_decimal_point) {
  444. token.kind = Token_Float;
  445. scan_mantissa(t, 10);
  446. goto exponent;
  447. }
  448. if (t->curr_rune == '0') {
  449. u8 *prev = t->curr;
  450. advance_to_next_rune(t);
  451. if (t->curr_rune == 'b') { // Binary
  452. advance_to_next_rune(t);
  453. scan_mantissa(t, 2);
  454. if (t->curr - prev <= 2) {
  455. token.kind = Token_Invalid;
  456. }
  457. } else if (t->curr_rune == 'o') { // Octal
  458. advance_to_next_rune(t);
  459. scan_mantissa(t, 8);
  460. if (t->curr - prev <= 2) {
  461. token.kind = Token_Invalid;
  462. }
  463. } else if (t->curr_rune == 'd') { // Decimal
  464. advance_to_next_rune(t);
  465. scan_mantissa(t, 10);
  466. if (t->curr - prev <= 2) {
  467. token.kind = Token_Invalid;
  468. }
  469. } else if (t->curr_rune == 'z') { // Dozenal
  470. advance_to_next_rune(t);
  471. scan_mantissa(t, 12);
  472. if (t->curr - prev <= 2) {
  473. token.kind = Token_Invalid;
  474. }
  475. } else if (t->curr_rune == 'x') { // Hexadecimal
  476. advance_to_next_rune(t);
  477. scan_mantissa(t, 16);
  478. if (t->curr - prev <= 2) {
  479. token.kind = Token_Invalid;
  480. }
  481. } /* else if (t->curr_rune == 'h') { // Hexadecimal Float
  482. token.kind = Token_Float;
  483. advance_to_next_rune(t);
  484. scan_mantissa(t, 16);
  485. if (t->curr - prev <= 2) {
  486. token.kind = Token_Invalid;
  487. }
  488. } */ else {
  489. seen_decimal_point = false;
  490. scan_mantissa(t, 10);
  491. if (t->curr_rune == '.' || t->curr_rune == 'e' || t->curr_rune == 'E') {
  492. seen_decimal_point = true;
  493. goto fraction;
  494. }
  495. }
  496. goto end;
  497. }
  498. scan_mantissa(t, 10);
  499. fraction:
  500. if (t->curr_rune == '.') {
  501. // HACK(bill): This may be inefficient
  502. TokenizerState state = save_tokenizer_state(t);
  503. advance_to_next_rune(t);
  504. if (t->curr_rune == '.') {
  505. // TODO(bill): Clean up this shit
  506. restore_tokenizer_state(t, &state);
  507. goto end;
  508. }
  509. token.kind = Token_Float;
  510. scan_mantissa(t, 10);
  511. }
  512. exponent:
  513. if (t->curr_rune == 'e' || t->curr_rune == 'E') {
  514. token.kind = Token_Float;
  515. advance_to_next_rune(t);
  516. if (t->curr_rune == '-' || t->curr_rune == '+') {
  517. advance_to_next_rune(t);
  518. }
  519. scan_mantissa(t, 10);
  520. }
  521. if (t->curr_rune == 'i') {
  522. token.kind = Token_Imag;
  523. advance_to_next_rune(t);
  524. }
  525. end:
  526. token.string.len = t->curr - token.string.text;
  527. return token;
  528. }
  529. // Quote == " for string
  530. bool scan_escape(Tokenizer *t, Rune quote) {
  531. isize len = 0;
  532. u32 base = 0, max = 0, x = 0;
  533. Rune r = t->curr_rune;
  534. if (r == 'a' ||
  535. r == 'b' ||
  536. r == 'f' ||
  537. r == 'n' ||
  538. r == 'r' ||
  539. r == 't' ||
  540. r == 'v' ||
  541. r == '\\' ||
  542. r == quote) {
  543. advance_to_next_rune(t);
  544. return true;
  545. } else if (gb_is_between(r, '0', '7')) {
  546. len = 3; base = 8; max = 255;
  547. } else if (r == 'x') {
  548. advance_to_next_rune(t);
  549. len = 2; base = 16; max = 255;
  550. } else if (r == 'u') {
  551. advance_to_next_rune(t);
  552. len = 4; base = 16; max = GB_RUNE_MAX;
  553. } else if (r == 'U') {
  554. advance_to_next_rune(t);
  555. len = 8; base = 16; max = GB_RUNE_MAX;
  556. } else {
  557. if (t->curr_rune < 0) {
  558. tokenizer_err(t, "Escape sequence was not terminated");
  559. } else {
  560. tokenizer_err(t, "Unknown escape sequence");
  561. }
  562. return false;
  563. }
  564. while (len --> 0) {
  565. u32 d = cast(u32)digit_value(t->curr_rune);
  566. if (d >= base) {
  567. if (t->curr_rune < 0) {
  568. tokenizer_err(t, "Escape sequence was not terminated");
  569. } else {
  570. tokenizer_err(t, "Illegal character %d in escape sequence", t->curr_rune);
  571. }
  572. return false;
  573. }
  574. x = x*base + d;
  575. advance_to_next_rune(t);
  576. }
  577. return true;
  578. }
  579. gb_inline TokenKind token_kind_variant2(Tokenizer *t, TokenKind a, TokenKind b) {
  580. if (t->curr_rune == '=') {
  581. advance_to_next_rune(t);
  582. return b;
  583. }
  584. return a;
  585. }
  586. gb_inline TokenKind token_kind_variant3(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c) {
  587. if (t->curr_rune == '=') {
  588. advance_to_next_rune(t);
  589. return b;
  590. }
  591. if (t->curr_rune == ch_c) {
  592. advance_to_next_rune(t);
  593. return c;
  594. }
  595. return a;
  596. }
  597. gb_inline TokenKind token_kind_variant4(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c, Rune ch_d, TokenKind d) {
  598. if (t->curr_rune == '=') {
  599. advance_to_next_rune(t);
  600. return b;
  601. } else if (t->curr_rune == ch_c) {
  602. advance_to_next_rune(t);
  603. return c;
  604. } else if (t->curr_rune == ch_d) {
  605. advance_to_next_rune(t);
  606. return d;
  607. }
  608. return a;
  609. }
  610. gb_inline TokenKind token_kind_dub_eq(Tokenizer *t, Rune sing_rune, TokenKind sing, TokenKind sing_eq, TokenKind dub, TokenKind dub_eq) {
  611. if (t->curr_rune == '=') {
  612. advance_to_next_rune(t);
  613. return sing_eq;
  614. } else if (t->curr_rune == sing_rune) {
  615. advance_to_next_rune(t);
  616. if (t->curr_rune == '=') {
  617. advance_to_next_rune(t);
  618. return dub_eq;
  619. }
  620. return dub;
  621. }
  622. return sing;
  623. }
  624. void tokenizer__fle_update(Tokenizer *t) {
  625. t->curr_rune = '/';
  626. t->curr = t->curr-1;
  627. t->read_curr = t->curr+1;
  628. advance_to_next_rune(t);
  629. }
  630. // NOTE(bill): needed if comment is straight after a "semicolon"
  631. bool tokenizer_find_line_end(Tokenizer *t) {
  632. while (t->curr_rune == '/' || t->curr_rune == '*') {
  633. if (t->curr_rune == '/') {
  634. tokenizer__fle_update(t);
  635. return true;
  636. }
  637. advance_to_next_rune(t);
  638. while (t->curr_rune >= 0) {
  639. Rune r = t->curr_rune;
  640. if (r == '\n') {
  641. tokenizer__fle_update(t);
  642. return true;
  643. }
  644. advance_to_next_rune(t);
  645. if (r == '*' && t->curr_rune == '/') {
  646. advance_to_next_rune(t);
  647. break;
  648. }
  649. }
  650. tokenizer_skip_whitespace(t);
  651. if (t->curr_rune < 0 || t->curr_rune == '\n') {
  652. tokenizer__fle_update(t);
  653. return true;
  654. }
  655. if (t->curr_rune != '/') {
  656. tokenizer__fle_update(t);
  657. return false;
  658. }
  659. advance_to_next_rune(t);
  660. }
  661. tokenizer__fle_update(t);
  662. return false;
  663. }
  664. Token tokenizer_get_token(Tokenizer *t) {
  665. tokenizer_skip_whitespace(t);
  666. Token token = {};
  667. token.string = make_string(t->curr, 1);
  668. token.pos.file = t->fullpath;
  669. token.pos.line = t->line_count;
  670. token.pos.column = t->curr - t->line + 1;
  671. Rune curr_rune = t->curr_rune;
  672. if (rune_is_letter(curr_rune)) {
  673. token.kind = Token_Ident;
  674. while (rune_is_letter(t->curr_rune) || rune_is_digit(t->curr_rune)) {
  675. advance_to_next_rune(t);
  676. }
  677. token.string.len = t->curr - token.string.text;
  678. // NOTE(bill): All keywords are > 1
  679. if (token.string.len > 1) {
  680. for (i32 k = Token__KeywordBegin+1; k < Token__KeywordEnd; k++) {
  681. if (token.string == token_strings[k]) {
  682. token.kind = cast(TokenKind)k;
  683. break;
  684. }
  685. }
  686. }
  687. } else if (gb_is_between(curr_rune, '0', '9')) {
  688. token = scan_number_to_token(t, false);
  689. } else {
  690. advance_to_next_rune(t);
  691. switch (curr_rune) {
  692. case GB_RUNE_EOF:
  693. token.kind = Token_EOF;
  694. break;
  695. case '\'': // Rune Literal
  696. {
  697. token.kind = Token_Rune;
  698. Rune quote = curr_rune;
  699. bool valid = true;
  700. i32 n = 0, success;
  701. for (;;) {
  702. Rune r = t->curr_rune;
  703. if (r == '\n' || r < 0) {
  704. tokenizer_err(t, "Rune literal not terminated");
  705. break;
  706. }
  707. advance_to_next_rune(t);
  708. if (r == quote) {
  709. break;
  710. }
  711. n++;
  712. if (r == '\\') {
  713. if (!scan_escape(t, quote)) {
  714. valid = false;
  715. }
  716. }
  717. }
  718. // TODO(bill): Better Error Handling
  719. if (valid && n != 1) {
  720. tokenizer_err(t, "Invalid rune literal");
  721. }
  722. token.string.len = t->curr - token.string.text;
  723. success = unquote_string(heap_allocator(), &token.string);
  724. if (success > 0) {
  725. if (success == 2) {
  726. array_add(&t->allocated_strings, token.string);
  727. }
  728. return token;
  729. } else {
  730. tokenizer_err(t, "Invalid rune literal");
  731. }
  732. } break;
  733. case '`': // Raw String Literal
  734. case '"': // String Literal
  735. {
  736. i32 success;
  737. Rune quote = curr_rune;
  738. token.kind = Token_String;
  739. if (curr_rune == '"') {
  740. for (;;) {
  741. Rune r = t->curr_rune;
  742. if (r == '\n' || r < 0) {
  743. tokenizer_err(t, "String literal not terminated");
  744. break;
  745. }
  746. advance_to_next_rune(t);
  747. if (r == quote) {
  748. break;
  749. }
  750. if (r == '\\') {
  751. scan_escape(t, quote);
  752. }
  753. }
  754. } else {
  755. for (;;) {
  756. Rune r = t->curr_rune;
  757. if (r < 0) {
  758. tokenizer_err(t, "String literal not terminated");
  759. break;
  760. }
  761. advance_to_next_rune(t);
  762. if (r == quote) {
  763. break;
  764. }
  765. }
  766. }
  767. token.string.len = t->curr - token.string.text;
  768. success = unquote_string(heap_allocator(), &token.string);
  769. if (success > 0) {
  770. if (success == 2) {
  771. array_add(&t->allocated_strings, token.string);
  772. }
  773. return token;
  774. } else {
  775. tokenizer_err(t, "Invalid string literal");
  776. }
  777. } break;
  778. case '.':
  779. token.kind = Token_Period; // Default
  780. if (t->curr_rune == '.') { // Could be an ellipsis
  781. advance_to_next_rune(t);
  782. token.kind = Token_HalfClosed;
  783. if (t->curr_rune == '.') {
  784. advance_to_next_rune(t);
  785. token.kind = Token_Ellipsis;
  786. }
  787. }
  788. break;
  789. case '#': token.kind = Token_Hash; break;
  790. case '@': token.kind = Token_At; break;
  791. case '$': token.kind = Token_Dollar; break;
  792. case '?': token.kind = Token_Question; break;
  793. case '^': token.kind = Token_Pointer; break;
  794. case ';': token.kind = Token_Semicolon; break;
  795. case ',': token.kind = Token_Comma; break;
  796. case ':': token.kind = Token_Colon; break;
  797. case '(': token.kind = Token_OpenParen; break;
  798. case ')': token.kind = Token_CloseParen; break;
  799. case '[': token.kind = Token_OpenBracket; break;
  800. case ']': token.kind = Token_CloseBracket; break;
  801. case '{': token.kind = Token_OpenBrace; break;
  802. case '}': token.kind = Token_CloseBrace; break;
  803. case '\\': token.kind = Token_BackSlash; break;
  804. case 0x2260: token.kind = Token_NotEq; break; // '≠'
  805. case 0x2264: token.kind = Token_LtEq; break; // '≤'
  806. case 0x2265: token.kind = Token_GtEq; break; // '≥'
  807. case '%': token.kind = token_kind_dub_eq(t, '%', Token_Mod, Token_ModEq, Token_ModMod, Token_ModModEq); break;
  808. case '*': token.kind = token_kind_variant2(t, Token_Mul, Token_MulEq); break;
  809. case '=':
  810. token.kind = Token_Eq;
  811. if (t->curr_rune == '>') {
  812. advance_to_next_rune(t);
  813. token.kind = Token_DoubleArrowRight;
  814. } else if (t->curr_rune == '=') {
  815. advance_to_next_rune(t);
  816. token.kind = Token_CmpEq;
  817. }
  818. break;
  819. case '~': token.kind = token_kind_variant2(t, Token_Xor, Token_XorEq); break;
  820. case '!': token.kind = token_kind_variant2(t, Token_Not, Token_NotEq); break;
  821. case '+': token.kind = token_kind_variant3(t, Token_Add, Token_AddEq, '+', Token_Inc); break;
  822. case '-':
  823. token.kind = Token_Sub;
  824. if (t->curr_rune == '=') {
  825. advance_to_next_rune(t);
  826. token.kind = Token_SubEq;
  827. } else if (t->curr_rune == '-') {
  828. advance_to_next_rune(t);
  829. token.kind = Token_Dec;
  830. if (t->curr_rune == '-') {
  831. advance_to_next_rune(t);
  832. token.kind = Token_Undef;
  833. }
  834. } else if (t->curr_rune == '>') {
  835. advance_to_next_rune(t);
  836. token.kind = Token_ArrowRight;
  837. }
  838. break;
  839. case '/': {
  840. if (t->curr_rune == '/') {
  841. while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) {
  842. advance_to_next_rune(t);
  843. }
  844. token.kind = Token_Comment;
  845. } else if (t->curr_rune == '*') {
  846. isize comment_scope = 1;
  847. advance_to_next_rune(t);
  848. while (comment_scope > 0) {
  849. if (t->curr_rune == GB_RUNE_EOF) {
  850. break;
  851. } else if (t->curr_rune == '/') {
  852. advance_to_next_rune(t);
  853. if (t->curr_rune == '*') {
  854. advance_to_next_rune(t);
  855. comment_scope++;
  856. }
  857. } else if (t->curr_rune == '*') {
  858. advance_to_next_rune(t);
  859. if (t->curr_rune == '/') {
  860. advance_to_next_rune(t);
  861. comment_scope--;
  862. }
  863. } else {
  864. advance_to_next_rune(t);
  865. }
  866. }
  867. token.kind = Token_Comment;
  868. } else {
  869. token.kind = token_kind_variant2(t, Token_Quo, Token_QuoEq);
  870. }
  871. } break;
  872. case '<':
  873. if (t->curr_rune == '-') {
  874. advance_to_next_rune(t);
  875. token.kind = Token_ArrowLeft;
  876. } else {
  877. token.kind = token_kind_dub_eq(t, '<', Token_Lt, Token_LtEq, Token_Shl, Token_ShlEq);
  878. }
  879. break;
  880. case '>': token.kind = token_kind_dub_eq(t, '>', Token_Gt, Token_GtEq, Token_Shr, Token_ShrEq); break;
  881. case '&':
  882. token.kind = Token_And;
  883. if (t->curr_rune == '~') {
  884. token.kind = Token_AndNot;
  885. advance_to_next_rune(t);
  886. if (t->curr_rune == '=') {
  887. token.kind = Token_AndNotEq;
  888. advance_to_next_rune(t);
  889. }
  890. } else {
  891. token.kind = token_kind_dub_eq(t, '&', Token_And, Token_AndEq, Token_CmpAnd, Token_CmpAndEq);
  892. }
  893. break;
  894. case '|': token.kind = token_kind_dub_eq(t, '|', Token_Or, Token_OrEq, Token_CmpOr, Token_CmpOrEq); break;
  895. default:
  896. if (curr_rune != GB_RUNE_BOM) {
  897. u8 str[4] = {};
  898. int len = cast(int)gb_utf8_encode_rune(str, curr_rune);
  899. tokenizer_err(t, "Illegal character: %.*s (%d) ", len, str, curr_rune);
  900. }
  901. token.kind = Token_Invalid;
  902. break;
  903. }
  904. }
  905. token.string.len = t->curr - token.string.text;
  906. return token;
  907. }