tokenizer.cpp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566
  1. #define TOKEN_KINDS \
  2. TOKEN_KIND(Token_Invalid, "Invalid"), \
  3. TOKEN_KIND(Token_EOF, "EOF"), \
  4. TOKEN_KIND(Token_Comment, "Comment"), \
  5. \
  6. TOKEN_KIND(Token__LiteralBegin, ""), \
  7. TOKEN_KIND(Token_Ident, "identifier"), \
  8. TOKEN_KIND(Token_Integer, "integer"), \
  9. TOKEN_KIND(Token_Float, "float"), \
  10. TOKEN_KIND(Token_Imag, "imaginary"), \
  11. TOKEN_KIND(Token_Rune, "rune"), \
  12. TOKEN_KIND(Token_String, "string"), \
  13. TOKEN_KIND(Token__LiteralEnd, ""), \
  14. \
  15. TOKEN_KIND(Token__OperatorBegin, ""), \
  16. TOKEN_KIND(Token_Eq, "="), \
  17. TOKEN_KIND(Token_Not, "!"), \
  18. TOKEN_KIND(Token_Hash, "#"), \
  19. TOKEN_KIND(Token_At, "@"), \
  20. TOKEN_KIND(Token_Dollar, "$"), \
  21. TOKEN_KIND(Token_Pointer, "^"), \
  22. TOKEN_KIND(Token_Question, "?"), \
  23. TOKEN_KIND(Token_Add, "+"), \
  24. TOKEN_KIND(Token_Sub, "-"), \
  25. TOKEN_KIND(Token_Mul, "*"), \
  26. TOKEN_KIND(Token_Quo, "/"), \
  27. TOKEN_KIND(Token_Mod, "%"), \
  28. TOKEN_KIND(Token_ModMod, "%%"), \
  29. TOKEN_KIND(Token_And, "&"), \
  30. TOKEN_KIND(Token_Or, "|"), \
  31. TOKEN_KIND(Token_Xor, "~"), \
  32. TOKEN_KIND(Token_AndNot, "&~"), \
  33. TOKEN_KIND(Token_Shl, "<<"), \
  34. TOKEN_KIND(Token_Shr, ">>"), \
  35. TOKEN_KIND(Token_CmpAnd, "&&"), \
  36. TOKEN_KIND(Token_CmpOr, "||"), \
  37. \
  38. TOKEN_KIND(Token__AssignOpBegin, ""), \
  39. TOKEN_KIND(Token_AddEq, "+="), \
  40. TOKEN_KIND(Token_SubEq, "-="), \
  41. TOKEN_KIND(Token_MulEq, "*="), \
  42. TOKEN_KIND(Token_QuoEq, "/="), \
  43. TOKEN_KIND(Token_ModEq, "%="), \
  44. TOKEN_KIND(Token_ModModEq, "%%="), \
  45. TOKEN_KIND(Token_AndEq, "&="), \
  46. TOKEN_KIND(Token_OrEq, "|="), \
  47. TOKEN_KIND(Token_XorEq, "~="), \
  48. TOKEN_KIND(Token_AndNotEq, "&~="), \
  49. TOKEN_KIND(Token_ShlEq, "<<="), \
  50. TOKEN_KIND(Token_ShrEq, ">>="), \
  51. TOKEN_KIND(Token_CmpAndEq, "&&="), \
  52. TOKEN_KIND(Token_CmpOrEq, "||="), \
  53. TOKEN_KIND(Token__AssignOpEnd, ""), \
  54. TOKEN_KIND(Token_Increment, "++"), \
  55. TOKEN_KIND(Token_Decrement, "--"), \
  56. TOKEN_KIND(Token_ArrowRight,"->"), \
  57. TOKEN_KIND(Token_Undef, "---"), \
  58. \
  59. TOKEN_KIND(Token__ComparisonBegin, ""), \
  60. TOKEN_KIND(Token_CmpEq, "=="), \
  61. TOKEN_KIND(Token_NotEq, "!="), \
  62. TOKEN_KIND(Token_Lt, "<"), \
  63. TOKEN_KIND(Token_Gt, ">"), \
  64. TOKEN_KIND(Token_LtEq, "<="), \
  65. TOKEN_KIND(Token_GtEq, ">="), \
  66. TOKEN_KIND(Token__ComparisonEnd, ""), \
  67. \
  68. TOKEN_KIND(Token_OpenParen, "("), \
  69. TOKEN_KIND(Token_CloseParen, ")"), \
  70. TOKEN_KIND(Token_OpenBracket, "["), \
  71. TOKEN_KIND(Token_CloseBracket, "]"), \
  72. TOKEN_KIND(Token_OpenBrace, "{"), \
  73. TOKEN_KIND(Token_CloseBrace, "}"), \
  74. TOKEN_KIND(Token_Colon, ":"), \
  75. TOKEN_KIND(Token_Semicolon, ";"), \
  76. TOKEN_KIND(Token_Period, "."), \
  77. TOKEN_KIND(Token_Comma, ","), \
  78. TOKEN_KIND(Token_Ellipsis, ".."), \
  79. TOKEN_KIND(Token_RangeFull, "..="), \
  80. TOKEN_KIND(Token_RangeHalf, "..<"), \
  81. TOKEN_KIND(Token_BackSlash, "\\"), \
  82. TOKEN_KIND(Token__OperatorEnd, ""), \
  83. \
  84. TOKEN_KIND(Token__KeywordBegin, ""), \
  85. TOKEN_KIND(Token_import, "import"), \
  86. TOKEN_KIND(Token_foreign, "foreign"), \
  87. TOKEN_KIND(Token_package, "package"), \
  88. TOKEN_KIND(Token_typeid, "typeid"), \
  89. TOKEN_KIND(Token_when, "when"), \
  90. TOKEN_KIND(Token_where, "where"), \
  91. TOKEN_KIND(Token_if, "if"), \
  92. TOKEN_KIND(Token_else, "else"), \
  93. TOKEN_KIND(Token_for, "for"), \
  94. TOKEN_KIND(Token_switch, "switch"), \
  95. TOKEN_KIND(Token_in, "in"), \
  96. TOKEN_KIND(Token_not_in, "not_in"), \
  97. TOKEN_KIND(Token_do, "do"), \
  98. TOKEN_KIND(Token_case, "case"), \
  99. TOKEN_KIND(Token_break, "break"), \
  100. TOKEN_KIND(Token_continue, "continue"), \
  101. TOKEN_KIND(Token_fallthrough, "fallthrough"), \
  102. TOKEN_KIND(Token_defer, "defer"), \
  103. TOKEN_KIND(Token_return, "return"), \
  104. TOKEN_KIND(Token_proc, "proc"), \
  105. TOKEN_KIND(Token_struct, "struct"), \
  106. TOKEN_KIND(Token_union, "union"), \
  107. TOKEN_KIND(Token_enum, "enum"), \
  108. TOKEN_KIND(Token_bit_set, "bit_set"), \
  109. TOKEN_KIND(Token_map, "map"), \
  110. TOKEN_KIND(Token_dynamic, "dynamic"), \
  111. TOKEN_KIND(Token_auto_cast, "auto_cast"), \
  112. TOKEN_KIND(Token_cast, "cast"), \
  113. TOKEN_KIND(Token_transmute, "transmute"), \
  114. TOKEN_KIND(Token_distinct, "distinct"), \
  115. TOKEN_KIND(Token_using, "using"), \
  116. TOKEN_KIND(Token_inline, "inline"), \
  117. TOKEN_KIND(Token_no_inline, "no_inline"), \
  118. TOKEN_KIND(Token_context, "context"), \
  119. TOKEN_KIND(Token_asm, "asm"), \
  120. TOKEN_KIND(Token__KeywordEnd, ""), \
  121. TOKEN_KIND(Token_Count, "")
  122. enum TokenKind {
  123. #define TOKEN_KIND(e, s) e
  124. TOKEN_KINDS
  125. #undef TOKEN_KIND
  126. };
  127. String const token_strings[] = {
  128. #define TOKEN_KIND(e, s) {cast(u8 *)s, gb_size_of(s)-1}
  129. TOKEN_KINDS
  130. #undef TOKEN_KIND
  131. };
  132. struct KeywordHashEntry {
  133. u32 hash;
  134. TokenKind kind;
  135. String text;
  136. };
  137. enum {
  138. KEYWORD_HASH_TABLE_COUNT = 1<<9,
  139. KEYWORD_HASH_TABLE_MASK = KEYWORD_HASH_TABLE_COUNT-1,
  140. };
  141. gb_global KeywordHashEntry keyword_hash_table[KEYWORD_HASH_TABLE_COUNT] = {};
  142. GB_STATIC_ASSERT(Token__KeywordEnd-Token__KeywordBegin <= gb_count_of(keyword_hash_table));
  143. gb_global isize const min_keyword_size = 2;
  144. gb_global isize max_keyword_size = 11;
  145. gb_global bool keyword_indices[16] = {};
  146. gb_inline u32 keyword_hash(u8 const *text, isize len) {
  147. return fnv32a(text, len);
  148. // return murmur3_32(text, len, 0x6f64696e);
  149. }
  150. void add_keyword_hash_entry(String const &s, TokenKind kind) {
  151. max_keyword_size = gb_max(max_keyword_size, s.len);
  152. keyword_indices[s.len] = true;
  153. u32 hash = keyword_hash(s.text, s.len);
  154. // NOTE(bill): This is a bit of an empirical hack in order to speed things up
  155. u32 index = hash & KEYWORD_HASH_TABLE_MASK;
  156. KeywordHashEntry *entry = &keyword_hash_table[index];
  157. GB_ASSERT_MSG(entry->kind == Token_Invalid, "Keyword hash table initialtion collision: %.*s %.*s %08x %08x", LIT(s), LIT(token_strings[entry->kind]), hash, entry->hash);
  158. entry->hash = hash;
  159. entry->kind = kind;
  160. entry->text = s;
  161. }
  162. void init_keyword_hash_table(void) {
  163. for (i32 kind = Token__KeywordBegin+1; kind < Token__KeywordEnd; kind++) {
  164. add_keyword_hash_entry(token_strings[kind], cast(TokenKind)kind);
  165. }
  166. static struct {
  167. String s;
  168. TokenKind kind;
  169. } const legacy_keywords[] = {
  170. {str_lit("notin"), Token_not_in},
  171. };
  172. for (i32 i = 0; i < gb_count_of(legacy_keywords); i++) {
  173. add_keyword_hash_entry(legacy_keywords[i].s, legacy_keywords[i].kind);
  174. }
  175. GB_ASSERT(max_keyword_size < 16);
  176. }
  177. gb_global Array<String> global_file_path_strings; // index is file id
  178. gb_global Array<struct AstFile *> global_files; // index is file id
  179. String get_file_path_string(i32 index);
  180. struct AstFile *get_ast_file_from_id(i32 index);
  181. struct TokenPos {
  182. i32 file_id;
  183. i32 offset; // starting at 0
  184. i32 line; // starting at 1
  185. i32 column; // starting at 1
  186. };
  187. // temporary
  188. char *token_pos_to_string(TokenPos const &pos) {
  189. gbString s = gb_string_make_reserve(temporary_allocator(), 128);
  190. String file = get_file_path_string(pos.file_id);
  191. s = gb_string_append_fmt(s, "%.*s(%d:%d)", LIT(file), pos.line, pos.column);
  192. return s;
  193. }
  194. i32 token_pos_cmp(TokenPos const &a, TokenPos const &b) {
  195. if (a.offset != b.offset) {
  196. return (a.offset < b.offset) ? -1 : +1;
  197. }
  198. if (a.line != b.line) {
  199. return (a.line < b.line) ? -1 : +1;
  200. }
  201. if (a.column != b.column) {
  202. return (a.column < b.column) ? -1 : +1;
  203. }
  204. return string_compare(get_file_path_string(a.file_id), get_file_path_string(b.file_id));
  205. }
  206. bool operator==(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) == 0; }
  207. bool operator!=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) != 0; }
  208. bool operator< (TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) < 0; }
  209. bool operator<=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) <= 0; }
  210. bool operator> (TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) > 0; }
  211. bool operator>=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) >= 0; }
  212. TokenPos token_pos_add_column(TokenPos pos) {
  213. pos.column += 1;
  214. pos.offset += 1;
  215. return pos;
  216. }
  217. struct Token {
  218. TokenKind kind;
  219. String string;
  220. TokenPos pos;
  221. };
  222. Token empty_token = {Token_Invalid};
  223. Token blank_token = {Token_Ident, {cast(u8 *)"_", 1}};
  224. Token make_token_ident(String s) {
  225. Token t = {Token_Ident, s};
  226. return t;
  227. }
  228. Token make_token_ident(char const *s) {
  229. Token t = {Token_Ident, make_string_c(s)};
  230. return t;
  231. }
  232. bool token_is_newline(Token const &tok) {
  233. return tok.kind == Token_Semicolon && tok.string == "\n";
  234. }
  235. struct ErrorCollector {
  236. TokenPos prev;
  237. i64 count;
  238. i64 warning_count;
  239. bool in_block;
  240. gbMutex mutex;
  241. gbMutex string_mutex;
  242. Array<u8> error_buffer;
  243. Array<String> errors;
  244. };
  245. gb_global ErrorCollector global_error_collector;
  246. #define MAX_ERROR_COLLECTOR_COUNT (36)
  247. bool any_errors(void) {
  248. return global_error_collector.error_buffer.count > 0;
  249. }
  250. void init_global_error_collector(void) {
  251. gb_mutex_init(&global_error_collector.mutex);
  252. gb_mutex_init(&global_error_collector.string_mutex);
  253. array_init(&global_error_collector.errors, heap_allocator());
  254. array_init(&global_error_collector.error_buffer, heap_allocator());
  255. array_init(&global_file_path_strings, heap_allocator(), 4096);
  256. array_init(&global_files, heap_allocator(), 4096);
  257. }
  258. bool set_file_path_string(i32 index, String const &path) {
  259. bool ok = false;
  260. GB_ASSERT(index >= 0);
  261. gb_mutex_lock(&global_error_collector.string_mutex);
  262. if (index >= global_file_path_strings.count) {
  263. array_resize(&global_file_path_strings, index);
  264. }
  265. String prev = global_file_path_strings[index];
  266. if (prev.len == 0) {
  267. global_file_path_strings[index] = path;
  268. ok = true;
  269. }
  270. gb_mutex_unlock(&global_error_collector.string_mutex);
  271. return ok;
  272. }
  273. bool set_ast_file_from_id(i32 index, AstFile *file) {
  274. bool ok = false;
  275. GB_ASSERT(index >= 0);
  276. gb_mutex_lock(&global_error_collector.string_mutex);
  277. if (index >= global_files.count) {
  278. array_resize(&global_files, index);
  279. }
  280. AstFile *prev = global_files[index];
  281. if (prev == nullptr) {
  282. global_files[index] = file;
  283. ok = true;
  284. }
  285. gb_mutex_unlock(&global_error_collector.string_mutex);
  286. return ok;
  287. }
  288. String get_file_path_string(i32 index) {
  289. GB_ASSERT(index >= 0);
  290. gb_mutex_lock(&global_error_collector.string_mutex);
  291. String path = {};
  292. if (index < global_file_path_strings.count) {
  293. path = global_file_path_strings[index];
  294. }
  295. gb_mutex_unlock(&global_error_collector.string_mutex);
  296. return path;
  297. }
  298. AstFile *get_ast_file_from_id(i32 index) {
  299. GB_ASSERT(index >= 0);
  300. gb_mutex_lock(&global_error_collector.string_mutex);
  301. AstFile *file = nullptr;
  302. if (index < global_files.count) {
  303. file = global_files[index];
  304. }
  305. gb_mutex_unlock(&global_error_collector.string_mutex);
  306. return file;
  307. }
  308. void begin_error_block(void) {
  309. gb_mutex_lock(&global_error_collector.mutex);
  310. global_error_collector.in_block = true;
  311. }
  312. void end_error_block(void) {
  313. if (global_error_collector.error_buffer.count > 0) {
  314. isize n = global_error_collector.error_buffer.count;
  315. u8 *text = gb_alloc_array(heap_allocator(), u8, n+1);
  316. gb_memmove(text, global_error_collector.error_buffer.data, n);
  317. text[n] = 0;
  318. String s = {text, n};
  319. array_add(&global_error_collector.errors, s);
  320. global_error_collector.error_buffer.count = 0;
  321. // gbFile *f = gb_file_get_standard(gbFileStandard_Error);
  322. // gb_file_write(f, text, n);
  323. }
  324. global_error_collector.in_block = false;
  325. gb_mutex_unlock(&global_error_collector.mutex);
  326. }
  327. #define ERROR_OUT_PROC(name) void name(char const *fmt, va_list va)
  328. typedef ERROR_OUT_PROC(ErrorOutProc);
  329. ERROR_OUT_PROC(default_error_out_va) {
  330. gbFile *f = gb_file_get_standard(gbFileStandard_Error);
  331. char buf[4096] = {};
  332. isize len = gb_snprintf_va(buf, gb_size_of(buf), fmt, va);
  333. isize n = len-1;
  334. if (global_error_collector.in_block) {
  335. isize cap = global_error_collector.error_buffer.count + n;
  336. array_reserve(&global_error_collector.error_buffer, cap);
  337. u8 *data = global_error_collector.error_buffer.data + global_error_collector.error_buffer.count;
  338. gb_memmove(data, buf, n);
  339. global_error_collector.error_buffer.count += n;
  340. } else {
  341. gb_mutex_lock(&global_error_collector.mutex);
  342. {
  343. u8 *text = gb_alloc_array(heap_allocator(), u8, n+1);
  344. gb_memmove(text, buf, n);
  345. text[n] = 0;
  346. array_add(&global_error_collector.errors, make_string(text, n));
  347. }
  348. gb_mutex_unlock(&global_error_collector.mutex);
  349. }
  350. gb_file_write(f, buf, n);
  351. }
  352. ErrorOutProc *error_out_va = default_error_out_va;
  353. // NOTE: defined in build_settings.cpp
  354. bool global_warnings_as_errors(void);
  355. bool global_ignore_warnings(void);
  356. bool show_error_line(void);
  357. gbString get_file_line_as_string(TokenPos const &pos, i32 *offset);
  358. void error_out(char const *fmt, ...) {
  359. va_list va;
  360. va_start(va, fmt);
  361. error_out_va(fmt, va);
  362. va_end(va);
  363. }
  364. bool show_error_on_line(TokenPos const &pos, TokenPos end) {
  365. if (!show_error_line()) {
  366. return false;
  367. }
  368. i32 offset = 0;
  369. gbString the_line = get_file_line_as_string(pos, &offset);
  370. defer (gb_string_free(the_line));
  371. if (the_line != nullptr) {
  372. String line = make_string(cast(u8 const *)the_line, gb_string_length(the_line));
  373. // TODO(bill): This assumes ASCII
  374. enum {
  375. MAX_LINE_LENGTH = 76,
  376. MAX_TAB_WIDTH = 8,
  377. ELLIPSIS_PADDING = 8
  378. };
  379. error_out("\n\t");
  380. if (line.len+MAX_TAB_WIDTH+ELLIPSIS_PADDING > MAX_LINE_LENGTH) {
  381. i32 const half_width = MAX_LINE_LENGTH/2;
  382. i32 left = cast(i32)(offset);
  383. i32 right = cast(i32)(line.len - offset);
  384. left = gb_min(left, half_width);
  385. right = gb_min(right, half_width);
  386. line.text += offset-left;
  387. line.len -= offset+right-left;
  388. line = string_trim_whitespace(line);
  389. offset = left + ELLIPSIS_PADDING/2;
  390. error_out("... %.*s ...", LIT(line));
  391. } else {
  392. error_out("%.*s", LIT(line));
  393. }
  394. error_out("\n\t");
  395. for (i32 i = 0; i < offset; i++) {
  396. error_out(" ");
  397. }
  398. error_out("^");
  399. if (end.file_id == pos.file_id) {
  400. if (end.line > pos.line) {
  401. for (i32 i = offset; i < line.len; i++) {
  402. error_out("~");
  403. }
  404. } else if (end.line == pos.line && end.column > pos.column) {
  405. i32 length = gb_min(end.offset - pos.offset, cast(i32)(line.len-offset));
  406. for (i32 i = 1; i < length-1; i++) {
  407. error_out("~");
  408. }
  409. if (length > 1) {
  410. error_out("^");
  411. }
  412. }
  413. }
  414. error_out("\n\n");
  415. return true;
  416. }
  417. return false;
  418. }
  419. void error_va(TokenPos const &pos, TokenPos end, char const *fmt, va_list va) {
  420. gb_mutex_lock(&global_error_collector.mutex);
  421. global_error_collector.count++;
  422. // NOTE(bill): Duplicate error, skip it
  423. if (pos.line == 0) {
  424. error_out("Error: %s\n", gb_bprintf_va(fmt, va));
  425. } else if (global_error_collector.prev != pos) {
  426. global_error_collector.prev = pos;
  427. error_out("%s %s\n",
  428. token_pos_to_string(pos),
  429. gb_bprintf_va(fmt, va));
  430. show_error_on_line(pos, end);
  431. }
  432. gb_mutex_unlock(&global_error_collector.mutex);
  433. if (global_error_collector.count > MAX_ERROR_COLLECTOR_COUNT) {
  434. gb_exit(1);
  435. }
  436. }
  437. void warning_va(TokenPos const &pos, TokenPos end, char const *fmt, va_list va) {
  438. if (global_warnings_as_errors()) {
  439. error_va(pos, end, fmt, va);
  440. return;
  441. }
  442. gb_mutex_lock(&global_error_collector.mutex);
  443. global_error_collector.warning_count++;
  444. if (!global_ignore_warnings()) {
  445. // NOTE(bill): Duplicate error, skip it
  446. if (pos.line == 0) {
  447. error_out("Warning: %s\n", gb_bprintf_va(fmt, va));
  448. } else if (global_error_collector.prev != pos) {
  449. global_error_collector.prev = pos;
  450. error_out("%s Warning: %s\n",
  451. token_pos_to_string(pos),
  452. gb_bprintf_va(fmt, va));
  453. show_error_on_line(pos, end);
  454. }
  455. }
  456. gb_mutex_unlock(&global_error_collector.mutex);
  457. }
  458. void error_line_va(char const *fmt, va_list va) {
  459. gb_mutex_lock(&global_error_collector.mutex);
  460. error_out_va(fmt, va);
  461. gb_mutex_unlock(&global_error_collector.mutex);
  462. }
  463. void error_no_newline_va(TokenPos const &pos, char const *fmt, va_list va) {
  464. gb_mutex_lock(&global_error_collector.mutex);
  465. global_error_collector.count++;
  466. // NOTE(bill): Duplicate error, skip it
  467. if (pos.line == 0) {
  468. error_out("Error: %s", gb_bprintf_va(fmt, va));
  469. } else if (global_error_collector.prev != pos) {
  470. global_error_collector.prev = pos;
  471. error_out("%s %s",
  472. token_pos_to_string(pos),
  473. gb_bprintf_va(fmt, va));
  474. }
  475. gb_mutex_unlock(&global_error_collector.mutex);
  476. if (global_error_collector.count > MAX_ERROR_COLLECTOR_COUNT) {
  477. gb_exit(1);
  478. }
  479. }
  480. void syntax_error_va(TokenPos const &pos, TokenPos end, char const *fmt, va_list va) {
  481. gb_mutex_lock(&global_error_collector.mutex);
  482. global_error_collector.count++;
  483. // NOTE(bill): Duplicate error, skip it
  484. if (global_error_collector.prev != pos) {
  485. global_error_collector.prev = pos;
  486. error_out("%s Syntax Error: %s\n",
  487. token_pos_to_string(pos),
  488. gb_bprintf_va(fmt, va));
  489. show_error_on_line(pos, end);
  490. } else if (pos.line == 0) {
  491. error_out("Syntax Error: %s\n", gb_bprintf_va(fmt, va));
  492. }
  493. gb_mutex_unlock(&global_error_collector.mutex);
  494. if (global_error_collector.count > MAX_ERROR_COLLECTOR_COUNT) {
  495. gb_exit(1);
  496. }
  497. }
  498. void syntax_warning_va(TokenPos const &pos, TokenPos end, char const *fmt, va_list va) {
  499. if (global_warnings_as_errors()) {
  500. syntax_error_va(pos, end, fmt, va);
  501. return;
  502. }
  503. gb_mutex_lock(&global_error_collector.mutex);
  504. global_error_collector.warning_count++;
  505. if (!global_ignore_warnings()) {
  506. // NOTE(bill): Duplicate error, skip it
  507. if (global_error_collector.prev != pos) {
  508. global_error_collector.prev = pos;
  509. error_out("%s Syntax Warning: %s\n",
  510. token_pos_to_string(pos),
  511. gb_bprintf_va(fmt, va));
  512. show_error_on_line(pos, end);
  513. } else if (pos.line == 0) {
  514. error_out("Warning: %s\n", gb_bprintf_va(fmt, va));
  515. }
  516. }
  517. gb_mutex_unlock(&global_error_collector.mutex);
  518. }
  519. void warning(Token const &token, char const *fmt, ...) {
  520. va_list va;
  521. va_start(va, fmt);
  522. warning_va(token.pos, {}, fmt, va);
  523. va_end(va);
  524. }
  525. void error(Token const &token, char const *fmt, ...) {
  526. va_list va;
  527. va_start(va, fmt);
  528. error_va(token.pos, {}, fmt, va);
  529. va_end(va);
  530. }
  531. void error(TokenPos pos, char const *fmt, ...) {
  532. va_list va;
  533. va_start(va, fmt);
  534. Token token = {};
  535. token.pos = pos;
  536. error_va(pos, {}, fmt, va);
  537. va_end(va);
  538. }
  539. void error_line(char const *fmt, ...) {
  540. va_list va;
  541. va_start(va, fmt);
  542. error_line_va(fmt, va);
  543. va_end(va);
  544. }
  545. void syntax_error(Token const &token, char const *fmt, ...) {
  546. va_list va;
  547. va_start(va, fmt);
  548. syntax_error_va(token.pos, {}, fmt, va);
  549. va_end(va);
  550. }
  551. void syntax_error(TokenPos pos, char const *fmt, ...) {
  552. va_list va;
  553. va_start(va, fmt);
  554. syntax_error_va(pos, {}, fmt, va);
  555. va_end(va);
  556. }
  557. void syntax_warning(Token const &token, char const *fmt, ...) {
  558. va_list va;
  559. va_start(va, fmt);
  560. syntax_warning_va(token.pos, {}, fmt, va);
  561. va_end(va);
  562. }
  563. void compiler_error(char const *fmt, ...) {
  564. va_list va;
  565. va_start(va, fmt);
  566. gb_printf_err("Internal Compiler Error: %s\n",
  567. gb_bprintf_va(fmt, va));
  568. va_end(va);
  569. gb_exit(1);
  570. }
  571. gb_inline bool token_is_literal(TokenKind t) {
  572. return gb_is_between(t, Token__LiteralBegin+1, Token__LiteralEnd-1);
  573. }
  574. gb_inline bool token_is_operator(TokenKind t) {
  575. return gb_is_between(t, Token__OperatorBegin+1, Token__OperatorEnd-1);
  576. }
  577. gb_inline bool token_is_keyword(TokenKind t) {
  578. return gb_is_between(t, Token__KeywordBegin+1, Token__KeywordEnd-1);
  579. }
  580. gb_inline bool token_is_comparison(TokenKind t) {
  581. return gb_is_between(t, Token__ComparisonBegin+1, Token__ComparisonEnd-1);
  582. }
  583. gb_inline bool token_is_shift(TokenKind t) {
  584. return t == Token_Shl || t == Token_Shr;
  585. }
  586. gb_inline void print_token(Token t) { gb_printf("%.*s\n", LIT(t.string)); }
  587. enum TokenizerInitError {
  588. TokenizerInit_None,
  589. TokenizerInit_Invalid,
  590. TokenizerInit_NotExists,
  591. TokenizerInit_Permission,
  592. TokenizerInit_Empty,
  593. TokenizerInit_FileTooLarge,
  594. TokenizerInit_Count,
  595. };
  596. struct TokenizerState {
  597. Rune curr_rune; // current character
  598. u8 * curr; // character pos
  599. u8 * read_curr; // pos from start
  600. u8 * line; // current line pos
  601. i32 line_count;
  602. bool insert_semicolon;
  603. };
  604. enum TokenizerFlags {
  605. TokenizerFlag_None = 0,
  606. TokenizerFlag_InsertSemicolon = 1<<0,
  607. };
  608. struct Tokenizer {
  609. i32 curr_file_id;
  610. String fullpath;
  611. u8 *start;
  612. u8 *end;
  613. Rune curr_rune; // current character
  614. u8 * curr; // character pos
  615. u8 * read_curr; // pos from start
  616. u8 * line; // current line pos
  617. i32 line_count;
  618. i32 error_count;
  619. Array<String> allocated_strings;
  620. TokenizerFlags flags;
  621. bool insert_semicolon;
  622. };
  623. TokenizerState save_tokenizer_state(Tokenizer *t) {
  624. TokenizerState state = {};
  625. state.curr_rune = t->curr_rune;
  626. state.curr = t->curr;
  627. state.read_curr = t->read_curr;
  628. state.line = t->line;
  629. state.line_count = t->line_count;
  630. state.insert_semicolon = t->insert_semicolon;
  631. return state;
  632. }
  633. void restore_tokenizer_state(Tokenizer *t, TokenizerState *state) {
  634. t->curr_rune = state->curr_rune;
  635. t->curr = state->curr;
  636. t->read_curr = state->read_curr;
  637. t->line = state->line;
  638. t->line_count = state->line_count;
  639. t->insert_semicolon = state->insert_semicolon;
  640. }
  641. void tokenizer_err(Tokenizer *t, char const *msg, ...) {
  642. va_list va;
  643. isize column = t->read_curr - t->line+1;
  644. if (column < 1) {
  645. column = 1;
  646. }
  647. TokenPos pos = {};
  648. pos.file_id = t->curr_file_id;
  649. pos.line = t->line_count;
  650. pos.column = cast(i32)column;
  651. pos.offset = cast(i32)(t->read_curr - t->start);
  652. va_start(va, msg);
  653. syntax_error_va(pos, {}, msg, va);
  654. va_end(va);
  655. t->error_count++;
  656. }
  657. void tokenizer_err(Tokenizer *t, TokenPos const &pos, char const *msg, ...) {
  658. va_list va;
  659. isize column = t->read_curr - t->line+1;
  660. if (column < 1) {
  661. column = 1;
  662. }
  663. va_start(va, msg);
  664. syntax_error_va(pos, {}, msg, va);
  665. va_end(va);
  666. t->error_count++;
  667. }
  668. void advance_to_next_rune(Tokenizer *t) {
  669. if (t->read_curr < t->end) {
  670. Rune rune;
  671. isize width = 1;
  672. t->curr = t->read_curr;
  673. if (t->curr_rune == '\n') {
  674. t->line = t->curr;
  675. t->line_count++;
  676. }
  677. rune = *t->read_curr;
  678. if (rune == 0) {
  679. tokenizer_err(t, "Illegal character NUL");
  680. } else if (rune >= 0x80) { // not ASCII
  681. width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
  682. if (rune == GB_RUNE_INVALID && width == 1) {
  683. tokenizer_err(t, "Illegal UTF-8 encoding");
  684. } else if (rune == GB_RUNE_BOM && t->curr-t->start > 0){
  685. tokenizer_err(t, "Illegal byte order mark");
  686. }
  687. }
  688. t->read_curr += width;
  689. t->curr_rune = rune;
  690. } else {
  691. t->curr = t->end;
  692. if (t->curr_rune == '\n') {
  693. t->line = t->curr;
  694. t->line_count++;
  695. }
  696. t->curr_rune = GB_RUNE_EOF;
  697. }
  698. }
  699. TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath, TokenizerFlags flags = TokenizerFlag_None) {
  700. TokenizerInitError err = TokenizerInit_None;
  701. char *c_str = alloc_cstring(heap_allocator(), fullpath);
  702. defer (gb_free(heap_allocator(), c_str));
  703. // TODO(bill): Memory map rather than copy contents
  704. gbFileContents fc = gb_file_read_contents(heap_allocator(), true, c_str);
  705. t->flags = flags;
  706. t->fullpath = fullpath;
  707. t->line_count = 1;
  708. if (fc.size > I32_MAX) {
  709. err = TokenizerInit_FileTooLarge;
  710. gb_file_free_contents(&fc);
  711. } else if (fc.data != nullptr) {
  712. t->start = cast(u8 *)fc.data;
  713. t->line = t->read_curr = t->curr = t->start;
  714. t->end = t->start + fc.size;
  715. advance_to_next_rune(t);
  716. if (t->curr_rune == GB_RUNE_BOM) {
  717. advance_to_next_rune(t); // Ignore BOM at file beginning
  718. }
  719. array_init(&t->allocated_strings, heap_allocator());
  720. } else {
  721. gbFile f = {};
  722. gbFileError file_err = gb_file_open(&f, c_str);
  723. defer (gb_file_close(&f));
  724. switch (file_err) {
  725. case gbFileError_Invalid: err = TokenizerInit_Invalid; break;
  726. case gbFileError_NotExists: err = TokenizerInit_NotExists; break;
  727. case gbFileError_Permission: err = TokenizerInit_Permission; break;
  728. }
  729. if (err == TokenizerInit_None && gb_file_size(&f) == 0) {
  730. err = TokenizerInit_Empty;
  731. }
  732. }
  733. return err;
  734. }
  735. gb_inline void destroy_tokenizer(Tokenizer *t) {
  736. if (t->start != nullptr) {
  737. gb_free(heap_allocator(), t->start);
  738. }
  739. for_array(i, t->allocated_strings) {
  740. gb_free(heap_allocator(), t->allocated_strings[i].text);
  741. }
  742. array_free(&t->allocated_strings);
  743. }
  744. gb_inline i32 digit_value(Rune r) {
  745. switch (r) {
  746. case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
  747. return r - '0';
  748. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
  749. return r - 'a' + 10;
  750. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
  751. return r - 'A' + 10;
  752. }
  753. return 16; // NOTE(bill): Larger than highest possible
  754. }
  755. gb_inline void scan_mantissa(Tokenizer *t, i32 base) {
  756. while (digit_value(t->curr_rune) < base || t->curr_rune == '_') {
  757. advance_to_next_rune(t);
  758. }
  759. }
  760. u8 peek_byte(Tokenizer *t, isize offset=0) {
  761. if (t->read_curr+offset < t->end) {
  762. return t->read_curr[offset];
  763. }
  764. return 0;
  765. }
  766. void scan_number_to_token(Tokenizer *t, Token *token, bool seen_decimal_point) {
  767. token->kind = Token_Integer;
  768. token->string = {t->curr, 1};
  769. token->pos.file_id = t->curr_file_id;
  770. token->pos.line = t->line_count;
  771. token->pos.column = cast(i32)(t->curr-t->line+1);
  772. if (seen_decimal_point) {
  773. token->string.text -= 1;
  774. token->string.len += 1;
  775. token->pos.column -= 1;
  776. token->kind = Token_Float;
  777. scan_mantissa(t, 10);
  778. goto exponent;
  779. }
  780. if (t->curr_rune == '0') {
  781. u8 *prev = t->curr;
  782. advance_to_next_rune(t);
  783. switch (t->curr_rune) {
  784. case 'b': // Binary
  785. advance_to_next_rune(t);
  786. scan_mantissa(t, 2);
  787. if (t->curr - prev <= 2) {
  788. token->kind = Token_Invalid;
  789. }
  790. goto end;
  791. case 'o': // Octal
  792. advance_to_next_rune(t);
  793. scan_mantissa(t, 8);
  794. if (t->curr - prev <= 2) {
  795. token->kind = Token_Invalid;
  796. }
  797. goto end;
  798. case 'd': // Decimal
  799. advance_to_next_rune(t);
  800. scan_mantissa(t, 10);
  801. if (t->curr - prev <= 2) {
  802. token->kind = Token_Invalid;
  803. }
  804. goto end;
  805. case 'z': // Dozenal
  806. advance_to_next_rune(t);
  807. scan_mantissa(t, 12);
  808. if (t->curr - prev <= 2) {
  809. token->kind = Token_Invalid;
  810. }
  811. goto end;
  812. case 'x': // Hexadecimal
  813. advance_to_next_rune(t);
  814. scan_mantissa(t, 16);
  815. if (t->curr - prev <= 2) {
  816. token->kind = Token_Invalid;
  817. }
  818. goto end;
  819. case 'h': // Hexadecimal Float
  820. token->kind = Token_Float;
  821. advance_to_next_rune(t);
  822. scan_mantissa(t, 16);
  823. if (t->curr - prev <= 2) {
  824. token->kind = Token_Invalid;
  825. } else {
  826. u8 *start = prev+2;
  827. isize n = t->curr - start;
  828. isize digit_count = 0;
  829. for (isize i = 0; i < n; i++) {
  830. if (start[i] != '_') {
  831. digit_count += 1;
  832. }
  833. }
  834. switch (digit_count) {
  835. case 4:
  836. case 8:
  837. case 16:
  838. break;
  839. default:
  840. tokenizer_err(t, "Invalid hexadecimal float, expected 4, 8, or 16 digits, got %td", digit_count);
  841. break;
  842. }
  843. }
  844. goto end;
  845. default:
  846. scan_mantissa(t, 10);
  847. goto fraction;
  848. }
  849. }
  850. scan_mantissa(t, 10);
  851. fraction:
  852. if (t->curr_rune == '.') {
  853. if (peek_byte(t) == '.') {
  854. // NOTE(bill): this is kind of ellipsis
  855. goto end;
  856. }
  857. advance_to_next_rune(t);
  858. token->kind = Token_Float;
  859. scan_mantissa(t, 10);
  860. }
  861. exponent:
  862. if (t->curr_rune == 'e' || t->curr_rune == 'E') {
  863. token->kind = Token_Float;
  864. advance_to_next_rune(t);
  865. if (t->curr_rune == '-' || t->curr_rune == '+') {
  866. advance_to_next_rune(t);
  867. }
  868. scan_mantissa(t, 10);
  869. }
  870. switch (t->curr_rune) {
  871. case 'i': case 'j': case 'k':
  872. token->kind = Token_Imag;
  873. advance_to_next_rune(t);
  874. break;
  875. }
  876. end:
  877. token->string.len = t->curr - token->string.text;
  878. return;
  879. }
  880. bool scan_escape(Tokenizer *t) {
  881. isize len = 0;
  882. u32 base = 0, max = 0, x = 0;
  883. Rune r = t->curr_rune;
  884. switch (r) {
  885. case 'a':
  886. case 'b':
  887. case 'e':
  888. case 'f':
  889. case 'n':
  890. case 'r':
  891. case 't':
  892. case 'v':
  893. case '\\':
  894. case '\'':
  895. case '\"':
  896. advance_to_next_rune(t);
  897. return true;
  898. case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
  899. len = 3; base = 8; max = 255;
  900. break;
  901. case 'x':
  902. advance_to_next_rune(t);
  903. len = 2; base = 16; max = 255;
  904. break;
  905. case 'u':
  906. advance_to_next_rune(t);
  907. len = 4; base = 16; max = GB_RUNE_MAX;
  908. break;
  909. case 'U':
  910. advance_to_next_rune(t);
  911. len = 8; base = 16; max = GB_RUNE_MAX;
  912. break;
  913. default:
  914. if (t->curr_rune < 0) {
  915. tokenizer_err(t, "Escape sequence was not terminated");
  916. } else {
  917. tokenizer_err(t, "Unknown escape sequence");
  918. }
  919. return false;
  920. }
  921. while (len --> 0) {
  922. u32 d = cast(u32)digit_value(t->curr_rune);
  923. if (d >= base) {
  924. if (t->curr_rune < 0) {
  925. tokenizer_err(t, "Escape sequence was not terminated");
  926. } else {
  927. tokenizer_err(t, "Illegal character %d in escape sequence", t->curr_rune);
  928. }
  929. return false;
  930. }
  931. x = x*base + d;
  932. advance_to_next_rune(t);
  933. }
  934. return true;
  935. }
  936. void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) {
  937. // Skip whitespace
  938. for (;;) {
  939. switch (t->curr_rune) {
  940. case '\n':
  941. if (t->insert_semicolon) {
  942. break;
  943. }
  944. /*fallthrough*/
  945. case ' ':
  946. case '\t':
  947. case '\r':
  948. advance_to_next_rune(t);
  949. continue;
  950. }
  951. break;
  952. }
  953. token->kind = Token_Invalid;
  954. token->string.text = t->curr;
  955. token->string.len = 1;
  956. token->pos.file_id = t->curr_file_id;
  957. token->pos.line = t->line_count;
  958. token->pos.offset = cast(i32)(t->curr - t->start);
  959. token->pos.column = cast(i32)(t->curr - t->line + 1);
  960. TokenPos current_pos = token->pos;
  961. bool insert_semicolon = false;
  962. Rune curr_rune = t->curr_rune;
  963. if (rune_is_letter(curr_rune)) {
  964. token->kind = Token_Ident;
  965. while (rune_is_letter_or_digit(t->curr_rune)) {
  966. advance_to_next_rune(t);
  967. }
  968. token->string.len = t->curr - token->string.text;
  969. // NOTE(bill): Heavily optimize to make it faster to find keywords
  970. if (1 < token->string.len && token->string.len <= max_keyword_size && keyword_indices[token->string.len]) {
  971. u32 hash = keyword_hash(token->string.text, token->string.len);
  972. u32 index = hash & KEYWORD_HASH_TABLE_MASK;
  973. KeywordHashEntry *entry = &keyword_hash_table[index];
  974. if (entry->kind != Token_Invalid && entry->hash == hash) {
  975. if (str_eq(entry->text, token->string)) {
  976. token->kind = entry->kind;
  977. if (token->kind == Token_not_in && entry->text == "notin") {
  978. syntax_warning(*token, "'notin' is deprecated in favour of 'not_in'");
  979. }
  980. }
  981. }
  982. }
  983. switch (token->kind) {
  984. case Token_Ident:
  985. case Token_context:
  986. case Token_typeid: // Dunno?
  987. case Token_break:
  988. case Token_continue:
  989. case Token_fallthrough:
  990. case Token_return:
  991. insert_semicolon = true;
  992. break;
  993. }
  994. if (t->flags & TokenizerFlag_InsertSemicolon) {
  995. t->insert_semicolon = insert_semicolon;
  996. }
  997. return;
  998. } else if (gb_is_between(curr_rune, '0', '9')) {
  999. insert_semicolon = true;
  1000. scan_number_to_token(t, token, false);
  1001. } else {
  1002. advance_to_next_rune(t);
  1003. switch (curr_rune) {
  1004. case GB_RUNE_EOF:
  1005. token->kind = Token_EOF;
  1006. if (t->insert_semicolon) {
  1007. t->insert_semicolon = false; // EOF consumed
  1008. token->string = str_lit("\n");
  1009. token->kind = Token_Semicolon;
  1010. return;
  1011. }
  1012. break;
  1013. case '\n':
  1014. t->insert_semicolon = false;
  1015. token->string = str_lit("\n");
  1016. token->kind = Token_Semicolon;
  1017. return;
  1018. case '\\':
  1019. if (t->flags & TokenizerFlag_InsertSemicolon) {
  1020. t->insert_semicolon = false;
  1021. }
  1022. tokenizer_get_token(t, token);
  1023. if (token->pos.line == current_pos.line) {
  1024. tokenizer_err(t, token_pos_add_column(current_pos), "Expected a newline after \\");
  1025. }
  1026. // NOTE(bill): tokenizer_get_token has been called already, return early
  1027. return;
  1028. case '\'': // Rune Literal
  1029. {
  1030. insert_semicolon = true;
  1031. token->kind = Token_Rune;
  1032. Rune quote = curr_rune;
  1033. bool valid = true;
  1034. i32 n = 0, success;
  1035. for (;;) {
  1036. Rune r = t->curr_rune;
  1037. if (r == '\n' || r < 0) {
  1038. tokenizer_err(t, "Rune literal not terminated");
  1039. break;
  1040. }
  1041. advance_to_next_rune(t);
  1042. if (r == quote) {
  1043. break;
  1044. }
  1045. n++;
  1046. if (r == '\\') {
  1047. if (!scan_escape(t)) {
  1048. valid = false;
  1049. }
  1050. }
  1051. }
  1052. // TODO(bill): Better Error Handling
  1053. if (valid && n != 1) {
  1054. tokenizer_err(t, "Invalid rune literal");
  1055. }
  1056. token->string.len = t->curr - token->string.text;
  1057. success = unquote_string(heap_allocator(), &token->string, 0);
  1058. if (success > 0) {
  1059. if (success == 2) {
  1060. array_add(&t->allocated_strings, token->string);
  1061. }
  1062. } else {
  1063. tokenizer_err(t, "Invalid rune literal");
  1064. }
  1065. if (t->flags & TokenizerFlag_InsertSemicolon) {
  1066. t->insert_semicolon = insert_semicolon;
  1067. }
  1068. return;
  1069. } break;
  1070. case '`': // Raw String Literal
  1071. case '"': // String Literal
  1072. {
  1073. insert_semicolon = true;
  1074. bool has_carriage_return = false;
  1075. i32 success;
  1076. Rune quote = curr_rune;
  1077. token->kind = Token_String;
  1078. if (curr_rune == '"') {
  1079. for (;;) {
  1080. Rune r = t->curr_rune;
  1081. if (r == '\n' || r < 0) {
  1082. tokenizer_err(t, "String literal not terminated");
  1083. break;
  1084. }
  1085. advance_to_next_rune(t);
  1086. if (r == quote) {
  1087. break;
  1088. }
  1089. if (r == '\\') {
  1090. scan_escape(t);
  1091. }
  1092. }
  1093. } else {
  1094. for (;;) {
  1095. Rune r = t->curr_rune;
  1096. if (r < 0) {
  1097. tokenizer_err(t, "String literal not terminated");
  1098. break;
  1099. }
  1100. advance_to_next_rune(t);
  1101. if (r == quote) {
  1102. break;
  1103. }
  1104. if (r == '\r') {
  1105. has_carriage_return = true;
  1106. }
  1107. }
  1108. }
  1109. token->string.len = t->curr - token->string.text;
  1110. success = unquote_string(heap_allocator(), &token->string, 0, has_carriage_return);
  1111. if (success > 0) {
  1112. if (success == 2) {
  1113. array_add(&t->allocated_strings, token->string);
  1114. }
  1115. } else {
  1116. tokenizer_err(t, "Invalid string literal");
  1117. }
  1118. if (t->flags & TokenizerFlag_InsertSemicolon) {
  1119. t->insert_semicolon = insert_semicolon;
  1120. }
  1121. return;
  1122. } break;
  1123. case '.':
  1124. if (t->curr_rune == '.') {
  1125. advance_to_next_rune(t);
  1126. token->kind = Token_Ellipsis;
  1127. if (t->curr_rune == '<') {
  1128. advance_to_next_rune(t);
  1129. token->kind = Token_RangeHalf;
  1130. } else if (t->curr_rune == '=') {
  1131. advance_to_next_rune(t);
  1132. token->kind = Token_RangeFull;
  1133. }
  1134. } else if ('0' <= t->curr_rune && t->curr_rune <= '9') {
  1135. scan_number_to_token(t, token, true);
  1136. } else {
  1137. token->kind = Token_Period;
  1138. }
  1139. break;
  1140. case '@': token->kind = Token_At; break;
  1141. case '$': token->kind = Token_Dollar; break;
  1142. case '?':
  1143. insert_semicolon = true;
  1144. token->kind = Token_Question;
  1145. break;
  1146. case '^':
  1147. insert_semicolon = true;
  1148. token->kind = Token_Pointer;
  1149. break;
  1150. case ';': token->kind = Token_Semicolon; break;
  1151. case ',': token->kind = Token_Comma; break;
  1152. case ':': token->kind = Token_Colon; break;
  1153. case '(': token->kind = Token_OpenParen; break;
  1154. case ')':
  1155. insert_semicolon = true;
  1156. token->kind = Token_CloseParen;
  1157. break;
  1158. case '[': token->kind = Token_OpenBracket; break;
  1159. case ']':
  1160. insert_semicolon = true;
  1161. token->kind = Token_CloseBracket;
  1162. break;
  1163. case '{': token->kind = Token_OpenBrace; break;
  1164. case '}':
  1165. insert_semicolon = true;
  1166. token->kind = Token_CloseBrace;
  1167. break;
  1168. case '%':
  1169. token->kind = Token_Mod;
  1170. if (t->curr_rune == '=') {
  1171. advance_to_next_rune(t);
  1172. token->kind = Token_ModEq;
  1173. } else if (t->curr_rune == '%') {
  1174. token->kind = Token_ModMod;
  1175. advance_to_next_rune(t);
  1176. if (t->curr_rune == '=') {
  1177. token->kind = Token_ModModEq;
  1178. advance_to_next_rune(t);
  1179. }
  1180. }
  1181. break;
  1182. case '*':
  1183. token->kind = Token_Mul;
  1184. if (t->curr_rune == '=') {
  1185. advance_to_next_rune(t);
  1186. token->kind = Token_MulEq;
  1187. }
  1188. break;
  1189. case '=':
  1190. token->kind = Token_Eq;
  1191. if (t->curr_rune == '=') {
  1192. advance_to_next_rune(t);
  1193. token->kind = Token_CmpEq;
  1194. }
  1195. break;
  1196. case '~':
  1197. token->kind = Token_Xor;
  1198. if (t->curr_rune == '=') {
  1199. advance_to_next_rune(t);
  1200. token->kind = Token_XorEq;
  1201. }
  1202. break;
  1203. case '!':
  1204. token->kind = Token_Not;
  1205. if (t->curr_rune == '=') {
  1206. advance_to_next_rune(t);
  1207. token->kind = Token_NotEq;
  1208. }
  1209. break;
  1210. case '+':
  1211. token->kind = Token_Add;
  1212. if (t->curr_rune == '=') {
  1213. advance_to_next_rune(t);
  1214. token->kind = Token_AddEq;
  1215. } else if (t->curr_rune == '+') {
  1216. advance_to_next_rune(t);
  1217. token->kind = Token_Increment;
  1218. insert_semicolon = true;
  1219. }
  1220. break;
  1221. case '-':
  1222. token->kind = Token_Sub;
  1223. if (t->curr_rune == '=') {
  1224. advance_to_next_rune(t);
  1225. token->kind = Token_SubEq;
  1226. } else if (t->curr_rune == '-') {
  1227. insert_semicolon = true;
  1228. advance_to_next_rune(t);
  1229. token->kind = Token_Decrement;
  1230. if (t->curr_rune == '-') {
  1231. advance_to_next_rune(t);
  1232. token->kind = Token_Undef;
  1233. }
  1234. } else if (t->curr_rune == '>') {
  1235. advance_to_next_rune(t);
  1236. token->kind = Token_ArrowRight;
  1237. }
  1238. break;
  1239. case '#':
  1240. if (t->curr_rune == '!') {
  1241. insert_semicolon = t->insert_semicolon;
  1242. token->kind = Token_Comment;
  1243. while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) {
  1244. advance_to_next_rune(t);
  1245. }
  1246. } else {
  1247. token->kind = Token_Hash;
  1248. }
  1249. break;
  1250. case '/': {
  1251. token->kind = Token_Quo;
  1252. if (t->curr_rune == '/') {
  1253. insert_semicolon = t->insert_semicolon;
  1254. token->kind = Token_Comment;
  1255. while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) {
  1256. advance_to_next_rune(t);
  1257. }
  1258. } else if (t->curr_rune == '*') {
  1259. token->kind = Token_Comment;
  1260. isize comment_scope = 1;
  1261. advance_to_next_rune(t);
  1262. while (comment_scope > 0) {
  1263. if (t->curr_rune == GB_RUNE_EOF) {
  1264. break;
  1265. } else if (t->curr_rune == '/') {
  1266. advance_to_next_rune(t);
  1267. if (t->curr_rune == '*') {
  1268. advance_to_next_rune(t);
  1269. comment_scope++;
  1270. }
  1271. } else if (t->curr_rune == '*') {
  1272. advance_to_next_rune(t);
  1273. if (t->curr_rune == '/') {
  1274. advance_to_next_rune(t);
  1275. comment_scope--;
  1276. }
  1277. } else {
  1278. advance_to_next_rune(t);
  1279. }
  1280. }
  1281. } else if (t->curr_rune == '=') {
  1282. advance_to_next_rune(t);
  1283. token->kind = Token_QuoEq;
  1284. }
  1285. } break;
  1286. case '<':
  1287. token->kind = Token_Lt;
  1288. if (t->curr_rune == '=') {
  1289. token->kind = Token_LtEq;
  1290. advance_to_next_rune(t);
  1291. } else if (t->curr_rune == '<') {
  1292. token->kind = Token_Shl;
  1293. advance_to_next_rune(t);
  1294. if (t->curr_rune == '=') {
  1295. token->kind = Token_ShlEq;
  1296. advance_to_next_rune(t);
  1297. }
  1298. }
  1299. break;
  1300. case '>':
  1301. token->kind = Token_Gt;
  1302. if (t->curr_rune == '=') {
  1303. token->kind = Token_GtEq;
  1304. advance_to_next_rune(t);
  1305. } else if (t->curr_rune == '>') {
  1306. token->kind = Token_Shr;
  1307. advance_to_next_rune(t);
  1308. if (t->curr_rune == '=') {
  1309. token->kind = Token_ShrEq;
  1310. advance_to_next_rune(t);
  1311. }
  1312. }
  1313. break;
  1314. case '&':
  1315. token->kind = Token_And;
  1316. if (t->curr_rune == '~') {
  1317. token->kind = Token_AndNot;
  1318. advance_to_next_rune(t);
  1319. if (t->curr_rune == '=') {
  1320. token->kind = Token_AndNotEq;
  1321. advance_to_next_rune(t);
  1322. }
  1323. } else if (t->curr_rune == '=') {
  1324. token->kind = Token_AndEq;
  1325. advance_to_next_rune(t);
  1326. } else if (t->curr_rune == '&') {
  1327. token->kind = Token_CmpAnd;
  1328. advance_to_next_rune(t);
  1329. if (t->curr_rune == '=') {
  1330. token->kind = Token_CmpAndEq;
  1331. advance_to_next_rune(t);
  1332. }
  1333. }
  1334. break;
  1335. case '|':
  1336. token->kind = Token_Or;
  1337. if (t->curr_rune == '=') {
  1338. token->kind = Token_OrEq;
  1339. advance_to_next_rune(t);
  1340. } else if (t->curr_rune == '|') {
  1341. token->kind = Token_CmpOr;
  1342. advance_to_next_rune(t);
  1343. if (t->curr_rune == '=') {
  1344. token->kind = Token_CmpOrEq;
  1345. advance_to_next_rune(t);
  1346. }
  1347. }
  1348. break;
  1349. default:
  1350. if (curr_rune != GB_RUNE_BOM) {
  1351. u8 str[4] = {};
  1352. int len = cast(int)gb_utf8_encode_rune(str, curr_rune);
  1353. tokenizer_err(t, "Illegal character: %.*s (%d) ", len, str, curr_rune);
  1354. }
  1355. insert_semicolon = t->insert_semicolon; // Preserve insert_semicolon info
  1356. token->kind = Token_Invalid;
  1357. break;
  1358. }
  1359. }
  1360. if (t->flags & TokenizerFlag_InsertSemicolon) {
  1361. t->insert_semicolon = insert_semicolon;
  1362. }
  1363. token->string.len = t->curr - token->string.text;
  1364. return;
  1365. }