tokenizer.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. #include "tokenizer.h"
  2. #include "errors.h"
  3. #include <assert.h>
  4. #include <stdbool.h>
  5. #include <stdlib.h>
  6. #include <string.h>
  7. token tokens_get(tokens *tokens, size_t index) {
  8. debug_context context = {0};
  9. check(tokens->current_size > index, context, "Token index out of bounds");
  10. return tokens->t[index];
  11. }
  12. static bool is_num(char ch, char chch) {
  13. return (ch >= '0' && ch <= '9') || (ch == '-' && chch >= '0' && chch <= '9');
  14. }
  15. static bool is_op(char ch) {
  16. return ch == '&' || ch == '|' || ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == '=' || ch == '!' || ch == '<' || ch == '>' || ch == '%' ||
  17. ch == '^';
  18. }
  19. static bool is_whitespace(char ch) {
  20. return ch == ' ' || (ch >= 9 && ch <= 13);
  21. }
  22. typedef enum mode {
  23. MODE_SELECT,
  24. MODE_NUMBER,
  25. // MODE_STRING,
  26. MODE_OPERATOR,
  27. MODE_IDENTIFIER,
  28. MODE_LINE_COMMENT,
  29. MODE_COMMENT
  30. } mode;
  31. typedef struct tokenizer_state {
  32. const char *iterator;
  33. char next;
  34. char next_next;
  35. int line, column;
  36. bool line_end;
  37. } tokenizer_state;
  38. static void tokenizer_state_init(debug_context *context, tokenizer_state *state, const char *source) {
  39. state->line = state->column = 0;
  40. state->iterator = source;
  41. state->next = *state->iterator;
  42. if (*state->iterator != 0) {
  43. state->iterator += 1;
  44. }
  45. state->next_next = *state->iterator;
  46. state->line_end = false;
  47. context->column = 0;
  48. context->line = 0;
  49. }
  50. static void tokenizer_state_advance(debug_context *context, tokenizer_state *state) {
  51. state->next = state->next_next;
  52. if (*state->iterator != 0) {
  53. state->iterator += 1;
  54. }
  55. state->next_next = *state->iterator;
  56. if (state->line_end) {
  57. state->line_end = false;
  58. state->line += 1;
  59. state->column = 0;
  60. }
  61. else {
  62. state->column += 1;
  63. }
  64. if (state->next == '\n') {
  65. state->line_end = true;
  66. }
  67. context->column = state->column;
  68. context->line = state->line;
  69. }
  70. typedef struct tokenizer_buffer {
  71. char *buf;
  72. size_t current_size;
  73. size_t max_size;
  74. int column, line;
  75. } tokenizer_buffer;
  76. static void tokenizer_buffer_init(tokenizer_buffer *buffer) {
  77. buffer->max_size = 1024 * 1024;
  78. buffer->buf = (char *)malloc(buffer->max_size);
  79. buffer->current_size = 0;
  80. buffer->column = buffer->line = 0;
  81. }
  82. static void tokenizer_buffer_reset(tokenizer_buffer *buffer, tokenizer_state *state) {
  83. buffer->current_size = 0;
  84. buffer->column = state->column;
  85. buffer->line = state->line;
  86. }
  87. static void tokenizer_buffer_add(tokenizer_buffer *buffer, char ch) {
  88. debug_context context = {0};
  89. check(buffer->current_size < buffer->max_size, context, "Token buffer is too small");
  90. buffer->buf[buffer->current_size] = ch;
  91. buffer->current_size += 1;
  92. }
  93. static bool tokenizer_buffer_equals(tokenizer_buffer *buffer, const char *str) {
  94. buffer->buf[buffer->current_size] = 0;
  95. return strcmp(buffer->buf, str) == 0;
  96. }
  97. static name_id tokenizer_buffer_to_name(tokenizer_buffer *buffer) {
  98. debug_context context = {0};
  99. check(buffer->current_size < buffer->max_size, context, "Token buffer is too small");
  100. buffer->buf[buffer->current_size] = 0;
  101. buffer->current_size += 1;
  102. return add_name(buffer->buf);
  103. }
  104. static double tokenizer_buffer_parse_number(tokenizer_buffer *buffer) {
  105. buffer->buf[buffer->current_size] = 0;
  106. return strtod(buffer->buf, NULL);
  107. }
  108. token token_create(int kind, tokenizer_state *state) {
  109. token token;
  110. token.kind = kind;
  111. token.column = state->column;
  112. token.line = state->line;
  113. return token;
  114. }
  115. static void tokens_init(tokens *tokens) {
  116. tokens->max_size = 1024 * 1024;
  117. tokens->t = malloc(tokens->max_size * sizeof(token));
  118. tokens->current_size = 0;
  119. }
  120. static void tokens_add(tokens *tokens, token token) {
  121. tokens->t[tokens->current_size] = token;
  122. tokens->current_size += 1;
  123. debug_context context = {0};
  124. check(tokens->current_size <= tokens->max_size, context, "Out of tokens");
  125. }
  126. static void tokens_add_identifier(tokenizer_state *state, tokens *tokens, tokenizer_buffer *buffer) {
  127. token token;
  128. if (tokenizer_buffer_equals(buffer, "true")) {
  129. token = token_create(TOKEN_BOOLEAN, state);
  130. token.boolean = true;
  131. }
  132. else if (tokenizer_buffer_equals(buffer, "false")) {
  133. token = token_create(TOKEN_BOOLEAN, state);
  134. token.boolean = false;
  135. }
  136. else if (tokenizer_buffer_equals(buffer, "if")) {
  137. token = token_create(TOKEN_IF, state);
  138. }
  139. else if (tokenizer_buffer_equals(buffer, "else")) {
  140. token = token_create(TOKEN_ELSE, state);
  141. }
  142. else if (tokenizer_buffer_equals(buffer, "while")) {
  143. token = token_create(TOKEN_WHILE, state);
  144. }
  145. else if (tokenizer_buffer_equals(buffer, "do")) {
  146. token = token_create(TOKEN_DO, state);
  147. }
  148. else if (tokenizer_buffer_equals(buffer, "for")) {
  149. token = token_create(TOKEN_FOR, state);
  150. }
  151. else if (tokenizer_buffer_equals(buffer, "in")) {
  152. token = token_create(TOKEN_IN, state);
  153. }
  154. else if (tokenizer_buffer_equals(buffer, "struct")) {
  155. token = token_create(TOKEN_STRUCT, state);
  156. }
  157. else if (tokenizer_buffer_equals(buffer, "fun")) {
  158. token = token_create(TOKEN_FUNCTION, state);
  159. }
  160. else if (tokenizer_buffer_equals(buffer, "var")) {
  161. token = token_create(TOKEN_VAR, state);
  162. }
  163. else if (tokenizer_buffer_equals(buffer, "const")) {
  164. token = token_create(TOKEN_CONST, state);
  165. }
  166. else if (tokenizer_buffer_equals(buffer, "return")) {
  167. token = token_create(TOKEN_RETURN, state);
  168. }
  169. else if (tokenizer_buffer_equals(buffer, "discard")) {
  170. token = token_create(TOKEN_DISCARD, state);
  171. }
  172. else {
  173. token = token_create(TOKEN_IDENTIFIER, state);
  174. token.identifier = tokenizer_buffer_to_name(buffer);
  175. }
  176. token.column = buffer->column;
  177. token.line = buffer->line;
  178. tokens_add(tokens, token);
  179. }
  180. tokens tokenize(const char *filename, const char *source) {
  181. mode mode = MODE_SELECT;
  182. bool number_has_dot = false;
  183. tokens tokens;
  184. tokens_init(&tokens);
  185. debug_context context = {0};
  186. context.filename = filename;
  187. tokenizer_state state;
  188. tokenizer_state_init(&context, &state, source);
  189. tokenizer_buffer buffer;
  190. tokenizer_buffer_init(&buffer);
  191. for (;;) {
  192. if (state.next == 0) {
  193. switch (mode) {
  194. case MODE_IDENTIFIER:
  195. tokens_add_identifier(&state, &tokens, &buffer);
  196. break;
  197. case MODE_NUMBER: {
  198. token token = token_create(number_has_dot ? TOKEN_FLOAT : TOKEN_INT, &state);
  199. token.number = tokenizer_buffer_parse_number(&buffer);
  200. tokens_add(&tokens, token);
  201. break;
  202. }
  203. case MODE_SELECT:
  204. case MODE_LINE_COMMENT:
  205. break;
  206. // case MODE_STRING:
  207. // error("Unclosed string", state.column, state.line);
  208. case MODE_OPERATOR:
  209. error(context, "File ends with an operator");
  210. case MODE_COMMENT:
  211. error(context, "Unclosed comment");
  212. }
  213. tokens_add(&tokens, token_create(TOKEN_NONE, &state));
  214. ////
  215. free(buffer.buf);
  216. ////
  217. return tokens;
  218. }
  219. else {
  220. char ch = (char)state.next;
  221. switch (mode) {
  222. case MODE_SELECT: {
  223. if (ch == '/') {
  224. if (state.next_next >= 0) {
  225. char chch = state.next_next;
  226. switch (chch) {
  227. case '/':
  228. mode = MODE_LINE_COMMENT;
  229. break;
  230. case '*':
  231. mode = MODE_COMMENT;
  232. break;
  233. default:
  234. tokenizer_buffer_reset(&buffer, &state);
  235. tokenizer_buffer_add(&buffer, ch);
  236. mode = MODE_OPERATOR;
  237. }
  238. }
  239. }
  240. else if (is_num(ch, state.next_next)) {
  241. mode = MODE_NUMBER;
  242. number_has_dot = false;
  243. tokenizer_buffer_reset(&buffer, &state);
  244. tokenizer_buffer_add(&buffer, ch);
  245. }
  246. else if (is_op(ch)) {
  247. mode = MODE_OPERATOR;
  248. tokenizer_buffer_reset(&buffer, &state);
  249. tokenizer_buffer_add(&buffer, ch);
  250. }
  251. else if (is_whitespace(ch)) {
  252. }
  253. else if (ch == '(') {
  254. tokens_add(&tokens, token_create(TOKEN_LEFT_PAREN, &state));
  255. }
  256. else if (ch == ')') {
  257. tokens_add(&tokens, token_create(TOKEN_RIGHT_PAREN, &state));
  258. }
  259. else if (ch == '{') {
  260. tokens_add(&tokens, token_create(TOKEN_LEFT_CURLY, &state));
  261. }
  262. else if (ch == '}') {
  263. tokens_add(&tokens, token_create(TOKEN_RIGHT_CURLY, &state));
  264. }
  265. else if (ch == '#') {
  266. tokens_add(&tokens, token_create(TOKEN_HASH, &state));
  267. }
  268. else if (ch == '[') {
  269. tokens_add(&tokens, token_create(TOKEN_LEFT_SQUARE, &state));
  270. }
  271. else if (ch == ']') {
  272. tokens_add(&tokens, token_create(TOKEN_RIGHT_SQUARE, &state));
  273. }
  274. else if (ch == ';') {
  275. tokens_add(&tokens, token_create(TOKEN_SEMICOLON, &state));
  276. }
  277. else if (ch == '.') {
  278. tokens_add(&tokens, token_create(TOKEN_DOT, &state));
  279. }
  280. else if (ch == ':') {
  281. tokens_add(&tokens, token_create(TOKEN_COLON, &state));
  282. }
  283. else if (ch == ',') {
  284. tokens_add(&tokens, token_create(TOKEN_COMMA, &state));
  285. }
  286. else if (ch == '"' || ch == '\'') {
  287. // mode = MODE_STRING;
  288. // tokenizer_buffer_reset(&buffer, &state);
  289. error(context, "Strings are not supported");
  290. }
  291. else {
  292. mode = MODE_IDENTIFIER;
  293. tokenizer_buffer_reset(&buffer, &state);
  294. tokenizer_buffer_add(&buffer, ch);
  295. }
  296. tokenizer_state_advance(&context, &state);
  297. break;
  298. }
  299. case MODE_LINE_COMMENT: {
  300. if (ch == '\n') {
  301. mode = MODE_SELECT;
  302. }
  303. tokenizer_state_advance(&context, &state);
  304. break;
  305. }
  306. case MODE_COMMENT: {
  307. if (ch == '*') {
  308. if (state.next_next >= 0) {
  309. char chch = (char)state.next_next;
  310. if (chch == '/') {
  311. mode = MODE_SELECT;
  312. tokenizer_state_advance(&context, &state);
  313. }
  314. }
  315. }
  316. tokenizer_state_advance(&context, &state);
  317. break;
  318. }
  319. case MODE_NUMBER: {
  320. if (is_num(ch, 0) || ch == '.') {
  321. if (ch == '.') {
  322. number_has_dot = true;
  323. }
  324. tokenizer_buffer_add(&buffer, ch);
  325. tokenizer_state_advance(&context, &state);
  326. }
  327. else {
  328. token token = token_create(number_has_dot ? TOKEN_FLOAT : TOKEN_INT, &state);
  329. token.number = tokenizer_buffer_parse_number(&buffer);
  330. tokens_add(&tokens, token);
  331. mode = MODE_SELECT;
  332. }
  333. break;
  334. }
  335. case MODE_OPERATOR: {
  336. char long_op[3];
  337. long_op[0] = 0;
  338. if (buffer.current_size == 1) {
  339. long_op[0] = buffer.buf[0];
  340. long_op[1] = ch;
  341. long_op[2] = 0;
  342. }
  343. if (strcmp(long_op, "==") == 0 || strcmp(long_op, "!=") == 0 || strcmp(long_op, "<=") == 0 || strcmp(long_op, ">=") == 0 ||
  344. strcmp(long_op, "||") == 0 || strcmp(long_op, "&&") == 0 || strcmp(long_op, "->") == 0 || strcmp(long_op, "-=") == 0 ||
  345. strcmp(long_op, "+=") == 0 || strcmp(long_op, "/=") == 0 || strcmp(long_op, "*=") == 0 || strcmp(long_op, "<<") == 0 ||
  346. strcmp(long_op, ">>") == 0) {
  347. tokenizer_buffer_add(&buffer, ch);
  348. tokenizer_state_advance(&context, &state);
  349. }
  350. if (tokenizer_buffer_equals(&buffer, "==")) {
  351. token token = token_create(TOKEN_OPERATOR, &state);
  352. token.op = OPERATOR_EQUALS;
  353. tokens_add(&tokens, token);
  354. }
  355. else if (tokenizer_buffer_equals(&buffer, "!=")) {
  356. token token = token_create(TOKEN_OPERATOR, &state);
  357. token.op = OPERATOR_NOT_EQUALS;
  358. tokens_add(&tokens, token);
  359. }
  360. else if (tokenizer_buffer_equals(&buffer, ">")) {
  361. token token = token_create(TOKEN_OPERATOR, &state);
  362. token.op = OPERATOR_GREATER;
  363. tokens_add(&tokens, token);
  364. }
  365. else if (tokenizer_buffer_equals(&buffer, ">=")) {
  366. token token = token_create(TOKEN_OPERATOR, &state);
  367. token.op = OPERATOR_GREATER_EQUAL;
  368. tokens_add(&tokens, token);
  369. }
  370. else if (tokenizer_buffer_equals(&buffer, "<")) {
  371. token token = token_create(TOKEN_OPERATOR, &state);
  372. token.op = OPERATOR_LESS;
  373. tokens_add(&tokens, token);
  374. }
  375. else if (tokenizer_buffer_equals(&buffer, "<=")) {
  376. token token = token_create(TOKEN_OPERATOR, &state);
  377. token.op = OPERATOR_LESS_EQUAL;
  378. tokens_add(&tokens, token);
  379. }
  380. else if (tokenizer_buffer_equals(&buffer, "-=")) {
  381. token token = token_create(TOKEN_OPERATOR, &state);
  382. token.op = OPERATOR_MINUS_ASSIGN;
  383. tokens_add(&tokens, token);
  384. }
  385. else if (tokenizer_buffer_equals(&buffer, "+=")) {
  386. token token = token_create(TOKEN_OPERATOR, &state);
  387. token.op = OPERATOR_PLUS_ASSIGN;
  388. tokens_add(&tokens, token);
  389. }
  390. else if (tokenizer_buffer_equals(&buffer, "/=")) {
  391. token token = token_create(TOKEN_OPERATOR, &state);
  392. token.op = OPERATOR_DIVIDE_ASSIGN;
  393. tokens_add(&tokens, token);
  394. }
  395. else if (tokenizer_buffer_equals(&buffer, "*=")) {
  396. token token = token_create(TOKEN_OPERATOR, &state);
  397. token.op = OPERATOR_MULTIPLY_ASSIGN;
  398. tokens_add(&tokens, token);
  399. }
  400. else if (tokenizer_buffer_equals(&buffer, "-")) {
  401. token token = token_create(TOKEN_OPERATOR, &state);
  402. token.op = OPERATOR_MINUS;
  403. tokens_add(&tokens, token);
  404. }
  405. else if (tokenizer_buffer_equals(&buffer, "+")) {
  406. token token = token_create(TOKEN_OPERATOR, &state);
  407. token.op = OPERATOR_PLUS;
  408. tokens_add(&tokens, token);
  409. }
  410. else if (tokenizer_buffer_equals(&buffer, "/")) {
  411. token token = token_create(TOKEN_OPERATOR, &state);
  412. token.op = OPERATOR_DIVIDE;
  413. tokens_add(&tokens, token);
  414. }
  415. else if (tokenizer_buffer_equals(&buffer, "*")) {
  416. token token = token_create(TOKEN_OPERATOR, &state);
  417. token.op = OPERATOR_MULTIPLY;
  418. tokens_add(&tokens, token);
  419. }
  420. else if (tokenizer_buffer_equals(&buffer, "!")) {
  421. token token = token_create(TOKEN_OPERATOR, &state);
  422. token.op = OPERATOR_NOT;
  423. tokens_add(&tokens, token);
  424. }
  425. else if (tokenizer_buffer_equals(&buffer, "||")) {
  426. token token = token_create(TOKEN_OPERATOR, &state);
  427. token.op = OPERATOR_OR;
  428. tokens_add(&tokens, token);
  429. }
  430. else if (tokenizer_buffer_equals(&buffer, "^")) {
  431. token token = token_create(TOKEN_OPERATOR, &state);
  432. token.op = OPERATOR_BITWISE_XOR;
  433. tokens_add(&tokens, token);
  434. }
  435. else if (tokenizer_buffer_equals(&buffer, "&")) {
  436. token token = token_create(TOKEN_OPERATOR, &state);
  437. token.op = OPERATOR_BITWISE_AND;
  438. tokens_add(&tokens, token);
  439. }
  440. else if (tokenizer_buffer_equals(&buffer, "|")) {
  441. token token = token_create(TOKEN_OPERATOR, &state);
  442. token.op = OPERATOR_BITWISE_OR;
  443. tokens_add(&tokens, token);
  444. }
  445. else if (tokenizer_buffer_equals(&buffer, "<<")) {
  446. token token = token_create(TOKEN_OPERATOR, &state);
  447. token.op = OPERATOR_LEFT_SHIFT;
  448. tokens_add(&tokens, token);
  449. }
  450. else if (tokenizer_buffer_equals(&buffer, ">>")) {
  451. token token = token_create(TOKEN_OPERATOR, &state);
  452. token.op = OPERATOR_RIGHT_SHIFT;
  453. tokens_add(&tokens, token);
  454. }
  455. else if (tokenizer_buffer_equals(&buffer, "&&")) {
  456. token token = token_create(TOKEN_OPERATOR, &state);
  457. token.op = OPERATOR_AND;
  458. tokens_add(&tokens, token);
  459. }
  460. else if (tokenizer_buffer_equals(&buffer, "%")) {
  461. token token = token_create(TOKEN_OPERATOR, &state);
  462. token.op = OPERATOR_MOD;
  463. tokens_add(&tokens, token);
  464. }
  465. else if (tokenizer_buffer_equals(&buffer, "=")) {
  466. token token = token_create(TOKEN_OPERATOR, &state);
  467. token.op = OPERATOR_ASSIGN;
  468. tokens_add(&tokens, token);
  469. }
  470. else {
  471. error(context, "Weird operator");
  472. }
  473. mode = MODE_SELECT;
  474. break;
  475. }
  476. /*case MODE_STRING: {
  477. if (ch == '"' || ch == '\'') {
  478. token token = token_create(TOKEN_STRING, &state);
  479. tokenizer_buffer_copy_to_string(&buffer, token.string);
  480. token.column = buffer.column;
  481. token.line = buffer.line;
  482. tokens_add(&tokens, token);
  483. tokenizer_state_advance(&state);
  484. mode = MODE_SELECT;
  485. }
  486. else {
  487. tokenizer_buffer_add(&buffer, ch);
  488. tokenizer_state_advance(&state);
  489. }
  490. break;
  491. }*/
  492. case MODE_IDENTIFIER: {
  493. if (is_whitespace(ch) || is_op(ch) || ch == '(' || ch == ')' || ch == '{' || ch == '}' || ch == '[' || ch == ']' || ch == '"' || ch == '\'' ||
  494. ch == ';' || ch == '.' || ch == ',' || ch == ':') {
  495. tokens_add_identifier(&state, &tokens, &buffer);
  496. mode = MODE_SELECT;
  497. }
  498. else {
  499. tokenizer_buffer_add(&buffer, ch);
  500. tokenizer_state_advance(&context, &state);
  501. }
  502. break;
  503. }
  504. }
  505. }
  506. }
  507. }