token.odin 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520
  1. package cel
  2. import "core:fmt"
  3. import "core:unicode/utf8"
  4. using Kind :: enum {
  5. Illegal,
  6. EOF,
  7. Comment,
  8. _literal_start,
  9. Ident,
  10. Integer,
  11. Float,
  12. Char,
  13. String,
  14. _literal_end,
  15. _keyword_start,
  16. True, // true
  17. False, // false
  18. Nil, // nil
  19. _keyword_end,
  20. _operator_start,
  21. Question, // ?
  22. And, // and
  23. Or, // or
  24. Add, // +
  25. Sub, // -
  26. Mul, // *
  27. Quo, // /
  28. Rem, // %
  29. Not, // !
  30. Eq, // ==
  31. NotEq, // !=
  32. Lt, // <
  33. Gt, // >
  34. LtEq, // <=
  35. GtEq, // >=
  36. At, // @
  37. _operator_end,
  38. _punc_start,
  39. Assign, // =
  40. Open_Paren, // (
  41. Close_Paren, // )
  42. Open_Bracket, // [
  43. Close_Bracket, // ]
  44. Open_Brace, // {
  45. Close_Brace, // }
  46. Colon, // :
  47. Semicolon, // ;
  48. Comma, // ,
  49. Period, // .
  50. _punc_end,
  51. }
  52. Pos :: struct {
  53. file: string,
  54. line: int,
  55. column: int,
  56. }
  57. Token :: struct {
  58. kind: Kind,
  59. using pos: Pos,
  60. lit: string,
  61. }
  62. Tokenizer :: struct {
  63. src: []byte,
  64. file: string, // May not be used
  65. curr_rune: rune,
  66. offset: int,
  67. read_offset: int,
  68. line_offset: int,
  69. line_count: int,
  70. insert_semi: bool,
  71. error_count: int,
  72. }
  73. keywords := map[string]Kind{
  74. "true" = True,
  75. "false" = False,
  76. "nil" = Nil,
  77. "and" = And,
  78. "or" = Or,
  79. };
  80. kind_to_string := [len(Kind)]string{
  81. "illegal",
  82. "EOF",
  83. "comment",
  84. "",
  85. "identifier",
  86. "integer",
  87. "float",
  88. "character",
  89. "string",
  90. "",
  91. "",
  92. "true", "false", "nil",
  93. "",
  94. "",
  95. "?", "and", "or",
  96. "+", "-", "*", "/", "%",
  97. "!",
  98. "==", "!=", "<", ">", "<=", ">=",
  99. "@",
  100. "",
  101. "",
  102. "=",
  103. "(", ")",
  104. "[", "]",
  105. "{", "}",
  106. ":", ";", ",", ".",
  107. "",
  108. };
  109. precedence :: proc(op: Kind) -> int {
  110. switch op {
  111. case Question:
  112. return 1;
  113. case Or:
  114. return 2;
  115. case And:
  116. return 3;
  117. case Eq, NotEq, Lt, Gt, LtEq, GtEq:
  118. return 4;
  119. case Add, Sub:
  120. return 5;
  121. case Mul, Quo, Rem:
  122. return 6;
  123. }
  124. return 0;
  125. }
  126. token_lookup :: proc(ident: string) -> Kind {
  127. if tok, is_keyword := keywords[ident]; is_keyword {
  128. return tok;
  129. }
  130. return Ident;
  131. }
  132. is_literal :: proc(tok: Kind) -> bool do return _literal_start < tok && tok < _literal_end;
  133. is_operator :: proc(tok: Kind) -> bool do return _operator_start < tok && tok < _operator_end;
  134. is_keyword :: proc(tok: Kind) -> bool do return _keyword_start < tok && tok < _keyword_end;
  135. tokenizer_init :: proc(t: ^Tokenizer, src: []byte, file := "") {
  136. t.src = src;
  137. t.file = file;
  138. t.curr_rune = ' ';
  139. t.offset = 0;
  140. t.read_offset = 0;
  141. t.line_offset = 0;
  142. t.line_count = 1;
  143. advance_to_next_rune(t);
  144. if t.curr_rune == utf8.RUNE_BOM {
  145. advance_to_next_rune(t);
  146. }
  147. }
  148. token_error :: proc(t: ^Tokenizer, msg: string, args: ..any) {
  149. fmt.printf_err("%s(%d:%d) Error: ", t.file, t.line_count, t.read_offset-t.line_offset+1);
  150. fmt.printf_err(msg, ..args);
  151. fmt.println_err();
  152. t.error_count += 1;
  153. }
  154. advance_to_next_rune :: proc(t: ^Tokenizer) {
  155. if t.read_offset < len(t.src) {
  156. t.offset = t.read_offset;
  157. if t.curr_rune == '\n' {
  158. t.line_offset = t.offset;
  159. t.line_count += 1;
  160. }
  161. r, w := rune(t.src[t.read_offset]), 1;
  162. switch {
  163. case r == 0:
  164. token_error(t, "Illegal character NUL");
  165. case r >= utf8.RUNE_SELF:
  166. r, w = utf8.decode_rune(t.src[t.read_offset:]);
  167. if r == utf8.RUNE_ERROR && w == 1 {
  168. token_error(t, "Illegal utf-8 encoding");
  169. } else if r == utf8.RUNE_BOM && t.offset > 0 {
  170. token_error(t, "Illegal byte order mark");
  171. }
  172. }
  173. t.read_offset += w;
  174. t.curr_rune = r;
  175. } else {
  176. t.offset = len(t.src);
  177. if t.curr_rune == '\n' {
  178. t.line_offset = t.offset;
  179. t.line_count += 1;
  180. }
  181. t.curr_rune = utf8.RUNE_EOF;
  182. }
  183. }
  184. get_pos :: proc(t: ^Tokenizer) -> Pos {
  185. return Pos {
  186. file = t.file,
  187. line = t.line_count,
  188. column = t.offset - t.line_offset + 1,
  189. };
  190. }
  191. is_letter :: proc(r: rune) -> bool {
  192. switch r {
  193. case 'a'..'z', 'A'..'Z', '_':
  194. return true;
  195. }
  196. return false;
  197. }
  198. is_digit :: proc(r: rune) -> bool {
  199. switch r {
  200. case '0'..'9':
  201. return true;
  202. }
  203. return false;
  204. }
  205. skip_whitespace :: proc(t: ^Tokenizer) {
  206. loop: for {
  207. switch t.curr_rune {
  208. case '\n':
  209. if t.insert_semi {
  210. break loop;
  211. }
  212. fallthrough;
  213. case ' ', '\t', '\r', '\v', '\f':
  214. advance_to_next_rune(t);
  215. case:
  216. break loop;
  217. }
  218. }
  219. }
  220. scan_identifier :: proc(t: ^Tokenizer) -> string {
  221. offset := t.offset;
  222. for is_letter(t.curr_rune) || is_digit(t.curr_rune) {
  223. advance_to_next_rune(t);
  224. }
  225. return string(t.src[offset : t.offset]);
  226. }
  227. digit_value :: proc(r: rune) -> int {
  228. switch r {
  229. case '0'..'9': return int(r - '0');
  230. case 'a'..'f': return int(r - 'a' + 10);
  231. case 'A'..'F': return int(r - 'A' + 10);
  232. }
  233. return 16;
  234. }
  235. scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Kind, string) {
  236. scan_mantissa :: proc(t: ^Tokenizer, base: int) {
  237. for digit_value(t.curr_rune) < base || t.curr_rune == '_' {
  238. advance_to_next_rune(t);
  239. }
  240. }
  241. scan_exponent :: proc(t: ^Tokenizer, tok: Kind, offset: int) -> (Kind, string) {
  242. if t.curr_rune == 'e' || t.curr_rune == 'E' {
  243. tok = Float;
  244. advance_to_next_rune(t);
  245. if t.curr_rune == '-' || t.curr_rune == '+' {
  246. advance_to_next_rune(t);
  247. }
  248. if digit_value(t.curr_rune) < 10 {
  249. scan_mantissa(t, 10);
  250. } else {
  251. token_error(t, "Illegal floating point exponent");
  252. }
  253. }
  254. return tok, string(t.src[offset : t.offset]);
  255. }
  256. scan_fraction :: proc(t: ^Tokenizer, tok: Kind, offset: int) -> (Kind, string) {
  257. if t.curr_rune == '.' {
  258. tok = Float;
  259. advance_to_next_rune(t);
  260. scan_mantissa(t, 10);
  261. }
  262. return scan_exponent(t, tok, offset);
  263. }
  264. offset := t.offset;
  265. tok := Integer;
  266. if seen_decimal_point {
  267. offset -= 1;
  268. tok = Float;
  269. scan_mantissa(t, 10);
  270. return scan_exponent(t, tok, offset);
  271. }
  272. if t.curr_rune == '0' {
  273. offset = t.offset;
  274. advance_to_next_rune(t);
  275. switch t.curr_rune {
  276. case 'b', 'B':
  277. advance_to_next_rune(t);
  278. scan_mantissa(t, 2);
  279. if t.offset - offset <= 2 {
  280. token_error(t, "Illegal binary number");
  281. }
  282. case 'o', 'O':
  283. advance_to_next_rune(t);
  284. scan_mantissa(t, 8);
  285. if t.offset - offset <= 2 {
  286. token_error(t, "Illegal octal number");
  287. }
  288. case 'x', 'X':
  289. advance_to_next_rune(t);
  290. scan_mantissa(t, 16);
  291. if t.offset - offset <= 2 {
  292. token_error(t, "Illegal hexadecimal number");
  293. }
  294. case:
  295. scan_mantissa(t, 10);
  296. switch t.curr_rune {
  297. case '.', 'e', 'E':
  298. return scan_fraction(t, tok, offset);
  299. }
  300. }
  301. return tok, string(t.src[offset:t.offset]);
  302. }
  303. scan_mantissa(t, 10);
  304. return scan_fraction(t, tok, offset);
  305. }
  306. scan :: proc(t: ^Tokenizer) -> Token {
  307. skip_whitespace(t);
  308. offset := t.offset;
  309. tok: Kind;
  310. pos := get_pos(t);
  311. lit: string;
  312. insert_semi := false;
  313. switch r := t.curr_rune; {
  314. case is_letter(r):
  315. insert_semi = true;
  316. lit = scan_identifier(t);
  317. tok = Ident;
  318. if len(lit) > 1 {
  319. tok = token_lookup(lit);
  320. }
  321. case '0' <= r && r <= '9':
  322. insert_semi = true;
  323. tok, lit = scan_number(t, false);
  324. case:
  325. advance_to_next_rune(t);
  326. switch r {
  327. case -1:
  328. if t.insert_semi {
  329. t.insert_semi = false;
  330. return Token{Semicolon, pos, "\n"};
  331. }
  332. return Token{EOF, pos, "\n"};
  333. case '\n':
  334. t.insert_semi = false;
  335. return Token{Semicolon, pos, "\n"};
  336. case '"':
  337. insert_semi = true;
  338. quote := r;
  339. tok = String;
  340. for {
  341. this_r := t.curr_rune;
  342. if this_r == '\n' || r < 0 {
  343. token_error(t, "String literal not terminated");
  344. break;
  345. }
  346. advance_to_next_rune(t);
  347. if this_r == quote {
  348. break;
  349. }
  350. // TODO(bill); Handle properly
  351. if this_r == '\\' && t.curr_rune == quote {
  352. advance_to_next_rune(t);
  353. }
  354. }
  355. lit = string(t.src[offset+1:t.offset-1]);
  356. case '#':
  357. for t.curr_rune != '\n' && t.curr_rune >= 0 {
  358. advance_to_next_rune(t);
  359. }
  360. if t.insert_semi {
  361. t.insert_semi = false;
  362. return Token{Semicolon, pos, "\n"};
  363. }
  364. // Recursive!
  365. return scan(t);
  366. case '?': tok = Question;
  367. case ':': tok = Colon;
  368. case '@': tok = At;
  369. case ';':
  370. tok = Semicolon;
  371. lit = ";";
  372. case ',': tok = Comma;
  373. case '(':
  374. tok = Open_Paren;
  375. case ')':
  376. insert_semi = true;
  377. tok = Close_Paren;
  378. case '[':
  379. tok = Open_Bracket;
  380. case ']':
  381. insert_semi = true;
  382. tok = Close_Bracket;
  383. case '{':
  384. tok = Open_Brace;
  385. case '}':
  386. insert_semi = true;
  387. tok = Close_Brace;
  388. case '+': tok = Add;
  389. case '-': tok = Sub;
  390. case '*': tok = Mul;
  391. case '/': tok = Quo;
  392. case '%': tok = Rem;
  393. case '!':
  394. tok = Not;
  395. if t.curr_rune == '=' {
  396. advance_to_next_rune(t);
  397. tok = NotEq;
  398. }
  399. case '=':
  400. tok = Assign;
  401. if t.curr_rune == '=' {
  402. advance_to_next_rune(t);
  403. tok = Eq;
  404. }
  405. case '<':
  406. tok = Lt;
  407. if t.curr_rune == '=' {
  408. advance_to_next_rune(t);
  409. tok = LtEq;
  410. }
  411. case '>':
  412. tok = Gt;
  413. if t.curr_rune == '=' {
  414. advance_to_next_rune(t);
  415. tok = GtEq;
  416. }
  417. case '.':
  418. if '0' <= t.curr_rune && t.curr_rune <= '9' {
  419. insert_semi = true;
  420. tok, lit = scan_number(t, true);
  421. } else {
  422. tok = Period;
  423. }
  424. case:
  425. if r != utf8.RUNE_BOM {
  426. token_error(t, "Illegal character '%r'", r);
  427. }
  428. insert_semi = t.insert_semi;
  429. tok = Illegal;
  430. }
  431. }
  432. t.insert_semi = insert_semi;
  433. if lit == "" {
  434. lit = string(t.src[offset:t.offset]);
  435. }
  436. return Token{tok, pos, lit};
  437. }