token.odin 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. package cel
  2. import "core:fmt"
  3. import "core:unicode/utf8"
  4. using Kind :: enum {
  5. Illegal,
  6. EOF,
  7. Comment,
  8. _literal_start,
  9. Ident,
  10. Integer,
  11. Float,
  12. Char,
  13. String,
  14. _literal_end,
  15. _keyword_start,
  16. True, // true
  17. False, // false
  18. Nil, // nil
  19. _keyword_end,
  20. _operator_start,
  21. Question, // ?
  22. And, // and
  23. Or, // or
  24. Add, // +
  25. Sub, // -
  26. Mul, // *
  27. Quo, // /
  28. Rem, // %
  29. Not, // !
  30. Eq, // ==
  31. NotEq, // !=
  32. Lt, // <
  33. Gt, // >
  34. LtEq, // <=
  35. GtEq, // >=
  36. At, // @
  37. _operator_end,
  38. _punc_start,
  39. Assign, // =
  40. Open_Paren, // (
  41. Close_Paren, // )
  42. Open_Bracket, // [
  43. Close_Bracket, // ]
  44. Open_Brace, // {
  45. Close_Brace, // }
  46. Colon, // :
  47. Semicolon, // ;
  48. Comma, // ,
  49. Period, // .
  50. _punc_end,
  51. }
  52. Pos :: struct {
  53. file: string,
  54. line: int,
  55. column: int,
  56. }
  57. Token :: struct {
  58. kind: Kind,
  59. using pos: Pos,
  60. lit: string,
  61. }
  62. Tokenizer :: struct {
  63. src: []byte,
  64. file: string, // May not be used
  65. curr_rune: rune,
  66. offset: int,
  67. read_offset: int,
  68. line_offset: int,
  69. line_count: int,
  70. insert_semi: bool,
  71. error_count: int,
  72. }
  73. keywords := map[string]Kind{
  74. "true" = True,
  75. "false" = False,
  76. "nil" = Nil,
  77. "and" = And,
  78. "or" = Or,
  79. };
  80. kind_to_string := [len(Kind)]string{
  81. "illegal",
  82. "EOF",
  83. "comment",
  84. "",
  85. "identifier",
  86. "integer",
  87. "float",
  88. "character",
  89. "string",
  90. "",
  91. "",
  92. "true", "false", "nil",
  93. "",
  94. "",
  95. "?", "and", "or",
  96. "+", "-", "*", "/", "%",
  97. "!",
  98. "==", "!=", "<", ">", "<=", ">=",
  99. "@",
  100. "",
  101. "",
  102. "=",
  103. "(", ")",
  104. "[", "]",
  105. "{", "}",
  106. ":", ";", ",", ".",
  107. "",
  108. };
  109. precedence :: proc(op: Kind) -> int {
  110. #partial switch op {
  111. case Question:
  112. return 1;
  113. case Or:
  114. return 2;
  115. case And:
  116. return 3;
  117. case Eq, NotEq, Lt, Gt, LtEq, GtEq:
  118. return 4;
  119. case Add, Sub:
  120. return 5;
  121. case Mul, Quo, Rem:
  122. return 6;
  123. }
  124. return 0;
  125. }
  126. token_lookup :: proc(ident: string) -> Kind {
  127. if tok, is_keyword := keywords[ident]; is_keyword {
  128. return tok;
  129. }
  130. return Ident;
  131. }
  132. is_literal :: proc(tok: Kind) -> bool do return _literal_start < tok && tok < _literal_end;
  133. is_operator :: proc(tok: Kind) -> bool do return _operator_start < tok && tok < _operator_end;
  134. is_keyword :: proc(tok: Kind) -> bool do return _keyword_start < tok && tok < _keyword_end;
  135. tokenizer_init :: proc(t: ^Tokenizer, src: []byte, file := "") {
  136. t.src = src;
  137. t.file = file;
  138. t.curr_rune = ' ';
  139. t.offset = 0;
  140. t.read_offset = 0;
  141. t.line_offset = 0;
  142. t.line_count = 1;
  143. advance_to_next_rune(t);
  144. if t.curr_rune == utf8.RUNE_BOM {
  145. advance_to_next_rune(t);
  146. }
  147. }
  148. token_error :: proc(t: ^Tokenizer, msg: string, args: ..any) {
  149. fmt.eprintf("%s(%d:%d) Error: ", t.file, t.line_count, t.read_offset-t.line_offset+1);
  150. fmt.eprintf(msg, ..args);
  151. fmt.eprintln();
  152. t.error_count += 1;
  153. }
  154. advance_to_next_rune :: proc(t: ^Tokenizer) {
  155. if t.read_offset < len(t.src) {
  156. t.offset = t.read_offset;
  157. if t.curr_rune == '\n' {
  158. t.line_offset = t.offset;
  159. t.line_count += 1;
  160. }
  161. r, w := rune(t.src[t.read_offset]), 1;
  162. switch {
  163. case r == 0:
  164. token_error(t, "Illegal character NUL");
  165. case r >= utf8.RUNE_SELF:
  166. r, w = utf8.decode_rune(t.src[t.read_offset:]);
  167. if r == utf8.RUNE_ERROR && w == 1 {
  168. token_error(t, "Illegal utf-8 encoding");
  169. } else if r == utf8.RUNE_BOM && t.offset > 0 {
  170. token_error(t, "Illegal byte order mark");
  171. }
  172. }
  173. t.read_offset += w;
  174. t.curr_rune = r;
  175. } else {
  176. t.offset = len(t.src);
  177. if t.curr_rune == '\n' {
  178. t.line_offset = t.offset;
  179. t.line_count += 1;
  180. }
  181. t.curr_rune = utf8.RUNE_EOF;
  182. }
  183. }
  184. get_pos :: proc(t: ^Tokenizer) -> Pos {
  185. return Pos {
  186. file = t.file,
  187. line = t.line_count,
  188. column = t.offset - t.line_offset + 1,
  189. };
  190. }
  191. is_letter :: proc(r: rune) -> bool {
  192. switch r {
  193. case 'a'..'z', 'A'..'Z', '_':
  194. return true;
  195. }
  196. return false;
  197. }
  198. is_digit :: proc(r: rune) -> bool {
  199. switch r {
  200. case '0'..'9':
  201. return true;
  202. }
  203. return false;
  204. }
  205. skip_whitespace :: proc(t: ^Tokenizer) {
  206. loop: for {
  207. switch t.curr_rune {
  208. case '\n':
  209. if t.insert_semi {
  210. break loop;
  211. }
  212. fallthrough;
  213. case ' ', '\t', '\r', '\v', '\f':
  214. advance_to_next_rune(t);
  215. case:
  216. break loop;
  217. }
  218. }
  219. }
  220. scan_identifier :: proc(t: ^Tokenizer) -> string {
  221. offset := t.offset;
  222. for is_letter(t.curr_rune) || is_digit(t.curr_rune) {
  223. advance_to_next_rune(t);
  224. }
  225. return string(t.src[offset : t.offset]);
  226. }
  227. digit_value :: proc(r: rune) -> int {
  228. switch r {
  229. case '0'..'9': return int(r - '0');
  230. case 'a'..'f': return int(r - 'a' + 10);
  231. case 'A'..'F': return int(r - 'A' + 10);
  232. }
  233. return 16;
  234. }
  235. scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Kind, string) {
  236. scan_mantissa :: proc(t: ^Tokenizer, base: int) {
  237. for digit_value(t.curr_rune) < base || t.curr_rune == '_' {
  238. advance_to_next_rune(t);
  239. }
  240. }
  241. scan_exponent :: proc(t: ^Tokenizer, tok: Kind, offset: int) -> (kind: Kind, text: string) {
  242. kind = tok;
  243. if t.curr_rune == 'e' || t.curr_rune == 'E' {
  244. kind = Float;
  245. advance_to_next_rune(t);
  246. if t.curr_rune == '-' || t.curr_rune == '+' {
  247. advance_to_next_rune(t);
  248. }
  249. if digit_value(t.curr_rune) < 10 {
  250. scan_mantissa(t, 10);
  251. } else {
  252. token_error(t, "Illegal floating point exponent");
  253. }
  254. }
  255. text = string(t.src[offset : t.offset]);
  256. return;
  257. }
  258. scan_fraction :: proc(t: ^Tokenizer, tok: Kind, offset: int) -> (kind: Kind, text: string) {
  259. kind = tok;
  260. if t.curr_rune == '.' {
  261. kind = Float;
  262. advance_to_next_rune(t);
  263. scan_mantissa(t, 10);
  264. }
  265. return scan_exponent(t, kind, offset);
  266. }
  267. offset := t.offset;
  268. tok := Integer;
  269. if seen_decimal_point {
  270. offset -= 1;
  271. tok = Float;
  272. scan_mantissa(t, 10);
  273. return scan_exponent(t, tok, offset);
  274. }
  275. if t.curr_rune == '0' {
  276. offset = t.offset;
  277. advance_to_next_rune(t);
  278. switch t.curr_rune {
  279. case 'b', 'B':
  280. advance_to_next_rune(t);
  281. scan_mantissa(t, 2);
  282. if t.offset - offset <= 2 {
  283. token_error(t, "Illegal binary number");
  284. }
  285. case 'o', 'O':
  286. advance_to_next_rune(t);
  287. scan_mantissa(t, 8);
  288. if t.offset - offset <= 2 {
  289. token_error(t, "Illegal octal number");
  290. }
  291. case 'x', 'X':
  292. advance_to_next_rune(t);
  293. scan_mantissa(t, 16);
  294. if t.offset - offset <= 2 {
  295. token_error(t, "Illegal hexadecimal number");
  296. }
  297. case:
  298. scan_mantissa(t, 10);
  299. switch t.curr_rune {
  300. case '.', 'e', 'E':
  301. return scan_fraction(t, tok, offset);
  302. }
  303. }
  304. return tok, string(t.src[offset:t.offset]);
  305. }
  306. scan_mantissa(t, 10);
  307. return scan_fraction(t, tok, offset);
  308. }
  309. scan :: proc(t: ^Tokenizer) -> Token {
  310. skip_whitespace(t);
  311. offset := t.offset;
  312. tok: Kind;
  313. pos := get_pos(t);
  314. lit: string;
  315. insert_semi := false;
  316. switch r := t.curr_rune; {
  317. case is_letter(r):
  318. insert_semi = true;
  319. lit = scan_identifier(t);
  320. tok = Ident;
  321. if len(lit) > 1 {
  322. tok = token_lookup(lit);
  323. }
  324. case '0' <= r && r <= '9':
  325. insert_semi = true;
  326. tok, lit = scan_number(t, false);
  327. case:
  328. advance_to_next_rune(t);
  329. switch r {
  330. case -1:
  331. if t.insert_semi {
  332. t.insert_semi = false;
  333. return Token{Semicolon, pos, "\n"};
  334. }
  335. return Token{EOF, pos, "\n"};
  336. case '\n':
  337. t.insert_semi = false;
  338. return Token{Semicolon, pos, "\n"};
  339. case '"':
  340. insert_semi = true;
  341. quote := r;
  342. tok = String;
  343. for {
  344. this_r := t.curr_rune;
  345. if this_r == '\n' || r < 0 {
  346. token_error(t, "String literal not terminated");
  347. break;
  348. }
  349. advance_to_next_rune(t);
  350. if this_r == quote {
  351. break;
  352. }
  353. // TODO(bill); Handle properly
  354. if this_r == '\\' && t.curr_rune == quote {
  355. advance_to_next_rune(t);
  356. }
  357. }
  358. lit = string(t.src[offset+1:t.offset-1]);
  359. case '#':
  360. for t.curr_rune != '\n' && t.curr_rune >= 0 {
  361. advance_to_next_rune(t);
  362. }
  363. if t.insert_semi {
  364. t.insert_semi = false;
  365. return Token{Semicolon, pos, "\n"};
  366. }
  367. // Recursive!
  368. return scan(t);
  369. case '?': tok = Question;
  370. case ':': tok = Colon;
  371. case '@': tok = At;
  372. case ';':
  373. tok = Semicolon;
  374. lit = ";";
  375. case ',': tok = Comma;
  376. case '(':
  377. tok = Open_Paren;
  378. case ')':
  379. insert_semi = true;
  380. tok = Close_Paren;
  381. case '[':
  382. tok = Open_Bracket;
  383. case ']':
  384. insert_semi = true;
  385. tok = Close_Bracket;
  386. case '{':
  387. tok = Open_Brace;
  388. case '}':
  389. insert_semi = true;
  390. tok = Close_Brace;
  391. case '+': tok = Add;
  392. case '-': tok = Sub;
  393. case '*': tok = Mul;
  394. case '/': tok = Quo;
  395. case '%': tok = Rem;
  396. case '!':
  397. tok = Not;
  398. if t.curr_rune == '=' {
  399. advance_to_next_rune(t);
  400. tok = NotEq;
  401. }
  402. case '=':
  403. tok = Assign;
  404. if t.curr_rune == '=' {
  405. advance_to_next_rune(t);
  406. tok = Eq;
  407. }
  408. case '<':
  409. tok = Lt;
  410. if t.curr_rune == '=' {
  411. advance_to_next_rune(t);
  412. tok = LtEq;
  413. }
  414. case '>':
  415. tok = Gt;
  416. if t.curr_rune == '=' {
  417. advance_to_next_rune(t);
  418. tok = GtEq;
  419. }
  420. case '.':
  421. if '0' <= t.curr_rune && t.curr_rune <= '9' {
  422. insert_semi = true;
  423. tok, lit = scan_number(t, true);
  424. } else {
  425. tok = Period;
  426. }
  427. case:
  428. if r != utf8.RUNE_BOM {
  429. token_error(t, "Illegal character '%r'", r);
  430. }
  431. insert_semi = t.insert_semi;
  432. tok = Illegal;
  433. }
  434. }
  435. t.insert_semi = insert_semi;
  436. if lit == "" {
  437. lit = string(t.src[offset:t.offset]);
  438. }
  439. return Token{tok, pos, lit};
  440. }