tokenizer.odin 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621
  1. package odin_tokenizer
  2. import "core:fmt"
  3. import "core:odin/token"
  4. import "core:unicode/utf8"
  5. Error_Handler :: #type proc(pos: token.Pos, fmt: string, args: ..any);
  6. Tokenizer :: struct {
  7. // Immutable data
  8. path: string,
  9. src: []byte,
  10. err: Error_Handler,
  11. // Tokenizing state
  12. ch: rune,
  13. offset: int,
  14. read_offset: int,
  15. line_offset: int,
  16. line_count: int,
  17. // Mutable data
  18. error_count: int,
  19. }
  20. init :: proc(t: ^Tokenizer, src: []byte, path: string, err: Error_Handler = default_error_handler) {
  21. t.src = src;
  22. t.err = err;
  23. t.ch = ' ';
  24. t.offset = 0;
  25. t.read_offset = 0;
  26. t.line_offset = 0;
  27. t.line_count = len(src) > 0 ? 1 : 0;
  28. t.error_count = 0;
  29. t.path = path;
  30. advance_rune(t);
  31. if t.ch == utf8.RUNE_BOM {
  32. advance_rune(t);
  33. }
  34. }
  35. @(private)
  36. offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> token.Pos {
  37. line := t.line_count;
  38. column := offset - t.line_offset + 1;
  39. return token.Pos {
  40. file = t.path,
  41. offset = offset,
  42. line = line,
  43. column = column,
  44. };
  45. }
  46. default_error_handler :: proc(pos: token.Pos, msg: string, args: ..any) {
  47. fmt.printf_err("%s(%d:%d) ", pos.file, pos.line, pos.column);
  48. fmt.printf_err(msg, ..args);
  49. fmt.printf_err("\n");
  50. }
  51. error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
  52. pos := offset_to_pos(t, offset);
  53. if t.err != nil {
  54. t.err(pos, msg, ..args);
  55. }
  56. t.error_count += 1;
  57. }
  58. advance_rune :: proc(using t: ^Tokenizer) {
  59. if read_offset < len(src) {
  60. offset = read_offset;
  61. if ch == '\n' {
  62. line_offset = offset;
  63. line_count += 1;
  64. }
  65. r, w := rune(src[read_offset]), 1;
  66. switch {
  67. case r == 0:
  68. error(t, t.offset, "illegal character NUL");
  69. case r >= utf8.RUNE_SELF:
  70. r, w = utf8.decode_rune(src[read_offset:]);
  71. if r == utf8.RUNE_ERROR && w == 1 {
  72. error(t, t.offset, "illegal UTF-8 encoding");
  73. } else if r == utf8.RUNE_BOM && offset > 0 {
  74. error(t, t.offset, "illegal byte order mark");
  75. }
  76. }
  77. read_offset += w;
  78. ch = r;
  79. } else {
  80. offset = len(src);
  81. if ch == '\n' {
  82. line_offset = offset;
  83. line_count += 1;
  84. }
  85. ch = -1;
  86. }
  87. }
  88. peek_byte :: proc(using t: ^Tokenizer, offset := 0) -> byte {
  89. if read_offset+offset < len(src) {
  90. return src[read_offset+offset];
  91. }
  92. return 0;
  93. }
  94. skip_whitespace :: proc(t: ^Tokenizer) {
  95. for t.ch == ' ' ||
  96. t.ch == '\t' ||
  97. t.ch == '\n' ||
  98. t.ch == '\r' {
  99. advance_rune(t);
  100. }
  101. }
  102. is_letter :: proc(r: rune) -> bool {
  103. if r < utf8.RUNE_SELF {
  104. switch r {
  105. case '_':
  106. return true;
  107. case 'A'..'Z', 'a'..'z':
  108. return true;
  109. }
  110. }
  111. // TODO(bill): Add unicode lookup tables
  112. return false;
  113. }
  114. is_digit :: proc(r: rune) -> bool {
  115. // TODO(bill): Add unicode lookup tables
  116. return '0' <= r && r <= '9';
  117. }
  118. scan_comment :: proc(t: ^Tokenizer) -> string {
  119. offset := t.offset-1;
  120. next := -1;
  121. general: {
  122. if t.ch == '/' || t.ch == '!' { // // #! comments
  123. advance_rune(t);
  124. for t.ch != '\n' && t.ch >= 0 {
  125. advance_rune(t);
  126. }
  127. next = t.offset;
  128. if t.ch == '\n' {
  129. next += 1;
  130. }
  131. break general;
  132. }
  133. /* style comment */
  134. advance_rune(t);
  135. for t.ch >= 0 {
  136. ch := t.ch;
  137. advance_rune(t);
  138. if ch == '*' && t.ch == '/' {
  139. advance_rune(t);
  140. next = t.offset;
  141. break general;
  142. }
  143. }
  144. error(t, offset, "comment not terminated");
  145. }
  146. lit := t.src[offset : t.offset];
  147. // NOTE(bill): Strip CR for line comments
  148. for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
  149. lit = lit[:len(lit)-1];
  150. }
  151. return string(lit);
  152. }
  153. scan_identifier :: proc(t: ^Tokenizer) -> string {
  154. offset := t.offset;
  155. for is_letter(t.ch) || is_digit(t.ch) {
  156. advance_rune(t);
  157. }
  158. return string(t.src[offset : t.offset]);
  159. }
  160. scan_string :: proc(t: ^Tokenizer) -> string {
  161. offset := t.offset-1;
  162. for {
  163. ch := t.ch;
  164. if ch == '\n' || ch < 0 {
  165. error(t, offset, "string literal was not terminated");
  166. break;
  167. }
  168. advance_rune(t);
  169. if ch == '"' {
  170. break;
  171. }
  172. if ch == '\\' {
  173. scan_escape(t);
  174. }
  175. }
  176. return string(t.src[offset : t.offset]);
  177. }
  178. scan_raw_string :: proc(t: ^Tokenizer) -> string {
  179. offset := t.offset-1;
  180. for {
  181. ch := t.ch;
  182. if ch == '\n' || ch < 0 {
  183. error(t, offset, "raw string literal was not terminated");
  184. break;
  185. }
  186. advance_rune(t);
  187. if ch == '`' {
  188. break;
  189. }
  190. }
  191. return string(t.src[offset : t.offset]);
  192. }
  193. digit_val :: proc(r: rune) -> int {
  194. switch r {
  195. case '0'..'9':
  196. return int(r-'0');
  197. case 'A'..'F':
  198. return int(r-'A' + 10);
  199. case 'a'..'f':
  200. return int(r-'a' + 10);
  201. }
  202. return 16;
  203. }
  204. scan_escape :: proc(t: ^Tokenizer) -> bool {
  205. offset := t.offset;
  206. n: int;
  207. base, max: u32;
  208. switch t.ch {
  209. case 'a', 'b', 'e', 'f', 'n', 't', 'v', '\\', '\'', '\"':
  210. advance_rune(t);
  211. return true;
  212. case '0'..'7':
  213. n, base, max = 3, 8, 255;
  214. case 'x':
  215. advance_rune(t);
  216. n, base, max = 2, 16, 255;
  217. case 'u':
  218. advance_rune(t);
  219. n, base, max = 4, 16, utf8.MAX_RUNE;
  220. case 'U':
  221. advance_rune(t);
  222. n, base, max = 8, 16, utf8.MAX_RUNE;
  223. case:
  224. if t.ch < 0 {
  225. error(t, offset, "escape sequence was not terminated");
  226. } else {
  227. error(t, offset, "unknown escape sequence");
  228. }
  229. return false;
  230. }
  231. x: u32;
  232. for n > 0 {
  233. d := u32(digit_val(t.ch));
  234. for d >= base {
  235. if t.ch < 0 {
  236. error(t, t.offset, "escape sequence was not terminated");
  237. } else {
  238. error(t, t.offset, "illegal character %d in escape sequence", t.ch);
  239. }
  240. return false;
  241. }
  242. x = x*base + d;
  243. advance_rune(t);
  244. n -= 1;
  245. }
  246. if x > max || 0xd800 <= x && x <= 0xe000 {
  247. error(t, offset, "escape sequence is an invalid Unicode code point");
  248. return false;
  249. }
  250. return true;
  251. }
  252. scan_rune :: proc(t: ^Tokenizer) -> string {
  253. offset := t.offset-1;
  254. valid := true;
  255. n := 0;
  256. for {
  257. ch := t.ch;
  258. if ch == '\n' || ch < 0 {
  259. if valid {
  260. error(t, offset, "rune literal not terminated");
  261. valid = false;
  262. }
  263. break;
  264. }
  265. advance_rune(t);
  266. if ch == '\'' {
  267. break;
  268. }
  269. n += 1;
  270. if ch == '\\' {
  271. if !scan_escape(t) {
  272. valid = false;
  273. }
  274. }
  275. }
  276. if valid && n != 1 {
  277. error(t, offset, "illegal rune literal");
  278. }
  279. return string(t.src[offset : t.offset]);
  280. }
  281. scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (token.Kind, string) {
  282. scan_mantissa :: proc(t: ^Tokenizer, base: int) {
  283. for digit_val(t.ch) < base || t.ch == '_' {
  284. advance_rune(t);
  285. }
  286. }
  287. scan_exponent :: proc(t: ^Tokenizer, kind: ^token.Kind) {
  288. if t.ch == 'e' || t.ch == 'E' {
  289. kind^ = token.Float;
  290. advance_rune(t);
  291. if t.ch == '-' || t.ch == '+' {
  292. advance_rune(t);
  293. }
  294. if digit_val(t.ch) < 10 {
  295. scan_mantissa(t, 10);
  296. } else {
  297. error(t, t.offset, "illegal floating-point exponent");
  298. }
  299. }
  300. // NOTE(bill): This needs to be here for sanity's sake
  301. if t.ch == 'i' {
  302. kind^ = token.Imag;
  303. advance_rune(t);
  304. }
  305. }
  306. scan_fraction :: proc(t: ^Tokenizer, kind: ^token.Kind) -> (early_exit: bool) {
  307. if t.ch == '.' && peek_byte(t) == '.' {
  308. return true;
  309. }
  310. if t.ch == '.' {
  311. kind^ = token.Float;
  312. advance_rune(t);
  313. scan_mantissa(t, 10);
  314. }
  315. return false;
  316. }
  317. offset := t.offset;
  318. kind := token.Integer;
  319. if seen_decimal_point {
  320. offset -= 1;
  321. kind = token.Float;
  322. scan_mantissa(t, 10);
  323. scan_exponent(t, &kind);
  324. } else {
  325. if t.ch == '0' {
  326. int_base :: inline proc(t: ^Tokenizer, kind: ^token.Kind, base: int, msg: string) {
  327. prev := t.offset;
  328. advance_rune(t);
  329. scan_mantissa(t, base);
  330. if t.offset - prev <= 1 {
  331. kind^ = token.Invalid;
  332. error(t, t.offset, msg);
  333. }
  334. }
  335. advance_rune(t);
  336. switch t.ch {
  337. case 'b': int_base(t, &kind, 2, "illegal binary integer");
  338. case 'o': int_base(t, &kind, 8, "illegal octal integer");
  339. case 'd': int_base(t, &kind, 10, "illegal decimal integer");
  340. case 'z': int_base(t, &kind, 12, "illegal dozenal integer");
  341. case 'x': int_base(t, &kind, 16, "illegal hexadecimal integer");
  342. case 'h':
  343. prev := t.offset;
  344. advance_rune(t);
  345. scan_mantissa(t, 16);
  346. if t.offset - prev <= 1 {
  347. kind = token.Invalid;
  348. error(t, t.offset, "illegal hexadecimal floating-point number");
  349. } else {
  350. sub := t.src[prev+1 : t.offset];
  351. digit_count := 0;
  352. for d in sub {
  353. if d != '_' {
  354. digit_count += 1;
  355. }
  356. }
  357. switch digit_count {
  358. case 8, 16: break;
  359. case:
  360. error(t, t.offset, "invalid hexadecimal floating-point number, expected 8 or 16 digits, got %d", digit_count);
  361. }
  362. }
  363. case:
  364. seen_decimal_point = false;
  365. scan_mantissa(t, 10);
  366. if t.ch == '.' {
  367. seen_decimal_point = true;
  368. if scan_fraction(t, &kind) {
  369. return kind, string(t.src[offset : t.offset]);
  370. }
  371. }
  372. scan_exponent(t, &kind);
  373. return kind, string(t.src[offset : t.offset]);
  374. }
  375. }
  376. }
  377. scan_mantissa(t, 10);
  378. if scan_fraction(t, &kind) {
  379. return kind, string(t.src[offset : t.offset]);
  380. }
  381. scan_exponent(t, &kind);
  382. return kind, string(t.src[offset : t.offset]);
  383. }
  384. scan :: proc(t: ^Tokenizer) -> token.Token {
  385. switch2 :: proc(t: ^Tokenizer, tok0, tok1: token.Kind) -> token.Kind {
  386. if t.ch == '=' {
  387. advance_rune(t);
  388. return tok1;
  389. }
  390. return tok0;
  391. }
  392. switch3 :: proc(t: ^Tokenizer, tok0, tok1: token.Kind, ch2: rune, tok2: token.Kind) -> token.Kind {
  393. if t.ch == '=' {
  394. advance_rune(t);
  395. return tok1;
  396. }
  397. if t.ch == ch2 {
  398. advance_rune(t);
  399. return tok2;
  400. }
  401. return tok0;
  402. }
  403. switch4 :: proc(t: ^Tokenizer, tok0, tok1: token.Kind, ch2: rune, tok2, tok3: token.Kind) -> token.Kind {
  404. if t.ch == '=' {
  405. advance_rune(t);
  406. return tok1;
  407. }
  408. if t.ch == ch2 {
  409. advance_rune(t);
  410. if t.ch == '=' {
  411. advance_rune(t);
  412. return tok3;
  413. }
  414. return tok2;
  415. }
  416. return tok0;
  417. }
  418. skip_whitespace(t);
  419. offset := t.offset;
  420. kind: token.Kind;
  421. lit: string;
  422. pos := offset_to_pos(t, offset);
  423. switch ch := t.ch; true {
  424. case is_letter(ch):
  425. lit = scan_identifier(t);
  426. kind = token.Ident;
  427. check_keyword: if len(lit) > 1 {
  428. // TODO(bill): Maybe have a hash table lookup rather than this linear search
  429. for i in token.B_Keyword_Begin .. token.B_Keyword_End {
  430. if lit == token.tokens[i] {
  431. kind = token.Kind(i);
  432. break check_keyword;
  433. }
  434. }
  435. for keyword, i in token.custom_keyword_tokens {
  436. if lit == keyword {
  437. kind = token.Kind(i+1)+token.B_Custom_Keyword_Begin;
  438. break check_keyword;
  439. }
  440. }
  441. }
  442. case '0' <= ch && ch <= '9':
  443. kind, lit = scan_number(t, false);
  444. case:
  445. advance_rune(t);
  446. switch ch {
  447. case -1:
  448. kind = token.EOF;
  449. case '"':
  450. kind = token.String;
  451. lit = scan_string(t);
  452. case '\'':
  453. kind = token.Rune;
  454. lit = scan_rune(t);
  455. case '`':
  456. kind = token.String;
  457. lit = scan_raw_string(t);
  458. case '=':
  459. if t.ch == '>' {
  460. advance_rune(t);
  461. kind = token.Double_Arrow_Right;
  462. } else {
  463. kind = switch2(t, token.Eq, token.Cmp_Eq);
  464. }
  465. case '!': kind = switch2(t, token.Not, token.Not_Eq);
  466. case '#':
  467. kind = token.Hash;
  468. if t.ch == '!' {
  469. kind = token.Comment;
  470. lit = scan_comment(t);
  471. }
  472. case '?': kind = token.Question;
  473. case '@': kind = token.At;
  474. case '$': kind = token.Dollar;
  475. case '^': kind = token.Pointer;
  476. case '+': kind = switch2(t, token.Add, token.Add_Eq);
  477. case '-':
  478. if t.ch == '>' {
  479. advance_rune(t);
  480. kind = token.Arrow_Right;
  481. } else if t.ch == '-' && peek_byte(t) == '-' {
  482. advance_rune(t);
  483. advance_rune(t);
  484. kind = token.Undef;
  485. } else {
  486. kind = switch2(t, token.Sub, token.Sub_Eq);
  487. }
  488. case '*': kind = switch2(t, token.Mul, token.Mul_Eq);
  489. case '/':
  490. if t.ch == '/' || t.ch == '*' {
  491. kind = token.Comment;
  492. lit = scan_comment(t);
  493. } else {
  494. kind = switch2(t, token.Quo, token.Quo_Eq);
  495. }
  496. case '%': kind = switch4(t, token.Mod, token.Mod_Eq, '%', token.Mod_Mod, token.Mod_Mod_Eq);
  497. case '&':
  498. if t.ch == '~' {
  499. advance_rune(t);
  500. kind = switch2(t, token.And_Not, token.And_Not_Eq);
  501. } else {
  502. kind = switch3(t, token.And, token.And_Eq, '&', token.Cmp_And);
  503. }
  504. case '|': kind = switch3(t, token.Or, token.Or_Eq, '|', token.Cmp_Or);
  505. case '~': kind = token.Xor;
  506. case '<':
  507. if t.ch == '-' {
  508. advance_rune(t);
  509. kind = token.Arrow_Left;
  510. } else {
  511. kind = switch4(t, token.Lt, token.Lt_Eq, '<', token.Shl, token.Shl_Eq);
  512. }
  513. case '>': kind = switch4(t, token.Gt, token.Gt_Eq, '>', token.Shr,token.Shr_Eq);
  514. case '≠': kind = token.Not_Eq;
  515. case '≤': kind = token.Lt_Eq;
  516. case '≥': kind = token.Gt_Eq;
  517. case '∈': kind = token.In;
  518. case '∉': kind = token.Notin;
  519. case '.':
  520. if '0' <= t.ch && t.ch <= '9' {
  521. kind, lit = scan_number(t, true);
  522. } else {
  523. kind = token.Period;
  524. if t.ch == '.' {
  525. advance_rune(t);
  526. kind = token.Ellipsis;
  527. if t.ch == '<' {
  528. advance_rune(t);
  529. kind = token.Range_Half;
  530. }
  531. }
  532. }
  533. case ':': kind = token.Colon;
  534. case ',': kind = token.Comma;
  535. case ';': kind = token.Semicolon;
  536. case '(': kind = token.Open_Paren;
  537. case ')': kind = token.Close_Paren;
  538. case '[': kind = token.Open_Bracket;
  539. case ']': kind = token.Close_Bracket;
  540. case '{': kind = token.Open_Brace;
  541. case '}': kind = token.Close_Brace;
  542. case '\\': kind = token.Back_Slash;
  543. case:
  544. if ch != utf8.RUNE_BOM {
  545. error(t, t.offset, "illegal character '%r': %d", ch, ch);
  546. }
  547. kind = token.Invalid;
  548. }
  549. }
  550. if lit == "" {
  551. lit = string(t.src[offset : t.offset]);
  552. }
  553. return token.Token{kind, lit, pos};
  554. }