tokenizer.odin 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. package odin_tokenizer
  2. import "core:fmt"
  3. import "core:unicode"
  4. import "core:unicode/utf8"
  5. Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any);
  6. Flag :: enum {
  7. Insert_Semicolon,
  8. }
  9. Flags :: distinct bit_set[Flag; u32];
  10. Tokenizer :: struct {
  11. // Immutable data
  12. path: string,
  13. src: string,
  14. err: Error_Handler,
  15. flags: Flags,
  16. // Tokenizing state
  17. ch: rune,
  18. offset: int,
  19. read_offset: int,
  20. line_offset: int,
  21. line_count: int,
  22. insert_semicolon: bool,
  23. // Mutable data
  24. error_count: int,
  25. }
  26. init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
  27. t.src = src;
  28. t.err = err;
  29. t.ch = ' ';
  30. t.offset = 0;
  31. t.read_offset = 0;
  32. t.line_offset = 0;
  33. t.line_count = len(src) > 0 ? 1 : 0;
  34. t.error_count = 0;
  35. t.path = path;
  36. advance_rune(t);
  37. if t.ch == utf8.RUNE_BOM {
  38. advance_rune(t);
  39. }
  40. }
  41. @(private)
  42. offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
  43. line := t.line_count;
  44. column := offset - t.line_offset + 1;
  45. return Pos {
  46. file = t.path,
  47. offset = offset,
  48. line = line,
  49. column = column,
  50. };
  51. }
  52. default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
  53. fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column);
  54. fmt.eprintf(msg, ..args);
  55. fmt.eprintf("\n");
  56. }
  57. error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
  58. pos := offset_to_pos(t, offset);
  59. if t.err != nil {
  60. t.err(pos, msg, ..args);
  61. }
  62. t.error_count += 1;
  63. }
  64. advance_rune :: proc(using t: ^Tokenizer) {
  65. if read_offset < len(src) {
  66. offset = read_offset;
  67. if ch == '\n' {
  68. line_offset = offset;
  69. line_count += 1;
  70. }
  71. r, w := rune(src[read_offset]), 1;
  72. switch {
  73. case r == 0:
  74. error(t, t.offset, "illegal character NUL");
  75. case r >= utf8.RUNE_SELF:
  76. r, w = utf8.decode_rune_in_string(src[read_offset:]);
  77. if r == utf8.RUNE_ERROR && w == 1 {
  78. error(t, t.offset, "illegal UTF-8 encoding");
  79. } else if r == utf8.RUNE_BOM && offset > 0 {
  80. error(t, t.offset, "illegal byte order mark");
  81. }
  82. }
  83. read_offset += w;
  84. ch = r;
  85. } else {
  86. offset = len(src);
  87. if ch == '\n' {
  88. line_offset = offset;
  89. line_count += 1;
  90. }
  91. ch = -1;
  92. }
  93. }
  94. peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
  95. if t.read_offset+offset < len(t.src) {
  96. return t.src[t.read_offset+offset];
  97. }
  98. return 0;
  99. }
  100. skip_whitespace :: proc(t: ^Tokenizer) {
  101. for {
  102. switch t.ch {
  103. case ' ', '\t', '\r':
  104. advance_rune(t);
  105. case '\n':
  106. if t.insert_semicolon {
  107. return;
  108. }
  109. advance_rune(t);
  110. case:
  111. return;
  112. }
  113. }
  114. }
  115. is_letter :: proc(r: rune) -> bool {
  116. if r < utf8.RUNE_SELF {
  117. switch r {
  118. case '_':
  119. return true;
  120. case 'A'..'Z', 'a'..'z':
  121. return true;
  122. }
  123. }
  124. return unicode.is_letter(r);
  125. }
  126. is_digit :: proc(r: rune) -> bool {
  127. if '0' <= r && r <= '9' {
  128. return true;
  129. }
  130. return unicode.is_digit(r);
  131. }
  132. scan_comment :: proc(t: ^Tokenizer) -> string {
  133. offset := t.offset-1;
  134. next := -1;
  135. general: {
  136. if t.ch == '/' || t.ch == '!' { // // #! comments
  137. advance_rune(t);
  138. for t.ch != '\n' && t.ch >= 0 {
  139. advance_rune(t);
  140. }
  141. next = t.offset;
  142. if t.ch == '\n' {
  143. next += 1;
  144. }
  145. break general;
  146. }
  147. /* style comment */
  148. advance_rune(t);
  149. nest := 1;
  150. for t.ch >= 0 && nest > 0 {
  151. ch := t.ch;
  152. advance_rune(t);
  153. if ch == '/' && t.ch == '*' {
  154. nest += 1;
  155. }
  156. if ch == '*' && t.ch == '/' {
  157. nest -= 1;
  158. advance_rune(t);
  159. next = t.offset;
  160. if nest == 0 {
  161. break general;
  162. }
  163. }
  164. }
  165. error(t, offset, "comment not terminated");
  166. }
  167. lit := t.src[offset : t.offset];
  168. // NOTE(bill): Strip CR for line comments
  169. for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
  170. lit = lit[:len(lit)-1];
  171. }
  172. return string(lit);
  173. }
  174. scan_identifier :: proc(t: ^Tokenizer) -> string {
  175. offset := t.offset;
  176. for is_letter(t.ch) || is_digit(t.ch) {
  177. advance_rune(t);
  178. }
  179. return string(t.src[offset : t.offset]);
  180. }
  181. scan_string :: proc(t: ^Tokenizer) -> string {
  182. offset := t.offset-1;
  183. for {
  184. ch := t.ch;
  185. if ch == '\n' || ch < 0 {
  186. error(t, offset, "string literal was not terminated");
  187. break;
  188. }
  189. advance_rune(t);
  190. if ch == '"' {
  191. break;
  192. }
  193. if ch == '\\' {
  194. scan_escape(t);
  195. }
  196. }
  197. return string(t.src[offset : t.offset]);
  198. }
  199. scan_raw_string :: proc(t: ^Tokenizer) -> string {
  200. offset := t.offset-1;
  201. for {
  202. ch := t.ch;
  203. if ch == utf8.RUNE_EOF {
  204. error(t, offset, "raw string literal was not terminated");
  205. break;
  206. }
  207. advance_rune(t);
  208. if ch == '`' {
  209. break;
  210. }
  211. }
  212. return string(t.src[offset : t.offset]);
  213. }
  214. digit_val :: proc(r: rune) -> int {
  215. switch r {
  216. case '0'..'9':
  217. return int(r-'0');
  218. case 'A'..'F':
  219. return int(r-'A' + 10);
  220. case 'a'..'f':
  221. return int(r-'a' + 10);
  222. }
  223. return 16;
  224. }
  225. scan_escape :: proc(t: ^Tokenizer) -> bool {
  226. offset := t.offset;
  227. n: int;
  228. base, max: u32;
  229. switch t.ch {
  230. case 'a', 'b', 'e', 'f', 'n', 't', 'v', 'r', '\\', '\'', '\"':
  231. advance_rune(t);
  232. return true;
  233. case '0'..'7':
  234. n, base, max = 3, 8, 255;
  235. case 'x':
  236. advance_rune(t);
  237. n, base, max = 2, 16, 255;
  238. case 'u':
  239. advance_rune(t);
  240. n, base, max = 4, 16, utf8.MAX_RUNE;
  241. case 'U':
  242. advance_rune(t);
  243. n, base, max = 8, 16, utf8.MAX_RUNE;
  244. case:
  245. if t.ch < 0 {
  246. error(t, offset, "escape sequence was not terminated");
  247. } else {
  248. error(t, offset, "unknown escape sequence");
  249. }
  250. return false;
  251. }
  252. x: u32;
  253. for n > 0 {
  254. d := u32(digit_val(t.ch));
  255. for d >= base {
  256. if t.ch < 0 {
  257. error(t, t.offset, "escape sequence was not terminated");
  258. } else {
  259. error(t, t.offset, "illegal character %d in escape sequence", t.ch);
  260. }
  261. return false;
  262. }
  263. x = x*base + d;
  264. advance_rune(t);
  265. n -= 1;
  266. }
  267. if x > max || 0xd800 <= x && x <= 0xe000 {
  268. error(t, offset, "escape sequence is an invalid Unicode code point");
  269. return false;
  270. }
  271. return true;
  272. }
  273. scan_rune :: proc(t: ^Tokenizer) -> string {
  274. offset := t.offset-1;
  275. valid := true;
  276. n := 0;
  277. for {
  278. ch := t.ch;
  279. if ch == '\n' || ch < 0 {
  280. if valid {
  281. error(t, offset, "rune literal not terminated");
  282. valid = false;
  283. }
  284. break;
  285. }
  286. advance_rune(t);
  287. if ch == '\'' {
  288. break;
  289. }
  290. n += 1;
  291. if ch == '\\' {
  292. if !scan_escape(t) {
  293. valid = false;
  294. }
  295. }
  296. }
  297. if valid && n != 1 {
  298. error(t, offset, "illegal rune literal");
  299. }
  300. return string(t.src[offset : t.offset]);
  301. }
  302. scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) {
  303. scan_mantissa :: proc(t: ^Tokenizer, base: int) {
  304. for digit_val(t.ch) < base || t.ch == '_' {
  305. advance_rune(t);
  306. }
  307. }
  308. scan_exponent :: proc(t: ^Tokenizer, kind: ^Token_Kind) {
  309. if t.ch == 'e' || t.ch == 'E' {
  310. kind^ = .Float;
  311. advance_rune(t);
  312. if t.ch == '-' || t.ch == '+' {
  313. advance_rune(t);
  314. }
  315. if digit_val(t.ch) < 10 {
  316. scan_mantissa(t, 10);
  317. } else {
  318. error(t, t.offset, "illegal floating-point exponent");
  319. }
  320. }
  321. // NOTE(bill): This needs to be here for sanity's sake
  322. switch t.ch {
  323. case 'i', 'j', 'k':
  324. kind^ = .Imag;
  325. advance_rune(t);
  326. }
  327. }
  328. scan_fraction :: proc(t: ^Tokenizer, kind: ^Token_Kind) -> (early_exit: bool) {
  329. if t.ch == '.' && peek_byte(t) == '.' {
  330. return true;
  331. }
  332. if t.ch == '.' {
  333. kind^ = .Float;
  334. advance_rune(t);
  335. scan_mantissa(t, 10);
  336. }
  337. return false;
  338. }
  339. offset := t.offset;
  340. kind := Token_Kind.Integer;
  341. seen_point := seen_decimal_point;
  342. if seen_point {
  343. offset -= 1;
  344. kind = .Float;
  345. scan_mantissa(t, 10);
  346. scan_exponent(t, &kind);
  347. } else {
  348. if t.ch == '0' {
  349. int_base :: proc(t: ^Tokenizer, kind: ^Token_Kind, base: int, msg: string) {
  350. prev := t.offset;
  351. advance_rune(t);
  352. scan_mantissa(t, base);
  353. if t.offset - prev <= 1 {
  354. kind^ = .Invalid;
  355. error(t, t.offset, msg);
  356. }
  357. }
  358. advance_rune(t);
  359. switch t.ch {
  360. case 'b': int_base(t, &kind, 2, "illegal binary integer");
  361. case 'o': int_base(t, &kind, 8, "illegal octal integer");
  362. case 'd': int_base(t, &kind, 10, "illegal decimal integer");
  363. case 'z': int_base(t, &kind, 12, "illegal dozenal integer");
  364. case 'x': int_base(t, &kind, 16, "illegal hexadecimal integer");
  365. case 'h':
  366. prev := t.offset;
  367. advance_rune(t);
  368. scan_mantissa(t, 16);
  369. if t.offset - prev <= 1 {
  370. kind = .Invalid;
  371. error(t, t.offset, "illegal hexadecimal floating-point number");
  372. } else {
  373. sub := t.src[prev+1 : t.offset];
  374. digit_count := 0;
  375. for d in sub {
  376. if d != '_' {
  377. digit_count += 1;
  378. }
  379. }
  380. switch digit_count {
  381. case 4, 8, 16: break;
  382. case:
  383. error(t, t.offset, "invalid hexadecimal floating-point number, expected 4, 8, or 16 digits, got %d", digit_count);
  384. }
  385. }
  386. case:
  387. seen_point = false;
  388. scan_mantissa(t, 10);
  389. if t.ch == '.' {
  390. seen_point = true;
  391. if scan_fraction(t, &kind) {
  392. return kind, string(t.src[offset : t.offset]);
  393. }
  394. }
  395. scan_exponent(t, &kind);
  396. return kind, string(t.src[offset : t.offset]);
  397. }
  398. }
  399. }
  400. scan_mantissa(t, 10);
  401. if scan_fraction(t, &kind) {
  402. return kind, string(t.src[offset : t.offset]);
  403. }
  404. scan_exponent(t, &kind);
  405. return kind, string(t.src[offset : t.offset]);
  406. }
  407. scan :: proc(t: ^Tokenizer) -> Token {
  408. switch2 :: proc(t: ^Tokenizer, tok0, tok1: Token_Kind) -> Token_Kind {
  409. if t.ch == '=' {
  410. advance_rune(t);
  411. return tok1;
  412. }
  413. return tok0;
  414. }
  415. switch3 :: proc(t: ^Tokenizer, tok0, tok1: Token_Kind, ch2: rune, tok2: Token_Kind) -> Token_Kind {
  416. if t.ch == '=' {
  417. advance_rune(t);
  418. return tok1;
  419. }
  420. if t.ch == ch2 {
  421. advance_rune(t);
  422. return tok2;
  423. }
  424. return tok0;
  425. }
  426. switch4 :: proc(t: ^Tokenizer, tok0, tok1: Token_Kind, ch2: rune, tok2, tok3: Token_Kind) -> Token_Kind {
  427. if t.ch == '=' {
  428. advance_rune(t);
  429. return tok1;
  430. }
  431. if t.ch == ch2 {
  432. advance_rune(t);
  433. if t.ch == '=' {
  434. advance_rune(t);
  435. return tok3;
  436. }
  437. return tok2;
  438. }
  439. return tok0;
  440. }
  441. skip_whitespace(t);
  442. offset := t.offset;
  443. kind: Token_Kind;
  444. lit: string;
  445. pos := offset_to_pos(t, offset);
  446. insert_semicolon := false;
  447. switch ch := t.ch; true {
  448. case is_letter(ch):
  449. lit = scan_identifier(t);
  450. kind = .Ident;
  451. check_keyword: if len(lit) > 1 {
  452. // TODO(bill): Maybe have a hash table lookup rather than this linear search
  453. for i in Token_Kind.B_Keyword_Begin .. Token_Kind.B_Keyword_End {
  454. if lit == tokens[i] {
  455. kind = Token_Kind(i);
  456. break check_keyword;
  457. }
  458. }
  459. for keyword, i in custom_keyword_tokens {
  460. if lit == keyword {
  461. kind = Token_Kind(i+1) + .B_Custom_Keyword_Begin;
  462. break check_keyword;
  463. }
  464. }
  465. #partial switch kind {
  466. case .Ident, .Context, .Typeid, .Break, .Continue, .Fallthrough, .Return:
  467. insert_semicolon = true;
  468. }
  469. }
  470. case '0' <= ch && ch <= '9':
  471. insert_semicolon = true;
  472. kind, lit = scan_number(t, false);
  473. case:
  474. advance_rune(t);
  475. switch ch {
  476. case -1:
  477. kind = .EOF;
  478. if t.insert_semicolon {
  479. t.insert_semicolon = false;
  480. kind = .Semicolon;
  481. lit = "\n";
  482. }
  483. case '\n':
  484. t.insert_semicolon = false;
  485. kind = .Semicolon;
  486. lit = "\n";
  487. case '"':
  488. insert_semicolon = true;
  489. kind = .String;
  490. lit = scan_string(t);
  491. case '\'':
  492. insert_semicolon = true;
  493. kind = .Rune;
  494. lit = scan_rune(t);
  495. case '`':
  496. insert_semicolon = true;
  497. kind = .String;
  498. lit = scan_raw_string(t);
  499. case '=': kind = switch2(t, .Eq, .Cmp_Eq);
  500. case '!': kind = switch2(t, .Not, .Not_Eq);
  501. case '#':
  502. kind = .Hash;
  503. if t.ch == '!' {
  504. insert_semicolon = t.insert_semicolon;
  505. kind = .Comment;
  506. lit = scan_comment(t);
  507. }
  508. case '?':
  509. insert_semicolon = true;
  510. kind = .Question;
  511. case '@': kind = .At;
  512. case '$': kind = .Dollar;
  513. case '^': kind = .Pointer;
  514. case '+': kind = switch2(t, .Add, .Add_Eq);
  515. case '-':
  516. if t.ch == '>' {
  517. advance_rune(t);
  518. kind = .Arrow_Right;
  519. } else if t.ch == '-' && peek_byte(t) == '-' {
  520. advance_rune(t);
  521. advance_rune(t);
  522. kind = .Undef;
  523. } else {
  524. kind = switch2(t, .Sub, .Sub_Eq);
  525. }
  526. case '*': kind = switch2(t, .Mul, .Mul_Eq);
  527. case '/':
  528. if t.ch == '/' || t.ch == '*' {
  529. insert_semicolon = t.insert_semicolon;
  530. kind = .Comment;
  531. lit = scan_comment(t);
  532. } else {
  533. kind = switch2(t, .Quo, .Quo_Eq);
  534. }
  535. case '%': kind = switch4(t, .Mod, .Mod_Eq, '%', .Mod_Mod, .Mod_Mod_Eq);
  536. case '&':
  537. if t.ch == '~' {
  538. advance_rune(t);
  539. kind = switch2(t, .And_Not, .And_Not_Eq);
  540. } else {
  541. kind = switch3(t, .And, .And_Eq, '&', .Cmp_And);
  542. }
  543. case '|': kind = switch3(t, .Or, .Or_Eq, '|', .Cmp_Or);
  544. case '~': kind = switch2(t, .Xor, .Xor_Eq);
  545. case '<': kind = switch4(t, .Lt, .Lt_Eq, '<', .Shl, .Shl_Eq);
  546. case '>': kind = switch4(t, .Gt, .Gt_Eq, '>', .Shr,.Shr_Eq);
  547. case '.':
  548. if '0' <= t.ch && t.ch <= '9' {
  549. kind, lit = scan_number(t, true);
  550. } else {
  551. kind = .Period;
  552. if t.ch == '.' {
  553. advance_rune(t);
  554. kind = .Ellipsis;
  555. if t.ch == '<' {
  556. advance_rune(t);
  557. kind = .Range_Half;
  558. } else if t.ch == '=' {
  559. advance_rune(t);
  560. kind = .Range_Full;
  561. }
  562. }
  563. }
  564. case ':': kind = .Colon;
  565. case ',': kind = .Comma;
  566. case ';': kind = .Semicolon;
  567. case '(': kind = .Open_Paren;
  568. case ')':
  569. insert_semicolon = true;
  570. kind = .Close_Paren;
  571. case '[': kind = .Open_Bracket;
  572. case ']':
  573. insert_semicolon = true;
  574. kind = .Close_Bracket;
  575. case '{': kind = .Open_Brace;
  576. case '}':
  577. insert_semicolon = true;
  578. kind = .Close_Brace;
  579. case '\\': kind = .Back_Slash;
  580. case:
  581. if ch != utf8.RUNE_BOM {
  582. error(t, t.offset, "illegal character '%r': %d", ch, ch);
  583. }
  584. insert_semicolon = t.insert_semicolon; // preserve insert_semicolon info
  585. kind = .Invalid;
  586. }
  587. }
  588. if .Insert_Semicolon in t.flags {
  589. t.insert_semicolon = insert_semicolon;
  590. }
  591. if lit == "" {
  592. lit = string(t.src[offset : t.offset]);
  593. }
  594. return Token{kind, lit, pos};
  595. }