tokenizer.odin 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. package odin_tokenizer
  2. import "core:fmt"
  3. import "core:unicode/utf8"
  4. Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any);
  5. Tokenizer :: struct {
  6. // Immutable data
  7. path: string,
  8. src: []byte,
  9. err: Error_Handler,
  10. // Tokenizing state
  11. ch: rune,
  12. offset: int,
  13. read_offset: int,
  14. line_offset: int,
  15. line_count: int,
  16. // Mutable data
  17. error_count: int,
  18. }
  19. init :: proc(t: ^Tokenizer, src: []byte, path: string, err: Error_Handler = default_error_handler) {
  20. t.src = src;
  21. t.err = err;
  22. t.ch = ' ';
  23. t.offset = 0;
  24. t.read_offset = 0;
  25. t.line_offset = 0;
  26. t.line_count = len(src) > 0 ? 1 : 0;
  27. t.error_count = 0;
  28. t.path = path;
  29. advance_rune(t);
  30. if t.ch == utf8.RUNE_BOM {
  31. advance_rune(t);
  32. }
  33. }
  34. @(private)
  35. offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
  36. line := t.line_count;
  37. column := offset - t.line_offset + 1;
  38. return Pos {
  39. file = t.path,
  40. offset = offset,
  41. line = line,
  42. column = column,
  43. };
  44. }
  45. default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
  46. fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column);
  47. fmt.eprintf(msg, ..args);
  48. fmt.eprintf("\n");
  49. }
  50. error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
  51. pos := offset_to_pos(t, offset);
  52. if t.err != nil {
  53. t.err(pos, msg, ..args);
  54. }
  55. t.error_count += 1;
  56. }
  57. advance_rune :: proc(using t: ^Tokenizer) {
  58. if read_offset < len(src) {
  59. offset = read_offset;
  60. if ch == '\n' {
  61. line_offset = offset;
  62. line_count += 1;
  63. }
  64. r, w := rune(src[read_offset]), 1;
  65. switch {
  66. case r == 0:
  67. error(t, t.offset, "illegal character NUL");
  68. case r >= utf8.RUNE_SELF:
  69. r, w = utf8.decode_rune(src[read_offset:]);
  70. if r == utf8.RUNE_ERROR && w == 1 {
  71. error(t, t.offset, "illegal UTF-8 encoding");
  72. } else if r == utf8.RUNE_BOM && offset > 0 {
  73. error(t, t.offset, "illegal byte order mark");
  74. }
  75. }
  76. read_offset += w;
  77. ch = r;
  78. } else {
  79. offset = len(src);
  80. if ch == '\n' {
  81. line_offset = offset;
  82. line_count += 1;
  83. }
  84. ch = -1;
  85. }
  86. }
  87. peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
  88. if t.read_offset+offset < len(t.src) {
  89. return t.src[t.read_offset+offset];
  90. }
  91. return 0;
  92. }
  93. skip_whitespace :: proc(t: ^Tokenizer) {
  94. for t.ch == ' ' ||
  95. t.ch == '\t' ||
  96. t.ch == '\n' ||
  97. t.ch == '\r' {
  98. advance_rune(t);
  99. }
  100. }
  101. is_letter :: proc(r: rune) -> bool {
  102. if r < utf8.RUNE_SELF {
  103. switch r {
  104. case '_':
  105. return true;
  106. case 'A'..'Z', 'a'..'z':
  107. return true;
  108. }
  109. }
  110. // TODO(bill): Add unicode lookup tables
  111. return false;
  112. }
  113. is_digit :: proc(r: rune) -> bool {
  114. // TODO(bill): Add unicode lookup tables
  115. return '0' <= r && r <= '9';
  116. }
  117. scan_comment :: proc(t: ^Tokenizer) -> string {
  118. offset := t.offset-1;
  119. next := -1;
  120. general: {
  121. if t.ch == '/' || t.ch == '!' { // // #! comments
  122. advance_rune(t);
  123. for t.ch != '\n' && t.ch >= 0 {
  124. advance_rune(t);
  125. }
  126. next = t.offset;
  127. if t.ch == '\n' {
  128. next += 1;
  129. }
  130. break general;
  131. }
  132. /* style comment */
  133. advance_rune(t);
  134. for t.ch >= 0 {
  135. ch := t.ch;
  136. advance_rune(t);
  137. if ch == '*' && t.ch == '/' {
  138. advance_rune(t);
  139. next = t.offset;
  140. break general;
  141. }
  142. }
  143. error(t, offset, "comment not terminated");
  144. }
  145. lit := t.src[offset : t.offset];
  146. // NOTE(bill): Strip CR for line comments
  147. for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
  148. lit = lit[:len(lit)-1];
  149. }
  150. return string(lit);
  151. }
  152. scan_identifier :: proc(t: ^Tokenizer) -> string {
  153. offset := t.offset;
  154. for is_letter(t.ch) || is_digit(t.ch) {
  155. advance_rune(t);
  156. }
  157. return string(t.src[offset : t.offset]);
  158. }
  159. scan_string :: proc(t: ^Tokenizer) -> string {
  160. offset := t.offset-1;
  161. for {
  162. ch := t.ch;
  163. if ch == '\n' || ch < 0 {
  164. error(t, offset, "string literal was not terminated");
  165. break;
  166. }
  167. advance_rune(t);
  168. if ch == '"' {
  169. break;
  170. }
  171. if ch == '\\' {
  172. scan_escape(t);
  173. }
  174. }
  175. return string(t.src[offset : t.offset]);
  176. }
  177. scan_raw_string :: proc(t: ^Tokenizer) -> string {
  178. offset := t.offset-1;
  179. for {
  180. ch := t.ch;
  181. if ch == '\n' || ch < 0 {
  182. error(t, offset, "raw string literal was not terminated");
  183. break;
  184. }
  185. advance_rune(t);
  186. if ch == '`' {
  187. break;
  188. }
  189. }
  190. return string(t.src[offset : t.offset]);
  191. }
  192. digit_val :: proc(r: rune) -> int {
  193. switch r {
  194. case '0'..'9':
  195. return int(r-'0');
  196. case 'A'..'F':
  197. return int(r-'A' + 10);
  198. case 'a'..'f':
  199. return int(r-'a' + 10);
  200. }
  201. return 16;
  202. }
  203. scan_escape :: proc(t: ^Tokenizer) -> bool {
  204. offset := t.offset;
  205. n: int;
  206. base, max: u32;
  207. switch t.ch {
  208. case 'a', 'b', 'e', 'f', 'n', 't', 'v', '\\', '\'', '\"':
  209. advance_rune(t);
  210. return true;
  211. case '0'..'7':
  212. n, base, max = 3, 8, 255;
  213. case 'x':
  214. advance_rune(t);
  215. n, base, max = 2, 16, 255;
  216. case 'u':
  217. advance_rune(t);
  218. n, base, max = 4, 16, utf8.MAX_RUNE;
  219. case 'U':
  220. advance_rune(t);
  221. n, base, max = 8, 16, utf8.MAX_RUNE;
  222. case:
  223. if t.ch < 0 {
  224. error(t, offset, "escape sequence was not terminated");
  225. } else {
  226. error(t, offset, "unknown escape sequence");
  227. }
  228. return false;
  229. }
  230. x: u32;
  231. for n > 0 {
  232. d := u32(digit_val(t.ch));
  233. for d >= base {
  234. if t.ch < 0 {
  235. error(t, t.offset, "escape sequence was not terminated");
  236. } else {
  237. error(t, t.offset, "illegal character %d in escape sequence", t.ch);
  238. }
  239. return false;
  240. }
  241. x = x*base + d;
  242. advance_rune(t);
  243. n -= 1;
  244. }
  245. if x > max || 0xd800 <= x && x <= 0xe000 {
  246. error(t, offset, "escape sequence is an invalid Unicode code point");
  247. return false;
  248. }
  249. return true;
  250. }
  251. scan_rune :: proc(t: ^Tokenizer) -> string {
  252. offset := t.offset-1;
  253. valid := true;
  254. n := 0;
  255. for {
  256. ch := t.ch;
  257. if ch == '\n' || ch < 0 {
  258. if valid {
  259. error(t, offset, "rune literal not terminated");
  260. valid = false;
  261. }
  262. break;
  263. }
  264. advance_rune(t);
  265. if ch == '\'' {
  266. break;
  267. }
  268. n += 1;
  269. if ch == '\\' {
  270. if !scan_escape(t) {
  271. valid = false;
  272. }
  273. }
  274. }
  275. if valid && n != 1 {
  276. error(t, offset, "illegal rune literal");
  277. }
  278. return string(t.src[offset : t.offset]);
  279. }
  280. scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) {
  281. scan_mantissa :: proc(t: ^Tokenizer, base: int) {
  282. for digit_val(t.ch) < base || t.ch == '_' {
  283. advance_rune(t);
  284. }
  285. }
  286. scan_exponent :: proc(t: ^Tokenizer, kind: ^Token_Kind) {
  287. if t.ch == 'e' || t.ch == 'E' {
  288. kind^ = .Float;
  289. advance_rune(t);
  290. if t.ch == '-' || t.ch == '+' {
  291. advance_rune(t);
  292. }
  293. if digit_val(t.ch) < 10 {
  294. scan_mantissa(t, 10);
  295. } else {
  296. error(t, t.offset, "illegal floating-point exponent");
  297. }
  298. }
  299. // NOTE(bill): This needs to be here for sanity's sake
  300. switch t.ch {
  301. case 'i', 'j', 'k':
  302. kind^ = .Imag;
  303. advance_rune(t);
  304. }
  305. }
  306. scan_fraction :: proc(t: ^Tokenizer, kind: ^Token_Kind) -> (early_exit: bool) {
  307. if t.ch == '.' && peek_byte(t) == '.' {
  308. return true;
  309. }
  310. if t.ch == '.' {
  311. kind^ = .Float;
  312. advance_rune(t);
  313. scan_mantissa(t, 10);
  314. }
  315. return false;
  316. }
  317. offset := t.offset;
  318. kind := Token_Kind.Integer;
  319. seen_point := seen_decimal_point;
  320. if seen_point {
  321. offset -= 1;
  322. kind = .Float;
  323. scan_mantissa(t, 10);
  324. scan_exponent(t, &kind);
  325. } else {
  326. if t.ch == '0' {
  327. int_base :: inline proc(t: ^Tokenizer, kind: ^Token_Kind, base: int, msg: string) {
  328. prev := t.offset;
  329. advance_rune(t);
  330. scan_mantissa(t, base);
  331. if t.offset - prev <= 1 {
  332. kind^ = .Invalid;
  333. error(t, t.offset, msg);
  334. }
  335. }
  336. advance_rune(t);
  337. switch t.ch {
  338. case 'b': int_base(t, &kind, 2, "illegal binary integer");
  339. case 'o': int_base(t, &kind, 8, "illegal octal integer");
  340. case 'd': int_base(t, &kind, 10, "illegal decimal integer");
  341. case 'z': int_base(t, &kind, 12, "illegal dozenal integer");
  342. case 'x': int_base(t, &kind, 16, "illegal hexadecimal integer");
  343. case 'h':
  344. prev := t.offset;
  345. advance_rune(t);
  346. scan_mantissa(t, 16);
  347. if t.offset - prev <= 1 {
  348. kind = .Invalid;
  349. error(t, t.offset, "illegal hexadecimal floating-point number");
  350. } else {
  351. sub := t.src[prev+1 : t.offset];
  352. digit_count := 0;
  353. for d in sub {
  354. if d != '_' {
  355. digit_count += 1;
  356. }
  357. }
  358. switch digit_count {
  359. case 8, 16: break;
  360. case:
  361. error(t, t.offset, "invalid hexadecimal floating-point number, expected 8 or 16 digits, got %d", digit_count);
  362. }
  363. }
  364. case:
  365. seen_point = false;
  366. scan_mantissa(t, 10);
  367. if t.ch == '.' {
  368. seen_point = true;
  369. if scan_fraction(t, &kind) {
  370. return kind, string(t.src[offset : t.offset]);
  371. }
  372. }
  373. scan_exponent(t, &kind);
  374. return kind, string(t.src[offset : t.offset]);
  375. }
  376. }
  377. }
  378. scan_mantissa(t, 10);
  379. if scan_fraction(t, &kind) {
  380. return kind, string(t.src[offset : t.offset]);
  381. }
  382. scan_exponent(t, &kind);
  383. return kind, string(t.src[offset : t.offset]);
  384. }
  385. scan :: proc(t: ^Tokenizer) -> Token {
  386. switch2 :: proc(t: ^Tokenizer, tok0, tok1: Token_Kind) -> Token_Kind {
  387. if t.ch == '=' {
  388. advance_rune(t);
  389. return tok1;
  390. }
  391. return tok0;
  392. }
  393. switch3 :: proc(t: ^Tokenizer, tok0, tok1: Token_Kind, ch2: rune, tok2: Token_Kind) -> Token_Kind {
  394. if t.ch == '=' {
  395. advance_rune(t);
  396. return tok1;
  397. }
  398. if t.ch == ch2 {
  399. advance_rune(t);
  400. return tok2;
  401. }
  402. return tok0;
  403. }
  404. switch4 :: proc(t: ^Tokenizer, tok0, tok1: Token_Kind, ch2: rune, tok2, tok3: Token_Kind) -> Token_Kind {
  405. if t.ch == '=' {
  406. advance_rune(t);
  407. return tok1;
  408. }
  409. if t.ch == ch2 {
  410. advance_rune(t);
  411. if t.ch == '=' {
  412. advance_rune(t);
  413. return tok3;
  414. }
  415. return tok2;
  416. }
  417. return tok0;
  418. }
  419. skip_whitespace(t);
  420. offset := t.offset;
  421. kind: Token_Kind;
  422. lit: string;
  423. pos := offset_to_pos(t, offset);
  424. switch ch := t.ch; true {
  425. case is_letter(ch):
  426. lit = scan_identifier(t);
  427. kind = .Ident;
  428. check_keyword: if len(lit) > 1 {
  429. // TODO(bill): Maybe have a hash table lookup rather than this linear search
  430. for i in Token_Kind.B_Keyword_Begin .. Token_Kind.B_Keyword_End {
  431. if lit == tokens[i] {
  432. kind = Token_Kind(i);
  433. break check_keyword;
  434. }
  435. }
  436. for keyword, i in custom_keyword_tokens {
  437. if lit == keyword {
  438. kind = Token_Kind(i+1) + .B_Custom_Keyword_Begin;
  439. break check_keyword;
  440. }
  441. }
  442. if kind == .Ident && lit == "notin" {
  443. kind = .Not_In;
  444. }
  445. }
  446. case '0' <= ch && ch <= '9':
  447. kind, lit = scan_number(t, false);
  448. case:
  449. advance_rune(t);
  450. switch ch {
  451. case -1:
  452. kind = .EOF;
  453. case '"':
  454. kind = .String;
  455. lit = scan_string(t);
  456. case '\'':
  457. kind = .Rune;
  458. lit = scan_rune(t);
  459. case '`':
  460. kind = .String;
  461. lit = scan_raw_string(t);
  462. case '=':
  463. if t.ch == '>' {
  464. advance_rune(t);
  465. kind = .Double_Arrow_Right;
  466. } else {
  467. kind = switch2(t, .Eq, .Cmp_Eq);
  468. }
  469. case '!': kind = switch2(t, .Not, .Not_Eq);
  470. case '#':
  471. kind = .Hash;
  472. if t.ch == '!' {
  473. kind = .Comment;
  474. lit = scan_comment(t);
  475. }
  476. case '?': kind = .Question;
  477. case '@': kind = .At;
  478. case '$': kind = .Dollar;
  479. case '^': kind = .Pointer;
  480. case '+': kind = switch2(t, .Add, .Add_Eq);
  481. case '-':
  482. if t.ch == '>' {
  483. advance_rune(t);
  484. kind = .Arrow_Right;
  485. } else if t.ch == '-' && peek_byte(t) == '-' {
  486. advance_rune(t);
  487. advance_rune(t);
  488. kind = .Undef;
  489. } else {
  490. kind = switch2(t, .Sub, .Sub_Eq);
  491. }
  492. case '*': kind = switch2(t, .Mul, .Mul_Eq);
  493. case '/':
  494. if t.ch == '/' || t.ch == '*' {
  495. kind = .Comment;
  496. lit = scan_comment(t);
  497. } else {
  498. kind = switch2(t, .Quo, .Quo_Eq);
  499. }
  500. case '%': kind = switch4(t, .Mod, .Mod_Eq, '%', .Mod_Mod, .Mod_Mod_Eq);
  501. case '&':
  502. if t.ch == '~' {
  503. advance_rune(t);
  504. kind = switch2(t, .And_Not, .And_Not_Eq);
  505. } else {
  506. kind = switch3(t, .And, .And_Eq, '&', .Cmp_And);
  507. }
  508. case '|': kind = switch3(t, .Or, .Or_Eq, '|', .Cmp_Or);
  509. case '~': kind = .Xor;
  510. case '<':
  511. if t.ch == '-' {
  512. advance_rune(t);
  513. kind = .Arrow_Left;
  514. } else {
  515. kind = switch4(t, .Lt, .Lt_Eq, '<', .Shl, .Shl_Eq);
  516. }
  517. case '>': kind = switch4(t, .Gt, .Gt_Eq, '>', .Shr,.Shr_Eq);
  518. case '.':
  519. if '0' <= t.ch && t.ch <= '9' {
  520. kind, lit = scan_number(t, true);
  521. } else {
  522. kind = .Period;
  523. if t.ch == '.' {
  524. advance_rune(t);
  525. kind = .Ellipsis;
  526. if t.ch == '<' {
  527. advance_rune(t);
  528. kind = .Range_Half;
  529. }
  530. }
  531. }
  532. case ':': kind = .Colon;
  533. case ',': kind = .Comma;
  534. case ';': kind = .Semicolon;
  535. case '(': kind = .Open_Paren;
  536. case ')': kind = .Close_Paren;
  537. case '[': kind = .Open_Bracket;
  538. case ']': kind = .Close_Bracket;
  539. case '{': kind = .Open_Brace;
  540. case '}': kind = .Close_Brace;
  541. case '\\': kind = .Back_Slash;
  542. case:
  543. if ch != utf8.RUNE_BOM {
  544. error(t, t.offset, "illegal character '%r': %d", ch, ch);
  545. }
  546. kind = .Invalid;
  547. }
  548. }
  549. if lit == "" {
  550. lit = string(t.src[offset : t.offset]);
  551. }
  552. return Token{kind, lit, pos};
  553. }