tokenizer.odin 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741
  1. package odin_tokenizer
  2. import "core:fmt"
  3. import "core:unicode"
  4. import "core:unicode/utf8"
  5. Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
  6. Flag :: enum {
  7. Insert_Semicolon,
  8. }
  9. Flags :: distinct bit_set[Flag; u32]
  10. Tokenizer :: struct {
  11. // Immutable data
  12. path: string,
  13. src: string,
  14. err: Error_Handler,
  15. flags: Flags,
  16. // Tokenizing state
  17. ch: rune,
  18. offset: int,
  19. read_offset: int,
  20. line_offset: int,
  21. line_count: int,
  22. insert_semicolon: bool,
  23. // Mutable data
  24. error_count: int,
  25. }
  26. init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
  27. t.src = src
  28. t.err = err
  29. t.ch = ' '
  30. t.offset = 0
  31. t.read_offset = 0
  32. t.line_offset = 0
  33. t.line_count = len(src) > 0 ? 1 : 0
  34. t.insert_semicolon = false
  35. t.error_count = 0
  36. t.path = path
  37. advance_rune(t)
  38. if t.ch == utf8.RUNE_BOM {
  39. advance_rune(t)
  40. }
  41. }
  42. @(private)
  43. offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
  44. line := t.line_count
  45. column := offset - t.line_offset + 1
  46. return Pos {
  47. file = t.path,
  48. offset = offset,
  49. line = line,
  50. column = column,
  51. }
  52. }
  53. default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
  54. fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
  55. fmt.eprintf(msg, ..args)
  56. fmt.eprintf("\n")
  57. }
  58. error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
  59. pos := offset_to_pos(t, offset)
  60. if t.err != nil {
  61. t.err(pos, msg, ..args)
  62. }
  63. t.error_count += 1
  64. }
  65. advance_rune :: proc(t: ^Tokenizer) {
  66. if t.read_offset < len(t.src) {
  67. t.offset = t.read_offset
  68. if t.ch == '\n' {
  69. t.line_offset = t.offset
  70. t.line_count += 1
  71. }
  72. r, w := rune(t.src[t.read_offset]), 1
  73. switch {
  74. case r == 0:
  75. error(t, t.offset, "illegal character NUL")
  76. case r >= utf8.RUNE_SELF:
  77. r, w = utf8.decode_rune_in_string(t.src[t.read_offset:])
  78. if r == utf8.RUNE_ERROR && w == 1 {
  79. error(t, t.offset, "illegal UTF-8 encoding")
  80. } else if r == utf8.RUNE_BOM && t.offset > 0 {
  81. error(t, t.offset, "illegal byte order mark")
  82. }
  83. }
  84. t.read_offset += w
  85. t.ch = r
  86. } else {
  87. t.offset = len(t.src)
  88. if t.ch == '\n' {
  89. t.line_offset = t.offset
  90. t.line_count += 1
  91. }
  92. t.ch = -1
  93. }
  94. }
  95. peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
  96. if t.read_offset+offset < len(t.src) {
  97. return t.src[t.read_offset+offset]
  98. }
  99. return 0
  100. }
  101. skip_whitespace :: proc(t: ^Tokenizer) {
  102. if t.insert_semicolon {
  103. for {
  104. switch t.ch {
  105. case ' ', '\t', '\r':
  106. advance_rune(t)
  107. case:
  108. return
  109. }
  110. }
  111. } else {
  112. for {
  113. switch t.ch {
  114. case ' ', '\t', '\r', '\n':
  115. advance_rune(t)
  116. case:
  117. return
  118. }
  119. }
  120. }
  121. }
  122. is_letter :: proc(r: rune) -> bool {
  123. if r < utf8.RUNE_SELF {
  124. switch r {
  125. case '_':
  126. return true
  127. case 'A'..='Z', 'a'..='z':
  128. return true
  129. }
  130. }
  131. return unicode.is_letter(r)
  132. }
  133. is_digit :: proc(r: rune) -> bool {
  134. if '0' <= r && r <= '9' {
  135. return true
  136. }
  137. return unicode.is_digit(r)
  138. }
  139. scan_comment :: proc(t: ^Tokenizer) -> string {
  140. offset := t.offset-1
  141. next := -1
  142. general: {
  143. if t.ch == '/' || t.ch == '!' { // // #! comments
  144. advance_rune(t)
  145. for t.ch != '\n' && t.ch >= 0 {
  146. advance_rune(t)
  147. }
  148. next = t.offset
  149. if t.ch == '\n' {
  150. next += 1
  151. }
  152. break general
  153. }
  154. /* style comment */
  155. advance_rune(t)
  156. nest := 1
  157. for t.ch >= 0 && nest > 0 {
  158. ch := t.ch
  159. advance_rune(t)
  160. if ch == '/' && t.ch == '*' {
  161. nest += 1
  162. }
  163. if ch == '*' && t.ch == '/' {
  164. nest -= 1
  165. advance_rune(t)
  166. next = t.offset
  167. if nest == 0 {
  168. break general
  169. }
  170. }
  171. }
  172. error(t, offset, "comment not terminated")
  173. }
  174. lit := t.src[offset : t.offset]
  175. // NOTE(bill): Strip CR for line comments
  176. for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
  177. lit = lit[:len(lit)-1]
  178. }
  179. return string(lit)
  180. }
  181. scan_identifier :: proc(t: ^Tokenizer) -> string {
  182. offset := t.offset
  183. for is_letter(t.ch) || is_digit(t.ch) {
  184. advance_rune(t)
  185. }
  186. return string(t.src[offset : t.offset])
  187. }
  188. scan_string :: proc(t: ^Tokenizer) -> string {
  189. offset := t.offset-1
  190. for {
  191. ch := t.ch
  192. if ch == '\n' || ch < 0 {
  193. error(t, offset, "string literal was not terminated")
  194. break
  195. }
  196. advance_rune(t)
  197. if ch == '"' {
  198. break
  199. }
  200. if ch == '\\' {
  201. scan_escape(t)
  202. }
  203. }
  204. return string(t.src[offset : t.offset])
  205. }
  206. scan_raw_string :: proc(t: ^Tokenizer) -> string {
  207. offset := t.offset-1
  208. for {
  209. ch := t.ch
  210. if ch == utf8.RUNE_EOF {
  211. error(t, offset, "raw string literal was not terminated")
  212. break
  213. }
  214. advance_rune(t)
  215. if ch == '`' {
  216. break
  217. }
  218. }
  219. return string(t.src[offset : t.offset])
  220. }
  221. digit_val :: proc(r: rune) -> int {
  222. switch r {
  223. case '0'..='9':
  224. return int(r-'0')
  225. case 'A'..='F':
  226. return int(r-'A' + 10)
  227. case 'a'..='f':
  228. return int(r-'a' + 10)
  229. }
  230. return 16
  231. }
  232. scan_escape :: proc(t: ^Tokenizer) -> bool {
  233. offset := t.offset
  234. n: int
  235. base, max: u32
  236. switch t.ch {
  237. case 'a', 'b', 'e', 'f', 'n', 't', 'v', 'r', '\\', '\'', '\"':
  238. advance_rune(t)
  239. return true
  240. case '0'..='7':
  241. n, base, max = 3, 8, 255
  242. case 'x':
  243. advance_rune(t)
  244. n, base, max = 2, 16, 255
  245. case 'u':
  246. advance_rune(t)
  247. n, base, max = 4, 16, utf8.MAX_RUNE
  248. case 'U':
  249. advance_rune(t)
  250. n, base, max = 8, 16, utf8.MAX_RUNE
  251. case:
  252. if t.ch < 0 {
  253. error(t, offset, "escape sequence was not terminated")
  254. } else {
  255. error(t, offset, "unknown escape sequence")
  256. }
  257. return false
  258. }
  259. x: u32
  260. for n > 0 {
  261. d := u32(digit_val(t.ch))
  262. for d >= base {
  263. if t.ch < 0 {
  264. error(t, t.offset, "escape sequence was not terminated")
  265. } else {
  266. error(t, t.offset, "illegal character %d in escape sequence", t.ch)
  267. }
  268. return false
  269. }
  270. x = x*base + d
  271. advance_rune(t)
  272. n -= 1
  273. }
  274. if x > max || 0xd800 <= x && x <= 0xe000 {
  275. error(t, offset, "escape sequence is an invalid Unicode code point")
  276. return false
  277. }
  278. return true
  279. }
  280. scan_rune :: proc(t: ^Tokenizer) -> string {
  281. offset := t.offset-1
  282. valid := true
  283. n := 0
  284. for {
  285. ch := t.ch
  286. if ch == '\n' || ch < 0 {
  287. if valid {
  288. error(t, offset, "rune literal not terminated")
  289. valid = false
  290. }
  291. break
  292. }
  293. advance_rune(t)
  294. if ch == '\'' {
  295. break
  296. }
  297. n += 1
  298. if ch == '\\' {
  299. if !scan_escape(t) {
  300. valid = false
  301. }
  302. }
  303. }
  304. if valid && n != 1 {
  305. error(t, offset, "illegal rune literal")
  306. }
  307. return string(t.src[offset : t.offset])
  308. }
  309. scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) {
  310. scan_mantissa :: proc(t: ^Tokenizer, base: int) {
  311. for digit_val(t.ch) < base || t.ch == '_' {
  312. advance_rune(t)
  313. }
  314. }
  315. scan_exponent :: proc(t: ^Tokenizer, kind: ^Token_Kind) {
  316. if t.ch == 'e' || t.ch == 'E' {
  317. kind^ = .Float
  318. advance_rune(t)
  319. if t.ch == '-' || t.ch == '+' {
  320. advance_rune(t)
  321. }
  322. if digit_val(t.ch) < 10 {
  323. scan_mantissa(t, 10)
  324. } else {
  325. error(t, t.offset, "illegal floating-point exponent")
  326. }
  327. }
  328. // NOTE(bill): This needs to be here for sanity's sake
  329. switch t.ch {
  330. case 'i', 'j', 'k':
  331. kind^ = .Imag
  332. advance_rune(t)
  333. }
  334. }
  335. scan_fraction :: proc(t: ^Tokenizer, kind: ^Token_Kind) -> (early_exit: bool) {
  336. if t.ch == '.' && peek_byte(t) == '.' {
  337. return true
  338. }
  339. if t.ch == '.' {
  340. kind^ = .Float
  341. advance_rune(t)
  342. scan_mantissa(t, 10)
  343. }
  344. return false
  345. }
  346. offset := t.offset
  347. kind := Token_Kind.Integer
  348. seen_point := seen_decimal_point
  349. if seen_point {
  350. offset -= 1
  351. kind = .Float
  352. scan_mantissa(t, 10)
  353. scan_exponent(t, &kind)
  354. } else {
  355. if t.ch == '0' {
  356. int_base :: proc(t: ^Tokenizer, kind: ^Token_Kind, base: int, msg: string) {
  357. prev := t.offset
  358. advance_rune(t)
  359. scan_mantissa(t, base)
  360. if t.offset - prev <= 1 {
  361. kind^ = .Invalid
  362. error(t, t.offset, msg)
  363. }
  364. }
  365. advance_rune(t)
  366. switch t.ch {
  367. case 'b': int_base(t, &kind, 2, "illegal binary integer")
  368. case 'o': int_base(t, &kind, 8, "illegal octal integer")
  369. case 'd': int_base(t, &kind, 10, "illegal decimal integer")
  370. case 'z': int_base(t, &kind, 12, "illegal dozenal integer")
  371. case 'x': int_base(t, &kind, 16, "illegal hexadecimal integer")
  372. case 'h':
  373. prev := t.offset
  374. advance_rune(t)
  375. scan_mantissa(t, 16)
  376. if t.offset - prev <= 1 {
  377. kind = .Invalid
  378. error(t, t.offset, "illegal hexadecimal floating-point number")
  379. } else {
  380. sub := t.src[prev+1 : t.offset]
  381. digit_count := 0
  382. for d in sub {
  383. if d != '_' {
  384. digit_count += 1
  385. }
  386. }
  387. switch digit_count {
  388. case 4, 8, 16: break
  389. case:
  390. error(t, t.offset, "invalid hexadecimal floating-point number, expected 4, 8, or 16 digits, got %d", digit_count)
  391. }
  392. }
  393. case:
  394. seen_point = false
  395. scan_mantissa(t, 10)
  396. if t.ch == '.' {
  397. seen_point = true
  398. if scan_fraction(t, &kind) {
  399. return kind, string(t.src[offset : t.offset])
  400. }
  401. }
  402. scan_exponent(t, &kind)
  403. return kind, string(t.src[offset : t.offset])
  404. }
  405. }
  406. }
  407. scan_mantissa(t, 10)
  408. if scan_fraction(t, &kind) {
  409. return kind, string(t.src[offset : t.offset])
  410. }
  411. scan_exponent(t, &kind)
  412. return kind, string(t.src[offset : t.offset])
  413. }
  414. scan :: proc(t: ^Tokenizer) -> Token {
  415. skip_whitespace(t)
  416. offset := t.offset
  417. kind: Token_Kind
  418. lit: string
  419. pos := offset_to_pos(t, offset)
  420. switch ch := t.ch; true {
  421. case is_letter(ch):
  422. lit = scan_identifier(t)
  423. kind = .Ident
  424. check_keyword: if len(lit) > 1 {
  425. // TODO(bill): Maybe have a hash table lookup rather than this linear search
  426. for i in Token_Kind.B_Keyword_Begin ..= Token_Kind.B_Keyword_End {
  427. if lit == tokens[i] {
  428. kind = Token_Kind(i)
  429. break check_keyword
  430. }
  431. }
  432. for keyword, i in custom_keyword_tokens {
  433. if lit == keyword {
  434. kind = Token_Kind(i+1) + .B_Custom_Keyword_Begin
  435. break check_keyword
  436. }
  437. }
  438. break check_keyword
  439. }
  440. case '0' <= ch && ch <= '9':
  441. kind, lit = scan_number(t, false)
  442. case:
  443. advance_rune(t)
  444. switch ch {
  445. case -1:
  446. kind = .EOF
  447. if t.insert_semicolon {
  448. t.insert_semicolon = false
  449. kind = .Semicolon
  450. lit = "\n"
  451. return Token{kind, lit, pos}
  452. }
  453. case '\n':
  454. t.insert_semicolon = false
  455. kind = .Semicolon
  456. lit = "\n"
  457. case '\\':
  458. if .Insert_Semicolon in t.flags {
  459. t.insert_semicolon = false
  460. }
  461. token := scan(t)
  462. if token.pos.line == pos.line {
  463. error(t, token.pos.offset, "expected a newline after \\")
  464. }
  465. return token
  466. case '\'':
  467. kind = .Rune
  468. lit = scan_rune(t)
  469. case '"':
  470. kind = .String
  471. lit = scan_string(t)
  472. case '`':
  473. kind = .String
  474. lit = scan_raw_string(t)
  475. case '.':
  476. kind = .Period
  477. switch t.ch {
  478. case '0'..='9':
  479. kind, lit = scan_number(t, true)
  480. case '.':
  481. advance_rune(t)
  482. kind = .Ellipsis
  483. switch t.ch {
  484. case '<':
  485. advance_rune(t)
  486. kind = .Range_Half
  487. case '=':
  488. advance_rune(t)
  489. kind = .Range_Full
  490. }
  491. }
  492. case '@': kind = .At
  493. case '$': kind = .Dollar
  494. case '?': kind = .Question
  495. case '^': kind = .Pointer
  496. case ';': kind = .Semicolon
  497. case ',': kind = .Comma
  498. case ':': kind = .Colon
  499. case '(': kind = .Open_Paren
  500. case ')': kind = .Close_Paren
  501. case '[': kind = .Open_Bracket
  502. case ']': kind = .Close_Bracket
  503. case '{': kind = .Open_Brace
  504. case '}': kind = .Close_Brace
  505. case '%':
  506. kind = .Mod
  507. switch t.ch {
  508. case '=':
  509. advance_rune(t)
  510. kind = .Mod_Eq
  511. case '%':
  512. advance_rune(t)
  513. kind = .Mod_Mod
  514. if t.ch == '=' {
  515. advance_rune(t)
  516. kind = .Mod_Mod_Eq
  517. }
  518. }
  519. case '*':
  520. kind = .Mul
  521. if t.ch == '=' {
  522. advance_rune(t)
  523. kind = .Mul_Eq
  524. }
  525. case '=':
  526. kind = .Eq
  527. if t.ch == '=' {
  528. advance_rune(t)
  529. kind = .Cmp_Eq
  530. }
  531. case '~':
  532. kind = .Xor
  533. if t.ch == '=' {
  534. advance_rune(t)
  535. kind = .Xor_Eq
  536. }
  537. case '!':
  538. kind = .Not
  539. if t.ch == '=' {
  540. advance_rune(t)
  541. kind = .Not_Eq
  542. }
  543. case '+':
  544. kind = .Add
  545. switch t.ch {
  546. case '=':
  547. advance_rune(t)
  548. kind = .Add_Eq
  549. case '+':
  550. advance_rune(t)
  551. kind = .Increment
  552. }
  553. case '-':
  554. kind = .Sub
  555. switch t.ch {
  556. case '-':
  557. advance_rune(t)
  558. kind = .Decrement
  559. if t.ch == '-' {
  560. advance_rune(t)
  561. kind = .Undef
  562. }
  563. case '>':
  564. advance_rune(t)
  565. kind = .Arrow_Right
  566. case '=':
  567. advance_rune(t)
  568. kind = .Sub_Eq
  569. }
  570. case '#':
  571. kind = .Hash
  572. if t.ch == '!' {
  573. kind = .Comment
  574. lit = scan_comment(t)
  575. }
  576. case '/':
  577. kind = .Quo
  578. switch t.ch {
  579. case '/', '*':
  580. kind = .Comment
  581. lit = scan_comment(t)
  582. case '=':
  583. advance_rune(t)
  584. kind = .Quo_Eq
  585. }
  586. case '<':
  587. kind = .Lt
  588. switch t.ch {
  589. case '=':
  590. advance_rune(t)
  591. kind = .Lt_Eq
  592. case '<':
  593. advance_rune(t)
  594. kind = .Shl
  595. if t.ch == '=' {
  596. advance_rune(t)
  597. kind = .Shl_Eq
  598. }
  599. }
  600. case '>':
  601. kind = .Gt
  602. switch t.ch {
  603. case '=':
  604. advance_rune(t)
  605. kind = .Gt_Eq
  606. case '>':
  607. advance_rune(t)
  608. kind = .Shr
  609. if t.ch == '=' {
  610. advance_rune(t)
  611. kind = .Shr_Eq
  612. }
  613. }
  614. case '&':
  615. kind = .And
  616. switch t.ch {
  617. case '~':
  618. advance_rune(t)
  619. kind = .And_Not
  620. if t.ch == '=' {
  621. advance_rune(t)
  622. kind = .And_Not_Eq
  623. }
  624. case '=':
  625. advance_rune(t)
  626. kind = .And_Eq
  627. case '&':
  628. advance_rune(t)
  629. kind = .Cmp_And
  630. if t.ch == '=' {
  631. advance_rune(t)
  632. kind = .Cmp_And_Eq
  633. }
  634. }
  635. case '|':
  636. kind = .Or
  637. switch t.ch {
  638. case '=':
  639. advance_rune(t)
  640. kind = .Or_Eq
  641. case '|':
  642. advance_rune(t)
  643. kind = .Cmp_Or
  644. if t.ch == '=' {
  645. advance_rune(t)
  646. kind = .Cmp_Or_Eq
  647. }
  648. }
  649. case:
  650. if ch != utf8.RUNE_BOM {
  651. error(t, t.offset, "illegal character '%r': %d", ch, ch)
  652. }
  653. kind = .Invalid
  654. }
  655. }
  656. if .Insert_Semicolon in t.flags {
  657. #partial switch kind {
  658. case .Invalid, .Comment:
  659. // Preserve insert_semicolon info
  660. case .Ident, .Context, .Typeid, .Break, .Continue, .Fallthrough, .Return,
  661. .Integer, .Float, .Imag, .Rune, .String, .Undef,
  662. .Question, .Pointer, .Close_Paren, .Close_Bracket, .Close_Brace,
  663. .Increment, .Decrement, .Or_Return, .Or_Break, .Or_Continue:
  664. /*fallthrough*/
  665. t.insert_semicolon = true
  666. case:
  667. t.insert_semicolon = false
  668. break
  669. }
  670. }
  671. if lit == "" {
  672. lit = string(t.src[offset : t.offset])
  673. }
  674. return Token{kind, lit, pos}
  675. }