tokenizer.odin 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740
  1. package odin_tokenizer
  2. import "core:fmt"
  3. import "core:unicode"
  4. import "core:unicode/utf8"
  5. Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
  6. Flag :: enum {
  7. Insert_Semicolon,
  8. }
  9. Flags :: distinct bit_set[Flag; u32]
  10. Tokenizer :: struct {
  11. // Immutable data
  12. path: string,
  13. src: string,
  14. err: Error_Handler,
  15. flags: Flags,
  16. // Tokenizing state
  17. ch: rune,
  18. offset: int,
  19. read_offset: int,
  20. line_offset: int,
  21. line_count: int,
  22. insert_semicolon: bool,
  23. // Mutable data
  24. error_count: int,
  25. }
  26. init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
  27. t.src = src
  28. t.err = err
  29. t.ch = ' '
  30. t.offset = 0
  31. t.read_offset = 0
  32. t.line_offset = 0
  33. t.line_count = len(src) > 0 ? 1 : 0
  34. t.error_count = 0
  35. t.path = path
  36. advance_rune(t)
  37. if t.ch == utf8.RUNE_BOM {
  38. advance_rune(t)
  39. }
  40. }
  41. @(private)
  42. offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
  43. line := t.line_count
  44. column := offset - t.line_offset + 1
  45. return Pos {
  46. file = t.path,
  47. offset = offset,
  48. line = line,
  49. column = column,
  50. }
  51. }
  52. default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
  53. fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
  54. fmt.eprintf(msg, ..args)
  55. fmt.eprintf("\n")
  56. }
  57. error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
  58. pos := offset_to_pos(t, offset)
  59. if t.err != nil {
  60. t.err(pos, msg, ..args)
  61. }
  62. t.error_count += 1
  63. }
  64. advance_rune :: proc(using t: ^Tokenizer) {
  65. if read_offset < len(src) {
  66. offset = read_offset
  67. if ch == '\n' {
  68. line_offset = offset
  69. line_count += 1
  70. }
  71. r, w := rune(src[read_offset]), 1
  72. switch {
  73. case r == 0:
  74. error(t, t.offset, "illegal character NUL")
  75. case r >= utf8.RUNE_SELF:
  76. r, w = utf8.decode_rune_in_string(src[read_offset:])
  77. if r == utf8.RUNE_ERROR && w == 1 {
  78. error(t, t.offset, "illegal UTF-8 encoding")
  79. } else if r == utf8.RUNE_BOM && offset > 0 {
  80. error(t, t.offset, "illegal byte order mark")
  81. }
  82. }
  83. read_offset += w
  84. ch = r
  85. } else {
  86. offset = len(src)
  87. if ch == '\n' {
  88. line_offset = offset
  89. line_count += 1
  90. }
  91. ch = -1
  92. }
  93. }
  94. peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
  95. if t.read_offset+offset < len(t.src) {
  96. return t.src[t.read_offset+offset]
  97. }
  98. return 0
  99. }
  100. skip_whitespace :: proc(t: ^Tokenizer) {
  101. if t.insert_semicolon {
  102. for {
  103. switch t.ch {
  104. case ' ', '\t', '\r':
  105. advance_rune(t)
  106. case:
  107. return
  108. }
  109. }
  110. } else {
  111. for {
  112. switch t.ch {
  113. case ' ', '\t', '\r', '\n':
  114. advance_rune(t)
  115. case:
  116. return
  117. }
  118. }
  119. }
  120. }
  121. is_letter :: proc(r: rune) -> bool {
  122. if r < utf8.RUNE_SELF {
  123. switch r {
  124. case '_':
  125. return true
  126. case 'A'..='Z', 'a'..='z':
  127. return true
  128. }
  129. }
  130. return unicode.is_letter(r)
  131. }
  132. is_digit :: proc(r: rune) -> bool {
  133. if '0' <= r && r <= '9' {
  134. return true
  135. }
  136. return unicode.is_digit(r)
  137. }
  138. scan_comment :: proc(t: ^Tokenizer) -> string {
  139. offset := t.offset-1
  140. next := -1
  141. general: {
  142. if t.ch == '/' || t.ch == '!' { // // #! comments
  143. advance_rune(t)
  144. for t.ch != '\n' && t.ch >= 0 {
  145. advance_rune(t)
  146. }
  147. next = t.offset
  148. if t.ch == '\n' {
  149. next += 1
  150. }
  151. break general
  152. }
  153. /* style comment */
  154. advance_rune(t)
  155. nest := 1
  156. for t.ch >= 0 && nest > 0 {
  157. ch := t.ch
  158. advance_rune(t)
  159. if ch == '/' && t.ch == '*' {
  160. nest += 1
  161. }
  162. if ch == '*' && t.ch == '/' {
  163. nest -= 1
  164. advance_rune(t)
  165. next = t.offset
  166. if nest == 0 {
  167. break general
  168. }
  169. }
  170. }
  171. error(t, offset, "comment not terminated")
  172. }
  173. lit := t.src[offset : t.offset]
  174. // NOTE(bill): Strip CR for line comments
  175. for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
  176. lit = lit[:len(lit)-1]
  177. }
  178. return string(lit)
  179. }
  180. scan_identifier :: proc(t: ^Tokenizer) -> string {
  181. offset := t.offset
  182. for is_letter(t.ch) || is_digit(t.ch) {
  183. advance_rune(t)
  184. }
  185. return string(t.src[offset : t.offset])
  186. }
  187. scan_string :: proc(t: ^Tokenizer) -> string {
  188. offset := t.offset-1
  189. for {
  190. ch := t.ch
  191. if ch == '\n' || ch < 0 {
  192. error(t, offset, "string literal was not terminated")
  193. break
  194. }
  195. advance_rune(t)
  196. if ch == '"' {
  197. break
  198. }
  199. if ch == '\\' {
  200. scan_escape(t)
  201. }
  202. }
  203. return string(t.src[offset : t.offset])
  204. }
  205. scan_raw_string :: proc(t: ^Tokenizer) -> string {
  206. offset := t.offset-1
  207. for {
  208. ch := t.ch
  209. if ch == utf8.RUNE_EOF {
  210. error(t, offset, "raw string literal was not terminated")
  211. break
  212. }
  213. advance_rune(t)
  214. if ch == '`' {
  215. break
  216. }
  217. }
  218. return string(t.src[offset : t.offset])
  219. }
  220. digit_val :: proc(r: rune) -> int {
  221. switch r {
  222. case '0'..='9':
  223. return int(r-'0')
  224. case 'A'..='F':
  225. return int(r-'A' + 10)
  226. case 'a'..='f':
  227. return int(r-'a' + 10)
  228. }
  229. return 16
  230. }
  231. scan_escape :: proc(t: ^Tokenizer) -> bool {
  232. offset := t.offset
  233. n: int
  234. base, max: u32
  235. switch t.ch {
  236. case 'a', 'b', 'e', 'f', 'n', 't', 'v', 'r', '\\', '\'', '\"':
  237. advance_rune(t)
  238. return true
  239. case '0'..='7':
  240. n, base, max = 3, 8, 255
  241. case 'x':
  242. advance_rune(t)
  243. n, base, max = 2, 16, 255
  244. case 'u':
  245. advance_rune(t)
  246. n, base, max = 4, 16, utf8.MAX_RUNE
  247. case 'U':
  248. advance_rune(t)
  249. n, base, max = 8, 16, utf8.MAX_RUNE
  250. case:
  251. if t.ch < 0 {
  252. error(t, offset, "escape sequence was not terminated")
  253. } else {
  254. error(t, offset, "unknown escape sequence")
  255. }
  256. return false
  257. }
  258. x: u32
  259. for n > 0 {
  260. d := u32(digit_val(t.ch))
  261. for d >= base {
  262. if t.ch < 0 {
  263. error(t, t.offset, "escape sequence was not terminated")
  264. } else {
  265. error(t, t.offset, "illegal character %d in escape sequence", t.ch)
  266. }
  267. return false
  268. }
  269. x = x*base + d
  270. advance_rune(t)
  271. n -= 1
  272. }
  273. if x > max || 0xd800 <= x && x <= 0xe000 {
  274. error(t, offset, "escape sequence is an invalid Unicode code point")
  275. return false
  276. }
  277. return true
  278. }
  279. scan_rune :: proc(t: ^Tokenizer) -> string {
  280. offset := t.offset-1
  281. valid := true
  282. n := 0
  283. for {
  284. ch := t.ch
  285. if ch == '\n' || ch < 0 {
  286. if valid {
  287. error(t, offset, "rune literal not terminated")
  288. valid = false
  289. }
  290. break
  291. }
  292. advance_rune(t)
  293. if ch == '\'' {
  294. break
  295. }
  296. n += 1
  297. if ch == '\\' {
  298. if !scan_escape(t) {
  299. valid = false
  300. }
  301. }
  302. }
  303. if valid && n != 1 {
  304. error(t, offset, "illegal rune literal")
  305. }
  306. return string(t.src[offset : t.offset])
  307. }
  308. scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) {
  309. scan_mantissa :: proc(t: ^Tokenizer, base: int) {
  310. for digit_val(t.ch) < base || t.ch == '_' {
  311. advance_rune(t)
  312. }
  313. }
  314. scan_exponent :: proc(t: ^Tokenizer, kind: ^Token_Kind) {
  315. if t.ch == 'e' || t.ch == 'E' {
  316. kind^ = .Float
  317. advance_rune(t)
  318. if t.ch == '-' || t.ch == '+' {
  319. advance_rune(t)
  320. }
  321. if digit_val(t.ch) < 10 {
  322. scan_mantissa(t, 10)
  323. } else {
  324. error(t, t.offset, "illegal floating-point exponent")
  325. }
  326. }
  327. // NOTE(bill): This needs to be here for sanity's sake
  328. switch t.ch {
  329. case 'i', 'j', 'k':
  330. kind^ = .Imag
  331. advance_rune(t)
  332. }
  333. }
  334. scan_fraction :: proc(t: ^Tokenizer, kind: ^Token_Kind) -> (early_exit: bool) {
  335. if t.ch == '.' && peek_byte(t) == '.' {
  336. return true
  337. }
  338. if t.ch == '.' {
  339. kind^ = .Float
  340. advance_rune(t)
  341. scan_mantissa(t, 10)
  342. }
  343. return false
  344. }
  345. offset := t.offset
  346. kind := Token_Kind.Integer
  347. seen_point := seen_decimal_point
  348. if seen_point {
  349. offset -= 1
  350. kind = .Float
  351. scan_mantissa(t, 10)
  352. scan_exponent(t, &kind)
  353. } else {
  354. if t.ch == '0' {
  355. int_base :: proc(t: ^Tokenizer, kind: ^Token_Kind, base: int, msg: string) {
  356. prev := t.offset
  357. advance_rune(t)
  358. scan_mantissa(t, base)
  359. if t.offset - prev <= 1 {
  360. kind^ = .Invalid
  361. error(t, t.offset, msg)
  362. }
  363. }
  364. advance_rune(t)
  365. switch t.ch {
  366. case 'b': int_base(t, &kind, 2, "illegal binary integer")
  367. case 'o': int_base(t, &kind, 8, "illegal octal integer")
  368. case 'd': int_base(t, &kind, 10, "illegal decimal integer")
  369. case 'z': int_base(t, &kind, 12, "illegal dozenal integer")
  370. case 'x': int_base(t, &kind, 16, "illegal hexadecimal integer")
  371. case 'h':
  372. prev := t.offset
  373. advance_rune(t)
  374. scan_mantissa(t, 16)
  375. if t.offset - prev <= 1 {
  376. kind = .Invalid
  377. error(t, t.offset, "illegal hexadecimal floating-point number")
  378. } else {
  379. sub := t.src[prev+1 : t.offset]
  380. digit_count := 0
  381. for d in sub {
  382. if d != '_' {
  383. digit_count += 1
  384. }
  385. }
  386. switch digit_count {
  387. case 4, 8, 16: break
  388. case:
  389. error(t, t.offset, "invalid hexadecimal floating-point number, expected 4, 8, or 16 digits, got %d", digit_count)
  390. }
  391. }
  392. case:
  393. seen_point = false
  394. scan_mantissa(t, 10)
  395. if t.ch == '.' {
  396. seen_point = true
  397. if scan_fraction(t, &kind) {
  398. return kind, string(t.src[offset : t.offset])
  399. }
  400. }
  401. scan_exponent(t, &kind)
  402. return kind, string(t.src[offset : t.offset])
  403. }
  404. }
  405. }
  406. scan_mantissa(t, 10)
  407. if scan_fraction(t, &kind) {
  408. return kind, string(t.src[offset : t.offset])
  409. }
  410. scan_exponent(t, &kind)
  411. return kind, string(t.src[offset : t.offset])
  412. }
  413. scan :: proc(t: ^Tokenizer) -> Token {
  414. skip_whitespace(t)
  415. offset := t.offset
  416. kind: Token_Kind
  417. lit: string
  418. pos := offset_to_pos(t, offset)
  419. switch ch := t.ch; true {
  420. case is_letter(ch):
  421. lit = scan_identifier(t)
  422. kind = .Ident
  423. check_keyword: if len(lit) > 1 {
  424. // TODO(bill): Maybe have a hash table lookup rather than this linear search
  425. for i in Token_Kind.B_Keyword_Begin ..= Token_Kind.B_Keyword_End {
  426. if lit == tokens[i] {
  427. kind = Token_Kind(i)
  428. break check_keyword
  429. }
  430. }
  431. for keyword, i in custom_keyword_tokens {
  432. if lit == keyword {
  433. kind = Token_Kind(i+1) + .B_Custom_Keyword_Begin
  434. break check_keyword
  435. }
  436. }
  437. break check_keyword
  438. }
  439. case '0' <= ch && ch <= '9':
  440. kind, lit = scan_number(t, false)
  441. case:
  442. advance_rune(t)
  443. switch ch {
  444. case -1:
  445. kind = .EOF
  446. if t.insert_semicolon {
  447. t.insert_semicolon = false
  448. kind = .Semicolon
  449. lit = "\n"
  450. return Token{kind, lit, pos}
  451. }
  452. case '\n':
  453. t.insert_semicolon = false
  454. kind = .Semicolon
  455. lit = "\n"
  456. case '\\':
  457. if .Insert_Semicolon in t.flags {
  458. t.insert_semicolon = false
  459. }
  460. token := scan(t)
  461. if token.pos.line == pos.line {
  462. error(t, token.pos.offset, "expected a newline after \\")
  463. }
  464. return token
  465. case '\'':
  466. kind = .Rune
  467. lit = scan_rune(t)
  468. case '"':
  469. kind = .String
  470. lit = scan_string(t)
  471. case '`':
  472. kind = .String
  473. lit = scan_raw_string(t)
  474. case '.':
  475. kind = .Period
  476. switch t.ch {
  477. case '0'..='9':
  478. kind, lit = scan_number(t, true)
  479. case '.':
  480. advance_rune(t)
  481. kind = .Ellipsis
  482. switch t.ch {
  483. case '<':
  484. advance_rune(t)
  485. kind = .Range_Half
  486. case '=':
  487. advance_rune(t)
  488. kind = .Range_Full
  489. }
  490. }
  491. case '@': kind = .At
  492. case '$': kind = .Dollar
  493. case '?': kind = .Question
  494. case '^': kind = .Pointer
  495. case ';': kind = .Semicolon
  496. case ',': kind = .Comma
  497. case ':': kind = .Colon
  498. case '(': kind = .Open_Paren
  499. case ')': kind = .Close_Paren
  500. case '[': kind = .Open_Bracket
  501. case ']': kind = .Close_Bracket
  502. case '{': kind = .Open_Brace
  503. case '}': kind = .Close_Brace
  504. case '%':
  505. kind = .Mod
  506. switch t.ch {
  507. case '=':
  508. advance_rune(t)
  509. kind = .Mod_Eq
  510. case '%':
  511. advance_rune(t)
  512. kind = .Mod_Mod
  513. if t.ch == '=' {
  514. advance_rune(t)
  515. kind = .Mod_Mod_Eq
  516. }
  517. }
  518. case '*':
  519. kind = .Mul
  520. if t.ch == '=' {
  521. advance_rune(t)
  522. kind = .Mul_Eq
  523. }
  524. case '=':
  525. kind = .Eq
  526. if t.ch == '=' {
  527. advance_rune(t)
  528. kind = .Cmp_Eq
  529. }
  530. case '~':
  531. kind = .Xor
  532. if t.ch == '=' {
  533. advance_rune(t)
  534. kind = .Xor_Eq
  535. }
  536. case '!':
  537. kind = .Not
  538. if t.ch == '=' {
  539. advance_rune(t)
  540. kind = .Not_Eq
  541. }
  542. case '+':
  543. kind = .Add
  544. switch t.ch {
  545. case '=':
  546. advance_rune(t)
  547. kind = .Add_Eq
  548. case '+':
  549. advance_rune(t)
  550. kind = .Increment
  551. }
  552. case '-':
  553. kind = .Sub
  554. switch t.ch {
  555. case '-':
  556. advance_rune(t)
  557. kind = .Decrement
  558. if t.ch == '-' {
  559. advance_rune(t)
  560. kind = .Undef
  561. }
  562. case '>':
  563. advance_rune(t)
  564. kind = .Arrow_Right
  565. case '=':
  566. advance_rune(t)
  567. kind = .Sub_Eq
  568. }
  569. case '#':
  570. kind = .Hash
  571. if t.ch == '!' {
  572. kind = .Comment
  573. lit = scan_comment(t)
  574. }
  575. case '/':
  576. kind = .Quo
  577. switch t.ch {
  578. case '/', '*':
  579. kind = .Comment
  580. lit = scan_comment(t)
  581. case '=':
  582. advance_rune(t)
  583. kind = .Quo_Eq
  584. }
  585. case '<':
  586. kind = .Lt
  587. switch t.ch {
  588. case '=':
  589. advance_rune(t)
  590. kind = .Lt_Eq
  591. case '<':
  592. advance_rune(t)
  593. kind = .Shl
  594. if t.ch == '=' {
  595. advance_rune(t)
  596. kind = .Shl_Eq
  597. }
  598. }
  599. case '>':
  600. kind = .Gt
  601. switch t.ch {
  602. case '=':
  603. advance_rune(t)
  604. kind = .Gt_Eq
  605. case '>':
  606. advance_rune(t)
  607. kind = .Shr
  608. if t.ch == '=' {
  609. advance_rune(t)
  610. kind = .Shr_Eq
  611. }
  612. }
  613. case '&':
  614. kind = .And
  615. switch t.ch {
  616. case '~':
  617. advance_rune(t)
  618. kind = .And_Not
  619. if t.ch == '=' {
  620. advance_rune(t)
  621. kind = .And_Not_Eq
  622. }
  623. case '=':
  624. advance_rune(t)
  625. kind = .And_Eq
  626. case '&':
  627. advance_rune(t)
  628. kind = .Cmp_And
  629. if t.ch == '=' {
  630. advance_rune(t)
  631. kind = .Cmp_And_Eq
  632. }
  633. }
  634. case '|':
  635. kind = .Or
  636. switch t.ch {
  637. case '=':
  638. advance_rune(t)
  639. kind = .Or_Eq
  640. case '|':
  641. advance_rune(t)
  642. kind = .Cmp_Or
  643. if t.ch == '=' {
  644. advance_rune(t)
  645. kind = .Cmp_Or_Eq
  646. }
  647. }
  648. case:
  649. if ch != utf8.RUNE_BOM {
  650. error(t, t.offset, "illegal character '%r': %d", ch, ch)
  651. }
  652. kind = .Invalid
  653. }
  654. }
  655. if .Insert_Semicolon in t.flags {
  656. #partial switch kind {
  657. case .Invalid, .Comment:
  658. // Preserve insert_semicolon info
  659. case .Ident, .Context, .Typeid, .Break, .Continue, .Fallthrough, .Return,
  660. .Integer, .Float, .Imag, .Rune, .String, .Undef,
  661. .Question, .Pointer, .Close_Paren, .Close_Bracket, .Close_Brace,
  662. .Increment, .Decrement, .Or_Return:
  663. /*fallthrough*/
  664. t.insert_semicolon = true
  665. case:
  666. t.insert_semicolon = false
  667. break
  668. }
  669. }
  670. if lit == "" {
  671. lit = string(t.src[offset : t.offset])
  672. }
  673. return Token{kind, lit, pos}
  674. }