tokenizer.odin 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761
  1. package odin_tokenizer
  2. import "core:fmt"
  3. import "core:unicode"
  4. import "core:unicode/utf8"
  5. Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
  6. Flag :: enum {
  7. Insert_Semicolon,
  8. }
  9. Flags :: distinct bit_set[Flag; u32]
  10. Tokenizer :: struct {
  11. // Immutable data
  12. path: string,
  13. src: string,
  14. err: Error_Handler,
  15. flags: Flags,
  16. // Tokenizing state
  17. ch: rune,
  18. offset: int,
  19. read_offset: int,
  20. line_offset: int,
  21. line_count: int,
  22. insert_semicolon: bool,
  23. // Mutable data
  24. error_count: int,
  25. }
  26. init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
  27. t.src = src
  28. t.err = err
  29. t.ch = ' '
  30. t.offset = 0
  31. t.read_offset = 0
  32. t.line_offset = 0
  33. t.line_count = len(src) > 0 ? 1 : 0
  34. t.insert_semicolon = false
  35. t.error_count = 0
  36. t.path = path
  37. advance_rune(t)
  38. if t.ch == utf8.RUNE_BOM {
  39. advance_rune(t)
  40. }
  41. }
  42. @(private)
  43. offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
  44. line := t.line_count
  45. column := offset - t.line_offset + 1
  46. return Pos {
  47. file = t.path,
  48. offset = offset,
  49. line = line,
  50. column = column,
  51. }
  52. }
  53. default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
  54. fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
  55. fmt.eprintf(msg, ..args)
  56. fmt.eprintf("\n")
  57. }
  58. error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
  59. pos := offset_to_pos(t, offset)
  60. if t.err != nil {
  61. t.err(pos, msg, ..args)
  62. }
  63. t.error_count += 1
  64. }
  65. advance_rune :: proc(t: ^Tokenizer) {
  66. if t.read_offset < len(t.src) {
  67. t.offset = t.read_offset
  68. if t.ch == '\n' {
  69. t.line_offset = t.offset
  70. t.line_count += 1
  71. }
  72. r, w := rune(t.src[t.read_offset]), 1
  73. switch {
  74. case r == 0:
  75. error(t, t.offset, "illegal character NUL")
  76. case r >= utf8.RUNE_SELF:
  77. r, w = utf8.decode_rune_in_string(t.src[t.read_offset:])
  78. if r == utf8.RUNE_ERROR && w == 1 {
  79. error(t, t.offset, "illegal UTF-8 encoding")
  80. } else if r == utf8.RUNE_BOM && t.offset > 0 {
  81. error(t, t.offset, "illegal byte order mark")
  82. }
  83. }
  84. t.read_offset += w
  85. t.ch = r
  86. } else {
  87. t.offset = len(t.src)
  88. if t.ch == '\n' {
  89. t.line_offset = t.offset
  90. t.line_count += 1
  91. }
  92. t.ch = -1
  93. }
  94. }
  95. peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
  96. if t.read_offset+offset < len(t.src) {
  97. return t.src[t.read_offset+offset]
  98. }
  99. return 0
  100. }
  101. skip_whitespace :: proc(t: ^Tokenizer) {
  102. if t.insert_semicolon {
  103. for {
  104. switch t.ch {
  105. case ' ', '\t', '\r':
  106. advance_rune(t)
  107. case:
  108. return
  109. }
  110. }
  111. } else {
  112. for {
  113. switch t.ch {
  114. case ' ', '\t', '\r', '\n':
  115. advance_rune(t)
  116. case:
  117. return
  118. }
  119. }
  120. }
  121. }
  122. is_letter :: proc(r: rune) -> bool {
  123. if r < utf8.RUNE_SELF {
  124. switch r {
  125. case '_':
  126. return true
  127. case 'A'..='Z', 'a'..='z':
  128. return true
  129. }
  130. }
  131. return unicode.is_letter(r)
  132. }
  133. is_digit :: proc(r: rune) -> bool {
  134. if '0' <= r && r <= '9' {
  135. return true
  136. }
  137. return unicode.is_digit(r)
  138. }
  139. scan_comment :: proc(t: ^Tokenizer) -> string {
  140. offset := t.offset-1
  141. next := -1
  142. general: {
  143. if t.ch == '/' || t.ch == '!' { // // #! comments
  144. advance_rune(t)
  145. for t.ch != '\n' && t.ch >= 0 {
  146. advance_rune(t)
  147. }
  148. next = t.offset
  149. if t.ch == '\n' {
  150. next += 1
  151. }
  152. break general
  153. }
  154. /* style comment */
  155. advance_rune(t)
  156. nest := 1
  157. for t.ch >= 0 && nest > 0 {
  158. ch := t.ch
  159. advance_rune(t)
  160. if ch == '/' && t.ch == '*' {
  161. nest += 1
  162. }
  163. if ch == '*' && t.ch == '/' {
  164. nest -= 1
  165. advance_rune(t)
  166. next = t.offset
  167. if nest == 0 {
  168. break general
  169. }
  170. }
  171. }
  172. error(t, offset, "comment not terminated")
  173. }
  174. lit := t.src[offset : t.offset]
  175. // NOTE(bill): Strip CR for line comments
  176. for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
  177. lit = lit[:len(lit)-1]
  178. }
  179. return string(lit)
  180. }
  181. scan_file_tag :: proc(t: ^Tokenizer) -> string {
  182. offset := t.offset - 1
  183. for t.ch != '\n' {
  184. if t.ch == '/' {
  185. next := peek_byte(t, 0)
  186. if next == '/' || next == '*' {
  187. break
  188. }
  189. }
  190. advance_rune(t)
  191. }
  192. return string(t.src[offset : t.offset])
  193. }
  194. scan_identifier :: proc(t: ^Tokenizer) -> string {
  195. offset := t.offset
  196. for is_letter(t.ch) || is_digit(t.ch) {
  197. advance_rune(t)
  198. }
  199. return string(t.src[offset : t.offset])
  200. }
  201. scan_string :: proc(t: ^Tokenizer) -> string {
  202. offset := t.offset-1
  203. for {
  204. ch := t.ch
  205. if ch == '\n' || ch < 0 {
  206. error(t, offset, "string literal was not terminated")
  207. break
  208. }
  209. advance_rune(t)
  210. if ch == '"' {
  211. break
  212. }
  213. if ch == '\\' {
  214. scan_escape(t)
  215. }
  216. }
  217. return string(t.src[offset : t.offset])
  218. }
  219. scan_raw_string :: proc(t: ^Tokenizer) -> string {
  220. offset := t.offset-1
  221. for {
  222. ch := t.ch
  223. if ch == utf8.RUNE_EOF {
  224. error(t, offset, "raw string literal was not terminated")
  225. break
  226. }
  227. advance_rune(t)
  228. if ch == '`' {
  229. break
  230. }
  231. }
  232. return string(t.src[offset : t.offset])
  233. }
  234. digit_val :: proc(r: rune) -> int {
  235. switch r {
  236. case '0'..='9':
  237. return int(r-'0')
  238. case 'A'..='F':
  239. return int(r-'A' + 10)
  240. case 'a'..='f':
  241. return int(r-'a' + 10)
  242. }
  243. return 16
  244. }
  245. scan_escape :: proc(t: ^Tokenizer) -> bool {
  246. offset := t.offset
  247. n: int
  248. base, max: u32
  249. switch t.ch {
  250. case 'a', 'b', 'e', 'f', 'n', 't', 'v', 'r', '\\', '\'', '\"':
  251. advance_rune(t)
  252. return true
  253. case '0'..='7':
  254. n, base, max = 3, 8, 255
  255. case 'x':
  256. advance_rune(t)
  257. n, base, max = 2, 16, 255
  258. case 'u':
  259. advance_rune(t)
  260. n, base, max = 4, 16, utf8.MAX_RUNE
  261. case 'U':
  262. advance_rune(t)
  263. n, base, max = 8, 16, utf8.MAX_RUNE
  264. case:
  265. if t.ch < 0 {
  266. error(t, offset, "escape sequence was not terminated")
  267. } else {
  268. error(t, offset, "unknown escape sequence")
  269. }
  270. return false
  271. }
  272. x: u32
  273. for n > 0 {
  274. d := u32(digit_val(t.ch))
  275. for d >= base {
  276. if t.ch < 0 {
  277. error(t, t.offset, "escape sequence was not terminated")
  278. } else {
  279. error(t, t.offset, "illegal character %d in escape sequence", t.ch)
  280. }
  281. return false
  282. }
  283. x = x*base + d
  284. advance_rune(t)
  285. n -= 1
  286. }
  287. if x > max || 0xd800 <= x && x <= 0xdfff {
  288. error(t, offset, "escape sequence is an invalid Unicode code point")
  289. return false
  290. }
  291. return true
  292. }
  293. scan_rune :: proc(t: ^Tokenizer) -> string {
  294. offset := t.offset-1
  295. valid := true
  296. n := 0
  297. for {
  298. ch := t.ch
  299. if ch == '\n' || ch < 0 {
  300. if valid {
  301. error(t, offset, "rune literal not terminated")
  302. valid = false
  303. }
  304. break
  305. }
  306. advance_rune(t)
  307. if ch == '\'' {
  308. break
  309. }
  310. n += 1
  311. if ch == '\\' {
  312. if !scan_escape(t) {
  313. valid = false
  314. }
  315. }
  316. }
  317. if valid && n != 1 {
  318. error(t, offset, "illegal rune literal")
  319. }
  320. return string(t.src[offset : t.offset])
  321. }
  322. scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) {
  323. scan_mantissa :: proc(t: ^Tokenizer, base: int) {
  324. for digit_val(t.ch) < base || t.ch == '_' {
  325. advance_rune(t)
  326. }
  327. }
  328. scan_exponent :: proc(t: ^Tokenizer, kind: ^Token_Kind) {
  329. if t.ch == 'e' || t.ch == 'E' {
  330. kind^ = .Float
  331. advance_rune(t)
  332. if t.ch == '-' || t.ch == '+' {
  333. advance_rune(t)
  334. }
  335. if digit_val(t.ch) < 10 {
  336. scan_mantissa(t, 10)
  337. } else {
  338. error(t, t.offset, "illegal floating-point exponent")
  339. }
  340. }
  341. // NOTE(bill): This needs to be here for sanity's sake
  342. switch t.ch {
  343. case 'i', 'j', 'k':
  344. kind^ = .Imag
  345. advance_rune(t)
  346. }
  347. }
  348. scan_fraction :: proc(t: ^Tokenizer, kind: ^Token_Kind) -> (early_exit: bool) {
  349. if t.ch == '.' && peek_byte(t) == '.' {
  350. return true
  351. }
  352. if t.ch == '.' {
  353. kind^ = .Float
  354. advance_rune(t)
  355. scan_mantissa(t, 10)
  356. }
  357. return false
  358. }
  359. offset := t.offset
  360. kind := Token_Kind.Integer
  361. seen_point := seen_decimal_point
  362. if seen_point {
  363. offset -= 1
  364. kind = .Float
  365. scan_mantissa(t, 10)
  366. scan_exponent(t, &kind)
  367. } else {
  368. if t.ch == '0' {
  369. int_base :: proc(t: ^Tokenizer, kind: ^Token_Kind, base: int, msg: string) {
  370. prev := t.offset
  371. advance_rune(t)
  372. scan_mantissa(t, base)
  373. if t.offset - prev <= 1 {
  374. kind^ = .Invalid
  375. error(t, t.offset, msg)
  376. }
  377. }
  378. advance_rune(t)
  379. switch t.ch {
  380. case 'b': int_base(t, &kind, 2, "illegal binary integer")
  381. case 'o': int_base(t, &kind, 8, "illegal octal integer")
  382. case 'd': int_base(t, &kind, 10, "illegal decimal integer")
  383. case 'z': int_base(t, &kind, 12, "illegal dozenal integer")
  384. case 'x': int_base(t, &kind, 16, "illegal hexadecimal integer")
  385. case 'h':
  386. prev := t.offset
  387. advance_rune(t)
  388. scan_mantissa(t, 16)
  389. if t.offset - prev <= 1 {
  390. kind = .Invalid
  391. error(t, t.offset, "illegal hexadecimal floating-point number")
  392. } else {
  393. sub := t.src[prev+1 : t.offset]
  394. digit_count := 0
  395. for d in sub {
  396. if d != '_' {
  397. digit_count += 1
  398. }
  399. }
  400. switch digit_count {
  401. case 4, 8, 16: break
  402. case:
  403. error(t, t.offset, "invalid hexadecimal floating-point number, expected 4, 8, or 16 digits, got %d", digit_count)
  404. }
  405. }
  406. case:
  407. seen_point = false
  408. scan_mantissa(t, 10)
  409. if t.ch == '.' {
  410. seen_point = true
  411. if scan_fraction(t, &kind) {
  412. return kind, string(t.src[offset : t.offset])
  413. }
  414. }
  415. scan_exponent(t, &kind)
  416. return kind, string(t.src[offset : t.offset])
  417. }
  418. }
  419. }
  420. scan_mantissa(t, 10)
  421. if scan_fraction(t, &kind) {
  422. return kind, string(t.src[offset : t.offset])
  423. }
  424. scan_exponent(t, &kind)
  425. return kind, string(t.src[offset : t.offset])
  426. }
  427. scan :: proc(t: ^Tokenizer) -> Token {
  428. skip_whitespace(t)
  429. offset := t.offset
  430. kind: Token_Kind
  431. lit: string
  432. pos := offset_to_pos(t, offset)
  433. switch ch := t.ch; true {
  434. case is_letter(ch):
  435. lit = scan_identifier(t)
  436. kind = .Ident
  437. check_keyword: if len(lit) > 1 {
  438. // TODO(bill): Maybe have a hash table lookup rather than this linear search
  439. for i in Token_Kind.B_Keyword_Begin ..= Token_Kind.B_Keyword_End {
  440. if lit == tokens[i] {
  441. kind = Token_Kind(i)
  442. break check_keyword
  443. }
  444. }
  445. for keyword, i in custom_keyword_tokens {
  446. if lit == keyword {
  447. kind = Token_Kind(i+1) + .B_Custom_Keyword_Begin
  448. break check_keyword
  449. }
  450. }
  451. break check_keyword
  452. }
  453. case '0' <= ch && ch <= '9':
  454. kind, lit = scan_number(t, false)
  455. case:
  456. advance_rune(t)
  457. switch ch {
  458. case -1:
  459. kind = .EOF
  460. if t.insert_semicolon {
  461. t.insert_semicolon = false
  462. kind = .Semicolon
  463. lit = "\n"
  464. return Token{kind, lit, pos}
  465. }
  466. case '\n':
  467. t.insert_semicolon = false
  468. kind = .Semicolon
  469. lit = "\n"
  470. case '\\':
  471. if .Insert_Semicolon in t.flags {
  472. t.insert_semicolon = false
  473. }
  474. token := scan(t)
  475. if token.pos.line == pos.line {
  476. error(t, token.pos.offset, "expected a newline after \\")
  477. }
  478. return token
  479. case '\'':
  480. kind = .Rune
  481. lit = scan_rune(t)
  482. case '"':
  483. kind = .String
  484. lit = scan_string(t)
  485. case '`':
  486. kind = .String
  487. lit = scan_raw_string(t)
  488. case '.':
  489. kind = .Period
  490. switch t.ch {
  491. case '0'..='9':
  492. kind, lit = scan_number(t, true)
  493. case '.':
  494. advance_rune(t)
  495. kind = .Ellipsis
  496. switch t.ch {
  497. case '<':
  498. advance_rune(t)
  499. kind = .Range_Half
  500. case '=':
  501. advance_rune(t)
  502. kind = .Range_Full
  503. }
  504. }
  505. case '@': kind = .At
  506. case '$': kind = .Dollar
  507. case '?': kind = .Question
  508. case '^': kind = .Pointer
  509. case ';': kind = .Semicolon
  510. case ',': kind = .Comma
  511. case ':': kind = .Colon
  512. case '(': kind = .Open_Paren
  513. case ')': kind = .Close_Paren
  514. case '[': kind = .Open_Bracket
  515. case ']': kind = .Close_Bracket
  516. case '{': kind = .Open_Brace
  517. case '}': kind = .Close_Brace
  518. case '%':
  519. kind = .Mod
  520. switch t.ch {
  521. case '=':
  522. advance_rune(t)
  523. kind = .Mod_Eq
  524. case '%':
  525. advance_rune(t)
  526. kind = .Mod_Mod
  527. if t.ch == '=' {
  528. advance_rune(t)
  529. kind = .Mod_Mod_Eq
  530. }
  531. }
  532. case '*':
  533. kind = .Mul
  534. if t.ch == '=' {
  535. advance_rune(t)
  536. kind = .Mul_Eq
  537. }
  538. case '=':
  539. kind = .Eq
  540. if t.ch == '=' {
  541. advance_rune(t)
  542. kind = .Cmp_Eq
  543. }
  544. case '~':
  545. kind = .Xor
  546. if t.ch == '=' {
  547. advance_rune(t)
  548. kind = .Xor_Eq
  549. }
  550. case '!':
  551. kind = .Not
  552. if t.ch == '=' {
  553. advance_rune(t)
  554. kind = .Not_Eq
  555. }
  556. case '+':
  557. kind = .Add
  558. switch t.ch {
  559. case '=':
  560. advance_rune(t)
  561. kind = .Add_Eq
  562. case '+':
  563. advance_rune(t)
  564. kind = .Increment
  565. }
  566. case '-':
  567. kind = .Sub
  568. switch t.ch {
  569. case '-':
  570. advance_rune(t)
  571. kind = .Decrement
  572. if t.ch == '-' {
  573. advance_rune(t)
  574. kind = .Undef
  575. }
  576. case '>':
  577. advance_rune(t)
  578. kind = .Arrow_Right
  579. case '=':
  580. advance_rune(t)
  581. kind = .Sub_Eq
  582. }
  583. case '#':
  584. kind = .Hash
  585. if t.ch == '!' {
  586. kind = .Comment
  587. lit = scan_comment(t)
  588. } else if t.ch == '+' {
  589. kind = .File_Tag
  590. lit = scan_file_tag(t)
  591. }
  592. case '/':
  593. kind = .Quo
  594. switch t.ch {
  595. case '/', '*':
  596. kind = .Comment
  597. lit = scan_comment(t)
  598. case '=':
  599. advance_rune(t)
  600. kind = .Quo_Eq
  601. }
  602. case '<':
  603. kind = .Lt
  604. switch t.ch {
  605. case '=':
  606. advance_rune(t)
  607. kind = .Lt_Eq
  608. case '<':
  609. advance_rune(t)
  610. kind = .Shl
  611. if t.ch == '=' {
  612. advance_rune(t)
  613. kind = .Shl_Eq
  614. }
  615. }
  616. case '>':
  617. kind = .Gt
  618. switch t.ch {
  619. case '=':
  620. advance_rune(t)
  621. kind = .Gt_Eq
  622. case '>':
  623. advance_rune(t)
  624. kind = .Shr
  625. if t.ch == '=' {
  626. advance_rune(t)
  627. kind = .Shr_Eq
  628. }
  629. }
  630. case '&':
  631. kind = .And
  632. switch t.ch {
  633. case '~':
  634. advance_rune(t)
  635. kind = .And_Not
  636. if t.ch == '=' {
  637. advance_rune(t)
  638. kind = .And_Not_Eq
  639. }
  640. case '=':
  641. advance_rune(t)
  642. kind = .And_Eq
  643. case '&':
  644. advance_rune(t)
  645. kind = .Cmp_And
  646. if t.ch == '=' {
  647. advance_rune(t)
  648. kind = .Cmp_And_Eq
  649. }
  650. }
  651. case '|':
  652. kind = .Or
  653. switch t.ch {
  654. case '=':
  655. advance_rune(t)
  656. kind = .Or_Eq
  657. case '|':
  658. advance_rune(t)
  659. kind = .Cmp_Or
  660. if t.ch == '=' {
  661. advance_rune(t)
  662. kind = .Cmp_Or_Eq
  663. }
  664. }
  665. case:
  666. if ch != utf8.RUNE_BOM {
  667. error(t, t.offset, "illegal character '%r': %d", ch, ch)
  668. }
  669. kind = .Invalid
  670. }
  671. }
  672. if .Insert_Semicolon in t.flags {
  673. #partial switch kind {
  674. case .Invalid, .Comment:
  675. // Preserve insert_semicolon info
  676. case .Ident, .Context, .Typeid, .Break, .Continue, .Fallthrough, .Return,
  677. .Integer, .Float, .Imag, .Rune, .String, .Undef,
  678. .Question, .Pointer, .Close_Paren, .Close_Bracket, .Close_Brace,
  679. .Increment, .Decrement, .Or_Return, .Or_Break, .Or_Continue:
  680. /*fallthrough*/
  681. t.insert_semicolon = true
  682. case:
  683. t.insert_semicolon = false
  684. break
  685. }
  686. }
  687. if lit == "" {
  688. lit = string(t.src[offset : t.offset])
  689. }
  690. return Token{kind, lit, pos}
  691. }