scanner.odin 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
  1. // package text/scanner provides a scanner and tokenizer for UTF-8-encoded text.
  2. // It takes a string providing the source, which then can be tokenized through
  3. // repeated calls to the scan procedure.
  4. // For compatibility with existing tooling and languages, the NUL character is not allowed.
  5. // If an UTF-8 encoded byte order mark (BOM) is the first character in the first character in the source, it will be discarded.
  6. //
  7. // By default, a Scanner skips white space and Odin comments and recognizes all literals defined by the Odin programming language specification.
  8. // A Scanner may be customized to recognize only a subset of those literals and to recognize different identifiers and white space characters.
  9. package text_scanner
  10. import "base:runtime"
  11. import "core:fmt"
  12. import "core:strings"
  13. import "core:unicode"
  14. import "core:unicode/utf8"
  15. // Position represents a source position
  16. // A position is valid if line > 0
  17. Position :: struct {
  18. filename: string, // filename, if present
  19. offset: int, // byte offset, starting @ 0
  20. line: int, // line number, starting @ 1
  21. column: int, // column number, starting @ 1 (character count per line)
  22. }
  23. // position_is_valid reports where the position is valid
  24. @(require_results)
  25. position_is_valid :: proc(pos: Position) -> bool {
  26. return pos.line > 0
  27. }
  28. @(require_results)
  29. position_to_string :: proc(pos: Position, allocator := context.temp_allocator) -> string {
  30. s := pos.filename
  31. if s == "" {
  32. s = "<input>"
  33. }
  34. context.allocator = allocator
  35. if position_is_valid(pos) {
  36. return fmt.aprintf("%s(%d:%d)", s, pos.line, pos.column)
  37. } else {
  38. return strings.clone(s)
  39. }
  40. }
  41. EOF :: -1
  42. Ident :: -2
  43. Int :: -3
  44. Float :: -4
  45. Char :: -5
  46. String :: -6
  47. Raw_String :: -7
  48. Comment :: -8
  49. Scan_Flag :: enum u32 {
  50. Scan_Idents,
  51. Scan_Ints,
  52. Scan_C_Int_Prefixes,
  53. Scan_Floats, // Includes integers and hexadecimal floats
  54. Scan_Chars,
  55. Scan_Strings,
  56. Scan_Raw_Strings,
  57. Scan_Comments,
  58. Skip_Comments, // if set with .Scan_Comments, comments become white space
  59. }
  60. Scan_Flags :: distinct bit_set[Scan_Flag; u32]
  61. Odin_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments}
  62. C_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_C_Int_Prefixes, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments}
  63. // Only allows for ASCII whitespace
  64. Whitespace :: distinct bit_set['\x00'..<utf8.RUNE_SELF; u128]
  65. // Odin_Whitespace is the default value for the Scanner's whitespace field
  66. Odin_Whitespace :: Whitespace{'\t', '\n', '\r', ' '}
  67. C_Whitespace :: Whitespace{'\t', '\n', '\r', '\v', '\f', ' '}
  68. // Scanner allows for the reading of Unicode characters and tokens from a string
  69. Scanner :: struct {
  70. src: string,
  71. src_pos: int,
  72. src_end: int,
  73. tok_pos: int,
  74. tok_end: int,
  75. ch: rune,
  76. line: int,
  77. column: int,
  78. prev_line_len: int,
  79. prev_char_len: int,
  80. // error is called for each error encountered
  81. // If no error procedure is set, the error is reported to os.stderr
  82. error: proc(s: ^Scanner, msg: string),
  83. // error_count is incremented by one for each error encountered
  84. error_count: int,
  85. // flags controls which tokens are recognized
  86. // e.g. to recognize integers, set the .Scan_Ints flag
  87. // This field may be changed by the user at any time during scanning
  88. flags: Scan_Flags,
  89. // The whitespace field controls which characters are recognized as white space
  90. // This field may be changed by the user at any time during scanning
  91. whitespace: Whitespace,
  92. // is_ident_rune is a predicate controlling the characters accepted as the ith rune in an identifier
  93. // The valid characters must not conflict with the set of white space characters
  94. // If is_ident_rune is not set, regular Odin-like identifiers are accepted
  95. // This field may be changed by the user at any time during scanning
  96. is_ident_rune: proc(ch: rune, i: int) -> bool,
  97. // Start position of most recently scanned token (set by scan(s))
  98. // Call init or next invalidates the position
  99. pos: Position,
  100. }
  101. // init initializes a scanner with a new source and returns itself.
  102. // error_count is set to 0, flags is set to Odin_Like_Tokens, whitespace is set to Odin_Whitespace
  103. init :: proc(s: ^Scanner, src: string, filename := "") -> ^Scanner {
  104. s^ = {}
  105. s.error_count = 0
  106. s.src = src
  107. s.pos.filename = filename
  108. s.tok_pos = -1
  109. s.ch = -2 // no char read yet, not an EOF
  110. s.line = 1
  111. s.flags = Odin_Like_Tokens
  112. s.whitespace = Odin_Whitespace
  113. return s
  114. }
  115. @(private, require_results)
  116. advance :: proc(s: ^Scanner) -> rune {
  117. if s.src_pos >= len(s.src) {
  118. s.prev_char_len = 0
  119. return EOF
  120. }
  121. ch, width := rune(s.src[s.src_pos]), 1
  122. if ch >= utf8.RUNE_SELF {
  123. ch, width = utf8.decode_rune_in_string(s.src[s.src_pos:])
  124. if ch == utf8.RUNE_ERROR && width == 1 {
  125. s.src_pos += width
  126. s.prev_char_len = width
  127. s.column += 1
  128. error(s, "invalid UTF-8 encoding")
  129. return ch
  130. }
  131. }
  132. s.src_pos += width
  133. s.prev_char_len = width
  134. s.column += 1
  135. switch ch {
  136. case 0:
  137. error(s, "invalid character NUL")
  138. case '\n':
  139. s.line += 1
  140. s.prev_line_len = s.column
  141. s.column = 0
  142. }
  143. return ch
  144. }
  145. // next reads and returns the next Unicode character. It returns EOF at the end of the source.
  146. // next does not update the Scanner's pos field. Use 'position(s)' to get the current position
  147. next :: proc(s: ^Scanner) -> rune {
  148. s.tok_pos = -1
  149. s.pos.line = 0
  150. ch := peek(s)
  151. if ch != EOF {
  152. s.ch = advance(s)
  153. }
  154. return ch
  155. }
  156. // peek returns the next Unicode character in the source without advancing the scanner
  157. // It returns EOF if the scanner's position is at least the last character of the source
  158. // if n > 0, it call next n times and return the nth Unicode character and then restore the Scanner's state
  159. @(require_results)
  160. peek :: proc(s: ^Scanner, n := 0) -> (ch: rune) {
  161. if s.ch == -2 {
  162. s.ch = advance(s)
  163. if s.ch == '\ufeff' { // Ignore BOM
  164. s.ch = advance(s)
  165. }
  166. }
  167. ch = s.ch
  168. if n > 0 {
  169. prev_s := s^
  170. for _ in 0..<n {
  171. next(s)
  172. }
  173. ch = s.ch
  174. s^ = prev_s
  175. }
  176. return ch
  177. }
  178. // peek returns the next token in the source
  179. // It returns EOF if the scanner's position is at least the last character of the source
  180. // if n > 0, it call next n times and return the nth token and then restore the Scanner's state
  181. @(require_results)
  182. peek_token :: proc(s: ^Scanner, n := 0) -> (tok: rune) {
  183. assert(n >= 0)
  184. prev_s := s^
  185. for _ in 0..<n {
  186. tok = scan(s)
  187. }
  188. tok = scan(s)
  189. s^ = prev_s
  190. return
  191. }
  192. error :: proc(s: ^Scanner, msg: string) {
  193. s.error_count += 1
  194. if s.error != nil {
  195. s.error(s, msg)
  196. return
  197. }
  198. p := s.pos
  199. if !position_is_valid(p) {
  200. p = position(s)
  201. }
  202. s := p.filename
  203. if s == "" {
  204. s = "<input>"
  205. }
  206. if position_is_valid(p) {
  207. fmt.eprintf("%s(%d:%d): %s\n", s, p.line, p.column, msg)
  208. } else {
  209. fmt.eprintf("%s: %s\n", s, msg)
  210. }
  211. }
  212. errorf :: proc(s: ^Scanner, format: string, args: ..any) {
  213. error(s, fmt.tprintf(format, ..args))
  214. }
  215. @(private, require_results)
  216. is_ident_rune :: proc(s: ^Scanner, ch: rune, i: int) -> bool {
  217. if s.is_ident_rune != nil {
  218. return s.is_ident_rune(ch, i)
  219. }
  220. return ch == '_' || unicode.is_letter(ch) || unicode.is_digit(ch) && i > 0
  221. }
  222. @(private, require_results)
  223. scan_identifier :: proc(s: ^Scanner) -> rune {
  224. ch := advance(s)
  225. for i := 1; is_ident_rune(s, ch, i); i += 1 {
  226. ch = advance(s)
  227. }
  228. return ch
  229. }
  230. @(private, require_results) lower :: proc(ch: rune) -> rune { return ('a' - 'A') | ch }
  231. @(private, require_results) is_decimal :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9' }
  232. @(private, require_results) is_hex :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
  233. @(private, require_results)
  234. scan_number :: proc(s: ^Scanner, ch: rune, seen_dot: bool) -> (rune, rune) {
  235. lit_name :: proc(prefix: rune) -> string {
  236. switch prefix {
  237. case 'b': return "binary literal"
  238. case 'o': return "octal literal"
  239. case 'z': return "dozenal literal"
  240. case 'x': return "hexadecimal literal"
  241. }
  242. return "decimal literal"
  243. }
  244. digits :: proc(s: ^Scanner, ch0: rune, base: int, invalid: ^rune) -> (ch: rune, digsep: int) {
  245. ch = ch0
  246. if base <= 10 {
  247. max := rune('0' + base)
  248. for is_decimal(ch) || ch == '_' {
  249. ds := 1
  250. if ch == '_' {
  251. ds = 2
  252. } else if ch >= max && invalid^ == 0 {
  253. invalid^ = ch
  254. }
  255. digsep |= ds
  256. ch = advance(s)
  257. }
  258. } else {
  259. for is_hex(ch) || ch == '_' {
  260. ds := 1
  261. if ch == '_' {
  262. ds = 2
  263. }
  264. digsep |= ds
  265. ch = advance(s)
  266. }
  267. }
  268. return
  269. }
  270. ch, seen_dot := ch, seen_dot
  271. base := 10
  272. prefix := rune(0)
  273. digsep := 0
  274. invalid := rune(0)
  275. tok: rune
  276. ds: int
  277. if !seen_dot {
  278. tok = Int
  279. if ch == '0' {
  280. ch = advance(s)
  281. p := lower(ch)
  282. if .Scan_C_Int_Prefixes in s.flags {
  283. switch p {
  284. case 'b':
  285. ch = advance(s)
  286. base, prefix = 2, 'b'
  287. case 'x':
  288. ch = advance(s)
  289. base, prefix = 16, 'x'
  290. case:
  291. base, prefix = 8, 'o'
  292. digsep = 1 // Leading zero
  293. }
  294. } else {
  295. switch p {
  296. case 'b':
  297. ch = advance(s)
  298. base, prefix = 2, 'b'
  299. case 'o':
  300. ch = advance(s)
  301. base, prefix = 8, 'o'
  302. case 'd':
  303. ch = advance(s)
  304. base, prefix = 10, 'd'
  305. case 'z':
  306. ch = advance(s)
  307. base, prefix = 12, 'z'
  308. case 'h':
  309. tok = Float
  310. fallthrough
  311. case 'x':
  312. ch = advance(s)
  313. base, prefix = 16, 'x'
  314. case:
  315. digsep = 1 // Leading zero
  316. }
  317. }
  318. }
  319. ch, ds = digits(s, ch, base, &invalid)
  320. digsep |= ds
  321. if ch == '.' && .Scan_Floats in s.flags {
  322. ch = advance(s)
  323. seen_dot = true
  324. }
  325. }
  326. if seen_dot {
  327. tok = Float
  328. if prefix != 0 && prefix != 'x' {
  329. errorf(s, "invalid radix point in %s", lit_name(prefix))
  330. }
  331. ch, ds = digits(s, ch, base, &invalid)
  332. digsep |= ds
  333. }
  334. if digsep&1 == 0 {
  335. errorf(s, "%s has no digits", lit_name(prefix))
  336. }
  337. if e := lower(ch); (e == 'e' || e == 'p') && .Scan_Floats in s.flags {
  338. switch {
  339. case e == 'e' && prefix != 0:
  340. errorf(s, "%q exponent requires decimal mantissa", ch)
  341. case e == 'p' && prefix != 'x':
  342. errorf(s, "%q exponent requires hexadecimal mantissa", ch)
  343. }
  344. ch = advance(s)
  345. tok = Float
  346. if ch == '+' || ch == '-' {
  347. ch = advance(s)
  348. }
  349. ch, ds = digits(s, ch, 10, nil)
  350. digsep |= ds
  351. if ds&1 == 0 {
  352. error(s, "exponent has no digits")
  353. }
  354. } else if prefix == 'x' && tok == Float {
  355. error(s, "hexadecimal mantissa requires a 'p' exponent")
  356. }
  357. if tok == Int && invalid != 0 {
  358. errorf(s, "invalid digit %q in %s", invalid, lit_name(prefix))
  359. }
  360. if digsep&2 != 0 {
  361. s.tok_end = s.src_pos - s.prev_char_len
  362. }
  363. return tok, ch
  364. }
  365. @(private, require_results)
  366. scan_string :: proc(s: ^Scanner, quote: rune) -> (n: int) {
  367. digit_val :: proc(ch: rune) -> int {
  368. switch v := lower(ch); v {
  369. case '0'..='9': return int(v - '0')
  370. case 'a'..='z': return int(v - 'a')
  371. }
  372. return 16
  373. }
  374. scan_digits :: proc(s: ^Scanner, ch: rune, base, n: int) -> rune {
  375. ch, n := ch, n
  376. for n > 0 && digit_val(ch) < base {
  377. ch = advance(s)
  378. n -= 1
  379. }
  380. if n > 0 {
  381. error(s, "invalid char escape")
  382. }
  383. return ch
  384. }
  385. ch := advance(s)
  386. for ch != quote {
  387. if ch == '\n' || ch < 0 {
  388. error(s, "literal no terminated")
  389. return
  390. }
  391. if ch == '\\' {
  392. ch = advance(s)
  393. switch ch {
  394. case quote, 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v', '\\':
  395. ch = advance(s)
  396. case '0'..='7': ch = scan_digits(s, advance(s), 8, 3)
  397. case 'x': ch = scan_digits(s, advance(s), 16, 2)
  398. case 'u': ch = scan_digits(s, advance(s), 16, 4)
  399. case 'U': ch = scan_digits(s, advance(s), 16, 8)
  400. case:
  401. error(s, "invalid char escape")
  402. }
  403. } else {
  404. ch = advance(s)
  405. }
  406. n += 1
  407. }
  408. return
  409. }
  410. @(private)
  411. scan_raw_string :: proc(s: ^Scanner) {
  412. ch := advance(s)
  413. for ch != '`' {
  414. if ch < 0 {
  415. error(s, "literal not terminated")
  416. return
  417. }
  418. ch = advance(s)
  419. }
  420. }
  421. @(private)
  422. scan_char :: proc(s: ^Scanner) {
  423. if scan_string(s, '\'') != 1 {
  424. error(s, "invalid char literal")
  425. }
  426. }
  427. @(private, require_results)
  428. scan_comment :: proc(s: ^Scanner, ch: rune) -> rune {
  429. ch := ch
  430. if ch == '/' { // line comment
  431. ch = advance(s)
  432. for ch != '\n' && ch >= 0 {
  433. ch = advance(s)
  434. }
  435. return ch
  436. }
  437. // block /**/ comment
  438. ch = advance(s)
  439. for {
  440. if ch < 0 {
  441. error(s, "comment not terminated")
  442. break
  443. }
  444. ch0 := ch
  445. ch = advance(s)
  446. if ch0 == '*' && ch == '/' {
  447. return advance(s)
  448. }
  449. }
  450. return ch
  451. }
  452. // scan reads the next token or Unicode character from source and returns it
  453. // It only recognizes tokens for which the respective flag that is set
  454. // It returns EOF at the end of the source
  455. // It reports Scanner errors by calling s.error, if not nil; otherwise it will print the error message to os.stderr
  456. scan :: proc(s: ^Scanner) -> (tok: rune) {
  457. ch := peek(s)
  458. if ch == EOF {
  459. return ch
  460. }
  461. // reset position
  462. s.tok_pos = -1
  463. s.pos.line = 0
  464. redo: for {
  465. for ch < utf8.RUNE_SELF && (ch in s.whitespace) {
  466. ch = advance(s)
  467. }
  468. s.tok_pos = s.src_pos - s.prev_char_len
  469. s.pos.offset = s.tok_pos
  470. if s.column > 0 {
  471. s.pos.line = s.line
  472. s.pos.column = s.column
  473. } else {
  474. // previous character was newline
  475. s.pos.line = s.line - 1
  476. s.pos.column = s.prev_line_len
  477. }
  478. tok = ch
  479. if is_ident_rune(s, ch, 0) {
  480. if .Scan_Idents in s.flags {
  481. tok = Ident
  482. ch = scan_identifier(s)
  483. } else {
  484. ch = advance(s)
  485. }
  486. } else if is_decimal(ch) {
  487. if .Scan_Ints in s.flags || .Scan_Floats in s.flags {
  488. tok, ch = scan_number(s, ch, false)
  489. } else {
  490. ch = advance(s)
  491. }
  492. } else {
  493. switch ch {
  494. case EOF:
  495. break
  496. case '"':
  497. if .Scan_Strings in s.flags {
  498. _ = scan_string(s, '"')
  499. tok = String
  500. }
  501. ch = advance(s)
  502. case '\'':
  503. if .Scan_Chars in s.flags {
  504. _ = scan_string(s, '\'')
  505. tok = Char
  506. }
  507. ch = advance(s)
  508. case '`':
  509. if .Scan_Raw_Strings in s.flags {
  510. scan_raw_string(s)
  511. tok = Raw_String
  512. }
  513. ch = advance(s)
  514. case '.':
  515. ch = advance(s)
  516. if is_decimal(ch) && .Scan_Floats in s.flags {
  517. tok, ch = scan_number(s, ch, true)
  518. }
  519. case '/':
  520. ch = advance(s)
  521. if (ch == '/' || ch == '*') && .Scan_Comments in s.flags {
  522. if .Skip_Comments in s.flags {
  523. s.tok_pos = -1
  524. ch = scan_comment(s, ch)
  525. continue redo
  526. }
  527. ch = scan_comment(s, ch)
  528. tok = Comment
  529. }
  530. case:
  531. ch = advance(s)
  532. }
  533. }
  534. break redo
  535. }
  536. s.tok_end = s.src_pos - s.prev_char_len
  537. s.ch = ch
  538. return tok
  539. }
  540. // position returns the position of the character immediately after the character or token returns by the previous call to next or scan
  541. // Use the Scanner's position field for the most recently scanned token position
  542. @(require_results)
  543. position :: proc(s: ^Scanner) -> Position {
  544. pos: Position
  545. pos.filename = s.pos.filename
  546. pos.offset = s.src_pos - s.prev_char_len
  547. switch {
  548. case s.column > 0:
  549. pos.line = s.line
  550. pos.column = s.column
  551. case s.prev_line_len > 0:
  552. pos.line = s.line-1
  553. pos.column = s.prev_line_len
  554. case:
  555. pos.line = 1
  556. pos.column = 1
  557. }
  558. return pos
  559. }
  560. // token_text returns the string of the most recently scanned token
  561. @(require_results)
  562. token_text :: proc(s: ^Scanner) -> string {
  563. if s.tok_pos < 0 {
  564. return ""
  565. }
  566. return string(s.src[s.tok_pos:s.tok_end])
  567. }
  568. // token_string returns a printable string for a token or Unicode character
  569. // By default, it uses the context.temp_allocator to produce the string
  570. @(require_results)
  571. token_string :: proc(tok: rune, allocator: runtime.Allocator) -> string {
  572. context.allocator = allocator
  573. switch tok {
  574. case EOF: return strings.clone("EOF")
  575. case Ident: return strings.clone("Ident")
  576. case Int: return strings.clone("Int")
  577. case Float: return strings.clone("Float")
  578. case Char: return strings.clone("Char")
  579. case String: return strings.clone("String")
  580. case Raw_String: return strings.clone("Raw_String")
  581. case Comment: return strings.clone("Comment")
  582. }
  583. return fmt.aprintf("%q", tok)
  584. }