scanner.odin 15 KB


  1. // package text/scanner provides a scanner and tokenizer for UTF-8-encoded text.
  2. // It takes a string providing the source, which then can be tokenized through
  3. // repeated calls to the scan procedure.
  4. // For compatibility with existing tooling and languages, the NUL character is not allowed.
  5. // If an UTF-8 encoded byte order mark (BOM) is the first character in the first character in the source, it will be discarded.
  6. //
  7. // By default, a Scanner skips white space and Odin comments and recognizes all literals defined by the Odin programming language specification.
  8. // A Scanner may be customized to recognize only a subset of those literals and to recognize different identifiers and white space characters.
  9. package text_scanner
  10. import "core:fmt"
  11. import "core:strings"
  12. import "core:unicode"
  13. import "core:unicode/utf8"
  14. // Position represents a source position
  15. // A position is valid if line > 0
  16. Position :: struct {
  17. filename: string, // filename, if present
  18. offset: int, // byte offset, starting @ 0
  19. line: int, // line number, starting @ 1
  20. column: int, // column number, starting @ 1 (character count per line)
  21. }
  22. // position_is_valid reports where the position is valid
  23. position_is_valid :: proc(pos: Position) -> bool {
  24. return pos.line > 0
  25. }
  26. position_to_string :: proc(pos: Position, allocator := context.temp_allocator) -> string {
  27. s := pos.filename
  28. if s == "" {
  29. s = "<input>"
  30. }
  31. context.allocator = allocator
  32. if position_is_valid(pos) {
  33. return fmt.aprintf("%s(%d:%d)", s, pos.line, pos.column)
  34. } else {
  35. return strings.clone(s)
  36. }
  37. }
  38. EOF :: -1
  39. Ident :: -2
  40. Int :: -3
  41. Float :: -4
  42. Char :: -5
  43. String :: -6
  44. Raw_String :: -7
  45. Comment :: -8
  46. Scan_Flag :: enum u32 {
  47. Scan_Idents,
  48. Scan_Ints,
  49. Scan_C_Int_Prefixes,
  50. Scan_Floats, // Includes integers and hexadecimal floats
  51. Scan_Chars,
  52. Scan_Strings,
  53. Scan_Raw_Strings,
  54. Scan_Comments,
  55. Skip_Comments, // if set with .Scan_Comments, comments become white space
  56. }
  57. Scan_Flags :: distinct bit_set[Scan_Flag; u32]
  58. Odin_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments}
  59. C_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_C_Int_Prefixes, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments}
  60. // Only allows for ASCII whitespace
  61. Whitespace :: distinct bit_set['\x00'..<utf8.RUNE_SELF; u128]
  62. // Odin_Whitespace is the default value for the Scanner's whitespace field
  63. Odin_Whitespace :: Whitespace{'\t', '\n', '\r', ' '}
  64. C_Whitespace :: Whitespace{'\t', '\n', '\r', '\v', '\f', ' '}
  65. // Scanner allows for the reading of Unicode characters and tokens from a string
  66. Scanner :: struct {
  67. src: string,
  68. src_pos: int,
  69. src_end: int,
  70. tok_pos: int,
  71. tok_end: int,
  72. ch: rune,
  73. line: int,
  74. column: int,
  75. prev_line_len: int,
  76. prev_char_len: int,
  77. // error is called for each error encountered
  78. // If no error procedure is set, the error is reported to os.stderr
  79. error: proc(s: ^Scanner, msg: string),
  80. // error_count is incremented by one for each error encountered
  81. error_count: int,
  82. // flags controls which tokens are recognized
  83. // e.g. to recognize integers, set the .Scan_Ints flag
  84. // This field may be changed by the user at any time during scanning
  85. flags: Scan_Flags,
  86. // The whitespace field controls which characters are recognized as white space
  87. // This field may be changed by the user at any time during scanning
  88. whitespace: Whitespace,
  89. // is_ident_rune is a predicate controlling the characters accepted as the ith rune in an identifier
  90. // The valid characters must not conflict with the set of white space characters
  91. // If is_ident_rune is not set, regular Odin-like identifiers are accepted
  92. // This field may be changed by the user at any time during scanning
  93. is_ident_rune: proc(ch: rune, i: int) -> bool,
  94. // Start position of most recently scanned token (set by scan(s))
  95. // Call init or next invalidates the position
  96. pos: Position,
  97. }
  98. // init initializes a scanner with a new source and returns itself.
  99. // error_count is set to 0, flags is set to Odin_Like_Tokens, whitespace is set to Odin_Whitespace
  100. init :: proc(s: ^Scanner, src: string, filename := "") -> ^Scanner {
  101. s^ = {}
  102. s.error_count = 0
  103. s.src = src
  104. s.pos.filename = filename
  105. s.tok_pos = -1
  106. s.ch = -2 // no char read yet, not an EOF
  107. s.line = 1
  108. s.flags = Odin_Like_Tokens
  109. s.whitespace = Odin_Whitespace
  110. return s
  111. }
  112. @(private)
  113. advance :: proc(s: ^Scanner) -> rune {
  114. if s.src_pos >= len(s.src) {
  115. s.prev_char_len = 0
  116. return EOF
  117. }
  118. ch, width := rune(s.src[s.src_pos]), 1
  119. if ch >= utf8.RUNE_SELF {
  120. ch, width = utf8.decode_rune_in_string(s.src[s.src_pos:])
  121. if ch == utf8.RUNE_ERROR && width == 1 {
  122. s.src_pos += width
  123. s.prev_char_len = width
  124. s.column += 1
  125. error(s, "invalid UTF-8 encoding")
  126. return ch
  127. }
  128. }
  129. s.src_pos += width
  130. s.prev_char_len = width
  131. s.column += 1
  132. switch ch {
  133. case 0:
  134. error(s, "invalid character NUL")
  135. case '\n':
  136. s.line += 1
  137. s.prev_line_len = s.column
  138. s.column = 0
  139. }
  140. return ch
  141. }
  142. // next reads and returns the next Unicode character. It returns EOF at the end of the source.
  143. // next does not update the Scanner's pos field. Use 'position(s)' to get the current position
  144. next :: proc(s: ^Scanner) -> rune {
  145. s.tok_pos = -1
  146. s.pos.line = 0
  147. ch := peek(s)
  148. if ch != EOF {
  149. s.ch = advance(s)
  150. }
  151. return ch
  152. }
  153. // peek returns the next Unicode character in the source without advancing the scanner
  154. // It returns EOF if the scanner's position is at least the last character of the source
  155. // if n > 0, it call next n times and return the nth Unicode character and then restore the Scanner's state
  156. peek :: proc(s: ^Scanner, n := 0) -> (ch: rune) {
  157. if s.ch == -2 {
  158. s.ch = advance(s)
  159. if s.ch == '\ufeff' { // Ignore BOM
  160. s.ch = advance(s)
  161. }
  162. }
  163. ch = s.ch
  164. if n > 0 {
  165. prev_s := s^
  166. for in 0..<n {
  167. next(s)
  168. }
  169. ch = s.ch
  170. s^ = prev_s
  171. }
  172. return ch
  173. }
  174. // peek returns the next token in the source
  175. // It returns EOF if the scanner's position is at least the last character of the source
  176. // if n > 0, it call next n times and return the nth token and then restore the Scanner's state
  177. peek_token :: proc(s: ^Scanner, n := 0) -> (tok: rune) {
  178. assert(n >= 0)
  179. prev_s := s^
  180. for in 0..<n {
  181. tok = scan(s)
  182. }
  183. tok = scan(s)
  184. s^ = prev_s
  185. return
  186. }
  187. error :: proc(s: ^Scanner, msg: string) {
  188. s.error_count += 1
  189. if s.error != nil {
  190. s.error(s, msg)
  191. return
  192. }
  193. p := s.pos
  194. if !position_is_valid(p) {
  195. p = position(s)
  196. }
  197. s := p.filename
  198. if s == "" {
  199. s = "<input>"
  200. }
  201. if position_is_valid(p) {
  202. fmt.eprintf("%s(%d:%d): %s\n", s, p.line, p.column, msg)
  203. } else {
  204. fmt.eprintf("%s: %s\n", s, msg)
  205. }
  206. }
  207. errorf :: proc(s: ^Scanner, format: string, args: ..any) {
  208. error(s, fmt.tprintf(format, ..args))
  209. }
  210. @(private)
  211. is_ident_rune :: proc(s: ^Scanner, ch: rune, i: int) -> bool {
  212. if s.is_ident_rune != nil {
  213. return s.is_ident_rune(ch, i)
  214. }
  215. return ch == '_' || unicode.is_letter(ch) || unicode.is_digit(ch) && i > 0
  216. }
  217. @(private)
  218. scan_identifier :: proc(s: ^Scanner) -> rune {
  219. ch := advance(s)
  220. for i := 1; is_ident_rune(s, ch, i); i += 1 {
  221. ch = advance(s)
  222. }
  223. return ch
  224. }
  225. @(private) lower :: proc(ch: rune) -> rune { return ('a' - 'A') | ch }
  226. @(private) is_decimal :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9' }
  227. @(private) is_hex :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
  228. @(private)
  229. scan_number :: proc(s: ^Scanner, ch: rune, seen_dot: bool) -> (rune, rune) {
  230. lit_name :: proc(prefix: rune) -> string {
  231. switch prefix {
  232. case 'b': return "binary literal"
  233. case 'o': return "octal literal"
  234. case 'z': return "dozenal literal"
  235. case 'x': return "hexadecimal literal"
  236. }
  237. return "decimal literal"
  238. }
  239. digits :: proc(s: ^Scanner, ch0: rune, base: int, invalid: ^rune) -> (ch: rune, digsep: int) {
  240. ch = ch0
  241. if base <= 10 {
  242. max := rune('0' + base)
  243. for is_decimal(ch) || ch == '_' {
  244. ds := 1
  245. if ch == '_' {
  246. ds = 2
  247. } else if ch >= max && invalid^ == 0 {
  248. invalid^ = ch
  249. }
  250. digsep |= ds
  251. ch = advance(s)
  252. }
  253. } else {
  254. for is_hex(ch) || ch == '_' {
  255. ds := 1
  256. if ch == '_' {
  257. ds = 2
  258. }
  259. digsep |= ds
  260. ch = advance(s)
  261. }
  262. }
  263. return
  264. }
  265. ch, seen_dot := ch, seen_dot
  266. base := 10
  267. prefix := rune(0)
  268. digsep := 0
  269. invalid := rune(0)
  270. tok: rune
  271. ds: int
  272. if !seen_dot {
  273. tok = Int
  274. if ch == '0' {
  275. ch = advance(s)
  276. p := lower(ch)
  277. if .Scan_C_Int_Prefixes in s.flags {
  278. switch p {
  279. case 'b':
  280. ch = advance(s)
  281. base, prefix = 2, 'b'
  282. case 'x':
  283. ch = advance(s)
  284. base, prefix = 16, 'x'
  285. case:
  286. base, prefix = 8, 'o'
  287. digsep = 1 // Leading zero
  288. }
  289. } else {
  290. switch p {
  291. case 'b':
  292. ch = advance(s)
  293. base, prefix = 2, 'b'
  294. case 'o':
  295. ch = advance(s)
  296. base, prefix = 8, 'o'
  297. case 'd':
  298. ch = advance(s)
  299. base, prefix = 10, 'd'
  300. case 'z':
  301. ch = advance(s)
  302. base, prefix = 12, 'z'
  303. case 'h':
  304. tok = Float
  305. fallthrough
  306. case 'x':
  307. ch = advance(s)
  308. base, prefix = 16, 'x'
  309. case:
  310. digsep = 1 // Leading zero
  311. }
  312. }
  313. }
  314. ch, ds = digits(s, ch, base, &invalid)
  315. digsep |= ds
  316. if ch == '.' && .Scan_Floats in s.flags {
  317. ch = advance(s)
  318. seen_dot = true
  319. }
  320. }
  321. if seen_dot {
  322. tok = Float
  323. if prefix != 0 && prefix != 'x' {
  324. errorf(s, "invalid radix point in %s", lit_name(prefix))
  325. }
  326. ch, ds = digits(s, ch, base, &invalid)
  327. digsep |= ds
  328. }
  329. if digsep&1 == 0 {
  330. errorf(s, "%s has no digits", lit_name(prefix))
  331. }
  332. if e := lower(ch); (e == 'e' || e == 'p') && .Scan_Floats in s.flags {
  333. switch {
  334. case e == 'e' && prefix != 0:
  335. errorf(s, "%q exponent requires decimal mantissa", ch)
  336. case e == 'p' && prefix != 'x':
  337. errorf(s, "%q exponent requires hexadecimal mantissa", ch)
  338. }
  339. ch = advance(s)
  340. tok = Float
  341. if ch == '+' || ch == '-' {
  342. ch = advance(s)
  343. }
  344. ch, ds = digits(s, ch, 10, nil)
  345. digsep |= ds
  346. if ds&1 == 0 {
  347. error(s, "exponent has no digits")
  348. }
  349. } else if prefix == 'x' && tok == Float {
  350. error(s, "hexadecimal mantissa requires a 'p' exponent")
  351. }
  352. if tok == Int && invalid != 0 {
  353. errorf(s, "invalid digit %q in %s", invalid, lit_name(prefix))
  354. }
  355. if digsep&2 != 0 {
  356. s.tok_end = s.src_pos - s.prev_char_len
  357. }
  358. return tok, ch
  359. }
  360. @(private)
  361. scan_string :: proc(s: ^Scanner, quote: rune) -> (n: int) {
  362. digit_val :: proc(ch: rune) -> int {
  363. switch v := lower(ch); v {
  364. case '0'..='9': return int(v - '0')
  365. case 'a'..='z': return int(v - 'a')
  366. }
  367. return 16
  368. }
  369. scan_digits :: proc(s: ^Scanner, ch: rune, base, n: int) -> rune {
  370. ch, n := ch, n
  371. for n > 0 && digit_val(ch) < base {
  372. ch = advance(s)
  373. n -= 1
  374. }
  375. if n > 0 {
  376. error(s, "invalid char escape")
  377. }
  378. return ch
  379. }
  380. ch := advance(s)
  381. for ch != quote {
  382. if ch == '\n' || ch < 0 {
  383. error(s, "literal no terminated")
  384. return
  385. }
  386. if ch == '\\' {
  387. ch = advance(s)
  388. switch ch {
  389. case quote, 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v', '\\':
  390. ch = advance(s)
  391. case '0'..='7': ch = scan_digits(s, advance(s), 8, 3)
  392. case 'x': ch = scan_digits(s, advance(s), 16, 2)
  393. case 'u': ch = scan_digits(s, advance(s), 16, 4)
  394. case 'U': ch = scan_digits(s, advance(s), 16, 8)
  395. case:
  396. error(s, "invalid char escape")
  397. }
  398. } else {
  399. ch = advance(s)
  400. }
  401. n += 1
  402. }
  403. return
  404. }
  405. @(private)
  406. scan_raw_string :: proc(s: ^Scanner) {
  407. ch := advance(s)
  408. for ch != '`' {
  409. if ch < 0 {
  410. error(s, "literal not terminated")
  411. return
  412. }
  413. ch = advance(s)
  414. }
  415. }
  416. @(private)
  417. scan_char :: proc(s: ^Scanner) {
  418. if scan_string(s, '\'') != 1 {
  419. error(s, "invalid char literal")
  420. }
  421. }
  422. @(private)
  423. scan_comment :: proc(s: ^Scanner, ch: rune) -> rune {
  424. ch := ch
  425. if ch == '/' { // line comment
  426. ch = advance(s)
  427. for ch != '\n' && ch >= 0 {
  428. ch = advance(s)
  429. }
  430. return ch
  431. }
  432. // block /**/ comment
  433. ch = advance(s)
  434. for {
  435. if ch < 0 {
  436. error(s, "comment not terminated")
  437. break
  438. }
  439. ch0 := ch
  440. ch = advance(s)
  441. if ch0 == '*' && ch == '/' {
  442. return advance(s)
  443. }
  444. }
  445. return ch
  446. }
  447. // scan reads the next token or Unicode character from source and returns it
  448. // It only recognizes tokens for which the respective flag that is set
  449. // It returns EOF at the end of the source
  450. // It reports Scanner errors by calling s.error, if not nil; otherwise it will print the error message to os.stderr
  451. scan :: proc(s: ^Scanner) -> (tok: rune) {
  452. ch := peek(s)
  453. if ch == EOF {
  454. return ch
  455. }
  456. // reset position
  457. s.tok_pos = -1
  458. s.pos.line = 0
  459. redo: for {
  460. for ch < utf8.RUNE_SELF && (ch in s.whitespace) {
  461. ch = advance(s)
  462. }
  463. s.tok_pos = s.src_pos - s.prev_char_len
  464. s.pos.offset = s.tok_pos
  465. if s.column > 0 {
  466. s.pos.line = s.line
  467. s.pos.column = s.column
  468. } else {
  469. // previous character was newline
  470. s.pos.line = s.line - 1
  471. s.pos.column = s.prev_line_len
  472. }
  473. tok = ch
  474. if is_ident_rune(s, ch, 0) {
  475. if .Scan_Idents in s.flags {
  476. tok = Ident
  477. ch = scan_identifier(s)
  478. } else {
  479. ch = advance(s)
  480. }
  481. } else if is_decimal(ch) {
  482. if .Scan_Ints in s.flags || .Scan_Floats in s.flags {
  483. tok, ch = scan_number(s, ch, false)
  484. } else {
  485. ch = advance(s)
  486. }
  487. } else {
  488. switch ch {
  489. case EOF:
  490. break
  491. case '"':
  492. if .Scan_Strings in s.flags {
  493. scan_string(s, '"')
  494. tok = String
  495. }
  496. ch = advance(s)
  497. case '\'':
  498. if .Scan_Chars in s.flags {
  499. scan_string(s, '\'')
  500. tok = Char
  501. }
  502. ch = advance(s)
  503. case '`':
  504. if .Scan_Raw_Strings in s.flags {
  505. scan_raw_string(s)
  506. tok = Raw_String
  507. }
  508. ch = advance(s)
  509. case '.':
  510. ch = advance(s)
  511. if is_decimal(ch) && .Scan_Floats in s.flags {
  512. tok, ch = scan_number(s, ch, true)
  513. }
  514. case '/':
  515. ch = advance(s)
  516. if (ch == '/' || ch == '*') && .Scan_Comments in s.flags {
  517. if .Skip_Comments in s.flags {
  518. s.tok_pos = -1
  519. ch = scan_comment(s, ch)
  520. continue redo
  521. }
  522. ch = scan_comment(s, ch)
  523. tok = Comment
  524. }
  525. case:
  526. ch = advance(s)
  527. }
  528. }
  529. break redo
  530. }
  531. s.tok_end = s.src_pos - s.prev_char_len
  532. s.ch = ch
  533. return tok
  534. }
  535. // position returns the position of the character immediately after the character or token returns by the previous call to next or scan
  536. // Use the Scanner's position field for the most recently scanned token position
  537. position :: proc(s: ^Scanner) -> Position {
  538. pos: Position
  539. pos.filename = s.pos.filename
  540. pos.offset = s.src_pos - s.prev_char_len
  541. switch {
  542. case s.column > 0:
  543. pos.line = s.line
  544. pos.column = s.column
  545. case s.prev_line_len > 0:
  546. pos.line = s.line-1
  547. pos.column = s.prev_line_len
  548. case:
  549. pos.line = 1
  550. pos.column = 1
  551. }
  552. return pos
  553. }
  554. // token_text returns the string of the most recently scanned token
  555. token_text :: proc(s: ^Scanner) -> string {
  556. if s.tok_pos < 0 {
  557. return ""
  558. }
  559. return string(s.src[s.tok_pos:s.tok_end])
  560. }
  561. // token_string returns a printable string for a token or Unicode character
  562. // By default, it uses the context.temp_allocator to produce the string
  563. token_string :: proc(tok: rune, allocator := context.temp_allocator) -> string {
  564. context.allocator = allocator
  565. switch tok {
  566. case EOF: return strings.clone("EOF")
  567. case Ident: return strings.clone("Ident")
  568. case Int: return strings.clone("Int")
  569. case Float: return strings.clone("Float")
  570. case Char: return strings.clone("Char")
  571. case String: return strings.clone("String")
  572. case Raw_String: return strings.clone("Raw_String")
  573. case Comment: return strings.clone("Comment")
  574. }
  575. return fmt.aprintf("%q", tok)
  576. }