scanner.odin 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655
  1. // package text/scanner provides a scanner and tokenizer for UTF-8-encoded text.
  2. // It takes a string providing the source, which then can be tokenized through
  3. // repeated calls to the scan procedure.
  4. // For compatibility with existing tooling and languages, the NUL character is not allowed.
  5. // If an UTF-8 encoded byte order mark (BOM) is the first character in the first character in the source, it will be discarded.
  6. //
  7. // By default, a Scanner skips white space and Odin comments and recognizes all literals defined by the Odin programming language specification.
  8. // A Scanner may be customized to recognize only a subset of those literals and to recognize different identifiers and white space characters.
  9. package text_scanner
  10. import "core:fmt"
  11. import "core:strings"
  12. import "core:unicode"
  13. import "core:unicode/utf8"
  14. // Position represents a source position
  15. // A position is valid if line > 0
  16. Position :: struct {
  17. filename: string, // filename, if present
  18. offset: int, // byte offset, starting @ 0
  19. line: int, // line number, starting @ 1
  20. column: int, // column number, starting @ 1 (character count per line)
  21. }
  22. // position_is_valid reports where the position is valid
  23. position_is_valid :: proc(pos: Position) -> bool {
  24. return pos.line > 0;
  25. }
  26. position_to_string :: proc(pos: Position, allocator := context.temp_allocator) -> string {
  27. s := pos.filename;
  28. if s == "" {
  29. s = "<input>";
  30. }
  31. context.allocator = allocator;
  32. if position_is_valid(pos) {
  33. return fmt.aprintf("%s(%d:%d)", s, pos.line, pos.column);
  34. } else {
  35. return strings.clone(s);
  36. }
  37. }
  38. EOF :: -1;
  39. Ident :: -2;
  40. Int :: -3;
  41. Float :: -4;
  42. Char :: -5;
  43. String :: -6;
  44. Raw_String :: -7;
  45. Comment :: -8;
  46. Scan_Flag :: enum u32 {
  47. Scan_Idents,
  48. Scan_Ints,
  49. Scan_C_Int_Prefixes,
  50. Scan_Floats, // Includes integers and hexadecimal floats
  51. Scan_Chars,
  52. Scan_Strings,
  53. Scan_Raw_Strings,
  54. Scan_Comments,
  55. Skip_Comments, // if set with .Scan_Comments, comments become white space
  56. }
  57. Scan_Flags :: distinct bit_set[Scan_Flag; u32];
  58. Odin_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments};
  59. C_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_C_Int_Prefixes, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments};
  60. // Only allows for ASCII whitespace
  61. Whitespace :: distinct bit_set['\x00'..<utf8.RUNE_SELF; u128];
  62. // Odin_Whitespace is the default value for the Scanner's whitespace field
  63. Odin_Whitespace :: Whitespace{'\t', '\n', '\r', ' '};
  64. C_Whitespace :: Whitespace{'\t', '\n', '\r', '\v', '\f', ' '};
  65. // Scanner allows for the reading of Unicode characters and tokens from a string
  66. Scanner :: struct {
  67. src: string,
  68. src_pos: int,
  69. src_end: int,
  70. tok_pos: int,
  71. tok_end: int,
  72. ch: rune,
  73. line: int,
  74. column: int,
  75. prev_line_len: int,
  76. prev_char_len: int,
  77. // error is called for each error encountered
  78. // If no error procedure is set, the error is reported to os.stderr
  79. error: proc(s: ^Scanner, msg: string),
  80. // error_count is incremented by one for each error encountered
  81. error_count: int,
  82. // flags controls which tokens are recognized
  83. // e.g. to recognize integers, set the .Scan_Ints flag
  84. // This field may be changed by the user at any time during scanning
  85. flags: Scan_Flags,
  86. // The whitespace field controls which characters are recognized as white space
  87. // This field may be changed by the user at any time during scanning
  88. whitespace: Whitespace,
  89. // is_ident_rune is a predicate controlling the characters accepted as the ith rune in an identifier
  90. // The valid characters must not conflict with the set of white space characters
  91. // If is_ident_rune is not set, regular Odin-like identifiers are accepted
  92. // This field may be changed by the user at any time during scanning
  93. is_ident_rune: proc(ch: rune, i: int) -> bool,
  94. // Start position of most recently scanned token (set by scan(s))
  95. // Call init or next invalidates the position
  96. pos: Position,
  97. }
  98. // init initializes a scanner with a new source and returns itself.
  99. // error_count is set to 0, flags is set to Odin_Like_Tokens, whitespace is set to Odin_Whitespace
  100. init :: proc(s: ^Scanner, src: string, filename := "") -> ^Scanner {
  101. s^ = {};
  102. s.error_count = 0;
  103. s.src = src;
  104. s.pos.filename = filename;
  105. s.tok_pos = -1;
  106. s.ch = -2; // no char read yet, not an EOF
  107. s.line = 1;
  108. s.flags = Odin_Like_Tokens;
  109. s.whitespace = Odin_Whitespace;
  110. return s;
  111. }
  112. @(private)
  113. advance :: proc(s: ^Scanner) -> rune {
  114. if s.src_pos >= len(s.src) {
  115. s.prev_char_len = 0;
  116. return EOF;
  117. }
  118. ch, width := rune(s.src[s.src_pos]), 1;
  119. if ch >= utf8.RUNE_SELF {
  120. ch, width = utf8.decode_rune_in_string(s.src[s.src_pos:]);
  121. if ch == utf8.RUNE_ERROR && width == 1 {
  122. s.src_pos += width;
  123. s.prev_char_len = width;
  124. s.column += 1;
  125. error(s, "invalid UTF-8 encoding");
  126. return ch;
  127. }
  128. }
  129. s.src_pos += width;
  130. s.prev_char_len = width;
  131. s.column += 1;
  132. switch ch {
  133. case 0:
  134. error(s, "invalid character NUL");
  135. case '\n':
  136. s.line += 1;
  137. s.prev_line_len = s.column;
  138. s.column = 0;
  139. }
  140. return ch;
  141. }
  142. // next reads and returns the next Unicode character. It returns EOF at the end of the source.
  143. // next does not update the Scanner's pos field. Use 'position(s)' to get the current position
  144. next :: proc(s: ^Scanner) -> rune {
  145. s.tok_pos = -1;
  146. s.pos.line = 0;
  147. ch := peek(s);
  148. if ch != EOF {
  149. s.ch = advance(s);
  150. }
  151. return ch;
  152. }
  153. // peek returns the next Unicode character in the source without advancing the scanner
  154. // It returns EOF if the scanner's position is at least the last character of the source
  155. // if n > 0, it call next n times and return the nth Unicode character and then restore the Scanner's state
  156. peek :: proc(s: ^Scanner, n := 0) -> (ch: rune) {
  157. if s.ch == -2 {
  158. s.ch = advance(s);
  159. if s.ch == '\ufeff' { // Ignore BOM
  160. s.ch = advance(s);
  161. }
  162. }
  163. ch = s.ch;
  164. if n > 0 {
  165. prev_s := s^;
  166. for in 0..<n {
  167. next(s);
  168. }
  169. ch = s.ch;
  170. s^ = prev_s;
  171. }
  172. return ch;
  173. }
  174. // peek returns the next token in the source
  175. // It returns EOF if the scanner's position is at least the last character of the source
  176. // if n > 0, it call next n times and return the nth token and then restore the Scanner's state
  177. peek_token :: proc(s: ^Scanner, n := 0) -> (tok: rune) {
  178. assert(n >= 0);
  179. prev_s := s^;
  180. for in 0..<n {
  181. tok = scan(s);
  182. }
  183. tok = scan(s);
  184. s^ = prev_s;
  185. return;
  186. }
  187. error :: proc(s: ^Scanner, msg: string) {
  188. s.error_count += 1;
  189. if s.error != nil {
  190. s.error(s, msg);
  191. return;
  192. }
  193. p := s.pos;
  194. if !position_is_valid(p) {
  195. p = position(s);
  196. }
  197. s := p.filename;
  198. if s == "" {
  199. s = "<input>";
  200. }
  201. if position_is_valid(p) {
  202. fmt.eprintf("%s(%d:%d): %s\n", s, p.line, p.column, msg);
  203. } else {
  204. fmt.eprintf("%s: %s\n", s, msg);
  205. }
  206. }
  207. errorf :: proc(s: ^Scanner, format: string, args: ..any) {
  208. error(s, fmt.tprintf(format, ..args));
  209. }
  210. @(private)
  211. is_ident_rune :: proc(s: ^Scanner, ch: rune, i: int) -> bool {
  212. if s.is_ident_rune != nil {
  213. return s.is_ident_rune(ch, i);
  214. }
  215. return ch == '_' || unicode.is_letter(ch) || unicode.is_digit(ch) && i > 0;
  216. }
  217. @(private)
  218. scan_identifier :: proc(s: ^Scanner) -> rune {
  219. ch := advance(s);
  220. for i := 1; is_ident_rune(s, ch, i); i += 1 {
  221. ch = advance(s);
  222. }
  223. return ch;
  224. }
  225. @(private) lower :: proc(ch: rune) -> rune { return ('a' - 'A') | ch; }
  226. @(private) is_decimal :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9'; }
  227. @(private) is_hex :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f'; }
  228. @(private)
  229. scan_number :: proc(s: ^Scanner, ch: rune, seen_dot: bool) -> (rune, rune) {
  230. lit_name :: proc(prefix: rune) -> string {
  231. switch prefix {
  232. case 'b': return "binary literal";
  233. case 'o': return "octal literal";
  234. case 'z': return "dozenal literal";
  235. case 'x': return "hexadecimal literal";
  236. }
  237. return "decimal literal";
  238. }
  239. digits :: proc(s: ^Scanner, ch0: rune, base: int, invalid: ^rune) -> (ch: rune, digsep: int) {
  240. ch = ch0;
  241. if base <= 10 {
  242. max := rune('0' + base);
  243. for is_decimal(ch) || ch == '_' {
  244. ds := 1;
  245. if ch == '_' {
  246. ds = 2;
  247. } else if ch >= max && invalid^ == 0 {
  248. invalid^ = ch;
  249. }
  250. digsep |= ds;
  251. ch = advance(s);
  252. }
  253. } else {
  254. for is_hex(ch) || ch == '_' {
  255. ds := 1;
  256. if ch == '_' {
  257. ds = 2;
  258. }
  259. digsep |= ds;
  260. ch = advance(s);
  261. }
  262. }
  263. return;
  264. }
  265. ch, seen_dot := ch, seen_dot;
  266. base := 10;
  267. prefix := rune(0);
  268. digsep := 0;
  269. invalid := rune(0);
  270. tok: rune;
  271. ds: int;
  272. if !seen_dot {
  273. tok = Int;
  274. if ch == '0' {
  275. ch = advance(s);
  276. p := lower(ch);
  277. if .Scan_C_Int_Prefixes in s.flags {
  278. switch p {
  279. case 'b':
  280. ch = advance(s);
  281. base, prefix = 2, 'b';
  282. case 'x':
  283. ch = advance(s);
  284. base, prefix = 16, 'x';
  285. case:
  286. base, prefix = 8, 'o';
  287. digsep = 1; // Leading zero
  288. }
  289. } else {
  290. switch p {
  291. case 'b':
  292. ch = advance(s);
  293. base, prefix = 2, 'b';
  294. case 'o':
  295. ch = advance(s);
  296. base, prefix = 8, 'o';
  297. case 'd':
  298. ch = advance(s);
  299. base, prefix = 10, 'd';
  300. case 'z':
  301. ch = advance(s);
  302. base, prefix = 12, 'z';
  303. case 'h':
  304. tok = Float;
  305. fallthrough;
  306. case 'x':
  307. ch = advance(s);
  308. base, prefix = 16, 'x';
  309. case:
  310. digsep = 1; // Leading zero
  311. }
  312. }
  313. }
  314. ch, ds = digits(s, ch, base, &invalid);
  315. digsep |= ds;
  316. if ch == '.' && .Scan_Floats in s.flags {
  317. ch = advance(s);
  318. seen_dot = true;
  319. }
  320. }
  321. if seen_dot {
  322. tok = Float;
  323. if prefix != 0 && prefix != 'x' {
  324. errorf(s, "invalid radix point in %s", lit_name(prefix));
  325. }
  326. ch, ds = digits(s, ch, base, &invalid);
  327. digsep |= ds;
  328. }
  329. if digsep&1 == 0 {
  330. errorf(s, "%s has no digits", lit_name(prefix));
  331. }
  332. if e := lower(ch); (e == 'e' || e == 'p') && .Scan_Floats in s.flags {
  333. switch {
  334. case e == 'e' && prefix != 0:
  335. errorf(s, "%q exponent requires decimal mantissa", ch);
  336. case e == 'p' && prefix != 'x':
  337. errorf(s, "%q exponent requires hexadecimal mantissa", ch);
  338. }
  339. ch = advance(s);
  340. tok = Float;
  341. if ch == '+' || ch == '-' {
  342. ch = advance(s);
  343. }
  344. ch, ds = digits(s, ch, 10, nil);
  345. digsep |= ds;
  346. if ds&1 == 0 {
  347. error(s, "exponent has no digits");
  348. }
  349. } else if prefix == 'x' && tok == Float {
  350. error(s, "hexadecimal mantissa requires a 'p' exponent");
  351. }
  352. if tok == Int && invalid != 0 {
  353. errorf(s, "invalid digit %q in %s", invalid, lit_name(prefix));
  354. }
  355. if digsep&2 != 0 {
  356. s.tok_end = s.src_pos - s.prev_char_len;
  357. }
  358. return tok, ch;
  359. }
  360. @(private)
  361. scan_string :: proc(s: ^Scanner, quote: rune) -> (n: int) {
  362. digit_val :: proc(ch: rune) -> int {
  363. switch v := lower(ch); v {
  364. case '0'..'9': return int(v - '0');
  365. case 'a'..'z': return int(v - 'a');
  366. }
  367. return 16;
  368. }
  369. scan_digits :: proc(s: ^Scanner, ch: rune, base, n: int) -> rune {
  370. ch, n := ch, n;
  371. for n > 0 && digit_val(ch) < base {
  372. ch = advance(s);
  373. n -= 1;
  374. }
  375. if n > 0 {
  376. error(s, "invalid char escape");
  377. }
  378. return ch;
  379. }
  380. ch := advance(s);
  381. for ch != quote {
  382. if ch == '\n' || ch < 0 {
  383. error(s, "literal no terminated");
  384. return;
  385. }
  386. if ch == '\\' {
  387. ch = advance(s);
  388. switch ch {
  389. case quote, 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v', '\\':
  390. ch = advance(s);
  391. case '0'..'7': ch = scan_digits(s, advance(s), 8, 3);
  392. case 'x': ch = scan_digits(s, advance(s), 16, 2);
  393. case 'u': ch = scan_digits(s, advance(s), 16, 4);
  394. case 'U': ch = scan_digits(s, advance(s), 16, 8);
  395. case:
  396. error(s, "invalid char escape");
  397. }
  398. } else {
  399. ch = advance(s);
  400. }
  401. n += 1;
  402. }
  403. return;
  404. }
  405. @(private)
  406. scan_raw_string :: proc(s: ^Scanner) {
  407. ch := advance(s);
  408. for ch != '`' {
  409. if ch < 0 {
  410. error(s, "literal not terminated");
  411. return;
  412. }
  413. ch = advance(s);
  414. }
  415. }
  416. @(private)
  417. scan_char :: proc(s: ^Scanner) {
  418. if scan_string(s, '\'') != 1 {
  419. error(s, "invalid char literal");
  420. }
  421. }
  422. @(private)
  423. scan_comment :: proc(s: ^Scanner, ch: rune) -> rune {
  424. ch := ch;
  425. if ch == '/' { // line comment
  426. ch = advance(s);
  427. for ch != '\n' && ch >= 0 {
  428. ch = advance(s);
  429. }
  430. return ch;
  431. }
  432. // block /**/ comment
  433. ch = advance(s);
  434. for {
  435. if ch < 0 {
  436. error(s, "comment not terminated");
  437. break;
  438. }
  439. ch0 := ch;
  440. ch = advance(s);
  441. if ch0 == '*' && ch == '/' {
  442. return advance(s);
  443. }
  444. }
  445. return ch;
  446. }
  447. // scan reads the next token or Unicode character from source and returns it
  448. // It only recognizes tokens for which the respective flag that is set
  449. // It returns EOF at the end of the source
  450. // It reports Scanner errors by calling s.error, if not nil; otherwise it will print the error message to os.stderr
  451. scan :: proc(s: ^Scanner) -> (tok: rune) {
  452. ch := peek(s);
  453. if ch == EOF {
  454. return ch;
  455. }
  456. // reset position
  457. s.tok_pos = -1;
  458. s.pos.line = 0;
  459. redo: for {
  460. for (ch < utf8.RUNE_SELF && ch in s.whitespace) {
  461. ch = advance(s);
  462. }
  463. s.tok_pos = s.src_pos - s.prev_char_len;
  464. s.pos.offset = s.tok_pos;
  465. if s.column > 0 {
  466. s.pos.line = s.line;
  467. s.pos.column = s.column;
  468. } else {
  469. // previous character was newline
  470. s.pos.line = s.line - 1;
  471. s.pos.column = s.prev_line_len;
  472. }
  473. tok = ch;
  474. if is_ident_rune(s, ch, 0) {
  475. if .Scan_Idents in s.flags {
  476. tok = Ident;
  477. ch = scan_identifier(s);
  478. } else {
  479. ch = advance(s);
  480. }
  481. } else if is_decimal(ch) {
  482. if .Scan_Ints in s.flags || .Scan_Floats in s.flags {
  483. tok, ch = scan_number(s, ch, false);
  484. } else {
  485. ch = advance(s);
  486. }
  487. } else {
  488. switch ch {
  489. case EOF:
  490. break;
  491. case '"':
  492. if .Scan_Strings in s.flags {
  493. scan_string(s, '"');
  494. tok = String;
  495. }
  496. ch = advance(s);
  497. case '\'':
  498. if .Scan_Chars in s.flags {
  499. scan_string(s, '\'');
  500. tok = Char;
  501. }
  502. ch = advance(s);
  503. case '`':
  504. if .Scan_Raw_Strings in s.flags {
  505. scan_raw_string(s);
  506. tok = Raw_String;
  507. }
  508. ch = advance(s);
  509. case '.':
  510. ch = advance(s);
  511. if is_decimal(ch) && .Scan_Floats in s.flags {
  512. tok, ch = scan_number(s, ch, true);
  513. }
  514. case '/':
  515. ch = advance(s);
  516. if (ch == '/' || ch == '*') && .Scan_Comments in s.flags {
  517. if .Skip_Comments in s.flags {
  518. s.tok_pos = -1;
  519. ch = scan_comment(s, ch);
  520. continue redo;
  521. }
  522. ch = scan_comment(s, ch);
  523. tok = Comment;
  524. }
  525. case:
  526. ch = advance(s);
  527. }
  528. }
  529. break redo;
  530. }
  531. s.tok_end = s.src_pos - s.prev_char_len;
  532. s.ch = ch;
  533. return tok;
  534. }
  535. // position returns the position of the character immediately after the character or token returns by the previous call to next or scan
  536. // Use the Scanner's position field for the most recently scanned token position
  537. position :: proc(s: ^Scanner) -> Position {
  538. pos: Position;
  539. pos.filename = s.pos.filename;
  540. pos.offset = s.src_pos - s.prev_char_len;
  541. switch {
  542. case s.column > 0:
  543. pos.line = s.line;
  544. pos.column = s.column;
  545. case s.prev_line_len > 0:
  546. pos.line = s.line-1;
  547. pos.column = s.prev_line_len;
  548. case:
  549. pos.line = 1;
  550. pos.column = 1;
  551. }
  552. return pos;
  553. }
  554. // token_text returns the string of the most recently scanned token
  555. token_text :: proc(s: ^Scanner) -> string {
  556. if s.tok_pos < 0 {
  557. return "";
  558. }
  559. return string(s.src[s.tok_pos:s.tok_end]);
  560. }
  561. // token_string returns a printable string for a token or Unicode character
  562. // By default, it uses the context.temp_allocator to produce the string
  563. token_string :: proc(tok: rune, allocator := context.temp_allocator) -> string {
  564. context.allocator = allocator;
  565. switch tok {
  566. case EOF: return strings.clone("EOF");
  567. case Ident: return strings.clone("Ident");
  568. case Int: return strings.clone("Int");
  569. case Float: return strings.clone("Float");
  570. case Char: return strings.clone("Char");
  571. case String: return strings.clone("String");
  572. case Raw_String: return strings.clone("Raw_String");
  573. case Comment: return strings.clone("Comment");
  574. }
  575. return fmt.aprintf("%q", tok);
  576. }