123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357 |
- package regex_tokenizer
- /*
- (c) Copyright 2024 Feoramund <[email protected]>.
- Made available under Odin's BSD-3 license.
- List of contributors:
- Feoramund: Initial implementation.
- */
- import "core:text/regex/common"
- import "core:unicode/utf8"
- Token_Kind :: enum {
- Invalid,
- EOF,
- Rune,
- Wildcard,
- Alternate,
- Concatenate,
- Repeat_Zero,
- Repeat_Zero_Non_Greedy,
- Repeat_One,
- Repeat_One_Non_Greedy,
- Repeat_N,
- Optional,
- Optional_Non_Greedy,
- Rune_Class,
- Open_Paren,
- Open_Paren_Non_Capture,
- Close_Paren,
- Anchor_Start,
- Anchor_End,
- Word_Boundary,
- Non_Word_Boundary,
- }
- Token :: struct {
- kind: Token_Kind,
- text: string,
- pos: int,
- }
- Tokenizer :: struct {
- flags: common.Flags,
- src: string,
- ch: rune,
- offset: int,
- read_offset: int,
- last_token_kind: Token_Kind,
- held_token: Token,
- error_state: Error,
- paren_depth: int,
- }
- Error :: enum {
- None,
- Illegal_Null_Character,
- Illegal_Codepoint,
- Illegal_Byte_Order_Mark,
- }
- init :: proc(t: ^Tokenizer, str: string, flags: common.Flags) {
- t.src = str
- t.flags = flags
- t.error_state = advance_rune(t)
- }
- peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
- if t.read_offset+offset < len(t.src) {
- return t.src[t.read_offset+offset]
- }
- return 0
- }
- advance_rune :: proc(t: ^Tokenizer) -> (err: Error) {
- if t.error_state != nil {
- return t.error_state
- }
- if t.read_offset < len(t.src) {
- t.offset = t.read_offset
- r, w := rune(t.src[t.read_offset]), 1
- switch {
- case r == 0:
- err = .Illegal_Null_Character
- case r >= utf8.RUNE_SELF:
- r, w = utf8.decode_rune(t.src[t.read_offset:])
- if r == utf8.RUNE_ERROR && w == 1 {
- err = .Illegal_Codepoint
- } else if r == utf8.RUNE_BOM && t.offset > 0 {
- err = .Illegal_Byte_Order_Mark
- }
- }
- t.read_offset += w
- t.ch = r
- } else {
- t.offset = len(t.src)
- t.ch = -1
- }
- t.error_state = err
- return
- }
- @require_results
- scan_class :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
- start := t.read_offset
- for {
- advance_rune(t)
- if t.ch == -1 || t.error_state != nil {
- return "", false
- }
- if t.ch == '\\' {
- advance_rune(t)
- continue
- }
- if t.ch == ']' {
- return t.src[start:t.offset], true
- }
- }
- unreachable()
- }
- @require_results
- scan_repeat :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
- start := t.read_offset
- for {
- advance_rune(t)
- if t.ch == -1 {
- return "", false
- }
- if t.ch == '}' {
- return t.src[start:t.offset], true
- }
- }
- unreachable()
- }
- @require_results
- scan_non_greedy :: proc(t: ^Tokenizer) -> bool {
- if peek_byte(t) == '?' {
- advance_rune(t)
- return true
- }
- return false
- }
- scan_comment :: proc(t: ^Tokenizer) {
- for {
- advance_rune(t)
- switch t.ch {
- case -1:
- return
- case '\n':
- // UNIX newline.
- advance_rune(t)
- return
- case '\r':
- // Mac newline.
- advance_rune(t)
- if t.ch == '\n' {
- // Windows newline.
- advance_rune(t)
- }
- return
- }
- }
- }
- @require_results
- scan_non_capture_group :: proc(t: ^Tokenizer) -> bool {
- if peek_byte(t) == '?' && peek_byte(t, 1) == ':' {
- advance_rune(t)
- advance_rune(t)
- return true
- }
- return false
- }
- @require_results
- scan :: proc(t: ^Tokenizer) -> (token: Token) {
- kind: Token_Kind
- lit: string
- pos := t.offset
- defer {
- t.last_token_kind = token.kind
- }
- if t.error_state != nil {
- t.error_state = nil
- return { .Invalid, "", pos }
- }
- if t.held_token != {} {
- popped := t.held_token
- t.held_token = {}
-
- return popped
- }
- ch_loop: for {
- switch t.ch {
- case -1:
- return { .EOF, "", pos }
- case '\\':
- advance_rune(t)
- if t.ch == -1 {
- return { .EOF, "", pos }
- }
- pos = t.offset
- // @MetaCharacter
- // NOTE: These must be kept in sync with the compiler.
- DIGIT_CLASS :: "0-9"
- SPACE_CLASS :: "\t\n\f\r "
- WORD_CLASS :: "0-9A-Z_a-z"
- switch t.ch {
- case 'b': kind = .Word_Boundary
- case 'B': kind = .Non_Word_Boundary
- case 'f': kind = .Rune; lit = "\f"
- case 'n': kind = .Rune; lit = "\n"
- case 'r': kind = .Rune; lit = "\r"
- case 't': kind = .Rune; lit = "\t"
- case 'd': kind = .Rune_Class; lit = DIGIT_CLASS
- case 's': kind = .Rune_Class; lit = SPACE_CLASS
- case 'w': kind = .Rune_Class; lit = WORD_CLASS
- case 'D': kind = .Rune_Class; lit = "^" + DIGIT_CLASS
- case 'S': kind = .Rune_Class; lit = "^" + SPACE_CLASS
- case 'W': kind = .Rune_Class; lit = "^" + WORD_CLASS
- case:
- kind = .Rune
- lit = t.src[t.offset:t.read_offset]
- }
- case '.':
- kind = .Wildcard
- case '|': kind = .Alternate
- case '*': kind = .Repeat_Zero_Non_Greedy if scan_non_greedy(t) else .Repeat_Zero
- case '+': kind = .Repeat_One_Non_Greedy if scan_non_greedy(t) else .Repeat_One
- case '?': kind = .Optional_Non_Greedy if scan_non_greedy(t) else .Optional
- case '[':
- if text, ok := scan_class(t); ok {
- kind = .Rune_Class
- lit = text
- } else {
- kind = .EOF
- }
- case '{':
- if text, ok := scan_repeat(t); ok {
- kind = .Repeat_N
- lit = text
- } else {
- kind = .EOF
- }
- case '(':
- kind = .Open_Paren_Non_Capture if scan_non_capture_group(t) else .Open_Paren
- t.paren_depth += 1
- case ')':
- kind = .Close_Paren
- t.paren_depth -= 1
- case '^': kind = .Anchor_Start
- case '$':
- kind = .Anchor_End
- case:
- if .Ignore_Whitespace in t.flags {
- switch t.ch {
- case ' ', '\r', '\n', '\t', '\f':
- advance_rune(t)
- continue ch_loop
- case:
- break
- }
- }
- if t.ch == '#' && t.paren_depth == 0 {
- scan_comment(t)
- continue ch_loop
- }
- kind = .Rune
- lit = t.src[t.offset:t.read_offset]
- }
- break ch_loop
- }
- if t.error_state != nil {
- t.error_state = nil
- return { .Invalid, "", pos }
- }
- advance_rune(t)
- // The following set of rules dictate where Concatenate tokens are
- // automatically inserted.
- #partial switch kind {
- case
- .Close_Paren,
- .Alternate,
- .Optional, .Optional_Non_Greedy,
- .Repeat_Zero, .Repeat_Zero_Non_Greedy,
- .Repeat_One, .Repeat_One_Non_Greedy,
- .Repeat_N:
- // Never prepend a Concatenate before these tokens.
- break
- case:
- #partial switch t.last_token_kind {
- case
- .Invalid,
- .Open_Paren, .Open_Paren_Non_Capture,
- .Alternate:
- // Never prepend a Concatenate token when the _last token_ was one
- // of these.
- break
- case:
- t.held_token = { kind, lit, pos }
- return { .Concatenate, "", pos }
- }
- }
- return { kind, lit, pos }
- }
|