tokenizer.odin 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. package regex_tokenizer
  2. /*
  3. (c) Copyright 2024 Feoramund <[email protected]>.
  4. Made available under Odin's BSD-3 license.
  5. List of contributors:
  6. Feoramund: Initial implementation.
  7. */
  8. import "core:text/regex/common"
  9. import "core:unicode/utf8"
  10. Token_Kind :: enum {
  11. Invalid,
  12. EOF,
  13. Rune,
  14. Wildcard,
  15. Alternate,
  16. Concatenate,
  17. Repeat_Zero,
  18. Repeat_Zero_Non_Greedy,
  19. Repeat_One,
  20. Repeat_One_Non_Greedy,
  21. Repeat_N,
  22. Optional,
  23. Optional_Non_Greedy,
  24. Rune_Class,
  25. Open_Paren,
  26. Open_Paren_Non_Capture,
  27. Close_Paren,
  28. Anchor_Start,
  29. Anchor_End,
  30. Word_Boundary,
  31. Non_Word_Boundary,
  32. }
  33. Token :: struct {
  34. kind: Token_Kind,
  35. text: string,
  36. pos: int,
  37. }
  38. Tokenizer :: struct {
  39. flags: common.Flags,
  40. src: string,
  41. ch: rune,
  42. offset: int,
  43. read_offset: int,
  44. last_token_kind: Token_Kind,
  45. held_token: Token,
  46. error_state: Error,
  47. paren_depth: int,
  48. }
  49. Error :: enum {
  50. None,
  51. Illegal_Null_Character,
  52. Illegal_Codepoint,
  53. Illegal_Byte_Order_Mark,
  54. }
  55. init :: proc(t: ^Tokenizer, str: string, flags: common.Flags) {
  56. t.src = str
  57. t.flags = flags
  58. t.error_state = advance_rune(t)
  59. }
  60. peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
  61. if t.read_offset+offset < len(t.src) {
  62. return t.src[t.read_offset+offset]
  63. }
  64. return 0
  65. }
  66. advance_rune :: proc(t: ^Tokenizer) -> (err: Error) {
  67. if t.error_state != nil {
  68. return t.error_state
  69. }
  70. if t.read_offset < len(t.src) {
  71. t.offset = t.read_offset
  72. r, w := rune(t.src[t.read_offset]), 1
  73. switch {
  74. case r == 0:
  75. err = .Illegal_Null_Character
  76. case r >= utf8.RUNE_SELF:
  77. r, w = utf8.decode_rune(t.src[t.read_offset:])
  78. if r == utf8.RUNE_ERROR && w == 1 {
  79. err = .Illegal_Codepoint
  80. } else if r == utf8.RUNE_BOM && t.offset > 0 {
  81. err = .Illegal_Byte_Order_Mark
  82. }
  83. }
  84. t.read_offset += w
  85. t.ch = r
  86. } else {
  87. t.offset = len(t.src)
  88. t.ch = -1
  89. }
  90. t.error_state = err
  91. return
  92. }
  93. @require_results
  94. scan_class :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
  95. start := t.read_offset
  96. for {
  97. advance_rune(t)
  98. if t.ch == -1 || t.error_state != nil {
  99. return "", false
  100. }
  101. if t.ch == '\\' {
  102. advance_rune(t)
  103. continue
  104. }
  105. if t.ch == ']' {
  106. return t.src[start:t.offset], true
  107. }
  108. }
  109. unreachable()
  110. }
  111. @require_results
  112. scan_repeat :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
  113. start := t.read_offset
  114. for {
  115. advance_rune(t)
  116. if t.ch == -1 {
  117. return "", false
  118. }
  119. if t.ch == '}' {
  120. return t.src[start:t.offset], true
  121. }
  122. }
  123. unreachable()
  124. }
  125. @require_results
  126. scan_non_greedy :: proc(t: ^Tokenizer) -> bool {
  127. if peek_byte(t) == '?' {
  128. advance_rune(t)
  129. return true
  130. }
  131. return false
  132. }
  133. scan_comment :: proc(t: ^Tokenizer) {
  134. for {
  135. advance_rune(t)
  136. switch t.ch {
  137. case -1:
  138. return
  139. case '\n':
  140. // UNIX newline.
  141. advance_rune(t)
  142. return
  143. case '\r':
  144. // Mac newline.
  145. advance_rune(t)
  146. if t.ch == '\n' {
  147. // Windows newline.
  148. advance_rune(t)
  149. }
  150. return
  151. }
  152. }
  153. }
  154. @require_results
  155. scan_non_capture_group :: proc(t: ^Tokenizer) -> bool {
  156. if peek_byte(t) == '?' && peek_byte(t, 1) == ':' {
  157. advance_rune(t)
  158. advance_rune(t)
  159. return true
  160. }
  161. return false
  162. }
  163. @require_results
  164. scan :: proc(t: ^Tokenizer) -> (token: Token) {
  165. kind: Token_Kind
  166. lit: string
  167. pos := t.offset
  168. defer {
  169. t.last_token_kind = token.kind
  170. }
  171. if t.error_state != nil {
  172. t.error_state = nil
  173. return { .Invalid, "", pos }
  174. }
  175. if t.held_token != {} {
  176. popped := t.held_token
  177. t.held_token = {}
  178. return popped
  179. }
  180. ch_loop: for {
  181. switch t.ch {
  182. case -1:
  183. return { .EOF, "", pos }
  184. case '\\':
  185. advance_rune(t)
  186. if t.ch == -1 {
  187. return { .EOF, "", pos }
  188. }
  189. pos = t.offset
  190. // @MetaCharacter
  191. // NOTE: These must be kept in sync with the compiler.
  192. DIGIT_CLASS :: "0-9"
  193. SPACE_CLASS :: "\t\n\f\r "
  194. WORD_CLASS :: "0-9A-Z_a-z"
  195. switch t.ch {
  196. case 'b': kind = .Word_Boundary
  197. case 'B': kind = .Non_Word_Boundary
  198. case 'f': kind = .Rune; lit = "\f"
  199. case 'n': kind = .Rune; lit = "\n"
  200. case 'r': kind = .Rune; lit = "\r"
  201. case 't': kind = .Rune; lit = "\t"
  202. case 'd': kind = .Rune_Class; lit = DIGIT_CLASS
  203. case 's': kind = .Rune_Class; lit = SPACE_CLASS
  204. case 'w': kind = .Rune_Class; lit = WORD_CLASS
  205. case 'D': kind = .Rune_Class; lit = "^" + DIGIT_CLASS
  206. case 'S': kind = .Rune_Class; lit = "^" + SPACE_CLASS
  207. case 'W': kind = .Rune_Class; lit = "^" + WORD_CLASS
  208. case:
  209. kind = .Rune
  210. lit = t.src[t.offset:t.read_offset]
  211. }
  212. case '.':
  213. kind = .Wildcard
  214. case '|': kind = .Alternate
  215. case '*': kind = .Repeat_Zero_Non_Greedy if scan_non_greedy(t) else .Repeat_Zero
  216. case '+': kind = .Repeat_One_Non_Greedy if scan_non_greedy(t) else .Repeat_One
  217. case '?': kind = .Optional_Non_Greedy if scan_non_greedy(t) else .Optional
  218. case '[':
  219. if text, ok := scan_class(t); ok {
  220. kind = .Rune_Class
  221. lit = text
  222. } else {
  223. kind = .EOF
  224. }
  225. case '{':
  226. if text, ok := scan_repeat(t); ok {
  227. kind = .Repeat_N
  228. lit = text
  229. } else {
  230. kind = .EOF
  231. }
  232. case '(':
  233. kind = .Open_Paren_Non_Capture if scan_non_capture_group(t) else .Open_Paren
  234. t.paren_depth += 1
  235. case ')':
  236. kind = .Close_Paren
  237. t.paren_depth -= 1
  238. case '^': kind = .Anchor_Start
  239. case '$':
  240. kind = .Anchor_End
  241. case:
  242. if .Ignore_Whitespace in t.flags {
  243. switch t.ch {
  244. case ' ', '\r', '\n', '\t', '\f':
  245. advance_rune(t)
  246. continue ch_loop
  247. case:
  248. break
  249. }
  250. }
  251. if t.ch == '#' && t.paren_depth == 0 {
  252. scan_comment(t)
  253. continue ch_loop
  254. }
  255. kind = .Rune
  256. lit = t.src[t.offset:t.read_offset]
  257. }
  258. break ch_loop
  259. }
  260. if t.error_state != nil {
  261. t.error_state = nil
  262. return { .Invalid, "", pos }
  263. }
  264. advance_rune(t)
  265. // The following set of rules dictate where Concatenate tokens are
  266. // automatically inserted.
  267. #partial switch kind {
  268. case
  269. .Close_Paren,
  270. .Alternate,
  271. .Optional, .Optional_Non_Greedy,
  272. .Repeat_Zero, .Repeat_Zero_Non_Greedy,
  273. .Repeat_One, .Repeat_One_Non_Greedy,
  274. .Repeat_N:
  275. // Never prepend a Concatenate before these tokens.
  276. break
  277. case:
  278. #partial switch t.last_token_kind {
  279. case
  280. .Invalid,
  281. .Open_Paren, .Open_Paren_Non_Capture,
  282. .Alternate:
  283. // Never prepend a Concatenate token when the _last token_ was one
  284. // of these.
  285. break
  286. case:
  287. t.held_token = { kind, lit, pos }
  288. return { .Concatenate, "", pos }
  289. }
  290. }
  291. return { kind, lit, pos }
  292. }