tokenizer.odin 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. // Tokenizes regular expressions.
  2. package regex_tokenizer
  3. /*
  4. (c) Copyright 2024 Feoramund <[email protected]>.
  5. Made available under Odin's BSD-3 license.
  6. List of contributors:
  7. Feoramund: Initial implementation.
  8. */
  9. import "core:text/regex/common"
  10. import "core:unicode/utf8"
  11. Token_Kind :: enum {
  12. Invalid,
  13. EOF,
  14. Rune,
  15. Wildcard,
  16. Alternate,
  17. Concatenate,
  18. Repeat_Zero,
  19. Repeat_Zero_Non_Greedy,
  20. Repeat_One,
  21. Repeat_One_Non_Greedy,
  22. Repeat_N,
  23. Optional,
  24. Optional_Non_Greedy,
  25. Rune_Class,
  26. Open_Paren,
  27. Open_Paren_Non_Capture,
  28. Close_Paren,
  29. Anchor_Start,
  30. Anchor_End,
  31. Word_Boundary,
  32. Non_Word_Boundary,
  33. }
  34. Token :: struct {
  35. kind: Token_Kind,
  36. text: string,
  37. pos: int,
  38. }
  39. Tokenizer :: struct {
  40. flags: common.Flags,
  41. src: string,
  42. ch: rune,
  43. offset: int,
  44. read_offset: int,
  45. last_token_kind: Token_Kind,
  46. held_token: Token,
  47. error_state: Error,
  48. paren_depth: int,
  49. }
  50. Error :: enum {
  51. None,
  52. Illegal_Null_Character,
  53. Illegal_Codepoint,
  54. Illegal_Byte_Order_Mark,
  55. }
  56. init :: proc(t: ^Tokenizer, str: string, flags: common.Flags) {
  57. t.src = str
  58. t.flags = flags
  59. t.error_state = advance_rune(t)
  60. }
  61. peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
  62. if t.read_offset+offset < len(t.src) {
  63. return t.src[t.read_offset+offset]
  64. }
  65. return 0
  66. }
  67. advance_rune :: proc(t: ^Tokenizer) -> (err: Error) {
  68. if t.error_state != nil {
  69. return t.error_state
  70. }
  71. if t.read_offset < len(t.src) {
  72. t.offset = t.read_offset
  73. r, w := rune(t.src[t.read_offset]), 1
  74. switch {
  75. case r == 0:
  76. err = .Illegal_Null_Character
  77. case r >= utf8.RUNE_SELF:
  78. r, w = utf8.decode_rune(t.src[t.read_offset:])
  79. if r == utf8.RUNE_ERROR && w == 1 {
  80. err = .Illegal_Codepoint
  81. } else if r == utf8.RUNE_BOM && t.offset > 0 {
  82. err = .Illegal_Byte_Order_Mark
  83. }
  84. }
  85. t.read_offset += w
  86. t.ch = r
  87. } else {
  88. t.offset = len(t.src)
  89. t.ch = -1
  90. }
  91. t.error_state = err
  92. return
  93. }
  94. @require_results
  95. scan_class :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
  96. start := t.read_offset
  97. for {
  98. advance_rune(t)
  99. if t.ch == -1 || t.error_state != nil {
  100. return "", false
  101. }
  102. if t.ch == '\\' {
  103. advance_rune(t)
  104. continue
  105. }
  106. if t.ch == ']' {
  107. return t.src[start:t.offset], true
  108. }
  109. }
  110. unreachable()
  111. }
  112. @require_results
  113. scan_repeat :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
  114. start := t.read_offset
  115. for {
  116. advance_rune(t)
  117. if t.ch == -1 {
  118. return "", false
  119. }
  120. if t.ch == '}' {
  121. return t.src[start:t.offset], true
  122. }
  123. }
  124. unreachable()
  125. }
  126. @require_results
  127. scan_non_greedy :: proc(t: ^Tokenizer) -> bool {
  128. if peek_byte(t) == '?' {
  129. advance_rune(t)
  130. return true
  131. }
  132. return false
  133. }
  134. scan_comment :: proc(t: ^Tokenizer) {
  135. for {
  136. advance_rune(t)
  137. switch t.ch {
  138. case -1:
  139. return
  140. case '\n':
  141. // UNIX newline.
  142. advance_rune(t)
  143. return
  144. case '\r':
  145. // Mac newline.
  146. advance_rune(t)
  147. if t.ch == '\n' {
  148. // Windows newline.
  149. advance_rune(t)
  150. }
  151. return
  152. }
  153. }
  154. }
  155. @require_results
  156. scan_non_capture_group :: proc(t: ^Tokenizer) -> bool {
  157. if peek_byte(t) == '?' && peek_byte(t, 1) == ':' {
  158. advance_rune(t)
  159. advance_rune(t)
  160. return true
  161. }
  162. return false
  163. }
  164. @require_results
  165. scan :: proc(t: ^Tokenizer) -> (token: Token) {
  166. kind: Token_Kind
  167. lit: string
  168. pos := t.offset
  169. defer {
  170. t.last_token_kind = token.kind
  171. }
  172. if t.error_state != nil {
  173. t.error_state = nil
  174. return { .Invalid, "", pos }
  175. }
  176. if t.held_token != {} {
  177. popped := t.held_token
  178. t.held_token = {}
  179. return popped
  180. }
  181. ch_loop: for {
  182. switch t.ch {
  183. case -1:
  184. return { .EOF, "", pos }
  185. case '\\':
  186. advance_rune(t)
  187. if t.ch == -1 {
  188. return { .EOF, "", pos }
  189. }
  190. pos = t.offset
  191. // @MetaCharacter
  192. // NOTE: These must be kept in sync with the compiler.
  193. DIGIT_CLASS :: "0-9"
  194. SPACE_CLASS :: "\t\n\f\r "
  195. WORD_CLASS :: "0-9A-Z_a-z"
  196. switch t.ch {
  197. case 'b': kind = .Word_Boundary
  198. case 'B': kind = .Non_Word_Boundary
  199. case 'f': kind = .Rune; lit = "\f"
  200. case 'n': kind = .Rune; lit = "\n"
  201. case 'r': kind = .Rune; lit = "\r"
  202. case 't': kind = .Rune; lit = "\t"
  203. case 'd': kind = .Rune_Class; lit = DIGIT_CLASS
  204. case 's': kind = .Rune_Class; lit = SPACE_CLASS
  205. case 'w': kind = .Rune_Class; lit = WORD_CLASS
  206. case 'D': kind = .Rune_Class; lit = "^" + DIGIT_CLASS
  207. case 'S': kind = .Rune_Class; lit = "^" + SPACE_CLASS
  208. case 'W': kind = .Rune_Class; lit = "^" + WORD_CLASS
  209. case:
  210. kind = .Rune
  211. lit = t.src[t.offset:t.read_offset]
  212. }
  213. case '.':
  214. kind = .Wildcard
  215. case '|': kind = .Alternate
  216. case '*': kind = .Repeat_Zero_Non_Greedy if scan_non_greedy(t) else .Repeat_Zero
  217. case '+': kind = .Repeat_One_Non_Greedy if scan_non_greedy(t) else .Repeat_One
  218. case '?': kind = .Optional_Non_Greedy if scan_non_greedy(t) else .Optional
  219. case '[':
  220. if text, ok := scan_class(t); ok {
  221. kind = .Rune_Class
  222. lit = text
  223. } else {
  224. kind = .EOF
  225. }
  226. case '{':
  227. if text, ok := scan_repeat(t); ok {
  228. kind = .Repeat_N
  229. lit = text
  230. } else {
  231. kind = .EOF
  232. }
  233. case '(':
  234. kind = .Open_Paren_Non_Capture if scan_non_capture_group(t) else .Open_Paren
  235. t.paren_depth += 1
  236. case ')':
  237. kind = .Close_Paren
  238. t.paren_depth -= 1
  239. case '^': kind = .Anchor_Start
  240. case '$':
  241. kind = .Anchor_End
  242. case:
  243. if .Ignore_Whitespace in t.flags {
  244. switch t.ch {
  245. case ' ', '\r', '\n', '\t', '\f':
  246. advance_rune(t)
  247. continue ch_loop
  248. case:
  249. break
  250. }
  251. }
  252. if t.ch == '#' && t.paren_depth == 0 {
  253. scan_comment(t)
  254. continue ch_loop
  255. }
  256. kind = .Rune
  257. lit = t.src[t.offset:t.read_offset]
  258. }
  259. break ch_loop
  260. }
  261. if t.error_state != nil {
  262. t.error_state = nil
  263. return { .Invalid, "", pos }
  264. }
  265. advance_rune(t)
  266. // The following set of rules dictate where Concatenate tokens are
  267. // automatically inserted.
  268. #partial switch kind {
  269. case
  270. .Close_Paren,
  271. .Alternate,
  272. .Optional, .Optional_Non_Greedy,
  273. .Repeat_Zero, .Repeat_Zero_Non_Greedy,
  274. .Repeat_One, .Repeat_One_Non_Greedy,
  275. .Repeat_N:
  276. // Never prepend a Concatenate before these tokens.
  277. break
  278. case:
  279. #partial switch t.last_token_kind {
  280. case
  281. .Invalid,
  282. .Open_Paren, .Open_Paren_Non_Capture,
  283. .Alternate:
  284. // Never prepend a Concatenate token when the _last token_ was one
  285. // of these.
  286. break
  287. case:
  288. t.held_token = { kind, lit, pos }
  289. return { .Concatenate, "", pos }
  290. }
  291. }
  292. return { kind, lit, pos }
  293. }