parser.odin 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. package regex_parser
  2. /*
  3. (c) Copyright 2024 Feoramund <[email protected]>.
  4. Made available under Odin's BSD-3 license.
  5. List of contributors:
  6. Feoramund: Initial implementation.
  7. */
  8. import "base:intrinsics"
  9. import "core:strconv"
  10. import "core:strings"
  11. import "core:text/regex/common"
  12. import "core:text/regex/tokenizer"
  13. import "core:unicode"
  14. import "core:unicode/utf8"
  15. Token :: tokenizer.Token
  16. Token_Kind :: tokenizer.Token_Kind
  17. Tokenizer :: tokenizer.Tokenizer
  18. Rune_Class_Range :: struct {
  19. lower, upper: rune,
  20. }
  21. Rune_Class_Data :: struct {
  22. runes: [dynamic]rune,
  23. ranges: [dynamic]Rune_Class_Range,
  24. }
  25. Node_Rune :: struct {
  26. data: rune,
  27. }
  28. Node_Rune_Class :: struct {
  29. negating: bool,
  30. using data: Rune_Class_Data,
  31. }
  32. Node_Wildcard :: struct {}
  33. Node_Alternation :: struct {
  34. left, right: Node,
  35. }
  36. Node_Concatenation :: struct {
  37. nodes: [dynamic]Node,
  38. }
  39. Node_Repeat_Zero :: struct {
  40. inner: Node,
  41. }
  42. Node_Repeat_Zero_Non_Greedy :: struct {
  43. inner: Node,
  44. }
  45. Node_Repeat_One :: struct {
  46. inner: Node,
  47. }
  48. Node_Repeat_One_Non_Greedy :: struct {
  49. inner: Node,
  50. }
  51. Node_Repeat_N :: struct {
  52. inner: Node,
  53. lower, upper: int,
  54. }
  55. Node_Optional :: struct {
  56. inner: Node,
  57. }
  58. Node_Optional_Non_Greedy :: struct {
  59. inner: Node,
  60. }
  61. Node_Group :: struct {
  62. inner: Node,
  63. capture_id: int,
  64. capture: bool,
  65. }
  66. Node_Anchor :: struct {
  67. start: bool,
  68. }
  69. Node_Word_Boundary :: struct {
  70. non_word: bool,
  71. }
  72. Node_Match_All_And_Escape :: struct {}
  73. Node :: union {
  74. ^Node_Rune,
  75. ^Node_Rune_Class,
  76. ^Node_Wildcard,
  77. ^Node_Concatenation,
  78. ^Node_Alternation,
  79. ^Node_Repeat_Zero,
  80. ^Node_Repeat_Zero_Non_Greedy,
  81. ^Node_Repeat_One,
  82. ^Node_Repeat_One_Non_Greedy,
  83. ^Node_Repeat_N,
  84. ^Node_Optional,
  85. ^Node_Optional_Non_Greedy,
  86. ^Node_Group,
  87. ^Node_Anchor,
  88. ^Node_Word_Boundary,
  89. // Optimized nodes (not created by the Parser):
  90. ^Node_Match_All_And_Escape,
  91. }
  92. left_binding_power :: proc(kind: Token_Kind) -> int {
  93. #partial switch kind {
  94. case .Alternate: return 1
  95. case .Concatenate: return 2
  96. case .Repeat_Zero, .Repeat_One,
  97. .Repeat_Zero_Non_Greedy, .Repeat_One_Non_Greedy,
  98. .Repeat_N: return 3
  99. case .Optional,
  100. .Optional_Non_Greedy: return 4
  101. case .Open_Paren,
  102. .Open_Paren_Non_Capture: return 9
  103. }
  104. return 0
  105. }
  106. Expected_Token :: struct {
  107. pos: int,
  108. kind: Token_Kind,
  109. }
  110. Invalid_Repetition :: struct {
  111. pos: int,
  112. }
  113. Invalid_Token :: struct {
  114. pos: int,
  115. kind: Token_Kind,
  116. }
  117. Invalid_Unicode :: struct {
  118. pos: int,
  119. }
  120. Too_Many_Capture_Groups :: struct {
  121. pos: int,
  122. }
  123. Unexpected_EOF :: struct {
  124. pos: int,
  125. }
  126. Error :: union {
  127. Expected_Token,
  128. Invalid_Repetition,
  129. Invalid_Token,
  130. Invalid_Unicode,
  131. Too_Many_Capture_Groups,
  132. Unexpected_EOF,
  133. }
  134. Parser :: struct {
  135. flags: common.Flags,
  136. t: Tokenizer,
  137. cur_token: Token,
  138. groups: int,
  139. }
  140. @require_results
  141. advance :: proc(p: ^Parser) -> Error {
  142. p.cur_token = tokenizer.scan(&p.t)
  143. if p.cur_token.kind == .Invalid {
  144. return Invalid_Unicode { pos = 0 }
  145. }
  146. return nil
  147. }
  148. expect :: proc(p: ^Parser, kind: Token_Kind) -> (err: Error) {
  149. if p.cur_token.kind == kind {
  150. advance(p) or_return
  151. return
  152. }
  153. return Expected_Token{
  154. pos = p.t.offset,
  155. kind = kind,
  156. }
  157. }
  158. null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) {
  159. #partial switch token.kind {
  160. case .Rune:
  161. r: rune
  162. for ru in token.text {
  163. r = ru
  164. break
  165. }
  166. assert(r != 0, "Parsed an empty Rune token.")
  167. if .Case_Insensitive in p.flags {
  168. lower := unicode.to_lower(r)
  169. upper := unicode.to_upper(r)
  170. if lower != upper {
  171. node := new(Node_Rune_Class)
  172. append(&node.runes, lower)
  173. append(&node.runes, upper)
  174. return node, nil
  175. }
  176. }
  177. node := new(Node_Rune)
  178. node ^= { r }
  179. return node, nil
  180. case .Rune_Class:
  181. if len(token.text) == 0 {
  182. return nil, nil
  183. }
  184. node := new(Node_Rune_Class)
  185. #no_bounds_check for i := 0; i < len(token.text); /**/ {
  186. r, size := utf8.decode_rune(token.text[i:])
  187. if i == 0 && r == '^' {
  188. node.negating = true
  189. i += size
  190. continue
  191. }
  192. i += size
  193. assert(size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
  194. if r == '\\' {
  195. next_r, next_size := utf8.decode_rune(token.text[i:])
  196. i += next_size
  197. assert(next_size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
  198. // @MetaCharacter
  199. // NOTE: These must be kept in sync with the tokenizer.
  200. switch next_r {
  201. case 'f': append(&node.runes, '\f')
  202. case 'n': append(&node.runes, '\n')
  203. case 'r': append(&node.runes, '\r')
  204. case 't': append(&node.runes, '\t')
  205. case 'd':
  206. append(&node.ranges, Rune_Class_Range{ '0', '9' })
  207. case 's':
  208. append(&node.runes, '\t')
  209. append(&node.runes, '\n')
  210. append(&node.runes, '\f')
  211. append(&node.runes, '\r')
  212. append(&node.runes, ' ')
  213. case 'w':
  214. append(&node.ranges, Rune_Class_Range{ '0', '9' })
  215. append(&node.ranges, Rune_Class_Range{ 'A', 'Z' })
  216. append(&node.runes, '_')
  217. append(&node.ranges, Rune_Class_Range{ 'a', 'z' })
  218. case 'D':
  219. append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
  220. append(&node.ranges, Rune_Class_Range{ '9' + 1, max(rune) })
  221. case 'S':
  222. append(&node.ranges, Rune_Class_Range{ 0, '\t' - 1 })
  223. // \t and \n are adjacent.
  224. append(&node.runes, '\x0b') // Vertical Tab
  225. append(&node.ranges, Rune_Class_Range{ '\r' + 1, ' ' - 1 })
  226. append(&node.ranges, Rune_Class_Range{ ' ' + 1, max(rune) })
  227. case 'W':
  228. append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
  229. append(&node.ranges, Rune_Class_Range{ '9' + 1, 'A' - 1 })
  230. append(&node.ranges, Rune_Class_Range{ 'Z' + 1, '_' - 1 })
  231. append(&node.ranges, Rune_Class_Range{ '_' + 1, 'a' - 1 })
  232. append(&node.ranges, Rune_Class_Range{ 'z' + 1, max(rune) })
  233. case:
  234. append(&node.runes, next_r)
  235. }
  236. continue
  237. }
  238. if r == '-' && len(node.runes) > 0 {
  239. next_r, next_size := utf8.decode_rune(token.text[i:])
  240. if next_size > 0 {
  241. last := pop(&node.runes)
  242. i += next_size
  243. append(&node.ranges, Rune_Class_Range{ last, next_r })
  244. continue
  245. }
  246. }
  247. append(&node.runes, r)
  248. }
  249. if .Case_Insensitive in p.flags {
  250. // These two loops cannot be in the form of `for x in y` because
  251. // they append to the data that they iterate over.
  252. length := len(node.runes)
  253. #no_bounds_check for i := 0; i < length; i += 1 {
  254. r := node.runes[i]
  255. lower := unicode.to_lower(r)
  256. upper := unicode.to_upper(r)
  257. if lower != upper {
  258. if lower != r {
  259. append(&node.runes, lower)
  260. } else {
  261. append(&node.runes, upper)
  262. }
  263. }
  264. }
  265. length = len(node.ranges)
  266. #no_bounds_check for i := 0; i < length; i += 1 {
  267. range := &node.ranges[i]
  268. min_lower := unicode.to_lower(range.lower)
  269. max_lower := unicode.to_lower(range.upper)
  270. min_upper := unicode.to_upper(range.lower)
  271. max_upper := unicode.to_upper(range.upper)
  272. if min_lower != min_upper && max_lower != max_upper {
  273. range.lower = min_lower
  274. range.upper = max_lower
  275. append(&node.ranges, Rune_Class_Range{ min_upper, max_upper })
  276. }
  277. }
  278. }
  279. result = node
  280. case .Wildcard:
  281. node := new(Node_Wildcard)
  282. result = node
  283. case .Open_Paren:
  284. // Because of the recursive nature of the token parser, we take the
  285. // group number first instead of afterwards, in order to construct
  286. // group matches from the outside in.
  287. p.groups += 1
  288. if p.groups == common.MAX_CAPTURE_GROUPS {
  289. return nil, Too_Many_Capture_Groups{ pos = token.pos }
  290. }
  291. this_group := p.groups
  292. node := new(Node_Group)
  293. node.capture = true
  294. node.capture_id = this_group
  295. node.inner = parse_expression(p, 0) or_return
  296. expect(p, .Close_Paren) or_return
  297. result = node
  298. case .Open_Paren_Non_Capture:
  299. node := new(Node_Group)
  300. node.inner = parse_expression(p, 0) or_return
  301. expect(p, .Close_Paren) or_return
  302. result = node
  303. case .Close_Paren:
  304. node := new(Node_Rune)
  305. node ^= { ')' }
  306. return node, nil
  307. case .Anchor_Start:
  308. node := new(Node_Anchor)
  309. node.start = true
  310. result = node
  311. case .Anchor_End:
  312. node := new(Node_Anchor)
  313. result = node
  314. case .Word_Boundary:
  315. node := new(Node_Word_Boundary)
  316. result = node
  317. case .Non_Word_Boundary:
  318. node := new(Node_Word_Boundary)
  319. node.non_word = true
  320. result = node
  321. case .Alternate:
  322. // A unary alternation with a left-side empty path, i.e. `|a`.
  323. right, right_err := parse_expression(p, left_binding_power(.Alternate))
  324. #partial switch specific in right_err {
  325. case Unexpected_EOF:
  326. // This token is a NOP, i.e. `|`.
  327. break
  328. case nil:
  329. break
  330. case:
  331. return nil, right_err
  332. }
  333. node := new(Node_Alternation)
  334. node.right = right
  335. result = node
  336. case .EOF:
  337. return nil, Unexpected_EOF{ pos = token.pos }
  338. case:
  339. return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
  340. }
  341. return
  342. }
  343. left_denotation :: proc(p: ^Parser, token: Token, left: Node) -> (result: Node, err: Error) {
  344. #partial switch token.kind {
  345. case .Alternate:
  346. if p.cur_token.kind == .Close_Paren {
  347. // `(a|)`
  348. // parse_expression will fail, so intervene here.
  349. node := new(Node_Alternation)
  350. node.left = left
  351. return node, nil
  352. }
  353. right, right_err := parse_expression(p, left_binding_power(.Alternate))
  354. #partial switch specific in right_err {
  355. case nil:
  356. break
  357. case Unexpected_EOF:
  358. // EOF is okay in an alternation; it's an edge case in the way of
  359. // expressing an optional such as `a|`.
  360. break
  361. case:
  362. return nil, right_err
  363. }
  364. node := new(Node_Alternation)
  365. node.left = left
  366. node.right = right
  367. result = node
  368. case .Concatenate:
  369. right := parse_expression(p, left_binding_power(.Concatenate)) or_return
  370. // There should be no need to check if right is Node_Concatenation, due
  371. // to how the parsing direction works.
  372. #partial switch specific in left {
  373. case ^Node_Concatenation:
  374. append(&specific.nodes, right)
  375. result = specific
  376. case:
  377. node := new(Node_Concatenation)
  378. append(&node.nodes, left)
  379. append(&node.nodes, right)
  380. result = node
  381. }
  382. case .Repeat_Zero:
  383. node := new(Node_Repeat_Zero)
  384. node.inner = left
  385. result = node
  386. case .Repeat_Zero_Non_Greedy:
  387. node := new(Node_Repeat_Zero_Non_Greedy)
  388. node.inner = left
  389. result = node
  390. case .Repeat_One:
  391. node := new(Node_Repeat_One)
  392. node.inner = left
  393. result = node
  394. case .Repeat_One_Non_Greedy:
  395. node := new(Node_Repeat_One_Non_Greedy)
  396. node.inner = left
  397. result = node
  398. case .Repeat_N:
  399. node := new(Node_Repeat_N)
  400. node.inner = left
  401. comma := strings.index_byte(token.text, ',')
  402. switch comma {
  403. case -1: // {N}
  404. exact, ok := strconv.parse_u64_of_base(token.text, base = 10)
  405. if !ok {
  406. return nil, Invalid_Repetition{ pos = token.pos }
  407. }
  408. if exact == 0 {
  409. return nil, Invalid_Repetition{ pos = token.pos }
  410. }
  411. node.lower = cast(int)exact
  412. node.upper = cast(int)exact
  413. case 0: // {,M}
  414. upper, ok := strconv.parse_u64_of_base(token.text[1:], base = 10)
  415. if !ok {
  416. return nil, Invalid_Repetition{ pos = token.pos }
  417. }
  418. if upper == 0 {
  419. return nil, Invalid_Repetition{ pos = token.pos }
  420. }
  421. node.lower = -1
  422. node.upper = cast(int)upper
  423. case len(token.text) - 1: // {N,}
  424. lower, ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
  425. if !ok {
  426. return nil, Invalid_Repetition{ pos = token.pos }
  427. }
  428. node.lower = cast(int)lower
  429. node.upper = -1
  430. case: // {N,M}
  431. lower, lower_ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
  432. if !lower_ok {
  433. return nil, Invalid_Repetition{ pos = token.pos }
  434. }
  435. upper, upper_ok := strconv.parse_u64_of_base(token.text[comma+1:], base = 10)
  436. if !upper_ok {
  437. return nil, Invalid_Repetition{ pos = token.pos }
  438. }
  439. if lower > upper {
  440. return nil, Invalid_Repetition{ pos = token.pos }
  441. }
  442. if upper == 0 {
  443. return nil, Invalid_Repetition{ pos = token.pos }
  444. }
  445. node.lower = cast(int)lower
  446. node.upper = cast(int)upper
  447. }
  448. result = node
  449. case .Optional:
  450. node := new(Node_Optional)
  451. node.inner = left
  452. result = node
  453. case .Optional_Non_Greedy:
  454. node := new(Node_Optional_Non_Greedy)
  455. node.inner = left
  456. result = node
  457. case .EOF:
  458. return nil, Unexpected_EOF{ pos = token.pos }
  459. case:
  460. return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
  461. }
  462. return
  463. }
  464. parse_expression :: proc(p: ^Parser, rbp: int) -> (result: Node, err: Error) {
  465. token := p.cur_token
  466. advance(p) or_return
  467. left := null_denotation(p, token) or_return
  468. token = p.cur_token
  469. for rbp < left_binding_power(token.kind) {
  470. advance(p) or_return
  471. left = left_denotation(p, token, left) or_return
  472. token = p.cur_token
  473. }
  474. return left, nil
  475. }
  476. parse :: proc(str: string, flags: common.Flags) -> (result: Node, err: Error) {
  477. if len(str) == 0 {
  478. node := new(Node_Group)
  479. return node, nil
  480. }
  481. p: Parser
  482. p.flags = flags
  483. tokenizer.init(&p.t, str, flags)
  484. p.cur_token = tokenizer.scan(&p.t)
  485. if p.cur_token.kind == .Invalid {
  486. return nil, Invalid_Unicode { pos = 0 }
  487. }
  488. node := parse_expression(&p, 0) or_return
  489. result = node
  490. return
  491. }