strlib.odin 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959
  1. package text_match
  2. import "base:runtime"
  3. import "core:unicode"
  4. import "core:unicode/utf8"
  5. import "core:strings"
  6. MAX_CAPTURES :: 32
  7. Capture :: struct {
  8. init: int,
  9. len: int,
  10. }
  11. Match :: struct {
  12. byte_start, byte_end: int,
  13. }
  14. Error :: enum {
  15. OK,
  16. OOB,
  17. Invalid_Capture_Index,
  18. Invalid_Pattern_Capture,
  19. Unfinished_Capture,
  20. Malformed_Pattern,
  21. Rune_Error,
  22. Match_Invalid,
  23. }
  24. L_ESC :: '%'
  25. CAP_POSITION :: -2
  26. CAP_UNFINISHED :: -1
  27. INVALID :: -1
  28. Match_State :: struct {
  29. src: string,
  30. pattern: string,
  31. level: int,
  32. capture: [MAX_CAPTURES]Capture,
  33. }
  34. match_class :: proc(c: rune, cl: rune) -> (res: bool) {
  35. switch unicode.to_lower(cl) {
  36. case 'a': res = is_alpha(c)
  37. case 'c': res = is_cntrl(c)
  38. case 'd': res = is_digit(c)
  39. case 'g': res = is_graph(c)
  40. case 'l': res = is_lower(c)
  41. case 'p': res = is_punct(c)
  42. case 's': res = is_space(c)
  43. case 'u': res = is_upper(c)
  44. case 'w': res = is_alnum(c)
  45. case 'x': res = is_xdigit(c)
  46. case: return cl == c
  47. }
  48. return is_lower(cl) ? res : !res
  49. }
  50. is_alpha :: unicode.is_alpha
  51. is_digit :: unicode.is_digit
  52. is_lower :: unicode.is_lower
  53. is_upper :: unicode.is_upper
  54. is_punct :: unicode.is_punct
  55. is_space :: unicode.is_space
  56. is_cntrl :: unicode.is_control
  57. is_alnum :: proc(c: rune) -> bool {
  58. return unicode.is_alpha(c) || unicode.is_digit(c)
  59. }
  60. is_graph :: proc(c: rune) -> bool {
  61. return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || unicode.is_digit(c)
  62. }
  63. is_xdigit :: proc(c: rune) -> bool {
  64. return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || unicode.is_digit(c)
  65. }
  66. // find the first utf8 charater and its size, return an error if the character is an error
  67. utf8_peek :: proc(bytes: string) -> (c: rune, size: int, err: Error) {
  68. c, size = utf8.decode_rune_in_string(bytes)
  69. if c == utf8.RUNE_ERROR {
  70. err = .Rune_Error
  71. }
  72. return
  73. }
  74. // find the first utf8 charater and its size and advance the index
  75. // return an error if the character is an error
  76. utf8_advance :: proc(bytes: string, index: ^int) -> (c: rune, err: Error) {
  77. size: int
  78. c, size = utf8.decode_rune_in_string(bytes[index^:])
  79. if c == utf8.RUNE_ERROR {
  80. err = .Rune_Error
  81. }
  82. index^ += size
  83. return
  84. }
  85. // continuation byte?
  86. is_cont :: proc(b: byte) -> bool {
  87. return b & 0xc0 == 0x80
  88. }
  89. utf8_prev :: proc(bytes: string, a, b: int) -> int {
  90. b := b
  91. for a < b && is_cont(bytes[b - 1]) {
  92. b -= 1
  93. }
  94. return a < b ? b - 1 : a
  95. }
  96. utf8_next :: proc(bytes: string, a: int) -> int {
  97. a := a
  98. b := len(bytes)
  99. for a < b - 1 && is_cont(bytes[a + 1]) {
  100. a += 1
  101. }
  102. return a < b ? a + 1 : b
  103. }
  104. check_capture :: proc(ms: ^Match_State, l: rune) -> (int, Error) {
  105. l := int(l - '1')
  106. if l < 0 || l >= ms.level || ms.capture[l].len == CAP_UNFINISHED {
  107. return 0, .Invalid_Capture_Index
  108. }
  109. return l, .OK
  110. }
  111. capture_to_close :: proc(ms: ^Match_State) -> (int, Error) {
  112. level := ms.level - 1
  113. for level >= 0 {
  114. if ms.capture[level].len == CAP_UNFINISHED {
  115. return level, .OK
  116. }
  117. level -= 1
  118. }
  119. return 0, .Invalid_Pattern_Capture
  120. }
  121. class_end :: proc(ms: ^Match_State, p: int) -> (step: int, err: Error) {
  122. step = p
  123. ch := utf8_advance(ms.pattern, &step) or_return
  124. switch ch {
  125. case L_ESC:
  126. if step == len(ms.pattern) {
  127. err = .Malformed_Pattern
  128. return
  129. }
  130. utf8_advance(ms.pattern, &step) or_return
  131. case '[':
  132. // fine with step by 1
  133. if step + 1 < len(ms.pattern) && ms.pattern[step] == '^' {
  134. step += 1
  135. }
  136. // run till end is reached
  137. for {
  138. if step == len(ms.pattern) {
  139. err = .Malformed_Pattern
  140. return
  141. }
  142. if ms.pattern[step] == ']' {
  143. break
  144. }
  145. // dont care about utf8 here
  146. step += 1
  147. if step < len(ms.pattern) && ms.pattern[step] == L_ESC {
  148. // skip escapes like '%'
  149. step += 1
  150. }
  151. }
  152. // advance last time
  153. step += 1
  154. }
  155. return
  156. }
  157. match_bracket_class :: proc(ms: ^Match_State, c: rune, p, ec: int) -> (sig: bool, err: Error) {
  158. sig = true
  159. p := p
  160. if ms.pattern[p + 1] == '^' {
  161. p += 1
  162. sig = false
  163. }
  164. // while inside of class range
  165. for p < ec {
  166. char := utf8_advance(ms.pattern, &p) or_return
  167. // e.g. %a
  168. if char == L_ESC {
  169. next := utf8_advance(ms.pattern, &p) or_return
  170. if match_class(c, next) {
  171. return
  172. }
  173. } else {
  174. next, next_size := utf8_peek(ms.pattern[p:]) or_return
  175. // TODO test case for [a-???] where ??? is missing
  176. if next == '-' && p + next_size < len(ms.pattern) {
  177. // advance 2 codepoints
  178. p += next_size
  179. last := utf8_advance(ms.pattern, &p) or_return
  180. if char <= c && c <= last {
  181. return
  182. }
  183. } else if char == c {
  184. return
  185. }
  186. }
  187. }
  188. sig = !sig
  189. return
  190. }
  191. single_match :: proc(ms: ^Match_State, s, p, ep: int) -> (matched: bool, schar_size: int, err: Error) {
  192. if s >= len(ms.src) {
  193. return
  194. }
  195. pchar, psize := utf8_peek(ms.pattern[p:]) or_return
  196. schar, ssize := utf8_peek(ms.src[s:]) or_return
  197. schar_size = ssize
  198. switch pchar {
  199. case '.': matched = true
  200. case L_ESC:
  201. pchar_next, _ := utf8_peek(ms.pattern[p + psize:]) or_return
  202. matched = match_class(schar, pchar_next)
  203. case '[': matched = match_bracket_class(ms, schar, p, ep - 1) or_return
  204. case: matched = schar == pchar
  205. }
  206. return
  207. }
  208. match_balance :: proc(ms: ^Match_State, s, p: int) -> (unused: int, err: Error) {
  209. if p >= len(ms.pattern) - 1 {
  210. return INVALID, .Invalid_Pattern_Capture
  211. }
  212. schar, ssize := utf8_peek(ms.src[s:]) or_return
  213. pchar, psize := utf8_peek(ms.pattern[p:]) or_return
  214. // skip until the src and pattern match
  215. if schar != pchar {
  216. return INVALID, .OK
  217. }
  218. cont := 1
  219. s := s
  220. s += ssize
  221. begin := pchar
  222. end, _ := utf8_peek(ms.pattern[p + psize:]) or_return
  223. for s < len(ms.src) {
  224. ch := utf8_advance(ms.src, &s) or_return
  225. switch ch{
  226. case end:
  227. cont -= 1
  228. if cont == 0 {
  229. return s, .OK
  230. }
  231. case begin:
  232. cont += 1
  233. }
  234. }
  235. return INVALID, .OK
  236. }
  237. max_expand :: proc(ms: ^Match_State, s, p, ep: int) -> (res: int, err: Error) {
  238. m := s
  239. // count up matches
  240. for {
  241. matched, size := single_match(ms, m, p, ep) or_return
  242. if !matched {
  243. break
  244. }
  245. m += size
  246. }
  247. for s <= m {
  248. result := match(ms, m, ep + 1) or_return
  249. if result != INVALID {
  250. return result, .OK
  251. }
  252. if s == m {
  253. break
  254. }
  255. m = utf8_prev(ms.src, s, m)
  256. }
  257. return INVALID, .OK
  258. }
  259. min_expand :: proc(ms: ^Match_State, s, p, ep: int) -> (res: int, err: Error) {
  260. s := s
  261. for {
  262. result := match(ms, s, ep + 1) or_return
  263. if result != INVALID {
  264. return result, .OK
  265. } else {
  266. // TODO receive next step maybe?
  267. matched, rune_size := single_match(ms, s, p, ep) or_return
  268. if matched {
  269. s += rune_size
  270. } else {
  271. return INVALID, .OK
  272. }
  273. }
  274. }
  275. }
  276. start_capture :: proc(ms: ^Match_State, s, p, what: int) -> (res: int, err: Error) {
  277. level := ms.level
  278. ms.capture[level].init = s
  279. ms.capture[level].len = what
  280. ms.level += 1
  281. res = match(ms, s, p) or_return
  282. if res == INVALID {
  283. ms.level -= 1
  284. }
  285. return
  286. }
  287. end_capture :: proc(ms: ^Match_State, s, p: int) -> (res: int, err: Error) {
  288. l := capture_to_close(ms) or_return
  289. // TODO double check, could do string as int index
  290. ms.capture[l].len = s - ms.capture[l].init
  291. res = match(ms, s, p) or_return
  292. if res == INVALID {
  293. ms.capture[l].len = CAP_UNFINISHED
  294. }
  295. return
  296. }
  297. match_capture :: proc(ms: ^Match_State, s: int, char: rune) -> (res: int, err: Error) {
  298. index := check_capture(ms, char) or_return
  299. length := ms.capture[index].len
  300. if len(ms.src) - s >= length {
  301. return s + length, .OK
  302. }
  303. return INVALID, .OK
  304. }
  305. match :: proc(ms: ^Match_State, s, p: int) -> (unused: int, err: Error) {
  306. s := s
  307. p := p
  308. if p == len(ms.pattern) {
  309. return s, .OK
  310. }
  311. // NOTE we can walk by ascii steps if we know the characters are ascii
  312. char, _ := utf8_peek(ms.pattern[p:]) or_return
  313. switch char {
  314. case '(':
  315. if p + 1 < len(ms.pattern) && ms.pattern[p + 1] == ')' {
  316. s = start_capture(ms, s, p + 2, CAP_POSITION) or_return
  317. } else {
  318. s = start_capture(ms, s, p + 1, CAP_UNFINISHED) or_return
  319. }
  320. case ')':
  321. s = end_capture(ms, s, p + 1) or_return
  322. case '$':
  323. if p + 1 != len(ms.pattern) {
  324. return match_default(ms, s, p)
  325. }
  326. if len(ms.src) != s {
  327. s = INVALID
  328. }
  329. case L_ESC:
  330. // stop short patterns like "%" only
  331. if p + 1 >= len(ms.pattern) {
  332. err = .OOB
  333. return
  334. }
  335. switch ms.pattern[p + 1] {
  336. // balanced string
  337. case 'b':
  338. s = match_balance(ms, s, p + 2) or_return
  339. if s != INVALID {
  340. // eg after %b()
  341. return match(ms, s, p + 4)
  342. }
  343. // frontier
  344. case 'f':
  345. p += 2
  346. if ms.pattern[p] != '[' {
  347. return INVALID, .Invalid_Pattern_Capture
  348. }
  349. ep := class_end(ms, p) or_return
  350. previous, current: rune
  351. // get previous
  352. if s != 0 {
  353. temp := utf8_prev(ms.src, 0, s)
  354. previous, _ = utf8_peek(ms.src[temp:]) or_return
  355. }
  356. // get current
  357. if s != len(ms.src) {
  358. current, _ = utf8_peek(ms.src[s:]) or_return
  359. }
  360. m1 := match_bracket_class(ms, previous, p, ep - 1) or_return
  361. m2 := match_bracket_class(ms, current, p, ep - 1) or_return
  362. if !m1 && m2 {
  363. return match(ms, s, ep)
  364. }
  365. s = INVALID
  366. // capture group
  367. case '0'..<'9':
  368. s = match_capture(ms, s, rune(ms.pattern[p + 1])) or_return
  369. if s != INVALID {
  370. return match(ms, s, p + 2)
  371. }
  372. case: return match_default(ms, s, p)
  373. }
  374. case:
  375. return match_default(ms, s, p)
  376. }
  377. return s, .OK
  378. }
  379. match_default :: proc(ms: ^Match_State, s, p: int) -> (unused: int, err: Error) {
  380. s := s
  381. ep := class_end(ms, p) or_return
  382. single_matched, ssize := single_match(ms, s, p, ep) or_return
  383. if !single_matched {
  384. epc := ep < len(ms.pattern) ? ms.pattern[ep] : 0
  385. switch epc {
  386. case '*', '?', '-': return match(ms, s, ep + 1)
  387. case: s = INVALID
  388. }
  389. } else {
  390. epc := ep < len(ms.pattern) ? ms.pattern[ep] : 0
  391. switch epc {
  392. case '?':
  393. result := match(ms, s + ssize, ep + 1) or_return
  394. if result != INVALID {
  395. s = result
  396. } else {
  397. return match(ms, s, ep + 1)
  398. }
  399. case '+': s = max_expand(ms, s + ssize, p, ep) or_return
  400. case '*': s = max_expand(ms, s, p, ep) or_return
  401. case '-': s = min_expand(ms, s, p, ep) or_return
  402. case: return match(ms, s + ssize, ep)
  403. }
  404. }
  405. return s, .OK
  406. }
  407. push_onecapture :: proc(ms: ^Match_State, i: int, s: int, e: int, matches: []Match) -> (err: Error) {
  408. if i >= ms.level {
  409. if i == 0 {
  410. matches[0] = { 0, e - s }
  411. } else {
  412. err = .Invalid_Capture_Index
  413. }
  414. } else {
  415. init := ms.capture[i].init
  416. length := ms.capture[i].len
  417. switch length {
  418. case CAP_UNFINISHED: err = .Unfinished_Capture
  419. case CAP_POSITION: matches[i] = { init, init + 1 }
  420. case: matches[i] = { init, init + length }
  421. }
  422. }
  423. return
  424. }
  425. push_captures :: proc(
  426. ms: ^Match_State,
  427. s: int,
  428. e: int,
  429. matches: []Match,
  430. ) -> (nlevels: int, err: Error) {
  431. nlevels = 1 if ms.level == 0 && s != -1 else ms.level
  432. for i in 0..<nlevels {
  433. push_onecapture(ms, i, s, e, matches) or_return
  434. }
  435. return
  436. }
  437. // SPECIALS := "^$*+?.([%-"
  438. // all special characters inside a small ascii array
  439. SPECIALS_TABLE := [256]bool {
  440. '^' = true,
  441. '$' = true,
  442. '*' = true,
  443. '+' = true,
  444. '?' = true,
  445. '.' = true,
  446. '(' = true,
  447. '[' = true,
  448. '%' = true,
  449. '-' = true,
  450. }
  451. // helper call to quick search for special characters
  452. index_special :: proc(text: string) -> int {
  453. for i in 0..<len(text) {
  454. if SPECIALS_TABLE[text[i]] {
  455. return i
  456. }
  457. }
  458. return -1
  459. }
  460. lmem_find :: proc(s1, s2: string) -> int {
  461. l1 := len(s1)
  462. l2 := len(s2)
  463. if l2 == 0 {
  464. return 0
  465. } else if l2 > l1 {
  466. return -1
  467. } else {
  468. init := strings.index_byte(s1, s2[0])
  469. end := init + l2
  470. for end <= l1 && init != -1 {
  471. init += 1
  472. if s1[init - 1:end] == s2 {
  473. return init - 1
  474. } else {
  475. next := strings.index_byte(s1[init:], s2[0])
  476. if next == -1 {
  477. return -1
  478. } else {
  479. init = init + next
  480. end = init + l2
  481. }
  482. }
  483. }
  484. }
  485. return -1
  486. }
  487. // find a pattern with in a haystack with an offset
  488. // allow_memfind will speed up simple searches
  489. find_aux :: proc(
  490. haystack: string,
  491. pattern: string,
  492. offset: int,
  493. allow_memfind: bool,
  494. matches: ^[MAX_CAPTURES]Match,
  495. ) -> (captures: int, err: Error) {
  496. s := offset
  497. p := 0
  498. specials_idx := index_special(pattern)
  499. if allow_memfind && specials_idx == -1 {
  500. if index := lmem_find(haystack[s:], pattern); index != -1 {
  501. matches[0] = { index + s, index + s + len(pattern) }
  502. captures = 1
  503. return
  504. } else {
  505. return
  506. }
  507. }
  508. pattern := pattern
  509. anchor: bool
  510. if len(pattern) > 0 && pattern[0] == '^' {
  511. anchor = true
  512. pattern = pattern[1:]
  513. }
  514. ms := Match_State {
  515. src = haystack,
  516. pattern = pattern,
  517. }
  518. for {
  519. res := match(&ms, s, p) or_return
  520. if res != INVALID {
  521. // disallow non advancing match
  522. if s == res {
  523. err = .Match_Invalid
  524. }
  525. // NOTE(Skytrias): first result is reserved for a full match
  526. matches[0] = { s, res }
  527. // rest are the actual captures
  528. captures = push_captures(&ms, -1, -1, matches[1:]) or_return
  529. captures += 1
  530. return
  531. }
  532. s += 1
  533. if !(s < len(ms.src) && !anchor) {
  534. break
  535. }
  536. }
  537. return
  538. }
  539. // iterative matching which returns the 0th/1st match
  540. // rest has to be used from captures
  541. gmatch :: proc(
  542. haystack: ^string,
  543. pattern: string,
  544. captures: ^[MAX_CAPTURES]Match,
  545. ) -> (res: string, ok: bool) {
  546. if len(haystack) > 0 {
  547. length, err := find_aux(haystack^, pattern, 0, false, captures)
  548. if length != 0 && err == .OK {
  549. ok = true
  550. first := length > 1 ? 1 : 0
  551. cap := captures[first]
  552. res = haystack[cap.byte_start:cap.byte_end]
  553. haystack^ = haystack[cap.byte_end:]
  554. }
  555. }
  556. return
  557. }
  558. // gsub with builder, replace patterns found with the replace content
  559. gsub_builder :: proc(
  560. builder: ^strings.Builder,
  561. haystack: string,
  562. pattern: string,
  563. replace: string,
  564. ) -> string {
  565. // find matches
  566. captures: [MAX_CAPTURES]Match
  567. haystack := haystack
  568. for {
  569. length, err := find_aux(haystack, pattern, 0, false, &captures)
  570. // done
  571. if length == 0 {
  572. break
  573. }
  574. if err != .OK {
  575. return {}
  576. }
  577. cap := captures[0]
  578. // write front till capture
  579. strings.write_string(builder, haystack[:cap.byte_start])
  580. // write replacements
  581. strings.write_string(builder, replace)
  582. // advance string till end
  583. haystack = haystack[cap.byte_end:]
  584. }
  585. strings.write_string(builder, haystack[:])
  586. return strings.to_string(builder^)
  587. }
  588. // uses temp builder to build initial string - then allocates the result
  589. gsub_allocator :: proc(
  590. haystack: string,
  591. pattern: string,
  592. replace: string,
  593. allocator := context.allocator,
  594. ) -> string {
  595. builder := strings.builder_make(0, 256, context.temp_allocator)
  596. return gsub_builder(&builder, haystack, pattern, replace)
  597. }
  598. Gsub_Proc :: proc(
  599. // optional passed data
  600. data: rawptr,
  601. // word match found
  602. word: string,
  603. // current haystack for found captures
  604. haystack: string,
  605. // found captures - empty for no captures
  606. captures: []Match,
  607. )
  608. // call a procedure on every match in the haystack
  609. gsub_with :: proc(
  610. haystack: string,
  611. pattern: string,
  612. data: rawptr,
  613. call: Gsub_Proc,
  614. ) {
  615. // find matches
  616. captures: [MAX_CAPTURES]Match
  617. haystack := haystack
  618. for {
  619. length := find_aux(haystack, pattern, 0, false, &captures) or_break
  620. // done
  621. if length == 0 {
  622. break
  623. }
  624. cap := captures[0]
  625. word := haystack[cap.byte_start:cap.byte_end]
  626. call(data, word, haystack, captures[1:length])
  627. // advance string till end
  628. haystack = haystack[cap.byte_end:]
  629. }
  630. }
  631. gsub :: proc { gsub_builder, gsub_allocator }
  632. // iterative find with zeroth capture only
  633. gfind :: proc(
  634. haystack: ^string,
  635. pattern: string,
  636. captures: ^[MAX_CAPTURES]Match,
  637. ) -> (res: string, ok: bool) {
  638. if len(haystack) > 0 {
  639. length, err := find_aux(haystack^, pattern, 0, true, captures)
  640. if length != 0 && err == .OK {
  641. ok = true
  642. cap := captures[0]
  643. res = haystack[cap.byte_start:cap.byte_end]
  644. haystack^ = haystack[cap.byte_end:]
  645. }
  646. }
  647. return
  648. }
  649. // rebuilds a pattern into a case insensitive pattern
  650. pattern_case_insensitive_builder :: proc(
  651. builder: ^strings.Builder,
  652. pattern: string,
  653. ) -> (res: string) {
  654. p := pattern
  655. last_percent: bool
  656. for len(p) > 0 {
  657. char, size := utf8.decode_rune_in_string(p)
  658. if unicode.is_alpha(char) && !last_percent {
  659. // write character class in manually
  660. strings.write_byte(builder, '[')
  661. strings.write_rune(builder, unicode.to_lower(char))
  662. strings.write_rune(builder, unicode.to_upper(char))
  663. strings.write_byte(builder, ']')
  664. } else {
  665. strings.write_rune(builder, char)
  666. }
  667. last_percent = char == L_ESC
  668. p = p[size:]
  669. }
  670. return strings.to_string(builder^)
  671. }
  672. pattern_case_insensitive_allocator :: proc(
  673. pattern: string,
  674. cap: int = 256,
  675. allocator := context.allocator,
  676. ) -> (res: string) {
  677. builder := strings.builder_make(0, cap, context.temp_allocator)
  678. return pattern_case_insensitive_builder(&builder, pattern)
  679. }
  680. pattern_case_insensitive :: proc { pattern_case_insensitive_builder, pattern_case_insensitive_allocator }
  681. // Matcher helper struct that stores optional data you might want to use or not
  682. // as lua is far more dynamic this helps dealing with too much data
  683. // this also allows use of find/match/gmatch at through one struct
  684. Matcher :: struct {
  685. haystack: string,
  686. pattern: string,
  687. captures: [MAX_CAPTURES]Match,
  688. captures_length: int,
  689. offset: int,
  690. err: Error,
  691. // changing content for iterators
  692. iter: string,
  693. iter_index: int,
  694. }
  695. // init using haystack & pattern and an optional byte offset
  696. matcher_init :: proc(haystack, pattern: string, offset: int = 0) -> (res: Matcher) {
  697. res.haystack = haystack
  698. res.pattern = pattern
  699. res.offset = offset
  700. res.iter = haystack
  701. return
  702. }
  703. // find the first match and return the byte start / end position in the string, true on success
  704. matcher_find :: proc(matcher: ^Matcher) -> (start, end: int, ok: bool) #no_bounds_check {
  705. matcher.captures_length, matcher.err = find_aux(
  706. matcher.haystack,
  707. matcher.pattern,
  708. matcher.offset,
  709. true,
  710. &matcher.captures,
  711. )
  712. ok = matcher.captures_length > 0 && matcher.err == .OK
  713. match := matcher.captures[0]
  714. start = match.byte_start
  715. end = match.byte_end
  716. return
  717. }
  718. // find the first match and return the matched word, true on success
  719. matcher_match :: proc(matcher: ^Matcher) -> (word: string, ok: bool) #no_bounds_check {
  720. matcher.captures_length, matcher.err = find_aux(
  721. matcher.haystack,
  722. matcher.pattern,
  723. matcher.offset,
  724. false,
  725. &matcher.captures,
  726. )
  727. ok = matcher.captures_length > 0 && matcher.err == .OK
  728. match := matcher.captures[0]
  729. word = matcher.haystack[match.byte_start:match.byte_end]
  730. return
  731. }
  732. // get the capture at the "correct" spot, as spot 0 is reserved for the first match
  733. matcher_capture :: proc(matcher: ^Matcher, index: int, loc := #caller_location) -> string #no_bounds_check {
  734. runtime.bounds_check_error_loc(loc, index + 1, MAX_CAPTURES - 1)
  735. cap := matcher.captures[index + 1]
  736. return matcher.haystack[cap.byte_start:cap.byte_end]
  737. }
  738. // get the raw match out of the captures, skipping spot 0
  739. matcher_capture_raw :: proc(matcher: ^Matcher, index: int, loc := #caller_location) -> Match #no_bounds_check {
  740. runtime.bounds_check_error_loc(loc, index + 1, MAX_CAPTURES - 1)
  741. return matcher.captures[index + 1]
  742. }
  743. // alias
  744. matcher_gmatch :: matcher_match_iter
  745. // iteratively match the haystack till it cant find any matches
  746. matcher_match_iter :: proc(matcher: ^Matcher) -> (res: string, index: int, ok: bool) {
  747. if len(matcher.iter) > 0 {
  748. matcher.captures_length, matcher.err = find_aux(
  749. matcher.iter,
  750. matcher.pattern,
  751. matcher.offset,
  752. false,
  753. &matcher.captures,
  754. )
  755. if matcher.captures_length != 0 && matcher.err == .OK {
  756. ok = true
  757. first := matcher.captures_length > 1 ? 1 : 0
  758. match := matcher.captures[first]
  759. // output
  760. res = matcher.iter[match.byte_start:match.byte_end]
  761. index = matcher.iter_index
  762. // advance
  763. matcher.iter_index += 1
  764. matcher.iter = matcher.iter[match.byte_end:]
  765. }
  766. }
  767. return
  768. }
  769. // get a slice of all valid captures above the first match
  770. matcher_captures_slice :: proc(matcher: ^Matcher) -> []Match {
  771. return matcher.captures[1:matcher.captures_length]
  772. }