strlib.odin 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964
  1. // A Lua-like string match algorithm.
  2. package text_match
  3. import "base:runtime"
  4. import "core:unicode"
  5. import "core:unicode/utf8"
  6. import "core:strings"
  7. MAX_CAPTURES :: 32
  8. Capture :: struct {
  9. init: int,
  10. len: int,
  11. }
  12. Match :: struct {
  13. byte_start: int,
  14. byte_end: int,
  15. }
  16. Error :: enum {
  17. OK,
  18. OOB,
  19. Invalid_Capture_Index,
  20. Invalid_Pattern_Capture,
  21. Unfinished_Capture,
  22. Malformed_Pattern,
  23. Rune_Error,
  24. Match_Invalid,
  25. }
  26. L_ESC :: '%'
  27. CAP_POSITION :: -2
  28. CAP_UNFINISHED :: -1
  29. INVALID :: -1
  30. Match_State :: struct {
  31. src: string,
  32. pattern: string,
  33. level: int,
  34. capture: [MAX_CAPTURES]Capture,
  35. }
  36. @(require_results)
  37. match_class :: proc(c: rune, cl: rune) -> (res: bool) {
  38. switch unicode.to_lower(cl) {
  39. case 'a': res = is_alpha(c)
  40. case 'c': res = is_cntrl(c)
  41. case 'd': res = is_digit(c)
  42. case 'g': res = is_graph(c)
  43. case 'l': res = is_lower(c)
  44. case 'p': res = is_punct(c)
  45. case 's': res = is_space(c)
  46. case 'u': res = is_upper(c)
  47. case 'w': res = is_alnum(c)
  48. case 'x': res = is_xdigit(c)
  49. case: return cl == c
  50. }
  51. return is_lower(cl) ? res : !res
  52. }
  53. is_alpha :: unicode.is_alpha
  54. is_digit :: unicode.is_digit
  55. is_lower :: unicode.is_lower
  56. is_upper :: unicode.is_upper
  57. is_punct :: unicode.is_punct
  58. is_space :: unicode.is_space
  59. is_cntrl :: unicode.is_control
  60. @(require_results)
  61. is_alnum :: proc(c: rune) -> bool {
  62. return unicode.is_alpha(c) || unicode.is_digit(c)
  63. }
  64. @(require_results)
  65. is_graph :: proc(c: rune) -> bool {
  66. return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || unicode.is_digit(c)
  67. }
  68. @(require_results)
  69. is_xdigit :: proc(c: rune) -> bool {
  70. return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || unicode.is_digit(c)
  71. }
  72. // find the first utf8 charater and its size, return an error if the character is an error
  73. @(require_results)
  74. utf8_peek :: proc(bytes: string) -> (c: rune, size: int, err: Error) {
  75. c, size = utf8.decode_rune_in_string(bytes)
  76. if c == utf8.RUNE_ERROR {
  77. err = .Rune_Error
  78. }
  79. return
  80. }
  81. // find the first utf8 charater and its size and advance the index
  82. // return an error if the character is an error
  83. @(require_results)
  84. utf8_advance :: proc(bytes: string, index: ^int) -> (c: rune, err: Error) {
  85. size: int
  86. c, size = utf8.decode_rune_in_string(bytes[index^:])
  87. if c == utf8.RUNE_ERROR {
  88. err = .Rune_Error
  89. }
  90. index^ += size
  91. return
  92. }
  93. // continuation byte?
  94. @(require_results)
  95. is_cont :: proc(b: byte) -> bool {
  96. return b & 0xc0 == 0x80
  97. }
  98. @(require_results)
  99. utf8_prev :: proc(bytes: string, a, b: int) -> int {
  100. b := b
  101. for a < b && is_cont(bytes[b - 1]) {
  102. b -= 1
  103. }
  104. return a < b ? b - 1 : a
  105. }
  106. @(require_results)
  107. utf8_next :: proc(bytes: string, a: int) -> int {
  108. a := a
  109. b := len(bytes)
  110. for a < b - 1 && is_cont(bytes[a + 1]) {
  111. a += 1
  112. }
  113. return a < b ? a + 1 : b
  114. }
  115. @(require_results)
  116. check_capture :: proc(ms: ^Match_State, l: rune) -> (int, Error) {
  117. l := int(l - '1')
  118. if l < 0 || l >= ms.level || ms.capture[l].len == CAP_UNFINISHED {
  119. return 0, .Invalid_Capture_Index
  120. }
  121. return l, .OK
  122. }
  123. @(require_results)
  124. capture_to_close :: proc(ms: ^Match_State) -> (int, Error) {
  125. level := ms.level - 1
  126. for level >= 0 {
  127. if ms.capture[level].len == CAP_UNFINISHED {
  128. return level, .OK
  129. }
  130. level -= 1
  131. }
  132. return 0, .Invalid_Pattern_Capture
  133. }
  134. @(require_results)
  135. class_end :: proc(ms: ^Match_State, p: int) -> (step: int, err: Error) {
  136. step = p
  137. ch := utf8_advance(ms.pattern, &step) or_return
  138. switch ch {
  139. case L_ESC:
  140. if step == len(ms.pattern) {
  141. err = .Malformed_Pattern
  142. return
  143. }
  144. _ = utf8_advance(ms.pattern, &step) or_return
  145. case '[':
  146. // fine with step by 1
  147. if step + 1 < len(ms.pattern) && ms.pattern[step] == '^' {
  148. step += 1
  149. }
  150. // run till end is reached
  151. for {
  152. if step == len(ms.pattern) {
  153. err = .Malformed_Pattern
  154. return
  155. }
  156. if ms.pattern[step] == ']' {
  157. break
  158. }
  159. // dont care about utf8 here
  160. step += 1
  161. if step < len(ms.pattern) && ms.pattern[step] == L_ESC {
  162. // skip escapes like '%'
  163. step += 1
  164. }
  165. }
  166. // advance last time
  167. step += 1
  168. }
  169. return
  170. }
  171. @(require_results)
  172. match_bracket_class :: proc(ms: ^Match_State, c: rune, p, ec: int) -> (sig: bool, err: Error) {
  173. sig = true
  174. p := p
  175. if ms.pattern[p + 1] == '^' {
  176. p += 1
  177. sig = false
  178. }
  179. // while inside of class range
  180. for p < ec {
  181. char := utf8_advance(ms.pattern, &p) or_return
  182. // e.g. %a
  183. if char == L_ESC {
  184. next := utf8_advance(ms.pattern, &p) or_return
  185. if match_class(c, next) {
  186. return
  187. }
  188. } else {
  189. next, next_size := utf8_peek(ms.pattern[p:]) or_return
  190. // TODO test case for [a-???] where ??? is missing
  191. if next == '-' && p + next_size < len(ms.pattern) {
  192. // advance 2 codepoints
  193. p += next_size
  194. last := utf8_advance(ms.pattern, &p) or_return
  195. if char <= c && c <= last {
  196. return
  197. }
  198. } else if char == c {
  199. return
  200. }
  201. }
  202. }
  203. sig = !sig
  204. return
  205. }
  206. @(require_results)
  207. single_match :: proc(ms: ^Match_State, s, p, ep: int) -> (matched: bool, schar_size: int, err: Error) {
  208. if s >= len(ms.src) {
  209. return
  210. }
  211. pchar, psize := utf8_peek(ms.pattern[p:]) or_return
  212. schar, ssize := utf8_peek(ms.src[s:]) or_return
  213. schar_size = ssize
  214. switch pchar {
  215. case '.': matched = true
  216. case L_ESC:
  217. pchar_next, _ := utf8_peek(ms.pattern[p + psize:]) or_return
  218. matched = match_class(schar, pchar_next)
  219. case '[':
  220. matched = match_bracket_class(ms, schar, p, ep - 1) or_return
  221. case:
  222. matched = schar == pchar
  223. }
  224. return
  225. }
  226. @(require_results)
  227. match_balance :: proc(ms: ^Match_State, s, p: int) -> (unused: int, err: Error) {
  228. if p >= len(ms.pattern) - 1 {
  229. return INVALID, .Invalid_Pattern_Capture
  230. }
  231. schar, ssize := utf8_peek(ms.src[s:]) or_return
  232. pchar, psize := utf8_peek(ms.pattern[p:]) or_return
  233. // skip until the src and pattern match
  234. if schar != pchar {
  235. return INVALID, .OK
  236. }
  237. cont := 1
  238. s := s
  239. s += ssize
  240. begin := pchar
  241. end, _ := utf8_peek(ms.pattern[p + psize:]) or_return
  242. for s < len(ms.src) {
  243. ch := utf8_advance(ms.src, &s) or_return
  244. switch ch{
  245. case end:
  246. cont -= 1
  247. if cont == 0 {
  248. return s, .OK
  249. }
  250. case begin:
  251. cont += 1
  252. }
  253. }
  254. return INVALID, .OK
  255. }
  256. @(require_results)
  257. max_expand :: proc(ms: ^Match_State, s, p, ep: int) -> (res: int, err: Error) {
  258. m := s
  259. // count up matches
  260. for {
  261. matched, size := single_match(ms, m, p, ep) or_return
  262. if !matched {
  263. break
  264. }
  265. m += size
  266. }
  267. for s <= m {
  268. result := match(ms, m, ep + 1) or_return
  269. if result != INVALID {
  270. return result, .OK
  271. }
  272. if s == m {
  273. break
  274. }
  275. m = utf8_prev(ms.src, s, m)
  276. }
  277. return INVALID, .OK
  278. }
  279. @(require_results)
  280. min_expand :: proc(ms: ^Match_State, s, p, ep: int) -> (res: int, err: Error) {
  281. s := s
  282. for {
  283. result := match(ms, s, ep + 1) or_return
  284. if result != INVALID {
  285. return result, .OK
  286. }
  287. // TODO receive next step maybe?
  288. matched, rune_size := single_match(ms, s, p, ep) or_return
  289. if matched {
  290. s += rune_size
  291. } else {
  292. return INVALID, .OK
  293. }
  294. }
  295. }
  296. @(require_results)
  297. start_capture :: proc(ms: ^Match_State, s, p, what: int) -> (res: int, err: Error) {
  298. level := ms.level
  299. ms.capture[level].init = s
  300. ms.capture[level].len = what
  301. ms.level += 1
  302. res = match(ms, s, p) or_return
  303. if res == INVALID {
  304. ms.level -= 1
  305. }
  306. return
  307. }
  308. @(require_results)
  309. end_capture :: proc(ms: ^Match_State, s, p: int) -> (res: int, err: Error) {
  310. l := capture_to_close(ms) or_return
  311. // TODO double check, could do string as int index
  312. ms.capture[l].len = s - ms.capture[l].init
  313. res = match(ms, s, p) or_return
  314. if res == INVALID {
  315. ms.capture[l].len = CAP_UNFINISHED
  316. }
  317. return
  318. }
  319. @(require_results)
  320. match_capture :: proc(ms: ^Match_State, s: int, char: rune) -> (res: int, err: Error) {
  321. index := check_capture(ms, char) or_return
  322. length := ms.capture[index].len
  323. if len(ms.src) - s >= length {
  324. return s + length, .OK
  325. }
  326. return INVALID, .OK
  327. }
  328. @(require_results)
  329. match :: proc(ms: ^Match_State, s, p: int) -> (unused: int, err: Error) {
  330. s := s
  331. p := p
  332. if p == len(ms.pattern) {
  333. return s, .OK
  334. }
  335. // NOTE we can walk by ascii steps if we know the characters are ascii
  336. char, _ := utf8_peek(ms.pattern[p:]) or_return
  337. switch char {
  338. case '(':
  339. if p + 1 < len(ms.pattern) && ms.pattern[p + 1] == ')' {
  340. s = start_capture(ms, s, p + 2, CAP_POSITION) or_return
  341. } else {
  342. s = start_capture(ms, s, p + 1, CAP_UNFINISHED) or_return
  343. }
  344. case ')':
  345. s = end_capture(ms, s, p + 1) or_return
  346. case '$':
  347. if p + 1 != len(ms.pattern) {
  348. return match_default(ms, s, p)
  349. }
  350. if len(ms.src) != s {
  351. s = INVALID
  352. }
  353. case L_ESC:
  354. // stop short patterns like "%" only
  355. if p + 1 >= len(ms.pattern) {
  356. err = .OOB
  357. return
  358. }
  359. switch ms.pattern[p + 1] {
  360. // balanced string
  361. case 'b':
  362. s = match_balance(ms, s, p + 2) or_return
  363. if s != INVALID {
  364. // eg after %b()
  365. return match(ms, s, p + 4)
  366. }
  367. // frontier
  368. case 'f':
  369. p += 2
  370. if ms.pattern[p] != '[' {
  371. return INVALID, .Invalid_Pattern_Capture
  372. }
  373. ep := class_end(ms, p) or_return
  374. previous, current: rune
  375. // get previous
  376. if s != 0 {
  377. temp := utf8_prev(ms.src, 0, s)
  378. previous, _ = utf8_peek(ms.src[temp:]) or_return
  379. }
  380. // get current
  381. if s != len(ms.src) {
  382. current, _ = utf8_peek(ms.src[s:]) or_return
  383. }
  384. m1 := match_bracket_class(ms, previous, p, ep - 1) or_return
  385. m2 := match_bracket_class(ms, current, p, ep - 1) or_return
  386. if !m1 && m2 {
  387. return match(ms, s, ep)
  388. }
  389. s = INVALID
  390. // capture group
  391. case '0'..<'9':
  392. s = match_capture(ms, s, rune(ms.pattern[p + 1])) or_return
  393. if s != INVALID {
  394. return match(ms, s, p + 2)
  395. }
  396. case: return match_default(ms, s, p)
  397. }
  398. case:
  399. return match_default(ms, s, p)
  400. }
  401. return s, .OK
  402. }
  403. @(require_results)
  404. match_default :: proc(ms: ^Match_State, s, p: int) -> (unused: int, err: Error) {
  405. s := s
  406. ep := class_end(ms, p) or_return
  407. single_matched, ssize := single_match(ms, s, p, ep) or_return
  408. if !single_matched {
  409. epc := ep < len(ms.pattern) ? ms.pattern[ep] : 0
  410. switch epc {
  411. case '*', '?', '-':
  412. return match(ms, s, ep + 1)
  413. case:
  414. s = INVALID
  415. }
  416. } else {
  417. epc := ep < len(ms.pattern) ? ms.pattern[ep] : 0
  418. switch epc {
  419. case '?':
  420. result := match(ms, s + ssize, ep + 1) or_return
  421. if result == INVALID {
  422. return match(ms, s, ep + 1)
  423. }
  424. s = result
  425. case '+': s = max_expand(ms, s + ssize, p, ep) or_return
  426. case '*': s = max_expand(ms, s, p, ep) or_return
  427. case '-': s = min_expand(ms, s, p, ep) or_return
  428. case:
  429. return match(ms, s + ssize, ep)
  430. }
  431. }
  432. return s, .OK
  433. }
  434. @(require_results)
  435. push_onecapture :: proc(ms: ^Match_State, i: int, s: int, e: int, matches: []Match) -> (err: Error) {
  436. if i >= ms.level {
  437. if i == 0 {
  438. matches[0] = { 0, e - s }
  439. } else {
  440. err = .Invalid_Capture_Index
  441. }
  442. } else {
  443. init := ms.capture[i].init
  444. length := ms.capture[i].len
  445. switch length {
  446. case CAP_UNFINISHED:
  447. err = .Unfinished_Capture
  448. case CAP_POSITION:
  449. matches[i] = { init, init + 1 }
  450. case:
  451. matches[i] = { init, init + length }
  452. }
  453. }
  454. return
  455. }
  456. @(require_results)
  457. push_captures :: proc(ms: ^Match_State, s, e: int, matches: []Match) -> (nlevels: int, err: Error) {
  458. nlevels = 1 if ms.level == 0 && s >= 0 else ms.level
  459. for i in 0..<nlevels {
  460. push_onecapture(ms, i, s, e, matches) or_return
  461. }
  462. return
  463. }
  464. // SPECIALS := "^$*+?.([%-"
  465. // all special characters inside a small ascii array
  466. @(rodata)
  467. SPECIALS_TABLE := [256]bool {
  468. '^' = true,
  469. '$' = true,
  470. '*' = true,
  471. '+' = true,
  472. '?' = true,
  473. '.' = true,
  474. '(' = true,
  475. '[' = true,
  476. '%' = true,
  477. '-' = true,
  478. }
  479. // helper call to quick search for special characters
  480. @(require_results)
  481. index_special :: proc(text: string) -> int {
  482. for i in 0..<len(text) {
  483. if SPECIALS_TABLE[text[i]] {
  484. return i
  485. }
  486. }
  487. return -1
  488. }
  489. @(require_results)
  490. lmem_find :: proc(s1, s2: string) -> int {
  491. l1 := len(s1)
  492. l2 := len(s2)
  493. if l2 == 0 {
  494. return 0
  495. }
  496. if l2 > l1 {
  497. return -1
  498. }
  499. init := strings.index_byte(s1, s2[0])
  500. end := init + l2
  501. for end <= l1 && init >= 0 {
  502. init += 1
  503. if s1[init - 1:end] == s2 {
  504. return init - 1
  505. }
  506. next := strings.index_byte(s1[init:], s2[0])
  507. if next == -1 {
  508. return -1
  509. }
  510. init = init + next
  511. end = init + l2
  512. }
  513. return -1
  514. }
  515. // find a pattern with in a haystack with an offset
  516. // allow_memfind will speed up simple searches
  517. find_aux :: proc(haystack, pattern: string, offset: int, allow_memfind: bool, matches: ^[MAX_CAPTURES]Match) -> (captures: int, err: Error) {
  518. s := offset
  519. p := 0
  520. specials_idx := index_special(pattern)
  521. if allow_memfind && specials_idx == -1 {
  522. if index := lmem_find(haystack[s:], pattern); index >= 0 {
  523. matches[0] = { index + s, index + s + len(pattern) }
  524. captures = 1
  525. }
  526. return
  527. }
  528. pattern := pattern
  529. anchor: bool
  530. if len(pattern) > 0 && pattern[0] == '^' {
  531. anchor = true
  532. pattern = pattern[1:]
  533. }
  534. ms := Match_State {
  535. src = haystack,
  536. pattern = pattern,
  537. }
  538. for {
  539. res := match(&ms, s, p) or_return
  540. if res != INVALID {
  541. // disallow non advancing match
  542. if s == res {
  543. err = .Match_Invalid
  544. }
  545. // NOTE(Skytrias): first result is reserved for a full match
  546. matches[0] = { s, res }
  547. // rest are the actual captures
  548. captures = push_captures(&ms, -1, -1, matches[1:]) or_return
  549. captures += 1
  550. return
  551. }
  552. s += 1
  553. if !(s < len(ms.src) && !anchor) {
  554. break
  555. }
  556. }
  557. return
  558. }
  559. // iterative matching which returns the 0th/1st match
  560. // rest has to be used from captures
  561. // assumes captures is zeroed on first iteration
  562. // resets captures to zero on last iteration
  563. @(require_results)
  564. gmatch :: proc(haystack: ^string, pattern: string, captures: ^[MAX_CAPTURES]Match) -> (res: string, ok: bool) {
  565. haystack^ = haystack[captures[0].byte_end:]
  566. if len(haystack) > 0 {
  567. length, err := find_aux(haystack^, pattern, 0, false, captures)
  568. if length != 0 && err == .OK {
  569. ok = true
  570. first := length > 1 ? 1 : 0
  571. cap := captures[first]
  572. res = haystack[cap.byte_start:cap.byte_end]
  573. }
  574. }
  575. if !ok {
  576. captures^ = {}
  577. }
  578. return
  579. }
  580. // gsub with builder, replace patterns found with the replace content
  581. @(require_results)
  582. gsub_builder :: proc(builder: ^strings.Builder, haystack, pattern, replace: string) -> string {
  583. // find matches
  584. captures: [MAX_CAPTURES]Match
  585. haystack := haystack
  586. for {
  587. length, err := find_aux(haystack, pattern, 0, false, &captures)
  588. if length == 0 { // done
  589. break
  590. }
  591. if err != .OK {
  592. return {}
  593. }
  594. cap := captures[0]
  595. // write front till capture
  596. strings.write_string(builder, haystack[:cap.byte_start])
  597. // write replacements
  598. strings.write_string(builder, replace)
  599. // advance string till end
  600. haystack = haystack[cap.byte_end:]
  601. }
  602. strings.write_string(builder, haystack[:])
  603. return strings.to_string(builder^)
  604. }
  605. // uses temp builder to build initial string - then allocates the result
  606. @(require_results)
  607. gsub_allocator :: proc(haystack, pattern, replace: string, allocator := context.allocator) -> string {
  608. builder := strings.builder_make(0, 256, context.temp_allocator)
  609. return gsub_builder(&builder, haystack, pattern, replace)
  610. }
  611. Gsub_Proc :: proc(
  612. // optional passed data
  613. data: rawptr,
  614. // word match found
  615. word: string,
  616. // current haystack for found captures
  617. haystack: string,
  618. // found captures - empty for no captures
  619. captures: []Match,
  620. )
  621. // call a procedure on every match in the haystack
  622. gsub_with :: proc(haystack, pattern: string, data: rawptr, call: Gsub_Proc) {
  623. // find matches
  624. captures: [MAX_CAPTURES]Match
  625. haystack := haystack
  626. for {
  627. length := find_aux(haystack, pattern, 0, false, &captures) or_break
  628. if length == 0 { // done
  629. break
  630. }
  631. cap := captures[0]
  632. word := haystack[cap.byte_start:cap.byte_end]
  633. call(data, word, haystack, captures[1:length])
  634. // advance string till end
  635. haystack = haystack[cap.byte_end:]
  636. }
  637. }
  638. gsub :: proc { gsub_builder, gsub_allocator }
  639. // iterative find with zeroth capture only
  640. // assumes captures is zeroed on first iteration
  641. // resets captures to zero on last iteration
  642. @(require_results)
  643. gfind :: proc(haystack: ^string, pattern: string, captures: ^[MAX_CAPTURES]Match) -> (res: string, ok: bool) {
  644. haystack^ = haystack[captures[0].byte_end:]
  645. if len(haystack) > 0 {
  646. length, err := find_aux(haystack^, pattern, 0, true, captures)
  647. if length != 0 && err == .OK {
  648. ok = true
  649. cap := captures[0]
  650. res = haystack[cap.byte_start:cap.byte_end]
  651. }
  652. }
  653. if !ok {
  654. captures^ = {}
  655. }
  656. return
  657. }
  658. // rebuilds a pattern into a case insensitive pattern
  659. @(require_results)
  660. pattern_case_insensitive_builder :: proc(builder: ^strings.Builder, pattern: string) -> string {
  661. p := pattern
  662. last_percent: bool
  663. for len(p) > 0 {
  664. char, size := utf8.decode_rune_in_string(p)
  665. if unicode.is_alpha(char) && !last_percent {
  666. // write character class in manually
  667. strings.write_byte(builder, '[')
  668. strings.write_rune(builder, unicode.to_lower(char))
  669. strings.write_rune(builder, unicode.to_upper(char))
  670. strings.write_byte(builder, ']')
  671. } else {
  672. strings.write_rune(builder, char)
  673. }
  674. last_percent = char == L_ESC
  675. p = p[size:]
  676. }
  677. return strings.to_string(builder^)
  678. }
  679. @(require_results)
  680. pattern_case_insensitive_allocator :: proc(pattern: string, cap: int = 256, allocator := context.allocator) -> string {
  681. builder := strings.builder_make(0, cap, context.temp_allocator)
  682. return pattern_case_insensitive_builder(&builder, pattern)
  683. }
  684. pattern_case_insensitive :: proc { pattern_case_insensitive_builder, pattern_case_insensitive_allocator }
  685. // Matcher helper struct that stores optional data you might want to use or not
  686. // as lua is far more dynamic this helps dealing with too much data
  687. // this also allows use of find/match/gmatch at through one struct
  688. Matcher :: struct {
  689. haystack: string,
  690. pattern: string,
  691. captures: [MAX_CAPTURES]Match,
  692. captures_length: int,
  693. offset: int,
  694. err: Error,
  695. // changing content for iterators
  696. iter: string,
  697. iter_index: int,
  698. }
  699. // init using haystack & pattern and an optional byte offset
  700. @(require_results)
  701. matcher_init :: proc(haystack, pattern: string, offset: int = 0) -> (res: Matcher) {
  702. res.haystack = haystack
  703. res.pattern = pattern
  704. res.offset = offset
  705. res.iter = haystack
  706. return
  707. }
  708. // find the first match and return the byte start / end position in the string, true on success
  709. @(require_results)
  710. matcher_find :: proc(matcher: ^Matcher) -> (start, end: int, ok: bool) #no_bounds_check {
  711. matcher.captures_length, matcher.err = find_aux(
  712. matcher.haystack,
  713. matcher.pattern,
  714. matcher.offset,
  715. allow_memfind=true,
  716. matches=&matcher.captures,
  717. )
  718. ok = matcher.captures_length > 0 && matcher.err == .OK
  719. match := matcher.captures[0]
  720. start = match.byte_start
  721. end = match.byte_end
  722. return
  723. }
  724. // find the first match and return the matched word, true on success
  725. @(require_results)
  726. matcher_match :: proc(matcher: ^Matcher) -> (word: string, ok: bool) #no_bounds_check {
  727. matcher.captures_length, matcher.err = find_aux(
  728. matcher.haystack,
  729. matcher.pattern,
  730. matcher.offset,
  731. allow_memfind=false,
  732. matches=&matcher.captures,
  733. )
  734. ok = matcher.captures_length > 0 && matcher.err == .OK
  735. match := matcher.captures[0]
  736. word = matcher.haystack[match.byte_start:match.byte_end]
  737. return
  738. }
  739. // get the capture at the "correct" spot, as spot 0 is reserved for the first match
  740. @(require_results)
  741. matcher_capture :: proc(matcher: ^Matcher, index: int, loc := #caller_location) -> string #no_bounds_check {
  742. runtime.bounds_check_error_loc(loc, index + 1, MAX_CAPTURES - 1)
  743. cap := matcher.captures[index + 1]
  744. return matcher.haystack[cap.byte_start:cap.byte_end]
  745. }
  746. // get the raw match out of the captures, skipping spot 0
  747. @(require_results)
  748. matcher_capture_raw :: proc(matcher: ^Matcher, index: int, loc := #caller_location) -> Match #no_bounds_check {
  749. runtime.bounds_check_error_loc(loc, index + 1, MAX_CAPTURES - 1)
  750. return matcher.captures[index + 1]
  751. }
  752. // alias
  753. matcher_gmatch :: matcher_match_iter
  754. // iteratively match the haystack till it cant find any matches
  755. @(require_results)
  756. matcher_match_iter :: proc(matcher: ^Matcher) -> (res: string, index: int, ok: bool) {
  757. if len(matcher.iter) > 0 {
  758. matcher.captures_length, matcher.err = find_aux(
  759. matcher.iter,
  760. matcher.pattern,
  761. matcher.offset,
  762. false,
  763. &matcher.captures,
  764. )
  765. if matcher.captures_length != 0 && matcher.err == .OK {
  766. ok = true
  767. first := matcher.captures_length > 1 ? 1 : 0
  768. match := matcher.captures[first]
  769. // output
  770. res = matcher.iter[match.byte_start:match.byte_end]
  771. index = matcher.iter_index
  772. // advance
  773. matcher.iter_index += 1
  774. matcher.iter = matcher.iter[match.byte_end:]
  775. }
  776. }
  777. return
  778. }
  779. // get a slice of all valid captures above the first match
  780. @(require_results)
  781. matcher_captures_slice :: proc(matcher: ^Matcher) -> []Match {
  782. return matcher.captures[1:matcher.captures_length]
  783. }