regex.odin 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. package regex
  2. /*
  3. (c) Copyright 2024 Feoramund <[email protected]>.
  4. Made available under Odin's BSD-3 license.
  5. List of contributors:
  6. Feoramund: Initial implementation.
  7. */
  8. import "base:runtime"
  9. import "core:text/regex/common"
  10. import "core:text/regex/compiler"
  11. import "core:text/regex/optimizer"
  12. import "core:text/regex/parser"
  13. import "core:text/regex/virtual_machine"
  14. Flag :: common.Flag
  15. Flags :: common.Flags
  16. Parser_Error :: parser.Error
  17. Compiler_Error :: compiler.Error
  18. Creation_Error :: enum {
  19. None,
  20. // A `\` was supplied as the delimiter to `create_by_user`.
  21. Bad_Delimiter,
  22. // A pair of delimiters for `create_by_user` was not found.
  23. Expected_Delimiter,
  24. // An unknown letter was supplied to `create_by_user` after the last delimiter.
  25. Unknown_Flag,
  26. }
  27. Error :: union #shared_nil {
  28. // An error that can occur in the pattern parsing phase.
  29. //
  30. // Most of these are regular expression syntax errors and are either
  31. // context-dependent as to what they mean or have self-explanatory names.
  32. Parser_Error,
  33. // An error that can occur in the pattern compiling phase.
  34. //
  35. // Of the two that can be returned, they have to do with exceeding the
  36. // limitations of the Virtual Machine.
  37. Compiler_Error,
  38. // An error that occurs only for `create_by_user`.
  39. Creation_Error,
  40. }
  41. /*
  42. This struct corresponds to a set of string captures from a RegEx match.
  43. `pos` will contain the start and end positions for each string in `groups`,
  44. such that `str[pos[0][0]:pos[0][1]] == groups[0]`.
  45. */
  46. Capture :: struct {
  47. pos: [][2]int,
  48. groups: []string,
  49. }
  50. /*
  51. A compiled Regular Expression value, to be used with the `match_*` procedures.
  52. */
  53. Regular_Expression :: struct {
  54. flags: Flags `fmt:"-"`,
  55. class_data: []virtual_machine.Rune_Class_Data `fmt:"-"`,
  56. program: []virtual_machine.Opcode `fmt:"-"`,
  57. }
  58. /*
  59. An iterator to repeatedly match a pattern against a string, to be used with `*_iterator` procedures.
  60. */
  61. Match_Iterator :: struct {
  62. regex: Regular_Expression,
  63. capture: Capture,
  64. vm: virtual_machine.Machine,
  65. idx: int,
  66. temp: runtime.Allocator,
  67. threads: int,
  68. done: bool,
  69. }
  70. /*
  71. Create a regular expression from a string pattern and a set of flags.
  72. *Allocates Using Provided Allocators*
  73. Inputs:
  74. - pattern: The pattern to compile.
  75. - flags: A `bit_set` of RegEx flags.
  76. - permanent_allocator: The allocator to use for the final regular expression. (default: context.allocator)
  77. - temporary_allocator: The allocator to use for the intermediate compilation stages. (default: context.temp_allocator)
  78. Returns:
  79. - result: The regular expression.
  80. - err: An error, if one occurred.
  81. */
  82. @require_results
  83. create :: proc(
  84. pattern: string,
  85. flags: Flags = {},
  86. permanent_allocator := context.allocator,
  87. temporary_allocator := context.temp_allocator,
  88. ) -> (result: Regular_Expression, err: Error) {
  89. // For the sake of speed and simplicity, we first run all the intermediate
  90. // processes such as parsing and compilation through the temporary
  91. // allocator.
  92. program: [dynamic]virtual_machine.Opcode = ---
  93. class_data: [dynamic]parser.Rune_Class_Data = ---
  94. {
  95. context.allocator = temporary_allocator
  96. ast := parser.parse(pattern, flags) or_return
  97. if .No_Optimization not_in flags {
  98. ast, _ = optimizer.optimize(ast, flags)
  99. }
  100. program, class_data = compiler.compile(ast, flags) or_return
  101. }
  102. // When that's successful, re-allocate all at once with the permanent
  103. // allocator so everything can be tightly packed.
  104. context.allocator = permanent_allocator
  105. result.flags = flags
  106. if len(class_data) > 0 {
  107. result.class_data = make([]virtual_machine.Rune_Class_Data, len(class_data))
  108. }
  109. for data, i in class_data {
  110. if len(data.runes) > 0 {
  111. result.class_data[i].runes = make([]rune, len(data.runes))
  112. copy(result.class_data[i].runes, data.runes[:])
  113. }
  114. if len(data.ranges) > 0 {
  115. result.class_data[i].ranges = make([]virtual_machine.Rune_Class_Range, len(data.ranges))
  116. copy(result.class_data[i].ranges, data.ranges[:])
  117. }
  118. }
  119. result.program = make([]virtual_machine.Opcode, len(program))
  120. copy(result.program, program[:])
  121. return
  122. }
  123. /*
  124. Create a regular expression from a delimited string pattern, such as one
  125. provided by users of a program or those found in a configuration file.
  126. They are in the form of:
  127. [DELIMITER] [regular expression] [DELIMITER] [flags]
  128. For example, the following strings are valid:
  129. /hellope/i
  130. #hellope#i
  131. •hellope•i
  132. つhellopeつi
  133. The delimiter is determined by the very first rune in the string.
  134. The only restriction is that the delimiter cannot be `\`, as that rune is used
  135. to escape the delimiter if found in the middle of the string.
  136. All runes after the closing delimiter will be parsed as flags:
  137. - 'm': Multiline
  138. - 'i': Case_Insensitive
  139. - 'x': Ignore_Whitespace
  140. - 'u': Unicode
  141. - 'n': No_Capture
  142. - '-': No_Optimization
  143. *Allocates Using Provided Allocators*
  144. Inputs:
  145. - pattern: The delimited pattern with optional flags to compile.
  146. - str: The string to match against.
  147. - permanent_allocator: The allocator to use for the final regular expression. (default: context.allocator)
  148. - temporary_allocator: The allocator to use for the intermediate compilation stages. (default: context.temp_allocator)
  149. Returns:
  150. - result: The regular expression.
  151. - err: An error, if one occurred.
  152. */
  153. @require_results
  154. create_by_user :: proc(
  155. pattern: string,
  156. permanent_allocator := context.allocator,
  157. temporary_allocator := context.temp_allocator,
  158. ) -> (result: Regular_Expression, err: Error) {
  159. if len(pattern) == 0 {
  160. err = .Expected_Delimiter
  161. return
  162. }
  163. delimiter: rune
  164. start := -1
  165. end := -1
  166. flags: Flags
  167. escaping: bool
  168. parse_loop: for r, i in pattern {
  169. if delimiter == 0 {
  170. if r == '\\' {
  171. err = .Bad_Delimiter
  172. return
  173. }
  174. delimiter = r
  175. continue parse_loop
  176. }
  177. if start == -1 {
  178. start = i
  179. }
  180. if escaping {
  181. escaping = false
  182. continue parse_loop
  183. }
  184. switch r {
  185. case '\\':
  186. escaping = true
  187. case delimiter:
  188. end = i
  189. break parse_loop
  190. }
  191. }
  192. if end == -1 {
  193. err = .Expected_Delimiter
  194. return
  195. }
  196. // `start` is also the size of the delimiter, which is why it's being added
  197. // to `end` here.
  198. for r in pattern[start + end:] {
  199. switch r {
  200. case 'm': flags += { .Multiline }
  201. case 'i': flags += { .Case_Insensitive }
  202. case 'x': flags += { .Ignore_Whitespace }
  203. case 'u': flags += { .Unicode }
  204. case 'n': flags += { .No_Capture }
  205. case '-': flags += { .No_Optimization }
  206. case:
  207. err = .Unknown_Flag
  208. return
  209. }
  210. }
  211. return create(pattern[start:end], flags, permanent_allocator, temporary_allocator)
  212. }
  213. /*
  214. Create a `Match_Iterator` using a string to search, a regular expression to match against it, and a set of flags.
  215. *Allocates Using Provided Allocators*
  216. Inputs:
  217. - str: The string to iterate over.
  218. - pattern: The pattern to match.
  219. - flags: A `bit_set` of RegEx flags.
  220. - permanent_allocator: The allocator to use for the compiled regular expression. (default: context.allocator)
  221. - temporary_allocator: The allocator to use for the intermediate compilation and iteration stages. (default: context.temp_allocator)
  222. Returns:
  223. - result: The `Match_Iterator`.
  224. - err: An error, if one occurred.
  225. */
  226. create_iterator :: proc(
  227. str: string,
  228. pattern: string,
  229. flags: Flags = {},
  230. permanent_allocator := context.allocator,
  231. temporary_allocator := context.temp_allocator,
  232. ) -> (result: Match_Iterator, err: Error) {
  233. result.regex = create(pattern, flags, permanent_allocator, temporary_allocator) or_return
  234. result.capture = preallocate_capture()
  235. result.temp = temporary_allocator
  236. result.vm = virtual_machine.create(result.regex.program, str)
  237. result.vm.class_data = result.regex.class_data
  238. result.threads = max(1, virtual_machine.opcode_count(result.vm.code) - 1)
  239. return
  240. }
  241. /*
  242. Match a regular expression against a string and allocate the results into the
  243. returned `capture` structure.
  244. The resulting capture strings will be slices to the string `str`, not wholly
  245. copied strings, so they won't need to be individually deleted.
  246. *Allocates Using Provided Allocators*
  247. Inputs:
  248. - regex: The regular expression.
  249. - str: The string to match against.
  250. - permanent_allocator: The allocator to use for the capture results. (default: context.allocator)
  251. - temporary_allocator: The allocator to use for the virtual machine. (default: context.temp_allocator)
  252. Returns:
  253. - capture: The capture groups found in the string.
  254. - success: True if the regex matched the string.
  255. */
  256. @require_results
  257. match_and_allocate_capture :: proc(
  258. regex: Regular_Expression,
  259. str: string,
  260. permanent_allocator := context.allocator,
  261. temporary_allocator := context.temp_allocator,
  262. ) -> (capture: Capture, success: bool) {
  263. saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
  264. {
  265. context.allocator = temporary_allocator
  266. vm := virtual_machine.create(regex.program, str)
  267. vm.class_data = regex.class_data
  268. if .Unicode in regex.flags {
  269. saved, success = virtual_machine.run(&vm, true)
  270. } else {
  271. saved, success = virtual_machine.run(&vm, false)
  272. }
  273. }
  274. if saved != nil {
  275. context.allocator = permanent_allocator
  276. num_groups := 0
  277. #no_bounds_check for i := 0; i < len(saved); i += 2 {
  278. a, b := saved[i], saved[i + 1]
  279. if a == -1 || b == -1 {
  280. continue
  281. }
  282. num_groups += 1
  283. }
  284. if num_groups > 0 {
  285. capture.groups = make([]string, num_groups)
  286. capture.pos = make([][2]int, num_groups)
  287. n := 0
  288. #no_bounds_check for i := 0; i < len(saved); i += 2 {
  289. a, b := saved[i], saved[i + 1]
  290. if a == -1 || b == -1 {
  291. continue
  292. }
  293. capture.groups[n] = str[a:b]
  294. capture.pos[n] = {a, b}
  295. n += 1
  296. }
  297. }
  298. }
  299. return
  300. }
  301. /*
  302. Match a regular expression against a string and save the capture results into
  303. the provided `capture` structure.
  304. The resulting capture strings will be slices to the string `str`, not wholly
  305. copied strings, so they won't need to be individually deleted.
  306. *Allocates Using Provided Allocator*
  307. Inputs:
  308. - regex: The regular expression.
  309. - str: The string to match against.
  310. - capture: A pointer to a Capture structure with `groups` and `pos` already allocated.
  311. - temporary_allocator: The allocator to use for the virtual machine. (default: context.temp_allocator)
  312. Returns:
  313. - num_groups: The number of capture groups set into `capture`.
  314. - success: True if the regex matched the string.
  315. */
  316. @require_results
  317. match_with_preallocated_capture :: proc(
  318. regex: Regular_Expression,
  319. str: string,
  320. capture: ^Capture,
  321. temporary_allocator := context.temp_allocator,
  322. ) -> (num_groups: int, success: bool) {
  323. assert(capture != nil, "Pre-allocated RegEx capture must not be nil.")
  324. assert(len(capture.groups) >= common.MAX_CAPTURE_GROUPS,
  325. "Pre-allocated RegEx capture `groups` must be at least 10 elements long.")
  326. assert(len(capture.pos) >= common.MAX_CAPTURE_GROUPS,
  327. "Pre-allocated RegEx capture `pos` must be at least 10 elements long.")
  328. saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
  329. {
  330. context.allocator = temporary_allocator
  331. vm := virtual_machine.create(regex.program, str)
  332. vm.class_data = regex.class_data
  333. if .Unicode in regex.flags {
  334. saved, success = virtual_machine.run(&vm, true)
  335. } else {
  336. saved, success = virtual_machine.run(&vm, false)
  337. }
  338. }
  339. if saved != nil {
  340. n := 0
  341. #no_bounds_check for i := 0; i < len(saved); i += 2 {
  342. a, b := saved[i], saved[i + 1]
  343. if a == -1 || b == -1 {
  344. continue
  345. }
  346. capture.groups[n] = str[a:b]
  347. capture.pos[n] = {a, b}
  348. n += 1
  349. }
  350. num_groups = n
  351. }
  352. return
  353. }
  354. /*
  355. Iterate over a `Match_Iterator` and return successive captures.
  356. Inputs:
  357. - it: Pointer to the `Match_Iterator` to iterate over.
  358. Returns:
  359. - result: `Capture` for this iteration.
  360. - ok: A bool indicating if there was a match, stopping the iteration on `false`.
  361. */
  362. match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok: bool) {
  363. assert(len(it.capture.groups) >= common.MAX_CAPTURE_GROUPS,
  364. "Pre-allocated RegEx capture `groups` must be at least 10 elements long.")
  365. assert(len(it.capture.pos) >= common.MAX_CAPTURE_GROUPS,
  366. "Pre-allocated RegEx capture `pos` must be at least 10 elements long.")
  367. // Guard against situations in which the iterator should finish.
  368. if it.done {
  369. return
  370. }
  371. runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
  372. if it.idx > 0 {
  373. // Reset the state needed to `virtual_machine.run` again.
  374. it.vm.top_thread = 0
  375. it.vm.current_rune = rune(0)
  376. it.vm.current_rune_size = 0
  377. for i in 0..<it.threads {
  378. it.vm.threads[i] = {}
  379. it.vm.next_threads[i] = {}
  380. }
  381. }
  382. // Take note of where the string pointer is before we start.
  383. sp_before := it.vm.string_pointer
  384. saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
  385. {
  386. context.allocator = it.temp
  387. if .Unicode in it.regex.flags {
  388. saved, ok = virtual_machine.run(&it.vm, true)
  389. } else {
  390. saved, ok = virtual_machine.run(&it.vm, false)
  391. }
  392. }
  393. if !ok {
  394. // Match failed, bail out.
  395. return
  396. }
  397. if it.vm.string_pointer == sp_before {
  398. // The string pointer did not move, but there was a match.
  399. //
  400. // At this point, the pattern supplied to the iterator will infinitely
  401. // loop if we do not intervene.
  402. it.done = true
  403. }
  404. if it.vm.string_pointer == len(it.vm.memory) {
  405. // The VM hit the end of the string.
  406. //
  407. // We do not check at the start, because a match of pattern `$`
  408. // against string "" is valid and must return a match.
  409. //
  410. // This check prevents a double-match of `$` against a non-empty string.
  411. it.done = true
  412. }
  413. str := string(it.vm.memory)
  414. num_groups: int
  415. if saved != nil {
  416. n := 0
  417. #no_bounds_check for i := 0; i < len(saved); i += 2 {
  418. a, b := saved[i], saved[i + 1]
  419. if a == -1 || b == -1 {
  420. continue
  421. }
  422. it.capture.groups[n] = str[a:b]
  423. it.capture.pos[n] = {a, b}
  424. n += 1
  425. }
  426. num_groups = n
  427. }
  428. defer it.idx += 1
  429. if num_groups > 0 {
  430. result = {it.capture.pos[:num_groups], it.capture.groups[:num_groups]}
  431. }
  432. return result, it.idx, ok
  433. }
  434. match :: proc {
  435. match_and_allocate_capture,
  436. match_with_preallocated_capture,
  437. match_iterator,
  438. }
  439. /*
  440. Reset an iterator, allowing it to be run again as if new.
  441. Inputs:
  442. - it: The iterator to reset.
  443. */
  444. reset :: proc(it: ^Match_Iterator) {
  445. it.done = false
  446. it.idx = 0
  447. it.vm.string_pointer = 0
  448. it.vm.top_thread = 0
  449. it.vm.current_rune = rune(0)
  450. it.vm.current_rune_size = 0
  451. it.vm.last_rune = rune(0)
  452. for i in 0..<it.threads {
  453. it.vm.threads[i] = {}
  454. it.vm.next_threads[i] = {}
  455. }
  456. }
  457. /*
  458. Allocate a `Capture` in advance for use with `match`. This can save some time
  459. if you plan on performing several matches at once and only need the results
  460. between matches.
  461. Inputs:
  462. - allocator: (default: context.allocator)
  463. Returns:
  464. - result: The `Capture` with the maximum number of groups allocated.
  465. */
  466. @require_results
  467. preallocate_capture :: proc(allocator := context.allocator) -> (result: Capture) {
  468. context.allocator = allocator
  469. result.pos = make([][2]int, common.MAX_CAPTURE_GROUPS)
  470. result.groups = make([]string, common.MAX_CAPTURE_GROUPS)
  471. return
  472. }
  473. /*
  474. Free all data allocated by the `create*` procedures.
  475. *Frees Using Provided Allocator*
  476. Inputs:
  477. - regex: A regular expression.
  478. - allocator: (default: context.allocator)
  479. */
  480. destroy_regex :: proc(regex: Regular_Expression, allocator := context.allocator) {
  481. context.allocator = allocator
  482. delete(regex.program)
  483. for data in regex.class_data {
  484. delete(data.runes)
  485. delete(data.ranges)
  486. }
  487. delete(regex.class_data)
  488. }
  489. /*
  490. Free all data allocated by the `match_and_allocate_capture` procedure.
  491. *Frees Using Provided Allocator*
  492. Inputs:
  493. - capture: A `Capture`.
  494. - allocator: (default: context.allocator)
  495. */
  496. destroy_capture :: proc(capture: Capture, allocator := context.allocator) {
  497. context.allocator = allocator
  498. delete(capture.groups)
  499. delete(capture.pos)
  500. }
  501. /*
  502. Free all data allocated by the `create_iterator` procedure.
  503. *Frees Using Provided Allocator*
  504. Inputs:
  505. - it: A `Match_Iterator`
  506. - allocator: (default: context.allocator)
  507. */
  508. destroy_iterator :: proc(it: Match_Iterator, allocator := context.allocator) {
  509. context.allocator = allocator
  510. destroy(it.regex)
  511. destroy(it.capture)
  512. virtual_machine.destroy(it.vm)
  513. }
  514. destroy :: proc {
  515. destroy_regex,
  516. destroy_capture,
  517. destroy_iterator,
  518. }