regex.odin 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. package regex
  2. /*
  3. (c) Copyright 2024 Feoramund <[email protected]>.
  4. Made available under Odin's BSD-3 license.
  5. List of contributors:
  6. Feoramund: Initial implementation.
  7. */
  8. import "core:text/regex/common"
  9. import "core:text/regex/compiler"
  10. import "core:text/regex/optimizer"
  11. import "core:text/regex/parser"
  12. import "core:text/regex/virtual_machine"
  13. Flag :: common.Flag
  14. Flags :: common.Flags
  15. Parser_Error :: parser.Error
  16. Compiler_Error :: compiler.Error
  17. Creation_Error :: enum {
  18. None,
  19. // A `\` was supplied as the delimiter to `create_by_user`.
  20. Bad_Delimiter,
  21. // A pair of delimiters for `create_by_user` was not found.
  22. Expected_Delimiter,
  23. // An unknown letter was supplied to `create_by_user` after the last delimiter.
  24. Unknown_Flag,
  25. }
  26. Error :: union #shared_nil {
  27. // An error that can occur in the pattern parsing phase.
  28. //
  29. // Most of these are regular expression syntax errors and are either
  30. // context-dependent as to what they mean or have self-explanatory names.
  31. Parser_Error,
  32. // An error that can occur in the pattern compiling phase.
  33. //
  34. // Of the two that can be returned, they have to do with exceeding the
  35. // limitations of the Virtual Machine.
  36. Compiler_Error,
  37. // An error that occurs only for `create_by_user`.
  38. Creation_Error,
  39. }
  40. /*
  41. This struct corresponds to a set of string captures from a RegEx match.
  42. `pos` will contain the start and end positions for each string in `groups`,
  43. such that `str[pos[0][0]:pos[0][1]] == groups[0]`.
  44. */
  45. Capture :: struct {
  46. pos: [][2]int,
  47. groups: []string,
  48. }
  49. /*
  50. A compiled Regular Expression value, to be used with the `match_*` procedures.
  51. */
  52. Regular_Expression :: struct {
  53. flags: Flags `fmt:"-"`,
  54. class_data: []virtual_machine.Rune_Class_Data `fmt:"-"`,
  55. program: []virtual_machine.Opcode `fmt:"-"`,
  56. }
  57. /*
  58. Create a regular expression from a string pattern and a set of flags.
  59. *Allocates Using Provided Allocators*
  60. Inputs:
  61. - pattern: The pattern to compile.
  62. - flags: A `bit_set` of RegEx flags.
  63. - permanent_allocator: The allocator to use for the final regular expression. (default: context.allocator)
  64. - temporary_allocator: The allocator to use for the intermediate compilation stages. (default: context.temp_allocator)
  65. Returns:
  66. - result: The regular expression.
  67. - err: An error, if one occurred.
  68. */
  69. @require_results
  70. create :: proc(
  71. pattern: string,
  72. flags: Flags = {},
  73. permanent_allocator := context.allocator,
  74. temporary_allocator := context.temp_allocator,
  75. ) -> (result: Regular_Expression, err: Error) {
  76. // For the sake of speed and simplicity, we first run all the intermediate
  77. // processes such as parsing and compilation through the temporary
  78. // allocator.
  79. program: [dynamic]virtual_machine.Opcode = ---
  80. class_data: [dynamic]parser.Rune_Class_Data = ---
  81. {
  82. context.allocator = temporary_allocator
  83. ast := parser.parse(pattern, flags) or_return
  84. if .No_Optimization not_in flags {
  85. ast, _ = optimizer.optimize(ast, flags)
  86. }
  87. program, class_data = compiler.compile(ast, flags) or_return
  88. }
  89. // When that's successful, re-allocate all at once with the permanent
  90. // allocator so everything can be tightly packed.
  91. context.allocator = permanent_allocator
  92. result.flags = flags
  93. if len(class_data) > 0 {
  94. result.class_data = make([]virtual_machine.Rune_Class_Data, len(class_data))
  95. }
  96. for data, i in class_data {
  97. if len(data.runes) > 0 {
  98. result.class_data[i].runes = make([]rune, len(data.runes))
  99. copy(result.class_data[i].runes, data.runes[:])
  100. }
  101. if len(data.ranges) > 0 {
  102. result.class_data[i].ranges = make([]virtual_machine.Rune_Class_Range, len(data.ranges))
  103. copy(result.class_data[i].ranges, data.ranges[:])
  104. }
  105. }
  106. result.program = make([]virtual_machine.Opcode, len(program))
  107. copy(result.program, program[:])
  108. return
  109. }
  110. /*
  111. Create a regular expression from a delimited string pattern, such as one
  112. provided by users of a program or those found in a configuration file.
  113. They are in the form of:
  114. [DELIMITER] [regular expression] [DELIMITER] [flags]
  115. For example, the following strings are valid:
  116. /hellope/i
  117. #hellope#i
  118. •hellope•i
  119. つhellopeつi
  120. The delimiter is determined by the very first rune in the string.
  121. The only restriction is that the delimiter cannot be `\`, as that rune is used
  122. to escape the delimiter if found in the middle of the string.
  123. All runes after the closing delimiter will be parsed as flags:
  124. - 'g': Global
  125. - 'm': Multiline
  126. - 'i': Case_Insensitive
  127. - 'x': Ignore_Whitespace
  128. - 'u': Unicode
  129. - 'n': No_Capture
  130. - '-': No_Optimization
  131. *Allocates Using Provided Allocators*
  132. Inputs:
  133. - pattern: The delimited pattern with optional flags to compile.
  134. - str: The string to match against.
  135. - permanent_allocator: The allocator to use for the final regular expression. (default: context.allocator)
  136. - temporary_allocator: The allocator to use for the intermediate compilation stages. (default: context.temp_allocator)
  137. Returns:
  138. - result: The regular expression.
  139. - err: An error, if one occurred.
  140. */
  141. @require_results
  142. create_by_user :: proc(
  143. pattern: string,
  144. permanent_allocator := context.allocator,
  145. temporary_allocator := context.temp_allocator,
  146. ) -> (result: Regular_Expression, err: Error) {
  147. if len(pattern) == 0 {
  148. err = .Expected_Delimiter
  149. return
  150. }
  151. delimiter: rune
  152. start := -1
  153. end := -1
  154. flags: Flags
  155. escaping: bool
  156. parse_loop: for r, i in pattern {
  157. if delimiter == 0 {
  158. if r == '\\' {
  159. err = .Bad_Delimiter
  160. return
  161. }
  162. delimiter = r
  163. continue parse_loop
  164. }
  165. if start == -1 {
  166. start = i
  167. }
  168. if escaping {
  169. escaping = false
  170. continue parse_loop
  171. }
  172. switch r {
  173. case '\\':
  174. escaping = true
  175. case delimiter:
  176. end = i
  177. break parse_loop
  178. }
  179. }
  180. if end == -1 {
  181. err = .Expected_Delimiter
  182. return
  183. }
  184. // `start` is also the size of the delimiter, which is why it's being added
  185. // to `end` here.
  186. for r in pattern[start + end:] {
  187. switch r {
  188. case 'g': flags += { .Global }
  189. case 'm': flags += { .Multiline }
  190. case 'i': flags += { .Case_Insensitive }
  191. case 'x': flags += { .Ignore_Whitespace }
  192. case 'u': flags += { .Unicode }
  193. case 'n': flags += { .No_Capture }
  194. case '-': flags += { .No_Optimization }
  195. case:
  196. err = .Unknown_Flag
  197. return
  198. }
  199. }
  200. return create(pattern[start:end], flags, permanent_allocator, temporary_allocator)
  201. }
  202. /*
  203. Match a regular expression against a string and allocate the results into the
  204. returned `capture` structure.
  205. The resulting capture strings will be slices to the string `str`, not wholly
  206. copied strings, so they won't need to be individually deleted.
  207. *Allocates Using Provided Allocators*
  208. Inputs:
  209. - regex: The regular expression.
  210. - str: The string to match against.
  211. - permanent_allocator: The allocator to use for the capture results. (default: context.allocator)
  212. - temporary_allocator: The allocator to use for the virtual machine. (default: context.temp_allocator)
  213. Returns:
  214. - capture: The capture groups found in the string.
  215. - success: True if the regex matched the string.
  216. */
  217. @require_results
  218. match_and_allocate_capture :: proc(
  219. regex: Regular_Expression,
  220. str: string,
  221. permanent_allocator := context.allocator,
  222. temporary_allocator := context.temp_allocator,
  223. ) -> (capture: Capture, success: bool) {
  224. saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
  225. {
  226. context.allocator = temporary_allocator
  227. vm := virtual_machine.create(regex.program, str)
  228. vm.class_data = regex.class_data
  229. if .Unicode in regex.flags {
  230. saved, success = virtual_machine.run(&vm, true)
  231. } else {
  232. saved, success = virtual_machine.run(&vm, false)
  233. }
  234. }
  235. if saved != nil {
  236. context.allocator = permanent_allocator
  237. num_groups := 0
  238. #no_bounds_check for i := 0; i < len(saved); i += 2 {
  239. a, b := saved[i], saved[i + 1]
  240. if a == -1 || b == -1 {
  241. continue
  242. }
  243. num_groups += 1
  244. }
  245. if num_groups > 0 {
  246. capture.groups = make([]string, num_groups)
  247. capture.pos = make([][2]int, num_groups)
  248. n := 0
  249. #no_bounds_check for i := 0; i < len(saved); i += 2 {
  250. a, b := saved[i], saved[i + 1]
  251. if a == -1 || b == -1 {
  252. continue
  253. }
  254. capture.groups[n] = str[a:b]
  255. capture.pos[n] = {a, b}
  256. n += 1
  257. }
  258. }
  259. }
  260. return
  261. }
  262. /*
  263. Match a regular expression against a string and save the capture results into
  264. the provided `capture` structure.
  265. The resulting capture strings will be slices to the string `str`, not wholly
  266. copied strings, so they won't need to be individually deleted.
  267. *Allocates Using Provided Allocator*
  268. Inputs:
  269. - regex: The regular expression.
  270. - str: The string to match against.
  271. - capture: A pointer to a Capture structure with `groups` and `pos` already allocated.
  272. - temporary_allocator: The allocator to use for the virtual machine. (default: context.temp_allocator)
  273. Returns:
  274. - num_groups: The number of capture groups set into `capture`.
  275. - success: True if the regex matched the string.
  276. */
  277. @require_results
  278. match_with_preallocated_capture :: proc(
  279. regex: Regular_Expression,
  280. str: string,
  281. capture: ^Capture,
  282. temporary_allocator := context.temp_allocator,
  283. ) -> (num_groups: int, success: bool) {
  284. assert(capture != nil, "Pre-allocated RegEx capture must not be nil.")
  285. assert(len(capture.groups) >= common.MAX_CAPTURE_GROUPS,
  286. "Pre-allocated RegEx capture `groups` must be at least 10 elements long.")
  287. assert(len(capture.pos) >= common.MAX_CAPTURE_GROUPS,
  288. "Pre-allocated RegEx capture `pos` must be at least 10 elements long.")
  289. saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
  290. {
  291. context.allocator = temporary_allocator
  292. vm := virtual_machine.create(regex.program, str)
  293. vm.class_data = regex.class_data
  294. if .Unicode in regex.flags {
  295. saved, success = virtual_machine.run(&vm, true)
  296. } else {
  297. saved, success = virtual_machine.run(&vm, false)
  298. }
  299. }
  300. if saved != nil {
  301. n := 0
  302. #no_bounds_check for i := 0; i < len(saved); i += 2 {
  303. a, b := saved[i], saved[i + 1]
  304. if a == -1 || b == -1 {
  305. continue
  306. }
  307. capture.groups[n] = str[a:b]
  308. capture.pos[n] = {a, b}
  309. n += 1
  310. }
  311. }
  312. return
  313. }
  314. match :: proc {
  315. match_and_allocate_capture,
  316. match_with_preallocated_capture,
  317. }
  318. /*
  319. Allocate a `Capture` in advance for use with `match`. This can save some time
  320. if you plan on performing several matches at once and only need the results
  321. between matches.
  322. Inputs:
  323. - allocator: (default: context.allocator)
  324. Returns:
  325. - result: The `Capture` with the maximum number of groups allocated.
  326. */
  327. @require_results
  328. preallocate_capture :: proc(allocator := context.allocator) -> (result: Capture) {
  329. context.allocator = allocator
  330. result.pos = make([][2]int, common.MAX_CAPTURE_GROUPS)
  331. result.groups = make([]string, common.MAX_CAPTURE_GROUPS)
  332. return
  333. }
  334. /*
  335. Free all data allocated by the `create*` procedures.
  336. *Frees Using Provided Allocator*
  337. Inputs:
  338. - regex: A regular expression.
  339. - allocator: (default: context.allocator)
  340. */
  341. destroy_regex :: proc(regex: Regular_Expression, allocator := context.allocator) {
  342. context.allocator = allocator
  343. delete(regex.program)
  344. for data in regex.class_data {
  345. delete(data.runes)
  346. delete(data.ranges)
  347. }
  348. delete(regex.class_data)
  349. }
  350. /*
  351. Free all data allocated by the `match_and_allocate_capture` procedure.
  352. *Frees Using Provided Allocator*
  353. Inputs:
  354. - capture: A Capture.
  355. - allocator: (default: context.allocator)
  356. */
  357. destroy_capture :: proc(capture: Capture, allocator := context.allocator) {
  358. context.allocator = allocator
  359. delete(capture.groups)
  360. delete(capture.pos)
  361. }
  362. destroy :: proc {
  363. destroy_regex,
  364. destroy_capture,
  365. }