123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530 |
- package regex_optimizer
- /*
- (c) Copyright 2024 Feoramund <[email protected]>.
- Made available under Odin's BSD-3 license.
- List of contributors:
- Feoramund: Initial implementation.
- */
- import "base:intrinsics"
- @require import "core:io"
- import "core:slice"
- import "core:text/regex/common"
- import "core:text/regex/parser"
- Rune_Class_Range :: parser.Rune_Class_Range
- Node :: parser.Node
- Node_Rune :: parser.Node_Rune
- Node_Rune_Class :: parser.Node_Rune_Class
- Node_Wildcard :: parser.Node_Wildcard
- Node_Concatenation :: parser.Node_Concatenation
- Node_Alternation :: parser.Node_Alternation
- Node_Repeat_Zero :: parser.Node_Repeat_Zero
- Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
- Node_Repeat_One :: parser.Node_Repeat_One
- Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
- Node_Repeat_N :: parser.Node_Repeat_N
- Node_Optional :: parser.Node_Optional
- Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
- Node_Group :: parser.Node_Group
- Node_Anchor :: parser.Node_Anchor
- Node_Word_Boundary :: parser.Node_Word_Boundary
- Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
- class_range_sorter :: proc(i, j: Rune_Class_Range) -> bool {
- return i.lower < j.lower
- }
- optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
- if tree == nil {
- return nil, 0
- }
- result = tree
- switch specific in tree {
- // No direct optimization possible on these nodes:
- case ^Node_Rune: break
- case ^Node_Wildcard: break
- case ^Node_Anchor: break
- case ^Node_Word_Boundary: break
- case ^Node_Match_All_And_Escape: break
- case ^Node_Concatenation:
- // * Composition: Consume All to Anchored End
- //
- // DO: `.*$` => <special opcode>
- // DO: `.+$` => `.` <special opcode>
- if .Multiline not_in flags && len(specific.nodes) >= 2 {
- i := len(specific.nodes) - 2
- wrza: {
- subnode := specific.nodes[i].(^Node_Repeat_Zero) or_break wrza
- _ = subnode.inner.(^Node_Wildcard) or_break wrza
- next_node := specific.nodes[i+1].(^Node_Anchor) or_break wrza
- if next_node.start == false {
- specific.nodes[i] = new(Node_Match_All_And_Escape)
- ordered_remove(&specific.nodes, i + 1)
- changes += 1
- break
- }
- }
- wroa: {
- subnode := specific.nodes[i].(^Node_Repeat_One) or_break wroa
- subsubnode := subnode.inner.(^Node_Wildcard) or_break wroa
- next_node := specific.nodes[i+1].(^Node_Anchor) or_break wroa
- if next_node.start == false {
- specific.nodes[i] = subsubnode
- specific.nodes[i+1] = new(Node_Match_All_And_Escape)
- changes += 1
- break
- }
- }
- }
- // Only recursive optimizations:
- #no_bounds_check for i := 0; i < len(specific.nodes); i += 1 {
- subnode, subnode_changes := optimize_subtree(specific.nodes[i], flags)
- changes += subnode_changes
- if subnode == nil {
- ordered_remove(&specific.nodes, i)
- i -= 1
- changes += 1
- } else {
- specific.nodes[i] = subnode
- }
- }
- if len(specific.nodes) == 1 {
- result = specific.nodes[0]
- changes += 1
- } else if len(specific.nodes) == 0 {
- return nil, changes + 1
- }
- case ^Node_Repeat_Zero:
- specific.inner, changes = optimize_subtree(specific.inner, flags)
- if specific.inner == nil {
- return nil, changes + 1
- }
- case ^Node_Repeat_Zero_Non_Greedy:
- specific.inner, changes = optimize_subtree(specific.inner, flags)
- if specific.inner == nil {
- return nil, changes + 1
- }
- case ^Node_Repeat_One:
- specific.inner, changes = optimize_subtree(specific.inner, flags)
- if specific.inner == nil {
- return nil, changes + 1
- }
- case ^Node_Repeat_One_Non_Greedy:
- specific.inner, changes = optimize_subtree(specific.inner, flags)
- if specific.inner == nil {
- return nil, changes + 1
- }
- case ^Node_Repeat_N:
- specific.inner, changes = optimize_subtree(specific.inner, flags)
- if specific.inner == nil {
- return nil, changes + 1
- }
- case ^Node_Optional:
- specific.inner, changes = optimize_subtree(specific.inner, flags)
- if specific.inner == nil {
- return nil, changes + 1
- }
- case ^Node_Optional_Non_Greedy:
- specific.inner, changes = optimize_subtree(specific.inner, flags)
- if specific.inner == nil {
- return nil, changes + 1
- }
- case ^Node_Group:
- specific.inner, changes = optimize_subtree(specific.inner, flags)
- if specific.inner == nil {
- return nil, changes + 1
- }
- if !specific.capture {
- result = specific.inner
- changes += 1
- }
- // Full optimization:
- case ^Node_Rune_Class:
- // * Class Simplification
- //
- // DO: `[aab]` => `[ab]`
- // DO: `[aa]` => `[a]`
- runes_seen: map[rune]bool
- for r in specific.runes {
- runes_seen[r] = true
- }
- if len(runes_seen) != len(specific.runes) {
- clear(&specific.runes)
- for key in runes_seen {
- append(&specific.runes, key)
- }
- changes += 1
- }
- // * Class Reduction
- //
- // DO: `[a]` => `a`
- if !specific.negating && len(specific.runes) == 1 && len(specific.ranges) == 0 {
- only_rune := specific.runes[0]
- node := new(Node_Rune)
- node.data = only_rune
- return node, changes + 1
- }
- // * Range Construction
- //
- // DO: `[abc]` => `[a-c]`
- slice.sort(specific.runes[:])
- if len(specific.runes) > 1 {
- new_range: Rune_Class_Range
- new_range.lower = specific.runes[0]
- new_range.upper = specific.runes[0]
- #no_bounds_check for i := 1; i < len(specific.runes); i += 1 {
- r := specific.runes[i]
- if new_range.lower == -1 {
- new_range = { r, r }
- continue
- }
- if r == new_range.lower - 1 {
- new_range.lower -= 1
- ordered_remove(&specific.runes, i)
- i -= 1
- changes += 1
- } else if r == new_range.upper + 1 {
- new_range.upper += 1
- ordered_remove(&specific.runes, i)
- i -= 1
- changes += 1
- } else if new_range.lower != new_range.upper {
- append(&specific.ranges, new_range)
- new_range = { -1, -1 }
- changes += 1
- }
- }
- if new_range.lower != new_range.upper {
- append(&specific.ranges, new_range)
- changes += 1
- }
- }
- // * Rune Merging into Range
- //
- // DO: `[aa-c]` => `[a-c]`
- for range in specific.ranges {
- #no_bounds_check for i := 0; i < len(specific.runes); i += 1 {
- r := specific.runes[i]
- if range.lower <= r && r <= range.upper {
- ordered_remove(&specific.runes, i)
- i -= 1
- changes += 1
- }
- }
- }
- // * Range Merging
- //
- // DO: `[a-cc-e]` => `[a-e]`
- // DO: `[a-cd-e]` => `[a-e]`
- // DO: `[a-cb-e]` => `[a-e]`
- slice.sort_by(specific.ranges[:], class_range_sorter)
- #no_bounds_check for i := 0; i < len(specific.ranges) - 1; i += 1 {
- for j := i + 1; j < len(specific.ranges); j += 1 {
- left_range := &specific.ranges[i]
- right_range := specific.ranges[j]
- if left_range.upper == right_range.lower ||
- left_range.upper == right_range.lower - 1 ||
- left_range.lower <= right_range.lower && right_range.lower <= left_range.upper {
- left_range.upper = max(left_range.upper, right_range.upper)
- ordered_remove(&specific.ranges, j)
- j -= 1
- changes += 1
- } else {
- break
- }
- }
- }
- if len(specific.ranges) == 0 {
- specific.ranges = {}
- }
- if len(specific.runes) == 0 {
- specific.runes = {}
- }
- // * NOP
- //
- // DO: `[]` => <nil>
- if len(specific.ranges) + len(specific.runes) == 0 {
- return nil, 1
- }
- slice.sort(specific.runes[:])
- slice.sort_by(specific.ranges[:], class_range_sorter)
- case ^Node_Alternation:
- // Perform recursive optimization first.
- left_changes, right_changes: int
- specific.left, left_changes = optimize_subtree(specific.left, flags)
- specific.right, right_changes = optimize_subtree(specific.right, flags)
- changes += left_changes + right_changes
- // * Alternation to Optional
- //
- // DO: `a|` => `a?`
- if specific.left != nil && specific.right == nil {
- node := new(Node_Optional)
- node.inner = specific.left
- return node, 1
- }
- // * Alternation to Optional Non-Greedy
- //
- // DO: `|a` => `a??`
- if specific.right != nil && specific.left == nil {
- node := new(Node_Optional_Non_Greedy)
- node.inner = specific.right
- return node, 1
- }
- // * NOP
- //
- // DO: `|` => <nil>
- if specific.left == nil && specific.right == nil {
- return nil, 1
- }
- left_rune, left_is_rune := specific.left.(^Node_Rune)
- right_rune, right_is_rune := specific.right.(^Node_Rune)
- if left_is_rune && right_is_rune {
- if left_rune.data == right_rune.data {
- // * Alternation Reduction
- //
- // DO: `a|a` => `a`
- return left_rune, 1
- } else {
- // * Alternation to Class
- //
- // DO: `a|b` => `[ab]`
- node := new(Node_Rune_Class)
- append(&node.runes, left_rune.data)
- append(&node.runes, right_rune.data)
- return node, 1
- }
- }
- left_wildcard, left_is_wildcard := specific.left.(^Node_Wildcard)
- right_wildcard, right_is_wildcard := specific.right.(^Node_Wildcard)
- // * Class Union
- //
- // DO: `[a0]|[b1]` => `[a0b1]`
- left_class, left_is_class := specific.left.(^Node_Rune_Class)
- right_class, right_is_class := specific.right.(^Node_Rune_Class)
- if left_is_class && right_is_class {
- for r in right_class.runes {
- append(&left_class.runes, r)
- }
- for range in right_class.ranges {
- append(&left_class.ranges, range)
- }
- return left_class, 1
- }
- // * Class Union
- //
- // DO: `[a-b]|c` => `[a-bc]`
- if left_is_class && right_is_rune {
- append(&left_class.runes, right_rune.data)
- return left_class, 1
- }
- // * Class Union
- //
- // DO: `a|[b-c]` => `[b-ca]`
- if left_is_rune && right_is_class {
- append(&right_class.runes, left_rune.data)
- return right_class, 1
- }
- // * Wildcard Reduction
- //
- // DO: `a|.` => `.`
- if left_is_rune && right_is_wildcard {
- return right_wildcard, 1
- }
- // * Wildcard Reduction
- //
- // DO: `.|a` => `.`
- if left_is_wildcard && right_is_rune {
- return left_wildcard, 1
- }
- // * Wildcard Reduction
- //
- // DO: `[ab]|.` => `.`
- if left_is_class && right_is_wildcard {
- return right_wildcard, 1
- }
- // * Wildcard Reduction
- //
- // DO: `.|[ab]` => `.`
- if left_is_wildcard && right_is_class {
- return left_wildcard, 1
- }
- left_concatenation, left_is_concatenation := specific.left.(^Node_Concatenation)
- right_concatenation, right_is_concatenation := specific.right.(^Node_Concatenation)
- // * Common Suffix Elimination
- //
- // DO: `blueberry|strawberry` => `(?:blue|straw)berry`
- if left_is_concatenation && right_is_concatenation {
- // Remember that a concatenation could contain any node, not just runes.
- left_len := len(left_concatenation.nodes)
- right_len := len(right_concatenation.nodes)
- least_len := min(left_len, right_len)
- same_len := 0
- for i := 1; i <= least_len; i += 1 {
- left_subrune, left_is_subrune := left_concatenation.nodes[left_len - i].(^Node_Rune)
- right_subrune, right_is_subrune := right_concatenation.nodes[right_len - i].(^Node_Rune)
- if !left_is_subrune || !right_is_subrune {
- // One of the nodes isn't a rune; there's nothing more we can do.
- break
- }
- if left_subrune.data == right_subrune.data {
- same_len += 1
- } else {
- // No more similarities.
- break
- }
- }
- if same_len > 0 {
- // Dissolve this alternation into a concatenation.
- cat_node := new(Node_Concatenation)
- group_node := new(Node_Group)
- append(&cat_node.nodes, group_node)
- // Turn the concatenation into the common suffix.
- for i := left_len - same_len; i < left_len; i += 1 {
- append(&cat_node.nodes, left_concatenation.nodes[i])
- }
- // Construct the group of alternating prefixes.
- for i := same_len; i > 0; i -= 1 {
- pop(&left_concatenation.nodes)
- pop(&right_concatenation.nodes)
- }
- // (Re-using this alternation node.)
- alter_node := specific
- alter_node.left = left_concatenation
- alter_node.right = right_concatenation
- group_node.inner = alter_node
- return cat_node, 1
- }
- }
- // * Common Prefix Elimination
- //
- // DO: `abi|abe` => `ab(?:i|e)`
- if left_is_concatenation && right_is_concatenation {
- // Try to identify a common prefix.
- // Remember that a concatenation could contain any node, not just runes.
- least_len := min(len(left_concatenation.nodes), len(right_concatenation.nodes))
- same_len := 0
- for i := 0; i < least_len; i += 1 {
- left_subrune, left_is_subrune := left_concatenation.nodes[i].(^Node_Rune)
- right_subrune, right_is_subrune := right_concatenation.nodes[i].(^Node_Rune)
- if !left_is_subrune || !right_is_subrune {
- // One of the nodes isn't a rune; there's nothing more we can do.
- break
- }
- if left_subrune.data == right_subrune.data {
- same_len = i + 1
- } else {
- // No more similarities.
- break
- }
- }
- if same_len > 0 {
- cat_node := new(Node_Concatenation)
- for i := 0; i < same_len; i += 1 {
- append(&cat_node.nodes, left_concatenation.nodes[i])
- }
- for i := same_len; i > 0; i -= 1 {
- ordered_remove(&left_concatenation.nodes, 0)
- ordered_remove(&right_concatenation.nodes, 0)
- }
- group_node := new(Node_Group)
- // (Re-using this alternation node.)
- alter_node := specific
- alter_node.left = left_concatenation
- alter_node.right = right_concatenation
- group_node.inner = alter_node
- append(&cat_node.nodes, group_node)
- return cat_node, 1
- }
- }
- }
- return
- }
- optimize :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
- result = tree
- new_changes := 0
- when common.ODIN_DEBUG_REGEX {
- io.write_string(common.debug_stream, "AST before Optimizer: ")
- parser.write_node(common.debug_stream, tree)
- io.write_byte(common.debug_stream, '\n')
- }
- // Keep optimizing until no more changes are seen.
- for {
- result, new_changes = optimize_subtree(result, flags)
- changes += new_changes
- if new_changes == 0 {
- break
- }
- }
- when common.ODIN_DEBUG_REGEX {
- io.write_string(common.debug_stream, "AST after Optimizer: ")
- parser.write_node(common.debug_stream, result)
- io.write_byte(common.debug_stream, '\n')
- }
- return
- }
|