瀏覽代碼

Merge pull request #5209 from Feoramund/regex-fixes

Fix RegEx iterator, remove `.Global`, make patterns unanchored by default (breaking change)
Jeroen van Rijn 4 月之前
父節點
當前提交
142dd58b27

+ 0 - 3
core/text/regex/common/common.odin

@@ -15,8 +15,6 @@ MAX_PROGRAM_SIZE   :: int(max(i16))
 MAX_CLASSES        :: int(max(u8))
 
 Flag :: enum u8 {
-	// Global: try to match the pattern anywhere in the string.
-	Global,
 	// Multiline: treat `^` and `$` as if they also match newlines.
 	Multiline,
 	// Case Insensitive: treat `a-z` as if it was also `A-Z`.
@@ -36,7 +34,6 @@ Flags :: bit_set[Flag; u8]
 
 @(rodata)
 Flag_To_Letter := #sparse[Flag]u8 {
-	.Global            = 'g',
 	.Multiline         = 'm',
 	.Case_Insensitive  = 'i',
 	.Ignore_Whitespace = 'x',

+ 9 - 5
core/text/regex/compiler/compiler.odin

@@ -401,7 +401,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
 
 	pc_open := 0
 
-	add_global: if .Global in flags {
+	optimize_opening: {
 		// Check if the opening to the pattern is predictable.
 		// If so, use one of the optimized Wait opcodes.
 		iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
@@ -412,7 +412,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
 				pc_open += size_of(Opcode)
 				inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
 				pc_open += size_of(u8)
-				break add_global
+				break optimize_opening
 
 			case .Rune:
 				operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
@@ -420,24 +420,28 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
 				pc_open += size_of(Opcode)
 				inject_raw(&code, pc_open, operand)
 				pc_open += size_of(rune)
-				break add_global
+				break optimize_opening
 
 			case .Rune_Class:
 				inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
 				pc_open += size_of(Opcode)
 				inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
 				pc_open += size_of(u8)
-				break add_global
+				break optimize_opening
 
 			case .Rune_Class_Negated:
 				inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
 				pc_open += size_of(Opcode)
 				inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
 				pc_open += size_of(u8)
-				break add_global
+				break optimize_opening
 
 			case .Save:
 				continue
+
+			case .Assert_Start:
+				break optimize_opening
+
 			case:
 				break seek_loop
 			}

+ 62 - 9
core/text/regex/regex.odin

@@ -77,6 +77,8 @@ Match_Iterator :: struct {
 	vm:       virtual_machine.Machine,
 	idx:      int,
 	temp:     runtime.Allocator,
+	threads:  int,
+	done:     bool,
 }
 
 /*
@@ -101,7 +103,6 @@ create :: proc(
 	permanent_allocator := context.allocator,
 	temporary_allocator := context.temp_allocator,
 ) -> (result: Regular_Expression, err: Error) {
-
 	// For the sake of speed and simplicity, we first run all the intermediate
 	// processes such as parsing and compilation through the temporary
 	// allocator.
@@ -166,7 +167,6 @@ to escape the delimiter if found in the middle of the string.
 
 All runes after the closing delimiter will be parsed as flags:
 
-- 'g': Global
 - 'm': Multiline
 - 'i': Case_Insensitive
 - 'x': Ignore_Whitespace
@@ -243,7 +243,6 @@ create_by_user :: proc(
 	// to `end` here.
 	for r in pattern[start + end:] {
 		switch r {
-		case 'g': flags += { .Global }
 		case 'm': flags += { .Multiline }
 		case 'i': flags += { .Case_Insensitive }
 		case 'x': flags += { .Ignore_Whitespace }
@@ -282,8 +281,6 @@ create_iterator :: proc(
 	permanent_allocator := context.allocator,
 	temporary_allocator := context.temp_allocator,
 ) -> (result: Match_Iterator, err: Error) {
-	flags := flags
-	flags += {.Global} // We're iterating over a string, so the next match could start anywhere
 
 	if .Multiline in flags {
 		return {}, .Unsupported_Flag
@@ -294,6 +291,7 @@ create_iterator :: proc(
 	result.temp          = temporary_allocator
 	result.vm            = virtual_machine.create(result.regex.program, str)
 	result.vm.class_data = result.regex.class_data
+	result.threads       = max(1, virtual_machine.opcode_count(result.vm.code) - 1)
 
 	return
 }
@@ -457,8 +455,27 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
 	assert(len(it.capture.pos) >= common.MAX_CAPTURE_GROUPS,
 		"Pre-allocated RegEx capture `pos` must be at least 10 elements long.")
 
+	// Guard against situations in which the iterator should finish.
+	if it.done {
+		return
+	}
+
 	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
 
+	if it.idx > 0 {
+		// Reset the state needed to `virtual_machine.run` again.
+		it.vm.top_thread        = 0
+		it.vm.current_rune      = rune(0)
+		it.vm.current_rune_size = 0
+		for i in 0..<it.threads {
+			it.vm.threads[i]      = {}
+			it.vm.next_threads[i] = {}
+		}
+	}
+
+	// Take note of where the string pointer is before we start.
+	sp_before := it.vm.string_pointer
+
 	saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
 	{
 		context.allocator = it.temp
@@ -469,6 +486,28 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
 		}
 	}
 
+	if !ok {
+		// Match failed, bail out.
+		return
+	}
+
+	if it.vm.string_pointer == sp_before {
+		// The string pointer did not move, but there was a match.
+		//
+		// At this point, the pattern supplied to the iterator will infinitely
+		// loop if we do not intervene.
+		it.done = true
+	}
+	if it.vm.string_pointer == len(it.vm.memory) {
+		// The VM hit the end of the string.
+		//
+		// We do not check at the start, because a match of pattern `$`
+		// against string "" is valid and must return a match.
+		//
+		// This check prevents a double-match of `$` against a non-empty string.
+		it.done = true
+	}
+
 	str := string(it.vm.memory)
 	num_groups: int
 
@@ -488,9 +527,7 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
 		num_groups = n
 	}
 
-	defer if ok {
-		it.idx += 1
-	}
+	defer it.idx += 1
 
 	if num_groups > 0 {
 		result = {it.capture.pos[:num_groups], it.capture.groups[:num_groups]}
@@ -504,8 +541,24 @@ match :: proc {
 	match_iterator,
 }
 
+/*
+Reset an iterator, allowing it to be run again as if new.
+
+Inputs:
+- it: The iterator to reset.
+*/
 reset :: proc(it: ^Match_Iterator) {
-	it.idx    = 0
+	it.done                 = false
+	it.idx                  = 0
+	it.vm.string_pointer    = 0
+
+	it.vm.top_thread        = 0
+	it.vm.current_rune      = rune(0)
+	it.vm.current_rune_size = 0
+	for i in 0..<it.threads {
+		it.vm.threads[i]      = {}
+		it.vm.next_threads[i] = {}
+	}
 }
 
 /*

+ 3 - 3
core/text/regex/virtual_machine/virtual_machine.odin

@@ -329,10 +329,10 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
 
 run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, ok: bool) #no_bounds_check {
 	when UNICODE_MODE {
-		vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory)
+		vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory[vm.string_pointer:])
 	} else {
 		if len(vm.memory) > 0 {
-			vm.next_rune = cast(rune)vm.memory[0]
+			vm.next_rune = cast(rune)vm.memory[vm.string_pointer]
 			vm.next_rune_size = 1
 		}
 	}
@@ -652,4 +652,4 @@ destroy :: proc(vm: Machine, allocator := context.allocator) {
 	delete(vm.busy_map)
 	free(vm.threads)
 	free(vm.next_threads)
-}
+}

+ 6 - 4
tests/benchmark/text/regex/benchmark_regex.odin

@@ -103,9 +103,11 @@ expensive_for_backtrackers :: proc(t: ^testing.T) {
 
 @test
 global_capture_end_word :: proc(t: ^testing.T) {
+	// NOTE: The previous behavior of `.Global`, which was to automatically
+	// insert `.*?` at the start of the pattern, is now default.
 	EXPR :: `Hellope World!`
 
-	rex, err := regex.create(EXPR, { .Global })
+	rex, err := regex.create(EXPR, { /*.Global*/ })
 	if !testing.expect_value(t, err, nil) {
 		return
 	}
@@ -145,7 +147,7 @@ global_capture_end_word_unicode :: proc(t: ^testing.T) {
 	EXPR :: `こにちは`
 	needle := string(EXPR)
 
-	rex, err := regex.create(EXPR, { .Global, .Unicode })
+	rex, err := regex.create(EXPR, { /*.Global,*/ .Unicode })
 	if !testing.expect_value(t, err, nil) {
 		return
 	}
@@ -185,7 +187,7 @@ global_capture_end_word_unicode :: proc(t: ^testing.T) {
 alternations :: proc(t: ^testing.T) {
 	EXPR :: `a(?:bb|cc|dd|ee|ff)`
 
-	rex, err := regex.create(EXPR, { .No_Capture, .Global })
+	rex, err := regex.create(EXPR, { .No_Capture, /*.Global*/ })
 	if !testing.expect_value(t, err, nil) {
 		return
 	}
@@ -219,7 +221,7 @@ classes :: proc(t: ^testing.T) {
 	EXPR :: `[\w\d]+`
 	NEEDLE :: "0123456789abcdef"
 
-	rex, err := regex.create(EXPR, { .Global })
+	rex, err := regex.create(EXPR, { /*.Global*/ })
 	if !testing.expect_value(t, err, nil) {
 		return
 	}

+ 174 - 19
tests/core/text/regex/test_core_text_regex.odin

@@ -51,13 +51,13 @@ check_expression_with_flags :: proc(t: ^testing.T, pattern: string, flags: regex
 }
 
 check_expression :: proc(t: ^testing.T, pattern, haystack: string, needles: ..string, extra_flags := regex.Flags{}, loc := #caller_location) {
-	check_expression_with_flags(t, pattern, { .Global } + extra_flags,
+	check_expression_with_flags(t, pattern, extra_flags,
 		haystack, ..needles, loc = loc)
-	check_expression_with_flags(t, pattern, { .Global, .No_Optimization } + extra_flags,
+	check_expression_with_flags(t, pattern, { .No_Optimization } + extra_flags,
 		haystack, ..needles, loc = loc)
-	check_expression_with_flags(t, pattern, { .Global, .Unicode } + extra_flags,
+	check_expression_with_flags(t, pattern, { .Unicode } + extra_flags,
 		haystack, ..needles, loc = loc)
-	check_expression_with_flags(t, pattern, { .Global, .Unicode, .No_Optimization } + extra_flags,
+	check_expression_with_flags(t, pattern, { .Unicode, .No_Optimization } + extra_flags,
 		haystack, ..needles, loc = loc)
 }
 
@@ -72,17 +72,18 @@ expect_error :: proc(t: ^testing.T, pattern: string, expected_error: typeid, fla
 	testing.expect_value(t, variant_ti, expected_ti, loc = loc)
 }
 
-check_capture :: proc(t: ^testing.T, got, expected: regex.Capture, loc := #caller_location) {
-	testing.expect_value(t, len(got.pos),    len(got.groups),      loc = loc)
-	testing.expect_value(t, len(got.pos),    len(expected.pos),    loc = loc)
-	testing.expect_value(t, len(got.groups), len(expected.groups), loc = loc)
+check_capture :: proc(t: ^testing.T, got, expected: regex.Capture, loc := #caller_location) -> (ok: bool) {
+	testing.expect_value(t, len(got.pos),    len(got.groups),      loc = loc) or_return
+	testing.expect_value(t, len(got.pos),    len(expected.pos),    loc = loc) or_return
+	testing.expect_value(t, len(got.groups), len(expected.groups), loc = loc) or_return
 
 	if len(got.pos) == len(expected.pos) {
 		for i in 0..<len(got.pos) {
-			testing.expect_value(t, got.pos[i],    expected.pos[i],    loc = loc)
-			testing.expect_value(t, got.groups[i], expected.groups[i], loc = loc)
+			testing.expect_value(t, got.pos[i],    expected.pos[i],    loc = loc) or_return
+			testing.expect_value(t, got.groups[i], expected.groups[i], loc = loc) or_return
 		}
 	}
+	return true
 }
 
 @test
@@ -212,6 +213,12 @@ test_alternation :: proc(t: ^testing.T) {
 	check_expression(t, EXPR, "cc", "cc")
 }
 
+@test
+test_alternation_order :: proc(t: ^testing.T) {
+	check_expression(t, "a|ab", "ab", "a")
+	check_expression(t, "ab|a", "ab", "ab")
+}
+
 @test
 test_optional :: proc(t: ^testing.T) {
 	EXPR :: "a?a?a?aaa"
@@ -516,7 +523,7 @@ test_pos_index_explicitly :: proc(t: ^testing.T) {
 	STR :: "This is an island."
 	EXPR :: `\bis\b`
 
-	rex, err := regex.create(EXPR, { .Global })
+	rex, err := regex.create(EXPR)
 	if !testing.expect_value(t, err, nil) {
 		return
 	}
@@ -642,9 +649,9 @@ test_unicode_explicitly :: proc(t: ^testing.T) {
 	}
 	{
 		EXPR :: "こにちは!"
-		check_expression_with_flags(t, EXPR, { .Global, .Unicode },
+		check_expression_with_flags(t, EXPR, { .Unicode },
 			"Hello こにちは!", "こにちは!")
-		check_expression_with_flags(t, EXPR, { .Global, .Unicode, .No_Optimization },
+		check_expression_with_flags(t, EXPR, { .Unicode, .No_Optimization },
 			"Hello こにちは!", "こにちは!")
 	}
 }
@@ -901,12 +908,12 @@ test_everything_at_once :: proc(t: ^testing.T) {
 @test
 test_creation_from_user_string :: proc(t: ^testing.T) {
 	{
-		USER_EXPR :: `/^hellope$/gmixun-`
+		USER_EXPR :: `/^hellope$/mixun-`
 		STR :: "hellope"
 		rex, err := regex.create_by_user(USER_EXPR)
 		defer regex.destroy(rex)
 		testing.expect_value(t, err, nil)
-		testing.expect_value(t, rex.flags, regex.Flags{ .Global, .Multiline, .Case_Insensitive, .Ignore_Whitespace, .Unicode, .No_Capture, .No_Optimization })
+		testing.expect_value(t, rex.flags, regex.Flags{ .Multiline, .Case_Insensitive, .Ignore_Whitespace, .Unicode, .No_Capture, .No_Optimization })
 
 		_, ok := regex.match(rex, STR)
 		testing.expectf(t, ok, "expected user-provided RegEx %v to match %q", rex, STR)
@@ -1102,24 +1109,121 @@ Iterator_Test :: struct {
 
 iterator_vectors := []Iterator_Test{
 	{
-		`xxab32ab52xx`, `(ab\d{1})`, {}, // {.Global} implicitly added by the iterator
+		`xxab32ab52xx`, `(ab\d{1})`, {},
 		{
 			{pos = {{2, 5}, {2, 5}}, groups = {"ab3", "ab3"}},
 			{pos = {{6, 9}, {6, 9}}, groups = {"ab5", "ab5"}},
 		},
 	},
 	{
-		`xxfoobarxfoobarxx`, `f(o)ob(ar)`, {.Global},
+		`xxfoobarxfoobarxx`, `f(o)ob(ar)`, {},
 		{
 			{pos = {{2,  8},  {3,  4},  {6,  8}}, groups = {"foobar", "o", "ar"}},
 			{pos = {{9, 15}, {10, 11}, {13, 15}}, groups = {"foobar", "o", "ar"}},
 		},
 	},
+	{
+		`aaa`, `a`, {},
+		{
+			{pos = {{0,  1}}, groups = {"a"}},
+			{pos = {{1,  2}}, groups = {"a"}},
+			{pos = {{2,  3}}, groups = {"a"}},
+		},
+	},
+	{
+		`aaa`, `\w`, {},
+		{
+			{pos = {{0,  1}}, groups = {"a"}},
+			{pos = {{1,  2}}, groups = {"a"}},
+			{pos = {{2,  3}}, groups = {"a"}},
+		},
+	},
+	{
+		`aかか`, `.`, {.Unicode},
+		{
+			{pos = {{0,  1}}, groups = {"a"}},
+			{pos = {{1,  4}}, groups = {"か"}},
+			{pos = {{4,  7}}, groups = {"か"}},
+		},
+	},
+	{
+		`ゆめ.`, `.`, {.Unicode},
+		{
+			{pos = {{0,  3}}, groups = {"ゆ"}},
+			{pos = {{3,  6}}, groups = {"め"}},
+			{pos = {{6,  7}}, groups = {"."}},
+		},
+	},
+	// This pattern is not anchored, so this is valid per the new behavior.
+	{
+		`ababa`, `(?:a|ab)`, {},
+		{
+			{pos = {{0,  1}}, groups = {"a"}},
+			{pos = {{2,  3}}, groups = {"a"}},
+			{pos = {{4,  5}}, groups = {"a"}},
+		},
+	},
+	// Ensure alternation follows expected ordering of left-higher priority.
+	{
+		`ababa`, `(?:ab|a)`, {},
+		{
+			{pos = {{0,  2}}, groups = {"ab"}},
+			{pos = {{2,  4}}, groups = {"ab"}},
+			{pos = {{4,  5}}, groups = {"a"}},
+		},
+	},
+	// This one is anchored, so only one match.
+	//
+	// This test ensures the behavior of `Assert_Start` is checking against the
+	// start of the string and we haven't just slid the memory string itself.
+	{
+		`ababa`, `^(?:a|b)`, {},
+		{
+			{pos = {{0,  1}}, groups = {"a"}},
+		},
+	},
+	{
+		`ababa`, `a$`, {},
+		{
+			{pos = {{4,  5}}, groups = {"a"}},
+		},
+	},
+	// A blank pattern on a blank string is valid and must not loop infinitely.
+	{
+		``, ``, {},
+		{
+			{pos = {{0,  0}}, groups = {""}},
+		},
+	},
+	// These too are valid.
+	//
+	// The iterator must bail out when it encounters any situation which would
+	// loop infinitely, but only after giving at least one valid match if one
+	// exists, as this would accord with attempting to singularly match the
+	// pattern against the string and provide a positive result.
+	{
+		`a`, ``, {},
+		{
+			{pos = {{0,  0}}, groups = {""}},
+		},
+	},
+	{
+		``, `$`, {},
+		{
+			{pos = {{0,  0}}, groups = {""}},
+		},
+	},
+	{
+		`aaa`, `$`, {},
+		{
+			{pos = {{3,  3}}, groups = {""}},
+		},
+	},
 }
 
 @test
 test_match_iterator :: proc(t: ^testing.T) {
-	for test in iterator_vectors {
+	vector: for test in iterator_vectors {
 		it, err := regex.create_iterator(test.haystack, test.pattern, test.flags)
 		defer regex.destroy(it)
 
@@ -1128,10 +1232,61 @@ test_match_iterator :: proc(t: ^testing.T) {
 
 		for capture, idx in regex.match(&it) {
 			if idx >= len(test.expected) {
+				log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture)
+				continue vector
+			}
+			if !check_capture(t, capture, test.expected[idx]) {
+				log.errorf("capture check failed on string %q against pattern %q", test.haystack, test.pattern)
+			}
+		}
+		testing.expectf(t, it.idx == len(test.expected), "expected idx to be %i, got %i on string %q against pattern %q", len(test.expected), it.idx, test.haystack, test.pattern)
+	}
+}
+
+@test
+test_iterator_reset :: proc(t: ^testing.T) {
+	test := Iterator_Test{
+		`aaa`, `a`, {},
+		{
+			{pos = {{0,  1}}, groups = {"a"}},
+			{pos = {{1,  2}}, groups = {"a"}},
+			{pos = {{2,  3}}, groups = {"a"}},
+		},
+	}
+
+	out: {
+		it, err := regex.create_iterator(`aaa`, `a`)
+		defer regex.destroy(it)
+
+		testing.expect_value(t, err, nil)
+		(err == nil) or_break out
+
+		for capture, idx in regex.match(&it) {
+			if idx >= len(test.expected) {
+				log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture)
 				break
 			}
 			check_capture(t, capture, test.expected[idx])
 		}
 		testing.expect_value(t, it.idx, len(test.expected))
+
+		// Do it again.
+		iterations := 0
+		regex.reset(&it)
+
+		// Mind that this loop can do nothing if it wasn't reset but leave us
+		// with the expected `idx` state.
+		//
+		// That's why we count iterations this time around.
+		for capture, idx in regex.match(&it) {
+			iterations += 1
+			if idx >= len(test.expected) {
+				log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture)
+				break
+			}
+			check_capture(t, capture, test.expected[idx])
+		}
+		testing.expect_value(t, it.idx, len(test.expected))
+		testing.expect_value(t, iterations, len(test.expected))
 	}
-}
+}