ソースを参照

Make RegEx VM restartable and fix iterator infinite loop

Feoramund 3 ヶ月 前
コミット
fedb9efb41

+ 62 - 5
core/text/regex/regex.odin

@@ -77,6 +77,8 @@ Match_Iterator :: struct {
 	vm:       virtual_machine.Machine,
 	idx:      int,
 	temp:     runtime.Allocator,
+	threads:  int,
+	done:     bool,
 }
 
 /*
@@ -101,7 +103,6 @@ create :: proc(
 	permanent_allocator := context.allocator,
 	temporary_allocator := context.temp_allocator,
 ) -> (result: Regular_Expression, err: Error) {
-
 	// For the sake of speed and simplicity, we first run all the intermediate
 	// processes such as parsing and compilation through the temporary
 	// allocator.
@@ -294,6 +295,7 @@ create_iterator :: proc(
 	result.temp          = temporary_allocator
 	result.vm            = virtual_machine.create(result.regex.program, str)
 	result.vm.class_data = result.regex.class_data
+	result.threads       = max(1, virtual_machine.opcode_count(result.vm.code) - 1)
 
 	return
 }
@@ -457,8 +459,27 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
 	assert(len(it.capture.pos) >= common.MAX_CAPTURE_GROUPS,
 		"Pre-allocated RegEx capture `pos` must be at least 10 elements long.")
 
+	// Guard against situations in which the iterator should finish.
+	if it.done {
+		return
+	}
+
 	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
 
+	if it.idx > 0 {
+		// Reset the state needed to `virtual_machine.run` again.
+		it.vm.top_thread        = 0
+		it.vm.current_rune      = rune(0)
+		it.vm.current_rune_size = 0
+		for i in 0..<it.threads {
+			it.vm.threads[i]      = {}
+			it.vm.next_threads[i] = {}
+		}
+	}
+
+	// Take note of where the string pointer is before we start.
+	sp_before := it.vm.string_pointer
+
 	saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
 	{
 		context.allocator = it.temp
@@ -469,6 +490,28 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
 		}
 	}
 
+	if !ok {
+		// Match failed, bail out.
+		return
+	}
+
+	if it.vm.string_pointer == sp_before {
+		// The string pointer did not move, but there was a match.
+		//
+		// At this point, the pattern supplied to the iterator will infinitely
+		// loop if we do not intervene.
+		it.done = true
+	}
+	if it.vm.string_pointer == len(it.vm.memory) {
+		// The VM hit the end of the string.
+		//
+		// We do not check at the start, because a match of pattern `$`
+		// against string "" is valid and must return a match.
+		//
+		// This check prevents a double-match of `$` against a non-empty string.
+		it.done = true
+	}
+
 	str := string(it.vm.memory)
 	num_groups: int
 
@@ -488,9 +531,7 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
 		num_groups = n
 	}
 
-	defer if ok {
-		it.idx += 1
-	}
+	defer it.idx += 1
 
 	if num_groups > 0 {
 		result = {it.capture.pos[:num_groups], it.capture.groups[:num_groups]}
@@ -504,8 +545,24 @@ match :: proc {
 	match_iterator,
 }
 
+/*
+Reset an iterator, allowing it to be run again as if new.
+
+Inputs:
+- it: The iterator to reset.
+*/
 reset :: proc(it: ^Match_Iterator) {
-	it.idx    = 0
+	it.done                 = false
+	it.idx                  = 0
+	it.vm.string_pointer    = 0
+
+	it.vm.top_thread        = 0
+	it.vm.current_rune      = rune(0)
+	it.vm.current_rune_size = 0
+	for i in 0..<it.threads {
+		it.vm.threads[i]      = {}
+		it.vm.next_threads[i] = {}
+	}
 }
 
 /*

+ 3 - 3
core/text/regex/virtual_machine/virtual_machine.odin

@@ -329,10 +329,10 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
 
 run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, ok: bool) #no_bounds_check {
 	when UNICODE_MODE {
-		vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory)
+		vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory[vm.string_pointer:])
 	} else {
 		if len(vm.memory) > 0 {
-			vm.next_rune = cast(rune)vm.memory[0]
+			vm.next_rune = cast(rune)vm.memory[vm.string_pointer]
 			vm.next_rune_size = 1
 		}
 	}
@@ -652,4 +652,4 @@ destroy :: proc(vm: Machine, allocator := context.allocator) {
 	delete(vm.busy_map)
 	free(vm.threads)
 	free(vm.next_threads)
-}
+}

+ 3 - 2
tests/core/text/regex/test_core_text_regex.odin

@@ -1119,7 +1119,7 @@ iterator_vectors := []Iterator_Test{
 
 @test
 test_match_iterator :: proc(t: ^testing.T) {
-	for test in iterator_vectors {
+	vector: for test in iterator_vectors {
 		it, err := regex.create_iterator(test.haystack, test.pattern, test.flags)
 		defer regex.destroy(it)
 
@@ -1128,7 +1128,8 @@ test_match_iterator :: proc(t: ^testing.T) {
 
 		for capture, idx in regex.match(&it) {
 			if idx >= len(test.expected) {
-				break
+				log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture)
+				continue vector
 			}
 			check_capture(t, capture, test.expected[idx])
 		}