Browse Source

Fix multiline RegEx iteration

In `.Multiline` mode:

- `^` is now defined to assert the start of the string or that a "\n" or
  "\r" rune was parsed on last VM dispatch.

- `$` is now defined to consume a newline sequence of "\n", "\r", or
  "\r\n" or to assert the end of the string.
Feoramund 3 months ago
parent
commit
35b157ac83

+ 7 - 3
core/text/regex/compiler/compiler.odin

@@ -195,8 +195,12 @@ generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) {
 
 
 	case ^Node_Anchor:
 	case ^Node_Anchor:
 		if .Multiline in c.flags {
 		if .Multiline in c.flags {
-			append(&code, Opcode.Multiline_Open)
-			append(&code, Opcode.Multiline_Close)
+			if specific.start {
+				append(&code, Opcode.Assert_Start_Multiline)
+			} else {
+				append(&code, Opcode.Multiline_Open)
+				append(&code, Opcode.Multiline_Close)
+			}
 		} else {
 		} else {
 			if specific.start {
 			if specific.start {
 				append(&code, Opcode.Assert_Start)
 				append(&code, Opcode.Assert_Start)
@@ -439,7 +443,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
 			case .Save:
 			case .Save:
 				continue
 				continue
 
 
-			case .Assert_Start:
+			case .Assert_Start, .Assert_Start_Multiline:
 				break optimize_opening
 				break optimize_opening
 
 
 			case:
 			case:

+ 1 - 4
core/text/regex/regex.odin

@@ -282,10 +282,6 @@ create_iterator :: proc(
 	temporary_allocator := context.temp_allocator,
 	temporary_allocator := context.temp_allocator,
 ) -> (result: Match_Iterator, err: Error) {
 ) -> (result: Match_Iterator, err: Error) {
 
 
-	if .Multiline in flags {
-		return {}, .Unsupported_Flag
-	}
-
 	result.regex         = create(pattern, flags, permanent_allocator, temporary_allocator) or_return
 	result.regex         = create(pattern, flags, permanent_allocator, temporary_allocator) or_return
 	result.capture       = preallocate_capture()
 	result.capture       = preallocate_capture()
 	result.temp          = temporary_allocator
 	result.temp          = temporary_allocator
@@ -555,6 +551,7 @@ reset :: proc(it: ^Match_Iterator) {
 	it.vm.top_thread        = 0
 	it.vm.top_thread        = 0
 	it.vm.current_rune      = rune(0)
 	it.vm.current_rune      = rune(0)
 	it.vm.current_rune_size = 0
 	it.vm.current_rune_size = 0
+	it.vm.last_rune         = rune(0)
 	for i in 0..<it.threads {
 	for i in 0..<it.threads {
 		it.vm.threads[i]      = {}
 		it.vm.threads[i]      = {}
 		it.vm.next_threads[i] = {}
 		it.vm.next_threads[i] = {}

+ 24 - 16
core/text/regex/virtual_machine/doc.odin

@@ -109,34 +109,42 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
 
 
 	(0x0A) Assert_Start
 	(0x0A) Assert_Start
 
 
-	Asserts that the thread is at the beginning of a string.
+	Asserts that the thread is at the beginning of the string.
 
 
-	(0x0B) Assert_End
+	(0x0B) Assert_Start_Multiline
 
 
-	Asserts that the thread is at the end of a string.
+	This opcode is compiled in only when the `Multiline` flag is present as a
+	replacement for the `^` text anchor.
 
 
-	(0x0C) Assert_Word_Boundary
+	Asserts that the thread is at the beginning of the string or previously
+	parsed either a "\n" or "\r".
+
+	(0x0C) Assert_End
+
+	Asserts that the thread is at the end of the string.
+
+	(0x0D) Assert_Word_Boundary
 
 
 	Asserts that the thread is on a word boundary, which can be the start or
 	Asserts that the thread is on a word boundary, which can be the start or
 	end of the text. This examines both the current rune and the next rune.
 	end of the text. This examines both the current rune and the next rune.
 
 
-	(0x0D) Assert_Non_Word_Boundary
+	(0x0E) Assert_Non_Word_Boundary
 
 
 	A modified version of Assert_Word_Boundary that returns the opposite value.
 	A modified version of Assert_Word_Boundary that returns the opposite value.
 
 
-	(0x0E) Multiline_Open
+	(0x0F) Multiline_Open
 
 
-	This opcode is compiled in only when the `Multiline` flag is present, and
-	it replaces both `^` and `$` text anchors.
+	This opcode is compiled in only when the `Multiline` flag is present as a
+	replacement for the `$` text anchor.
 
 
-	It asserts that either the current thread is on one of the string
-	boundaries, or it consumes a `\n` or `\r` character.
+	It asserts that either the current thread is at the end of the string,
+	or it consumes a `\n` or `\r` character.
 
 
 	If a `\r` character is consumed, the PC will be advanced to the sibling
 	If a `\r` character is consumed, the PC will be advanced to the sibling
 	`Multiline_Close` opcode to optionally consume a `\n` character on the next
 	`Multiline_Close` opcode to optionally consume a `\n` character on the next
 	frame.
 	frame.
 
 
-	(0x0F) Multiline_Close
+	(0x10) Multiline_Close
 
 
 	This opcode is always present after `Multiline_Open`.
 	This opcode is always present after `Multiline_Open`.
 
 
@@ -144,10 +152,10 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
 	For example, Windows newlines are represented by the characters `\r\n`,
 	For example, Windows newlines are represented by the characters `\r\n`,
 	whereas UNIX newlines are `\n` and Macintosh newlines are `\r`.
 	whereas UNIX newlines are `\n` and Macintosh newlines are `\r`.
 
 
-	(0x10) Wait_For_Byte
-	(0x11) Wait_For_Rune
-	(0x12) Wait_For_Rune_Class
-	(0x13) Wait_For_Rune_Class_Negated
+	(0x11) Wait_For_Byte
+	(0x12) Wait_For_Rune
+	(0x13) Wait_For_Rune_Class
+	(0x14) Wait_For_Rune_Class_Negated
 
 
 	These opcodes are an optimization around restarting threads on failed
 	These opcodes are an optimization around restarting threads on failed
 	matches when the beginning to a pattern is predictable and the Global flag
 	matches when the beginning to a pattern is predictable and the Global flag
@@ -156,7 +164,7 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
 	They will cause the VM to wait for the next rune to match before splitting,
 	They will cause the VM to wait for the next rune to match before splitting,
 	as would happen in the un-optimized version.
 	as would happen in the un-optimized version.
 
 
-	(0x14) Match_All_And_Escape
+	(0x15) Match_All_And_Escape
 
 
 	This opcode is an optimized version of `.*$` or `.+$` that causes the
 	This opcode is an optimized version of `.*$` or `.+$` that causes the
 	active thread to immediately work on escaping the program by following all
 	active thread to immediately work on escaping the program by following all

+ 2 - 0
core/text/regex/virtual_machine/util.odin

@@ -34,6 +34,7 @@ iterate_opcodes :: proc(iter: ^Opcode_Iterator) -> (opcode: Opcode, pc: int, ok:
 	case .Split:                       iter.pc += size_of(Opcode) + 2 * size_of(u16)
 	case .Split:                       iter.pc += size_of(Opcode) + 2 * size_of(u16)
 	case .Save:                        iter.pc += size_of(Opcode) + size_of(u8)
 	case .Save:                        iter.pc += size_of(Opcode) + size_of(u8)
 	case .Assert_Start:                iter.pc += size_of(Opcode)
 	case .Assert_Start:                iter.pc += size_of(Opcode)
+	case .Assert_Start_Multiline:      iter.pc += size_of(Opcode)
 	case .Assert_End:                  iter.pc += size_of(Opcode)
 	case .Assert_End:                  iter.pc += size_of(Opcode)
 	case .Assert_Word_Boundary:        iter.pc += size_of(Opcode)
 	case .Assert_Word_Boundary:        iter.pc += size_of(Opcode)
 	case .Assert_Non_Word_Boundary:    iter.pc += size_of(Opcode)
 	case .Assert_Non_Word_Boundary:    iter.pc += size_of(Opcode)
@@ -64,6 +65,7 @@ opcode_to_name :: proc(opcode: Opcode) -> (str: string) {
 	case .Split:                       str = "Split"
 	case .Split:                       str = "Split"
 	case .Save:                        str = "Save"
 	case .Save:                        str = "Save"
 	case .Assert_Start:                str = "Assert_Start"
 	case .Assert_Start:                str = "Assert_Start"
+	case .Assert_Start_Multiline:      str = "Assert_Start_Multiline"
 	case .Assert_End:                  str = "Assert_End"
 	case .Assert_End:                  str = "Assert_End"
 	case .Assert_Word_Boundary:        str = "Assert_Word_Boundary"
 	case .Assert_Word_Boundary:        str = "Assert_Word_Boundary"
 	case .Assert_Non_Word_Boundary:    str = "Assert_Non_Word_Boundary"
 	case .Assert_Non_Word_Boundary:    str = "Assert_Non_Word_Boundary"

+ 25 - 27
core/text/regex/virtual_machine/virtual_machine.odin

@@ -37,16 +37,17 @@ Opcode :: enum u8 {
 	Split                       = 0x08, // | u16, u16
 	Split                       = 0x08, // | u16, u16
 	Save                        = 0x09, // | u8
 	Save                        = 0x09, // | u8
 	Assert_Start                = 0x0A, // |
 	Assert_Start                = 0x0A, // |
-	Assert_End                  = 0x0B, // |
-	Assert_Word_Boundary        = 0x0C, // |
-	Assert_Non_Word_Boundary    = 0x0D, // |
-	Multiline_Open              = 0x0E, // |
-	Multiline_Close             = 0x0F, // |
-	Wait_For_Byte               = 0x10, // | u8
-	Wait_For_Rune               = 0x11, // | i32
-	Wait_For_Rune_Class         = 0x12, // | u8
-	Wait_For_Rune_Class_Negated = 0x13, // | u8
-	Match_All_And_Escape        = 0x14, // |
+	Assert_Start_Multiline      = 0x0B, // |
+	Assert_End                  = 0x0C, // |
+	Assert_Word_Boundary        = 0x0D, // |
+	Assert_Non_Word_Boundary    = 0x0E, // |
+	Multiline_Open              = 0x0F, // |
+	Multiline_Close             = 0x10, // |
+	Wait_For_Byte               = 0x11, // | u8
+	Wait_For_Rune               = 0x12, // | i32
+	Wait_For_Rune_Class         = 0x13, // | u8
+	Wait_For_Rune_Class_Negated = 0x14, // | u8
+	Match_All_And_Escape        = 0x15, // |
 }
 }
 
 
 Thread :: struct {
 Thread :: struct {
@@ -77,6 +78,8 @@ Machine :: struct {
 	current_rune_size: int,
 	current_rune_size: int,
 	next_rune: rune,
 	next_rune: rune,
 	next_rune_size: int,
 	next_rune_size: int,
+
+	last_rune: rune,
 }
 }
 
 
 
 
@@ -169,6 +172,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
 				pc += size_of(Opcode)
 				pc += size_of(Opcode)
 				continue
 				continue
 			}
 			}
+		case .Assert_Start_Multiline:
+			sp := vm.string_pointer+vm.current_rune_size
+			if sp == 0 || vm.last_rune == '\n' || vm.last_rune == '\r' {
+				pc += size_of(Opcode)
+				continue
+			}
 		case .Assert_End:
 		case .Assert_End:
 			sp := vm.string_pointer+vm.current_rune_size
 			sp := vm.string_pointer+vm.current_rune_size
 			if sp == len(vm.memory) {
 			if sp == len(vm.memory) {
@@ -177,24 +186,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
 			}
 			}
 		case .Multiline_Open:
 		case .Multiline_Open:
 			sp := vm.string_pointer+vm.current_rune_size
 			sp := vm.string_pointer+vm.current_rune_size
-			if sp == 0 || sp == len(vm.memory) {
-				if vm.next_rune == '\r' || vm.next_rune == '\n' {
-					// The VM is currently on a newline at the string boundary,
-					// so consume the newline next frame.
-					when common.ODIN_DEBUG_REGEX {
-						io.write_string(common.debug_stream, "*** New thread added [PC:")
-						common.write_padded_hex(common.debug_stream, pc, 4)
-						io.write_string(common.debug_stream, "]\n")
-					}
-					vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
-					vm.top_thread += 1
-				} else {
-					// Skip the `Multiline_Close` opcode.
-					pc += 2 * size_of(Opcode)
-					continue
-				}
+			if sp == len(vm.memory) {
+				// Skip the `Multiline_Close` opcode.
+				pc += 2 * size_of(Opcode)
+				continue
 			} else {
 			} else {
-				// Not on a string boundary.
+				// Not at the end of the string.
 				// Try to consume a newline next frame in the other opcode loop.
 				// Try to consume a newline next frame in the other opcode loop.
 				when common.ODIN_DEBUG_REGEX {
 				when common.ODIN_DEBUG_REGEX {
 					io.write_string(common.debug_stream, "*** New thread added [PC:")
 					io.write_string(common.debug_stream, "*** New thread added [PC:")
@@ -613,6 +610,7 @@ run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTU
 			break
 			break
 		}
 		}
 
 
+		vm.last_rune = vm.current_rune
 		vm.string_pointer += vm.current_rune_size
 		vm.string_pointer += vm.current_rune_size
 	}
 	}
 
 

+ 54 - 3
tests/core/text/regex/test_core_text_regex.odin

@@ -699,15 +699,15 @@ test_case_insensitive :: proc(t: ^testing.T) {
 test_multiline :: proc(t: ^testing.T) {
 test_multiline :: proc(t: ^testing.T) {
 	{
 	{
 		EXPR :: `^hellope$world$`
 		EXPR :: `^hellope$world$`
-		check_expression(t, EXPR, "\nhellope\nworld\n", "\nhellope\nworld\n", extra_flags = { .Multiline })
+		check_expression(t, EXPR, "hellope\nworld\n", "hellope\nworld\n", extra_flags = { .Multiline })
 		check_expression(t, EXPR, "hellope\nworld", "hellope\nworld", extra_flags = { .Multiline })
 		check_expression(t, EXPR, "hellope\nworld", "hellope\nworld", extra_flags = { .Multiline })
 		check_expression(t, EXPR, "hellope\rworld", "hellope\rworld", extra_flags = { .Multiline })
 		check_expression(t, EXPR, "hellope\rworld", "hellope\rworld", extra_flags = { .Multiline })
 		check_expression(t, EXPR, "hellope\r\nworld", "hellope\r\nworld", extra_flags = { .Multiline })
 		check_expression(t, EXPR, "hellope\r\nworld", "hellope\r\nworld", extra_flags = { .Multiline })
 	}
 	}
 	{
 	{
-		EXPR :: `^?.$`
-		check_expression(t, EXPR, "\nh", "\nh", extra_flags = { .Multiline })
+		EXPR :: `^.$`
 		check_expression(t, EXPR, "h", "h", extra_flags = { .Multiline })
 		check_expression(t, EXPR, "h", "h", extra_flags = { .Multiline })
+		check_expression(t, EXPR, "h\n", "h\n", extra_flags = { .Multiline })
 	}
 	}
 	{
 	{
 		EXPR :: `^$`
 		EXPR :: `^$`
@@ -1219,6 +1219,57 @@ iterator_vectors := []Iterator_Test{
 			{pos = {{3,  3}}, groups = {""}},
 			{pos = {{3,  3}}, groups = {""}},
 		},
 		},
 	},
 	},
+	// Multiline iteration is supported, but it must follow the `^...$` scheme.
+	//
+	// Any usage outside of this strict syntax will produce predictable but
+	// unusual outputs, as `^` is defined to assert the start of a string or
+	// that a newline sequence was previously consumed, and `$` consumes a
+	// newline sequence or asserts the end of the string.
+	{
+		"foo1\nfoo2\r\nfoo3\rfoo4", `^foo.$`, {.Multiline},
+		{
+			{pos = {{0,  5}}, groups = {"foo1\n"}},
+			{pos = {{5,  11}}, groups = {"foo2\r\n"}},
+			{pos = {{11, 16}}, groups = {"foo3\r"}},
+			{pos = {{16, 20}}, groups = {"foo4"}},
+		},
+	},
+	{
+		"a\nb\n\r", `^$`, {.Multiline},
+		{},
+	},
+	{
+		"a\nb\n", `^$`, {.Multiline},
+		{},
+	},
+	{
+		"a\nb", `^$`, {.Multiline},
+		{},
+	},
+	// Multiline anchors must work within groups, as people are going to end up
+	// using them in there and we do not forbid it.
+	{
+		"a\nb\na\nb", `(?:^a$|^b$)`, {.Multiline},
+		{
+			{pos = {{0, 2}}, groups = {"a\n"}},
+			{pos = {{2, 4}}, groups = {"b\n"}},
+			{pos = {{4, 6}}, groups = {"a\n"}},
+			{pos = {{6, 7}}, groups = {"b"}},
+		},
+	},
+	// The following patterns are valid uses of optional anchors and must match.
+	{
+		"a\nb\na\nb", `^a(?:b|$)`, {.Multiline},
+		{
+			{pos = {{0, 2}}, groups = {"a\n"}},
+		},
+	},
+	{
+		"a\nb\na\nb", `^ab?$?`, {.Multiline},
+		{
+			{pos = {{0, 2}}, groups = {"a\n"}},
+		},
+	},
 }
 }
 
 
 @test
 @test