3 months ago · 35b157ac83
--- a/core/text/regex/compiler/compiler.odin
+++ b/core/text/regex/compiler/compiler.odin
@@ -195,8 +195,12 @@ generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) {
 
															 	case ^Node_Anchor:
														
 
															 		if .Multiline in c.flags {
														
 
															-			append(&code, Opcode.Multiline_Open)
														
 
															-			append(&code, Opcode.Multiline_Close)
														
 
															+			if specific.start {
														
 
															+				append(&code, Opcode.Assert_Start_Multiline)
														
 
															+			} else {
														
 
															+				append(&code, Opcode.Multiline_Open)
														
 
															+				append(&code, Opcode.Multiline_Close)
														
 
															+			}
														
 
															 		} else {
														
 
															 			if specific.start {
														
 
															 				append(&code, Opcode.Assert_Start)
														
@@ -439,7 +443,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
 
															 			case .Save:
														
 
															 				continue
														
 
															-			case .Assert_Start:
														
 
															+			case .Assert_Start, .Assert_Start_Multiline:
														
 
															 				break optimize_opening
														
 
															 			case:
														
--- a/core/text/regex/regex.odin
+++ b/core/text/regex/regex.odin
@@ -282,10 +282,6 @@ create_iterator :: proc(
 
															 	temporary_allocator := context.temp_allocator,
														
 
															 ) -> (result: Match_Iterator, err: Error) {
														
 
															-	if .Multiline in flags {
														
 
															-		return {}, .Unsupported_Flag
														
 
															-	}
														
 
															-
														
 
															 	result.regex         = create(pattern, flags, permanent_allocator, temporary_allocator) or_return
														
 
															 	result.capture       = preallocate_capture()
														
 
															 	result.temp          = temporary_allocator
														
@@ -555,6 +551,7 @@ reset :: proc(it: ^Match_Iterator) {
 
															 	it.vm.top_thread        = 0
														
 
															 	it.vm.current_rune      = rune(0)
														
 
															 	it.vm.current_rune_size = 0
														
 
															+	it.vm.last_rune         = rune(0)
														
 
															 	for i in 0..<it.threads {
														
 
															 		it.vm.threads[i]      = {}
														
 
															 		it.vm.next_threads[i] = {}
														
--- a/core/text/regex/virtual_machine/doc.odin
+++ b/core/text/regex/virtual_machine/doc.odin
@@ -109,34 +109,42 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
 
															 	(0x0A) Assert_Start
														
 
															-	Asserts that the thread is at the beginning of a string.
														
 
															+	Asserts that the thread is at the beginning of the string.
														
 
															-	(0x0B) Assert_End
														
 
															+	(0x0B) Assert_Start_Multiline
														
 
															-	Asserts that the thread is at the end of a string.
														
 
															+	This opcode is compiled in only when the `Multiline` flag is present as a
														
 
															+	replacement for the `^` text anchor.
														
 
															-	(0x0C) Assert_Word_Boundary
														
 
															+	Asserts that the thread is at the beginning of the string or previously
														
 
															+	parsed either a "\n" or "\r".
														
 
															+
														
 
															+	(0x0C) Assert_End
														
 
															+
														
 
															+	Asserts that the thread is at the end of the string.
														
 
															+
														
 
															+	(0x0D) Assert_Word_Boundary
														
 
															 	Asserts that the thread is on a word boundary, which can be the start or
														
 
															 	end of the text. This examines both the current rune and the next rune.
														
 
															-	(0x0D) Assert_Non_Word_Boundary
														
 
															+	(0x0E) Assert_Non_Word_Boundary
														
 
															 	A modified version of Assert_Word_Boundary that returns the opposite value.
														
 
															-	(0x0E) Multiline_Open
														
 
															+	(0x0F) Multiline_Open
														
 
															-	This opcode is compiled in only when the `Multiline` flag is present, and
														
 
															-	it replaces both `^` and `$` text anchors.
														
 
															+	This opcode is compiled in only when the `Multiline` flag is present as a
														
 
															+	replacement for the `$` text anchor.
														
 
															-	It asserts that either the current thread is on one of the string
														
 
															-	boundaries, or it consumes a `\n` or `\r` character.
														
 
															+	It asserts that either the current thread is at the end of the string,
														
 
															+	or it consumes a `\n` or `\r` character.
														
 
															 	If a `\r` character is consumed, the PC will be advanced to the sibling
														
 
															 	`Multiline_Close` opcode to optionally consume a `\n` character on the next
														
 
															 	frame.
														
 
															-	(0x0F) Multiline_Close
														
 
															+	(0x10) Multiline_Close
														
 
															 	This opcode is always present after `Multiline_Open`.
														
@@ -144,10 +152,10 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
 
															 	For example, Windows newlines are represented by the characters `\r\n`,
														
 
															 	whereas UNIX newlines are `\n` and Macintosh newlines are `\r`.
														
 
															-	(0x10) Wait_For_Byte
														
 
															-	(0x11) Wait_For_Rune
														
 
															-	(0x12) Wait_For_Rune_Class
														
 
															-	(0x13) Wait_For_Rune_Class_Negated
														
 
															+	(0x11) Wait_For_Byte
														
 
															+	(0x12) Wait_For_Rune
														
 
															+	(0x13) Wait_For_Rune_Class
														
 
															+	(0x14) Wait_For_Rune_Class_Negated
														
 
															 	These opcodes are an optimization around restarting threads on failed
														
 
															 	matches when the beginning to a pattern is predictable and the Global flag
														
@@ -156,7 +164,7 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
 
															 	They will cause the VM to wait for the next rune to match before splitting,
														
 
															 	as would happen in the un-optimized version.
														
 
															-	(0x14) Match_All_And_Escape
														
 
															+	(0x15) Match_All_And_Escape
														
 
															 	This opcode is an optimized version of `.*$` or `.+$` that causes the
														
 
															 	active thread to immediately work on escaping the program by following all
														
--- a/core/text/regex/virtual_machine/util.odin
+++ b/core/text/regex/virtual_machine/util.odin
@@ -34,6 +34,7 @@ iterate_opcodes :: proc(iter: ^Opcode_Iterator) -> (opcode: Opcode, pc: int, ok:
 
															 	case .Split:                       iter.pc += size_of(Opcode) + 2 * size_of(u16)
														
 
															 	case .Save:                        iter.pc += size_of(Opcode) + size_of(u8)
														
 
															 	case .Assert_Start:                iter.pc += size_of(Opcode)
														
 
															+	case .Assert_Start_Multiline:      iter.pc += size_of(Opcode)
														
 
															 	case .Assert_End:                  iter.pc += size_of(Opcode)
														
 
															 	case .Assert_Word_Boundary:        iter.pc += size_of(Opcode)
														
 
															 	case .Assert_Non_Word_Boundary:    iter.pc += size_of(Opcode)
														
@@ -64,6 +65,7 @@ opcode_to_name :: proc(opcode: Opcode) -> (str: string) {
 
															 	case .Split:                       str = "Split"
														
 
															 	case .Save:                        str = "Save"
														
 
															 	case .Assert_Start:                str = "Assert_Start"
														
 
															+	case .Assert_Start_Multiline:      str = "Assert_Start_Multiline"
														
 
															 	case .Assert_End:                  str = "Assert_End"
														
 
															 	case .Assert_Word_Boundary:        str = "Assert_Word_Boundary"
														
 
															 	case .Assert_Non_Word_Boundary:    str = "Assert_Non_Word_Boundary"
														
--- a/core/text/regex/virtual_machine/virtual_machine.odin
+++ b/core/text/regex/virtual_machine/virtual_machine.odin
@@ -37,16 +37,17 @@ Opcode :: enum u8 {
 
															 	Split                       = 0x08, // | u16, u16
														
 
															 	Save                        = 0x09, // | u8
														
 
															 	Assert_Start                = 0x0A, // |
														
 
															-	Assert_End                  = 0x0B, // |
														
 
															-	Assert_Word_Boundary        = 0x0C, // |
														
 
															-	Assert_Non_Word_Boundary    = 0x0D, // |
														
 
															-	Multiline_Open              = 0x0E, // |
														
 
															-	Multiline_Close             = 0x0F, // |
														
 
															-	Wait_For_Byte               = 0x10, // | u8
														
 
															-	Wait_For_Rune               = 0x11, // | i32
														
 
															-	Wait_For_Rune_Class         = 0x12, // | u8
														
 
															-	Wait_For_Rune_Class_Negated = 0x13, // | u8
														
 
															-	Match_All_And_Escape        = 0x14, // |
														
 
															+	Assert_Start_Multiline      = 0x0B, // |
														
 
															+	Assert_End                  = 0x0C, // |
														
 
															+	Assert_Word_Boundary        = 0x0D, // |
														
 
															+	Assert_Non_Word_Boundary    = 0x0E, // |
														
 
															+	Multiline_Open              = 0x0F, // |
														
 
															+	Multiline_Close             = 0x10, // |
														
 
															+	Wait_For_Byte               = 0x11, // | u8
														
 
															+	Wait_For_Rune               = 0x12, // | i32
														
 
															+	Wait_For_Rune_Class         = 0x13, // | u8
														
 
															+	Wait_For_Rune_Class_Negated = 0x14, // | u8
														
 
															+	Match_All_And_Escape        = 0x15, // |
														
 
															 }
														
 
															 Thread :: struct {
														
@@ -77,6 +78,8 @@ Machine :: struct {
 
															 	current_rune_size: int,
														
 
															 	next_rune: rune,
														
 
															 	next_rune_size: int,
														
 
															+
														
 
															+	last_rune: rune,
														
 
															 }
														
@@ -169,6 +172,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
 
															 				pc += size_of(Opcode)
														
 
															 				continue
														
 
															 			}
														
 
															+		case .Assert_Start_Multiline:
														
 
															+			sp := vm.string_pointer+vm.current_rune_size
														
 
															+			if sp == 0 || vm.last_rune == '\n' || vm.last_rune == '\r' {
														
 
															+				pc += size_of(Opcode)
														
 
															+				continue
														
 
															+			}
														
 
															 		case .Assert_End:
														
 
															 			sp := vm.string_pointer+vm.current_rune_size
														
 
															 			if sp == len(vm.memory) {
														
@@ -177,24 +186,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
 
															 			}
														
 
															 		case .Multiline_Open:
														
 
															 			sp := vm.string_pointer+vm.current_rune_size
														
 
															-			if sp == 0 || sp == len(vm.memory) {
														
 
															-				if vm.next_rune == '\r' || vm.next_rune == '\n' {
														
 
															-					// The VM is currently on a newline at the string boundary,
														
 
															-					// so consume the newline next frame.
														
 
															-					when common.ODIN_DEBUG_REGEX {
														
 
															-						io.write_string(common.debug_stream, "*** New thread added [PC:")
														
 
															-						common.write_padded_hex(common.debug_stream, pc, 4)
														
 
															-						io.write_string(common.debug_stream, "]\n")
														
 
															-					}
														
 
															-					vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
														
 
															-					vm.top_thread += 1
														
 
															-				} else {
														
 
															-					// Skip the `Multiline_Close` opcode.
														
 
															-					pc += 2 * size_of(Opcode)
														
 
															-					continue
														
 
															-				}
														
 
															+			if sp == len(vm.memory) {
														
 
															+				// Skip the `Multiline_Close` opcode.
														
 
															+				pc += 2 * size_of(Opcode)
														
 
															+				continue
														
 
															 			} else {
														
 
															-				// Not on a string boundary.
														
 
															+				// Not at the end of the string.
														
 
															 				// Try to consume a newline next frame in the other opcode loop.
														
 
															 				when common.ODIN_DEBUG_REGEX {
														
 
															 					io.write_string(common.debug_stream, "*** New thread added [PC:")
														
@@ -613,6 +610,7 @@ run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTU
 
															 			break
														
 
															 		}
														
 
															+		vm.last_rune = vm.current_rune
														
 
															 		vm.string_pointer += vm.current_rune_size
														
 
															 	}
														
--- a/tests/core/text/regex/test_core_text_regex.odin
+++ b/tests/core/text/regex/test_core_text_regex.odin
@@ -699,15 +699,15 @@ test_case_insensitive :: proc(t: ^testing.T) {
 
															 test_multiline :: proc(t: ^testing.T) {
														
 
															 	{
														
 
															 		EXPR :: `^hellope$world$`
														
 
															-		check_expression(t, EXPR, "\nhellope\nworld\n", "\nhellope\nworld\n", extra_flags = { .Multiline })
														
 
															+		check_expression(t, EXPR, "hellope\nworld\n", "hellope\nworld\n", extra_flags = { .Multiline })
														
 
															 		check_expression(t, EXPR, "hellope\nworld", "hellope\nworld", extra_flags = { .Multiline })
														
 
															 		check_expression(t, EXPR, "hellope\rworld", "hellope\rworld", extra_flags = { .Multiline })
														
 
															 		check_expression(t, EXPR, "hellope\r\nworld", "hellope\r\nworld", extra_flags = { .Multiline })
														
 
															 	}
														
 
															 	{
														
 
															-		EXPR :: `^?.$`
														
 
															-		check_expression(t, EXPR, "\nh", "\nh", extra_flags = { .Multiline })
														
 
															+		EXPR :: `^.$`
														
 
															 		check_expression(t, EXPR, "h", "h", extra_flags = { .Multiline })
														
 
															+		check_expression(t, EXPR, "h\n", "h\n", extra_flags = { .Multiline })
														
 
															 	}
														
 
															 	{
														
 
															 		EXPR :: `^$`
														
@@ -1219,6 +1219,57 @@ iterator_vectors := []Iterator_Test{
 
															 			{pos = {{3,  3}}, groups = {""}},
														
 
															 		},
														
 
															 	},
														
 
															+	// Multiline iteration is supported, but it must follow the `^...$` scheme.
														
 
															+	//
														
 
															+	// Any usage outside of this strict syntax will produce predictable but
														
 
															+	// unusual outputs, as `^` is defined to assert the start of a string or
														
 
															+	// that a newline sequence was previously consumed, and `$` consumes a
														
 
															+	// newline sequence or asserts the end of the string.
														
 
															+	{
														
 
															+		"foo1\nfoo2\r\nfoo3\rfoo4", `^foo.$`, {.Multiline},
														
 
															+		{
														
 
															+			{pos = {{0,  5}}, groups = {"foo1\n"}},
														
 
															+			{pos = {{5,  11}}, groups = {"foo2\r\n"}},
														
 
															+			{pos = {{11, 16}}, groups = {"foo3\r"}},
														
 
															+			{pos = {{16, 20}}, groups = {"foo4"}},
														
 
															+		},
														
 
															+	},
														
 
															+	{
														
 
															+		"a\nb\n\r", `^$`, {.Multiline},
														
 
															+		{},
														
 
															+	},
														
 
															+	{
														
 
															+		"a\nb\n", `^$`, {.Multiline},
														
 
															+		{},
														
 
															+	},
														
 
															+	{
														
 
															+		"a\nb", `^$`, {.Multiline},
														
 
															+		{},
														
 
															+	},
														
 
															+	// Multiline anchors must work within groups, as people are going to end up
														
 
															+	// using them in there and we do not forbid it.
														
 
															+	{
														
 
															+		"a\nb\na\nb", `(?:^a$|^b$)`, {.Multiline},
														
 
															+		{
														
 
															+			{pos = {{0, 2}}, groups = {"a\n"}},
														
 
															+			{pos = {{2, 4}}, groups = {"b\n"}},
														
 
															+			{pos = {{4, 6}}, groups = {"a\n"}},
														
 
															+			{pos = {{6, 7}}, groups = {"b"}},
														
 
															+		},
														
 
															+	},
														
 
															+	// The following patterns are valid uses of optional anchors and must match.
														
 
															+	{
														
 
															+		"a\nb\na\nb", `^a(?:b|$)`, {.Multiline},
														
 
															+		{
														
 
															+			{pos = {{0, 2}}, groups = {"a\n"}},
														
 
															+		},
														
 
															+	},
														
 
															+	{
														
 
															+		"a\nb\na\nb", `^ab?$?`, {.Multiline},
														
 
															+		{
														
 
															+			{pos = {{0, 2}}, groups = {"a\n"}},
														
 
															+		},
														
 
															+	},
														
 
															 }
														
 
															 @test