2
0
Эх сурвалжийг харах

Remove `Global` RegEx flag, default to unanchored patterns

Feoramund 3 сар өмнө
parent
commit
37d6491300

+ 0 - 3
core/text/regex/common/common.odin

@@ -15,8 +15,6 @@ MAX_PROGRAM_SIZE   :: int(max(i16))
 MAX_CLASSES        :: int(max(u8))
 
 Flag :: enum u8 {
-	// Global: try to match the pattern anywhere in the string.
-	Global,
 	// Multiline: treat `^` and `$` as if they also match newlines.
 	Multiline,
 	// Case Insensitive: treat `a-z` as if it was also `A-Z`.
@@ -36,7 +34,6 @@ Flags :: bit_set[Flag; u8]
 
 @(rodata)
 Flag_To_Letter := #sparse[Flag]u8 {
-	.Global            = 'g',
 	.Multiline         = 'm',
 	.Case_Insensitive  = 'i',
 	.Ignore_Whitespace = 'x',

+ 9 - 5
core/text/regex/compiler/compiler.odin

@@ -401,7 +401,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
 
 	pc_open := 0
 
-	add_global: if .Global in flags {
+	optimize_opening: {
 		// Check if the opening to the pattern is predictable.
 		// If so, use one of the optimized Wait opcodes.
 		iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
@@ -412,7 +412,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
 				pc_open += size_of(Opcode)
 				inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
 				pc_open += size_of(u8)
-				break add_global
+				break optimize_opening
 
 			case .Rune:
 				operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
@@ -420,24 +420,28 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
 				pc_open += size_of(Opcode)
 				inject_raw(&code, pc_open, operand)
 				pc_open += size_of(rune)
-				break add_global
+				break optimize_opening
 
 			case .Rune_Class:
 				inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
 				pc_open += size_of(Opcode)
 				inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
 				pc_open += size_of(u8)
-				break add_global
+				break optimize_opening
 
 			case .Rune_Class_Negated:
 				inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
 				pc_open += size_of(Opcode)
 				inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
 				pc_open += size_of(u8)
-				break add_global
+				break optimize_opening
 
 			case .Save:
 				continue
+
+			case .Assert_Start:
+				break optimize_opening
+
 			case:
 				break seek_loop
 			}

+ 0 - 4
core/text/regex/regex.odin

@@ -167,7 +167,6 @@ to escape the delimiter if found in the middle of the string.
 
 All runes after the closing delimiter will be parsed as flags:
 
-- 'g': Global
 - 'm': Multiline
 - 'i': Case_Insensitive
 - 'x': Ignore_Whitespace
@@ -244,7 +243,6 @@ create_by_user :: proc(
 	// to `end` here.
 	for r in pattern[start + end:] {
 		switch r {
-		case 'g': flags += { .Global }
 		case 'm': flags += { .Multiline }
 		case 'i': flags += { .Case_Insensitive }
 		case 'x': flags += { .Ignore_Whitespace }
@@ -283,8 +281,6 @@ create_iterator :: proc(
 	permanent_allocator := context.allocator,
 	temporary_allocator := context.temp_allocator,
 ) -> (result: Match_Iterator, err: Error) {
-	flags := flags
-	flags += {.Global} // We're iterating over a string, so the next match could start anywhere
 
 	if .Multiline in flags {
 		return {}, .Unsupported_Flag

+ 6 - 4
tests/benchmark/text/regex/benchmark_regex.odin

@@ -103,9 +103,11 @@ expensive_for_backtrackers :: proc(t: ^testing.T) {
 
 @test
 global_capture_end_word :: proc(t: ^testing.T) {
+	// NOTE: The previous behavior of `.Global`, which was to automatically
+	// insert `.*?` at the start of the pattern, is now default.
 	EXPR :: `Hellope World!`
 
-	rex, err := regex.create(EXPR, { .Global })
+	rex, err := regex.create(EXPR, { /*.Global*/ })
 	if !testing.expect_value(t, err, nil) {
 		return
 	}
@@ -145,7 +147,7 @@ global_capture_end_word_unicode :: proc(t: ^testing.T) {
 	EXPR :: `こにちは`
 	needle := string(EXPR)
 
-	rex, err := regex.create(EXPR, { .Global, .Unicode })
+	rex, err := regex.create(EXPR, { /*.Global,*/ .Unicode })
 	if !testing.expect_value(t, err, nil) {
 		return
 	}
@@ -185,7 +187,7 @@ global_capture_end_word_unicode :: proc(t: ^testing.T) {
 alternations :: proc(t: ^testing.T) {
 	EXPR :: `a(?:bb|cc|dd|ee|ff)`
 
-	rex, err := regex.create(EXPR, { .No_Capture, .Global })
+	rex, err := regex.create(EXPR, { .No_Capture, /*.Global*/ })
 	if !testing.expect_value(t, err, nil) {
 		return
 	}
@@ -219,7 +221,7 @@ classes :: proc(t: ^testing.T) {
 	EXPR :: `[\w\d]+`
 	NEEDLE :: "0123456789abcdef"
 
-	rex, err := regex.create(EXPR, { .Global })
+	rex, err := regex.create(EXPR, { /*.Global*/ })
 	if !testing.expect_value(t, err, nil) {
 		return
 	}

+ 12 - 12
tests/core/text/regex/test_core_text_regex.odin

@@ -51,13 +51,13 @@ check_expression_with_flags :: proc(t: ^testing.T, pattern: string, flags: regex
 }
 
 check_expression :: proc(t: ^testing.T, pattern, haystack: string, needles: ..string, extra_flags := regex.Flags{}, loc := #caller_location) {
-	check_expression_with_flags(t, pattern, { .Global } + extra_flags,
+	check_expression_with_flags(t, pattern, extra_flags,
 		haystack, ..needles, loc = loc)
-	check_expression_with_flags(t, pattern, { .Global, .No_Optimization } + extra_flags,
+	check_expression_with_flags(t, pattern, { .No_Optimization } + extra_flags,
 		haystack, ..needles, loc = loc)
-	check_expression_with_flags(t, pattern, { .Global, .Unicode } + extra_flags,
+	check_expression_with_flags(t, pattern, { .Unicode } + extra_flags,
 		haystack, ..needles, loc = loc)
-	check_expression_with_flags(t, pattern, { .Global, .Unicode, .No_Optimization } + extra_flags,
+	check_expression_with_flags(t, pattern, { .Unicode, .No_Optimization } + extra_flags,
 		haystack, ..needles, loc = loc)
 }
 
@@ -516,7 +516,7 @@ test_pos_index_explicitly :: proc(t: ^testing.T) {
 	STR :: "This is an island."
 	EXPR :: `\bis\b`
 
-	rex, err := regex.create(EXPR, { .Global })
+	rex, err := regex.create(EXPR)
 	if !testing.expect_value(t, err, nil) {
 		return
 	}
@@ -642,9 +642,9 @@ test_unicode_explicitly :: proc(t: ^testing.T) {
 	}
 	{
 		EXPR :: "こにちは!"
-		check_expression_with_flags(t, EXPR, { .Global, .Unicode },
+		check_expression_with_flags(t, EXPR, { .Unicode },
 			"Hello こにちは!", "こにちは!")
-		check_expression_with_flags(t, EXPR, { .Global, .Unicode, .No_Optimization },
+		check_expression_with_flags(t, EXPR, { .Unicode, .No_Optimization },
 			"Hello こにちは!", "こにちは!")
 	}
 }
@@ -901,12 +901,12 @@ test_everything_at_once :: proc(t: ^testing.T) {
 @test
 test_creation_from_user_string :: proc(t: ^testing.T) {
 	{
-		USER_EXPR :: `/^hellope$/gmixun-`
+		USER_EXPR :: `/^hellope$/mixun-`
 		STR :: "hellope"
 		rex, err := regex.create_by_user(USER_EXPR)
 		defer regex.destroy(rex)
 		testing.expect_value(t, err, nil)
-		testing.expect_value(t, rex.flags, regex.Flags{ .Global, .Multiline, .Case_Insensitive, .Ignore_Whitespace, .Unicode, .No_Capture, .No_Optimization })
+		testing.expect_value(t, rex.flags, regex.Flags{ .Multiline, .Case_Insensitive, .Ignore_Whitespace, .Unicode, .No_Capture, .No_Optimization })
 
 		_, ok := regex.match(rex, STR)
 		testing.expectf(t, ok, "expected user-provided RegEx %v to match %q", rex, STR)
@@ -1102,14 +1102,14 @@ Iterator_Test :: struct {
 
 iterator_vectors := []Iterator_Test{
 	{
-		`xxab32ab52xx`, `(ab\d{1})`, {}, // {.Global} implicitly added by the iterator
+		`xxab32ab52xx`, `(ab\d{1})`, {},
 		{
 			{pos = {{2, 5}, {2, 5}}, groups = {"ab3", "ab3"}},
 			{pos = {{6, 9}, {6, 9}}, groups = {"ab5", "ab5"}},
 		},
 	},
 	{
-		`xxfoobarxfoobarxx`, `f(o)ob(ar)`, {.Global},
+		`xxfoobarxfoobarxx`, `f(o)ob(ar)`, {},
 		{
 			{pos = {{2,  8},  {3,  4},  {6,  8}}, groups = {"foobar", "o", "ar"}},
 			{pos = {{9, 15}, {10, 11}, {13, 15}}, groups = {"foobar", "o", "ar"}},
@@ -1135,4 +1135,4 @@ test_match_iterator :: proc(t: ^testing.T) {
 		}
 		testing.expect_value(t, it.idx, len(test.expected))
 	}
-}
+}