Bläddra i källkod

Do not use utf16Reader for ASCII strings when using Go regexp. See #201

Dmitry Panov 5 år sedan
förälder
incheckning
fa030ff283
5 ändrade filer med 60 tillägg och 12 borttagningar
  1. 18 2
      regexp.go
  2. 41 0
      regexp_test.go
  3. 0 1
      string.go
  4. 1 5
      string_ascii.go
  5. 0 4
      string_unicode.go

+ 18 - 2
regexp.go

@@ -138,7 +138,7 @@ func (p *regexpPattern) findAllSubmatchIndex(s valueString, start int, limit int
 			return p.regexpWrapper.findAllSubmatchIndex(s.String(), limit, sticky)
 		}
 		if limit == 1 {
-			result := p.regexpWrapper.findSubmatchIndex(s, p.unicode)
+			result := p.regexpWrapper.findSubmatchIndexUnicode(s.(unicodeString), p.unicode)
 			if result == nil {
 				return nil
 			}
@@ -445,7 +445,23 @@ func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (
 	return
 }
 
-func (r *regexpWrapper) findSubmatchIndex(s valueString, fullUnicode bool) (result []int) {
+func (r *regexpWrapper) findSubmatchIndex(s valueString, fullUnicode bool) []int {
+	switch s := s.(type) {
+	case asciiString:
+		return r.findSubmatchIndexASCII(string(s))
+	case unicodeString:
+		return r.findSubmatchIndexUnicode(s, fullUnicode)
+	default:
+		panic("Unsupported string type")
+	}
+}
+
+func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int {
+	wrapped := (*regexp.Regexp)(r)
+	return wrapped.FindStringSubmatchIndex(s)
+}
+
+func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) {
 	wrapped := (*regexp.Regexp)(r)
 	if fullUnicode {
 		posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), 0)

+ 41 - 0
regexp_test.go

@@ -580,3 +580,44 @@ func BenchmarkRegexpMatchCache(b *testing.B) {
 		b.Fatal("not a function")
 	}
 }
+
+func BenchmarkRegexpSingleExec(b *testing.B) {
+	vm := New()
+	regexp := vm.Get("RegExp")
+	f := func(reStr, str string, b *testing.B) {
+		r, err := vm.New(regexp, vm.ToValue(reStr))
+		if err != nil {
+			b.Fatal(err)
+		}
+		exec, ok := AssertFunction(r.Get("exec"))
+		if !ok {
+			b.Fatal("RegExp.exec is not a function")
+		}
+		arg := vm.ToValue(str)
+		b.ResetTimer()
+		b.ReportAllocs()
+		for i := 0; i < b.N; i++ {
+			_, err := exec(r, arg)
+			if err != nil {
+				b.Fatal(err)
+			}
+		}
+	}
+
+	b.Run("Re-ASCII", func(b *testing.B) {
+		f("test", "aaaaaaaaaaaaaaaaaaaaaaaaa testing", b)
+	})
+
+	b.Run("Re2-ASCII", func(b *testing.B) {
+		f("(?=)test", "aaaaaaaaaaaaaaaaaaaaaaaaa testing", b)
+	})
+
+	b.Run("Re-Unicode", func(b *testing.B) {
+		f("test", "aaaaaaaaaaaaaaaaaaaaaaaaa testing 😀", b)
+	})
+
+	b.Run("Re2-Unicode", func(b *testing.B) {
+		f("(?=)test", "aaaaaaaaaaaaaaaaaaaaaaaaa testing 😀", b)
+	})
+
+}

+ 0 - 1
string.go

@@ -57,7 +57,6 @@ type valueString interface {
 	compareTo(valueString) int
 	reader(start int) io.RuneReader
 	utf16Reader(start int) io.RuneReader
-	runes() []rune
 	utf16Runes() []rune
 	index(valueString, int) int
 	lastIndex(valueString, int) int

+ 1 - 5
string_ascii.go

@@ -40,7 +40,7 @@ func (s asciiString) utf16Reader(start int) io.RuneReader {
 	return s.reader(start)
 }
 
-func (s asciiString) runes() []rune {
+func (s asciiString) utf16Runes() []rune {
 	runes := make([]rune, len(s))
 	for i := 0; i < len(s); i++ {
 		runes[i] = rune(s[i])
@@ -48,10 +48,6 @@ func (s asciiString) runes() []rune {
 	return runes
 }
 
-func (s asciiString) utf16Runes() []rune {
-	return s.runes()
-}
-
 // ss must be trimmed
 func strToInt(ss string) (int64, error) {
 	if ss == "" {

+ 0 - 4
string_unicode.go

@@ -290,10 +290,6 @@ func (s unicodeString) utf16Reader(start int) io.RuneReader {
 	}
 }
 
-func (s unicodeString) runes() []rune {
-	return utf16.Decode(s[1:])
-}
-
 func (s unicodeString) utf16Runes() []rune {
 	runes := make([]rune, len(s)-1)
 	for i, ch := range s[1:] {