2
0
Эх сурвалжийг харах

Cache runes and posMap to optimise consecutive regexp.exec() calls on the same string. See #201

Dmitry Panov 5 жил өмнө
parent
commit
b2a8925997
2 өөрчлөгдсөн 259 нэмэгдсэн , 29 устгасан
  1. 106 29
      regexp.go
  2. 153 0
      regexp_test.go

+ 106 - 29
regexp.go

@@ -11,7 +11,17 @@ import (
 	"unicode/utf16"
 )
 
-type regexp2Wrapper regexp2.Regexp
+type regexp2MatchCache struct {
+	target valueString
+	runes  []rune
+	posMap []int
+}
+
+type regexp2Wrapper struct {
+	rx    *regexp2.Regexp
+	cache *regexp2MatchCache
+}
+
 type regexpWrapper regexp.Regexp
 
 type positionMapItem struct {
@@ -68,7 +78,7 @@ func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, er
 		return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
 	}
 
-	return (*regexp2Wrapper)(regexp2Pattern), nil
+	return &regexp2Wrapper{rx: regexp2Pattern}, nil
 }
 
 func (p *regexpPattern) createRegexp2() {
@@ -107,14 +117,14 @@ func buildUTF8PosMap(s valueString) (positionMap, string) {
 
 func (p *regexpPattern) findSubmatchIndex(s valueString, start int) []int {
 	if p.regexpWrapper == nil {
-		return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode)
+		return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
 	}
 	if start != 0 {
 		// Unfortunately Go's regexp library does not allow starting from an arbitrary position.
 		// If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not
 		// work correctly.
 		p.createRegexp2()
-		return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode)
+		return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
 	}
 	return p.regexpWrapper.findSubmatchIndex(s, p.unicode)
 }
@@ -163,16 +173,41 @@ type regexpObject struct {
 	standard bool
 }
 
-func (r *regexp2Wrapper) findSubmatchIndex(s valueString, start int, fullUnicode bool) (result []int) {
+func (r *regexp2Wrapper) findSubmatchIndex(s valueString, start int, fullUnicode, doCache bool) (result []int) {
 	if fullUnicode {
-		return r.findSubmatchIndexUnicode(s, start)
+		return r.findSubmatchIndexUnicode(s, start, doCache)
+	}
+	return r.findSubmatchIndexUTF16(s, start, doCache)
+}
+
+func (r *regexp2Wrapper) findUTF16Cached(s valueString, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) {
+	wrapped := r.rx
+	cache := r.cache
+	if cache != nil && cache.posMap == nil && cache.target.SameAs(s) {
+		runes = cache.runes
+	} else {
+		runes = s.utf16Runes()
+		cache = nil
+	}
+	match, err = wrapped.FindRunesMatchStartingAt(runes, start)
+	if doCache && match != nil && err == nil {
+		if cache == nil {
+			if r.cache == nil {
+				r.cache = new(regexp2MatchCache)
+			}
+			*r.cache = regexp2MatchCache{
+				target: s,
+				runes:  runes,
+			}
+		}
+	} else {
+		r.cache = nil
 	}
-	return r.findSubmatchIndexUTF16(s, start)
+	return
 }
 
-func (r *regexp2Wrapper) findSubmatchIndexUTF16(s valueString, start int) (result []int) {
-	wrapped := (*regexp2.Regexp)(r)
-	match, err := wrapped.FindRunesMatchStartingAt(s.utf16Runes(), start)
+func (r *regexp2Wrapper) findSubmatchIndexUTF16(s valueString, start int, doCache bool) (result []int) {
+	match, _, err := r.findUTF16Cached(s, start, doCache)
 	if err != nil {
 		return
 	}
@@ -193,17 +228,55 @@ func (r *regexp2Wrapper) findSubmatchIndexUTF16(s valueString, start int) (resul
 	return
 }
 
-func (r *regexp2Wrapper) findSubmatchIndexUnicode(s valueString, start int) (result []int) {
-	wrapped := (*regexp2.Regexp)(r)
-	posMap, runes, mappedStart := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), start)
-	match, err := wrapped.FindRunesMatchStartingAt(runes, mappedStart)
-	if err != nil {
-		return
+func (r *regexp2Wrapper) findUnicodeCached(s valueString, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) {
+	var (
+		runes       []rune
+		mappedStart int
+		splitPair   bool
+		savedRune   rune
+	)
+	wrapped := r.rx
+	cache := r.cache
+	if cache != nil && cache.posMap != nil && cache.target.SameAs(s) {
+		runes, posMap = cache.runes, cache.posMap
+		mappedStart, splitPair = posMapReverseLookup(posMap, start)
+	} else {
+		posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), start)
+		cache = nil
+	}
+	if splitPair {
+		// temporarily set the rune at mappedStart to the second code point of the pair
+		_, second := utf16.EncodeRune(runes[mappedStart])
+		savedRune, runes[mappedStart] = runes[mappedStart], second
+	}
+	match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart)
+	if doCache && match != nil && err == nil {
+		if splitPair {
+			runes[mappedStart] = savedRune
+		}
+		if cache == nil {
+			if r.cache == nil {
+				r.cache = new(regexp2MatchCache)
+			}
+			*r.cache = regexp2MatchCache{
+				target: s,
+				runes:  runes,
+				posMap: posMap,
+			}
+		}
+	} else {
+		r.cache = nil
 	}
 
-	if match == nil {
+	return
+}
+
+func (r *regexp2Wrapper) findSubmatchIndexUnicode(s valueString, start int, doCache bool) (result []int) {
+	match, posMap, err := r.findUnicodeCached(s, start, doCache)
+	if match == nil || err != nil {
 		return
 	}
+
 	groups := match.Groups()
 
 	result = make([]int, 0, len(groups)<<1)
@@ -218,10 +291,9 @@ func (r *regexp2Wrapper) findSubmatchIndexUnicode(s valueString, start int) (res
 }
 
 func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s valueString, start, limit int, sticky bool) [][]int {
-	wrapped := (*regexp2.Regexp)(r)
-	runes := s.utf16Runes()
-	match, err := wrapped.FindRunesMatchStartingAt(runes, start)
-	if err != nil {
+	wrapped := r.rx
+	match, runes, err := r.findUTF16Cached(s, start, false)
+	if match == nil || err != nil {
 		return nil
 	}
 	if limit < 0 {
@@ -263,7 +335,7 @@ func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s valueString, start, limit i
 	return results
 }
 
-func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int) {
+func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) {
 	posMap = make([]int, 0, l+1)
 	curPos := 0
 	runes = make([]rune, 0, l)
@@ -277,8 +349,7 @@ func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, ma
 			if curPos > start {
 				// start position splits a surrogate pair
 				mappedStart = len(runes) - 1
-				_, second := utf16.EncodeRune(runes[mappedStart])
-				runes[mappedStart] = second
+				splitPair = true
 				startFound = true
 			}
 		}
@@ -294,15 +365,21 @@ func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, ma
 	return
 }
 
+func posMapReverseLookup(posMap []int, pos int) (int, bool) {
+	mapped := sort.SearchInts(posMap, pos)
+	if mapped < len(posMap) && posMap[mapped] != pos {
+		return mapped - 1, true
+	}
+	return mapped, false
+}
+
 func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int {
-	wrapped := (*regexp2.Regexp)(r)
+	wrapped := r.rx
 	if limit < 0 {
 		limit = len(s) + 1
 	}
 	results := make([][]int, 0, limit)
-	posMap, runes, mappedStart := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), start)
-
-	match, err := wrapped.FindRunesMatchStartingAt(runes, mappedStart)
+	match, posMap, err := r.findUnicodeCached(s, start, false)
 	if err != nil {
 		return nil
 	}
@@ -371,7 +448,7 @@ func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (
 func (r *regexpWrapper) findSubmatchIndex(s valueString, fullUnicode bool) (result []int) {
 	wrapped := (*regexp.Regexp)(r)
 	if fullUnicode {
-		posMap, runes, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), 0)
+		posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), 0)
 		res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes})
 		for i, item := range res {
 			res[i] = posMap[item]

+ 153 - 0
regexp_test.go

@@ -366,6 +366,98 @@ func TestRegexpEscapeSource(t *testing.T) {
 	testScript1(SCRIPT, asciiString(`href="(.+?)(\/.*\/\S+?)\/"`), t)
 }
 
+func TestRegexpConsecutiveMatchCache(t *testing.T) {
+	const SCRIPT = `
+	(function test(unicode) {
+		var regex = new RegExp('t(e)(st(\\d?))', unicode?'gu':'g');
+		var string = 'test1test2';
+		var match;
+		var matches = [];
+		while (match = regex.exec(string)) {
+			matches.push(match);
+		}
+		var expectedMatches = [
+		  [
+			'test1',
+			'e',
+			'st1',
+			'1'
+		  ],
+		  [
+			'test2',
+			'e',
+			'st2',
+			'2'
+		  ]
+		];
+		expectedMatches[0].index = 0;
+		expectedMatches[0].input = 'test1test2';
+		expectedMatches[1].index = 5;
+		expectedMatches[1].input = 'test1test2';
+
+		assert(deepEqual(matches, expectedMatches), "#1");
+
+		// try the same regexp with a different string
+		regex.lastIndex = 0;
+		match = regex.exec(' test5');
+		var expectedMatch = [
+		  'test5',
+		  'e',
+		  'st5',
+		  '5'
+		];
+		expectedMatch.index = 1;
+		expectedMatch.input = ' test5';
+		assert(deepEqual(match, expectedMatch), "#2");
+		assert.sameValue(regex.lastIndex, 6, "#3");
+
+		// continue matching with a different string
+		match = regex.exec(' test5test6');
+		expectedMatch = [
+		  'test6',
+		  'e',
+		  'st6',
+		  '6'
+		];
+		expectedMatch.index = 6;
+		expectedMatch.input = ' test5test6';
+		assert(deepEqual(match, expectedMatch), "#4");
+		assert.sameValue(regex.lastIndex, 11, "#5");
+
+		match = regex.exec(' test5test6');
+		assert.sameValue(match, null, "#6");
+		return regex;
+	});
+	`
+	vm := New()
+	v, err := vm.RunString(TESTLIBX + SCRIPT)
+	if err != nil {
+		t.Fatal(err)
+	}
+	var f func(bool) (*Object, error)
+	err = vm.ExportTo(v, &f)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	regex, err := f(false)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if regex.self.(*regexpObject).pattern.regexp2Wrapper.cache != nil {
+		t.Fatal("Cache is not nil (non-unicode)")
+	}
+
+	regex, err = f(true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if regex.self.(*regexpObject).pattern.regexp2Wrapper.cache != nil {
+		t.Fatal("Cache is not nil (unicode)")
+	}
+
+}
+
 func BenchmarkRegexpSplitWithBackRef(b *testing.B) {
 	const SCRIPT = `
 	"aaaaaaaaaaaaaaaaaaaaaaaaa++bbbbbbbbbbbbbbbbbbbbbb+-ccccccccccccccccccccccc".split(/([+-])\1/)
@@ -407,3 +499,64 @@ func BenchmarkRegexpMatch(b *testing.B) {
 		vm.RunProgram(prg)
 	}
 }
+
+func BenchmarkRegexpMatchCache(b *testing.B) {
+	const SCRIPT = `
+	(function() {
+		var s = "a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+         a\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\ra\nb\r\c\nd\r\e\n\f\rg\nh\r\
+        "
+		var r = /[^\r\n]+/g
+		while(r.exec(s)) {};
+	});
+	`
+	vm := New()
+	v, err := vm.RunString(SCRIPT)
+	if err != nil {
+		b.Fatal(err)
+	}
+	if fn, ok := AssertFunction(v); ok {
+		b.ResetTimer()
+		b.ReportAllocs()
+		for i := 0; i < b.N; i++ {
+			fn(_undefined)
+		}
+	} else {
+		b.Fatal("not a function")
+	}
+}