Browse Source

Unescape non-ascii characters for re2 (closes #23)

Dmitry Panov 8 years ago
parent
commit
c99b0db935
2 changed files with 10 additions and 1 deletions
  1. 2 1
      parser/regexp.go
  2. 8 0
      regexp_test.go

+ 2 - 1
parser/regexp.go

@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"unicode/utf8"
 )
 
 const (
@@ -315,7 +316,7 @@ func (self *_RegExp_parser) scanEscape(inClass bool) {
 	default:
 		// $ is an identifier character, so we have to have
 		// a special case for it here
-		if self.chr == '$' || !isIdentifierPart(self.chr) {
+		if self.chr == '$' || self.chr < utf8.RuneSelf && !isIdentifierPart(self.chr) {
 			// A non-identifier character needs escaping
 			err := self.goRegexp.WriteByte('\\')
 			if err != nil {

+ 8 - 0
regexp_test.go

@@ -182,6 +182,14 @@ func TestRegexpSplitWithBackRef(t *testing.T) {
 	testScript1(SCRIPT, asciiString("a $$ + $$ b+-c"), t)
 }
 
+func TestEscapeNonASCII(t *testing.T) {
+	const SCRIPT = `
+	/\⩓/.test("⩓")
+	`
+
+	testScript1(SCRIPT, valueTrue, t)
+}
+
 func BenchmarkRegexpSplitWithBackRef(b *testing.B) {
 	const SCRIPT = `
 	"aaaaaaaaaaaaaaaaaaaaaaaaa++bbbbbbbbbbbbbbbbbbbbbb+-ccccccccccccccccccccccc".split(/([+-])\1/)