Răsfoiți Sursa

Implement workarounds for regex parsing known issues (#1603)

adams85 2 ani în urmă
părinte
comite
6eddc61db4

+ 0 - 4
Jint.Tests.Test262/Test262Harness.settings.json

@@ -44,14 +44,10 @@
     "language/literals/regexp/named-groups/forward-reference.js",
     "language/literals/regexp/named-groups/forward-reference.js",
 
 
     // RegExp handling problems
     // RegExp handling problems
-    "built-ins/RegExp/match-indices/indices-array-unicode-property-names.js",
     "built-ins/RegExp/named-groups/non-unicode-match.js",
     "built-ins/RegExp/named-groups/non-unicode-match.js",
     "built-ins/RegExp/named-groups/non-unicode-property-names-valid.js",
     "built-ins/RegExp/named-groups/non-unicode-property-names-valid.js",
-    "built-ins/RegExp/named-groups/non-unicode-property-names.js",
     "built-ins/RegExp/named-groups/unicode-match.js",
     "built-ins/RegExp/named-groups/unicode-match.js",
     "built-ins/RegExp/named-groups/unicode-property-names-valid.js",
     "built-ins/RegExp/named-groups/unicode-property-names-valid.js",
-    "built-ins/RegExp/named-groups/unicode-property-names.js",
-    "built-ins/RegExp/prototype/Symbol.replace/named-groups.js",
     "built-ins/RegExp/prototype/exec/S15.10.6.2_A1_T6.js",
     "built-ins/RegExp/prototype/exec/S15.10.6.2_A1_T6.js",
     "built-ins/String/prototype/split/separator-regexp.js",
     "built-ins/String/prototype/split/separator-regexp.js",
     "language/literals/regexp/u-case-mapping.js",
     "language/literals/regexp/u-case-mapping.js",

+ 38 - 1
Jint.Tests/Runtime/RegExpTests.cs

@@ -35,7 +35,7 @@ public class RegExpTests
     public void PreventsInfiniteLoop()
     public void PreventsInfiniteLoop()
     {
     {
         var engine = new Engine();
         var engine = new Engine();
-        var result = (ArrayInstance)engine.Evaluate("'x'.match(/|/g);");
+        var result = (ArrayInstance) engine.Evaluate("'x'.match(/|/g);");
         Assert.Equal((uint) 2, result.Length);
         Assert.Equal((uint) 2, result.Length);
         Assert.Equal("", result[0]);
         Assert.Equal("", result[0]);
         Assert.Equal("", result[1]);
         Assert.Equal("", result[1]);
@@ -103,4 +103,41 @@ public class RegExpTests
         var source = engine.Evaluate(@"/\/\//.source");
         var source = engine.Evaluate(@"/\/\//.source");
         Assert.Equal("\\/\\/", source);
         Assert.Equal("\\/\\/", source);
     }
     }
+
+    [Theory]
+    [InlineData("", "/()/ug", new[] { "" }, new[] { 0 })]
+    [InlineData("💩", "/()/ug", new[] { "", "" }, new[] { 0, 2 })]
+    [InlineData("ᴜⁿᵢ𝒸ₒᵈₑ is a 💩", "/i?/ug",
+        new[] { "", "", "", "", "", "", "", "", "i", "", "", "", "", "", "" },
+        new[] { 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 })]
+    public void ShouldNotMatchEmptyStringsWithinSurrogatePairsInUnicodeMode(string input, string pattern, string[] expectedCaptures, int[] expectedIndices)
+    {
+        var engine = new Engine();
+        var matches = engine.Evaluate($"[...'{input}'.matchAll({pattern})]").AsArray();
+        Assert.Equal((ulong) expectedCaptures.Length, matches.Length);
+        Assert.Equal(expectedCaptures, matches.Select((m, i) => m.Get(0).AsString()));
+        Assert.Equal(expectedIndices, matches.Select(m => m.Get("index").AsInteger()));
+    }
+
+    [Fact]
+    public void ShouldAllowProblematicGroupNames()
+    {
+        var engine = new Engine();
+
+        var match = engine.Evaluate("'abc'.match(/(?<$group>b)/)").AsArray();
+        var groups = match.Get("groups").AsObject();
+        Assert.Equal(new[] { "$group" }, groups.GetOwnPropertyKeys().Select(k => k.AsString()));
+        Assert.Equal("b", groups["$group"]);
+
+        var result = engine.Evaluate("'abc'.replace(/(?<$group>b)/g, '-$<$group>-')").AsString();
+        Assert.Equal("a-b-c", result);
+    }
+
+    [Fact]
+    public void Issue506()
+    {
+        var engine = new Engine();
+        var result = engine.Evaluate("/[^]?(:[rp][el]a[\\w-]+)[^]/.test(':reagent-')").AsBoolean();
+        Assert.True(result);
+    }
 }
 }

+ 50 - 44
Jint/Native/RegExp/RegExpPrototype.cs

@@ -1,4 +1,5 @@
 using System.Diagnostics.CodeAnalysis;
 using System.Diagnostics.CodeAnalysis;
+using System.Text;
 using System.Text.RegularExpressions;
 using System.Text.RegularExpressions;
 using Jint.Collections;
 using Jint.Collections;
 using Jint.Native.Number;
 using Jint.Native.Number;
@@ -824,7 +825,7 @@ namespace Jint.Native.RegExp
             var exec = r.Get(PropertyExec);
             var exec = r.Get(PropertyExec);
             if (exec is ICallable callable)
             if (exec is ICallable callable)
             {
             {
-                var result = callable.Call(r, new JsValue[]  { s });
+                var result = callable.Call(r, new JsValue[] { s });
                 if (!result.IsNull() && !result.IsObject())
                 if (!result.IsNull() && !result.IsObject())
                 {
                 {
                     ExceptionHelper.ThrowTypeError(r.Engine.Realm);
                     ExceptionHelper.ThrowTypeError(r.Engine.Realm);
@@ -902,31 +903,47 @@ namespace Jint.Native.RegExp
 
 
             // the stateful version
             // the stateful version
             Match match;
             Match match;
-            while (true)
+
+            if (lastIndex > length)
             {
             {
-                if (lastIndex > length)
-                {
-                    R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
-                    return Null;
-                }
+                R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
+                return Null;
+            }
 
 
-                match = R.Value.Match(s, (int) lastIndex);
-                var success = match.Success && (!sticky || match.Index == (int) lastIndex);
-                if (!success)
+            var startAt = (int) lastIndex;
+            while (true)
+            {
+                match = R.Value.Match(s, startAt);
+
+                // The conversion of Unicode regex patterns to .NET Regex has some flaws:
+                // when the pattern may match empty strings, the adapted Regex will return empty string matches
+                // in the middle of surrogate pairs. As a best effort solution, we remove these fake positive matches.
+                // (See also: https://github.com/sebastienros/esprima-dotnet/pull/364#issuecomment-1606045259)
+
+                if (match.Success
+                    && fullUnicode
+                    && match.Length == 0
+                    && 0 < match.Index && match.Index < s.Length
+                    && char.IsHighSurrogate(s[match.Index - 1]) && char.IsLowSurrogate(s[match.Index]))
                 {
                 {
-                    R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
-                    return Null;
+                    startAt++;
+                    continue;
                 }
                 }
 
 
                 break;
                 break;
             }
             }
 
 
-            var e = match.Index + match.Length;
-            if (fullUnicode)
+            var success = match.Success && (!sticky || match.Index == (int) lastIndex);
+            if (!success)
             {
             {
-                e = GetStringIndex(s, e);
+                R.Set(JsRegExp.PropertyLastIndex, JsNumber.PositiveZero, true);
+                return Null;
             }
             }
 
 
+            var e = match.Index + match.Length;
+
+            // NOTE: Even in Unicode mode, we don't need to translate indices as .NET regexes always return code unit indices.
+
             if (global || sticky)
             if (global || sticky)
             {
             {
                 R.Set(JsRegExp.PropertyLastIndex, e, true);
                 R.Set(JsRegExp.PropertyLastIndex, e, true);
@@ -935,35 +952,6 @@ namespace Jint.Native.RegExp
             return CreateReturnValueArray(R.Engine, matcher, match, s, fullUnicode, hasIndices);
             return CreateReturnValueArray(R.Engine, matcher, match, s, fullUnicode, hasIndices);
         }
         }
 
 
-        /// <summary>
-        /// https://tc39.es/ecma262/#sec-getstringindex
-        /// </summary>
-        private static int GetStringIndex(string s, int codePointIndex)
-        {
-            if (s.Length == 0)
-            {
-                return 0;
-            }
-
-            var len = s.Length;
-            var codeUnitCount = 0;
-            var codePointCount = 0;
-
-            while (codeUnitCount < len)
-            {
-                if (codePointCount == codePointIndex)
-                {
-                    return codeUnitCount;
-                }
-
-                var isSurrogatePair = char.IsSurrogatePair(s, codeUnitCount);
-                codeUnitCount += isSurrogatePair ? 2 : 1;
-                codePointCount += 1;
-            }
-
-            return len;
-        }
-
         private static JsArray CreateReturnValueArray(
         private static JsArray CreateReturnValueArray(
             Engine engine,
             Engine engine,
             Regex regex,
             Regex regex,
@@ -1080,6 +1068,24 @@ namespace Jint.Native.RegExp
                 return null;
                 return null;
 
 
             }
             }
+
+            // The characters allowed in group names differs between the JS and .NET regex engines.
+            // For example the group name "$group" is valid in JS but invalid in .NET.
+            // As a workaround for this issue, the parser make an attempt to encode the problematic group names to
+            // names which are valid in .NET and probably won't collide with other group names present in the pattern
+            // (https://github.com/sebastienros/esprima-dotnet/blob/v3.0.0-rc-03/src/Esprima/Scanner.RegExpParser.cs#L942).
+            // We need to decode such group names.
+            const string encodedGroupNamePrefix = "__utf8_";
+            if (groupNameFromNumber.StartsWith(encodedGroupNamePrefix, StringComparison.Ordinal))
+            {
+                try
+                {
+                    var bytes = groupNameFromNumber.AsSpan(encodedGroupNamePrefix.Length).BytesFromHexString();
+                    groupNameFromNumber = Encoding.UTF8.GetString(bytes);
+                }
+                catch { /* intentional no-op */ }
+            }
+
             return groupNameFromNumber;
             return groupNameFromNumber;
         }
         }
 
 

+ 41 - 0
Jint/Shims.cs

@@ -0,0 +1,41 @@
+namespace Jint;
+
+internal static class Shims
+{
+    public static byte[] BytesFromHexString(this ReadOnlySpan<char> value)
+    {
+#if NET6_0_OR_GREATER
+        return Convert.FromHexString(value);
+#else
+        if ((value.Length & 1) != 0)
+        {
+            throw new FormatException();
+        }
+
+        var byteCount = value.Length >> 1;
+        var result = new byte[byteCount];
+        var index = 0;
+        for (var i = 0; i < byteCount; i++)
+        {
+            int hi, lo;
+            if ((hi = GetDigitValue(value[index++])) < 0
+                || (lo = GetDigitValue(value[index++])) < 0)
+            {
+                throw new FormatException();
+            }
+
+            result[i] = (byte) (hi << 4 | lo);
+        }
+
+        return result;
+
+        static int GetDigitValue(char ch) => ch switch
+        {
+            >= '0' and <= '9' => ch - 0x30,
+            >= 'a' and <= 'f' => ch - 0x57,
+            >= 'A' and <= 'F' => ch - 0x37,
+            _ => -1
+        };
+#endif
+    }
+}