浏览代码

Implement RegExp named capture groups (#1309)

Marko Lahma 2 年之前
父节点
当前提交
d7b8ca1224

+ 10 - 2
Jint.Repl/Program.cs

@@ -51,6 +51,8 @@ namespace Jint.Repl
                 AdaptRegexp = true
             };
 
+            var serializer = new JsonSerializer(engine);
+
             while (true)
             {
                 Console.ForegroundColor = defaultColor;
@@ -66,13 +68,19 @@ namespace Jint.Repl
                     var result = engine.Evaluate(input, parserOptions);
                     if (!result.IsPrimitive() && result is not IPrimitiveInstance)
                     {
-                        var serializer = new JsonSerializer(engine);
                         var str = serializer.Serialize(result, Undefined.Instance, "  ");
                         Console.WriteLine(str);
                     }
                     else
                     {
-                        Console.WriteLine(result);
+                        if (result.IsString())
+                        {
+                            Console.WriteLine(serializer.Serialize(result, Undefined.Instance, Undefined.Instance));
+                        }
+                        else
+                        {
+                            Console.WriteLine(result);
+                        }
                     }
                 }
                 catch (JavaScriptException je)

+ 1 - 1
Jint.Tests.Test262/.config/dotnet-tools.json

@@ -3,7 +3,7 @@
   "isRoot": true,
   "tools": {
     "test262harness.console": {
-      "version": "0.0.17",
+      "version": "0.0.22",
       "commands": [
         "test262"
       ]

+ 28 - 34
Jint.Tests.Test262/Test262Harness.settings.json

@@ -23,7 +23,6 @@
     "Promise.any",
     "regexp-duplicate-named-groups",
     "regexp-match-indices",
-    "regexp-named-groups",
     "regexp-lookbehind",
     "regexp-unicode-property-escapes",
     "regexp-v-flag",
@@ -49,31 +48,38 @@
     "language/expressions/assignment/fn-name-lhs-cover.js",
 
     // Unicode support not built-in to .NET the same way, requires more work
-    "built-ins/RegExp/character-class-escape-non-whitespace.js",
-    "built-ins/RegExp/property-escapes/character-class.js",
     "built-ins/RegExp/prototype/Symbol.match/builtin-infer-unicode.js",
-    "built-ins/RegExp/prototype/Symbol.match/builtin-success-u-return-val-groups.js",
-    "built-ins/RegExp/prototype/Symbol.match/u-advance-after-empty.js",
-    "built-ins/RegExp/prototype/Symbol.replace/u-advance-after-empty.js",
     "built-ins/RegExp/prototype/Symbol.search/u-lastindex-advance.js",
-    "built-ins/RegExp/unicode_restricted_quantifiable_assertion.js",
-    "built-ins/RegExp/prototype/exec/u-lastindex-value.js",
-    "built-ins/RegExp/unicode_restricted_character_class_escape.js",
-    "built-ins/RegExp/unicode_restricted_identity_escape.js",
+    "built-ins/RegExp/prototype/exec/u-lastindex-adv.js",
+    "built-ins/RegExp/unicode_character_class_backspace_escape.js",
     "built-ins/RegExp/unicode_restricted_identity_escape_alpha.js",
     "built-ins/RegExp/unicode_restricted_identity_escape_c.js",
     "built-ins/RegExp/unicode_restricted_identity_escape_u.js",
-    "built-ins/RegExp/unicode_identity_escape.js",
-    "built-ins/RegExp/unicode_character_class_backspace_escape.js",
     "built-ins/RegExp/unicode_restricted_identity_escape_x.js",
     "language/literals/regexp/u-astral-char-class-invert.js",
+    "language/literals/regexp/u-astral.js",
+    "language/literals/regexp/u-case-mapping.js",
 
-    // Issue with \r in source string
-    "built-ins/RegExp/dotall/without-dotall.js",
-    "built-ins/RegExp/dotall/without-dotall-unicode.js",
+    // cannot have characters like 𝒜 as group name or something starting with $ in .NET, other .NET limitations
+    "built-ins/RegExp/named-groups/non-unicode-match.js",
+    "built-ins/RegExp/named-groups/non-unicode-property-names-valid.js",
+    "built-ins/RegExp/named-groups/non-unicode-property-names.js",
+    "built-ins/RegExp/named-groups/unicode-match.js",
+    "built-ins/RegExp/named-groups/unicode-property-names-valid.js",
+    "built-ins/RegExp/named-groups/unicode-property-names.js",
+    "built-ins/RegExp/prototype/Symbol.replace/named-groups.js",
+    "built-ins/RegExp/quantifier-integer-limit.js",
+
+    // more validation and cleanup needed
+    "built-ins/RegExp/S15.10.2.13_A1_T1.js",
+    "built-ins/RegExp/S15.10.2.13_A1_T2.js",
+    "built-ins/RegExp/character-class-escape-non-whitespace.js",
+    "built-ins/RegExp/unicode_character_class_backspace_escape.js",
+    "built-ins/RegExp/unicode_identity_escape.js",
+    "built-ins/RegExp/unicode_restricted_character_class_escape.js",
+    "built-ins/RegExp/unicode_restricted_identity_escape.js",
+    "built-ins/RegExp/unicode_restricted_quantifiable_assertion.js",
 
-    // regex named groups
-    "built-ins/String/prototype/replaceAll/searchValue-replacer-RegExp-call.js",
 
     // requires investigation how to process complex function name evaluation for property
     "built-ins/Function/prototype/toString/method-computed-property-name.js",
@@ -88,7 +94,6 @@
     "built-ins/RegExp/prototype/exec/S15.10.6.2_A1_T6.js",
     "built-ins/RegExp/S15.10.2.5_A1_T4.js",
 
-    "built-ins/RegExp/prototype/exec/u-lastindex-adv.js", // Esprima strips unicode sequence and match logic does not work
     "built-ins/String/raw/special-characters.js", // Windows line ending differences
     "language/expressions/object/method-definition/object-method-returns-promise.js", // Promise not implemented
     "language/statements/class/definition/class-method-returns-promise.js", // Promise not implemented
@@ -148,22 +153,18 @@
     "language/expressions/super/prop-expr-obj-val-from-eval.js",
 
     // Esprima problem
-    "built-ins/RegExp/quantifier-integer-limit.js",
+
+    "language/literals/regexp/u-surrogate-pairs-atom-escape-decimal.js",
+    "language/literals/regexp/u-unicode-esc.js",
     "built-ins/String/prototype/split/separator-regexp.js",
+
+
     "language/expressions/object/method-definition/name-super-prop-param.js",
     "language/expressions/optional-chaining/member-expression.js",
     "language/statements/for-of/dstr-obj-id-init-let.js",
     "language/statements/for/head-lhs-let.js",
     "language/expressions/object/yield-non-strict-access.js",
     "language/expressions/object/yield-non-strict-syntax.js",
-    "built-ins/RegExp/prototype/source/value-u.js",
-    "built-ins/RegExp/prototype/source/value-line-terminator.js",
-    "built-ins/RegExp/S15.10.2.13_A1_T1.js",
-    "built-ins/RegExp/S15.10.2.13_A1_T17.js",
-    "built-ins/RegExp/S15.10.2.13_A1_T2.js",
-    "built-ins/RegExp/S15.10.2.13_A2_T1.js",
-    "built-ins/RegExp/S15.10.2.13_A2_T2.js",
-    "built-ins/RegExp/S15.10.2.13_A2_T8.js",
     "language/expressions/object/let-non-strict-access.js",
     "language/expressions/object/let-non-strict-syntax.js",
     "language/expressions/assignment/dstr-obj-id-identifier-yield-ident-valid.js",
@@ -449,13 +450,6 @@
     "language/expressions/template-literal/tv-line-continuation.js",
     "language/function-code/eval-param-env-with-computed-key.js",
     "language/function-code/eval-param-env-with-prop-initializer.js",
-    "language/literals/regexp/u-astral.js",
-    "language/literals/regexp/u-case-mapping.js",
-    "language/literals/regexp/u-null-character-escape.js",
-    "language/literals/regexp/u-surrogate-pairs-atom-char-class.js",
-    "language/literals/regexp/u-surrogate-pairs-atom-escape-decimal.js",
-    "language/literals/regexp/u-surrogate-pairs.js",
-    "language/literals/regexp/u-unicode-esc.js",
     "language/literals/string/line-separator-eval.js",
     "language/literals/string/line-separator.js",
     "language/literals/string/paragraph-separator-eval.js",

+ 0 - 1
Jint.Tests/Runtime/ModuleTests.cs

@@ -1,4 +1,3 @@
-using Esprima;
 using Jint.Native;
 using Jint.Runtime;
 

+ 1 - 1
Jint/Native/Array/ArrayPrototype.cs

@@ -1474,7 +1474,7 @@ namespace Jint.Native.Array
         {
             var grouping = BuildArrayGrouping(thisObject, arguments, mapMode: false);
 
-            var obj = OrdinaryObjectCreate(null);
+            var obj = OrdinaryObjectCreate(_engine, null);
             foreach (var pair in grouping)
             {
                 obj.FastSetProperty(pair.Key, new PropertyDescriptor(pair.Value, PropertyFlag.ConfigurableEnumerableWritable));

+ 2 - 2
Jint/Native/Object/ObjectInstance.cs

@@ -1243,9 +1243,9 @@ namespace Jint.Native.Object
         /// <summary>
         /// https://tc39.es/ecma262/#sec-ordinaryobjectcreate
         /// </summary>
-        internal ObjectInstance OrdinaryObjectCreate(ObjectInstance? proto)
+        internal static ObjectInstance OrdinaryObjectCreate(Engine engine, ObjectInstance? proto)
         {
-            var prototype = new ObjectInstance(_engine)
+            var prototype = new ObjectInstance(engine)
             {
                 _prototype = proto
             };

+ 1 - 1
Jint/Native/RegExp/RegExpInstance.cs

@@ -8,7 +8,7 @@ namespace Jint.Native.RegExp
     public class RegExpInstance : ObjectInstance
     {
         internal const string regExpForMatchingAllCharacters = "(?:)";
-        internal static readonly JsString PropertyLastIndex = new JsString("lastIndex");
+        internal static readonly JsString PropertyLastIndex = new("lastIndex");
 
         private string _flags = null!;
 

+ 116 - 33
Jint/Native/RegExp/RegExpPrototype.cs

@@ -1,5 +1,4 @@
 using System.Diagnostics.CodeAnalysis;
-using System.Globalization;
 using System.Text.RegularExpressions;
 using Jint.Collections;
 using Jint.Native.Array;
@@ -16,14 +15,15 @@ namespace Jint.Native.RegExp
 {
     public sealed class RegExpPrototype : Prototype
     {
-        private static readonly JsString PropertyExec = new JsString("exec");
-        private static readonly JsString PropertyIndex = new JsString("index");
-        private static readonly JsString PropertyInput = new JsString("input");
-        private static readonly JsString PropertySticky = new JsString("sticky");
-        private static readonly JsString PropertyGlobal = new JsString("global");
-        internal static readonly JsString PropertySource = new JsString("source");
-        private static readonly JsValue DefaultSource = new JsString("(?:)");
-        internal static readonly JsString PropertyFlags = new JsString("flags");
+        private static readonly JsString PropertyExec = new("exec");
+        private static readonly JsString PropertyIndex = new("index");
+        private static readonly JsString PropertyInput = new("input");
+        private static readonly JsString PropertySticky = new("sticky");
+        private static readonly JsString PropertyGlobal = new("global");
+        internal static readonly JsString PropertySource = new("source");
+        private static readonly JsString DefaultSource = new("(?:)");
+        internal static readonly JsString PropertyFlags = new("flags");
+        private static readonly JsString PropertyGroups = new("groups");
 
         private readonly RegExpConstructor _constructor;
         private readonly Func<JsValue, JsValue[], JsValue> _defaultExec;
@@ -115,7 +115,7 @@ namespace Jint.Native.RegExp
                 return JsString.Empty;
             }
 
-            return r.Source.Replace("/", "\\/");
+            return r.Source.Replace("/", "\\/").Replace("\n", "\\n");
         }
 
         /// <summary>
@@ -165,16 +165,27 @@ namespace Jint.Native.RegExp
                         var replacerArgs = new List<JsValue>(match.Groups.Count + 2);
                         replacerArgs.Add(match.Value);
 
+                        ObjectInstance? groups = null;
                         for (var i = 1; i < match.Groups.Count; i++)
                         {
                             var capture = match.Groups[i];
-                            replacerArgs.Add(capture.Value);
+                            replacerArgs.Add(capture.Success ? capture.Value : Undefined);
+
+                            var groupName = GetRegexGroupName(rei.Value, i);
+                            if (!string.IsNullOrWhiteSpace(groupName))
+                            {
+                                groups ??= OrdinaryObjectCreate(_engine, null);
+                                groups.CreateDataPropertyOrThrow(groupName, capture.Success ? capture.Value : Undefined);
+                            }
                         }
 
                         replacerArgs.Add(match.Index);
                         replacerArgs.Add(s);
+                        if (groups is not null)
+                        {
+                            replacerArgs.Add(groups);
+                        }
 
-                        // no named captures
                         return CallFunctionalReplace(replaceValue, replacerArgs);
                     }
 
@@ -218,8 +229,9 @@ namespace Jint.Native.RegExp
             var nextSourcePosition = 0;
 
             var captures = new List<string>();
-            foreach (var result in results)
+            for (var i = 0; i < results.Count; i++)
             {
+                var result = results[i];
                 var nCaptures = (int) result.Length;
                 nCaptures = System.Math.Max(nCaptures - 1, 0);
                 var matched = TypeConverter.ToString(result.Get(0));
@@ -237,7 +249,7 @@ namespace Jint.Native.RegExp
                     n++;
                 }
 
-                var namedCaptures = result.Get("groups");
+                var namedCaptures = result.Get(PropertyGroups);
                 string replacement;
                 if (functionalReplace)
                 {
@@ -254,6 +266,7 @@ namespace Jint.Native.RegExp
                     {
                         replacerArgs.Add(namedCaptures);
                     }
+
                     replacement = CallFunctionalReplace(replaceValue, replacerArgs);
                 }
                 else
@@ -290,6 +303,9 @@ namespace Jint.Native.RegExp
             return TypeConverter.ToString(result);
         }
 
+        /// <summary>
+        /// https://tc39.es/ecma262/#sec-getsubstitution
+        /// </summary>
         internal static string GetSubstitution(
             string matched,
             string str,
@@ -332,6 +348,26 @@ namespace Jint.Native.RegExp
                         case '\'':
                             sb.Append(str.Substring(position + matched.Length));
                             break;
+                        case '<':
+                            var gtPos = replacement.IndexOf('>', i + 1);
+                            if (gtPos == -1 || namedCaptures.IsUndefined())
+                            {
+                                sb.Append('$');
+                                sb.Append(c);
+                            }
+                            else
+                            {
+                                var startIndex = i + 1;
+                                var groupName = replacement.Substring(startIndex, gtPos - startIndex);
+                                var capture = namedCaptures.Get(groupName);
+                                if (!capture.IsUndefined())
+                                {
+                                    sb.Append(TypeConverter.ToString(capture));
+                                }
+
+                                i = gtPos;
+                            }
+                            break;
                         default:
                             {
                                 if (char.IsDigit(c))
@@ -607,6 +643,9 @@ namespace Jint.Native.RegExp
             return !match.IsNull();
         }
 
+        /// <summary>
+        /// https://tc39.es/ecma262/#sec-regexp.prototype-@@search
+        /// </summary>
         private JsValue Search(JsValue thisObj, JsValue[] arguments)
         {
             var rx = AssertThisIsObjectInstance(thisObj, "RegExp.prototype.search");
@@ -633,6 +672,9 @@ namespace Jint.Native.RegExp
             return result.Get(PropertyIndex);
         }
 
+        /// <summary>
+        /// https://tc39.es/ecma262/#sec-regexp.prototype-@@match
+        /// </summary>
         private JsValue Match(JsValue thisObj, JsValue[] arguments)
         {
             var rx = AssertThisIsObjectInstance(thisObj, "RegExp.prototype.match");
@@ -847,7 +889,7 @@ namespace Jint.Native.RegExp
                     return Null;
                 }
 
-                return CreateReturnValueArray(R.Engine, m, s, fullUnicode: false);
+                return CreateReturnValueArray(R.Engine, matcher, m, s, fullUnicode: false);
             }
 
             // the stateful version
@@ -874,24 +916,47 @@ namespace Jint.Native.RegExp
             var e = match.Index + match.Length;
             if (fullUnicode)
             {
-                // e is an index into the Input character list, derived from S, matched by matcher.
-                // Let eUTF be the smallest index into S that corresponds to the character at element e of Input.
-                // If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S.
-                // Set e to eUTF.
-                var indexes = StringInfo.ParseCombiningCharacters(s);
-                if (match.Index < indexes.Length)
+                e = GetStringIndex(s, e);
+            }
+
+            if (global || sticky)
+            {
+                R.Set(RegExpInstance.PropertyLastIndex, e, true);
+            }
+
+            return CreateReturnValueArray(R.Engine, matcher, match, s, fullUnicode);
+        }
+
+        /// <summary>
+        /// https://tc39.es/ecma262/#sec-getstringindex
+        /// </summary>
+        private static int GetStringIndex(string s, int codePointIndex)
+        {
+            if (s.Length == 0)
+            {
+                return 0;
+            }
+
+            var len = s.Length;
+            var codeUnitCount = 0;
+            var codePointCount = 0;
+
+            while (codeUnitCount < len)
+            {
+                if (codePointCount == codePointIndex)
                 {
-                    var sub = StringInfo.GetNextTextElement(s, match.Index);
-                    e += sub.Length - 1;
+                    return codeUnitCount;
                 }
-            }
 
-            R.Set(RegExpInstance.PropertyLastIndex, e, true);
+                var isSurrogatePair = char.IsSurrogatePair(s, codeUnitCount);
+                codeUnitCount += isSurrogatePair ? 2 : 1;
+                codePointCount += 1;
+            }
 
-            return CreateReturnValueArray(R.Engine, match, s, fullUnicode);
+            return len;
         }
 
-        private static ArrayInstance CreateReturnValueArray(Engine engine, Match match, string inputValue, bool fullUnicode)
+        private static ArrayInstance CreateReturnValueArray(Engine engine, Regex regex, Match match, string inputValue, bool fullUnicode)
         {
             var array = engine.Realm.Intrinsics.Array.ArrayCreate((ulong) match.Groups.Count);
             array.CreateDataProperty(PropertyIndex, match.Index);
@@ -904,22 +969,40 @@ namespace Jint.Native.RegExp
                 var capturedValue = Undefined;
                 if (capture?.Success == true)
                 {
-                    capturedValue = fullUnicode
-                        ? StringInfo.GetNextTextElement(inputValue, capture.Index)
-                        : capture.Value;
-
+                    capturedValue = capture.Value;
+                }
 
-                    // todo detect captured name
+                var groupName = GetRegexGroupName(regex, (int) i);
+                if (!string.IsNullOrWhiteSpace(groupName))
+                {
+                    groups ??= OrdinaryObjectCreate(engine, null);
+                    groups.CreateDataPropertyOrThrow(groupName, capturedValue);
                 }
 
                 array.SetIndexValue(i, capturedValue, updateLength: false);
             }
 
-            array.CreateDataProperty("groups", groups ?? Undefined);
+            array.CreateDataProperty(PropertyGroups, groups ?? Undefined);
 
             return array;
         }
 
+        private static string? GetRegexGroupName(Regex regex, int index)
+        {
+            if (index == 0)
+            {
+                return null;
+            }
+            var groupNameFromNumber = regex.GroupNameFromNumber(index);
+            if (groupNameFromNumber.Length == 1 && groupNameFromNumber[0] == 48 + index)
+            {
+                // regex defaults to index as group name when it's not a named group
+                return null;
+
+            }
+            return groupNameFromNumber;
+        }
+
         private JsValue Exec(JsValue thisObj, JsValue[] arguments)
         {
             var r = thisObj as RegExpInstance;

+ 3 - 0
Jint/Native/String/StringPrototype.cs

@@ -830,6 +830,9 @@ namespace Jint.Native.String
             return (long) s[position];
         }
 
+        /// <summary>
+        /// https://tc39.es/ecma262/#sec-string.prototype.codepointat
+        /// </summary>
         private JsValue CodePointAt(JsValue thisObj, JsValue[] arguments)
         {
             TypeConverter.CheckObjectCoercible(Engine, thisObj);

+ 1 - 0
README.md

@@ -61,6 +61,7 @@ The entire execution engine was rebuild with performance in mind, in many cases
 #### ECMAScript 2018
 
 - ✔ `Promise.prototype.finally`
+- ✔ RegExp named capture groups
 - ✔ Rest/spread operators for object literals (`...identifier`),
 
 #### ECMAScript 2019