瀏覽代碼

Adding cultural check to the "ToUpperCase" string prototype. (#1693)

* Adding cultural check to the "ToUpperCase" string prototype to  fix some tests.
* Make character filtering criteria work for larger strings instead of for the tests characters only.
* Moving culture specific logic from StringPrototype to StringInlHelper created class.
* Data strings to a class of their own for StringTests clarity. Half way done with all the cases.
* Generate the test strings for all the cases, document and clean up.
* Fixing NET462 incorrect uppecase for specific characters.

---------

Co-authored-by: Luis Merino <[email protected]>
LuisMerinoP 1 年之前
父節點
當前提交
84cd41ab94

+ 18 - 0
Jint.Tests/Runtime/StringTests.cs

@@ -81,4 +81,22 @@ bar += 'bar';
         Assert.True(result.HasOwnProperty("key"));
         Assert.Equal("value", result["key"]);
     }
+
+    public static TheoryData GetLithuaniaTestsData()
+    {
+        return new StringTetsLithuaniaData().TestData();
+    }
+
+    /// <summary>
+    /// Lithuanian case is special and Test262 suite tests cover only correct parsing by character. See:
+    /// https://github.com/tc39/test262/blob/main/test/intl402/String/prototype/toLocaleUpperCase/special_casing_Lithuanian.js
+    /// Added logic in the engine needs to parse full strings and not only spare characters. This is what these tests cover.
+    /// </summary>
+    [Theory]
+    [MemberData(nameof(GetLithuaniaTestsData))]
+    public void LithuanianToLocaleUpperCase(string parseStr, string result)
+    {
+        var value = _engine.Evaluate($"('{parseStr}').toLocaleUpperCase('lt')").AsString();
+        Assert.Equal(result, value);
+    }
 }

+ 97 - 0
Jint.Tests/Runtime/StringTetsLithuaniaData.cs

@@ -0,0 +1,97 @@
+namespace Jint.Tests.Runtime
+{
+    public class StringTetsLithuaniaData
+    {
+        // Contains the non-uppercased string that will be processed by the engine and the expected result.
+        private readonly TheoryData<string, string> fullSetOfData = new TheoryData<string, string>();
+        // From: https://github.com/tc39/test262/blob/main/test/intl402/String/prototype/toLocaleUpperCase/special_casing_Lithuanian.js
+        private readonly string[] softDotted = [
+            "\u0069", "\u006A",   // LATIN SMALL LETTER I..LATIN SMALL LETTER J
+            "\u012F",             // LATIN SMALL LETTER I WITH OGONEK
+            "\u0249",             // LATIN SMALL LETTER J WITH STROKE
+            "\u0268",             // LATIN SMALL LETTER I WITH STROKE
+            "\u029D",             // LATIN SMALL LETTER J WITH CROSSED-TAIL
+            "\u02B2",             // MODIFIER LETTER SMALL J
+            "\u03F3",             // GREEK LETTER YOT
+            "\u0456",             // CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
+            "\u0458",             // CYRILLIC SMALL LETTER JE
+            "\u1D62",             // LATIN SUBSCRIPT SMALL LETTER I
+            "\u1D96",             // LATIN SMALL LETTER I WITH RETROFLEX HOOK
+            "\u1DA4",             // MODIFIER LETTER SMALL I WITH STROKE
+            "\u1DA8",             // MODIFIER LETTER SMALL J WITH CROSSED-TAIL
+            "\u1E2D",             // LATIN SMALL LETTER I WITH TILDE BELOW
+            "\u1ECB",             // LATIN SMALL LETTER I WITH DOT BELOW
+            "\u2071",             // SUPERSCRIPT LATIN SMALL LETTER I
+            "\u2148", "\u2149",   // DOUBLE-STRUCK ITALIC SMALL I..DOUBLE-STRUCK ITALIC SMALL J
+            "\u2C7C",             // LATIN SUBSCRIPT SMALL LETTER J
+            "\uD835\uDC22", "\uD835\uDC23",   // MATHEMATICAL BOLD SMALL I..MATHEMATICAL BOLD SMALL J
+            "\uD835\uDC56", "\uD835\uDC57",   // MATHEMATICAL ITALIC SMALL I..MATHEMATICAL ITALIC SMALL J
+            "\uD835\uDC8A", "\uD835\uDC8B",   // MATHEMATICAL BOLD ITALIC SMALL I..MATHEMATICAL BOLD ITALIC SMALL J
+            "\uD835\uDCBE", "\uD835\uDCBF",   // MATHEMATICAL SCRIPT SMALL I..MATHEMATICAL SCRIPT SMALL J
+            "\uD835\uDCF2", "\uD835\uDCF3",   // MATHEMATICAL BOLD SCRIPT SMALL I..MATHEMATICAL BOLD SCRIPT SMALL J
+            "\uD835\uDD26", "\uD835\uDD27",   // MATHEMATICAL FRAKTUR SMALL I..MATHEMATICAL FRAKTUR SMALL J
+            "\uD835\uDD5A", "\uD835\uDD5B",   // MATHEMATICAL DOUBLE-STRUCK SMALL I..MATHEMATICAL DOUBLE-STRUCK SMALL J
+            "\uD835\uDD8E", "\uD835\uDD8F",   // MATHEMATICAL BOLD FRAKTUR SMALL I..MATHEMATICAL BOLD FRAKTUR SMALL J
+            "\uD835\uDDC2", "\uD835\uDDC3",   // MATHEMATICAL SANS-SERIF SMALL I..MATHEMATICAL SANS-SERIF SMALL J
+            "\uD835\uDDF6", "\uD835\uDDF7",   // MATHEMATICAL SANS-SERIF BOLD SMALL I..MATHEMATICAL SANS-SERIF BOLD SMALL J
+            "\uD835\uDE2A", "\uD835\uDE2B",   // MATHEMATICAL SANS-SERIF ITALIC SMALL I..MATHEMATICAL SANS-SERIF ITALIC SMALL J
+            "\uD835\uDE5E", "\uD835\uDE5F",   // MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL I..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL J
+            "\uD835\uDE92", "\uD835\uDE93",   // MATHEMATICAL MONOSPACE SMALL I..MATHEMATICAL MONOSPACE SMALL J
+        ];
+
+        // Results obtained from node -v 18.12.0.
+        private readonly string[] softDottedUpperCased = [
+            "I", "J", "Į", "Ɉ", "Ɨ", "Ʝ", "ʲ", "Ϳ", "І", "Ј",
+            "ᵢ", "ᶖ", "ᶤ", "ᶨ", "Ḭ", "Ị", "ⁱ", "ⅈ", "ⅉ", "ⱼ",
+            "𝐢", "𝐣", "𝑖", "𝑗", "𝒊", "𝒋", "𝒾", "𝒿", "𝓲", "𝓳",
+            "𝔦", "𝔧", "𝕚", "𝕛", "𝖎", "𝖏", "𝗂", "𝗃", "𝗶", "𝗷",
+            "𝘪", "𝘫", "𝙞", "𝙟", "𝚒", "𝚓",
+        ];
+
+        /// <summary>
+        /// Creates and adds the data to <fullSetOfData> that will be used for the tests. Six cases:
+        /// 1.- String with character at the beginning of the string.
+        /// 2.- String with double character at the beginning of the string.
+        /// 3.- String with character at the middle of the string.
+        /// 4.- String with double character at the middle of the string.
+        /// 5.- String with character at the end of the string.
+        /// 6.- String with double character at the end of the string.
+        /// </summary>
+        private void AddStringsForChars(string nonCapChar, string toUpperChar)
+        {
+            fullSetOfData.Add($"{nonCapChar}lorem ipsum", $"{toUpperChar}LOREM IPSUM");
+            fullSetOfData.Add($"{nonCapChar}{nonCapChar}lorem ipsum", $"{toUpperChar}{toUpperChar}LOREM IPSUM");
+            fullSetOfData.Add($"lorem{nonCapChar}ipsum", $"LOREM{toUpperChar}IPSUM");
+            fullSetOfData.Add($"lorem{nonCapChar}{nonCapChar}ipsum", $"LOREM{toUpperChar}{toUpperChar}IPSUM");
+            fullSetOfData.Add($"lorem ipsum{nonCapChar}", $"LOREM IPSUM{toUpperChar}");
+            fullSetOfData.Add($"lorem ipsum{nonCapChar}{nonCapChar}", $"LOREM IPSUM{toUpperChar}{toUpperChar}");
+        }
+
+        // All the cases from https://github.com/tc39/test262/blob/main/test/intl402/String/prototype/toLocaleUpperCase/special_casing_Lithuanian.js
+        public TheoryData<string, string> TestData()
+        {
+            // COMBINING DOT ABOVE (U+0307) not removed when uppercasing capital I
+            AddStringsForChars("İ", "İ");
+            // COMBINING DOT ABOVE (U+0307) not removed when uppercasing capital J
+            AddStringsForChars("J̇", "J̇");
+            for (int i = 0; i < softDotted.Length; i++)
+            {
+                // COMBINING DOT ABOVE (U+0307) removed when preceded by Soft_Dotted.
+                // Character directly preceded by Soft_Dotted.
+                AddStringsForChars(softDotted[i] + "\u0307", softDottedUpperCased[i]);
+
+                // COMBINING DOT ABOVE (U+0307) removed if preceded by Soft_Dotted.
+                // Character not directly preceded by Soft_Dotted.
+                // - COMBINING DOT BELOW (U+0323), combining class 220 (Below)
+                AddStringsForChars(softDotted[i] + "\u0323\u0307", softDottedUpperCased[i] + "\u0323");
+
+                // COMBINING DOT ABOVE removed if preceded by Soft_Dotted.
+                // Character not directly preceded by Soft_Dotted.
+                // - PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE (U+101FD = D800 DDFD), combining class 220 (Below)
+                AddStringsForChars(softDotted[i] + "\uD800\uDDFD\u0307", softDottedUpperCased[i] + "\uD800\uDDFD");
+            }
+
+            return fullSetOfData;
+        }
+    }
+}

+ 55 - 0
Jint/Native/String/StringInlHelper.cs

@@ -0,0 +1,55 @@
+using System.Text;
+
+namespace Jint.Native.String
+{
+    /// <summary>
+    /// Some internacionalization logic that is special or specific to determined culture.
+    /// </summary>
+    internal class StringInlHelper
+    {
+        private static List<int> GetLithuaninanReplaceableCharIdx(string input)
+        {
+            List<int> replaceableCharsIdx = new List<int>();
+            for (int i = 0; i < input.Length; i++)
+            {
+                if (input[i].Equals('\u0307'))
+                {
+                    replaceableCharsIdx.Add(i);
+                }
+            }
+
+            // For capital I and J we do not replace the dot above (\u3017).
+            replaceableCharsIdx
+                .RemoveAll(idx => (idx > 0) && input[idx - 1] == 'I' || input[idx - 1] == 'J');
+
+            return replaceableCharsIdx;
+        }
+
+        /// <summary>
+        /// Lithuanian case is a bit special. For more info see:
+        /// https://github.com/tc39/test262/blob/main/test/intl402/String/prototype/toLocaleUpperCase/special_casing_Lithuanian.js
+        /// </summary>
+        public static string LithuanianStringProcessor(string input)
+        {
+            var replaceableCharsIdx = GetLithuaninanReplaceableCharIdx(input);
+            if (replaceableCharsIdx.Count > 0)
+            {
+                StringBuilder stringBuilder = new StringBuilder(input);
+
+                // Remove characters in reverse order to avoid index shifting
+                for (int i = replaceableCharsIdx.Count - 1; i >= 0; i--)
+                {
+                    int index = replaceableCharsIdx[i];
+                    if (index >= 0 && index < stringBuilder.Length)
+                    {
+                        stringBuilder.Remove(index, 1);
+                    }
+                }
+
+                return stringBuilder.ToString();
+            }
+
+            return input;
+        }
+    }
+}

+ 27 - 1
Jint/Native/String/StringPrototype.cs

@@ -1,5 +1,6 @@
 #pragma warning disable CA1859 // Use concrete types when possible for improved performance -- most of prototype methods return JsValue
 
+using System;
 using System.Globalization;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
@@ -231,7 +232,32 @@ namespace Jint.Native.String
         {
             TypeConverter.CheckObjectCoercible(_engine, thisObject);
             var s = TypeConverter.ToString(thisObject);
-            return new JsString(s.ToUpper(CultureInfo.InvariantCulture));
+            var culture = CultureInfo.InvariantCulture;
+            if (arguments.Length > 0 && arguments[0].IsString())
+            {
+                try
+                {
+                    var cultureArgument = arguments[0].ToString();
+                    culture = CultureInfo.GetCultureInfo(cultureArgument);
+                }
+                catch (CultureNotFoundException)
+                {
+                    ExceptionHelper.ThrowRangeError(_realm, "Incorrect culture information provided");
+                }
+            }
+            if (string.Equals("lt", culture.Name, StringComparison.OrdinalIgnoreCase))
+            {
+                s = StringInlHelper.LithuanianStringProcessor(s);
+#if NET462
+                // Code specific to .NET Framework 4.6.2.
+                // For no good reason this verison does not upper case these characters correctly.
+                return new JsString(s.ToUpper(culture)
+                    .Replace("ϳ", "Ϳ")
+                    .Replace("ʝ", "Ʝ"));
+#endif
+            }
+
+            return new JsString(s.ToUpper(culture));
         }
 
         private JsValue ToUpperCase(JsValue thisObject, JsValue[] arguments)