Browse Source

Merge pull request #84289 from bruvzg/icu741

ICU4C: Update to version 74.1
Rémi Verschelde 1 năm trước cách đây
mục cha
commit
9b2686c333
65 tập tin đã thay đổi với 5828 bổ sung4251 xóa
  1. 3 1
      modules/text_server_adv/SCsub
  2. 3 1
      modules/text_server_adv/gdextension_build/SConstruct
  3. 3 3
      thirdparty/README.md
  4. 36 43
      thirdparty/icu4c/LICENSE
  5. 93 28
      thirdparty/icu4c/common/brkeng.cpp
  6. 53 6
      thirdparty/icu4c/common/brkeng.h
  7. 13 1
      thirdparty/icu4c/common/brkiter.cpp
  8. 9 1
      thirdparty/icu4c/common/characterproperties.cpp
  9. 2 2
      thirdparty/icu4c/common/dictbe.cpp
  10. 2 1
      thirdparty/icu4c/common/dictbe.h
  11. 29 0
      thirdparty/icu4c/common/loadednormalizer2impl.cpp
  12. 1099 1111
      thirdparty/icu4c/common/localefallback_data.h
  13. 1 1
      thirdparty/icu4c/common/localematcher.cpp
  14. 50 21
      thirdparty/icu4c/common/locid.cpp
  15. 91 663
      thirdparty/icu4c/common/loclikely.cpp
  16. 321 94
      thirdparty/icu4c/common/loclikelysubtags.cpp
  17. 18 10
      thirdparty/icu4c/common/loclikelysubtags.h
  18. 12 8
      thirdparty/icu4c/common/locmap.cpp
  19. 9 5
      thirdparty/icu4c/common/locresdata.cpp
  20. 20 0
      thirdparty/icu4c/common/lsr.cpp
  21. 3 0
      thirdparty/icu4c/common/lsr.h
  22. 1 1
      thirdparty/icu4c/common/norm2_nfc_data.h
  23. 1 0
      thirdparty/icu4c/common/norm2allmodes.h
  24. 2 1
      thirdparty/icu4c/common/normalizer2impl.h
  25. 999 991
      thirdparty/icu4c/common/propname_data.h
  26. 15 0
      thirdparty/icu4c/common/putil.cpp
  27. 34 17
      thirdparty/icu4c/common/rbbi.cpp
  28. 3 2
      thirdparty/icu4c/common/rbbi_cache.cpp
  29. 0 1
      thirdparty/icu4c/common/rbbirb.cpp
  30. 0 3
      thirdparty/icu4c/common/rbbirb.h
  31. 0 2
      thirdparty/icu4c/common/rbbiscan.cpp
  32. 0 15
      thirdparty/icu4c/common/rbbitblb.cpp
  33. 63 64
      thirdparty/icu4c/common/ubidi_props_data.h
  34. 3 40
      thirdparty/icu4c/common/ucase.cpp
  35. 62 63
      thirdparty/icu4c/common/ucase_props_data.h
  36. 10 5
      thirdparty/icu4c/common/ucasemap.cpp
  37. 2 1
      thirdparty/icu4c/common/ucasemap_imp.h
  38. 692 683
      thirdparty/icu4c/common/uchar_props_data.h
  39. 58 32
      thirdparty/icu4c/common/ucurr.cpp
  40. 1 1
      thirdparty/icu4c/common/udata.cpp
  41. 59 55
      thirdparty/icu4c/common/uloc.cpp
  42. 27 83
      thirdparty/icu4c/common/uloc_tag.cpp
  43. 99 0
      thirdparty/icu4c/common/ulocale.cpp
  44. 156 0
      thirdparty/icu4c/common/ulocbuilder.cpp
  45. 7 68
      thirdparty/icu4c/common/ulocimp.h
  46. 1 0
      thirdparty/icu4c/common/unicode/brkiter.h
  47. 1 1
      thirdparty/icu4c/common/unicode/docmain.h
  48. 13 1
      thirdparty/icu4c/common/unicode/locid.h
  49. 24 2
      thirdparty/icu4c/common/unicode/normalizer2.h
  50. 81 1
      thirdparty/icu4c/common/unicode/rbbi.h
  51. 40 4
      thirdparty/icu4c/common/unicode/uchar.h
  52. 229 0
      thirdparty/icu4c/common/unicode/ulocale.h
  53. 441 0
      thirdparty/icu4c/common/unicode/ulocbuilder.h
  54. 24 2
      thirdparty/icu4c/common/unicode/unorm2.h
  55. 57 4
      thirdparty/icu4c/common/unicode/urename.h
  56. 6 6
      thirdparty/icu4c/common/unicode/uvernum.h
  57. 32 10
      thirdparty/icu4c/common/uniquecharstr.h
  58. 63 0
      thirdparty/icu4c/common/uprops.cpp
  59. 2 0
      thirdparty/icu4c/common/uprops.h
  60. 172 53
      thirdparty/icu4c/common/uresbund.cpp
  61. 10 5
      thirdparty/icu4c/common/ustrcase.cpp
  62. 1 12
      thirdparty/icu4c/common/uts46.cpp
  63. 297 10
      thirdparty/icu4c/i18n/unicode/uspoof.h
  64. 170 12
      thirdparty/icu4c/i18n/uspoof.cpp
  65. BIN
      thirdparty/icu4c/icudt74l.dat

+ 3 - 1
modules/text_server_adv/SCsub

@@ -401,6 +401,8 @@ if env["builtin_icu4c"]:
         "common/uloc.cpp",
         "common/uloc_keytype.cpp",
         "common/uloc_tag.cpp",
+        "common/ulocale.cpp",
+        "common/ulocbuilder.cpp",
         "common/umapfile.cpp",
         "common/umath.cpp",
         "common/umutablecptrie.cpp",
@@ -466,7 +468,7 @@ if env["builtin_icu4c"]:
     ]
     thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
 
-    icu_data_name = "icudt73l.dat"
+    icu_data_name = "icudt74l.dat"
 
     if env.editor_build:
         env_icu.Depends("#thirdparty/icu4c/icudata.gen.h", "#thirdparty/icu4c/" + icu_data_name)

+ 3 - 1
modules/text_server_adv/gdextension_build/SConstruct

@@ -623,6 +623,8 @@ thirdparty_icu_sources = [
     "common/uloc.cpp",
     "common/uloc_keytype.cpp",
     "common/uloc_tag.cpp",
+    "common/ulocale.cpp",
+    "common/ulocbuilder.cpp",
     "common/umapfile.cpp",
     "common/umath.cpp",
     "common/umutablecptrie.cpp",
@@ -688,7 +690,7 @@ thirdparty_icu_sources = [
 ]
 thirdparty_icu_sources = [thirdparty_icu_dir + file for file in thirdparty_icu_sources]
 
-icu_data_name = "icudt73l.dat"
+icu_data_name = "icudt74l.dat"
 
 if env["static_icu_data"]:
     env_icu.Depends("../../../thirdparty/icu4c/icudata.gen.h", "../../../thirdparty/icu4c/" + icu_data_name)

+ 3 - 3
thirdparty/README.md

@@ -389,7 +389,7 @@ Files extracted from upstream source:
 ## icu4c
 
 - Upstream: https://github.com/unicode-org/icu
-- Version: 73.2 (680f521746a3bd6a86f25f25ee50a62d88b489cf, 2023)
+- Version: 74.1 (9edac7b78327a1cb58db29e2714b15f9fa14e4d7, 2023)
 - License: Unicode
 
 Files extracted from upstream source:
@@ -401,7 +401,7 @@ Files extracted from upstream source:
 
 Files generated from upstream source:
 
-- The `icudt73l.dat` built with the provided `godot_data.json` config file (see
+- The `icudt74l.dat` built with the provided `godot_data.json` config file (see
   https://github.com/unicode-org/icu/blob/master/docs/userguide/icu_data/buildtool.md
   for instructions).
 
@@ -411,7 +411,7 @@ Files generated from upstream source:
 3. Reconfigure ICU with custom data config:
    `ICU_DATA_FILTER_FILE={GODOT_SOURCE}/thirdparty/icu4c/godot_data.json ./runConfigureICU {PLATFORM} --with-data-packaging=common`
 4. Delete `data/out` folder and rebuild data: `cd data && rm -rf ./out && make`
-5. Copy `source/data/out/icudt73l.dat` to the `{GODOT_SOURCE}/thirdparty/icu4c/icudt73l.dat`
+5. Copy `source/data/out/icudt74l.dat` to the `{GODOT_SOURCE}/thirdparty/icu4c/icudt74l.dat`
 
 
 ## jpeg-compressor

+ 36 - 43
thirdparty/icu4c/LICENSE

@@ -1,49 +1,42 @@
-UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
-
-See Terms of Use <https://www.unicode.org/copyright.html>
-for definitions of Unicode Inc.’s Data Files and Software.
-
-NOTICE TO USER: Carefully read the following legal agreement.
-BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
-DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
-YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
-TERMS AND CONDITIONS OF THIS AGREEMENT.
-IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
-THE DATA FILES OR SOFTWARE.
+UNICODE LICENSE V3
 
 COPYRIGHT AND PERMISSION NOTICE
 
-Copyright © 1991-2023 Unicode, Inc. All rights reserved.
-Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of the Unicode data files and any associated documentation
-(the "Data Files") or Unicode software and any associated documentation
-(the "Software") to deal in the Data Files or Software
-without restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, and/or sell copies of
-the Data Files or Software, and to permit persons to whom the Data Files
-or Software are furnished to do so, provided that either
-(a) this copyright and permission notice appear with all copies
-of the Data Files or Software, or
-(b) this copyright and permission notice appear in associated
-Documentation.
-
-THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
-ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT OF THIRD PARTY RIGHTS.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
-NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
-DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
-DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-PERFORMANCE OF THE DATA FILES OR SOFTWARE.
-
-Except as contained in this notice, the name of a copyright holder
-shall not be used in advertising or otherwise to promote the sale,
-use or other dealings in these Data Files or Software without prior
-written authorization of the copyright holder.
+Copyright © 2016-2023 Unicode, Inc.
+
+NOTICE TO USER: Carefully read the following legal agreement. BY
+DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
+SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of data files and any associated documentation (the "Data Files") or
+software and any associated documentation (the "Software") to deal in the
+Data Files or Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, and/or sell
+copies of the Data Files or Software, and to permit persons to whom the
+Data Files or Software are furnished to do so, provided that either (a)
+this copyright and permission notice appear with all copies of the Data
+Files or Software, or (b) this copyright and permission notice appear in
+associated Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+THIRD PARTY RIGHTS.
+
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
+FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall
+not be used in advertising or otherwise to promote the sale, use or other
+dealings in these Data Files or Software without prior written
+authorization of the copyright holder.
 
 ----------------------------------------------------------------------
 

+ 93 - 28
thirdparty/icu4c/common/brkeng.cpp

@@ -21,6 +21,7 @@
 #include "unicode/uscript.h"
 #include "unicode/ucharstrie.h"
 #include "unicode/bytestrie.h"
+#include "unicode/rbbi.h"
 
 #include "brkeng.h"
 #include "cmemory.h"
@@ -70,19 +71,21 @@ UnhandledEngine::~UnhandledEngine() {
 }
 
 UBool
-UnhandledEngine::handles(UChar32 c) const {
+UnhandledEngine::handles(UChar32 c, const char* locale) const {
+    (void)locale; // Unused
     return fHandled && fHandled->contains(c);
 }
 
 int32_t
 UnhandledEngine::findBreaks( UText *text,
-                             int32_t /* startPos */,
+                             int32_t startPos,
                              int32_t endPos,
                              UVector32 &/*foundBreaks*/,
                              UBool /* isPhraseBreaking */,
                              UErrorCode &status) const {
     if (U_FAILURE(status)) return 0;
-    UChar32 c = utext_current32(text); 
+    utext_setNativeIndex(text, startPos);
+    UChar32 c = utext_current32(text);
     while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
         utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
         c = utext_current32(text);
@@ -120,41 +123,39 @@ ICULanguageBreakFactory::~ICULanguageBreakFactory() {
     }
 }
 
-U_NAMESPACE_END
-U_CDECL_BEGIN
-static void U_CALLCONV _deleteEngine(void *obj) {
-    delete (const icu::LanguageBreakEngine *) obj;
+void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
+    static UMutex gBreakEngineMutex;
+    Mutex m(&gBreakEngineMutex);
+    if (fEngines == nullptr) {
+        LocalPointer<UStack>  engines(new UStack(uprv_deleteUObject, nullptr, status), status);
+        if (U_SUCCESS(status)) {
+            fEngines = engines.orphan();
+        }
+    }
 }
-U_CDECL_END
-U_NAMESPACE_BEGIN
 
 const LanguageBreakEngine *
-ICULanguageBreakFactory::getEngineFor(UChar32 c) {
+ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
     const LanguageBreakEngine *lbe = nullptr;
     UErrorCode  status = U_ZERO_ERROR;
+    ensureEngines(status);
+    if (U_FAILURE(status) ) {
+        // Note: no way to return error code to caller.
+        return nullptr;
+    }
 
     static UMutex gBreakEngineMutex;
     Mutex m(&gBreakEngineMutex);
-
-    if (fEngines == nullptr) {
-        LocalPointer<UStack>  engines(new UStack(_deleteEngine, nullptr, status), status);
-        if (U_FAILURE(status) ) {
-            // Note: no way to return error code to caller.
-            return nullptr;
-        }
-        fEngines = engines.orphan();
-    } else {
-        int32_t i = fEngines->size();
-        while (--i >= 0) {
-            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
-            if (lbe != nullptr && lbe->handles(c)) {
-                return lbe;
-            }
+    int32_t i = fEngines->size();
+    while (--i >= 0) {
+        lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
+        if (lbe != nullptr && lbe->handles(c, locale)) {
+            return lbe;
         }
     }
-    
+
     // We didn't find an engine. Create one.
-    lbe = loadEngineFor(c);
+    lbe = loadEngineFor(c, locale);
     if (lbe != nullptr) {
         fEngines->push((void *)lbe, status);
     }
@@ -162,7 +163,7 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c) {
 }
 
 const LanguageBreakEngine *
-ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
+ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
     UErrorCode status = U_ZERO_ERROR;
     UScriptCode code = uscript_getScript(c, &status);
     if (U_SUCCESS(status)) {
@@ -299,6 +300,70 @@ ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
     return nullptr;
 }
 
+
+void ICULanguageBreakFactory::addExternalEngine(
+        ExternalBreakEngine* external, UErrorCode& status) {
+    LocalPointer<ExternalBreakEngine> engine(external, status);
+    ensureEngines(status);
+    LocalPointer<BreakEngineWrapper> wrapper(
+        new BreakEngineWrapper(engine.orphan(), status), status);
+    static UMutex gBreakEngineMutex;
+    Mutex m(&gBreakEngineMutex);
+    fEngines->push(wrapper.getAlias(), status);
+    wrapper.orphan();
+}
+
+BreakEngineWrapper::BreakEngineWrapper(
+    ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
+}
+
+BreakEngineWrapper::~BreakEngineWrapper() {
+}
+
+UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
+    return delegate->isFor(c, locale);
+}
+
+int32_t BreakEngineWrapper::findBreaks(
+    UText *text,
+    int32_t startPos,
+    int32_t endPos,
+    UVector32 &foundBreaks,
+    UBool /* isPhraseBreaking */,
+    UErrorCode &status) const {
+    if (U_FAILURE(status)) return 0;
+    int32_t result = 0;
+
+    // Find the span of characters included in the set.
+    //   The span to break begins at the current position in the text, and
+    //   extends towards the start or end of the text, depending on 'reverse'.
+
+    utext_setNativeIndex(text, startPos);
+    int32_t start = (int32_t)utext_getNativeIndex(text);
+    int32_t current;
+    int32_t rangeStart;
+    int32_t rangeEnd;
+    UChar32 c = utext_current32(text);
+    while((current = (int32_t)utext_getNativeIndex(text)) < endPos && delegate->handles(c)) {
+        utext_next32(text);         // TODO:  recast loop for postincrement
+        c = utext_current32(text);
+    }
+    rangeStart = start;
+    rangeEnd = current;
+    int32_t beforeSize = foundBreaks.size();
+    int32_t additionalCapacity = rangeEnd - rangeStart + 1;
+    // enlarge to contains (rangeEnd-rangeStart+1) more items
+    foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
+    if (U_FAILURE(status)) return 0;
+    foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
+    result = delegate->fillBreaks(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
+                                  additionalCapacity, status);
+    if (U_FAILURE(status)) return 0;
+    foundBreaks.setSize(beforeSize + result);
+    utext_setNativeIndex(text, current);
+    return result;
+}
+
 U_NAMESPACE_END
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

+ 53 - 6
thirdparty/icu4c/common/brkeng.h

@@ -10,6 +10,7 @@
 #ifndef BRKENG_H
 #define BRKENG_H
 
+#include "unicode/umisc.h"
 #include "unicode/utypes.h"
 #include "unicode/uobject.h"
 #include "unicode/utext.h"
@@ -21,6 +22,7 @@ class UnicodeSet;
 class UStack;
 class UVector32;
 class DictionaryMatcher;
+class ExternalBreakEngine;
 
 /*******************************************************************
  * LanguageBreakEngine
@@ -35,7 +37,7 @@ class DictionaryMatcher;
  * <p>LanguageBreakEngines should normally be implemented so as to
  * be shared between threads without locking.</p>
  */
-class LanguageBreakEngine : public UMemory {
+class LanguageBreakEngine : public UObject {
  public:
 
   /**
@@ -54,10 +56,11 @@ class LanguageBreakEngine : public UMemory {
   * a particular kind of break.</p>
   *
   * @param c A character which begins a run that the engine might handle
+  * @param locale The locale.
   * @return true if this engine handles the particular character and break
   * type.
   */
-  virtual UBool handles(UChar32 c) const = 0;
+  virtual UBool handles(UChar32 c, const char* locale) const = 0;
 
  /**
   * <p>Find any breaks within a run in the supplied text.</p>
@@ -80,6 +83,35 @@ class LanguageBreakEngine : public UMemory {
 
 };
 
+/*******************************************************************
+ * BreakEngineWrapper
+ */
+
+/**
+ * <p>BreakEngineWrapper implement LanguageBreakEngine by
+ * a thin wrapper that delegate the task to ExternalBreakEngine
+ * </p>
+ */
+class BreakEngineWrapper : public  LanguageBreakEngine {
+ public:
+
+  BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status);
+
+  virtual ~BreakEngineWrapper();
+
+  virtual UBool handles(UChar32 c, const char* locale) const override;
+
+  virtual int32_t findBreaks( UText *text,
+                              int32_t startPos,
+                              int32_t endPos,
+                              UVector32 &foundBreaks,
+                              UBool isPhraseBreaking,
+                              UErrorCode &status) const override;
+
+ private:
+  LocalPointer<ExternalBreakEngine> delegate;
+};
+
 /*******************************************************************
  * LanguageBreakFactory
  */
@@ -125,9 +157,10 @@ class LanguageBreakFactory : public UMemory {
   *
   * @param c A character that begins a run for which a LanguageBreakEngine is
   * sought.
+  * @param locale The locale.
   * @return A LanguageBreakEngine with the desired characteristics, or 0.
   */
-  virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
+  virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0;
 
 };
 
@@ -174,10 +207,11 @@ class UnhandledEngine : public LanguageBreakEngine {
   * a particular kind of break.</p>
   *
   * @param c A character which begins a run that the engine might handle
+  * @param locale The locale.
   * @return true if this engine handles the particular character and break
   * type.
   */
-  virtual UBool handles(UChar32 c) const override;
+  virtual UBool handles(UChar32 c, const char* locale) const override;
 
  /**
   * <p>Find any breaks within a run in the supplied text.</p>
@@ -247,9 +281,18 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
   *
   * @param c A character that begins a run for which a LanguageBreakEngine is
   * sought.
+  * @param locale The locale.
   * @return A LanguageBreakEngine with the desired characteristics, or 0.
   */
-  virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override;
+  virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override;
+
+  /**
+   * Add and adopt the engine and return an URegistryKey.
+   * @param engine The ExternalBreakEngine to be added and adopt. The caller
+   *     pass the ownership and should not release the memory after this.
+   * @param status the error code.
+   */
+  virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status);
 
 protected:
  /**
@@ -258,9 +301,10 @@ protected:
   *
   * @param c A character that begins a run for which a LanguageBreakEngine is
   * sought.
+  * @param locale The locale.
   * @return A LanguageBreakEngine with the desired characteristics, or 0.
   */
-  virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
+  virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale);
 
   /**
    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
@@ -269,6 +313,9 @@ protected:
    * @return A DictionaryMatcher with the desired characteristics, or nullptr.
    */
   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
+
+ private:
+  void ensureEngines(UErrorCode& status);
 };
 
 U_NAMESPACE_END

+ 13 - 1
thirdparty/icu4c/common/brkiter.cpp

@@ -27,6 +27,7 @@
 #include "unicode/rbbi.h"
 #include "unicode/brkiter.h"
 #include "unicode/udata.h"
+#include "unicode/uloc.h"
 #include "unicode/ures.h"
 #include "unicode/ustring.h"
 #include "unicode/filteredbrk.h"
@@ -121,8 +122,11 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
     // If there is a result, set the valid locale and actual locale, and the kind
     if (U_SUCCESS(status) && result != nullptr) {
         U_LOCALE_BASED(locBased, *(BreakIterator*)result);
+
         locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), 
                               actualLocale.data());
+        uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
+        result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
     }
 
     ures_close(b);
@@ -202,18 +206,20 @@ BreakIterator::getAvailableLocales(int32_t& count)
 
 BreakIterator::BreakIterator()
 {
-    *validLocale = *actualLocale = 0;
+    *validLocale = *actualLocale = *requestLocale = 0;
 }
 
 BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
     uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
     uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
+    uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
 }
 
 BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
     if (this != &other) {
         uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
         uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
+        uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
     }
     return *this;
 }
@@ -493,12 +499,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
 
 Locale
 BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
+    if (type == ULOC_REQUESTED_LOCALE) {
+        return Locale(requestLocale);
+    }
     U_LOCALE_BASED(locBased, *this);
     return locBased.getLocale(type, status);
 }
 
 const char *
 BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
+    if (type == ULOC_REQUESTED_LOCALE) {
+        return requestLocale;
+    }
     U_LOCALE_BASED(locBased, *this);
     return locBased.getLocaleID(type, status);
 }

+ 9 - 1
thirdparty/icu4c/common/characterproperties.cpp

@@ -169,7 +169,7 @@ void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
     case UPROPS_SRC_INPC:
     case UPROPS_SRC_INSC:
     case UPROPS_SRC_VO:
-        uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
+        uprops_addPropertyStarts(src, &sa, &errorCode);
         break;
     case UPROPS_SRC_EMOJI: {
         const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
@@ -178,6 +178,14 @@ void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
         }
         break;
     }
+    case UPROPS_SRC_IDSU:
+        // New in Unicode 15.1 for just two characters.
+        sa.add(sa.set, 0x2FFE);
+        sa.add(sa.set, 0x2FFF + 1);
+        break;
+    case UPROPS_SRC_ID_COMPAT_MATH:
+        uprops_addPropertyStarts(src, &sa, &errorCode);
+        break;
     default:
         errorCode = U_INTERNAL_PROGRAM_ERROR;
         break;

+ 2 - 2
thirdparty/icu4c/common/dictbe.cpp

@@ -42,7 +42,7 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
 }
 
 UBool
-DictionaryBreakEngine::handles(UChar32 c) const {
+DictionaryBreakEngine::handles(UChar32 c, const char*) const {
     return fSet.contains(c);
 }
 
@@ -54,13 +54,13 @@ DictionaryBreakEngine::findBreaks( UText *text,
                                  UBool isPhraseBreaking,
                                  UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
-    (void)startPos;            // TODO: remove this param?
     int32_t result = 0;
 
     // Find the span of characters included in the set.
     //   The span to break begins at the current position in the text, and
     //   extends towards the start or end of the text, depending on 'reverse'.
 
+    utext_setNativeIndex(text, startPos);
     int32_t start = (int32_t)utext_getNativeIndex(text);
     int32_t current;
     int32_t rangeStart;

+ 2 - 1
thirdparty/icu4c/common/dictbe.h

@@ -62,10 +62,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
    * a particular kind of break.</p>
    *
    * @param c A character which begins a run that the engine might handle
+   * @param locale The locale.
    * @return true if this engine handles the particular character and break
    * type.
    */
-  virtual UBool handles(UChar32 c) const override;
+  virtual UBool handles(UChar32 c, const char* locale) const override;
 
   /**
    * <p>Find any breaks within a run in the supplied text.</p>

+ 29 - 0
thirdparty/icu4c/common/loadednormalizer2impl.cpp

@@ -143,6 +143,9 @@ static icu::UInitOnce nfkcInitOnce {};
 static Norm2AllModes *nfkc_cfSingleton;
 static icu::UInitOnce nfkc_cfInitOnce {};
 
+static Norm2AllModes *nfkc_scfSingleton;
+static icu::UInitOnce nfkc_scfInitOnce {};
+
 static UHashtable    *cache=nullptr;
 
 // UInitOnce singleton initialization function
@@ -156,6 +159,8 @@ static void U_CALLCONV initSingletons(const char *what, UErrorCode &errorCode) {
         nfkcSingleton    = Norm2AllModes::createInstance(nullptr, "nfkc", errorCode);
     } else if (uprv_strcmp(what, "nfkc_cf") == 0) {
         nfkc_cfSingleton = Norm2AllModes::createInstance(nullptr, "nfkc_cf", errorCode);
+    } else if (uprv_strcmp(what, "nfkc_scf") == 0) {
+        nfkc_scfSingleton = Norm2AllModes::createInstance(nullptr, "nfkc_scf", errorCode);
     } else {
         UPRV_UNREACHABLE_EXIT;   // Unknown singleton
     }
@@ -183,6 +188,10 @@ static UBool U_CALLCONV uprv_loaded_normalizer2_cleanup() {
     nfkc_cfSingleton = nullptr;
     nfkc_cfInitOnce.reset();
 
+    delete nfkc_scfSingleton;
+    nfkc_scfSingleton = nullptr;
+    nfkc_scfInitOnce.reset();
+
     uhash_close(cache);
     cache=nullptr;
     return true;
@@ -213,6 +222,13 @@ Norm2AllModes::getNFKC_CFInstance(UErrorCode &errorCode) {
     return nfkc_cfSingleton;
 }
 
+const Norm2AllModes *
+Norm2AllModes::getNFKC_SCFInstance(UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return nullptr; }
+    umtx_initOnce(nfkc_scfInitOnce, &initSingletons, "nfkc_scf", errorCode);
+    return nfkc_scfSingleton;
+}
+
 #if !NORM2_HARDCODE_NFC_DATA
 const Normalizer2 *
 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
@@ -261,6 +277,12 @@ Normalizer2::getNFKCCasefoldInstance(UErrorCode &errorCode) {
     return allModes!=nullptr ? &allModes->comp : nullptr;
 }
 
+const Normalizer2 *
+Normalizer2::getNFKCSimpleCasefoldInstance(UErrorCode &errorCode) {
+    const Norm2AllModes *allModes=Norm2AllModes::getNFKC_SCFInstance(errorCode);
+    return allModes!=nullptr ? &allModes->comp : nullptr;
+}
+
 const Normalizer2 *
 Normalizer2::getInstance(const char *packageName,
                          const char *name,
@@ -281,6 +303,8 @@ Normalizer2::getInstance(const char *packageName,
             allModes=Norm2AllModes::getNFKCInstance(errorCode);
         } else if(0==uprv_strcmp(name, "nfkc_cf")) {
             allModes=Norm2AllModes::getNFKC_CFInstance(errorCode);
+        } else if(0==uprv_strcmp(name, "nfkc_scf")) {
+            allModes=Norm2AllModes::getNFKC_SCFInstance(errorCode);
         }
     }
     if(allModes==nullptr && U_SUCCESS(errorCode)) {
@@ -393,6 +417,11 @@ unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode) {
     return (const UNormalizer2 *)Normalizer2::getNFKCCasefoldInstance(*pErrorCode);
 }
 
+U_CAPI const UNormalizer2 * U_EXPORT2
+unorm2_getNFKCSimpleCasefoldInstance(UErrorCode *pErrorCode) {
+    return (const UNormalizer2 *)Normalizer2::getNFKCSimpleCasefoldInstance(*pErrorCode);
+}
+
 U_CAPI const UNormalizer2 * U_EXPORT2
 unorm2_getInstance(const char *packageName,
                    const char *name,

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 1099 - 1111
thirdparty/icu4c/common/localefallback_data.h


+ 1 - 1
thirdparty/icu4c/common/localematcher.cpp

@@ -307,7 +307,7 @@ LSR getMaximalLsrOrUnd(const XLikelySubtags &likelySubtags, const Locale &locale
     if (U_FAILURE(errorCode) || locale.isBogus() || *locale.getName() == 0 /* "und" */) {
         return UND_LSR;
     } else {
-        return likelySubtags.makeMaximizedLsrFrom(locale, errorCode);
+        return likelySubtags.makeMaximizedLsrFrom(locale, false, errorCode);
     }
 }
 

+ 50 - 21
thirdparty/icu4c/common/locid.cpp

@@ -563,7 +563,7 @@ private:
                    LocalMemory<int32_t>& replacementIndexes,
                    int32_t &length,
                    void (*checkType)(const char* type),
-                   void (*checkReplacement)(const UnicodeString& replacement),
+                   void (*checkReplacement)(const UChar* replacement),
                    UErrorCode &status);
 
     // Read the languageAlias data from alias to
@@ -700,7 +700,7 @@ AliasDataBuilder::readAlias(
         LocalMemory<int32_t>& replacementIndexes,
         int32_t &length,
         void (*checkType)(const char* type),
-        void (*checkReplacement)(const UnicodeString& replacement),
+        void (*checkReplacement)(const UChar* replacement),
         UErrorCode &status) {
     if (U_FAILURE(status)) {
         return;
@@ -720,8 +720,8 @@ AliasDataBuilder::readAlias(
         LocalUResourceBundlePointer res(
             ures_getNextResource(alias, nullptr, &status));
         const char* aliasFrom = ures_getKey(res.getAlias());
-        UnicodeString aliasTo =
-            ures_getUnicodeStringByKey(res.getAlias(), "replacement", &status);
+        const UChar* aliasTo =
+            ures_getStringByKey(res.getAlias(), "replacement", nullptr, &status);
         if (U_FAILURE(status)) return;
 
         checkType(aliasFrom);
@@ -766,7 +766,7 @@ AliasDataBuilder::readLanguageAlias(
 #else
         [](const char*) {},
 #endif
-        [](const UnicodeString&) {}, status);
+        [](const UChar*) {}, status);
 }
 
 /**
@@ -790,12 +790,12 @@ AliasDataBuilder::readScriptAlias(
         [](const char* type) {
             U_ASSERT(uprv_strlen(type) == 4);
         },
-        [](const UnicodeString& replacement) {
-            U_ASSERT(replacement.length() == 4);
+        [](const UChar* replacement) {
+            U_ASSERT(u_strlen(replacement) == 4);
         },
 #else
         [](const char*) {},
-        [](const UnicodeString&) { },
+        [](const UChar*) { },
 #endif
         status);
 }
@@ -824,7 +824,7 @@ AliasDataBuilder::readTerritoryAlias(
 #else
         [](const char*) {},
 #endif
-        [](const UnicodeString&) { },
+        [](const UChar*) { },
         status);
 }
 
@@ -851,15 +851,16 @@ AliasDataBuilder::readVariantAlias(
             U_ASSERT(uprv_strlen(type) != 4 ||
                      (type[0] >= '0' && type[0] <= '9'));
         },
-        [](const UnicodeString& replacement) {
-            U_ASSERT(replacement.length() >= 4 && replacement.length() <= 8);
-            U_ASSERT(replacement.length() != 4 ||
-                     (replacement.charAt(0) >= u'0' &&
-                      replacement.charAt(0) <= u'9'));
+        [](const UChar* replacement) {
+            int32_t len = u_strlen(replacement);
+            U_ASSERT(len >= 4 && len <= 8);
+            U_ASSERT(len != 4 ||
+                     (*replacement >= u'0' &&
+                      *replacement <= u'9'));
         },
 #else
         [](const char*) {},
-        [](const UnicodeString&) { },
+        [](const UChar*) { },
 #endif
         status);
 }
@@ -888,7 +889,7 @@ AliasDataBuilder::readSubdivisionAlias(
 #else
         [](const char*) {},
 #endif
-        [](const UnicodeString&) { },
+        [](const UChar*) { },
         status);
 }
 
@@ -1066,7 +1067,13 @@ class AliasReplacer {
 public:
     AliasReplacer(UErrorCode status) :
             language(nullptr), script(nullptr), region(nullptr),
-            extensions(nullptr), variants(status),
+            extensions(nullptr),
+            // store value in variants only once
+            variants(nullptr,
+                     ([](UElement e1, UElement e2) -> UBool {
+                       return 0==uprv_strcmp((const char*)e1.pointer,
+                                             (const char*)e2.pointer);}),
+                     status),
             data(nullptr) {
     }
     ~AliasReplacer() {
@@ -1652,10 +1659,16 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
         while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr &&
                U_SUCCESS(status)) {
             *end = NULL_CHAR;  // null terminate inside variantsBuff
-            variants.addElement(start, status);
+            // do not add "" or duplicate data to variants
+            if (*start && !variants.contains(start)) {
+                variants.addElement(start, status);
+            }
             start = end + 1;
         }
-        variants.addElement(start, status);
+        // do not add "" or duplicate data to variants
+        if (*start && !variants.contains(start)) {
+            variants.addElement(start, status);
+        }
     }
     if (U_FAILURE(status)) { return false; }
 
@@ -2079,6 +2092,10 @@ Locale::addLikelySubtags(UErrorCode& status) {
 
 void
 Locale::minimizeSubtags(UErrorCode& status) {
+    Locale::minimizeSubtags(false, status);
+}
+void
+Locale::minimizeSubtags(bool favorScript, UErrorCode& status) {
     if (U_FAILURE(status)) {
         return;
     }
@@ -2086,7 +2103,7 @@ Locale::minimizeSubtags(UErrorCode& status) {
     CharString minimizedLocaleID;
     {
         CharStringByteSink sink(&minimizedLocaleID);
-        ulocimp_minimizeSubtags(fullName, sink, &status);
+        ulocimp_minimizeSubtags(fullName, sink, favorScript, &status);
     }
 
     if (U_FAILURE(status)) {
@@ -2402,8 +2419,9 @@ Locale::getLocaleCache()
 }
 
 class KeywordEnumeration : public StringEnumeration {
-private:
+protected:
     char *keywords;
+private:
     char *current;
     int32_t length;
     UnicodeString currUSKey;
@@ -2510,6 +2528,17 @@ public:
         if (resultLength != nullptr) *resultLength = 0;
         return nullptr;
     }
+    virtual int32_t count(UErrorCode &/*status*/) const override {
+        char *kw = keywords;
+        int32_t result = 0;
+        while(*kw) {
+            if (uloc_toUnicodeLocaleKey(kw) != nullptr) {
+                result++;
+            }
+            kw += uprv_strlen(kw)+1;
+        }
+        return result;
+    }
 };
 
 // Out-of-line virtual destructor to serve as the "key function".

+ 91 - 663
thirdparty/icu4c/common/loclikely.cpp

@@ -31,82 +31,10 @@
 #include "charstr.h"
 #include "cmemory.h"
 #include "cstring.h"
+#include "loclikelysubtags.h"
 #include "ulocimp.h"
 #include "ustr_imp.h"
 
-/**
- * These are the canonical strings for unknown languages, scripts and regions.
- **/
-static const char* const unknownLanguage = "und";
-static const char* const unknownScript = "Zzzz";
-static const char* const unknownRegion = "ZZ";
-
-/**
- * This function looks for the localeID in the likelySubtags resource.
- *
- * @param localeID The tag to find.
- * @param buffer A buffer to hold the matching entry
- * @param bufferLength The length of the output buffer
- * @return A pointer to "buffer" if found, or a null pointer if not.
- */
-static const char*  U_CALLCONV
-findLikelySubtags(const char* localeID,
-                  char* buffer,
-                  int32_t bufferLength,
-                  UErrorCode* err) {
-    const char* result = nullptr;
-
-    if (!U_FAILURE(*err)) {
-        int32_t resLen = 0;
-        const char16_t* s = nullptr;
-        UErrorCode tmpErr = U_ZERO_ERROR;
-        icu::LocalUResourceBundlePointer subtags(ures_openDirect(nullptr, "likelySubtags", &tmpErr));
-        if (U_SUCCESS(tmpErr)) {
-            icu::CharString und;
-            if (localeID != nullptr) {
-                if (*localeID == '\0') {
-                    localeID = unknownLanguage;
-                } else if (*localeID == '_') {
-                    und.append(unknownLanguage, *err);
-                    und.append(localeID, *err);
-                    if (U_FAILURE(*err)) {
-                        return nullptr;
-                    }
-                    localeID = und.data();
-                }
-            }
-            s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr);
-
-            if (U_FAILURE(tmpErr)) {
-                /*
-                 * If a resource is missing, it's not really an error, it's
-                 * just that we don't have any data for that particular locale ID.
-                 */
-                if (tmpErr != U_MISSING_RESOURCE_ERROR) {
-                    *err = tmpErr;
-                }
-            }
-            else if (resLen >= bufferLength) {
-                /* The buffer should never overflow. */
-                *err = U_INTERNAL_PROGRAM_ERROR;
-            }
-            else {
-                u_UCharsToChars(s, buffer, resLen + 1);
-                if (resLen >= 3 &&
-                    uprv_strnicmp(buffer, unknownLanguage, 3) == 0 &&
-                    (resLen == 3 || buffer[3] == '_')) {
-                    uprv_memmove(buffer, buffer + 3, resLen - 3 + 1);
-                }
-                result = buffer;
-            }
-        } else {
-            *err = tmpErr;
-        }
-    }
-
-    return result;
-}
-
 /**
  * Append a tag to a buffer, adding the separator if necessary.  The buffer
  * must be large enough to contain the resulting tag plus any separator
@@ -360,57 +288,6 @@ error:
     }
 }
 
-/**
- * Create a tag string from the supplied parameters.  The lang, script and region
- * parameters may be nullptr pointers. If they are, their corresponding length parameters
- * must be less than or equal to 0.  If the lang parameter is an empty string, the
- * default value for an unknown language is written to the output buffer.
- *
- * If the length of the new string exceeds the capacity of the output buffer, 
- * the function copies as many bytes to the output buffer as it can, and returns
- * the error U_BUFFER_OVERFLOW_ERROR.
- *
- * If an illegal argument is provided, the function returns the error
- * U_ILLEGAL_ARGUMENT_ERROR.
- *
- * @param lang The language tag to use.
- * @param langLength The length of the language tag.
- * @param script The script tag to use.
- * @param scriptLength The length of the script tag.
- * @param region The region tag to use.
- * @param regionLength The length of the region tag.
- * @param trailing Any trailing data to append to the new tag.
- * @param trailingLength The length of the trailing data.
- * @param sink The output sink receiving the tag string.
- * @param err A pointer to a UErrorCode for error reporting.
- **/
-static void U_CALLCONV
-createTagString(
-    const char* lang,
-    int32_t langLength,
-    const char* script,
-    int32_t scriptLength,
-    const char* region,
-    int32_t regionLength,
-    const char* trailing,
-    int32_t trailingLength,
-    icu::ByteSink& sink,
-    UErrorCode* err)
-{
-    createTagStringWithAlternates(
-                lang,
-                langLength,
-                script,
-                scriptLength,
-                region,
-                regionLength,
-                trailing,
-                trailingLength,
-                nullptr,
-                sink,
-                err);
-}
-
 /**
  * Parse the language, script, and region subtags from a tag string, and copy the
  * results into the corresponding output parameters. The buffers are null-terminated,
@@ -494,13 +371,6 @@ parseTagString(
     *scriptLength = subtagLength;
 
     if (*scriptLength > 0) {
-        if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
-            /**
-             * If the script part is the "unknown" script, then don't return it.
-             **/
-            *scriptLength = 0;
-        }
-
         /*
          * Move past any separator.
          */
@@ -517,14 +387,7 @@ parseTagString(
 
     *regionLength = subtagLength;
 
-    if (*regionLength > 0) {
-        if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
-            /**
-             * If the region part is the "unknown" region, then don't return it.
-             **/
-            *regionLength = 0;
-        }
-    } else if (*position != 0 && *position != '@') {
+    if (*regionLength <= 0 && *position != 0 && *position != '@') {
         /* back up over consumed trailing separator */
         --position;
     }
@@ -546,264 +409,6 @@ error:
     goto exit;
 }
 
-static UBool U_CALLCONV
-createLikelySubtagsString(
-    const char* lang,
-    int32_t langLength,
-    const char* script,
-    int32_t scriptLength,
-    const char* region,
-    int32_t regionLength,
-    const char* variants,
-    int32_t variantsLength,
-    icu::ByteSink& sink,
-    UErrorCode* err) {
-    /**
-     * ULOC_FULLNAME_CAPACITY will provide enough capacity
-     * that we can build a string that contains the language,
-     * script and region code without worrying about overrunning
-     * the user-supplied buffer.
-     **/
-    char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
-
-    if(U_FAILURE(*err)) {
-        goto error;
-    }
-
-    /**
-     * Try the language with the script and region first.
-     **/
-    if (scriptLength > 0 && regionLength > 0) {
-
-        const char* likelySubtags = nullptr;
-
-        icu::CharString tagBuffer;
-        {
-            icu::CharStringByteSink sink(&tagBuffer);
-            createTagString(
-                lang,
-                langLength,
-                script,
-                scriptLength,
-                region,
-                regionLength,
-                nullptr,
-                0,
-                sink,
-                err);
-        }
-        if(U_FAILURE(*err)) {
-            goto error;
-        }
-
-        likelySubtags =
-            findLikelySubtags(
-                tagBuffer.data(),
-                likelySubtagsBuffer,
-                sizeof(likelySubtagsBuffer),
-                err);
-        if(U_FAILURE(*err)) {
-            goto error;
-        }
-
-        if (likelySubtags != nullptr) {
-            /* Always use the language tag from the
-               maximal string, since it may be more
-               specific than the one provided. */
-            createTagStringWithAlternates(
-                        nullptr,
-                        0,
-                        nullptr,
-                        0,
-                        nullptr,
-                        0,
-                        variants,
-                        variantsLength,
-                        likelySubtags,
-                        sink,
-                        err);
-            return true;
-        }
-    }
-
-    /**
-     * Try the language with just the script.
-     **/
-    if (scriptLength > 0) {
-
-        const char* likelySubtags = nullptr;
-
-        icu::CharString tagBuffer;
-        {
-            icu::CharStringByteSink sink(&tagBuffer);
-            createTagString(
-                lang,
-                langLength,
-                script,
-                scriptLength,
-                nullptr,
-                0,
-                nullptr,
-                0,
-                sink,
-                err);
-        }
-        if(U_FAILURE(*err)) {
-            goto error;
-        }
-
-        likelySubtags =
-            findLikelySubtags(
-                tagBuffer.data(),
-                likelySubtagsBuffer,
-                sizeof(likelySubtagsBuffer),
-                err);
-        if(U_FAILURE(*err)) {
-            goto error;
-        }
-
-        if (likelySubtags != nullptr) {
-            /* Always use the language tag from the
-               maximal string, since it may be more
-               specific than the one provided. */
-            createTagStringWithAlternates(
-                        nullptr,
-                        0,
-                        nullptr,
-                        0,
-                        region,
-                        regionLength,
-                        variants,
-                        variantsLength,
-                        likelySubtags,
-                        sink,
-                        err);
-            return true;
-        }
-    }
-
-    /**
-     * Try the language with just the region.
-     **/
-    if (regionLength > 0) {
-
-        const char* likelySubtags = nullptr;
-
-        icu::CharString tagBuffer;
-        {
-            icu::CharStringByteSink sink(&tagBuffer);
-            createTagString(
-                lang,
-                langLength,
-                nullptr,
-                0,
-                region,
-                regionLength,
-                nullptr,
-                0,
-                sink,
-                err);
-        }
-        if(U_FAILURE(*err)) {
-            goto error;
-        }
-
-        likelySubtags =
-            findLikelySubtags(
-                tagBuffer.data(),
-                likelySubtagsBuffer,
-                sizeof(likelySubtagsBuffer),
-                err);
-        if(U_FAILURE(*err)) {
-            goto error;
-        }
-
-        if (likelySubtags != nullptr) {
-            /* Always use the language tag from the
-               maximal string, since it may be more
-               specific than the one provided. */
-            createTagStringWithAlternates(
-                        nullptr,
-                        0,
-                        script,
-                        scriptLength,
-                        nullptr,
-                        0,
-                        variants,
-                        variantsLength,
-                        likelySubtags,
-                        sink,
-                        err);
-            return true;
-        }
-    }
-
-    /**
-     * Finally, try just the language.
-     **/
-    {
-        const char* likelySubtags = nullptr;
-
-        icu::CharString tagBuffer;
-        {
-            icu::CharStringByteSink sink(&tagBuffer);
-            createTagString(
-                lang,
-                langLength,
-                nullptr,
-                0,
-                nullptr,
-                0,
-                nullptr,
-                0,
-                sink,
-                err);
-        }
-        if(U_FAILURE(*err)) {
-            goto error;
-        }
-
-        likelySubtags =
-            findLikelySubtags(
-                tagBuffer.data(),
-                likelySubtagsBuffer,
-                sizeof(likelySubtagsBuffer),
-                err);
-        if(U_FAILURE(*err)) {
-            goto error;
-        }
-
-        if (likelySubtags != nullptr) {
-            /* Always use the language tag from the
-               maximal string, since it may be more
-               specific than the one provided. */
-            createTagStringWithAlternates(
-                        nullptr,
-                        0,
-                        script,
-                        scriptLength,
-                        region,
-                        regionLength,
-                        variants,
-                        variantsLength,
-                        likelySubtags,
-                        sink,
-                        err);
-            return true;
-        }
-    }
-
-    return false;
-
-error:
-
-    if (!U_FAILURE(*err)) {
-        *err = U_ILLEGAL_ARGUMENT_ERROR;
-    }
-
-    return false;
-}
-
 #define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) UPRV_BLOCK_MACRO_BEGIN { \
     int32_t count = 0; \
     int32_t i; \
@@ -836,7 +441,6 @@ _uloc_addLikelySubtags(const char* localeID,
     const char* trailing = "";
     int32_t trailingLength = 0;
     int32_t trailingIndex = 0;
-    UBool success = false;
 
     if(U_FAILURE(*err)) {
         goto error;
@@ -862,6 +466,9 @@ _uloc_addLikelySubtags(const char* localeID,
 
         goto error;
     }
+    if (langLength > 3) {
+        goto error;
+    }
 
     /* Find the length of the trailing portion. */
     while (_isIDSeparator(localeID[trailingIndex])) {
@@ -871,30 +478,42 @@ _uloc_addLikelySubtags(const char* localeID,
     trailingLength = (int32_t)uprv_strlen(trailing);
 
     CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
-
-    success =
-        createLikelySubtagsString(
-            lang,
-            langLength,
-            script,
-            scriptLength,
-            region,
-            regionLength,
+    {
+        const icu::XLikelySubtags* likelySubtags = icu::XLikelySubtags::getSingleton(*err);
+        if(U_FAILURE(*err)) {
+            goto error;
+        }
+        // We need to keep l on the stack because lsr may point into internal
+        // memory of l.
+        icu::Locale l = icu::Locale::createFromName(localeID);
+        if (l.isBogus()) {
+            goto error;
+        }
+        icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, *err);
+        if(U_FAILURE(*err)) {
+            goto error;
+        }
+        const char* language = lsr.language;
+        if (uprv_strcmp(language, "und") == 0) {
+            language = "";
+        }
+        createTagStringWithAlternates(
+            language,
+            (int32_t)uprv_strlen(language),
+            lsr.script,
+            (int32_t)uprv_strlen(lsr.script),
+            lsr.region,
+            (int32_t)uprv_strlen(lsr.region),
             trailing,
             trailingLength,
+            nullptr,
             sink,
             err);
-
-    if (!success) {
-        const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
-
-        /*
-         * If we get here, we need to return localeID.
-         */
-        sink.Append(localeID, localIDLength);
+        if(U_FAILURE(*err)) {
+            goto error;
+        }
     }
-
-    return success;
+    return true;
 
 error:
 
@@ -913,6 +532,7 @@ static UBool _ulocimp_addLikelySubtags(const char*, icu::ByteSink&, UErrorCode*)
 static void
 _uloc_minimizeSubtags(const char* localeID,
                       icu::ByteSink& sink,
+                      bool favorScript,
                       UErrorCode* err) {
     icu::CharString maximizedTagBuffer;
 
@@ -925,7 +545,6 @@ _uloc_minimizeSubtags(const char* localeID,
     const char* trailing = "";
     int32_t trailingLength = 0;
     int32_t trailingIndex = 0;
-    UBool successGetMax = false;
 
     if(U_FAILURE(*err)) {
         goto error;
@@ -964,213 +583,38 @@ _uloc_minimizeSubtags(const char* localeID,
     CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
 
     {
-        icu::CharString base;
-        {
-            icu::CharStringByteSink baseSink(&base);
-            createTagString(
-                lang,
-                langLength,
-                script,
-                scriptLength,
-                region,
-                regionLength,
-                nullptr,
-                0,
-                baseSink,
-                err);
-        }
-
-        /**
-         * First, we need to first get the maximization
-         * from AddLikelySubtags.
-         **/
-        {
-            icu::CharStringByteSink maxSink(&maximizedTagBuffer);
-            successGetMax = _ulocimp_addLikelySubtags(base.data(), maxSink, err);
-        }
-    }
-
-    if(U_FAILURE(*err)) {
-        goto error;
-    }
-
-    if (!successGetMax) {
-        /**
-         * If we got here, return the locale ID parameter unchanged.
-         **/
-        const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
-        sink.Append(localeID, localeIDLength);
-        return;
-    }
-
-    // In the following, the lang, script, region are referring to those in
-    // the maximizedTagBuffer, not the one in the localeID.
-    langLength = sizeof(lang);
-    scriptLength = sizeof(script);
-    regionLength = sizeof(region);
-    parseTagString(
-        maximizedTagBuffer.data(),
-        lang,
-        &langLength,
-        script,
-        &scriptLength,
-        region,
-        &regionLength,
-        err);
-    if(U_FAILURE(*err)) {
-        goto error;
-    }
-
-    /**
-     * Start first with just the language.
-     **/
-    {
-        icu::CharString tagBuffer;
-        {
-            icu::CharStringByteSink tagSink(&tagBuffer);
-            createLikelySubtagsString(
-                lang,
-                langLength,
-                nullptr,
-                0,
-                nullptr,
-                0,
-                nullptr,
-                0,
-                tagSink,
-                err);
-        }
-
+        const icu::XLikelySubtags* likelySubtags = icu::XLikelySubtags::getSingleton(*err);
         if(U_FAILURE(*err)) {
             goto error;
         }
-        else if (!tagBuffer.isEmpty() &&
-                 uprv_strnicmp(
-                    maximizedTagBuffer.data(),
-                    tagBuffer.data(),
-                    tagBuffer.length()) == 0) {
-
-            createTagString(
-                        lang,
-                        langLength,
-                        nullptr,
-                        0,
-                        nullptr,
-                        0,
-                        trailing,
-                        trailingLength,
-                        sink,
-                        err);
-            return;
-        }
-    }
-
-    /**
-     * Next, try the language and region.
-     **/
-    if (regionLength > 0) {
-
-        icu::CharString tagBuffer;
-        {
-            icu::CharStringByteSink tagSink(&tagBuffer);
-            createLikelySubtagsString(
-                lang,
-                langLength,
-                nullptr,
-                0,
-                region,
-                regionLength,
-                nullptr,
-                0,
-                tagSink,
-                err);
-        }
-
+        icu::LSR lsr = likelySubtags->minimizeSubtags(
+            {lang, langLength},
+            {script, scriptLength},
+            {region, regionLength},
+            favorScript,
+            *err);
         if(U_FAILURE(*err)) {
             goto error;
         }
-        else if (!tagBuffer.isEmpty() &&
-                 uprv_strnicmp(
-                    maximizedTagBuffer.data(),
-                    tagBuffer.data(),
-                    tagBuffer.length()) == 0) {
-
-            createTagString(
-                        lang,
-                        langLength,
-                        nullptr,
-                        0,
-                        region,
-                        regionLength,
-                        trailing,
-                        trailingLength,
-                        sink,
-                        err);
-            return;
-        }
-    }
-
-    /**
-     * Finally, try the language and script.  This is our last chance,
-     * since trying with all three subtags would only yield the
-     * maximal version that we already have.
-     **/
-    if (scriptLength > 0) {
-        icu::CharString tagBuffer;
-        {
-            icu::CharStringByteSink tagSink(&tagBuffer);
-            createLikelySubtagsString(
-                lang,
-                langLength,
-                script,
-                scriptLength,
-                nullptr,
-                0,
-                nullptr,
-                0,
-                tagSink,
-                err);
-        }
-
+        const char* language = lsr.language;
+        if (uprv_strcmp(language, "und") == 0) {
+            language = "";
+        }
+        createTagStringWithAlternates(
+            language,
+            (int32_t)uprv_strlen(language),
+            lsr.script,
+            (int32_t)uprv_strlen(lsr.script),
+            lsr.region,
+            (int32_t)uprv_strlen(lsr.region),
+            trailing,
+            trailingLength,
+            nullptr,
+            sink,
+            err);
         if(U_FAILURE(*err)) {
             goto error;
         }
-        else if (!tagBuffer.isEmpty() &&
-                 uprv_strnicmp(
-                    maximizedTagBuffer.data(),
-                    tagBuffer.data(),
-                    tagBuffer.length()) == 0) {
-
-            createTagString(
-                        lang,
-                        langLength,
-                        script,
-                        scriptLength,
-                        nullptr,
-                        0,
-                        trailing,
-                        trailingLength,
-                        sink,
-                        err);
-            return;
-        }
-    }
-
-    {
-        /**
-         * If we got here, return the max + trail.
-         **/
-        createTagString(
-                    lang,
-                    langLength,
-                    script,
-                    scriptLength,
-                    region,
-                    regionLength,
-                    trailing,
-                    trailingLength,
-                    sink,
-                    err);
         return;
     }
 
@@ -1181,31 +625,6 @@ error:
     }
 }
 
-static int32_t
-do_canonicalize(const char*    localeID,
-         char* buffer,
-         int32_t bufferCapacity,
-         UErrorCode* err)
-{
-    int32_t canonicalizedSize = uloc_canonicalize(
-        localeID,
-        buffer,
-        bufferCapacity,
-        err);
-
-    if (*err == U_STRING_NOT_TERMINATED_WARNING ||
-        *err == U_BUFFER_OVERFLOW_ERROR) {
-        return canonicalizedSize;
-    }
-    else if (U_FAILURE(*err)) {
-
-        return -1;
-    }
-    else {
-        return canonicalizedSize;
-    }
-}
-
 U_CAPI int32_t U_EXPORT2
 uloc_addLikelySubtags(const char* localeID,
                       char* maximizedLocaleID,
@@ -1239,14 +658,13 @@ static UBool
 _ulocimp_addLikelySubtags(const char* localeID,
                           icu::ByteSink& sink,
                           UErrorCode* status) {
-    PreflightingLocaleIDBuffer localeBuffer;
-    do {
-        localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
-            localeBuffer.getCapacity(), status);
-    } while (localeBuffer.needToTryAgain(status));
-    
+    icu::CharString localeBuffer;
+    {
+        icu::CharStringByteSink localeSink(&localeBuffer);
+        ulocimp_canonicalize(localeID, localeSink, status);
+    }
     if (U_SUCCESS(*status)) {
-        return _uloc_addLikelySubtags(localeBuffer.getBuffer(), sink, status);
+        return _uloc_addLikelySubtags(localeBuffer.data(), sink, status);
     } else {
         return false;
     }
@@ -1271,7 +689,7 @@ uloc_minimizeSubtags(const char* localeID,
     icu::CheckedArrayByteSink sink(
             minimizedLocaleID, minimizedLocaleIDCapacity);
 
-    ulocimp_minimizeSubtags(localeID, sink, status);
+    ulocimp_minimizeSubtags(localeID, sink, false, status);
     int32_t reslen = sink.NumberOfBytesAppended();
 
     if (U_FAILURE(*status)) {
@@ -1291,14 +709,14 @@ uloc_minimizeSubtags(const char* localeID,
 U_CAPI void U_EXPORT2
 ulocimp_minimizeSubtags(const char* localeID,
                         icu::ByteSink& sink,
+                        bool favorScript,
                         UErrorCode* status) {
-    PreflightingLocaleIDBuffer localeBuffer;
-    do {
-        localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
-            localeBuffer.getCapacity(), status);
-    } while (localeBuffer.needToTryAgain(status));
-    
-    _uloc_minimizeSubtags(localeBuffer.getBuffer(), sink, status);
+    icu::CharString localeBuffer;
+    {
+        icu::CharStringByteSink localeSink(&localeBuffer);
+        ulocimp_canonicalize(localeID, localeSink, status);
+    }
+    _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
 }
 
 // Pairs of (language subtag, + or -) for finding out fast if common languages
@@ -1374,16 +792,26 @@ ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion,
     UErrorCode rgStatus = U_ZERO_ERROR;
 
     // First check for rg keyword value
-    int32_t rgLen = uloc_getKeywordValue(localeID, "rg", rgBuf, ULOC_RG_BUFLEN, &rgStatus);
-    if (U_FAILURE(rgStatus) || rgLen != 6) {
+    icu::CharString rg;
+    {
+        icu::CharStringByteSink sink(&rg);
+        ulocimp_getKeywordValue(localeID, "rg", sink, &rgStatus);
+    }
+    int32_t rgLen = rg.length();
+    if (U_FAILURE(rgStatus) || rgLen < 3 || rgLen > 7) {
         rgLen = 0;
     } else {
-        // rgBuf guaranteed to be zero terminated here, with text len 6
-        char *rgPtr = rgBuf;
-        for (; *rgPtr!= 0; rgPtr++) {
-            *rgPtr = uprv_toupper(*rgPtr);
+        // chop off the subdivision code (which will generally be "zzzz" anyway)
+        const char* const data = rg.data();
+        if (uprv_isASCIILetter(data[0])) {
+            rgLen = 2;
+            rgBuf[0] = uprv_toupper(data[0]);
+            rgBuf[1] = uprv_toupper(data[1]);
+        } else {
+            // assume three-digit region code
+            rgLen = 3;
+            uprv_memcpy(rgBuf, data, rgLen);
         }
-        rgLen = (uprv_strcmp(rgBuf+2, "ZZZZ") == 0)? 2: 0;
     }
 
     if (rgLen == 0) {

+ 321 - 94
thirdparty/icu4c/common/loclikelysubtags.cpp

@@ -11,6 +11,7 @@
 #include "unicode/locid.h"
 #include "unicode/uobject.h"
 #include "unicode/ures.h"
+#include "unicode/uscript.h"
 #include "charstr.h"
 #include "cstring.h"
 #include "loclikelysubtags.h"
@@ -23,6 +24,7 @@
 #include "uniquecharstr.h"
 #include "uresdata.h"
 #include "uresimp.h"
+#include "uvector.h"
 
 U_NAMESPACE_BEGIN
 
@@ -81,11 +83,18 @@ struct XLikelySubtagsData {
         // Read all strings in the resource bundle and convert them to invariant char *.
         LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes;
         int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0;
+        ResourceArray m49Array;
+        if (likelyTable.findValue("m49", value)) {
+            m49Array = value.getArray(errorCode);
+        } else {
+            errorCode = U_MISSING_RESOURCE_ERROR;
+            return;
+        }
         if (!readStrings(likelyTable, "languageAliases", value,
                          languageIndexes, languagesLength, errorCode) ||
                 !readStrings(likelyTable, "regionAliases", value,
                              regionIndexes, regionsLength, errorCode) ||
-                !readStrings(likelyTable, "lsrs", value,
+                !readLSREncodedStrings(likelyTable, "lsrnum", value, m49Array,
                              lsrSubtagIndexes,lsrSubtagsLength, errorCode)) {
             return;
         }
@@ -136,7 +145,7 @@ struct XLikelySubtagsData {
 
             if (!readStrings(matchTable, "partitions", value,
                              partitionIndexes, partitionsLength, errorCode) ||
-                    !readStrings(matchTable, "paradigms", value,
+                    !readLSREncodedStrings(matchTable, "paradigmnum", value, m49Array,
                                  paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) {
                 return;
             }
@@ -233,10 +242,96 @@ private:
                 return false;
             }
             for (int i = 0; i < length; ++i) {
-                stringArray.getValue(i, value);  // returns true because i < length
-                rawIndexes[i] = strings.add(value.getUnicodeString(errorCode), errorCode);
+                if (stringArray.getValue(i, value)) {  // returns true because i < length
+                    int32_t strLength = 0;
+                    rawIndexes[i] = strings.add(value.getString(strLength, errorCode), errorCode);
+                    if (U_FAILURE(errorCode)) { return false; }
+                }
+            }
+        }
+        return true;
+    }
+    UnicodeString toLanguage(int encoded) {
+        if (encoded == 0) {
+            return UNICODE_STRING_SIMPLE("");
+        }
+        if (encoded == 1) {
+            return UNICODE_STRING_SIMPLE("skip");
+        }
+        encoded &= 0x00ffffff;
+        encoded %= 27*27*27;
+        char lang[3];
+        lang[0] = 'a' + ((encoded % 27) - 1);
+        lang[1] = 'a' + (((encoded / 27 ) % 27) - 1);
+        if (encoded / (27 * 27) == 0) {
+            return UnicodeString(lang, 2, US_INV);
+        }
+        lang[2] = 'a' + ((encoded / (27 * 27)) - 1);
+        return UnicodeString(lang, 3, US_INV);
+    }
+    UnicodeString toScript(int encoded) {
+        if (encoded == 0) {
+            return UNICODE_STRING_SIMPLE("");
+        }
+        if (encoded == 1) {
+            return UNICODE_STRING_SIMPLE("script");
+        }
+        encoded = (encoded >> 24) & 0x000000ff;
+        const char* script = uscript_getShortName(static_cast<UScriptCode>(encoded));
+        if (script == nullptr) {
+            return UNICODE_STRING_SIMPLE("");
+        }
+        U_ASSERT(uprv_strlen(script) == 4);
+        return UnicodeString(script, 4, US_INV);
+    }
+    UnicodeString m49IndexToCode(const ResourceArray &m49Array, ResourceValue &value, int index, UErrorCode &errorCode) {
+        if (U_FAILURE(errorCode)) {
+            return UNICODE_STRING_SIMPLE("");
+        }
+        if (m49Array.getValue(index, value)) {
+            return value.getUnicodeString(errorCode);
+        }
+        // "m49" does not include the index.
+        errorCode = U_MISSING_RESOURCE_ERROR;
+        return UNICODE_STRING_SIMPLE("");
+    }
+
+    UnicodeString toRegion(const ResourceArray& m49Array, ResourceValue &value, int encoded, UErrorCode &errorCode) {
+        if (encoded == 0 || encoded == 1) {
+            return UNICODE_STRING_SIMPLE("");
+        }
+        encoded &= 0x00ffffff;
+        encoded /= 27 * 27 * 27;
+        encoded %= 27 * 27;
+        if (encoded < 27) {
+            // Selected M49 code index, find the code from "m49" resource.
+            return  m49IndexToCode(m49Array, value, encoded, errorCode);
+        }
+        char region[2];
+        region[0] = 'A' + ((encoded % 27) - 1);
+        region[1] = 'A' + (((encoded / 27) % 27) - 1);
+        return UnicodeString(region, 2, US_INV);
+    }
+
+    bool readLSREncodedStrings(const ResourceTable &table, const char* key, ResourceValue &value, const ResourceArray& m49Array,
+                     LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
+        if (table.findValue(key, value)) {
+            const int32_t* vectors = value.getIntVector(length, errorCode);
+            if (U_FAILURE(errorCode)) { return false; }
+            if (length == 0) { return true; }
+            int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length * 3);
+            if (rawIndexes == nullptr) {
+                errorCode = U_MEMORY_ALLOCATION_ERROR;
+                return false;
+            }
+            for (int i = 0; i < length; ++i) {
+                rawIndexes[i*3] = strings.addByValue(toLanguage(vectors[i]), errorCode);
+                rawIndexes[i*3+1] = strings.addByValue(toScript(vectors[i]), errorCode);
+                rawIndexes[i*3+2] = strings.addByValue(
+                    toRegion(m49Array, value, vectors[i], errorCode), errorCode);
                 if (U_FAILURE(errorCode)) { return false; }
             }
+            length *= 3;
         }
         return true;
     }
@@ -245,15 +340,52 @@ private:
 namespace {
 
 XLikelySubtags *gLikelySubtags = nullptr;
+UVector *gMacroregions = nullptr;
 UInitOnce gInitOnce {};
 
 UBool U_CALLCONV cleanup() {
     delete gLikelySubtags;
     gLikelySubtags = nullptr;
+    delete gMacroregions;
+    gMacroregions = nullptr;
     gInitOnce.reset();
     return true;
 }
 
+static const char16_t RANGE_MARKER = 0x7E; /* '~' */
+UVector* loadMacroregions(UErrorCode &status) {
+    LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status);
+
+    LocalUResourceBundlePointer supplementalData(ures_openDirect(nullptr,"supplementalData",&status));
+    LocalUResourceBundlePointer idValidity(ures_getByKey(supplementalData.getAlias(),"idValidity",nullptr,&status));
+    LocalUResourceBundlePointer regionList(ures_getByKey(idValidity.getAlias(),"region",nullptr,&status));
+    LocalUResourceBundlePointer regionMacro(ures_getByKey(regionList.getAlias(),"macroregion",nullptr,&status));
+
+    if (U_FAILURE(status)) {
+        return nullptr;
+    }
+
+    while (U_SUCCESS(status) && ures_hasNext(regionMacro.getAlias())) {
+        UnicodeString regionName = ures_getNextUnicodeString(regionMacro.getAlias(),nullptr,&status);
+        int32_t rangeMarkerLocation = regionName.indexOf(RANGE_MARKER);
+        char16_t buf[6];
+        regionName.extract(buf,6,status);
+        if ( rangeMarkerLocation > 0 ) {
+            char16_t endRange = regionName.charAt(rangeMarkerLocation+1);
+            buf[rangeMarkerLocation] = 0;
+            while ( buf[rangeMarkerLocation-1] <= endRange && U_SUCCESS(status)) {
+                LocalPointer<UnicodeString> newRegion(new UnicodeString(buf), status);
+                newMacroRegions->adoptElement(newRegion.orphan(),status);
+                buf[rangeMarkerLocation-1]++;
+            }
+        } else {
+            LocalPointer<UnicodeString> newRegion(new UnicodeString(regionName), status);
+            newMacroRegions->adoptElement(newRegion.orphan(),status);
+        }
+    }
+    return newMacroRegions.orphan();
+}
+
 }  // namespace
 
 void U_CALLCONV XLikelySubtags::initLikelySubtags(UErrorCode &errorCode) {
@@ -263,10 +395,14 @@ void U_CALLCONV XLikelySubtags::initLikelySubtags(UErrorCode &errorCode) {
     data.load(errorCode);
     if (U_FAILURE(errorCode)) { return; }
     gLikelySubtags = new XLikelySubtags(data);
-    if (gLikelySubtags == nullptr) {
+    gMacroregions = loadMacroregions(errorCode);
+    if (U_FAILURE(errorCode) || gLikelySubtags == nullptr || gMacroregions == nullptr) {
+        delete gLikelySubtags;
+        delete gMacroregions;
         errorCode = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
+
     ucln_common_registerCleanup(UCLN_COMMON_LIKELY_SUBTAGS, cleanup);
 }
 
@@ -317,15 +453,32 @@ XLikelySubtags::~XLikelySubtags() {
     delete[] lsrs;
 }
 
-LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const {
+LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale,
+                                         bool returnInputIfUnmatch,
+                                         UErrorCode &errorCode) const {
+    if (locale.isBogus()) {
+        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return LSR("", "", "", LSR::EXPLICIT_LSR);
+    }
     const char *name = locale.getName();
     if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') {  // name.startsWith("@x=")
         // Private use language tag x-subtag-subtag... which CLDR changes to
         // und-x-subtag-subtag...
         return LSR(name, "", "", LSR::EXPLICIT_LSR);
     }
-    return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
-                            locale.getVariant(), errorCode);
+    LSR max = makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
+                            locale.getVariant(), returnInputIfUnmatch, errorCode);
+
+    if (uprv_strlen(max.language) == 0 &&
+        uprv_strlen(max.script) == 0 &&
+        uprv_strlen(max.region) == 0) {
+        // No match. ICU API mandate us to
+        // If the provided ULocale instance is already in the maximal form, or
+        // there is no data available available for maximization, it will be
+        // returned.
+        return LSR(locale.getLanguage(), locale.getScript(), locale.getCountry(), LSR::EXPLICIT_LSR, errorCode);
+    }
+    return max;
 }
 
 namespace {
@@ -338,7 +491,9 @@ const char *getCanonical(const CharStringMap &aliases, const char *alias) {
 }  // namespace
 
 LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region,
-                                     const char *variant, UErrorCode &errorCode) const {
+                                     const char *variant,
+                                     bool returnInputIfUnmatch,
+                                     UErrorCode &errorCode) const {
     // Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
     // They should match only themselves,
     // not other locales with what looks like the same language and script subtags.
@@ -378,64 +533,91 @@ LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, c
     language = getCanonical(languageAliases, language);
     // (We have no script mappings.)
     region = getCanonical(regionAliases, region);
-    return maximize(language, script, region);
+    return maximize(language, script, region, returnInputIfUnmatch, errorCode);
 }
 
-LSR XLikelySubtags::maximize(const char *language, const char *script, const char *region) const {
-    if (uprv_strcmp(language, "und") == 0) {
+LSR XLikelySubtags::maximize(const char *language, const char *script, const char *region,
+                             bool returnInputIfUnmatch,
+                             UErrorCode &errorCode) const {
+    return maximize({language, (int32_t)uprv_strlen(language)},
+                    {script, (int32_t)uprv_strlen(script)},
+                    {region, (int32_t)uprv_strlen(region)},
+                    returnInputIfUnmatch,
+                    errorCode);
+}
+
+bool XLikelySubtags::isMacroregion(StringPiece& region, UErrorCode& errorCode) const {
+    // In Java, we use Region class. In C++, since Region is under i18n,
+    // we read the same data used by Region into gMacroregions avoid dependency
+    // from common to i18n/region.cpp
+    if (U_FAILURE(errorCode)) { return false; }
+    umtx_initOnce(gInitOnce, &XLikelySubtags::initLikelySubtags, errorCode);
+    if (U_FAILURE(errorCode)) { return false; }
+    UnicodeString str(UnicodeString::fromUTF8(region));
+    return gMacroregions->contains((void *)&str);
+}
+
+LSR XLikelySubtags::maximize(StringPiece language, StringPiece script, StringPiece region,
+                             bool returnInputIfUnmatch,
+                             UErrorCode &errorCode) const {
+    if (U_FAILURE(errorCode)) {
+        return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
+    }
+    if (language.compare("und") == 0) {
         language = "";
     }
-    if (uprv_strcmp(script, "Zzzz") == 0) {
+    if (script.compare("Zzzz") == 0) {
         script = "";
     }
-    if (uprv_strcmp(region, "ZZ") == 0) {
+    if (region.compare("ZZ") == 0) {
         region = "";
     }
-    if (*script != 0 && *region != 0 && *language != 0) {
-        return LSR(language, script, region, LSR::EXPLICIT_LSR);  // already maximized
+    if (!script.empty() && !region.empty() && !language.empty()) {
+        return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);  // already maximized
     }
+    bool retainLanguage = false;
+    bool retainScript = false;
+    bool retainRegion = false;
 
-    uint32_t retainOldMask = 0;
     BytesTrie iter(trie);
     uint64_t state;
     int32_t value;
     // Small optimization: Array lookup for first language letter.
     int32_t c0;
-    if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
-            language[1] != 0 &&  // language.length() >= 2
+    if (0 <= (c0 = uprv_lowerOrdinal(language.data()[0])) && c0 <= 25 &&
+            language.length() >= 2 &&
             (state = trieFirstLetterStates[c0]) != 0) {
         value = trieNext(iter.resetToState64(state), language, 1);
     } else {
         value = trieNext(iter, language, 0);
     }
+    bool matchLanguage = (value >= 0);
+    bool matchScript = false;
     if (value >= 0) {
-        if (*language != 0) {
-            retainOldMask |= 4;
-        }
+        retainLanguage = !language.empty();
         state = iter.getState64();
     } else {
-        retainOldMask |= 4;
+        retainLanguage = true;
         iter.resetToState64(trieUndState);  // "und" ("*")
         state = 0;
     }
 
+    if (value >= 0 && !script.empty()) {
+        matchScript = true;
+    }
     if (value > 0) {
         // Intermediate or final value from just language.
         if (value == SKIP_SCRIPT) {
             value = 0;
         }
-        if (*script != 0) {
-            retainOldMask |= 2;
-        }
+        retainScript = !script.empty();
     } else {
         value = trieNext(iter, script, 0);
         if (value >= 0) {
-            if (*script != 0) {
-                retainOldMask |= 2;
-            }
+            retainScript = !script.empty();
             state = iter.getState64();
         } else {
-            retainOldMask |= 2;
+            retainScript = true;
             if (state == 0) {
                 iter.resetToState64(trieUndZzzzState);  // "und-Zzzz" ("**")
             } else {
@@ -447,19 +629,19 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
         }
     }
 
+    bool matchRegion = false;
     if (value > 0) {
         // Final value from just language or language+script.
-        if (*region != 0) {
-            retainOldMask |= 1;
-        }
+        retainRegion = !region.empty();
     } else {
         value = trieNext(iter, region, 0);
         if (value >= 0) {
-            if (*region != 0) {
-                retainOldMask |= 1;
+            if (!region.empty() && !isMacroregion(region, errorCode)) {
+                retainRegion = true;
+                matchRegion = true;
             }
         } else {
-            retainOldMask |= 1;
+            retainRegion = true;
             if (state == 0) {
                 value = defaultLsrIndex;
             } else {
@@ -470,28 +652,33 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
         }
     }
     U_ASSERT(value < lsrsLength);
-    const LSR &result = lsrs[value];
+    const LSR &matched = lsrs[value];
 
-    if (*language == 0) {
-        language = "und";
+    if (returnInputIfUnmatch &&
+        (!(matchLanguage || matchScript || (matchRegion && language.empty())))) {
+      return LSR("", "", "", LSR::EXPLICIT_LSR, errorCode);  // no matching.
+    }
+    if (language.empty()) {
+        language = StringPiece("und");
     }
 
-    if (retainOldMask == 0) {
+    if (!(retainLanguage || retainScript || retainRegion)) {
         // Quickly return a copy of the lookup-result LSR
         // without new allocation of the subtags.
-        return LSR(result.language, result.script, result.region, result.flags);
+        return LSR(matched.language, matched.script, matched.region, matched.flags);
     }
-    if ((retainOldMask & 4) == 0) {
-        language = result.language;
+    if (!retainLanguage) {
+        language = matched.language;
     }
-    if ((retainOldMask & 2) == 0) {
-        script = result.script;
+    if (!retainScript) {
+        script = matched.script;
     }
-    if ((retainOldMask & 1) == 0) {
-        region = result.region;
+    if (!retainRegion) {
+        region = matched.region;
     }
+    int32_t retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
     // retainOldMask flags = LSR explicit-subtag flags
-    return LSR(language, script, region, retainOldMask);
+    return LSR(language, script, region, retainMask, errorCode);
 }
 
 int32_t XLikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
@@ -627,57 +814,97 @@ int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
     default: return -1;
     }
 }
+int32_t XLikelySubtags::trieNext(BytesTrie &iter, StringPiece s, int32_t i) {
+    UStringTrieResult result;
+    uint8_t c;
+    if (s.length() == i) {
+        result = iter.next(u'*');
+    } else {
+        c = s.data()[i];
+        for (;;) {
+            c = uprv_invCharToAscii(c);
+            // EBCDIC: If s[i] is not an invariant character,
+            // then c is now 0 and will simply not match anything, which is harmless.
+            if (i+1 != s.length()) {
+                if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
+                    return -1;
+                }
+                c = s.data()[++i];
+            } else {
+                // last character of this subtag
+                result = iter.next(c | 0x80);
+                break;
+            }
+        }
+    }
+    switch (result) {
+    case USTRINGTRIE_NO_MATCH: return -1;
+    case USTRINGTRIE_NO_VALUE: return 0;
+    case USTRINGTRIE_INTERMEDIATE_VALUE:
+        U_ASSERT(iter.getValue() == SKIP_SCRIPT);
+        return SKIP_SCRIPT;
+    case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
+    default: return -1;
+    }
+}
 
-// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
-// in loclikely.cpp to this new code, including activating this
-// minimizeSubtags() function. The LocaleMatcher does not minimize.
-#if 0
-LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn,
-                                    const char *regionIn, ULocale.Minimize fieldToFavor,
+LSR XLikelySubtags::minimizeSubtags(StringPiece language, StringPiece script,
+                                    StringPiece region,
+                                    bool favorScript,
                                     UErrorCode &errorCode) const {
-    LSR result = maximize(languageIn, scriptIn, regionIn);
-
-    // We could try just a series of checks, like:
-    // LSR result2 = addLikelySubtags(languageIn, "", "");
-    // if result.equals(result2) return result2;
-    // However, we can optimize 2 of the cases:
-    //   (languageIn, "", "")
-    //   (languageIn, "", regionIn)
-
-    // value00 = lookup(result.language, "", "")
-    BytesTrie iter = new BytesTrie(trie);
-    int value = trieNext(iter, result.language, 0);
-    U_ASSERT(value >= 0);
-    if (value == 0) {
-        value = trieNext(iter, "", 0);
-        U_ASSERT(value >= 0);
-        if (value == 0) {
-            value = trieNext(iter, "", 0);
-        }
-    }
-    U_ASSERT(value > 0);
-    LSR value00 = lsrs[value];
-    boolean favorRegionOk = false;
-    if (result.script.equals(value00.script)) { //script is default
-        if (result.region.equals(value00.region)) {
-            return new LSR(result.language, "", "", LSR.DONT_CARE_FLAGS);
-        } else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) {
-            return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
-        } else {
-            favorRegionOk = true;
+    LSR max = maximize(language, script, region, true, errorCode);
+    if (U_FAILURE(errorCode)) {
+        return max;
+    }
+    // If no match, return it.
+    if (uprv_strlen(max.language) == 0 &&
+        uprv_strlen(max.script) == 0 &&
+        uprv_strlen(max.region) == 0) {
+        // No match. ICU API mandate us to
+        // "If this Locale is already in the minimal form, or not valid, or
+        // there is no data available for minimization, the Locale will be
+        // unchanged."
+        return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
+    }
+    // try language
+    LSR test = maximize(max.language, "", "", true, errorCode);
+    if (U_FAILURE(errorCode)) {
+        return max;
+    }
+    if (test.isEquivalentTo(max)) {
+        return LSR(max.language, "", "", LSR::DONT_CARE_FLAGS, errorCode);
+    }
+
+    if (!favorScript) {
+        // favor Region
+        // try language and region
+        test = maximize(max.language, "", max.region, true, errorCode);
+        if (U_FAILURE(errorCode)) {
+            return max;
+        }
+        if (test.isEquivalentTo(max)) {
+            return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
         }
     }
-
-    // The last case is not as easy to optimize.
-    // Maybe do later, but for now use the straightforward code.
-    LSR result2 = maximize(languageIn, scriptIn, "");
-    if (result2.equals(result)) {
-        return new LSR(result.language, result.script, "", LSR.DONT_CARE_FLAGS);
-    } else if (favorRegionOk) {
-        return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
+    // try language and script
+    test = maximize(max.language, max.script, "", true, errorCode);
+    if (U_FAILURE(errorCode)) {
+        return max;
+    }
+    if (test.isEquivalentTo(max)) {
+        return LSR(max.language, max.script, "", LSR::DONT_CARE_FLAGS, errorCode);
+    }
+    if (favorScript) {
+        // try language and region
+        test = maximize(max.language, "", max.region, true, errorCode);
+        if (U_FAILURE(errorCode)) {
+            return max;
+        }
+        if (test.isEquivalentTo(max)) {
+            return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
+        }
     }
-    return result;
+    return LSR(max.language, max.script, max.region, LSR::DONT_CARE_FLAGS, errorCode);
 }
-#endif
 
 U_NAMESPACE_END

+ 18 - 10
thirdparty/icu4c/common/loclikelysubtags.h

@@ -11,6 +11,7 @@
 #include "unicode/utypes.h"
 #include "unicode/bytestrie.h"
 #include "unicode/locid.h"
+#include "unicode/stringpiece.h"
 #include "unicode/uobject.h"
 #include "unicode/ures.h"
 #include "charstrmap.h"
@@ -47,7 +48,9 @@ public:
     static const XLikelySubtags *getSingleton(UErrorCode &errorCode);
 
     // VisibleForTesting
-    LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const;
+    LSR makeMaximizedLsrFrom(const Locale &locale,
+                             bool returnInputIfUnmatch,
+                             UErrorCode &errorCode) const;
 
     /**
      * Tests whether lsr is "more likely" than other.
@@ -61,13 +64,9 @@ public:
      */
     int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const;
 
-    // TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
-    // in loclikely.cpp to this new code, including activating this
-    // minimizeSubtags() function. The LocaleMatcher does not minimize.
-#if 0
-    LSR minimizeSubtags(const char *languageIn, const char *scriptIn, const char *regionIn,
-                        ULocale.Minimize fieldToFavor, UErrorCode &errorCode) const;
-#endif
+    LSR minimizeSubtags(StringPiece language, StringPiece script, StringPiece region,
+                        bool favorScript,
+                        UErrorCode &errorCode) const;
 
     // visible for LocaleDistance
     const LocaleDistanceData &getDistanceData() const { return distanceData; }
@@ -80,16 +79,25 @@ private:
     static void initLikelySubtags(UErrorCode &errorCode);
 
     LSR makeMaximizedLsr(const char *language, const char *script, const char *region,
-                         const char *variant, UErrorCode &errorCode) const;
+                         const char *variant,
+                         bool returnInputIfUnmatch,
+                         UErrorCode &errorCode) const;
 
     /**
      * Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN".
      */
-    LSR maximize(const char *language, const char *script, const char *region) const;
+    LSR maximize(const char *language, const char *script, const char *region,
+                 bool returnInputIfUnmatch,
+                 UErrorCode &errorCode) const;
+    LSR maximize(StringPiece language, StringPiece script, StringPiece region,
+                 bool returnInputIfUnmatch,
+                 UErrorCode &errorCode) const;
 
     int32_t getLikelyIndex(const char *language, const char *script) const;
+    bool isMacroregion(StringPiece& region, UErrorCode &errorCode) const;
 
     static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i);
+    static int32_t trieNext(BytesTrie &iter, StringPiece s, int32_t i);
 
     UResourceBundle *langInfoBundle;
     // We could store the strings by value, except that if there were few enough strings,

+ 12 - 8
thirdparty/icu4c/common/locmap.cpp

@@ -1170,7 +1170,7 @@ uprv_convertToLCIDPlatform(const char* localeID, UErrorCode* status)
     // conversion functionality when available.
 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
     int32_t len;
-    char baseName[ULOC_FULLNAME_CAPACITY] = {};
+    icu::CharString baseName;
     const char * mylocaleID = localeID;
 
     // Check any for keywords.
@@ -1189,19 +1189,23 @@ uprv_convertToLCIDPlatform(const char* localeID, UErrorCode* status)
         else
         {
             // If the locale ID contains keywords other than collation, just use the base name.
-            len = uloc_getBaseName(localeID, baseName, UPRV_LENGTHOF(baseName) - 1, status);
-
-            if (U_SUCCESS(*status) && len > 0)
             {
-                baseName[len] = 0;
-                mylocaleID = baseName;
+                icu::CharStringByteSink sink(&baseName);
+                ulocimp_getBaseName(localeID, sink, status);
+            }
+            if (U_SUCCESS(*status) && !baseName.isEmpty())
+            {
+                mylocaleID = baseName.data();
             }
         }
     }
 
-    char asciiBCP47Tag[LOCALE_NAME_MAX_LENGTH] = {};
     // this will change it from de_DE@collation=phonebook to de-DE-u-co-phonebk form
-    (void)uloc_toLanguageTag(mylocaleID, asciiBCP47Tag, UPRV_LENGTHOF(asciiBCP47Tag), false, status);
+    icu::CharString asciiBCP47Tag;
+    {
+        icu::CharStringByteSink sink(&asciiBCP47Tag);
+        ulocimp_toLanguageTag(mylocaleID, sink, false, status);
+    }
 
     if (U_SUCCESS(*status))
     {

+ 9 - 5
thirdparty/icu4c/common/locresdata.cpp

@@ -24,6 +24,8 @@
 #include "unicode/putil.h"
 #include "unicode/uloc.h"
 #include "unicode/ures.h"
+#include "bytesinkutil.h"
+#include "charstr.h"
 #include "cstring.h"
 #include "ulocimp.h"
 #include "uresimp.h"
@@ -156,16 +158,18 @@ _uloc_getOrientationHelper(const char* localeId,
     ULayoutType result = ULOC_LAYOUT_UNKNOWN;
 
     if (!U_FAILURE(*status)) {
-        int32_t length = 0;
-        char localeBuffer[ULOC_FULLNAME_CAPACITY];
-
-        uloc_canonicalize(localeId, localeBuffer, sizeof(localeBuffer), status);
+        icu::CharString localeBuffer;
+        {
+            icu::CharStringByteSink sink(&localeBuffer);
+            ulocimp_canonicalize(localeId, sink, status);
+        }
 
         if (!U_FAILURE(*status)) {
+            int32_t length = 0;
             const char16_t* const value =
                 uloc_getTableStringWithFallback(
                     nullptr,
-                    localeBuffer,
+                    localeBuffer.data(),
                     "layout",
                     nullptr,
                     key,

+ 20 - 0
thirdparty/icu4c/common/lsr.cpp

@@ -31,6 +31,26 @@ LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t
     }
 }
 
+LSR::LSR(StringPiece lang, StringPiece scr, StringPiece r, int32_t f,
+         UErrorCode &errorCode) :
+        language(nullptr), script(nullptr), region(nullptr),
+        regionIndex(indexForRegion(r.data())), flags(f) {
+    if (U_SUCCESS(errorCode)) {
+        CharString data;
+        data.append(lang, errorCode).append('\0', errorCode);
+        int32_t scriptOffset = data.length();
+        data.append(scr, errorCode).append('\0', errorCode);
+        int32_t regionOffset = data.length();
+        data.append(r, errorCode);
+        owned = data.cloneData(errorCode);
+        if (U_SUCCESS(errorCode)) {
+            language = owned;
+            script = owned + scriptOffset;
+            region = owned + regionOffset;
+        }
+    }
+}
+
 LSR::LSR(LSR &&other) noexcept :
         language(other.language), script(other.script), region(other.region), owned(other.owned),
         regionIndex(other.regionIndex), flags(other.flags),

+ 3 - 0
thirdparty/icu4c/common/lsr.h

@@ -7,6 +7,7 @@
 #ifndef __LSR_H__
 #define __LSR_H__
 
+#include "unicode/stringpiece.h"
 #include "unicode/utypes.h"
 #include "unicode/uobject.h"
 #include "cstring.h"
@@ -45,6 +46,8 @@ struct LSR final : public UMemory {
      */
     LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
         UErrorCode &errorCode);
+    LSR(StringPiece lang, StringPiece scr, StringPiece r, int32_t f,
+        UErrorCode &errorCode);
     LSR(LSR &&other) noexcept;
     LSR(const LSR &other) = delete;
     inline ~LSR() {

+ 1 - 1
thirdparty/icu4c/common/norm2_nfc_data.h

@@ -10,7 +10,7 @@
 #ifdef INCLUDED_FROM_NORMALIZER2_CPP
 
 static const UVersionInfo norm2_nfc_data_formatVersion={4,0,0,0};
-static const UVersionInfo norm2_nfc_data_dataVersion={0xf,0,0,0};
+static const UVersionInfo norm2_nfc_data_dataVersion={0xf,1,0,0};
 
 static const int32_t norm2_nfc_data_indexes[Normalizer2Impl::IX_COUNT]={
 0x50,0x4cb8,0x8920,0x8a20,0x8a20,0x8a20,0x8a20,0x8a20,0xc0,0x300,0xae2,0x29e0,0x3c66,0xfc00,0x1288,0x3b9c,

+ 1 - 0
thirdparty/icu4c/common/norm2allmodes.h

@@ -391,6 +391,7 @@ struct Norm2AllModes : public UMemory {
     static const Norm2AllModes *getNFCInstance(UErrorCode &errorCode);
     static const Norm2AllModes *getNFKCInstance(UErrorCode &errorCode);
     static const Norm2AllModes *getNFKC_CFInstance(UErrorCode &errorCode);
+    static const Norm2AllModes *getNFKC_SCFInstance(UErrorCode &errorCode);
 
     Normalizer2Impl *impl;
     ComposeNormalizer2 comp;

+ 2 - 1
thirdparty/icu4c/common/normalizer2impl.h

@@ -789,7 +789,8 @@ unorm_getFCD16(UChar32 c);
  *
  * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
  * ICU ships with data files for standard Unicode Normalization Forms
- * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm).
+ * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm),
+ * NFKC_Casefold (nfkc_cf.nrm) and NFKC_Simple_Casefold (nfkc_scf.nrm).
  * Custom (application-specific) data can be built into additional .nrm files
  * with the gennorm2 build tool.
  * ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 999 - 991
thirdparty/icu4c/common/propname_data.h


+ 15 - 0
thirdparty/icu4c/common/putil.cpp

@@ -1175,6 +1175,21 @@ uprv_tzname(int n)
         if (ret != nullptr && uprv_strcmp(TZDEFAULT, gTimeZoneBuffer) != 0) {
             int32_t tzZoneInfoTailLen = uprv_strlen(TZZONEINFOTAIL);
             const char *tzZoneInfoTailPtr = uprv_strstr(gTimeZoneBuffer, TZZONEINFOTAIL);
+            // MacOS14 has the realpath as something like
+            // /usr/share/zoneinfo.default/Australia/Melbourne
+            // which will not have "/zoneinfo/" in the path.
+            // Therefore if we fail, we fall back to read the link which is
+            // /var/db/timezone/zoneinfo/Australia/Melbourne
+            // We also fall back to reading the link if the realpath leads to something like
+            // /usr/share/zoneinfo/posixrules
+            if (tzZoneInfoTailPtr == nullptr ||
+                    uprv_strcmp(tzZoneInfoTailPtr + tzZoneInfoTailLen, "posixrules") == 0) {
+                ssize_t size = readlink(TZDEFAULT, gTimeZoneBuffer, sizeof(gTimeZoneBuffer)-1);
+                if (size > 0) {
+                    gTimeZoneBuffer[size] = 0;
+                    tzZoneInfoTailPtr = uprv_strstr(gTimeZoneBuffer, TZZONEINFOTAIL);
+                }
+            }
             if (tzZoneInfoTailPtr != nullptr) {
                 tzZoneInfoTailPtr += tzZoneInfoTailLen;
                 skipZoneIDPrefix(&tzZoneInfoTailPtr);

+ 34 - 17
thirdparty/icu4c/common/rbbi.cpp

@@ -1125,6 +1125,7 @@ static icu::UStack *gLanguageBreakFactories = nullptr;
 static const icu::UnicodeString *gEmptyString = nullptr;
 static icu::UInitOnce gLanguageBreakFactoriesInitOnce {};
 static icu::UInitOnce gRBBIInitOnce {};
+static icu::ICULanguageBreakFactory *gICULanguageBreakFactory = nullptr;
 
 /**
  * Release all static memory held by breakiterator.
@@ -1153,37 +1154,41 @@ static void U_CALLCONV rbbiInit() {
     ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
 }
 
-static void U_CALLCONV initLanguageFactories() {
-    UErrorCode status = U_ZERO_ERROR;
+static void U_CALLCONV initLanguageFactories(UErrorCode& status) {
     U_ASSERT(gLanguageBreakFactories == nullptr);
     gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status);
     if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) {
-        ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
-        gLanguageBreakFactories->push(builtIn, status);
+        LocalPointer<ICULanguageBreakFactory> factory(new ICULanguageBreakFactory(status), status);
+        if (U_SUCCESS(status)) {
+            gICULanguageBreakFactory = factory.orphan();
+            gLanguageBreakFactories->push(gICULanguageBreakFactory, status);
 #ifdef U_LOCAL_SERVICE_HOOK
-        LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
-        if (extra != nullptr) {
-            gLanguageBreakFactories->push(extra, status);
-        }
+            LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
+            if (extra != nullptr) {
+                gLanguageBreakFactories->push(extra, status);
+            }
 #endif
+        }
     }
     ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
 }
 
+void ensureLanguageFactories(UErrorCode& status) {
+    umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories, status);
+}
 
 static const LanguageBreakEngine*
-getLanguageBreakEngineFromFactory(UChar32 c)
+getLanguageBreakEngineFromFactory(UChar32 c, const char* locale)
 {
-    umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
-    if (gLanguageBreakFactories == nullptr) {
-        return nullptr;
-    }
+    UErrorCode status = U_ZERO_ERROR;
+    ensureLanguageFactories(status);
+    if (U_FAILURE(status)) return nullptr;
 
     int32_t i = gLanguageBreakFactories->size();
     const LanguageBreakEngine *lbe = nullptr;
     while (--i >= 0) {
         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
-        lbe = factory->getEngineFor(c);
+        lbe = factory->getEngineFor(c, locale);
         if (lbe != nullptr) {
             break;
         }
@@ -1199,7 +1204,7 @@ getLanguageBreakEngineFromFactory(UChar32 c)
 //
 //-------------------------------------------------------------------------------
 const LanguageBreakEngine *
-RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
+RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c, const char* locale) {
     const LanguageBreakEngine *lbe = nullptr;
     UErrorCode status = U_ZERO_ERROR;
 
@@ -1215,14 +1220,14 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
     int32_t i = fLanguageBreakEngines->size();
     while (--i >= 0) {
         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
-        if (lbe->handles(c)) {
+        if (lbe->handles(c, locale)) {
             return lbe;
         }
     }
 
     // No existing dictionary took the character. See if a factory wants to
     // give us a new LanguageBreakEngine for this character.
-    lbe = getLanguageBreakEngineFromFactory(c);
+    lbe = getLanguageBreakEngineFromFactory(c, locale);
 
     // If we got one, use it and push it on our stack.
     if (lbe != nullptr) {
@@ -1259,6 +1264,18 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
     return fUnhandledBreakEngine;
 }
 
+#ifndef U_HIDE_DRAFT_API
+void U_EXPORT2 RuleBasedBreakIterator::registerExternalBreakEngine(
+                  ExternalBreakEngine* toAdopt, UErrorCode& status) {
+    LocalPointer<ExternalBreakEngine> engine(toAdopt, status);
+    if (U_FAILURE(status)) return;
+    ensureLanguageFactories(status);
+    if (U_FAILURE(status)) return;
+    gICULanguageBreakFactory->addExternalEngine(engine.orphan(), status);
+}
+#endif  /* U_HIDE_DRAFT_API */
+
+
 void RuleBasedBreakIterator::dumpCache() {
     fBreakCache->dumpCache();
 }

+ 3 - 2
thirdparty/icu4c/common/rbbi_cache.cpp

@@ -158,12 +158,13 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
 
         // We now have a dictionary character. Get the appropriate language object
         // to deal with it.
-        const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);
+        const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
+            c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
 
         // Ask the language object if there are any breaks. It will add them to the cache and
         // leave the text pointer on the other side of its range, ready to search for the next one.
         if (lbe != nullptr) {
-            foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
+            foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
         }
 
         // Reload the loop variables for the next go-round

+ 0 - 1
thirdparty/icu4c/common/rbbirb.cpp

@@ -66,7 +66,6 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
     fForwardTable       = nullptr;
     fRuleStatusVals     = nullptr;
     fChainRules         = false;
-    fLBCMNoChain        = false;
     fLookAheadHardBreak = false;
     fUSetNodes          = nullptr;
     fRuleStatusVals     = nullptr;

+ 0 - 3
thirdparty/icu4c/common/rbbirb.h

@@ -159,9 +159,6 @@ public:
     UBool                         fChainRules;       // True for chained Unicode TR style rules.
                                                      // False for traditional regexp rules.
 
-    UBool                         fLBCMNoChain;      // True:  suppress chaining of rules on
-                                                     //   chars with LineBreak property == CM.
-
     UBool                         fLookAheadHardBreak;  // True:  Look ahead matches cause an
                                                      // immediate break, no continuing for the
                                                      // longest match.

+ 0 - 2
thirdparty/icu4c/common/rbbiscan.cpp

@@ -547,8 +547,6 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
             UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
             if (opt == UNICODE_STRING("chain", 5)) {
                 fRB->fChainRules = true;
-            } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
-                fRB->fLBCMNoChain = true;
             } else if (opt == UNICODE_STRING("forward", 7)) {
                 fRB->fDefaultTree   = &fRB->fForwardTree;
             } else if (opt == UNICODE_STRING("reverse", 7)) {

+ 0 - 15
thirdparty/icu4c/common/rbbitblb.cpp

@@ -458,21 +458,6 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNod
 
         // We've got a node that can end a match.
 
-        // !!LBCMNoChain implementation:  If this node's val correspond to
-        // the Line Break $CM char class, don't chain from it.
-        // TODO:  Remove this. !!LBCMNoChain is deprecated, and is not used
-        //        by any of the standard ICU rules.
-        if (fRB->fLBCMNoChain) {
-            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
-            if (c != -1) {
-                // c == -1 occurs with sets containing only the {eof} marker string.
-                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
-                if (cLBProp == U_LB_COMBINING_MARK) {
-                    continue;
-                }
-            }
-        }
-
         // Now iterate over the nodes that can start a match, looking for ones
         //   with the same char class as our ending node.
         RBBINode *startNode;

+ 63 - 64
thirdparty/icu4c/common/ubidi_props_data.h

@@ -9,11 +9,11 @@
 
 #ifdef INCLUDED_FROM_UBIDI_PROPS_C
 
-static const UVersionInfo ubidi_props_dataVersion={0xf,0,0,0};
+static const UVersionInfo ubidi_props_dataVersion={0xf,1,0,0};
 
-static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={0x10,0x6bc0,0x65d0,0x28,0x620,0x8cc,0x10ac0,0x10d24,0,0,0,0,0,0,0,0x6702b6};
+static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={0x10,0x6ba0,0x65b0,0x28,0x620,0x8cc,0x10ac0,0x10d24,0,0,0,0,0,0,0,0x6702b6};
 
-static const uint16_t ubidi_props_trieIndex[13024]={
+static const uint16_t ubidi_props_trieIndex[13008]={
 0x387,0x38f,0x397,0x39f,0x3b7,0x3bf,0x3c7,0x3cf,0x3a7,0x3af,0x3a7,0x3af,0x3a7,0x3af,0x3a7,0x3af,
 0x3a7,0x3af,0x3a7,0x3af,0x3d5,0x3dd,0x3e5,0x3ed,0x3f5,0x3fd,0x3f9,0x401,0x409,0x411,0x40c,0x414,
 0x3a7,0x3af,0x3a7,0x3af,0x41c,0x424,0x3a7,0x3af,0x3a7,0x3af,0x3a7,0x3af,0x42a,0x432,0x43a,0x442,
@@ -38,8 +38,8 @@ static const uint16_t ubidi_props_trieIndex[13024]={
 0x7e8,0x7f0,0x7f8,0x7ff,0x806,0x80e,0x812,0x7e0,0x67c,0x67c,0x67c,0x81a,0x820,0x67c,0x67c,0x826,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x82e,0x3a7,0x3a7,0x3a7,0x836,0x3a7,0x3a7,0x3a7,0x3f5,
 0x83e,0x846,0x849,0x3a7,0x851,0x67c,0x67c,0x67f,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x858,0x85e,
-0x86e,0x866,0x3a7,0x3a7,0x876,0x61f,0x3a7,0x3ce,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x67c,0x835,
-0x3dc,0x3a7,0x87e,0x886,0x3a7,0x88e,0x896,0x3a7,0x3a7,0x3a7,0x3a7,0x89a,0x3a7,0x3a7,0x674,0x3cd,
+0x86e,0x866,0x3a7,0x3a7,0x876,0x61f,0x3a7,0x3ce,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x67c,0x87e,
+0x3dc,0x3a7,0x85e,0x882,0x3a7,0x88a,0x892,0x3a7,0x3a7,0x3a7,0x3a7,0x896,0x3a7,0x3a7,0x674,0x3cd,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
@@ -96,10 +96,10 @@ static const uint16_t ubidi_props_trieIndex[13024]={
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x87e,0x67c,0x595,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x8a1,0x3a7,0x3a7,0x8a6,0x8ae,0x3a7,0x3a7,0x5cb,0x67c,0x673,0x3a7,0x3a7,0x8b6,0x3a7,0x3a7,0x3a7,
-0x8be,0x8c5,0x645,0x8cd,0x3a7,0x3a7,0x5a1,0x8d5,0x3a7,0x8dd,0x8e4,0x3a7,0x501,0x8e9,0x3a7,0x51a,
-0x3a7,0x8f1,0x8f9,0x51c,0x3a7,0x8fd,0x51b,0x905,0x3a7,0x3a7,0x3a7,0x90b,0x3a7,0x3a7,0x3a7,0x912,
+0x3a7,0x3a7,0x3a7,0x3a7,0x85e,0x67c,0x595,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0x89d,0x3a7,0x3a7,0x8a2,0x8aa,0x3a7,0x3a7,0x5cb,0x67c,0x673,0x3a7,0x3a7,0x8b2,0x3a7,0x3a7,0x3a7,
+0x8ba,0x8c1,0x645,0x8c9,0x3a7,0x3a7,0x5a1,0x8d1,0x3a7,0x8d9,0x8e0,0x3a7,0x501,0x8e5,0x3a7,0x51a,
+0x3a7,0x8ed,0x8f5,0x51c,0x3a7,0x8f9,0x51b,0x901,0x3a7,0x3a7,0x3a7,0x907,0x3a7,0x3a7,0x3a7,0x90e,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
@@ -139,9 +139,9 @@ static const uint16_t ubidi_props_trieIndex[13024]={
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x926,0x91a,0x91e,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,
-0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x92e,0x936,0x4a6,0x4a6,0x4a6,0x93b,0x93f,
-0x947,0x94f,0x953,0x95b,0x4a6,0x4a6,0x4a6,0x95f,0x967,0x397,0x96f,0x977,0x3a7,0x3a7,0x3a7,0x97f,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x922,0x916,0x91a,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,
+0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x92a,0x932,0x4a6,0x4a6,0x4a6,0x937,0x93b,
+0x943,0x94b,0x94f,0x957,0x4a6,0x4a6,0x4a6,0x95b,0x963,0x397,0x96b,0x973,0x3a7,0x3a7,0x3a7,0x97b,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0xe9c,0xe9c,0xedc,0xf1c,0xe9c,0xe9c,0xe9c,0xe9c,0xe9c,0xe9c,0xf54,0xf94,0xfd4,0xfe4,0x1024,0x1030,
@@ -178,68 +178,68 @@ static const uint16_t ubidi_props_trieIndex[13024]={
 0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0xd89,
 0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,
 0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0xd89,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x987,0x3a7,0x67c,0x67c,0x98f,0x61f,0x3a7,0x514,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x997,0x3a7,0x3a7,0x3a7,0x99e,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x983,0x3a7,0x67c,0x67c,0x98b,0x61f,0x3a7,0x514,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x993,0x3a7,0x3a7,0x3a7,0x99a,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x9a6,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
-0x9ae,0x9b2,0x43c,0x43c,0x43c,0x43c,0x9c2,0x9ba,0x43c,0x9ca,0x43c,0x43c,0x9d2,0x9d8,0x43c,0x43c,
-0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x9e8,0x9e0,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
-0x43c,0x43c,0x43c,0x9f0,0x43c,0x9f8,0x4a6,0xa00,0x43c,0xa08,0xa0f,0xa15,0xa1d,0xa21,0xa29,0x43c,
-0x51b,0xa31,0xa38,0xa3f,0x41e,0xa47,0x569,0x3a7,0x501,0xa4e,0x3a7,0xa54,0x41e,0xa59,0xa61,0x3a7,
-0x3a7,0xa66,0x51b,0x3a7,0x3a7,0x3a7,0x836,0xa6e,0x41e,0x5a3,0x57e,0xa75,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0xa31,0xa7d,0x3a7,0x3a7,0xa85,0xa8d,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xa91,0xa99,0x3a7,
-0x3a7,0xaa1,0x57e,0xaa9,0x3a7,0xaaf,0x3a7,0x3a7,0x60f,0xab7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0xabc,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xac3,0xacb,0x3a7,0x3a7,0x3a7,0xace,0x57e,0xad6,
-0xada,0xae2,0x3a7,0xae9,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0xaf0,0x3a7,0x3a7,0xafe,0xaf8,0x3a7,0x3a7,0x3a7,0xb06,0xb0e,0x3a7,0xb12,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x5a5,0x41e,0x99e,0xb1a,0x3a7,0x3a7,0x3a7,0xb27,0xb22,0x3a7,
+0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x9a2,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
+0x9aa,0x9ae,0x43c,0x43c,0x43c,0x43c,0x9be,0x9b6,0x43c,0x9c6,0x43c,0x43c,0x9ce,0x9d4,0x43c,0x43c,
+0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x9e4,0x9dc,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
+0x43c,0x43c,0x43c,0x9ec,0x43c,0x9f4,0x4a6,0x9fc,0x43c,0xa04,0xa0b,0xa11,0xa19,0xa1d,0xa25,0x43c,
+0x51b,0xa2d,0xa34,0xa3b,0x41e,0xa43,0x569,0x3a7,0x501,0xa4a,0x3a7,0xa50,0x41e,0xa55,0xa5d,0x3a7,
+0x3a7,0xa62,0x51b,0x3a7,0x3a7,0x3a7,0x836,0xa6a,0x41e,0x5a3,0x57e,0xa71,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0xa2d,0xa79,0x3a7,0x3a7,0xa81,0xa89,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xa8d,0xa95,0x3a7,
+0x3a7,0xa9d,0x57e,0xaa5,0x3a7,0xaab,0x3a7,0x3a7,0x60f,0xab3,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0xab8,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xabf,0xac7,0x3a7,0x3a7,0x3a7,0xaca,0x57e,0xad2,
+0xad6,0xade,0x3a7,0xae5,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0xaec,0x3a7,0x3a7,0xafa,0xaf4,0x3a7,0x3a7,0x3a7,0xb02,0xb0a,0x3a7,0xb0e,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x5a5,0x41e,0x99a,0xb16,0x3a7,0x3a7,0x3a7,0xb23,0xb1e,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0xb2f,0xb37,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xb3d,
-0x3a7,0xb43,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0xb2b,0xb33,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xb39,
+0x3a7,0xb3f,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0xa55,0x3a7,0xb49,0x3a7,0x3a7,0xb51,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0x3a7,0xa51,0x3a7,0xb45,0x3a7,0x3a7,0xb4d,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x535,0xb59,0x3a7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x535,0xb55,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3f5,0xb61,0x500,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0xb69,0xb71,0xb77,0x3a7,0xb7d,0x67c,0x67c,0xb85,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x67c,0x67c,0xb8d,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xb93,
-0x3a7,0xb9a,0x3a7,0xb96,0x3a7,0xb9d,0x3a7,0xba5,0xba9,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3f5,0xbb1,0x3f5,0xbb8,0xbbf,0xbc7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3f5,0xb5d,0x500,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0xb65,0xb6d,0xb73,0x3a7,0xb79,0x67c,0x67c,0xb81,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0x67c,0x67c,0xb89,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xb8f,
+0x3a7,0xb96,0x3a7,0xb92,0x3a7,0xb99,0x3a7,0xba1,0xba5,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3f5,0xbad,0x3f5,0xbb4,0xbbb,0xbc3,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xbcf,0xbd7,0x3a7,0x3a7,0xa55,0x3a7,0x3a7,
-0x3a7,0x3a7,0xb43,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xa81,0x3a7,
-0xbdc,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0xbe4,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0xbec,
-0x43c,0xbf4,0xbf4,0xbfb,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
-0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x91e,0x4a6,0x4a6,0x43c,
-0x43c,0x4a6,0x4a6,0xc03,0x43c,0x43c,0x43c,0x43c,0x43c,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,
-0xc0b,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x67c,0xc13,0x67c,0x67c,0x67f,0xc18,0xc1c,
-0x858,0xc24,0x3c9,0x3a7,0xc2a,0x3a7,0xc2f,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x783,0x3a7,0x3a7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xbcb,0xbd3,0x3a7,0x3a7,0xa51,0x3a7,0x3a7,
+0x3a7,0x3a7,0xb3f,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xa7d,0x3a7,
+0xbd8,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0xbe0,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0xbe8,
+0x43c,0xbf0,0xbf0,0xbf7,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
+0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x91a,0x4a6,0x4a6,0x43c,
+0x43c,0x4a6,0x4a6,0xbff,0x43c,0x43c,0x43c,0x43c,0x43c,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,
+0xc07,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x67c,0xc0f,0x67c,0x67c,0x67f,0xc14,0xc18,
+0x858,0xc20,0x3c9,0x3a7,0xc26,0x3a7,0xc2b,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x783,0x3a7,0x3a7,0x3a7,
 0x3a7,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,
-0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0xc37,
-0x98f,0x67c,0x67c,0x67c,0xc3e,0x67c,0x67c,0xc45,0xc4d,0xc13,0x67c,0xc55,0x67c,0xc5d,0xc62,0x3a7,
-0x3a7,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67f,0xc6a,0xc73,0xc77,0xc7f,
-0xc6f,0x67c,0x67c,0x67c,0x67c,0xc87,0x67c,0x792,0xc8f,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0xc33,
+0x98b,0x67c,0x67c,0x67c,0xc3a,0x67c,0x67c,0xc41,0xc49,0xc0f,0x67c,0xc51,0x67c,0xc59,0xc5e,0x3a7,
+0x3a7,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67f,0xc66,0xc6f,0xc73,0xc7b,
+0xc6b,0x67c,0x67c,0x67c,0x67c,0xc83,0x67c,0x792,0xc8b,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xc96,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xc92,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
 0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
-0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xc96,0xca6,0xc9e,0xc9e,0xc9e,0xca7,0xca7,0xca7,
-0xca7,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0xcaf,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,
-0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,
-0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,
-0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,
-0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0x386,0x386,0x386,0x12,0x12,0x12,0x12,
+0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xc92,0xca2,0xc9a,0xc9a,0xc9a,0xca3,0xca3,0xca3,
+0xca3,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0xcab,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,
+0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,
+0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,
+0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,
+0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0x386,0x386,0x386,0x12,0x12,0x12,0x12,
 0x12,0x12,0x12,0x12,0x12,8,7,8,9,7,0x12,0x12,0x12,0x12,0x12,0x12,
 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,7,7,7,8,9,0xa,0xa,4,
 4,4,0xa,0xa,0x310a,0xf20a,0xa,3,6,3,6,6,2,2,2,2,
@@ -551,15 +551,14 @@ static const uint16_t ubidi_props_trieIndex[13024]={
 0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
 0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
-0xa,0xa,0xa,0xa,0,0,0,0,0xa,0,0,0,0,0,0,0,
+0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,
 0,0,0xb1,0xb1,0xb1,0xb1,0,0,0xa,0,0,0,0,0,0xa,0xa,
 0,0,0,0,0,0xa,0xa,0xa,9,0xa,0xa,0xa,0xa,0,0,0,
 0x310a,0xf20a,0x310a,0xf20a,0x310a,0xf20a,0x310a,0xf20a,0x310a,0xf20a,0xa,0xa,0x310a,0xf20a,0x310a,0xf20a,
 0x310a,0xf20a,0x310a,0xf20a,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0xb1,0xb1,0xa,0xa,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
-0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,
+0,0xb1,0xb1,0xa,0xa,0,0,0,0xa,0xa,0xa,0xa,0,0,0,0,
+0,0,0,0,0,0,0,0xa,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0xa,0xa,0xa,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
@@ -935,13 +934,13 @@ static const UBiDiProps ubidi_props_singleton={
     ubidi_props_trieIndex+3612,
     nullptr,
     3612,
-    9412,
+    9396,
     0x1a0,
     0xe9c,
     0x0,
     0x0,
     0x110000,
-    0x32dc,
+    0x32cc,
     nullptr, 0, false, false, 0, nullptr
   },
   { 2,2,0,0 }

+ 3 - 40
thirdparty/icu4c/common/ucase.cpp

@@ -317,43 +317,6 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
     }
 }
 
-namespace {
-
-/**
- * Add the simple case closure mapping,
- * except if there is not actually an scf relationship between the two characters.
- * TODO: Unicode should probably add the corresponding scf mappings.
- * See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
- * If & when those scf mappings are added, we should be able to remove all of these exceptions.
- */
-void addOneSimpleCaseClosure(UChar32 c, UChar32 t, const USetAdder *sa) {
-    switch (c) {
-    case 0x0390:
-        if (t == 0x1FD3) { return; }
-        break;
-    case 0x03B0:
-        if (t == 0x1FE3) { return; }
-        break;
-    case 0x1FD3:
-        if (t == 0x0390) { return; }
-        break;
-    case 0x1FE3:
-        if (t == 0x03B0) { return; }
-        break;
-    case 0xFB05:
-        if (t == 0xFB06) { return; }
-        break;
-    case 0xFB06:
-        if (t == 0xFB05) { return; }
-        break;
-    default:
-        break;
-    }
-    sa->add(sa->set, t);
-}
-
-}  // namespace
-
 U_CFUNC void U_EXPORT2
 ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
@@ -397,7 +360,7 @@ ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
                 pe=pe0;
                 UChar32 mapping;
                 GET_SLOT_VALUE(excWord, idx, pe, mapping);
-                addOneSimpleCaseClosure(c, mapping, sa);
+                sa->add(sa->set, mapping);
             }
         }
         if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
@@ -405,7 +368,7 @@ ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
             int32_t delta;
             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
             UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
-            addOneSimpleCaseClosure(c, mapping, sa);
+            sa->add(sa->set, mapping);
         }
 
         /* get the closure string pointer & length */
@@ -448,7 +411,7 @@ ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
         for(int32_t idx=0; idx<closureLength;) {
             UChar32 mapping;
             U16_NEXT_UNSAFE(closure, idx, mapping);
-            addOneSimpleCaseClosure(c, mapping, sa);
+            sa->add(sa->set, mapping);
         }
     }
 }

+ 62 - 63
thirdparty/icu4c/common/ucase_props_data.h

@@ -9,9 +9,9 @@
 
 #ifdef INCLUDED_FROM_UCASE_CPP
 
-static const UVersionInfo ucase_props_dataVersion={0xf,0,0,0};
+static const UVersionInfo ucase_props_dataVersion={0xf,1,0,0};
 
-static const int32_t ucase_props_indexes[UCASE_IX_TOP]={0x10,0x76f2,0x66c8,0x683,0x172,0,0,0,0,0,0,0,0,0,0,3};
+static const int32_t ucase_props_indexes[UCASE_IX_TOP]={0x10,0x76ec,0x66c8,0x680,0x172,0,0,0,0,0,0,0,0,0,0,3};
 
 static const uint16_t ucase_props_trieIndex[13148]={
 0x355,0x35d,0x365,0x36d,0x37b,0x383,0x38b,0x393,0x39b,0x3a3,0x3aa,0x3b2,0x3ba,0x3c2,0x3ca,0x3d2,
@@ -509,9 +509,9 @@ static const uint16_t ucase_props_trieIndex[13148]={
 0x39b9,0x3a29,0x3a99,0x3b09,0x3b7b,0x3beb,0x3c5b,0x3ccb,0x3d3b,0x3dab,0x3e1b,0x3e8b,0x411,0x411,0x3ef9,0x3f79,
 0x3fe9,0,0x4069,0x40e9,0xfc12,0xfc12,0xdb12,0xdb12,0x419b,4,0x4209,4,4,4,0x4259,0x42d9,
 0x4349,0,0x43c9,0x4449,0xd512,0xd512,0xd512,0xd512,0x44fb,4,4,4,0x411,0x411,0x4569,0x4619,
-0,0,0x46e9,0x4769,0xfc12,0xfc12,0xce12,0xce12,0,4,4,4,0x411,0x411,0x4819,0x48c9,
-0x4999,0x391,0x4a19,0x4a99,0xfc12,0xfc12,0xc812,0xc812,0xfc92,4,4,4,0,0,0x4b49,0x4bc9,
-0x4c39,0,0x4cb9,0x4d39,0xc012,0xc012,0xc112,0xc112,0x4deb,4,4,0,0,0,0,0,
+0,0,0x46d9,0x4759,0xfc12,0xfc12,0xce12,0xce12,0,4,4,4,0x411,0x411,0x4809,0x48b9,
+0x4979,0x391,0x49f9,0x4a79,0xfc12,0xfc12,0xc812,0xc812,0xfc92,4,4,4,0,0,0x4b29,0x4ba9,
+0x4c19,0,0x4c99,0x4d19,0xc012,0xc012,0xc112,0xc112,0x4dcb,4,4,0,0,0,0,0,
 0,0,0,0,0,0,0,4,4,4,4,4,0,0,0,0,
 0,0,0,0,4,4,0,0,0,0,0,0,4,0,0,4,
 0,0,4,4,4,4,4,0,0,0,0,0,0,0,0,0,
@@ -525,8 +525,8 @@ static const uint16_t ucase_props_trieIndex[13148]={
 0x64,0x44,0x64,0x64,0x64,0x64,0x64,0x64,0x44,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,
 0,0,1,2,2,2,1,1,2,2,2,1,0,2,0,0,
-0,2,2,2,2,2,0,0,0,0,0,0,2,0,0x4e5a,0,
-2,0,0x4e9a,0x4eda,2,2,0,1,2,2,0xe12,2,1,0,0,0,
+0,2,2,2,2,2,0,0,0,0,0,0,2,0,0x4e3a,0,
+2,0,0x4e7a,0x4eba,2,2,0,1,2,2,0xe12,2,1,0,0,0,
 0,1,0,0,1,1,2,2,0,0,0,0,0,2,1,1,
 0x21,0x21,0,0,0,0,0xf211,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0x812,0x812,0x812,0x812,0x812,0x812,0x812,0x812,
@@ -541,13 +541,13 @@ static const uint16_t ucase_props_trieIndex[13148]={
 0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,
 0x1812,0x1812,0x1812,0x1812,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,
 0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,
-0xe811,0xe811,0xe811,0xe811,0x92,0xff91,0x4f1a,0x4f3a,0x4f5a,0x4f79,0x4f99,0x92,0xff91,0x92,0xff91,0x92,
-0xff91,0x4fba,0x4fda,0x4ffa,0x501a,1,0x92,0xff91,1,0x92,0xff91,1,1,1,1,1,
-0x25,5,0x503a,0x503a,0x92,0xff91,0x92,0xff91,1,0,0,0,0,0,0,0x92,
+0xe811,0xe811,0xe811,0xe811,0x92,0xff91,0x4efa,0x4f1a,0x4f3a,0x4f59,0x4f79,0x92,0xff91,0x92,0xff91,0x92,
+0xff91,0x4f9a,0x4fba,0x4fda,0x4ffa,1,0x92,0xff91,1,0x92,0xff91,1,1,1,1,1,
+0x25,5,0x501a,0x501a,0x92,0xff91,0x92,0xff91,1,0,0,0,0,0,0,0x92,
 0xff91,0x92,0xff91,0x44,0x44,0x44,0x92,0xff91,0,0,0,0,0,0,0,0,
-0,0,0,0,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,
-0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,
-0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0,0x5059,0,0,0,0,0,0x5059,0,0,
+0,0,0,0,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,
+0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,
+0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0,0x5039,0,0,0,0,0,0x5039,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0x64,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,
@@ -562,7 +562,7 @@ static const uint16_t ucase_props_trieIndex[13148]={
 0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
-0x92,0xff91,0x507a,0x50b9,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
+0x92,0xff91,0x505a,0x5099,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
 0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0,0x44,4,4,4,0,
 0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0,4,0x92,0xff91,0x92,0xff91,
 0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
@@ -573,11 +573,11 @@ static const uint16_t ucase_props_trieIndex[13148]={
 4,4,4,4,4,4,4,4,4,4,4,4,4,4,0x92,0xff91,
 0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,1,1,0x92,0xff91,
 0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
-5,1,1,1,1,1,1,1,1,0x92,0xff91,0x92,0xff91,0x50fa,0x92,0xff91,
-0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,4,4,4,0x92,0xff91,0x511a,1,0,
+5,1,1,1,1,1,1,1,1,0x92,0xff91,0x92,0xff91,0x50da,0x92,0xff91,
+0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,4,4,4,0x92,0xff91,0x50fa,1,0,
 0x92,0xff91,0x92,0xff91,0x1811,1,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
-0x92,0xff91,0x513a,0x515a,0x517a,0x519a,0x513a,1,0x51ba,0x51da,0x51fa,0x521a,0x92,0xff91,0x92,0xff91,
-0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0xe812,0x523a,0x525a,0x92,0xff91,0x92,0xff91,0,
+0x92,0xff91,0x511a,0x513a,0x515a,0x517a,0x511a,1,0x519a,0x51ba,0x51da,0x51fa,0x92,0xff91,0x92,0xff91,
+0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0xe812,0x521a,0x523a,0x92,0xff91,0x92,0xff91,0,
 0,0,0,0,0x92,0xff91,0,1,0,1,0x92,0xff91,0x92,0xff91,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,5,5,5,0x92,0xff91,0,5,5,1,0,0,0,0,0,
@@ -607,17 +607,17 @@ static const uint16_t ucase_props_trieIndex[13148]={
 0,0,0,0,0,0,0,0,0,0,0,0,4,4,0,0,
 0,0,0,4,4,0,0x64,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,
-1,1,1,1,1,1,1,1,1,1,1,0x5279,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,0x5259,1,1,1,1,
 1,1,1,4,5,5,5,5,1,1,1,1,1,1,1,1,
-1,5,4,4,0,0,0,0,0x5299,0x52c9,0x52f9,0x5329,0x5359,0x5389,0x53b9,0x53e9,
-0x5419,0x5449,0x5479,0x54a9,0x54d9,0x5509,0x5539,0x5569,0x5b99,0x5bc9,0x5bf9,0x5c29,0x5c59,0x5c89,0x5cb9,0x5ce9,
-0x5d19,0x5d49,0x5d79,0x5da9,0x5dd9,0x5e09,0x5e39,0x5e69,0x5e99,0x5ec9,0x5ef9,0x5f29,0x5f59,0x5f89,0x5fb9,0x5fe9,
-0x6019,0x6049,0x6079,0x60a9,0x60d9,0x6109,0x6139,0x6169,0x5599,0x55c9,0x55f9,0x5629,0x5659,0x5689,0x56b9,0x56e9,
-0x5719,0x5749,0x5779,0x57a9,0x57d9,0x5809,0x5839,0x5869,0x5899,0x58c9,0x58f9,0x5929,0x5959,0x5989,0x59b9,0x59e9,
-0x5a19,0x5a49,0x5a79,0x5aa9,0x5ad9,0x5b09,0x5b39,0x5b69,0,0,0,0,0,4,0,0,
+1,5,4,4,0,0,0,0,0x5279,0x52a9,0x52d9,0x5309,0x5339,0x5369,0x5399,0x53c9,
+0x53f9,0x5429,0x5459,0x5489,0x54b9,0x54e9,0x5519,0x5549,0x5b79,0x5ba9,0x5bd9,0x5c09,0x5c39,0x5c69,0x5c99,0x5cc9,
+0x5cf9,0x5d29,0x5d59,0x5d89,0x5db9,0x5de9,0x5e19,0x5e49,0x5e79,0x5ea9,0x5ed9,0x5f09,0x5f39,0x5f69,0x5f99,0x5fc9,
+0x5ff9,0x6029,0x6059,0x6089,0x60b9,0x60e9,0x6119,0x6149,0x5579,0x55a9,0x55d9,0x5609,0x5639,0x5669,0x5699,0x56c9,
+0x56f9,0x5729,0x5759,0x5789,0x57b9,0x57e9,0x5819,0x5849,0x5879,0x58a9,0x58d9,0x5909,0x5939,0x5969,0x5999,0x59c9,
+0x59f9,0x5a29,0x5a59,0x5a89,0x5ab9,0x5ae9,0x5b19,0x5b49,0,0,0,0,0,4,0,0,
 4,0,0,0,0,0x64,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0x6199,0x6219,0x6299,0x6319,0x63c9,0x6479,0x6519,0,
-0,0,0,0,0,0,0,0,0,0,0,0x65b9,0x6639,0x66b9,0x6739,0x67b9,
+0,0,0,0,0,0,0,0,0x6179,0x61f9,0x6279,0x62f9,0x63a9,0x6459,0x64e9,0,
+0,0,0,0,0,0,0,0,0,0,0,0x6589,0x6609,0x6689,0x6709,0x6789,
 0,0,0,0,0,0,0x64,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,4,4,4,4,4,4,
 4,4,4,4,4,4,4,4,4,4,4,4,0,0,0,4,
@@ -838,7 +838,7 @@ static const uint16_t ucase_props_trieIndex[13148]={
 0,0,0,0,0,0,0,0,0,0,0,0
 };
 
-static const uint16_t ucase_props_exceptions[1667]={
+static const uint16_t ucase_props_exceptions[1664]={
 0xc850,0x20,2,0x130,0x131,0x4810,0x20,0x841,0x6b,1,0x212a,0x841,0x73,1,0x17f,0x5c50,
 0x20,2,0x130,0x131,0x844,0x4b,1,0x212a,0x844,0x53,1,0x17f,0x806,0x3bc,0x39c,0x841,
 0xe5,1,0x212b,0x8c0,1,0x2220,0x73,0x73,0x53,0x53,0x53,0x73,0x1e9e,0x844,0xc5,1,
@@ -909,41 +909,40 @@ static const uint16_t ucase_props_exceptions[1667]={
 0x3b7,0x3b9,0x397,0x399,0x880,0x2220,0x3ae,0x3b9,0x389,0x399,0x389,0x345,0x880,0x2220,0x3b7,0x342,
 0x397,0x342,0x397,0x342,0x880,0x3330,0x3b7,0x342,0x3b9,0x397,0x342,0x399,0x397,0x342,0x345,0xc90,
 9,0x220,0x3b7,0x3b9,0x397,0x399,0x880,0x3330,0x3b9,0x308,0x300,0x399,0x308,0x300,0x399,0x308,
-0x300,0x8c0,1,0x3330,0x3b9,0x308,0x301,0x399,0x308,0x301,0x399,0x308,0x301,0x390,0x880,0x2220,
-0x3b9,0x342,0x399,0x342,0x399,0x342,0x880,0x3330,0x3b9,0x308,0x342,0x399,0x308,0x342,0x399,0x308,
-0x342,0x880,0x3330,0x3c5,0x308,0x300,0x3a5,0x308,0x300,0x3a5,0x308,0x300,0x8c0,1,0x3330,0x3c5,
-0x308,0x301,0x3a5,0x308,0x301,0x3a5,0x308,0x301,0x3b0,0x880,0x2220,0x3c1,0x313,0x3a1,0x313,0x3a1,
-0x313,0x880,0x2220,0x3c5,0x342,0x3a5,0x342,0x3a5,0x342,0x880,0x3330,0x3c5,0x308,0x342,0x3a5,0x308,
-0x342,0x3a5,0x308,0x342,0x880,0x2220,0x1f7c,0x3b9,0x1ffa,0x399,0x1ffa,0x345,0x890,9,0x220,0x3c9,
-0x3b9,0x3a9,0x399,0x880,0x2220,0x3ce,0x3b9,0x38f,0x399,0x38f,0x345,0x880,0x2220,0x3c9,0x342,0x3a9,
-0x342,0x3a9,0x342,0x880,0x3330,0x3c9,0x342,0x3b9,0x3a9,0x342,0x399,0x3a9,0x342,0x345,0xc90,9,
-0x220,0x3c9,0x3b9,0x3a9,0x399,0xc50,0x1d5d,1,0x3a9,0xc50,0x20bf,1,0x4b,0xc50,0x2046,1,
-0xc5,0xc10,0x29f7,0xc10,0xee6,0xc10,0x29e7,0xc10,0x2a2b,0xc10,0x2a28,0xc10,0x2a1c,0xc10,0x29fd,0xc10,
-0x2a1f,0xc10,0x2a1e,0xc10,0x2a3f,0xc10,0x1c60,0x841,0xa64b,1,0x1c88,0x844,0xa64a,1,0x1c88,0xc10,
-0x8a04,0xc10,0xa528,0xc10,0xa544,0xc10,0xa54f,0xc10,0xa54b,0xc10,0xa541,0xc10,0xa512,0xc10,0xa52a,0xc10,
-0xa515,0x810,0x3a0,0xc10,0xa543,0xc10,0x8a38,0xc10,0x3a0,0x806,0x13a0,0x13a0,0x806,0x13a1,0x13a1,0x806,
-0x13a2,0x13a2,0x806,0x13a3,0x13a3,0x806,0x13a4,0x13a4,0x806,0x13a5,0x13a5,0x806,0x13a6,0x13a6,0x806,0x13a7,
-0x13a7,0x806,0x13a8,0x13a8,0x806,0x13a9,0x13a9,0x806,0x13aa,0x13aa,0x806,0x13ab,0x13ab,0x806,0x13ac,0x13ac,
-0x806,0x13ad,0x13ad,0x806,0x13ae,0x13ae,0x806,0x13af,0x13af,0x806,0x13b0,0x13b0,0x806,0x13b1,0x13b1,0x806,
-0x13b2,0x13b2,0x806,0x13b3,0x13b3,0x806,0x13b4,0x13b4,0x806,0x13b5,0x13b5,0x806,0x13b6,0x13b6,0x806,0x13b7,
-0x13b7,0x806,0x13b8,0x13b8,0x806,0x13b9,0x13b9,0x806,0x13ba,0x13ba,0x806,0x13bb,0x13bb,0x806,0x13bc,0x13bc,
-0x806,0x13bd,0x13bd,0x806,0x13be,0x13be,0x806,0x13bf,0x13bf,0x806,0x13c0,0x13c0,0x806,0x13c1,0x13c1,0x806,
-0x13c2,0x13c2,0x806,0x13c3,0x13c3,0x806,0x13c4,0x13c4,0x806,0x13c5,0x13c5,0x806,0x13c6,0x13c6,0x806,0x13c7,
-0x13c7,0x806,0x13c8,0x13c8,0x806,0x13c9,0x13c9,0x806,0x13ca,0x13ca,0x806,0x13cb,0x13cb,0x806,0x13cc,0x13cc,
-0x806,0x13cd,0x13cd,0x806,0x13ce,0x13ce,0x806,0x13cf,0x13cf,0x806,0x13d0,0x13d0,0x806,0x13d1,0x13d1,0x806,
-0x13d2,0x13d2,0x806,0x13d3,0x13d3,0x806,0x13d4,0x13d4,0x806,0x13d5,0x13d5,0x806,0x13d6,0x13d6,0x806,0x13d7,
-0x13d7,0x806,0x13d8,0x13d8,0x806,0x13d9,0x13d9,0x806,0x13da,0x13da,0x806,0x13db,0x13db,0x806,0x13dc,0x13dc,
-0x806,0x13dd,0x13dd,0x806,0x13de,0x13de,0x806,0x13df,0x13df,0x806,0x13e0,0x13e0,0x806,0x13e1,0x13e1,0x806,
-0x13e2,0x13e2,0x806,0x13e3,0x13e3,0x806,0x13e4,0x13e4,0x806,0x13e5,0x13e5,0x806,0x13e6,0x13e6,0x806,0x13e7,
-0x13e7,0x806,0x13e8,0x13e8,0x806,0x13e9,0x13e9,0x806,0x13ea,0x13ea,0x806,0x13eb,0x13eb,0x806,0x13ec,0x13ec,
-0x806,0x13ed,0x13ed,0x806,0x13ee,0x13ee,0x806,0x13ef,0x13ef,0x880,0x2220,0x66,0x66,0x46,0x46,0x46,
-0x66,0x880,0x2220,0x66,0x69,0x46,0x49,0x46,0x69,0x880,0x2220,0x66,0x6c,0x46,0x4c,0x46,
-0x6c,0x880,0x3330,0x66,0x66,0x69,0x46,0x46,0x49,0x46,0x66,0x69,0x880,0x3330,0x66,0x66,
-0x6c,0x46,0x46,0x4c,0x46,0x66,0x6c,0x8c0,1,0x2220,0x73,0x74,0x53,0x54,0x53,0x74,
-0xfb06,0x8c0,1,0x2220,0x73,0x74,0x53,0x54,0x53,0x74,0xfb05,0x880,0x2220,0x574,0x576,0x544,
-0x546,0x544,0x576,0x880,0x2220,0x574,0x565,0x544,0x535,0x544,0x565,0x880,0x2220,0x574,0x56b,0x544,
-0x53b,0x544,0x56b,0x880,0x2220,0x57e,0x576,0x54e,0x546,0x54e,0x576,0x880,0x2220,0x574,0x56d,0x544,
-0x53d,0x544,0x56d
+0x300,0x882,0x390,0x3330,0x3b9,0x308,0x301,0x399,0x308,0x301,0x399,0x308,0x301,0x880,0x2220,0x3b9,
+0x342,0x399,0x342,0x399,0x342,0x880,0x3330,0x3b9,0x308,0x342,0x399,0x308,0x342,0x399,0x308,0x342,
+0x880,0x3330,0x3c5,0x308,0x300,0x3a5,0x308,0x300,0x3a5,0x308,0x300,0x882,0x3b0,0x3330,0x3c5,0x308,
+0x301,0x3a5,0x308,0x301,0x3a5,0x308,0x301,0x880,0x2220,0x3c1,0x313,0x3a1,0x313,0x3a1,0x313,0x880,
+0x2220,0x3c5,0x342,0x3a5,0x342,0x3a5,0x342,0x880,0x3330,0x3c5,0x308,0x342,0x3a5,0x308,0x342,0x3a5,
+0x308,0x342,0x880,0x2220,0x1f7c,0x3b9,0x1ffa,0x399,0x1ffa,0x345,0x890,9,0x220,0x3c9,0x3b9,0x3a9,
+0x399,0x880,0x2220,0x3ce,0x3b9,0x38f,0x399,0x38f,0x345,0x880,0x2220,0x3c9,0x342,0x3a9,0x342,0x3a9,
+0x342,0x880,0x3330,0x3c9,0x342,0x3b9,0x3a9,0x342,0x399,0x3a9,0x342,0x345,0xc90,9,0x220,0x3c9,
+0x3b9,0x3a9,0x399,0xc50,0x1d5d,1,0x3a9,0xc50,0x20bf,1,0x4b,0xc50,0x2046,1,0xc5,0xc10,
+0x29f7,0xc10,0xee6,0xc10,0x29e7,0xc10,0x2a2b,0xc10,0x2a28,0xc10,0x2a1c,0xc10,0x29fd,0xc10,0x2a1f,0xc10,
+0x2a1e,0xc10,0x2a3f,0xc10,0x1c60,0x841,0xa64b,1,0x1c88,0x844,0xa64a,1,0x1c88,0xc10,0x8a04,0xc10,
+0xa528,0xc10,0xa544,0xc10,0xa54f,0xc10,0xa54b,0xc10,0xa541,0xc10,0xa512,0xc10,0xa52a,0xc10,0xa515,0x810,
+0x3a0,0xc10,0xa543,0xc10,0x8a38,0xc10,0x3a0,0x806,0x13a0,0x13a0,0x806,0x13a1,0x13a1,0x806,0x13a2,0x13a2,
+0x806,0x13a3,0x13a3,0x806,0x13a4,0x13a4,0x806,0x13a5,0x13a5,0x806,0x13a6,0x13a6,0x806,0x13a7,0x13a7,0x806,
+0x13a8,0x13a8,0x806,0x13a9,0x13a9,0x806,0x13aa,0x13aa,0x806,0x13ab,0x13ab,0x806,0x13ac,0x13ac,0x806,0x13ad,
+0x13ad,0x806,0x13ae,0x13ae,0x806,0x13af,0x13af,0x806,0x13b0,0x13b0,0x806,0x13b1,0x13b1,0x806,0x13b2,0x13b2,
+0x806,0x13b3,0x13b3,0x806,0x13b4,0x13b4,0x806,0x13b5,0x13b5,0x806,0x13b6,0x13b6,0x806,0x13b7,0x13b7,0x806,
+0x13b8,0x13b8,0x806,0x13b9,0x13b9,0x806,0x13ba,0x13ba,0x806,0x13bb,0x13bb,0x806,0x13bc,0x13bc,0x806,0x13bd,
+0x13bd,0x806,0x13be,0x13be,0x806,0x13bf,0x13bf,0x806,0x13c0,0x13c0,0x806,0x13c1,0x13c1,0x806,0x13c2,0x13c2,
+0x806,0x13c3,0x13c3,0x806,0x13c4,0x13c4,0x806,0x13c5,0x13c5,0x806,0x13c6,0x13c6,0x806,0x13c7,0x13c7,0x806,
+0x13c8,0x13c8,0x806,0x13c9,0x13c9,0x806,0x13ca,0x13ca,0x806,0x13cb,0x13cb,0x806,0x13cc,0x13cc,0x806,0x13cd,
+0x13cd,0x806,0x13ce,0x13ce,0x806,0x13cf,0x13cf,0x806,0x13d0,0x13d0,0x806,0x13d1,0x13d1,0x806,0x13d2,0x13d2,
+0x806,0x13d3,0x13d3,0x806,0x13d4,0x13d4,0x806,0x13d5,0x13d5,0x806,0x13d6,0x13d6,0x806,0x13d7,0x13d7,0x806,
+0x13d8,0x13d8,0x806,0x13d9,0x13d9,0x806,0x13da,0x13da,0x806,0x13db,0x13db,0x806,0x13dc,0x13dc,0x806,0x13dd,
+0x13dd,0x806,0x13de,0x13de,0x806,0x13df,0x13df,0x806,0x13e0,0x13e0,0x806,0x13e1,0x13e1,0x806,0x13e2,0x13e2,
+0x806,0x13e3,0x13e3,0x806,0x13e4,0x13e4,0x806,0x13e5,0x13e5,0x806,0x13e6,0x13e6,0x806,0x13e7,0x13e7,0x806,
+0x13e8,0x13e8,0x806,0x13e9,0x13e9,0x806,0x13ea,0x13ea,0x806,0x13eb,0x13eb,0x806,0x13ec,0x13ec,0x806,0x13ed,
+0x13ed,0x806,0x13ee,0x13ee,0x806,0x13ef,0x13ef,0x880,0x2220,0x66,0x66,0x46,0x46,0x46,0x66,0x880,
+0x2220,0x66,0x69,0x46,0x49,0x46,0x69,0x880,0x2220,0x66,0x6c,0x46,0x4c,0x46,0x6c,0x880,
+0x3330,0x66,0x66,0x69,0x46,0x46,0x49,0x46,0x66,0x69,0x880,0x3330,0x66,0x66,0x6c,0x46,
+0x46,0x4c,0x46,0x66,0x6c,0x882,0xfb06,0x2220,0x73,0x74,0x53,0x54,0x53,0x74,0x8c0,1,
+0x2220,0x73,0x74,0x53,0x54,0x53,0x74,0xfb05,0x880,0x2220,0x574,0x576,0x544,0x546,0x544,0x576,
+0x880,0x2220,0x574,0x565,0x544,0x535,0x544,0x565,0x880,0x2220,0x574,0x56b,0x544,0x53b,0x544,0x56b,
+0x880,0x2220,0x57e,0x576,0x54e,0x546,0x54e,0x576,0x880,0x2220,0x574,0x56d,0x544,0x53d,0x544,0x56d
 };
 
 static const uint16_t ucase_props_unfold[370]={

+ 10 - 5
thirdparty/icu4c/common/ucasemap.cpp

@@ -679,14 +679,18 @@ void toUpper(uint32_t options,
             // Adding one only to the final vowel in a longer sequence
             // (which does not occur in normal writing) would require lookahead.
             // Set the same flag as for preserving an existing dialytika.
-            if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
-                    (upper == 0x399 || upper == 0x3A5)) {
-                data |= HAS_DIALYTIKA;
+            if ((data & HAS_VOWEL) != 0 &&
+                (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
+                    0 &&
+                (upper == 0x399 || upper == 0x3A5)) {
+                data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
+                                                                           : HAS_COMBINING_DIALYTIKA;
             }
             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
             if ((data & HAS_YPOGEGRAMMENI) != 0) {
                 numYpogegrammeni = 1;
             }
+            const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
             // Skip combining diacritics after this Greek letter.
             int32_t nextNextIndex = nextIndex;
             while (nextIndex < srcLength) {
@@ -704,7 +708,8 @@ void toUpper(uint32_t options,
                 }
             }
             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
-                nextState |= AFTER_VOWEL_WITH_ACCENT;
+                nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
+                                                  : AFTER_VOWEL_WITH_COMBINING_ACCENT;
             }
             // Map according to Greek rules.
             UBool addTonos = false;
@@ -715,7 +720,7 @@ void toUpper(uint32_t options,
                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
                 // Keep disjunctive "or" with (only) a tonos.
                 // We use the same "word boundary" conditions as for the Final_Sigma test.
-                if (i == nextIndex) {
+                if (hasPrecomposedAccent) {
                     upper = 0x389;  // Preserve the precomposed form.
                 } else {
                     addTonos = true;

+ 2 - 1
thirdparty/icu4c/common/ucasemap_imp.h

@@ -263,7 +263,8 @@ static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALY
 
 // State bits.
 static const uint32_t AFTER_CASED = 1;
-static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
+static const uint32_t AFTER_VOWEL_WITH_COMBINING_ACCENT = 2;
+static const uint32_t AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT = 4;
 
 uint32_t getLetterData(UChar32 c);
 

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 692 - 683
thirdparty/icu4c/common/uchar_props_data.h


+ 58 - 32
thirdparty/icu4c/common/ucurr.cpp

@@ -11,6 +11,8 @@
 
 #if !UCONFIG_NO_FORMATTING
 
+#include <utility>
+
 #include "unicode/ucurr.h"
 #include "unicode/locid.h"
 #include "unicode/ures.h"
@@ -20,6 +22,7 @@
 #include "unicode/usetiter.h"
 #include "unicode/utf16.h"
 #include "ustr_imp.h"
+#include "bytesinkutil.h"
 #include "charstr.h"
 #include "cmemory.h"
 #include "cstring.h"
@@ -520,14 +523,18 @@ ucurr_forLocale(const char* locale,
         return 0;
     }
 
-    char currency[4];  // ISO currency codes are alpha3 codes.
     UErrorCode localStatus = U_ZERO_ERROR;
-    int32_t resLen = uloc_getKeywordValue(locale, "currency",
-                                          currency, UPRV_LENGTHOF(currency), &localStatus);
-    if (U_SUCCESS(localStatus) && resLen == 3 && uprv_isInvariantString(currency, resLen)) {
+    CharString currency;
+    {
+        CharStringByteSink sink(&currency);
+        ulocimp_getKeywordValue(locale, "currency", sink, &localStatus);
+    }
+    int32_t resLen = currency.length();
+
+    if (U_SUCCESS(localStatus) && resLen == 3 && uprv_isInvariantString(currency.data(), resLen)) {
         if (resLen < buffCapacity) {
-            T_CString_toUpperCase(currency);
-            u_charsToUChars(currency, buff, resLen);
+            T_CString_toUpperCase(currency.data());
+            u_charsToUChars(currency.data(), buff, resLen);
         }
         return u_terminateUChars(buff, buffCapacity, resLen, ec);
     }
@@ -597,11 +604,15 @@ ucurr_forLocale(const char* locale,
 
     if ((U_FAILURE(localStatus)) && strchr(id, '_') != 0) {
         // We don't know about it.  Check to see if we support the variant.
-        uloc_getParent(locale, id, UPRV_LENGTHOF(id), ec);
+        CharString parent;
+        {
+            CharStringByteSink sink(&parent);
+            ulocimp_getParent(locale, sink, ec);
+        }
         *ec = U_USING_FALLBACK_WARNING;
-        // TODO: Loop over the shortened id rather than recursing and
+        // TODO: Loop over the parent rather than recursing and
         // looking again for a currency keyword.
-        return ucurr_forLocale(id, buff, buffCapacity, ec);
+        return ucurr_forLocale(parent.data(), buff, buffCapacity, ec);
     }
     if (*ec == U_ZERO_ERROR || localStatus != U_ZERO_ERROR) {
         // There is nothing to fallback to. Report the failure/warning if possible.
@@ -624,20 +635,22 @@ ucurr_forLocale(const char* locale,
  * @return true if the fallback happened; false if locale is already
  * root ("").
  */
-static UBool fallback(char *loc) {
-    if (!*loc) {
+static UBool fallback(CharString& loc) {
+    if (loc.isEmpty()) {
         return false;
     }
     UErrorCode status = U_ZERO_ERROR;
-    if (uprv_strcmp(loc, "en_GB") == 0) {
+    if (loc == "en_GB") {
         // HACK: See #13368.  We need "en_GB" to fall back to "en_001" instead of "en"
         // in order to consume the correct data strings.  This hack will be removed
         // when proper data sink loading is implemented here.
-        // NOTE: "001" adds 1 char over "GB".  However, both call sites allocate
-        // arrays with length ULOC_FULLNAME_CAPACITY (plenty of room for en_001).
-        uprv_strcpy(loc + 3, "001");
+        loc.truncate(3);
+        loc.append("001", status);
     } else {
-        uloc_getParent(loc, loc, (int32_t)uprv_strlen(loc), &status);
+        CharString tmp;
+        CharStringByteSink sink(&tmp);
+        ulocimp_getParent(loc.data(), sink, &status);
+        loc = std::move(tmp);
     }
  /*
     char *i = uprv_strrchr(loc, '_');
@@ -692,9 +705,12 @@ ucurr_getName(const char16_t* currency,
     // this function.
     UErrorCode ec2 = U_ZERO_ERROR;
 
-    char loc[ULOC_FULLNAME_CAPACITY];
-    uloc_getName(locale, loc, sizeof(loc), &ec2);
-    if (U_FAILURE(ec2) || ec2 == U_STRING_NOT_TERMINATED_WARNING) {
+    CharString loc;
+    {
+        CharStringByteSink sink(&loc);
+        ulocimp_getName(locale, sink, &ec2);
+    }
+    if (U_FAILURE(ec2)) {
         *ec = U_ILLEGAL_ARGUMENT_ERROR;
         return 0;
     }
@@ -707,7 +723,7 @@ ucurr_getName(const char16_t* currency,
     
     const char16_t* s = nullptr;
     ec2 = U_ZERO_ERROR;
-    LocalUResourceBundlePointer rb(ures_open(U_ICUDATA_CURR, loc, &ec2));
+    LocalUResourceBundlePointer rb(ures_open(U_ICUDATA_CURR, loc.data(), &ec2));
 
     if (nameStyle == UCURR_NARROW_SYMBOL_NAME || nameStyle == UCURR_FORMAL_SYMBOL_NAME || nameStyle == UCURR_VARIANT_SYMBOL_NAME) {
         CharString key;
@@ -791,9 +807,12 @@ ucurr_getPluralName(const char16_t* currency,
     // this function.
     UErrorCode ec2 = U_ZERO_ERROR;
 
-    char loc[ULOC_FULLNAME_CAPACITY];
-    uloc_getName(locale, loc, sizeof(loc), &ec2);
-    if (U_FAILURE(ec2) || ec2 == U_STRING_NOT_TERMINATED_WARNING) {
+    CharString loc;
+    {
+        CharStringByteSink sink(&loc);
+        ulocimp_getName(locale, sink, &ec2);
+    }
+    if (U_FAILURE(ec2)) {
         *ec = U_ILLEGAL_ARGUMENT_ERROR;
         return 0;
     }
@@ -803,7 +822,7 @@ ucurr_getPluralName(const char16_t* currency,
 
     const char16_t* s = nullptr;
     ec2 = U_ZERO_ERROR;
-    UResourceBundle* rb = ures_open(U_ICUDATA_CURR, loc, &ec2);
+    UResourceBundle* rb = ures_open(U_ICUDATA_CURR, loc.data(), &ec2);
 
     rb = ures_getByKey(rb, CURRENCYPLURALS, rb, &ec2);
 
@@ -904,13 +923,17 @@ getCurrencyNameCount(const char* loc, int32_t* total_currency_name_count, int32_
     *total_currency_name_count = 0;
     *total_currency_symbol_count = 0;
     const char16_t* s = nullptr;
-    char locale[ULOC_FULLNAME_CAPACITY] = "";
-    uprv_strcpy(locale, loc);
+    CharString locale;
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        locale.append(loc, status);
+        if (U_FAILURE(status)) { return; }
+    }
     const icu::Hashtable *currencySymbolsEquiv = getCurrSymbolsEquiv();
     for (;;) {
         UErrorCode ec2 = U_ZERO_ERROR;
         // TODO: ures_openDirect?
-        UResourceBundle* rb = ures_open(U_ICUDATA_CURR, locale, &ec2);
+        UResourceBundle* rb = ures_open(U_ICUDATA_CURR, locale.data(), &ec2);
         UResourceBundle* curr = ures_getByKey(rb, CURRENCIES, nullptr, &ec2);
         int32_t n = ures_getSize(curr);
         for (int32_t i=0; i<n; ++i) {
@@ -979,14 +1002,17 @@ collectCurrencyNames(const char* locale,
     // Look up the Currencies resource for the given locale.
     UErrorCode ec2 = U_ZERO_ERROR;
 
-    char loc[ULOC_FULLNAME_CAPACITY] = "";
-    uloc_getName(locale, loc, sizeof(loc), &ec2);
-    if (U_FAILURE(ec2) || ec2 == U_STRING_NOT_TERMINATED_WARNING) {
+    CharString loc;
+    {
+        CharStringByteSink sink(&loc);
+        ulocimp_getName(locale, sink, &ec2);
+    }
+    if (U_FAILURE(ec2)) {
         ec = U_ILLEGAL_ARGUMENT_ERROR;
     }
 
     // Get maximum currency name count first.
-    getCurrencyNameCount(loc, total_currency_name_count, total_currency_symbol_count);
+    getCurrencyNameCount(loc.data(), total_currency_name_count, total_currency_symbol_count);
 
     *currencyNames = (CurrencyNameStruct*)uprv_malloc
         (sizeof(CurrencyNameStruct) * (*total_currency_name_count));
@@ -1014,7 +1040,7 @@ collectCurrencyNames(const char* locale,
     for (int32_t localeLevel = 0; ; ++localeLevel) {
         ec2 = U_ZERO_ERROR;
         // TODO: ures_openDirect
-        UResourceBundle* rb = ures_open(U_ICUDATA_CURR, loc, &ec2);
+        UResourceBundle* rb = ures_open(U_ICUDATA_CURR, loc.data(), &ec2);
         UResourceBundle* curr = ures_getByKey(rb, CURRENCIES, nullptr, &ec2);
         int32_t n = ures_getSize(curr);
         for (int32_t i=0; i<n; ++i) {

+ 1 - 1
thirdparty/icu4c/common/udata.cpp

@@ -1196,7 +1196,7 @@ doOpenChoice(const char *path, const char *type, const char *name,
                 *p = U_FILE_SEP_CHAR;
             }
 #if defined (UDATA_DEBUG)
-            fprintf(stderr, "Changed path from [%s] to [%s]\n", path, altSepPath.s);
+            fprintf(stderr, "Changed path from [%s] to [%s]\n", path, altSepPath.data());
 #endif
             path = altSepPath.data();
         }

+ 59 - 55
thirdparty/icu4c/common/uloc.cpp

@@ -103,12 +103,12 @@ static const char * const LANGUAGES[] = {
     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
     "bgc", "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
-    "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
+    "blo", "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
     "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
-    "cs",  "csb", "cu",  "cv",  "cy",
+    "cs",  "csb", "csw", "cu",  "cv",  "cy",
     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
     "dyo", "dyu", "dz",  "dzg",
@@ -135,7 +135,7 @@ static const char * const LANGUAGES[] = {
     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
-    "kv",  "kw",  "ky",
+    "kv",  "kw",  "kxv", "ky",
     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
@@ -169,14 +169,14 @@ static const char * const LANGUAGES[] = {
     "sv",  "sw",  "swb", "syc", "syr", "szl",
     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr",
-    "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
+    "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tok", "tpi",
     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
-    "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
-    "vot", "vro", "vun",
+    "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vmw",
+    "vo", "vot", "vro", "vun",
     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
-    "xal", "xh",  "xmf", "xog",
+    "xal", "xh",  "xmf", "xnr", "xog",
     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
     "zun", "zxx", "zza",
@@ -220,12 +220,12 @@ static const char * const LANGUAGES_3[] = {
     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
     "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
-    "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
+    "blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
     "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
-    "ces", "csb", "chu", "chv", "cym",
+    "ces", "csb", "csw", "chu", "chv", "cym",
     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
     "dyo", "dyu", "dzo", "dzg",
@@ -252,7 +252,7 @@ static const char * const LANGUAGES_3[] = {
     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
-    "kom", "cor", "kir",
+    "kom", "cor", "kxv", "kir",
     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
@@ -286,14 +286,14 @@ static const char * const LANGUAGES_3[] = {
     "swe", "swa", "swb", "syc", "syr", "szl",
     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
-    "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
+    "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi",
     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
-    "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
-    "vot", "vro", "vun",
+    "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw",
+    "vol", "vot", "vro", "vun",
     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
-    "xal", "xho", "xmf", "xog",
+    "xal", "xho", "xmf", "xnr", "xog",
     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
     "zun", "zxx", "zza",
@@ -477,25 +477,6 @@ static const CanonicalizationMap CANONICALIZE_MAP[] = {
 /* ### BCP47 Conversion *******************************************/
 /* Test if the locale id has BCP47 u extension and does not have '@' */
 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == nullptr && getShortestSubtagLength(localeID) == 1)
-/* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
-static const char* _ConvertBCP47(
-        const char* id, char* buffer, int32_t length,
-        UErrorCode* err, int32_t* pLocaleIdSize) {
-    const char* finalID;
-    int32_t localeIDSize = uloc_forLanguageTag(id, buffer, length, nullptr, err);
-    if (localeIDSize <= 0 || U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) {
-        finalID=id;
-        if (*err == U_STRING_NOT_TERMINATED_WARNING) {
-            *err = U_BUFFER_OVERFLOW_ERROR;
-        }
-    } else {
-        finalID=buffer;
-    }
-    if (pLocaleIdSize != nullptr) {
-        *pLocaleIdSize = localeIDSize;
-    }
-    return finalID;
-}
 /* Gets the size of the shortest subtag in the given localeID. */
 static int32_t getShortestSubtagLength(const char *localeID) {
     int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
@@ -762,7 +743,7 @@ ulocimp_getKeywordValue(const char* localeID,
     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 
     if(status && U_SUCCESS(*status) && localeID) {
-      char tempBuffer[ULOC_FULLNAME_CAPACITY];
+      CharString tempBuffer;
       const char* tmpLocaleID;
 
       if (keywordName == nullptr || keywordName[0] == 0) {
@@ -776,8 +757,9 @@ ulocimp_getKeywordValue(const char* localeID,
       }
 
       if (_hasBCP47Extension(localeID)) {
-          tmpLocaleID = _ConvertBCP47(localeID, tempBuffer,
-                                      sizeof(tempBuffer), status, nullptr);
+        CharStringByteSink sink(&tempBuffer);
+        ulocimp_forLanguageTag(localeID, -1, sink, nullptr, status);
+        tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
       } else {
           tmpLocaleID=localeID;
       }
@@ -1406,7 +1388,7 @@ U_CAPI UEnumeration* U_EXPORT2
 uloc_openKeywords(const char* localeID,
                         UErrorCode* status)
 {
-    char tempBuffer[ULOC_FULLNAME_CAPACITY];
+    CharString tempBuffer;
     const char* tmpLocaleID;
 
     if(status==nullptr || U_FAILURE(*status)) {
@@ -1414,8 +1396,9 @@ uloc_openKeywords(const char* localeID,
     }
 
     if (_hasBCP47Extension(localeID)) {
-        tmpLocaleID = _ConvertBCP47(localeID, tempBuffer,
-                                    sizeof(tempBuffer), status, nullptr);
+        CharStringByteSink sink(&tempBuffer);
+        ulocimp_forLanguageTag(localeID, -1, sink, nullptr, status);
+        tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
     } else {
         if (localeID==nullptr) {
             localeID=uloc_getDefault();
@@ -1489,7 +1472,7 @@ _canonicalize(const char* localeID,
     }
 
     int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
-    PreflightingLocaleIDBuffer tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
+    CharString tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
     CharString localeIDWithHyphens;  // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
     const char* origLocaleID;
     const char* tmpLocaleID;
@@ -1512,13 +1495,9 @@ _canonicalize(const char* localeID,
             }
         }
 
-        do {
-            // After this call tmpLocaleID may point to localeIDPtr which may
-            // point to either localeID or localeIDWithHyphens.data().
-            tmpLocaleID = _ConvertBCP47(localeIDPtr, tempBuffer.getBuffer(),
-                                        tempBuffer.getCapacity(), err,
-                                        &(tempBuffer.requestedCapacity));
-        } while (tempBuffer.needToTryAgain(err));
+        CharStringByteSink tempSink(&tempBuffer);
+        ulocimp_forLanguageTag(localeIDPtr, -1, tempSink, nullptr, err);
+        tmpLocaleID = U_SUCCESS(*err) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeIDPtr;
     } else {
         if (localeID==nullptr) {
            localeID=uloc_getDefault();
@@ -1676,12 +1655,39 @@ uloc_getParent(const char*    localeID,
                char* parent,
                int32_t parentCapacity,
                UErrorCode* err)
+{
+    if (U_FAILURE(*err)) {
+        return 0;
+    }
+
+    CheckedArrayByteSink sink(parent, parentCapacity);
+    ulocimp_getParent(localeID, sink, err);
+
+    int32_t reslen = sink.NumberOfBytesAppended();
+
+    if (U_FAILURE(*err)) {
+        return reslen;
+    }
+
+    if (sink.Overflowed()) {
+        *err = U_BUFFER_OVERFLOW_ERROR;
+    } else {
+        u_terminateChars(parent, parentCapacity, reslen, err);
+    }
+
+    return reslen;
+}
+
+U_CAPI void U_EXPORT2
+ulocimp_getParent(const char* localeID,
+                  icu::ByteSink& sink,
+                  UErrorCode* err)
 {
     const char *lastUnderscore;
     int32_t i;
 
     if (U_FAILURE(*err))
-        return 0;
+        return;
 
     if (localeID == nullptr)
         localeID = uloc_getDefault();
@@ -1697,13 +1703,9 @@ uloc_getParent(const char*    localeID,
         if (uprv_strnicmp(localeID, "und_", 4) == 0) {
             localeID += 3;
             i -= 3;
-            uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
-        } else if (parent != localeID) {
-            uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
         }
+        sink.Append(localeID, i);
     }
-
-    return u_terminateChars(parent, parentCapacity, i, err);
 }
 
 U_CAPI int32_t U_EXPORT2
@@ -1795,7 +1797,7 @@ uloc_getVariant(const char* localeID,
                 int32_t variantCapacity,
                 UErrorCode* err)
 {
-    char tempBuffer[ULOC_FULLNAME_CAPACITY];
+    CharString tempBuffer;
     const char* tmpLocaleID;
     int32_t i=0;
 
@@ -1804,7 +1806,9 @@ uloc_getVariant(const char* localeID,
     }
 
     if (_hasBCP47Extension(localeID)) {
-        tmpLocaleID =_ConvertBCP47(localeID, tempBuffer, sizeof(tempBuffer), err, nullptr);
+        CharStringByteSink sink(&tempBuffer);
+        ulocimp_forLanguageTag(localeID, -1, sink, nullptr, err);
+        tmpLocaleID = U_SUCCESS(*err) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
     } else {
         if (localeID==nullptr) {
            localeID=uloc_getDefault();

+ 27 - 83
thirdparty/icu4c/common/uloc_tag.cpp

@@ -1326,14 +1326,23 @@ _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool st
                         attrBufLength = 0;
                         for (; i < len; i++) {
                             if (buf[i] != '-') {
-                                attrBuf[attrBufLength++] = buf[i];
+                                if (static_cast<size_t>(attrBufLength) < sizeof(attrBuf)) {
+                                    attrBuf[attrBufLength++] = buf[i];
+                                } else {
+                                    *status = U_ILLEGAL_ARGUMENT_ERROR;
+                                    return;
+                                }
                             } else {
                                 i++;
                                 break;
                             }
                         }
                         if (attrBufLength > 0) {
-                            attrBuf[attrBufLength] = 0;
+                            if (static_cast<size_t>(attrBufLength) < sizeof(attrBuf)) {
+                                attrBuf[attrBufLength] = 0;
+                            } else {
+                                *status = U_STRING_NOT_TERMINATED_WARNING;
+                            }
 
                         } else if (i >= len){
                             break;
@@ -1879,11 +1888,8 @@ static void
 _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
     (void)hadPosix;
     char buf[ULOC_FULLNAME_CAPACITY];
-    char tmpAppend[ULOC_FULLNAME_CAPACITY];
     UErrorCode tmpStatus = U_ZERO_ERROR;
     int32_t len, i;
-    int32_t reslen = 0;
-    int32_t capacity = sizeof tmpAppend;
 
     if (U_FAILURE(*status)) {
         return;
@@ -1936,37 +1942,18 @@ _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool
                     }
 
                     if (writeValue) {
-                        if (reslen < capacity) {
-                            tmpAppend[reslen++] = SEP;
-                        }
+                        sink.Append("-", 1);
 
                         if (firstValue) {
-                            if (reslen < capacity) {
-                                tmpAppend[reslen++] = *PRIVATEUSE_KEY;
-                            }
-
-                            if (reslen < capacity) {
-                                tmpAppend[reslen++] = SEP;
-                            }
-
-                            len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX);
-                            if (reslen < capacity) {
-                                uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen));
-                            }
-                            reslen += len;
-
-                            if (reslen < capacity) {
-                                tmpAppend[reslen++] = SEP;
-                            }
-
+                            sink.Append(PRIVATEUSE_KEY, UPRV_LENGTHOF(PRIVATEUSE_KEY) - 1);
+                            sink.Append("-", 1);
+                            sink.Append(PRIVUSE_VARIANT_PREFIX, UPRV_LENGTHOF(PRIVUSE_VARIANT_PREFIX) - 1);
+                            sink.Append("-", 1);
                             firstValue = false;
                         }
 
                         len = (int32_t)uprv_strlen(pPriv);
-                        if (reslen < capacity) {
-                            uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen));
-                        }
-                        reslen += len;
+                        sink.Append(pPriv, len);
                     }
                 }
                 /* reset private use starting position */
@@ -1976,15 +1963,6 @@ _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool
             }
             p++;
         }
-
-        if (U_FAILURE(*status)) {
-            return;
-        }
-    }
-
-    if (U_SUCCESS(*status)) {
-        len = reslen;
-        sink.Append(tmpAppend, len);
     }
 }
 
@@ -2092,12 +2070,13 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
             int32_t oldTagLength = tagLen;
             if (tagLen < newTagLength) {
                 uprv_free(tagBuf);
-                tagBuf = (char*)uprv_malloc(newTagLength + 1);
+                // Change t->buf after the free and before return to avoid the second double free in
+                // the destructor of t when t is out of scope.
+                t->buf = tagBuf = (char*)uprv_malloc(newTagLength + 1);
                 if (tagBuf == nullptr) {
                     *status = U_MEMORY_ALLOCATION_ERROR;
                     return nullptr;
                 }
-                t->buf = tagBuf;
                 tagLen = newTagLength;
             }
             parsedLenDelta = checkLegacyLen - replacementLen;
@@ -2646,53 +2625,18 @@ ulocimp_toLanguageTag(const char* localeID,
                       UBool strict,
                       UErrorCode* status) {
     icu::CharString canonical;
-    int32_t reslen;
     UErrorCode tmpStatus = U_ZERO_ERROR;
     UBool hadPosix = false;
     const char* pKeywordStart;
 
     /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "".  See #6835 */
-    int32_t resultCapacity = static_cast<int32_t>(uprv_strlen(localeID));
-    if (resultCapacity > 0) {
-        char* buffer;
-
-        for (;;) {
-            buffer = canonical.getAppendBuffer(
-                    /*minCapacity=*/resultCapacity,
-                    /*desiredCapacityHint=*/resultCapacity,
-                    resultCapacity,
-                    tmpStatus);
-
-            if (U_FAILURE(tmpStatus)) {
-                *status = tmpStatus;
-                return;
-            }
-
-            reslen =
-                uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
-
-            if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
-                break;
-            }
-
-            resultCapacity = reslen;
-            tmpStatus = U_ZERO_ERROR;
-        }
-
-        if (U_FAILURE(tmpStatus)) {
-            *status = U_ILLEGAL_ARGUMENT_ERROR;
-            return;
-        }
-
-        canonical.append(buffer, reslen, tmpStatus);
-        if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
-            tmpStatus = U_ZERO_ERROR;  // Terminators provided by CharString.
-        }
-
-        if (U_FAILURE(tmpStatus)) {
-            *status = tmpStatus;
-            return;
-        }
+    {
+        icu::CharStringByteSink canonicalSink(&canonical);
+        ulocimp_canonicalize(localeID, canonicalSink, &tmpStatus);
+    }
+    if (U_FAILURE(tmpStatus)) {
+        *status = tmpStatus;
+        return;
     }
 
     /* For handling special case - private use only tag */

+ 99 - 0
thirdparty/icu4c/common/ulocale.cpp

@@ -0,0 +1,99 @@
+// © 2023 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+//
+#include "unicode/errorcode.h"
+#include "unicode/stringpiece.h"
+#include "unicode/utypes.h"
+#include "unicode/ustring.h"
+#include "unicode/ulocale.h"
+#include "unicode/locid.h"
+
+#include "charstr.h"
+#include "cmemory.h"
+#include "ustr_imp.h"
+
+U_NAMESPACE_USE
+#define EXTERNAL(i) (reinterpret_cast<ULocale*>(i))
+#define CONST_INTERNAL(e) (reinterpret_cast<const icu::Locale*>(e))
+#define INTERNAL(e) (reinterpret_cast<icu::Locale*>(e))
+
+ULocale*
+ulocale_openForLocaleID(const char* localeID, int32_t length, UErrorCode* err) {
+    CharString str(length < 0 ? StringPiece(localeID) : StringPiece(localeID, length), *err);
+    if (U_FAILURE(*err)) return nullptr;
+    return EXTERNAL(icu::Locale::createFromName(str.data()).clone());
+}
+
+ULocale*
+ulocale_openForLanguageTag(const char* tag, int32_t length, UErrorCode* err) {
+  Locale l = icu::Locale::forLanguageTag(length < 0 ? StringPiece(tag) : StringPiece(tag, length), *err);
+  if (U_FAILURE(*err)) return nullptr;
+  return EXTERNAL(l.clone());
+}
+
+void
+ulocale_close(ULocale* locale) {
+    delete INTERNAL(locale);
+}
+
+#define IMPL_ULOCALE_STRING_GETTER(N1, N2) \
+const char* ulocale_get ## N1(const ULocale* locale) { \
+    if (locale == nullptr) return nullptr; \
+    return CONST_INTERNAL(locale)->get ## N2(); \
+}
+
+#define IMPL_ULOCALE_STRING_IDENTICAL_GETTER(N) IMPL_ULOCALE_STRING_GETTER(N, N)
+
+#define IMPL_ULOCALE_GET_KEYWORD_VALUE(N) \
+int32_t ulocale_get ##N ( \
+    const ULocale* locale, const char* keyword, int32_t keywordLength, \
+    char* valueBuffer, int32_t bufferCapacity, UErrorCode *err) { \
+    if (U_FAILURE(*err)) return 0; \
+    if (locale == nullptr) { \
+        *err = U_ILLEGAL_ARGUMENT_ERROR; \
+        return 0; \
+    } \
+    CheckedArrayByteSink sink(valueBuffer, bufferCapacity); \
+    CONST_INTERNAL(locale)->get ## N( \
+        keywordLength < 0 ? StringPiece(keyword) : StringPiece(keyword, keywordLength), \
+        sink, *err); \
+    int32_t reslen = sink.NumberOfBytesAppended(); \
+    if (U_FAILURE(*err)) { \
+        return reslen; \
+    } \
+    if (sink.Overflowed()) { \
+        *err = U_BUFFER_OVERFLOW_ERROR; \
+    } else { \
+        u_terminateChars(valueBuffer, bufferCapacity, reslen, err); \
+    } \
+    return reslen; \
+}
+
+#define IMPL_ULOCALE_GET_KEYWORDS(N) \
+UEnumeration* ulocale_get ## N(const ULocale* locale, UErrorCode *err) { \
+    if (U_FAILURE(*err)) return nullptr; \
+    if (locale == nullptr) { \
+        *err = U_ILLEGAL_ARGUMENT_ERROR; \
+        return nullptr; \
+    } \
+    return uenum_openFromStringEnumeration( \
+        CONST_INTERNAL(locale)->create ## N(*err), err); \
+}
+
+IMPL_ULOCALE_STRING_IDENTICAL_GETTER(Language)
+IMPL_ULOCALE_STRING_IDENTICAL_GETTER(Script)
+IMPL_ULOCALE_STRING_GETTER(Region, Country)
+IMPL_ULOCALE_STRING_IDENTICAL_GETTER(Variant)
+IMPL_ULOCALE_STRING_GETTER(LocaleID, Name)
+IMPL_ULOCALE_STRING_IDENTICAL_GETTER(BaseName)
+IMPL_ULOCALE_GET_KEYWORD_VALUE(KeywordValue)
+IMPL_ULOCALE_GET_KEYWORD_VALUE(UnicodeKeywordValue)
+IMPL_ULOCALE_GET_KEYWORDS(Keywords)
+IMPL_ULOCALE_GET_KEYWORDS(UnicodeKeywords)
+
+bool ulocale_isBogus(const ULocale* locale) {
+    if (locale == nullptr) return false;
+    return CONST_INTERNAL(locale)->isBogus();
+}
+
+/*eof*/

+ 156 - 0
thirdparty/icu4c/common/ulocbuilder.cpp

@@ -0,0 +1,156 @@
+// © 2023 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include <utility>
+
+#include "unicode/bytestream.h"
+#include "unicode/localebuilder.h"
+#include "unicode/locid.h"
+#include "unicode/stringpiece.h"
+#include "unicode/umachine.h"
+#include "unicode/ulocbuilder.h"
+#include "cstring.h"
+#include "ustr_imp.h"
+
+using icu::CheckedArrayByteSink;
+using icu::StringPiece;
+
+#define EXTERNAL(i) (reinterpret_cast<ULocaleBuilder*>(i))
+#define INTERNAL(e) (reinterpret_cast<icu::LocaleBuilder*>(e))
+#define CONST_INTERNAL(e) (reinterpret_cast<const icu::LocaleBuilder*>(e))
+
+ULocaleBuilder* ulocbld_open() {
+    return EXTERNAL(new icu::LocaleBuilder());
+}
+
+void ulocbld_close(ULocaleBuilder* builder) {
+    if (builder == nullptr) return;
+    delete INTERNAL(builder);
+}
+
+void ulocbld_setLocale(ULocaleBuilder* builder, const char* locale, int32_t length) {
+    if (builder == nullptr) return;
+    icu::Locale l;
+    if (length < 0 || locale[length] == '\0') {
+        l = icu::Locale(locale);
+    } else {
+        if (length >= ULOC_FULLNAME_CAPACITY) {
+            l.setToBogus();
+        } else {
+            // locale is not null termined but Locale API require one.
+            // Create a null termined version in buf.
+            char buf[ULOC_FULLNAME_CAPACITY];
+            uprv_memcpy(buf, locale, length);
+            buf[length] = '\0';
+            l = icu::Locale(buf);
+        }
+    }
+    INTERNAL(builder)->setLocale(l);
+}
+
+void
+ulocbld_adoptULocale(ULocaleBuilder* builder, ULocale* locale) {
+    if (builder == nullptr) return;
+    INTERNAL(builder)->setLocale(*(reinterpret_cast<const icu::Locale*>(locale)));
+    ulocale_close(locale);
+}
+
+#define STRING_PIECE(s, l) ((l)<0 ? StringPiece(s) : StringPiece((s), (l)))
+
+#define IMPL_ULOCBLD_SETTER(N) \
+void ulocbld_##N(ULocaleBuilder* bld, const char* s, int32_t l) { \
+    if (bld == nullptr) return; \
+    INTERNAL(bld)->N(STRING_PIECE(s,l)); \
+}
+
+IMPL_ULOCBLD_SETTER(setLanguageTag)
+IMPL_ULOCBLD_SETTER(setLanguage)
+IMPL_ULOCBLD_SETTER(setScript)
+IMPL_ULOCBLD_SETTER(setRegion)
+IMPL_ULOCBLD_SETTER(setVariant)
+IMPL_ULOCBLD_SETTER(addUnicodeLocaleAttribute)
+IMPL_ULOCBLD_SETTER(removeUnicodeLocaleAttribute)
+
+void ulocbld_setExtension(ULocaleBuilder* builder, char key, const char* value, int32_t length) {
+    if (builder == nullptr) return;
+    INTERNAL(builder)->setExtension(key, STRING_PIECE(value, length));
+}
+
+void ulocbld_setUnicodeLocaleKeyword(
+    ULocaleBuilder* builder, const char* key, int32_t keyLength,
+    const char* type, int32_t typeLength) {
+    if (builder == nullptr) return;
+    INTERNAL(builder)->setUnicodeLocaleKeyword(
+        STRING_PIECE(key, keyLength), STRING_PIECE(type, typeLength));
+}
+
+void ulocbld_clear(ULocaleBuilder* builder) {
+    if (builder == nullptr) return;
+    INTERNAL(builder)->clear();
+}
+
+void ulocbld_clearExtensions(ULocaleBuilder* builder) {
+    if (builder == nullptr) return;
+    INTERNAL(builder)->clearExtensions();
+}
+
+
+ULocale* ulocbld_buildULocale(ULocaleBuilder* builder, UErrorCode* err) {
+    if (builder == nullptr) {
+        *err = U_ILLEGAL_ARGUMENT_ERROR;
+        return nullptr;
+    }
+    icu::Locale l = INTERNAL(builder)->build(*err);
+    if (U_FAILURE(*err)) return nullptr;
+    icu::Locale* r = l.clone();
+    if (r == nullptr) {
+        *err = U_MEMORY_ALLOCATION_ERROR;
+        return nullptr;
+    }
+    return reinterpret_cast<ULocale*>(r);
+}
+
+int32_t ulocbld_buildLocaleID(ULocaleBuilder* builder,
+                              char* buffer, int32_t bufferCapacity, UErrorCode* err) {
+    if (builder == nullptr) {
+        *err = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    icu::Locale l = INTERNAL(builder)->build(*err);
+    if (U_FAILURE(*err)) return 0;
+    int32_t length = (int32_t)(uprv_strlen(l.getName()));
+    if (0 < length && length <= bufferCapacity) {
+        uprv_memcpy(buffer, l.getName(), length);
+    }
+    return u_terminateChars(buffer, bufferCapacity, length, err);
+}
+
+int32_t ulocbld_buildLanguageTag(ULocaleBuilder* builder,
+                  char* buffer, int32_t bufferCapacity, UErrorCode* err) {
+    if (builder == nullptr) {
+        *err = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    icu::Locale l = INTERNAL(builder)->build(*err);
+    if (U_FAILURE(*err)) return 0;
+    CheckedArrayByteSink sink(buffer, bufferCapacity);
+    l.toLanguageTag(sink, *err);
+    int32_t reslen = sink.NumberOfBytesAppended();
+    if (U_FAILURE(*err)) {
+        return reslen;
+    }
+    if (sink.Overflowed()) {
+        *err = U_BUFFER_OVERFLOW_ERROR;
+    } else {
+        u_terminateChars(buffer, bufferCapacity, reslen, err);
+    }
+    return reslen;
+}
+
+UBool ulocbld_copyErrorTo(const ULocaleBuilder* builder, UErrorCode *outErrorCode) {
+    if (builder == nullptr) {
+        *outErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return true;
+    }
+    return CONST_INTERNAL(builder)->copyErrorTo(*outErrorCode);
+}

+ 7 - 68
thirdparty/icu4c/common/ulocimp.h

@@ -92,6 +92,11 @@ ulocimp_getKeywordValue(const char* localeID,
                         icu::ByteSink& sink,
                         UErrorCode* status);
 
+U_CAPI void U_EXPORT2
+ulocimp_getParent(const char* localeID,
+                  icu::ByteSink& sink,
+                  UErrorCode* err);
+
 /**
  * Writes a well-formed language tag for this locale ID.
  *
@@ -237,6 +242,7 @@ ulocimp_addLikelySubtags(const char* localeID,
  *
  * @param localeID The locale to minimize
  * @param sink The output sink receiving the maximized locale
+ * @param favorScript favor to keep script if true, region if false.
  * @param err Error information if minimizing the locale failed.  If the length
  * of the localeID and the null-terminator is greater than the maximum allowed size,
  * or the localeId is not well-formed, the error code is U_ILLEGAL_ARGUMENT_ERROR.
@@ -245,6 +251,7 @@ ulocimp_addLikelySubtags(const char* localeID,
 U_CAPI void U_EXPORT2
 ulocimp_minimizeSubtags(const char* localeID,
                         icu::ByteSink& sink,
+                        bool favorScript,
                         UErrorCode* err);
 
 U_CAPI const char * U_EXPORT2
@@ -307,72 +314,4 @@ U_CAPI const char* const* ulocimp_getKnownCanonicalizedLocaleForTest(int32_t* le
 // Return true if the value is already canonicalized.
 U_CAPI bool ulocimp_isCanonicalizedLocaleForTest(const char* localeName);
 
-/**
- * A utility class for handling locale IDs that may be longer than ULOC_FULLNAME_CAPACITY.
- * This encompasses all of the logic to allocate a temporary locale ID buffer on the stack,
- * and then, if it's not big enough, reallocate it on the heap and try again.
- *
- * You use it like this:
- * UErrorCode err = U_ZERO_ERROR;
- *
- * PreflightingLocaleIDBuffer tempBuffer;
- * do {
- *     tempBuffer.requestedCapacity = uloc_doSomething(localeID, tempBuffer.getBuffer(), tempBuffer.getCapacity(), &err);
- * } while (tempBuffer.needToTryAgain(&err));
- * if (U_SUCCESS(err)) {
- *     uloc_doSomethingWithTheResult(tempBuffer.getBuffer());
- * }
- */
-class PreflightingLocaleIDBuffer {
-private:
-    char stackBuffer[ULOC_FULLNAME_CAPACITY];
-    char* heapBuffer = nullptr;
-    int32_t capacity = ULOC_FULLNAME_CAPACITY;
-    
-public:
-    int32_t requestedCapacity = ULOC_FULLNAME_CAPACITY;
-
-    // No heap allocation. Use only on the stack.
-    static void* U_EXPORT2 operator new(size_t) noexcept = delete;
-    static void* U_EXPORT2 operator new[](size_t) noexcept = delete;
-#if U_HAVE_PLACEMENT_NEW
-    static void* U_EXPORT2 operator new(size_t, void*) noexcept = delete;
-#endif
-
-    PreflightingLocaleIDBuffer() {}
-    
-    ~PreflightingLocaleIDBuffer() { uprv_free(heapBuffer); }
-    
-    char* getBuffer() {
-        if (heapBuffer == nullptr) {
-            return stackBuffer;
-        } else {
-            return heapBuffer;
-        }
-    }
-    
-    int32_t getCapacity() {
-        return capacity;
-    }
-    
-    bool needToTryAgain(UErrorCode* err) {
-        if (heapBuffer != nullptr) {
-            return false;
-        }
-    
-        if (*err == U_BUFFER_OVERFLOW_ERROR || *err == U_STRING_NOT_TERMINATED_WARNING) {
-            int32_t newCapacity = requestedCapacity + 2;    // one for the terminating null, one just for paranoia
-            heapBuffer = static_cast<char*>(uprv_malloc(newCapacity));
-            if (heapBuffer == nullptr) {
-                *err = U_MEMORY_ALLOCATION_ERROR;
-            } else {
-                *err = U_ZERO_ERROR;
-                capacity = newCapacity;
-            }
-            return U_SUCCESS(*err);
-        }
-        return false;
-    }
-};
-
 #endif

+ 1 - 0
thirdparty/icu4c/common/unicode/brkiter.h

@@ -649,6 +649,7 @@ private:
     /** @internal (private) */
     char actualLocale[ULOC_FULLNAME_CAPACITY];
     char validLocale[ULOC_FULLNAME_CAPACITY];
+    char requestLocale[ULOC_FULLNAME_CAPACITY];
 };
 
 #ifndef U_HIDE_DEPRECATED_API

+ 1 - 1
thirdparty/icu4c/common/unicode/docmain.h

@@ -114,7 +114,7 @@
  *   </tr>
  *   <tr>
  *     <td>Locales </td>
- *     <td>uloc.h</a></td>
+ *     <td>uloc.h, ulocale.h, ulocbuilder.h</a></td>
  *     <td>icu::Locale, icu::LocaleBuilder, icu::LocaleMatcher</td>
  *   </tr>
  *   <tr>

+ 13 - 1
thirdparty/icu4c/common/unicode/locid.h

@@ -984,7 +984,10 @@ public:
     static const char* const* U_EXPORT2 getISOCountries();
 
     /**
-     * Gets a list of all available language codes defined in ISO 639.  This is a pointer
+     * Returns a list of all unique language codes defined in ISO 639.
+     * They can be 2 or 3 letter codes, as defined by
+     * <a href="https://www.ietf.org/rfc/bcp/bcp47.html#section-2.2.1">
+     * BCP 47, section 2.2.1</a>. This is a pointer
      * to an array of pointers to arrays of char.  All of these pointers are owned
      * by ICU-- do not delete them, and do not write through them.  The array is
      * terminated with a null pointer.
@@ -1110,6 +1113,15 @@ protected: /* only protected for testing purposes. DO NOT USE. */
      * @internal
      */
     void setFromPOSIXID(const char *posixID);
+    /**
+     * Minimize the subtags for this Locale, per the algorithm described
+     * @param favorScript favor to keep script if true, to keep region if false.
+     * @param status  error information if maximizing this Locale failed.
+     *                If this Locale is not well-formed, the error code is
+     *                U_ILLEGAL_ARGUMENT_ERROR.
+     * @internal
+     */
+    void minimizeSubtags(bool favorScript, UErrorCode& status);
 #endif  /* U_HIDE_INTERNAL_API */
 
 private:

+ 24 - 2
thirdparty/icu4c/common/unicode/normalizer2.h

@@ -147,7 +147,10 @@ public:
     getNFKDInstance(UErrorCode &errorCode);
 
     /**
-     * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
+     * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
+     * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
+     * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
+     *
      * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
      * Returns an unmodifiable singleton instance. Do not delete it.
      * @param errorCode Standard ICU error code. Its input value must
@@ -160,6 +163,25 @@ public:
     static const Normalizer2 *
     getNFKCCasefoldInstance(UErrorCode &errorCode);
 
+#ifndef U_HIDE_DRAFT_API
+    /**
+     * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
+     * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
+     * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
+     *
+     * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
+     * Returns an unmodifiable singleton instance. Do not delete it.
+     * @param errorCode Standard ICU error code. Its input value must
+     *                  pass the U_SUCCESS() test, or else the function returns
+     *                  immediately. Check for U_FAILURE() on output or use with
+     *                  function chaining. (See User Guide for details.)
+     * @return the requested Normalizer2, if successful
+     * @draft ICU 74
+     */
+    static const Normalizer2 *
+    getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
+#endif  // U_HIDE_DRAFT_API
+
     /**
      * Returns a Normalizer2 instance which uses the specified data file
      * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
@@ -172,7 +194,7 @@ public:
      * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
      *
      * @param packageName nullptr for ICU built-in data, otherwise application data package name
-     * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
+     * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
      * @param mode normalization mode (compose or decompose etc.)
      * @param errorCode Standard ICU error code. Its input value must
      *                  pass the U_SUCCESS() test, or else the function returns

+ 81 - 1
thirdparty/icu4c/common/unicode/rbbi.h

@@ -43,6 +43,69 @@ class  RBBIDataWrapper;
 class  UnhandledEngine;
 class  UStack;
 
+
+#ifndef U_HIDE_DRAFT_API
+/**
+ * The ExternalBreakEngine class define an abstract interface for the host environment
+ * to provide a low level facility to break text for unicode text in script that the text boundary
+ * cannot be handled by upper level rule based logic, for example, for Chinese and Japanese
+ * word breaking, Thai, Khmer, Burmese, Lao and other Southeast Asian scripts.
+ * The host environment implement one or more subclass of ExternalBreakEngine and
+ * register them in the initialization time by calling
+ * RuleBasedBreakIterator::registerExternalBreakEngine(). ICU adopt and own the engine and will
+ * delete the registered external engine in proper time during the clean up
+ * event.
+ * @internal ICU 74 technology preview
+ */
+class ExternalBreakEngine : public UObject {
+  public:
+    /**
+     * destructor
+     * @internal ICU 74 technology preview
+     */
+    virtual ~ExternalBreakEngine() {}
+
+    /**
+     * <p>Indicate whether this engine handles a particular character when
+     * the RuleBasedBreakIterator is used for a particular locale. This method is used
+     * by the RuleBasedBreakIterator to find a break engine.</p>
+     * @param c A character which begins a run that the engine might handle.
+     * @param locale    The locale.
+     * @return true if this engine handles the particular character for that locale.
+     * @internal ICU 74 technology preview
+     */
+    virtual bool isFor(UChar32 c, const char* locale) const = 0;
+
+    /**
+     * <p>Indicate whether this engine handles a particular character.This method is
+     * used by the RuleBasedBreakIterator after it already find a break engine to see which
+     * characters after the first one can be handled by this break engine.</p>
+     * @param c A character that the engine might handle.
+     * @return true if this engine handles the particular character.
+     * @internal ICU 74 technology preview
+     */
+    virtual bool handles(UChar32 c) const = 0;
+
+    /**
+     * <p>Divide up a range of text handled by this break engine.</p>
+     *
+     * @param text A UText representing the text
+     * @param start The start of the range of known characters
+     * @param end The end of the range of known characters
+     * @param foundBreaks Output of C array of int32_t break positions, or
+     * nullptr
+     * @param foundBreaksCapacity The capacity of foundBreaks
+     * @param status Information on any errors encountered.
+     * @return The number of breaks found
+     * @internal ICU 74 technology preview
+     */
+     virtual int32_t fillBreaks(UText* text,  int32_t start, int32_t end,
+                               int32_t* foundBreaks, int32_t foundBreaksCapacity,
+                               UErrorCode& status) const = 0;
+};
+#endif  /* U_HIDE_DRAFT_API */
+
+
 /**
  *
  * A subclass of BreakIterator whose behavior is specified using a list of rules.
@@ -716,9 +779,10 @@ private:
      * This function returns the appropriate LanguageBreakEngine for a
      * given character c.
      * @param c         A character in the dictionary set
+     * @param locale    The locale.
      * @internal (private)
      */
-    const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
+    const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale);
 
   public:
 #ifndef U_HIDE_INTERNAL_API
@@ -734,8 +798,24 @@ private:
      */
     void dumpTables();
 #endif  /* U_HIDE_INTERNAL_API */
+
+#ifndef U_HIDE_DRAFT_API
+    /**
+     * Register a new external break engine. The external break engine will be adopted.
+     * Because ICU may choose to cache break engine internally, this must
+     * be called at application startup, prior to any calls to
+     * object methods of RuleBasedBreakIterator to avoid undefined behavior.
+     * @param toAdopt the ExternalBreakEngine instance to be adopted
+     * @param status the in/out status code, no special meanings are assigned
+     * @internal ICU 74 technology preview
+     */
+    static void U_EXPORT2 registerExternalBreakEngine(
+                  ExternalBreakEngine* toAdopt, UErrorCode& status);
+#endif  /* U_HIDE_DRAFT_API */
+
 };
 
+
 U_NAMESPACE_END
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

+ 40 - 4
thirdparty/icu4c/common/unicode/uchar.h

@@ -60,7 +60,7 @@ U_CDECL_BEGIN
  * @see u_getUnicodeVersion
  * @stable ICU 2.0
  */
-#define U_UNICODE_VERSION "15.0"
+#define U_UNICODE_VERSION "15.1"
 
 /**
  * \file
@@ -532,12 +532,33 @@ typedef enum UProperty {
      * @stable ICU 70
      */
     UCHAR_RGI_EMOJI=71,
+#ifndef U_HIDE_DRAFT_API
+    /**
+     * Binary property IDS_Unary_Operator.
+     * For programmatic determination of Ideographic Description Sequences.
+     *
+     * @draft ICU 74
+     */
+    UCHAR_IDS_UNARY_OPERATOR=72,
+    /**
+     * Binary property ID_Compat_Math_Start.
+     * Used in mathematical identifier profile in UAX #31.
+     * @draft ICU 74
+     */
+    UCHAR_ID_COMPAT_MATH_START=73,
+    /**
+     * Binary property ID_Compat_Math_Continue.
+     * Used in mathematical identifier profile in UAX #31.
+     * @draft ICU 74
+     */
+    UCHAR_ID_COMPAT_MATH_CONTINUE=74,
+#endif  // U_HIDE_DRAFT_API
 #ifndef U_HIDE_DEPRECATED_API
     /**
      * One more than the last constant for binary Unicode properties.
      * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
      */
-    UCHAR_BINARY_LIMIT=72,
+    UCHAR_BINARY_LIMIT=75,
 #endif  // U_HIDE_DEPRECATED_API
 
     /** Enumerated property Bidi_Class.
@@ -1900,6 +1921,11 @@ enum UBlockCode {
     /** @stable ICU 72 */
     UBLOCK_NAG_MUNDARI = 327, /*[1E4D0]*/
 
+    // New block in Unicode 15.1
+
+    /** @stable ICU 74 */
+    UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I = 328, /*[2EBF0]*/
+
 #ifndef U_HIDE_DEPRECATED_API
     /**
      * One more than the highest normal UBlockCode value.
@@ -1907,7 +1933,7 @@ enum UBlockCode {
      *
      * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
      */
-    UBLOCK_COUNT = 328,
+    UBLOCK_COUNT = 329,
 #endif  // U_HIDE_DEPRECATED_API
 
     /** @stable ICU 2.0 */
@@ -2439,6 +2465,16 @@ typedef enum ULineBreak {
     U_LB_E_MODIFIER = 41,        /*[EM]*/
     /** @stable ICU 58 */
     U_LB_ZWJ = 42,               /*[ZWJ]*/
+    /** @stable ICU 74 */
+    U_LB_AKSARA = 43,            /*[AK]*/
+    /** @stable ICU 74 */
+    U_LB_AKSARA_PREBASE = 44,    /*[AP]*/
+    /** @stable ICU 74 */
+    U_LB_AKSARA_START = 45,      /*[AS]*/
+    /** @stable ICU 74 */
+    U_LB_VIRAMA_FINAL = 46,      /*[VF]*/
+    /** @stable ICU 74 */
+    U_LB_VIRAMA = 47,            /*[VI]*/
 #ifndef U_HIDE_DEPRECATED_API
     /**
      * One more than the highest normal ULineBreak value.
@@ -2446,7 +2482,7 @@ typedef enum ULineBreak {
      *
      * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
      */
-    U_LB_COUNT = 43
+    U_LB_COUNT = 48
 #endif  // U_HIDE_DEPRECATED_API
 } ULineBreak;
 

+ 229 - 0
thirdparty/icu4c/common/unicode/ulocale.h

@@ -0,0 +1,229 @@
+// © 2023 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#ifndef ULOCALE_H
+#define ULOCALE_H
+
+#include "unicode/localpointer.h"
+#include "unicode/uenum.h"
+#include "unicode/utypes.h"
+
+/**
+ * \file
+ * \brief C API: Locale ID functionality similar to C++ class Locale
+ */
+
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Opaque C service object type for the locale API
+ * @draft ICU 74
+ */
+struct ULocale;
+
+/**
+ * C typedef for struct ULocale.
+ * @draft ICU 74
+ */
+typedef struct ULocale ULocale;
+
+/**
+ * Constructs an ULocale from the locale ID.
+ * The created ULocale should be destroyed by calling
+ * ulocale_close();
+ * @param localeID the locale, a const char * pointer (need not be terminated when
+ *               the length is non-negative)
+ * @param length the length of the locale; if negative, then the locale need to be
+ *               null terminated.
+ * @param err the error code
+ * @return the locale.
+ *
+ * @draft ICU 74
+ */
+U_CAPI ULocale* U_EXPORT2
+ulocale_openForLocaleID(const char* localeID, int32_t length, UErrorCode* err);
+
+/**
+ * Constructs an ULocale from the provided IETF BCP 47 language tag.
+ * The created ULocale should be destroyed by calling
+ * ulocale_close();
+ * @param tag the language tag, defined as IETF BCP 47 language tag, const
+ *            char* pointer (need not be terminated when the length is non-negative)
+ * @param length the length of the tag; if negative, then the tag need to be
+ *               null terminated.
+ * @param err the error code
+ * @return the locale.
+ *
+ * @draft ICU 74
+ */
+U_CAPI ULocale* U_EXPORT2
+ulocale_openForLanguageTag(const char* tag, int32_t length, UErrorCode* err);
+
+/**
+ * Close the locale and destroy it's internal states.
+ *
+ * @param locale the locale
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocale_close(ULocale* locale);
+
+/**
+ * Returns the locale's ISO-639 language code.
+ *
+ * @param locale the locale
+ * @return      the language code of the locale.
+ * @draft ICU 74
+ */
+U_CAPI const char* U_EXPORT2
+ulocale_getLanguage(const ULocale* locale);
+
+/**
+ * Returns the locale's ISO-15924 abbreviation script code.
+ *
+ * @param locale the locale
+ * @return      A pointer to the script.
+ * @draft ICU 74
+ */
+U_CAPI const char* U_EXPORT2
+ulocale_getScript(const ULocale* locale);
+
+/**
+ * Returns the locale's ISO-3166 region code.
+ *
+ * @param locale the locale
+ * @return      A pointer to the region.
+ * @draft ICU 74
+ */
+U_CAPI const char* U_EXPORT2
+ulocale_getRegion(const ULocale* locale);
+
+/**
+ * Returns the locale's variant code.
+ *
+ * @param locale the locale
+ * @return      A pointer to the variant.
+ * @draft ICU 74
+ */
+U_CAPI const char* U_EXPORT2
+ulocale_getVariant(const ULocale* locale);
+
+/**
+ * Returns the programmatic name of the entire locale, with the language,
+ * country and variant separated by underbars. If a field is missing, up
+ * to two leading underbars will occur. Example: "en", "de_DE", "en_US_WIN",
+ * "de__POSIX", "fr__MAC", "__MAC", "_MT", "_FR_EURO"
+ *
+ * @param locale the locale
+ * @return      A pointer to "name".
+ * @draft ICU 74
+ */
+U_CAPI const char* U_EXPORT2
+ulocale_getLocaleID(const ULocale* locale);
+
+/**
+ * Returns the programmatic name of the entire locale as ulocale_getLocaleID()
+ * would return, but without keywords.
+ *
+ * @param locale the locale
+ * @return      A pointer to "base name".
+ * @draft ICU 74
+ */
+U_CAPI const char* U_EXPORT2
+ulocale_getBaseName(const ULocale* locale);
+
+/**
+ * Gets the bogus state. Locale object can be bogus if it doesn't exist
+ *
+ * @param locale the locale
+ * @return false if it is a real locale, true if it is a bogus locale
+ * @draft ICU 74
+ */
+U_CAPI bool U_EXPORT2
+ulocale_isBogus(const ULocale* locale);
+
+/**
+ * Gets the list of keywords for the specified locale.
+ *
+ * @param locale the locale
+ * @param err the error code
+ * @return pointer to UEnumeration, or nullptr if there are no keywords.
+ * Client must call uenum_close() to dispose the returned value.
+ * @draft ICU 74
+ */
+U_CAPI UEnumeration* U_EXPORT2
+ulocale_getKeywords(const ULocale* locale, UErrorCode *err);
+
+/**
+ * Gets the list of unicode keywords for the specified locale.
+ *
+ * @param locale the locale
+ * @param err the error code
+ * @return pointer to UEnumeration, or nullptr if there are no keywords.
+ * Client must call uenum_close() to dispose the returned value.
+ * @draft ICU 74
+ */
+U_CAPI UEnumeration* U_EXPORT2
+ulocale_getUnicodeKeywords(const ULocale* locale, UErrorCode *err);
+
+/**
+ * Gets the value for a keyword.
+ *
+ * This uses legacy keyword=value pairs, like "collation=phonebook".
+ *
+ * @param locale the locale
+ * @param keyword the keyword, a const char * pointer (need not be
+ *                terminated when the length is non-negative)
+ * @param keywordLength the length of the keyword; if negative, then the
+ *                      keyword need to be null terminated.
+ * @param valueBuffer The buffer to receive the value.
+ * @param valueBufferCapacity The capacity of receiving valueBuffer.
+ * @param err the error code
+ * @draft ICU 74
+ */
+U_CAPI int32_t U_EXPORT2
+ulocale_getKeywordValue(
+    const ULocale* locale, const char* keyword, int32_t keywordLength,
+    char* valueBuffer, int32_t valueBufferCapacity, UErrorCode *err);
+
+/**
+ * Gets the Unicode value for a Unicode keyword.
+ *
+ * This uses Unicode key-value pairs, like "co-phonebk".
+ *
+ * @param locale the locale
+ * @param keyword the Unicode keyword, a const char * pointer (need not be
+ *                terminated when the length is non-negative)
+ * @param keywordLength the length of the Unicode keyword; if negative,
+ *                      then the keyword need to be null terminated.
+ * @param valueBuffer The buffer to receive the Unicode value.
+ * @param valueBufferCapacity The capacity of receiving valueBuffer.
+ * @param err the error code
+ * @draft ICU 74
+ */
+U_CAPI int32_t U_EXPORT2
+ulocale_getUnicodeKeywordValue(
+    const ULocale* locale, const char* keyword, int32_t keywordLength,
+    char* valueBuffer, int32_t valueBufferCapacity, UErrorCode *err);
+
+#if U_SHOW_CPLUSPLUS_API
+
+U_NAMESPACE_BEGIN
+
+/**
+ * \class LocalULocalePointer
+ * "Smart pointer" class, closes a ULocale via ulocale_close().
+ * For most methods see the LocalPointerBase base class.
+ *
+ * @see LocalPointerBase
+ * @see LocalPointer
+ * @draft ICU 74
+ */
+U_DEFINE_LOCAL_OPEN_POINTER(LocalULocalePointer, ULocale, ulocale_close);
+
+U_NAMESPACE_END
+
+#endif  /* U_SHOW_CPLUSPLUS_API */
+
+#endif /* U_HIDE_DRAFT_API */
+
+#endif /*_ULOCALE */

+ 441 - 0
thirdparty/icu4c/common/unicode/ulocbuilder.h

@@ -0,0 +1,441 @@
+// © 2023 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+#ifndef __ULOCBUILDER_H__
+#define __ULOCBUILDER_H__
+
+#include "unicode/localpointer.h"
+#include "unicode/ulocale.h"
+#include "unicode/utypes.h"
+
+/**
+ * \file
+ * \brief C API: Builder API for Locale
+ */
+
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Opaque C service object type for the locale builder API
+ * @draft ICU 74
+ */
+struct ULocaleBuilder;
+
+/**
+ * C typedef for struct ULocaleBuilder.
+ * @draft ICU 74
+ */
+typedef struct ULocaleBuilder ULocaleBuilder;
+
+/**
+ * <code>ULocaleBuilder</code> is used to build valid <code>locale</code> id
+ * string or IETF BCP 47 language tag from values configured by the setters.
+ * The <code>ULocaleBuilder</code> checks if a value configured by a
+ * setter satisfies the syntax requirements defined by the <code>Locale</code>
+ * class.  A string of Locale created by a <code>ULocaleBuilder</code> is
+ * well-formed and can be transformed to a well-formed IETF BCP 47 language tag
+ * without losing information.
+ *
+ * <p>The following example shows how to create a <code>locale</code> string
+ * with the <code>ULocaleBuilder</code>.
+ * <blockquote>
+ * <pre>
+ *     UErrorCode err = U_ZERO_ERROR;
+ *     char buffer[ULOC_FULLNAME_CAPACITY];
+ *     ULocaleBuilder* builder = ulocbld_open();
+ *     ulocbld_setLanguage(builder, "sr", -1);
+ *     ulocbld_setScript(builder, "Latn", -1);
+ *     ulocbld_setRegion(builder, "RS", -1);
+ *     int32_t length = ulocbld_buildLocaleID(
+ *         builder, buffer, ULOC_FULLNAME_CAPACITY, &error);
+ *     ulocbld_close(builder);
+ * </pre>
+ * </blockquote>
+ *
+ * <p>ULocaleBuilders can be reused; <code>ulocbld_clear()</code> resets all
+ * fields to their default values.
+ *
+ * <p>ULocaleBuilder tracks errors in an internal UErrorCode. For all setters,
+ * except ulocbld_setLanguageTag and ulocbld_setLocale, ULocaleBuilder will return immediately
+ * if the internal UErrorCode is in error state.
+ * To reset internal state and error code, call clear method.
+ * The ulocbld_setLanguageTag and setLocale method will first clear the internal
+ * UErrorCode, then track the error of the validation of the input parameter
+ * into the internal UErrorCode.
+ *
+ * @draft ICU 74
+ */
+
+/**
+ * Constructs an empty ULocaleBuilder. The default value of all
+ * fields, extensions, and private use information is the
+ * empty string. The created builder should be destroyed by calling
+ * ulocbld_close();
+ *
+ * @draft ICU 74
+ */
+U_CAPI ULocaleBuilder* U_EXPORT2
+ulocbld_open();
+
+/**
+ * Close the builder and destroy it's internal states.
+ * @param builder the builder
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_close(ULocaleBuilder* builder);
+
+/**
+ * Resets the <code>ULocaleBuilder</code> to match the provided
+ * <code>locale</code>.  Existing state is discarded.
+ *
+ * <p>All fields of the locale must be well-formed.
+ * <p>This method clears the internal UErrorCode.
+ *
+ * @param builder the builder
+ * @param locale the locale, a const char * pointer (need not be terminated when
+ *               the length is non-negative)
+ * @param length the length of the locale; if negative, then the locale need to be
+ *               null terminated,
+ *
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_setLocale(ULocaleBuilder* builder, const char* locale, int32_t length);
+
+/**
+ * Resets the <code>ULocaleBuilder</code> to match the provided
+ * <code>ULocale</code>. Existing state is discarded.
+ *
+ * <p>The locale must be not bogus.
+ * <p>This method clears the internal UErrorCode.
+ *
+ * @param builder the builder.
+ * @param locale the locale, a ULocale* pointer. The builder adopts the locale
+ *               after the call and the client must not delete it.
+ *
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_adoptULocale(ULocaleBuilder* builder, ULocale* locale);
+
+/**
+ * Resets the ULocaleBuilder to match the provided IETF BCP 47 language tag.
+ * Discards the existing state.
+ * The empty string causes the builder to be reset, like {@link #ulocbld_clear}.
+ * Legacy language tags (marked as “Type: grandfathered” in BCP 47)
+ * are converted to their canonical form before being processed.
+ * Otherwise, the <code>language tag</code> must be well-formed,
+ * or else the ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods
+ * will later report an U_ILLEGAL_ARGUMENT_ERROR.
+ *
+ * <p>This method clears the internal UErrorCode.
+ *
+ * @param builder the builder
+ * @param tag the language tag, defined as IETF BCP 47 language tag, a
+ *               const char * pointer (need not be terminated when
+ *               the length is non-negative)
+ * @param length the length of the tag; if negative, then the tag need to be
+ *               null terminated,
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_setLanguageTag(ULocaleBuilder* builder, const char* tag, int32_t length);
+
+/**
+ * Sets the language.  If <code>language</code> is the empty string, the
+ * language in this <code>ULocaleBuilder</code> is removed. Otherwise, the
+ * <code>language</code> must be well-formed, or else the ulocbld_buildLocaleID()
+ * and ulocbld_buildLanguageTag() methods will
+ * later report an U_ILLEGAL_ARGUMENT_ERROR.
+ *
+ * <p>The syntax of language value is defined as
+ * [unicode_language_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_language_subtag).
+ *
+ * @param builder the builder
+ * @param language the language, a const char * pointer (need not be terminated when
+ *               the length is non-negative)
+ * @param length the length of the language; if negative, then the language need to be
+ *               null terminated,
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_setLanguage(ULocaleBuilder* builder, const char* language, int32_t length);
+
+/**
+ * Sets the script. If <code>script</code> is the empty string, the script in
+ * this <code>ULocaleBuilder</code> is removed.
+ * Otherwise, the <code>script</code> must be well-formed, or else the
+ * ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods will later
+ * report an U_ILLEGAL_ARGUMENT_ERROR.
+ *
+ * <p>The script value is a four-letter script code as
+ * [unicode_script_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_script_subtag)
+ * defined by ISO 15924
+ *
+ * @param builder the builder
+ * @param script the script, a const char * pointer (need not be terminated when
+ *               the length is non-negative)
+ * @param length the length of the script; if negative, then the script need to be
+ *               null terminated,
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_setScript(ULocaleBuilder* builder, const char* script, int32_t length);
+
+/**
+ * Sets the region.  If region is the empty string, the region in this
+ * <code>ULocaleBuilder</code> is removed. Otherwise, the <code>region</code>
+ * must be well-formed, or else the ulocbld_buildLocaleID() and
+ * ulocbld_buildLanguageTag() methods will later report an
+ * U_ILLEGAL_ARGUMENT_ERROR.
+ *
+ * <p>The region value is defined by
+ *  [unicode_region_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_region_subtag)
+ * as a two-letter ISO 3166 code or a three-digit UN M.49 area code.
+ *
+ * <p>The region value in the <code>Locale</code> created by the
+ * <code>ULocaleBuilder</code> is always normalized to upper case.
+ *
+ * @param builder the builder
+ * @param region the region, a const char * pointer (need not be terminated when
+ *               the length is non-negative)
+ * @param length the length of the region; if negative, then the region need to be
+ *               null terminated,
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_setRegion(ULocaleBuilder* builder, const char* region, int32_t length);
+
+/**
+ * Sets the variant.  If variant is the empty string, the variant in this
+ * <code>ULocaleBuilder</code> is removed.  Otherwise, the <code>variant</code>
+ * must be well-formed, or else the ulocbld_buildLocaleID() and
+ * ulocbld_buildLanguageTag() methods will later report an
+ * U_ILLEGAL_ARGUMENT_ERROR.
+ *
+ * <p><b>Note:</b> This method checks if <code>variant</code>
+ * satisfies the
+ * [unicode_variant_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_variant_subtag)
+ * syntax requirements, and normalizes the value to lowercase letters. However,
+ * the <code>Locale</code> class does not impose any syntactic
+ * restriction on variant. To set an ill-formed variant, use a Locale constructor.
+ * If there are multiple unicode_variant_subtag, the caller must concatenate
+ * them with '-' as separator (ex: "foobar-fibar").
+ *
+ * @param builder the builder
+ * @param variant the variant, a const char * pointer (need not be terminated when
+ *               the length is non-negative)
+ * @param length the length of the variant; if negative, then the variant need to be
+ *               null terminated,
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_setVariant(ULocaleBuilder* builder, const char* variant, int32_t length);
+
+/**
+ * Sets the extension for the given key. If the value is the empty string,
+ * the extension is removed.  Otherwise, the <code>key</code> and
+ * <code>value</code> must be well-formed, or else the ulocbld_buildLocaleID()
+ * and ulocbld_buildLanguageTag() methods will
+ * later report an U_ILLEGAL_ARGUMENT_ERROR.
+ *
+ * <p><b>Note:</b> The key ('u') is used for the Unicode locale extension.
+ * Setting a value for this key replaces any existing Unicode locale key/type
+ * pairs with those defined in the extension.
+ *
+ * <p><b>Note:</b> The key ('x') is used for the private use code. To be
+ * well-formed, the value for this key needs only to have subtags of one to
+ * eight alphanumeric characters, not two to eight as in the general case.
+ *
+ * @param builder the builder
+ * @param key the extension key
+ * @param value the value, a const char * pointer (need not be terminated when
+ *               the length is non-negative)
+ * @param length the length of the value; if negative, then the value need to be
+ *               null terminated,
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_setExtension(ULocaleBuilder* builder, char key, const char* value, int32_t length);
+
+/**
+ * Sets the Unicode locale keyword type for the given key. If the type
+ * StringPiece is constructed with a nullptr, the keyword is removed.
+ * If the type is the empty string, the keyword is set without type subtags.
+ * Otherwise, the key and type must be well-formed, or else the
+ * ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods will later
+ * report an U_ILLEGAL_ARGUMENT_ERROR.
+ *
+ * <p>Keys and types are converted to lower case.
+ *
+ * <p><b>Note</b>:Setting the 'u' extension via {@link #ulocbld_setExtension}
+ * replaces all Unicode locale keywords with those defined in the
+ * extension.
+ *
+ * @param builder the builder
+ * @param key the Unicode locale key, a const char * pointer (need not be
+ *               terminated when the length is non-negative)
+ * @param keyLength the length of the key; if negative, then the key need to be
+ *               null terminated,
+ * @param type the Unicode locale type, a const char * pointer (need not be
+ *               terminated when the length is non-negative)
+ * @param typeLength the length of the type; if negative, then the type need to
+ *               be null terminated,
+ * @return This builder.
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_setUnicodeLocaleKeyword(ULocaleBuilder* builder,
+        const char* key, int32_t keyLength, const char* type, int32_t typeLength);
+
+/**
+ * Adds a unicode locale attribute, if not already present, otherwise
+ * has no effect.  The attribute must not be empty string and must be
+ * well-formed or U_ILLEGAL_ARGUMENT_ERROR will be set to status
+ * during the ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() calls.
+ *
+ * @param builder the builder
+ * @param attribute the attribute, a const char * pointer (need not be
+ *               terminated when the length is non-negative)
+ * @param length the length of the attribute; if negative, then the attribute
+ *               need to be null terminated,
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_addUnicodeLocaleAttribute(
+    ULocaleBuilder* builder, const char* attribute, int32_t length);
+
+/**
+ * Removes a unicode locale attribute, if present, otherwise has no
+ * effect.  The attribute must not be empty string and must be well-formed
+ * or U_ILLEGAL_ARGUMENT_ERROR will be set to status during the ulocbld_buildLocaleID()
+ * and ulocbld_buildLanguageTag() calls.
+ *
+ * <p>Attribute comparison for removal is case-insensitive.
+ *
+ * @param builder the builder
+ * @param attribute the attribute, a const char * pointer (need not be
+ *               terminated when the length is non-negative)
+ * @param length the length of the attribute; if negative, then the attribute
+ *               need to be null terminated,
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_removeUnicodeLocaleAttribute(
+    ULocaleBuilder* builder, const char* attribute, int32_t length);
+
+/**
+ * Resets the builder to its initial, empty state.
+ * <p>This method clears the internal UErrorCode.
+ *
+ * @param builder the builder
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_clear(ULocaleBuilder* builder);
+
+/**
+ * Resets the extensions to their initial, empty state.
+ * Language, script, region and variant are unchanged.
+ *
+ * @param builder the builder
+ * @draft ICU 74
+ */
+U_CAPI void U_EXPORT2
+ulocbld_clearExtensions(ULocaleBuilder* builder);
+
+/**
+ * Build the LocaleID string from the fields set on this builder.
+ * If any set methods or during the ulocbld_buildLocaleID() call require memory
+ * allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status.
+ * If any of the fields set by the setters are not well-formed, the status
+ * will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will
+ * not change after the ulocbld_buildLocaleID() call and the caller is
+ * free to keep using the same builder to build more locales.
+ *
+ * @param builder the builder
+ * @param locale the locale id
+ * @param localeCapacity the size of the locale buffer to store the locale id
+ * @param err the error code
+ * @return the length of the locale id in buffer
+ * @draft ICU 74
+ */
+U_CAPI int32_t U_EXPORT2
+ulocbld_buildLocaleID(ULocaleBuilder* builder, char* locale,
+                      int32_t localeCapacity, UErrorCode* err);
+
+/**
+ * Build the ULocale object from the fields set on this builder.
+ * If any set methods or during the ulocbld_buildULocale() call require memory
+ * allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status.
+ * If any of the fields set by the setters are not well-formed, the status
+ * will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will
+ * not change after the ulocbld_buildULocale() call and the caller is
+ * free to keep using the same builder to build more locales.
+ *
+ * @param builder the builder.
+ * @param err the error code.
+ * @return the locale, a ULocale* pointer. The created ULocale must be
+ *          destroyed by calling {@link ulocale_close}.
+ * @draft ICU 74
+ */
+U_CAPI ULocale* U_EXPORT2
+ulocbld_buildULocale(ULocaleBuilder* builder, UErrorCode* err);
+
+/**
+ * Build the IETF BCP 47 language tag string from the fields set on this builder.
+ * If any set methods or during the ulocbld_buildLanguageTag() call require memory
+ * allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status.
+ * If any of the fields set by the setters are not well-formed, the status
+ * will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will
+ * not change after the ulocbld_buildLanguageTag() call and the caller is free
+ * to keep using the same builder to build more locales.
+ *
+ * @param builder the builder
+ * @param language the language tag
+ * @param languageCapacity the size of the language buffer to store the language
+ * tag
+ * @param err the error code
+ * @return the length of the language tag in buffer
+ * @draft ICU 74
+ */
+U_CAPI int32_t U_EXPORT2
+ulocbld_buildLanguageTag(ULocaleBuilder* builder, char* language,
+                      int32_t languageCapacity, UErrorCode* err);
+
+/**
+ * Sets the UErrorCode if an error occurred while recording sets.
+ * Preserves older error codes in the outErrorCode.
+ *
+ * @param builder the builder
+ * @param outErrorCode Set to an error code that occurred while setting subtags.
+ *                  Unchanged if there is no such error or if outErrorCode
+ *                  already contained an error.
+ * @return true if U_FAILURE(*outErrorCode)
+ * @draft ICU 74
+ */
+U_CAPI UBool U_EXPORT2
+ulocbld_copyErrorTo(const ULocaleBuilder* builder, UErrorCode *outErrorCode);
+
+#if U_SHOW_CPLUSPLUS_API
+
+U_NAMESPACE_BEGIN
+
+/**
+ * \class LocalULocaleBuilderPointer
+ * "Smart pointer" class, closes a ULocaleBuilder via ulocbld_close().
+ * For most methods see the LocalPointerBase base class.
+ *
+ * @see LocalPointerBase
+ * @see LocalPointer
+ * @draft ICU 74
+ */
+U_DEFINE_LOCAL_OPEN_POINTER(LocalULocaleBuilderPointer, ULocaleBuilder, ulocbld_close);
+
+U_NAMESPACE_END
+
+#endif  /* U_SHOW_CPLUSPLUS_API */
+
+#endif  /* U_HIDE_DRAFT_API */
+
+#endif  // __ULOCBUILDER_H__

+ 24 - 2
thirdparty/icu4c/common/unicode/unorm2.h

@@ -181,7 +181,10 @@ U_CAPI const UNormalizer2 * U_EXPORT2
 unorm2_getNFKDInstance(UErrorCode *pErrorCode);
 
 /**
- * Returns a UNormalizer2 instance for Unicode NFKC_Casefold normalization.
+ * Returns a UNormalizer2 instance for Unicode toNFKC_Casefold() normalization
+ * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
+ * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
+ *
  * Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode).
  * Returns an unmodifiable singleton instance. Do not delete it.
  * @param pErrorCode Standard ICU error code. Its input value must
@@ -194,6 +197,25 @@ unorm2_getNFKDInstance(UErrorCode *pErrorCode);
 U_CAPI const UNormalizer2 * U_EXPORT2
 unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Returns a UNormalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
+ * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
+ * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
+ *
+ * Same as unorm2_getInstance(NULL, "nfkc_scf", UNORM2_COMPOSE, pErrorCode).
+ * Returns an unmodifiable singleton instance. Do not delete it.
+ * @param pErrorCode Standard ICU error code. Its input value must
+ *                  pass the U_SUCCESS() test, or else the function returns
+ *                  immediately. Check for U_FAILURE() on output or use with
+ *                  function chaining. (See User Guide for details.)
+ * @return the requested Normalizer2, if successful
+ * @draft ICU 74
+ */
+U_CAPI const UNormalizer2 * U_EXPORT2
+unorm2_getNFKCSimpleCasefoldInstance(UErrorCode *pErrorCode);
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Returns a UNormalizer2 instance which uses the specified data file
  * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
@@ -206,7 +228,7 @@ unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode);
  * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
  *
  * @param packageName NULL for ICU built-in data, otherwise application data package name
- * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
+ * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
  * @param mode normalization mode (compose or decompose etc.)
  * @param pErrorCode Standard ICU error code. Its input value must
  *                  pass the U_SUCCESS() test, or else the function returns

+ 57 - 4
thirdparty/icu4c/common/unicode/urename.h

@@ -138,8 +138,8 @@
 #define locale_getKeywordsStart U_ICU_ENTRY_POINT_RENAME(locale_getKeywordsStart)
 #define locale_get_default U_ICU_ENTRY_POINT_RENAME(locale_get_default)
 #define locale_set_default U_ICU_ENTRY_POINT_RENAME(locale_set_default)
+#define mixedMeasuresToMicros U_ICU_ENTRY_POINT_RENAME(mixedMeasuresToMicros)
 #define numSysCleanup U_ICU_ENTRY_POINT_RENAME(numSysCleanup)
-#define rbbi_cleanup U_ICU_ENTRY_POINT_RENAME(rbbi_cleanup)
 #define pl_addFontRun U_ICU_ENTRY_POINT_RENAME(pl_addFontRun)
 #define pl_addLocaleRun U_ICU_ENTRY_POINT_RENAME(pl_addLocaleRun)
 #define pl_addValueRun U_ICU_ENTRY_POINT_RENAME(pl_addValueRun)
@@ -193,6 +193,7 @@
 #define pl_resetFontRuns U_ICU_ENTRY_POINT_RENAME(pl_resetFontRuns)
 #define pl_resetLocaleRuns U_ICU_ENTRY_POINT_RENAME(pl_resetLocaleRuns)
 #define pl_resetValueRuns U_ICU_ENTRY_POINT_RENAME(pl_resetValueRuns)
+#define rbbi_cleanup U_ICU_ENTRY_POINT_RENAME(rbbi_cleanup)
 #define res_countArrayItems U_ICU_ENTRY_POINT_RENAME(res_countArrayItems)
 #define res_findResource U_ICU_ENTRY_POINT_RENAME(res_findResource)
 #define res_getAlias U_ICU_ENTRY_POINT_RENAME(res_getAlias)
@@ -512,9 +513,6 @@
 #define ubrk_setText U_ICU_ENTRY_POINT_RENAME(ubrk_setText)
 #define ubrk_setUText U_ICU_ENTRY_POINT_RENAME(ubrk_setUText)
 #define ubrk_swap U_ICU_ENTRY_POINT_RENAME(ubrk_swap)
-#define ucache_compareKeys U_ICU_ENTRY_POINT_RENAME(ucache_compareKeys)
-#define ucache_deleteKey U_ICU_ENTRY_POINT_RENAME(ucache_deleteKey)
-#define ucache_hashKeys U_ICU_ENTRY_POINT_RENAME(ucache_hashKeys)
 #define ucal_add U_ICU_ENTRY_POINT_RENAME(ucal_add)
 #define ucal_clear U_ICU_ENTRY_POINT_RENAME(ucal_clear)
 #define ucal_clearField U_ICU_ENTRY_POINT_RENAME(ucal_clearField)
@@ -532,6 +530,7 @@
 #define ucal_getFieldDifference U_ICU_ENTRY_POINT_RENAME(ucal_getFieldDifference)
 #define ucal_getGregorianChange U_ICU_ENTRY_POINT_RENAME(ucal_getGregorianChange)
 #define ucal_getHostTimeZone U_ICU_ENTRY_POINT_RENAME(ucal_getHostTimeZone)
+#define ucal_getIanaTimeZoneID U_ICU_ENTRY_POINT_RENAME(ucal_getIanaTimeZoneID)
 #define ucal_getKeywordValuesForLocale U_ICU_ENTRY_POINT_RENAME(ucal_getKeywordValuesForLocale)
 #define ucal_getLimit U_ICU_ENTRY_POINT_RENAME(ucal_getLimit)
 #define ucal_getLocaleByType U_ICU_ENTRY_POINT_RENAME(ucal_getLocaleByType)
@@ -587,6 +586,7 @@
 #define ucasemap_getLocale U_ICU_ENTRY_POINT_RENAME(ucasemap_getLocale)
 #define ucasemap_getOptions U_ICU_ENTRY_POINT_RENAME(ucasemap_getOptions)
 #define ucasemap_internalUTF8ToTitle U_ICU_ENTRY_POINT_RENAME(ucasemap_internalUTF8ToTitle)
+#define ucasemap_mapUTF8 U_ICU_ENTRY_POINT_RENAME(ucasemap_mapUTF8)
 #define ucasemap_open U_ICU_ENTRY_POINT_RENAME(ucasemap_open)
 #define ucasemap_setBreakIterator U_ICU_ENTRY_POINT_RENAME(ucasemap_setBreakIterator)
 #define ucasemap_setLocale U_ICU_ENTRY_POINT_RENAME(ucasemap_setLocale)
@@ -955,9 +955,16 @@
 #define ufieldpositer_close U_ICU_ENTRY_POINT_RENAME(ufieldpositer_close)
 #define ufieldpositer_next U_ICU_ENTRY_POINT_RENAME(ufieldpositer_next)
 #define ufieldpositer_open U_ICU_ENTRY_POINT_RENAME(ufieldpositer_open)
+#define ufile_close_translit U_ICU_ENTRY_POINT_RENAME(ufile_close_translit)
+#define ufile_fill_uchar_buffer U_ICU_ENTRY_POINT_RENAME(ufile_fill_uchar_buffer)
+#define ufile_flush_io U_ICU_ENTRY_POINT_RENAME(ufile_flush_io)
+#define ufile_flush_translit U_ICU_ENTRY_POINT_RENAME(ufile_flush_translit)
 #define ufile_getch U_ICU_ENTRY_POINT_RENAME(ufile_getch)
 #define ufile_getch32 U_ICU_ENTRY_POINT_RENAME(ufile_getch32)
+#define ufmt_64tou U_ICU_ENTRY_POINT_RENAME(ufmt_64tou)
 #define ufmt_close U_ICU_ENTRY_POINT_RENAME(ufmt_close)
+#define ufmt_defaultCPToUnicode U_ICU_ENTRY_POINT_RENAME(ufmt_defaultCPToUnicode)
+#define ufmt_digitvalue U_ICU_ENTRY_POINT_RENAME(ufmt_digitvalue)
 #define ufmt_getArrayItemByIndex U_ICU_ENTRY_POINT_RENAME(ufmt_getArrayItemByIndex)
 #define ufmt_getArrayLength U_ICU_ENTRY_POINT_RENAME(ufmt_getArrayLength)
 #define ufmt_getDate U_ICU_ENTRY_POINT_RENAME(ufmt_getDate)
@@ -969,7 +976,11 @@
 #define ufmt_getType U_ICU_ENTRY_POINT_RENAME(ufmt_getType)
 #define ufmt_getUChars U_ICU_ENTRY_POINT_RENAME(ufmt_getUChars)
 #define ufmt_isNumeric U_ICU_ENTRY_POINT_RENAME(ufmt_isNumeric)
+#define ufmt_isdigit U_ICU_ENTRY_POINT_RENAME(ufmt_isdigit)
 #define ufmt_open U_ICU_ENTRY_POINT_RENAME(ufmt_open)
+#define ufmt_ptou U_ICU_ENTRY_POINT_RENAME(ufmt_ptou)
+#define ufmt_uto64 U_ICU_ENTRY_POINT_RENAME(ufmt_uto64)
+#define ufmt_utop U_ICU_ENTRY_POINT_RENAME(ufmt_utop)
 #define ufmtval_getString U_ICU_ENTRY_POINT_RENAME(ufmtval_getString)
 #define ufmtval_nextPosition U_ICU_ENTRY_POINT_RENAME(ufmtval_nextPosition)
 #define ugender_getInstance U_ICU_ENTRY_POINT_RENAME(ugender_getInstance)
@@ -1133,6 +1144,39 @@
 #define uloc_toLegacyType U_ICU_ENTRY_POINT_RENAME(uloc_toLegacyType)
 #define uloc_toUnicodeLocaleKey U_ICU_ENTRY_POINT_RENAME(uloc_toUnicodeLocaleKey)
 #define uloc_toUnicodeLocaleType U_ICU_ENTRY_POINT_RENAME(uloc_toUnicodeLocaleType)
+#define ulocale_close U_ICU_ENTRY_POINT_RENAME(ulocale_close)
+#define ulocale_getBaseName U_ICU_ENTRY_POINT_RENAME(ulocale_getBaseName)
+#define ulocale_getKeywordValue U_ICU_ENTRY_POINT_RENAME(ulocale_getKeywordValue)
+#define ulocale_getKeywords U_ICU_ENTRY_POINT_RENAME(ulocale_getKeywords)
+#define ulocale_getLanguage U_ICU_ENTRY_POINT_RENAME(ulocale_getLanguage)
+#define ulocale_getLocaleID U_ICU_ENTRY_POINT_RENAME(ulocale_getLocaleID)
+#define ulocale_getRegion U_ICU_ENTRY_POINT_RENAME(ulocale_getRegion)
+#define ulocale_getScript U_ICU_ENTRY_POINT_RENAME(ulocale_getScript)
+#define ulocale_getUnicodeKeywordValue U_ICU_ENTRY_POINT_RENAME(ulocale_getUnicodeKeywordValue)
+#define ulocale_getUnicodeKeywords U_ICU_ENTRY_POINT_RENAME(ulocale_getUnicodeKeywords)
+#define ulocale_getVariant U_ICU_ENTRY_POINT_RENAME(ulocale_getVariant)
+#define ulocale_isBogus U_ICU_ENTRY_POINT_RENAME(ulocale_isBogus)
+#define ulocale_openForLanguageTag U_ICU_ENTRY_POINT_RENAME(ulocale_openForLanguageTag)
+#define ulocale_openForLocaleID U_ICU_ENTRY_POINT_RENAME(ulocale_openForLocaleID)
+#define ulocbld_addUnicodeLocaleAttribute U_ICU_ENTRY_POINT_RENAME(ulocbld_addUnicodeLocaleAttribute)
+#define ulocbld_adoptULocale U_ICU_ENTRY_POINT_RENAME(ulocbld_adoptULocale)
+#define ulocbld_buildLanguageTag U_ICU_ENTRY_POINT_RENAME(ulocbld_buildLanguageTag)
+#define ulocbld_buildLocaleID U_ICU_ENTRY_POINT_RENAME(ulocbld_buildLocaleID)
+#define ulocbld_buildULocale U_ICU_ENTRY_POINT_RENAME(ulocbld_buildULocale)
+#define ulocbld_clear U_ICU_ENTRY_POINT_RENAME(ulocbld_clear)
+#define ulocbld_clearExtensions U_ICU_ENTRY_POINT_RENAME(ulocbld_clearExtensions)
+#define ulocbld_close U_ICU_ENTRY_POINT_RENAME(ulocbld_close)
+#define ulocbld_copyErrorTo U_ICU_ENTRY_POINT_RENAME(ulocbld_copyErrorTo)
+#define ulocbld_open U_ICU_ENTRY_POINT_RENAME(ulocbld_open)
+#define ulocbld_removeUnicodeLocaleAttribute U_ICU_ENTRY_POINT_RENAME(ulocbld_removeUnicodeLocaleAttribute)
+#define ulocbld_setExtension U_ICU_ENTRY_POINT_RENAME(ulocbld_setExtension)
+#define ulocbld_setLanguage U_ICU_ENTRY_POINT_RENAME(ulocbld_setLanguage)
+#define ulocbld_setLanguageTag U_ICU_ENTRY_POINT_RENAME(ulocbld_setLanguageTag)
+#define ulocbld_setLocale U_ICU_ENTRY_POINT_RENAME(ulocbld_setLocale)
+#define ulocbld_setRegion U_ICU_ENTRY_POINT_RENAME(ulocbld_setRegion)
+#define ulocbld_setScript U_ICU_ENTRY_POINT_RENAME(ulocbld_setScript)
+#define ulocbld_setUnicodeLocaleKeyword U_ICU_ENTRY_POINT_RENAME(ulocbld_setUnicodeLocaleKeyword)
+#define ulocbld_setVariant U_ICU_ENTRY_POINT_RENAME(ulocbld_setVariant)
 #define ulocdata_close U_ICU_ENTRY_POINT_RENAME(ulocdata_close)
 #define ulocdata_getCLDRVersion U_ICU_ENTRY_POINT_RENAME(ulocdata_getCLDRVersion)
 #define ulocdata_getDelimiter U_ICU_ENTRY_POINT_RENAME(ulocdata_getDelimiter)
@@ -1213,6 +1257,7 @@
 #define unorm2_getNFDInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFDInstance)
 #define unorm2_getNFKCCasefoldInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKCCasefoldInstance)
 #define unorm2_getNFKCInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKCInstance)
+#define unorm2_getNFKCSimpleCasefoldInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKCSimpleCasefoldInstance)
 #define unorm2_getNFKDInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKDInstance)
 #define unorm2_getRawDecomposition U_ICU_ENTRY_POINT_RENAME(unorm2_getRawDecomposition)
 #define unorm2_hasBoundaryAfter U_ICU_ENTRY_POINT_RENAME(unorm2_hasBoundaryAfter)
@@ -1349,6 +1394,7 @@
 #define uprv_convertToPosix U_ICU_ENTRY_POINT_RENAME(uprv_convertToPosix)
 #define uprv_copyAscii U_ICU_ENTRY_POINT_RENAME(uprv_copyAscii)
 #define uprv_copyEbcdic U_ICU_ENTRY_POINT_RENAME(uprv_copyEbcdic)
+#define uprv_currencyLeads U_ICU_ENTRY_POINT_RENAME(uprv_currencyLeads)
 #define uprv_decContextClearStatus U_ICU_ENTRY_POINT_RENAME(uprv_decContextClearStatus)
 #define uprv_decContextDefault U_ICU_ENTRY_POINT_RENAME(uprv_decContextDefault)
 #define uprv_decContextGetRounding U_ICU_ENTRY_POINT_RENAME(uprv_decContextGetRounding)
@@ -1367,6 +1413,7 @@
 #define uprv_decNumberAbs U_ICU_ENTRY_POINT_RENAME(uprv_decNumberAbs)
 #define uprv_decNumberAdd U_ICU_ENTRY_POINT_RENAME(uprv_decNumberAdd)
 #define uprv_decNumberAnd U_ICU_ENTRY_POINT_RENAME(uprv_decNumberAnd)
+#define uprv_decNumberClass U_ICU_ENTRY_POINT_RENAME(uprv_decNumberClass)
 #define uprv_decNumberClassToString U_ICU_ENTRY_POINT_RENAME(uprv_decNumberClassToString)
 #define uprv_decNumberCompare U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCompare)
 #define uprv_decNumberCompareSignal U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCompareSignal)
@@ -1763,6 +1810,9 @@
 #define usnumf_formatInt64 U_ICU_ENTRY_POINT_RENAME(usnumf_formatInt64)
 #define usnumf_openForLocale U_ICU_ENTRY_POINT_RENAME(usnumf_openForLocale)
 #define usnumf_openForLocaleAndGroupingStrategy U_ICU_ENTRY_POINT_RENAME(usnumf_openForLocaleAndGroupingStrategy)
+#define uspoof_areBidiConfusable U_ICU_ENTRY_POINT_RENAME(uspoof_areBidiConfusable)
+#define uspoof_areBidiConfusableUTF8 U_ICU_ENTRY_POINT_RENAME(uspoof_areBidiConfusableUTF8)
+#define uspoof_areBidiConfusableUnicodeString U_ICU_ENTRY_POINT_RENAME(uspoof_areBidiConfusableUnicodeString)
 #define uspoof_areConfusable U_ICU_ENTRY_POINT_RENAME(uspoof_areConfusable)
 #define uspoof_areConfusableUTF8 U_ICU_ENTRY_POINT_RENAME(uspoof_areConfusableUTF8)
 #define uspoof_areConfusableUnicodeString U_ICU_ENTRY_POINT_RENAME(uspoof_areConfusableUnicodeString)
@@ -1778,6 +1828,9 @@
 #define uspoof_getAllowedChars U_ICU_ENTRY_POINT_RENAME(uspoof_getAllowedChars)
 #define uspoof_getAllowedLocales U_ICU_ENTRY_POINT_RENAME(uspoof_getAllowedLocales)
 #define uspoof_getAllowedUnicodeSet U_ICU_ENTRY_POINT_RENAME(uspoof_getAllowedUnicodeSet)
+#define uspoof_getBidiSkeleton U_ICU_ENTRY_POINT_RENAME(uspoof_getBidiSkeleton)
+#define uspoof_getBidiSkeletonUTF8 U_ICU_ENTRY_POINT_RENAME(uspoof_getBidiSkeletonUTF8)
+#define uspoof_getBidiSkeletonUnicodeString U_ICU_ENTRY_POINT_RENAME(uspoof_getBidiSkeletonUnicodeString)
 #define uspoof_getCheckResultChecks U_ICU_ENTRY_POINT_RENAME(uspoof_getCheckResultChecks)
 #define uspoof_getCheckResultNumerics U_ICU_ENTRY_POINT_RENAME(uspoof_getCheckResultNumerics)
 #define uspoof_getCheckResultRestrictionLevel U_ICU_ENTRY_POINT_RENAME(uspoof_getCheckResultRestrictionLevel)

+ 6 - 6
thirdparty/icu4c/common/unicode/uvernum.h

@@ -53,13 +53,13 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.4
  */
-#define U_ICU_VERSION_MAJOR_NUM 73
+#define U_ICU_VERSION_MAJOR_NUM 74
 
 /** The current ICU minor version as an integer.
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.6
  */
-#define U_ICU_VERSION_MINOR_NUM 2
+#define U_ICU_VERSION_MINOR_NUM 1
 
 /** The current ICU patchlevel version as an integer.
  *  This value will change in the subsequent releases of ICU
@@ -79,7 +79,7 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.6
  */
-#define U_ICU_VERSION_SUFFIX _73
+#define U_ICU_VERSION_SUFFIX _74
 
 /**
  * \def U_DEF2_ICU_ENTRY_POINT_RENAME
@@ -132,7 +132,7 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.4
  */
-#define U_ICU_VERSION "73.2"
+#define U_ICU_VERSION "74.1"
 
 /**
  * The current ICU library major version number as a string, for library name suffixes.
@@ -145,13 +145,13 @@
  *
  * @stable ICU 2.6
  */
-#define U_ICU_VERSION_SHORT "73"
+#define U_ICU_VERSION_SHORT "74"
 
 #ifndef U_HIDE_INTERNAL_API
 /** Data version in ICU4C.
  * @internal ICU 4.4 Internal Use Only
  **/
-#define U_ICU_DATA_VERSION "73.2"
+#define U_ICU_DATA_VERSION "74.1"
 #endif  /* U_HIDE_INTERNAL_API */
 
 /*===========================================================================

+ 32 - 10
thirdparty/icu4c/common/uniquecharstr.h

@@ -10,6 +10,7 @@
 #include "charstr.h"
 #include "uassert.h"
 #include "uhash.h"
+#include "cmemory.h"
 
 U_NAMESPACE_BEGIN
 
@@ -47,22 +48,20 @@ public:
     }
 
     /**
-     * Adds a string and returns a unique number for it.
-     * The string's buffer contents must not change, nor move around in memory,
+     * Adds a NUL-terminated string and returns a unique number for it.
+     * The string must not change, nor move around in memory,
      * while this UniqueCharStrings is in use.
-     * The string contents must be NUL-terminated exactly at s.length().
      *
-     * Best used with read-only-alias UnicodeString objects that point to
-     * stable storage, such as strings returned by resource bundle functions.
+     * Best used with string data in a stable storage, such as strings returned
+     * by resource bundle functions.
      */
-    int32_t add(const UnicodeString &s, UErrorCode &errorCode) {
-        if (U_FAILURE(errorCode)) { return 0; }
+    int32_t add(const char16_t*p, UErrorCode &errorCode) {
+        if (U_FAILURE(errorCode)) { return -1; }
         if (isFrozen) {
             errorCode = U_NO_WRITE_PERMISSION;
-            return 0;
+            return -1;
         }
         // The string points into the resource bundle.
-        const char16_t *p = s.getBuffer();
         int32_t oldIndex = uhash_geti(&map, p);
         if (oldIndex != 0) {  // found duplicate
             return oldIndex;
@@ -71,11 +70,33 @@ public:
         // The strings object is also terminated with one implicit NUL.
         strings->append(0, errorCode);
         int32_t newIndex = strings->length();
-        strings->appendInvariantChars(s, errorCode);
+        strings->appendInvariantChars(p, u_strlen(p), errorCode);
         uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode);
         return newIndex;
     }
 
+    /**
+     * Adds a unicode string by value and returns a unique number for it.
+     */
+    int32_t addByValue(UnicodeString s, UErrorCode &errorCode) {
+        if (U_FAILURE(errorCode)) { return -1; }
+        if (isFrozen) {
+            errorCode = U_NO_WRITE_PERMISSION;
+            return -1;
+        }
+        int32_t oldIndex = uhash_geti(&map, s.getTerminatedBuffer());
+        if (oldIndex != 0) {  // found duplicate
+            return oldIndex;
+        }
+        // We need to store the string content of the UnicodeString.
+        UnicodeString *key = keyStore.create(s);
+        if (key == nullptr) {
+            errorCode = U_MEMORY_ALLOCATION_ERROR;
+            return -1;
+        }
+        return add(key->getTerminatedBuffer(), errorCode);
+    }
+
     void freeze() { isFrozen = true; }
 
     /**
@@ -90,6 +111,7 @@ public:
 private:
     UHashtable map;
     CharString *strings;
+    MemoryPool<UnicodeString> keyStore;
     bool isFrozen = false;
 };
 

+ 63 - 0
thirdparty/icu4c/common/uprops.cpp

@@ -328,6 +328,53 @@ static UBool hasEmojiProperty(const BinaryProperty &/*prop*/, UChar32 c, UProper
     return EmojiProps::hasBinaryProperty(c, which);
 }
 
+static UBool isIDSUnaryOperator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
+    // New in Unicode 15.1 for just two characters.
+    return 0x2FFE<=c && c<=0x2FFF;
+}
+
+/** Ranges (start/limit pairs) of ID_Compat_Math_Continue (only), from UCD PropList.txt. */
+static constexpr UChar32 ID_COMPAT_MATH_CONTINUE[] = {
+    0x00B2, 0x00B3 + 1,
+    0x00B9, 0x00B9 + 1,
+    0x2070, 0x2070 + 1,
+    0x2074, 0x207E + 1,
+    0x2080, 0x208E + 1
+};
+
+/** ID_Compat_Math_Start characters, from UCD PropList.txt. */
+static constexpr UChar32 ID_COMPAT_MATH_START[] = {
+    0x2202,
+    0x2207,
+    0x221E,
+    0x1D6C1,
+    0x1D6DB,
+    0x1D6FB,
+    0x1D715,
+    0x1D735,
+    0x1D74F,
+    0x1D76F,
+    0x1D789,
+    0x1D7A9,
+    0x1D7C3
+};
+
+static UBool isIDCompatMathStart(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
+    if (c < ID_COMPAT_MATH_START[0]) { return false; }  // fastpath for common scripts
+    for (UChar32 startChar : ID_COMPAT_MATH_START) {
+        if (c == startChar) { return true; }
+    }
+    return false;
+}
+
+static UBool isIDCompatMathContinue(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) {
+    for (int32_t i = 0; i < UPRV_LENGTHOF(ID_COMPAT_MATH_CONTINUE); i += 2) {
+        if (c < ID_COMPAT_MATH_CONTINUE[i]) { return false; }  // below range start
+        if (c < ID_COMPAT_MATH_CONTINUE[i + 1]) { return true; }  // below range limit
+    }
+    return isIDCompatMathStart(prop, c, UCHAR_ID_COMPAT_MATH_START);
+}
+
 static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
     /*
      * column and mask values for binary properties from u_getUnicodeProperties().
@@ -409,6 +456,9 @@ static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
     { UPROPS_SRC_EMOJI, 0, hasEmojiProperty },  // UCHAR_RGI_EMOJI_TAG_SEQUENCE
     { UPROPS_SRC_EMOJI, 0, hasEmojiProperty },  // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE
     { UPROPS_SRC_EMOJI, 0, hasEmojiProperty },  // UCHAR_RGI_EMOJI
+    { UPROPS_SRC_IDSU, 0, isIDSUnaryOperator }, // UCHAR_IDS_UNARY_OPERATOR
+    { UPROPS_SRC_ID_COMPAT_MATH, 0, isIDCompatMathStart }, // UCHAR_ID_COMPAT_MATH_START
+    { UPROPS_SRC_ID_COMPAT_MATH, 0, isIDCompatMathContinue }, // UCHAR_ID_COMPAT_MATH_CONTINUE
 };
 
 U_CAPI UBool U_EXPORT2
@@ -759,6 +809,19 @@ uprops_getSource(UProperty which) {
 
 U_CFUNC void U_EXPORT2
 uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode) {
+    if (U_FAILURE(*pErrorCode)) { return; }
+    if (src == UPROPS_SRC_ID_COMPAT_MATH) {
+        // range limits
+        for (UChar32 c : ID_COMPAT_MATH_CONTINUE) {
+            sa->add(sa->set, c);
+        }
+        // single characters
+        for (UChar32 c : ID_COMPAT_MATH_START) {
+            sa->add(sa->set, c);
+            sa->add(sa->set, c + 1);
+        }
+        return;
+    }
     if (!ulayout_ensureData(*pErrorCode)) { return; }
     const UCPTrie *trie;
     switch (src) {

+ 2 - 0
thirdparty/icu4c/common/uprops.h

@@ -379,6 +379,8 @@ enum UPropertySource {
     UPROPS_SRC_INSC,
     UPROPS_SRC_VO,
     UPROPS_SRC_EMOJI,
+    UPROPS_SRC_IDSU,
+    UPROPS_SRC_ID_COMPAT_MATH,
     /** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
     UPROPS_SRC_COUNT
 };

+ 172 - 53
thirdparty/icu4c/common/uresbund.cpp

@@ -24,6 +24,7 @@
 #include "unicode/ures.h"
 #include "unicode/ustring.h"
 #include "unicode/ucnv.h"
+#include "bytesinkutil.h"
 #include "charstr.h"
 #include "uresimp.h"
 #include "ustr_imp.h"
@@ -2351,7 +2352,66 @@ struct GetAllChildrenSink : public ResourceSink {
                     aliasedValue.setData(aliasRB->getResData());
                     aliasedValue.setValidLocaleDataEntry(aliasRB->fValidLocaleDataEntry);
                     aliasedValue.setResource(aliasRB->fRes, ResourceTracer(aliasRB));
-                    dest.put(key, aliasedValue, isRoot, errorCode);
+                    
+                    if (aliasedValue.getType() != URES_TABLE) {
+                        dest.put(key, aliasedValue, isRoot, errorCode);
+                    } else {
+                        // if the resource we're aliasing over to is a table, the sink might iterate over its contents.
+                        // If it does, it'll get only the things defined in the actual alias target, not the things
+                        // the target inherits from its parent resources.  So we walk the parent chain for the *alias target*,
+                        // calling dest.put() for each of the parent tables we could be inheriting from.  This means
+                        // that dest.put() has to iterate over the children of multiple tables to get all of the inherited
+                        // resource values, but it already has to do that to handle normal vertical inheritance.
+                        UResType aliasedValueType = URES_TABLE;
+                        CharString tablePath;
+                        tablePath.append(aliasRB->fResPath, errorCode);
+                        const char* parentKey = key; // dest.put() changes the key
+                        dest.put(parentKey, aliasedValue, isRoot, errorCode);
+                        UResourceDataEntry* entry = aliasRB->fData;
+                        Resource res = aliasRB->fRes;
+                        while (aliasedValueType == URES_TABLE && entry->fParent != nullptr) {
+                            CharString localPath;
+                            localPath.copyFrom(tablePath, errorCode);
+                            char* localPathAsCharPtr = localPath.data();
+                            const char* childKey;
+                            entry = entry->fParent;
+                            res = entry->fData.rootRes;
+                            Resource newRes = res_findResource(&entry->fData, res, &localPathAsCharPtr, &childKey);
+                            if (newRes != RES_BOGUS) {
+                                aliasedValue.setData(entry->fData);
+                                // TODO: do I also need to call aliasedValue.setValueLocaleDataEntry() ?
+                                aliasedValue.setResource(newRes, ResourceTracer(aliasRB)); // probably wrong to use aliasRB here
+                                aliasedValueType = aliasedValue.getType();
+                                if (aliasedValueType == URES_ALIAS) {
+                                    // in a few rare cases, when we get to the root resource bundle, the resource in question
+                                    // won't be an actual table, but will instead be an alias to a table.  That is, we have
+                                    // two aliases in the inheritance path.  (For some locales, such as Zulu, we see this with
+                                    // children of the "fields" resource: "day-narrow" aliases to "day-short", which aliases
+                                    // to "day".)  When this happens, we need to make sure we follow all the aliases.
+                                    ResourceDataValue& rdv2 = static_cast<ResourceDataValue&>(aliasedValue);
+                                    aliasRB = getAliasTargetAsResourceBundle(rdv2.getData(), rdv2.getResource(), nullptr, -1,
+                                                                             rdv2.getValidLocaleDataEntry(), nullptr, 0,
+                                                                             stackTempBundle.getAlias(), &errorCode);
+                                    tablePath.clear();
+                                    tablePath.append(aliasRB->fResPath, errorCode);
+                                    entry = aliasRB->fData;
+                                    res = aliasRB->fRes;
+                                    aliasedValue.setData(entry->fData);
+                                    // TODO: do I also need to call aliasedValue.setValueLocaleDataEntry() ?
+                                    aliasedValue.setResource(res, ResourceTracer(aliasRB)); // probably wrong to use aliasRB here
+                                    aliasedValueType = aliasedValue.getType();
+                                }
+                                if (aliasedValueType == URES_TABLE) {
+                                    dest.put(parentKey, aliasedValue, isRoot, errorCode);
+                                } else {
+                                    // once we've followed the alias, the resource we're looking at really should
+                                    // be a table
+                                    errorCode = U_INTERNAL_PROGRAM_ERROR;
+                                    return;
+                                }
+                            }
+                        }
+                    }
                 }
             } else {
                 dest.put(key, value, isRoot, errorCode);
@@ -2657,13 +2717,16 @@ ures_openWithType(UResourceBundle *r, const char* path, const char* localeID,
     UResourceDataEntry *entry;
     if(openType != URES_OPEN_DIRECT) {
         /* first "canonicalize" the locale ID */
-        char canonLocaleID[ULOC_FULLNAME_CAPACITY];
-        uloc_getBaseName(localeID, canonLocaleID, UPRV_LENGTHOF(canonLocaleID), status);
-        if(U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING) {
+        CharString canonLocaleID;
+        {
+            CharStringByteSink sink(&canonLocaleID);
+            ulocimp_getBaseName(localeID, sink, status);
+        }
+        if(U_FAILURE(*status)) {
             *status = U_ILLEGAL_ARGUMENT_ERROR;
             return nullptr;
         }
-        entry = entryOpen(path, canonLocaleID, openType, status);
+        entry = entryOpen(path, canonLocaleID.data(), openType, status);
     } else {
         entry = entryOpenDirect(path, localeID, status);
     }
@@ -2974,15 +3037,39 @@ static UBool isLocaleInList(UEnumeration *locEnum, const char *locToSearch, UErr
     return false;
 }
 
+static void getParentForFunctionalEquivalent(const char*      localeID,
+                                             UResourceBundle* res,
+                                             UResourceBundle* bund1,
+                                             char*            parent,
+                                             int32_t          parentCapacity) {
+    // Get parent.
+    // First check for a parent from %%Parent resource (Note that in resource trees
+    // such as collation, data may have different parents than in parentLocales).
+    UErrorCode subStatus = U_ZERO_ERROR;
+    parent[0] = '\0';
+    if (res != NULL) {
+        ures_getByKey(res, "%%Parent", bund1, &subStatus);
+        if (U_SUCCESS(subStatus)) {
+            int32_t parentLen = parentCapacity;
+            ures_getUTF8String(bund1, parent, &parentLen, true, &subStatus);
+        }
+    }
+    
+    // If none there, use normal truncation parent
+    if (U_FAILURE(subStatus) || parent[0] == 0) {
+        subStatus = U_ZERO_ERROR;
+        uloc_getParent(localeID, parent, parentCapacity, &subStatus);
+    }
+}
+
 U_CAPI int32_t U_EXPORT2
 ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
                              const char *path, const char *resName, const char *keyword, const char *locid,
                              UBool *isAvailable, UBool omitDefault, UErrorCode *status)
 {
-    char kwVal[1024] = ""; /* value of keyword 'keyword' */
     char defVal[1024] = ""; /* default value for given locale */
     char defLoc[1024] = ""; /* default value for given locale */
-    char base[1024] = ""; /* base locale */
+    CharString base; /* base locale */
     char found[1024] = "";
     char parent[1024] = "";
     char full[1024] = "";
@@ -2991,23 +3078,29 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
     UErrorCode subStatus = U_ZERO_ERROR;
     int32_t length = 0;
     if(U_FAILURE(*status)) return 0;
-    uloc_getKeywordValue(locid, keyword, kwVal, 1024-1,&subStatus);
-    if(!uprv_strcmp(kwVal, DEFAULT_TAG)) {
-        kwVal[0]=0;
+    CharString kwVal;
+    {
+        CharStringByteSink sink(&kwVal);
+        ulocimp_getKeywordValue(locid, keyword, sink, &subStatus);
+    }
+    if(kwVal == DEFAULT_TAG) {
+        kwVal.clear();
+    }
+    {
+        CharStringByteSink sink(&base);
+        ulocimp_getBaseName(locid, sink, &subStatus);
     }
-    uloc_getBaseName(locid, base, 1024-1,&subStatus);
 #if defined(URES_TREE_DEBUG)
     fprintf(stderr, "getFunctionalEquivalent: \"%s\" [%s=%s] in %s - %s\n", 
-            locid, keyword, kwVal, base, u_errorName(subStatus));
+            locid, keyword, kwVal.data(), base.data(), u_errorName(subStatus));
 #endif
     ures_initStackObject(&bund1);
     ures_initStackObject(&bund2);
-    
-    
-    uprv_strcpy(parent, base);
-    uprv_strcpy(found, base);
 
-    if(isAvailable) { 
+    base.extract(parent, UPRV_LENGTHOF(parent), subStatus);
+    base.extract(found, UPRV_LENGTHOF(found), subStatus);
+
+    if(isAvailable) {
         UEnumeration *locEnum = ures_openAvailableLocales(path, &subStatus);
         *isAvailable = true;
         if (U_SUCCESS(subStatus)) {
@@ -3054,11 +3147,11 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
                         path?path:"ICUDATA", parent, keyword, defVal, u_errorName(subStatus));
 #endif
                     uprv_strcpy(defLoc, parent);
-                    if(kwVal[0]==0) {
-                        uprv_strcpy(kwVal, defVal);
+                    if(kwVal.isEmpty()) {
+                        kwVal.append(defVal, defLen, subStatus);
 #if defined(URES_TREE_DEBUG)
                         fprintf(stderr, "%s;%s -> kwVal =  %s\n", 
-                            path?path:"ICUDATA", parent, keyword, kwVal);
+                            path?path:"ICUDATA", parent, keyword, kwVal.data());
 #endif
                     }
                 }
@@ -3071,16 +3164,19 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
             uprv_strcpy(found, ures_getLocaleByType(res, ULOC_VALID_LOCALE, &subStatus));
         }
 
-        uloc_getParent(found,parent,sizeof(parent),&subStatus);
+        if (uprv_strcmp(found, parent) != 0) {
+            uprv_strcpy(parent, found);
+        } else {
+            getParentForFunctionalEquivalent(found,res,&bund1,parent,sizeof(parent));
+        }
         ures_close(res);
     } while(!defVal[0] && *found && uprv_strcmp(found, "root") != 0 && U_SUCCESS(*status));
     
     /* Now, see if we can find the kwVal collator.. start the search over.. */
-    uprv_strcpy(parent, base);
-    uprv_strcpy(found, base);
-    
+    base.extract(parent, UPRV_LENGTHOF(parent), subStatus);
+    base.extract(found, UPRV_LENGTHOF(found), subStatus);
+
     do {
-        subStatus = U_ZERO_ERROR;
         res = ures_open(path, parent, &subStatus);
         if((subStatus == U_USING_FALLBACK_WARNING) && isAvailable) {
             *isAvailable = false;
@@ -3089,7 +3185,7 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
         
 #if defined(URES_TREE_DEBUG)
         fprintf(stderr, "%s;%s -> %s (looking for %s)\n", 
-            path?path:"ICUDATA", parent, u_errorName(subStatus), kwVal);
+            path?path:"ICUDATA", parent, u_errorName(subStatus), kwVal.data());
 #endif
         if(U_FAILURE(subStatus)) {
             *status = subStatus;
@@ -3099,14 +3195,14 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
 /**/ fprintf(stderr,"@%d [%s] %s\n", __LINE__, resName, u_errorName(subStatus));
 #endif
             if(subStatus == U_ZERO_ERROR) {
-                ures_getByKey(&bund1, kwVal, &bund2, &subStatus);
+                ures_getByKey(&bund1, kwVal.data(), &bund2, &subStatus);
 #if defined(URES_TREE_DEBUG)
-/**/ fprintf(stderr,"@%d [%s] %s\n", __LINE__, kwVal, u_errorName(subStatus));
+/**/ fprintf(stderr,"@%d [%s] %s\n", __LINE__, kwVal.data(), u_errorName(subStatus));
 #endif
                 if(subStatus == U_ZERO_ERROR) {
 #if defined(URES_TREE_DEBUG)
                     fprintf(stderr, "%s;%s -> full0 %s=%s,  %s\n", 
-                        path?path:"ICUDATA", parent, keyword, kwVal, u_errorName(subStatus));
+                        path?path:"ICUDATA", parent, keyword, kwVal.data(), u_errorName(subStatus));
 #endif
                     uprv_strcpy(full, parent);
                     if(*full == 0) {
@@ -3139,29 +3235,52 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
                 } else {
 #if defined(URES_TREE_DEBUG)
                     fprintf(stderr, "err=%s in %s looking for %s\n", 
-                        u_errorName(subStatus), parent, kwVal);
+                        u_errorName(subStatus), parent, kwVal.data());
 #endif
                 }
             }
         }
         
-        subStatus = U_ZERO_ERROR;
-        
-        uprv_strcpy(found, parent);
-        uloc_getParent(found,parent,1023,&subStatus);
+        UBool haveFound = false;
+        // At least for collations which may be aliased, we need to use the VALID locale
+        // as the parent instead of just truncating, as long as the VALID locale is not
+        // root and has a different language than the parent. Use of the VALID locale
+        // here is similar to the procedure used at the end of the previous do-while loop
+        // for all resource types.
+        if (res != NULL && uprv_strcmp(resName, "collations") == 0) {
+            subStatus = U_ZERO_ERROR;
+            const char *validLoc = ures_getLocaleByType(res, ULOC_VALID_LOCALE, &subStatus);
+            if (U_SUCCESS(subStatus) && validLoc != NULL && validLoc[0] != 0 && uprv_strcmp(validLoc, "root") != 0) {
+                char validLang[ULOC_LANG_CAPACITY];
+                char parentLang[ULOC_LANG_CAPACITY];
+                uloc_getLanguage(validLoc, validLang, ULOC_LANG_CAPACITY, &subStatus);
+                uloc_getLanguage(parent, parentLang, ULOC_LANG_CAPACITY, &subStatus);
+                if (U_SUCCESS(subStatus) && uprv_strcmp(validLang, parentLang) != 0) {
+                    // validLoc is not root and has a different language than parent, use it instead
+                    uprv_strcpy(found, validLoc);
+                    haveFound = true;
+                }
+            }
+            subStatus = U_ZERO_ERROR;
+        }
+        if (!haveFound) {
+            uprv_strcpy(found, parent);
+        }
+
+        getParentForFunctionalEquivalent(found,res,&bund1,parent,1023);
         ures_close(res);
+        subStatus = U_ZERO_ERROR;
     } while(!full[0] && *found && U_SUCCESS(*status));
-    
-    if((full[0]==0) && uprv_strcmp(kwVal, defVal)) {
+
+    if((full[0]==0) && kwVal != defVal) {
 #if defined(URES_TREE_DEBUG)
-        fprintf(stderr, "Failed to locate kw %s - try default %s\n", kwVal, defVal);
+        fprintf(stderr, "Failed to locate kw %s - try default %s\n", kwVal.data(), defVal);
 #endif
-        uprv_strcpy(kwVal, defVal);
-        uprv_strcpy(parent, base);
-        uprv_strcpy(found, base);
-        
+        kwVal.clear().append(defVal, subStatus);
+        base.extract(parent, UPRV_LENGTHOF(parent), subStatus);
+        base.extract(found, UPRV_LENGTHOF(found), subStatus);
+
         do { /* search for 'default' named item */
-            subStatus = U_ZERO_ERROR;
             res = ures_open(path, parent, &subStatus);
             if((subStatus == U_USING_FALLBACK_WARNING) && isAvailable) {
                 *isAvailable = false;
@@ -3170,18 +3289,18 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
             
 #if defined(URES_TREE_DEBUG)
             fprintf(stderr, "%s;%s -> %s (looking for default %s)\n",
-                path?path:"ICUDATA", parent, u_errorName(subStatus), kwVal);
+                path?path:"ICUDATA", parent, u_errorName(subStatus), kwVal.data());
 #endif
             if(U_FAILURE(subStatus)) {
                 *status = subStatus;
             } else if(subStatus == U_ZERO_ERROR) {
                 ures_getByKey(res,resName,&bund1, &subStatus);
                 if(subStatus == U_ZERO_ERROR) {
-                    ures_getByKey(&bund1, kwVal, &bund2, &subStatus);
+                    ures_getByKey(&bund1, kwVal.data(), &bund2, &subStatus);
                     if(subStatus == U_ZERO_ERROR) {
 #if defined(URES_TREE_DEBUG)
                         fprintf(stderr, "%s;%s -> full1 %s=%s,  %s\n", path?path:"ICUDATA",
-                            parent, keyword, kwVal, u_errorName(subStatus));
+                            parent, keyword, kwVal.data(), u_errorName(subStatus));
 #endif
                         uprv_strcpy(full, parent);
                         if(*full == 0) {
@@ -3215,18 +3334,18 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
                     }
                 }
             }
-            subStatus = U_ZERO_ERROR;
             
             uprv_strcpy(found, parent);
-            uloc_getParent(found,parent,1023,&subStatus);
+            getParentForFunctionalEquivalent(found,res,&bund1,parent,1023);
             ures_close(res);
+            subStatus = U_ZERO_ERROR;
         } while(!full[0] && *found && U_SUCCESS(*status));
     }
     
     if(U_SUCCESS(*status)) {
         if(!full[0]) {
 #if defined(URES_TREE_DEBUG)
-          fprintf(stderr, "Still could not load keyword %s=%s\n", keyword, kwVal);
+          fprintf(stderr, "Still could not load keyword %s=%s\n", keyword, kwVal.data());
 #endif
           *status = U_MISSING_RESOURCE_ERROR;
         } else if(omitDefault) {
@@ -3235,21 +3354,21 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
 #endif        
           if(uprv_strlen(defLoc) <= uprv_strlen(full)) {
             /* found the keyword in a *child* of where the default tag was present. */
-            if(!uprv_strcmp(kwVal, defVal)) { /* if the requested kw is default, */
+            if(kwVal == defVal) { /* if the requested kw is default, */
               /* and the default is in or in an ancestor of the current locale */
 #if defined(URES_TREE_DEBUG)
-              fprintf(stderr, "Removing unneeded var %s=%s\n", keyword, kwVal);
+              fprintf(stderr, "Removing unneeded var %s=%s\n", keyword, kwVal.data());
 #endif
-              kwVal[0]=0;
+              kwVal.clear();
             }
           }
         }
         uprv_strcpy(found, full);
-        if(kwVal[0]) {
+        if(!kwVal.isEmpty()) {
             uprv_strcat(found, "@");
             uprv_strcat(found, keyword);
             uprv_strcat(found, "=");
-            uprv_strcat(found, kwVal);
+            uprv_strcat(found, kwVal.data());
         } else if(!omitDefault) {
             uprv_strcat(found, "@");
             uprv_strcat(found, keyword);

+ 10 - 5
thirdparty/icu4c/common/ustrcase.cpp

@@ -1130,14 +1130,18 @@ int32_t toUpper(uint32_t options,
             // Adding one only to the final vowel in a longer sequence
             // (which does not occur in normal writing) would require lookahead.
             // Set the same flag as for preserving an existing dialytika.
-            if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
-                    (upper == 0x399 || upper == 0x3A5)) {
-                data |= HAS_DIALYTIKA;
+            if ((data & HAS_VOWEL) != 0 &&
+                (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
+                    0 &&
+                (upper == 0x399 || upper == 0x3A5)) {
+                data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) ? HAS_DIALYTIKA
+                                                                      : HAS_COMBINING_DIALYTIKA;
             }
             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
             if ((data & HAS_YPOGEGRAMMENI) != 0) {
                 numYpogegrammeni = 1;
             }
+            const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
             // Skip combining diacritics after this Greek letter.
             while (nextIndex < srcLength) {
                 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
@@ -1152,7 +1156,8 @@ int32_t toUpper(uint32_t options,
                 }
             }
             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
-                nextState |= AFTER_VOWEL_WITH_ACCENT;
+                nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
+                                                  : AFTER_VOWEL_WITH_COMBINING_ACCENT;
             }
             // Map according to Greek rules.
             UBool addTonos = false;
@@ -1163,7 +1168,7 @@ int32_t toUpper(uint32_t options,
                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
                 // Keep disjunctive "or" with (only) a tonos.
                 // We use the same "word boundary" conditions as for the Final_Sigma test.
-                if (i == nextIndex) {
+                if (hasPrecomposedAccent) {
                     upper = 0x389;  // Preserve the precomposed form.
                 } else {
                     addTonos = true;

+ 1 - 12
thirdparty/icu4c/common/uts46.cpp

@@ -669,14 +669,6 @@ UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart
     return length;
 }
 
-// Some non-ASCII characters are equivalent to sequences with
-// non-LDH ASCII characters. To find them:
-// grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
-static inline UBool
-isNonASCIIDisallowedSTD3Valid(UChar32 c) {
-    return c==0x2260 || c==0x226E || c==0x226F;
-}
-
 // Replace the label in dest with the label string, if the label was modified.
 // If &label==&dest then the label was modified in-place and labelLength
 // is the new label length, different from label.length().
@@ -820,10 +812,7 @@ UTS46::processLabel(UnicodeString &dest,
             }
         } else {
             oredChars|=c;
-            if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
-                info.labelErrors|=UIDNA_ERROR_DISALLOWED;
-                *s=0xfffd;
-            } else if(c==0xfffd) {
+            if(c==0xfffd) {
                 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
             }
         }

+ 297 - 10
thirdparty/icu4c/i18n/unicode/uspoof.h

@@ -19,6 +19,7 @@
 #ifndef USPOOF_H
 #define USPOOF_H
 
+#include "unicode/ubidi.h"
 #include "unicode/utypes.h"
 #include "unicode/uset.h"
 #include "unicode/parseerr.h"
@@ -83,6 +84,25 @@
  * the instance should be created once (e.g., upon application startup), and the efficient
  * {@link uspoof_areConfusable} method can be used at runtime.
  *
+ * If the paragraph direction used to display the strings is known, the bidi function should be used instead:
+ *
+ * \code{.c}
+ * UErrorCode status = U_ZERO_ERROR;
+ * // These strings look identical when rendered in a left-to-right context.
+ * // They look distinct in a right-to-left context.
+ * UChar* str1 = (UChar*) u"A1\u05D0";  // A1א
+ * UChar* str2 = (UChar*) u"A\u05D01";  // Aא1
+ *
+ * USpoofChecker* sc = uspoof_open(&status);
+ * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
+ *
+ * int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status);
+ * UBool result = bitmask != 0;
+ * // areBidiConfusable: 1 (status: U_ZERO_ERROR)
+ * printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status));
+ * uspoof_close(sc);
+ * \endcode
+ *
  * <p>
  * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers.  It will automatically call
  * {@link uspoof_close} when the object goes out of scope:
@@ -339,6 +359,51 @@
  * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
  * scripts.
  *
+ * <h2>Advanced bidirectional usage</h2>
+ * If the paragraph direction with which the identifiers will be displayed is not known, there are
+ * multiple options for confusable detection depending on the circumstances.
+ *
+ * <p>
+ * In some circumstances, the only concern is confusion between identifiers displayed with the same
+ * paragraph direction.
+ *
+ * <p>
+ * An example is the case where identifiers are usernames prefixed with the @ symbol.
+ * That symbol will appear to the left in a left-to-right context, and to the right in a
+ * right-to-left context, so that an identifier displayed in a left-to-right context can never be
+ * confused with an identifier displayed in a right-to-left context:
+ * <ul>
+ * <li>
+ * The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1)
+ * would be considered confusable, since they both appear as \@A1א in a left-to-right context, and the
+ * usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered
+ * confusable, since they both appear as A_1א@ in a right-to-left context.
+ * </li>
+ * <li>
+ * The username "Mark_" would not be considered confusable with the username "_Mark",
+ * even though the latter would appear as Mark_@ in a right-to-left context, and the
+ * former as \@Mark_ in a left-to-right context.
+ * </li>
+ * </ul>
+ * <p>
+ * In that case, the caller should check for both LTR-confusability and RTL-confusability:
+ *
+ * \code{.cpp}
+ * bool confusableInEitherDirection =
+ *     uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, id1, id2, &status) ||
+ *     uspoof_areBidiConfusableUnicodeString(sc, UBIDI_RTL, id1, id2, &status);
+ * \endcode
+ *
+ * If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR
+ * with LTR and RTL with RTL.
+ *
+ * <p>
+ * In cases where confusability between the visual appearances of an identifier displayed in a
+ * left-to-right context with another identifier displayed in a right-to-left context is a concern,
+ * the LTR skeleton of one can be compared with the RTL skeleton of the other.  However, this
+ * very broad definition of confusability may have unexpected results; for instance, it treats the
+ * ASCII identifiers "Mark_" and "_Mark" as confusable.
+ *
  * <h2>Additional Information</h2>
  *
  * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
@@ -519,7 +584,7 @@ typedef enum USpoofChecks {
 
 
     /**
-     * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and
+     * Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and
      * for returned identifier restriction levels in check results.
      *
      * @stable ICU 51
@@ -633,8 +698,8 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng
 /**
   * Open a Spoof Checker from the source form of the spoof data.
   * The input corresponds to the Unicode data file confusables.txt
-  * as described in Unicode UAX #39.  The syntax of the source data
-  * is as described in UAX #39 for this file, and the content of
+  * as described in Unicode Technical Standard #39.  The syntax of the source data
+  * is as described in UTS #39 for this file, and the content of
   * this file is acceptable input.
   *
   * The character encoding of the (char *) input text is UTF-8.
@@ -1111,7 +1176,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *
 
 
 /**
- * Check the whether two specified strings are visually confusable.
+ * Check whether two specified strings are visually confusable.
  *
  * If the strings are confusable, the return value will be nonzero, as long as
  * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
@@ -1159,7 +1224,58 @@ uspoof_areConfusable(const USpoofChecker *sc,
                      const UChar *id2, int32_t length2,
                      UErrorCode *status);
 
-
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Check whether two specified strings are visually confusable when
+ * displayed in a context with the given paragraph direction.
+ *
+ * If the strings are confusable, the return value will be nonzero, as long as
+ * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
+ *
+ * The bits in the return value correspond to flags for each of the classes of
+ * confusables applicable to the two input strings.  According to UTS 39
+ * section 4, the possible flags are:
+ *
+ * <ul>
+ *   <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
+ *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
+ *   <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
+ * </ul>
+ *
+ * If one or more of the above flags were not listed in uspoof_setChecks(), this
+ * function will never report that class of confusable.  The check
+ * {@link USPOOF_CONFUSABLE} enables all three flags.
+ *
+ *
+ * @param sc      The USpoofChecker
+ * @param direction The paragraph direction with which the identifiers are
+ *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param id1     The first of the two identifiers to be compared for
+ *                confusability.  The strings are in UTF-16 format.
+ * @param length1 the length of the first identifier, expressed in
+ *                16 bit UTF-16 code units, or -1 if the string is
+ *                nul terminated.
+ * @param id2     The second of the two identifiers to be compared for
+ *                confusability.  The identifiers are in UTF-16 format.
+ * @param length2 The length of the second identifiers, expressed in
+ *                16 bit UTF-16 code units, or -1 if the string is
+ *                nul terminated.
+ * @param status  The error code, set if an error occurred while attempting to
+ *                perform the check.
+ *                Confusability of the identifiers is not reported here,
+ *                but through this function's return value.
+ * @return        An integer value with bit(s) set corresponding to
+ *                the type of confusability found, as defined by
+ *                enum USpoofChecks.  Zero is returned if the identifiers
+ *                are not confusable.
+ *
+ * @draft ICU 74
+ */
+U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
+                                                  const UChar *id1, int32_t length1,
+                                                  const UChar *id2, int32_t length2,
+                                                  UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
 
 /**
  * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
@@ -1192,14 +1308,45 @@ uspoof_areConfusableUTF8(const USpoofChecker *sc,
                          const char *id2, int32_t length2,
                          UErrorCode *status);
 
-
-
+#ifndef U_HIDE_DRAFT_API
+/**
+ * A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format.
+ *
+ * @param sc      The USpoofChecker
+ * @param direction The paragraph direction with which the identifiers are
+ *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param id1     The first of the two identifiers to be compared for
+ *                confusability.  The strings are in UTF-8 format.
+ * @param length1 the length of the first identifiers, in bytes, or -1
+ *                if the string is nul terminated.
+ * @param id2     The second of the two identifiers to be compared for
+ *                confusability.  The strings are in UTF-8 format.
+ * @param length2 The length of the second string in bytes, or -1
+ *                if the string is nul terminated.
+ * @param status  The error code, set if an error occurred while attempting to
+ *                perform the check.
+ *                Confusability of the strings is not reported here,
+ *                but through this function's return value.
+ * @return        An integer value with bit(s) set corresponding to
+ *                the type of confusability found, as defined by
+ *                enum USpoofChecks.  Zero is returned if the strings
+ *                are not confusable.
+ *
+ * @draft ICU 74
+ *
+ * @see uspoof_areBidiConfusable
+ */
+U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
+                                                      const char *id1, int32_t length1,
+                                                      const char *id2, int32_t length2,
+                                                      UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
 
 /**
  *  Get the "skeleton" for an identifier.
  *  Skeletons are a transformation of the input identifier;
  * Two identifiers are confusable if their skeletons are identical.
- *  See Unicode UAX #39 for additional information.
+ *  See Unicode Technical Standard #39 for additional information.
  *
  *  Using skeletons directly makes it possible to quickly check
  *  whether an identifier is confusable with any of some large
@@ -1233,11 +1380,50 @@ uspoof_getSkeleton(const USpoofChecker *sc,
                    UChar *dest, int32_t destCapacity,
                    UErrorCode *status);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ *  Get the "bidiSkeleton" for an identifier and a direction.
+ *  Skeletons are a transformation of the input identifier;
+ *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
+ *  they are RTL-confusable if their RTL bidiSkeletons are identical.
+ *  See Unicode Technical Standard #39 for additional information:
+ *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
+ *
+ *  Using skeletons directly makes it possible to quickly check
+ *  whether an identifier is confusable with any of some large
+ *  set of existing identifiers, by creating an efficiently
+ *  searchable collection of the skeletons.
+ *
+ * @param sc      The USpoofChecker.
+ * @param direction The context direction with which the identifier will be
+ *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param id      The input identifier whose skeleton will be computed.
+ * @param length  The length of the input identifier, expressed in 16 bit
+ *                UTF-16 code units, or -1 if the string is zero terminated.
+ * @param dest    The output buffer, to receive the skeleton string.
+ * @param destCapacity  The length of the output buffer, in 16 bit units.
+ *                The destCapacity may be zero, in which case the function will
+ *                return the actual length of the skeleton.
+ * @param status  The error code, set if an error occurred while attempting to
+ *                perform the check.
+ * @return        The length of the skeleton string.  The returned length
+ *                is always that of the complete skeleton, even when the
+ *                supplied buffer is too small (or of zero length)
+ *
+ * @draft ICU 74
+ * @see uspoof_areBidiConfusable
+ */
+U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc,
+                                                UBiDiDirection direction,
+                                                const UChar *id, int32_t length,
+                                                UChar *dest, int32_t destCapacity, UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
+
 /**
  *  Get the "skeleton" for an identifier.
  *  Skeletons are a transformation of the input identifier;
  *  Two identifiers are confusable if their skeletons are identical.
- *  See Unicode UAX #39 for additional information.
+ *  See Unicode Technical Standard #39 for additional information.
  *
  *  Using skeletons directly makes it possible to quickly check
  *  whether an identifier is confusable with any of some large
@@ -1273,6 +1459,46 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
                        char *dest, int32_t destCapacity,
                        UErrorCode *status);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ *  Get the "bidiSkeleton" for an identifier and a direction.
+ *  Skeletons are a transformation of the input identifier;
+ *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
+ *  they are RTL-confusable if their RTL bidiSkeletons are identical.
+ *  See Unicode Technical Standard #39 for additional information:
+ *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
+ *
+ *  Using skeletons directly makes it possible to quickly check
+ *  whether an identifier is confusable with any of some large
+ *  set of existing identifiers, by creating an efficiently
+ *  searchable collection of the skeletons.
+ *
+ * @param sc      The USpoofChecker
+ * @param direction The context direction with which the identifier will be
+ *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param id      The UTF-8 format identifier whose skeleton will be computed.
+ * @param length  The length of the input string, in bytes,
+ *                or -1 if the string is zero terminated.
+ * @param dest    The output buffer, to receive the skeleton string.
+ * @param destCapacity  The length of the output buffer, in bytes.
+ *                The destCapacity may be zero, in which case the function will
+ *                return the actual length of the skeleton.
+ * @param status  The error code, set if an error occurred while attempting to
+ *                perform the check.  Possible Errors include U_INVALID_CHAR_FOUND
+ *                for invalid UTF-8 sequences, and
+ *                U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
+ *                to hold the complete skeleton.
+ * @return        The length of the skeleton string, in bytes.  The returned length
+ *                is always that of the complete skeleton, even when the
+ *                supplied buffer is too small (or of zero length)
+ *
+ * @draft ICU 74
+ */
+U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
+                                                    const char *id, int32_t length, char *dest,
+                                                    int32_t destCapacity, UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
+
 /**
   * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
   * in http://unicode.org/Public/security/latest/xidmodifications.txt
@@ -1510,11 +1736,42 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
                                   const icu::UnicodeString &s2,
                                   UErrorCode *status);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings.
+ *
+ * @param sc      The USpoofChecker
+ * @param direction The paragraph direction with which the identifiers are
+ *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param s1     The first of the two identifiers to be compared for
+ *                confusability.  The strings are in UTF-8 format.
+ * @param s2     The second of the two identifiers to be compared for
+ *                confusability.  The strings are in UTF-8 format.
+ * @param status  The error code, set if an error occurred while attempting to
+ *                perform the check.
+ *                Confusability of the identifiers is not reported here,
+ *                but through this function's return value.
+ * @return        An integer value with bit(s) set corresponding to
+ *                the type of confusability found, as defined by
+ *                enum USpoofChecks.  Zero is returned if the identifiers
+ *                are not confusable.
+ *
+ * @draft ICU 74
+ *
+ * @see uspoof_areBidiConfusable
+ */
+U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
+                                                               UBiDiDirection direction,
+                                                               const icu::UnicodeString &s1,
+                                                               const icu::UnicodeString &s2,
+                                                               UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
+
 /**
  *  Get the "skeleton" for an identifier.
  *  Skeletons are a transformation of the input identifier;
  *  Two identifiers are confusable if their skeletons are identical.
- *  See Unicode UAX #39 for additional information.
+ *  See Unicode Technical Standard #39 for additional information.
  *
  *  Using skeletons directly makes it possible to quickly check
  *  whether an identifier is confusable with any of some large
@@ -1540,6 +1797,36 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
                                 icu::UnicodeString &dest,
                                 UErrorCode *status);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ *  Get the "bidiSkeleton" for an identifier and a direction.
+ *  Skeletons are a transformation of the input identifier;
+ *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
+ *  they are RTL-confusable if their RTL bidiSkeletons are identical.
+ *  See Unicode Technical Standard #39 for additional information.
+ *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
+ *
+ *  Using skeletons directly makes it possible to quickly check
+ *  whether an identifier is confusable with any of some large
+ *  set of existing identifiers, by creating an efficiently
+ *  searchable collection of the skeletons.
+ *
+ * @param sc      The USpoofChecker.
+ * @param direction The context direction with which the identifier will be
+ *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
+ * @param id      The input identifier whose bidiSkeleton will be computed.
+ * @param dest    The output identifier, to receive the skeleton string.
+ * @param status  The error code, set if an error occurred while attempting to
+ *                perform the check.
+ * @return        A reference to the destination (skeleton) string.
+ *
+ * @draft ICU 74
+ */
+U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(
+    const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id,
+    icu::UnicodeString &dest, UErrorCode *status);
+#endif /* U_HIDE_DRAFT_API */
+
 /**
   * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
   * in http://unicode.org/Public/security/latest/xidmodifications.txt

+ 170 - 12
thirdparty/icu4c/i18n/uspoof.cpp

@@ -15,6 +15,7 @@
 *
 *   Unicode Spoof Detection
 */
+#include "unicode/ubidi.h"
 #include "unicode/utypes.h"
 #include "unicode/normalizer2.h"
 #include "unicode/uspoof.h"
@@ -141,8 +142,8 @@ void U_CALLCONV initializeStatics(UErrorCode &status) {
         u"\\U0001DF00-\\U0001DF1E\\U0001DF25-\\U0001DF2A\\U0001E08F\\U0001E7E0-"
         u"\\U0001E7E6\\U0001E7E8-\\U0001E7EB\\U0001E7ED\\U0001E7EE\\U0001E7F0-"
         u"\\U0001E7FE\\U00020000-\\U0002A6DF\\U0002A700-\\U0002B739\\U0002B740-"
-        u"\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002CEB0-\\U0002EBE0\\U00030000-"
-        u"\\U0003134A\\U00031350-\\U000323AF]";
+        u"\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002CEB0-\\U0002EBE0\\U0002EBF0-"
+        u"\\U0002EE5D\\U00030000-\\U0003134A\\U00031350-\\U000323AF]";
 
     gRecommendedSet = new UnicodeSet(UnicodeString(recommendedPat), status);
     if (gRecommendedSet == nullptr) {
@@ -538,6 +539,90 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
     return result;
 }
 
+U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
+                                                  const char16_t *id1, int32_t length1,
+                                                  const char16_t *id2, int32_t length2,
+                                                   UErrorCode *status) {
+    UnicodeString id1Str((length1 == -1), id1, length1); // Aliasing constructor
+    UnicodeString id2Str((length2 == -1), id2, length2); // Aliasing constructor
+    if (id1Str.isBogus() || id2Str.isBogus()) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    return uspoof_areBidiConfusableUnicodeString(sc, direction, id1Str, id2Str, status);
+}
+
+U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
+                                                      const char *id1, int32_t length1, const char *id2,
+                                                      int32_t length2, UErrorCode *status) {
+    if (length1 < -1 || length2 < -1) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    UnicodeString id1Str = UnicodeString::fromUTF8(
+        StringPiece(id1, length1 >= 0 ? length1 : static_cast<int32_t>(uprv_strlen(id1))));
+    UnicodeString id2Str = UnicodeString::fromUTF8(
+        StringPiece(id2, length2 >= 0 ? length2 : static_cast<int32_t>(uprv_strlen(id2))));
+    return uspoof_areBidiConfusableUnicodeString(sc, direction, id1Str, id2Str, status);
+}
+
+U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
+                                                               UBiDiDirection direction,
+                                                               const icu::UnicodeString &id1,
+                                                               const icu::UnicodeString &id2,
+                                                               UErrorCode *status) {
+    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
+    if (U_FAILURE(*status)) {
+        return 0;
+    }
+    //
+    // See section 4 of UTS 39 for the algorithm for checking whether two strings are confusable,
+    //   and for definitions of the types (single, whole, mixed-script) of confusables.
+
+    // We only care about a few of the check flags.  Ignore the others.
+    // If no tests relevant to this function have been specified, return an error.
+    // TODO:  is this really the right thing to do?  It's probably an error on the caller's part,
+    //        but logically we would just return 0 (no error).
+    if ((This->fChecks & USPOOF_CONFUSABLE) == 0) {
+        *status = U_INVALID_STATE_ERROR;
+        return 0;
+    }
+
+    // Compute the skeletons and check for confusability.
+    UnicodeString id1Skeleton;
+    uspoof_getBidiSkeletonUnicodeString(sc, direction, id1, id1Skeleton, status);
+    UnicodeString id2Skeleton;
+    uspoof_getBidiSkeletonUnicodeString(sc, direction, id2, id2Skeleton, status);
+    if (U_FAILURE(*status)) {
+        return 0;
+    }
+    if (id1Skeleton != id2Skeleton) {
+        return 0;
+    }
+
+    // If we get here, the strings are confusable.  Now we just need to set the flags for the appropriate
+    // classes of confusables according to UTS 39 section 4. Start by computing the resolved script sets
+    // of id1 and id2.
+    ScriptSet id1RSS;
+    This->getResolvedScriptSet(id1, id1RSS, *status);
+    ScriptSet id2RSS;
+    This->getResolvedScriptSet(id2, id2RSS, *status);
+
+    // Turn on all applicable flags
+    uint32_t result = 0;
+    if (id1RSS.intersects(id2RSS)) {
+        result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
+    } else {
+        result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
+        if (!id1RSS.isEmpty() && !id2RSS.isEmpty()) {
+            result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
+        }
+    }
+
+    // Turn off flags that the user doesn't want
+    return result & This->fChecks;
+}
+
 
 U_CAPI int32_t U_EXPORT2
 uspoof_checkUnicodeString(const USpoofChecker *sc,
@@ -697,6 +782,60 @@ uspoof_getSkeleton(const USpoofChecker *sc,
     return destStr.length();
 }
 
+U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc, UBiDiDirection direction,
+                                                const UChar *id, int32_t length, UChar *dest,
+                                                int32_t destCapacity, UErrorCode *status) {
+    UnicodeString idStr((length == -1), id, length); // Aliasing constructor
+    if (idStr.isBogus()) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    UnicodeString destStr;
+    uspoof_getBidiSkeletonUnicodeString(sc, direction, idStr, destStr, status);
+    return destStr.extract(dest, destCapacity, *status);
+}
+
+
+
+U_I18N_API UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(const USpoofChecker *sc,
+                                                                        UBiDiDirection direction,
+                                                                        const UnicodeString &id,
+                                                                        UnicodeString &dest,
+                                                                        UErrorCode *status) {
+    dest.remove();
+    if (direction != UBIDI_LTR && direction != UBIDI_RTL) {
+      *status = U_ILLEGAL_ARGUMENT_ERROR;
+      return dest;
+    }
+    UBiDi *bidi = ubidi_open();
+    ubidi_setPara(bidi, id.getBuffer(), id.length(), direction,
+                  /*embeddingLevels*/ nullptr, status);
+    if (U_FAILURE(*status)) {
+        ubidi_close(bidi);
+        return dest;
+    }
+    UnicodeString reordered;
+    int32_t const size = ubidi_getProcessedLength(bidi);
+    UChar* const reorderedBuffer = reordered.getBuffer(size);
+    if (reorderedBuffer == nullptr) {
+        *status = U_MEMORY_ALLOCATION_ERROR;
+        ubidi_close(bidi);
+        return dest;
+    }
+    ubidi_writeReordered(bidi, reorderedBuffer, size,
+                         UBIDI_KEEP_BASE_COMBINING | UBIDI_DO_MIRRORING, status);
+    reordered.releaseBuffer(size);
+    ubidi_close(bidi);
+
+    if (U_FAILURE(*status)) {
+        return dest;
+    }
+
+    // The type parameter is deprecated since ICU 58; any number may be passed.
+    constexpr uint32_t deprecatedType = 58;
+    return uspoof_getSkeletonUnicodeString(sc, deprecatedType, reordered, dest, status);
+}
+
 
 
 U_I18N_API UnicodeString &  U_EXPORT2
@@ -721,19 +860,17 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
     for (inputIndex=0; inputIndex < normalizedLen; ) {
         UChar32 c = nfdId.char32At(inputIndex);
         inputIndex += U16_LENGTH(c);
-        This->fSpoofData->confusableLookup(c, skelStr);
+        if (!u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
+            This->fSpoofData->confusableLookup(c, skelStr);
+        }
     }
 
     gNfdNormalizer->normalize(skelStr, dest, *status);
     return dest;
 }
 
-
-U_CAPI int32_t U_EXPORT2
-uspoof_getSkeletonUTF8(const USpoofChecker *sc,
-                       uint32_t type,
-                       const char *id,  int32_t length,
-                       char *dest, int32_t destCapacity,
+U_CAPI int32_t U_EXPORT2 uspoof_getSkeletonUTF8(const USpoofChecker *sc, uint32_t type, const char *id,
+                                                int32_t length, char *dest, int32_t destCapacity,
                        UErrorCode *status) {
     SpoofImpl::validateThis(sc, *status);
     if (U_FAILURE(*status)) {
@@ -744,7 +881,8 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
         return 0;
     }
 
-    UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : static_cast<int32_t>(uprv_strlen(id))));
+    UnicodeString srcStr = UnicodeString::fromUTF8(
+        StringPiece(id, length >= 0 ? length : static_cast<int32_t>(uprv_strlen(id))));
     UnicodeString destStr;
     uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status);
     if (U_FAILURE(*status)) {
@@ -752,8 +890,28 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
     }
 
     int32_t lengthInUTF8 = 0;
-    u_strToUTF8(dest, destCapacity, &lengthInUTF8,
-                destStr.getBuffer(), destStr.length(), status);
+    u_strToUTF8(dest, destCapacity, &lengthInUTF8, destStr.getBuffer(), destStr.length(), status);
+    return lengthInUTF8;
+}
+
+U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
+                                                    const char *id, int32_t length, char *dest,
+                                                    int32_t destCapacity, UErrorCode *status) {
+    if (length < -1) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    UnicodeString srcStr = UnicodeString::fromUTF8(
+        StringPiece(id, length >= 0 ? length : static_cast<int32_t>(uprv_strlen(id))));
+    UnicodeString destStr;
+    uspoof_getBidiSkeletonUnicodeString(sc, direction, srcStr, destStr, status);
+    if (U_FAILURE(*status)) {
+        return 0;
+    }
+
+    int32_t lengthInUTF8 = 0;
+    u_strToUTF8(dest, destCapacity, &lengthInUTF8, destStr.getBuffer(), destStr.length(), status);
     return lengthInUTF8;
 }
 

BIN
thirdparty/icu4c/icudt73l.dat → thirdparty/icu4c/icudt74l.dat


Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác