Browse Source

Update ICU to 75.1
904da4ae1c86fc5542eac7f1cd18d97b72eb8517

romankoshelev 10 months ago
parent
commit
5b22fadb0f

+ 38 - 43
contrib/libs/icu/LICENSE

@@ -1,49 +1,44 @@
-UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
-
-See Terms of Use <https://www.unicode.org/copyright.html>
-for definitions of Unicode Inc.’s Data Files and Software.
-
-NOTICE TO USER: Carefully read the following legal agreement.
-BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
-DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
-YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
-TERMS AND CONDITIONS OF THIS AGREEMENT.
-IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
-THE DATA FILES OR SOFTWARE.
+UNICODE LICENSE V3
 
 COPYRIGHT AND PERMISSION NOTICE
 
-Copyright © 1991-2023 Unicode, Inc. All rights reserved.
-Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of the Unicode data files and any associated documentation
-(the "Data Files") or Unicode software and any associated documentation
-(the "Software") to deal in the Data Files or Software
-without restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, and/or sell copies of
-the Data Files or Software, and to permit persons to whom the Data Files
-or Software are furnished to do so, provided that either
-(a) this copyright and permission notice appear with all copies
-of the Data Files or Software, or
-(b) this copyright and permission notice appear in associated
-Documentation.
-
-THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
-ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT OF THIRD PARTY RIGHTS.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
-NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
-DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
-DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-PERFORMANCE OF THE DATA FILES OR SOFTWARE.
-
-Except as contained in this notice, the name of a copyright holder
-shall not be used in advertising or otherwise to promote the sale,
-use or other dealings in these Data Files or Software without prior
-written authorization of the copyright holder.
+Copyright © 2016-2024 Unicode, Inc.
+
+NOTICE TO USER: Carefully read the following legal agreement. BY
+DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
+SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of data files and any associated documentation (the "Data Files") or
+software and any associated documentation (the "Software") to deal in the
+Data Files or Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, and/or sell
+copies of the Data Files or Software, and to permit persons to whom the
+Data Files or Software are furnished to do so, provided that either (a)
+this copyright and permission notice appear with all copies of the Data
+Files or Software, or (b) this copyright and permission notice appear in
+associated Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+THIRD PARTY RIGHTS.
+
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
+FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall
+not be used in advertising or otherwise to promote the sale, use or other
+dealings in these Data Files or Software without prior written
+authorization of the copyright holder.
+
+SPDX-License-Identifier: Unicode-3.0
 
 ----------------------------------------------------------------------
 

+ 95 - 32
contrib/libs/icu/common/brkeng.cpp

@@ -21,6 +21,7 @@
 #include "unicode/uscript.h"
 #include "unicode/ucharstrie.h"
 #include "unicode/bytestrie.h"
+#include "unicode/rbbi.h"
 
 #include "brkeng.h"
 #include "cmemory.h"
@@ -70,19 +71,21 @@ UnhandledEngine::~UnhandledEngine() {
 }
 
 UBool
-UnhandledEngine::handles(UChar32 c) const {
+UnhandledEngine::handles(UChar32 c, const char* locale) const {
+    (void)locale; // Unused
     return fHandled && fHandled->contains(c);
 }
 
 int32_t
 UnhandledEngine::findBreaks( UText *text,
-                             int32_t /* startPos */,
+                             int32_t startPos,
                              int32_t endPos,
                              UVector32 &/*foundBreaks*/,
                              UBool /* isPhraseBreaking */,
                              UErrorCode &status) const {
     if (U_FAILURE(status)) return 0;
-    UChar32 c = utext_current32(text); 
+    utext_setNativeIndex(text, startPos);
+    UChar32 c = utext_current32(text);
     while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
         utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
         c = utext_current32(text);
@@ -111,50 +114,46 @@ UnhandledEngine::handleCharacter(UChar32 c) {
  */
 
 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
-    fEngines = 0;
+    fEngines = nullptr;
 }
 
 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
-    if (fEngines != 0) {
-        delete fEngines;
-    }
+    delete fEngines;
 }
 
-U_NAMESPACE_END
-U_CDECL_BEGIN
-static void U_CALLCONV _deleteEngine(void *obj) {
-    delete (const icu::LanguageBreakEngine *) obj;
+void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
+    static UMutex gBreakEngineMutex;
+    Mutex m(&gBreakEngineMutex);
+    if (fEngines == nullptr) {
+        LocalPointer<UStack>  engines(new UStack(uprv_deleteUObject, nullptr, status), status);
+        if (U_SUCCESS(status)) {
+            fEngines = engines.orphan();
+        }
+    }
 }
-U_CDECL_END
-U_NAMESPACE_BEGIN
 
 const LanguageBreakEngine *
-ICULanguageBreakFactory::getEngineFor(UChar32 c) {
+ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
     const LanguageBreakEngine *lbe = nullptr;
     UErrorCode  status = U_ZERO_ERROR;
+    ensureEngines(status);
+    if (U_FAILURE(status) ) {
+        // Note: no way to return error code to caller.
+        return nullptr;
+    }
 
     static UMutex gBreakEngineMutex;
     Mutex m(&gBreakEngineMutex);
-
-    if (fEngines == nullptr) {
-        LocalPointer<UStack>  engines(new UStack(_deleteEngine, nullptr, status), status);
-        if (U_FAILURE(status) ) {
-            // Note: no way to return error code to caller.
-            return nullptr;
-        }
-        fEngines = engines.orphan();
-    } else {
-        int32_t i = fEngines->size();
-        while (--i >= 0) {
-            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
-            if (lbe != nullptr && lbe->handles(c)) {
-                return lbe;
-            }
+    int32_t i = fEngines->size();
+    while (--i >= 0) {
+        lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
+        if (lbe != nullptr && lbe->handles(c, locale)) {
+            return lbe;
         }
     }
-    
+
     // We didn't find an engine. Create one.
-    lbe = loadEngineFor(c);
+    lbe = loadEngineFor(c, locale);
     if (lbe != nullptr) {
         fEngines->push((void *)lbe, status);
     }
@@ -162,7 +161,7 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c) {
 }
 
 const LanguageBreakEngine *
-ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
+ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
     UErrorCode status = U_ZERO_ERROR;
     UScriptCode code = uscript_getScript(c, &status);
     if (U_SUCCESS(status)) {
@@ -299,6 +298,70 @@ ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
     return nullptr;
 }
 
+
+void ICULanguageBreakFactory::addExternalEngine(
+        ExternalBreakEngine* external, UErrorCode& status) {
+    LocalPointer<ExternalBreakEngine> engine(external, status);
+    ensureEngines(status);
+    LocalPointer<BreakEngineWrapper> wrapper(
+        new BreakEngineWrapper(engine.orphan(), status), status);
+    static UMutex gBreakEngineMutex;
+    Mutex m(&gBreakEngineMutex);
+    fEngines->push(wrapper.getAlias(), status);
+    wrapper.orphan();
+}
+
+BreakEngineWrapper::BreakEngineWrapper(
+    ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
+}
+
+BreakEngineWrapper::~BreakEngineWrapper() {
+}
+
+UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
+    return delegate->isFor(c, locale);
+}
+
+int32_t BreakEngineWrapper::findBreaks(
+    UText *text,
+    int32_t startPos,
+    int32_t endPos,
+    UVector32 &foundBreaks,
+    UBool /* isPhraseBreaking */,
+    UErrorCode &status) const {
+    if (U_FAILURE(status)) return 0;
+    int32_t result = 0;
+
+    // Find the span of characters included in the set.
+    //   The span to break begins at the current position in the text, and
+    //   extends towards the start or end of the text, depending on 'reverse'.
+
+    utext_setNativeIndex(text, startPos);
+    int32_t start = (int32_t)utext_getNativeIndex(text);
+    int32_t current;
+    int32_t rangeStart;
+    int32_t rangeEnd;
+    UChar32 c = utext_current32(text);
+    while((current = (int32_t)utext_getNativeIndex(text)) < endPos && delegate->handles(c)) {
+        utext_next32(text);         // TODO:  recast loop for postincrement
+        c = utext_current32(text);
+    }
+    rangeStart = start;
+    rangeEnd = current;
+    int32_t beforeSize = foundBreaks.size();
+    int32_t additionalCapacity = rangeEnd - rangeStart + 1;
+    // enlarge to contains (rangeEnd-rangeStart+1) more items
+    foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
+    if (U_FAILURE(status)) return 0;
+    foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
+    result = delegate->fillBreaks(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
+                                  additionalCapacity, status);
+    if (U_FAILURE(status)) return 0;
+    foundBreaks.setSize(beforeSize + result);
+    utext_setNativeIndex(text, current);
+    return result;
+}
+
 U_NAMESPACE_END
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

+ 53 - 6
contrib/libs/icu/common/brkeng.h

@@ -10,6 +10,7 @@
 #ifndef BRKENG_H
 #define BRKENG_H
 
+#include "unicode/umisc.h"
 #include "unicode/utypes.h"
 #include "unicode/uobject.h"
 #include "unicode/utext.h"
@@ -21,6 +22,7 @@ class UnicodeSet;
 class UStack;
 class UVector32;
 class DictionaryMatcher;
+class ExternalBreakEngine;
 
 /*******************************************************************
  * LanguageBreakEngine
@@ -35,7 +37,7 @@ class DictionaryMatcher;
  * <p>LanguageBreakEngines should normally be implemented so as to
  * be shared between threads without locking.</p>
  */
-class LanguageBreakEngine : public UMemory {
+class LanguageBreakEngine : public UObject {
  public:
 
   /**
@@ -54,10 +56,11 @@ class LanguageBreakEngine : public UMemory {
   * a particular kind of break.</p>
   *
   * @param c A character which begins a run that the engine might handle
+  * @param locale The locale.
   * @return true if this engine handles the particular character and break
   * type.
   */
-  virtual UBool handles(UChar32 c) const = 0;
+  virtual UBool handles(UChar32 c, const char* locale) const = 0;
 
  /**
   * <p>Find any breaks within a run in the supplied text.</p>
@@ -80,6 +83,35 @@ class LanguageBreakEngine : public UMemory {
 
 };
 
+/*******************************************************************
+ * BreakEngineWrapper
+ */
+
+/**
+ * <p>BreakEngineWrapper implement LanguageBreakEngine by
+ * a thin wrapper that delegate the task to ExternalBreakEngine
+ * </p>
+ */
+class BreakEngineWrapper : public  LanguageBreakEngine {
+ public:
+
+  BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status);
+
+  virtual ~BreakEngineWrapper();
+
+  virtual UBool handles(UChar32 c, const char* locale) const override;
+
+  virtual int32_t findBreaks( UText *text,
+                              int32_t startPos,
+                              int32_t endPos,
+                              UVector32 &foundBreaks,
+                              UBool isPhraseBreaking,
+                              UErrorCode &status) const override;
+
+ private:
+  LocalPointer<ExternalBreakEngine> delegate;
+};
+
 /*******************************************************************
  * LanguageBreakFactory
  */
@@ -125,9 +157,10 @@ class LanguageBreakFactory : public UMemory {
   *
   * @param c A character that begins a run for which a LanguageBreakEngine is
   * sought.
+  * @param locale The locale.
   * @return A LanguageBreakEngine with the desired characteristics, or 0.
   */
-  virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
+  virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0;
 
 };
 
@@ -174,10 +207,11 @@ class UnhandledEngine : public LanguageBreakEngine {
   * a particular kind of break.</p>
   *
   * @param c A character which begins a run that the engine might handle
+  * @param locale The locale.
   * @return true if this engine handles the particular character and break
   * type.
   */
-  virtual UBool handles(UChar32 c) const override;
+  virtual UBool handles(UChar32 c, const char* locale) const override;
 
  /**
   * <p>Find any breaks within a run in the supplied text.</p>
@@ -247,9 +281,18 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
   *
   * @param c A character that begins a run for which a LanguageBreakEngine is
   * sought.
+  * @param locale The locale.
   * @return A LanguageBreakEngine with the desired characteristics, or 0.
   */
-  virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override;
+  virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override;
+
+  /**
+   * Add and adopt the engine and return an URegistryKey.
+   * @param engine The ExternalBreakEngine to be added and adopt. The caller
+   *     pass the ownership and should not release the memory after this.
+   * @param status the error code.
+   */
+  virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status);
 
 protected:
  /**
@@ -258,9 +301,10 @@ protected:
   *
   * @param c A character that begins a run for which a LanguageBreakEngine is
   * sought.
+  * @param locale The locale.
   * @return A LanguageBreakEngine with the desired characteristics, or 0.
   */
-  virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
+  virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale);
 
   /**
    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
@@ -269,6 +313,9 @@ protected:
    * @return A DictionaryMatcher with the desired characteristics, or nullptr.
    */
   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
+
+ private:
+  void ensureEngines(UErrorCode& status);
 };
 
 U_NAMESPACE_END

+ 15 - 6
contrib/libs/icu/common/brkiter.cpp

@@ -27,6 +27,7 @@
 #include "unicode/rbbi.h"
 #include "unicode/brkiter.h"
 #include "unicode/udata.h"
+#include "unicode/uloc.h"
 #include "unicode/ures.h"
 #include "unicode/ustring.h"
 #include "unicode/filteredbrk.h"
@@ -121,8 +122,11 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
     // If there is a result, set the valid locale and actual locale, and the kind
     if (U_SUCCESS(status) && result != nullptr) {
         U_LOCALE_BASED(locBased, *(BreakIterator*)result);
+
         locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), 
                               actualLocale.data());
+        uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
+        result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
     }
 
     ures_close(b);
@@ -202,18 +206,20 @@ BreakIterator::getAvailableLocales(int32_t& count)
 
 BreakIterator::BreakIterator()
 {
-    *validLocale = *actualLocale = 0;
+    *validLocale = *actualLocale = *requestLocale = 0;
 }
 
 BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
     uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
     uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
+    uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
 }
 
 BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
     if (this != &other) {
         uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
         uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
+        uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
     }
     return *this;
 }
@@ -432,17 +438,14 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
             uprv_strcpy(lb_lw, "line");
             UErrorCode kvStatus = U_ZERO_ERROR;
-            CharString value;
-            CharStringByteSink valueSink(&value);
-            loc.getKeywordValue("lb", valueSink, kvStatus);
+            auto value = loc.getKeywordValue<CharString>("lb", kvStatus);
             if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
                 uprv_strcat(lb_lw, "_");
                 uprv_strcat(lb_lw, value.data());
             }
             // lw=phrase is only supported in Japanese and Korean
             if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {
-                value.clear();
-                loc.getKeywordValue("lw", valueSink, kvStatus);
+                value = loc.getKeywordValue<CharString>("lw", kvStatus);
                 if (U_SUCCESS(kvStatus) && value == "phrase") {
                     uprv_strcat(lb_lw, "_");
                     uprv_strcat(lb_lw, value.data());
@@ -493,12 +496,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
 
 Locale
 BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
+    if (type == ULOC_REQUESTED_LOCALE) {
+        return {requestLocale};
+    }
     U_LOCALE_BASED(locBased, *this);
     return locBased.getLocale(type, status);
 }
 
 const char *
 BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
+    if (type == ULOC_REQUESTED_LOCALE) {
+        return requestLocale;
+    }
     U_LOCALE_BASED(locBased, *this);
     return locBased.getLocaleID(type, status);
 }

+ 90 - 22
contrib/libs/icu/common/bytesinkutil.h

@@ -7,18 +7,52 @@
 #ifndef BYTESINKUTIL_H
 #define BYTESINKUTIL_H
 
+#include <type_traits>
+
 #include "unicode/utypes.h"
 #include "unicode/bytestream.h"
 #include "unicode/edits.h"
+#include "charstr.h"
 #include "cmemory.h"
 #include "uassert.h"
+#include "ustr_imp.h"
 
 U_NAMESPACE_BEGIN
 
 class ByteSink;
-class CharString;
 class Edits;
 
+class U_COMMON_API CharStringByteSink : public ByteSink {
+public:
+    CharStringByteSink(CharString* dest);
+    ~CharStringByteSink() override;
+
+    CharStringByteSink() = delete;
+    CharStringByteSink(const CharStringByteSink&) = delete;
+    CharStringByteSink& operator=(const CharStringByteSink&) = delete;
+
+    void Append(const char* bytes, int32_t n) override;
+
+    char* GetAppendBuffer(int32_t min_capacity,
+                          int32_t desired_capacity_hint,
+                          char* scratch,
+                          int32_t scratch_capacity,
+                          int32_t* result_capacity) override;
+
+private:
+    CharString& dest_;
+};
+
+// CharString doesn't provide the public API that StringByteSink requires a
+// string class to have so this template specialization replaces the default
+// implementation of StringByteSink<CharString> with CharStringByteSink.
+template<>
+class StringByteSink<CharString> : public CharStringByteSink {
+ public:
+  StringByteSink(CharString* dest) : CharStringByteSink(dest) { }
+  StringByteSink(CharString* dest, int32_t /*initialAppendCapacity*/) : CharStringByteSink(dest) { }
+};
+
 class U_COMMON_API ByteSinkUtil {
 public:
     ByteSinkUtil() = delete;  // all static
@@ -57,30 +91,64 @@ public:
                                  ByteSink &sink, uint32_t options, Edits *edits,
                                  UErrorCode &errorCode);
 
-private:
-    static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
-                                        ByteSink &sink, uint32_t options, Edits *edits);
-};
-
-class U_COMMON_API CharStringByteSink : public ByteSink {
-public:
-    CharStringByteSink(CharString* dest);
-    ~CharStringByteSink() override;
-
-    CharStringByteSink() = delete;
-    CharStringByteSink(const CharStringByteSink&) = delete;
-    CharStringByteSink& operator=(const CharStringByteSink&) = delete;
-
-    void Append(const char* bytes, int32_t n) override;
+    /**
+     * Calls a lambda that writes to a ByteSink with a CheckedArrayByteSink
+     * and then returns through u_terminateChars(), in order to implement
+     * the classic ICU4C C API writing to a fix sized buffer on top of a
+     * contemporary C++ API.
+     *
+     * @param buffer receiving buffer
+     * @param capacity capacity of receiving buffer
+     * @param lambda that gets called with the sink as an argument
+     * @param status set to U_BUFFER_OVERFLOW_ERROR on overflow
+     * @return number of bytes written, or needed (in case of overflow)
+     * @internal
+     */
+    template <typename F,
+              typename = std::enable_if_t<
+                  std::is_invocable_r_v<void, F, ByteSink&, UErrorCode&>>>
+    static int32_t viaByteSinkToTerminatedChars(char* buffer, int32_t capacity,
+                                                F&& lambda,
+                                                UErrorCode& status) {
+        if (U_FAILURE(status)) { return 0; }
+        CheckedArrayByteSink sink(buffer, capacity);
+        lambda(sink, status);
+        if (U_FAILURE(status)) { return 0; }
+
+        int32_t reslen = sink.NumberOfBytesAppended();
+
+        if (sink.Overflowed()) {
+            status = U_BUFFER_OVERFLOW_ERROR;
+            return reslen;
+        }
+
+        return u_terminateChars(buffer, capacity, reslen, &status);
+    }
 
-    char* GetAppendBuffer(int32_t min_capacity,
-                          int32_t desired_capacity_hint,
-                          char* scratch,
-                          int32_t scratch_capacity,
-                          int32_t* result_capacity) override;
+    /**
+     * Calls a lambda that writes to a ByteSink with a CharStringByteSink and
+     * then returns a CharString, in order to implement a contemporary C++ API
+     * on top of a C/C++ compatibility ByteSink API.
+     *
+     * @param lambda that gets called with the sink as an argument
+     * @param status to check and report
+     * @return the resulting string, or an empty string (in case of error)
+     * @internal
+     */
+    template <typename F,
+              typename = std::enable_if_t<
+                  std::is_invocable_r_v<void, F, ByteSink&, UErrorCode&>>>
+    static CharString viaByteSinkToCharString(F&& lambda, UErrorCode& status) {
+        if (U_FAILURE(status)) { return {}; }
+        CharString result;
+        CharStringByteSink sink(&result);
+        lambda(sink, status);
+        return result;
+    }
 
 private:
-    CharString& dest_;
+    static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
+                                        ByteSink &sink, uint32_t options, Edits *edits);
 };
 
 U_NAMESPACE_END

+ 29 - 21
contrib/libs/icu/common/caniter.cpp

@@ -64,6 +64,7 @@ U_NAMESPACE_BEGIN
 
 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)
 
+
 /**
  *@param source string to get results for
  */
@@ -73,10 +74,10 @@ CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode
     pieces_lengths(nullptr),
     current(nullptr),
     current_length(0),
-    nfd(*Normalizer2::getNFDInstance(status)),
-    nfcImpl(*Normalizer2Factory::getNFCImpl(status))
+    nfd(Normalizer2::getNFDInstance(status)),
+    nfcImpl(Normalizer2Factory::getNFCImpl(status))
 {
-    if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) {
+    if(U_SUCCESS(status) && nfcImpl->ensureCanonIterData(status)) {
       setSource(sourceStr, status);
     }
 }
@@ -172,7 +173,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
     int32_t i = 0;
     UnicodeString *list = nullptr;
 
-    nfd.normalize(newSource, source, status);
+    nfd->normalize(newSource, source, status);
     if(U_FAILURE(status)) {
       return;
     }
@@ -194,7 +195,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
         current[0] = 0;
         pieces[0] = new UnicodeString[1];
         pieces_lengths[0] = 1;
-        if (pieces[0] == 0) {
+        if (pieces[0] == nullptr) {
             status = U_MEMORY_ALLOCATION_ERROR;
             goto CleanPartialInitialization;
         }
@@ -203,7 +204,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
 
 
     list = new UnicodeString[source.length()];
-    if (list == 0) {
+    if (list == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         goto CleanPartialInitialization;
     }
@@ -219,7 +220,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
     // on the NFD form - see above).
     for (; i < source.length(); i += U16_LENGTH(cp)) {
         cp = source.char32At(i);
-        if (nfcImpl.isCanonSegmentStarter(cp)) {
+        if (nfcImpl->isCanonSegmentStarter(cp)) {
             source.extract(start, i-start, list[list_length++]); // add up to i
             start = i;
         }
@@ -252,9 +253,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
     return;
 // Common section to cleanup all local variables and reset object variables.
 CleanPartialInitialization:
-    if (list != nullptr) {
-        delete[] list;
-    }
+    delete[] list;
     cleanPieces();
 }
 
@@ -264,10 +263,19 @@ CleanPartialInitialization:
  * @param source the string to find permutations for
  * @return the results in a set.
  */
-void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
+void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status, int32_t depth) {
     if(U_FAILURE(status)) {
         return;
     }
+    // To avoid infinity loop caused by permute, we limit the depth of recursive
+    // call to permute and return U_UNSUPPORTED_ERROR.
+    // We know in some unit test we need at least 4. Set to 8 just in case some
+    // unforseen use cases.
+    constexpr int32_t kPermuteDepthLimit = 8;
+    if (depth > kPermuteDepthLimit) {
+        status = U_UNSUPPORTED_ERROR;
+        return;
+    }
     //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
     int32_t i = 0;
 
@@ -277,7 +285,7 @@ void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros
     if (source.length() <= 2 && source.countChar32() <= 1) {
         UnicodeString *toPut = new UnicodeString(source);
         /* test for nullptr */
-        if (toPut == 0) {
+        if (toPut == nullptr) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return;
         }
@@ -311,7 +319,7 @@ void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros
 
         // see what the permutations of the characters before and after this one are
         //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
-        permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status);
+        permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status, depth+1);
         /* Test for buffer overflows */
         if(U_FAILURE(status)) {
             return;
@@ -346,7 +354,7 @@ UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, i
     Hashtable permutations(status);
     Hashtable basic(status);
     if (U_FAILURE(status)) {
-        return 0;
+        return nullptr;
     }
     result.setValueDeleter(uprv_deleteUObject);
     permutations.setValueDeleter(uprv_deleteUObject);
@@ -381,7 +389,7 @@ UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, i
             //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
             UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
             UnicodeString attempt;
-            nfd.normalize(possible, attempt, status);
+            nfd->normalize(possible, attempt, status);
 
             // TODO: check if operator == is semanticaly the same as attempt.equals(segment)
             if (attempt==segment) {
@@ -399,7 +407,7 @@ UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, i
 
     /* Test for buffer overflows */
     if(U_FAILURE(status)) {
-        return 0;
+        return nullptr;
     }
     // convert into a String[] to clean up storage
     //String[] finalResult = new String[result.size()];
@@ -407,7 +415,7 @@ UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, i
     int32_t resultCount;
     if((resultCount = result.count()) != 0) {
         finalResult = new UnicodeString[resultCount];
-        if (finalResult == 0) {
+        if (finalResult == nullptr) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return nullptr;
         }
@@ -448,7 +456,7 @@ Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const cha
     for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) {
         // see if any character is at the start of some decomposition
         U16_GET(segment, 0, i, segLen, cp);
-        if (!nfcImpl.getCanonStartSet(cp, starts)) {
+        if (!nfcImpl->getCanonStartSet(cp, starts)) {
             continue;
         }
         // if so, see which decompositions match
@@ -471,7 +479,7 @@ Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const cha
                 UnicodeString item = *((UnicodeString *)(ne->value.pointer));
                 UnicodeString *toAdd = new UnicodeString(prefix);
                 /* test for nullptr */
-                if (toAdd == 0) {
+                if (toAdd == nullptr) {
                     status = U_MEMORY_ALLOCATION_ERROR;
                     return nullptr;
                 }
@@ -509,7 +517,7 @@ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, con
     UnicodeString temp(comp);
     int32_t inputLen=temp.length();
     UnicodeString decompString;
-    nfd.normalize(temp, decompString, status);
+    nfd->normalize(temp, decompString, status);
     if (U_FAILURE(status)) {
         return nullptr;
     }
@@ -573,7 +581,7 @@ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, con
     // brute force approach
     // check to make sure result is canonically equivalent
     UnicodeString trial;
-    nfd.normalize(temp, trial, status);
+    nfd->normalize(temp, trial, status);
     if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {
         return nullptr;
     }

+ 9 - 1
contrib/libs/icu/common/characterproperties.cpp

@@ -169,7 +169,7 @@ void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
     case UPROPS_SRC_INPC:
     case UPROPS_SRC_INSC:
     case UPROPS_SRC_VO:
-        uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
+        uprops_addPropertyStarts(src, &sa, &errorCode);
         break;
     case UPROPS_SRC_EMOJI: {
         const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
@@ -178,6 +178,14 @@ void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
         }
         break;
     }
+    case UPROPS_SRC_IDSU:
+        // New in Unicode 15.1 for just two characters.
+        sa.add(sa.set, 0x2FFE);
+        sa.add(sa.set, 0x2FFF + 1);
+        break;
+    case UPROPS_SRC_ID_COMPAT_MATH:
+        uprops_addPropertyStarts(src, &sa, &errorCode);
+        break;
     default:
         errorCode = U_INTERNAL_PROGRAM_ERROR;
         break;

+ 7 - 0
contrib/libs/icu/common/charstr.h

@@ -104,6 +104,13 @@ public:
      */
     int32_t extract(char *dest, int32_t capacity, UErrorCode &errorCode) const;
 
+    bool operator==(const CharString& other) const {
+        return len == other.length() && (len == 0 || uprv_memcmp(data(), other.data(), len) == 0);
+    }
+    bool operator!=(const CharString& other) const {
+        return !operator==(other);
+    }
+
     bool operator==(StringPiece other) const {
         return len == other.length() && (len == 0 || uprv_memcmp(data(), other.data(), len) == 0);
     }

+ 2 - 2
contrib/libs/icu/common/dictbe.cpp

@@ -42,7 +42,7 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
 }
 
 UBool
-DictionaryBreakEngine::handles(UChar32 c) const {
+DictionaryBreakEngine::handles(UChar32 c, const char*) const {
     return fSet.contains(c);
 }
 
@@ -54,13 +54,13 @@ DictionaryBreakEngine::findBreaks( UText *text,
                                  UBool isPhraseBreaking,
                                  UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
-    (void)startPos;            // TODO: remove this param?
     int32_t result = 0;
 
     // Find the span of characters included in the set.
     //   The span to break begins at the current position in the text, and
     //   extends towards the start or end of the text, depending on 'reverse'.
 
+    utext_setNativeIndex(text, startPos);
     int32_t start = (int32_t)utext_getNativeIndex(text);
     int32_t current;
     int32_t rangeStart;

+ 2 - 1
contrib/libs/icu/common/dictbe.h

@@ -62,10 +62,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
    * a particular kind of break.</p>
    *
    * @param c A character which begins a run that the engine might handle
+   * @param locale The locale.
    * @return true if this engine handles the particular character and break
    * type.
    */
-  virtual UBool handles(UChar32 c) const override;
+  virtual UBool handles(UChar32 c, const char* locale) const override;
 
   /**
    * <p>Find any breaks within a run in the supplied text.</p>

Some files were not shown because too many files changed in this diff