1 year ago · 4823c43486
--- a/contrib/libs/utf8proc/NEWS.md
+++ b/contrib/libs/utf8proc/NEWS.md
@@ -1,5 +1,11 @@
 
				 # utf8proc release history #
			
 
				 
			
 
				+## Version 2.9.0 ##
			
 
				+
			
 
				+2023-10-20
			
 
				+
			
 
				+ - Unicode 15.1 support ([#253]).
			
 
				+
			
 
				 ## Version 2.8.0 ##
			
 
				 
			
 
				 2022-10-30
			
@@ -436,3 +442,4 @@ Release of version 1.0.1
 
				 [#224]: https://github.com/JuliaStrings/utf8proc/issues/224
			
 
				 [#233]: https://github.com/JuliaStrings/utf8proc/issues/233
			
 
				 [#247]: https://github.com/JuliaStrings/utf8proc/issues/247
			
 
				+[#253]: https://github.com/JuliaStrings/utf8proc/issues/253
			
--- a/contrib/libs/utf8proc/README.md
+++ b/contrib/libs/utf8proc/README.md
@@ -59,7 +59,7 @@ The C library is found in this directory after successful compilation
 
				 and is named `libutf8proc.a` (for the static library) and
			
 
				 `libutf8proc.so` (for the dynamic library).
			
 
				 
			
 
				-The Unicode version supported is 15.0.0.
			
 
				+The Unicode version supported is 15.1.0.
			
 
				 
			
 
				 For Unicode normalizations, the following options are used:
			
 
				 
			
--- a/contrib/libs/utf8proc/utf8proc.c
+++ b/contrib/libs/utf8proc/utf8proc.c
@@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
 
				 }
			
 
				 
			
 
				 UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
			
 
				-  return "15.0.0";
			
 
				+  return "15.1.0";
			
 
				 }
			
 
				 
			
 
				 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
			
@@ -288,35 +288,54 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
 
				     true; // GB999
			
 
				 }
			
 
				 
			
 
				-static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
			
 
				+static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
			
 
				 {
			
 
				   if (state) {
			
 
				-    int lbc_override;
			
 
				-    if (*state == UTF8PROC_BOUNDCLASS_START)
			
 
				-      *state = lbc_override = lbc;
			
 
				-    else
			
 
				-      lbc_override = *state;
			
 
				-    utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
			
 
				+    int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
			
 
				+    if (*state == 0) { /* state initialization */
			
 
				+      state_bc = lbc;
			
 
				+      state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
			
 
				+    }
			
 
				+    else { /* lbc and licb are already encoded in *state */
			
 
				+      state_bc = *state & 0xff;  // 1st byte of state is bound class
			
 
				+      state_icb = *state >> 8;   // 2nd byte of state is indic conjunct break
			
 
				+    }
			
 
				+
			
 
				+    utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
			
 
				+       !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
			
 
				+        && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c
			
 
				+
			
 
				+    // Special support for GB9c.  Don't break between two consonants
			
 
				+    // separated 1+ linker characters and 0+ extend characters in any order.
			
 
				+    // After a consonant, we enter LINKER state after at least one linker.
			
 
				+    if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
			
 
				+        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
			
 
				+        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
			
 
				+      state_icb = ticb;
			
 
				+    else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
			
 
				+      state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
			
 
				+                  UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
			
 
				 
			
 
				     // Special support for GB 12/13 made possible by GB999. After two RI
			
 
				     // class codepoints we want to force a break. Do this by resetting the
			
 
				     // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
			
 
				     // after that character according to GB999 (unless of course such a break is
			
 
				     // forbidden by a different rule such as GB9).
			
 
				-    if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
			
 
				-      *state = UTF8PROC_BOUNDCLASS_OTHER;
			
 
				+    if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
			
 
				+      state_bc = UTF8PROC_BOUNDCLASS_OTHER;
			
 
				     // Special support for GB11 (emoji extend* zwj / emoji)
			
 
				-    else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
			
 
				+    else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
			
 
				       if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
			
 
				-        *state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
			
 
				+        state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
			
 
				       else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
			
 
				-        *state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
			
 
				+        state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
			
 
				       else
			
 
				-        *state = tbc;
			
 
				+        state_bc = tbc;
			
 
				     }
			
 
				     else
			
 
				-      *state = tbc;
			
 
				+      state_bc = tbc;
			
 
				 
			
 
				+    *state = state_bc + (state_icb << 8);
			
 
				     return break_permitted;
			
 
				   }
			
 
				   else
			
@@ -326,8 +345,12 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
 
				 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
			
 
				     utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
			
 
				 
			
 
				-  return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
			
 
				-                                 utf8proc_get_property(c2)->boundclass,
			
 
				+  const utf8proc_property_t *p1 = utf8proc_get_property(c1);
			
 
				+  const utf8proc_property_t *p2 = utf8proc_get_property(c2);
			
 
				+  return grapheme_break_extended(p1->boundclass,
			
 
				+                                 p2->boundclass,
			
 
				+                                 p1->indic_conjunct_break,
			
 
				+                                 p2->indic_conjunct_break,
			
 
				                                  state);
			
 
				 }
			
 
				 
			
@@ -498,8 +521,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
 
				   }
			
 
				   if (options & UTF8PROC_CHARBOUND) {
			
 
				     utf8proc_bool boundary;
			
 
				-    int tbc = property->boundclass;
			
 
				-    boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
			
 
				+    boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
			
 
				+                                       last_boundclass);
			
 
				     if (boundary) {
			
 
				       if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
			
 
				       if (bufsize >= 2) dst[1] = uc;
			
--- a/contrib/libs/utf8proc/utf8proc.h
+++ b/contrib/libs/utf8proc/utf8proc.h
@@ -71,7 +71,7 @@
 
				 /** The MAJOR version number (increased when backwards API compatibility is broken). */
			
 
				 #define UTF8PROC_VERSION_MAJOR 2
			
 
				 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
			
 
				-#define UTF8PROC_VERSION_MINOR 8
			
 
				+#define UTF8PROC_VERSION_MINOR 9
			
 
				 /** The PATCH version (increased for fixes that do not change the API). */
			
 
				 #define UTF8PROC_VERSION_PATCH 0
			
 
				 /** @} */
			
@@ -259,7 +259,8 @@ typedef struct utf8proc_property_struct {
 
				    * Boundclass.
			
 
				    * @see utf8proc_boundclass_t.
			
 
				    */
			
 
				-  unsigned boundclass:8;
			
 
				+  unsigned boundclass:6;
			
 
				+  unsigned indic_conjunct_break:2;
			
 
				 } utf8proc_property_t;
			
 
				 
			
 
				 /** Unicode categories. */
			
@@ -374,6 +375,14 @@ typedef enum {
 
				   UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
			
 
				 } utf8proc_boundclass_t;
			
 
				 
			
 
				+/** Indic_Conjunct_Break property. (TR44) */
			
 
				+typedef enum {
			
 
				+  UTF8PROC_INDIC_CONJUNCT_BREAK_NONE = 0,
			
 
				+  UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER = 1,
			
 
				+  UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT = 2,
			
 
				+  UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND = 3,
			
 
				+} utf8proc_indic_conjunct_break_t;
			
 
				+
			
 
				 /**
			
 
				  * Function pointer type passed to @ref utf8proc_map_custom and
			
 
				  * @ref utf8proc_decompose_custom, which is used to specify a user-defined
			
@@ -467,8 +476,9 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
 
				  * - @ref UTF8PROC_STRIPNA   - remove unassigned codepoints
			
 
				  * @param last_boundclass
			
 
				  * Pointer to an integer variable containing
			
 
				- * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
			
 
				- * option is used.  Otherwise, this parameter is ignored.
			
 
				+ * the previous codepoint's (boundclass + indic_conjunct_break << 1) if the @ref UTF8PROC_CHARBOUND
			
 
				+ * option is used.  If the string is being processed in order, this can be initialized to 0 for
			
 
				+ * the beginning of the string, and is thereafter updated automatically.  Otherwise, this parameter is ignored.
			
 
				  *
			
 
				  * @return
			
 
				  * In case of success, the number of codepoints written is returned; in case
			
--- a/contrib/libs/utf8proc/utf8proc_data.c
+++ b/contrib/libs/utf8proc/utf8proc_data.c
--- a/contrib/libs/utf8proc/ya.make
+++ b/contrib/libs/utf8proc/ya.make
@@ -1,4 +1,4 @@
 
				-# Generated by devtools/yamaker from nixpkgs 22.05.
			
 
				+# Generated by devtools/yamaker from nixpkgs 22.11.
			
 
				 
			
 
				 LIBRARY()
			
 
				 
			
@@ -9,9 +9,9 @@ LICENSE(
 
				 
			
 
				 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
			
 
				 
			
 
				-VERSION(2.8.0)
			
 
				+VERSION(2.9.0)
			
 
				 
			
 
				-ORIGINAL_SOURCE(https://github.com/JuliaStrings/utf8proc/archive/v2.8.0.tar.gz)
			
 
				+ORIGINAL_SOURCE(https://github.com/JuliaStrings/utf8proc/archive/v2.9.0.tar.gz)
			
 
				 
			
 
				 NO_COMPILER_WARNINGS()