|
@@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
|
|
|
}
|
|
|
|
|
|
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
|
|
|
- return "15.0.0";
|
|
|
+ return "15.1.0";
|
|
|
}
|
|
|
|
|
|
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
|
|
@@ -288,35 +288,54 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
|
|
|
true; // GB999
|
|
|
}
|
|
|
|
|
|
-static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
|
|
|
+static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
|
|
|
{
|
|
|
if (state) {
|
|
|
- int lbc_override;
|
|
|
- if (*state == UTF8PROC_BOUNDCLASS_START)
|
|
|
- *state = lbc_override = lbc;
|
|
|
- else
|
|
|
- lbc_override = *state;
|
|
|
- utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
|
|
|
+ int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
|
|
|
+ if (*state == 0) { /* state initialization */
|
|
|
+ state_bc = lbc;
|
|
|
+ state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
|
|
|
+ }
|
|
|
+ else { /* lbc and licb are already encoded in *state */
|
|
|
+ state_bc = *state & 0xff; // 1st byte of state is bound class
|
|
|
+ state_icb = *state >> 8; // 2nd byte of state is indic conjunct break
|
|
|
+ }
|
|
|
+
|
|
|
+ utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
|
|
|
+ !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
|
|
|
+ && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c
|
|
|
+
|
|
|
+ // Special support for GB9c. Don't break between two consonants
|
|
|
+ // separated 1+ linker characters and 0+ extend characters in any order.
|
|
|
+ // After a consonant, we enter LINKER state after at least one linker.
|
|
|
+ if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|
|
|
+ || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|
|
|
+ || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
|
|
|
+ state_icb = ticb;
|
|
|
+ else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
|
|
|
+ state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
|
|
|
+ UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
|
|
|
|
|
|
// Special support for GB 12/13 made possible by GB999. After two RI
|
|
|
// class codepoints we want to force a break. Do this by resetting the
|
|
|
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
|
|
|
// after that character according to GB999 (unless of course such a break is
|
|
|
// forbidden by a different rule such as GB9).
|
|
|
- if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
|
|
|
- *state = UTF8PROC_BOUNDCLASS_OTHER;
|
|
|
+ if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
|
|
|
+ state_bc = UTF8PROC_BOUNDCLASS_OTHER;
|
|
|
// Special support for GB11 (emoji extend* zwj / emoji)
|
|
|
- else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
|
|
|
+ else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
|
|
|
if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
|
|
|
- *state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
|
|
|
+ state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
|
|
|
else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
|
|
|
- *state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
|
|
|
+ state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
|
|
|
else
|
|
|
- *state = tbc;
|
|
|
+ state_bc = tbc;
|
|
|
}
|
|
|
else
|
|
|
- *state = tbc;
|
|
|
+ state_bc = tbc;
|
|
|
|
|
|
+ *state = state_bc + (state_icb << 8);
|
|
|
return break_permitted;
|
|
|
}
|
|
|
else
|
|
@@ -326,8 +345,12 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
|
|
|
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
|
|
|
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
|
|
|
|
|
|
- return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
|
|
|
- utf8proc_get_property(c2)->boundclass,
|
|
|
+ const utf8proc_property_t *p1 = utf8proc_get_property(c1);
|
|
|
+ const utf8proc_property_t *p2 = utf8proc_get_property(c2);
|
|
|
+ return grapheme_break_extended(p1->boundclass,
|
|
|
+ p2->boundclass,
|
|
|
+ p1->indic_conjunct_break,
|
|
|
+ p2->indic_conjunct_break,
|
|
|
state);
|
|
|
}
|
|
|
|
|
@@ -498,8 +521,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
|
|
|
}
|
|
|
if (options & UTF8PROC_CHARBOUND) {
|
|
|
utf8proc_bool boundary;
|
|
|
- int tbc = property->boundclass;
|
|
|
- boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
|
|
|
+ boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
|
|
|
+ last_boundclass);
|
|
|
if (boundary) {
|
|
|
if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
|
|
|
if (bufsize >= 2) dst[1] = uc;
|