ucase.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2004-2012, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: ucase.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2004aug30
  16. * created by: Markus W. Scherer
  17. *
  18. * Low-level Unicode character/string case mapping code.
  19. */
  20. #ifndef __UCASE_H__
  21. #define __UCASE_H__
  22. #include "unicode/utypes.h"
  23. #include "unicode/uset.h"
  24. #include "putilimp.h"
  25. #include "uset_imp.h"
  26. #include "udataswp.h"
  27. #include "utrie2.h"
  28. #ifdef __cplusplus
  29. U_NAMESPACE_BEGIN
  30. class UnicodeString;
  31. U_NAMESPACE_END
  32. #endif
  33. /* library API -------------------------------------------------------------- */
  34. U_CFUNC void U_EXPORT2
  35. ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
  36. /**
  37. * Requires non-NULL locale ID but otherwise does the equivalent of
  38. * checking for language codes as if uloc_getLanguage() were called:
  39. * Accepts both 2- and 3-letter codes and accepts case variants.
  40. */
  41. U_CFUNC int32_t
  42. ucase_getCaseLocale(const char *locale);
  43. /* Casing locale types for ucase_getCaseLocale */
  44. enum {
  45. UCASE_LOC_UNKNOWN,
  46. UCASE_LOC_ROOT,
  47. UCASE_LOC_TURKISH,
  48. UCASE_LOC_LITHUANIAN,
  49. UCASE_LOC_GREEK,
  50. UCASE_LOC_DUTCH,
  51. UCASE_LOC_ARMENIAN
  52. };
  53. /**
  54. * Bit mask for getting just the options from a string compare options word
  55. * that are relevant for case-insensitive string comparison.
  56. * See stringoptions.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER.
  57. * @internal
  58. */
  59. #define _STRCASECMP_OPTIONS_MASK 0xffff
  60. /**
  61. * Bit mask for getting just the options from a string compare options word
  62. * that are relevant for case folding (of a single string or code point).
  63. *
  64. * Currently only bit 0 for U_FOLD_CASE_EXCLUDE_SPECIAL_I.
  65. * It is conceivable that at some point we might use one more bit for using uppercase sharp s.
  66. * It is conceivable that at some point we might want the option to use only simple case foldings
  67. * when operating on strings.
  68. *
  69. * See stringoptions.h.
  70. * @internal
  71. */
  72. #define _FOLD_CASE_OPTIONS_MASK 7
  73. /* single-code point functions */
  74. U_CAPI UChar32 U_EXPORT2
  75. ucase_tolower(UChar32 c);
  76. U_CAPI UChar32 U_EXPORT2
  77. ucase_toupper(UChar32 c);
  78. U_CAPI UChar32 U_EXPORT2
  79. ucase_totitle(UChar32 c);
  80. U_CAPI UChar32 U_EXPORT2
  81. ucase_fold(UChar32 c, uint32_t options);
  82. /**
  83. * Adds all simple case mappings and the full case folding for c to sa,
  84. * and also adds special case closure mappings.
  85. * c itself is not added.
  86. * For example, the mappings
  87. * - for s include long s
  88. * - for sharp s include ss
  89. * - for k include the Kelvin sign
  90. */
  91. U_CFUNC void U_EXPORT2
  92. ucase_addCaseClosure(UChar32 c, const USetAdder *sa);
  93. /** Case closure with only scf=Simple_Case_Folding. */
  94. U_CFUNC void U_EXPORT2
  95. ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa);
  96. /**
  97. * Maps the string to single code points and adds the associated case closure
  98. * mappings.
  99. * The string is mapped to code points if it is their full case folding string.
  100. * In other words, this performs a reverse full case folding and then
  101. * adds the case closure items of the resulting code points.
  102. * If the string is found and its closure applied, then
  103. * the string itself is added as well as part of its code points' closure.
  104. * It must be length>=0.
  105. *
  106. * @return true if the string was found
  107. */
  108. U_CFUNC UBool U_EXPORT2
  109. ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa);
  110. #ifdef __cplusplus
  111. U_NAMESPACE_BEGIN
  112. /**
  113. * Iterator over characters with more than one code point in the full default Case_Folding.
  114. */
  115. class U_COMMON_API FullCaseFoldingIterator {
  116. public:
  117. /** Constructor. */
  118. FullCaseFoldingIterator();
  119. /**
  120. * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding.
  121. * Returns a negative cp value at the end of the iteration.
  122. */
  123. UChar32 next(UnicodeString &full);
  124. private:
  125. FullCaseFoldingIterator(const FullCaseFoldingIterator &) = delete; // no copy
  126. FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &) = delete; // no assignment
  127. const char16_t *unfold;
  128. int32_t unfoldRows;
  129. int32_t unfoldRowWidth;
  130. int32_t unfoldStringWidth;
  131. int32_t currentRow;
  132. int32_t rowCpIndex;
  133. };
  134. /**
  135. * Fast case mapping data for ASCII/Latin.
  136. * Linear arrays of delta bytes: 0=no mapping; EXC=exception.
  137. * Deltas must not cross the ASCII boundary, or else they cannot be easily used
  138. * in simple UTF-8 code.
  139. */
  140. namespace LatinCase {
  141. /** Case mapping/folding data for code points up to U+017F. */
  142. constexpr char16_t LIMIT = 0x180;
  143. /** U+017F case-folds and uppercases crossing the ASCII boundary. */
  144. constexpr char16_t LONG_S = 0x17f;
  145. /** Exception: Complex mapping, or too-large delta. */
  146. constexpr int8_t EXC = -0x80;
  147. /** Deltas for lowercasing for most locales, and default case folding. */
  148. extern const int8_t TO_LOWER_NORMAL[LIMIT];
  149. /** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */
  150. extern const int8_t TO_LOWER_TR_LT[LIMIT];
  151. /** Deltas for uppercasing for most locales. */
  152. extern const int8_t TO_UPPER_NORMAL[LIMIT];
  153. /** Deltas for uppercasing for tr/az. */
  154. extern const int8_t TO_UPPER_TR[LIMIT];
  155. } // namespace LatinCase
  156. U_NAMESPACE_END
  157. #endif
  158. /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
  159. U_CAPI int32_t U_EXPORT2
  160. ucase_getType(UChar32 c);
  161. /** @return like ucase_getType() but also sets UCASE_IGNORABLE if c is case-ignorable */
  162. U_CAPI int32_t U_EXPORT2
  163. ucase_getTypeOrIgnorable(UChar32 c);
  164. U_CAPI UBool U_EXPORT2
  165. ucase_isSoftDotted(UChar32 c);
  166. U_CAPI UBool U_EXPORT2
  167. ucase_isCaseSensitive(UChar32 c);
  168. /* string case mapping functions */
  169. U_CDECL_BEGIN
  170. /**
  171. * Iterator function for string case mappings, which need to look at the
  172. * context (surrounding text) of a given character for conditional mappings.
  173. *
  174. * The iterator only needs to go backward or forward away from the
  175. * character in question. It does not use any indexes on this interface.
  176. * It does not support random access or an arbitrary change of
  177. * iteration direction.
  178. *
  179. * The code point being case-mapped itself is never returned by
  180. * this iterator.
  181. *
  182. * @param context A pointer to the iterator's working data.
  183. * @param dir If <0 then start iterating backward from the character;
  184. * if >0 then start iterating forward from the character;
  185. * if 0 then continue iterating in the current direction.
  186. * @return Next code point, or <0 when the iteration is done.
  187. */
  188. typedef UChar32 U_CALLCONV
  189. UCaseContextIterator(void *context, int8_t dir);
  190. /**
  191. * Sample struct which may be used by some implementations of
  192. * UCaseContextIterator.
  193. */
  194. struct UCaseContext {
  195. void *p;
  196. int32_t start, index, limit;
  197. int32_t cpStart, cpLimit;
  198. int8_t dir;
  199. int8_t b1, b2, b3;
  200. };
  201. typedef struct UCaseContext UCaseContext;
  202. U_CDECL_END
  203. #define UCASECONTEXT_INITIALIZER { NULL, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
  204. enum {
  205. /**
  206. * For string case mappings, a single character (a code point) is mapped
  207. * either to itself (in which case in-place mapping functions do nothing),
  208. * or to another single code point, or to a string.
  209. * Aside from the string contents, these are indicated with a single int32_t
  210. * value as follows:
  211. *
  212. * Mapping to self: Negative values (~self instead of -self to support U+0000)
  213. *
  214. * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH
  215. *
  216. * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is
  217. * returned. Note that the string result may indeed have zero length.
  218. */
  219. UCASE_MAX_STRING_LENGTH=0x1f
  220. };
  221. /**
  222. * Get the full lowercase mapping for c.
  223. *
  224. * @param csp Case mapping properties.
  225. * @param c Character to be mapped.
  226. * @param iter Character iterator, used for context-sensitive mappings.
  227. * See UCaseContextIterator for details.
  228. * If iter==NULL then a context-independent result is returned.
  229. * @param context Pointer to be passed into iter.
  230. * @param pString If the mapping result is a string, then the pointer is
  231. * written to *pString.
  232. * @param caseLocale Case locale value from ucase_getCaseLocale().
  233. * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH.
  234. *
  235. * @see UCaseContextIterator
  236. * @see UCASE_MAX_STRING_LENGTH
  237. * @internal
  238. */
  239. U_CAPI int32_t U_EXPORT2
  240. ucase_toFullLower(UChar32 c,
  241. UCaseContextIterator *iter, void *context,
  242. const UChar **pString,
  243. int32_t caseLocale);
  244. U_CAPI int32_t U_EXPORT2
  245. ucase_toFullUpper(UChar32 c,
  246. UCaseContextIterator *iter, void *context,
  247. const UChar **pString,
  248. int32_t caseLocale);
  249. U_CAPI int32_t U_EXPORT2
  250. ucase_toFullTitle(UChar32 c,
  251. UCaseContextIterator *iter, void *context,
  252. const UChar **pString,
  253. int32_t caseLocale);
  254. U_CAPI int32_t U_EXPORT2
  255. ucase_toFullFolding(UChar32 c,
  256. const UChar **pString,
  257. uint32_t options);
  258. U_CFUNC int32_t U_EXPORT2
  259. ucase_hasBinaryProperty(UChar32 c, UProperty which);
  260. U_CDECL_BEGIN
  261. /**
  262. * @internal
  263. */
  264. typedef int32_t U_CALLCONV
  265. UCaseMapFull(UChar32 c,
  266. UCaseContextIterator *iter, void *context,
  267. const UChar **pString,
  268. int32_t caseLocale);
  269. U_CDECL_END
  270. /* for icuexportdata -------------------------------------------------------- */
  271. struct UCaseProps {
  272. void *mem; // TODO: was unused, and type UDataMemory -- remove
  273. const int32_t *indexes;
  274. const uint16_t *exceptions;
  275. const uint16_t *unfold;
  276. UTrie2 trie;
  277. uint8_t formatVersion[4];
  278. };
  279. U_CAPI const struct UCaseProps * U_EXPORT2
  280. ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength);
  281. /* file definitions --------------------------------------------------------- */
  282. #define UCASE_DATA_NAME "ucase"
  283. #define UCASE_DATA_TYPE "icu"
  284. /* format "cAsE" */
  285. #define UCASE_FMT_0 0x63
  286. #define UCASE_FMT_1 0x41
  287. #define UCASE_FMT_2 0x53
  288. #define UCASE_FMT_3 0x45
  289. /* indexes into indexes[] */
  290. enum {
  291. UCASE_IX_INDEX_TOP,
  292. UCASE_IX_LENGTH,
  293. UCASE_IX_TRIE_SIZE,
  294. UCASE_IX_EXC_LENGTH,
  295. UCASE_IX_UNFOLD_LENGTH,
  296. UCASE_IX_MAX_FULL_LENGTH=15,
  297. UCASE_IX_TOP=16
  298. };
  299. /* definitions for 16-bit case properties word ------------------------------ */
  300. U_CFUNC const UTrie2 * U_EXPORT2
  301. ucase_getTrie();
  302. /* 2-bit constants for types of cased characters */
  303. #define UCASE_TYPE_MASK 3
  304. enum {
  305. UCASE_NONE,
  306. UCASE_LOWER,
  307. UCASE_UPPER,
  308. UCASE_TITLE
  309. };
  310. #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
  311. #define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7)
  312. #define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2)
  313. #define UCASE_IGNORABLE 4
  314. #define UCASE_EXCEPTION 8
  315. #define UCASE_SENSITIVE 0x10
  316. #define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
  317. #define UCASE_DOT_MASK 0x60
  318. enum {
  319. UCASE_NO_DOT=0, /* normal characters with cc=0 */
  320. UCASE_SOFT_DOTTED=0x20, /* soft-dotted characters with cc=0 */
  321. UCASE_ABOVE=0x40, /* "above" accents with cc=230 */
  322. UCASE_OTHER_ACCENT=0x60 /* other accent character (0<cc!=230) */
  323. };
  324. /* no exception: bits 15..7 are a 9-bit signed case mapping delta */
  325. #define UCASE_DELTA_SHIFT 7
  326. #define UCASE_DELTA_MASK 0xff80
  327. #define UCASE_MAX_DELTA 0xff
  328. #define UCASE_MIN_DELTA (-UCASE_MAX_DELTA-1)
  329. #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
  330. # define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
  331. #else
  332. # define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT))
  333. #endif
  334. /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
  335. #define UCASE_EXC_SHIFT 4
  336. #define UCASE_EXC_MASK 0xfff0
  337. #define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1)
  338. /* definitions for 16-bit main exceptions word ------------------------------ */
  339. /* first 8 bits indicate values in optional slots */
  340. enum {
  341. UCASE_EXC_LOWER,
  342. UCASE_EXC_FOLD,
  343. UCASE_EXC_UPPER,
  344. UCASE_EXC_TITLE,
  345. UCASE_EXC_DELTA,
  346. UCASE_EXC_5, /* reserved */
  347. UCASE_EXC_CLOSURE,
  348. UCASE_EXC_FULL_MAPPINGS,
  349. UCASE_EXC_ALL_SLOTS /* one past the last slot */
  350. };
  351. /* each slot is 2 uint16_t instead of 1 */
  352. #define UCASE_EXC_DOUBLE_SLOTS 0x100
  353. enum {
  354. UCASE_EXC_NO_SIMPLE_CASE_FOLDING=0x200,
  355. UCASE_EXC_DELTA_IS_NEGATIVE=0x400,
  356. UCASE_EXC_SENSITIVE=0x800
  357. };
  358. /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
  359. #define UCASE_EXC_DOT_SHIFT 7
  360. /* normally stored in the main word, but pushed out for larger exception indexes */
  361. #define UCASE_EXC_DOT_MASK 0x3000
  362. enum {
  363. UCASE_EXC_NO_DOT=0,
  364. UCASE_EXC_SOFT_DOTTED=0x1000,
  365. UCASE_EXC_ABOVE=0x2000, /* "above" accents with cc=230 */
  366. UCASE_EXC_OTHER_ACCENT=0x3000 /* other character (0<cc!=230) */
  367. };
  368. /* complex/conditional mappings */
  369. #define UCASE_EXC_CONDITIONAL_SPECIAL 0x4000
  370. #define UCASE_EXC_CONDITIONAL_FOLD 0x8000
  371. /* definitions for lengths word for full case mappings */
  372. #define UCASE_FULL_LOWER 0xf
  373. #define UCASE_FULL_FOLDING 0xf0
  374. #define UCASE_FULL_UPPER 0xf00
  375. #define UCASE_FULL_TITLE 0xf000
  376. /* maximum lengths */
  377. #define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf)
  378. #define UCASE_CLOSURE_MAX_LENGTH 0xf
  379. /* constants for reverse case folding ("unfold") data */
  380. enum {
  381. UCASE_UNFOLD_ROWS,
  382. UCASE_UNFOLD_ROW_WIDTH,
  383. UCASE_UNFOLD_STRING_WIDTH
  384. };
  385. #endif