uset.h 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2002-2014, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: uset.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002mar07
  16. * created by: Markus W. Scherer
  17. *
  18. * C version of UnicodeSet.
  19. */
  20. /**
  21. * \file
  22. * \brief C API: Unicode Set
  23. *
  24. * <p>This is a C wrapper around the C++ UnicodeSet class.</p>
  25. */
  26. #ifndef __USET_H__
  27. #define __USET_H__
  28. #include "unicode/utypes.h"
  29. #include "unicode/uchar.h"
  30. #if U_SHOW_CPLUSPLUS_API
  31. #include "unicode/localpointer.h"
  32. #endif // U_SHOW_CPLUSPLUS_API
  33. #ifndef USET_DEFINED
  34. #ifndef U_IN_DOXYGEN
  35. #define USET_DEFINED
  36. #endif
  37. /**
  38. * USet is the C API type corresponding to C++ class UnicodeSet.
  39. * Use the uset_* API to manipulate. Create with
  40. * uset_open*, and destroy with uset_close.
  41. * @stable ICU 2.4
  42. */
  43. typedef struct USet USet;
  44. #endif
  45. /**
  46. * Bitmask values to be passed to uset_openPatternOptions() or
  47. * uset_applyPattern() taking an option parameter.
  48. *
  49. * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
  50. * These case options are mutually exclusive.
  51. *
  52. * Undefined options bits are ignored, and reserved for future use.
  53. *
  54. * @stable ICU 2.4
  55. */
  56. enum {
  57. /**
  58. * Ignore white space within patterns unless quoted or escaped.
  59. * @stable ICU 2.4
  60. */
  61. USET_IGNORE_SPACE = 1,
  62. /**
  63. * Enable case insensitive matching. E.g., "[ab]" with this flag
  64. * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
  65. * match all except 'a', 'A', 'b', and 'B'. This performs a full
  66. * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
  67. *
  68. * The resulting set is a superset of the input for the code points but
  69. * not for the strings.
  70. * It performs a case mapping closure of the code points and adds
  71. * full case folding strings for the code points, and reduces strings of
  72. * the original set to their full case folding equivalents.
  73. *
  74. * This is designed for case-insensitive matches, for example
  75. * in regular expressions. The full code point case closure allows checking of
  76. * an input character directly against the closure set.
  77. * Strings are matched by comparing the case-folded form from the closure
  78. * set with an incremental case folding of the string in question.
  79. *
  80. * The closure set will also contain single code points if the original
  81. * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
  82. * This is not necessary (that is, redundant) for the above matching method
  83. * but results in the same closure sets regardless of whether the original
  84. * set contained the code point or a string.
  85. *
  86. * @stable ICU 2.4
  87. */
  88. USET_CASE_INSENSITIVE = 2,
  89. /**
  90. * Adds all case mappings for each element in the set.
  91. * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
  92. * of each existing element in the set.
  93. *
  94. * Unlike the “case insensitive” options, this does not perform a closure.
  95. * For example, it does not add 'ſ' (U+017F long s) for 's',
  96. * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
  97. *
  98. * @stable ICU 3.2
  99. */
  100. USET_ADD_CASE_MAPPINGS = 4,
  101. #ifndef U_HIDE_DRAFT_API
  102. /**
  103. * Enable case insensitive matching.
  104. * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings,
  105. * which map each code point to one code point,
  106. * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
  107. *
  108. * This is designed for case-insensitive matches, for example in certain
  109. * regular expression implementations where only Simple_Case_Folding mappings are used,
  110. * such as in ECMAScript (JavaScript) regular expressions.
  111. *
  112. * @draft ICU 73
  113. */
  114. USET_SIMPLE_CASE_INSENSITIVE = 6
  115. #endif // U_HIDE_DRAFT_API
  116. };
  117. /**
  118. * Argument values for whether span() and similar functions continue while
  119. * the current character is contained vs. not contained in the set.
  120. *
  121. * The functionality is straightforward for sets with only single code points,
  122. * without strings (which is the common case):
  123. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same.
  124. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED.
  125. * - span() and spanBack() partition any string the same way when
  126. * alternating between span(USET_SPAN_NOT_CONTAINED) and
  127. * span(either "contained" condition).
  128. * - Using a complemented (inverted) set and the opposite span conditions
  129. * yields the same results.
  130. *
  131. * When a set contains multi-code point strings, then these statements may not
  132. * be true, depending on the strings in the set (for example, whether they
  133. * overlap with each other) and the string that is processed.
  134. * For a set with strings:
  135. * - The complement of the set contains the opposite set of code points,
  136. * but the same set of strings.
  137. * Therefore, complementing both the set and the span conditions
  138. * may yield different results.
  139. * - When starting spans at different positions in a string
  140. * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
  141. * because a set string may start before the later position.
  142. * - span(USET_SPAN_SIMPLE) may be shorter than
  143. * span(USET_SPAN_CONTAINED) because it will not recursively try
  144. * all possible paths.
  145. * For example, with a set which contains the three strings "xy", "xya" and "ax",
  146. * span("xyax", USET_SPAN_CONTAINED) will return 4 but
  147. * span("xyax", USET_SPAN_SIMPLE) will return 3.
  148. * span(USET_SPAN_SIMPLE) will never be longer than
  149. * span(USET_SPAN_CONTAINED).
  150. * - With either "contained" condition, span() and spanBack() may partition
  151. * a string in different ways.
  152. * For example, with a set which contains the two strings "ab" and "ba",
  153. * and when processing the string "aba",
  154. * span() will yield contained/not-contained boundaries of { 0, 2, 3 }
  155. * while spanBack() will yield boundaries of { 0, 1, 3 }.
  156. *
  157. * Note: If it is important to get the same boundaries whether iterating forward
  158. * or backward through a string, then either only span() should be used and
  159. * the boundaries cached for backward operation, or an ICU BreakIterator
  160. * could be used.
  161. *
  162. * Note: Unpaired surrogates are treated like surrogate code points.
  163. * Similarly, set strings match only on code point boundaries,
  164. * never in the middle of a surrogate pair.
  165. * Illegal UTF-8 sequences are treated like U+FFFD.
  166. * When processing UTF-8 strings, malformed set strings
  167. * (strings with unpaired surrogates which cannot be converted to UTF-8)
  168. * are ignored.
  169. *
  170. * @stable ICU 3.8
  171. */
  172. typedef enum USetSpanCondition {
  173. /**
  174. * Continues a span() while there is no set element at the current position.
  175. * Increments by one code point at a time.
  176. * Stops before the first set element (character or string).
  177. * (For code points only, this is like while contains(current)==false).
  178. *
  179. * When span() returns, the substring between where it started and the position
  180. * it returned consists only of characters that are not in the set,
  181. * and none of its strings overlap with the span.
  182. *
  183. * @stable ICU 3.8
  184. */
  185. USET_SPAN_NOT_CONTAINED = 0,
  186. /**
  187. * Spans the longest substring that is a concatenation of set elements (characters or strings).
  188. * (For characters only, this is like while contains(current)==true).
  189. *
  190. * When span() returns, the substring between where it started and the position
  191. * it returned consists only of set elements (characters or strings) that are in the set.
  192. *
  193. * If a set contains strings, then the span will be the longest substring for which there
  194. * exists at least one non-overlapping concatenation of set elements (characters or strings).
  195. * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
  196. * (Java/ICU/Perl regex stops at the first match of an OR.)
  197. *
  198. * @stable ICU 3.8
  199. */
  200. USET_SPAN_CONTAINED = 1,
  201. /**
  202. * Continues a span() while there is a set element at the current position.
  203. * Increments by the longest matching element at each position.
  204. * (For characters only, this is like while contains(current)==true).
  205. *
  206. * When span() returns, the substring between where it started and the position
  207. * it returned consists only of set elements (characters or strings) that are in the set.
  208. *
  209. * If a set only contains single characters, then this is the same
  210. * as USET_SPAN_CONTAINED.
  211. *
  212. * If a set contains strings, then the span will be the longest substring
  213. * with a match at each position with the longest single set element (character or string).
  214. *
  215. * Use this span condition together with other longest-match algorithms,
  216. * such as ICU converters (ucnv_getUnicodeSet()).
  217. *
  218. * @stable ICU 3.8
  219. */
  220. USET_SPAN_SIMPLE = 2,
  221. #ifndef U_HIDE_DEPRECATED_API
  222. /**
  223. * One more than the last span condition.
  224. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
  225. */
  226. USET_SPAN_CONDITION_COUNT
  227. #endif // U_HIDE_DEPRECATED_API
  228. } USetSpanCondition;
  229. enum {
  230. /**
  231. * Capacity of USerializedSet::staticArray.
  232. * Enough for any single-code point set.
  233. * Also provides padding for nice sizeof(USerializedSet).
  234. * @stable ICU 2.4
  235. */
  236. USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
  237. };
  238. /**
  239. * A serialized form of a Unicode set. Limited manipulations are
  240. * possible directly on a serialized set. See below.
  241. * @stable ICU 2.4
  242. */
  243. typedef struct USerializedSet {
  244. /**
  245. * The serialized Unicode Set.
  246. * @stable ICU 2.4
  247. */
  248. const uint16_t *array;
  249. /**
  250. * The length of the array that contains BMP characters.
  251. * @stable ICU 2.4
  252. */
  253. int32_t bmpLength;
  254. /**
  255. * The total length of the array.
  256. * @stable ICU 2.4
  257. */
  258. int32_t length;
  259. /**
  260. * A small buffer for the array to reduce memory allocations.
  261. * @stable ICU 2.4
  262. */
  263. uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY];
  264. } USerializedSet;
  265. /*********************************************************************
  266. * USet API
  267. *********************************************************************/
  268. /**
  269. * Create an empty USet object.
  270. * Equivalent to uset_open(1, 0).
  271. * @return a newly created USet. The caller must call uset_close() on
  272. * it when done.
  273. * @stable ICU 4.2
  274. */
  275. U_CAPI USet* U_EXPORT2
  276. uset_openEmpty(void);
  277. /**
  278. * Creates a USet object that contains the range of characters
  279. * start..end, inclusive. If <code>start > end</code>
  280. * then an empty set is created (same as using uset_openEmpty()).
  281. * @param start first character of the range, inclusive
  282. * @param end last character of the range, inclusive
  283. * @return a newly created USet. The caller must call uset_close() on
  284. * it when done.
  285. * @stable ICU 2.4
  286. */
  287. U_CAPI USet* U_EXPORT2
  288. uset_open(UChar32 start, UChar32 end);
  289. /**
  290. * Creates a set from the given pattern. See the UnicodeSet class
  291. * description for the syntax of the pattern language.
  292. * @param pattern a string specifying what characters are in the set
  293. * @param patternLength the length of the pattern, or -1 if null
  294. * terminated
  295. * @param ec the error code
  296. * @stable ICU 2.4
  297. */
  298. U_CAPI USet* U_EXPORT2
  299. uset_openPattern(const UChar* pattern, int32_t patternLength,
  300. UErrorCode* ec);
  301. /**
  302. * Creates a set from the given pattern. See the UnicodeSet class
  303. * description for the syntax of the pattern language.
  304. * @param pattern a string specifying what characters are in the set
  305. * @param patternLength the length of the pattern, or -1 if null
  306. * terminated
  307. * @param options bitmask for options to apply to the pattern.
  308. * Valid options are USET_IGNORE_SPACE and
  309. * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
  310. * These case options are mutually exclusive.
  311. * @param ec the error code
  312. * @stable ICU 2.4
  313. */
  314. U_CAPI USet* U_EXPORT2
  315. uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
  316. uint32_t options,
  317. UErrorCode* ec);
  318. /**
  319. * Disposes of the storage used by a USet object. This function should
  320. * be called exactly once for objects returned by uset_open().
  321. * @param set the object to dispose of
  322. * @stable ICU 2.4
  323. */
  324. U_CAPI void U_EXPORT2
  325. uset_close(USet* set);
  326. #if U_SHOW_CPLUSPLUS_API
  327. U_NAMESPACE_BEGIN
  328. /**
  329. * \class LocalUSetPointer
  330. * "Smart pointer" class, closes a USet via uset_close().
  331. * For most methods see the LocalPointerBase base class.
  332. *
  333. * @see LocalPointerBase
  334. * @see LocalPointer
  335. * @stable ICU 4.4
  336. */
  337. U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close);
  338. U_NAMESPACE_END
  339. #endif
  340. /**
  341. * Returns a copy of this object.
  342. * If this set is frozen, then the clone will be frozen as well.
  343. * Use uset_cloneAsThawed() for a mutable clone of a frozen set.
  344. * @param set the original set
  345. * @return the newly allocated copy of the set
  346. * @see uset_cloneAsThawed
  347. * @stable ICU 3.8
  348. */
  349. U_CAPI USet * U_EXPORT2
  350. uset_clone(const USet *set);
  351. /**
  352. * Determines whether the set has been frozen (made immutable) or not.
  353. * See the ICU4J Freezable interface for details.
  354. * @param set the set
  355. * @return true/false for whether the set has been frozen
  356. * @see uset_freeze
  357. * @see uset_cloneAsThawed
  358. * @stable ICU 3.8
  359. */
  360. U_CAPI UBool U_EXPORT2
  361. uset_isFrozen(const USet *set);
  362. /**
  363. * Freeze the set (make it immutable).
  364. * Once frozen, it cannot be unfrozen and is therefore thread-safe
  365. * until it is deleted.
  366. * See the ICU4J Freezable interface for details.
  367. * Freezing the set may also make some operations faster, for example
  368. * uset_contains() and uset_span().
  369. * A frozen set will not be modified. (It remains frozen.)
  370. * @param set the set
  371. * @return the same set, now frozen
  372. * @see uset_isFrozen
  373. * @see uset_cloneAsThawed
  374. * @stable ICU 3.8
  375. */
  376. U_CAPI void U_EXPORT2
  377. uset_freeze(USet *set);
  378. /**
  379. * Clone the set and make the clone mutable.
  380. * See the ICU4J Freezable interface for details.
  381. * @param set the set
  382. * @return the mutable clone
  383. * @see uset_freeze
  384. * @see uset_isFrozen
  385. * @see uset_clone
  386. * @stable ICU 3.8
  387. */
  388. U_CAPI USet * U_EXPORT2
  389. uset_cloneAsThawed(const USet *set);
  390. /**
  391. * Causes the USet object to represent the range <code>start - end</code>.
  392. * If <code>start > end</code> then this USet is set to an empty range.
  393. * A frozen set will not be modified.
  394. * @param set the object to set to the given range
  395. * @param start first character in the set, inclusive
  396. * @param end last character in the set, inclusive
  397. * @stable ICU 3.2
  398. */
  399. U_CAPI void U_EXPORT2
  400. uset_set(USet* set,
  401. UChar32 start, UChar32 end);
  402. /**
  403. * Modifies the set to represent the set specified by the given
  404. * pattern. See the UnicodeSet class description for the syntax of
  405. * the pattern language. See also the User Guide chapter about UnicodeSet.
  406. * <em>Empties the set passed before applying the pattern.</em>
  407. * A frozen set will not be modified.
  408. * @param set The set to which the pattern is to be applied.
  409. * @param pattern A pointer to UChar string specifying what characters are in the set.
  410. * The character at pattern[0] must be a '['.
  411. * @param patternLength The length of the UChar string. -1 if NUL terminated.
  412. * @param options A bitmask for options to apply to the pattern.
  413. * Valid options are USET_IGNORE_SPACE and
  414. * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS,
  415. * USET_SIMPLE_CASE_INSENSITIVE.
  416. * These case options are mutually exclusive.
  417. * @param status Returns an error if the pattern cannot be parsed.
  418. * @return Upon successful parse, the value is either
  419. * the index of the character after the closing ']'
  420. * of the parsed pattern.
  421. * If the status code indicates failure, then the return value
  422. * is the index of the error in the source.
  423. *
  424. * @stable ICU 2.8
  425. */
  426. U_CAPI int32_t U_EXPORT2
  427. uset_applyPattern(USet *set,
  428. const UChar *pattern, int32_t patternLength,
  429. uint32_t options,
  430. UErrorCode *status);
  431. /**
  432. * Modifies the set to contain those code points which have the given value
  433. * for the given binary or enumerated property, as returned by
  434. * u_getIntPropertyValue. Prior contents of this set are lost.
  435. * A frozen set will not be modified.
  436. *
  437. * @param set the object to contain the code points defined by the property
  438. *
  439. * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
  440. * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
  441. * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
  442. *
  443. * @param value a value in the range u_getIntPropertyMinValue(prop)..
  444. * u_getIntPropertyMaxValue(prop), with one exception. If prop is
  445. * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
  446. * rather a mask value produced by U_GET_GC_MASK(). This allows grouped
  447. * categories such as [:L:] to be represented.
  448. *
  449. * @param ec error code input/output parameter
  450. *
  451. * @stable ICU 3.2
  452. */
  453. U_CAPI void U_EXPORT2
  454. uset_applyIntPropertyValue(USet* set,
  455. UProperty prop, int32_t value, UErrorCode* ec);
  456. /**
  457. * Modifies the set to contain those code points which have the
  458. * given value for the given property. Prior contents of this
  459. * set are lost.
  460. * A frozen set will not be modified.
  461. *
  462. * @param set the object to contain the code points defined by the given
  463. * property and value alias
  464. *
  465. * @param prop a string specifying a property alias, either short or long.
  466. * The name is matched loosely. See PropertyAliases.txt for names and a
  467. * description of loose matching. If the value string is empty, then this
  468. * string is interpreted as either a General_Category value alias, a Script
  469. * value alias, a binary property alias, or a special ID. Special IDs are
  470. * matched loosely and correspond to the following sets:
  471. *
  472. * "ANY" = [\\u0000-\\U0010FFFF],
  473. * "ASCII" = [\\u0000-\\u007F],
  474. * "Assigned" = [:^Cn:].
  475. *
  476. * @param propLength the length of the prop, or -1 if NULL
  477. *
  478. * @param value a string specifying a value alias, either short or long.
  479. * The name is matched loosely. See PropertyValueAliases.txt for names
  480. * and a description of loose matching. In addition to aliases listed,
  481. * numeric values and canonical combining classes may be expressed
  482. * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string
  483. * may also be empty.
  484. *
  485. * @param valueLength the length of the value, or -1 if NULL
  486. *
  487. * @param ec error code input/output parameter
  488. *
  489. * @stable ICU 3.2
  490. */
  491. U_CAPI void U_EXPORT2
  492. uset_applyPropertyAlias(USet* set,
  493. const UChar *prop, int32_t propLength,
  494. const UChar *value, int32_t valueLength,
  495. UErrorCode* ec);
  496. /**
  497. * Return true if the given position, in the given pattern, appears
  498. * to be the start of a UnicodeSet pattern.
  499. *
  500. * @param pattern a string specifying the pattern
  501. * @param patternLength the length of the pattern, or -1 if NULL
  502. * @param pos the given position
  503. * @stable ICU 3.2
  504. */
  505. U_CAPI UBool U_EXPORT2
  506. uset_resemblesPattern(const UChar *pattern, int32_t patternLength,
  507. int32_t pos);
  508. /**
  509. * Returns a string representation of this set. If the result of
  510. * calling this function is passed to a uset_openPattern(), it
  511. * will produce another set that is equal to this one.
  512. * @param set the set
  513. * @param result the string to receive the rules, may be NULL
  514. * @param resultCapacity the capacity of result, may be 0 if result is NULL
  515. * @param escapeUnprintable if true then convert unprintable
  516. * character to their hex escape representations, \\uxxxx or
  517. * \\Uxxxxxxxx. Unprintable characters are those other than
  518. * U+000A, U+0020..U+007E.
  519. * @param ec error code.
  520. * @return length of string, possibly larger than resultCapacity
  521. * @stable ICU 2.4
  522. */
  523. U_CAPI int32_t U_EXPORT2
  524. uset_toPattern(const USet* set,
  525. UChar* result, int32_t resultCapacity,
  526. UBool escapeUnprintable,
  527. UErrorCode* ec);
  528. /**
  529. * Adds the given character to the given USet. After this call,
  530. * uset_contains(set, c) will return true.
  531. * A frozen set will not be modified.
  532. * @param set the object to which to add the character
  533. * @param c the character to add
  534. * @stable ICU 2.4
  535. */
  536. U_CAPI void U_EXPORT2
  537. uset_add(USet* set, UChar32 c);
  538. /**
  539. * Adds all of the elements in the specified set to this set if
  540. * they're not already present. This operation effectively
  541. * modifies this set so that its value is the <i>union</i> of the two
  542. * sets. The behavior of this operation is unspecified if the specified
  543. * collection is modified while the operation is in progress.
  544. * A frozen set will not be modified.
  545. *
  546. * @param set the object to which to add the set
  547. * @param additionalSet the source set whose elements are to be added to this set.
  548. * @stable ICU 2.6
  549. */
  550. U_CAPI void U_EXPORT2
  551. uset_addAll(USet* set, const USet *additionalSet);
  552. /**
  553. * Adds the given range of characters to the given USet. After this call,
  554. * uset_contains(set, start, end) will return true.
  555. * A frozen set will not be modified.
  556. * @param set the object to which to add the character
  557. * @param start the first character of the range to add, inclusive
  558. * @param end the last character of the range to add, inclusive
  559. * @stable ICU 2.2
  560. */
  561. U_CAPI void U_EXPORT2
  562. uset_addRange(USet* set, UChar32 start, UChar32 end);
  563. /**
  564. * Adds the given string to the given USet. After this call,
  565. * uset_containsString(set, str, strLen) will return true.
  566. * A frozen set will not be modified.
  567. * @param set the object to which to add the character
  568. * @param str the string to add
  569. * @param strLen the length of the string or -1 if null terminated.
  570. * @stable ICU 2.4
  571. */
  572. U_CAPI void U_EXPORT2
  573. uset_addString(USet* set, const UChar* str, int32_t strLen);
  574. /**
  575. * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
  576. * If this set already contains any particular character, it has no effect on that character.
  577. * A frozen set will not be modified.
  578. * @param set the object to which to add the character
  579. * @param str the source string
  580. * @param strLen the length of the string or -1 if null terminated.
  581. * @stable ICU 3.4
  582. */
  583. U_CAPI void U_EXPORT2
  584. uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen);
  585. /**
  586. * Removes the given character from the given USet. After this call,
  587. * uset_contains(set, c) will return false.
  588. * A frozen set will not be modified.
  589. * @param set the object from which to remove the character
  590. * @param c the character to remove
  591. * @stable ICU 2.4
  592. */
  593. U_CAPI void U_EXPORT2
  594. uset_remove(USet* set, UChar32 c);
  595. /**
  596. * Removes the given range of characters from the given USet. After this call,
  597. * uset_contains(set, start, end) will return false.
  598. * A frozen set will not be modified.
  599. * @param set the object to which to add the character
  600. * @param start the first character of the range to remove, inclusive
  601. * @param end the last character of the range to remove, inclusive
  602. * @stable ICU 2.2
  603. */
  604. U_CAPI void U_EXPORT2
  605. uset_removeRange(USet* set, UChar32 start, UChar32 end);
  606. /**
  607. * Removes the given string to the given USet. After this call,
  608. * uset_containsString(set, str, strLen) will return false.
  609. * A frozen set will not be modified.
  610. * @param set the object to which to add the character
  611. * @param str the string to remove
  612. * @param strLen the length of the string or -1 if null terminated.
  613. * @stable ICU 2.4
  614. */
  615. U_CAPI void U_EXPORT2
  616. uset_removeString(USet* set, const UChar* str, int32_t strLen);
  617. /**
  618. * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
  619. * A frozen set will not be modified.
  620. *
  621. * @param set the object to be modified
  622. * @param str the string
  623. * @param length the length of the string, or -1 if NUL-terminated
  624. * @stable ICU 69
  625. */
  626. U_CAPI void U_EXPORT2
  627. uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
  628. /**
  629. * Removes from this set all of its elements that are contained in the
  630. * specified set. This operation effectively modifies this
  631. * set so that its value is the <i>asymmetric set difference</i> of
  632. * the two sets.
  633. * A frozen set will not be modified.
  634. * @param set the object from which the elements are to be removed
  635. * @param removeSet the object that defines which elements will be
  636. * removed from this set
  637. * @stable ICU 3.2
  638. */
  639. U_CAPI void U_EXPORT2
  640. uset_removeAll(USet* set, const USet* removeSet);
  641. /**
  642. * Retain only the elements in this set that are contained in the
  643. * specified range. If <code>start > end</code> then an empty range is
  644. * retained, leaving the set empty. This is equivalent to
  645. * a boolean logic AND, or a set INTERSECTION.
  646. * A frozen set will not be modified.
  647. *
  648. * @param set the object for which to retain only the specified range
  649. * @param start first character, inclusive, of range
  650. * @param end last character, inclusive, of range
  651. * @stable ICU 3.2
  652. */
  653. U_CAPI void U_EXPORT2
  654. uset_retain(USet* set, UChar32 start, UChar32 end);
  655. /**
  656. * Retains only the specified string from this set if it is present.
  657. * Upon return this set will be empty if it did not contain s, or
  658. * will only contain s if it did contain s.
  659. * A frozen set will not be modified.
  660. *
  661. * @param set the object to be modified
  662. * @param str the string
  663. * @param length the length of the string, or -1 if NUL-terminated
  664. * @stable ICU 69
  665. */
  666. U_CAPI void U_EXPORT2
  667. uset_retainString(USet *set, const UChar *str, int32_t length);
  668. /**
  669. * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
  670. * A frozen set will not be modified.
  671. *
  672. * @param set the object to be modified
  673. * @param str the string
  674. * @param length the length of the string, or -1 if NUL-terminated
  675. * @stable ICU 69
  676. */
  677. U_CAPI void U_EXPORT2
  678. uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
  679. /**
  680. * Retains only the elements in this set that are contained in the
  681. * specified set. In other words, removes from this set all of
  682. * its elements that are not contained in the specified set. This
  683. * operation effectively modifies this set so that its value is
  684. * the <i>intersection</i> of the two sets.
  685. * A frozen set will not be modified.
  686. *
  687. * @param set the object on which to perform the retain
  688. * @param retain set that defines which elements this set will retain
  689. * @stable ICU 3.2
  690. */
  691. U_CAPI void U_EXPORT2
  692. uset_retainAll(USet* set, const USet* retain);
  693. /**
  694. * Reallocate this objects internal structures to take up the least
  695. * possible space, without changing this object's value.
  696. * A frozen set will not be modified.
  697. *
  698. * @param set the object on which to perform the compact
  699. * @stable ICU 3.2
  700. */
  701. U_CAPI void U_EXPORT2
  702. uset_compact(USet* set);
  703. /**
  704. * This is equivalent to
  705. * <code>uset_complementRange(set, 0, 0x10FFFF)</code>.
  706. *
  707. * <strong>Note:</strong> This performs a symmetric difference with all code points
  708. * <em>and thus retains all multicharacter strings</em>.
  709. * In order to achieve a “code point complement” (all code points minus this set),
  710. * the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>.
  711. *
  712. * A frozen set will not be modified.
  713. * @param set the set
  714. * @stable ICU 2.4
  715. */
  716. U_CAPI void U_EXPORT2
  717. uset_complement(USet* set);
  718. /**
  719. * Complements the specified range in this set. Any character in
  720. * the range will be removed if it is in this set, or will be
  721. * added if it is not in this set. If <code>start > end</code>
  722. * then an empty range is complemented, leaving the set unchanged.
  723. * This is equivalent to a boolean logic XOR.
  724. * A frozen set will not be modified.
  725. *
  726. * @param set the object to be modified
  727. * @param start first character, inclusive, of range
  728. * @param end last character, inclusive, of range
  729. * @stable ICU 69
  730. */
  731. U_CAPI void U_EXPORT2
  732. uset_complementRange(USet *set, UChar32 start, UChar32 end);
  733. /**
  734. * Complements the specified string in this set.
  735. * The string will be removed if it is in this set, or will be added if it is not in this set.
  736. * A frozen set will not be modified.
  737. *
  738. * @param set the object to be modified
  739. * @param str the string
  740. * @param length the length of the string, or -1 if NUL-terminated
  741. * @stable ICU 69
  742. */
  743. U_CAPI void U_EXPORT2
  744. uset_complementString(USet *set, const UChar *str, int32_t length);
  745. /**
  746. * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"}
  747. * A frozen set will not be modified.
  748. *
  749. * @param set the object to be modified
  750. * @param str the string
  751. * @param length the length of the string, or -1 if NUL-terminated
  752. * @stable ICU 69
  753. */
  754. U_CAPI void U_EXPORT2
  755. uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
  756. /**
  757. * Complements in this set all elements contained in the specified
  758. * set. Any character in the other set will be removed if it is
  759. * in this set, or will be added if it is not in this set.
  760. * A frozen set will not be modified.
  761. *
  762. * @param set the set with which to complement
  763. * @param complement set that defines which elements will be xor'ed
  764. * from this set.
  765. * @stable ICU 3.2
  766. */
  767. U_CAPI void U_EXPORT2
  768. uset_complementAll(USet* set, const USet* complement);
  769. /**
  770. * Removes all of the elements from this set. This set will be
  771. * empty after this call returns.
  772. * A frozen set will not be modified.
  773. * @param set the set
  774. * @stable ICU 2.4
  775. */
  776. U_CAPI void U_EXPORT2
  777. uset_clear(USet* set);
  778. /**
  779. * Close this set over the given attribute. For the attribute
  780. * USET_CASE_INSENSITIVE, the result is to modify this set so that:
  781. *
  782. * 1. For each character or string 'a' in this set, all strings or
  783. * characters 'b' such that foldCase(a) == foldCase(b) are added
  784. * to this set.
  785. *
  786. * 2. For each string 'e' in the resulting set, if e !=
  787. * foldCase(e), 'e' will be removed.
  788. *
  789. * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
  790. *
  791. * (Here foldCase(x) refers to the operation u_strFoldCase, and a
  792. * == b denotes that the contents are the same, not pointer
  793. * comparison.)
  794. *
  795. * A frozen set will not be modified.
  796. *
  797. * @param set the set
  798. *
  799. * @param attributes bitmask for attributes to close over.
  800. * Valid options:
  801. * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
  802. * These case options are mutually exclusive.
  803. * Unrelated options bits are ignored.
  804. * @stable ICU 4.2
  805. */
  806. U_CAPI void U_EXPORT2
  807. uset_closeOver(USet* set, int32_t attributes);
  808. /**
  809. * Remove all strings from this set.
  810. *
  811. * @param set the set
  812. * @stable ICU 4.2
  813. */
  814. U_CAPI void U_EXPORT2
  815. uset_removeAllStrings(USet* set);
  816. /**
  817. * Returns true if the given USet contains no characters and no
  818. * strings.
  819. * @param set the set
  820. * @return true if set is empty
  821. * @stable ICU 2.4
  822. */
  823. U_CAPI UBool U_EXPORT2
  824. uset_isEmpty(const USet* set);
  825. /**
  826. * @param set the set
  827. * @return true if this set contains multi-character strings or the empty string.
  828. * @stable ICU 70
  829. */
  830. U_CAPI UBool U_EXPORT2
  831. uset_hasStrings(const USet *set);
  832. /**
  833. * Returns true if the given USet contains the given character.
  834. * This function works faster with a frozen set.
  835. * @param set the set
  836. * @param c The codepoint to check for within the set
  837. * @return true if set contains c
  838. * @stable ICU 2.4
  839. */
  840. U_CAPI UBool U_EXPORT2
  841. uset_contains(const USet* set, UChar32 c);
  842. /**
  843. * Returns true if the given USet contains all characters c
  844. * where start <= c && c <= end.
  845. * @param set the set
  846. * @param start the first character of the range to test, inclusive
  847. * @param end the last character of the range to test, inclusive
  848. * @return true if set contains the range
  849. * @stable ICU 2.2
  850. */
  851. U_CAPI UBool U_EXPORT2
  852. uset_containsRange(const USet* set, UChar32 start, UChar32 end);
  853. /**
  854. * Returns true if the given USet contains the given string.
  855. * @param set the set
  856. * @param str the string
  857. * @param strLen the length of the string or -1 if null terminated.
  858. * @return true if set contains str
  859. * @stable ICU 2.4
  860. */
  861. U_CAPI UBool U_EXPORT2
  862. uset_containsString(const USet* set, const UChar* str, int32_t strLen);
  863. /**
  864. * Returns the index of the given character within this set, where
  865. * the set is ordered by ascending code point. If the character
  866. * is not in this set, return -1. The inverse of this method is
  867. * <code>charAt()</code>.
  868. * @param set the set
  869. * @param c the character to obtain the index for
  870. * @return an index from 0..size()-1, or -1
  871. * @stable ICU 3.2
  872. */
  873. U_CAPI int32_t U_EXPORT2
  874. uset_indexOf(const USet* set, UChar32 c);
  875. /**
  876. * Returns the character at the given index within this set, where
  877. * the set is ordered by ascending code point. If the index is
  878. * out of range for characters, returns (UChar32)-1.
  879. * The inverse of this method is <code>indexOf()</code>.
  880. *
  881. * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount()
  882. * with uset_getItem(), because for each call it skips linearly over <code>index</code>
  883. * characters in the ranges.
  884. *
  885. * @param set the set
  886. * @param charIndex an index from 0..size()-1 to obtain the char for
  887. * @return the character at the given index, or (UChar32)-1.
  888. * @stable ICU 3.2
  889. */
  890. U_CAPI UChar32 U_EXPORT2
  891. uset_charAt(const USet* set, int32_t charIndex);
  892. /**
  893. * Returns the number of characters and strings contained in this set.
  894. * The last (uset_getItemCount() - uset_getRangeCount()) items are strings.
  895. *
  896. * This is slower than uset_getRangeCount() and uset_getItemCount() because
  897. * it counts the code points of all ranges.
  898. *
  899. * @param set the set
  900. * @return a non-negative integer counting the characters and strings
  901. * contained in set
  902. * @stable ICU 2.4
  903. * @see uset_getRangeCount
  904. */
  905. U_CAPI int32_t U_EXPORT2
  906. uset_size(const USet* set);
  907. /**
  908. * @param set the set
  909. * @return the number of ranges in this set.
  910. * @stable ICU 70
  911. * @see uset_getItemCount
  912. * @see uset_getItem
  913. * @see uset_size
  914. */
  915. U_CAPI int32_t U_EXPORT2
  916. uset_getRangeCount(const USet *set);
  917. /**
  918. * Returns the number of items in this set. An item is either a range
  919. * of characters or a single multicharacter string.
  920. * @param set the set
  921. * @return a non-negative integer counting the character ranges
  922. * and/or strings contained in set
  923. * @stable ICU 2.4
  924. */
  925. U_CAPI int32_t U_EXPORT2
  926. uset_getItemCount(const USet* set);
  927. /**
  928. * Returns an item of this set. An item is either a range of
  929. * characters or a single multicharacter string (which can be the empty string).
  930. *
  931. * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0,
  932. * and the range is <code>*start</code>..<code>*end</code>.
  933. *
  934. * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then
  935. * this function copies the string into <code>str[strCapacity]</code> and
  936. * returns the length of the string (0 for the empty string).
  937. *
  938. * If <code>itemIndex</code> is out of range, then this function returns -1.
  939. *
  940. * Note that 0 is returned for each range as well as for the empty string.
  941. *
  942. * @param set the set
  943. * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1
  944. * @param start pointer to variable to receive first character in range, inclusive;
  945. * can be NULL for a string item
  946. * @param end pointer to variable to receive last character in range, inclusive;
  947. * can be NULL for a string item
  948. * @param str buffer to receive the string, may be NULL
  949. * @param strCapacity capacity of str, or 0 if str is NULL
  950. * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range
  951. * @return the length of the string (0 or >= 2), or 0 if the item is a range,
  952. * or -1 if the itemIndex is out of range
  953. * @stable ICU 2.4
  954. */
  955. U_CAPI int32_t U_EXPORT2
  956. uset_getItem(const USet* set, int32_t itemIndex,
  957. UChar32* start, UChar32* end,
  958. UChar* str, int32_t strCapacity,
  959. UErrorCode* ec);
  960. /**
  961. * Returns true if set1 contains all the characters and strings
  962. * of set2. It answers the question, 'Is set1 a superset of set2?'
  963. * @param set1 set to be checked for containment
  964. * @param set2 set to be checked for containment
  965. * @return true if the test condition is met
  966. * @stable ICU 3.2
  967. */
  968. U_CAPI UBool U_EXPORT2
  969. uset_containsAll(const USet* set1, const USet* set2);
  970. /**
  971. * Returns true if this set contains all the characters
  972. * of the given string. This is does not check containment of grapheme
  973. * clusters, like uset_containsString.
  974. * @param set set of characters to be checked for containment
  975. * @param str string containing codepoints to be checked for containment
  976. * @param strLen the length of the string or -1 if null terminated.
  977. * @return true if the test condition is met
  978. * @stable ICU 3.4
  979. */
  980. U_CAPI UBool U_EXPORT2
  981. uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen);
  982. /**
  983. * Returns true if set1 contains none of the characters and strings
  984. * of set2. It answers the question, 'Is set1 a disjoint set of set2?'
  985. * @param set1 set to be checked for containment
  986. * @param set2 set to be checked for containment
  987. * @return true if the test condition is met
  988. * @stable ICU 3.2
  989. */
  990. U_CAPI UBool U_EXPORT2
  991. uset_containsNone(const USet* set1, const USet* set2);
  992. /**
  993. * Returns true if set1 contains some of the characters and strings
  994. * of set2. It answers the question, 'Does set1 and set2 have an intersection?'
  995. * @param set1 set to be checked for containment
  996. * @param set2 set to be checked for containment
  997. * @return true if the test condition is met
  998. * @stable ICU 3.2
  999. */
  1000. U_CAPI UBool U_EXPORT2
  1001. uset_containsSome(const USet* set1, const USet* set2);
  1002. /**
  1003. * Returns the length of the initial substring of the input string which
  1004. * consists only of characters and strings that are contained in this set
  1005. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  1006. * or only of characters and strings that are not contained
  1007. * in this set (USET_SPAN_NOT_CONTAINED).
  1008. * See USetSpanCondition for details.
  1009. * Similar to the strspn() C library function.
  1010. * Unpaired surrogates are treated according to contains() of their surrogate code points.
  1011. * This function works faster with a frozen set and with a non-negative string length argument.
  1012. * @param set the set
  1013. * @param s start of the string
  1014. * @param length of the string; can be -1 for NUL-terminated
  1015. * @param spanCondition specifies the containment condition
  1016. * @return the length of the initial substring according to the spanCondition;
  1017. * 0 if the start of the string does not fit the spanCondition
  1018. * @stable ICU 3.8
  1019. * @see USetSpanCondition
  1020. */
  1021. U_CAPI int32_t U_EXPORT2
  1022. uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
  1023. /**
  1024. * Returns the start of the trailing substring of the input string which
  1025. * consists only of characters and strings that are contained in this set
  1026. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  1027. * or only of characters and strings that are not contained
  1028. * in this set (USET_SPAN_NOT_CONTAINED).
  1029. * See USetSpanCondition for details.
  1030. * Unpaired surrogates are treated according to contains() of their surrogate code points.
  1031. * This function works faster with a frozen set and with a non-negative string length argument.
  1032. * @param set the set
  1033. * @param s start of the string
  1034. * @param length of the string; can be -1 for NUL-terminated
  1035. * @param spanCondition specifies the containment condition
  1036. * @return the start of the trailing substring according to the spanCondition;
  1037. * the string length if the end of the string does not fit the spanCondition
  1038. * @stable ICU 3.8
  1039. * @see USetSpanCondition
  1040. */
  1041. U_CAPI int32_t U_EXPORT2
  1042. uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
  1043. /**
  1044. * Returns the length of the initial substring of the input string which
  1045. * consists only of characters and strings that are contained in this set
  1046. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  1047. * or only of characters and strings that are not contained
  1048. * in this set (USET_SPAN_NOT_CONTAINED).
  1049. * See USetSpanCondition for details.
  1050. * Similar to the strspn() C library function.
  1051. * Malformed byte sequences are treated according to contains(0xfffd).
  1052. * This function works faster with a frozen set and with a non-negative string length argument.
  1053. * @param set the set
  1054. * @param s start of the string (UTF-8)
  1055. * @param length of the string; can be -1 for NUL-terminated
  1056. * @param spanCondition specifies the containment condition
  1057. * @return the length of the initial substring according to the spanCondition;
  1058. * 0 if the start of the string does not fit the spanCondition
  1059. * @stable ICU 3.8
  1060. * @see USetSpanCondition
  1061. */
  1062. U_CAPI int32_t U_EXPORT2
  1063. uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
  1064. /**
  1065. * Returns the start of the trailing substring of the input string which
  1066. * consists only of characters and strings that are contained in this set
  1067. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  1068. * or only of characters and strings that are not contained
  1069. * in this set (USET_SPAN_NOT_CONTAINED).
  1070. * See USetSpanCondition for details.
  1071. * Malformed byte sequences are treated according to contains(0xfffd).
  1072. * This function works faster with a frozen set and with a non-negative string length argument.
  1073. * @param set the set
  1074. * @param s start of the string (UTF-8)
  1075. * @param length of the string; can be -1 for NUL-terminated
  1076. * @param spanCondition specifies the containment condition
  1077. * @return the start of the trailing substring according to the spanCondition;
  1078. * the string length if the end of the string does not fit the spanCondition
  1079. * @stable ICU 3.8
  1080. * @see USetSpanCondition
  1081. */
  1082. U_CAPI int32_t U_EXPORT2
  1083. uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
  1084. /**
  1085. * Returns true if set1 contains all of the characters and strings
  1086. * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'
  1087. * @param set1 set to be checked for containment
  1088. * @param set2 set to be checked for containment
  1089. * @return true if the test condition is met
  1090. * @stable ICU 3.2
  1091. */
  1092. U_CAPI UBool U_EXPORT2
  1093. uset_equals(const USet* set1, const USet* set2);
  1094. /*********************************************************************
  1095. * Serialized set API
  1096. *********************************************************************/
  1097. /**
  1098. * Serializes this set into an array of 16-bit integers. Serialization
  1099. * (currently) only records the characters in the set; multicharacter
  1100. * strings are ignored.
  1101. *
  1102. * The array
  1103. * has following format (each line is one 16-bit integer):
  1104. *
  1105. * length = (n+2*m) | (m!=0?0x8000:0)
  1106. * bmpLength = n; present if m!=0
  1107. * bmp[0]
  1108. * bmp[1]
  1109. * ...
  1110. * bmp[n-1]
  1111. * supp-high[0]
  1112. * supp-low[0]
  1113. * supp-high[1]
  1114. * supp-low[1]
  1115. * ...
  1116. * supp-high[m-1]
  1117. * supp-low[m-1]
  1118. *
  1119. * The array starts with a header. After the header are n bmp
  1120. * code points, then m supplementary code points. Either n or m
  1121. * or both may be zero. n+2*m is always <= 0x7FFF.
  1122. *
  1123. * If there are no supplementary characters (if m==0) then the
  1124. * header is one 16-bit integer, 'length', with value n.
  1125. *
  1126. * If there are supplementary characters (if m!=0) then the header
  1127. * is two 16-bit integers. The first, 'length', has value
  1128. * (n+2*m)|0x8000. The second, 'bmpLength', has value n.
  1129. *
  1130. * After the header the code points are stored in ascending order.
  1131. * Supplementary code points are stored as most significant 16
  1132. * bits followed by least significant 16 bits.
  1133. *
  1134. * @param set the set
  1135. * @param dest pointer to buffer of destCapacity 16-bit integers.
  1136. * May be NULL only if destCapacity is zero.
  1137. * @param destCapacity size of dest, or zero. Must not be negative.
  1138. * @param pErrorCode pointer to the error code. Will be set to
  1139. * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to
  1140. * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity.
  1141. * @return the total length of the serialized format, including
  1142. * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
  1143. * than U_BUFFER_OVERFLOW_ERROR.
  1144. * @stable ICU 2.4
  1145. */
  1146. U_CAPI int32_t U_EXPORT2
  1147. uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);
  1148. /**
  1149. * Given a serialized array, fill in the given serialized set object.
  1150. * @param fillSet pointer to result
  1151. * @param src pointer to start of array
  1152. * @param srcLength length of array
  1153. * @return true if the given array is valid, otherwise false
  1154. * @stable ICU 2.4
  1155. */
  1156. U_CAPI UBool U_EXPORT2
  1157. uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);
  1158. /**
  1159. * Set the USerializedSet to contain the given character (and nothing
  1160. * else).
  1161. * @param fillSet pointer to result
  1162. * @param c The codepoint to set
  1163. * @stable ICU 2.4
  1164. */
  1165. U_CAPI void U_EXPORT2
  1166. uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);
  1167. /**
  1168. * Returns true if the given USerializedSet contains the given
  1169. * character.
  1170. * @param set the serialized set
  1171. * @param c The codepoint to check for within the set
  1172. * @return true if set contains c
  1173. * @stable ICU 2.4
  1174. */
  1175. U_CAPI UBool U_EXPORT2
  1176. uset_serializedContains(const USerializedSet* set, UChar32 c);
  1177. /**
  1178. * Returns the number of disjoint ranges of characters contained in
  1179. * the given serialized set. Ignores any strings contained in the
  1180. * set.
  1181. * @param set the serialized set
  1182. * @return a non-negative integer counting the character ranges
  1183. * contained in set
  1184. * @stable ICU 2.4
  1185. */
  1186. U_CAPI int32_t U_EXPORT2
  1187. uset_getSerializedRangeCount(const USerializedSet* set);
  1188. /**
  1189. * Returns a range of characters contained in the given serialized
  1190. * set.
  1191. * @param set the serialized set
  1192. * @param rangeIndex a non-negative integer in the range 0..
  1193. * uset_getSerializedRangeCount(set)-1
  1194. * @param pStart pointer to variable to receive first character
  1195. * in range, inclusive
  1196. * @param pEnd pointer to variable to receive last character in range,
  1197. * inclusive
  1198. * @return true if rangeIndex is valid, otherwise false
  1199. * @stable ICU 2.4
  1200. */
  1201. U_CAPI UBool U_EXPORT2
  1202. uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
  1203. UChar32* pStart, UChar32* pEnd);
  1204. #endif