uset.h 63 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2002-2014, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: uset.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002mar07
  16. * created by: Markus W. Scherer
  17. *
  18. * C version of UnicodeSet.
  19. */
  20. /**
  21. * \file
  22. * \brief C API: Unicode Set
  23. *
  24. * <p>This is a C wrapper around the C++ UnicodeSet class.</p>
  25. */
  26. #ifndef __USET_H__
  27. #define __USET_H__
  28. #include "unicode/utypes.h"
  29. #include "unicode/uchar.h"
  30. #if U_SHOW_CPLUSPLUS_API
  31. #include <string_view>
  32. #include "unicode/char16ptr.h"
  33. #include "unicode/localpointer.h"
  34. #include "unicode/unistr.h"
  35. #endif // U_SHOW_CPLUSPLUS_API
  36. #ifndef USET_DEFINED
  37. #ifndef U_IN_DOXYGEN
  38. #define USET_DEFINED
  39. #endif
  40. /**
  41. * USet is the C API type corresponding to C++ class UnicodeSet.
  42. * Use the uset_* API to manipulate. Create with
  43. * uset_open*, and destroy with uset_close.
  44. * @stable ICU 2.4
  45. */
  46. typedef struct USet USet;
  47. #endif
  48. /**
  49. * Bitmask values to be passed to uset_openPatternOptions() or
  50. * uset_applyPattern() taking an option parameter.
  51. *
  52. * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
  53. * These case options are mutually exclusive.
  54. *
  55. * Undefined options bits are ignored, and reserved for future use.
  56. *
  57. * @stable ICU 2.4
  58. */
  59. enum {
  60. /**
  61. * Ignore white space within patterns unless quoted or escaped.
  62. * @stable ICU 2.4
  63. */
  64. USET_IGNORE_SPACE = 1,
  65. /**
  66. * Enable case insensitive matching. E.g., "[ab]" with this flag
  67. * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
  68. * match all except 'a', 'A', 'b', and 'B'. This performs a full
  69. * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
  70. *
  71. * The resulting set is a superset of the input for the code points but
  72. * not for the strings.
  73. * It performs a case mapping closure of the code points and adds
  74. * full case folding strings for the code points, and reduces strings of
  75. * the original set to their full case folding equivalents.
  76. *
  77. * This is designed for case-insensitive matches, for example
  78. * in regular expressions. The full code point case closure allows checking of
  79. * an input character directly against the closure set.
  80. * Strings are matched by comparing the case-folded form from the closure
  81. * set with an incremental case folding of the string in question.
  82. *
  83. * The closure set will also contain single code points if the original
  84. * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
  85. * This is not necessary (that is, redundant) for the above matching method
  86. * but results in the same closure sets regardless of whether the original
  87. * set contained the code point or a string.
  88. *
  89. * @stable ICU 2.4
  90. */
  91. USET_CASE_INSENSITIVE = 2,
  92. /**
  93. * Adds all case mappings for each element in the set.
  94. * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
  95. * of each existing element in the set.
  96. *
  97. * Unlike the “case insensitive” options, this does not perform a closure.
  98. * For example, it does not add 'ſ' (U+017F long s) for 's',
  99. * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
  100. *
  101. * @stable ICU 3.2
  102. */
  103. USET_ADD_CASE_MAPPINGS = 4,
  104. /**
  105. * Enable case insensitive matching.
  106. * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings,
  107. * which map each code point to one code point,
  108. * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
  109. *
  110. * This is designed for case-insensitive matches, for example in certain
  111. * regular expression implementations where only Simple_Case_Folding mappings are used,
  112. * such as in ECMAScript (JavaScript) regular expressions.
  113. *
  114. * @stable ICU 73
  115. */
  116. USET_SIMPLE_CASE_INSENSITIVE = 6
  117. };
  118. /**
  119. * Argument values for whether span() and similar functions continue while
  120. * the current character is contained vs. not contained in the set.
  121. *
  122. * The functionality is straightforward for sets with only single code points,
  123. * without strings (which is the common case):
  124. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same.
  125. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED.
  126. * - span() and spanBack() partition any string the same way when
  127. * alternating between span(USET_SPAN_NOT_CONTAINED) and
  128. * span(either "contained" condition).
  129. * - Using a complemented (inverted) set and the opposite span conditions
  130. * yields the same results.
  131. *
  132. * When a set contains multi-code point strings, then these statements may not
  133. * be true, depending on the strings in the set (for example, whether they
  134. * overlap with each other) and the string that is processed.
  135. * For a set with strings:
  136. * - The complement of the set contains the opposite set of code points,
  137. * but the same set of strings.
  138. * Therefore, complementing both the set and the span conditions
  139. * may yield different results.
  140. * - When starting spans at different positions in a string
  141. * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
  142. * because a set string may start before the later position.
  143. * - span(USET_SPAN_SIMPLE) may be shorter than
  144. * span(USET_SPAN_CONTAINED) because it will not recursively try
  145. * all possible paths.
  146. * For example, with a set which contains the three strings "xy", "xya" and "ax",
  147. * span("xyax", USET_SPAN_CONTAINED) will return 4 but
  148. * span("xyax", USET_SPAN_SIMPLE) will return 3.
  149. * span(USET_SPAN_SIMPLE) will never be longer than
  150. * span(USET_SPAN_CONTAINED).
  151. * - With either "contained" condition, span() and spanBack() may partition
  152. * a string in different ways.
  153. * For example, with a set which contains the two strings "ab" and "ba",
  154. * and when processing the string "aba",
  155. * span() will yield contained/not-contained boundaries of { 0, 2, 3 }
  156. * while spanBack() will yield boundaries of { 0, 1, 3 }.
  157. *
  158. * Note: If it is important to get the same boundaries whether iterating forward
  159. * or backward through a string, then either only span() should be used and
  160. * the boundaries cached for backward operation, or an ICU BreakIterator
  161. * could be used.
  162. *
  163. * Note: Unpaired surrogates are treated like surrogate code points.
  164. * Similarly, set strings match only on code point boundaries,
  165. * never in the middle of a surrogate pair.
  166. * Illegal UTF-8 sequences are treated like U+FFFD.
  167. * When processing UTF-8 strings, malformed set strings
  168. * (strings with unpaired surrogates which cannot be converted to UTF-8)
  169. * are ignored.
  170. *
  171. * @stable ICU 3.8
  172. */
  173. typedef enum USetSpanCondition {
  174. /**
  175. * Continues a span() while there is no set element at the current position.
  176. * Increments by one code point at a time.
  177. * Stops before the first set element (character or string).
  178. * (For code points only, this is like while contains(current)==false).
  179. *
  180. * When span() returns, the substring between where it started and the position
  181. * it returned consists only of characters that are not in the set,
  182. * and none of its strings overlap with the span.
  183. *
  184. * @stable ICU 3.8
  185. */
  186. USET_SPAN_NOT_CONTAINED = 0,
  187. /**
  188. * Spans the longest substring that is a concatenation of set elements (characters or strings).
  189. * (For characters only, this is like while contains(current)==true).
  190. *
  191. * When span() returns, the substring between where it started and the position
  192. * it returned consists only of set elements (characters or strings) that are in the set.
  193. *
  194. * If a set contains strings, then the span will be the longest substring for which there
  195. * exists at least one non-overlapping concatenation of set elements (characters or strings).
  196. * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
  197. * (Java/ICU/Perl regex stops at the first match of an OR.)
  198. *
  199. * @stable ICU 3.8
  200. */
  201. USET_SPAN_CONTAINED = 1,
  202. /**
  203. * Continues a span() while there is a set element at the current position.
  204. * Increments by the longest matching element at each position.
  205. * (For characters only, this is like while contains(current)==true).
  206. *
  207. * When span() returns, the substring between where it started and the position
  208. * it returned consists only of set elements (characters or strings) that are in the set.
  209. *
  210. * If a set only contains single characters, then this is the same
  211. * as USET_SPAN_CONTAINED.
  212. *
  213. * If a set contains strings, then the span will be the longest substring
  214. * with a match at each position with the longest single set element (character or string).
  215. *
  216. * Use this span condition together with other longest-match algorithms,
  217. * such as ICU converters (ucnv_getUnicodeSet()).
  218. *
  219. * @stable ICU 3.8
  220. */
  221. USET_SPAN_SIMPLE = 2,
  222. #ifndef U_HIDE_DEPRECATED_API
  223. /**
  224. * One more than the last span condition.
  225. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
  226. */
  227. USET_SPAN_CONDITION_COUNT
  228. #endif // U_HIDE_DEPRECATED_API
  229. } USetSpanCondition;
  230. enum {
  231. /**
  232. * Capacity of USerializedSet::staticArray.
  233. * Enough for any single-code point set.
  234. * Also provides padding for nice sizeof(USerializedSet).
  235. * @stable ICU 2.4
  236. */
  237. USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
  238. };
  239. /**
  240. * A serialized form of a Unicode set. Limited manipulations are
  241. * possible directly on a serialized set. See below.
  242. * @stable ICU 2.4
  243. */
  244. typedef struct USerializedSet {
  245. /**
  246. * The serialized Unicode Set.
  247. * @stable ICU 2.4
  248. */
  249. const uint16_t *array;
  250. /**
  251. * The length of the array that contains BMP characters.
  252. * @stable ICU 2.4
  253. */
  254. int32_t bmpLength;
  255. /**
  256. * The total length of the array.
  257. * @stable ICU 2.4
  258. */
  259. int32_t length;
  260. /**
  261. * A small buffer for the array to reduce memory allocations.
  262. * @stable ICU 2.4
  263. */
  264. uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY];
  265. } USerializedSet;
  266. /*********************************************************************
  267. * USet API
  268. *********************************************************************/
  269. /**
  270. * Create an empty USet object.
  271. * Equivalent to uset_open(1, 0).
  272. * @return a newly created USet. The caller must call uset_close() on
  273. * it when done.
  274. * @stable ICU 4.2
  275. */
  276. U_CAPI USet* U_EXPORT2
  277. uset_openEmpty(void);
  278. /**
  279. * Creates a USet object that contains the range of characters
  280. * start..end, inclusive. If <code>start > end</code>
  281. * then an empty set is created (same as using uset_openEmpty()).
  282. * @param start first character of the range, inclusive
  283. * @param end last character of the range, inclusive
  284. * @return a newly created USet. The caller must call uset_close() on
  285. * it when done.
  286. * @stable ICU 2.4
  287. */
  288. U_CAPI USet* U_EXPORT2
  289. uset_open(UChar32 start, UChar32 end);
  290. /**
  291. * Creates a set from the given pattern. See the UnicodeSet class
  292. * description for the syntax of the pattern language.
  293. * @param pattern a string specifying what characters are in the set
  294. * @param patternLength the length of the pattern, or -1 if null
  295. * terminated
  296. * @param ec the error code
  297. * @stable ICU 2.4
  298. */
  299. U_CAPI USet* U_EXPORT2
  300. uset_openPattern(const UChar* pattern, int32_t patternLength,
  301. UErrorCode* ec);
  302. /**
  303. * Creates a set from the given pattern. See the UnicodeSet class
  304. * description for the syntax of the pattern language.
  305. * @param pattern a string specifying what characters are in the set
  306. * @param patternLength the length of the pattern, or -1 if null
  307. * terminated
  308. * @param options bitmask for options to apply to the pattern.
  309. * Valid options are USET_IGNORE_SPACE and
  310. * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
  311. * These case options are mutually exclusive.
  312. * @param ec the error code
  313. * @stable ICU 2.4
  314. */
  315. U_CAPI USet* U_EXPORT2
  316. uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
  317. uint32_t options,
  318. UErrorCode* ec);
  319. /**
  320. * Disposes of the storage used by a USet object. This function should
  321. * be called exactly once for objects returned by uset_open().
  322. * @param set the object to dispose of
  323. * @stable ICU 2.4
  324. */
  325. U_CAPI void U_EXPORT2
  326. uset_close(USet* set);
  327. #if U_SHOW_CPLUSPLUS_API
  328. U_NAMESPACE_BEGIN
  329. /**
  330. * \class LocalUSetPointer
  331. * "Smart pointer" class, closes a USet via uset_close().
  332. * For most methods see the LocalPointerBase base class.
  333. *
  334. * @see LocalPointerBase
  335. * @see LocalPointer
  336. * @stable ICU 4.4
  337. */
  338. U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close);
  339. U_NAMESPACE_END
  340. #endif
  341. /**
  342. * Returns a copy of this object.
  343. * If this set is frozen, then the clone will be frozen as well.
  344. * Use uset_cloneAsThawed() for a mutable clone of a frozen set.
  345. * @param set the original set
  346. * @return the newly allocated copy of the set
  347. * @see uset_cloneAsThawed
  348. * @stable ICU 3.8
  349. */
  350. U_CAPI USet * U_EXPORT2
  351. uset_clone(const USet *set);
  352. /**
  353. * Determines whether the set has been frozen (made immutable) or not.
  354. * See the ICU4J Freezable interface for details.
  355. * @param set the set
  356. * @return true/false for whether the set has been frozen
  357. * @see uset_freeze
  358. * @see uset_cloneAsThawed
  359. * @stable ICU 3.8
  360. */
  361. U_CAPI UBool U_EXPORT2
  362. uset_isFrozen(const USet *set);
  363. /**
  364. * Freeze the set (make it immutable).
  365. * Once frozen, it cannot be unfrozen and is therefore thread-safe
  366. * until it is deleted.
  367. * See the ICU4J Freezable interface for details.
  368. * Freezing the set may also make some operations faster, for example
  369. * uset_contains() and uset_span().
  370. * A frozen set will not be modified. (It remains frozen.)
  371. * @param set the set
  372. * @return the same set, now frozen
  373. * @see uset_isFrozen
  374. * @see uset_cloneAsThawed
  375. * @stable ICU 3.8
  376. */
  377. U_CAPI void U_EXPORT2
  378. uset_freeze(USet *set);
  379. /**
  380. * Clone the set and make the clone mutable.
  381. * See the ICU4J Freezable interface for details.
  382. * @param set the set
  383. * @return the mutable clone
  384. * @see uset_freeze
  385. * @see uset_isFrozen
  386. * @see uset_clone
  387. * @stable ICU 3.8
  388. */
  389. U_CAPI USet * U_EXPORT2
  390. uset_cloneAsThawed(const USet *set);
  391. /**
  392. * Causes the USet object to represent the range <code>start - end</code>.
  393. * If <code>start > end</code> then this USet is set to an empty range.
  394. * A frozen set will not be modified.
  395. * @param set the object to set to the given range
  396. * @param start first character in the set, inclusive
  397. * @param end last character in the set, inclusive
  398. * @stable ICU 3.2
  399. */
  400. U_CAPI void U_EXPORT2
  401. uset_set(USet* set,
  402. UChar32 start, UChar32 end);
  403. /**
  404. * Modifies the set to represent the set specified by the given
  405. * pattern. See the UnicodeSet class description for the syntax of
  406. * the pattern language. See also the User Guide chapter about UnicodeSet.
  407. * <em>Empties the set passed before applying the pattern.</em>
  408. * A frozen set will not be modified.
  409. * @param set The set to which the pattern is to be applied.
  410. * @param pattern A pointer to UChar string specifying what characters are in the set.
  411. * The character at pattern[0] must be a '['.
  412. * @param patternLength The length of the UChar string. -1 if NUL terminated.
  413. * @param options A bitmask for options to apply to the pattern.
  414. * Valid options are USET_IGNORE_SPACE and
  415. * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS,
  416. * USET_SIMPLE_CASE_INSENSITIVE.
  417. * These case options are mutually exclusive.
  418. * @param status Returns an error if the pattern cannot be parsed.
  419. * @return Upon successful parse, the value is either
  420. * the index of the character after the closing ']'
  421. * of the parsed pattern.
  422. * If the status code indicates failure, then the return value
  423. * is the index of the error in the source.
  424. *
  425. * @stable ICU 2.8
  426. */
  427. U_CAPI int32_t U_EXPORT2
  428. uset_applyPattern(USet *set,
  429. const UChar *pattern, int32_t patternLength,
  430. uint32_t options,
  431. UErrorCode *status);
  432. /**
  433. * Modifies the set to contain those code points which have the given value
  434. * for the given binary or enumerated property, as returned by
  435. * u_getIntPropertyValue. Prior contents of this set are lost.
  436. * A frozen set will not be modified.
  437. *
  438. * @param set the object to contain the code points defined by the property
  439. *
  440. * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
  441. * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
  442. * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
  443. *
  444. * @param value a value in the range u_getIntPropertyMinValue(prop)..
  445. * u_getIntPropertyMaxValue(prop), with one exception. If prop is
  446. * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
  447. * rather a mask value produced by U_GET_GC_MASK(). This allows grouped
  448. * categories such as [:L:] to be represented.
  449. *
  450. * @param ec error code input/output parameter
  451. *
  452. * @stable ICU 3.2
  453. */
  454. U_CAPI void U_EXPORT2
  455. uset_applyIntPropertyValue(USet* set,
  456. UProperty prop, int32_t value, UErrorCode* ec);
  457. /**
  458. * Modifies the set to contain those code points which have the
  459. * given value for the given property. Prior contents of this
  460. * set are lost.
  461. * A frozen set will not be modified.
  462. *
  463. * @param set the object to contain the code points defined by the given
  464. * property and value alias
  465. *
  466. * @param prop a string specifying a property alias, either short or long.
  467. * The name is matched loosely. See PropertyAliases.txt for names and a
  468. * description of loose matching. If the value string is empty, then this
  469. * string is interpreted as either a General_Category value alias, a Script
  470. * value alias, a binary property alias, or a special ID. Special IDs are
  471. * matched loosely and correspond to the following sets:
  472. *
  473. * "ANY" = [\\u0000-\\U0010FFFF],
  474. * "ASCII" = [\\u0000-\\u007F],
  475. * "Assigned" = [:^Cn:].
  476. *
  477. * @param propLength the length of the prop, or -1 if NULL
  478. *
  479. * @param value a string specifying a value alias, either short or long.
  480. * The name is matched loosely. See PropertyValueAliases.txt for names
  481. * and a description of loose matching. In addition to aliases listed,
  482. * numeric values and canonical combining classes may be expressed
  483. * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string
  484. * may also be empty.
  485. *
  486. * @param valueLength the length of the value, or -1 if NULL
  487. *
  488. * @param ec error code input/output parameter
  489. *
  490. * @stable ICU 3.2
  491. */
  492. U_CAPI void U_EXPORT2
  493. uset_applyPropertyAlias(USet* set,
  494. const UChar *prop, int32_t propLength,
  495. const UChar *value, int32_t valueLength,
  496. UErrorCode* ec);
  497. /**
  498. * Return true if the given position, in the given pattern, appears
  499. * to be the start of a UnicodeSet pattern.
  500. *
  501. * @param pattern a string specifying the pattern
  502. * @param patternLength the length of the pattern, or -1 if NULL
  503. * @param pos the given position
  504. * @stable ICU 3.2
  505. */
  506. U_CAPI UBool U_EXPORT2
  507. uset_resemblesPattern(const UChar *pattern, int32_t patternLength,
  508. int32_t pos);
  509. /**
  510. * Returns a string representation of this set. If the result of
  511. * calling this function is passed to a uset_openPattern(), it
  512. * will produce another set that is equal to this one.
  513. * @param set the set
  514. * @param result the string to receive the rules, may be NULL
  515. * @param resultCapacity the capacity of result, may be 0 if result is NULL
  516. * @param escapeUnprintable if true then convert unprintable
  517. * character to their hex escape representations, \\uxxxx or
  518. * \\Uxxxxxxxx. Unprintable characters are those other than
  519. * U+000A, U+0020..U+007E.
  520. * @param ec error code.
  521. * @return length of string, possibly larger than resultCapacity
  522. * @stable ICU 2.4
  523. */
  524. U_CAPI int32_t U_EXPORT2
  525. uset_toPattern(const USet* set,
  526. UChar* result, int32_t resultCapacity,
  527. UBool escapeUnprintable,
  528. UErrorCode* ec);
  529. /**
  530. * Adds the given character to the given USet. After this call,
  531. * uset_contains(set, c) will return true.
  532. * A frozen set will not be modified.
  533. * @param set the object to which to add the character
  534. * @param c the character to add
  535. * @stable ICU 2.4
  536. */
  537. U_CAPI void U_EXPORT2
  538. uset_add(USet* set, UChar32 c);
  539. /**
  540. * Adds all of the elements in the specified set to this set if
  541. * they're not already present. This operation effectively
  542. * modifies this set so that its value is the <i>union</i> of the two
  543. * sets. The behavior of this operation is unspecified if the specified
  544. * collection is modified while the operation is in progress.
  545. * A frozen set will not be modified.
  546. *
  547. * @param set the object to which to add the set
  548. * @param additionalSet the source set whose elements are to be added to this set.
  549. * @stable ICU 2.6
  550. */
  551. U_CAPI void U_EXPORT2
  552. uset_addAll(USet* set, const USet *additionalSet);
  553. /**
  554. * Adds the given range of characters to the given USet. After this call,
  555. * uset_contains(set, start, end) will return true.
  556. * A frozen set will not be modified.
  557. * @param set the object to which to add the character
  558. * @param start the first character of the range to add, inclusive
  559. * @param end the last character of the range to add, inclusive
  560. * @stable ICU 2.2
  561. */
  562. U_CAPI void U_EXPORT2
  563. uset_addRange(USet* set, UChar32 start, UChar32 end);
  564. /**
  565. * Adds the given string to the given USet. After this call,
  566. * uset_containsString(set, str, strLen) will return true.
  567. * A frozen set will not be modified.
  568. * @param set the object to which to add the character
  569. * @param str the string to add
  570. * @param strLen the length of the string or -1 if null terminated.
  571. * @stable ICU 2.4
  572. */
  573. U_CAPI void U_EXPORT2
  574. uset_addString(USet* set, const UChar* str, int32_t strLen);
  575. /**
  576. * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
  577. * If this set already contains any particular character, it has no effect on that character.
  578. * A frozen set will not be modified.
  579. * @param set the object to which to add the character
  580. * @param str the source string
  581. * @param strLen the length of the string or -1 if null terminated.
  582. * @stable ICU 3.4
  583. */
  584. U_CAPI void U_EXPORT2
  585. uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen);
  586. /**
  587. * Removes the given character from the given USet. After this call,
  588. * uset_contains(set, c) will return false.
  589. * A frozen set will not be modified.
  590. * @param set the object from which to remove the character
  591. * @param c the character to remove
  592. * @stable ICU 2.4
  593. */
  594. U_CAPI void U_EXPORT2
  595. uset_remove(USet* set, UChar32 c);
  596. /**
  597. * Removes the given range of characters from the given USet. After this call,
  598. * uset_contains(set, start, end) will return false.
  599. * A frozen set will not be modified.
  600. * @param set the object to which to add the character
  601. * @param start the first character of the range to remove, inclusive
  602. * @param end the last character of the range to remove, inclusive
  603. * @stable ICU 2.2
  604. */
  605. U_CAPI void U_EXPORT2
  606. uset_removeRange(USet* set, UChar32 start, UChar32 end);
  607. /**
  608. * Removes the given string to the given USet. After this call,
  609. * uset_containsString(set, str, strLen) will return false.
  610. * A frozen set will not be modified.
  611. * @param set the object to which to add the character
  612. * @param str the string to remove
  613. * @param strLen the length of the string or -1 if null terminated.
  614. * @stable ICU 2.4
  615. */
  616. U_CAPI void U_EXPORT2
  617. uset_removeString(USet* set, const UChar* str, int32_t strLen);
  618. /**
  619. * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
  620. * A frozen set will not be modified.
  621. *
  622. * @param set the object to be modified
  623. * @param str the string
  624. * @param length the length of the string, or -1 if NUL-terminated
  625. * @stable ICU 69
  626. */
  627. U_CAPI void U_EXPORT2
  628. uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
  629. /**
  630. * Removes from this set all of its elements that are contained in the
  631. * specified set. This operation effectively modifies this
  632. * set so that its value is the <i>asymmetric set difference</i> of
  633. * the two sets.
  634. * A frozen set will not be modified.
  635. * @param set the object from which the elements are to be removed
  636. * @param removeSet the object that defines which elements will be
  637. * removed from this set
  638. * @stable ICU 3.2
  639. */
  640. U_CAPI void U_EXPORT2
  641. uset_removeAll(USet* set, const USet* removeSet);
  642. /**
  643. * Retain only the elements in this set that are contained in the
  644. * specified range. If <code>start > end</code> then an empty range is
  645. * retained, leaving the set empty. This is equivalent to
  646. * a boolean logic AND, or a set INTERSECTION.
  647. * A frozen set will not be modified.
  648. *
  649. * @param set the object for which to retain only the specified range
  650. * @param start first character, inclusive, of range
  651. * @param end last character, inclusive, of range
  652. * @stable ICU 3.2
  653. */
  654. U_CAPI void U_EXPORT2
  655. uset_retain(USet* set, UChar32 start, UChar32 end);
  656. /**
  657. * Retains only the specified string from this set if it is present.
  658. * Upon return this set will be empty if it did not contain s, or
  659. * will only contain s if it did contain s.
  660. * A frozen set will not be modified.
  661. *
  662. * @param set the object to be modified
  663. * @param str the string
  664. * @param length the length of the string, or -1 if NUL-terminated
  665. * @stable ICU 69
  666. */
  667. U_CAPI void U_EXPORT2
  668. uset_retainString(USet *set, const UChar *str, int32_t length);
  669. /**
  670. * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
  671. * A frozen set will not be modified.
  672. *
  673. * @param set the object to be modified
  674. * @param str the string
  675. * @param length the length of the string, or -1 if NUL-terminated
  676. * @stable ICU 69
  677. */
  678. U_CAPI void U_EXPORT2
  679. uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
  680. /**
  681. * Retains only the elements in this set that are contained in the
  682. * specified set. In other words, removes from this set all of
  683. * its elements that are not contained in the specified set. This
  684. * operation effectively modifies this set so that its value is
  685. * the <i>intersection</i> of the two sets.
  686. * A frozen set will not be modified.
  687. *
  688. * @param set the object on which to perform the retain
  689. * @param retain set that defines which elements this set will retain
  690. * @stable ICU 3.2
  691. */
  692. U_CAPI void U_EXPORT2
  693. uset_retainAll(USet* set, const USet* retain);
  694. /**
  695. * Reallocate this objects internal structures to take up the least
  696. * possible space, without changing this object's value.
  697. * A frozen set will not be modified.
  698. *
  699. * @param set the object on which to perform the compact
  700. * @stable ICU 3.2
  701. */
  702. U_CAPI void U_EXPORT2
  703. uset_compact(USet* set);
  704. /**
  705. * This is equivalent to
  706. * <code>uset_complementRange(set, 0, 0x10FFFF)</code>.
  707. *
  708. * <strong>Note:</strong> This performs a symmetric difference with all code points
  709. * <em>and thus retains all multicharacter strings</em>.
  710. * In order to achieve a “code point complement” (all code points minus this set),
  711. * the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>.
  712. *
  713. * A frozen set will not be modified.
  714. * @param set the set
  715. * @stable ICU 2.4
  716. */
  717. U_CAPI void U_EXPORT2
  718. uset_complement(USet* set);
  719. /**
  720. * Complements the specified range in this set. Any character in
  721. * the range will be removed if it is in this set, or will be
  722. * added if it is not in this set. If <code>start > end</code>
  723. * then an empty range is complemented, leaving the set unchanged.
  724. * This is equivalent to a boolean logic XOR.
  725. * A frozen set will not be modified.
  726. *
  727. * @param set the object to be modified
  728. * @param start first character, inclusive, of range
  729. * @param end last character, inclusive, of range
  730. * @stable ICU 69
  731. */
  732. U_CAPI void U_EXPORT2
  733. uset_complementRange(USet *set, UChar32 start, UChar32 end);
  734. /**
  735. * Complements the specified string in this set.
  736. * The string will be removed if it is in this set, or will be added if it is not in this set.
  737. * A frozen set will not be modified.
  738. *
  739. * @param set the object to be modified
  740. * @param str the string
  741. * @param length the length of the string, or -1 if NUL-terminated
  742. * @stable ICU 69
  743. */
  744. U_CAPI void U_EXPORT2
  745. uset_complementString(USet *set, const UChar *str, int32_t length);
  746. /**
  747. * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"}
  748. * A frozen set will not be modified.
  749. *
  750. * @param set the object to be modified
  751. * @param str the string
  752. * @param length the length of the string, or -1 if NUL-terminated
  753. * @stable ICU 69
  754. */
  755. U_CAPI void U_EXPORT2
  756. uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
  757. /**
  758. * Complements in this set all elements contained in the specified
  759. * set. Any character in the other set will be removed if it is
  760. * in this set, or will be added if it is not in this set.
  761. * A frozen set will not be modified.
  762. *
  763. * @param set the set with which to complement
  764. * @param complement set that defines which elements will be xor'ed
  765. * from this set.
  766. * @stable ICU 3.2
  767. */
  768. U_CAPI void U_EXPORT2
  769. uset_complementAll(USet* set, const USet* complement);
  770. /**
  771. * Removes all of the elements from this set. This set will be
  772. * empty after this call returns.
  773. * A frozen set will not be modified.
  774. * @param set the set
  775. * @stable ICU 2.4
  776. */
  777. U_CAPI void U_EXPORT2
  778. uset_clear(USet* set);
  779. /**
  780. * Close this set over the given attribute. For the attribute
  781. * USET_CASE_INSENSITIVE, the result is to modify this set so that:
  782. *
  783. * 1. For each character or string 'a' in this set, all strings or
  784. * characters 'b' such that foldCase(a) == foldCase(b) are added
  785. * to this set.
  786. *
  787. * 2. For each string 'e' in the resulting set, if e !=
  788. * foldCase(e), 'e' will be removed.
  789. *
  790. * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
  791. *
  792. * (Here foldCase(x) refers to the operation u_strFoldCase, and a
  793. * == b denotes that the contents are the same, not pointer
  794. * comparison.)
  795. *
  796. * A frozen set will not be modified.
  797. *
  798. * @param set the set
  799. *
  800. * @param attributes bitmask for attributes to close over.
  801. * Valid options:
  802. * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
  803. * These case options are mutually exclusive.
  804. * Unrelated options bits are ignored.
  805. * @stable ICU 4.2
  806. */
  807. U_CAPI void U_EXPORT2
  808. uset_closeOver(USet* set, int32_t attributes);
  809. /**
  810. * Remove all strings from this set.
  811. *
  812. * @param set the set
  813. * @stable ICU 4.2
  814. */
  815. U_CAPI void U_EXPORT2
  816. uset_removeAllStrings(USet* set);
  817. /**
  818. * Returns true if the given USet contains no characters and no
  819. * strings.
  820. * @param set the set
  821. * @return true if set is empty
  822. * @stable ICU 2.4
  823. */
  824. U_CAPI UBool U_EXPORT2
  825. uset_isEmpty(const USet* set);
  826. /**
  827. * @param set the set
  828. * @return true if this set contains multi-character strings or the empty string.
  829. * @stable ICU 70
  830. */
  831. U_CAPI UBool U_EXPORT2
  832. uset_hasStrings(const USet *set);
  833. /**
  834. * Returns true if the given USet contains the given character.
  835. * This function works faster with a frozen set.
  836. * @param set the set
  837. * @param c The codepoint to check for within the set
  838. * @return true if set contains c
  839. * @stable ICU 2.4
  840. */
  841. U_CAPI UBool U_EXPORT2
  842. uset_contains(const USet* set, UChar32 c);
  843. /**
  844. * Returns true if the given USet contains all characters c
  845. * where start <= c && c <= end.
  846. * @param set the set
  847. * @param start the first character of the range to test, inclusive
  848. * @param end the last character of the range to test, inclusive
  849. * @return true if set contains the range
  850. * @stable ICU 2.2
  851. */
  852. U_CAPI UBool U_EXPORT2
  853. uset_containsRange(const USet* set, UChar32 start, UChar32 end);
  854. /**
  855. * Returns true if the given USet contains the given string.
  856. * @param set the set
  857. * @param str the string
  858. * @param strLen the length of the string or -1 if null terminated.
  859. * @return true if set contains str
  860. * @stable ICU 2.4
  861. */
  862. U_CAPI UBool U_EXPORT2
  863. uset_containsString(const USet* set, const UChar* str, int32_t strLen);
  864. /**
  865. * Returns the index of the given character within this set, where
  866. * the set is ordered by ascending code point. If the character
  867. * is not in this set, return -1. The inverse of this method is
  868. * <code>charAt()</code>.
  869. * @param set the set
  870. * @param c the character to obtain the index for
  871. * @return an index from 0..size()-1, or -1
  872. * @stable ICU 3.2
  873. */
  874. U_CAPI int32_t U_EXPORT2
  875. uset_indexOf(const USet* set, UChar32 c);
  876. /**
  877. * Returns the character at the given index within this set, where
  878. * the set is ordered by ascending code point. If the index is
  879. * out of range for characters, returns (UChar32)-1.
  880. * The inverse of this method is <code>indexOf()</code>.
  881. *
  882. * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount()
  883. * with uset_getItem(), because for each call it skips linearly over <code>index</code>
  884. * characters in the ranges.
  885. *
  886. * @param set the set
  887. * @param charIndex an index from 0..size()-1 to obtain the char for
  888. * @return the character at the given index, or (UChar32)-1.
  889. * @stable ICU 3.2
  890. */
  891. U_CAPI UChar32 U_EXPORT2
  892. uset_charAt(const USet* set, int32_t charIndex);
  893. /**
  894. * Returns the number of characters and strings contained in this set.
  895. * The last uset_getStringCount() == (uset_getItemCount() - uset_getRangeCount()) items are strings.
  896. *
  897. * This is slower than uset_getRangeCount() and uset_getItemCount() because
  898. * it counts the code points of all ranges.
  899. *
  900. * @param set the set
  901. * @return a non-negative integer counting the characters and strings
  902. * contained in set
  903. * @stable ICU 2.4
  904. * @see uset_getRangeCount
  905. * @see uset_getStringCount
  906. * @see uset_getItemCount
  907. */
  908. U_CAPI int32_t U_EXPORT2
  909. uset_size(const USet* set);
  910. /**
  911. * @param set the set
  912. * @return the number of ranges in this set.
  913. * @stable ICU 70
  914. * @see uset_getItemCount
  915. * @see uset_getItem
  916. * @see uset_getStringCount
  917. * @see uset_size
  918. */
  919. U_CAPI int32_t U_EXPORT2
  920. uset_getRangeCount(const USet *set);
  921. #ifndef U_HIDE_DRAFT_API
  922. /**
  923. * @param set the set
  924. * @return the number of strings in this set.
  925. * @draft ICU 76
  926. * @see uset_getRangeCount
  927. * @see uset_getItemCount
  928. * @see uset_size
  929. */
  930. U_CAPI int32_t U_EXPORT2
  931. uset_getStringCount(const USet *set);
  932. /**
  933. * Returns the index-th string (empty or multi-character) in the set.
  934. * The string may not be NUL-terminated.
  935. * The output length must be used, and the caller must not read more than that many UChars.
  936. *
  937. * @param set the set
  938. * @param index the string index, 0 .. uset_getStringCount() - 1
  939. * @param pLength the output string length; must not be NULL
  940. * @return the pointer to the string; NULL if the index is out of range or pLength is NULL
  941. * @draft ICU 76
  942. * @see uset_getStringCount
  943. */
  944. U_CAPI const UChar* U_EXPORT2
  945. uset_getString(const USet *set, int32_t index, int32_t *pLength);
  946. #endif // U_HIDE_DRAFT_API
  947. /**
  948. * Returns the number of items in this set. An item is either a range
  949. * of characters or a single multicharacter string.
  950. * @param set the set
  951. * @return a non-negative integer counting the character ranges
  952. * and/or strings contained in set
  953. * @stable ICU 2.4
  954. * @see uset_getRangeCount
  955. * @see uset_getStringCount
  956. */
  957. U_CAPI int32_t U_EXPORT2
  958. uset_getItemCount(const USet* set);
  959. /**
  960. * Returns an item of this set. An item is either a range of
  961. * characters or a single multicharacter string (which can be the empty string).
  962. *
  963. * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0,
  964. * and the range is <code>*start</code>..<code>*end</code>.
  965. *
  966. * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then
  967. * this function copies the string into <code>str[strCapacity]</code> and
  968. * returns the length of the string (0 for the empty string).
  969. * See uset_getString() for a function that does not copy the string contents.
  970. *
  971. * If <code>itemIndex</code> is out of range, then this function returns -1.
  972. *
  973. * Note that 0 is returned for each range as well as for the empty string.
  974. *
  975. * @param set the set
  976. * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1
  977. * @param start pointer to variable to receive first character in range, inclusive;
  978. * can be NULL for a string item
  979. * @param end pointer to variable to receive last character in range, inclusive;
  980. * can be NULL for a string item
  981. * @param str buffer to receive the string, may be NULL
  982. * @param strCapacity capacity of str, or 0 if str is NULL
  983. * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range
  984. * @return the length of the string (0 or >= 2), or 0 if the item is a range,
  985. * or -1 if the itemIndex is out of range
  986. * @stable ICU 2.4
  987. * @see uset_getString
  988. */
  989. U_CAPI int32_t U_EXPORT2
  990. uset_getItem(const USet* set, int32_t itemIndex,
  991. UChar32* start, UChar32* end,
  992. UChar* str, int32_t strCapacity,
  993. UErrorCode* ec);
  994. /**
  995. * Returns true if set1 contains all the characters and strings
  996. * of set2. It answers the question, 'Is set1 a superset of set2?'
  997. * @param set1 set to be checked for containment
  998. * @param set2 set to be checked for containment
  999. * @return true if the test condition is met
  1000. * @stable ICU 3.2
  1001. */
  1002. U_CAPI UBool U_EXPORT2
  1003. uset_containsAll(const USet* set1, const USet* set2);
  1004. /**
  1005. * Returns true if this set contains all the characters
  1006. * of the given string. This is does not check containment of grapheme
  1007. * clusters, like uset_containsString.
  1008. * @param set set of characters to be checked for containment
  1009. * @param str string containing codepoints to be checked for containment
  1010. * @param strLen the length of the string or -1 if null terminated.
  1011. * @return true if the test condition is met
  1012. * @stable ICU 3.4
  1013. */
  1014. U_CAPI UBool U_EXPORT2
  1015. uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen);
  1016. /**
  1017. * Returns true if set1 contains none of the characters and strings
  1018. * of set2. It answers the question, 'Is set1 a disjoint set of set2?'
  1019. * @param set1 set to be checked for containment
  1020. * @param set2 set to be checked for containment
  1021. * @return true if the test condition is met
  1022. * @stable ICU 3.2
  1023. */
  1024. U_CAPI UBool U_EXPORT2
  1025. uset_containsNone(const USet* set1, const USet* set2);
  1026. /**
  1027. * Returns true if set1 contains some of the characters and strings
  1028. * of set2. It answers the question, 'Does set1 and set2 have an intersection?'
  1029. * @param set1 set to be checked for containment
  1030. * @param set2 set to be checked for containment
  1031. * @return true if the test condition is met
  1032. * @stable ICU 3.2
  1033. */
  1034. U_CAPI UBool U_EXPORT2
  1035. uset_containsSome(const USet* set1, const USet* set2);
  1036. /**
  1037. * Returns the length of the initial substring of the input string which
  1038. * consists only of characters and strings that are contained in this set
  1039. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  1040. * or only of characters and strings that are not contained
  1041. * in this set (USET_SPAN_NOT_CONTAINED).
  1042. * See USetSpanCondition for details.
  1043. * Similar to the strspn() C library function.
  1044. * Unpaired surrogates are treated according to contains() of their surrogate code points.
  1045. * This function works faster with a frozen set and with a non-negative string length argument.
  1046. * @param set the set
  1047. * @param s start of the string
  1048. * @param length of the string; can be -1 for NUL-terminated
  1049. * @param spanCondition specifies the containment condition
  1050. * @return the length of the initial substring according to the spanCondition;
  1051. * 0 if the start of the string does not fit the spanCondition
  1052. * @stable ICU 3.8
  1053. * @see USetSpanCondition
  1054. */
  1055. U_CAPI int32_t U_EXPORT2
  1056. uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
  1057. /**
  1058. * Returns the start of the trailing substring of the input string which
  1059. * consists only of characters and strings that are contained in this set
  1060. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  1061. * or only of characters and strings that are not contained
  1062. * in this set (USET_SPAN_NOT_CONTAINED).
  1063. * See USetSpanCondition for details.
  1064. * Unpaired surrogates are treated according to contains() of their surrogate code points.
  1065. * This function works faster with a frozen set and with a non-negative string length argument.
  1066. * @param set the set
  1067. * @param s start of the string
  1068. * @param length of the string; can be -1 for NUL-terminated
  1069. * @param spanCondition specifies the containment condition
  1070. * @return the start of the trailing substring according to the spanCondition;
  1071. * the string length if the end of the string does not fit the spanCondition
  1072. * @stable ICU 3.8
  1073. * @see USetSpanCondition
  1074. */
  1075. U_CAPI int32_t U_EXPORT2
  1076. uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
  1077. /**
  1078. * Returns the length of the initial substring of the input string which
  1079. * consists only of characters and strings that are contained in this set
  1080. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  1081. * or only of characters and strings that are not contained
  1082. * in this set (USET_SPAN_NOT_CONTAINED).
  1083. * See USetSpanCondition for details.
  1084. * Similar to the strspn() C library function.
  1085. * Malformed byte sequences are treated according to contains(0xfffd).
  1086. * This function works faster with a frozen set and with a non-negative string length argument.
  1087. * @param set the set
  1088. * @param s start of the string (UTF-8)
  1089. * @param length of the string; can be -1 for NUL-terminated
  1090. * @param spanCondition specifies the containment condition
  1091. * @return the length of the initial substring according to the spanCondition;
  1092. * 0 if the start of the string does not fit the spanCondition
  1093. * @stable ICU 3.8
  1094. * @see USetSpanCondition
  1095. */
  1096. U_CAPI int32_t U_EXPORT2
  1097. uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
  1098. /**
  1099. * Returns the start of the trailing substring of the input string which
  1100. * consists only of characters and strings that are contained in this set
  1101. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  1102. * or only of characters and strings that are not contained
  1103. * in this set (USET_SPAN_NOT_CONTAINED).
  1104. * See USetSpanCondition for details.
  1105. * Malformed byte sequences are treated according to contains(0xfffd).
  1106. * This function works faster with a frozen set and with a non-negative string length argument.
  1107. * @param set the set
  1108. * @param s start of the string (UTF-8)
  1109. * @param length of the string; can be -1 for NUL-terminated
  1110. * @param spanCondition specifies the containment condition
  1111. * @return the start of the trailing substring according to the spanCondition;
  1112. * the string length if the end of the string does not fit the spanCondition
  1113. * @stable ICU 3.8
  1114. * @see USetSpanCondition
  1115. */
  1116. U_CAPI int32_t U_EXPORT2
  1117. uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
  1118. /**
  1119. * Returns true if set1 contains all of the characters and strings
  1120. * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'
  1121. * @param set1 set to be checked for containment
  1122. * @param set2 set to be checked for containment
  1123. * @return true if the test condition is met
  1124. * @stable ICU 3.2
  1125. */
  1126. U_CAPI UBool U_EXPORT2
  1127. uset_equals(const USet* set1, const USet* set2);
  1128. /*********************************************************************
  1129. * Serialized set API
  1130. *********************************************************************/
  1131. /**
  1132. * Serializes this set into an array of 16-bit integers. Serialization
  1133. * (currently) only records the characters in the set; multicharacter
  1134. * strings are ignored.
  1135. *
  1136. * The array
  1137. * has following format (each line is one 16-bit integer):
  1138. *
  1139. * length = (n+2*m) | (m!=0?0x8000:0)
  1140. * bmpLength = n; present if m!=0
  1141. * bmp[0]
  1142. * bmp[1]
  1143. * ...
  1144. * bmp[n-1]
  1145. * supp-high[0]
  1146. * supp-low[0]
  1147. * supp-high[1]
  1148. * supp-low[1]
  1149. * ...
  1150. * supp-high[m-1]
  1151. * supp-low[m-1]
  1152. *
  1153. * The array starts with a header. After the header are n bmp
  1154. * code points, then m supplementary code points. Either n or m
  1155. * or both may be zero. n+2*m is always <= 0x7FFF.
  1156. *
  1157. * If there are no supplementary characters (if m==0) then the
  1158. * header is one 16-bit integer, 'length', with value n.
  1159. *
  1160. * If there are supplementary characters (if m!=0) then the header
  1161. * is two 16-bit integers. The first, 'length', has value
  1162. * (n+2*m)|0x8000. The second, 'bmpLength', has value n.
  1163. *
  1164. * After the header the code points are stored in ascending order.
  1165. * Supplementary code points are stored as most significant 16
  1166. * bits followed by least significant 16 bits.
  1167. *
  1168. * @param set the set
  1169. * @param dest pointer to buffer of destCapacity 16-bit integers.
  1170. * May be NULL only if destCapacity is zero.
  1171. * @param destCapacity size of dest, or zero. Must not be negative.
  1172. * @param pErrorCode pointer to the error code. Will be set to
  1173. * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to
  1174. * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity.
  1175. * @return the total length of the serialized format, including
  1176. * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
  1177. * than U_BUFFER_OVERFLOW_ERROR.
  1178. * @stable ICU 2.4
  1179. */
  1180. U_CAPI int32_t U_EXPORT2
  1181. uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);
  1182. /**
  1183. * Given a serialized array, fill in the given serialized set object.
  1184. * @param fillSet pointer to result
  1185. * @param src pointer to start of array
  1186. * @param srcLength length of array
  1187. * @return true if the given array is valid, otherwise false
  1188. * @stable ICU 2.4
  1189. */
  1190. U_CAPI UBool U_EXPORT2
  1191. uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);
  1192. /**
  1193. * Set the USerializedSet to contain the given character (and nothing
  1194. * else).
  1195. * @param fillSet pointer to result
  1196. * @param c The codepoint to set
  1197. * @stable ICU 2.4
  1198. */
  1199. U_CAPI void U_EXPORT2
  1200. uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);
  1201. /**
  1202. * Returns true if the given USerializedSet contains the given
  1203. * character.
  1204. * @param set the serialized set
  1205. * @param c The codepoint to check for within the set
  1206. * @return true if set contains c
  1207. * @stable ICU 2.4
  1208. */
  1209. U_CAPI UBool U_EXPORT2
  1210. uset_serializedContains(const USerializedSet* set, UChar32 c);
  1211. /**
  1212. * Returns the number of disjoint ranges of characters contained in
  1213. * the given serialized set. Ignores any strings contained in the
  1214. * set.
  1215. * @param set the serialized set
  1216. * @return a non-negative integer counting the character ranges
  1217. * contained in set
  1218. * @stable ICU 2.4
  1219. */
  1220. U_CAPI int32_t U_EXPORT2
  1221. uset_getSerializedRangeCount(const USerializedSet* set);
  1222. /**
  1223. * Returns a range of characters contained in the given serialized
  1224. * set.
  1225. * @param set the serialized set
  1226. * @param rangeIndex a non-negative integer in the range 0..
  1227. * uset_getSerializedRangeCount(set)-1
  1228. * @param pStart pointer to variable to receive first character
  1229. * in range, inclusive
  1230. * @param pEnd pointer to variable to receive last character in range,
  1231. * inclusive
  1232. * @return true if rangeIndex is valid, otherwise false
  1233. * @stable ICU 2.4
  1234. */
  1235. U_CAPI UBool U_EXPORT2
  1236. uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
  1237. UChar32* pStart, UChar32* pEnd);
  1238. #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
  1239. #ifndef U_HIDE_DRAFT_API
  1240. namespace U_HEADER_ONLY_NAMESPACE {
  1241. // Note: Not U_COMMON_API, and not a subclass of UMemory, because this is a header-only class,
  1242. // not intended to be used via export from the ICU DLL.
  1243. /**
  1244. * Iterator returned by USetCodePoints.
  1245. * @draft ICU 76
  1246. */
  1247. class USetCodePointIterator {
  1248. public:
  1249. /** @draft ICU 76 */
  1250. USetCodePointIterator(const USetCodePointIterator &other) = default;
  1251. /** @draft ICU 76 */
  1252. bool operator==(const USetCodePointIterator &other) const {
  1253. // No need to compare rangeCount & end given private constructor
  1254. // and assuming we don't compare iterators across the set being modified.
  1255. // And comparing rangeIndex is redundant with comparing c.
  1256. // We might even skip comparing uset.
  1257. // Unless we want operator==() to be "correct" for more than iteration.
  1258. return uset == other.uset && c == other.c;
  1259. }
  1260. /** @draft ICU 76 */
  1261. bool operator!=(const USetCodePointIterator &other) const { return !operator==(other); }
  1262. /** @draft ICU 76 */
  1263. UChar32 operator*() const { return c; }
  1264. /**
  1265. * Pre-increment.
  1266. * @draft ICU 76
  1267. */
  1268. USetCodePointIterator &operator++() {
  1269. if (c < end) {
  1270. ++c;
  1271. } else if (rangeIndex < rangeCount) {
  1272. UErrorCode errorCode = U_ZERO_ERROR;
  1273. int32_t result = uset_getItem(uset, rangeIndex, &c, &end, nullptr, 0, &errorCode);
  1274. if (U_SUCCESS(errorCode) && result == 0) {
  1275. ++rangeIndex;
  1276. } else {
  1277. c = end = U_SENTINEL;
  1278. }
  1279. } else {
  1280. c = end = U_SENTINEL;
  1281. }
  1282. return *this;
  1283. }
  1284. /**
  1285. * Post-increment.
  1286. * @draft ICU 76
  1287. */
  1288. USetCodePointIterator operator++(int) {
  1289. USetCodePointIterator result(*this);
  1290. operator++();
  1291. return result;
  1292. }
  1293. private:
  1294. friend class USetCodePoints;
  1295. USetCodePointIterator(const USet *uset, int32_t rangeIndex, int32_t rangeCount)
  1296. : uset(uset), rangeIndex(rangeIndex), rangeCount(rangeCount),
  1297. c(U_SENTINEL), end(U_SENTINEL) {
  1298. // Fetch the first range.
  1299. operator++();
  1300. }
  1301. const USet *uset;
  1302. int32_t rangeIndex;
  1303. int32_t rangeCount;
  1304. UChar32 c, end;
  1305. };
  1306. /**
  1307. * C++ "range" for iterating over the code points of a USet.
  1308. *
  1309. * \code
  1310. * using U_HEADER_NESTED_NAMESPACE::USetCodePoints;
  1311. * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, &errorCode));
  1312. * for (UChar32 c : USetCodePoints(uset.getAlias())) {
  1313. * printf("uset.codePoint U+%04lx\n", (long)c);
  1314. * }
  1315. * \endcode
  1316. *
  1317. * C++ UnicodeSet has member functions for iteration, including codePoints().
  1318. *
  1319. * @draft ICU 76
  1320. * @see USetRanges
  1321. * @see USetStrings
  1322. * @see USetElements
  1323. */
  1324. class USetCodePoints {
  1325. public:
  1326. /**
  1327. * Constructs a C++ "range" object over the code points of the USet.
  1328. * @draft ICU 76
  1329. */
  1330. USetCodePoints(const USet *uset) : uset(uset), rangeCount(uset_getRangeCount(uset)) {}
  1331. /** @draft ICU 76 */
  1332. USetCodePoints(const USetCodePoints &other) = default;
  1333. /** @draft ICU 76 */
  1334. USetCodePointIterator begin() const {
  1335. return USetCodePointIterator(uset, 0, rangeCount);
  1336. }
  1337. /** @draft ICU 76 */
  1338. USetCodePointIterator end() const {
  1339. return USetCodePointIterator(uset, rangeCount, rangeCount);
  1340. }
  1341. private:
  1342. const USet *uset;
  1343. int32_t rangeCount;
  1344. };
  1345. /**
  1346. * A contiguous range of code points in a USet/UnicodeSet.
  1347. * Returned by USetRangeIterator which is returned by USetRanges.
  1348. * Both the rangeStart and rangeEnd are in the range.
  1349. * (end() returns an iterator corresponding to rangeEnd+1.)
  1350. * @draft ICU 76
  1351. */
  1352. struct CodePointRange {
  1353. /** @draft ICU 76 */
  1354. struct iterator {
  1355. /** @draft ICU 76 */
  1356. iterator(UChar32 c) : c(c) {}
  1357. /** @draft ICU 76 */
  1358. bool operator==(const iterator &other) const { return c == other.c; }
  1359. /** @draft ICU 76 */
  1360. bool operator!=(const iterator &other) const { return !operator==(other); }
  1361. /** @draft ICU 76 */
  1362. UChar32 operator*() const { return c; }
  1363. /**
  1364. * Pre-increment.
  1365. * @draft ICU 76
  1366. */
  1367. iterator &operator++() {
  1368. ++c;
  1369. return *this;
  1370. }
  1371. /**
  1372. * Post-increment.
  1373. * @draft ICU 76
  1374. */
  1375. iterator operator++(int) {
  1376. return c++;
  1377. }
  1378. /**
  1379. * The current code point in the range.
  1380. * @draft ICU 76
  1381. */
  1382. UChar32 c;
  1383. };
  1384. /** @draft ICU 76 */
  1385. CodePointRange(UChar32 start, UChar32 end) : rangeStart(start), rangeEnd(end) {}
  1386. /** @draft ICU 76 */
  1387. CodePointRange(const CodePointRange &other) = default;
  1388. /** @draft ICU 76 */
  1389. size_t size() const { return (rangeEnd + 1) - rangeStart; }
  1390. /** @draft ICU 76 */
  1391. iterator begin() const { return rangeStart; }
  1392. /** @draft ICU 76 */
  1393. iterator end() const { return rangeEnd + 1; }
  1394. /**
  1395. * Start of a USet/UnicodeSet range of code points.
  1396. * @draft ICU 76
  1397. */
  1398. UChar32 rangeStart;
  1399. /**
  1400. * Inclusive end of a USet/UnicodeSet range of code points.
  1401. * @draft ICU 76
  1402. */
  1403. UChar32 rangeEnd;
  1404. };
  1405. /**
  1406. * Iterator returned by USetRanges.
  1407. * @draft ICU 76
  1408. */
  1409. class USetRangeIterator {
  1410. public:
  1411. /** @draft ICU 76 */
  1412. USetRangeIterator(const USetRangeIterator &other) = default;
  1413. /** @draft ICU 76 */
  1414. bool operator==(const USetRangeIterator &other) const {
  1415. // No need to compare rangeCount given private constructor
  1416. // and assuming we don't compare iterators across the set being modified.
  1417. // We might even skip comparing uset.
  1418. // Unless we want operator==() to be "correct" for more than iteration.
  1419. return uset == other.uset && rangeIndex == other.rangeIndex;
  1420. }
  1421. /** @draft ICU 76 */
  1422. bool operator!=(const USetRangeIterator &other) const { return !operator==(other); }
  1423. /** @draft ICU 76 */
  1424. CodePointRange operator*() const {
  1425. if (rangeIndex < rangeCount) {
  1426. UChar32 start, end;
  1427. UErrorCode errorCode = U_ZERO_ERROR;
  1428. int32_t result = uset_getItem(uset, rangeIndex, &start, &end, nullptr, 0, &errorCode);
  1429. if (U_SUCCESS(errorCode) && result == 0) {
  1430. return CodePointRange(start, end);
  1431. }
  1432. }
  1433. return CodePointRange(U_SENTINEL, U_SENTINEL);
  1434. }
  1435. /**
  1436. * Pre-increment.
  1437. * @draft ICU 76
  1438. */
  1439. USetRangeIterator &operator++() {
  1440. ++rangeIndex;
  1441. return *this;
  1442. }
  1443. /**
  1444. * Post-increment.
  1445. * @draft ICU 76
  1446. */
  1447. USetRangeIterator operator++(int) {
  1448. USetRangeIterator result(*this);
  1449. ++rangeIndex;
  1450. return result;
  1451. }
  1452. private:
  1453. friend class USetRanges;
  1454. USetRangeIterator(const USet *uset, int32_t rangeIndex, int32_t rangeCount)
  1455. : uset(uset), rangeIndex(rangeIndex), rangeCount(rangeCount) {}
  1456. const USet *uset;
  1457. int32_t rangeIndex;
  1458. int32_t rangeCount;
  1459. };
  1460. /**
  1461. * C++ "range" for iterating over the code point ranges of a USet.
  1462. *
  1463. * \code
  1464. * using U_HEADER_NESTED_NAMESPACE::USetRanges;
  1465. * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, &errorCode));
  1466. * for (auto [start, end] : USetRanges(uset.getAlias())) {
  1467. * printf("uset.range U+%04lx..U+%04lx\n", (long)start, (long)end);
  1468. * }
  1469. * for (auto range : USetRanges(uset.getAlias())) {
  1470. * for (UChar32 c : range) {
  1471. * printf("uset.range.c U+%04lx\n", (long)c);
  1472. * }
  1473. * }
  1474. * \endcode
  1475. *
  1476. * C++ UnicodeSet has member functions for iteration, including ranges().
  1477. *
  1478. * @draft ICU 76
  1479. * @see USetCodePoints
  1480. * @see USetStrings
  1481. * @see USetElements
  1482. */
  1483. class USetRanges {
  1484. public:
  1485. /**
  1486. * Constructs a C++ "range" object over the code point ranges of the USet.
  1487. * @draft ICU 76
  1488. */
  1489. USetRanges(const USet *uset) : uset(uset), rangeCount(uset_getRangeCount(uset)) {}
  1490. /** @draft ICU 76 */
  1491. USetRanges(const USetRanges &other) = default;
  1492. /** @draft ICU 76 */
  1493. USetRangeIterator begin() const {
  1494. return USetRangeIterator(uset, 0, rangeCount);
  1495. }
  1496. /** @draft ICU 76 */
  1497. USetRangeIterator end() const {
  1498. return USetRangeIterator(uset, rangeCount, rangeCount);
  1499. }
  1500. private:
  1501. const USet *uset;
  1502. int32_t rangeCount;
  1503. };
  1504. /**
  1505. * Iterator returned by USetStrings.
  1506. * @draft ICU 76
  1507. */
  1508. class USetStringIterator {
  1509. public:
  1510. /** @draft ICU 76 */
  1511. USetStringIterator(const USetStringIterator &other) = default;
  1512. /** @draft ICU 76 */
  1513. bool operator==(const USetStringIterator &other) const {
  1514. // No need to compare count given private constructor
  1515. // and assuming we don't compare iterators across the set being modified.
  1516. // We might even skip comparing uset.
  1517. // Unless we want operator==() to be "correct" for more than iteration.
  1518. return uset == other.uset && index == other.index;
  1519. }
  1520. /** @draft ICU 76 */
  1521. bool operator!=(const USetStringIterator &other) const { return !operator==(other); }
  1522. /** @draft ICU 76 */
  1523. std::u16string_view operator*() const {
  1524. if (index < count) {
  1525. int32_t length;
  1526. const UChar *uchars = uset_getString(uset, index, &length);
  1527. // assert uchars != nullptr;
  1528. return {ConstChar16Ptr(uchars), static_cast<uint32_t>(length)};
  1529. }
  1530. return {};
  1531. }
  1532. /**
  1533. * Pre-increment.
  1534. * @draft ICU 76
  1535. */
  1536. USetStringIterator &operator++() {
  1537. ++index;
  1538. return *this;
  1539. }
  1540. /**
  1541. * Post-increment.
  1542. * @draft ICU 76
  1543. */
  1544. USetStringIterator operator++(int) {
  1545. USetStringIterator result(*this);
  1546. ++index;
  1547. return result;
  1548. }
  1549. private:
  1550. friend class USetStrings;
  1551. USetStringIterator(const USet *uset, int32_t index, int32_t count)
  1552. : uset(uset), index(index), count(count) {}
  1553. const USet *uset;
  1554. int32_t index;
  1555. int32_t count;
  1556. };
  1557. /**
  1558. * C++ "range" for iterating over the empty and multi-character strings of a USet.
  1559. *
  1560. * \code
  1561. * using U_HEADER_NESTED_NAMESPACE::USetStrings;
  1562. * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, &errorCode));
  1563. * for (auto s : USetStrings(uset.getAlias())) {
  1564. * UnicodeString us(s);
  1565. * std::string u8;
  1566. * printf("uset.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str());
  1567. * }
  1568. * \endcode
  1569. *
  1570. * C++ UnicodeSet has member functions for iteration, including strings().
  1571. *
  1572. * @draft ICU 76
  1573. * @see USetCodePoints
  1574. * @see USetRanges
  1575. * @see USetElements
  1576. */
  1577. class USetStrings {
  1578. public:
  1579. /**
  1580. * Constructs a C++ "range" object over the strings of the USet.
  1581. * @draft ICU 76
  1582. */
  1583. USetStrings(const USet *uset) : uset(uset), count(uset_getStringCount(uset)) {}
  1584. /** @draft ICU 76 */
  1585. USetStrings(const USetStrings &other) = default;
  1586. /** @draft ICU 76 */
  1587. USetStringIterator begin() const {
  1588. return USetStringIterator(uset, 0, count);
  1589. }
  1590. /** @draft ICU 76 */
  1591. USetStringIterator end() const {
  1592. return USetStringIterator(uset, count, count);
  1593. }
  1594. private:
  1595. const USet *uset;
  1596. int32_t count;
  1597. };
  1598. /**
  1599. * Iterator returned by USetElements.
  1600. * @draft ICU 76
  1601. */
  1602. class USetElementIterator {
  1603. public:
  1604. /** @draft ICU 76 */
  1605. USetElementIterator(const USetElementIterator &other) = default;
  1606. /** @draft ICU 76 */
  1607. bool operator==(const USetElementIterator &other) const {
  1608. // No need to compare rangeCount & end given private constructor
  1609. // and assuming we don't compare iterators across the set being modified.
  1610. // We might even skip comparing uset.
  1611. // Unless we want operator==() to be "correct" for more than iteration.
  1612. return uset == other.uset && c == other.c && index == other.index;
  1613. }
  1614. /** @draft ICU 76 */
  1615. bool operator!=(const USetElementIterator &other) const { return !operator==(other); }
  1616. /** @draft ICU 76 */
  1617. UnicodeString operator*() const {
  1618. if (c >= 0) {
  1619. return UnicodeString(c);
  1620. } else if (index < totalCount) {
  1621. int32_t length;
  1622. const UChar *uchars = uset_getString(uset, index - rangeCount, &length);
  1623. // assert uchars != nullptr;
  1624. return UnicodeString(uchars, length);
  1625. } else {
  1626. return UnicodeString();
  1627. }
  1628. }
  1629. /**
  1630. * Pre-increment.
  1631. * @draft ICU 76
  1632. */
  1633. USetElementIterator &operator++() {
  1634. if (c < end) {
  1635. ++c;
  1636. } else if (index < rangeCount) {
  1637. UErrorCode errorCode = U_ZERO_ERROR;
  1638. int32_t result = uset_getItem(uset, index, &c, &end, nullptr, 0, &errorCode);
  1639. if (U_SUCCESS(errorCode) && result == 0) {
  1640. ++index;
  1641. } else {
  1642. c = end = U_SENTINEL;
  1643. }
  1644. } else if (c >= 0) {
  1645. // assert index == rangeCount;
  1646. // Switch from the last range to the first string.
  1647. c = end = U_SENTINEL;
  1648. } else {
  1649. ++index;
  1650. }
  1651. return *this;
  1652. }
  1653. /**
  1654. * Post-increment.
  1655. * @draft ICU 76
  1656. */
  1657. USetElementIterator operator++(int) {
  1658. USetElementIterator result(*this);
  1659. operator++();
  1660. return result;
  1661. }
  1662. private:
  1663. friend class USetElements;
  1664. USetElementIterator(const USet *uset, int32_t index, int32_t rangeCount, int32_t totalCount)
  1665. : uset(uset), index(index), rangeCount(rangeCount), totalCount(totalCount),
  1666. c(U_SENTINEL), end(U_SENTINEL) {
  1667. if (index < rangeCount) {
  1668. // Fetch the first range.
  1669. operator++();
  1670. }
  1671. // Otherwise don't move beyond the (index - rangeCount)-th string.
  1672. }
  1673. const USet *uset;
  1674. int32_t index;
  1675. /** Number of UnicodeSet/USet code point ranges. */
  1676. int32_t rangeCount;
  1677. /**
  1678. * Number of code point ranges plus number of strings.
  1679. * index starts from 0, counts ranges while less than rangeCount,
  1680. * then counts strings while at least rangeCount and less than totalCount.
  1681. *
  1682. * Note that totalCount is the same as uset_getItemCount(), but usually
  1683. * smaller than the number of elements returned by this iterator
  1684. * because we return each code point of each range.
  1685. */
  1686. int32_t totalCount;
  1687. UChar32 c, end;
  1688. };
  1689. /**
  1690. * A C++ "range" for iterating over all of the elements of a USet.
  1691. * Convenient all-in one iteration, but creates a UnicodeString for each
  1692. * code point or string.
  1693. *
  1694. * Code points are returned first, then empty and multi-character strings.
  1695. *
  1696. * \code
  1697. * using U_HEADER_NESTED_NAMESPACE::USetElements;
  1698. * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, &errorCode));
  1699. * for (auto el : USetElements(uset.getAlias())) {
  1700. * std::string u8;
  1701. * printf("uset.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str());
  1702. * }
  1703. * \endcode
  1704. *
  1705. * C++ UnicodeSet has member functions for iteration, including begin() and end().
  1706. *
  1707. * @return an all-elements iterator.
  1708. * @draft ICU 76
  1709. * @see USetCodePoints
  1710. * @see USetRanges
  1711. * @see USetStrings
  1712. */
  1713. class USetElements {
  1714. public:
  1715. /**
  1716. * Constructs a C++ "range" object over all of the elements of the USet.
  1717. * @draft ICU 76
  1718. */
  1719. USetElements(const USet *uset)
  1720. : uset(uset), rangeCount(uset_getRangeCount(uset)),
  1721. stringCount(uset_getStringCount(uset)) {}
  1722. /** @draft ICU 76 */
  1723. USetElements(const USetElements &other) = default;
  1724. /** @draft ICU 76 */
  1725. USetElementIterator begin() const {
  1726. return USetElementIterator(uset, 0, rangeCount, rangeCount + stringCount);
  1727. }
  1728. /** @draft ICU 76 */
  1729. USetElementIterator end() const {
  1730. return USetElementIterator(uset, rangeCount + stringCount, rangeCount, rangeCount + stringCount);
  1731. }
  1732. private:
  1733. const USet *uset;
  1734. int32_t rangeCount, stringCount;
  1735. };
  1736. } // namespace U_HEADER_ONLY_NAMESPACE
  1737. #endif // U_HIDE_DRAFT_API
  1738. #endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
  1739. #endif // __USET_H__