ustring.h 72 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 1998-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. *
  9. * File ustring.h
  10. *
  11. * Modification History:
  12. *
  13. * Date Name Description
  14. * 12/07/98 bertrand Creation.
  15. ******************************************************************************
  16. */
  17. #ifndef USTRING_H
  18. #define USTRING_H
  19. #include "unicode/utypes.h"
  20. #include "unicode/putil.h"
  21. #include "unicode/uiter.h"
  22. /**
  23. * \def UBRK_TYPEDEF_UBREAK_ITERATOR
  24. * @internal
  25. */
  26. #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
  27. # define UBRK_TYPEDEF_UBREAK_ITERATOR
  28. /** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/
  29. typedef struct UBreakIterator UBreakIterator;
  30. #endif
  31. /**
  32. * \file
  33. * \brief C API: Unicode string handling functions
  34. *
  35. * These C API functions provide general Unicode string handling.
  36. *
  37. * Some functions are equivalent in name, signature, and behavior to the ANSI C <string.h>
  38. * functions. (For example, they do not check for bad arguments like NULL string pointers.)
  39. * In some cases, only the thread-safe variant of such a function is implemented here
  40. * (see u_strtok_r()).
  41. *
  42. * Other functions provide more Unicode-specific functionality like locale-specific
  43. * upper/lower-casing and string comparison in code point order.
  44. *
  45. * ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units.
  46. * UTF-16 encodes each Unicode code point with either one or two UChar code units.
  47. * (This is the default form of Unicode, and a forward-compatible extension of the original,
  48. * fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0
  49. * in 1996.)
  50. *
  51. * Some APIs accept a 32-bit UChar32 value for a single code point.
  52. *
  53. * ICU also handles 16-bit Unicode text with unpaired surrogates.
  54. * Such text is not well-formed UTF-16.
  55. * Code-point-related functions treat unpaired surrogates as surrogate code points,
  56. * i.e., as separate units.
  57. *
  58. * Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings),
  59. * it is much more efficient even for random access because the code unit values
  60. * for single-unit characters vs. lead units vs. trail units are completely disjoint.
  61. * This means that it is easy to determine character (code point) boundaries from
  62. * random offsets in the string.
  63. *
  64. * Unicode (UTF-16) string processing is optimized for the single-unit case.
  65. * Although it is important to support supplementary characters
  66. * (which use pairs of lead/trail code units called "surrogates"),
  67. * their occurrence is rare. Almost all characters in modern use require only
  68. * a single UChar code unit (i.e., their code point values are <=0xffff).
  69. *
  70. * For more details see the User Guide Strings chapter (https://unicode-org.github.io/icu/userguide/strings/).
  71. * For a discussion of the handling of unpaired surrogates see also
  72. * Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18.
  73. */
  74. /**
  75. * \defgroup ustring_ustrlen String Length
  76. * \ingroup ustring_strlen
  77. */
  78. /*@{*/
  79. /**
  80. * Determine the length of an array of UChar.
  81. *
  82. * @param s The array of UChars, NULL (U+0000) terminated.
  83. * @return The number of UChars in <code>chars</code>, minus the terminator.
  84. * @stable ICU 2.0
  85. */
  86. U_CAPI int32_t U_EXPORT2
  87. u_strlen(const UChar *s);
  88. /*@}*/
  89. /**
  90. * Count Unicode code points in the length UChar code units of the string.
  91. * A code point may occupy either one or two UChar code units.
  92. * Counting code points involves reading all code units.
  93. *
  94. * This functions is basically the inverse of the U16_FWD_N() macro (see utf.h).
  95. *
  96. * @param s The input string.
  97. * @param length The number of UChar code units to be checked, or -1 to count all
  98. * code points before the first NUL (U+0000).
  99. * @return The number of code points in the specified code units.
  100. * @stable ICU 2.0
  101. */
  102. U_CAPI int32_t U_EXPORT2
  103. u_countChar32(const UChar *s, int32_t length);
  104. /**
  105. * Check if the string contains more Unicode code points than a certain number.
  106. * This is more efficient than counting all code points in the entire string
  107. * and comparing that number with a threshold.
  108. * This function may not need to scan the string at all if the length is known
  109. * (not -1 for NUL-termination) and falls within a certain range, and
  110. * never needs to count more than 'number+1' code points.
  111. * Logically equivalent to (u_countChar32(s, length)>number).
  112. * A Unicode code point may occupy either one or two UChar code units.
  113. *
  114. * @param s The input string.
  115. * @param length The length of the string, or -1 if it is NUL-terminated.
  116. * @param number The number of code points in the string is compared against
  117. * the 'number' parameter.
  118. * @return Boolean value for whether the string contains more Unicode code points
  119. * than 'number'. Same as (u_countChar32(s, length)>number).
  120. * @stable ICU 2.4
  121. */
  122. U_CAPI UBool U_EXPORT2
  123. u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number);
  124. /**
  125. * Concatenate two ustrings. Appends a copy of <code>src</code>,
  126. * including the null terminator, to <code>dst</code>. The initial copied
  127. * character from <code>src</code> overwrites the null terminator in <code>dst</code>.
  128. *
  129. * @param dst The destination string.
  130. * @param src The source string.
  131. * @return A pointer to <code>dst</code>.
  132. * @stable ICU 2.0
  133. */
  134. U_CAPI UChar* U_EXPORT2
  135. u_strcat(UChar *dst,
  136. const UChar *src);
  137. /**
  138. * Concatenate two ustrings.
  139. * Appends at most <code>n</code> characters from <code>src</code> to <code>dst</code>.
  140. * Adds a terminating NUL.
  141. * If src is too long, then only <code>n-1</code> characters will be copied
  142. * before the terminating NUL.
  143. * If <code>n&lt;=0</code> then dst is not modified.
  144. *
  145. * @param dst The destination string.
  146. * @param src The source string (can be NULL/invalid if n<=0).
  147. * @param n The maximum number of characters to append; no-op if <=0.
  148. * @return A pointer to <code>dst</code>.
  149. * @stable ICU 2.0
  150. */
  151. U_CAPI UChar* U_EXPORT2
  152. u_strncat(UChar *dst,
  153. const UChar *src,
  154. int32_t n);
  155. /**
  156. * Find the first occurrence of a substring in a string.
  157. * The substring is found at code point boundaries.
  158. * That means that if the substring begins with
  159. * a trail surrogate or ends with a lead surrogate,
  160. * then it is found only if these surrogates stand alone in the text.
  161. * Otherwise, the substring edge units would be matched against
  162. * halves of surrogate pairs.
  163. *
  164. * @param s The string to search (NUL-terminated).
  165. * @param substring The substring to find (NUL-terminated).
  166. * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
  167. * or <code>s</code> itself if the <code>substring</code> is empty,
  168. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  169. * @stable ICU 2.0
  170. *
  171. * @see u_strrstr
  172. * @see u_strFindFirst
  173. * @see u_strFindLast
  174. */
  175. U_CAPI UChar * U_EXPORT2
  176. u_strstr(const UChar *s, const UChar *substring);
  177. /**
  178. * Find the first occurrence of a substring in a string.
  179. * The substring is found at code point boundaries.
  180. * That means that if the substring begins with
  181. * a trail surrogate or ends with a lead surrogate,
  182. * then it is found only if these surrogates stand alone in the text.
  183. * Otherwise, the substring edge units would be matched against
  184. * halves of surrogate pairs.
  185. *
  186. * @param s The string to search.
  187. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
  188. * @param substring The substring to find (NUL-terminated).
  189. * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
  190. * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
  191. * or <code>s</code> itself if the <code>substring</code> is empty,
  192. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  193. * @stable ICU 2.4
  194. *
  195. * @see u_strstr
  196. * @see u_strFindLast
  197. */
  198. U_CAPI UChar * U_EXPORT2
  199. u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
  200. /**
  201. * Find the first occurrence of a BMP code point in a string.
  202. * A surrogate code point is found only if its match in the text is not
  203. * part of a surrogate pair.
  204. * A NUL character is found at the string terminator.
  205. *
  206. * @param s The string to search (NUL-terminated).
  207. * @param c The BMP code point to find.
  208. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  209. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  210. * @stable ICU 2.0
  211. *
  212. * @see u_strchr32
  213. * @see u_memchr
  214. * @see u_strstr
  215. * @see u_strFindFirst
  216. */
  217. U_CAPI UChar * U_EXPORT2
  218. u_strchr(const UChar *s, UChar c);
  219. /**
  220. * Find the first occurrence of a code point in a string.
  221. * A surrogate code point is found only if its match in the text is not
  222. * part of a surrogate pair.
  223. * A NUL character is found at the string terminator.
  224. *
  225. * @param s The string to search (NUL-terminated).
  226. * @param c The code point to find.
  227. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  228. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  229. * @stable ICU 2.0
  230. *
  231. * @see u_strchr
  232. * @see u_memchr32
  233. * @see u_strstr
  234. * @see u_strFindFirst
  235. */
  236. U_CAPI UChar * U_EXPORT2
  237. u_strchr32(const UChar *s, UChar32 c);
  238. /**
  239. * Find the last occurrence of a substring in a string.
  240. * The substring is found at code point boundaries.
  241. * That means that if the substring begins with
  242. * a trail surrogate or ends with a lead surrogate,
  243. * then it is found only if these surrogates stand alone in the text.
  244. * Otherwise, the substring edge units would be matched against
  245. * halves of surrogate pairs.
  246. *
  247. * @param s The string to search (NUL-terminated).
  248. * @param substring The substring to find (NUL-terminated).
  249. * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
  250. * or <code>s</code> itself if the <code>substring</code> is empty,
  251. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  252. * @stable ICU 2.4
  253. *
  254. * @see u_strstr
  255. * @see u_strFindFirst
  256. * @see u_strFindLast
  257. */
  258. U_CAPI UChar * U_EXPORT2
  259. u_strrstr(const UChar *s, const UChar *substring);
  260. /**
  261. * Find the last occurrence of a substring in a string.
  262. * The substring is found at code point boundaries.
  263. * That means that if the substring begins with
  264. * a trail surrogate or ends with a lead surrogate,
  265. * then it is found only if these surrogates stand alone in the text.
  266. * Otherwise, the substring edge units would be matched against
  267. * halves of surrogate pairs.
  268. *
  269. * @param s The string to search.
  270. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
  271. * @param substring The substring to find (NUL-terminated).
  272. * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
  273. * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
  274. * or <code>s</code> itself if the <code>substring</code> is empty,
  275. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  276. * @stable ICU 2.4
  277. *
  278. * @see u_strstr
  279. * @see u_strFindLast
  280. */
  281. U_CAPI UChar * U_EXPORT2
  282. u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
  283. /**
  284. * Find the last occurrence of a BMP code point in a string.
  285. * A surrogate code point is found only if its match in the text is not
  286. * part of a surrogate pair.
  287. * A NUL character is found at the string terminator.
  288. *
  289. * @param s The string to search (NUL-terminated).
  290. * @param c The BMP code point to find.
  291. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  292. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  293. * @stable ICU 2.4
  294. *
  295. * @see u_strrchr32
  296. * @see u_memrchr
  297. * @see u_strrstr
  298. * @see u_strFindLast
  299. */
  300. U_CAPI UChar * U_EXPORT2
  301. u_strrchr(const UChar *s, UChar c);
  302. /**
  303. * Find the last occurrence of a code point in a string.
  304. * A surrogate code point is found only if its match in the text is not
  305. * part of a surrogate pair.
  306. * A NUL character is found at the string terminator.
  307. *
  308. * @param s The string to search (NUL-terminated).
  309. * @param c The code point to find.
  310. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  311. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  312. * @stable ICU 2.4
  313. *
  314. * @see u_strrchr
  315. * @see u_memchr32
  316. * @see u_strrstr
  317. * @see u_strFindLast
  318. */
  319. U_CAPI UChar * U_EXPORT2
  320. u_strrchr32(const UChar *s, UChar32 c);
  321. /**
  322. * Locates the first occurrence in the string <code>string</code> of any of the characters
  323. * in the string <code>matchSet</code>.
  324. * Works just like C's strpbrk but with Unicode.
  325. *
  326. * @param string The string in which to search, NUL-terminated.
  327. * @param matchSet A NUL-terminated string defining a set of code points
  328. * for which to search in the text string.
  329. * @return A pointer to the character in <code>string</code> that matches one of the
  330. * characters in <code>matchSet</code>, or NULL if no such character is found.
  331. * @stable ICU 2.0
  332. */
  333. U_CAPI UChar * U_EXPORT2
  334. u_strpbrk(const UChar *string, const UChar *matchSet);
  335. /**
  336. * Returns the number of consecutive characters in <code>string</code>,
  337. * beginning with the first, that do not occur somewhere in <code>matchSet</code>.
  338. * Works just like C's strcspn but with Unicode.
  339. *
  340. * @param string The string in which to search, NUL-terminated.
  341. * @param matchSet A NUL-terminated string defining a set of code points
  342. * for which to search in the text string.
  343. * @return The number of initial characters in <code>string</code> that do not
  344. * occur in <code>matchSet</code>.
  345. * @see u_strspn
  346. * @stable ICU 2.0
  347. */
  348. U_CAPI int32_t U_EXPORT2
  349. u_strcspn(const UChar *string, const UChar *matchSet);
  350. /**
  351. * Returns the number of consecutive characters in <code>string</code>,
  352. * beginning with the first, that occur somewhere in <code>matchSet</code>.
  353. * Works just like C's strspn but with Unicode.
  354. *
  355. * @param string The string in which to search, NUL-terminated.
  356. * @param matchSet A NUL-terminated string defining a set of code points
  357. * for which to search in the text string.
  358. * @return The number of initial characters in <code>string</code> that do
  359. * occur in <code>matchSet</code>.
  360. * @see u_strcspn
  361. * @stable ICU 2.0
  362. */
  363. U_CAPI int32_t U_EXPORT2
  364. u_strspn(const UChar *string, const UChar *matchSet);
  365. /**
  366. * The string tokenizer API allows an application to break a string into
  367. * tokens. Unlike strtok(), the saveState (the current pointer within the
  368. * original string) is maintained in saveState. In the first call, the
  369. * argument src is a pointer to the string. In subsequent calls to
  370. * return successive tokens of that string, src must be specified as
  371. * NULL. The value saveState is set by this function to maintain the
  372. * function's position within the string, and on each subsequent call
  373. * you must give this argument the same variable. This function does
  374. * handle surrogate pairs. This function is similar to the strtok_r()
  375. * the POSIX Threads Extension (1003.1c-1995) version.
  376. *
  377. * @param src String containing token(s). This string will be modified.
  378. * After the first call to u_strtok_r(), this argument must
  379. * be NULL to get to the next token.
  380. * @param delim Set of delimiter characters (Unicode code points).
  381. * @param saveState The current pointer within the original string,
  382. * which is set by this function. The saveState
  383. * parameter should the address of a local variable of type
  384. * UChar *. (i.e. defined "UChar *myLocalSaveState" and use
  385. * &myLocalSaveState for this parameter).
  386. * @return A pointer to the next token found in src, or NULL
  387. * when there are no more tokens.
  388. * @stable ICU 2.0
  389. */
  390. U_CAPI UChar * U_EXPORT2
  391. u_strtok_r(UChar *src,
  392. const UChar *delim,
  393. UChar **saveState);
  394. /**
  395. * Compare two Unicode strings for bitwise equality (code unit order).
  396. *
  397. * @param s1 A string to compare.
  398. * @param s2 A string to compare.
  399. * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
  400. * value if <code>s1</code> is bitwise less than <code>s2,</code>; a positive
  401. * value if <code>s1</code> is bitwise greater than <code>s2</code>.
  402. * @stable ICU 2.0
  403. */
  404. U_CAPI int32_t U_EXPORT2
  405. u_strcmp(const UChar *s1,
  406. const UChar *s2);
  407. /**
  408. * Compare two Unicode strings in code point order.
  409. * See u_strCompare for details.
  410. *
  411. * @param s1 A string to compare.
  412. * @param s2 A string to compare.
  413. * @return a negative/zero/positive integer corresponding to whether
  414. * the first string is less than/equal to/greater than the second one
  415. * in code point order
  416. * @stable ICU 2.0
  417. */
  418. U_CAPI int32_t U_EXPORT2
  419. u_strcmpCodePointOrder(const UChar *s1, const UChar *s2);
  420. /**
  421. * Compare two Unicode strings (binary order).
  422. *
  423. * The comparison can be done in code unit order or in code point order.
  424. * They differ only in UTF-16 when
  425. * comparing supplementary code points (U+10000..U+10ffff)
  426. * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
  427. * In code unit order, high BMP code points sort after supplementary code points
  428. * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
  429. *
  430. * This functions works with strings of different explicitly specified lengths
  431. * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
  432. * NUL-terminated strings are possible with length arguments of -1.
  433. *
  434. * @param s1 First source string.
  435. * @param length1 Length of first source string, or -1 if NUL-terminated.
  436. *
  437. * @param s2 Second source string.
  438. * @param length2 Length of second source string, or -1 if NUL-terminated.
  439. *
  440. * @param codePointOrder Choose between code unit order (false)
  441. * and code point order (true).
  442. *
  443. * @return <0 or 0 or >0 as usual for string comparisons
  444. *
  445. * @stable ICU 2.2
  446. */
  447. U_CAPI int32_t U_EXPORT2
  448. u_strCompare(const UChar *s1, int32_t length1,
  449. const UChar *s2, int32_t length2,
  450. UBool codePointOrder);
  451. /**
  452. * Compare two Unicode strings (binary order)
  453. * as presented by UCharIterator objects.
  454. * Works otherwise just like u_strCompare().
  455. *
  456. * Both iterators are reset to their start positions.
  457. * When the function returns, it is undefined where the iterators
  458. * have stopped.
  459. *
  460. * @param iter1 First source string iterator.
  461. * @param iter2 Second source string iterator.
  462. * @param codePointOrder Choose between code unit order (false)
  463. * and code point order (true).
  464. *
  465. * @return <0 or 0 or >0 as usual for string comparisons
  466. *
  467. * @see u_strCompare
  468. *
  469. * @stable ICU 2.6
  470. */
  471. U_CAPI int32_t U_EXPORT2
  472. u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder);
  473. /**
  474. * Compare two strings case-insensitively using full case folding.
  475. * This is equivalent to
  476. * u_strCompare(u_strFoldCase(s1, options),
  477. * u_strFoldCase(s2, options),
  478. * (options&U_COMPARE_CODE_POINT_ORDER)!=0).
  479. *
  480. * The comparison can be done in UTF-16 code unit order or in code point order.
  481. * They differ only when comparing supplementary code points (U+10000..U+10ffff)
  482. * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
  483. * In code unit order, high BMP code points sort after supplementary code points
  484. * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
  485. *
  486. * This functions works with strings of different explicitly specified lengths
  487. * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
  488. * NUL-terminated strings are possible with length arguments of -1.
  489. *
  490. * @param s1 First source string.
  491. * @param length1 Length of first source string, or -1 if NUL-terminated.
  492. *
  493. * @param s2 Second source string.
  494. * @param length2 Length of second source string, or -1 if NUL-terminated.
  495. *
  496. * @param options A bit set of options:
  497. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  498. * Comparison in code unit order with default case folding.
  499. *
  500. * - U_COMPARE_CODE_POINT_ORDER
  501. * Set to choose code point order instead of code unit order
  502. * (see u_strCompare for details).
  503. *
  504. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  505. *
  506. * @param pErrorCode Must be a valid pointer to an error code value,
  507. * which must not indicate a failure before the function call.
  508. *
  509. * @return <0 or 0 or >0 as usual for string comparisons
  510. *
  511. * @stable ICU 2.2
  512. */
  513. U_CAPI int32_t U_EXPORT2
  514. u_strCaseCompare(const UChar *s1, int32_t length1,
  515. const UChar *s2, int32_t length2,
  516. uint32_t options,
  517. UErrorCode *pErrorCode);
  518. /**
  519. * Compare two ustrings for bitwise equality.
  520. * Compares at most <code>n</code> characters.
  521. *
  522. * @param ucs1 A string to compare (can be NULL/invalid if n<=0).
  523. * @param ucs2 A string to compare (can be NULL/invalid if n<=0).
  524. * @param n The maximum number of characters to compare; always returns 0 if n<=0.
  525. * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
  526. * value if <code>s1</code> is bitwise less than <code>s2</code>; a positive
  527. * value if <code>s1</code> is bitwise greater than <code>s2</code>.
  528. * @stable ICU 2.0
  529. */
  530. U_CAPI int32_t U_EXPORT2
  531. u_strncmp(const UChar *ucs1,
  532. const UChar *ucs2,
  533. int32_t n);
  534. /**
  535. * Compare two Unicode strings in code point order.
  536. * This is different in UTF-16 from u_strncmp() if supplementary characters are present.
  537. * For details, see u_strCompare().
  538. *
  539. * @param s1 A string to compare.
  540. * @param s2 A string to compare.
  541. * @param n The maximum number of characters to compare.
  542. * @return a negative/zero/positive integer corresponding to whether
  543. * the first string is less than/equal to/greater than the second one
  544. * in code point order
  545. * @stable ICU 2.0
  546. */
  547. U_CAPI int32_t U_EXPORT2
  548. u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n);
  549. /**
  550. * Compare two strings case-insensitively using full case folding.
  551. * This is equivalent to u_strcmp(u_strFoldCase(s1, options), u_strFoldCase(s2, options)).
  552. *
  553. * @param s1 A string to compare.
  554. * @param s2 A string to compare.
  555. * @param options A bit set of options:
  556. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  557. * Comparison in code unit order with default case folding.
  558. *
  559. * - U_COMPARE_CODE_POINT_ORDER
  560. * Set to choose code point order instead of code unit order
  561. * (see u_strCompare for details).
  562. *
  563. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  564. *
  565. * @return A negative, zero, or positive integer indicating the comparison result.
  566. * @stable ICU 2.0
  567. */
  568. U_CAPI int32_t U_EXPORT2
  569. u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options);
  570. /**
  571. * Compare two strings case-insensitively using full case folding.
  572. * This is equivalent to u_strcmp(u_strFoldCase(s1, at most n, options),
  573. * u_strFoldCase(s2, at most n, options)).
  574. *
  575. * @param s1 A string to compare.
  576. * @param s2 A string to compare.
  577. * @param n The maximum number of characters each string to case-fold and then compare.
  578. * @param options A bit set of options:
  579. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  580. * Comparison in code unit order with default case folding.
  581. *
  582. * - U_COMPARE_CODE_POINT_ORDER
  583. * Set to choose code point order instead of code unit order
  584. * (see u_strCompare for details).
  585. *
  586. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  587. *
  588. * @return A negative, zero, or positive integer indicating the comparison result.
  589. * @stable ICU 2.0
  590. */
  591. U_CAPI int32_t U_EXPORT2
  592. u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options);
  593. /**
  594. * Compare two strings case-insensitively using full case folding.
  595. * This is equivalent to u_strcmp(u_strFoldCase(s1, n, options),
  596. * u_strFoldCase(s2, n, options)).
  597. *
  598. * @param s1 A string to compare.
  599. * @param s2 A string to compare.
  600. * @param length The number of characters in each string to case-fold and then compare.
  601. * @param options A bit set of options:
  602. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  603. * Comparison in code unit order with default case folding.
  604. *
  605. * - U_COMPARE_CODE_POINT_ORDER
  606. * Set to choose code point order instead of code unit order
  607. * (see u_strCompare for details).
  608. *
  609. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  610. *
  611. * @return A negative, zero, or positive integer indicating the comparison result.
  612. * @stable ICU 2.0
  613. */
  614. U_CAPI int32_t U_EXPORT2
  615. u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options);
  616. /**
  617. * Copy a ustring. Adds a null terminator.
  618. *
  619. * @param dst The destination string.
  620. * @param src The source string.
  621. * @return A pointer to <code>dst</code>.
  622. * @stable ICU 2.0
  623. */
  624. U_CAPI UChar* U_EXPORT2
  625. u_strcpy(UChar *dst,
  626. const UChar *src);
  627. /**
  628. * Copy a ustring.
  629. * Copies at most <code>n</code> characters. The result will be null terminated
  630. * if the length of <code>src</code> is less than <code>n</code>.
  631. *
  632. * @param dst The destination string.
  633. * @param src The source string (can be NULL/invalid if n<=0).
  634. * @param n The maximum number of characters to copy; no-op if <=0.
  635. * @return A pointer to <code>dst</code>.
  636. * @stable ICU 2.0
  637. */
  638. U_CAPI UChar* U_EXPORT2
  639. u_strncpy(UChar *dst,
  640. const UChar *src,
  641. int32_t n);
  642. #if !UCONFIG_NO_CONVERSION
  643. /**
  644. * Copy a byte string encoded in the default codepage to a ustring.
  645. * Adds a null terminator.
  646. * Performs a host byte to UChar conversion
  647. *
  648. * @param dst The destination string.
  649. * @param src The source string.
  650. * @return A pointer to <code>dst</code>.
  651. * @stable ICU 2.0
  652. */
  653. U_CAPI UChar* U_EXPORT2 u_uastrcpy(UChar *dst,
  654. const char *src );
  655. /**
  656. * Copy a byte string encoded in the default codepage to a ustring.
  657. * Copies at most <code>n</code> characters. The result will be null terminated
  658. * if the length of <code>src</code> is less than <code>n</code>.
  659. * Performs a host byte to UChar conversion
  660. *
  661. * @param dst The destination string.
  662. * @param src The source string.
  663. * @param n The maximum number of characters to copy.
  664. * @return A pointer to <code>dst</code>.
  665. * @stable ICU 2.0
  666. */
  667. U_CAPI UChar* U_EXPORT2 u_uastrncpy(UChar *dst,
  668. const char *src,
  669. int32_t n);
  670. /**
  671. * Copy ustring to a byte string encoded in the default codepage.
  672. * Adds a null terminator.
  673. * Performs a UChar to host byte conversion
  674. *
  675. * @param dst The destination string.
  676. * @param src The source string.
  677. * @return A pointer to <code>dst</code>.
  678. * @stable ICU 2.0
  679. */
  680. U_CAPI char* U_EXPORT2 u_austrcpy(char *dst,
  681. const UChar *src );
  682. /**
  683. * Copy ustring to a byte string encoded in the default codepage.
  684. * Copies at most <code>n</code> characters. The result will be null terminated
  685. * if the length of <code>src</code> is less than <code>n</code>.
  686. * Performs a UChar to host byte conversion
  687. *
  688. * @param dst The destination string.
  689. * @param src The source string.
  690. * @param n The maximum number of characters to copy.
  691. * @return A pointer to <code>dst</code>.
  692. * @stable ICU 2.0
  693. */
  694. U_CAPI char* U_EXPORT2 u_austrncpy(char *dst,
  695. const UChar *src,
  696. int32_t n );
  697. #endif
  698. /**
  699. * Synonym for memcpy(), but with UChars only.
  700. * @param dest The destination string
  701. * @param src The source string (can be NULL/invalid if count<=0)
  702. * @param count The number of characters to copy; no-op if <=0
  703. * @return A pointer to <code>dest</code>
  704. * @stable ICU 2.0
  705. */
  706. U_CAPI UChar* U_EXPORT2
  707. u_memcpy(UChar *dest, const UChar *src, int32_t count);
  708. /**
  709. * Synonym for memmove(), but with UChars only.
  710. * @param dest The destination string
  711. * @param src The source string (can be NULL/invalid if count<=0)
  712. * @param count The number of characters to move; no-op if <=0
  713. * @return A pointer to <code>dest</code>
  714. * @stable ICU 2.0
  715. */
  716. U_CAPI UChar* U_EXPORT2
  717. u_memmove(UChar *dest, const UChar *src, int32_t count);
  718. /**
  719. * Initialize <code>count</code> characters of <code>dest</code> to <code>c</code>.
  720. *
  721. * @param dest The destination string.
  722. * @param c The character to initialize the string.
  723. * @param count The maximum number of characters to set.
  724. * @return A pointer to <code>dest</code>.
  725. * @stable ICU 2.0
  726. */
  727. U_CAPI UChar* U_EXPORT2
  728. u_memset(UChar *dest, UChar c, int32_t count);
  729. /**
  730. * Compare the first <code>count</code> UChars of each buffer.
  731. *
  732. * @param buf1 The first string to compare.
  733. * @param buf2 The second string to compare.
  734. * @param count The maximum number of UChars to compare.
  735. * @return When buf1 < buf2, a negative number is returned.
  736. * When buf1 == buf2, 0 is returned.
  737. * When buf1 > buf2, a positive number is returned.
  738. * @stable ICU 2.0
  739. */
  740. U_CAPI int32_t U_EXPORT2
  741. u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count);
  742. /**
  743. * Compare two Unicode strings in code point order.
  744. * This is different in UTF-16 from u_memcmp() if supplementary characters are present.
  745. * For details, see u_strCompare().
  746. *
  747. * @param s1 A string to compare.
  748. * @param s2 A string to compare.
  749. * @param count The maximum number of characters to compare.
  750. * @return a negative/zero/positive integer corresponding to whether
  751. * the first string is less than/equal to/greater than the second one
  752. * in code point order
  753. * @stable ICU 2.0
  754. */
  755. U_CAPI int32_t U_EXPORT2
  756. u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count);
  757. /**
  758. * Find the first occurrence of a BMP code point in a string.
  759. * A surrogate code point is found only if its match in the text is not
  760. * part of a surrogate pair.
  761. * A NUL character is found at the string terminator.
  762. *
  763. * @param s The string to search (contains <code>count</code> UChars).
  764. * @param c The BMP code point to find.
  765. * @param count The length of the string.
  766. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  767. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  768. * @stable ICU 2.0
  769. *
  770. * @see u_strchr
  771. * @see u_memchr32
  772. * @see u_strFindFirst
  773. */
  774. U_CAPI UChar* U_EXPORT2
  775. u_memchr(const UChar *s, UChar c, int32_t count);
  776. /**
  777. * Find the first occurrence of a code point in a string.
  778. * A surrogate code point is found only if its match in the text is not
  779. * part of a surrogate pair.
  780. * A NUL character is found at the string terminator.
  781. *
  782. * @param s The string to search (contains <code>count</code> UChars).
  783. * @param c The code point to find.
  784. * @param count The length of the string.
  785. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  786. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  787. * @stable ICU 2.0
  788. *
  789. * @see u_strchr32
  790. * @see u_memchr
  791. * @see u_strFindFirst
  792. */
  793. U_CAPI UChar* U_EXPORT2
  794. u_memchr32(const UChar *s, UChar32 c, int32_t count);
  795. /**
  796. * Find the last occurrence of a BMP code point in a string.
  797. * A surrogate code point is found only if its match in the text is not
  798. * part of a surrogate pair.
  799. * A NUL character is found at the string terminator.
  800. *
  801. * @param s The string to search (contains <code>count</code> UChars).
  802. * @param c The BMP code point to find.
  803. * @param count The length of the string.
  804. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  805. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  806. * @stable ICU 2.4
  807. *
  808. * @see u_strrchr
  809. * @see u_memrchr32
  810. * @see u_strFindLast
  811. */
  812. U_CAPI UChar* U_EXPORT2
  813. u_memrchr(const UChar *s, UChar c, int32_t count);
  814. /**
  815. * Find the last occurrence of a code point in a string.
  816. * A surrogate code point is found only if its match in the text is not
  817. * part of a surrogate pair.
  818. * A NUL character is found at the string terminator.
  819. *
  820. * @param s The string to search (contains <code>count</code> UChars).
  821. * @param c The code point to find.
  822. * @param count The length of the string.
  823. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  824. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  825. * @stable ICU 2.4
  826. *
  827. * @see u_strrchr32
  828. * @see u_memrchr
  829. * @see u_strFindLast
  830. */
  831. U_CAPI UChar* U_EXPORT2
  832. u_memrchr32(const UChar *s, UChar32 c, int32_t count);
  833. /**
  834. * Unicode String literals in C.
  835. * We need one macro to declare a variable for the string
  836. * and to statically preinitialize it if possible,
  837. * and a second macro to dynamically initialize such a string variable if necessary.
  838. *
  839. * The macros are defined for maximum performance.
  840. * They work only for strings that contain "invariant characters", i.e.,
  841. * only latin letters, digits, and some punctuation.
  842. * See utypes.h for details.
  843. *
  844. * A pair of macros for a single string must be used with the same
  845. * parameters.
  846. * The string parameter must be a C string literal.
  847. * The length of the string, not including the terminating
  848. * `NUL`, must be specified as a constant.
  849. * The U_STRING_DECL macro should be invoked exactly once for one
  850. * such string variable before it is used.
  851. *
  852. * Usage:
  853. *
  854. * U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11);
  855. * U_STRING_DECL(ustringVar2, "jumps 5%", 8);
  856. * static UBool didInit=false;
  857. *
  858. * int32_t function() {
  859. * if(!didInit) {
  860. * U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11);
  861. * U_STRING_INIT(ustringVar2, "jumps 5%", 8);
  862. * didInit=true;
  863. * }
  864. * return u_strcmp(ustringVar1, ustringVar2);
  865. * }
  866. *
  867. * Note that the macros will NOT consistently work if their argument is another #`define`.
  868. * The following will not work on all platforms, don't use it.
  869. *
  870. * #define GLUCK "Mr. Gluck"
  871. * U_STRING_DECL(var, GLUCK, 9)
  872. * U_STRING_INIT(var, GLUCK, 9)
  873. *
  874. * Instead, use the string literal "Mr. Gluck" as the argument to both macro
  875. * calls.
  876. *
  877. *
  878. * @stable ICU 2.0
  879. */
  880. #if defined(U_DECLARE_UTF16)
  881. # define U_STRING_DECL(var, cs, length) static const UChar *var=(const UChar *)U_DECLARE_UTF16(cs)
  882. /**@stable ICU 2.0 */
  883. # define U_STRING_INIT(var, cs, length)
  884. #elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || defined(U_WCHAR_IS_UTF16))
  885. # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs
  886. /**@stable ICU 2.0 */
  887. # define U_STRING_INIT(var, cs, length)
  888. #else
  889. # define U_STRING_DECL(var, cs, length) static UChar var[(length)+1]
  890. /**@stable ICU 2.0 */
  891. # define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1)
  892. #endif
  893. /**
  894. * Unescape a string of characters and write the resulting
  895. * Unicode characters to the destination buffer. The following escape
  896. * sequences are recognized:
  897. *
  898. * \\uhhhh 4 hex digits; h in [0-9A-Fa-f]
  899. * \\Uhhhhhhhh 8 hex digits
  900. * \\xhh 1-2 hex digits
  901. * \\x{h...} 1-8 hex digits
  902. * \\ooo 1-3 octal digits; o in [0-7]
  903. * \\cX control-X; X is masked with 0x1F
  904. *
  905. * as well as the standard ANSI C escapes:
  906. *
  907. * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
  908. * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
  909. * \\&quot; => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
  910. *
  911. * Anything else following a backslash is generically escaped. For
  912. * example, "[a\\-z]" returns "[a-z]".
  913. *
  914. * If an escape sequence is ill-formed, this method returns an empty
  915. * string. An example of an ill-formed sequence is "\\u" followed by
  916. * fewer than 4 hex digits.
  917. *
  918. * The above characters are recognized in the compiler's codepage,
  919. * that is, they are coded as 'u', '\\', etc. Characters that are
  920. * not parts of escape sequences are converted using u_charsToUChars().
  921. *
  922. * This function is similar to UnicodeString::unescape() but not
  923. * identical to it. The latter takes a source UnicodeString, so it
  924. * does escape recognition but no conversion.
  925. *
  926. * @param src a zero-terminated string of invariant characters
  927. * @param dest pointer to buffer to receive converted and unescaped
  928. * text and, if there is room, a zero terminator. May be NULL for
  929. * preflighting, in which case no UChars will be written, but the
  930. * return value will still be valid. On error, an empty string is
  931. * stored here (if possible).
  932. * @param destCapacity the number of UChars that may be written at
  933. * dest. Ignored if dest == NULL.
  934. * @return the length of unescaped string.
  935. * @see u_unescapeAt
  936. * @see UnicodeString#unescape()
  937. * @see UnicodeString#unescapeAt()
  938. * @stable ICU 2.0
  939. */
  940. U_CAPI int32_t U_EXPORT2
  941. u_unescape(const char *src,
  942. UChar *dest, int32_t destCapacity);
  943. U_CDECL_BEGIN
  944. /**
  945. * Callback function for u_unescapeAt() that returns a character of
  946. * the source text given an offset and a context pointer. The context
  947. * pointer will be whatever is passed into u_unescapeAt().
  948. *
  949. * @param offset pointer to the offset that will be passed to u_unescapeAt().
  950. * @param context an opaque pointer passed directly into u_unescapeAt()
  951. * @return the character represented by the escape sequence at
  952. * offset
  953. * @see u_unescapeAt
  954. * @stable ICU 2.0
  955. */
  956. typedef UChar (U_CALLCONV *UNESCAPE_CHAR_AT)(int32_t offset, void *context);
  957. U_CDECL_END
  958. /**
  959. * Unescape a single sequence. The character at offset-1 is assumed
  960. * (without checking) to be a backslash. This method takes a callback
  961. * pointer to a function that returns the UChar at a given offset. By
  962. * varying this callback, ICU functions are able to unescape char*
  963. * strings, UnicodeString objects, and UFILE pointers.
  964. *
  965. * If offset is out of range, or if the escape sequence is ill-formed,
  966. * (UChar32)0xFFFFFFFF is returned. See documentation of u_unescape()
  967. * for a list of recognized sequences.
  968. *
  969. * @param charAt callback function that returns a UChar of the source
  970. * text given an offset and a context pointer.
  971. * @param offset pointer to the offset that will be passed to charAt.
  972. * The offset value will be updated upon return to point after the
  973. * last parsed character of the escape sequence. On error the offset
  974. * is unchanged.
  975. * @param length the number of characters in the source text. The
  976. * last character of the source text is considered to be at offset
  977. * length-1.
  978. * @param context an opaque pointer passed directly into charAt.
  979. * @return the character represented by the escape sequence at
  980. * offset, or (UChar32)0xFFFFFFFF on error.
  981. * @see u_unescape()
  982. * @see UnicodeString#unescape()
  983. * @see UnicodeString#unescapeAt()
  984. * @stable ICU 2.0
  985. */
  986. U_CAPI UChar32 U_EXPORT2
  987. u_unescapeAt(UNESCAPE_CHAR_AT charAt,
  988. int32_t *offset,
  989. int32_t length,
  990. void *context);
  991. /**
  992. * Uppercase the characters in a string.
  993. * Casing is locale-dependent and context-sensitive.
  994. * The result may be longer or shorter than the original.
  995. * The source string and the destination buffer are allowed to overlap.
  996. *
  997. * @param dest A buffer for the result string. The result will be zero-terminated if
  998. * the buffer is large enough.
  999. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1000. * dest may be NULL and the function will only return the length of the result
  1001. * without writing any of the result string.
  1002. * @param src The original string
  1003. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1004. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1005. * @param pErrorCode Must be a valid pointer to an error code value,
  1006. * which must not indicate a failure before the function call.
  1007. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1008. * only some of the result was written to the destination buffer.
  1009. * @stable ICU 2.0
  1010. */
  1011. U_CAPI int32_t U_EXPORT2
  1012. u_strToUpper(UChar *dest, int32_t destCapacity,
  1013. const UChar *src, int32_t srcLength,
  1014. const char *locale,
  1015. UErrorCode *pErrorCode);
  1016. /**
  1017. * Lowercase the characters in a string.
  1018. * Casing is locale-dependent and context-sensitive.
  1019. * The result may be longer or shorter than the original.
  1020. * The source string and the destination buffer are allowed to overlap.
  1021. *
  1022. * @param dest A buffer for the result string. The result will be zero-terminated if
  1023. * the buffer is large enough.
  1024. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1025. * dest may be NULL and the function will only return the length of the result
  1026. * without writing any of the result string.
  1027. * @param src The original string
  1028. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1029. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1030. * @param pErrorCode Must be a valid pointer to an error code value,
  1031. * which must not indicate a failure before the function call.
  1032. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1033. * only some of the result was written to the destination buffer.
  1034. * @stable ICU 2.0
  1035. */
  1036. U_CAPI int32_t U_EXPORT2
  1037. u_strToLower(UChar *dest, int32_t destCapacity,
  1038. const UChar *src, int32_t srcLength,
  1039. const char *locale,
  1040. UErrorCode *pErrorCode);
  1041. #if !UCONFIG_NO_BREAK_ITERATION
  1042. /**
  1043. * Titlecase a string.
  1044. * Casing is locale-dependent and context-sensitive.
  1045. * Titlecasing uses a break iterator to find the first characters of words
  1046. * that are to be titlecased. It titlecases those characters and lowercases
  1047. * all others.
  1048. *
  1049. * The titlecase break iterator can be provided to customize for arbitrary
  1050. * styles, using rules and dictionaries beyond the standard iterators.
  1051. * It may be more efficient to always provide an iterator to avoid
  1052. * opening and closing one for each string.
  1053. * If the break iterator passed in is null, the default Unicode algorithm
  1054. * will be used to determine the titlecase positions.
  1055. *
  1056. * This function uses only the setText(), first() and next() methods of the
  1057. * provided break iterator.
  1058. *
  1059. * The result may be longer or shorter than the original.
  1060. * The source string and the destination buffer are allowed to overlap.
  1061. *
  1062. * @param dest A buffer for the result string. The result will be zero-terminated if
  1063. * the buffer is large enough.
  1064. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1065. * dest may be NULL and the function will only return the length of the result
  1066. * without writing any of the result string.
  1067. * @param src The original string
  1068. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1069. * @param titleIter A break iterator to find the first characters of words
  1070. * that are to be titlecased.
  1071. * If none is provided (NULL), then a standard titlecase
  1072. * break iterator is opened.
  1073. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1074. * @param pErrorCode Must be a valid pointer to an error code value,
  1075. * which must not indicate a failure before the function call.
  1076. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1077. * only some of the result was written to the destination buffer.
  1078. * @stable ICU 2.1
  1079. */
  1080. U_CAPI int32_t U_EXPORT2
  1081. u_strToTitle(UChar *dest, int32_t destCapacity,
  1082. const UChar *src, int32_t srcLength,
  1083. UBreakIterator *titleIter,
  1084. const char *locale,
  1085. UErrorCode *pErrorCode);
  1086. #endif
  1087. /**
  1088. * Case-folds the characters in a string.
  1089. *
  1090. * Case-folding is locale-independent and not context-sensitive,
  1091. * but there is an option for whether to include or exclude mappings for dotted I
  1092. * and dotless i that are marked with 'T' in CaseFolding.txt.
  1093. *
  1094. * The result may be longer or shorter than the original.
  1095. * The source string and the destination buffer are allowed to overlap.
  1096. *
  1097. * @param dest A buffer for the result string. The result will be zero-terminated if
  1098. * the buffer is large enough.
  1099. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1100. * dest may be NULL and the function will only return the length of the result
  1101. * without writing any of the result string.
  1102. * @param src The original string
  1103. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1104. * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
  1105. * @param pErrorCode Must be a valid pointer to an error code value,
  1106. * which must not indicate a failure before the function call.
  1107. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1108. * only some of the result was written to the destination buffer.
  1109. * @stable ICU 2.0
  1110. */
  1111. U_CAPI int32_t U_EXPORT2
  1112. u_strFoldCase(UChar *dest, int32_t destCapacity,
  1113. const UChar *src, int32_t srcLength,
  1114. uint32_t options,
  1115. UErrorCode *pErrorCode);
  1116. #if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION
  1117. /**
  1118. * Convert a UTF-16 string to a wchar_t string.
  1119. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
  1120. * this function simply calls the fast, dedicated function for that.
  1121. * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed.
  1122. *
  1123. * @param dest A buffer for the result string. The result will be zero-terminated if
  1124. * the buffer is large enough.
  1125. * @param destCapacity The size of the buffer (number of wchar_t's). If it is 0, then
  1126. * dest may be NULL and the function will only return the length of the
  1127. * result without writing any of the result string (pre-flighting).
  1128. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1129. * pDestLength!=NULL then *pDestLength is always set to the
  1130. * number of output units corresponding to the transformation of
  1131. * all the input units, even in case of a buffer overflow.
  1132. * @param src The original source string
  1133. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1134. * @param pErrorCode Must be a valid pointer to an error code value,
  1135. * which must not indicate a failure before the function call.
  1136. * @return The pointer to destination buffer.
  1137. * @stable ICU 2.0
  1138. */
  1139. U_CAPI wchar_t* U_EXPORT2
  1140. u_strToWCS(wchar_t *dest,
  1141. int32_t destCapacity,
  1142. int32_t *pDestLength,
  1143. const UChar *src,
  1144. int32_t srcLength,
  1145. UErrorCode *pErrorCode);
  1146. /**
  1147. * Convert a wchar_t string to UTF-16.
  1148. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
  1149. * this function simply calls the fast, dedicated function for that.
  1150. * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed.
  1151. *
  1152. * @param dest A buffer for the result string. The result will be zero-terminated if
  1153. * the buffer is large enough.
  1154. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1155. * dest may be NULL and the function will only return the length of the
  1156. * result without writing any of the result string (pre-flighting).
  1157. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1158. * pDestLength!=NULL then *pDestLength is always set to the
  1159. * number of output units corresponding to the transformation of
  1160. * all the input units, even in case of a buffer overflow.
  1161. * @param src The original source string
  1162. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1163. * @param pErrorCode Must be a valid pointer to an error code value,
  1164. * which must not indicate a failure before the function call.
  1165. * @return The pointer to destination buffer.
  1166. * @stable ICU 2.0
  1167. */
  1168. U_CAPI UChar* U_EXPORT2
  1169. u_strFromWCS(UChar *dest,
  1170. int32_t destCapacity,
  1171. int32_t *pDestLength,
  1172. const wchar_t *src,
  1173. int32_t srcLength,
  1174. UErrorCode *pErrorCode);
  1175. #endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */
  1176. /**
  1177. * Convert a UTF-16 string to UTF-8.
  1178. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1179. *
  1180. * @param dest A buffer for the result string. The result will be zero-terminated if
  1181. * the buffer is large enough.
  1182. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1183. * dest may be NULL and the function will only return the length of the
  1184. * result without writing any of the result string (pre-flighting).
  1185. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1186. * pDestLength!=NULL then *pDestLength is always set to the
  1187. * number of output units corresponding to the transformation of
  1188. * all the input units, even in case of a buffer overflow.
  1189. * @param src The original source string
  1190. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1191. * @param pErrorCode Must be a valid pointer to an error code value,
  1192. * which must not indicate a failure before the function call.
  1193. * @return The pointer to destination buffer.
  1194. * @stable ICU 2.0
  1195. * @see u_strToUTF8WithSub
  1196. * @see u_strFromUTF8
  1197. */
  1198. U_CAPI char* U_EXPORT2
  1199. u_strToUTF8(char *dest,
  1200. int32_t destCapacity,
  1201. int32_t *pDestLength,
  1202. const UChar *src,
  1203. int32_t srcLength,
  1204. UErrorCode *pErrorCode);
  1205. /**
  1206. * Convert a UTF-8 string to UTF-16.
  1207. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1208. *
  1209. * @param dest A buffer for the result string. The result will be zero-terminated if
  1210. * the buffer is large enough.
  1211. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1212. * dest may be NULL and the function will only return the length of the
  1213. * result without writing any of the result string (pre-flighting).
  1214. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1215. * pDestLength!=NULL then *pDestLength is always set to the
  1216. * number of output units corresponding to the transformation of
  1217. * all the input units, even in case of a buffer overflow.
  1218. * @param src The original source string
  1219. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1220. * @param pErrorCode Must be a valid pointer to an error code value,
  1221. * which must not indicate a failure before the function call.
  1222. * @return The pointer to destination buffer.
  1223. * @stable ICU 2.0
  1224. * @see u_strFromUTF8WithSub
  1225. * @see u_strFromUTF8Lenient
  1226. */
  1227. U_CAPI UChar* U_EXPORT2
  1228. u_strFromUTF8(UChar *dest,
  1229. int32_t destCapacity,
  1230. int32_t *pDestLength,
  1231. const char *src,
  1232. int32_t srcLength,
  1233. UErrorCode *pErrorCode);
  1234. /**
  1235. * Convert a UTF-16 string to UTF-8.
  1236. *
  1237. * Same as u_strToUTF8() except for the additional subchar which is output for
  1238. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1239. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
  1240. *
  1241. * @param dest A buffer for the result string. The result will be zero-terminated if
  1242. * the buffer is large enough.
  1243. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1244. * dest may be NULL and the function will only return the length of the
  1245. * result without writing any of the result string (pre-flighting).
  1246. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1247. * pDestLength!=NULL then *pDestLength is always set to the
  1248. * number of output units corresponding to the transformation of
  1249. * all the input units, even in case of a buffer overflow.
  1250. * @param src The original source string
  1251. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1252. * @param subchar The substitution character to use in place of an illegal input sequence,
  1253. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1254. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1255. * except for surrogate code points (U+D800..U+DFFF).
  1256. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1257. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1258. * Set to 0 if no substitutions occur or subchar<0.
  1259. * pNumSubstitutions can be NULL.
  1260. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1261. * pass the U_SUCCESS() test, or else the function returns
  1262. * immediately. Check for U_FAILURE() on output or use with
  1263. * function chaining. (See User Guide for details.)
  1264. * @return The pointer to destination buffer.
  1265. * @see u_strToUTF8
  1266. * @see u_strFromUTF8WithSub
  1267. * @stable ICU 3.6
  1268. */
  1269. U_CAPI char* U_EXPORT2
  1270. u_strToUTF8WithSub(char *dest,
  1271. int32_t destCapacity,
  1272. int32_t *pDestLength,
  1273. const UChar *src,
  1274. int32_t srcLength,
  1275. UChar32 subchar, int32_t *pNumSubstitutions,
  1276. UErrorCode *pErrorCode);
  1277. /**
  1278. * Convert a UTF-8 string to UTF-16.
  1279. *
  1280. * Same as u_strFromUTF8() except for the additional subchar which is output for
  1281. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1282. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
  1283. *
  1284. * @param dest A buffer for the result string. The result will be zero-terminated if
  1285. * the buffer is large enough.
  1286. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1287. * dest may be NULL and the function will only return the length of the
  1288. * result without writing any of the result string (pre-flighting).
  1289. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1290. * pDestLength!=NULL then *pDestLength is always set to the
  1291. * number of output units corresponding to the transformation of
  1292. * all the input units, even in case of a buffer overflow.
  1293. * @param src The original source string
  1294. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1295. * @param subchar The substitution character to use in place of an illegal input sequence,
  1296. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1297. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1298. * except for surrogate code points (U+D800..U+DFFF).
  1299. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1300. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1301. * Set to 0 if no substitutions occur or subchar<0.
  1302. * pNumSubstitutions can be NULL.
  1303. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1304. * pass the U_SUCCESS() test, or else the function returns
  1305. * immediately. Check for U_FAILURE() on output or use with
  1306. * function chaining. (See User Guide for details.)
  1307. * @return The pointer to destination buffer.
  1308. * @see u_strFromUTF8
  1309. * @see u_strFromUTF8Lenient
  1310. * @see u_strToUTF8WithSub
  1311. * @stable ICU 3.6
  1312. */
  1313. U_CAPI UChar* U_EXPORT2
  1314. u_strFromUTF8WithSub(UChar *dest,
  1315. int32_t destCapacity,
  1316. int32_t *pDestLength,
  1317. const char *src,
  1318. int32_t srcLength,
  1319. UChar32 subchar, int32_t *pNumSubstitutions,
  1320. UErrorCode *pErrorCode);
  1321. /**
  1322. * Convert a UTF-8 string to UTF-16.
  1323. *
  1324. * Same as u_strFromUTF8() except that this function is designed to be very fast,
  1325. * which it achieves by being lenient about malformed UTF-8 sequences.
  1326. * This function is intended for use in environments where UTF-8 text is
  1327. * expected to be well-formed.
  1328. *
  1329. * Its semantics are:
  1330. * - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text.
  1331. * - The function will not read beyond the input string, nor write beyond
  1332. * the destCapacity.
  1333. * - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not
  1334. * be well-formed UTF-16.
  1335. * The function will resynchronize to valid code point boundaries
  1336. * within a small number of code points after an illegal sequence.
  1337. * - Non-shortest forms are not detected and will result in "spoofing" output.
  1338. *
  1339. * For further performance improvement, if srcLength is given (>=0),
  1340. * then it must be destCapacity>=srcLength.
  1341. *
  1342. * There is no inverse u_strToUTF8Lenient() function because there is practically
  1343. * no performance gain from not checking that a UTF-16 string is well-formed.
  1344. *
  1345. * @param dest A buffer for the result string. The result will be zero-terminated if
  1346. * the buffer is large enough.
  1347. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1348. * dest may be NULL and the function will only return the length of the
  1349. * result without writing any of the result string (pre-flighting).
  1350. * Unlike for other ICU functions, if srcLength>=0 then it
  1351. * must be destCapacity>=srcLength.
  1352. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1353. * pDestLength!=NULL then *pDestLength is always set to the
  1354. * number of output units corresponding to the transformation of
  1355. * all the input units, even in case of a buffer overflow.
  1356. * Unlike for other ICU functions, if srcLength>=0 but
  1357. * destCapacity<srcLength, then *pDestLength will be set to srcLength
  1358. * (and U_BUFFER_OVERFLOW_ERROR will be set)
  1359. * regardless of the actual result length.
  1360. * @param src The original source string
  1361. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1362. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1363. * pass the U_SUCCESS() test, or else the function returns
  1364. * immediately. Check for U_FAILURE() on output or use with
  1365. * function chaining. (See User Guide for details.)
  1366. * @return The pointer to destination buffer.
  1367. * @see u_strFromUTF8
  1368. * @see u_strFromUTF8WithSub
  1369. * @see u_strToUTF8WithSub
  1370. * @stable ICU 3.6
  1371. */
  1372. U_CAPI UChar * U_EXPORT2
  1373. u_strFromUTF8Lenient(UChar *dest,
  1374. int32_t destCapacity,
  1375. int32_t *pDestLength,
  1376. const char *src,
  1377. int32_t srcLength,
  1378. UErrorCode *pErrorCode);
  1379. /**
  1380. * Convert a UTF-16 string to UTF-32.
  1381. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1382. *
  1383. * @param dest A buffer for the result string. The result will be zero-terminated if
  1384. * the buffer is large enough.
  1385. * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
  1386. * dest may be NULL and the function will only return the length of the
  1387. * result without writing any of the result string (pre-flighting).
  1388. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1389. * pDestLength!=NULL then *pDestLength is always set to the
  1390. * number of output units corresponding to the transformation of
  1391. * all the input units, even in case of a buffer overflow.
  1392. * @param src The original source string
  1393. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1394. * @param pErrorCode Must be a valid pointer to an error code value,
  1395. * which must not indicate a failure before the function call.
  1396. * @return The pointer to destination buffer.
  1397. * @see u_strToUTF32WithSub
  1398. * @see u_strFromUTF32
  1399. * @stable ICU 2.0
  1400. */
  1401. U_CAPI UChar32* U_EXPORT2
  1402. u_strToUTF32(UChar32 *dest,
  1403. int32_t destCapacity,
  1404. int32_t *pDestLength,
  1405. const UChar *src,
  1406. int32_t srcLength,
  1407. UErrorCode *pErrorCode);
  1408. /**
  1409. * Convert a UTF-32 string to UTF-16.
  1410. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1411. *
  1412. * @param dest A buffer for the result string. The result will be zero-terminated if
  1413. * the buffer is large enough.
  1414. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1415. * dest may be NULL and the function will only return the length of the
  1416. * result without writing any of the result string (pre-flighting).
  1417. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1418. * pDestLength!=NULL then *pDestLength is always set to the
  1419. * number of output units corresponding to the transformation of
  1420. * all the input units, even in case of a buffer overflow.
  1421. * @param src The original source string
  1422. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1423. * @param pErrorCode Must be a valid pointer to an error code value,
  1424. * which must not indicate a failure before the function call.
  1425. * @return The pointer to destination buffer.
  1426. * @see u_strFromUTF32WithSub
  1427. * @see u_strToUTF32
  1428. * @stable ICU 2.0
  1429. */
  1430. U_CAPI UChar* U_EXPORT2
  1431. u_strFromUTF32(UChar *dest,
  1432. int32_t destCapacity,
  1433. int32_t *pDestLength,
  1434. const UChar32 *src,
  1435. int32_t srcLength,
  1436. UErrorCode *pErrorCode);
  1437. /**
  1438. * Convert a UTF-16 string to UTF-32.
  1439. *
  1440. * Same as u_strToUTF32() except for the additional subchar which is output for
  1441. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1442. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32().
  1443. *
  1444. * @param dest A buffer for the result string. The result will be zero-terminated if
  1445. * the buffer is large enough.
  1446. * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
  1447. * dest may be NULL and the function will only return the length of the
  1448. * result without writing any of the result string (pre-flighting).
  1449. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1450. * pDestLength!=NULL then *pDestLength is always set to the
  1451. * number of output units corresponding to the transformation of
  1452. * all the input units, even in case of a buffer overflow.
  1453. * @param src The original source string
  1454. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1455. * @param subchar The substitution character to use in place of an illegal input sequence,
  1456. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1457. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1458. * except for surrogate code points (U+D800..U+DFFF).
  1459. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1460. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1461. * Set to 0 if no substitutions occur or subchar<0.
  1462. * pNumSubstitutions can be NULL.
  1463. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1464. * pass the U_SUCCESS() test, or else the function returns
  1465. * immediately. Check for U_FAILURE() on output or use with
  1466. * function chaining. (See User Guide for details.)
  1467. * @return The pointer to destination buffer.
  1468. * @see u_strToUTF32
  1469. * @see u_strFromUTF32WithSub
  1470. * @stable ICU 4.2
  1471. */
  1472. U_CAPI UChar32* U_EXPORT2
  1473. u_strToUTF32WithSub(UChar32 *dest,
  1474. int32_t destCapacity,
  1475. int32_t *pDestLength,
  1476. const UChar *src,
  1477. int32_t srcLength,
  1478. UChar32 subchar, int32_t *pNumSubstitutions,
  1479. UErrorCode *pErrorCode);
  1480. /**
  1481. * Convert a UTF-32 string to UTF-16.
  1482. *
  1483. * Same as u_strFromUTF32() except for the additional subchar which is output for
  1484. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1485. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32().
  1486. *
  1487. * @param dest A buffer for the result string. The result will be zero-terminated if
  1488. * the buffer is large enough.
  1489. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1490. * dest may be NULL and the function will only return the length of the
  1491. * result without writing any of the result string (pre-flighting).
  1492. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1493. * pDestLength!=NULL then *pDestLength is always set to the
  1494. * number of output units corresponding to the transformation of
  1495. * all the input units, even in case of a buffer overflow.
  1496. * @param src The original source string
  1497. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1498. * @param subchar The substitution character to use in place of an illegal input sequence,
  1499. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1500. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1501. * except for surrogate code points (U+D800..U+DFFF).
  1502. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1503. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1504. * Set to 0 if no substitutions occur or subchar<0.
  1505. * pNumSubstitutions can be NULL.
  1506. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1507. * pass the U_SUCCESS() test, or else the function returns
  1508. * immediately. Check for U_FAILURE() on output or use with
  1509. * function chaining. (See User Guide for details.)
  1510. * @return The pointer to destination buffer.
  1511. * @see u_strFromUTF32
  1512. * @see u_strToUTF32WithSub
  1513. * @stable ICU 4.2
  1514. */
  1515. U_CAPI UChar* U_EXPORT2
  1516. u_strFromUTF32WithSub(UChar *dest,
  1517. int32_t destCapacity,
  1518. int32_t *pDestLength,
  1519. const UChar32 *src,
  1520. int32_t srcLength,
  1521. UChar32 subchar, int32_t *pNumSubstitutions,
  1522. UErrorCode *pErrorCode);
  1523. /**
  1524. * Convert a 16-bit Unicode string to Java Modified UTF-8.
  1525. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
  1526. *
  1527. * This function behaves according to the documentation for Java DataOutput.writeUTF()
  1528. * except that it does not encode the output length in the destination buffer
  1529. * and does not have an output length restriction.
  1530. * See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String)
  1531. *
  1532. * The input string need not be well-formed UTF-16.
  1533. * (Therefore there is no subchar parameter.)
  1534. *
  1535. * @param dest A buffer for the result string. The result will be zero-terminated if
  1536. * the buffer is large enough.
  1537. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1538. * dest may be NULL and the function will only return the length of the
  1539. * result without writing any of the result string (pre-flighting).
  1540. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1541. * pDestLength!=NULL then *pDestLength is always set to the
  1542. * number of output units corresponding to the transformation of
  1543. * all the input units, even in case of a buffer overflow.
  1544. * @param src The original source string
  1545. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1546. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1547. * pass the U_SUCCESS() test, or else the function returns
  1548. * immediately. Check for U_FAILURE() on output or use with
  1549. * function chaining. (See User Guide for details.)
  1550. * @return The pointer to destination buffer.
  1551. * @stable ICU 4.4
  1552. * @see u_strToUTF8WithSub
  1553. * @see u_strFromJavaModifiedUTF8WithSub
  1554. */
  1555. U_CAPI char* U_EXPORT2
  1556. u_strToJavaModifiedUTF8(
  1557. char *dest,
  1558. int32_t destCapacity,
  1559. int32_t *pDestLength,
  1560. const UChar *src,
  1561. int32_t srcLength,
  1562. UErrorCode *pErrorCode);
  1563. /**
  1564. * Convert a Java Modified UTF-8 string to a 16-bit Unicode string.
  1565. * If the input string is not well-formed and no substitution char is specified,
  1566. * then the U_INVALID_CHAR_FOUND error code is set.
  1567. *
  1568. * This function behaves according to the documentation for Java DataInput.readUTF()
  1569. * except that it takes a length parameter rather than
  1570. * interpreting the first two input bytes as the length.
  1571. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF()
  1572. *
  1573. * The output string may not be well-formed UTF-16.
  1574. *
  1575. * @param dest A buffer for the result string. The result will be zero-terminated if
  1576. * the buffer is large enough.
  1577. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1578. * dest may be NULL and the function will only return the length of the
  1579. * result without writing any of the result string (pre-flighting).
  1580. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1581. * pDestLength!=NULL then *pDestLength is always set to the
  1582. * number of output units corresponding to the transformation of
  1583. * all the input units, even in case of a buffer overflow.
  1584. * @param src The original source string
  1585. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1586. * @param subchar The substitution character to use in place of an illegal input sequence,
  1587. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1588. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1589. * except for surrogate code points (U+D800..U+DFFF).
  1590. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1591. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1592. * Set to 0 if no substitutions occur or subchar<0.
  1593. * pNumSubstitutions can be NULL.
  1594. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1595. * pass the U_SUCCESS() test, or else the function returns
  1596. * immediately. Check for U_FAILURE() on output or use with
  1597. * function chaining. (See User Guide for details.)
  1598. * @return The pointer to destination buffer.
  1599. * @see u_strFromUTF8WithSub
  1600. * @see u_strFromUTF8Lenient
  1601. * @see u_strToJavaModifiedUTF8
  1602. * @stable ICU 4.4
  1603. */
  1604. U_CAPI UChar* U_EXPORT2
  1605. u_strFromJavaModifiedUTF8WithSub(
  1606. UChar *dest,
  1607. int32_t destCapacity,
  1608. int32_t *pDestLength,
  1609. const char *src,
  1610. int32_t srcLength,
  1611. UChar32 subchar, int32_t *pNumSubstitutions,
  1612. UErrorCode *pErrorCode);
  1613. #endif