utf16.h 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 1999-2012, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: utf16.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 1999sep09
  16. * created by: Markus W. Scherer
  17. */
  18. /**
  19. * \file
  20. * \brief C API: 16-bit Unicode handling macros
  21. *
  22. * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
  23. *
  24. * For more information see utf.h and the ICU User Guide Strings chapter
  25. * (https://unicode-org.github.io/icu/userguide/strings).
  26. *
  27. * <em>Usage:</em>
  28. * ICU coding guidelines for if() statements should be followed when using these macros.
  29. * Compound statements (curly braces {}) must be used for if-else-while...
  30. * bodies and all macro statements should be terminated with semicolon.
  31. */
  32. #ifndef __UTF16_H__
  33. #define __UTF16_H__
  34. #include <stdbool.h>
  35. #include "unicode/umachine.h"
  36. #ifndef __UTF_H__
  37. # include "unicode/utf.h"
  38. #endif
  39. /* single-code point definitions -------------------------------------------- */
  40. /**
  41. * Does this code unit alone encode a code point (BMP, not a surrogate)?
  42. * @param c 16-bit code unit
  43. * @return true or false
  44. * @stable ICU 2.4
  45. */
  46. #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
  47. /**
  48. * Is this code unit a lead surrogate (U+d800..U+dbff)?
  49. * @param c 16-bit code unit
  50. * @return true or false
  51. * @stable ICU 2.4
  52. */
  53. #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
  54. /**
  55. * Is this code unit a trail surrogate (U+dc00..U+dfff)?
  56. * @param c 16-bit code unit
  57. * @return true or false
  58. * @stable ICU 2.4
  59. */
  60. #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
  61. /**
  62. * Is this code unit a surrogate (U+d800..U+dfff)?
  63. * @param c 16-bit code unit
  64. * @return true or false
  65. * @stable ICU 2.4
  66. */
  67. #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
  68. /**
  69. * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
  70. * is it a lead surrogate?
  71. * @param c 16-bit code unit
  72. * @return true or false
  73. * @stable ICU 2.4
  74. */
  75. #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
  76. /**
  77. * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
  78. * is it a trail surrogate?
  79. * @param c 16-bit code unit
  80. * @return true or false
  81. * @stable ICU 4.2
  82. */
  83. #define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
  84. /**
  85. * Helper constant for U16_GET_SUPPLEMENTARY.
  86. * @internal
  87. */
  88. #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
  89. /**
  90. * Get a supplementary code point value (U+10000..U+10ffff)
  91. * from its lead and trail surrogates.
  92. * The result is undefined if the input values are not
  93. * lead and trail surrogates.
  94. *
  95. * @param lead lead surrogate (U+d800..U+dbff)
  96. * @param trail trail surrogate (U+dc00..U+dfff)
  97. * @return supplementary code point (U+10000..U+10ffff)
  98. * @stable ICU 2.4
  99. */
  100. #define U16_GET_SUPPLEMENTARY(lead, trail) \
  101. (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
  102. /**
  103. * Get the lead surrogate (0xd800..0xdbff) for a
  104. * supplementary code point (0x10000..0x10ffff).
  105. * @param supplementary 32-bit code point (U+10000..U+10ffff)
  106. * @return lead surrogate (U+d800..U+dbff) for supplementary
  107. * @stable ICU 2.4
  108. */
  109. #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
  110. /**
  111. * Get the trail surrogate (0xdc00..0xdfff) for a
  112. * supplementary code point (0x10000..0x10ffff).
  113. * @param supplementary 32-bit code point (U+10000..U+10ffff)
  114. * @return trail surrogate (U+dc00..U+dfff) for supplementary
  115. * @stable ICU 2.4
  116. */
  117. #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
  118. /**
  119. * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
  120. * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
  121. * @param c 32-bit code point
  122. * @return 1 or 2
  123. * @stable ICU 2.4
  124. */
  125. #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
  126. /**
  127. * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
  128. * @return 2
  129. * @stable ICU 2.4
  130. */
  131. #define U16_MAX_LENGTH 2
  132. /**
  133. * Get a code point from a string at a random-access offset,
  134. * without changing the offset.
  135. * "Unsafe" macro, assumes well-formed UTF-16.
  136. *
  137. * The offset may point to either the lead or trail surrogate unit
  138. * for a supplementary code point, in which case the macro will read
  139. * the adjacent matching surrogate as well.
  140. * The result is undefined if the offset points to a single, unpaired surrogate.
  141. * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
  142. *
  143. * @param s const UChar * string
  144. * @param i string offset
  145. * @param c output UChar32 variable
  146. * @see U16_GET
  147. * @stable ICU 2.4
  148. */
  149. #define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
  150. (c)=(s)[i]; \
  151. if(U16_IS_SURROGATE(c)) { \
  152. if(U16_IS_SURROGATE_LEAD(c)) { \
  153. (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
  154. } else { \
  155. (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
  156. } \
  157. } \
  158. } UPRV_BLOCK_MACRO_END
  159. /**
  160. * Get a code point from a string at a random-access offset,
  161. * without changing the offset.
  162. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  163. *
  164. * The offset may point to either the lead or trail surrogate unit
  165. * for a supplementary code point, in which case the macro will read
  166. * the adjacent matching surrogate as well.
  167. *
  168. * The length can be negative for a NUL-terminated string.
  169. *
  170. * If the offset points to a single, unpaired surrogate, then
  171. * c is set to that unpaired surrogate.
  172. * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
  173. *
  174. * @param s const UChar * string
  175. * @param start starting string offset (usually 0)
  176. * @param i string offset, must be start<=i<length
  177. * @param length string length
  178. * @param c output UChar32 variable
  179. * @see U16_GET_UNSAFE
  180. * @stable ICU 2.4
  181. */
  182. #define U16_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
  183. (c)=(s)[i]; \
  184. if(U16_IS_SURROGATE(c)) { \
  185. uint16_t __c2; \
  186. if(U16_IS_SURROGATE_LEAD(c)) { \
  187. if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
  188. (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
  189. } \
  190. } else { \
  191. if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
  192. (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
  193. } \
  194. } \
  195. } \
  196. } UPRV_BLOCK_MACRO_END
  197. /**
  198. * Get a code point from a string at a random-access offset,
  199. * without changing the offset.
  200. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  201. *
  202. * The offset may point to either the lead or trail surrogate unit
  203. * for a supplementary code point, in which case the macro will read
  204. * the adjacent matching surrogate as well.
  205. *
  206. * The length can be negative for a NUL-terminated string.
  207. *
  208. * If the offset points to a single, unpaired surrogate, then
  209. * c is set to U+FFFD.
  210. * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
  211. *
  212. * @param s const UChar * string
  213. * @param start starting string offset (usually 0)
  214. * @param i string offset, must be start<=i<length
  215. * @param length string length
  216. * @param c output UChar32 variable
  217. * @see U16_GET_UNSAFE
  218. * @stable ICU 60
  219. */
  220. #define U16_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
  221. (c)=(s)[i]; \
  222. if(U16_IS_SURROGATE(c)) { \
  223. uint16_t __c2; \
  224. if(U16_IS_SURROGATE_LEAD(c)) { \
  225. if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
  226. (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
  227. } else { \
  228. (c)=0xfffd; \
  229. } \
  230. } else { \
  231. if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
  232. (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
  233. } else { \
  234. (c)=0xfffd; \
  235. } \
  236. } \
  237. } \
  238. } UPRV_BLOCK_MACRO_END
  239. /* definitions with forward iteration --------------------------------------- */
  240. /**
  241. * Get a code point from a string at a code point boundary offset,
  242. * and advance the offset to the next code point boundary.
  243. * (Post-incrementing forward iteration.)
  244. * "Unsafe" macro, assumes well-formed UTF-16.
  245. *
  246. * The offset may point to the lead surrogate unit
  247. * for a supplementary code point, in which case the macro will read
  248. * the following trail surrogate as well.
  249. * If the offset points to a trail surrogate, then that itself
  250. * will be returned as the code point.
  251. * The result is undefined if the offset points to a single, unpaired lead surrogate.
  252. *
  253. * @param s const UChar * string
  254. * @param i string offset
  255. * @param c output UChar32 variable
  256. * @see U16_NEXT
  257. * @stable ICU 2.4
  258. */
  259. #define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
  260. (c)=(s)[(i)++]; \
  261. if(U16_IS_LEAD(c)) { \
  262. (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
  263. } \
  264. } UPRV_BLOCK_MACRO_END
  265. /**
  266. * Get a code point from a string at a code point boundary offset,
  267. * and advance the offset to the next code point boundary.
  268. * (Post-incrementing forward iteration.)
  269. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  270. *
  271. * The length can be negative for a NUL-terminated string.
  272. *
  273. * The offset may point to the lead surrogate unit
  274. * for a supplementary code point, in which case the macro will read
  275. * the following trail surrogate as well.
  276. * If the offset points to a trail surrogate or
  277. * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
  278. *
  279. * @param s const UChar * string
  280. * @param i string offset, must be i<length
  281. * @param length string length
  282. * @param c output UChar32 variable
  283. * @see U16_NEXT_UNSAFE
  284. * @stable ICU 2.4
  285. */
  286. #define U16_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
  287. (c)=(s)[(i)++]; \
  288. if(U16_IS_LEAD(c)) { \
  289. uint16_t __c2; \
  290. if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
  291. ++(i); \
  292. (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
  293. } \
  294. } \
  295. } UPRV_BLOCK_MACRO_END
  296. /**
  297. * Get a code point from a string at a code point boundary offset,
  298. * and advance the offset to the next code point boundary.
  299. * (Post-incrementing forward iteration.)
  300. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  301. *
  302. * The length can be negative for a NUL-terminated string.
  303. *
  304. * The offset may point to the lead surrogate unit
  305. * for a supplementary code point, in which case the macro will read
  306. * the following trail surrogate as well.
  307. * If the offset points to a trail surrogate or
  308. * to a single, unpaired lead surrogate, then c is set to U+FFFD.
  309. *
  310. * @param s const UChar * string
  311. * @param i string offset, must be i<length
  312. * @param length string length
  313. * @param c output UChar32 variable
  314. * @see U16_NEXT_UNSAFE
  315. * @stable ICU 60
  316. */
  317. #define U16_NEXT_OR_FFFD(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
  318. (c)=(s)[(i)++]; \
  319. if(U16_IS_SURROGATE(c)) { \
  320. uint16_t __c2; \
  321. if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
  322. ++(i); \
  323. (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
  324. } else { \
  325. (c)=0xfffd; \
  326. } \
  327. } \
  328. } UPRV_BLOCK_MACRO_END
  329. /**
  330. * Append a code point to a string, overwriting 1 or 2 code units.
  331. * The offset points to the current end of the string contents
  332. * and is advanced (post-increment).
  333. * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
  334. * Otherwise, the result is undefined.
  335. *
  336. * @param s const UChar * string buffer
  337. * @param i string offset
  338. * @param c code point to append
  339. * @see U16_APPEND
  340. * @stable ICU 2.4
  341. */
  342. #define U16_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
  343. if((uint32_t)(c)<=0xffff) { \
  344. (s)[(i)++]=(uint16_t)(c); \
  345. } else { \
  346. (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
  347. (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
  348. } \
  349. } UPRV_BLOCK_MACRO_END
  350. /**
  351. * Append a code point to a string, overwriting 1 or 2 code units.
  352. * The offset points to the current end of the string contents
  353. * and is advanced (post-increment).
  354. * "Safe" macro, checks for a valid code point.
  355. * If a surrogate pair is written, checks for sufficient space in the string.
  356. * If the code point is not valid or a trail surrogate does not fit,
  357. * then isError is set to true.
  358. *
  359. * @param s const UChar * string buffer
  360. * @param i string offset, must be i<capacity
  361. * @param capacity size of the string buffer
  362. * @param c code point to append
  363. * @param isError output UBool set to true if an error occurs, otherwise not modified
  364. * @see U16_APPEND_UNSAFE
  365. * @stable ICU 2.4
  366. */
  367. #define U16_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
  368. if((uint32_t)(c)<=0xffff) { \
  369. (s)[(i)++]=(uint16_t)(c); \
  370. } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
  371. (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
  372. (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
  373. } else /* c>0x10ffff or not enough space */ { \
  374. (isError)=true; \
  375. } \
  376. } UPRV_BLOCK_MACRO_END
  377. /**
  378. * Advance the string offset from one code point boundary to the next.
  379. * (Post-incrementing iteration.)
  380. * "Unsafe" macro, assumes well-formed UTF-16.
  381. *
  382. * @param s const UChar * string
  383. * @param i string offset
  384. * @see U16_FWD_1
  385. * @stable ICU 2.4
  386. */
  387. #define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
  388. if(U16_IS_LEAD((s)[(i)++])) { \
  389. ++(i); \
  390. } \
  391. } UPRV_BLOCK_MACRO_END
  392. /**
  393. * Advance the string offset from one code point boundary to the next.
  394. * (Post-incrementing iteration.)
  395. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  396. *
  397. * The length can be negative for a NUL-terminated string.
  398. *
  399. * @param s const UChar * string
  400. * @param i string offset, must be i<length
  401. * @param length string length
  402. * @see U16_FWD_1_UNSAFE
  403. * @stable ICU 2.4
  404. */
  405. #define U16_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
  406. if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
  407. ++(i); \
  408. } \
  409. } UPRV_BLOCK_MACRO_END
  410. /**
  411. * Advance the string offset from one code point boundary to the n-th next one,
  412. * i.e., move forward by n code points.
  413. * (Post-incrementing iteration.)
  414. * "Unsafe" macro, assumes well-formed UTF-16.
  415. *
  416. * @param s const UChar * string
  417. * @param i string offset
  418. * @param n number of code points to skip
  419. * @see U16_FWD_N
  420. * @stable ICU 2.4
  421. */
  422. #define U16_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
  423. int32_t __N=(n); \
  424. while(__N>0) { \
  425. U16_FWD_1_UNSAFE(s, i); \
  426. --__N; \
  427. } \
  428. } UPRV_BLOCK_MACRO_END
  429. /**
  430. * Advance the string offset from one code point boundary to the n-th next one,
  431. * i.e., move forward by n code points.
  432. * (Post-incrementing iteration.)
  433. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  434. *
  435. * The length can be negative for a NUL-terminated string.
  436. *
  437. * @param s const UChar * string
  438. * @param i int32_t string offset, must be i<length
  439. * @param length int32_t string length
  440. * @param n number of code points to skip
  441. * @see U16_FWD_N_UNSAFE
  442. * @stable ICU 2.4
  443. */
  444. #define U16_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
  445. int32_t __N=(n); \
  446. while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
  447. U16_FWD_1(s, i, length); \
  448. --__N; \
  449. } \
  450. } UPRV_BLOCK_MACRO_END
  451. /**
  452. * Adjust a random-access offset to a code point boundary
  453. * at the start of a code point.
  454. * If the offset points to the trail surrogate of a surrogate pair,
  455. * then the offset is decremented.
  456. * Otherwise, it is not modified.
  457. * "Unsafe" macro, assumes well-formed UTF-16.
  458. *
  459. * @param s const UChar * string
  460. * @param i string offset
  461. * @see U16_SET_CP_START
  462. * @stable ICU 2.4
  463. */
  464. #define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
  465. if(U16_IS_TRAIL((s)[i])) { \
  466. --(i); \
  467. } \
  468. } UPRV_BLOCK_MACRO_END
  469. /**
  470. * Adjust a random-access offset to a code point boundary
  471. * at the start of a code point.
  472. * If the offset points to the trail surrogate of a surrogate pair,
  473. * then the offset is decremented.
  474. * Otherwise, it is not modified.
  475. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  476. *
  477. * @param s const UChar * string
  478. * @param start starting string offset (usually 0)
  479. * @param i string offset, must be start<=i
  480. * @see U16_SET_CP_START_UNSAFE
  481. * @stable ICU 2.4
  482. */
  483. #define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
  484. if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
  485. --(i); \
  486. } \
  487. } UPRV_BLOCK_MACRO_END
  488. /* definitions with backward iteration -------------------------------------- */
  489. /**
  490. * Move the string offset from one code point boundary to the previous one
  491. * and get the code point between them.
  492. * (Pre-decrementing backward iteration.)
  493. * "Unsafe" macro, assumes well-formed UTF-16.
  494. *
  495. * The input offset may be the same as the string length.
  496. * If the offset is behind a trail surrogate unit
  497. * for a supplementary code point, then the macro will read
  498. * the preceding lead surrogate as well.
  499. * If the offset is behind a lead surrogate, then that itself
  500. * will be returned as the code point.
  501. * The result is undefined if the offset is behind a single, unpaired trail surrogate.
  502. *
  503. * @param s const UChar * string
  504. * @param i string offset
  505. * @param c output UChar32 variable
  506. * @see U16_PREV
  507. * @stable ICU 2.4
  508. */
  509. #define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
  510. (c)=(s)[--(i)]; \
  511. if(U16_IS_TRAIL(c)) { \
  512. (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
  513. } \
  514. } UPRV_BLOCK_MACRO_END
  515. /**
  516. * Move the string offset from one code point boundary to the previous one
  517. * and get the code point between them.
  518. * (Pre-decrementing backward iteration.)
  519. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  520. *
  521. * The input offset may be the same as the string length.
  522. * If the offset is behind a trail surrogate unit
  523. * for a supplementary code point, then the macro will read
  524. * the preceding lead surrogate as well.
  525. * If the offset is behind a lead surrogate or behind a single, unpaired
  526. * trail surrogate, then c is set to that unpaired surrogate.
  527. *
  528. * @param s const UChar * string
  529. * @param start starting string offset (usually 0)
  530. * @param i string offset, must be start<i
  531. * @param c output UChar32 variable
  532. * @see U16_PREV_UNSAFE
  533. * @stable ICU 2.4
  534. */
  535. #define U16_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
  536. (c)=(s)[--(i)]; \
  537. if(U16_IS_TRAIL(c)) { \
  538. uint16_t __c2; \
  539. if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
  540. --(i); \
  541. (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
  542. } \
  543. } \
  544. } UPRV_BLOCK_MACRO_END
  545. /**
  546. * Move the string offset from one code point boundary to the previous one
  547. * and get the code point between them.
  548. * (Pre-decrementing backward iteration.)
  549. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  550. *
  551. * The input offset may be the same as the string length.
  552. * If the offset is behind a trail surrogate unit
  553. * for a supplementary code point, then the macro will read
  554. * the preceding lead surrogate as well.
  555. * If the offset is behind a lead surrogate or behind a single, unpaired
  556. * trail surrogate, then c is set to U+FFFD.
  557. *
  558. * @param s const UChar * string
  559. * @param start starting string offset (usually 0)
  560. * @param i string offset, must be start<i
  561. * @param c output UChar32 variable
  562. * @see U16_PREV_UNSAFE
  563. * @stable ICU 60
  564. */
  565. #define U16_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
  566. (c)=(s)[--(i)]; \
  567. if(U16_IS_SURROGATE(c)) { \
  568. uint16_t __c2; \
  569. if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
  570. --(i); \
  571. (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
  572. } else { \
  573. (c)=0xfffd; \
  574. } \
  575. } \
  576. } UPRV_BLOCK_MACRO_END
  577. /**
  578. * Move the string offset from one code point boundary to the previous one.
  579. * (Pre-decrementing backward iteration.)
  580. * The input offset may be the same as the string length.
  581. * "Unsafe" macro, assumes well-formed UTF-16.
  582. *
  583. * @param s const UChar * string
  584. * @param i string offset
  585. * @see U16_BACK_1
  586. * @stable ICU 2.4
  587. */
  588. #define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
  589. if(U16_IS_TRAIL((s)[--(i)])) { \
  590. --(i); \
  591. } \
  592. } UPRV_BLOCK_MACRO_END
  593. /**
  594. * Move the string offset from one code point boundary to the previous one.
  595. * (Pre-decrementing backward iteration.)
  596. * The input offset may be the same as the string length.
  597. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  598. *
  599. * @param s const UChar * string
  600. * @param start starting string offset (usually 0)
  601. * @param i string offset, must be start<i
  602. * @see U16_BACK_1_UNSAFE
  603. * @stable ICU 2.4
  604. */
  605. #define U16_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
  606. if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
  607. --(i); \
  608. } \
  609. } UPRV_BLOCK_MACRO_END
  610. /**
  611. * Move the string offset from one code point boundary to the n-th one before it,
  612. * i.e., move backward by n code points.
  613. * (Pre-decrementing backward iteration.)
  614. * The input offset may be the same as the string length.
  615. * "Unsafe" macro, assumes well-formed UTF-16.
  616. *
  617. * @param s const UChar * string
  618. * @param i string offset
  619. * @param n number of code points to skip
  620. * @see U16_BACK_N
  621. * @stable ICU 2.4
  622. */
  623. #define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
  624. int32_t __N=(n); \
  625. while(__N>0) { \
  626. U16_BACK_1_UNSAFE(s, i); \
  627. --__N; \
  628. } \
  629. } UPRV_BLOCK_MACRO_END
  630. /**
  631. * Move the string offset from one code point boundary to the n-th one before it,
  632. * i.e., move backward by n code points.
  633. * (Pre-decrementing backward iteration.)
  634. * The input offset may be the same as the string length.
  635. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  636. *
  637. * @param s const UChar * string
  638. * @param start start of string
  639. * @param i string offset, must be start<i
  640. * @param n number of code points to skip
  641. * @see U16_BACK_N_UNSAFE
  642. * @stable ICU 2.4
  643. */
  644. #define U16_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
  645. int32_t __N=(n); \
  646. while(__N>0 && (i)>(start)) { \
  647. U16_BACK_1(s, start, i); \
  648. --__N; \
  649. } \
  650. } UPRV_BLOCK_MACRO_END
  651. /**
  652. * Adjust a random-access offset to a code point boundary after a code point.
  653. * If the offset is behind the lead surrogate of a surrogate pair,
  654. * then the offset is incremented.
  655. * Otherwise, it is not modified.
  656. * The input offset may be the same as the string length.
  657. * "Unsafe" macro, assumes well-formed UTF-16.
  658. *
  659. * @param s const UChar * string
  660. * @param i string offset
  661. * @see U16_SET_CP_LIMIT
  662. * @stable ICU 2.4
  663. */
  664. #define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
  665. if(U16_IS_LEAD((s)[(i)-1])) { \
  666. ++(i); \
  667. } \
  668. } UPRV_BLOCK_MACRO_END
  669. /**
  670. * Adjust a random-access offset to a code point boundary after a code point.
  671. * If the offset is behind the lead surrogate of a surrogate pair,
  672. * then the offset is incremented.
  673. * Otherwise, it is not modified.
  674. * The input offset may be the same as the string length.
  675. * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
  676. *
  677. * The length can be negative for a NUL-terminated string.
  678. *
  679. * @param s const UChar * string
  680. * @param start int32_t starting string offset (usually 0)
  681. * @param i int32_t string offset, start<=i<=length
  682. * @param length int32_t string length
  683. * @see U16_SET_CP_LIMIT_UNSAFE
  684. * @stable ICU 2.4
  685. */
  686. #define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
  687. if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
  688. ++(i); \
  689. } \
  690. } UPRV_BLOCK_MACRO_END
  691. #endif