uchar.cpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ********************************************************************************
  5. * Copyright (C) 1996-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. ********************************************************************************
  8. *
  9. * File UCHAR.C
  10. *
  11. * Modification History:
  12. *
  13. * Date Name Description
  14. * 04/02/97 aliu Creation.
  15. * 4/15/99 Madhu Updated all the function definitions for C Implementation
  16. * 5/20/99 Madhu Added the function u_getVersion()
  17. * 8/19/1999 srl Upgraded scripts to Unicode3.0
  18. * 11/11/1999 weiv added u_isalnum(), cleaned comments
  19. * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion.
  20. * 06/20/2000 helena OS/400 port changes; mostly typecast.
  21. ******************************************************************************
  22. */
  23. #include "unicode/utypes.h"
  24. #include "unicode/uchar.h"
  25. #include "unicode/ucptrie.h"
  26. #include "unicode/uscript.h"
  27. #include "unicode/udata.h"
  28. #include "uassert.h"
  29. #include "cmemory.h"
  30. #include "ucln_cmn.h"
  31. #include "utrie2.h"
  32. #include "udataswp.h"
  33. #include "uprops.h"
  34. #include "ustr_imp.h"
  35. /* uchar_props_data.h is machine-generated by genprops --csource */
  36. #define INCLUDED_FROM_UCHAR_C
  37. #include "uchar_props_data.h"
  38. /* constants and macros for access to the data ------------------------------ */
  39. /* getting a uint32_t properties word from the data */
  40. #define GET_PROPS(c, result) ((result)=UTRIE2_GET16(&propsTrie, c))
  41. /* API functions ------------------------------------------------------------ */
  42. /* Gets the Unicode character's general category.*/
  43. U_CAPI int8_t U_EXPORT2
  44. u_charType(UChar32 c) {
  45. uint32_t props;
  46. GET_PROPS(c, props);
  47. return (int8_t)GET_CATEGORY(props);
  48. }
  49. /* Enumerate all code points with their general categories. */
  50. struct _EnumTypeCallback {
  51. UCharEnumTypeRange *enumRange;
  52. const void *context;
  53. };
  54. static uint32_t U_CALLCONV
  55. _enumTypeValue(const void *context, uint32_t value) {
  56. (void)context;
  57. return GET_CATEGORY(value);
  58. }
  59. static UBool U_CALLCONV
  60. _enumTypeRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
  61. /* just cast the value to UCharCategory */
  62. return static_cast<const _EnumTypeCallback*>(context)->
  63. enumRange(static_cast<const _EnumTypeCallback*>(context)->context,
  64. start, end + 1, static_cast<UCharCategory>(value));
  65. }
  66. U_CAPI void U_EXPORT2
  67. u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context) {
  68. struct _EnumTypeCallback callback;
  69. if(enumRange==nullptr) {
  70. return;
  71. }
  72. callback.enumRange=enumRange;
  73. callback.context=context;
  74. utrie2_enum(&propsTrie, _enumTypeValue, _enumTypeRange, &callback);
  75. }
  76. /* Checks if ch is a lower case letter.*/
  77. U_CAPI UBool U_EXPORT2
  78. u_islower(UChar32 c) {
  79. uint32_t props;
  80. GET_PROPS(c, props);
  81. return GET_CATEGORY(props)==U_LOWERCASE_LETTER;
  82. }
  83. /* Checks if ch is an upper case letter.*/
  84. U_CAPI UBool U_EXPORT2
  85. u_isupper(UChar32 c) {
  86. uint32_t props;
  87. GET_PROPS(c, props);
  88. return GET_CATEGORY(props)==U_UPPERCASE_LETTER;
  89. }
  90. /* Checks if ch is a title case letter; usually upper case letters.*/
  91. U_CAPI UBool U_EXPORT2
  92. u_istitle(UChar32 c) {
  93. uint32_t props;
  94. GET_PROPS(c, props);
  95. return GET_CATEGORY(props)==U_TITLECASE_LETTER;
  96. }
  97. /* Checks if ch is a decimal digit. */
  98. U_CAPI UBool U_EXPORT2
  99. u_isdigit(UChar32 c) {
  100. uint32_t props;
  101. GET_PROPS(c, props);
  102. return GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER;
  103. }
  104. U_CAPI UBool U_EXPORT2
  105. u_isxdigit(UChar32 c) {
  106. uint32_t props;
  107. /* check ASCII and Fullwidth ASCII a-fA-F */
  108. if(
  109. (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
  110. (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
  111. ) {
  112. return true;
  113. }
  114. GET_PROPS(c, props);
  115. return GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER;
  116. }
  117. /* Checks if the Unicode character is a letter.*/
  118. U_CAPI UBool U_EXPORT2
  119. u_isalpha(UChar32 c) {
  120. uint32_t props;
  121. GET_PROPS(c, props);
  122. return (CAT_MASK(props)&U_GC_L_MASK)!=0;
  123. }
  124. U_CAPI UBool U_EXPORT2
  125. u_isUAlphabetic(UChar32 c) {
  126. return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0;
  127. }
  128. /* Checks if c is a letter or a decimal digit */
  129. U_CAPI UBool U_EXPORT2
  130. u_isalnum(UChar32 c) {
  131. uint32_t props;
  132. GET_PROPS(c, props);
  133. return (CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0;
  134. }
  135. /**
  136. * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
  137. * @internal
  138. */
  139. U_CFUNC UBool
  140. u_isalnumPOSIX(UChar32 c) {
  141. return u_isUAlphabetic(c) || u_isdigit(c);
  142. }
  143. /* Checks if ch is a unicode character with assigned character type.*/
  144. U_CAPI UBool U_EXPORT2
  145. u_isdefined(UChar32 c) {
  146. uint32_t props;
  147. GET_PROPS(c, props);
  148. return GET_CATEGORY(props)!=0;
  149. }
  150. /* Checks if the Unicode character is a base form character that can take a diacritic.*/
  151. U_CAPI UBool U_EXPORT2
  152. u_isbase(UChar32 c) {
  153. uint32_t props;
  154. GET_PROPS(c, props);
  155. return (CAT_MASK(props)&(U_GC_L_MASK|U_GC_N_MASK|U_GC_MC_MASK|U_GC_ME_MASK))!=0;
  156. }
  157. /* Checks if the Unicode character is a control character.*/
  158. U_CAPI UBool U_EXPORT2
  159. u_iscntrl(UChar32 c) {
  160. uint32_t props;
  161. GET_PROPS(c, props);
  162. return (CAT_MASK(props)&(U_GC_CC_MASK|U_GC_CF_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK))!=0;
  163. }
  164. U_CAPI UBool U_EXPORT2
  165. u_isISOControl(UChar32 c) {
  166. return (uint32_t)c<=0x9f && (c<=0x1f || c>=0x7f);
  167. }
  168. /* Some control characters that are used as space. */
  169. #define IS_THAT_CONTROL_SPACE(c) \
  170. (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==0x85))
  171. /* Java has decided that U+0085 New Line is not whitespace any more. */
  172. #define IS_THAT_ASCII_CONTROL_SPACE(c) \
  173. (c<=0x1f && c>=TAB && (c<=CR || c>=0x1c))
  174. /* Checks if the Unicode character is a space character.*/
  175. U_CAPI UBool U_EXPORT2
  176. u_isspace(UChar32 c) {
  177. uint32_t props;
  178. GET_PROPS(c, props);
  179. return (CAT_MASK(props)&U_GC_Z_MASK)!=0 || IS_THAT_CONTROL_SPACE(c);
  180. }
  181. U_CAPI UBool U_EXPORT2
  182. u_isJavaSpaceChar(UChar32 c) {
  183. uint32_t props;
  184. GET_PROPS(c, props);
  185. return (CAT_MASK(props)&U_GC_Z_MASK)!=0;
  186. }
  187. /* Checks if the Unicode character is a whitespace character.*/
  188. U_CAPI UBool U_EXPORT2
  189. u_isWhitespace(UChar32 c) {
  190. uint32_t props;
  191. GET_PROPS(c, props);
  192. return ((CAT_MASK(props)&U_GC_Z_MASK)!=0 &&
  193. c!=NBSP && c!=FIGURESP && c!=NNBSP) || /* exclude no-break spaces */
  194. IS_THAT_ASCII_CONTROL_SPACE(c);
  195. }
  196. U_CAPI UBool U_EXPORT2
  197. u_isblank(UChar32 c) {
  198. if((uint32_t)c<=0x9f) {
  199. return c==9 || c==0x20; /* TAB or SPACE */
  200. } else {
  201. /* Zs */
  202. uint32_t props;
  203. GET_PROPS(c, props);
  204. return GET_CATEGORY(props)==U_SPACE_SEPARATOR;
  205. }
  206. }
  207. U_CAPI UBool U_EXPORT2
  208. u_isUWhiteSpace(UChar32 c) {
  209. return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0;
  210. }
  211. /* Checks if the Unicode character is printable.*/
  212. U_CAPI UBool U_EXPORT2
  213. u_isprint(UChar32 c) {
  214. uint32_t props;
  215. GET_PROPS(c, props);
  216. /* comparing ==0 returns false for the categories mentioned */
  217. return (CAT_MASK(props)&U_GC_C_MASK)==0;
  218. }
  219. /**
  220. * Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
  221. * Implements UCHAR_POSIX_PRINT.
  222. * @internal
  223. */
  224. U_CFUNC UBool
  225. u_isprintPOSIX(UChar32 c) {
  226. uint32_t props;
  227. GET_PROPS(c, props);
  228. /*
  229. * The only cntrl character in graph+blank is TAB (in blank).
  230. * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
  231. */
  232. return (GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c);
  233. }
  234. U_CAPI UBool U_EXPORT2
  235. u_isgraph(UChar32 c) {
  236. uint32_t props;
  237. GET_PROPS(c, props);
  238. /* comparing ==0 returns false for the categories mentioned */
  239. return (CAT_MASK(props)&
  240. (U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
  241. ==0;
  242. }
  243. /**
  244. * Checks if c is in
  245. * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
  246. * with space=\p{Whitespace} and Control=Cc.
  247. * Implements UCHAR_POSIX_GRAPH.
  248. * @internal
  249. */
  250. U_CFUNC UBool
  251. u_isgraphPOSIX(UChar32 c) {
  252. uint32_t props;
  253. GET_PROPS(c, props);
  254. /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
  255. /* comparing ==0 returns false for the categories mentioned */
  256. return (CAT_MASK(props)&
  257. (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
  258. ==0;
  259. }
  260. U_CAPI UBool U_EXPORT2
  261. u_ispunct(UChar32 c) {
  262. uint32_t props;
  263. GET_PROPS(c, props);
  264. return (CAT_MASK(props)&U_GC_P_MASK)!=0;
  265. }
  266. /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
  267. U_CAPI UBool U_EXPORT2
  268. u_isIDIgnorable(UChar32 c) {
  269. if(c<=0x9f) {
  270. return u_isISOControl(c) && !IS_THAT_ASCII_CONTROL_SPACE(c);
  271. } else {
  272. uint32_t props;
  273. GET_PROPS(c, props);
  274. return GET_CATEGORY(props)==U_FORMAT_CHAR;
  275. }
  276. }
  277. /*Checks if the Unicode character can start a Java identifier.*/
  278. U_CAPI UBool U_EXPORT2
  279. u_isJavaIDStart(UChar32 c) {
  280. uint32_t props;
  281. GET_PROPS(c, props);
  282. return (CAT_MASK(props)&(U_GC_L_MASK|U_GC_SC_MASK|U_GC_PC_MASK))!=0;
  283. }
  284. /*Checks if the Unicode character can be a Java identifier part other than starting the
  285. * identifier.
  286. */
  287. U_CAPI UBool U_EXPORT2
  288. u_isJavaIDPart(UChar32 c) {
  289. uint32_t props;
  290. GET_PROPS(c, props);
  291. return (CAT_MASK(props)&
  292. (U_GC_ND_MASK|U_GC_NL_MASK|
  293. U_GC_L_MASK|
  294. U_GC_SC_MASK|U_GC_PC_MASK|
  295. U_GC_MC_MASK|U_GC_MN_MASK)
  296. )!=0 ||
  297. u_isIDIgnorable(c);
  298. }
  299. U_CAPI int32_t U_EXPORT2
  300. u_charDigitValue(UChar32 c) {
  301. uint32_t props;
  302. int32_t value;
  303. GET_PROPS(c, props);
  304. value=(int32_t)GET_NUMERIC_TYPE_VALUE(props)-UPROPS_NTV_DECIMAL_START;
  305. if(value<=9) {
  306. return value;
  307. } else {
  308. return -1;
  309. }
  310. }
  311. U_CAPI double U_EXPORT2
  312. u_getNumericValue(UChar32 c) {
  313. uint32_t props;
  314. int32_t ntv;
  315. GET_PROPS(c, props);
  316. ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(props);
  317. if(ntv==UPROPS_NTV_NONE) {
  318. return U_NO_NUMERIC_VALUE;
  319. } else if(ntv<UPROPS_NTV_DIGIT_START) {
  320. /* decimal digit */
  321. return ntv-UPROPS_NTV_DECIMAL_START;
  322. } else if(ntv<UPROPS_NTV_NUMERIC_START) {
  323. /* other digit */
  324. return ntv-UPROPS_NTV_DIGIT_START;
  325. } else if(ntv<UPROPS_NTV_FRACTION_START) {
  326. /* small integer */
  327. return ntv-UPROPS_NTV_NUMERIC_START;
  328. } else if(ntv<UPROPS_NTV_LARGE_START) {
  329. /* fraction */
  330. int32_t numerator=(ntv>>4)-12;
  331. int32_t denominator=(ntv&0xf)+1;
  332. return (double)numerator/denominator;
  333. } else if(ntv<UPROPS_NTV_BASE60_START) {
  334. /* large, single-significant-digit integer */
  335. double numValue;
  336. int32_t mant=(ntv>>5)-14;
  337. int32_t exp=(ntv&0x1f)+2;
  338. numValue=mant;
  339. /* multiply by 10^exp without math.h */
  340. while(exp>=4) {
  341. numValue*=10000.;
  342. exp-=4;
  343. }
  344. switch(exp) {
  345. case 3:
  346. numValue*=1000.;
  347. break;
  348. case 2:
  349. numValue*=100.;
  350. break;
  351. case 1:
  352. numValue*=10.;
  353. break;
  354. case 0:
  355. default:
  356. break;
  357. }
  358. return numValue;
  359. } else if(ntv<UPROPS_NTV_FRACTION20_START) {
  360. /* sexagesimal (base 60) integer */
  361. int32_t numValue=(ntv>>2)-0xbf;
  362. int32_t exp=(ntv&3)+1;
  363. switch(exp) {
  364. case 4:
  365. numValue*=60*60*60*60;
  366. break;
  367. case 3:
  368. numValue*=60*60*60;
  369. break;
  370. case 2:
  371. numValue*=60*60;
  372. break;
  373. case 1:
  374. numValue*=60;
  375. break;
  376. case 0:
  377. default:
  378. break;
  379. }
  380. return numValue;
  381. } else if(ntv<UPROPS_NTV_FRACTION32_START) {
  382. // fraction-20 e.g. 3/80
  383. int32_t frac20=ntv-UPROPS_NTV_FRACTION20_START; // 0..0x17
  384. int32_t numerator=2*(frac20&3)+1;
  385. int32_t denominator=20<<(frac20>>2);
  386. return (double)numerator/denominator;
  387. } else if(ntv<UPROPS_NTV_RESERVED_START) {
  388. // fraction-32 e.g. 3/64
  389. int32_t frac32=ntv-UPROPS_NTV_FRACTION32_START; // 0..15
  390. int32_t numerator=2*(frac32&3)+1;
  391. int32_t denominator=32<<(frac32>>2);
  392. return (double)numerator/denominator;
  393. } else {
  394. /* reserved */
  395. return U_NO_NUMERIC_VALUE;
  396. }
  397. }
  398. U_CAPI int32_t U_EXPORT2
  399. u_digit(UChar32 ch, int8_t radix) {
  400. int8_t value;
  401. if((uint8_t)(radix-2)<=(36-2)) {
  402. value=(int8_t)u_charDigitValue(ch);
  403. if(value<0) {
  404. /* ch is not a decimal digit, try latin letters */
  405. if(ch>=0x61 && ch<=0x7A) {
  406. value=(int8_t)(ch-0x57); /* ch - 'a' + 10 */
  407. } else if(ch>=0x41 && ch<=0x5A) {
  408. value=(int8_t)(ch-0x37); /* ch - 'A' + 10 */
  409. } else if(ch>=0xFF41 && ch<=0xFF5A) {
  410. value=(int8_t)(ch-0xFF37); /* fullwidth ASCII a-z */
  411. } else if(ch>=0xFF21 && ch<=0xFF3A) {
  412. value=(int8_t)(ch-0xFF17); /* fullwidth ASCII A-Z */
  413. }
  414. }
  415. } else {
  416. value=-1; /* invalid radix */
  417. }
  418. return (int8_t)((value<radix) ? value : -1);
  419. }
  420. U_CAPI UChar32 U_EXPORT2
  421. u_forDigit(int32_t digit, int8_t radix) {
  422. if((uint8_t)(radix-2)>(36-2) || (uint32_t)digit>=(uint32_t)radix) {
  423. return 0;
  424. } else if(digit<10) {
  425. return (UChar32)(0x30+digit);
  426. } else {
  427. return (UChar32)((0x61-10)+digit);
  428. }
  429. }
  430. /* miscellaneous, and support for uprops.cpp -------------------------------- */
  431. U_CAPI void U_EXPORT2
  432. u_getUnicodeVersion(UVersionInfo versionArray) {
  433. if(versionArray!=nullptr) {
  434. uprv_memcpy(versionArray, dataVersion, U_MAX_VERSION_LENGTH);
  435. }
  436. }
  437. U_CFUNC uint32_t
  438. u_getMainProperties(UChar32 c) {
  439. uint32_t props;
  440. GET_PROPS(c, props);
  441. return props;
  442. }
  443. U_CFUNC uint32_t
  444. u_getUnicodeProperties(UChar32 c, int32_t column) {
  445. U_ASSERT(column>=0);
  446. if(column>=propsVectorsColumns) {
  447. return 0;
  448. } else {
  449. uint16_t vecIndex=UTRIE2_GET16(&propsVectorsTrie, c);
  450. return propsVectors[vecIndex+column];
  451. }
  452. }
  453. U_CFUNC int32_t
  454. uprv_getMaxValues(int32_t column) {
  455. switch(column) {
  456. case 0:
  457. return indexes[UPROPS_MAX_VALUES_INDEX];
  458. case 2:
  459. return indexes[UPROPS_MAX_VALUES_2_INDEX];
  460. case UPROPS_MAX_VALUES_OTHER_INDEX:
  461. return indexes[column];
  462. default:
  463. return 0;
  464. }
  465. }
  466. U_CAPI void U_EXPORT2
  467. u_charAge(UChar32 c, UVersionInfo versionArray) {
  468. if(versionArray!=nullptr) {
  469. uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT;
  470. versionArray[0]=(uint8_t)(version>>2);
  471. versionArray[1]=(uint8_t)(version&3);
  472. versionArray[2]=versionArray[3]=0;
  473. }
  474. }
  475. U_CAPI UScriptCode U_EXPORT2
  476. uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
  477. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  478. return USCRIPT_INVALID_CODE;
  479. }
  480. if((uint32_t)c>0x10ffff) {
  481. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  482. return USCRIPT_INVALID_CODE;
  483. }
  484. uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
  485. uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT;
  486. if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
  487. return (UScriptCode)codeOrIndex;
  488. } else if(scriptX<UPROPS_SCRIPT_X_WITH_INHERITED) {
  489. return USCRIPT_COMMON;
  490. } else if(scriptX<UPROPS_SCRIPT_X_WITH_OTHER) {
  491. return USCRIPT_INHERITED;
  492. } else {
  493. return (UScriptCode)scriptExtensions[codeOrIndex];
  494. }
  495. }
  496. U_CAPI UBool U_EXPORT2
  497. uscript_hasScript(UChar32 c, UScriptCode sc) UPRV_NO_SANITIZE_UNDEFINED {
  498. uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
  499. uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT;
  500. if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
  501. return sc==(UScriptCode)codeOrIndex;
  502. }
  503. const uint16_t *scx=scriptExtensions+codeOrIndex;
  504. if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) {
  505. scx=scriptExtensions+scx[1];
  506. }
  507. uint32_t sc32=sc;
  508. if(sc32>0x7fff) {
  509. /* Guard against bogus input that would make us go past the Script_Extensions terminator. */
  510. return false;
  511. }
  512. while(sc32>*scx) {
  513. ++scx;
  514. }
  515. return sc32==(*scx&0x7fff);
  516. }
  517. U_CAPI int32_t U_EXPORT2
  518. uscript_getScriptExtensions(UChar32 c,
  519. UScriptCode *scripts, int32_t capacity,
  520. UErrorCode *pErrorCode) {
  521. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  522. return 0;
  523. }
  524. if(capacity<0 || (capacity>0 && scripts==nullptr)) {
  525. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  526. return 0;
  527. }
  528. uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
  529. uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT;
  530. if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
  531. if(capacity==0) {
  532. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  533. } else {
  534. scripts[0]=(UScriptCode)codeOrIndex;
  535. }
  536. return 1;
  537. }
  538. const uint16_t *scx=scriptExtensions+codeOrIndex;
  539. if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) {
  540. scx=scriptExtensions+scx[1];
  541. }
  542. int32_t length=0;
  543. uint16_t sx;
  544. do {
  545. sx=*scx++;
  546. if(length<capacity) {
  547. scripts[length]=(UScriptCode)(sx&0x7fff);
  548. }
  549. ++length;
  550. } while(sx<0x8000);
  551. if(length>capacity) {
  552. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  553. }
  554. return length;
  555. }
  556. U_CAPI UBlockCode U_EXPORT2
  557. ublock_getCode(UChar32 c) {
  558. // We store Block values indexed by the code point shifted right 4 bits
  559. // and use a "small" UCPTrie=CodePointTrie for minimal data size.
  560. // This works because blocks have xxx0..xxxF ranges.
  561. uint32_t c4 = c; // unsigned so that shifting right does not worry the compiler
  562. // Shift unless out of range, in which case we fetch the trie's error value.
  563. if (c4 <= 0x10ffff) {
  564. c4 >>= 4;
  565. }
  566. return (UBlockCode)ucptrie_get(&block_trie, c4);
  567. }
  568. /* property starts for UnicodeSet ------------------------------------------- */
  569. static UBool U_CALLCONV
  570. _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
  571. /* add the start code point to the USet */
  572. const USetAdder* sa = static_cast<const USetAdder*>(context);
  573. sa->add(sa->set, start);
  574. (void)end;
  575. (void)value;
  576. return true;
  577. }
  578. #define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1)
  579. U_CFUNC void U_EXPORT2
  580. uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
  581. if(U_FAILURE(*pErrorCode)) {
  582. return;
  583. }
  584. /* add the start code point of each same-value range of the main trie */
  585. utrie2_enum(&propsTrie, nullptr, _enumPropertyStartsRange, sa);
  586. /* add code points with hardcoded properties, plus the ones following them */
  587. /* add for u_isblank() */
  588. USET_ADD_CP_AND_NEXT(sa, TAB);
  589. /* add for IS_THAT_CONTROL_SPACE() */
  590. sa->add(sa->set, CR+1); /* range TAB..CR */
  591. sa->add(sa->set, 0x1c);
  592. sa->add(sa->set, 0x1f+1);
  593. USET_ADD_CP_AND_NEXT(sa, 0x85); // NEXT LINE (NEL)
  594. /* add for u_isIDIgnorable() what was not added above */
  595. sa->add(sa->set, 0x7f); /* range DEL..NBSP-1, NBSP added below */
  596. sa->add(sa->set, HAIRSP);
  597. sa->add(sa->set, RLM+1);
  598. sa->add(sa->set, 0x206a); // INHIBIT SYMMETRIC SWAPPING
  599. sa->add(sa->set, 0x206f+1); // NOMINAL DIGIT SHAPES
  600. USET_ADD_CP_AND_NEXT(sa, ZWNBSP);
  601. /* add no-break spaces for u_isWhitespace() what was not added above */
  602. USET_ADD_CP_AND_NEXT(sa, NBSP);
  603. USET_ADD_CP_AND_NEXT(sa, FIGURESP);
  604. USET_ADD_CP_AND_NEXT(sa, NNBSP);
  605. /* add for u_digit() */
  606. sa->add(sa->set, u'a');
  607. sa->add(sa->set, u'z'+1);
  608. sa->add(sa->set, u'A');
  609. sa->add(sa->set, u'Z'+1);
  610. // fullwidth
  611. sa->add(sa->set, u'a');
  612. sa->add(sa->set, u'z'+1);
  613. sa->add(sa->set, u'A');
  614. sa->add(sa->set, u'Z'+1);
  615. /* add for u_isxdigit() */
  616. sa->add(sa->set, u'f'+1);
  617. sa->add(sa->set, u'F'+1);
  618. // fullwidth
  619. sa->add(sa->set, u'f'+1);
  620. sa->add(sa->set, u'F'+1);
  621. /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
  622. sa->add(sa->set, 0x2060); /* range 2060..206f */
  623. sa->add(sa->set, 0xfff0);
  624. sa->add(sa->set, 0xfffb+1);
  625. sa->add(sa->set, 0xe0000);
  626. sa->add(sa->set, 0xe0fff+1);
  627. /* add for UCHAR_GRAPHEME_BASE and others */
  628. USET_ADD_CP_AND_NEXT(sa, CGJ);
  629. }
  630. U_CFUNC void U_EXPORT2
  631. upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
  632. if(U_FAILURE(*pErrorCode)) {
  633. return;
  634. }
  635. /* add the start code point of each same-value range of the properties vectors trie */
  636. utrie2_enum(&propsVectorsTrie, nullptr, _enumPropertyStartsRange, sa);
  637. }
  638. U_CFUNC void U_EXPORT2
  639. ublock_addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) {
  640. // Add the start code point of each same-value range of the trie.
  641. // We store Block values indexed by the code point shifted right 4 bits;
  642. // see ublock_getCode().
  643. UChar32 start = 0, end;
  644. uint32_t value;
  645. while (start < 0x11000 && // limit: (max code point + 1) >> 4
  646. (end = ucptrie_getRange(&block_trie, start, UCPMAP_RANGE_NORMAL, 0,
  647. nullptr, nullptr, &value)) >= 0) {
  648. sa->add(sa->set, start << 4);
  649. start = end + 1;
  650. }
  651. }