uchar.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ********************************************************************************
  5. * Copyright (C) 1996-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. ********************************************************************************
  8. *
  9. * File UCHAR.C
  10. *
  11. * Modification History:
  12. *
  13. * Date Name Description
  14. * 04/02/97 aliu Creation.
  15. * 4/15/99 Madhu Updated all the function definitions for C Implementation
  16. * 5/20/99 Madhu Added the function u_getVersion()
  17. * 8/19/1999 srl Upgraded scripts to Unicode3.0
  18. * 11/11/1999 weiv added u_isalnum(), cleaned comments
  19. * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion.
  20. * 06/20/2000 helena OS/400 port changes; mostly typecast.
  21. ******************************************************************************
  22. */
  23. #include "unicode/utypes.h"
  24. #include "unicode/uchar.h"
  25. #include "unicode/uscript.h"
  26. #include "unicode/udata.h"
  27. #include "uassert.h"
  28. #include "cmemory.h"
  29. #include "ucln_cmn.h"
  30. #include "utrie2.h"
  31. #include "udataswp.h"
  32. #include "uprops.h"
  33. #include "ustr_imp.h"
  34. /* uchar_props_data.h is machine-generated by genprops --csource */
  35. #define INCLUDED_FROM_UCHAR_C
  36. #include "uchar_props_data.h"
  37. /* constants and macros for access to the data ------------------------------ */
  38. /* getting a uint32_t properties word from the data */
  39. #define GET_PROPS(c, result) ((result)=UTRIE2_GET16(&propsTrie, c))
  40. /* API functions ------------------------------------------------------------ */
  41. /* Gets the Unicode character's general category.*/
  42. U_CAPI int8_t U_EXPORT2
  43. u_charType(UChar32 c) {
  44. uint32_t props;
  45. GET_PROPS(c, props);
  46. return (int8_t)GET_CATEGORY(props);
  47. }
  48. /* Enumerate all code points with their general categories. */
  49. struct _EnumTypeCallback {
  50. UCharEnumTypeRange *enumRange;
  51. const void *context;
  52. };
  53. static uint32_t U_CALLCONV
  54. _enumTypeValue(const void *context, uint32_t value) {
  55. (void)context;
  56. return GET_CATEGORY(value);
  57. }
  58. static UBool U_CALLCONV
  59. _enumTypeRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
  60. /* just cast the value to UCharCategory */
  61. return ((struct _EnumTypeCallback *)context)->
  62. enumRange(((struct _EnumTypeCallback *)context)->context,
  63. start, end+1, (UCharCategory)value);
  64. }
  65. U_CAPI void U_EXPORT2
  66. u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context) {
  67. struct _EnumTypeCallback callback;
  68. if(enumRange==nullptr) {
  69. return;
  70. }
  71. callback.enumRange=enumRange;
  72. callback.context=context;
  73. utrie2_enum(&propsTrie, _enumTypeValue, _enumTypeRange, &callback);
  74. }
  75. /* Checks if ch is a lower case letter.*/
  76. U_CAPI UBool U_EXPORT2
  77. u_islower(UChar32 c) {
  78. uint32_t props;
  79. GET_PROPS(c, props);
  80. return (UBool)(GET_CATEGORY(props)==U_LOWERCASE_LETTER);
  81. }
  82. /* Checks if ch is an upper case letter.*/
  83. U_CAPI UBool U_EXPORT2
  84. u_isupper(UChar32 c) {
  85. uint32_t props;
  86. GET_PROPS(c, props);
  87. return (UBool)(GET_CATEGORY(props)==U_UPPERCASE_LETTER);
  88. }
  89. /* Checks if ch is a title case letter; usually upper case letters.*/
  90. U_CAPI UBool U_EXPORT2
  91. u_istitle(UChar32 c) {
  92. uint32_t props;
  93. GET_PROPS(c, props);
  94. return (UBool)(GET_CATEGORY(props)==U_TITLECASE_LETTER);
  95. }
  96. /* Checks if ch is a decimal digit. */
  97. U_CAPI UBool U_EXPORT2
  98. u_isdigit(UChar32 c) {
  99. uint32_t props;
  100. GET_PROPS(c, props);
  101. return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
  102. }
  103. U_CAPI UBool U_EXPORT2
  104. u_isxdigit(UChar32 c) {
  105. uint32_t props;
  106. /* check ASCII and Fullwidth ASCII a-fA-F */
  107. if(
  108. (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
  109. (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
  110. ) {
  111. return true;
  112. }
  113. GET_PROPS(c, props);
  114. return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
  115. }
  116. /* Checks if the Unicode character is a letter.*/
  117. U_CAPI UBool U_EXPORT2
  118. u_isalpha(UChar32 c) {
  119. uint32_t props;
  120. GET_PROPS(c, props);
  121. return (UBool)((CAT_MASK(props)&U_GC_L_MASK)!=0);
  122. }
  123. U_CAPI UBool U_EXPORT2
  124. u_isUAlphabetic(UChar32 c) {
  125. return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0;
  126. }
  127. /* Checks if c is a letter or a decimal digit */
  128. U_CAPI UBool U_EXPORT2
  129. u_isalnum(UChar32 c) {
  130. uint32_t props;
  131. GET_PROPS(c, props);
  132. return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0);
  133. }
  134. /**
  135. * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
  136. * @internal
  137. */
  138. U_CFUNC UBool
  139. u_isalnumPOSIX(UChar32 c) {
  140. return (UBool)(u_isUAlphabetic(c) || u_isdigit(c));
  141. }
  142. /* Checks if ch is a unicode character with assigned character type.*/
  143. U_CAPI UBool U_EXPORT2
  144. u_isdefined(UChar32 c) {
  145. uint32_t props;
  146. GET_PROPS(c, props);
  147. return (UBool)(GET_CATEGORY(props)!=0);
  148. }
  149. /* Checks if the Unicode character is a base form character that can take a diacritic.*/
  150. U_CAPI UBool U_EXPORT2
  151. u_isbase(UChar32 c) {
  152. uint32_t props;
  153. GET_PROPS(c, props);
  154. return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_N_MASK|U_GC_MC_MASK|U_GC_ME_MASK))!=0);
  155. }
  156. /* Checks if the Unicode character is a control character.*/
  157. U_CAPI UBool U_EXPORT2
  158. u_iscntrl(UChar32 c) {
  159. uint32_t props;
  160. GET_PROPS(c, props);
  161. return (UBool)((CAT_MASK(props)&(U_GC_CC_MASK|U_GC_CF_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK))!=0);
  162. }
  163. U_CAPI UBool U_EXPORT2
  164. u_isISOControl(UChar32 c) {
  165. return (uint32_t)c<=0x9f && (c<=0x1f || c>=0x7f);
  166. }
  167. /* Some control characters that are used as space. */
  168. #define IS_THAT_CONTROL_SPACE(c) \
  169. (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==0x85))
  170. /* Java has decided that U+0085 New Line is not whitespace any more. */
  171. #define IS_THAT_ASCII_CONTROL_SPACE(c) \
  172. (c<=0x1f && c>=TAB && (c<=CR || c>=0x1c))
  173. /* Checks if the Unicode character is a space character.*/
  174. U_CAPI UBool U_EXPORT2
  175. u_isspace(UChar32 c) {
  176. uint32_t props;
  177. GET_PROPS(c, props);
  178. return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0 || IS_THAT_CONTROL_SPACE(c));
  179. }
  180. U_CAPI UBool U_EXPORT2
  181. u_isJavaSpaceChar(UChar32 c) {
  182. uint32_t props;
  183. GET_PROPS(c, props);
  184. return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0);
  185. }
  186. /* Checks if the Unicode character is a whitespace character.*/
  187. U_CAPI UBool U_EXPORT2
  188. u_isWhitespace(UChar32 c) {
  189. uint32_t props;
  190. GET_PROPS(c, props);
  191. return (UBool)(
  192. ((CAT_MASK(props)&U_GC_Z_MASK)!=0 &&
  193. c!=NBSP && c!=FIGURESP && c!=NNBSP) || /* exclude no-break spaces */
  194. IS_THAT_ASCII_CONTROL_SPACE(c)
  195. );
  196. }
  197. U_CAPI UBool U_EXPORT2
  198. u_isblank(UChar32 c) {
  199. if((uint32_t)c<=0x9f) {
  200. return c==9 || c==0x20; /* TAB or SPACE */
  201. } else {
  202. /* Zs */
  203. uint32_t props;
  204. GET_PROPS(c, props);
  205. return (UBool)(GET_CATEGORY(props)==U_SPACE_SEPARATOR);
  206. }
  207. }
  208. U_CAPI UBool U_EXPORT2
  209. u_isUWhiteSpace(UChar32 c) {
  210. return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0;
  211. }
  212. /* Checks if the Unicode character is printable.*/
  213. U_CAPI UBool U_EXPORT2
  214. u_isprint(UChar32 c) {
  215. uint32_t props;
  216. GET_PROPS(c, props);
  217. /* comparing ==0 returns false for the categories mentioned */
  218. return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0);
  219. }
  220. /**
  221. * Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
  222. * Implements UCHAR_POSIX_PRINT.
  223. * @internal
  224. */
  225. U_CFUNC UBool
  226. u_isprintPOSIX(UChar32 c) {
  227. uint32_t props;
  228. GET_PROPS(c, props);
  229. /*
  230. * The only cntrl character in graph+blank is TAB (in blank).
  231. * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
  232. */
  233. return (UBool)((GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c));
  234. }
  235. U_CAPI UBool U_EXPORT2
  236. u_isgraph(UChar32 c) {
  237. uint32_t props;
  238. GET_PROPS(c, props);
  239. /* comparing ==0 returns false for the categories mentioned */
  240. return (UBool)((CAT_MASK(props)&
  241. (U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
  242. ==0);
  243. }
  244. /**
  245. * Checks if c is in
  246. * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
  247. * with space=\p{Whitespace} and Control=Cc.
  248. * Implements UCHAR_POSIX_GRAPH.
  249. * @internal
  250. */
  251. U_CFUNC UBool
  252. u_isgraphPOSIX(UChar32 c) {
  253. uint32_t props;
  254. GET_PROPS(c, props);
  255. /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
  256. /* comparing ==0 returns false for the categories mentioned */
  257. return (UBool)((CAT_MASK(props)&
  258. (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
  259. ==0);
  260. }
  261. U_CAPI UBool U_EXPORT2
  262. u_ispunct(UChar32 c) {
  263. uint32_t props;
  264. GET_PROPS(c, props);
  265. return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0);
  266. }
  267. /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
  268. U_CAPI UBool U_EXPORT2
  269. u_isIDIgnorable(UChar32 c) {
  270. if(c<=0x9f) {
  271. return u_isISOControl(c) && !IS_THAT_ASCII_CONTROL_SPACE(c);
  272. } else {
  273. uint32_t props;
  274. GET_PROPS(c, props);
  275. return (UBool)(GET_CATEGORY(props)==U_FORMAT_CHAR);
  276. }
  277. }
  278. /*Checks if the Unicode character can start a Java identifier.*/
  279. U_CAPI UBool U_EXPORT2
  280. u_isJavaIDStart(UChar32 c) {
  281. uint32_t props;
  282. GET_PROPS(c, props);
  283. return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_SC_MASK|U_GC_PC_MASK))!=0);
  284. }
  285. /*Checks if the Unicode character can be a Java identifier part other than starting the
  286. * identifier.
  287. */
  288. U_CAPI UBool U_EXPORT2
  289. u_isJavaIDPart(UChar32 c) {
  290. uint32_t props;
  291. GET_PROPS(c, props);
  292. return (UBool)(
  293. (CAT_MASK(props)&
  294. (U_GC_ND_MASK|U_GC_NL_MASK|
  295. U_GC_L_MASK|
  296. U_GC_SC_MASK|U_GC_PC_MASK|
  297. U_GC_MC_MASK|U_GC_MN_MASK)
  298. )!=0 ||
  299. u_isIDIgnorable(c));
  300. }
  301. U_CAPI int32_t U_EXPORT2
  302. u_charDigitValue(UChar32 c) {
  303. uint32_t props;
  304. int32_t value;
  305. GET_PROPS(c, props);
  306. value=(int32_t)GET_NUMERIC_TYPE_VALUE(props)-UPROPS_NTV_DECIMAL_START;
  307. if(value<=9) {
  308. return value;
  309. } else {
  310. return -1;
  311. }
  312. }
  313. U_CAPI double U_EXPORT2
  314. u_getNumericValue(UChar32 c) {
  315. uint32_t props;
  316. int32_t ntv;
  317. GET_PROPS(c, props);
  318. ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(props);
  319. if(ntv==UPROPS_NTV_NONE) {
  320. return U_NO_NUMERIC_VALUE;
  321. } else if(ntv<UPROPS_NTV_DIGIT_START) {
  322. /* decimal digit */
  323. return ntv-UPROPS_NTV_DECIMAL_START;
  324. } else if(ntv<UPROPS_NTV_NUMERIC_START) {
  325. /* other digit */
  326. return ntv-UPROPS_NTV_DIGIT_START;
  327. } else if(ntv<UPROPS_NTV_FRACTION_START) {
  328. /* small integer */
  329. return ntv-UPROPS_NTV_NUMERIC_START;
  330. } else if(ntv<UPROPS_NTV_LARGE_START) {
  331. /* fraction */
  332. int32_t numerator=(ntv>>4)-12;
  333. int32_t denominator=(ntv&0xf)+1;
  334. return (double)numerator/denominator;
  335. } else if(ntv<UPROPS_NTV_BASE60_START) {
  336. /* large, single-significant-digit integer */
  337. double numValue;
  338. int32_t mant=(ntv>>5)-14;
  339. int32_t exp=(ntv&0x1f)+2;
  340. numValue=mant;
  341. /* multiply by 10^exp without math.h */
  342. while(exp>=4) {
  343. numValue*=10000.;
  344. exp-=4;
  345. }
  346. switch(exp) {
  347. case 3:
  348. numValue*=1000.;
  349. break;
  350. case 2:
  351. numValue*=100.;
  352. break;
  353. case 1:
  354. numValue*=10.;
  355. break;
  356. case 0:
  357. default:
  358. break;
  359. }
  360. return numValue;
  361. } else if(ntv<UPROPS_NTV_FRACTION20_START) {
  362. /* sexagesimal (base 60) integer */
  363. int32_t numValue=(ntv>>2)-0xbf;
  364. int32_t exp=(ntv&3)+1;
  365. switch(exp) {
  366. case 4:
  367. numValue*=60*60*60*60;
  368. break;
  369. case 3:
  370. numValue*=60*60*60;
  371. break;
  372. case 2:
  373. numValue*=60*60;
  374. break;
  375. case 1:
  376. numValue*=60;
  377. break;
  378. case 0:
  379. default:
  380. break;
  381. }
  382. return numValue;
  383. } else if(ntv<UPROPS_NTV_FRACTION32_START) {
  384. // fraction-20 e.g. 3/80
  385. int32_t frac20=ntv-UPROPS_NTV_FRACTION20_START; // 0..0x17
  386. int32_t numerator=2*(frac20&3)+1;
  387. int32_t denominator=20<<(frac20>>2);
  388. return (double)numerator/denominator;
  389. } else if(ntv<UPROPS_NTV_RESERVED_START) {
  390. // fraction-32 e.g. 3/64
  391. int32_t frac32=ntv-UPROPS_NTV_FRACTION32_START; // 0..15
  392. int32_t numerator=2*(frac32&3)+1;
  393. int32_t denominator=32<<(frac32>>2);
  394. return (double)numerator/denominator;
  395. } else {
  396. /* reserved */
  397. return U_NO_NUMERIC_VALUE;
  398. }
  399. }
  400. U_CAPI int32_t U_EXPORT2
  401. u_digit(UChar32 ch, int8_t radix) {
  402. int8_t value;
  403. if((uint8_t)(radix-2)<=(36-2)) {
  404. value=(int8_t)u_charDigitValue(ch);
  405. if(value<0) {
  406. /* ch is not a decimal digit, try latin letters */
  407. if(ch>=0x61 && ch<=0x7A) {
  408. value=(int8_t)(ch-0x57); /* ch - 'a' + 10 */
  409. } else if(ch>=0x41 && ch<=0x5A) {
  410. value=(int8_t)(ch-0x37); /* ch - 'A' + 10 */
  411. } else if(ch>=0xFF41 && ch<=0xFF5A) {
  412. value=(int8_t)(ch-0xFF37); /* fullwidth ASCII a-z */
  413. } else if(ch>=0xFF21 && ch<=0xFF3A) {
  414. value=(int8_t)(ch-0xFF17); /* fullwidth ASCII A-Z */
  415. }
  416. }
  417. } else {
  418. value=-1; /* invalid radix */
  419. }
  420. return (int8_t)((value<radix) ? value : -1);
  421. }
  422. U_CAPI UChar32 U_EXPORT2
  423. u_forDigit(int32_t digit, int8_t radix) {
  424. if((uint8_t)(radix-2)>(36-2) || (uint32_t)digit>=(uint32_t)radix) {
  425. return 0;
  426. } else if(digit<10) {
  427. return (UChar32)(0x30+digit);
  428. } else {
  429. return (UChar32)((0x61-10)+digit);
  430. }
  431. }
  432. /* miscellaneous, and support for uprops.cpp -------------------------------- */
  433. U_CAPI void U_EXPORT2
  434. u_getUnicodeVersion(UVersionInfo versionArray) {
  435. if(versionArray!=nullptr) {
  436. uprv_memcpy(versionArray, dataVersion, U_MAX_VERSION_LENGTH);
  437. }
  438. }
  439. U_CFUNC uint32_t
  440. u_getMainProperties(UChar32 c) {
  441. uint32_t props;
  442. GET_PROPS(c, props);
  443. return props;
  444. }
  445. U_CFUNC uint32_t
  446. u_getUnicodeProperties(UChar32 c, int32_t column) {
  447. U_ASSERT(column>=0);
  448. if(column>=propsVectorsColumns) {
  449. return 0;
  450. } else {
  451. uint16_t vecIndex=UTRIE2_GET16(&propsVectorsTrie, c);
  452. return propsVectors[vecIndex+column];
  453. }
  454. }
  455. U_CFUNC int32_t
  456. uprv_getMaxValues(int32_t column) {
  457. switch(column) {
  458. case 0:
  459. return indexes[UPROPS_MAX_VALUES_INDEX];
  460. case 2:
  461. return indexes[UPROPS_MAX_VALUES_2_INDEX];
  462. default:
  463. return 0;
  464. }
  465. }
  466. U_CAPI void U_EXPORT2
  467. u_charAge(UChar32 c, UVersionInfo versionArray) {
  468. if(versionArray!=nullptr) {
  469. uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT;
  470. versionArray[0]=(uint8_t)(version>>4);
  471. versionArray[1]=(uint8_t)(version&0xf);
  472. versionArray[2]=versionArray[3]=0;
  473. }
  474. }
  475. U_CAPI UScriptCode U_EXPORT2
  476. uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
  477. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  478. return USCRIPT_INVALID_CODE;
  479. }
  480. if((uint32_t)c>0x10ffff) {
  481. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  482. return USCRIPT_INVALID_CODE;
  483. }
  484. uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
  485. uint32_t codeOrIndex=uprops_mergeScriptCodeOrIndex(scriptX);
  486. if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
  487. return (UScriptCode)codeOrIndex;
  488. } else if(scriptX<UPROPS_SCRIPT_X_WITH_INHERITED) {
  489. return USCRIPT_COMMON;
  490. } else if(scriptX<UPROPS_SCRIPT_X_WITH_OTHER) {
  491. return USCRIPT_INHERITED;
  492. } else {
  493. return (UScriptCode)scriptExtensions[codeOrIndex];
  494. }
  495. }
  496. U_CAPI UBool U_EXPORT2
  497. uscript_hasScript(UChar32 c, UScriptCode sc) UPRV_NO_SANITIZE_UNDEFINED {
  498. uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
  499. uint32_t codeOrIndex=uprops_mergeScriptCodeOrIndex(scriptX);
  500. if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
  501. return sc==(UScriptCode)codeOrIndex;
  502. }
  503. const uint16_t *scx=scriptExtensions+codeOrIndex;
  504. if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) {
  505. scx=scriptExtensions+scx[1];
  506. }
  507. uint32_t sc32=sc;
  508. if(sc32>0x7fff) {
  509. /* Guard against bogus input that would make us go past the Script_Extensions terminator. */
  510. return false;
  511. }
  512. while(sc32>*scx) {
  513. ++scx;
  514. }
  515. return sc32==(*scx&0x7fff);
  516. }
  517. U_CAPI int32_t U_EXPORT2
  518. uscript_getScriptExtensions(UChar32 c,
  519. UScriptCode *scripts, int32_t capacity,
  520. UErrorCode *pErrorCode) {
  521. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  522. return 0;
  523. }
  524. if(capacity<0 || (capacity>0 && scripts==nullptr)) {
  525. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  526. return 0;
  527. }
  528. uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
  529. uint32_t codeOrIndex=uprops_mergeScriptCodeOrIndex(scriptX);
  530. if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
  531. if(capacity==0) {
  532. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  533. } else {
  534. scripts[0]=(UScriptCode)codeOrIndex;
  535. }
  536. return 1;
  537. }
  538. const uint16_t *scx=scriptExtensions+codeOrIndex;
  539. if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) {
  540. scx=scriptExtensions+scx[1];
  541. }
  542. int32_t length=0;
  543. uint16_t sx;
  544. do {
  545. sx=*scx++;
  546. if(length<capacity) {
  547. scripts[length]=(UScriptCode)(sx&0x7fff);
  548. }
  549. ++length;
  550. } while(sx<0x8000);
  551. if(length>capacity) {
  552. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  553. }
  554. return length;
  555. }
  556. U_CAPI UBlockCode U_EXPORT2
  557. ublock_getCode(UChar32 c) {
  558. return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT);
  559. }
  560. /* property starts for UnicodeSet ------------------------------------------- */
  561. static UBool U_CALLCONV
  562. _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
  563. /* add the start code point to the USet */
  564. const USetAdder *sa=(const USetAdder *)context;
  565. sa->add(sa->set, start);
  566. (void)end;
  567. (void)value;
  568. return true;
  569. }
  570. #define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1)
  571. U_CFUNC void U_EXPORT2
  572. uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
  573. if(U_FAILURE(*pErrorCode)) {
  574. return;
  575. }
  576. /* add the start code point of each same-value range of the main trie */
  577. utrie2_enum(&propsTrie, nullptr, _enumPropertyStartsRange, sa);
  578. /* add code points with hardcoded properties, plus the ones following them */
  579. /* add for u_isblank() */
  580. USET_ADD_CP_AND_NEXT(sa, TAB);
  581. /* add for IS_THAT_CONTROL_SPACE() */
  582. sa->add(sa->set, CR+1); /* range TAB..CR */
  583. sa->add(sa->set, 0x1c);
  584. sa->add(sa->set, 0x1f+1);
  585. USET_ADD_CP_AND_NEXT(sa, 0x85); // NEXT LINE (NEL)
  586. /* add for u_isIDIgnorable() what was not added above */
  587. sa->add(sa->set, 0x7f); /* range DEL..NBSP-1, NBSP added below */
  588. sa->add(sa->set, HAIRSP);
  589. sa->add(sa->set, RLM+1);
  590. sa->add(sa->set, 0x206a); // INHIBIT SYMMETRIC SWAPPING
  591. sa->add(sa->set, 0x206f+1); // NOMINAL DIGIT SHAPES
  592. USET_ADD_CP_AND_NEXT(sa, ZWNBSP);
  593. /* add no-break spaces for u_isWhitespace() what was not added above */
  594. USET_ADD_CP_AND_NEXT(sa, NBSP);
  595. USET_ADD_CP_AND_NEXT(sa, FIGURESP);
  596. USET_ADD_CP_AND_NEXT(sa, NNBSP);
  597. /* add for u_digit() */
  598. sa->add(sa->set, u'a');
  599. sa->add(sa->set, u'z'+1);
  600. sa->add(sa->set, u'A');
  601. sa->add(sa->set, u'Z'+1);
  602. // fullwidth
  603. sa->add(sa->set, u'a');
  604. sa->add(sa->set, u'z'+1);
  605. sa->add(sa->set, u'A');
  606. sa->add(sa->set, u'Z'+1);
  607. /* add for u_isxdigit() */
  608. sa->add(sa->set, u'f'+1);
  609. sa->add(sa->set, u'F'+1);
  610. // fullwidth
  611. sa->add(sa->set, u'f'+1);
  612. sa->add(sa->set, u'F'+1);
  613. /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
  614. sa->add(sa->set, 0x2060); /* range 2060..206f */
  615. sa->add(sa->set, 0xfff0);
  616. sa->add(sa->set, 0xfffb+1);
  617. sa->add(sa->set, 0xe0000);
  618. sa->add(sa->set, 0xe0fff+1);
  619. /* add for UCHAR_GRAPHEME_BASE and others */
  620. USET_ADD_CP_AND_NEXT(sa, CGJ);
  621. }
  622. U_CFUNC void U_EXPORT2
  623. upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
  624. if(U_FAILURE(*pErrorCode)) {
  625. return;
  626. }
  627. /* add the start code point of each same-value range of the properties vectors trie */
  628. utrie2_enum(&propsVectorsTrie, nullptr, _enumPropertyStartsRange, sa);
  629. }