usprep.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2003-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: usprep.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2003jul2
  16. * created by: Ram Viswanadha
  17. */
  18. #include "unicode/utypes.h"
  19. #if !UCONFIG_NO_IDNA
  20. #include "unicode/usprep.h"
  21. #include "unicode/normalizer2.h"
  22. #include "unicode/ustring.h"
  23. #include "unicode/uchar.h"
  24. #include "unicode/uversion.h"
  25. #include "umutex.h"
  26. #include "cmemory.h"
  27. #include "sprpimpl.h"
  28. #include "ustr_imp.h"
  29. #include "uhash.h"
  30. #include "cstring.h"
  31. #include "udataswp.h"
  32. #include "ucln_cmn.h"
  33. #include "ubidi_props.h"
  34. #include "uprops.h"
  35. U_NAMESPACE_USE
  36. U_CDECL_BEGIN
  37. /*
  38. Static cache for already opened StringPrep profiles
  39. */
  40. static UHashtable *SHARED_DATA_HASHTABLE = nullptr;
  41. static icu::UInitOnce gSharedDataInitOnce {};
  42. static UMutex usprepMutex;
  43. /* format version of spp file */
  44. //static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
  45. /* the Unicode version of the sprep data */
  46. static UVersionInfo dataVersion={ 0, 0, 0, 0 };
  47. /* Profile names must be aligned to UStringPrepProfileType */
  48. static const char * const PROFILE_NAMES[] = {
  49. "rfc3491", /* USPREP_RFC3491_NAMEPREP */
  50. "rfc3530cs", /* USPREP_RFC3530_NFS4_CS_PREP */
  51. "rfc3530csci", /* USPREP_RFC3530_NFS4_CS_PREP_CI */
  52. "rfc3491", /* USPREP_RFC3530_NSF4_CIS_PREP */
  53. "rfc3530mixp", /* USPREP_RFC3530_NSF4_MIXED_PREP_PREFIX */
  54. "rfc3491", /* USPREP_RFC3530_NSF4_MIXED_PREP_SUFFIX */
  55. "rfc3722", /* USPREP_RFC3722_ISCSI */
  56. "rfc3920node", /* USPREP_RFC3920_NODEPREP */
  57. "rfc3920res", /* USPREP_RFC3920_RESOURCEPREP */
  58. "rfc4011", /* USPREP_RFC4011_MIB */
  59. "rfc4013", /* USPREP_RFC4013_SASLPREP */
  60. "rfc4505", /* USPREP_RFC4505_TRACE */
  61. "rfc4518", /* USPREP_RFC4518_LDAP */
  62. "rfc4518ci", /* USPREP_RFC4518_LDAP_CI */
  63. };
  64. static UBool U_CALLCONV
  65. isSPrepAcceptable(void * /* context */,
  66. const char * /* type */,
  67. const char * /* name */,
  68. const UDataInfo *pInfo) {
  69. if(
  70. pInfo->size>=20 &&
  71. pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  72. pInfo->charsetFamily==U_CHARSET_FAMILY &&
  73. pInfo->dataFormat[0]==0x53 && /* dataFormat="SPRP" */
  74. pInfo->dataFormat[1]==0x50 &&
  75. pInfo->dataFormat[2]==0x52 &&
  76. pInfo->dataFormat[3]==0x50 &&
  77. pInfo->formatVersion[0]==3 &&
  78. pInfo->formatVersion[2]==UTRIE_SHIFT &&
  79. pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
  80. ) {
  81. //uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
  82. uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
  83. return true;
  84. } else {
  85. return false;
  86. }
  87. }
  88. static int32_t U_CALLCONV
  89. getSPrepFoldingOffset(uint32_t data) {
  90. return (int32_t)data;
  91. }
  92. /* hashes an entry */
  93. static int32_t U_CALLCONV
  94. hashEntry(const UHashTok parm) {
  95. UStringPrepKey *b = (UStringPrepKey *)parm.pointer;
  96. UHashTok namekey, pathkey;
  97. namekey.pointer = b->name;
  98. pathkey.pointer = b->path;
  99. uint32_t unsignedHash = static_cast<uint32_t>(uhash_hashChars(namekey)) +
  100. 37u * static_cast<uint32_t>(uhash_hashChars(pathkey));
  101. return static_cast<int32_t>(unsignedHash);
  102. }
  103. /* compares two entries */
  104. static UBool U_CALLCONV
  105. compareEntries(const UHashTok p1, const UHashTok p2) {
  106. UStringPrepKey *b1 = (UStringPrepKey *)p1.pointer;
  107. UStringPrepKey *b2 = (UStringPrepKey *)p2.pointer;
  108. UHashTok name1, name2, path1, path2;
  109. name1.pointer = b1->name;
  110. name2.pointer = b2->name;
  111. path1.pointer = b1->path;
  112. path2.pointer = b2->path;
  113. return uhash_compareChars(name1, name2) & uhash_compareChars(path1, path2);
  114. }
  115. static void
  116. usprep_unload(UStringPrepProfile* data){
  117. udata_close(data->sprepData);
  118. }
  119. static int32_t
  120. usprep_internal_flushCache(UBool noRefCount){
  121. UStringPrepProfile *profile = nullptr;
  122. UStringPrepKey *key = nullptr;
  123. int32_t pos = UHASH_FIRST;
  124. int32_t deletedNum = 0;
  125. const UHashElement *e;
  126. /*
  127. * if shared data hasn't even been lazy evaluated yet
  128. * return 0
  129. */
  130. umtx_lock(&usprepMutex);
  131. if (SHARED_DATA_HASHTABLE == nullptr) {
  132. umtx_unlock(&usprepMutex);
  133. return 0;
  134. }
  135. /*creates an enumeration to iterate through every element in the table */
  136. while ((e = uhash_nextElement(SHARED_DATA_HASHTABLE, &pos)) != nullptr)
  137. {
  138. profile = (UStringPrepProfile *) e->value.pointer;
  139. key = (UStringPrepKey *) e->key.pointer;
  140. if ((noRefCount== false && profile->refCount == 0) ||
  141. noRefCount) {
  142. deletedNum++;
  143. uhash_removeElement(SHARED_DATA_HASHTABLE, e);
  144. /* unload the data */
  145. usprep_unload(profile);
  146. if(key->name != nullptr) {
  147. uprv_free(key->name);
  148. key->name=nullptr;
  149. }
  150. if(key->path != nullptr) {
  151. uprv_free(key->path);
  152. key->path=nullptr;
  153. }
  154. uprv_free(profile);
  155. uprv_free(key);
  156. }
  157. }
  158. umtx_unlock(&usprepMutex);
  159. return deletedNum;
  160. }
  161. /* Works just like ucnv_flushCache()
  162. static int32_t
  163. usprep_flushCache(){
  164. return usprep_internal_flushCache(false);
  165. }
  166. */
  167. static UBool U_CALLCONV usprep_cleanup(){
  168. if (SHARED_DATA_HASHTABLE != nullptr) {
  169. usprep_internal_flushCache(true);
  170. if (SHARED_DATA_HASHTABLE != nullptr && uhash_count(SHARED_DATA_HASHTABLE) == 0) {
  171. uhash_close(SHARED_DATA_HASHTABLE);
  172. SHARED_DATA_HASHTABLE = nullptr;
  173. }
  174. }
  175. gSharedDataInitOnce.reset();
  176. return (SHARED_DATA_HASHTABLE == nullptr);
  177. }
  178. U_CDECL_END
  179. /** Initializes the cache for resources */
  180. static void U_CALLCONV
  181. createCache(UErrorCode &status) {
  182. SHARED_DATA_HASHTABLE = uhash_open(hashEntry, compareEntries, nullptr, &status);
  183. if (U_FAILURE(status)) {
  184. SHARED_DATA_HASHTABLE = nullptr;
  185. }
  186. ucln_common_registerCleanup(UCLN_COMMON_USPREP, usprep_cleanup);
  187. }
  188. static void
  189. initCache(UErrorCode *status) {
  190. umtx_initOnce(gSharedDataInitOnce, &createCache, *status);
  191. }
  192. static UBool U_CALLCONV
  193. loadData(UStringPrepProfile* profile,
  194. const char* path,
  195. const char* name,
  196. const char* type,
  197. UErrorCode* errorCode) {
  198. /* load Unicode SPREP data from file */
  199. UTrie _sprepTrie = {nullptr, nullptr, nullptr, 0, 0, 0, 0};
  200. UDataMemory *dataMemory;
  201. const int32_t *p=nullptr;
  202. const uint8_t *pb;
  203. UVersionInfo normUnicodeVersion;
  204. int32_t normUniVer, sprepUniVer, normCorrVer;
  205. if(errorCode==nullptr || U_FAILURE(*errorCode)) {
  206. return 0;
  207. }
  208. /* open the data outside the mutex block */
  209. //TODO: change the path
  210. dataMemory=udata_openChoice(path, type, name, isSPrepAcceptable, nullptr, errorCode);
  211. if(U_FAILURE(*errorCode)) {
  212. return false;
  213. }
  214. p = static_cast<const int32_t*>(udata_getMemory(dataMemory));
  215. pb = reinterpret_cast<const uint8_t*>(p + _SPREP_INDEX_TOP);
  216. utrie_unserialize(&_sprepTrie, pb, p[_SPREP_INDEX_TRIE_SIZE], errorCode);
  217. _sprepTrie.getFoldingOffset=getSPrepFoldingOffset;
  218. if(U_FAILURE(*errorCode)) {
  219. udata_close(dataMemory);
  220. return false;
  221. }
  222. /* in the mutex block, set the data for this process */
  223. umtx_lock(&usprepMutex);
  224. if(profile->sprepData==nullptr) {
  225. profile->sprepData=dataMemory;
  226. dataMemory=nullptr;
  227. uprv_memcpy(&profile->indexes, p, sizeof(profile->indexes));
  228. uprv_memcpy(&profile->sprepTrie, &_sprepTrie, sizeof(UTrie));
  229. } else {
  230. p = static_cast<const int32_t*>(udata_getMemory(profile->sprepData));
  231. }
  232. umtx_unlock(&usprepMutex);
  233. /* initialize some variables */
  234. profile->mappingData = reinterpret_cast<const uint16_t*>(reinterpret_cast<const uint8_t*>(p + _SPREP_INDEX_TOP) + profile->indexes[_SPREP_INDEX_TRIE_SIZE]);
  235. u_getUnicodeVersion(normUnicodeVersion);
  236. normUniVer = (normUnicodeVersion[0] << 24) + (normUnicodeVersion[1] << 16) +
  237. (normUnicodeVersion[2] << 8 ) + (normUnicodeVersion[3]);
  238. sprepUniVer = (dataVersion[0] << 24) + (dataVersion[1] << 16) +
  239. (dataVersion[2] << 8 ) + (dataVersion[3]);
  240. normCorrVer = profile->indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION];
  241. if(U_FAILURE(*errorCode)){
  242. udata_close(dataMemory);
  243. return false;
  244. }
  245. if( normUniVer < sprepUniVer && /* the Unicode version of SPREP file must be less than the Unicode Version of the normalization data */
  246. normUniVer < normCorrVer && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Version of the normalization data */
  247. ((profile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0) /* normalization turned on*/
  248. ){
  249. *errorCode = U_INVALID_FORMAT_ERROR;
  250. udata_close(dataMemory);
  251. return false;
  252. }
  253. profile->isDataLoaded = true;
  254. /* if a different thread set it first, then close the extra data */
  255. if(dataMemory!=nullptr) {
  256. udata_close(dataMemory); /* nullptr if it was set correctly */
  257. }
  258. return profile->isDataLoaded;
  259. }
  260. static UStringPrepProfile*
  261. usprep_getProfile(const char* path,
  262. const char* name,
  263. UErrorCode *status){
  264. UStringPrepProfile* profile = nullptr;
  265. initCache(status);
  266. if(U_FAILURE(*status)){
  267. return nullptr;
  268. }
  269. UStringPrepKey stackKey;
  270. /*
  271. * const is cast way to save malloc, strcpy and free calls
  272. * we use the passed in pointers for fetching the data from the
  273. * hash table which is safe
  274. */
  275. stackKey.name = const_cast<char*>(name);
  276. stackKey.path = const_cast<char*>(path);
  277. /* fetch the data from the cache */
  278. umtx_lock(&usprepMutex);
  279. profile = static_cast<UStringPrepProfile*>(uhash_get(SHARED_DATA_HASHTABLE, &stackKey));
  280. if(profile != nullptr) {
  281. profile->refCount++;
  282. }
  283. umtx_unlock(&usprepMutex);
  284. if(profile == nullptr) {
  285. /* else load the data and put the data in the cache */
  286. LocalMemory<UStringPrepProfile> newProfile;
  287. if(newProfile.allocateInsteadAndReset() == nullptr) {
  288. *status = U_MEMORY_ALLOCATION_ERROR;
  289. return nullptr;
  290. }
  291. /* load the data */
  292. if(!loadData(newProfile.getAlias(), path, name, _SPREP_DATA_TYPE, status) || U_FAILURE(*status) ){
  293. return nullptr;
  294. }
  295. /* get the options */
  296. newProfile->doNFKC = static_cast<UBool>((newProfile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0);
  297. newProfile->checkBiDi = static_cast<UBool>((newProfile->indexes[_SPREP_OPTIONS] & _SPREP_CHECK_BIDI_ON) > 0);
  298. LocalMemory<UStringPrepKey> key;
  299. LocalMemory<char> keyName;
  300. LocalMemory<char> keyPath;
  301. if( key.allocateInsteadAndReset() == nullptr ||
  302. keyName.allocateInsteadAndCopy(static_cast<int32_t>(uprv_strlen(name)+1)) == nullptr ||
  303. (path != nullptr &&
  304. keyPath.allocateInsteadAndCopy(static_cast<int32_t>(uprv_strlen(path)+1)) == nullptr)
  305. ) {
  306. *status = U_MEMORY_ALLOCATION_ERROR;
  307. usprep_unload(newProfile.getAlias());
  308. return nullptr;
  309. }
  310. umtx_lock(&usprepMutex);
  311. // If another thread already inserted the same key/value, refcount and cleanup our thread data
  312. profile = static_cast<UStringPrepProfile*>(uhash_get(SHARED_DATA_HASHTABLE, &stackKey));
  313. if(profile != nullptr) {
  314. profile->refCount++;
  315. usprep_unload(newProfile.getAlias());
  316. }
  317. else {
  318. /* initialize the key members */
  319. key->name = keyName.orphan();
  320. uprv_strcpy(key->name, name);
  321. if(path != nullptr){
  322. key->path = keyPath.orphan();
  323. uprv_strcpy(key->path, path);
  324. }
  325. profile = newProfile.orphan();
  326. /* add the data object to the cache */
  327. profile->refCount = 1;
  328. uhash_put(SHARED_DATA_HASHTABLE, key.orphan(), profile, status);
  329. }
  330. umtx_unlock(&usprepMutex);
  331. }
  332. return profile;
  333. }
  334. U_CAPI UStringPrepProfile* U_EXPORT2
  335. usprep_open(const char* path,
  336. const char* name,
  337. UErrorCode* status){
  338. if(status == nullptr || U_FAILURE(*status)){
  339. return nullptr;
  340. }
  341. /* initialize the profile struct members */
  342. return usprep_getProfile(path,name,status);
  343. }
  344. U_CAPI UStringPrepProfile* U_EXPORT2
  345. usprep_openByType(UStringPrepProfileType type,
  346. UErrorCode* status) {
  347. if(status == nullptr || U_FAILURE(*status)){
  348. return nullptr;
  349. }
  350. int32_t index = (int32_t)type;
  351. if (index < 0 || index >= UPRV_LENGTHOF(PROFILE_NAMES)) {
  352. *status = U_ILLEGAL_ARGUMENT_ERROR;
  353. return nullptr;
  354. }
  355. return usprep_open(nullptr, PROFILE_NAMES[index], status);
  356. }
  357. U_CAPI void U_EXPORT2
  358. usprep_close(UStringPrepProfile* profile){
  359. if(profile==nullptr){
  360. return;
  361. }
  362. umtx_lock(&usprepMutex);
  363. /* decrement the ref count*/
  364. if(profile->refCount > 0){
  365. profile->refCount--;
  366. }
  367. umtx_unlock(&usprepMutex);
  368. }
  369. U_CFUNC void
  370. uprv_syntaxError(const char16_t* rules,
  371. int32_t pos,
  372. int32_t rulesLen,
  373. UParseError* parseError){
  374. if(parseError == nullptr){
  375. return;
  376. }
  377. parseError->offset = pos;
  378. parseError->line = 0 ; // we are not using line numbers
  379. // for pre-context
  380. int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
  381. int32_t limit = pos;
  382. u_memcpy(parseError->preContext,rules+start,limit-start);
  383. //null terminate the buffer
  384. parseError->preContext[limit-start] = 0;
  385. // for post-context; include error rules[pos]
  386. start = pos;
  387. limit = start + (U_PARSE_CONTEXT_LEN-1);
  388. if (limit > rulesLen) {
  389. limit = rulesLen;
  390. }
  391. if (start < rulesLen) {
  392. u_memcpy(parseError->postContext,rules+start,limit-start);
  393. }
  394. //null terminate the buffer
  395. parseError->postContext[limit-start]= 0;
  396. }
  397. static inline UStringPrepType
  398. getValues(uint16_t trieWord, int16_t& value, UBool& isIndex){
  399. UStringPrepType type;
  400. if(trieWord == 0){
  401. /*
  402. * Initial value stored in the mapping table
  403. * just return USPREP_TYPE_LIMIT .. so that
  404. * the source codepoint is copied to the destination
  405. */
  406. type = USPREP_TYPE_LIMIT;
  407. isIndex =false;
  408. value = 0;
  409. }else if(trieWord >= _SPREP_TYPE_THRESHOLD){
  410. type = static_cast<UStringPrepType>(trieWord - _SPREP_TYPE_THRESHOLD);
  411. isIndex =false;
  412. value = 0;
  413. }else{
  414. /* get the type */
  415. type = USPREP_MAP;
  416. /* ascertain if the value is index or delta */
  417. if(trieWord & 0x02){
  418. isIndex = true;
  419. value = trieWord >> 2; //mask off the lower 2 bits and shift
  420. }else{
  421. isIndex = false;
  422. value = static_cast<int16_t>(trieWord);
  423. value = (value >> 2);
  424. }
  425. if((trieWord>>2) == _SPREP_MAX_INDEX_VALUE){
  426. type = USPREP_DELETE;
  427. isIndex =false;
  428. value = 0;
  429. }
  430. }
  431. return type;
  432. }
  433. // TODO: change to writing to UnicodeString not char16_t *
  434. static int32_t
  435. usprep_map( const UStringPrepProfile* profile,
  436. const char16_t* src, int32_t srcLength,
  437. char16_t* dest, int32_t destCapacity,
  438. int32_t options,
  439. UParseError* parseError,
  440. UErrorCode* status ){
  441. uint16_t result;
  442. int32_t destIndex=0;
  443. int32_t srcIndex;
  444. UBool allowUnassigned = static_cast<UBool>((options & USPREP_ALLOW_UNASSIGNED) > 0);
  445. UStringPrepType type;
  446. int16_t value;
  447. UBool isIndex;
  448. const int32_t* indexes = profile->indexes;
  449. // no error checking the caller check for error and arguments
  450. // no string length check the caller finds out the string length
  451. for(srcIndex=0;srcIndex<srcLength;){
  452. UChar32 ch;
  453. U16_NEXT(src,srcIndex,srcLength,ch);
  454. result=0;
  455. UTRIE_GET16(&profile->sprepTrie,ch,result);
  456. type = getValues(result, value, isIndex);
  457. // check if the source codepoint is unassigned
  458. if(type == USPREP_UNASSIGNED && allowUnassigned == false){
  459. uprv_syntaxError(src,srcIndex-U16_LENGTH(ch), srcLength,parseError);
  460. *status = U_STRINGPREP_UNASSIGNED_ERROR;
  461. return 0;
  462. }else if(type == USPREP_MAP){
  463. int32_t index, length;
  464. if(isIndex){
  465. index = value;
  466. if(index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] &&
  467. index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){
  468. length = 1;
  469. }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] &&
  470. index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){
  471. length = 2;
  472. }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] &&
  473. index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){
  474. length = 3;
  475. }else{
  476. length = profile->mappingData[index++];
  477. }
  478. /* copy mapping to destination */
  479. for(int32_t i=0; i< length; i++){
  480. if(destIndex < destCapacity ){
  481. dest[destIndex] = profile->mappingData[index+i];
  482. }
  483. destIndex++; /* for pre-flighting */
  484. }
  485. continue;
  486. }else{
  487. // subtract the delta to arrive at the code point
  488. ch -= value;
  489. }
  490. }else if(type==USPREP_DELETE){
  491. // just consume the codepoint and continue
  492. continue;
  493. }
  494. //copy the code point into destination
  495. if(ch <= 0xFFFF){
  496. if(destIndex < destCapacity ){
  497. dest[destIndex] = static_cast<char16_t>(ch);
  498. }
  499. destIndex++;
  500. }else{
  501. if(destIndex+1 < destCapacity ){
  502. dest[destIndex] = U16_LEAD(ch);
  503. dest[destIndex+1] = U16_TRAIL(ch);
  504. }
  505. destIndex +=2;
  506. }
  507. }
  508. return u_terminateUChars(dest, destCapacity, destIndex, status);
  509. }
  510. /*
  511. 1) Map -- For each character in the input, check if it has a mapping
  512. and, if so, replace it with its mapping.
  513. 2) Normalize -- Possibly normalize the result of step 1 using Unicode
  514. normalization.
  515. 3) Prohibit -- Check for any characters that are not allowed in the
  516. output. If any are found, return an error.
  517. 4) Check bidi -- Possibly check for right-to-left characters, and if
  518. any are found, make sure that the whole string satisfies the
  519. requirements for bidirectional strings. If the string does not
  520. satisfy the requirements for bidirectional strings, return an
  521. error.
  522. [Unicode3.2] defines several bidirectional categories; each character
  523. has one bidirectional category assigned to it. For the purposes of
  524. the requirements below, an "RandALCat character" is a character that
  525. has Unicode bidirectional categories "R" or "AL"; an "LCat character"
  526. is a character that has Unicode bidirectional category "L". Note
  527. that there are many characters which fall in neither of the above
  528. definitions; Latin digits (<U+0030> through <U+0039>) are examples of
  529. this because they have bidirectional category "EN".
  530. In any profile that specifies bidirectional character handling, all
  531. three of the following requirements MUST be met:
  532. 1) The characters in section 5.8 MUST be prohibited.
  533. 2) If a string contains any RandALCat character, the string MUST NOT
  534. contain any LCat character.
  535. 3) If a string contains any RandALCat character, a RandALCat
  536. character MUST be the first character of the string, and a
  537. RandALCat character MUST be the last character of the string.
  538. */
  539. U_CAPI int32_t U_EXPORT2
  540. usprep_prepare( const UStringPrepProfile* profile,
  541. const char16_t* src, int32_t srcLength,
  542. char16_t* dest, int32_t destCapacity,
  543. int32_t options,
  544. UParseError* parseError,
  545. UErrorCode* status ){
  546. // check error status
  547. if(U_FAILURE(*status)){
  548. return 0;
  549. }
  550. //check arguments
  551. if(profile==nullptr ||
  552. (src==nullptr ? srcLength!=0 : srcLength<-1) ||
  553. (dest==nullptr ? destCapacity!=0 : destCapacity<0)) {
  554. *status=U_ILLEGAL_ARGUMENT_ERROR;
  555. return 0;
  556. }
  557. //get the string length
  558. if(srcLength < 0){
  559. srcLength = u_strlen(src);
  560. }
  561. // map
  562. UnicodeString s1;
  563. char16_t *b1 = s1.getBuffer(srcLength);
  564. if(b1==nullptr){
  565. *status = U_MEMORY_ALLOCATION_ERROR;
  566. return 0;
  567. }
  568. int32_t b1Len = usprep_map(profile, src, srcLength,
  569. b1, s1.getCapacity(), options, parseError, status);
  570. s1.releaseBuffer(U_SUCCESS(*status) ? b1Len : 0);
  571. if(*status == U_BUFFER_OVERFLOW_ERROR){
  572. // redo processing of string
  573. /* we do not have enough room so grow the buffer*/
  574. b1 = s1.getBuffer(b1Len);
  575. if(b1==nullptr){
  576. *status = U_MEMORY_ALLOCATION_ERROR;
  577. return 0;
  578. }
  579. *status = U_ZERO_ERROR; // reset error
  580. b1Len = usprep_map(profile, src, srcLength,
  581. b1, s1.getCapacity(), options, parseError, status);
  582. s1.releaseBuffer(U_SUCCESS(*status) ? b1Len : 0);
  583. }
  584. if(U_FAILURE(*status)){
  585. return 0;
  586. }
  587. // normalize
  588. UnicodeString s2;
  589. if(profile->doNFKC){
  590. const Normalizer2 *n2 = Normalizer2::getNFKCInstance(*status);
  591. FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*status));
  592. if(U_FAILURE(*status)){
  593. return 0;
  594. }
  595. fn2.normalize(s1, s2, *status);
  596. }else{
  597. s2.fastCopyFrom(s1);
  598. }
  599. if(U_FAILURE(*status)){
  600. return 0;
  601. }
  602. // Prohibit and checkBiDi in one pass
  603. const char16_t *b2 = s2.getBuffer();
  604. int32_t b2Len = s2.length();
  605. UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
  606. UBool leftToRight=false, rightToLeft=false;
  607. int32_t rtlPos =-1, ltrPos =-1;
  608. for(int32_t b2Index=0; b2Index<b2Len;){
  609. UChar32 ch = 0;
  610. U16_NEXT(b2, b2Index, b2Len, ch);
  611. uint16_t result;
  612. UTRIE_GET16(&profile->sprepTrie,ch,result);
  613. int16_t value;
  614. UBool isIndex;
  615. UStringPrepType type = getValues(result, value, isIndex);
  616. if( type == USPREP_PROHIBITED ||
  617. ((result < _SPREP_TYPE_THRESHOLD) && (result & 0x01) /* first bit says it the code point is prohibited*/)
  618. ){
  619. *status = U_STRINGPREP_PROHIBITED_ERROR;
  620. uprv_syntaxError(b2, b2Index-U16_LENGTH(ch), b2Len, parseError);
  621. return 0;
  622. }
  623. if(profile->checkBiDi) {
  624. direction = ubidi_getClass(ch);
  625. if(firstCharDir == U_CHAR_DIRECTION_COUNT){
  626. firstCharDir = direction;
  627. }
  628. if(direction == U_LEFT_TO_RIGHT){
  629. leftToRight = true;
  630. ltrPos = b2Index-1;
  631. }
  632. if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
  633. rightToLeft = true;
  634. rtlPos = b2Index-1;
  635. }
  636. }
  637. }
  638. if(profile->checkBiDi){
  639. // satisfy 2
  640. if( leftToRight && rightToLeft){
  641. *status = U_STRINGPREP_CHECK_BIDI_ERROR;
  642. uprv_syntaxError(b2,(rtlPos>ltrPos) ? rtlPos : ltrPos, b2Len, parseError);
  643. return 0;
  644. }
  645. //satisfy 3
  646. if( rightToLeft &&
  647. !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) &&
  648. (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC))
  649. ){
  650. *status = U_STRINGPREP_CHECK_BIDI_ERROR;
  651. uprv_syntaxError(b2, rtlPos, b2Len, parseError);
  652. return false;
  653. }
  654. }
  655. return s2.extract(dest, destCapacity, *status);
  656. }
  657. /* data swapping ------------------------------------------------------------ */
  658. U_CAPI int32_t U_EXPORT2
  659. usprep_swap(const UDataSwapper *ds,
  660. const void *inData, int32_t length, void *outData,
  661. UErrorCode *pErrorCode) {
  662. const UDataInfo *pInfo;
  663. int32_t headerSize;
  664. const uint8_t *inBytes;
  665. uint8_t *outBytes;
  666. const int32_t *inIndexes;
  667. int32_t indexes[16];
  668. int32_t i, offset, count, size;
  669. /* udata_swapDataHeader checks the arguments */
  670. headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
  671. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  672. return 0;
  673. }
  674. /* check data format and format version */
  675. pInfo=(const UDataInfo *)((const char *)inData+4);
  676. if(!(
  677. pInfo->dataFormat[0]==0x53 && /* dataFormat="SPRP" */
  678. pInfo->dataFormat[1]==0x50 &&
  679. pInfo->dataFormat[2]==0x52 &&
  680. pInfo->dataFormat[3]==0x50 &&
  681. pInfo->formatVersion[0]==3
  682. )) {
  683. udata_printError(ds, "usprep_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as StringPrep .spp data\n",
  684. pInfo->dataFormat[0], pInfo->dataFormat[1],
  685. pInfo->dataFormat[2], pInfo->dataFormat[3],
  686. pInfo->formatVersion[0]);
  687. *pErrorCode=U_UNSUPPORTED_ERROR;
  688. return 0;
  689. }
  690. inBytes=(const uint8_t *)inData+headerSize;
  691. outBytes= (outData == nullptr ) ? nullptr : (uint8_t *)outData+headerSize;
  692. inIndexes=(const int32_t *)inBytes;
  693. if(length>=0) {
  694. length-=headerSize;
  695. if(length<16*4) {
  696. udata_printError(ds, "usprep_swap(): too few bytes (%d after header) for StringPrep .spp data\n",
  697. length);
  698. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  699. return 0;
  700. }
  701. }
  702. /* read the first 16 indexes (ICU 2.8/format version 3: _SPREP_INDEX_TOP==16, might grow) */
  703. for(i=0; i<16; ++i) {
  704. indexes[i]=udata_readInt32(ds, inIndexes[i]);
  705. }
  706. /* calculate the total length of the data */
  707. size=
  708. 16*4+ /* size of indexes[] */
  709. indexes[_SPREP_INDEX_TRIE_SIZE]+
  710. indexes[_SPREP_INDEX_MAPPING_DATA_SIZE];
  711. if(length>=0) {
  712. if(length<size) {
  713. udata_printError(ds, "usprep_swap(): too few bytes (%d after header) for all of StringPrep .spp data\n",
  714. length);
  715. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  716. return 0;
  717. }
  718. /* copy the data for inaccessible bytes */
  719. if(inBytes!=outBytes) {
  720. uprv_memcpy(outBytes, inBytes, size);
  721. }
  722. offset=0;
  723. /* swap the int32_t indexes[] */
  724. count=16*4;
  725. ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
  726. offset+=count;
  727. /* swap the UTrie */
  728. count=indexes[_SPREP_INDEX_TRIE_SIZE];
  729. utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
  730. offset+=count;
  731. /* swap the uint16_t mappingTable[] */
  732. count=indexes[_SPREP_INDEX_MAPPING_DATA_SIZE];
  733. ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
  734. //offset+=count;
  735. }
  736. return headerSize+size;
  737. }
  738. #endif /* #if !UCONFIG_NO_IDNA */