ucnvsel.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2008-2011, International Business Machines
  7. * Corporation, Google and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. */
  11. // Author : eldawy@google.com (Mohamed Eldawy)
  12. // ucnvsel.cpp
  13. //
  14. // Purpose: To generate a list of encodings capable of handling
  15. // a given Unicode text
  16. //
  17. // Started 09-April-2008
  18. /**
  19. * \file
  20. *
  21. * This is an implementation of an encoding selector.
  22. * The goal is, given a unicode string, find the encodings
  23. * this string can be mapped to. To make processing faster
  24. * a trie is built when you call ucnvsel_open() that
  25. * stores all encodings a codepoint can map to
  26. */
  27. #include "unicode/ucnvsel.h"
  28. #if !UCONFIG_NO_CONVERSION
  29. #include <string.h>
  30. #include "unicode/uchar.h"
  31. #include "unicode/uniset.h"
  32. #include "unicode/ucnv.h"
  33. #include "unicode/ustring.h"
  34. #include "unicode/uchriter.h"
  35. #include "utrie2.h"
  36. #include "propsvec.h"
  37. #include "uassert.h"
  38. #include "ucmndata.h"
  39. #include "udataswp.h"
  40. #include "uenumimp.h"
  41. #include "cmemory.h"
  42. #include "cstring.h"
  43. U_NAMESPACE_USE
  44. struct UConverterSelector {
  45. UTrie2 *trie; // 16 bit trie containing offsets into pv
  46. uint32_t* pv; // table of bits!
  47. int32_t pvCount;
  48. char** encodings; // which encodings did user ask to use?
  49. int32_t encodingsCount;
  50. int32_t encodingStrLength;
  51. uint8_t* swapped;
  52. UBool ownPv, ownEncodingStrings;
  53. };
  54. static void generateSelectorData(UConverterSelector* result,
  55. UPropsVectors *upvec,
  56. const USet* excludedCodePoints,
  57. const UConverterUnicodeSet whichSet,
  58. UErrorCode* status) {
  59. if (U_FAILURE(*status)) {
  60. return;
  61. }
  62. int32_t columns = (result->encodingsCount+31)/32;
  63. // set errorValue to all-ones
  64. for (int32_t col = 0; col < columns; col++) {
  65. upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP,
  66. col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0), status);
  67. }
  68. for (int32_t i = 0; i < result->encodingsCount; ++i) {
  69. uint32_t mask;
  70. uint32_t column;
  71. int32_t item_count;
  72. int32_t j;
  73. UConverter* test_converter = ucnv_open(result->encodings[i], status);
  74. if (U_FAILURE(*status)) {
  75. return;
  76. }
  77. USet* unicode_point_set;
  78. unicode_point_set = uset_open(1, 0); // empty set
  79. ucnv_getUnicodeSet(test_converter, unicode_point_set,
  80. whichSet, status);
  81. if (U_FAILURE(*status)) {
  82. ucnv_close(test_converter);
  83. return;
  84. }
  85. column = i / 32;
  86. mask = 1 << (i%32);
  87. // now iterate over intervals on set i!
  88. item_count = uset_getItemCount(unicode_point_set);
  89. for (j = 0; j < item_count; ++j) {
  90. UChar32 start_char;
  91. UChar32 end_char;
  92. UErrorCode smallStatus = U_ZERO_ERROR;
  93. uset_getItem(unicode_point_set, j, &start_char, &end_char, nullptr, 0,
  94. &smallStatus);
  95. if (U_FAILURE(smallStatus)) {
  96. // this will be reached for the converters that fill the set with
  97. // strings. Those should be ignored by our system
  98. } else {
  99. upvec_setValue(upvec, start_char, end_char, column, static_cast<uint32_t>(~0), mask,
  100. status);
  101. }
  102. }
  103. ucnv_close(test_converter);
  104. uset_close(unicode_point_set);
  105. if (U_FAILURE(*status)) {
  106. return;
  107. }
  108. }
  109. // handle excluded encodings! Simply set their values to all 1's in the upvec
  110. if (excludedCodePoints) {
  111. int32_t item_count = uset_getItemCount(excludedCodePoints);
  112. for (int32_t j = 0; j < item_count; ++j) {
  113. UChar32 start_char;
  114. UChar32 end_char;
  115. uset_getItem(excludedCodePoints, j, &start_char, &end_char, nullptr, 0,
  116. status);
  117. for (int32_t col = 0; col < columns; col++) {
  118. upvec_setValue(upvec, start_char, end_char, col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0),
  119. status);
  120. }
  121. }
  122. }
  123. // alright. Now, let's put things in the same exact form you'd get when you
  124. // unserialize things.
  125. result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status);
  126. result->pv = upvec_cloneArray(upvec, &result->pvCount, nullptr, status);
  127. result->pvCount *= columns; // number of uint32_t = rows * columns
  128. result->ownPv = true;
  129. }
  130. /* open a selector. If converterListSize is 0, build for all converters.
  131. If excludedCodePoints is nullptr, don't exclude any codepoints */
  132. U_CAPI UConverterSelector* U_EXPORT2
  133. ucnvsel_open(const char* const* converterList, int32_t converterListSize,
  134. const USet* excludedCodePoints,
  135. const UConverterUnicodeSet whichSet, UErrorCode* status) {
  136. // check if already failed
  137. if (U_FAILURE(*status)) {
  138. return nullptr;
  139. }
  140. // ensure args make sense!
  141. if (converterListSize < 0 || (converterList == nullptr && converterListSize != 0)) {
  142. *status = U_ILLEGAL_ARGUMENT_ERROR;
  143. return nullptr;
  144. }
  145. // allocate a new converter
  146. LocalUConverterSelectorPointer newSelector(
  147. (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)));
  148. if (newSelector.isNull()) {
  149. *status = U_MEMORY_ALLOCATION_ERROR;
  150. return nullptr;
  151. }
  152. uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector));
  153. if (converterListSize == 0) {
  154. converterList = nullptr;
  155. converterListSize = ucnv_countAvailable();
  156. }
  157. newSelector->encodings =
  158. (char**)uprv_malloc(converterListSize * sizeof(char*));
  159. if (!newSelector->encodings) {
  160. *status = U_MEMORY_ALLOCATION_ERROR;
  161. return nullptr;
  162. }
  163. newSelector->encodings[0] = nullptr; // now we can call ucnvsel_close()
  164. // make a backup copy of the list of converters
  165. int32_t totalSize = 0;
  166. int32_t i;
  167. for (i = 0; i < converterListSize; i++) {
  168. totalSize +=
  169. (int32_t)uprv_strlen(converterList != nullptr ? converterList[i] : ucnv_getAvailableName(i)) + 1;
  170. }
  171. // 4-align the totalSize to 4-align the size of the serialized form
  172. int32_t encodingStrPadding = totalSize & 3;
  173. if (encodingStrPadding != 0) {
  174. encodingStrPadding = 4 - encodingStrPadding;
  175. }
  176. newSelector->encodingStrLength = totalSize += encodingStrPadding;
  177. char* allStrings = (char*) uprv_malloc(totalSize);
  178. if (!allStrings) {
  179. *status = U_MEMORY_ALLOCATION_ERROR;
  180. return nullptr;
  181. }
  182. for (i = 0; i < converterListSize; i++) {
  183. newSelector->encodings[i] = allStrings;
  184. uprv_strcpy(newSelector->encodings[i],
  185. converterList != nullptr ? converterList[i] : ucnv_getAvailableName(i));
  186. allStrings += uprv_strlen(newSelector->encodings[i]) + 1;
  187. }
  188. while (encodingStrPadding > 0) {
  189. *allStrings++ = 0;
  190. --encodingStrPadding;
  191. }
  192. newSelector->ownEncodingStrings = true;
  193. newSelector->encodingsCount = converterListSize;
  194. UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status);
  195. generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status);
  196. upvec_close(upvec);
  197. if (U_FAILURE(*status)) {
  198. return nullptr;
  199. }
  200. return newSelector.orphan();
  201. }
  202. /* close opened selector */
  203. U_CAPI void U_EXPORT2
  204. ucnvsel_close(UConverterSelector *sel) {
  205. if (!sel) {
  206. return;
  207. }
  208. if (sel->ownEncodingStrings) {
  209. uprv_free(sel->encodings[0]);
  210. }
  211. uprv_free(sel->encodings);
  212. if (sel->ownPv) {
  213. uprv_free(sel->pv);
  214. }
  215. utrie2_close(sel->trie);
  216. uprv_free(sel->swapped);
  217. uprv_free(sel);
  218. }
  219. static const UDataInfo dataInfo = {
  220. sizeof(UDataInfo),
  221. 0,
  222. U_IS_BIG_ENDIAN,
  223. U_CHARSET_FAMILY,
  224. U_SIZEOF_UCHAR,
  225. 0,
  226. { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */
  227. { 1, 0, 0, 0 }, /* formatVersion */
  228. { 0, 0, 0, 0 } /* dataVersion */
  229. };
  230. enum {
  231. UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes
  232. UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors
  233. UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names
  234. UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding
  235. UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader
  236. UCNVSEL_INDEX_COUNT = 16
  237. };
  238. /*
  239. * Serialized form of a UConverterSelector, formatVersion 1:
  240. *
  241. * The serialized form begins with a standard ICU DataHeader with a UDataInfo
  242. * as the template above.
  243. * This is followed by:
  244. * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above
  245. * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes
  246. * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors
  247. * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding
  248. */
  249. /* serialize a selector */
  250. U_CAPI int32_t U_EXPORT2
  251. ucnvsel_serialize(const UConverterSelector* sel,
  252. void* buffer, int32_t bufferCapacity, UErrorCode* status) {
  253. // check if already failed
  254. if (U_FAILURE(*status)) {
  255. return 0;
  256. }
  257. // ensure args make sense!
  258. uint8_t *p = (uint8_t *)buffer;
  259. if (bufferCapacity < 0 ||
  260. (bufferCapacity > 0 && (p == nullptr || (U_POINTER_MASK_LSB(p, 3) != 0)))
  261. ) {
  262. *status = U_ILLEGAL_ARGUMENT_ERROR;
  263. return 0;
  264. }
  265. // add up the size of the serialized form
  266. int32_t serializedTrieSize = utrie2_serialize(sel->trie, nullptr, 0, status);
  267. if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
  268. return 0;
  269. }
  270. *status = U_ZERO_ERROR;
  271. DataHeader header;
  272. uprv_memset(&header, 0, sizeof(header));
  273. header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15);
  274. header.dataHeader.magic1 = 0xda;
  275. header.dataHeader.magic2 = 0x27;
  276. uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo));
  277. int32_t indexes[UCNVSEL_INDEX_COUNT] = {
  278. serializedTrieSize,
  279. sel->pvCount,
  280. sel->encodingsCount,
  281. sel->encodingStrLength
  282. };
  283. int32_t totalSize =
  284. header.dataHeader.headerSize +
  285. (int32_t)sizeof(indexes) +
  286. serializedTrieSize +
  287. sel->pvCount * 4 +
  288. sel->encodingStrLength;
  289. indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize;
  290. if (totalSize > bufferCapacity) {
  291. *status = U_BUFFER_OVERFLOW_ERROR;
  292. return totalSize;
  293. }
  294. // ok, save!
  295. int32_t length = header.dataHeader.headerSize;
  296. uprv_memcpy(p, &header, sizeof(header));
  297. uprv_memset(p + sizeof(header), 0, length - sizeof(header));
  298. p += length;
  299. length = (int32_t)sizeof(indexes);
  300. uprv_memcpy(p, indexes, length);
  301. p += length;
  302. utrie2_serialize(sel->trie, p, serializedTrieSize, status);
  303. p += serializedTrieSize;
  304. length = sel->pvCount * 4;
  305. uprv_memcpy(p, sel->pv, length);
  306. p += length;
  307. uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength);
  308. p += sel->encodingStrLength;
  309. return totalSize;
  310. }
  311. /**
  312. * swap a selector into the desired Endianness and Asciiness of
  313. * the system. Just as FYI, selectors are always saved in the format
  314. * of the system that created them. They are only converted if used
  315. * on another system. In other words, selectors created on different
  316. * system can be different even if the params are identical (endianness
  317. * and Asciiness differences only)
  318. *
  319. * @param ds pointer to data swapper containing swapping info
  320. * @param inData pointer to incoming data
  321. * @param length length of inData in bytes
  322. * @param outData pointer to output data. Capacity should
  323. * be at least equal to capacity of inData
  324. * @param status an in/out ICU UErrorCode
  325. * @return 0 on failure, number of bytes swapped on success
  326. * number of bytes swapped can be smaller than length
  327. */
  328. static int32_t
  329. ucnvsel_swap(const UDataSwapper *ds,
  330. const void *inData, int32_t length,
  331. void *outData, UErrorCode *status) {
  332. /* udata_swapDataHeader checks the arguments */
  333. int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status);
  334. if(U_FAILURE(*status)) {
  335. return 0;
  336. }
  337. /* check data format and format version */
  338. const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4);
  339. if(!(
  340. pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */
  341. pInfo->dataFormat[1] == 0x53 &&
  342. pInfo->dataFormat[2] == 0x65 &&
  343. pInfo->dataFormat[3] == 0x6c
  344. )) {
  345. udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n",
  346. pInfo->dataFormat[0], pInfo->dataFormat[1],
  347. pInfo->dataFormat[2], pInfo->dataFormat[3]);
  348. *status = U_INVALID_FORMAT_ERROR;
  349. return 0;
  350. }
  351. if(pInfo->formatVersion[0] != 1) {
  352. udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n",
  353. pInfo->formatVersion[0]);
  354. *status = U_UNSUPPORTED_ERROR;
  355. return 0;
  356. }
  357. if(length >= 0) {
  358. length -= headerSize;
  359. if(length < 16*4) {
  360. udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n",
  361. length);
  362. *status = U_INDEX_OUTOFBOUNDS_ERROR;
  363. return 0;
  364. }
  365. }
  366. const uint8_t *inBytes = (const uint8_t *)inData + headerSize;
  367. uint8_t *outBytes = (uint8_t *)outData + headerSize;
  368. /* read the indexes */
  369. const int32_t *inIndexes = (const int32_t *)inBytes;
  370. int32_t indexes[16];
  371. int32_t i;
  372. for(i = 0; i < 16; ++i) {
  373. indexes[i] = udata_readInt32(ds, inIndexes[i]);
  374. }
  375. /* get the total length of the data */
  376. int32_t size = indexes[UCNVSEL_INDEX_SIZE];
  377. if(length >= 0) {
  378. if(length < size) {
  379. udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n",
  380. length);
  381. *status = U_INDEX_OUTOFBOUNDS_ERROR;
  382. return 0;
  383. }
  384. /* copy the data for inaccessible bytes */
  385. if(inBytes != outBytes) {
  386. uprv_memcpy(outBytes, inBytes, size);
  387. }
  388. int32_t offset = 0, count;
  389. /* swap the int32_t indexes[] */
  390. count = UCNVSEL_INDEX_COUNT*4;
  391. ds->swapArray32(ds, inBytes, count, outBytes, status);
  392. offset += count;
  393. /* swap the UTrie2 */
  394. count = indexes[UCNVSEL_INDEX_TRIE_SIZE];
  395. utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status);
  396. offset += count;
  397. /* swap the uint32_t pv[] */
  398. count = indexes[UCNVSEL_INDEX_PV_COUNT]*4;
  399. ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status);
  400. offset += count;
  401. /* swap the encoding names */
  402. count = indexes[UCNVSEL_INDEX_NAMES_LENGTH];
  403. ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status);
  404. offset += count;
  405. U_ASSERT(offset == size);
  406. }
  407. return headerSize + size;
  408. }
  409. /* unserialize a selector */
  410. U_CAPI UConverterSelector* U_EXPORT2
  411. ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) {
  412. // check if already failed
  413. if (U_FAILURE(*status)) {
  414. return nullptr;
  415. }
  416. // ensure args make sense!
  417. const uint8_t *p = (const uint8_t *)buffer;
  418. if (length <= 0 ||
  419. (length > 0 && (p == nullptr || (U_POINTER_MASK_LSB(p, 3) != 0)))
  420. ) {
  421. *status = U_ILLEGAL_ARGUMENT_ERROR;
  422. return nullptr;
  423. }
  424. // header
  425. if (length < 32) {
  426. // not even enough space for a minimal header
  427. *status = U_INDEX_OUTOFBOUNDS_ERROR;
  428. return nullptr;
  429. }
  430. const DataHeader *pHeader = (const DataHeader *)p;
  431. if (!(
  432. pHeader->dataHeader.magic1==0xda &&
  433. pHeader->dataHeader.magic2==0x27 &&
  434. pHeader->info.dataFormat[0] == 0x43 &&
  435. pHeader->info.dataFormat[1] == 0x53 &&
  436. pHeader->info.dataFormat[2] == 0x65 &&
  437. pHeader->info.dataFormat[3] == 0x6c
  438. )) {
  439. /* header not valid or dataFormat not recognized */
  440. *status = U_INVALID_FORMAT_ERROR;
  441. return nullptr;
  442. }
  443. if (pHeader->info.formatVersion[0] != 1) {
  444. *status = U_UNSUPPORTED_ERROR;
  445. return nullptr;
  446. }
  447. uint8_t* swapped = nullptr;
  448. if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN ||
  449. pHeader->info.charsetFamily != U_CHARSET_FAMILY
  450. ) {
  451. // swap the data
  452. UDataSwapper *ds =
  453. udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status);
  454. int32_t totalSize = ucnvsel_swap(ds, p, -1, nullptr, status);
  455. if (U_FAILURE(*status)) {
  456. udata_closeSwapper(ds);
  457. return nullptr;
  458. }
  459. if (length < totalSize) {
  460. udata_closeSwapper(ds);
  461. *status = U_INDEX_OUTOFBOUNDS_ERROR;
  462. return nullptr;
  463. }
  464. swapped = (uint8_t*)uprv_malloc(totalSize);
  465. if (swapped == nullptr) {
  466. udata_closeSwapper(ds);
  467. *status = U_MEMORY_ALLOCATION_ERROR;
  468. return nullptr;
  469. }
  470. ucnvsel_swap(ds, p, length, swapped, status);
  471. udata_closeSwapper(ds);
  472. if (U_FAILURE(*status)) {
  473. uprv_free(swapped);
  474. return nullptr;
  475. }
  476. p = swapped;
  477. pHeader = (const DataHeader *)p;
  478. }
  479. if (length < (pHeader->dataHeader.headerSize + 16 * 4)) {
  480. // not even enough space for the header and the indexes
  481. uprv_free(swapped);
  482. *status = U_INDEX_OUTOFBOUNDS_ERROR;
  483. return nullptr;
  484. }
  485. p += pHeader->dataHeader.headerSize;
  486. length -= pHeader->dataHeader.headerSize;
  487. // indexes
  488. const int32_t *indexes = (const int32_t *)p;
  489. if (length < indexes[UCNVSEL_INDEX_SIZE]) {
  490. uprv_free(swapped);
  491. *status = U_INDEX_OUTOFBOUNDS_ERROR;
  492. return nullptr;
  493. }
  494. p += UCNVSEL_INDEX_COUNT * 4;
  495. // create and populate the selector object
  496. UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector));
  497. char **encodings =
  498. (char **)uprv_malloc(
  499. indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *));
  500. if (sel == nullptr || encodings == nullptr) {
  501. uprv_free(swapped);
  502. uprv_free(sel);
  503. uprv_free(encodings);
  504. *status = U_MEMORY_ALLOCATION_ERROR;
  505. return nullptr;
  506. }
  507. uprv_memset(sel, 0, sizeof(UConverterSelector));
  508. sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT];
  509. sel->encodings = encodings;
  510. sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT];
  511. sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH];
  512. sel->swapped = swapped;
  513. // trie
  514. sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
  515. p, indexes[UCNVSEL_INDEX_TRIE_SIZE], nullptr,
  516. status);
  517. p += indexes[UCNVSEL_INDEX_TRIE_SIZE];
  518. if (U_FAILURE(*status)) {
  519. ucnvsel_close(sel);
  520. return nullptr;
  521. }
  522. // bit vectors
  523. sel->pv = (uint32_t *)p;
  524. p += sel->pvCount * 4;
  525. // encoding names
  526. char* s = (char*)p;
  527. for (int32_t i = 0; i < sel->encodingsCount; ++i) {
  528. sel->encodings[i] = s;
  529. s += uprv_strlen(s) + 1;
  530. }
  531. p += sel->encodingStrLength;
  532. return sel;
  533. }
  534. // a bunch of functions for the enumeration thingie! Nothing fancy here. Just
  535. // iterate over the selected encodings
  536. struct Enumerator {
  537. int16_t* index;
  538. int16_t length;
  539. int16_t cur;
  540. const UConverterSelector* sel;
  541. };
  542. U_CDECL_BEGIN
  543. static void U_CALLCONV
  544. ucnvsel_close_selector_iterator(UEnumeration *enumerator) {
  545. uprv_free(((Enumerator*)(enumerator->context))->index);
  546. uprv_free(enumerator->context);
  547. uprv_free(enumerator);
  548. }
  549. static int32_t U_CALLCONV
  550. ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) {
  551. // check if already failed
  552. if (U_FAILURE(*status)) {
  553. return 0;
  554. }
  555. return ((Enumerator*)(enumerator->context))->length;
  556. }
  557. static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator,
  558. int32_t* resultLength,
  559. UErrorCode* status) {
  560. // check if already failed
  561. if (U_FAILURE(*status)) {
  562. return nullptr;
  563. }
  564. int16_t cur = ((Enumerator*)(enumerator->context))->cur;
  565. const UConverterSelector* sel;
  566. const char* result;
  567. if (cur >= ((Enumerator*)(enumerator->context))->length) {
  568. return nullptr;
  569. }
  570. sel = ((Enumerator*)(enumerator->context))->sel;
  571. result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ];
  572. ((Enumerator*)(enumerator->context))->cur++;
  573. if (resultLength) {
  574. *resultLength = (int32_t)uprv_strlen(result);
  575. }
  576. return result;
  577. }
  578. static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator,
  579. UErrorCode* status) {
  580. // check if already failed
  581. if (U_FAILURE(*status)) {
  582. return ;
  583. }
  584. ((Enumerator*)(enumerator->context))->cur = 0;
  585. }
  586. U_CDECL_END
  587. static const UEnumeration defaultEncodings = {
  588. nullptr,
  589. nullptr,
  590. ucnvsel_close_selector_iterator,
  591. ucnvsel_count_encodings,
  592. uenum_unextDefault,
  593. ucnvsel_next_encoding,
  594. ucnvsel_reset_iterator
  595. };
  596. // internal fn to intersect two sets of masks
  597. // returns whether the mask has reduced to all zeros
  598. static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) {
  599. int32_t i;
  600. uint32_t oredDest = 0;
  601. for (i = 0 ; i < len ; ++i) {
  602. oredDest |= (dest[i] &= source1[i]);
  603. }
  604. return oredDest == 0;
  605. }
  606. // internal fn to count how many 1's are there in a mask
  607. // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
  608. static int16_t countOnes(uint32_t* mask, int32_t len) {
  609. int32_t i, totalOnes = 0;
  610. for (i = 0 ; i < len ; ++i) {
  611. uint32_t ent = mask[i];
  612. for (; ent; totalOnes++)
  613. {
  614. ent &= ent - 1; // clear the least significant bit set
  615. }
  616. }
  617. return static_cast<int16_t>(totalOnes);
  618. }
  619. /* internal function! */
  620. static UEnumeration *selectForMask(const UConverterSelector* sel,
  621. uint32_t *theMask, UErrorCode *status) {
  622. LocalMemory<uint32_t> mask(theMask);
  623. // this is the context we will use. Store a table of indices to which
  624. // encodings are legit.
  625. LocalMemory<Enumerator> result(static_cast<Enumerator *>(uprv_malloc(sizeof(Enumerator))));
  626. if (result.isNull()) {
  627. *status = U_MEMORY_ALLOCATION_ERROR;
  628. return nullptr;
  629. }
  630. result->index = nullptr; // this will be allocated later!
  631. result->length = result->cur = 0;
  632. result->sel = sel;
  633. LocalMemory<UEnumeration> en(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
  634. if (en.isNull()) {
  635. // TODO(markus): Combine Enumerator and UEnumeration into one struct.
  636. *status = U_MEMORY_ALLOCATION_ERROR;
  637. return nullptr;
  638. }
  639. memcpy(en.getAlias(), &defaultEncodings, sizeof(UEnumeration));
  640. int32_t columns = (sel->encodingsCount+31)/32;
  641. int16_t numOnes = countOnes(mask.getAlias(), columns);
  642. // now, we know the exact space we need for index
  643. if (numOnes > 0) {
  644. result->index = static_cast<int16_t*>(uprv_malloc(numOnes * sizeof(int16_t)));
  645. if (result->index == nullptr) {
  646. *status = U_MEMORY_ALLOCATION_ERROR;
  647. return nullptr;
  648. }
  649. int32_t i, j;
  650. int16_t k = 0;
  651. for (j = 0 ; j < columns; j++) {
  652. uint32_t v = mask[j];
  653. for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) {
  654. if ((v & 1) != 0) {
  655. result->index[result->length++] = k;
  656. }
  657. v >>= 1;
  658. }
  659. }
  660. } //otherwise, index will remain nullptr (and will never be touched by
  661. //the enumerator code anyway)
  662. en->context = result.orphan();
  663. return en.orphan();
  664. }
  665. /* check a string against the selector - UTF16 version */
  666. U_CAPI UEnumeration * U_EXPORT2
  667. ucnvsel_selectForString(const UConverterSelector* sel,
  668. const char16_t *s, int32_t length, UErrorCode *status) {
  669. // check if already failed
  670. if (U_FAILURE(*status)) {
  671. return nullptr;
  672. }
  673. // ensure args make sense!
  674. if (sel == nullptr || (s == nullptr && length != 0)) {
  675. *status = U_ILLEGAL_ARGUMENT_ERROR;
  676. return nullptr;
  677. }
  678. int32_t columns = (sel->encodingsCount+31)/32;
  679. uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4);
  680. if (mask == nullptr) {
  681. *status = U_MEMORY_ALLOCATION_ERROR;
  682. return nullptr;
  683. }
  684. uprv_memset(mask, ~0, columns *4);
  685. if(s!=nullptr) {
  686. const char16_t *limit;
  687. if (length >= 0) {
  688. limit = s + length;
  689. } else {
  690. limit = nullptr;
  691. }
  692. while (limit == nullptr ? *s != 0 : s != limit) {
  693. UChar32 c;
  694. uint16_t pvIndex;
  695. UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex);
  696. if (intersectMasks(mask, sel->pv+pvIndex, columns)) {
  697. break;
  698. }
  699. }
  700. }
  701. return selectForMask(sel, mask, status);
  702. }
  703. /* check a string against the selector - UTF8 version */
  704. U_CAPI UEnumeration * U_EXPORT2
  705. ucnvsel_selectForUTF8(const UConverterSelector* sel,
  706. const char *s, int32_t length, UErrorCode *status) {
  707. // check if already failed
  708. if (U_FAILURE(*status)) {
  709. return nullptr;
  710. }
  711. // ensure args make sense!
  712. if (sel == nullptr || (s == nullptr && length != 0)) {
  713. *status = U_ILLEGAL_ARGUMENT_ERROR;
  714. return nullptr;
  715. }
  716. int32_t columns = (sel->encodingsCount+31)/32;
  717. uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4);
  718. if (mask == nullptr) {
  719. *status = U_MEMORY_ALLOCATION_ERROR;
  720. return nullptr;
  721. }
  722. uprv_memset(mask, ~0, columns *4);
  723. if (length < 0) {
  724. length = (int32_t)uprv_strlen(s);
  725. }
  726. if(s!=nullptr) {
  727. const char *limit = s + length;
  728. while (s != limit) {
  729. uint16_t pvIndex;
  730. UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex);
  731. if (intersectMasks(mask, sel->pv+pvIndex, columns)) {
  732. break;
  733. }
  734. }
  735. }
  736. return selectForMask(sel, mask, status);
  737. }
  738. #endif // !UCONFIG_NO_CONVERSION