collationdatareader.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2013-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * collationdatareader.cpp
  9. *
  10. * created on: 2013feb07
  11. * created by: Markus W. Scherer
  12. */
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_COLLATION
  15. #include "unicode/ucol.h"
  16. #include "unicode/udata.h"
  17. #include "unicode/uscript.h"
  18. #include "cmemory.h"
  19. #include "collation.h"
  20. #include "collationdata.h"
  21. #include "collationdatareader.h"
  22. #include "collationfastlatin.h"
  23. #include "collationkeys.h"
  24. #include "collationrootelements.h"
  25. #include "collationsettings.h"
  26. #include "collationtailoring.h"
  27. #include "collunsafe.h"
  28. #include "normalizer2impl.h"
  29. #include "uassert.h"
  30. #include "ucmndata.h"
  31. #include "utrie2.h"
  32. U_NAMESPACE_BEGIN
  33. namespace {
  34. int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
  35. return (i < length) ? indexes[i] : -1;
  36. }
  37. } // namespace
  38. void
  39. CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
  40. CollationTailoring &tailoring, UErrorCode &errorCode) {
  41. if(U_FAILURE(errorCode)) { return; }
  42. if(base != nullptr) {
  43. if(inBytes == nullptr || (0 <= inLength && inLength < 24)) {
  44. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  45. return;
  46. }
  47. const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes);
  48. if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
  49. isAcceptable(tailoring.version, nullptr, nullptr, &header->info))) {
  50. errorCode = U_INVALID_FORMAT_ERROR;
  51. return;
  52. }
  53. if(base->getUCAVersion() != tailoring.getUCAVersion()) {
  54. errorCode = U_COLLATOR_VERSION_MISMATCH;
  55. return;
  56. }
  57. int32_t headerLength = header->dataHeader.headerSize;
  58. inBytes += headerLength;
  59. if(inLength >= 0) {
  60. inLength -= headerLength;
  61. }
  62. }
  63. if(inBytes == nullptr || (0 <= inLength && inLength < 8)) {
  64. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  65. return;
  66. }
  67. const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);
  68. int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
  69. if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {
  70. errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
  71. return;
  72. }
  73. // Assume that the tailoring data is in initial state,
  74. // with nullptr pointers and 0 lengths.
  75. // Set pointers to non-empty data parts.
  76. // Do this in order of their byte offsets. (Should help porting to Java.)
  77. int32_t index; // one of the indexes[] slots
  78. int32_t offset; // byte offset for the index part
  79. int32_t length; // number of bytes in the index part
  80. if(indexesLength > IX_TOTAL_SIZE) {
  81. length = inIndexes[IX_TOTAL_SIZE];
  82. } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
  83. length = inIndexes[indexesLength - 1];
  84. } else {
  85. length = 0; // only indexes, and inLength was already checked for them
  86. }
  87. if(0 <= inLength && inLength < length) {
  88. errorCode = U_INVALID_FORMAT_ERROR;
  89. return;
  90. }
  91. const CollationData *baseData = base == nullptr ? nullptr : base->data;
  92. const int32_t *reorderCodes = nullptr;
  93. int32_t reorderCodesLength = 0;
  94. const uint32_t *reorderRanges = nullptr;
  95. int32_t reorderRangesLength = 0;
  96. index = IX_REORDER_CODES_OFFSET;
  97. offset = getIndex(inIndexes, indexesLength, index);
  98. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  99. if(length >= 4) {
  100. if(baseData == nullptr) {
  101. // We assume for collation settings that
  102. // the base data does not have a reordering.
  103. errorCode = U_INVALID_FORMAT_ERROR;
  104. return;
  105. }
  106. reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
  107. reorderCodesLength = length / 4;
  108. // The reorderRanges (if any) are the trailing reorderCodes entries.
  109. // Split the array at the boundary.
  110. // Script or reorder codes do not exceed 16-bit values.
  111. // Range limits are stored in the upper 16 bits, and are never 0.
  112. while(reorderRangesLength < reorderCodesLength &&
  113. (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
  114. ++reorderRangesLength;
  115. }
  116. U_ASSERT(reorderRangesLength < reorderCodesLength);
  117. if(reorderRangesLength != 0) {
  118. reorderCodesLength -= reorderRangesLength;
  119. reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength);
  120. }
  121. }
  122. // There should be a reorder table only if there are reorder codes.
  123. // However, when there are reorder codes the reorder table may be omitted to reduce
  124. // the data size.
  125. const uint8_t *reorderTable = nullptr;
  126. index = IX_REORDER_TABLE_OFFSET;
  127. offset = getIndex(inIndexes, indexesLength, index);
  128. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  129. if(length >= 256) {
  130. if(reorderCodesLength == 0) {
  131. errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reordering codes.
  132. return;
  133. }
  134. reorderTable = inBytes + offset;
  135. } else {
  136. // If we have reorder codes, then build the reorderTable at the end,
  137. // when the CollationData is otherwise complete.
  138. }
  139. if(baseData != nullptr && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
  140. errorCode = U_INVALID_FORMAT_ERROR;
  141. return;
  142. }
  143. CollationData *data = nullptr; // Remains nullptr if there are no mappings.
  144. index = IX_TRIE_OFFSET;
  145. offset = getIndex(inIndexes, indexesLength, index);
  146. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  147. if(length >= 8) {
  148. if(!tailoring.ensureOwnedData(errorCode)) { return; }
  149. data = tailoring.ownedData;
  150. data->base = baseData;
  151. data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
  152. data->trie = tailoring.trie = utrie2_openFromSerialized(
  153. UTRIE2_32_VALUE_BITS, inBytes + offset, length, nullptr,
  154. &errorCode);
  155. if(U_FAILURE(errorCode)) { return; }
  156. } else if(baseData != nullptr) {
  157. // Use the base data. Only the settings are tailored.
  158. tailoring.data = baseData;
  159. } else {
  160. errorCode = U_INVALID_FORMAT_ERROR; // No mappings.
  161. return;
  162. }
  163. index = IX_CES_OFFSET;
  164. offset = getIndex(inIndexes, indexesLength, index);
  165. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  166. if(length >= 8) {
  167. if(data == nullptr) {
  168. errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailored trie.
  169. return;
  170. }
  171. data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
  172. data->cesLength = length / 8;
  173. }
  174. index = IX_CE32S_OFFSET;
  175. offset = getIndex(inIndexes, indexesLength, index);
  176. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  177. if(length >= 4) {
  178. if(data == nullptr) {
  179. errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailored trie.
  180. return;
  181. }
  182. data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
  183. data->ce32sLength = length / 4;
  184. }
  185. int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
  186. if(jamoCE32sStart >= 0) {
  187. if(data == nullptr || data->ce32s == nullptr) {
  188. errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32s[].
  189. return;
  190. }
  191. data->jamoCE32s = data->ce32s + jamoCE32sStart;
  192. } else if(data == nullptr) {
  193. // Nothing to do.
  194. } else if(baseData != nullptr) {
  195. data->jamoCE32s = baseData->jamoCE32s;
  196. } else {
  197. errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul processing.
  198. return;
  199. }
  200. index = IX_ROOT_ELEMENTS_OFFSET;
  201. offset = getIndex(inIndexes, indexesLength, index);
  202. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  203. if(length >= 4) {
  204. length /= 4;
  205. if(data == nullptr || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
  206. errorCode = U_INVALID_FORMAT_ERROR;
  207. return;
  208. }
  209. data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
  210. data->rootElementsLength = length;
  211. uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
  212. if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
  213. errorCode = U_INVALID_FORMAT_ERROR;
  214. return;
  215. }
  216. uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
  217. if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
  218. // [fixed last secondary common byte] is too low,
  219. // and secondary weights would collide with compressed common secondaries.
  220. errorCode = U_INVALID_FORMAT_ERROR;
  221. return;
  222. }
  223. }
  224. index = IX_CONTEXTS_OFFSET;
  225. offset = getIndex(inIndexes, indexesLength, index);
  226. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  227. if(length >= 2) {
  228. if(data == nullptr) {
  229. errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without tailored trie.
  230. return;
  231. }
  232. data->contexts = reinterpret_cast<const char16_t *>(inBytes + offset);
  233. data->contextsLength = length / 2;
  234. }
  235. index = IX_UNSAFE_BWD_OFFSET;
  236. offset = getIndex(inIndexes, indexesLength, index);
  237. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  238. if(length >= 2) {
  239. if(data == nullptr) {
  240. errorCode = U_INVALID_FORMAT_ERROR;
  241. return;
  242. }
  243. if(baseData == nullptr) {
  244. #if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE)
  245. tailoring.unsafeBackwardSet = new UnicodeSet(unsafe_serializedData, unsafe_serializedCount, UnicodeSet::kSerialized, errorCode);
  246. if(tailoring.unsafeBackwardSet == nullptr) {
  247. errorCode = U_MEMORY_ALLOCATION_ERROR;
  248. return;
  249. } else if (U_FAILURE(errorCode)) {
  250. return;
  251. }
  252. #else
  253. // Create the unsafe-backward set for the root collator.
  254. // Include all non-zero combining marks and trail surrogates.
  255. // We do this at load time, rather than at build time,
  256. // to simplify Unicode version bootstrapping:
  257. // The root data builder only needs the new FractionalUCA.txt data,
  258. // but it need not be built with a version of ICU already updated to
  259. // the corresponding new Unicode Character Database.
  260. //
  261. // The following is an optimized version of
  262. // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
  263. // It is faster and requires fewer code dependencies.
  264. tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
  265. if(tailoring.unsafeBackwardSet == nullptr) {
  266. errorCode = U_MEMORY_ALLOCATION_ERROR;
  267. return;
  268. }
  269. data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
  270. #endif // !COLLUNSAFE_SERIALIZE || !COLLUNSAFE_COLL_VERSION
  271. } else {
  272. // Clone the root collator's set contents.
  273. tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
  274. baseData->unsafeBackwardSet->cloneAsThawed());
  275. if(tailoring.unsafeBackwardSet == nullptr) {
  276. errorCode = U_MEMORY_ALLOCATION_ERROR;
  277. return;
  278. }
  279. }
  280. // Add the ranges from the data file to the unsafe-backward set.
  281. USerializedSet sset;
  282. const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);
  283. if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
  284. errorCode = U_INVALID_FORMAT_ERROR;
  285. return;
  286. }
  287. int32_t count = uset_getSerializedRangeCount(&sset);
  288. for(int32_t i = 0; i < count; ++i) {
  289. UChar32 start, end;
  290. uset_getSerializedRange(&sset, i, &start, &end);
  291. tailoring.unsafeBackwardSet->add(start, end);
  292. }
  293. // Mark each lead surrogate as "unsafe"
  294. // if any of its 1024 associated supplementary code points is "unsafe".
  295. UChar32 c = 0x10000;
  296. for(char16_t lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
  297. if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
  298. tailoring.unsafeBackwardSet->add(lead);
  299. }
  300. }
  301. tailoring.unsafeBackwardSet->freeze();
  302. data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
  303. } else if(data == nullptr) {
  304. // Nothing to do.
  305. } else if(baseData != nullptr) {
  306. // No tailoring-specific data: Alias the root collator's set.
  307. data->unsafeBackwardSet = baseData->unsafeBackwardSet;
  308. } else {
  309. errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet.
  310. return;
  311. }
  312. // If the fast Latin format version is different,
  313. // or the version is set to 0 for "no fast Latin table",
  314. // then just always use the normal string comparison path.
  315. if(data != nullptr) {
  316. data->fastLatinTable = nullptr;
  317. data->fastLatinTableLength = 0;
  318. if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
  319. index = IX_FAST_LATIN_TABLE_OFFSET;
  320. offset = getIndex(inIndexes, indexesLength, index);
  321. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  322. if(length >= 2) {
  323. data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
  324. data->fastLatinTableLength = length / 2;
  325. if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
  326. errorCode = U_INVALID_FORMAT_ERROR; // header vs. table version mismatch
  327. return;
  328. }
  329. } else if(baseData != nullptr) {
  330. data->fastLatinTable = baseData->fastLatinTable;
  331. data->fastLatinTableLength = baseData->fastLatinTableLength;
  332. }
  333. }
  334. }
  335. index = IX_SCRIPTS_OFFSET;
  336. offset = getIndex(inIndexes, indexesLength, index);
  337. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  338. if(length >= 2) {
  339. if(data == nullptr) {
  340. errorCode = U_INVALID_FORMAT_ERROR;
  341. return;
  342. }
  343. const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
  344. int32_t scriptsLength = length / 2;
  345. data->numScripts = scripts[0];
  346. // There must be enough entries for both arrays, including more than two range starts.
  347. data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16);
  348. if(data->scriptStartsLength <= 2 ||
  349. CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {
  350. errorCode = U_INVALID_FORMAT_ERROR;
  351. return;
  352. }
  353. data->scriptsIndex = scripts + 1;
  354. data->scriptStarts = scripts + 1 + data->numScripts + 16;
  355. if(!(data->scriptStarts[0] == 0 &&
  356. data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) &&
  357. data->scriptStarts[data->scriptStartsLength - 1] ==
  358. (Collation::TRAIL_WEIGHT_BYTE << 8))) {
  359. errorCode = U_INVALID_FORMAT_ERROR;
  360. return;
  361. }
  362. } else if(data == nullptr) {
  363. // Nothing to do.
  364. } else if(baseData != nullptr) {
  365. data->numScripts = baseData->numScripts;
  366. data->scriptsIndex = baseData->scriptsIndex;
  367. data->scriptStarts = baseData->scriptStarts;
  368. data->scriptStartsLength = baseData->scriptStartsLength;
  369. }
  370. index = IX_COMPRESSIBLE_BYTES_OFFSET;
  371. offset = getIndex(inIndexes, indexesLength, index);
  372. length = getIndex(inIndexes, indexesLength, index + 1) - offset;
  373. if(length >= 256) {
  374. if(data == nullptr) {
  375. errorCode = U_INVALID_FORMAT_ERROR;
  376. return;
  377. }
  378. data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
  379. } else if(data == nullptr) {
  380. // Nothing to do.
  381. } else if(baseData != nullptr) {
  382. data->compressibleBytes = baseData->compressibleBytes;
  383. } else {
  384. errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[].
  385. return;
  386. }
  387. const CollationSettings &ts = *tailoring.settings;
  388. int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
  389. uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
  390. int32_t fastLatinOptions = CollationFastLatin::getOptions(
  391. tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries));
  392. if(options == ts.options && ts.variableTop != 0 &&
  393. reorderCodesLength == ts.reorderCodesLength &&
  394. (reorderCodesLength == 0 ||
  395. uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0) &&
  396. fastLatinOptions == ts.fastLatinOptions &&
  397. (fastLatinOptions < 0 ||
  398. uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
  399. sizeof(fastLatinPrimaries)) == 0)) {
  400. return;
  401. }
  402. CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
  403. if(settings == nullptr) {
  404. errorCode = U_MEMORY_ALLOCATION_ERROR;
  405. return;
  406. }
  407. settings->options = options;
  408. // Set variableTop from options and scripts data.
  409. settings->variableTop = tailoring.data->getLastPrimaryForGroup(
  410. UCOL_REORDER_CODE_FIRST + int32_t{settings->getMaxVariable()});
  411. if(settings->variableTop == 0) {
  412. errorCode = U_INVALID_FORMAT_ERROR;
  413. return;
  414. }
  415. if(reorderCodesLength != 0) {
  416. settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,
  417. reorderRanges, reorderRangesLength,
  418. reorderTable, errorCode);
  419. }
  420. settings->fastLatinOptions = CollationFastLatin::getOptions(
  421. tailoring.data, *settings,
  422. settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries));
  423. }
  424. UBool U_CALLCONV
  425. CollationDataReader::isAcceptable(void *context,
  426. const char * /* type */, const char * /*name*/,
  427. const UDataInfo *pInfo) {
  428. if(
  429. pInfo->size >= 20 &&
  430. pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
  431. pInfo->charsetFamily == U_CHARSET_FAMILY &&
  432. pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol"
  433. pInfo->dataFormat[1] == 0x43 &&
  434. pInfo->dataFormat[2] == 0x6f &&
  435. pInfo->dataFormat[3] == 0x6c &&
  436. pInfo->formatVersion[0] == 5
  437. ) {
  438. UVersionInfo *version = static_cast<UVersionInfo *>(context);
  439. if(version != nullptr) {
  440. uprv_memcpy(version, pInfo->dataVersion, 4);
  441. }
  442. return true;
  443. } else {
  444. return false;
  445. }
  446. }
  447. U_NAMESPACE_END
  448. #endif // !UCONFIG_NO_COLLATION