collationdatawriter.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2013-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * collationdatawriter.cpp
  9. *
  10. * created on: 2013aug06
  11. * created by: Markus W. Scherer
  12. */
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_COLLATION
  15. #include "unicode/tblcoll.h"
  16. #include "unicode/udata.h"
  17. #include "unicode/uniset.h"
  18. #include "cmemory.h"
  19. #include "collationdata.h"
  20. #include "collationdatabuilder.h"
  21. #include "collationdatareader.h"
  22. #include "collationdatawriter.h"
  23. #include "collationfastlatin.h"
  24. #include "collationsettings.h"
  25. #include "collationtailoring.h"
  26. #include "uassert.h"
  27. #include "ucmndata.h"
  28. U_NAMESPACE_BEGIN
  29. uint8_t *
  30. RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
  31. if(U_FAILURE(errorCode)) { return nullptr; }
  32. LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
  33. if(buffer.isNull()) {
  34. errorCode = U_MEMORY_ALLOCATION_ERROR;
  35. return nullptr;
  36. }
  37. length = cloneBinary(buffer.getAlias(), 20000, errorCode);
  38. if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
  39. if(buffer.allocateInsteadAndCopy(length, 0) == nullptr) {
  40. errorCode = U_MEMORY_ALLOCATION_ERROR;
  41. return nullptr;
  42. }
  43. errorCode = U_ZERO_ERROR;
  44. length = cloneBinary(buffer.getAlias(), length, errorCode);
  45. }
  46. if(U_FAILURE(errorCode)) { return nullptr; }
  47. return buffer.orphan();
  48. }
  49. int32_t
  50. RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
  51. int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
  52. return CollationDataWriter::writeTailoring(
  53. *tailoring, *settings, indexes, dest, capacity,
  54. errorCode);
  55. }
  56. static const UDataInfo dataInfo = {
  57. sizeof(UDataInfo),
  58. 0,
  59. U_IS_BIG_ENDIAN,
  60. U_CHARSET_FAMILY,
  61. U_SIZEOF_UCHAR,
  62. 0,
  63. { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
  64. { 5, 0, 0, 0 }, // formatVersion
  65. { 6, 3, 0, 0 } // dataVersion
  66. };
  67. int32_t
  68. CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
  69. const void *rootElements, int32_t rootElementsLength,
  70. int32_t indexes[], uint8_t *dest, int32_t capacity,
  71. UErrorCode &errorCode) {
  72. return write(true, nullptr,
  73. data, settings,
  74. rootElements, rootElementsLength,
  75. indexes, dest, capacity, errorCode);
  76. }
  77. int32_t
  78. CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
  79. int32_t indexes[], uint8_t *dest, int32_t capacity,
  80. UErrorCode &errorCode) {
  81. return write(false, t.version,
  82. *t.data, settings,
  83. nullptr, 0,
  84. indexes, dest, capacity, errorCode);
  85. }
  86. int32_t
  87. CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
  88. const CollationData &data, const CollationSettings &settings,
  89. const void *rootElements, int32_t rootElementsLength,
  90. int32_t indexes[], uint8_t *dest, int32_t capacity,
  91. UErrorCode &errorCode) {
  92. if(U_FAILURE(errorCode)) { return 0; }
  93. if(capacity < 0 || (capacity > 0 && dest == nullptr)) {
  94. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  95. return 0;
  96. }
  97. // Figure out which data items to write before settling on
  98. // the indexes length and writing offsets.
  99. // For any data item, we need to write the start and limit offsets,
  100. // so the indexes length must be at least index-of-start-offset + 2.
  101. int32_t indexesLength;
  102. UBool hasMappings;
  103. UnicodeSet unsafeBackwardSet;
  104. const CollationData *baseData = data.base;
  105. int32_t fastLatinVersion;
  106. if(data.fastLatinTable != nullptr) {
  107. fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
  108. } else {
  109. fastLatinVersion = 0;
  110. }
  111. int32_t fastLatinTableLength = 0;
  112. if(isBase) {
  113. // For the root collator, we write an even number of indexes
  114. // so that we start with an 8-aligned offset.
  115. indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
  116. U_ASSERT(settings.reorderCodesLength == 0);
  117. hasMappings = true;
  118. unsafeBackwardSet = *data.unsafeBackwardSet;
  119. fastLatinTableLength = data.fastLatinTableLength;
  120. } else if(baseData == nullptr) {
  121. hasMappings = false;
  122. if(settings.reorderCodesLength == 0) {
  123. // only options
  124. indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here
  125. } else {
  126. // only options, reorder codes, and the reorder table
  127. indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
  128. }
  129. } else {
  130. hasMappings = true;
  131. // Tailored mappings, and what else?
  132. // Check in ascending order of optional tailoring data items.
  133. indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
  134. if(data.contextsLength != 0) {
  135. indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
  136. }
  137. unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
  138. if(!unsafeBackwardSet.isEmpty()) {
  139. indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
  140. }
  141. if(data.fastLatinTable != baseData->fastLatinTable) {
  142. fastLatinTableLength = data.fastLatinTableLength;
  143. indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
  144. }
  145. }
  146. UVector32 codesAndRanges(errorCode);
  147. const int32_t *reorderCodes = settings.reorderCodes;
  148. int32_t reorderCodesLength = settings.reorderCodesLength;
  149. if(settings.hasReordering() &&
  150. CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
  151. // Rebuild the full list of reorder ranges.
  152. // The list in the settings is truncated for efficiency.
  153. data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
  154. // Write the codes, then the ranges.
  155. for(int32_t i = 0; i < reorderCodesLength; ++i) {
  156. codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
  157. }
  158. if(U_FAILURE(errorCode)) { return 0; }
  159. reorderCodes = codesAndRanges.getBuffer();
  160. reorderCodesLength = codesAndRanges.size();
  161. }
  162. int32_t headerSize;
  163. if(isBase) {
  164. headerSize = 0; // udata_create() writes the header
  165. } else {
  166. DataHeader header;
  167. header.dataHeader.magic1 = 0xda;
  168. header.dataHeader.magic2 = 0x27;
  169. uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
  170. uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
  171. headerSize = (int32_t)sizeof(header);
  172. U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes
  173. if(hasMappings && data.cesLength != 0) {
  174. // Sum of the sizes of the data items which are
  175. // not automatically multiples of 8 bytes and which are placed before the CEs.
  176. int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
  177. if((sum & 7) != 0) {
  178. // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
  179. // We add to the header size here.
  180. // Alternatively, we could increment the indexesLength
  181. // or add a few bytes to the reorderTable.
  182. headerSize += 4;
  183. }
  184. }
  185. header.dataHeader.headerSize = (uint16_t)headerSize;
  186. if(headerSize <= capacity) {
  187. uprv_memcpy(dest, &header, sizeof(header));
  188. // Write 00 bytes so that the padding is not mistaken for a copyright string.
  189. uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
  190. dest += headerSize;
  191. capacity -= headerSize;
  192. } else {
  193. dest = nullptr;
  194. capacity = 0;
  195. }
  196. }
  197. indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
  198. U_ASSERT((settings.options & ~0xffff) == 0);
  199. indexes[CollationDataReader::IX_OPTIONS] =
  200. data.numericPrimary | fastLatinVersion | settings.options;
  201. indexes[CollationDataReader::IX_RESERVED2] = 0;
  202. indexes[CollationDataReader::IX_RESERVED3] = 0;
  203. // Byte offsets of data items all start from the start of the indexes.
  204. // We add the headerSize at the very end.
  205. int32_t totalSize = indexesLength * 4;
  206. if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
  207. indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);
  208. } else {
  209. indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
  210. }
  211. indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
  212. totalSize += reorderCodesLength * 4;
  213. indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
  214. if(settings.reorderTable != nullptr) {
  215. totalSize += 256;
  216. }
  217. indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
  218. if(hasMappings) {
  219. UErrorCode errorCode2 = U_ZERO_ERROR;
  220. int32_t length;
  221. if(totalSize < capacity) {
  222. length = utrie2_serialize(data.trie, dest + totalSize,
  223. capacity - totalSize, &errorCode2);
  224. } else {
  225. length = utrie2_serialize(data.trie, nullptr, 0, &errorCode2);
  226. }
  227. if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
  228. errorCode = errorCode2;
  229. return 0;
  230. }
  231. // The trie size should be a multiple of 8 bytes due to the way
  232. // compactIndex2(UNewTrie2 *trie) currently works.
  233. U_ASSERT((length & 7) == 0);
  234. totalSize += length;
  235. }
  236. indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
  237. indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
  238. if(hasMappings && data.cesLength != 0) {
  239. U_ASSERT(((headerSize + totalSize) & 7) == 0);
  240. totalSize += data.cesLength * 8;
  241. }
  242. indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
  243. indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
  244. if(hasMappings) {
  245. totalSize += data.ce32sLength * 4;
  246. }
  247. indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
  248. totalSize += rootElementsLength * 4;
  249. indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
  250. if(hasMappings) {
  251. totalSize += data.contextsLength * 2;
  252. }
  253. indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
  254. if(hasMappings && !unsafeBackwardSet.isEmpty()) {
  255. UErrorCode errorCode2 = U_ZERO_ERROR;
  256. int32_t length;
  257. if(totalSize < capacity) {
  258. uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
  259. length = unsafeBackwardSet.serialize(
  260. p, (capacity - totalSize) / 2, errorCode2);
  261. } else {
  262. length = unsafeBackwardSet.serialize(nullptr, 0, errorCode2);
  263. }
  264. if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
  265. errorCode = errorCode2;
  266. return 0;
  267. }
  268. totalSize += length * 2;
  269. }
  270. indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
  271. totalSize += fastLatinTableLength * 2;
  272. UnicodeString scripts;
  273. indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
  274. if(isBase) {
  275. scripts.append((char16_t)data.numScripts);
  276. scripts.append(reinterpret_cast<const char16_t *>(data.scriptsIndex), data.numScripts + 16);
  277. scripts.append(reinterpret_cast<const char16_t *>(data.scriptStarts), data.scriptStartsLength);
  278. totalSize += scripts.length() * 2;
  279. }
  280. indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
  281. if(isBase) {
  282. totalSize += 256;
  283. }
  284. indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
  285. indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
  286. if(totalSize > capacity) {
  287. errorCode = U_BUFFER_OVERFLOW_ERROR;
  288. return headerSize + totalSize;
  289. }
  290. uprv_memcpy(dest, indexes, indexesLength * 4);
  291. copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
  292. copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
  293. // The trie has already been serialized into the dest buffer.
  294. copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
  295. copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
  296. copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
  297. copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
  298. // The unsafeBackwardSet has already been serialized into the dest buffer.
  299. copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
  300. copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
  301. copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
  302. return headerSize + totalSize;
  303. }
  304. void
  305. CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
  306. const void *src, uint8_t *dest) {
  307. int32_t start = indexes[startIndex];
  308. int32_t limit = indexes[startIndex + 1];
  309. if(start < limit) {
  310. uprv_memcpy(dest + start, src, limit - start);
  311. }
  312. }
  313. U_NAMESPACE_END
  314. #endif // !UCONFIG_NO_COLLATION