123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- * Copyright (C) 2013-2015, International Business Machines
- * Corporation and others. All Rights Reserved.
- *******************************************************************************
- * collationdatawriter.cpp
- *
- * created on: 2013aug06
- * created by: Markus W. Scherer
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_COLLATION
- #include "unicode/tblcoll.h"
- #include "unicode/udata.h"
- #include "unicode/uniset.h"
- #include "cmemory.h"
- #include "collationdata.h"
- #include "collationdatabuilder.h"
- #include "collationdatareader.h"
- #include "collationdatawriter.h"
- #include "collationfastlatin.h"
- #include "collationsettings.h"
- #include "collationtailoring.h"
- #include "uassert.h"
- #include "ucmndata.h"
- U_NAMESPACE_BEGIN
- uint8_t *
- RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return nullptr; }
- LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
- if(buffer.isNull()) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return nullptr;
- }
- length = cloneBinary(buffer.getAlias(), 20000, errorCode);
- if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
- if(buffer.allocateInsteadAndCopy(length, 0) == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return nullptr;
- }
- errorCode = U_ZERO_ERROR;
- length = cloneBinary(buffer.getAlias(), length, errorCode);
- }
- if(U_FAILURE(errorCode)) { return nullptr; }
- return buffer.orphan();
- }
- int32_t
- RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
- int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
- return CollationDataWriter::writeTailoring(
- *tailoring, *settings, indexes, dest, capacity,
- errorCode);
- }
- static const UDataInfo dataInfo = {
- sizeof(UDataInfo),
- 0,
- U_IS_BIG_ENDIAN,
- U_CHARSET_FAMILY,
- U_SIZEOF_UCHAR,
- 0,
- { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
- { 5, 0, 0, 0 }, // formatVersion
- { 6, 3, 0, 0 } // dataVersion
- };
- int32_t
- CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
- const void *rootElements, int32_t rootElementsLength,
- int32_t indexes[], uint8_t *dest, int32_t capacity,
- UErrorCode &errorCode) {
- return write(true, nullptr,
- data, settings,
- rootElements, rootElementsLength,
- indexes, dest, capacity, errorCode);
- }
- int32_t
- CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
- int32_t indexes[], uint8_t *dest, int32_t capacity,
- UErrorCode &errorCode) {
- return write(false, t.version,
- *t.data, settings,
- nullptr, 0,
- indexes, dest, capacity, errorCode);
- }
- int32_t
- CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
- const CollationData &data, const CollationSettings &settings,
- const void *rootElements, int32_t rootElementsLength,
- int32_t indexes[], uint8_t *dest, int32_t capacity,
- UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return 0; }
- if(capacity < 0 || (capacity > 0 && dest == nullptr)) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- // Figure out which data items to write before settling on
- // the indexes length and writing offsets.
- // For any data item, we need to write the start and limit offsets,
- // so the indexes length must be at least index-of-start-offset + 2.
- int32_t indexesLength;
- UBool hasMappings;
- UnicodeSet unsafeBackwardSet;
- const CollationData *baseData = data.base;
- int32_t fastLatinVersion;
- if(data.fastLatinTable != nullptr) {
- fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
- } else {
- fastLatinVersion = 0;
- }
- int32_t fastLatinTableLength = 0;
- if(isBase) {
- // For the root collator, we write an even number of indexes
- // so that we start with an 8-aligned offset.
- indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
- U_ASSERT(settings.reorderCodesLength == 0);
- hasMappings = true;
- unsafeBackwardSet = *data.unsafeBackwardSet;
- fastLatinTableLength = data.fastLatinTableLength;
- } else if(baseData == nullptr) {
- hasMappings = false;
- if(settings.reorderCodesLength == 0) {
- // only options
- indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here
- } else {
- // only options, reorder codes, and the reorder table
- indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
- }
- } else {
- hasMappings = true;
- // Tailored mappings, and what else?
- // Check in ascending order of optional tailoring data items.
- indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
- if(data.contextsLength != 0) {
- indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
- }
- unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
- if(!unsafeBackwardSet.isEmpty()) {
- indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
- }
- if(data.fastLatinTable != baseData->fastLatinTable) {
- fastLatinTableLength = data.fastLatinTableLength;
- indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
- }
- }
- UVector32 codesAndRanges(errorCode);
- const int32_t *reorderCodes = settings.reorderCodes;
- int32_t reorderCodesLength = settings.reorderCodesLength;
- if(settings.hasReordering() &&
- CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
- // Rebuild the full list of reorder ranges.
- // The list in the settings is truncated for efficiency.
- data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
- // Write the codes, then the ranges.
- for(int32_t i = 0; i < reorderCodesLength; ++i) {
- codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
- }
- if(U_FAILURE(errorCode)) { return 0; }
- reorderCodes = codesAndRanges.getBuffer();
- reorderCodesLength = codesAndRanges.size();
- }
- int32_t headerSize;
- if(isBase) {
- headerSize = 0; // udata_create() writes the header
- } else {
- DataHeader header;
- header.dataHeader.magic1 = 0xda;
- header.dataHeader.magic2 = 0x27;
- uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
- uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
- headerSize = (int32_t)sizeof(header);
- U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes
- if(hasMappings && data.cesLength != 0) {
- // Sum of the sizes of the data items which are
- // not automatically multiples of 8 bytes and which are placed before the CEs.
- int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
- if((sum & 7) != 0) {
- // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
- // We add to the header size here.
- // Alternatively, we could increment the indexesLength
- // or add a few bytes to the reorderTable.
- headerSize += 4;
- }
- }
- header.dataHeader.headerSize = (uint16_t)headerSize;
- if(headerSize <= capacity) {
- uprv_memcpy(dest, &header, sizeof(header));
- // Write 00 bytes so that the padding is not mistaken for a copyright string.
- uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
- dest += headerSize;
- capacity -= headerSize;
- } else {
- dest = nullptr;
- capacity = 0;
- }
- }
- indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
- U_ASSERT((settings.options & ~0xffff) == 0);
- indexes[CollationDataReader::IX_OPTIONS] =
- data.numericPrimary | fastLatinVersion | settings.options;
- indexes[CollationDataReader::IX_RESERVED2] = 0;
- indexes[CollationDataReader::IX_RESERVED3] = 0;
- // Byte offsets of data items all start from the start of the indexes.
- // We add the headerSize at the very end.
- int32_t totalSize = indexesLength * 4;
- if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
- indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);
- } else {
- indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
- }
- indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
- totalSize += reorderCodesLength * 4;
- indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
- if(settings.reorderTable != nullptr) {
- totalSize += 256;
- }
- indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
- if(hasMappings) {
- UErrorCode errorCode2 = U_ZERO_ERROR;
- int32_t length;
- if(totalSize < capacity) {
- length = utrie2_serialize(data.trie, dest + totalSize,
- capacity - totalSize, &errorCode2);
- } else {
- length = utrie2_serialize(data.trie, nullptr, 0, &errorCode2);
- }
- if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
- errorCode = errorCode2;
- return 0;
- }
- // The trie size should be a multiple of 8 bytes due to the way
- // compactIndex2(UNewTrie2 *trie) currently works.
- U_ASSERT((length & 7) == 0);
- totalSize += length;
- }
- indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
- indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
- if(hasMappings && data.cesLength != 0) {
- U_ASSERT(((headerSize + totalSize) & 7) == 0);
- totalSize += data.cesLength * 8;
- }
- indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
- indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
- if(hasMappings) {
- totalSize += data.ce32sLength * 4;
- }
- indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
- totalSize += rootElementsLength * 4;
- indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
- if(hasMappings) {
- totalSize += data.contextsLength * 2;
- }
- indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
- if(hasMappings && !unsafeBackwardSet.isEmpty()) {
- UErrorCode errorCode2 = U_ZERO_ERROR;
- int32_t length;
- if(totalSize < capacity) {
- uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
- length = unsafeBackwardSet.serialize(
- p, (capacity - totalSize) / 2, errorCode2);
- } else {
- length = unsafeBackwardSet.serialize(nullptr, 0, errorCode2);
- }
- if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
- errorCode = errorCode2;
- return 0;
- }
- totalSize += length * 2;
- }
- indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
- totalSize += fastLatinTableLength * 2;
- UnicodeString scripts;
- indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
- if(isBase) {
- scripts.append((char16_t)data.numScripts);
- scripts.append(reinterpret_cast<const char16_t *>(data.scriptsIndex), data.numScripts + 16);
- scripts.append(reinterpret_cast<const char16_t *>(data.scriptStarts), data.scriptStartsLength);
- totalSize += scripts.length() * 2;
- }
- indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
- if(isBase) {
- totalSize += 256;
- }
- indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
- indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
- if(totalSize > capacity) {
- errorCode = U_BUFFER_OVERFLOW_ERROR;
- return headerSize + totalSize;
- }
- uprv_memcpy(dest, indexes, indexesLength * 4);
- copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
- copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
- // The trie has already been serialized into the dest buffer.
- copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
- copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
- copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
- copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
- // The unsafeBackwardSet has already been serialized into the dest buffer.
- copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
- copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
- copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
- return headerSize + totalSize;
- }
- void
- CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
- const void *src, uint8_t *dest) {
- int32_t start = indexes[startIndex];
- int32_t limit = indexes[startIndex + 1];
- if(start < limit) {
- uprv_memcpy(dest + start, src, limit - start);
- }
- }
- U_NAMESPACE_END
- #endif // !UCONFIG_NO_COLLATION
|