123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- * Copyright (C) 2014-2016, International Business Machines
- * Corporation and others. All Rights Reserved.
- *******************************************************************************
- * dictionarydata.h
- *
- * created on: 2012may31
- * created by: Markus W. Scherer & Maxime Serrano
- */
- #include "dictionarydata.h"
- #include "unicode/ucharstrie.h"
- #include "unicode/bytestrie.h"
- #include "unicode/udata.h"
- #include "cmemory.h"
- #if !UCONFIG_NO_BREAK_ITERATION
- U_NAMESPACE_BEGIN
- const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
- const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
- const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
- const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
- const int32_t DictionaryData::TRANSFORM_NONE = 0;
- const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
- const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
- const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
-
- DictionaryMatcher::~DictionaryMatcher() {
- }
- UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
- udata_close(file);
- }
- int32_t UCharsDictionaryMatcher::getType() const {
- return DictionaryData::TRIE_TYPE_UCHARS;
- }
- int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
- int32_t *lengths, int32_t *cpLengths, int32_t *values,
- int32_t *prefix) const {
- UCharsTrie uct(characters);
- int32_t startingTextIndex = static_cast<int32_t>(utext_getNativeIndex(text));
- int32_t wordCount = 0;
- int32_t codePointsMatched = 0;
- for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
- UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
- int32_t lengthMatched = static_cast<int32_t>(utext_getNativeIndex(text)) - startingTextIndex;
- codePointsMatched += 1;
- if (USTRINGTRIE_HAS_VALUE(result)) {
- if (wordCount < limit) {
- if (values != nullptr) {
- values[wordCount] = uct.getValue();
- }
- if (lengths != nullptr) {
- lengths[wordCount] = lengthMatched;
- }
- if (cpLengths != nullptr) {
- cpLengths[wordCount] = codePointsMatched;
- }
- ++wordCount;
- }
- if (result == USTRINGTRIE_FINAL_VALUE) {
- break;
- }
- }
- else if (result == USTRINGTRIE_NO_MATCH) {
- break;
- }
- if (lengthMatched >= maxLength) {
- break;
- }
- }
- if (prefix != nullptr) {
- *prefix = codePointsMatched;
- }
- return wordCount;
- }
- BytesDictionaryMatcher::~BytesDictionaryMatcher() {
- udata_close(file);
- }
- UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
- if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
- if (c == 0x200D) {
- return 0xFF;
- } else if (c == 0x200C) {
- return 0xFE;
- }
- int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
- if (delta < 0 || 0xFD < delta) {
- return U_SENTINEL;
- }
- return static_cast<UChar32>(delta);
- }
- return c;
- }
- int32_t BytesDictionaryMatcher::getType() const {
- return DictionaryData::TRIE_TYPE_BYTES;
- }
- int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
- int32_t *lengths, int32_t *cpLengths, int32_t *values,
- int32_t *prefix) const {
- BytesTrie bt(characters);
- int32_t startingTextIndex = static_cast<int32_t>(utext_getNativeIndex(text));
- int32_t wordCount = 0;
- int32_t codePointsMatched = 0;
- for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
- UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
- int32_t lengthMatched = static_cast<int32_t>(utext_getNativeIndex(text)) - startingTextIndex;
- codePointsMatched += 1;
- if (USTRINGTRIE_HAS_VALUE(result)) {
- if (wordCount < limit) {
- if (values != nullptr) {
- values[wordCount] = bt.getValue();
- }
- if (lengths != nullptr) {
- lengths[wordCount] = lengthMatched;
- }
- if (cpLengths != nullptr) {
- cpLengths[wordCount] = codePointsMatched;
- }
- ++wordCount;
- }
- if (result == USTRINGTRIE_FINAL_VALUE) {
- break;
- }
- }
- else if (result == USTRINGTRIE_NO_MATCH) {
- break;
- }
- if (lengthMatched >= maxLength) {
- break;
- }
- }
- if (prefix != nullptr) {
- *prefix = codePointsMatched;
- }
- return wordCount;
- }
- U_NAMESPACE_END
- U_NAMESPACE_USE
- U_CAPI int32_t U_EXPORT2
- udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
- void *outData, UErrorCode *pErrorCode) {
- const UDataInfo *pInfo;
- int32_t headerSize;
- const uint8_t *inBytes;
- uint8_t *outBytes;
- const int32_t *inIndexes;
- int32_t indexes[DictionaryData::IX_COUNT];
- int32_t i, offset, size;
- headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
- if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) return 0;
- pInfo = (const UDataInfo *)((const char *)inData + 4);
- if (!(pInfo->dataFormat[0] == 0x44 &&
- pInfo->dataFormat[1] == 0x69 &&
- pInfo->dataFormat[2] == 0x63 &&
- pInfo->dataFormat[3] == 0x74 &&
- pInfo->formatVersion[0] == 1)) {
- udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
- pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
- *pErrorCode = U_UNSUPPORTED_ERROR;
- return 0;
- }
- inBytes = (const uint8_t *)inData + headerSize;
- outBytes = (outData == nullptr) ? nullptr : (uint8_t *)outData + headerSize;
- inIndexes = (const int32_t *)inBytes;
- if (length >= 0) {
- length -= headerSize;
- if (length < (int32_t)(sizeof(indexes))) {
- udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
- *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
- }
- for (i = 0; i < DictionaryData::IX_COUNT; i++) {
- indexes[i] = udata_readInt32(ds, inIndexes[i]);
- }
- size = indexes[DictionaryData::IX_TOTAL_SIZE];
- if (length >= 0) {
- if (length < size) {
- udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
- *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
- if (inBytes != outBytes) {
- uprv_memcpy(outBytes, inBytes, size);
- }
- offset = 0;
- ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
- offset = (int32_t)sizeof(indexes);
- int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
- int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
- if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
- ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
- } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
- // nothing to do
- } else {
- udata_printError(ds, "udict_swap(): unknown trie type!\n");
- *pErrorCode = U_UNSUPPORTED_ERROR;
- return 0;
- }
- // these next two sections are empty in the current format,
- // but may be used later.
- offset = nextOffset;
- nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
- offset = nextOffset;
- nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
- offset = nextOffset;
- }
- return headerSize + size;
- }
- #endif
|