dictionarydata.cpp 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2014-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * dictionarydata.h
  9. *
  10. * created on: 2012may31
  11. * created by: Markus W. Scherer & Maxime Serrano
  12. */
  13. #include "dictionarydata.h"
  14. #include "unicode/ucharstrie.h"
  15. #include "unicode/bytestrie.h"
  16. #include "unicode/udata.h"
  17. #include "cmemory.h"
  18. #if !UCONFIG_NO_BREAK_ITERATION
  19. U_NAMESPACE_BEGIN
  20. const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
  21. const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
  22. const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
  23. const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
  24. const int32_t DictionaryData::TRANSFORM_NONE = 0;
  25. const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
  26. const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
  27. const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
  28. DictionaryMatcher::~DictionaryMatcher() {
  29. }
  30. UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
  31. udata_close(file);
  32. }
  33. int32_t UCharsDictionaryMatcher::getType() const {
  34. return DictionaryData::TRIE_TYPE_UCHARS;
  35. }
  36. int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
  37. int32_t *lengths, int32_t *cpLengths, int32_t *values,
  38. int32_t *prefix) const {
  39. UCharsTrie uct(characters);
  40. int32_t startingTextIndex = static_cast<int32_t>(utext_getNativeIndex(text));
  41. int32_t wordCount = 0;
  42. int32_t codePointsMatched = 0;
  43. for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
  44. UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
  45. int32_t lengthMatched = static_cast<int32_t>(utext_getNativeIndex(text)) - startingTextIndex;
  46. codePointsMatched += 1;
  47. if (USTRINGTRIE_HAS_VALUE(result)) {
  48. if (wordCount < limit) {
  49. if (values != nullptr) {
  50. values[wordCount] = uct.getValue();
  51. }
  52. if (lengths != nullptr) {
  53. lengths[wordCount] = lengthMatched;
  54. }
  55. if (cpLengths != nullptr) {
  56. cpLengths[wordCount] = codePointsMatched;
  57. }
  58. ++wordCount;
  59. }
  60. if (result == USTRINGTRIE_FINAL_VALUE) {
  61. break;
  62. }
  63. }
  64. else if (result == USTRINGTRIE_NO_MATCH) {
  65. break;
  66. }
  67. if (lengthMatched >= maxLength) {
  68. break;
  69. }
  70. }
  71. if (prefix != nullptr) {
  72. *prefix = codePointsMatched;
  73. }
  74. return wordCount;
  75. }
  76. BytesDictionaryMatcher::~BytesDictionaryMatcher() {
  77. udata_close(file);
  78. }
  79. UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
  80. if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
  81. if (c == 0x200D) {
  82. return 0xFF;
  83. } else if (c == 0x200C) {
  84. return 0xFE;
  85. }
  86. int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
  87. if (delta < 0 || 0xFD < delta) {
  88. return U_SENTINEL;
  89. }
  90. return static_cast<UChar32>(delta);
  91. }
  92. return c;
  93. }
  94. int32_t BytesDictionaryMatcher::getType() const {
  95. return DictionaryData::TRIE_TYPE_BYTES;
  96. }
  97. int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
  98. int32_t *lengths, int32_t *cpLengths, int32_t *values,
  99. int32_t *prefix) const {
  100. BytesTrie bt(characters);
  101. int32_t startingTextIndex = static_cast<int32_t>(utext_getNativeIndex(text));
  102. int32_t wordCount = 0;
  103. int32_t codePointsMatched = 0;
  104. for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
  105. UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
  106. int32_t lengthMatched = static_cast<int32_t>(utext_getNativeIndex(text)) - startingTextIndex;
  107. codePointsMatched += 1;
  108. if (USTRINGTRIE_HAS_VALUE(result)) {
  109. if (wordCount < limit) {
  110. if (values != nullptr) {
  111. values[wordCount] = bt.getValue();
  112. }
  113. if (lengths != nullptr) {
  114. lengths[wordCount] = lengthMatched;
  115. }
  116. if (cpLengths != nullptr) {
  117. cpLengths[wordCount] = codePointsMatched;
  118. }
  119. ++wordCount;
  120. }
  121. if (result == USTRINGTRIE_FINAL_VALUE) {
  122. break;
  123. }
  124. }
  125. else if (result == USTRINGTRIE_NO_MATCH) {
  126. break;
  127. }
  128. if (lengthMatched >= maxLength) {
  129. break;
  130. }
  131. }
  132. if (prefix != nullptr) {
  133. *prefix = codePointsMatched;
  134. }
  135. return wordCount;
  136. }
  137. U_NAMESPACE_END
  138. U_NAMESPACE_USE
  139. U_CAPI int32_t U_EXPORT2
  140. udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
  141. void *outData, UErrorCode *pErrorCode) {
  142. const UDataInfo *pInfo;
  143. int32_t headerSize;
  144. const uint8_t *inBytes;
  145. uint8_t *outBytes;
  146. const int32_t *inIndexes;
  147. int32_t indexes[DictionaryData::IX_COUNT];
  148. int32_t i, offset, size;
  149. headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
  150. if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) return 0;
  151. pInfo = (const UDataInfo *)((const char *)inData + 4);
  152. if (!(pInfo->dataFormat[0] == 0x44 &&
  153. pInfo->dataFormat[1] == 0x69 &&
  154. pInfo->dataFormat[2] == 0x63 &&
  155. pInfo->dataFormat[3] == 0x74 &&
  156. pInfo->formatVersion[0] == 1)) {
  157. udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
  158. pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
  159. *pErrorCode = U_UNSUPPORTED_ERROR;
  160. return 0;
  161. }
  162. inBytes = (const uint8_t *)inData + headerSize;
  163. outBytes = (outData == nullptr) ? nullptr : (uint8_t *)outData + headerSize;
  164. inIndexes = (const int32_t *)inBytes;
  165. if (length >= 0) {
  166. length -= headerSize;
  167. if (length < (int32_t)(sizeof(indexes))) {
  168. udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
  169. *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  170. return 0;
  171. }
  172. }
  173. for (i = 0; i < DictionaryData::IX_COUNT; i++) {
  174. indexes[i] = udata_readInt32(ds, inIndexes[i]);
  175. }
  176. size = indexes[DictionaryData::IX_TOTAL_SIZE];
  177. if (length >= 0) {
  178. if (length < size) {
  179. udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
  180. *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  181. return 0;
  182. }
  183. if (inBytes != outBytes) {
  184. uprv_memcpy(outBytes, inBytes, size);
  185. }
  186. offset = 0;
  187. ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
  188. offset = (int32_t)sizeof(indexes);
  189. int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
  190. int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
  191. if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
  192. ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
  193. } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
  194. // nothing to do
  195. } else {
  196. udata_printError(ds, "udict_swap(): unknown trie type!\n");
  197. *pErrorCode = U_UNSUPPORTED_ERROR;
  198. return 0;
  199. }
  200. // these next two sections are empty in the current format,
  201. // but may be used later.
  202. offset = nextOffset;
  203. nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
  204. offset = nextOffset;
  205. nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
  206. offset = nextOffset;
  207. }
  208. return headerSize + size;
  209. }
  210. #endif