collationsettings.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2013-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * collationsettings.cpp
  9. *
  10. * created on: 2013feb07
  11. * created by: Markus W. Scherer
  12. */
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_COLLATION
  15. #include "unicode/ucol.h"
  16. #include "cmemory.h"
  17. #include "collation.h"
  18. #include "collationdata.h"
  19. #include "collationsettings.h"
  20. #include "sharedobject.h"
  21. #include "uassert.h"
  22. #include "umutex.h"
  23. #include "uvectr32.h"
  24. U_NAMESPACE_BEGIN
  25. CollationSettings::CollationSettings(const CollationSettings &other)
  26. : SharedObject(other),
  27. options(other.options), variableTop(other.variableTop),
  28. reorderTable(nullptr),
  29. minHighNoReorder(other.minHighNoReorder),
  30. reorderRanges(nullptr), reorderRangesLength(0),
  31. reorderCodes(nullptr), reorderCodesLength(0), reorderCodesCapacity(0),
  32. fastLatinOptions(other.fastLatinOptions) {
  33. UErrorCode errorCode = U_ZERO_ERROR;
  34. copyReorderingFrom(other, errorCode);
  35. if(fastLatinOptions >= 0) {
  36. uprv_memcpy(fastLatinPrimaries, other.fastLatinPrimaries, sizeof(fastLatinPrimaries));
  37. }
  38. }
  39. CollationSettings::~CollationSettings() {
  40. if(reorderCodesCapacity != 0) {
  41. uprv_free(const_cast<int32_t *>(reorderCodes));
  42. }
  43. }
  44. bool
  45. CollationSettings::operator==(const CollationSettings &other) const {
  46. if(options != other.options) { return false; }
  47. if((options & ALTERNATE_MASK) != 0 && variableTop != other.variableTop) { return false; }
  48. if(reorderCodesLength != other.reorderCodesLength) { return false; }
  49. for(int32_t i = 0; i < reorderCodesLength; ++i) {
  50. if(reorderCodes[i] != other.reorderCodes[i]) { return false; }
  51. }
  52. return true;
  53. }
  54. int32_t
  55. CollationSettings::hashCode() const {
  56. int32_t h = options << 8;
  57. if((options & ALTERNATE_MASK) != 0) { h ^= variableTop; }
  58. h ^= reorderCodesLength;
  59. for(int32_t i = 0; i < reorderCodesLength; ++i) {
  60. h ^= (reorderCodes[i] << i);
  61. }
  62. return h;
  63. }
  64. void
  65. CollationSettings::resetReordering() {
  66. // When we turn off reordering, we want to set a nullptr permutation
  67. // rather than a no-op permutation.
  68. // Keep the memory via reorderCodes and its capacity.
  69. reorderTable = nullptr;
  70. minHighNoReorder = 0;
  71. reorderRangesLength = 0;
  72. reorderCodesLength = 0;
  73. }
  74. void
  75. CollationSettings::aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
  76. const uint32_t *ranges, int32_t rangesLength,
  77. const uint8_t *table, UErrorCode &errorCode) {
  78. if(U_FAILURE(errorCode)) { return; }
  79. if(table != nullptr &&
  80. (rangesLength == 0 ?
  81. !reorderTableHasSplitBytes(table) :
  82. rangesLength >= 2 &&
  83. // The first offset must be 0. The last offset must not be 0.
  84. (ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0)) {
  85. // We need to release the memory before setting the alias pointer.
  86. if(reorderCodesCapacity != 0) {
  87. uprv_free(const_cast<int32_t *>(reorderCodes));
  88. reorderCodesCapacity = 0;
  89. }
  90. reorderTable = table;
  91. reorderCodes = codes;
  92. reorderCodesLength = length;
  93. // Drop ranges before the first split byte. They are reordered by the table.
  94. // This then speeds up reordering of the remaining ranges.
  95. int32_t firstSplitByteRangeIndex = 0;
  96. while(firstSplitByteRangeIndex < rangesLength &&
  97. (ranges[firstSplitByteRangeIndex] & 0xff0000) == 0) {
  98. // The second byte of the primary limit is 0.
  99. ++firstSplitByteRangeIndex;
  100. }
  101. if(firstSplitByteRangeIndex == rangesLength) {
  102. U_ASSERT(!reorderTableHasSplitBytes(table));
  103. minHighNoReorder = 0;
  104. reorderRanges = nullptr;
  105. reorderRangesLength = 0;
  106. } else {
  107. U_ASSERT(table[ranges[firstSplitByteRangeIndex] >> 24] == 0);
  108. minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000;
  109. reorderRanges = ranges + firstSplitByteRangeIndex;
  110. reorderRangesLength = rangesLength - firstSplitByteRangeIndex;
  111. }
  112. return;
  113. }
  114. // Regenerate missing data.
  115. setReordering(data, codes, length, errorCode);
  116. }
  117. void
  118. CollationSettings::setReordering(const CollationData &data,
  119. const int32_t *codes, int32_t codesLength,
  120. UErrorCode &errorCode) {
  121. if(U_FAILURE(errorCode)) { return; }
  122. if(codesLength == 0 || (codesLength == 1 && codes[0] == UCOL_REORDER_CODE_NONE)) {
  123. resetReordering();
  124. return;
  125. }
  126. UVector32 rangesList(errorCode);
  127. data.makeReorderRanges(codes, codesLength, rangesList, errorCode);
  128. if(U_FAILURE(errorCode)) { return; }
  129. int32_t rangesLength = rangesList.size();
  130. if(rangesLength == 0) {
  131. resetReordering();
  132. return;
  133. }
  134. const uint32_t *ranges = reinterpret_cast<uint32_t *>(rangesList.getBuffer());
  135. // ranges[] contains at least two (limit, offset) pairs.
  136. // The first offset must be 0. The last offset must not be 0.
  137. // Separators (at the low end) and trailing weights (at the high end)
  138. // are never reordered.
  139. U_ASSERT(rangesLength >= 2);
  140. U_ASSERT((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0);
  141. minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000;
  142. // Write the lead byte permutation table.
  143. // Set a 0 for each lead byte that has a range boundary in the middle.
  144. uint8_t table[256];
  145. int32_t b = 0;
  146. int32_t firstSplitByteRangeIndex = -1;
  147. for(int32_t i = 0; i < rangesLength; ++i) {
  148. uint32_t pair = ranges[i];
  149. int32_t limit1 = (int32_t)(pair >> 24);
  150. while(b < limit1) {
  151. table[b] = (uint8_t)(b + pair);
  152. ++b;
  153. }
  154. // Check the second byte of the limit.
  155. if((pair & 0xff0000) != 0) {
  156. table[limit1] = 0;
  157. b = limit1 + 1;
  158. if(firstSplitByteRangeIndex < 0) {
  159. firstSplitByteRangeIndex = i;
  160. }
  161. }
  162. }
  163. while(b <= 0xff) {
  164. table[b] = (uint8_t)b;
  165. ++b;
  166. }
  167. if(firstSplitByteRangeIndex < 0) {
  168. // The lead byte permutation table alone suffices for reordering.
  169. rangesLength = 0;
  170. } else {
  171. // Remove the ranges below the first split byte.
  172. ranges += firstSplitByteRangeIndex;
  173. rangesLength -= firstSplitByteRangeIndex;
  174. }
  175. setReorderArrays(codes, codesLength, ranges, rangesLength, table, errorCode);
  176. }
  177. void
  178. CollationSettings::setReorderArrays(const int32_t *codes, int32_t codesLength,
  179. const uint32_t *ranges, int32_t rangesLength,
  180. const uint8_t *table, UErrorCode &errorCode) {
  181. if(U_FAILURE(errorCode)) { return; }
  182. int32_t *ownedCodes;
  183. int32_t totalLength = codesLength + rangesLength;
  184. U_ASSERT(totalLength > 0);
  185. if(totalLength <= reorderCodesCapacity) {
  186. ownedCodes = const_cast<int32_t *>(reorderCodes);
  187. } else {
  188. // Allocate one memory block for the codes, the ranges, and the 16-aligned table.
  189. int32_t capacity = (totalLength + 3) & ~3; // round up to a multiple of 4 ints
  190. ownedCodes = (int32_t *)uprv_malloc(capacity * 4 + 256);
  191. if(ownedCodes == nullptr) {
  192. resetReordering();
  193. errorCode = U_MEMORY_ALLOCATION_ERROR;
  194. return;
  195. }
  196. if(reorderCodesCapacity != 0) {
  197. uprv_free(const_cast<int32_t *>(reorderCodes));
  198. }
  199. reorderCodes = ownedCodes;
  200. reorderCodesCapacity = capacity;
  201. }
  202. uprv_memcpy(ownedCodes + reorderCodesCapacity, table, 256);
  203. uprv_memcpy(ownedCodes, codes, codesLength * 4);
  204. uprv_memcpy(ownedCodes + codesLength, ranges, rangesLength * 4);
  205. reorderTable = reinterpret_cast<const uint8_t *>(reorderCodes + reorderCodesCapacity);
  206. reorderCodesLength = codesLength;
  207. reorderRanges = reinterpret_cast<uint32_t *>(ownedCodes) + codesLength;
  208. reorderRangesLength = rangesLength;
  209. }
  210. void
  211. CollationSettings::copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode) {
  212. if(U_FAILURE(errorCode)) { return; }
  213. if(!other.hasReordering()) {
  214. resetReordering();
  215. return;
  216. }
  217. minHighNoReorder = other.minHighNoReorder;
  218. if(other.reorderCodesCapacity == 0) {
  219. // The reorder arrays are aliased to memory-mapped data.
  220. reorderTable = other.reorderTable;
  221. reorderRanges = other.reorderRanges;
  222. reorderRangesLength = other.reorderRangesLength;
  223. reorderCodes = other.reorderCodes;
  224. reorderCodesLength = other.reorderCodesLength;
  225. } else {
  226. setReorderArrays(other.reorderCodes, other.reorderCodesLength,
  227. other.reorderRanges, other.reorderRangesLength,
  228. other.reorderTable, errorCode);
  229. }
  230. }
  231. UBool
  232. CollationSettings::reorderTableHasSplitBytes(const uint8_t table[256]) {
  233. U_ASSERT(table[0] == 0);
  234. for(int32_t i = 1; i < 256; ++i) {
  235. if(table[i] == 0) {
  236. return true;
  237. }
  238. }
  239. return false;
  240. }
  241. uint32_t
  242. CollationSettings::reorderEx(uint32_t p) const {
  243. if(p >= minHighNoReorder) { return p; }
  244. // Round up p so that its lower 16 bits are >= any offset bits.
  245. // Then compare q directly with (limit, offset) pairs.
  246. uint32_t q = p | 0xffff;
  247. uint32_t r;
  248. const uint32_t *ranges = reorderRanges;
  249. while(q >= (r = *ranges)) { ++ranges; }
  250. return p + (r << 24);
  251. }
  252. void
  253. CollationSettings::setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode) {
  254. if(U_FAILURE(errorCode)) { return; }
  255. int32_t noStrength = options & ~STRENGTH_MASK;
  256. switch(value) {
  257. case UCOL_PRIMARY:
  258. case UCOL_SECONDARY:
  259. case UCOL_TERTIARY:
  260. case UCOL_QUATERNARY:
  261. case UCOL_IDENTICAL:
  262. options = noStrength | (value << STRENGTH_SHIFT);
  263. break;
  264. case UCOL_DEFAULT:
  265. options = noStrength | (defaultOptions & STRENGTH_MASK);
  266. break;
  267. default:
  268. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  269. break;
  270. }
  271. }
  272. void
  273. CollationSettings::setFlag(int32_t bit, UColAttributeValue value,
  274. int32_t defaultOptions, UErrorCode &errorCode) {
  275. if(U_FAILURE(errorCode)) { return; }
  276. switch(value) {
  277. case UCOL_ON:
  278. options |= bit;
  279. break;
  280. case UCOL_OFF:
  281. options &= ~bit;
  282. break;
  283. case UCOL_DEFAULT:
  284. options = (options & ~bit) | (defaultOptions & bit);
  285. break;
  286. default:
  287. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  288. break;
  289. }
  290. }
  291. void
  292. CollationSettings::setCaseFirst(UColAttributeValue value,
  293. int32_t defaultOptions, UErrorCode &errorCode) {
  294. if(U_FAILURE(errorCode)) { return; }
  295. int32_t noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK;
  296. switch(value) {
  297. case UCOL_OFF:
  298. options = noCaseFirst;
  299. break;
  300. case UCOL_LOWER_FIRST:
  301. options = noCaseFirst | CASE_FIRST;
  302. break;
  303. case UCOL_UPPER_FIRST:
  304. options = noCaseFirst | CASE_FIRST_AND_UPPER_MASK;
  305. break;
  306. case UCOL_DEFAULT:
  307. options = noCaseFirst | (defaultOptions & CASE_FIRST_AND_UPPER_MASK);
  308. break;
  309. default:
  310. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  311. break;
  312. }
  313. }
  314. void
  315. CollationSettings::setAlternateHandling(UColAttributeValue value,
  316. int32_t defaultOptions, UErrorCode &errorCode) {
  317. if(U_FAILURE(errorCode)) { return; }
  318. int32_t noAlternate = options & ~ALTERNATE_MASK;
  319. switch(value) {
  320. case UCOL_NON_IGNORABLE:
  321. options = noAlternate;
  322. break;
  323. case UCOL_SHIFTED:
  324. options = noAlternate | SHIFTED;
  325. break;
  326. case UCOL_DEFAULT:
  327. options = noAlternate | (defaultOptions & ALTERNATE_MASK);
  328. break;
  329. default:
  330. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  331. break;
  332. }
  333. }
  334. void
  335. CollationSettings::setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode) {
  336. if(U_FAILURE(errorCode)) { return; }
  337. int32_t noMax = options & ~MAX_VARIABLE_MASK;
  338. switch(value) {
  339. case MAX_VAR_SPACE:
  340. case MAX_VAR_PUNCT:
  341. case MAX_VAR_SYMBOL:
  342. case MAX_VAR_CURRENCY:
  343. options = noMax | (value << MAX_VARIABLE_SHIFT);
  344. break;
  345. case UCOL_DEFAULT:
  346. options = noMax | (defaultOptions & MAX_VARIABLE_MASK);
  347. break;
  348. default:
  349. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  350. break;
  351. }
  352. }
  353. U_NAMESPACE_END
  354. #endif // !UCONFIG_NO_COLLATION