collationdatabuilder.h 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2012-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * collationdatabuilder.h
  9. *
  10. * created on: 2012apr01
  11. * created by: Markus W. Scherer
  12. */
  13. #ifndef __COLLATIONDATABUILDER_H__
  14. #define __COLLATIONDATABUILDER_H__
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_COLLATION
  17. #include "unicode/uniset.h"
  18. #include "unicode/unistr.h"
  19. #include "unicode/uversion.h"
  20. #include "collation.h"
  21. #include "collationdata.h"
  22. #include "collationsettings.h"
  23. #include "normalizer2impl.h"
  24. #include "utrie2.h"
  25. #include "uvectr32.h"
  26. #include "uvectr64.h"
  27. #include "uvector.h"
  28. U_NAMESPACE_BEGIN
  29. struct ConditionalCE32;
  30. class CollationFastLatinBuilder;
  31. class CopyHelper;
  32. class DataBuilderCollationIterator;
  33. class UCharsTrieBuilder;
  34. /**
  35. * Low-level CollationData builder.
  36. * Takes (character, CE) pairs and builds them into runtime data structures.
  37. * Supports characters with context prefixes and contraction suffixes.
  38. */
  39. class U_I18N_API CollationDataBuilder : public UObject {
  40. public:
  41. /**
  42. * Collation element modifier. Interface class for a modifier
  43. * that changes a tailoring builder's temporary CEs to final CEs.
  44. * Called for every non-special CE32 and every expansion CE.
  45. */
  46. class CEModifier : public UObject {
  47. public:
  48. virtual ~CEModifier();
  49. /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
  50. virtual int64_t modifyCE32(uint32_t ce32) const = 0;
  51. /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
  52. virtual int64_t modifyCE(int64_t ce) const = 0;
  53. };
  54. CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode);
  55. virtual ~CollationDataBuilder();
  56. void initForTailoring(const CollationData *b, UErrorCode &errorCode);
  57. virtual UBool isCompressibleLeadByte(uint32_t b) const;
  58. inline UBool isCompressiblePrimary(uint32_t p) const {
  59. return isCompressibleLeadByte(p >> 24);
  60. }
  61. /**
  62. * @return true if this builder has mappings (e.g., add() has been called)
  63. */
  64. UBool hasMappings() const { return modified; }
  65. /**
  66. * @return true if c has CEs in this builder
  67. */
  68. UBool isAssigned(UChar32 c) const;
  69. /**
  70. * @return the three-byte primary if c maps to a single such CE and has no context data,
  71. * otherwise returns 0.
  72. */
  73. uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;
  74. /**
  75. * @return the single CE for c.
  76. * Sets an error code if c does not have a single CE.
  77. */
  78. int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
  79. void add(const UnicodeString &prefix, const UnicodeString &s,
  80. const int64_t ces[], int32_t cesLength,
  81. UErrorCode &errorCode);
  82. /**
  83. * Encodes the ces as either the returned ce32 by itself,
  84. * or by storing an expansion, with the returned ce32 referring to that.
  85. *
  86. * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
  87. */
  88. virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
  89. void addCE32(const UnicodeString &prefix, const UnicodeString &s,
  90. uint32_t ce32, UErrorCode &errorCode);
  91. /**
  92. * Sets three-byte-primary CEs for a range of code points in code point order,
  93. * if it is worth doing; otherwise no change is made.
  94. * None of the code points in the range should have complex mappings so far
  95. * (expansions/contractions/prefixes).
  96. * @param start first code point
  97. * @param end last code point (inclusive)
  98. * @param primary primary weight for 'start'
  99. * @param step per-code point primary-weight increment
  100. * @param errorCode ICU in/out error code
  101. * @return true if an OFFSET_TAG range was used for start..end
  102. */
  103. UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,
  104. uint32_t primary, int32_t step,
  105. UErrorCode &errorCode);
  106. /**
  107. * Sets three-byte-primary CEs for a range of code points in code point order.
  108. * Sets range values if that is worth doing, or else individual values.
  109. * None of the code points in the range should have complex mappings so far
  110. * (expansions/contractions/prefixes).
  111. * @param start first code point
  112. * @param end last code point (inclusive)
  113. * @param primary primary weight for 'start'
  114. * @param step per-code point primary-weight increment
  115. * @param errorCode ICU in/out error code
  116. * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
  117. */
  118. uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
  119. uint32_t primary, int32_t step,
  120. UErrorCode &errorCode);
  121. /**
  122. * Copies all mappings from the src builder, with modifications.
  123. * This builder here must not be built yet, and should be empty.
  124. */
  125. void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
  126. UErrorCode &errorCode);
  127. void optimize(const UnicodeSet &set, UErrorCode &errorCode);
  128. void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);
  129. void enableFastLatin() { fastLatinEnabled = true; }
  130. virtual void build(CollationData &data, UErrorCode &errorCode);
  131. /**
  132. * Looks up CEs for s and appends them to the ces array.
  133. * Does not handle normalization: s should be in FCD form.
  134. *
  135. * Does not write completely ignorable CEs.
  136. * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
  137. *
  138. * @return incremented cesLength
  139. */
  140. int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);
  141. int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,
  142. int64_t ces[], int32_t cesLength);
  143. protected:
  144. friend class CopyHelper;
  145. friend class DataBuilderCollationIterator;
  146. uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;
  147. int32_t addCE(int64_t ce, UErrorCode &errorCode);
  148. int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);
  149. int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);
  150. inline ConditionalCE32 *getConditionalCE32(int32_t index) const {
  151. return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);
  152. }
  153. inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {
  154. return getConditionalCE32(Collation::indexFromCE32(ce32));
  155. }
  156. static uint32_t makeBuilderContextCE32(int32_t index) {
  157. return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);
  158. }
  159. static inline UBool isBuilderContextCE32(uint32_t ce32) {
  160. return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);
  161. }
  162. static uint32_t encodeOneCEAsCE32(int64_t ce);
  163. uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);
  164. uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);
  165. uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);
  166. uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);
  167. /**
  168. * Copies base contractions to a list of ConditionalCE32.
  169. * Sets cond->next to the index of the first new item
  170. * and returns the index of the last new item.
  171. */
  172. int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
  173. ConditionalCE32 *cond, UErrorCode &errorCode);
  174. UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);
  175. void setDigitTags(UErrorCode &errorCode);
  176. void setLeadSurrogates(UErrorCode &errorCode);
  177. void buildMappings(CollationData &data, UErrorCode &errorCode);
  178. void clearContexts();
  179. void buildContexts(UErrorCode &errorCode);
  180. uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);
  181. int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
  182. UErrorCode &errorCode);
  183. void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);
  184. int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);
  185. static UChar32 jamoCpFromIndex(int32_t i) {
  186. // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
  187. if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }
  188. i -= Hangul::JAMO_L_COUNT;
  189. if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }
  190. i -= Hangul::JAMO_V_COUNT;
  191. // i < 27
  192. return Hangul::JAMO_T_BASE + 1 + i;
  193. }
  194. /** @see Collation::BUILDER_DATA_TAG */
  195. static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;
  196. const Normalizer2Impl &nfcImpl;
  197. const CollationData *base;
  198. const CollationSettings *baseSettings;
  199. UTrie2 *trie;
  200. UVector32 ce32s;
  201. UVector64 ce64s;
  202. UVector conditionalCE32s; // vector of ConditionalCE32
  203. // Characters that have context (prefixes or contraction suffixes).
  204. UnicodeSet contextChars;
  205. // Serialized UCharsTrie structures for finalized contexts.
  206. UnicodeString contexts;
  207. private:
  208. /**
  209. * The "era" of building intermediate contexts.
  210. * When the array of cached, temporary contexts overflows, then clearContexts()
  211. * removes them all and invalidates the builtCE32 that used to point to built tries.
  212. * See ConditionalCE32::era.
  213. */
  214. int32_t contextsEra = 0;
  215. protected:
  216. UnicodeSet unsafeBackwardSet;
  217. UBool modified;
  218. UBool icu4xMode;
  219. UBool fastLatinEnabled;
  220. CollationFastLatinBuilder *fastLatinBuilder;
  221. DataBuilderCollationIterator *collIter;
  222. };
  223. U_NAMESPACE_END
  224. #endif // !UCONFIG_NO_COLLATION
  225. #endif // __COLLATIONDATABUILDER_H__