collationsets.h 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2013-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * collationsets.h
  9. *
  10. * created on: 2013feb09
  11. * created by: Markus W. Scherer
  12. */
  13. #ifndef __COLLATIONSETS_H__
  14. #define __COLLATIONSETS_H__
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_COLLATION
  17. #include "unicode/uniset.h"
  18. #include "collation.h"
  19. U_NAMESPACE_BEGIN
  20. struct CollationData;
  21. /**
  22. * Finds the set of characters and strings that sort differently in the tailoring
  23. * from the base data.
  24. *
  25. * Every mapping in the tailoring needs to be compared to the base,
  26. * because some mappings are copied for optimization, and
  27. * all contractions for a character are copied if any contractions for that character
  28. * are added, modified or removed.
  29. *
  30. * It might be simpler to re-parse the rule string, but:
  31. * - That would require duplicating some of the from-rules builder code.
  32. * - That would make the runtime code depend on the builder.
  33. * - That would only work if we have the rule string, and we allow users to
  34. * omit the rule string from data files.
  35. */
  36. class TailoredSet : public UMemory {
  37. public:
  38. TailoredSet(UnicodeSet *t)
  39. : data(nullptr), baseData(nullptr),
  40. tailored(t),
  41. suffix(nullptr),
  42. errorCode(U_ZERO_ERROR) {}
  43. void forData(const CollationData *d, UErrorCode &errorCode);
  44. /**
  45. * @return U_SUCCESS(errorCode) in C++, void in Java
  46. * @internal only public for access by callback
  47. */
  48. UBool handleCE32(UChar32 start, UChar32 end, uint32_t ce32);
  49. private:
  50. void compare(UChar32 c, uint32_t ce32, uint32_t baseCE32);
  51. void comparePrefixes(UChar32 c, const char16_t *p, const char16_t *q);
  52. void compareContractions(UChar32 c, const char16_t *p, const char16_t *q);
  53. void addPrefixes(const CollationData *d, UChar32 c, const char16_t *p);
  54. void addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32);
  55. void addContractions(UChar32 c, const char16_t *p);
  56. void addSuffix(UChar32 c, const UnicodeString &sfx);
  57. void add(UChar32 c);
  58. /** Prefixes are reversed in the data structure. */
  59. void setPrefix(const UnicodeString &pfx) {
  60. unreversedPrefix = pfx;
  61. unreversedPrefix.reverse();
  62. }
  63. void resetPrefix() {
  64. unreversedPrefix.remove();
  65. }
  66. const CollationData *data;
  67. const CollationData *baseData;
  68. UnicodeSet *tailored;
  69. UnicodeString unreversedPrefix;
  70. const UnicodeString *suffix;
  71. UErrorCode errorCode;
  72. };
  73. class ContractionsAndExpansions : public UMemory {
  74. public:
  75. class CESink : public UMemory {
  76. public:
  77. virtual ~CESink();
  78. virtual void handleCE(int64_t ce) = 0;
  79. virtual void handleExpansion(const int64_t ces[], int32_t length) = 0;
  80. };
  81. ContractionsAndExpansions(UnicodeSet *con, UnicodeSet *exp, CESink *s, UBool prefixes)
  82. : data(nullptr),
  83. contractions(con), expansions(exp),
  84. sink(s),
  85. addPrefixes(prefixes),
  86. checkTailored(0),
  87. suffix(nullptr),
  88. errorCode(U_ZERO_ERROR) {}
  89. void forData(const CollationData *d, UErrorCode &errorCode);
  90. void forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec);
  91. // all following: @internal, only public for access by callback
  92. void handleCE32(UChar32 start, UChar32 end, uint32_t ce32);
  93. void handlePrefixes(UChar32 start, UChar32 end, uint32_t ce32);
  94. void handleContractions(UChar32 start, UChar32 end, uint32_t ce32);
  95. void addExpansions(UChar32 start, UChar32 end);
  96. void addStrings(UChar32 start, UChar32 end, UnicodeSet *set);
  97. /** Prefixes are reversed in the data structure. */
  98. void setPrefix(const UnicodeString &pfx) {
  99. unreversedPrefix = pfx;
  100. unreversedPrefix.reverse();
  101. }
  102. void resetPrefix() {
  103. unreversedPrefix.remove();
  104. }
  105. const CollationData *data;
  106. UnicodeSet *contractions;
  107. UnicodeSet *expansions;
  108. CESink *sink;
  109. UBool addPrefixes;
  110. int8_t checkTailored; // -1: collected tailored +1: exclude tailored
  111. UnicodeSet tailored;
  112. UnicodeSet ranges;
  113. UnicodeString unreversedPrefix;
  114. const UnicodeString *suffix;
  115. int64_t ces[Collation::MAX_EXPANSION_LENGTH];
  116. UErrorCode errorCode;
  117. };
  118. U_NAMESPACE_END
  119. #endif // !UCONFIG_NO_COLLATION
  120. #endif // __COLLATIONSETS_H__