usrchimp.h 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2001-2015 IBM and others. All rights reserved.
  6. **********************************************************************
  7. * Date Name Description
  8. * 08/13/2001 synwee Creation.
  9. **********************************************************************
  10. */
  11. #ifndef USRCHIMP_H
  12. #define USRCHIMP_H
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_COLLATION
  15. #include "unicode/normalizer2.h"
  16. #include "unicode/ucol.h"
  17. #include "unicode/ucoleitr.h"
  18. #include "unicode/ubrk.h"
  19. /* mask off anything but primary order */
  20. #define UCOL_PRIMARYORDERMASK 0xffff0000
  21. /* mask off anything but secondary order */
  22. #define UCOL_SECONDARYORDERMASK 0x0000ff00
  23. /* mask off anything but tertiary order */
  24. #define UCOL_TERTIARYORDERMASK 0x000000ff
  25. /* primary order shift */
  26. #define UCOL_PRIMARYORDERSHIFT 16
  27. /* secondary order shift */
  28. #define UCOL_SECONDARYORDERSHIFT 8
  29. #define UCOL_IGNORABLE 0
  30. /* get weights from a CE */
  31. #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
  32. #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
  33. #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
  34. #define UCOL_CONTINUATION_MARKER 0xC0
  35. #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
  36. /**
  37. * This indicates an error has occurred during processing or there are no more CEs
  38. * to be returned.
  39. */
  40. #define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
  41. U_NAMESPACE_BEGIN
  42. class CollationElementIterator;
  43. class Collator;
  44. struct PCEI
  45. {
  46. uint64_t ce;
  47. int32_t low;
  48. int32_t high;
  49. };
  50. struct PCEBuffer
  51. {
  52. PCEI defaultBuffer[16];
  53. PCEI *buffer;
  54. int32_t bufferIndex;
  55. int32_t bufferSize;
  56. PCEBuffer();
  57. ~PCEBuffer();
  58. void reset();
  59. UBool isEmpty() const;
  60. void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
  61. const PCEI *get();
  62. };
  63. class UCollationPCE : public UMemory {
  64. private:
  65. PCEBuffer pceBuffer;
  66. CollationElementIterator *cei;
  67. UCollationStrength strength;
  68. UBool toShift;
  69. UBool isShifted;
  70. uint32_t variableTop;
  71. public:
  72. UCollationPCE(UCollationElements *elems);
  73. UCollationPCE(CollationElementIterator *iter);
  74. ~UCollationPCE();
  75. void init(UCollationElements *elems);
  76. void init(CollationElementIterator *iter);
  77. /**
  78. * Get the processed ordering priority of the next collation element in the text.
  79. * A single character may contain more than one collation element.
  80. *
  81. * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
  82. * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
  83. * @param status A pointer to an UErrorCode to receive any errors.
  84. * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
  85. * if an error has occurred or if the end of string has been reached
  86. */
  87. int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
  88. /**
  89. * Get the processed ordering priority of the previous collation element in the text.
  90. * A single character may contain more than one collation element.
  91. *
  92. * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
  93. * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
  94. * @param status A pointer to an UErrorCode to receive any errors. Notably
  95. * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
  96. * buffer has been exhausted.
  97. * @return The previous collation elements ordering, otherwise returns
  98. * UCOL_PROCESSED_NULLORDER if an error has occurred or if the start of
  99. * string has been reached.
  100. */
  101. int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
  102. private:
  103. void init(const Collator &coll);
  104. uint64_t processCE(uint32_t ce);
  105. };
  106. U_NAMESPACE_END
  107. #define INITIAL_ARRAY_SIZE_ 256
  108. struct USearch {
  109. // required since collation element iterator does not have a getText API
  110. const UChar *text;
  111. int32_t textLength; // exact length
  112. UBool isOverlap;
  113. UBool isCanonicalMatch;
  114. int16_t elementComparisonType;
  115. UBreakIterator *internalBreakIter; // internal character breakiterator, lazily created.
  116. UBreakIterator *breakIter; // caller provided character breakiterator
  117. // value USEARCH_DONE is the default value
  118. // if we are not at the start of the text or the end of the text,
  119. // depending on the iteration direction and matchedIndex is USEARCH_DONE
  120. // it means that we can't find any more matches in that particular direction
  121. int32_t matchedIndex;
  122. int32_t matchedLength;
  123. UBool isForwardSearching;
  124. UBool reset;
  125. };
  126. struct UPattern {
  127. const UChar *text;
  128. int32_t textLength; // exact length
  129. // length required for backwards ce comparison
  130. int32_t cesLength;
  131. int32_t *ces;
  132. int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
  133. int32_t pcesLength;
  134. int64_t *pces;
  135. int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
  136. UBool hasPrefixAccents;
  137. UBool hasSuffixAccents;
  138. };
  139. struct UStringSearch {
  140. struct USearch *search;
  141. struct UPattern pattern;
  142. const UCollator *collator;
  143. const icu::Normalizer2 *nfd;
  144. // positions within the collation element iterator is used to determine
  145. // if we are at the start of the text.
  146. UCollationElements *textIter;
  147. icu::UCollationPCE *textProcessedIter;
  148. // utility collation element, used throughout program for temporary
  149. // iteration.
  150. UCollationElements *utilIter;
  151. UBool ownCollator;
  152. UCollationStrength strength;
  153. uint32_t ceMask;
  154. uint32_t variableTop;
  155. UBool toShift;
  156. };
  157. /**
  158. * Exact matches without checking for the ends for extra accents.
  159. * The match after the position within the collation element iterator is to be
  160. * found.
  161. * After a match is found the offset in the collation element iterator will be
  162. * shifted to the start of the match.
  163. * Implementation note:
  164. * For tertiary we can't use the collator->tertiaryMask, that is a
  165. * preprocessed mask that takes into account case options. since we are only
  166. * concerned with exact matches, we don't need that.
  167. * Alternate handling - since only the 16 most significant digits is only used,
  168. * we can safely do a compare without masking if the ce is a variable, we mask
  169. * and get only the primary values no shifting to quartenary is required since
  170. * all primary values less than variabletop will need to be masked off anyway.
  171. * If the end character is composite and the pattern ce does not match the text
  172. * ce, we skip it until we find a match in the end composite character or when
  173. * it has passed the character. This is so that we can match pattern "a" with
  174. * the text "\u00e6"
  175. * @param strsrch string search data
  176. * @param status error status if any
  177. * @return true if an exact match is found, false otherwise
  178. */
  179. U_CFUNC
  180. UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
  181. /**
  182. * Canonical matches.
  183. * According to the definition, matches found here will include the whole span
  184. * of beginning and ending accents if it overlaps that region.
  185. * @param strsrch string search data
  186. * @param status error status if any
  187. * @return true if a canonical match is found, false otherwise
  188. */
  189. U_CFUNC
  190. UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
  191. /**
  192. * Gets the previous match.
  193. * Comments follows from handleNextExact
  194. * @param strsrch string search data
  195. * @param status error status if any
  196. * @return True if a exact math is found, false otherwise.
  197. */
  198. U_CFUNC
  199. UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
  200. /**
  201. * Canonical matches.
  202. * According to the definition, matches found here will include the whole span
  203. * of beginning and ending accents if it overlaps that region.
  204. * @param strsrch string search data
  205. * @param status error status if any
  206. * @return true if a canonical match is found, false otherwise
  207. */
  208. U_CFUNC
  209. UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
  210. UErrorCode *status);
  211. #endif /* #if !UCONFIG_NO_COLLATION */
  212. #endif