rbbi_cache.h 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. // file: rbbi_cache.h
  4. //
  5. #ifndef RBBI_CACHE_H
  6. #define RBBI_CACHE_H
  7. #include "unicode/utypes.h"
  8. #if !UCONFIG_NO_BREAK_ITERATION
  9. #include "unicode/rbbi.h"
  10. #include "unicode/uobject.h"
  11. #include "uvectr32.h"
  12. U_NAMESPACE_BEGIN
  13. /* DictionaryCache stores the boundaries obtained from a run of dictionary characters.
  14. * Dictionary boundaries are moved first to this cache, then from here
  15. * to the main BreakCache, where they may inter-leave with non-dictionary
  16. * boundaries. The public BreakIterator API always fetches directly
  17. * from the main BreakCache, not from here.
  18. *
  19. * In common situations, the number of boundaries in a single dictionary run
  20. * should be quite small, it will be terminated by punctuation, spaces,
  21. * or any other non-dictionary characters. The main BreakCache may end
  22. * up with boundaries from multiple dictionary based runs.
  23. *
  24. * The boundaries are stored in a simple ArrayList (vector), with the
  25. * assumption that they will be accessed sequentially.
  26. */
  27. class RuleBasedBreakIterator::DictionaryCache: public UMemory {
  28. public:
  29. DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);
  30. ~DictionaryCache();
  31. void reset();
  32. UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
  33. UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
  34. /**
  35. * Populate the cache with the dictionary based boundaries within a region of text.
  36. * @param startPos The start position of a range of text
  37. * @param endPos The end position of a range of text
  38. * @param firstRuleStatus The rule status index that applies to the break at startPos
  39. * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
  40. * @internal
  41. */
  42. void populateDictionary(int32_t startPos, int32_t endPos,
  43. int32_t firstRuleStatus, int32_t otherRuleStatus);
  44. RuleBasedBreakIterator *fBI;
  45. UVector32 fBreaks; // A vector containing the boundaries.
  46. int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following()
  47. // or preceding(). Optimizes sequential access.
  48. int32_t fStart; // Text position of first boundary in cache.
  49. int32_t fLimit; // Last boundary in cache. Which is the limit of the
  50. // text segment being handled by the dictionary.
  51. int32_t fFirstRuleStatusIndex; // Rule status info for first boundary.
  52. int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.
  53. };
  54. /*
  55. * class BreakCache
  56. *
  57. * Cache of break boundary positions and rule status values.
  58. * Break iterator API functions, next(), previous(), etc., will use cached results
  59. * when possible, and otherwise cache new results as they are obtained.
  60. *
  61. * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
  62. *
  63. * The cache is implemented as a single circular buffer.
  64. */
  65. /*
  66. * size of the circular cache buffer.
  67. */
  68. class RuleBasedBreakIterator::BreakCache: public UMemory {
  69. public:
  70. BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);
  71. virtual ~BreakCache();
  72. void reset(int32_t pos = 0, int32_t ruleStatus = 0);
  73. void next() { if (fBufIdx == fEndBufIdx) {
  74. nextOL();
  75. } else {
  76. fBufIdx = modChunkSize(fBufIdx + 1);
  77. fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];
  78. fBI->fRuleStatusIndex = fStatuses[fBufIdx];
  79. }
  80. }
  81. void nextOL();
  82. void previous(UErrorCode &status);
  83. // Move the iteration state to the position following the startPosition.
  84. // Input position must be pinned to the input length.
  85. void following(int32_t startPosition, UErrorCode &status);
  86. void preceding(int32_t startPosition, UErrorCode &status);
  87. /*
  88. * Update the state of the public BreakIterator (fBI) to reflect the
  89. * current state of the break iterator cache (this).
  90. */
  91. int32_t current();
  92. /**
  93. * Add boundaries to the cache near the specified position.
  94. * The given position need not be a boundary itself.
  95. * The input position must be within the range of the text, and
  96. * on a code point boundary.
  97. * If the requested position is a break boundary, leave the iteration
  98. * position on it.
  99. * If the requested position is not a boundary, leave the iteration
  100. * position on the preceding boundary and include both the
  101. * preceding and following boundaries in the cache.
  102. * Additional boundaries, either preceding or following, may be added
  103. * to the cache as a side effect.
  104. *
  105. * Return false if the operation failed.
  106. */
  107. UBool populateNear(int32_t position, UErrorCode &status);
  108. /**
  109. * Add boundary(s) to the cache following the current last boundary.
  110. * Return false if at the end of the text, and no more boundaries can be added.
  111. * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
  112. */
  113. UBool populateFollowing();
  114. /**
  115. * Add one or more boundaries to the cache preceding the first currently cached boundary.
  116. * Leave the iteration position on the first added boundary.
  117. * Return false if no boundaries could be added (if at the start of the text.)
  118. */
  119. UBool populatePreceding(UErrorCode &status);
  120. enum UpdatePositionValues {
  121. RetainCachePosition = 0,
  122. UpdateCachePosition = 1
  123. };
  124. /*
  125. * Add the boundary following the current position.
  126. * The current position can be left as it was, or changed to the newly added boundary,
  127. * as specified by the update parameter.
  128. */
  129. void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
  130. /*
  131. * Add the boundary preceding the current position.
  132. * The current position can be left as it was, or changed to the newly added boundary,
  133. * as specified by the update parameter.
  134. */
  135. bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
  136. /**
  137. * Set the cache position to the specified position, or, if the position
  138. * falls between to cached boundaries, to the preceding boundary.
  139. * Fails if the requested position is outside of the range of boundaries currently held by the cache.
  140. * The startPosition must be on a code point boundary.
  141. *
  142. * Return true if successful, false if the specified position is after
  143. * the last cached boundary or before the first.
  144. */
  145. UBool seek(int32_t startPosition);
  146. void dumpCache();
  147. private:
  148. static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); }
  149. static constexpr int32_t CACHE_SIZE = 128;
  150. static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
  151. RuleBasedBreakIterator *fBI;
  152. int32_t fStartBufIdx;
  153. int32_t fEndBufIdx; // inclusive
  154. int32_t fTextIdx;
  155. int32_t fBufIdx;
  156. int32_t fBoundaries[CACHE_SIZE];
  157. uint16_t fStatuses[CACHE_SIZE];
  158. UVector32 fSideBuffer;
  159. };
  160. U_NAMESPACE_END
  161. #endif // #if !UCONFIG_NO_BREAK_ITERATION
  162. #endif // RBBI_CACHE_H