rbbi_cache.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. // Copyright (C) 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. // file: rbbi_cache.cpp
  4. #include "unicode/utypes.h"
  5. #if !UCONFIG_NO_BREAK_ITERATION
  6. #include "unicode/ubrk.h"
  7. #include "unicode/rbbi.h"
  8. #include "rbbi_cache.h"
  9. #include "brkeng.h"
  10. #include "cmemory.h"
  11. #include "rbbidata.h"
  12. #include "rbbirb.h"
  13. #include "uassert.h"
  14. #include "uvectr32.h"
  15. U_NAMESPACE_BEGIN
  16. /*
  17. * DictionaryCache implementation
  18. */
  19. RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
  20. fBI(bi), fBreaks(status), fPositionInCache(-1),
  21. fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
  22. }
  23. RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
  24. }
  25. void RuleBasedBreakIterator::DictionaryCache::reset() {
  26. fPositionInCache = -1;
  27. fStart = 0;
  28. fLimit = 0;
  29. fFirstRuleStatusIndex = 0;
  30. fOtherRuleStatusIndex = 0;
  31. fBreaks.removeAllElements();
  32. }
  33. UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
  34. if (fromPos >= fLimit || fromPos < fStart) {
  35. fPositionInCache = -1;
  36. return false;
  37. }
  38. // Sequential iteration, move from previous boundary to the following
  39. int32_t r = 0;
  40. if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
  41. ++fPositionInCache;
  42. if (fPositionInCache >= fBreaks.size()) {
  43. fPositionInCache = -1;
  44. return false;
  45. }
  46. r = fBreaks.elementAti(fPositionInCache);
  47. U_ASSERT(r > fromPos);
  48. *result = r;
  49. *statusIndex = fOtherRuleStatusIndex;
  50. return true;
  51. }
  52. // Random indexing. Linear search for the boundary following the given position.
  53. for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
  54. r= fBreaks.elementAti(fPositionInCache);
  55. if (r > fromPos) {
  56. *result = r;
  57. *statusIndex = fOtherRuleStatusIndex;
  58. return true;
  59. }
  60. }
  61. UPRV_UNREACHABLE_EXIT;
  62. }
  63. UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
  64. if (fromPos <= fStart || fromPos > fLimit) {
  65. fPositionInCache = -1;
  66. return false;
  67. }
  68. if (fromPos == fLimit) {
  69. fPositionInCache = fBreaks.size() - 1;
  70. if (fPositionInCache >= 0) {
  71. U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
  72. }
  73. }
  74. int32_t r;
  75. if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
  76. --fPositionInCache;
  77. r = fBreaks.elementAti(fPositionInCache);
  78. U_ASSERT(r < fromPos);
  79. *result = r;
  80. *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
  81. return true;
  82. }
  83. if (fPositionInCache == 0) {
  84. fPositionInCache = -1;
  85. return false;
  86. }
  87. for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
  88. r = fBreaks.elementAti(fPositionInCache);
  89. if (r < fromPos) {
  90. *result = r;
  91. *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
  92. return true;
  93. }
  94. }
  95. UPRV_UNREACHABLE_EXIT;
  96. }
  97. void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
  98. int32_t firstRuleStatus, int32_t otherRuleStatus) {
  99. if ((endPos - startPos) <= 1) {
  100. return;
  101. }
  102. reset();
  103. fFirstRuleStatusIndex = firstRuleStatus;
  104. fOtherRuleStatusIndex = otherRuleStatus;
  105. int32_t rangeStart = startPos;
  106. int32_t rangeEnd = endPos;
  107. uint16_t category;
  108. int32_t current;
  109. UErrorCode status = U_ZERO_ERROR;
  110. int32_t foundBreakCount = 0;
  111. UText *text = &fBI->fText;
  112. // Loop through the text, looking for ranges of dictionary characters.
  113. // For each span, find the appropriate break engine, and ask it to find
  114. // any breaks within the span.
  115. utext_setNativeIndex(text, rangeStart);
  116. UChar32 c = utext_current32(text);
  117. category = ucptrie_get(fBI->fData->fTrie, c);
  118. uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
  119. while(U_SUCCESS(status)) {
  120. while ((current = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(text))) < rangeEnd
  121. && (category < dictStart)) {
  122. utext_next32(text); // TODO: cleaner loop structure.
  123. c = utext_current32(text);
  124. category = ucptrie_get(fBI->fData->fTrie, c);
  125. }
  126. if (current >= rangeEnd) {
  127. break;
  128. }
  129. // We now have a dictionary character. Get the appropriate language object
  130. // to deal with it.
  131. const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
  132. c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
  133. // Ask the language object if there are any breaks. It will add them to the cache and
  134. // leave the text pointer on the other side of its range, ready to search for the next one.
  135. if (lbe != nullptr) {
  136. foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
  137. }
  138. // Reload the loop variables for the next go-round
  139. c = utext_current32(text);
  140. category = ucptrie_get(fBI->fData->fTrie, c);
  141. }
  142. // If we found breaks, ensure that the first and last entries are
  143. // the original starting and ending position. And initialize the
  144. // cache iteration position to the first entry.
  145. // printf("foundBreakCount = %d\n", foundBreakCount);
  146. if (foundBreakCount > 0) {
  147. U_ASSERT(foundBreakCount == fBreaks.size());
  148. if (startPos < fBreaks.elementAti(0)) {
  149. // The dictionary did not place a boundary at the start of the segment of text.
  150. // Add one now. This should not commonly happen, but it would be easy for interactions
  151. // of the rules for dictionary segments and the break engine implementations to
  152. // inadvertently cause it. Cover it here, just in case.
  153. fBreaks.insertElementAt(startPos, 0, status);
  154. }
  155. if (endPos > fBreaks.peeki()) {
  156. fBreaks.push(endPos, status);
  157. }
  158. fPositionInCache = 0;
  159. // Note: Dictionary matching may extend beyond the original limit.
  160. fStart = fBreaks.elementAti(0);
  161. fLimit = fBreaks.peeki();
  162. } else {
  163. // there were no language-based breaks, even though the segment contained
  164. // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
  165. // for this range will fail, and the calling code will fall back to the rule based boundaries.
  166. }
  167. }
  168. /*
  169. * BreakCache implementation
  170. */
  171. RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
  172. fBI(bi), fSideBuffer(status) {
  173. reset();
  174. }
  175. RuleBasedBreakIterator::BreakCache::~BreakCache() {
  176. }
  177. void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
  178. fStartBufIdx = 0;
  179. fEndBufIdx = 0;
  180. fTextIdx = pos;
  181. fBufIdx = 0;
  182. fBoundaries[0] = pos;
  183. fStatuses[0] = static_cast<uint16_t>(ruleStatus);
  184. }
  185. int32_t RuleBasedBreakIterator::BreakCache::current() {
  186. fBI->fPosition = fTextIdx;
  187. fBI->fRuleStatusIndex = fStatuses[fBufIdx];
  188. fBI->fDone = false;
  189. return fTextIdx;
  190. }
  191. void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
  192. if (U_FAILURE(status)) {
  193. return;
  194. }
  195. if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
  196. // startPos is in the cache. Do a next() from that position.
  197. // TODO: an awkward set of interactions with bi->fDone
  198. // seek() does not clear it; it can't because of interactions with populateNear().
  199. // next() does not clear it in the fast-path case, where everything matters. Maybe it should.
  200. // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
  201. fBI->fDone = false;
  202. next();
  203. }
  204. }
  205. void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
  206. if (U_FAILURE(status)) {
  207. return;
  208. }
  209. if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
  210. if (startPos == fTextIdx) {
  211. previous(status);
  212. } else {
  213. // seek() leaves the BreakCache positioned at the preceding boundary
  214. // if the requested position is between two boundaries.
  215. // current() pushes the BreakCache position out to the BreakIterator itself.
  216. U_ASSERT(startPos > fTextIdx);
  217. current();
  218. }
  219. }
  220. }
  221. /*
  222. * Out-of-line code for BreakCache::next().
  223. * Cache does not already contain the boundary
  224. */
  225. void RuleBasedBreakIterator::BreakCache::nextOL() {
  226. fBI->fDone = !populateFollowing();
  227. fBI->fPosition = fTextIdx;
  228. fBI->fRuleStatusIndex = fStatuses[fBufIdx];
  229. }
  230. void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
  231. if (U_FAILURE(status)) {
  232. return;
  233. }
  234. int32_t initialBufIdx = fBufIdx;
  235. if (fBufIdx == fStartBufIdx) {
  236. // At start of cache. Prepend to it.
  237. populatePreceding(status);
  238. } else {
  239. // Cache already holds the next boundary
  240. fBufIdx = modChunkSize(fBufIdx - 1);
  241. fTextIdx = fBoundaries[fBufIdx];
  242. }
  243. fBI->fDone = (fBufIdx == initialBufIdx);
  244. fBI->fPosition = fTextIdx;
  245. fBI->fRuleStatusIndex = fStatuses[fBufIdx];
  246. }
  247. UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
  248. if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
  249. return false;
  250. }
  251. if (pos == fBoundaries[fStartBufIdx]) {
  252. // Common case: seek(0), from BreakIterator::first()
  253. fBufIdx = fStartBufIdx;
  254. fTextIdx = fBoundaries[fBufIdx];
  255. return true;
  256. }
  257. if (pos == fBoundaries[fEndBufIdx]) {
  258. fBufIdx = fEndBufIdx;
  259. fTextIdx = fBoundaries[fBufIdx];
  260. return true;
  261. }
  262. int32_t min = fStartBufIdx;
  263. int32_t max = fEndBufIdx;
  264. while (min != max) {
  265. int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
  266. probe = modChunkSize(probe);
  267. if (fBoundaries[probe] > pos) {
  268. max = probe;
  269. } else {
  270. min = modChunkSize(probe + 1);
  271. }
  272. }
  273. U_ASSERT(fBoundaries[max] > pos);
  274. fBufIdx = modChunkSize(max - 1);
  275. fTextIdx = fBoundaries[fBufIdx];
  276. U_ASSERT(fTextIdx <= pos);
  277. return true;
  278. }
  279. UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
  280. if (U_FAILURE(status)) {
  281. return false;
  282. }
  283. U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
  284. // Add boundaries to the cache near the specified position.
  285. // The given position need not be a boundary itself.
  286. // The input position must be within the range of the text, and
  287. // on a code point boundary.
  288. // If the requested position is a break boundary, leave the iteration
  289. // position on it.
  290. // If the requested position is not a boundary, leave the iteration
  291. // position on the preceding boundary and include both the
  292. // preceding and following boundaries in the cache.
  293. // Additional boundaries, either preceding or following, may be added
  294. // to the cache as a side effect.
  295. // If the requested position is not near already cached positions, clear the existing cache,
  296. // find a near-by boundary and begin new cache contents there.
  297. // Threshold for a text position to be considered near to existing cache contents.
  298. // TODO: See issue ICU-22024 "perf tuning of Cache needed."
  299. // This value is subject to change. See the ticket for more details.
  300. static constexpr int32_t CACHE_NEAR = 15;
  301. int32_t aBoundary = -1;
  302. int32_t ruleStatusIndex = 0;
  303. bool retainCache = false;
  304. if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
  305. // Requested position is near the existing cache. Retain it.
  306. retainCache = true;
  307. } else if (position <= CACHE_NEAR) {
  308. // Requested position is near the start of the text. Fill cache from start, skipping
  309. // the need to find a safe point.
  310. retainCache = false;
  311. aBoundary = 0;
  312. } else {
  313. // Requested position is not near the existing cache.
  314. // Find a safe point to refill the cache from.
  315. int32_t backupPos = fBI->handleSafePrevious(position);
  316. if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
  317. // The requested position is beyond the end of the existing cache, but the
  318. // reverse rules produced a position near or before the cached region.
  319. // Retain the existing cache, and fill from the end of it.
  320. retainCache = true;
  321. } else if (backupPos < CACHE_NEAR) {
  322. // The safe reverse rules moved us to near the start of text.
  323. // Take that (index 0) as the backup boundary, avoiding the complication
  324. // (in the following block) of moving forward from the safe point to a known boundary.
  325. //
  326. // Retain the cache if it begins not too far from the requested position.
  327. aBoundary = 0;
  328. retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
  329. } else {
  330. // The safe reverse rules produced a position that is neither near the existing
  331. // cache, nor near the start of text.
  332. // Advance to the boundary following.
  333. // There is a complication: the safe reverse rules identify pairs of code points
  334. // that are safe. If advancing from the safe point moves forwards by less than
  335. // two code points, we need to advance one more time to ensure that the boundary
  336. // is good, including a correct rules status value.
  337. retainCache = false;
  338. fBI->fPosition = backupPos;
  339. aBoundary = fBI->handleNext();
  340. if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
  341. // +4 is a quick test for possibly having advanced only one codepoint.
  342. // Four being the length of the longest potential code point, a supplementary in UTF-8
  343. utext_setNativeIndex(&fBI->fText, aBoundary);
  344. if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
  345. // The initial handleNext() only advanced by a single code point. Go again.
  346. aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
  347. }
  348. }
  349. if (aBoundary == UBRK_DONE) {
  350. // Note (Andy Heninger): I don't think this condition can occur, but it's hard
  351. // to prove that it can't. We ran off the end of the string looking a boundary
  352. // following a safe point; choose the end of the string as that boundary.
  353. aBoundary = utext_nativeLength(&fBI->fText);
  354. }
  355. ruleStatusIndex = fBI->fRuleStatusIndex;
  356. }
  357. }
  358. if (!retainCache) {
  359. U_ASSERT(aBoundary != -1);
  360. reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
  361. }
  362. // Fill in boundaries between existing cache content and the new requested position.
  363. if (fBoundaries[fEndBufIdx] < position) {
  364. // The last position in the cache precedes the requested position.
  365. // Add following position(s) to the cache.
  366. while (fBoundaries[fEndBufIdx] < position) {
  367. if (!populateFollowing()) {
  368. UPRV_UNREACHABLE_EXIT;
  369. }
  370. }
  371. fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
  372. fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
  373. while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
  374. previous(status);
  375. }
  376. return true;
  377. }
  378. if (fBoundaries[fStartBufIdx] > position) {
  379. // The first position in the cache is beyond the requested position.
  380. // back up more until we get a boundary <= the requested position.
  381. while (fBoundaries[fStartBufIdx] > position) {
  382. populatePreceding(status);
  383. }
  384. fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
  385. fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
  386. while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
  387. next();
  388. }
  389. if (fTextIdx > position) {
  390. // If position is not itself a boundary, the next() loop above will overshoot.
  391. // Back up one, leaving cache position at the boundary preceding the requested position.
  392. previous(status);
  393. }
  394. return true;
  395. }
  396. U_ASSERT(fTextIdx == position);
  397. return true;
  398. }
  399. UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
  400. int32_t fromPosition = fBoundaries[fEndBufIdx];
  401. int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
  402. int32_t pos = 0;
  403. int32_t ruleStatusIdx = 0;
  404. if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
  405. addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
  406. return true;
  407. }
  408. fBI->fPosition = fromPosition;
  409. pos = fBI->handleNext();
  410. if (pos == UBRK_DONE) {
  411. return false;
  412. }
  413. ruleStatusIdx = fBI->fRuleStatusIndex;
  414. if (fBI->fDictionaryCharCount > 0) {
  415. // The text segment obtained from the rules includes dictionary characters.
  416. // Subdivide it, with subdivided results going into the dictionary cache.
  417. fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
  418. if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
  419. addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
  420. return true;
  421. // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
  422. // But be careful with interactions with populateNear().
  423. }
  424. }
  425. // Rule based segment did not include dictionary characters.
  426. // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
  427. // meaning that we didn't take the return, above.
  428. // Add its end point to the cache.
  429. addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
  430. // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
  431. // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
  432. //
  433. for (int count=0; count<6; ++count) {
  434. pos = fBI->handleNext();
  435. if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
  436. break;
  437. }
  438. addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
  439. }
  440. return true;
  441. }
  442. UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
  443. if (U_FAILURE(status)) {
  444. return false;
  445. }
  446. int32_t fromPosition = fBoundaries[fStartBufIdx];
  447. if (fromPosition == 0) {
  448. return false;
  449. }
  450. int32_t position = 0;
  451. int32_t positionStatusIdx = 0;
  452. if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
  453. addPreceding(position, positionStatusIdx, UpdateCachePosition);
  454. return true;
  455. }
  456. int32_t backupPosition = fromPosition;
  457. // Find a boundary somewhere preceding the first already-cached boundary
  458. do {
  459. backupPosition = backupPosition - 30;
  460. if (backupPosition <= 0) {
  461. backupPosition = 0;
  462. } else {
  463. backupPosition = fBI->handleSafePrevious(backupPosition);
  464. }
  465. if (backupPosition == UBRK_DONE || backupPosition == 0) {
  466. position = 0;
  467. positionStatusIdx = 0;
  468. } else {
  469. // Advance to the boundary following the backup position.
  470. // There is a complication: the safe reverse rules identify pairs of code points
  471. // that are safe. If advancing from the safe point moves forwards by less than
  472. // two code points, we need to advance one more time to ensure that the boundary
  473. // is good, including a correct rules status value.
  474. //
  475. fBI->fPosition = backupPosition;
  476. position = fBI->handleNext();
  477. if (position <= backupPosition + 4) {
  478. // +4 is a quick test for possibly having advanced only one codepoint.
  479. // Four being the length of the longest potential code point, a supplementary in UTF-8
  480. utext_setNativeIndex(&fBI->fText, position);
  481. if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
  482. // The initial handleNext() only advanced by a single code point. Go again.
  483. position = fBI->handleNext(); // Safe rules identify safe pairs.
  484. }
  485. }
  486. positionStatusIdx = fBI->fRuleStatusIndex;
  487. }
  488. } while (position >= fromPosition);
  489. // Find boundaries between the one we just located and the first already-cached boundary
  490. // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
  491. fSideBuffer.removeAllElements();
  492. fSideBuffer.addElement(position, status);
  493. fSideBuffer.addElement(positionStatusIdx, status);
  494. do {
  495. int32_t prevPosition = fBI->fPosition = position;
  496. int32_t prevStatusIdx = positionStatusIdx;
  497. position = fBI->handleNext();
  498. positionStatusIdx = fBI->fRuleStatusIndex;
  499. if (position == UBRK_DONE) {
  500. break;
  501. }
  502. UBool segmentHandledByDictionary = false;
  503. if (fBI->fDictionaryCharCount != 0) {
  504. // Segment from the rules includes dictionary characters.
  505. // Subdivide it, with subdivided results going into the dictionary cache.
  506. int32_t dictSegEndPosition = position;
  507. fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
  508. while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
  509. segmentHandledByDictionary = true;
  510. U_ASSERT(position > prevPosition);
  511. if (position >= fromPosition) {
  512. break;
  513. }
  514. U_ASSERT(position <= dictSegEndPosition);
  515. fSideBuffer.addElement(position, status);
  516. fSideBuffer.addElement(positionStatusIdx, status);
  517. prevPosition = position;
  518. }
  519. U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
  520. }
  521. if (!segmentHandledByDictionary && position < fromPosition) {
  522. fSideBuffer.addElement(position, status);
  523. fSideBuffer.addElement(positionStatusIdx, status);
  524. }
  525. } while (position < fromPosition);
  526. // Move boundaries from the side buffer to the main circular buffer.
  527. UBool success = false;
  528. if (!fSideBuffer.isEmpty()) {
  529. positionStatusIdx = fSideBuffer.popi();
  530. position = fSideBuffer.popi();
  531. addPreceding(position, positionStatusIdx, UpdateCachePosition);
  532. success = true;
  533. }
  534. while (!fSideBuffer.isEmpty()) {
  535. positionStatusIdx = fSideBuffer.popi();
  536. position = fSideBuffer.popi();
  537. if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
  538. // No space in circular buffer to hold a new preceding result while
  539. // also retaining the current cache (iteration) position.
  540. // Bailing out is safe; the cache will refill again if needed.
  541. break;
  542. }
  543. }
  544. return success;
  545. }
  546. void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
  547. U_ASSERT(position > fBoundaries[fEndBufIdx]);
  548. U_ASSERT(ruleStatusIdx <= UINT16_MAX);
  549. int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
  550. if (nextIdx == fStartBufIdx) {
  551. fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
  552. }
  553. fBoundaries[nextIdx] = position;
  554. fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
  555. fEndBufIdx = nextIdx;
  556. if (update == UpdateCachePosition) {
  557. // Set current position to the newly added boundary.
  558. fBufIdx = nextIdx;
  559. fTextIdx = position;
  560. } else {
  561. // Retaining the original cache position.
  562. // Check if the added boundary wraps around the buffer, and would over-write the original position.
  563. // It's the responsibility of callers of this function to not add too many.
  564. U_ASSERT(nextIdx != fBufIdx);
  565. }
  566. }
  567. bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
  568. U_ASSERT(position < fBoundaries[fStartBufIdx]);
  569. U_ASSERT(ruleStatusIdx <= UINT16_MAX);
  570. int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
  571. if (nextIdx == fEndBufIdx) {
  572. if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
  573. // Failure. The insertion of the new boundary would claim the buffer position that is the
  574. // current iteration position. And we also want to retain the current iteration position.
  575. // (The buffer is already completely full of entries that precede the iteration position.)
  576. return false;
  577. }
  578. fEndBufIdx = modChunkSize(fEndBufIdx - 1);
  579. }
  580. fBoundaries[nextIdx] = position;
  581. fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
  582. fStartBufIdx = nextIdx;
  583. if (update == UpdateCachePosition) {
  584. fBufIdx = nextIdx;
  585. fTextIdx = position;
  586. }
  587. return true;
  588. }
  589. void RuleBasedBreakIterator::BreakCache::dumpCache() {
  590. #ifdef RBBI_DEBUG
  591. RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx);
  592. for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
  593. RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]);
  594. if (i == fEndBufIdx) {
  595. break;
  596. }
  597. }
  598. #endif
  599. }
  600. U_NAMESPACE_END
  601. #endif // #if !UCONFIG_NO_BREAK_ITERATION