rbbi.cpp 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ***************************************************************************
  5. * Copyright (C) 1999-2016 International Business Machines Corporation
  6. * and others. All rights reserved.
  7. ***************************************************************************
  8. */
  9. //
  10. // file: rbbi.cpp Contains the implementation of the rule based break iterator
  11. // runtime engine and the API implementation for
  12. // class RuleBasedBreakIterator
  13. //
  14. #include "utypeinfo.h" // for 'typeid' to work
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_BREAK_ITERATION
  17. #include <cinttypes>
  18. #include "unicode/rbbi.h"
  19. #include "unicode/schriter.h"
  20. #include "unicode/uchriter.h"
  21. #include "unicode/uclean.h"
  22. #include "unicode/udata.h"
  23. #include "brkeng.h"
  24. #include "ucln_cmn.h"
  25. #include "cmemory.h"
  26. #include "cstring.h"
  27. #include "localsvc.h"
  28. #include "rbbidata.h"
  29. #include "rbbi_cache.h"
  30. #include "rbbirb.h"
  31. #include "uassert.h"
  32. #include "umutex.h"
  33. #include "uvectr32.h"
  34. #ifdef RBBI_DEBUG
  35. static UBool gTrace = false;
  36. #endif
  37. U_NAMESPACE_BEGIN
  38. // The state number of the starting state
  39. constexpr int32_t START_STATE = 1;
  40. // The state-transition value indicating "stop"
  41. constexpr int32_t STOP_STATE = 0;
  42. UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
  43. //=======================================================================
  44. // constructors
  45. //=======================================================================
  46. /**
  47. * Constructs a RuleBasedBreakIterator that uses the already-created
  48. * tables object that is passed in as a parameter.
  49. */
  50. RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
  51. : RuleBasedBreakIterator(&status)
  52. {
  53. fData = new RBBIDataWrapper(data, status); // status checked in constructor
  54. if (U_FAILURE(status)) {return;}
  55. if(fData == nullptr) {
  56. status = U_MEMORY_ALLOCATION_ERROR;
  57. return;
  58. }
  59. if (fData->fForwardTable->fLookAheadResultsSize > 0) {
  60. fLookAheadMatches = static_cast<int32_t *>(
  61. uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
  62. if (fLookAheadMatches == nullptr) {
  63. status = U_MEMORY_ALLOCATION_ERROR;
  64. return;
  65. }
  66. }
  67. }
  68. //-------------------------------------------------------------------------------
  69. //
  70. // Constructor from a UDataMemory handle to precompiled break rules
  71. // stored in an ICU data file. This construcotr is private API,
  72. // only for internal use.
  73. //
  74. //-------------------------------------------------------------------------------
  75. RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
  76. UErrorCode &status) : RuleBasedBreakIterator(udm, status)
  77. {
  78. fIsPhraseBreaking = isPhraseBreaking;
  79. }
  80. //
  81. // Construct from precompiled binary rules (tables). This constructor is public API,
  82. // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
  83. //
  84. RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
  85. uint32_t ruleLength,
  86. UErrorCode &status)
  87. : RuleBasedBreakIterator(&status)
  88. {
  89. if (U_FAILURE(status)) {
  90. return;
  91. }
  92. if (compiledRules == nullptr || ruleLength < sizeof(RBBIDataHeader)) {
  93. status = U_ILLEGAL_ARGUMENT_ERROR;
  94. return;
  95. }
  96. const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
  97. if (data->fLength > ruleLength) {
  98. status = U_ILLEGAL_ARGUMENT_ERROR;
  99. return;
  100. }
  101. fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
  102. if (U_FAILURE(status)) {return;}
  103. if(fData == nullptr) {
  104. status = U_MEMORY_ALLOCATION_ERROR;
  105. return;
  106. }
  107. if (fData->fForwardTable->fLookAheadResultsSize > 0) {
  108. fLookAheadMatches = static_cast<int32_t *>(
  109. uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
  110. if (fLookAheadMatches == nullptr) {
  111. status = U_MEMORY_ALLOCATION_ERROR;
  112. return;
  113. }
  114. }
  115. }
  116. //-------------------------------------------------------------------------------
  117. //
  118. // Constructor from a UDataMemory handle to precompiled break rules
  119. // stored in an ICU data file.
  120. //
  121. //-------------------------------------------------------------------------------
  122. RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
  123. : RuleBasedBreakIterator(&status)
  124. {
  125. fData = new RBBIDataWrapper(udm, status); // status checked in constructor
  126. if (U_FAILURE(status)) {return;}
  127. if(fData == nullptr) {
  128. status = U_MEMORY_ALLOCATION_ERROR;
  129. return;
  130. }
  131. if (fData->fForwardTable->fLookAheadResultsSize > 0) {
  132. fLookAheadMatches = static_cast<int32_t *>(
  133. uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
  134. if (fLookAheadMatches == nullptr) {
  135. status = U_MEMORY_ALLOCATION_ERROR;
  136. return;
  137. }
  138. }
  139. }
  140. //-------------------------------------------------------------------------------
  141. //
  142. // Constructor from a set of rules supplied as a string.
  143. //
  144. //-------------------------------------------------------------------------------
  145. RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
  146. UParseError &parseError,
  147. UErrorCode &status)
  148. : RuleBasedBreakIterator(&status)
  149. {
  150. if (U_FAILURE(status)) {return;}
  151. RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
  152. RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
  153. // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
  154. // creates and returns a complete RBBI. From here, in a constructor, we
  155. // can't just return the object created by the builder factory, hence
  156. // the assignment of the factory created object to "this".
  157. if (U_SUCCESS(status)) {
  158. *this = *bi;
  159. delete bi;
  160. }
  161. }
  162. //-------------------------------------------------------------------------------
  163. //
  164. // Default Constructor. Create an empty shell that can be set up later.
  165. // Used when creating a RuleBasedBreakIterator from a set
  166. // of rules.
  167. //-------------------------------------------------------------------------------
  168. RuleBasedBreakIterator::RuleBasedBreakIterator()
  169. : RuleBasedBreakIterator(nullptr)
  170. {
  171. }
  172. /**
  173. * Simple Constructor with an error code.
  174. * Handles common initialization for all other constructors.
  175. */
  176. RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
  177. UErrorCode ec = U_ZERO_ERROR;
  178. if (status == nullptr) {
  179. status = &ec;
  180. }
  181. utext_openUChars(&fText, nullptr, 0, status);
  182. LocalPointer<DictionaryCache> lpDictionaryCache(new DictionaryCache(this, *status), *status);
  183. LocalPointer<BreakCache> lpBreakCache(new BreakCache(this, *status), *status);
  184. if (U_FAILURE(*status)) {
  185. fErrorCode = *status;
  186. return;
  187. }
  188. fDictionaryCache = lpDictionaryCache.orphan();
  189. fBreakCache = lpBreakCache.orphan();
  190. #ifdef RBBI_DEBUG
  191. static UBool debugInitDone = false;
  192. if (debugInitDone == false) {
  193. char *debugEnv = getenv("U_RBBIDEBUG");
  194. if (debugEnv && uprv_strstr(debugEnv, "trace")) {
  195. gTrace = true;
  196. }
  197. debugInitDone = true;
  198. }
  199. #endif
  200. }
  201. //-------------------------------------------------------------------------------
  202. //
  203. // Copy constructor. Will produce a break iterator with the same behavior,
  204. // and which iterates over the same text, as the one passed in.
  205. //
  206. //-------------------------------------------------------------------------------
  207. RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
  208. : RuleBasedBreakIterator()
  209. {
  210. *this = other;
  211. }
  212. /**
  213. * Destructor
  214. */
  215. RuleBasedBreakIterator::~RuleBasedBreakIterator() {
  216. if (fCharIter != &fSCharIter) {
  217. // fCharIter was adopted from the outside.
  218. delete fCharIter;
  219. }
  220. fCharIter = nullptr;
  221. utext_close(&fText);
  222. if (fData != nullptr) {
  223. fData->removeReference();
  224. fData = nullptr;
  225. }
  226. delete fBreakCache;
  227. fBreakCache = nullptr;
  228. delete fDictionaryCache;
  229. fDictionaryCache = nullptr;
  230. delete fLanguageBreakEngines;
  231. fLanguageBreakEngines = nullptr;
  232. delete fUnhandledBreakEngine;
  233. fUnhandledBreakEngine = nullptr;
  234. uprv_free(fLookAheadMatches);
  235. fLookAheadMatches = nullptr;
  236. }
  237. /**
  238. * Assignment operator. Sets this iterator to have the same behavior,
  239. * and iterate over the same text, as the one passed in.
  240. * TODO: needs better handling of memory allocation errors.
  241. */
  242. RuleBasedBreakIterator&
  243. RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
  244. if (this == &that) {
  245. return *this;
  246. }
  247. BreakIterator::operator=(that);
  248. if (fLanguageBreakEngines != nullptr) {
  249. delete fLanguageBreakEngines;
  250. fLanguageBreakEngines = nullptr; // Just rebuild for now
  251. }
  252. // TODO: clone fLanguageBreakEngines from "that"
  253. UErrorCode status = U_ZERO_ERROR;
  254. utext_clone(&fText, &that.fText, false, true, &status);
  255. if (fCharIter != &fSCharIter) {
  256. delete fCharIter;
  257. }
  258. fCharIter = &fSCharIter;
  259. if (that.fCharIter != nullptr && that.fCharIter != &that.fSCharIter) {
  260. // This is a little bit tricky - it will initially appear that
  261. // this->fCharIter is adopted, even if that->fCharIter was
  262. // not adopted. That's ok.
  263. fCharIter = that.fCharIter->clone();
  264. }
  265. fSCharIter = that.fSCharIter;
  266. if (fCharIter == nullptr) {
  267. fCharIter = &fSCharIter;
  268. }
  269. if (fData != nullptr) {
  270. fData->removeReference();
  271. fData = nullptr;
  272. }
  273. if (that.fData != nullptr) {
  274. fData = that.fData->addReference();
  275. }
  276. uprv_free(fLookAheadMatches);
  277. fLookAheadMatches = nullptr;
  278. if (fData && fData->fForwardTable->fLookAheadResultsSize > 0) {
  279. fLookAheadMatches = static_cast<int32_t *>(
  280. uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
  281. }
  282. fPosition = that.fPosition;
  283. fRuleStatusIndex = that.fRuleStatusIndex;
  284. fDone = that.fDone;
  285. // TODO: both the dictionary and the main cache need to be copied.
  286. // Current position could be within a dictionary range. Trying to continue
  287. // the iteration without the caches present would go to the rules, with
  288. // the assumption that the current position is on a rule boundary.
  289. fBreakCache->reset(fPosition, fRuleStatusIndex);
  290. fDictionaryCache->reset();
  291. return *this;
  292. }
  293. //-----------------------------------------------------------------------------
  294. //
  295. // clone - Returns a newly-constructed RuleBasedBreakIterator with the same
  296. // behavior, and iterating over the same text, as this one.
  297. // Virtual function: does the right thing with subclasses.
  298. //
  299. //-----------------------------------------------------------------------------
  300. RuleBasedBreakIterator*
  301. RuleBasedBreakIterator::clone() const {
  302. return new RuleBasedBreakIterator(*this);
  303. }
  304. /**
  305. * Equality operator. Returns true if both BreakIterators are of the
  306. * same class, have the same behavior, and iterate over the same text.
  307. */
  308. bool
  309. RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
  310. if (typeid(*this) != typeid(that)) {
  311. return false;
  312. }
  313. if (this == &that) {
  314. return true;
  315. }
  316. // The base class BreakIterator carries no state that participates in equality,
  317. // and does not implement an equality function that would otherwise be
  318. // checked at this point.
  319. const RuleBasedBreakIterator& that2 = static_cast<const RuleBasedBreakIterator&>(that);
  320. if (!utext_equals(&fText, &that2.fText)) {
  321. // The two break iterators are operating on different text,
  322. // or have a different iteration position.
  323. // Note that fText's position is always the same as the break iterator's position.
  324. return false;
  325. }
  326. if (!(fPosition == that2.fPosition &&
  327. fRuleStatusIndex == that2.fRuleStatusIndex &&
  328. fDone == that2.fDone)) {
  329. return false;
  330. }
  331. if (that2.fData == fData ||
  332. (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) {
  333. // The two break iterators are using the same rules.
  334. return true;
  335. }
  336. return false;
  337. }
  338. /**
  339. * Compute a hash code for this BreakIterator
  340. * @return A hash code
  341. */
  342. int32_t
  343. RuleBasedBreakIterator::hashCode() const {
  344. int32_t hash = 0;
  345. if (fData != nullptr) {
  346. hash = fData->hashCode();
  347. }
  348. return hash;
  349. }
  350. void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
  351. if (U_FAILURE(status)) {
  352. return;
  353. }
  354. fBreakCache->reset();
  355. fDictionaryCache->reset();
  356. utext_clone(&fText, ut, false, true, &status);
  357. // Set up a dummy CharacterIterator to be returned if anyone
  358. // calls getText(). With input from UText, there is no reasonable
  359. // way to return a characterIterator over the actual input text.
  360. // Return one over an empty string instead - this is the closest
  361. // we can come to signaling a failure.
  362. // (GetText() is obsolete, this failure is sort of OK)
  363. fSCharIter.setText(u"", 0);
  364. if (fCharIter != &fSCharIter) {
  365. // existing fCharIter was adopted from the outside. Delete it now.
  366. delete fCharIter;
  367. }
  368. fCharIter = &fSCharIter;
  369. this->first();
  370. }
  371. UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
  372. UText *result = utext_clone(fillIn, &fText, false, true, &status);
  373. return result;
  374. }
  375. //=======================================================================
  376. // BreakIterator overrides
  377. //=======================================================================
  378. /**
  379. * Return a CharacterIterator over the text being analyzed.
  380. */
  381. CharacterIterator&
  382. RuleBasedBreakIterator::getText() const {
  383. return *fCharIter;
  384. }
  385. /**
  386. * Set the iterator to analyze a new piece of text. This function resets
  387. * the current iteration position to the beginning of the text.
  388. * @param newText An iterator over the text to analyze.
  389. */
  390. void
  391. RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
  392. // If we are holding a CharacterIterator adopted from a
  393. // previous call to this function, delete it now.
  394. if (fCharIter != &fSCharIter) {
  395. delete fCharIter;
  396. }
  397. fCharIter = newText;
  398. UErrorCode status = U_ZERO_ERROR;
  399. fBreakCache->reset();
  400. fDictionaryCache->reset();
  401. if (newText==nullptr || newText->startIndex() != 0) {
  402. // startIndex !=0 wants to be an error, but there's no way to report it.
  403. // Make the iterator text be an empty string.
  404. utext_openUChars(&fText, nullptr, 0, &status);
  405. } else {
  406. utext_openCharacterIterator(&fText, newText, &status);
  407. }
  408. this->first();
  409. }
  410. /**
  411. * Set the iterator to analyze a new piece of text. This function resets
  412. * the current iteration position to the beginning of the text.
  413. * @param newText An iterator over the text to analyze.
  414. */
  415. void
  416. RuleBasedBreakIterator::setText(const UnicodeString& newText) {
  417. UErrorCode status = U_ZERO_ERROR;
  418. fBreakCache->reset();
  419. fDictionaryCache->reset();
  420. utext_openConstUnicodeString(&fText, &newText, &status);
  421. // Set up a character iterator on the string.
  422. // Needed in case someone calls getText().
  423. // Can not, unfortunately, do this lazily on the (probably never)
  424. // call to getText(), because getText is const.
  425. fSCharIter.setText(newText.getBuffer(), newText.length());
  426. if (fCharIter != &fSCharIter) {
  427. // old fCharIter was adopted from the outside. Delete it.
  428. delete fCharIter;
  429. }
  430. fCharIter = &fSCharIter;
  431. this->first();
  432. }
  433. /**
  434. * Provide a new UText for the input text. Must reference text with contents identical
  435. * to the original.
  436. * Intended for use with text data originating in Java (garbage collected) environments
  437. * where the data may be moved in memory at arbitrary times.
  438. */
  439. RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
  440. if (U_FAILURE(status)) {
  441. return *this;
  442. }
  443. if (input == nullptr) {
  444. status = U_ILLEGAL_ARGUMENT_ERROR;
  445. return *this;
  446. }
  447. int64_t pos = utext_getNativeIndex(&fText);
  448. // Shallow read-only clone of the new UText into the existing input UText
  449. utext_clone(&fText, input, false, true, &status);
  450. if (U_FAILURE(status)) {
  451. return *this;
  452. }
  453. utext_setNativeIndex(&fText, pos);
  454. if (utext_getNativeIndex(&fText) != pos) {
  455. // Sanity check. The new input utext is supposed to have the exact same
  456. // contents as the old. If we can't set to the same position, it doesn't.
  457. // The contents underlying the old utext might be invalid at this point,
  458. // so it's not safe to check directly.
  459. status = U_ILLEGAL_ARGUMENT_ERROR;
  460. }
  461. return *this;
  462. }
  463. /**
  464. * Sets the current iteration position to the beginning of the text, position zero.
  465. * @return The new iterator position, which is zero.
  466. */
  467. int32_t RuleBasedBreakIterator::first() {
  468. UErrorCode status = U_ZERO_ERROR;
  469. if (!fBreakCache->seek(0)) {
  470. fBreakCache->populateNear(0, status);
  471. }
  472. fBreakCache->current();
  473. U_ASSERT(fPosition == 0);
  474. return 0;
  475. }
  476. /**
  477. * Sets the current iteration position to the end of the text.
  478. * @return The text's past-the-end offset.
  479. */
  480. int32_t RuleBasedBreakIterator::last() {
  481. int32_t endPos = (int32_t)utext_nativeLength(&fText);
  482. UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
  483. (void)endShouldBeBoundary;
  484. U_ASSERT(endShouldBeBoundary);
  485. U_ASSERT(fPosition == endPos);
  486. return endPos;
  487. }
  488. /**
  489. * Advances the iterator either forward or backward the specified number of steps.
  490. * Negative values move backward, and positive values move forward. This is
  491. * equivalent to repeatedly calling next() or previous().
  492. * @param n The number of steps to move. The sign indicates the direction
  493. * (negative is backwards, and positive is forwards).
  494. * @return The character offset of the boundary position n boundaries away from
  495. * the current one.
  496. */
  497. int32_t RuleBasedBreakIterator::next(int32_t n) {
  498. int32_t result = 0;
  499. if (n > 0) {
  500. for (; n > 0 && result != UBRK_DONE; --n) {
  501. result = next();
  502. }
  503. } else if (n < 0) {
  504. for (; n < 0 && result != UBRK_DONE; ++n) {
  505. result = previous();
  506. }
  507. } else {
  508. result = current();
  509. }
  510. return result;
  511. }
  512. /**
  513. * Advances the iterator to the next boundary position.
  514. * @return The position of the first boundary after this one.
  515. */
  516. int32_t RuleBasedBreakIterator::next() {
  517. fBreakCache->next();
  518. return fDone ? UBRK_DONE : fPosition;
  519. }
  520. /**
  521. * Move the iterator backwards, to the boundary preceding the current one.
  522. *
  523. * Starts from the current position within fText.
  524. * Starting position need not be on a boundary.
  525. *
  526. * @return The position of the boundary position immediately preceding the starting position.
  527. */
  528. int32_t RuleBasedBreakIterator::previous() {
  529. UErrorCode status = U_ZERO_ERROR;
  530. fBreakCache->previous(status);
  531. return fDone ? UBRK_DONE : fPosition;
  532. }
  533. /**
  534. * Sets the iterator to refer to the first boundary position following
  535. * the specified position.
  536. * @param startPos The position from which to begin searching for a break position.
  537. * @return The position of the first break after the current position.
  538. */
  539. int32_t RuleBasedBreakIterator::following(int32_t startPos) {
  540. // if the supplied position is before the beginning, return the
  541. // text's starting offset
  542. if (startPos < 0) {
  543. return first();
  544. }
  545. // Move requested offset to a code point start. It might be on a trail surrogate,
  546. // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
  547. utext_setNativeIndex(&fText, startPos);
  548. startPos = (int32_t)utext_getNativeIndex(&fText);
  549. UErrorCode status = U_ZERO_ERROR;
  550. fBreakCache->following(startPos, status);
  551. return fDone ? UBRK_DONE : fPosition;
  552. }
  553. /**
  554. * Sets the iterator to refer to the last boundary position before the
  555. * specified position.
  556. * @param offset The position to begin searching for a break from.
  557. * @return The position of the last boundary before the starting position.
  558. */
  559. int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
  560. if (offset > utext_nativeLength(&fText)) {
  561. return last();
  562. }
  563. // Move requested offset to a code point start. It might be on a trail surrogate,
  564. // or on a trail byte if the input is UTF-8.
  565. utext_setNativeIndex(&fText, offset);
  566. int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
  567. UErrorCode status = U_ZERO_ERROR;
  568. fBreakCache->preceding(adjustedOffset, status);
  569. return fDone ? UBRK_DONE : fPosition;
  570. }
  571. /**
  572. * Returns true if the specified position is a boundary position. As a side
  573. * effect, leaves the iterator pointing to the first boundary position at
  574. * or after "offset".
  575. *
  576. * @param offset the offset to check.
  577. * @return True if "offset" is a boundary position.
  578. */
  579. UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
  580. // out-of-range indexes are never boundary positions
  581. if (offset < 0) {
  582. first(); // For side effects on current position, tag values.
  583. return false;
  584. }
  585. // Adjust offset to be on a code point boundary and not beyond the end of the text.
  586. // Note that isBoundary() is always false for offsets that are not on code point boundaries.
  587. // But we still need the side effect of leaving iteration at the following boundary.
  588. utext_setNativeIndex(&fText, offset);
  589. int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
  590. bool result = false;
  591. UErrorCode status = U_ZERO_ERROR;
  592. if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
  593. result = (fBreakCache->current() == offset);
  594. }
  595. if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) {
  596. // Original offset is beyond the end of the text. Return false, it's not a boundary,
  597. // but the iteration position remains set to the end of the text, which is a boundary.
  598. return false;
  599. }
  600. if (!result) {
  601. // Not on a boundary. isBoundary() must leave iterator on the following boundary.
  602. // Cache->seek(), above, left us on the preceding boundary, so advance one.
  603. next();
  604. }
  605. return result;
  606. }
  607. /**
  608. * Returns the current iteration position.
  609. * @return The current iteration position.
  610. */
  611. int32_t RuleBasedBreakIterator::current() const {
  612. return fPosition;
  613. }
  614. //=======================================================================
  615. // implementation
  616. //=======================================================================
  617. //
  618. // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
  619. // of user text. A variable with this enum type keeps track of where we
  620. // are. The state machine only fetches user input while in the RUN mode.
  621. //
  622. enum RBBIRunMode {
  623. RBBI_START, // state machine processing is before first char of input
  624. RBBI_RUN, // state machine processing is in the user text
  625. RBBI_END // state machine processing is after end of user text.
  626. };
  627. // Wrapper functions to select the appropriate handleNext() or handleSafePrevious()
  628. // instantiation, based on whether an 8 or 16 bit table is required.
  629. //
  630. // These Trie access functions will be inlined within the handleNext()/Previous() instantions.
  631. static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) {
  632. return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c);
  633. }
  634. static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) {
  635. return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c);
  636. }
  637. int32_t RuleBasedBreakIterator::handleNext() {
  638. const RBBIStateTable *statetable = fData->fForwardTable;
  639. bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
  640. if (statetable->fFlags & RBBI_8BITS_ROWS) {
  641. if (use8BitsTrie) {
  642. return handleNext<RBBIStateTableRow8, TrieFunc8>();
  643. } else {
  644. return handleNext<RBBIStateTableRow8, TrieFunc16>();
  645. }
  646. } else {
  647. if (use8BitsTrie) {
  648. return handleNext<RBBIStateTableRow16, TrieFunc8>();
  649. } else {
  650. return handleNext<RBBIStateTableRow16, TrieFunc16>();
  651. }
  652. }
  653. }
  654. int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
  655. const RBBIStateTable *statetable = fData->fReverseTable;
  656. bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
  657. if (statetable->fFlags & RBBI_8BITS_ROWS) {
  658. if (use8BitsTrie) {
  659. return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition);
  660. } else {
  661. return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition);
  662. }
  663. } else {
  664. if (use8BitsTrie) {
  665. return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition);
  666. } else {
  667. return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition);
  668. }
  669. }
  670. }
  671. //-----------------------------------------------------------------------------------
  672. //
  673. // handleNext()
  674. // Run the state machine to find a boundary
  675. //
  676. //-----------------------------------------------------------------------------------
  677. template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
  678. int32_t RuleBasedBreakIterator::handleNext() {
  679. int32_t state;
  680. uint16_t category = 0;
  681. RBBIRunMode mode;
  682. RowType *row;
  683. UChar32 c;
  684. int32_t result = 0;
  685. int32_t initialPosition = 0;
  686. const RBBIStateTable *statetable = fData->fForwardTable;
  687. const char *tableData = statetable->fTableData;
  688. uint32_t tableRowLen = statetable->fRowLen;
  689. uint32_t dictStart = statetable->fDictCategoriesStart;
  690. #ifdef RBBI_DEBUG
  691. if (gTrace) {
  692. RBBIDebugPuts("Handle Next pos char state category");
  693. }
  694. #endif
  695. // handleNext always sets the break tag value.
  696. // Set the default for it.
  697. fRuleStatusIndex = 0;
  698. fDictionaryCharCount = 0;
  699. // if we're already at the end of the text, return DONE.
  700. initialPosition = fPosition;
  701. UTEXT_SETNATIVEINDEX(&fText, initialPosition);
  702. result = initialPosition;
  703. c = UTEXT_NEXT32(&fText);
  704. if (c==U_SENTINEL) {
  705. fDone = true;
  706. return UBRK_DONE;
  707. }
  708. // Set the initial state for the state machine
  709. state = START_STATE;
  710. row = (RowType *)
  711. //(statetable->fTableData + (statetable->fRowLen * state));
  712. (tableData + tableRowLen * state);
  713. mode = RBBI_RUN;
  714. if (statetable->fFlags & RBBI_BOF_REQUIRED) {
  715. category = 2;
  716. mode = RBBI_START;
  717. }
  718. // loop until we reach the end of the text or transition to state 0
  719. //
  720. for (;;) {
  721. if (c == U_SENTINEL) {
  722. // Reached end of input string.
  723. if (mode == RBBI_END) {
  724. // We have already run the loop one last time with the
  725. // character set to the psueudo {eof} value. Now it is time
  726. // to unconditionally bail out.
  727. break;
  728. }
  729. // Run the loop one last time with the fake end-of-input character category.
  730. mode = RBBI_END;
  731. category = 1;
  732. }
  733. //
  734. // Get the char category. An incoming category of 1 or 2 means that
  735. // we are preset for doing the beginning or end of input, and
  736. // that we shouldn't get a category from an actual text input character.
  737. //
  738. if (mode == RBBI_RUN) {
  739. // look up the current character's character category, which tells us
  740. // which column in the state table to look at.
  741. category = trieFunc(fData->fTrie, c);
  742. fDictionaryCharCount += (category >= dictStart);
  743. }
  744. #ifdef RBBI_DEBUG
  745. if (gTrace) {
  746. RBBIDebugPrintf(" %4" PRId64 " ", utext_getNativeIndex(&fText));
  747. if (0x20<=c && c<0x7f) {
  748. RBBIDebugPrintf("\"%c\" ", c);
  749. } else {
  750. RBBIDebugPrintf("%5x ", c);
  751. }
  752. RBBIDebugPrintf("%3d %3d\n", state, category);
  753. }
  754. #endif
  755. // State Transition - move machine to its next state
  756. //
  757. // fNextState is a variable-length array.
  758. U_ASSERT(category<fData->fHeader->fCatCount);
  759. state = row->fNextState[category]; /*Not accessing beyond memory*/
  760. row = (RowType *)
  761. // (statetable->fTableData + (statetable->fRowLen * state));
  762. (tableData + tableRowLen * state);
  763. uint16_t accepting = row->fAccepting;
  764. if (accepting == ACCEPTING_UNCONDITIONAL) {
  765. // Match found, common case.
  766. if (mode != RBBI_START) {
  767. result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
  768. }
  769. fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values.
  770. } else if (accepting > ACCEPTING_UNCONDITIONAL) {
  771. // Lookahead match is completed.
  772. U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize);
  773. int32_t lookaheadResult = fLookAheadMatches[accepting];
  774. if (lookaheadResult >= 0) {
  775. fRuleStatusIndex = row->fTagsIdx;
  776. fPosition = lookaheadResult;
  777. return lookaheadResult;
  778. }
  779. }
  780. // If we are at the position of the '/' in a look-ahead (hard break) rule;
  781. // record the current position, to be returned later, if the full rule matches.
  782. // TODO: Move this check before the previous check of fAccepting.
  783. // This would enable hard-break rules with no following context.
  784. // But there are line break test failures when trying this. Investigate.
  785. // Issue ICU-20837
  786. uint16_t rule = row->fLookAhead;
  787. U_ASSERT(rule == 0 || rule > ACCEPTING_UNCONDITIONAL);
  788. U_ASSERT(rule == 0 || rule < fData->fForwardTable->fLookAheadResultsSize);
  789. if (rule > ACCEPTING_UNCONDITIONAL) {
  790. int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
  791. fLookAheadMatches[rule] = pos;
  792. }
  793. if (state == STOP_STATE) {
  794. // This is the normal exit from the lookup state machine.
  795. // We have advanced through the string until it is certain that no
  796. // longer match is possible, no matter what characters follow.
  797. break;
  798. }
  799. // Advance to the next character.
  800. // If this is a beginning-of-input loop iteration, don't advance
  801. // the input position. The next iteration will be processing the
  802. // first real input character.
  803. if (mode == RBBI_RUN) {
  804. c = UTEXT_NEXT32(&fText);
  805. } else {
  806. if (mode == RBBI_START) {
  807. mode = RBBI_RUN;
  808. }
  809. }
  810. }
  811. // The state machine is done. Check whether it found a match...
  812. // If the iterator failed to advance in the match engine, force it ahead by one.
  813. // (This really indicates a defect in the break rules. They should always match
  814. // at least one character.)
  815. if (result == initialPosition) {
  816. utext_setNativeIndex(&fText, initialPosition);
  817. utext_next32(&fText);
  818. result = (int32_t)utext_getNativeIndex(&fText);
  819. fRuleStatusIndex = 0;
  820. }
  821. // Leave the iterator at our result position.
  822. fPosition = result;
  823. #ifdef RBBI_DEBUG
  824. if (gTrace) {
  825. RBBIDebugPrintf("result = %d\n\n", result);
  826. }
  827. #endif
  828. return result;
  829. }
  830. //-----------------------------------------------------------------------------------
  831. //
  832. // handleSafePrevious()
  833. //
  834. // Iterate backwards using the safe reverse rules.
  835. // The logic of this function is similar to handleNext(), but simpler
  836. // because the safe table does not require as many options.
  837. //
  838. //-----------------------------------------------------------------------------------
  839. template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
  840. int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
  841. int32_t state;
  842. uint16_t category = 0;
  843. RowType *row;
  844. UChar32 c;
  845. int32_t result = 0;
  846. const RBBIStateTable *stateTable = fData->fReverseTable;
  847. UTEXT_SETNATIVEINDEX(&fText, fromPosition);
  848. #ifdef RBBI_DEBUG
  849. if (gTrace) {
  850. RBBIDebugPuts("Handle Previous pos char state category");
  851. }
  852. #endif
  853. // if we're already at the start of the text, return DONE.
  854. if (fData == nullptr || UTEXT_GETNATIVEINDEX(&fText)==0) {
  855. return BreakIterator::DONE;
  856. }
  857. // Set the initial state for the state machine
  858. c = UTEXT_PREVIOUS32(&fText);
  859. state = START_STATE;
  860. row = (RowType *)
  861. (stateTable->fTableData + (stateTable->fRowLen * state));
  862. // loop until we reach the start of the text or transition to state 0
  863. //
  864. for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
  865. // look up the current character's character category, which tells us
  866. // which column in the state table to look at.
  867. //
  868. // Off the dictionary flag bit. For reverse iteration it is not used.
  869. category = trieFunc(fData->fTrie, c);
  870. #ifdef RBBI_DEBUG
  871. if (gTrace) {
  872. RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
  873. if (0x20<=c && c<0x7f) {
  874. RBBIDebugPrintf("\"%c\" ", c);
  875. } else {
  876. RBBIDebugPrintf("%5x ", c);
  877. }
  878. RBBIDebugPrintf("%3d %3d\n", state, category);
  879. }
  880. #endif
  881. // State Transition - move machine to its next state
  882. //
  883. // fNextState is a variable-length array.
  884. U_ASSERT(category<fData->fHeader->fCatCount);
  885. state = row->fNextState[category]; /*Not accessing beyond memory*/
  886. row = (RowType *)
  887. (stateTable->fTableData + (stateTable->fRowLen * state));
  888. if (state == STOP_STATE) {
  889. // This is the normal exit from the lookup state machine.
  890. // Transition to state zero means we have found a safe point.
  891. break;
  892. }
  893. }
  894. // The state machine is done. Check whether it found a match...
  895. result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
  896. #ifdef RBBI_DEBUG
  897. if (gTrace) {
  898. RBBIDebugPrintf("result = %d\n\n", result);
  899. }
  900. #endif
  901. return result;
  902. }
  903. //-------------------------------------------------------------------------------
  904. //
  905. // getRuleStatus() Return the break rule tag associated with the current
  906. // iterator position. If the iterator arrived at its current
  907. // position by iterating forwards, the value will have been
  908. // cached by the handleNext() function.
  909. //
  910. //-------------------------------------------------------------------------------
  911. int32_t RuleBasedBreakIterator::getRuleStatus() const {
  912. // fLastRuleStatusIndex indexes to the start of the appropriate status record
  913. // (the number of status values.)
  914. // This function returns the last (largest) of the array of status values.
  915. int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
  916. int32_t tagVal = fData->fRuleStatusTable[idx];
  917. return tagVal;
  918. }
  919. int32_t RuleBasedBreakIterator::getRuleStatusVec(
  920. int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
  921. if (U_FAILURE(status)) {
  922. return 0;
  923. }
  924. int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex];
  925. int32_t numValsToCopy = numVals;
  926. if (numVals > capacity) {
  927. status = U_BUFFER_OVERFLOW_ERROR;
  928. numValsToCopy = capacity;
  929. }
  930. int i;
  931. for (i=0; i<numValsToCopy; i++) {
  932. fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
  933. }
  934. return numVals;
  935. }
  936. //-------------------------------------------------------------------------------
  937. //
  938. // getBinaryRules Access to the compiled form of the rules,
  939. // for use by build system tools that save the data
  940. // for standard iterator types.
  941. //
  942. //-------------------------------------------------------------------------------
  943. const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
  944. const uint8_t *retPtr = nullptr;
  945. length = 0;
  946. if (fData != nullptr) {
  947. retPtr = (const uint8_t *)fData->fHeader;
  948. length = fData->fHeader->fLength;
  949. }
  950. return retPtr;
  951. }
  952. RuleBasedBreakIterator *RuleBasedBreakIterator::createBufferClone(
  953. void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) {
  954. if (U_FAILURE(status)){
  955. return nullptr;
  956. }
  957. if (bufferSize == 0) {
  958. bufferSize = 1; // preflighting for deprecated functionality
  959. return nullptr;
  960. }
  961. BreakIterator *clonedBI = clone();
  962. if (clonedBI == nullptr) {
  963. status = U_MEMORY_ALLOCATION_ERROR;
  964. } else {
  965. status = U_SAFECLONE_ALLOCATED_WARNING;
  966. }
  967. return (RuleBasedBreakIterator *)clonedBI;
  968. }
  969. U_NAMESPACE_END
  970. static icu::UStack *gLanguageBreakFactories = nullptr;
  971. static const icu::UnicodeString *gEmptyString = nullptr;
  972. static icu::UInitOnce gLanguageBreakFactoriesInitOnce {};
  973. static icu::UInitOnce gRBBIInitOnce {};
  974. /**
  975. * Release all static memory held by breakiterator.
  976. */
  977. U_CDECL_BEGIN
  978. UBool U_CALLCONV rbbi_cleanup() {
  979. delete gLanguageBreakFactories;
  980. gLanguageBreakFactories = nullptr;
  981. delete gEmptyString;
  982. gEmptyString = nullptr;
  983. gLanguageBreakFactoriesInitOnce.reset();
  984. gRBBIInitOnce.reset();
  985. return true;
  986. }
  987. U_CDECL_END
  988. U_CDECL_BEGIN
  989. static void U_CALLCONV _deleteFactory(void *obj) {
  990. delete (icu::LanguageBreakFactory *) obj;
  991. }
  992. U_CDECL_END
  993. U_NAMESPACE_BEGIN
  994. static void U_CALLCONV rbbiInit() {
  995. gEmptyString = new UnicodeString();
  996. ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
  997. }
  998. static void U_CALLCONV initLanguageFactories() {
  999. UErrorCode status = U_ZERO_ERROR;
  1000. U_ASSERT(gLanguageBreakFactories == nullptr);
  1001. gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status);
  1002. if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) {
  1003. ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
  1004. gLanguageBreakFactories->push(builtIn, status);
  1005. #ifdef U_LOCAL_SERVICE_HOOK
  1006. LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
  1007. if (extra != nullptr) {
  1008. gLanguageBreakFactories->push(extra, status);
  1009. }
  1010. #endif
  1011. }
  1012. ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
  1013. }
  1014. static const LanguageBreakEngine*
  1015. getLanguageBreakEngineFromFactory(UChar32 c)
  1016. {
  1017. umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
  1018. if (gLanguageBreakFactories == nullptr) {
  1019. return nullptr;
  1020. }
  1021. int32_t i = gLanguageBreakFactories->size();
  1022. const LanguageBreakEngine *lbe = nullptr;
  1023. while (--i >= 0) {
  1024. LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
  1025. lbe = factory->getEngineFor(c);
  1026. if (lbe != nullptr) {
  1027. break;
  1028. }
  1029. }
  1030. return lbe;
  1031. }
  1032. //-------------------------------------------------------------------------------
  1033. //
  1034. // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
  1035. // the character c.
  1036. //
  1037. //-------------------------------------------------------------------------------
  1038. const LanguageBreakEngine *
  1039. RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
  1040. const LanguageBreakEngine *lbe = nullptr;
  1041. UErrorCode status = U_ZERO_ERROR;
  1042. if (fLanguageBreakEngines == nullptr) {
  1043. fLanguageBreakEngines = new UStack(status);
  1044. if (fLanguageBreakEngines == nullptr || U_FAILURE(status)) {
  1045. delete fLanguageBreakEngines;
  1046. fLanguageBreakEngines = 0;
  1047. return nullptr;
  1048. }
  1049. }
  1050. int32_t i = fLanguageBreakEngines->size();
  1051. while (--i >= 0) {
  1052. lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
  1053. if (lbe->handles(c)) {
  1054. return lbe;
  1055. }
  1056. }
  1057. // No existing dictionary took the character. See if a factory wants to
  1058. // give us a new LanguageBreakEngine for this character.
  1059. lbe = getLanguageBreakEngineFromFactory(c);
  1060. // If we got one, use it and push it on our stack.
  1061. if (lbe != nullptr) {
  1062. fLanguageBreakEngines->push((void *)lbe, status);
  1063. // Even if we can't remember it, we can keep looking it up, so
  1064. // return it even if the push fails.
  1065. return lbe;
  1066. }
  1067. // No engine is forthcoming for this character. Add it to the
  1068. // reject set. Create the reject break engine if needed.
  1069. if (fUnhandledBreakEngine == nullptr) {
  1070. fUnhandledBreakEngine = new UnhandledEngine(status);
  1071. if (U_SUCCESS(status) && fUnhandledBreakEngine == nullptr) {
  1072. status = U_MEMORY_ALLOCATION_ERROR;
  1073. return nullptr;
  1074. }
  1075. // Put it last so that scripts for which we have an engine get tried
  1076. // first.
  1077. fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
  1078. // If we can't insert it, or creation failed, get rid of it
  1079. U_ASSERT(!fLanguageBreakEngines->hasDeleter());
  1080. if (U_FAILURE(status)) {
  1081. delete fUnhandledBreakEngine;
  1082. fUnhandledBreakEngine = 0;
  1083. return nullptr;
  1084. }
  1085. }
  1086. // Tell the reject engine about the character; at its discretion, it may
  1087. // add more than just the one character.
  1088. fUnhandledBreakEngine->handleCharacter(c);
  1089. return fUnhandledBreakEngine;
  1090. }
  1091. void RuleBasedBreakIterator::dumpCache() {
  1092. fBreakCache->dumpCache();
  1093. }
  1094. void RuleBasedBreakIterator::dumpTables() {
  1095. fData->printData();
  1096. }
  1097. /**
  1098. * Returns the description used to create this iterator
  1099. */
  1100. const UnicodeString&
  1101. RuleBasedBreakIterator::getRules() const {
  1102. if (fData != nullptr) {
  1103. return fData->getRuleSourceString();
  1104. } else {
  1105. umtx_initOnce(gRBBIInitOnce, &rbbiInit);
  1106. return *gEmptyString;
  1107. }
  1108. }
  1109. U_NAMESPACE_END
  1110. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */