rbbi.cpp 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ***************************************************************************
  5. * Copyright (C) 1999-2016 International Business Machines Corporation
  6. * and others. All rights reserved.
  7. ***************************************************************************
  8. */
  9. //
  10. // file: rbbi.cpp Contains the implementation of the rule based break iterator
  11. // runtime engine and the API implementation for
  12. // class RuleBasedBreakIterator
  13. //
  14. #include "utypeinfo.h" // for 'typeid' to work
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_BREAK_ITERATION
  17. #include <cinttypes>
  18. #include "unicode/rbbi.h"
  19. #include "unicode/schriter.h"
  20. #include "unicode/uchriter.h"
  21. #include "unicode/uclean.h"
  22. #include "unicode/udata.h"
  23. #include "brkeng.h"
  24. #include "ucln_cmn.h"
  25. #include "cmemory.h"
  26. #include "cstring.h"
  27. #include "localsvc.h"
  28. #include "rbbidata.h"
  29. #include "rbbi_cache.h"
  30. #include "rbbirb.h"
  31. #include "uassert.h"
  32. #include "umutex.h"
  33. #include "uvectr32.h"
  34. #ifdef RBBI_DEBUG
  35. static UBool gTrace = false;
  36. #endif
  37. U_NAMESPACE_BEGIN
  38. // The state number of the starting state
  39. constexpr int32_t START_STATE = 1;
  40. // The state-transition value indicating "stop"
  41. constexpr int32_t STOP_STATE = 0;
  42. UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
  43. //=======================================================================
  44. // constructors
  45. //=======================================================================
  46. /**
  47. * Constructs a RuleBasedBreakIterator that uses the already-created
  48. * tables object that is passed in as a parameter.
  49. */
  50. RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
  51. : RuleBasedBreakIterator(&status)
  52. {
  53. fData = new RBBIDataWrapper(data, status); // status checked in constructor
  54. if (U_FAILURE(status)) {return;}
  55. if(fData == nullptr) {
  56. status = U_MEMORY_ALLOCATION_ERROR;
  57. return;
  58. }
  59. if (fData->fForwardTable->fLookAheadResultsSize > 0) {
  60. fLookAheadMatches = static_cast<int32_t *>(
  61. uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
  62. if (fLookAheadMatches == nullptr) {
  63. status = U_MEMORY_ALLOCATION_ERROR;
  64. return;
  65. }
  66. }
  67. }
  68. //-------------------------------------------------------------------------------
  69. //
  70. // Constructor from a UDataMemory handle to precompiled break rules
  71. // stored in an ICU data file. This construcotr is private API,
  72. // only for internal use.
  73. //
  74. //-------------------------------------------------------------------------------
  75. RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
  76. UErrorCode &status) : RuleBasedBreakIterator(udm, status)
  77. {
  78. fIsPhraseBreaking = isPhraseBreaking;
  79. }
  80. //
  81. // Construct from precompiled binary rules (tables). This constructor is public API,
  82. // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
  83. //
  84. RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
  85. uint32_t ruleLength,
  86. UErrorCode &status)
  87. : RuleBasedBreakIterator(&status)
  88. {
  89. if (U_FAILURE(status)) {
  90. return;
  91. }
  92. if (compiledRules == nullptr || ruleLength < sizeof(RBBIDataHeader)) {
  93. status = U_ILLEGAL_ARGUMENT_ERROR;
  94. return;
  95. }
  96. const RBBIDataHeader* data = reinterpret_cast<const RBBIDataHeader*>(compiledRules);
  97. if (data->fLength > ruleLength) {
  98. status = U_ILLEGAL_ARGUMENT_ERROR;
  99. return;
  100. }
  101. fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
  102. if (U_FAILURE(status)) {return;}
  103. if(fData == nullptr) {
  104. status = U_MEMORY_ALLOCATION_ERROR;
  105. return;
  106. }
  107. if (fData->fForwardTable->fLookAheadResultsSize > 0) {
  108. fLookAheadMatches = static_cast<int32_t *>(
  109. uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
  110. if (fLookAheadMatches == nullptr) {
  111. status = U_MEMORY_ALLOCATION_ERROR;
  112. return;
  113. }
  114. }
  115. }
  116. //-------------------------------------------------------------------------------
  117. //
  118. // Constructor from a UDataMemory handle to precompiled break rules
  119. // stored in an ICU data file.
  120. //
  121. //-------------------------------------------------------------------------------
  122. RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
  123. : RuleBasedBreakIterator(&status)
  124. {
  125. fData = new RBBIDataWrapper(udm, status); // status checked in constructor
  126. if (U_FAILURE(status)) {return;}
  127. if(fData == nullptr) {
  128. status = U_MEMORY_ALLOCATION_ERROR;
  129. return;
  130. }
  131. if (fData->fForwardTable->fLookAheadResultsSize > 0) {
  132. fLookAheadMatches = static_cast<int32_t *>(
  133. uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
  134. if (fLookAheadMatches == nullptr) {
  135. status = U_MEMORY_ALLOCATION_ERROR;
  136. return;
  137. }
  138. }
  139. }
  140. //-------------------------------------------------------------------------------
  141. //
  142. // Constructor from a set of rules supplied as a string.
  143. //
  144. //-------------------------------------------------------------------------------
  145. RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
  146. UParseError &parseError,
  147. UErrorCode &status)
  148. : RuleBasedBreakIterator(&status)
  149. {
  150. if (U_FAILURE(status)) {return;}
  151. RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
  152. RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
  153. // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
  154. // creates and returns a complete RBBI. From here, in a constructor, we
  155. // can't just return the object created by the builder factory, hence
  156. // the assignment of the factory created object to "this".
  157. if (U_SUCCESS(status)) {
  158. *this = *bi;
  159. delete bi;
  160. }
  161. }
  162. //-------------------------------------------------------------------------------
  163. //
  164. // Default Constructor. Create an empty shell that can be set up later.
  165. // Used when creating a RuleBasedBreakIterator from a set
  166. // of rules.
  167. //-------------------------------------------------------------------------------
  168. RuleBasedBreakIterator::RuleBasedBreakIterator()
  169. : RuleBasedBreakIterator(nullptr)
  170. {
  171. }
  172. /**
  173. * Simple Constructor with an error code.
  174. * Handles common initialization for all other constructors.
  175. */
  176. RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
  177. UErrorCode ec = U_ZERO_ERROR;
  178. if (status == nullptr) {
  179. status = &ec;
  180. }
  181. utext_openUChars(&fText, nullptr, 0, status);
  182. LocalPointer<DictionaryCache> lpDictionaryCache(new DictionaryCache(this, *status), *status);
  183. LocalPointer<BreakCache> lpBreakCache(new BreakCache(this, *status), *status);
  184. if (U_FAILURE(*status)) {
  185. fErrorCode = *status;
  186. return;
  187. }
  188. fDictionaryCache = lpDictionaryCache.orphan();
  189. fBreakCache = lpBreakCache.orphan();
  190. #ifdef RBBI_DEBUG
  191. static UBool debugInitDone = false;
  192. if (debugInitDone == false) {
  193. char *debugEnv = getenv("U_RBBIDEBUG");
  194. if (debugEnv && uprv_strstr(debugEnv, "trace")) {
  195. gTrace = true;
  196. }
  197. debugInitDone = true;
  198. }
  199. #endif
  200. }
  201. //-------------------------------------------------------------------------------
  202. //
  203. // Copy constructor. Will produce a break iterator with the same behavior,
  204. // and which iterates over the same text, as the one passed in.
  205. //
  206. //-------------------------------------------------------------------------------
  207. RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
  208. : RuleBasedBreakIterator()
  209. {
  210. *this = other;
  211. }
  212. /**
  213. * Destructor
  214. */
  215. RuleBasedBreakIterator::~RuleBasedBreakIterator() {
  216. if (fCharIter != &fSCharIter) {
  217. // fCharIter was adopted from the outside.
  218. delete fCharIter;
  219. }
  220. fCharIter = nullptr;
  221. utext_close(&fText);
  222. if (fData != nullptr) {
  223. fData->removeReference();
  224. fData = nullptr;
  225. }
  226. delete fBreakCache;
  227. fBreakCache = nullptr;
  228. delete fDictionaryCache;
  229. fDictionaryCache = nullptr;
  230. delete fLanguageBreakEngines;
  231. fLanguageBreakEngines = nullptr;
  232. delete fUnhandledBreakEngine;
  233. fUnhandledBreakEngine = nullptr;
  234. uprv_free(fLookAheadMatches);
  235. fLookAheadMatches = nullptr;
  236. }
  237. /**
  238. * Assignment operator. Sets this iterator to have the same behavior,
  239. * and iterate over the same text, as the one passed in.
  240. * TODO: needs better handling of memory allocation errors.
  241. */
  242. RuleBasedBreakIterator&
  243. RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
  244. if (this == &that) {
  245. return *this;
  246. }
  247. BreakIterator::operator=(that);
  248. if (fLanguageBreakEngines != nullptr) {
  249. delete fLanguageBreakEngines;
  250. fLanguageBreakEngines = nullptr; // Just rebuild for now
  251. }
  252. // TODO: clone fLanguageBreakEngines from "that"
  253. UErrorCode status = U_ZERO_ERROR;
  254. utext_clone(&fText, &that.fText, false, true, &status);
  255. if (fCharIter != &fSCharIter) {
  256. delete fCharIter;
  257. }
  258. fCharIter = &fSCharIter;
  259. if (that.fCharIter != nullptr && that.fCharIter != &that.fSCharIter) {
  260. // This is a little bit tricky - it will initially appear that
  261. // this->fCharIter is adopted, even if that->fCharIter was
  262. // not adopted. That's ok.
  263. fCharIter = that.fCharIter->clone();
  264. }
  265. fSCharIter = that.fSCharIter;
  266. if (fCharIter == nullptr) {
  267. fCharIter = &fSCharIter;
  268. }
  269. if (fData != nullptr) {
  270. fData->removeReference();
  271. fData = nullptr;
  272. }
  273. if (that.fData != nullptr) {
  274. fData = that.fData->addReference();
  275. }
  276. uprv_free(fLookAheadMatches);
  277. fLookAheadMatches = nullptr;
  278. if (fData && fData->fForwardTable->fLookAheadResultsSize > 0) {
  279. fLookAheadMatches = static_cast<int32_t *>(
  280. uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t)));
  281. }
  282. fPosition = that.fPosition;
  283. fRuleStatusIndex = that.fRuleStatusIndex;
  284. fDone = that.fDone;
  285. // TODO: both the dictionary and the main cache need to be copied.
  286. // Current position could be within a dictionary range. Trying to continue
  287. // the iteration without the caches present would go to the rules, with
  288. // the assumption that the current position is on a rule boundary.
  289. fBreakCache->reset(fPosition, fRuleStatusIndex);
  290. fDictionaryCache->reset();
  291. return *this;
  292. }
  293. //-----------------------------------------------------------------------------
  294. //
  295. // clone - Returns a newly-constructed RuleBasedBreakIterator with the same
  296. // behavior, and iterating over the same text, as this one.
  297. // Virtual function: does the right thing with subclasses.
  298. //
  299. //-----------------------------------------------------------------------------
  300. RuleBasedBreakIterator*
  301. RuleBasedBreakIterator::clone() const {
  302. return new RuleBasedBreakIterator(*this);
  303. }
  304. /**
  305. * Equality operator. Returns true if both BreakIterators are of the
  306. * same class, have the same behavior, and iterate over the same text.
  307. */
  308. bool
  309. RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
  310. if (typeid(*this) != typeid(that)) {
  311. return false;
  312. }
  313. if (this == &that) {
  314. return true;
  315. }
  316. // The base class BreakIterator carries no state that participates in equality,
  317. // and does not implement an equality function that would otherwise be
  318. // checked at this point.
  319. const RuleBasedBreakIterator& that2 = static_cast<const RuleBasedBreakIterator&>(that);
  320. if (!utext_equals(&fText, &that2.fText)) {
  321. // The two break iterators are operating on different text,
  322. // or have a different iteration position.
  323. // Note that fText's position is always the same as the break iterator's position.
  324. return false;
  325. }
  326. if (!(fPosition == that2.fPosition &&
  327. fRuleStatusIndex == that2.fRuleStatusIndex &&
  328. fDone == that2.fDone)) {
  329. return false;
  330. }
  331. if (that2.fData == fData ||
  332. (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) {
  333. // The two break iterators are using the same rules.
  334. return true;
  335. }
  336. return false;
  337. }
  338. /**
  339. * Compute a hash code for this BreakIterator
  340. * @return A hash code
  341. */
  342. int32_t
  343. RuleBasedBreakIterator::hashCode() const {
  344. int32_t hash = 0;
  345. if (fData != nullptr) {
  346. hash = fData->hashCode();
  347. }
  348. return hash;
  349. }
  350. void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
  351. if (U_FAILURE(status)) {
  352. return;
  353. }
  354. fBreakCache->reset();
  355. fDictionaryCache->reset();
  356. utext_clone(&fText, ut, false, true, &status);
  357. // Set up a dummy CharacterIterator to be returned if anyone
  358. // calls getText(). With input from UText, there is no reasonable
  359. // way to return a characterIterator over the actual input text.
  360. // Return one over an empty string instead - this is the closest
  361. // we can come to signaling a failure.
  362. // (GetText() is obsolete, this failure is sort of OK)
  363. fSCharIter.setText(u"", 0);
  364. if (fCharIter != &fSCharIter) {
  365. // existing fCharIter was adopted from the outside. Delete it now.
  366. delete fCharIter;
  367. }
  368. fCharIter = &fSCharIter;
  369. this->first();
  370. }
  371. UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
  372. UText *result = utext_clone(fillIn, &fText, false, true, &status);
  373. return result;
  374. }
  375. //=======================================================================
  376. // BreakIterator overrides
  377. //=======================================================================
  378. /**
  379. * Return a CharacterIterator over the text being analyzed.
  380. */
  381. CharacterIterator&
  382. RuleBasedBreakIterator::getText() const {
  383. return *fCharIter;
  384. }
  385. /**
  386. * Set the iterator to analyze a new piece of text. This function resets
  387. * the current iteration position to the beginning of the text.
  388. * @param newText An iterator over the text to analyze.
  389. */
  390. void
  391. RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
  392. // If we are holding a CharacterIterator adopted from a
  393. // previous call to this function, delete it now.
  394. if (fCharIter != &fSCharIter) {
  395. delete fCharIter;
  396. }
  397. fCharIter = newText;
  398. UErrorCode status = U_ZERO_ERROR;
  399. fBreakCache->reset();
  400. fDictionaryCache->reset();
  401. if (newText==nullptr || newText->startIndex() != 0) {
  402. // startIndex !=0 wants to be an error, but there's no way to report it.
  403. // Make the iterator text be an empty string.
  404. utext_openUChars(&fText, nullptr, 0, &status);
  405. } else {
  406. utext_openCharacterIterator(&fText, newText, &status);
  407. }
  408. this->first();
  409. }
  410. /**
  411. * Set the iterator to analyze a new piece of text. This function resets
  412. * the current iteration position to the beginning of the text.
  413. * @param newText An iterator over the text to analyze.
  414. */
  415. void
  416. RuleBasedBreakIterator::setText(const UnicodeString& newText) {
  417. UErrorCode status = U_ZERO_ERROR;
  418. fBreakCache->reset();
  419. fDictionaryCache->reset();
  420. utext_openConstUnicodeString(&fText, &newText, &status);
  421. // Set up a character iterator on the string.
  422. // Needed in case someone calls getText().
  423. // Can not, unfortunately, do this lazily on the (probably never)
  424. // call to getText(), because getText is const.
  425. fSCharIter.setText(newText.getBuffer(), newText.length());
  426. if (fCharIter != &fSCharIter) {
  427. // old fCharIter was adopted from the outside. Delete it.
  428. delete fCharIter;
  429. }
  430. fCharIter = &fSCharIter;
  431. this->first();
  432. }
  433. /**
  434. * Provide a new UText for the input text. Must reference text with contents identical
  435. * to the original.
  436. * Intended for use with text data originating in Java (garbage collected) environments
  437. * where the data may be moved in memory at arbitrary times.
  438. */
  439. RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
  440. if (U_FAILURE(status)) {
  441. return *this;
  442. }
  443. if (input == nullptr) {
  444. status = U_ILLEGAL_ARGUMENT_ERROR;
  445. return *this;
  446. }
  447. int64_t pos = utext_getNativeIndex(&fText);
  448. // Shallow read-only clone of the new UText into the existing input UText
  449. utext_clone(&fText, input, false, true, &status);
  450. if (U_FAILURE(status)) {
  451. return *this;
  452. }
  453. utext_setNativeIndex(&fText, pos);
  454. if (utext_getNativeIndex(&fText) != pos) {
  455. // Sanity check. The new input utext is supposed to have the exact same
  456. // contents as the old. If we can't set to the same position, it doesn't.
  457. // The contents underlying the old utext might be invalid at this point,
  458. // so it's not safe to check directly.
  459. status = U_ILLEGAL_ARGUMENT_ERROR;
  460. }
  461. return *this;
  462. }
  463. /**
  464. * Sets the current iteration position to the beginning of the text, position zero.
  465. * @return The new iterator position, which is zero.
  466. */
  467. int32_t RuleBasedBreakIterator::first() {
  468. UErrorCode status = U_ZERO_ERROR;
  469. if (!fBreakCache->seek(0)) {
  470. fBreakCache->populateNear(0, status);
  471. }
  472. fBreakCache->current();
  473. U_ASSERT(fPosition == 0);
  474. return 0;
  475. }
  476. /**
  477. * Sets the current iteration position to the end of the text.
  478. * @return The text's past-the-end offset.
  479. */
  480. int32_t RuleBasedBreakIterator::last() {
  481. int32_t endPos = static_cast<int32_t>(utext_nativeLength(&fText));
  482. UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
  483. (void)endShouldBeBoundary;
  484. U_ASSERT(endShouldBeBoundary);
  485. U_ASSERT(fPosition == endPos);
  486. return endPos;
  487. }
  488. /**
  489. * Advances the iterator either forward or backward the specified number of steps.
  490. * Negative values move backward, and positive values move forward. This is
  491. * equivalent to repeatedly calling next() or previous().
  492. * @param n The number of steps to move. The sign indicates the direction
  493. * (negative is backwards, and positive is forwards).
  494. * @return The character offset of the boundary position n boundaries away from
  495. * the current one.
  496. */
  497. int32_t RuleBasedBreakIterator::next(int32_t n) {
  498. int32_t result = 0;
  499. if (n > 0) {
  500. for (; n > 0 && result != UBRK_DONE; --n) {
  501. result = next();
  502. }
  503. } else if (n < 0) {
  504. for (; n < 0 && result != UBRK_DONE; ++n) {
  505. result = previous();
  506. }
  507. } else {
  508. result = current();
  509. }
  510. return result;
  511. }
  512. /**
  513. * Advances the iterator to the next boundary position.
  514. * @return The position of the first boundary after this one.
  515. */
  516. int32_t RuleBasedBreakIterator::next() {
  517. fBreakCache->next();
  518. return fDone ? UBRK_DONE : fPosition;
  519. }
  520. /**
  521. * Move the iterator backwards, to the boundary preceding the current one.
  522. *
  523. * Starts from the current position within fText.
  524. * Starting position need not be on a boundary.
  525. *
  526. * @return The position of the boundary position immediately preceding the starting position.
  527. */
  528. int32_t RuleBasedBreakIterator::previous() {
  529. UErrorCode status = U_ZERO_ERROR;
  530. fBreakCache->previous(status);
  531. return fDone ? UBRK_DONE : fPosition;
  532. }
  533. /**
  534. * Sets the iterator to refer to the first boundary position following
  535. * the specified position.
  536. * @param startPos The position from which to begin searching for a break position.
  537. * @return The position of the first break after the current position.
  538. */
  539. int32_t RuleBasedBreakIterator::following(int32_t startPos) {
  540. // if the supplied position is before the beginning, return the
  541. // text's starting offset
  542. if (startPos < 0) {
  543. return first();
  544. }
  545. // Move requested offset to a code point start. It might be on a trail surrogate,
  546. // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
  547. utext_setNativeIndex(&fText, startPos);
  548. startPos = static_cast<int32_t>(utext_getNativeIndex(&fText));
  549. UErrorCode status = U_ZERO_ERROR;
  550. fBreakCache->following(startPos, status);
  551. return fDone ? UBRK_DONE : fPosition;
  552. }
  553. /**
  554. * Sets the iterator to refer to the last boundary position before the
  555. * specified position.
  556. * @param offset The position to begin searching for a break from.
  557. * @return The position of the last boundary before the starting position.
  558. */
  559. int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
  560. if (offset > utext_nativeLength(&fText)) {
  561. return last();
  562. }
  563. // Move requested offset to a code point start. It might be on a trail surrogate,
  564. // or on a trail byte if the input is UTF-8.
  565. utext_setNativeIndex(&fText, offset);
  566. int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
  567. UErrorCode status = U_ZERO_ERROR;
  568. fBreakCache->preceding(adjustedOffset, status);
  569. return fDone ? UBRK_DONE : fPosition;
  570. }
  571. /**
  572. * Returns true if the specified position is a boundary position. As a side
  573. * effect, leaves the iterator pointing to the first boundary position at
  574. * or after "offset".
  575. *
  576. * @param offset the offset to check.
  577. * @return True if "offset" is a boundary position.
  578. */
  579. UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
  580. // out-of-range indexes are never boundary positions
  581. if (offset < 0) {
  582. first(); // For side effects on current position, tag values.
  583. return false;
  584. }
  585. // Adjust offset to be on a code point boundary and not beyond the end of the text.
  586. // Note that isBoundary() is always false for offsets that are not on code point boundaries.
  587. // But we still need the side effect of leaving iteration at the following boundary.
  588. utext_setNativeIndex(&fText, offset);
  589. int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
  590. bool result = false;
  591. UErrorCode status = U_ZERO_ERROR;
  592. if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
  593. result = (fBreakCache->current() == offset);
  594. }
  595. if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) {
  596. // Original offset is beyond the end of the text. Return false, it's not a boundary,
  597. // but the iteration position remains set to the end of the text, which is a boundary.
  598. return false;
  599. }
  600. if (!result) {
  601. // Not on a boundary. isBoundary() must leave iterator on the following boundary.
  602. // Cache->seek(), above, left us on the preceding boundary, so advance one.
  603. next();
  604. }
  605. return result;
  606. }
  607. /**
  608. * Returns the current iteration position.
  609. * @return The current iteration position.
  610. */
  611. int32_t RuleBasedBreakIterator::current() const {
  612. return fPosition;
  613. }
  614. //=======================================================================
  615. // implementation
  616. //=======================================================================
  617. //
  618. // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
  619. // of user text. A variable with this enum type keeps track of where we
  620. // are. The state machine only fetches user input while in the RUN mode.
  621. //
  622. enum RBBIRunMode {
  623. RBBI_START, // state machine processing is before first char of input
  624. RBBI_RUN, // state machine processing is in the user text
  625. RBBI_END // state machine processing is after end of user text.
  626. };
  627. // Wrapper functions to select the appropriate handleNext() or handleSafePrevious()
  628. // instantiation, based on whether an 8 or 16 bit table is required.
  629. //
  630. // These Trie access functions will be inlined within the handleNext()/Previous() instantions.
  631. static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) {
  632. return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c);
  633. }
  634. static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) {
  635. return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c);
  636. }
  637. int32_t RuleBasedBreakIterator::handleNext() {
  638. const RBBIStateTable *statetable = fData->fForwardTable;
  639. bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
  640. if (statetable->fFlags & RBBI_8BITS_ROWS) {
  641. if (use8BitsTrie) {
  642. return handleNext<RBBIStateTableRow8, TrieFunc8>();
  643. } else {
  644. return handleNext<RBBIStateTableRow8, TrieFunc16>();
  645. }
  646. } else {
  647. if (use8BitsTrie) {
  648. return handleNext<RBBIStateTableRow16, TrieFunc8>();
  649. } else {
  650. return handleNext<RBBIStateTableRow16, TrieFunc16>();
  651. }
  652. }
  653. }
  654. int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
  655. const RBBIStateTable *statetable = fData->fReverseTable;
  656. bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
  657. if (statetable->fFlags & RBBI_8BITS_ROWS) {
  658. if (use8BitsTrie) {
  659. return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition);
  660. } else {
  661. return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition);
  662. }
  663. } else {
  664. if (use8BitsTrie) {
  665. return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition);
  666. } else {
  667. return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition);
  668. }
  669. }
  670. }
  671. //-----------------------------------------------------------------------------------
  672. //
  673. // handleNext()
  674. // Run the state machine to find a boundary
  675. //
  676. //-----------------------------------------------------------------------------------
  677. template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
  678. int32_t RuleBasedBreakIterator::handleNext() {
  679. int32_t state;
  680. uint16_t category = 0;
  681. RBBIRunMode mode;
  682. RowType *row;
  683. UChar32 c;
  684. int32_t result = 0;
  685. int32_t initialPosition = 0;
  686. const RBBIStateTable *statetable = fData->fForwardTable;
  687. const char *tableData = statetable->fTableData;
  688. uint32_t tableRowLen = statetable->fRowLen;
  689. uint32_t dictStart = statetable->fDictCategoriesStart;
  690. #ifdef RBBI_DEBUG
  691. if (gTrace) {
  692. RBBIDebugPuts("Handle Next pos char state category");
  693. }
  694. #endif
  695. // handleNext always sets the break tag value.
  696. // Set the default for it.
  697. fRuleStatusIndex = 0;
  698. fDictionaryCharCount = 0;
  699. // if we're already at the end of the text, return DONE.
  700. initialPosition = fPosition;
  701. UTEXT_SETNATIVEINDEX(&fText, initialPosition);
  702. result = initialPosition;
  703. c = UTEXT_NEXT32(&fText);
  704. if (c==U_SENTINEL) {
  705. fDone = true;
  706. return UBRK_DONE;
  707. }
  708. // Set the initial state for the state machine
  709. state = START_STATE;
  710. row = (RowType *)
  711. //(statetable->fTableData + (statetable->fRowLen * state));
  712. (tableData + tableRowLen * state);
  713. mode = RBBI_RUN;
  714. if (statetable->fFlags & RBBI_BOF_REQUIRED) {
  715. category = 2;
  716. mode = RBBI_START;
  717. }
  718. // loop until we reach the end of the text or transition to state 0
  719. //
  720. for (;;) {
  721. if (c == U_SENTINEL) {
  722. // Reached end of input string.
  723. if (mode == RBBI_END) {
  724. // We have already run the loop one last time with the
  725. // character set to the psueudo {eof} value. Now it is time
  726. // to unconditionally bail out.
  727. break;
  728. }
  729. // Run the loop one last time with the fake end-of-input character category.
  730. mode = RBBI_END;
  731. category = 1;
  732. }
  733. //
  734. // Get the char category. An incoming category of 1 or 2 means that
  735. // we are preset for doing the beginning or end of input, and
  736. // that we shouldn't get a category from an actual text input character.
  737. //
  738. if (mode == RBBI_RUN) {
  739. // look up the current character's character category, which tells us
  740. // which column in the state table to look at.
  741. category = trieFunc(fData->fTrie, c);
  742. fDictionaryCharCount += (category >= dictStart);
  743. }
  744. #ifdef RBBI_DEBUG
  745. if (gTrace) {
  746. RBBIDebugPrintf(" %4" PRId64 " ", utext_getNativeIndex(&fText));
  747. if (0x20<=c && c<0x7f) {
  748. RBBIDebugPrintf("\"%c\" ", c);
  749. } else {
  750. RBBIDebugPrintf("%5x ", c);
  751. }
  752. RBBIDebugPrintf("%3d %3d\n", state, category);
  753. }
  754. #endif
  755. // State Transition - move machine to its next state
  756. //
  757. // fNextState is a variable-length array.
  758. U_ASSERT(category<fData->fHeader->fCatCount);
  759. state = row->fNextState[category]; /*Not accessing beyond memory*/
  760. row = (RowType *)
  761. // (statetable->fTableData + (statetable->fRowLen * state));
  762. (tableData + tableRowLen * state);
  763. uint16_t accepting = row->fAccepting;
  764. if (accepting == ACCEPTING_UNCONDITIONAL) {
  765. // Match found, common case.
  766. if (mode != RBBI_START) {
  767. result = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(&fText));
  768. }
  769. fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values.
  770. } else if (accepting > ACCEPTING_UNCONDITIONAL) {
  771. // Lookahead match is completed.
  772. U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize);
  773. int32_t lookaheadResult = fLookAheadMatches[accepting];
  774. if (lookaheadResult >= 0) {
  775. fRuleStatusIndex = row->fTagsIdx;
  776. fPosition = lookaheadResult;
  777. return lookaheadResult;
  778. }
  779. }
  780. // If we are at the position of the '/' in a look-ahead (hard break) rule;
  781. // record the current position, to be returned later, if the full rule matches.
  782. // TODO: Move this check before the previous check of fAccepting.
  783. // This would enable hard-break rules with no following context.
  784. // But there are line break test failures when trying this. Investigate.
  785. // Issue ICU-20837
  786. uint16_t rule = row->fLookAhead;
  787. U_ASSERT(rule == 0 || rule > ACCEPTING_UNCONDITIONAL);
  788. U_ASSERT(rule == 0 || rule < fData->fForwardTable->fLookAheadResultsSize);
  789. if (rule > ACCEPTING_UNCONDITIONAL) {
  790. int32_t pos = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(&fText));
  791. fLookAheadMatches[rule] = pos;
  792. }
  793. if (state == STOP_STATE) {
  794. // This is the normal exit from the lookup state machine.
  795. // We have advanced through the string until it is certain that no
  796. // longer match is possible, no matter what characters follow.
  797. break;
  798. }
  799. // Advance to the next character.
  800. // If this is a beginning-of-input loop iteration, don't advance
  801. // the input position. The next iteration will be processing the
  802. // first real input character.
  803. if (mode == RBBI_RUN) {
  804. c = UTEXT_NEXT32(&fText);
  805. } else {
  806. if (mode == RBBI_START) {
  807. mode = RBBI_RUN;
  808. }
  809. }
  810. }
  811. // The state machine is done. Check whether it found a match...
  812. // If the iterator failed to advance in the match engine, force it ahead by one.
  813. // (This really indicates a defect in the break rules. They should always match
  814. // at least one character.)
  815. if (result == initialPosition) {
  816. utext_setNativeIndex(&fText, initialPosition);
  817. utext_next32(&fText);
  818. result = static_cast<int32_t>(utext_getNativeIndex(&fText));
  819. fRuleStatusIndex = 0;
  820. }
  821. // Leave the iterator at our result position.
  822. fPosition = result;
  823. #ifdef RBBI_DEBUG
  824. if (gTrace) {
  825. RBBIDebugPrintf("result = %d\n\n", result);
  826. }
  827. #endif
  828. return result;
  829. }
  830. //-----------------------------------------------------------------------------------
  831. //
  832. // handleSafePrevious()
  833. //
  834. // Iterate backwards using the safe reverse rules.
  835. // The logic of this function is similar to handleNext(), but simpler
  836. // because the safe table does not require as many options.
  837. //
  838. //-----------------------------------------------------------------------------------
  839. template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
  840. int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
  841. int32_t state;
  842. uint16_t category = 0;
  843. RowType *row;
  844. UChar32 c;
  845. int32_t result = 0;
  846. const RBBIStateTable *stateTable = fData->fReverseTable;
  847. UTEXT_SETNATIVEINDEX(&fText, fromPosition);
  848. #ifdef RBBI_DEBUG
  849. if (gTrace) {
  850. RBBIDebugPuts("Handle Previous pos char state category");
  851. }
  852. #endif
  853. // if we're already at the start of the text, return DONE.
  854. if (fData == nullptr || UTEXT_GETNATIVEINDEX(&fText)==0) {
  855. return BreakIterator::DONE;
  856. }
  857. // Set the initial state for the state machine
  858. c = UTEXT_PREVIOUS32(&fText);
  859. state = START_STATE;
  860. row = (RowType *)
  861. (stateTable->fTableData + (stateTable->fRowLen * state));
  862. // loop until we reach the start of the text or transition to state 0
  863. //
  864. for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
  865. // look up the current character's character category, which tells us
  866. // which column in the state table to look at.
  867. //
  868. // Off the dictionary flag bit. For reverse iteration it is not used.
  869. category = trieFunc(fData->fTrie, c);
  870. #ifdef RBBI_DEBUG
  871. if (gTrace) {
  872. RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
  873. if (0x20<=c && c<0x7f) {
  874. RBBIDebugPrintf("\"%c\" ", c);
  875. } else {
  876. RBBIDebugPrintf("%5x ", c);
  877. }
  878. RBBIDebugPrintf("%3d %3d\n", state, category);
  879. }
  880. #endif
  881. // State Transition - move machine to its next state
  882. //
  883. // fNextState is a variable-length array.
  884. U_ASSERT(category<fData->fHeader->fCatCount);
  885. state = row->fNextState[category]; /*Not accessing beyond memory*/
  886. row = (RowType *)
  887. (stateTable->fTableData + (stateTable->fRowLen * state));
  888. if (state == STOP_STATE) {
  889. // This is the normal exit from the lookup state machine.
  890. // Transition to state zero means we have found a safe point.
  891. break;
  892. }
  893. }
  894. // The state machine is done. Check whether it found a match...
  895. result = static_cast<int32_t>(UTEXT_GETNATIVEINDEX(&fText));
  896. #ifdef RBBI_DEBUG
  897. if (gTrace) {
  898. RBBIDebugPrintf("result = %d\n\n", result);
  899. }
  900. #endif
  901. return result;
  902. }
  903. //-------------------------------------------------------------------------------
  904. //
  905. // getRuleStatus() Return the break rule tag associated with the current
  906. // iterator position. If the iterator arrived at its current
  907. // position by iterating forwards, the value will have been
  908. // cached by the handleNext() function.
  909. //
  910. //-------------------------------------------------------------------------------
  911. int32_t RuleBasedBreakIterator::getRuleStatus() const {
  912. // fLastRuleStatusIndex indexes to the start of the appropriate status record
  913. // (the number of status values.)
  914. // This function returns the last (largest) of the array of status values.
  915. int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
  916. int32_t tagVal = fData->fRuleStatusTable[idx];
  917. return tagVal;
  918. }
  919. int32_t RuleBasedBreakIterator::getRuleStatusVec(
  920. int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
  921. if (U_FAILURE(status)) {
  922. return 0;
  923. }
  924. int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex];
  925. int32_t numValsToCopy = numVals;
  926. if (numVals > capacity) {
  927. status = U_BUFFER_OVERFLOW_ERROR;
  928. numValsToCopy = capacity;
  929. }
  930. int i;
  931. for (i=0; i<numValsToCopy; i++) {
  932. fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
  933. }
  934. return numVals;
  935. }
  936. //-------------------------------------------------------------------------------
  937. //
  938. // getBinaryRules Access to the compiled form of the rules,
  939. // for use by build system tools that save the data
  940. // for standard iterator types.
  941. //
  942. //-------------------------------------------------------------------------------
  943. const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
  944. const uint8_t *retPtr = nullptr;
  945. length = 0;
  946. if (fData != nullptr) {
  947. retPtr = reinterpret_cast<const uint8_t*>(fData->fHeader);
  948. length = fData->fHeader->fLength;
  949. }
  950. return retPtr;
  951. }
  952. RuleBasedBreakIterator *RuleBasedBreakIterator::createBufferClone(
  953. void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) {
  954. if (U_FAILURE(status)){
  955. return nullptr;
  956. }
  957. if (bufferSize == 0) {
  958. bufferSize = 1; // preflighting for deprecated functionality
  959. return nullptr;
  960. }
  961. BreakIterator *clonedBI = clone();
  962. if (clonedBI == nullptr) {
  963. status = U_MEMORY_ALLOCATION_ERROR;
  964. } else {
  965. status = U_SAFECLONE_ALLOCATED_WARNING;
  966. }
  967. return (RuleBasedBreakIterator *)clonedBI;
  968. }
  969. U_NAMESPACE_END
  970. static icu::UStack *gLanguageBreakFactories = nullptr;
  971. static const icu::UnicodeString *gEmptyString = nullptr;
  972. static icu::UInitOnce gLanguageBreakFactoriesInitOnce {};
  973. static icu::UInitOnce gRBBIInitOnce {};
  974. static icu::ICULanguageBreakFactory *gICULanguageBreakFactory = nullptr;
  975. /**
  976. * Release all static memory held by breakiterator.
  977. */
  978. U_CDECL_BEGIN
  979. UBool U_CALLCONV rbbi_cleanup() {
  980. delete gLanguageBreakFactories;
  981. gLanguageBreakFactories = nullptr;
  982. delete gEmptyString;
  983. gEmptyString = nullptr;
  984. gLanguageBreakFactoriesInitOnce.reset();
  985. gRBBIInitOnce.reset();
  986. return true;
  987. }
  988. U_CDECL_END
  989. U_CDECL_BEGIN
  990. static void U_CALLCONV _deleteFactory(void *obj) {
  991. delete (icu::LanguageBreakFactory *) obj;
  992. }
  993. U_CDECL_END
  994. U_NAMESPACE_BEGIN
  995. static void U_CALLCONV rbbiInit() {
  996. gEmptyString = new UnicodeString();
  997. ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
  998. }
  999. static void U_CALLCONV initLanguageFactories(UErrorCode& status) {
  1000. U_ASSERT(gLanguageBreakFactories == nullptr);
  1001. gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status);
  1002. if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) {
  1003. LocalPointer<ICULanguageBreakFactory> factory(new ICULanguageBreakFactory(status), status);
  1004. if (U_SUCCESS(status)) {
  1005. gICULanguageBreakFactory = factory.orphan();
  1006. gLanguageBreakFactories->push(gICULanguageBreakFactory, status);
  1007. #ifdef U_LOCAL_SERVICE_HOOK
  1008. LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
  1009. if (extra != nullptr) {
  1010. gLanguageBreakFactories->push(extra, status);
  1011. }
  1012. #endif
  1013. }
  1014. }
  1015. ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
  1016. }
  1017. void ensureLanguageFactories(UErrorCode& status) {
  1018. umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories, status);
  1019. }
  1020. static const LanguageBreakEngine*
  1021. getLanguageBreakEngineFromFactory(UChar32 c, const char* locale)
  1022. {
  1023. UErrorCode status = U_ZERO_ERROR;
  1024. ensureLanguageFactories(status);
  1025. if (U_FAILURE(status)) return nullptr;
  1026. int32_t i = gLanguageBreakFactories->size();
  1027. const LanguageBreakEngine *lbe = nullptr;
  1028. while (--i >= 0) {
  1029. LanguageBreakFactory* factory = static_cast<LanguageBreakFactory*>(gLanguageBreakFactories->elementAt(i));
  1030. lbe = factory->getEngineFor(c, locale);
  1031. if (lbe != nullptr) {
  1032. break;
  1033. }
  1034. }
  1035. return lbe;
  1036. }
  1037. //-------------------------------------------------------------------------------
  1038. //
  1039. // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
  1040. // the character c.
  1041. //
  1042. //-------------------------------------------------------------------------------
  1043. const LanguageBreakEngine *
  1044. RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c, const char* locale) {
  1045. const LanguageBreakEngine *lbe = nullptr;
  1046. UErrorCode status = U_ZERO_ERROR;
  1047. if (fLanguageBreakEngines == nullptr) {
  1048. fLanguageBreakEngines = new UStack(status);
  1049. if (fLanguageBreakEngines == nullptr || U_FAILURE(status)) {
  1050. delete fLanguageBreakEngines;
  1051. fLanguageBreakEngines = nullptr;
  1052. return nullptr;
  1053. }
  1054. }
  1055. int32_t i = fLanguageBreakEngines->size();
  1056. while (--i >= 0) {
  1057. lbe = static_cast<const LanguageBreakEngine*>(fLanguageBreakEngines->elementAt(i));
  1058. if (lbe->handles(c, locale)) {
  1059. return lbe;
  1060. }
  1061. }
  1062. // No existing dictionary took the character. See if a factory wants to
  1063. // give us a new LanguageBreakEngine for this character.
  1064. lbe = getLanguageBreakEngineFromFactory(c, locale);
  1065. // If we got one, use it and push it on our stack.
  1066. if (lbe != nullptr) {
  1067. fLanguageBreakEngines->push((void *)lbe, status);
  1068. // Even if we can't remember it, we can keep looking it up, so
  1069. // return it even if the push fails.
  1070. return lbe;
  1071. }
  1072. // No engine is forthcoming for this character. Add it to the
  1073. // reject set. Create the reject break engine if needed.
  1074. if (fUnhandledBreakEngine == nullptr) {
  1075. fUnhandledBreakEngine = new UnhandledEngine(status);
  1076. if (U_SUCCESS(status) && fUnhandledBreakEngine == nullptr) {
  1077. status = U_MEMORY_ALLOCATION_ERROR;
  1078. return nullptr;
  1079. }
  1080. // Put it last so that scripts for which we have an engine get tried
  1081. // first.
  1082. fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
  1083. // If we can't insert it, or creation failed, get rid of it
  1084. U_ASSERT(!fLanguageBreakEngines->hasDeleter());
  1085. if (U_FAILURE(status)) {
  1086. delete fUnhandledBreakEngine;
  1087. fUnhandledBreakEngine = nullptr;
  1088. return nullptr;
  1089. }
  1090. }
  1091. // Tell the reject engine about the character; at its discretion, it may
  1092. // add more than just the one character.
  1093. fUnhandledBreakEngine->handleCharacter(c);
  1094. return fUnhandledBreakEngine;
  1095. }
  1096. #ifndef U_HIDE_DRAFT_API
  1097. void U_EXPORT2 RuleBasedBreakIterator::registerExternalBreakEngine(
  1098. ExternalBreakEngine* toAdopt, UErrorCode& status) {
  1099. LocalPointer<ExternalBreakEngine> engine(toAdopt, status);
  1100. if (U_FAILURE(status)) return;
  1101. ensureLanguageFactories(status);
  1102. if (U_FAILURE(status)) return;
  1103. gICULanguageBreakFactory->addExternalEngine(engine.orphan(), status);
  1104. }
  1105. #endif /* U_HIDE_DRAFT_API */
  1106. void RuleBasedBreakIterator::dumpCache() {
  1107. fBreakCache->dumpCache();
  1108. }
  1109. void RuleBasedBreakIterator::dumpTables() {
  1110. fData->printData();
  1111. }
  1112. /**
  1113. * Returns the description used to create this iterator
  1114. */
  1115. const UnicodeString&
  1116. RuleBasedBreakIterator::getRules() const {
  1117. if (fData != nullptr) {
  1118. return fData->getRuleSourceString();
  1119. } else {
  1120. umtx_initOnce(gRBBIInitOnce, &rbbiInit);
  1121. return *gEmptyString;
  1122. }
  1123. }
  1124. U_NAMESPACE_END
  1125. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */