rbbi.h 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ***************************************************************************
  5. * Copyright (C) 1999-2016 International Business Machines Corporation *
  6. * and others. All rights reserved. *
  7. ***************************************************************************
  8. **********************************************************************
  9. * Date Name Description
  10. * 10/22/99 alan Creation.
  11. * 11/11/99 rgillam Complete port from Java.
  12. **********************************************************************
  13. */
  14. #ifndef RBBI_H
  15. #define RBBI_H
  16. #include "unicode/utypes.h"
  17. #if U_SHOW_CPLUSPLUS_API
  18. /**
  19. * \file
  20. * \brief C++ API: Rule Based Break Iterator
  21. */
  22. #if !UCONFIG_NO_BREAK_ITERATION
  23. #include "unicode/brkiter.h"
  24. #include "unicode/udata.h"
  25. #include "unicode/parseerr.h"
  26. #include "unicode/schriter.h"
  27. struct UCPTrie;
  28. U_NAMESPACE_BEGIN
  29. /** @internal */
  30. class LanguageBreakEngine;
  31. struct RBBIDataHeader;
  32. class RBBIDataWrapper;
  33. class UnhandledEngine;
  34. class UStack;
  35. #ifndef U_HIDE_INTERNAL_API
  36. /**
  37. * The ExternalBreakEngine class define an abstract interface for the host environment
  38. * to provide a low level facility to break text for unicode text in script that the text boundary
  39. * cannot be handled by upper level rule based logic, for example, for Chinese and Japanese
  40. * word breaking, Thai, Khmer, Burmese, Lao and other Southeast Asian scripts.
  41. * The host environment implement one or more subclass of ExternalBreakEngine and
  42. * register them in the initialization time by calling
  43. * RuleBasedBreakIterator::registerExternalBreakEngine(). ICU adopt and own the engine and will
  44. * delete the registered external engine in proper time during the clean up
  45. * event.
  46. * @internal ICU 74 technology preview
  47. */
  48. class ExternalBreakEngine : public UObject {
  49. public:
  50. /**
  51. * destructor
  52. * @internal ICU 74 technology preview
  53. */
  54. virtual ~ExternalBreakEngine() {}
  55. /**
  56. * <p>Indicate whether this engine handles a particular character when
  57. * the RuleBasedBreakIterator is used for a particular locale. This method is used
  58. * by the RuleBasedBreakIterator to find a break engine.</p>
  59. * @param c A character which begins a run that the engine might handle.
  60. * @param locale The locale.
  61. * @return true if this engine handles the particular character for that locale.
  62. * @internal ICU 74 technology preview
  63. */
  64. virtual bool isFor(UChar32 c, const char* locale) const = 0;
  65. /**
  66. * <p>Indicate whether this engine handles a particular character.This method is
  67. * used by the RuleBasedBreakIterator after it already find a break engine to see which
  68. * characters after the first one can be handled by this break engine.</p>
  69. * @param c A character that the engine might handle.
  70. * @return true if this engine handles the particular character.
  71. * @internal ICU 74 technology preview
  72. */
  73. virtual bool handles(UChar32 c) const = 0;
  74. /**
  75. * <p>Divide up a range of text handled by this break engine.</p>
  76. *
  77. * @param text A UText representing the text
  78. * @param start The start of the range of known characters
  79. * @param end The end of the range of known characters
  80. * @param foundBreaks Output of C array of int32_t break positions, or
  81. * nullptr
  82. * @param foundBreaksCapacity The capacity of foundBreaks
  83. * @param status Information on any errors encountered.
  84. * @return The number of breaks found
  85. * @internal ICU 74 technology preview
  86. */
  87. virtual int32_t fillBreaks(UText* text, int32_t start, int32_t end,
  88. int32_t* foundBreaks, int32_t foundBreaksCapacity,
  89. UErrorCode& status) const = 0;
  90. };
  91. #endif /* U_HIDE_INTERNAL_API */
  92. /**
  93. *
  94. * A subclass of BreakIterator whose behavior is specified using a list of rules.
  95. * <p>Instances of this class are most commonly created by the factory methods of
  96. * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
  97. * and then used via the abstract API in class BreakIterator</p>
  98. *
  99. * <p>See the ICU User Guide for information on Break Iterator Rules.</p>
  100. *
  101. * <p>This class is not intended to be subclassed.</p>
  102. */
  103. class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
  104. private:
  105. /**
  106. * The UText through which this BreakIterator accesses the text
  107. * @internal (private)
  108. */
  109. UText fText = UTEXT_INITIALIZER;
  110. #ifndef U_HIDE_INTERNAL_API
  111. public:
  112. #endif /* U_HIDE_INTERNAL_API */
  113. /**
  114. * The rule data for this BreakIterator instance.
  115. * Not for general use; Public only for testing purposes.
  116. * @internal
  117. */
  118. RBBIDataWrapper *fData = nullptr;
  119. private:
  120. /**
  121. * The saved error code associated with this break iterator.
  122. * This is the value to be returned by copyErrorTo().
  123. */
  124. UErrorCode fErrorCode = U_ZERO_ERROR;
  125. /**
  126. * The current position of the iterator. Pinned, 0 < fPosition <= text.length.
  127. * Never has the value UBRK_DONE (-1).
  128. */
  129. int32_t fPosition = 0;
  130. /**
  131. * TODO:
  132. */
  133. int32_t fRuleStatusIndex = 0;
  134. /**
  135. * Cache of previously determined boundary positions.
  136. */
  137. class BreakCache;
  138. BreakCache *fBreakCache = nullptr;
  139. /**
  140. * Cache of boundary positions within a region of text that has been
  141. * sub-divided by dictionary based breaking.
  142. */
  143. class DictionaryCache;
  144. DictionaryCache *fDictionaryCache = nullptr;
  145. /**
  146. *
  147. * If present, UStack of LanguageBreakEngine objects that might handle
  148. * dictionary characters. Searched from top to bottom to find an object to
  149. * handle a given character.
  150. * @internal (private)
  151. */
  152. UStack *fLanguageBreakEngines = nullptr;
  153. /**
  154. *
  155. * If present, the special LanguageBreakEngine used for handling
  156. * characters that are in the dictionary set, but not handled by any
  157. * LanguageBreakEngine.
  158. * @internal (private)
  159. */
  160. UnhandledEngine *fUnhandledBreakEngine = nullptr;
  161. /**
  162. * Counter for the number of characters encountered with the "dictionary"
  163. * flag set.
  164. * @internal (private)
  165. */
  166. uint32_t fDictionaryCharCount = 0;
  167. /**
  168. * A character iterator that refers to the same text as the UText, above.
  169. * Only included for compatibility with old API, which was based on CharacterIterators.
  170. * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.
  171. */
  172. CharacterIterator *fCharIter = &fSCharIter;
  173. /**
  174. * When the input text is provided by a UnicodeString, this will point to
  175. * a characterIterator that wraps that data. Needed only for the
  176. * implementation of getText(), a backwards compatibility issue.
  177. */
  178. UCharCharacterIterator fSCharIter {u"", 0};
  179. /**
  180. * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
  181. */
  182. bool fDone = false;
  183. /**
  184. * Array of look-ahead tentative results.
  185. */
  186. int32_t *fLookAheadMatches = nullptr;
  187. /**
  188. * A flag to indicate if phrase based breaking is enabled.
  189. */
  190. UBool fIsPhraseBreaking = false;
  191. //=======================================================================
  192. // constructors
  193. //=======================================================================
  194. /**
  195. * Constructor from a flattened set of RBBI data in malloced memory.
  196. * RulesBasedBreakIterators built from a custom set of rules
  197. * are created via this constructor; the rules are compiled
  198. * into memory, then the break iterator is constructed here.
  199. *
  200. * The break iterator adopts the memory, and will
  201. * free it when done.
  202. * @internal (private)
  203. */
  204. RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
  205. /**
  206. * This constructor uses the udata interface to create a BreakIterator
  207. * whose internal tables live in a memory-mapped file. "image" is an
  208. * ICU UDataMemory handle for the pre-compiled break iterator tables.
  209. * @param image handle to the memory image for the break iterator data.
  210. * Ownership of the UDataMemory handle passes to the Break Iterator,
  211. * which will be responsible for closing it when it is no longer needed.
  212. * @param status Information on any errors encountered.
  213. * @param isPhraseBreaking true if phrase based breaking is required, otherwise false.
  214. * @see udata_open
  215. * @see #getBinaryRules
  216. * @internal (private)
  217. */
  218. RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
  219. /** @internal */
  220. friend class RBBIRuleBuilder;
  221. /** @internal */
  222. friend class BreakIterator;
  223. /**
  224. * Default constructor with an error code parameter.
  225. * Aside from error handling, otherwise identical to the default constructor.
  226. * Internally, handles common initialization for other constructors.
  227. * @internal (private)
  228. */
  229. RuleBasedBreakIterator(UErrorCode *status);
  230. public:
  231. /** Default constructor. Creates an empty shell of an iterator, with no
  232. * rules or text to iterate over. Object can subsequently be assigned to,
  233. * but is otherwise unusable.
  234. * @stable ICU 2.2
  235. */
  236. RuleBasedBreakIterator();
  237. /**
  238. * Copy constructor. Will produce a break iterator with the same behavior,
  239. * and which iterates over the same text, as the one passed in.
  240. * @param that The RuleBasedBreakIterator passed to be copied
  241. * @stable ICU 2.0
  242. */
  243. RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
  244. /**
  245. * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
  246. * @param rules The break rules to be used.
  247. * @param parseError In the event of a syntax error in the rules, provides the location
  248. * within the rules of the problem.
  249. * @param status Information on any errors encountered.
  250. * @stable ICU 2.2
  251. */
  252. RuleBasedBreakIterator( const UnicodeString &rules,
  253. UParseError &parseError,
  254. UErrorCode &status);
  255. /**
  256. * Construct a RuleBasedBreakIterator from a set of precompiled binary rules.
  257. * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules().
  258. * Construction of a break iterator in this way is substantially faster than
  259. * construction from source rules.
  260. *
  261. * Ownership of the storage containing the compiled rules remains with the
  262. * caller of this function. The compiled rules must not be modified or
  263. * deleted during the life of the break iterator.
  264. *
  265. * The compiled rules are not compatible across different major versions of ICU.
  266. * The compiled rules are compatible only between machines with the same
  267. * byte ordering (little or big endian) and the same base character set family
  268. * (ASCII or EBCDIC).
  269. *
  270. * @see #getBinaryRules
  271. * @param compiledRules A pointer to the compiled break rules to be used.
  272. * @param ruleLength The length of the compiled break rules, in bytes. This
  273. * corresponds to the length value produced by getBinaryRules().
  274. * @param status Information on any errors encountered, including invalid
  275. * binary rules.
  276. * @stable ICU 4.8
  277. */
  278. RuleBasedBreakIterator(const uint8_t *compiledRules,
  279. uint32_t ruleLength,
  280. UErrorCode &status);
  281. /**
  282. * This constructor uses the udata interface to create a BreakIterator
  283. * whose internal tables live in a memory-mapped file. "image" is an
  284. * ICU UDataMemory handle for the pre-compiled break iterator tables.
  285. * @param image handle to the memory image for the break iterator data.
  286. * Ownership of the UDataMemory handle passes to the Break Iterator,
  287. * which will be responsible for closing it when it is no longer needed.
  288. * @param status Information on any errors encountered.
  289. * @see udata_open
  290. * @see #getBinaryRules
  291. * @stable ICU 2.8
  292. */
  293. RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
  294. /**
  295. * Destructor
  296. * @stable ICU 2.0
  297. */
  298. virtual ~RuleBasedBreakIterator();
  299. /**
  300. * Assignment operator. Sets this iterator to have the same behavior,
  301. * and iterate over the same text, as the one passed in.
  302. * @param that The RuleBasedBreakItertor passed in
  303. * @return the newly created RuleBasedBreakIterator
  304. * @stable ICU 2.0
  305. */
  306. RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
  307. /**
  308. * Equality operator. Returns true if both BreakIterators are of the
  309. * same class, have the same behavior, and iterate over the same text.
  310. * @param that The BreakIterator to be compared for equality
  311. * @return true if both BreakIterators are of the
  312. * same class, have the same behavior, and iterate over the same text.
  313. * @stable ICU 2.0
  314. */
  315. virtual bool operator==(const BreakIterator& that) const override;
  316. /**
  317. * Not-equal operator. If operator== returns true, this returns false,
  318. * and vice versa.
  319. * @param that The BreakIterator to be compared for inequality
  320. * @return true if both BreakIterators are not same.
  321. * @stable ICU 2.0
  322. */
  323. inline bool operator!=(const BreakIterator& that) const {
  324. return !operator==(that);
  325. }
  326. /**
  327. * Returns a newly-constructed RuleBasedBreakIterator with the same
  328. * behavior, and iterating over the same text, as this one.
  329. * Differs from the copy constructor in that it is polymorphic, and
  330. * will correctly clone (copy) a derived class.
  331. * clone() is thread safe. Multiple threads may simultaneously
  332. * clone the same source break iterator.
  333. * @return a newly-constructed RuleBasedBreakIterator
  334. * @stable ICU 2.0
  335. */
  336. virtual RuleBasedBreakIterator* clone() const override;
  337. /**
  338. * Compute a hash code for this BreakIterator
  339. * @return A hash code
  340. * @stable ICU 2.0
  341. */
  342. virtual int32_t hashCode() const;
  343. /**
  344. * Returns the description used to create this iterator
  345. * @return the description used to create this iterator
  346. * @stable ICU 2.0
  347. */
  348. virtual const UnicodeString& getRules() const;
  349. //=======================================================================
  350. // BreakIterator overrides
  351. //=======================================================================
  352. /**
  353. * <p>
  354. * Return a CharacterIterator over the text being analyzed.
  355. * The returned character iterator is owned by the break iterator, and must
  356. * not be deleted by the caller. Repeated calls to this function may
  357. * return the same CharacterIterator.
  358. * </p>
  359. * <p>
  360. * The returned character iterator must not be used concurrently with
  361. * the break iterator. If concurrent operation is needed, clone the
  362. * returned character iterator first and operate on the clone.
  363. * </p>
  364. * <p>
  365. * When the break iterator is operating on text supplied via a UText,
  366. * this function will fail, returning a CharacterIterator containing no text.
  367. * The function getUText() provides similar functionality,
  368. * is reliable, and is more efficient.
  369. * </p>
  370. *
  371. * TODO: deprecate this function?
  372. *
  373. * @return An iterator over the text being analyzed.
  374. * @stable ICU 2.0
  375. */
  376. virtual CharacterIterator& getText() const override;
  377. /**
  378. * Get a UText for the text being analyzed.
  379. * The returned UText is a shallow clone of the UText used internally
  380. * by the break iterator implementation. It can safely be used to
  381. * access the text without impacting any break iterator operations,
  382. * but the underlying text itself must not be altered.
  383. *
  384. * @param fillIn A UText to be filled in. If nullptr, a new UText will be
  385. * allocated to hold the result.
  386. * @param status receives any error codes.
  387. * @return The current UText for this break iterator. If an input
  388. * UText was provided, it will always be returned.
  389. * @stable ICU 3.4
  390. */
  391. virtual UText *getUText(UText *fillIn, UErrorCode &status) const override;
  392. /**
  393. * Set the iterator to analyze a new piece of text. This function resets
  394. * the current iteration position to the beginning of the text.
  395. * @param newText An iterator over the text to analyze. The BreakIterator
  396. * takes ownership of the character iterator. The caller MUST NOT delete it!
  397. * @stable ICU 2.0
  398. */
  399. virtual void adoptText(CharacterIterator* newText) override;
  400. /**
  401. * Set the iterator to analyze a new piece of text. This function resets
  402. * the current iteration position to the beginning of the text.
  403. *
  404. * The BreakIterator will retain a reference to the supplied string.
  405. * The caller must not modify or delete the text while the BreakIterator
  406. * retains the reference.
  407. *
  408. * @param newText The text to analyze.
  409. * @stable ICU 2.0
  410. */
  411. virtual void setText(const UnicodeString& newText) override;
  412. /**
  413. * Reset the break iterator to operate over the text represented by
  414. * the UText. The iterator position is reset to the start.
  415. *
  416. * This function makes a shallow clone of the supplied UText. This means
  417. * that the caller is free to immediately close or otherwise reuse the
  418. * Utext that was passed as a parameter, but that the underlying text itself
  419. * must not be altered while being referenced by the break iterator.
  420. *
  421. * @param text The UText used to change the text.
  422. * @param status Receives any error codes.
  423. * @stable ICU 3.4
  424. */
  425. virtual void setText(UText *text, UErrorCode &status) override;
  426. /**
  427. * Sets the current iteration position to the beginning of the text, position zero.
  428. * @return The offset of the beginning of the text, zero.
  429. * @stable ICU 2.0
  430. */
  431. virtual int32_t first() override;
  432. /**
  433. * Sets the current iteration position to the end of the text.
  434. * @return The text's past-the-end offset.
  435. * @stable ICU 2.0
  436. */
  437. virtual int32_t last() override;
  438. /**
  439. * Advances the iterator either forward or backward the specified number of steps.
  440. * Negative values move backward, and positive values move forward. This is
  441. * equivalent to repeatedly calling next() or previous().
  442. * @param n The number of steps to move. The sign indicates the direction
  443. * (negative is backwards, and positive is forwards).
  444. * @return The character offset of the boundary position n boundaries away from
  445. * the current one.
  446. * @stable ICU 2.0
  447. */
  448. virtual int32_t next(int32_t n) override;
  449. /**
  450. * Advances the iterator to the next boundary position.
  451. * @return The position of the first boundary after this one.
  452. * @stable ICU 2.0
  453. */
  454. virtual int32_t next() override;
  455. /**
  456. * Moves the iterator backwards, to the last boundary preceding this one.
  457. * @return The position of the last boundary position preceding this one.
  458. * @stable ICU 2.0
  459. */
  460. virtual int32_t previous() override;
  461. /**
  462. * Sets the iterator to refer to the first boundary position following
  463. * the specified position.
  464. * @param offset The position from which to begin searching for a break position.
  465. * @return The position of the first break after the current position.
  466. * @stable ICU 2.0
  467. */
  468. virtual int32_t following(int32_t offset) override;
  469. /**
  470. * Sets the iterator to refer to the last boundary position before the
  471. * specified position.
  472. * @param offset The position to begin searching for a break from.
  473. * @return The position of the last boundary before the starting position.
  474. * @stable ICU 2.0
  475. */
  476. virtual int32_t preceding(int32_t offset) override;
  477. /**
  478. * Returns true if the specified position is a boundary position. As a side
  479. * effect, leaves the iterator pointing to the first boundary position at
  480. * or after "offset".
  481. * @param offset the offset to check.
  482. * @return True if "offset" is a boundary position.
  483. * @stable ICU 2.0
  484. */
  485. virtual UBool isBoundary(int32_t offset) override;
  486. /**
  487. * Returns the current iteration position. Note that UBRK_DONE is never
  488. * returned from this function; if iteration has run to the end of a
  489. * string, current() will return the length of the string while
  490. * next() will return UBRK_DONE).
  491. * @return The current iteration position.
  492. * @stable ICU 2.0
  493. */
  494. virtual int32_t current() const override;
  495. /**
  496. * Return the status tag from the break rule that determined the boundary at
  497. * the current iteration position. For break rules that do not specify a
  498. * status, a default value of 0 is returned. If more than one break rule
  499. * would cause a boundary to be located at some position in the text,
  500. * the numerically largest of the applicable status values is returned.
  501. * <p>
  502. * Of the standard types of ICU break iterators, only word break and
  503. * line break provide status values. The values are defined in
  504. * the header file ubrk.h. For Word breaks, the status allows distinguishing between words
  505. * that contain alphabetic letters, "words" that appear to be numbers,
  506. * punctuation and spaces, words containing ideographic characters, and
  507. * more. For Line Break, the status distinguishes between hard (mandatory) breaks
  508. * and soft (potential) break positions.
  509. * <p>
  510. * <code>getRuleStatus()</code> can be called after obtaining a boundary
  511. * position from <code>next()</code>, <code>previous()</code>, or
  512. * any other break iterator functions that returns a boundary position.
  513. * <p>
  514. * Note that <code>getRuleStatus()</code> returns the value corresponding to
  515. * <code>current()</code> index even after <code>next()</code> has returned DONE.
  516. * <p>
  517. * When creating custom break rules, one is free to define whatever
  518. * status values may be convenient for the application.
  519. * <p>
  520. * @return the status from the break rule that determined the boundary
  521. * at the current iteration position.
  522. *
  523. * @see UWordBreak
  524. * @stable ICU 2.2
  525. */
  526. virtual int32_t getRuleStatus() const override;
  527. /**
  528. * Get the status (tag) values from the break rule(s) that determined the boundary
  529. * at the current iteration position.
  530. * <p>
  531. * The returned status value(s) are stored into an array provided by the caller.
  532. * The values are stored in sorted (ascending) order.
  533. * If the capacity of the output array is insufficient to hold the data,
  534. * the output will be truncated to the available length, and a
  535. * U_BUFFER_OVERFLOW_ERROR will be signaled.
  536. *
  537. * @param fillInVec an array to be filled in with the status values.
  538. * @param capacity the length of the supplied vector. A length of zero causes
  539. * the function to return the number of status values, in the
  540. * normal way, without attempting to store any values.
  541. * @param status receives error codes.
  542. * @return The number of rule status values from the rules that determined
  543. * the boundary at the current iteration position.
  544. * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
  545. * is the total number of status values that were available,
  546. * not the reduced number that were actually returned.
  547. * @see getRuleStatus
  548. * @stable ICU 3.0
  549. */
  550. virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) override;
  551. /**
  552. * Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
  553. * This method is to implement a simple version of RTTI, since not all
  554. * C++ compilers support genuine RTTI. Polymorphic operator==() and
  555. * clone() methods call this method.
  556. *
  557. * @return The class ID for this object. All objects of a
  558. * given class have the same class ID. Objects of
  559. * other classes have different class IDs.
  560. * @stable ICU 2.0
  561. */
  562. virtual UClassID getDynamicClassID() const override;
  563. /**
  564. * Returns the class ID for this class. This is useful only for
  565. * comparing to a return value from getDynamicClassID(). For example:
  566. *
  567. * Base* polymorphic_pointer = createPolymorphicObject();
  568. * if (polymorphic_pointer->getDynamicClassID() ==
  569. * Derived::getStaticClassID()) ...
  570. *
  571. * @return The class ID for all objects of this class.
  572. * @stable ICU 2.0
  573. */
  574. static UClassID U_EXPORT2 getStaticClassID();
  575. #ifndef U_FORCE_HIDE_DEPRECATED_API
  576. /**
  577. * Deprecated functionality. Use clone() instead.
  578. *
  579. * Create a clone (copy) of this break iterator in memory provided
  580. * by the caller. The idea is to increase performance by avoiding
  581. * a storage allocation. Use of this function is NOT RECOMMENDED.
  582. * Performance gains are minimal, and correct buffer management is
  583. * tricky. Use clone() instead.
  584. *
  585. * @param stackBuffer The pointer to the memory into which the cloned object
  586. * should be placed. If nullptr, allocate heap memory
  587. * for the cloned object.
  588. * @param BufferSize The size of the buffer. If zero, return the required
  589. * buffer size, but do not clone the object. If the
  590. * size was too small (but not zero), allocate heap
  591. * storage for the cloned object.
  592. *
  593. * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
  594. * returned if the provided buffer was too small, and
  595. * the clone was therefore put on the heap.
  596. *
  597. * @return Pointer to the clone object. This may differ from the stackBuffer
  598. * address if the byte alignment of the stack buffer was not suitable
  599. * or if the stackBuffer was too small to hold the clone.
  600. * @deprecated ICU 52. Use clone() instead.
  601. */
  602. virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer,
  603. int32_t &BufferSize,
  604. UErrorCode &status) override;
  605. #endif // U_FORCE_HIDE_DEPRECATED_API
  606. /**
  607. * Return the binary form of compiled break rules,
  608. * which can then be used to create a new break iterator at some
  609. * time in the future. Creating a break iterator from pre-compiled rules
  610. * is much faster than building one from the source form of the
  611. * break rules.
  612. *
  613. * The binary data can only be used with the same version of ICU
  614. * and on the same platform type (processor endian-ness)
  615. *
  616. * @param length Returns the length of the binary data. (Out parameter.)
  617. *
  618. * @return A pointer to the binary (compiled) rule data. The storage
  619. * belongs to the RulesBasedBreakIterator object, not the
  620. * caller, and must not be modified or deleted.
  621. * @stable ICU 4.8
  622. */
  623. virtual const uint8_t *getBinaryRules(uint32_t &length);
  624. /**
  625. * Set the subject text string upon which the break iterator is operating
  626. * without changing any other aspect of the matching state.
  627. * The new and previous text strings must have the same content.
  628. *
  629. * This function is intended for use in environments where ICU is operating on
  630. * strings that may move around in memory. It provides a mechanism for notifying
  631. * ICU that the string has been relocated, and providing a new UText to access the
  632. * string in its new position.
  633. *
  634. * Note that the break iterator implementation never copies the underlying text
  635. * of a string being processed, but always operates directly on the original text
  636. * provided by the user. Refreshing simply drops the references to the old text
  637. * and replaces them with references to the new.
  638. *
  639. * Caution: this function is normally used only by very specialized,
  640. * system-level code. One example use case is with garbage collection that moves
  641. * the text in memory.
  642. *
  643. * @param input The new (moved) text string.
  644. * @param status Receives errors detected by this function.
  645. * @return *this
  646. *
  647. * @stable ICU 49
  648. */
  649. virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status) override;
  650. private:
  651. //=======================================================================
  652. // implementation
  653. //=======================================================================
  654. /**
  655. * Iterate backwards from an arbitrary position in the input text using the
  656. * synthesized Safe Reverse rules.
  657. * This locates a "Safe Position" from which the forward break rules
  658. * will operate correctly. A Safe Position is not necessarily a boundary itself.
  659. *
  660. * @param fromPosition the position in the input text to begin the iteration.
  661. * @internal (private)
  662. */
  663. int32_t handleSafePrevious(int32_t fromPosition);
  664. /**
  665. * Find a rule-based boundary by running the state machine.
  666. * Input
  667. * fPosition, the position in the text to begin from.
  668. * Output
  669. * fPosition: the boundary following the starting position.
  670. * fDictionaryCharCount the number of dictionary characters encountered.
  671. * If > 0, the segment will be further subdivided
  672. * fRuleStatusIndex Info from the state table indicating which rules caused the boundary.
  673. *
  674. * @internal (private)
  675. */
  676. int32_t handleNext();
  677. /*
  678. * Templatized version of handleNext() and handleSafePrevious().
  679. *
  680. * There will be exactly four instantiations, two each for 8 and 16 bit tables,
  681. * two each for 8 and 16 bit trie.
  682. * Having separate instantiations for the table types keeps conditional tests of
  683. * the table type out of the inner loops, at the expense of replicated code.
  684. *
  685. * The template parameter for the Trie access function is a value, not a type.
  686. * Doing it this way, the compiler will inline the Trie function in the
  687. * expanded functions. (Both the 8 and 16 bit access functions have the same type
  688. * signature)
  689. */
  690. typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
  691. template<typename RowType, PTrieFunc trieFunc>
  692. int32_t handleSafePrevious(int32_t fromPosition);
  693. template<typename RowType, PTrieFunc trieFunc>
  694. int32_t handleNext();
  695. /**
  696. * This function returns the appropriate LanguageBreakEngine for a
  697. * given character c.
  698. * @param c A character in the dictionary set
  699. * @param locale The locale.
  700. * @internal (private)
  701. */
  702. const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale);
  703. public:
  704. #ifndef U_HIDE_INTERNAL_API
  705. /**
  706. * Debugging function only.
  707. * @internal
  708. */
  709. void dumpCache();
  710. /**
  711. * Debugging function only.
  712. * @internal
  713. */
  714. void dumpTables();
  715. #endif /* U_HIDE_INTERNAL_API */
  716. #ifndef U_HIDE_INTERNAL_API
  717. /**
  718. * Register a new external break engine. The external break engine will be adopted.
  719. * Because ICU may choose to cache break engine internally, this must
  720. * be called at application startup, prior to any calls to
  721. * object methods of RuleBasedBreakIterator to avoid undefined behavior.
  722. * @param toAdopt the ExternalBreakEngine instance to be adopted
  723. * @param status the in/out status code, no special meanings are assigned
  724. * @internal ICU 74 technology preview
  725. */
  726. static void U_EXPORT2 registerExternalBreakEngine(
  727. ExternalBreakEngine* toAdopt, UErrorCode& status);
  728. #endif /* U_HIDE_INTERNAL_API */
  729. };
  730. U_NAMESPACE_END
  731. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  732. #endif /* U_SHOW_CPLUSPLUS_API */
  733. #endif