rbbisetb.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // rbbisetb.cpp
  5. //
  6. /*
  7. ***************************************************************************
  8. * Copyright (C) 2002-2008 International Business Machines Corporation *
  9. * and others. All rights reserved. *
  10. ***************************************************************************
  11. */
  12. //
  13. // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
  14. // (part of the rule building process.)
  15. //
  16. // Starting with the rules parse tree from the scanner,
  17. //
  18. // - Enumerate the set of UnicodeSets that are referenced
  19. // by the RBBI rules.
  20. // - compute a set of non-overlapping character ranges
  21. // with all characters within a range belonging to the same
  22. // set of input unicode sets.
  23. // - Derive a set of non-overlapping UnicodeSet (like things)
  24. // that will correspond to columns in the state table for
  25. // the RBBI execution engine. All characters within one
  26. // of these sets belong to the same set of the original
  27. // UnicodeSets from the user's rules.
  28. // - construct the trie table that maps input characters
  29. // to the index of the matching non-overlapping set of set from
  30. // the previous step.
  31. //
  32. #include "unicode/utypes.h"
  33. #if !UCONFIG_NO_BREAK_ITERATION
  34. #include "unicode/uniset.h"
  35. #include "uvector.h"
  36. #include "uassert.h"
  37. #include "cmemory.h"
  38. #include "cstring.h"
  39. #include "rbbisetb.h"
  40. #include "rbbinode.h"
  41. U_NAMESPACE_BEGIN
  42. const int32_t kMaxCharCategoriesFor8BitsTrie = 255;
  43. //------------------------------------------------------------------------
  44. //
  45. // Constructor
  46. //
  47. //------------------------------------------------------------------------
  48. RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
  49. {
  50. fRB = rb;
  51. fStatus = rb->fStatus;
  52. fRangeList = nullptr;
  53. fMutableTrie = nullptr;
  54. fTrie = nullptr;
  55. fTrieSize = 0;
  56. fGroupCount = 0;
  57. fSawBOF = false;
  58. }
  59. //------------------------------------------------------------------------
  60. //
  61. // Destructor
  62. //
  63. //------------------------------------------------------------------------
  64. RBBISetBuilder::~RBBISetBuilder()
  65. {
  66. RangeDescriptor *nextRangeDesc;
  67. // Walk through & delete the linked list of RangeDescriptors
  68. for (nextRangeDesc = fRangeList; nextRangeDesc!=nullptr;) {
  69. RangeDescriptor *r = nextRangeDesc;
  70. nextRangeDesc = r->fNext;
  71. delete r;
  72. }
  73. ucptrie_close(fTrie);
  74. umutablecptrie_close(fMutableTrie);
  75. }
  76. //------------------------------------------------------------------------
  77. //
  78. // build Build the list of non-overlapping character ranges
  79. // from the Unicode Sets.
  80. //
  81. //------------------------------------------------------------------------
  82. void RBBISetBuilder::buildRanges() {
  83. RBBINode *usetNode;
  84. RangeDescriptor *rlRange;
  85. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
  86. //
  87. // Initialize the process by creating a single range encompassing all characters
  88. // that is in no sets.
  89. //
  90. fRangeList = new RangeDescriptor(*fStatus); // will check for status here
  91. if (fRangeList == nullptr) {
  92. *fStatus = U_MEMORY_ALLOCATION_ERROR;
  93. return;
  94. }
  95. fRangeList->fStartChar = 0;
  96. fRangeList->fEndChar = 0x10ffff;
  97. if (U_FAILURE(*fStatus)) {
  98. return;
  99. }
  100. //
  101. // Find the set of non-overlapping ranges of characters
  102. //
  103. int ni;
  104. for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
  105. usetNode = static_cast<RBBINode*>(this->fRB->fUSetNodes->elementAt(ni));
  106. if (usetNode==nullptr) {
  107. break;
  108. }
  109. UnicodeSet *inputSet = usetNode->fInputSet;
  110. int32_t inputSetRangeCount = inputSet->getRangeCount();
  111. int inputSetRangeIndex = 0;
  112. rlRange = fRangeList;
  113. for (;;) {
  114. if (inputSetRangeIndex >= inputSetRangeCount) {
  115. break;
  116. }
  117. UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
  118. UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);
  119. // skip over ranges from the range list that are completely
  120. // below the current range from the input unicode set.
  121. while (rlRange->fEndChar < inputSetRangeBegin) {
  122. rlRange = rlRange->fNext;
  123. }
  124. // If the start of the range from the range list is before with
  125. // the start of the range from the unicode set, split the range list range
  126. // in two, with one part being before (wholly outside of) the unicode set
  127. // and the other containing the rest.
  128. // Then continue the loop; the post-split current range will then be skipped
  129. // over
  130. if (rlRange->fStartChar < inputSetRangeBegin) {
  131. rlRange->split(inputSetRangeBegin, *fStatus);
  132. if (U_FAILURE(*fStatus)) {
  133. return;
  134. }
  135. continue;
  136. }
  137. // Same thing at the end of the ranges...
  138. // If the end of the range from the range list doesn't coincide with
  139. // the end of the range from the unicode set, split the range list
  140. // range in two. The first part of the split range will be
  141. // wholly inside the Unicode set.
  142. if (rlRange->fEndChar > inputSetRangeEnd) {
  143. rlRange->split(inputSetRangeEnd+1, *fStatus);
  144. if (U_FAILURE(*fStatus)) {
  145. return;
  146. }
  147. }
  148. // The current rlRange is now entirely within the UnicodeSet range.
  149. // Add this unicode set to the list of sets for this rlRange
  150. if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
  151. rlRange->fIncludesSets->addElement(usetNode, *fStatus);
  152. if (U_FAILURE(*fStatus)) {
  153. return;
  154. }
  155. }
  156. // Advance over ranges that we are finished with.
  157. if (inputSetRangeEnd == rlRange->fEndChar) {
  158. inputSetRangeIndex++;
  159. }
  160. rlRange = rlRange->fNext;
  161. }
  162. }
  163. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
  164. //
  165. // Group the above ranges, with each group consisting of one or more
  166. // ranges that are in exactly the same set of original UnicodeSets.
  167. // The groups are numbered, and these group numbers are the set of
  168. // input symbols recognized by the run-time state machine.
  169. //
  170. // Numbering: # 0 (state table column 0) is unused.
  171. // # 1 is reserved - table column 1 is for end-of-input
  172. // # 2 is reserved - table column 2 is for beginning-of-input
  173. // # 3 is the first range list.
  174. //
  175. RangeDescriptor *rlSearchRange;
  176. int32_t dictGroupCount = 0;
  177. for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
  178. for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
  179. if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
  180. rlRange->fNum = rlSearchRange->fNum;
  181. rlRange->fIncludesDict = rlSearchRange->fIncludesDict;
  182. break;
  183. }
  184. }
  185. if (rlRange->fNum == 0) {
  186. rlRange->fFirstInGroup = true;
  187. if (rlRange->isDictionaryRange()) {
  188. rlRange->fNum = ++dictGroupCount;
  189. rlRange->fIncludesDict = true;
  190. } else {
  191. fGroupCount++;
  192. rlRange->fNum = fGroupCount+2;
  193. addValToSets(rlRange->fIncludesSets, rlRange->fNum);
  194. }
  195. }
  196. }
  197. // Move the character category numbers for any dictionary ranges up, so that they
  198. // immediately follow the non-dictionary ranges.
  199. fDictCategoriesStart = fGroupCount + 3;
  200. for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
  201. if (rlRange->fIncludesDict) {
  202. rlRange->fNum += fDictCategoriesStart - 1;
  203. if (rlRange->fFirstInGroup) {
  204. addValToSets(rlRange->fIncludesSets, rlRange->fNum);
  205. }
  206. }
  207. }
  208. fGroupCount += dictGroupCount;
  209. // Handle input sets that contain the special string {eof}.
  210. // Column 1 of the state table is reserved for EOF on input.
  211. // Column 2 is reserved for before-the-start-input.
  212. // (This column can be optimized away later if there are no rule
  213. // references to {bof}.)
  214. // Add this column value (1 or 2) to the equivalent expression
  215. // subtree for each UnicodeSet that contains the string {eof}
  216. // Because {bof} and {eof} are not characters in the normal sense,
  217. // they don't affect the computation of the ranges or TRIE.
  218. UnicodeString eofString(u"eof");
  219. UnicodeString bofString(u"bof");
  220. for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
  221. usetNode = static_cast<RBBINode*>(this->fRB->fUSetNodes->elementAt(ni));
  222. if (usetNode==nullptr) {
  223. break;
  224. }
  225. UnicodeSet *inputSet = usetNode->fInputSet;
  226. if (inputSet->contains(eofString)) {
  227. addValToSet(usetNode, 1);
  228. }
  229. if (inputSet->contains(bofString)) {
  230. addValToSet(usetNode, 2);
  231. fSawBOF = true;
  232. }
  233. }
  234. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
  235. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
  236. }
  237. //
  238. // Build the Trie table for mapping UChar32 values to the corresponding
  239. // range group number.
  240. //
  241. void RBBISetBuilder::buildTrie() {
  242. fMutableTrie = umutablecptrie_open(
  243. 0, // Initial value for all code points.
  244. 0, // Error value for out-of-range input.
  245. fStatus);
  246. for (RangeDescriptor *range = fRangeList; range!=nullptr && U_SUCCESS(*fStatus); range=range->fNext) {
  247. umutablecptrie_setRange(fMutableTrie,
  248. range->fStartChar, // Range start
  249. range->fEndChar, // Range end (inclusive)
  250. range->fNum, // value for range
  251. fStatus);
  252. }
  253. }
  254. void RBBISetBuilder::mergeCategories(IntPair categories) {
  255. U_ASSERT(categories.first >= 1);
  256. U_ASSERT(categories.second > categories.first);
  257. U_ASSERT((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) ||
  258. (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
  259. for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
  260. int32_t rangeNum = rd->fNum;
  261. if (rangeNum == categories.second) {
  262. rd->fNum = categories.first;
  263. } else if (rangeNum > categories.second) {
  264. rd->fNum--;
  265. }
  266. }
  267. --fGroupCount;
  268. if (categories.second <= fDictCategoriesStart) {
  269. --fDictCategoriesStart;
  270. }
  271. }
  272. //-----------------------------------------------------------------------------------
  273. //
  274. // getTrieSize() Return the size that will be required to serialize the Trie.
  275. //
  276. //-----------------------------------------------------------------------------------
  277. int32_t RBBISetBuilder::getTrieSize() {
  278. if (U_FAILURE(*fStatus)) {
  279. return 0;
  280. }
  281. if (fTrie == nullptr) {
  282. bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
  283. fTrie = umutablecptrie_buildImmutable(
  284. fMutableTrie,
  285. UCPTRIE_TYPE_FAST,
  286. use8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16,
  287. fStatus);
  288. fTrieSize = ucptrie_toBinary(fTrie, nullptr, 0, fStatus);
  289. if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
  290. *fStatus = U_ZERO_ERROR;
  291. }
  292. }
  293. return fTrieSize;
  294. }
  295. //-----------------------------------------------------------------------------------
  296. //
  297. // serializeTrie() Put the serialized trie at the specified address.
  298. // Trust the caller to have given us enough memory.
  299. // getTrieSize() MUST be called first.
  300. //
  301. //-----------------------------------------------------------------------------------
  302. void RBBISetBuilder::serializeTrie(uint8_t *where) {
  303. ucptrie_toBinary(fTrie,
  304. where, // Buffer
  305. fTrieSize, // Capacity
  306. fStatus);
  307. }
  308. //------------------------------------------------------------------------
  309. //
  310. // addValToSets Add a runtime-mapped input value to each uset from a
  311. // list of uset nodes. (val corresponds to a state table column.)
  312. // For each of the original Unicode sets - which correspond
  313. // directly to uset nodes - a logically equivalent expression
  314. // is constructed in terms of the remapped runtime input
  315. // symbol set. This function adds one runtime input symbol to
  316. // a list of sets.
  317. //
  318. // The "logically equivalent expression" is the tree for an
  319. // or-ing together of all of the symbols that go into the set.
  320. //
  321. //------------------------------------------------------------------------
  322. void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
  323. int32_t ix;
  324. for (ix=0; ix<sets->size(); ix++) {
  325. RBBINode* usetNode = static_cast<RBBINode*>(sets->elementAt(ix));
  326. addValToSet(usetNode, val);
  327. }
  328. }
  329. void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
  330. RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
  331. if (leafNode == nullptr) {
  332. *fStatus = U_MEMORY_ALLOCATION_ERROR;
  333. return;
  334. }
  335. leafNode->fVal = static_cast<unsigned short>(val);
  336. if (usetNode->fLeftChild == nullptr) {
  337. usetNode->fLeftChild = leafNode;
  338. leafNode->fParent = usetNode;
  339. } else {
  340. // There are already input symbols present for this set.
  341. // Set up an OR node, with the previous stuff as the left child
  342. // and the new value as the right child.
  343. RBBINode *orNode = new RBBINode(RBBINode::opOr);
  344. if (orNode == nullptr) {
  345. *fStatus = U_MEMORY_ALLOCATION_ERROR;
  346. return;
  347. }
  348. orNode->fLeftChild = usetNode->fLeftChild;
  349. orNode->fRightChild = leafNode;
  350. orNode->fLeftChild->fParent = orNode;
  351. orNode->fRightChild->fParent = orNode;
  352. usetNode->fLeftChild = orNode;
  353. orNode->fParent = usetNode;
  354. }
  355. }
  356. //------------------------------------------------------------------------
  357. //
  358. // getNumCharCategories
  359. //
  360. //------------------------------------------------------------------------
  361. int32_t RBBISetBuilder::getNumCharCategories() const {
  362. return fGroupCount + 3;
  363. }
  364. //------------------------------------------------------------------------
  365. //
  366. // getDictCategoriesStart
  367. //
  368. //------------------------------------------------------------------------
  369. int32_t RBBISetBuilder::getDictCategoriesStart() const {
  370. return fDictCategoriesStart;
  371. }
  372. //------------------------------------------------------------------------
  373. //
  374. // sawBOF
  375. //
  376. //------------------------------------------------------------------------
  377. UBool RBBISetBuilder::sawBOF() const {
  378. return fSawBOF;
  379. }
  380. //------------------------------------------------------------------------
  381. //
  382. // getFirstChar Given a runtime RBBI character category, find
  383. // the first UChar32 that is in the set of chars
  384. // in the category.
  385. //------------------------------------------------------------------------
  386. UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
  387. RangeDescriptor *rlRange;
  388. UChar32 retVal = static_cast<UChar32>(-1);
  389. for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
  390. if (rlRange->fNum == category) {
  391. retVal = rlRange->fStartChar;
  392. break;
  393. }
  394. }
  395. return retVal;
  396. }
  397. //------------------------------------------------------------------------
  398. //
  399. // printRanges A debugging function.
  400. // dump out all of the range definitions.
  401. //
  402. //------------------------------------------------------------------------
  403. #ifdef RBBI_DEBUG
  404. void RBBISetBuilder::printRanges() {
  405. RangeDescriptor *rlRange;
  406. int i;
  407. RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
  408. for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
  409. RBBIDebugPrintf("%4x-%4x ", rlRange->fStartChar, rlRange->fEndChar);
  410. for (i=0; i<rlRange->fIncludesSets->size(); i++) {
  411. RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
  412. UnicodeString setName {u"anon"};
  413. RBBINode *setRef = usetNode->fParent;
  414. if (setRef != nullptr) {
  415. RBBINode *varRef = setRef->fParent;
  416. if (varRef != nullptr && varRef->fType == RBBINode::varRef) {
  417. setName = varRef->fText;
  418. }
  419. }
  420. RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
  421. }
  422. RBBIDebugPrintf("\n");
  423. }
  424. }
  425. #endif
  426. //------------------------------------------------------------------------
  427. //
  428. // printRangeGroups A debugging function.
  429. // dump out all of the range groups.
  430. //
  431. //------------------------------------------------------------------------
  432. #ifdef RBBI_DEBUG
  433. void RBBISetBuilder::printRangeGroups() {
  434. int i;
  435. RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
  436. for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
  437. if (rlRange->fFirstInGroup) {
  438. int groupNum = rlRange->fNum;
  439. RBBIDebugPrintf("%2i ", groupNum);
  440. if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> ");}
  441. for (i=0; i<rlRange->fIncludesSets->size(); i++) {
  442. RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
  443. UnicodeString setName = UNICODE_STRING("anon", 4);
  444. RBBINode *setRef = usetNode->fParent;
  445. if (setRef != nullptr) {
  446. RBBINode *varRef = setRef->fParent;
  447. if (varRef != nullptr && varRef->fType == RBBINode::varRef) {
  448. setName = varRef->fText;
  449. }
  450. }
  451. RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
  452. }
  453. i = 0;
  454. for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) {
  455. if (tRange->fNum == rlRange->fNum) {
  456. if (i++ % 5 == 0) {
  457. RBBIDebugPrintf("\n ");
  458. }
  459. RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
  460. }
  461. }
  462. RBBIDebugPrintf("\n");
  463. }
  464. }
  465. RBBIDebugPrintf("\n");
  466. }
  467. #endif
  468. //------------------------------------------------------------------------
  469. //
  470. // printSets A debugging function.
  471. // dump out all of the set definitions.
  472. //
  473. //------------------------------------------------------------------------
  474. #ifdef RBBI_DEBUG
  475. void RBBISetBuilder::printSets() {
  476. int i;
  477. RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
  478. for (i=0; ; i++) {
  479. RBBINode *usetNode;
  480. RBBINode *setRef;
  481. RBBINode *varRef;
  482. UnicodeString setName;
  483. usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
  484. if (usetNode == nullptr) {
  485. break;
  486. }
  487. RBBIDebugPrintf("%3d ", i);
  488. setName = UNICODE_STRING("anonymous", 9);
  489. setRef = usetNode->fParent;
  490. if (setRef != nullptr) {
  491. varRef = setRef->fParent;
  492. if (varRef != nullptr && varRef->fType == RBBINode::varRef) {
  493. setName = varRef->fText;
  494. }
  495. }
  496. RBBI_DEBUG_printUnicodeString(setName);
  497. RBBIDebugPrintf(" ");
  498. RBBI_DEBUG_printUnicodeString(usetNode->fText);
  499. RBBIDebugPrintf("\n");
  500. if (usetNode->fLeftChild != nullptr) {
  501. RBBINode::printTree(usetNode->fLeftChild, true);
  502. }
  503. }
  504. RBBIDebugPrintf("\n");
  505. }
  506. #endif
  507. //-------------------------------------------------------------------------------------
  508. //
  509. // RangeDescriptor copy constructor
  510. //
  511. //-------------------------------------------------------------------------------------
  512. RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) :
  513. fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum},
  514. fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} {
  515. if (U_FAILURE(status)) {
  516. return;
  517. }
  518. fIncludesSets = new UVector(status);
  519. if (this->fIncludesSets == nullptr) {
  520. status = U_MEMORY_ALLOCATION_ERROR;
  521. }
  522. if (U_FAILURE(status)) {
  523. return;
  524. }
  525. for (int32_t i=0; i<other.fIncludesSets->size(); i++) {
  526. this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
  527. }
  528. }
  529. //-------------------------------------------------------------------------------------
  530. //
  531. // RangeDesriptor default constructor
  532. //
  533. //-------------------------------------------------------------------------------------
  534. RangeDescriptor::RangeDescriptor(UErrorCode &status) {
  535. if (U_FAILURE(status)) {
  536. return;
  537. }
  538. fIncludesSets = new UVector(status);
  539. if (fIncludesSets == nullptr) {
  540. status = U_MEMORY_ALLOCATION_ERROR;
  541. }
  542. }
  543. //-------------------------------------------------------------------------------------
  544. //
  545. // RangeDesriptor Destructor
  546. //
  547. //-------------------------------------------------------------------------------------
  548. RangeDescriptor::~RangeDescriptor() {
  549. delete fIncludesSets;
  550. fIncludesSets = nullptr;
  551. }
  552. //-------------------------------------------------------------------------------------
  553. //
  554. // RangeDesriptor::split()
  555. //
  556. //-------------------------------------------------------------------------------------
  557. void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
  558. U_ASSERT(where>fStartChar && where<=fEndChar);
  559. RangeDescriptor *nr = new RangeDescriptor(*this, status);
  560. if(nr == nullptr) {
  561. status = U_MEMORY_ALLOCATION_ERROR;
  562. return;
  563. }
  564. if (U_FAILURE(status)) {
  565. delete nr;
  566. return;
  567. }
  568. // RangeDescriptor copy constructor copies all fields.
  569. // Only need to update those that are different after the split.
  570. nr->fStartChar = where;
  571. this->fEndChar = where-1;
  572. nr->fNext = this->fNext;
  573. this->fNext = nr;
  574. }
  575. //-------------------------------------------------------------------------------------
  576. //
  577. // RangeDescriptor::isDictionaryRange
  578. //
  579. // Test whether this range includes characters from
  580. // the original Unicode Set named "dictionary".
  581. //
  582. // This function looks through the Unicode Sets that
  583. // the range includes, checking for one named "dictionary"
  584. //
  585. // TODO: a faster way would be to find the set node for
  586. // "dictionary" just once, rather than looking it
  587. // up by name every time.
  588. //
  589. //-------------------------------------------------------------------------------------
  590. bool RangeDescriptor::isDictionaryRange() {
  591. static const char16_t *dictionary = u"dictionary";
  592. for (int32_t i=0; i<fIncludesSets->size(); i++) {
  593. RBBINode* usetNode = static_cast<RBBINode*>(fIncludesSets->elementAt(i));
  594. RBBINode *setRef = usetNode->fParent;
  595. if (setRef != nullptr) {
  596. RBBINode *varRef = setRef->fParent;
  597. if (varRef && varRef->fType == RBBINode::varRef) {
  598. const UnicodeString *setName = &varRef->fText;
  599. if (setName->compare(dictionary, -1) == 0) {
  600. return true;
  601. }
  602. }
  603. }
  604. }
  605. return false;
  606. }
  607. U_NAMESPACE_END
  608. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */