123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- //
- // rbbisetb.cpp
- //
- /*
- ***************************************************************************
- * Copyright (C) 2002-2008 International Business Machines Corporation *
- * and others. All rights reserved. *
- ***************************************************************************
- */
- //
- // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
- // (part of the rule building process.)
- //
- // Starting with the rules parse tree from the scanner,
- //
- // - Enumerate the set of UnicodeSets that are referenced
- // by the RBBI rules.
- // - compute a set of non-overlapping character ranges
- // with all characters within a range belonging to the same
- // set of input unicode sets.
- // - Derive a set of non-overlapping UnicodeSet (like things)
- // that will correspond to columns in the state table for
- // the RBBI execution engine. All characters within one
- // of these sets belong to the same set of the original
- // UnicodeSets from the user's rules.
- // - construct the trie table that maps input characters
- // to the index of the matching non-overlapping set of set from
- // the previous step.
- //
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_BREAK_ITERATION
- #include "unicode/uniset.h"
- #include "uvector.h"
- #include "uassert.h"
- #include "cmemory.h"
- #include "cstring.h"
- #include "rbbisetb.h"
- #include "rbbinode.h"
- U_NAMESPACE_BEGIN
- const int32_t kMaxCharCategoriesFor8BitsTrie = 255;
- //------------------------------------------------------------------------
- //
- // Constructor
- //
- //------------------------------------------------------------------------
- RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
- {
- fRB = rb;
- fStatus = rb->fStatus;
- fRangeList = nullptr;
- fMutableTrie = nullptr;
- fTrie = nullptr;
- fTrieSize = 0;
- fGroupCount = 0;
- fSawBOF = false;
- }
- //------------------------------------------------------------------------
- //
- // Destructor
- //
- //------------------------------------------------------------------------
- RBBISetBuilder::~RBBISetBuilder()
- {
- RangeDescriptor *nextRangeDesc;
- // Walk through & delete the linked list of RangeDescriptors
- for (nextRangeDesc = fRangeList; nextRangeDesc!=nullptr;) {
- RangeDescriptor *r = nextRangeDesc;
- nextRangeDesc = r->fNext;
- delete r;
- }
- ucptrie_close(fTrie);
- umutablecptrie_close(fMutableTrie);
- }
- //------------------------------------------------------------------------
- //
- // build Build the list of non-overlapping character ranges
- // from the Unicode Sets.
- //
- //------------------------------------------------------------------------
- void RBBISetBuilder::buildRanges() {
- RBBINode *usetNode;
- RangeDescriptor *rlRange;
- if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
- //
- // Initialize the process by creating a single range encompassing all characters
- // that is in no sets.
- //
- fRangeList = new RangeDescriptor(*fStatus); // will check for status here
- if (fRangeList == nullptr) {
- *fStatus = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- fRangeList->fStartChar = 0;
- fRangeList->fEndChar = 0x10ffff;
- if (U_FAILURE(*fStatus)) {
- return;
- }
- //
- // Find the set of non-overlapping ranges of characters
- //
- int ni;
- for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
- usetNode = static_cast<RBBINode*>(this->fRB->fUSetNodes->elementAt(ni));
- if (usetNode==nullptr) {
- break;
- }
- UnicodeSet *inputSet = usetNode->fInputSet;
- int32_t inputSetRangeCount = inputSet->getRangeCount();
- int inputSetRangeIndex = 0;
- rlRange = fRangeList;
- for (;;) {
- if (inputSetRangeIndex >= inputSetRangeCount) {
- break;
- }
- UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
- UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);
- // skip over ranges from the range list that are completely
- // below the current range from the input unicode set.
- while (rlRange->fEndChar < inputSetRangeBegin) {
- rlRange = rlRange->fNext;
- }
- // If the start of the range from the range list is before with
- // the start of the range from the unicode set, split the range list range
- // in two, with one part being before (wholly outside of) the unicode set
- // and the other containing the rest.
- // Then continue the loop; the post-split current range will then be skipped
- // over
- if (rlRange->fStartChar < inputSetRangeBegin) {
- rlRange->split(inputSetRangeBegin, *fStatus);
- if (U_FAILURE(*fStatus)) {
- return;
- }
- continue;
- }
- // Same thing at the end of the ranges...
- // If the end of the range from the range list doesn't coincide with
- // the end of the range from the unicode set, split the range list
- // range in two. The first part of the split range will be
- // wholly inside the Unicode set.
- if (rlRange->fEndChar > inputSetRangeEnd) {
- rlRange->split(inputSetRangeEnd+1, *fStatus);
- if (U_FAILURE(*fStatus)) {
- return;
- }
- }
- // The current rlRange is now entirely within the UnicodeSet range.
- // Add this unicode set to the list of sets for this rlRange
- if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
- rlRange->fIncludesSets->addElement(usetNode, *fStatus);
- if (U_FAILURE(*fStatus)) {
- return;
- }
- }
- // Advance over ranges that we are finished with.
- if (inputSetRangeEnd == rlRange->fEndChar) {
- inputSetRangeIndex++;
- }
- rlRange = rlRange->fNext;
- }
- }
- if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
- //
- // Group the above ranges, with each group consisting of one or more
- // ranges that are in exactly the same set of original UnicodeSets.
- // The groups are numbered, and these group numbers are the set of
- // input symbols recognized by the run-time state machine.
- //
- // Numbering: # 0 (state table column 0) is unused.
- // # 1 is reserved - table column 1 is for end-of-input
- // # 2 is reserved - table column 2 is for beginning-of-input
- // # 3 is the first range list.
- //
- RangeDescriptor *rlSearchRange;
- int32_t dictGroupCount = 0;
- for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
- for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
- if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
- rlRange->fNum = rlSearchRange->fNum;
- rlRange->fIncludesDict = rlSearchRange->fIncludesDict;
- break;
- }
- }
- if (rlRange->fNum == 0) {
- rlRange->fFirstInGroup = true;
- if (rlRange->isDictionaryRange()) {
- rlRange->fNum = ++dictGroupCount;
- rlRange->fIncludesDict = true;
- } else {
- fGroupCount++;
- rlRange->fNum = fGroupCount+2;
- addValToSets(rlRange->fIncludesSets, rlRange->fNum);
- }
- }
- }
- // Move the character category numbers for any dictionary ranges up, so that they
- // immediately follow the non-dictionary ranges.
- fDictCategoriesStart = fGroupCount + 3;
- for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
- if (rlRange->fIncludesDict) {
- rlRange->fNum += fDictCategoriesStart - 1;
- if (rlRange->fFirstInGroup) {
- addValToSets(rlRange->fIncludesSets, rlRange->fNum);
- }
- }
- }
- fGroupCount += dictGroupCount;
- // Handle input sets that contain the special string {eof}.
- // Column 1 of the state table is reserved for EOF on input.
- // Column 2 is reserved for before-the-start-input.
- // (This column can be optimized away later if there are no rule
- // references to {bof}.)
- // Add this column value (1 or 2) to the equivalent expression
- // subtree for each UnicodeSet that contains the string {eof}
- // Because {bof} and {eof} are not characters in the normal sense,
- // they don't affect the computation of the ranges or TRIE.
- UnicodeString eofString(u"eof");
- UnicodeString bofString(u"bof");
- for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
- usetNode = static_cast<RBBINode*>(this->fRB->fUSetNodes->elementAt(ni));
- if (usetNode==nullptr) {
- break;
- }
- UnicodeSet *inputSet = usetNode->fInputSet;
- if (inputSet->contains(eofString)) {
- addValToSet(usetNode, 1);
- }
- if (inputSet->contains(bofString)) {
- addValToSet(usetNode, 2);
- fSawBOF = true;
- }
- }
- if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
- if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
- }
- //
- // Build the Trie table for mapping UChar32 values to the corresponding
- // range group number.
- //
- void RBBISetBuilder::buildTrie() {
- fMutableTrie = umutablecptrie_open(
- 0, // Initial value for all code points.
- 0, // Error value for out-of-range input.
- fStatus);
- for (RangeDescriptor *range = fRangeList; range!=nullptr && U_SUCCESS(*fStatus); range=range->fNext) {
- umutablecptrie_setRange(fMutableTrie,
- range->fStartChar, // Range start
- range->fEndChar, // Range end (inclusive)
- range->fNum, // value for range
- fStatus);
- }
- }
- void RBBISetBuilder::mergeCategories(IntPair categories) {
- U_ASSERT(categories.first >= 1);
- U_ASSERT(categories.second > categories.first);
- U_ASSERT((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) ||
- (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
- for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
- int32_t rangeNum = rd->fNum;
- if (rangeNum == categories.second) {
- rd->fNum = categories.first;
- } else if (rangeNum > categories.second) {
- rd->fNum--;
- }
- }
- --fGroupCount;
- if (categories.second <= fDictCategoriesStart) {
- --fDictCategoriesStart;
- }
- }
- //-----------------------------------------------------------------------------------
- //
- // getTrieSize() Return the size that will be required to serialize the Trie.
- //
- //-----------------------------------------------------------------------------------
- int32_t RBBISetBuilder::getTrieSize() {
- if (U_FAILURE(*fStatus)) {
- return 0;
- }
- if (fTrie == nullptr) {
- bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
- fTrie = umutablecptrie_buildImmutable(
- fMutableTrie,
- UCPTRIE_TYPE_FAST,
- use8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16,
- fStatus);
- fTrieSize = ucptrie_toBinary(fTrie, nullptr, 0, fStatus);
- if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
- *fStatus = U_ZERO_ERROR;
- }
- }
- return fTrieSize;
- }
- //-----------------------------------------------------------------------------------
- //
- // serializeTrie() Put the serialized trie at the specified address.
- // Trust the caller to have given us enough memory.
- // getTrieSize() MUST be called first.
- //
- //-----------------------------------------------------------------------------------
- void RBBISetBuilder::serializeTrie(uint8_t *where) {
- ucptrie_toBinary(fTrie,
- where, // Buffer
- fTrieSize, // Capacity
- fStatus);
- }
- //------------------------------------------------------------------------
- //
- // addValToSets Add a runtime-mapped input value to each uset from a
- // list of uset nodes. (val corresponds to a state table column.)
- // For each of the original Unicode sets - which correspond
- // directly to uset nodes - a logically equivalent expression
- // is constructed in terms of the remapped runtime input
- // symbol set. This function adds one runtime input symbol to
- // a list of sets.
- //
- // The "logically equivalent expression" is the tree for an
- // or-ing together of all of the symbols that go into the set.
- //
- //------------------------------------------------------------------------
- void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
- int32_t ix;
- for (ix=0; ix<sets->size(); ix++) {
- RBBINode* usetNode = static_cast<RBBINode*>(sets->elementAt(ix));
- addValToSet(usetNode, val);
- }
- }
- void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
- RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
- if (leafNode == nullptr) {
- *fStatus = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- leafNode->fVal = static_cast<unsigned short>(val);
- if (usetNode->fLeftChild == nullptr) {
- usetNode->fLeftChild = leafNode;
- leafNode->fParent = usetNode;
- } else {
- // There are already input symbols present for this set.
- // Set up an OR node, with the previous stuff as the left child
- // and the new value as the right child.
- RBBINode *orNode = new RBBINode(RBBINode::opOr);
- if (orNode == nullptr) {
- *fStatus = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- orNode->fLeftChild = usetNode->fLeftChild;
- orNode->fRightChild = leafNode;
- orNode->fLeftChild->fParent = orNode;
- orNode->fRightChild->fParent = orNode;
- usetNode->fLeftChild = orNode;
- orNode->fParent = usetNode;
- }
- }
- //------------------------------------------------------------------------
- //
- // getNumCharCategories
- //
- //------------------------------------------------------------------------
- int32_t RBBISetBuilder::getNumCharCategories() const {
- return fGroupCount + 3;
- }
- //------------------------------------------------------------------------
- //
- // getDictCategoriesStart
- //
- //------------------------------------------------------------------------
- int32_t RBBISetBuilder::getDictCategoriesStart() const {
- return fDictCategoriesStart;
- }
- //------------------------------------------------------------------------
- //
- // sawBOF
- //
- //------------------------------------------------------------------------
- UBool RBBISetBuilder::sawBOF() const {
- return fSawBOF;
- }
- //------------------------------------------------------------------------
- //
- // getFirstChar Given a runtime RBBI character category, find
- // the first UChar32 that is in the set of chars
- // in the category.
- //------------------------------------------------------------------------
- UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
- RangeDescriptor *rlRange;
- UChar32 retVal = static_cast<UChar32>(-1);
- for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
- if (rlRange->fNum == category) {
- retVal = rlRange->fStartChar;
- break;
- }
- }
- return retVal;
- }
- //------------------------------------------------------------------------
- //
- // printRanges A debugging function.
- // dump out all of the range definitions.
- //
- //------------------------------------------------------------------------
- #ifdef RBBI_DEBUG
- void RBBISetBuilder::printRanges() {
- RangeDescriptor *rlRange;
- int i;
- RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
- for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
- RBBIDebugPrintf("%4x-%4x ", rlRange->fStartChar, rlRange->fEndChar);
- for (i=0; i<rlRange->fIncludesSets->size(); i++) {
- RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
- UnicodeString setName {u"anon"};
- RBBINode *setRef = usetNode->fParent;
- if (setRef != nullptr) {
- RBBINode *varRef = setRef->fParent;
- if (varRef != nullptr && varRef->fType == RBBINode::varRef) {
- setName = varRef->fText;
- }
- }
- RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
- }
- RBBIDebugPrintf("\n");
- }
- }
- #endif
- //------------------------------------------------------------------------
- //
- // printRangeGroups A debugging function.
- // dump out all of the range groups.
- //
- //------------------------------------------------------------------------
- #ifdef RBBI_DEBUG
- void RBBISetBuilder::printRangeGroups() {
- int i;
- RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
- for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
- if (rlRange->fFirstInGroup) {
- int groupNum = rlRange->fNum;
- RBBIDebugPrintf("%2i ", groupNum);
- if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> ");}
- for (i=0; i<rlRange->fIncludesSets->size(); i++) {
- RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
- UnicodeString setName = UNICODE_STRING("anon", 4);
- RBBINode *setRef = usetNode->fParent;
- if (setRef != nullptr) {
- RBBINode *varRef = setRef->fParent;
- if (varRef != nullptr && varRef->fType == RBBINode::varRef) {
- setName = varRef->fText;
- }
- }
- RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
- }
- i = 0;
- for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) {
- if (tRange->fNum == rlRange->fNum) {
- if (i++ % 5 == 0) {
- RBBIDebugPrintf("\n ");
- }
- RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
- }
- }
- RBBIDebugPrintf("\n");
- }
- }
- RBBIDebugPrintf("\n");
- }
- #endif
- //------------------------------------------------------------------------
- //
- // printSets A debugging function.
- // dump out all of the set definitions.
- //
- //------------------------------------------------------------------------
- #ifdef RBBI_DEBUG
- void RBBISetBuilder::printSets() {
- int i;
- RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
- for (i=0; ; i++) {
- RBBINode *usetNode;
- RBBINode *setRef;
- RBBINode *varRef;
- UnicodeString setName;
- usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
- if (usetNode == nullptr) {
- break;
- }
- RBBIDebugPrintf("%3d ", i);
- setName = UNICODE_STRING("anonymous", 9);
- setRef = usetNode->fParent;
- if (setRef != nullptr) {
- varRef = setRef->fParent;
- if (varRef != nullptr && varRef->fType == RBBINode::varRef) {
- setName = varRef->fText;
- }
- }
- RBBI_DEBUG_printUnicodeString(setName);
- RBBIDebugPrintf(" ");
- RBBI_DEBUG_printUnicodeString(usetNode->fText);
- RBBIDebugPrintf("\n");
- if (usetNode->fLeftChild != nullptr) {
- RBBINode::printTree(usetNode->fLeftChild, true);
- }
- }
- RBBIDebugPrintf("\n");
- }
- #endif
- //-------------------------------------------------------------------------------------
- //
- // RangeDescriptor copy constructor
- //
- //-------------------------------------------------------------------------------------
- RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) :
- fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum},
- fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} {
- if (U_FAILURE(status)) {
- return;
- }
- fIncludesSets = new UVector(status);
- if (this->fIncludesSets == nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
- if (U_FAILURE(status)) {
- return;
- }
- for (int32_t i=0; i<other.fIncludesSets->size(); i++) {
- this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
- }
- }
- //-------------------------------------------------------------------------------------
- //
- // RangeDesriptor default constructor
- //
- //-------------------------------------------------------------------------------------
- RangeDescriptor::RangeDescriptor(UErrorCode &status) {
- if (U_FAILURE(status)) {
- return;
- }
- fIncludesSets = new UVector(status);
- if (fIncludesSets == nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
- }
- //-------------------------------------------------------------------------------------
- //
- // RangeDesriptor Destructor
- //
- //-------------------------------------------------------------------------------------
- RangeDescriptor::~RangeDescriptor() {
- delete fIncludesSets;
- fIncludesSets = nullptr;
- }
- //-------------------------------------------------------------------------------------
- //
- // RangeDesriptor::split()
- //
- //-------------------------------------------------------------------------------------
- void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
- U_ASSERT(where>fStartChar && where<=fEndChar);
- RangeDescriptor *nr = new RangeDescriptor(*this, status);
- if(nr == nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- if (U_FAILURE(status)) {
- delete nr;
- return;
- }
- // RangeDescriptor copy constructor copies all fields.
- // Only need to update those that are different after the split.
- nr->fStartChar = where;
- this->fEndChar = where-1;
- nr->fNext = this->fNext;
- this->fNext = nr;
- }
- //-------------------------------------------------------------------------------------
- //
- // RangeDescriptor::isDictionaryRange
- //
- // Test whether this range includes characters from
- // the original Unicode Set named "dictionary".
- //
- // This function looks through the Unicode Sets that
- // the range includes, checking for one named "dictionary"
- //
- // TODO: a faster way would be to find the set node for
- // "dictionary" just once, rather than looking it
- // up by name every time.
- //
- //-------------------------------------------------------------------------------------
- bool RangeDescriptor::isDictionaryRange() {
- static const char16_t *dictionary = u"dictionary";
- for (int32_t i=0; i<fIncludesSets->size(); i++) {
- RBBINode* usetNode = static_cast<RBBINode*>(fIncludesSets->elementAt(i));
- RBBINode *setRef = usetNode->fParent;
- if (setRef != nullptr) {
- RBBINode *varRef = setRef->fParent;
- if (varRef && varRef->fType == RBBINode::varRef) {
- const UnicodeString *setName = &varRef->fText;
- if (setName->compare(dictionary, -1) == 0) {
- return true;
- }
- }
- }
- }
- return false;
- }
- U_NAMESPACE_END
- #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|