123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- //
- // file: rbbirb.cpp
- //
- // Copyright (C) 2002-2011, International Business Machines Corporation and others.
- // All Rights Reserved.
- //
- // This file contains the RBBIRuleBuilder class implementation. This is the main class for
- // building (compiling) break rules into the tables required by the runtime
- // RBBI engine.
- //
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_BREAK_ITERATION
- #include "unicode/brkiter.h"
- #include "unicode/rbbi.h"
- #include "unicode/ubrk.h"
- #include "unicode/unistr.h"
- #include "unicode/uniset.h"
- #include "unicode/uchar.h"
- #include "unicode/uchriter.h"
- #include "unicode/ustring.h"
- #include "unicode/parsepos.h"
- #include "unicode/parseerr.h"
- #include "cmemory.h"
- #include "cstring.h"
- #include "rbbirb.h"
- #include "rbbinode.h"
- #include "rbbiscan.h"
- #include "rbbisetb.h"
- #include "rbbitblb.h"
- #include "rbbidata.h"
- #include "uassert.h"
- U_NAMESPACE_BEGIN
- //----------------------------------------------------------------------------------------
- //
- // Constructor.
- //
- //----------------------------------------------------------------------------------------
- RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
- UParseError *parseErr,
- UErrorCode &status)
- : fRules(rules), fStrippedRules(rules)
- {
- fStatus = &status; // status is checked below
- fParseError = parseErr;
- fDebugEnv = nullptr;
- #ifdef RBBI_DEBUG
- fDebugEnv = getenv("U_RBBIDEBUG");
- #endif
- fForwardTree = nullptr;
- fReverseTree = nullptr;
- fSafeFwdTree = nullptr;
- fSafeRevTree = nullptr;
- fDefaultTree = &fForwardTree;
- fForwardTable = nullptr;
- fRuleStatusVals = nullptr;
- fChainRules = false;
- fLBCMNoChain = false;
- fLookAheadHardBreak = false;
- fUSetNodes = nullptr;
- fRuleStatusVals = nullptr;
- fScanner = nullptr;
- fSetBuilder = nullptr;
- if (parseErr) {
- uprv_memset(parseErr, 0, sizeof(UParseError));
- }
- if (U_FAILURE(status)) {
- return;
- }
- fUSetNodes = new UVector(status); // bcos status gets overwritten here
- fRuleStatusVals = new UVector(status);
- fScanner = new RBBIRuleScanner(this);
- fSetBuilder = new RBBISetBuilder(this);
- if (U_FAILURE(status)) {
- return;
- }
- if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
- }
- //----------------------------------------------------------------------------------------
- //
- // Destructor
- //
- //----------------------------------------------------------------------------------------
- RBBIRuleBuilder::~RBBIRuleBuilder() {
- int i;
- for (i=0; ; i++) {
- RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
- if (n==nullptr) {
- break;
- }
- delete n;
- }
- delete fUSetNodes;
- delete fSetBuilder;
- delete fForwardTable;
- delete fForwardTree;
- delete fReverseTree;
- delete fSafeFwdTree;
- delete fSafeRevTree;
- delete fScanner;
- delete fRuleStatusVals;
- }
- //----------------------------------------------------------------------------------------
- //
- // flattenData() - Collect up the compiled RBBI rule data and put it into
- // the format for saving in ICU data files,
- // which is also the format needed by the RBBI runtime engine.
- //
- //----------------------------------------------------------------------------------------
- static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
- RBBIDataHeader *RBBIRuleBuilder::flattenData() {
- int32_t i;
- if (U_FAILURE(*fStatus)) {
- return nullptr;
- }
- // Remove whitespace from the rules to make it smaller.
- // The rule parser has already removed comments.
- fStrippedRules = fScanner->stripRules(fStrippedRules);
- // Calculate the size of each section in the data.
- // Sizes here are padded up to a multiple of 8 for better memory alignment.
- // Sections sizes actually stored in the header are for the actual data
- // without the padding.
- //
- int32_t headerSize = align8(sizeof(RBBIDataHeader));
- int32_t forwardTableSize = align8(fForwardTable->getTableSize());
- int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
- int32_t trieSize = align8(fSetBuilder->getTrieSize());
- int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
- int32_t rulesLengthInUTF8 = 0;
- u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
- fStrippedRules.getBuffer(), fStrippedRules.length(),
- 0xfffd, nullptr, fStatus);
- *fStatus = U_ZERO_ERROR;
- int32_t rulesSize = align8((rulesLengthInUTF8+1));
- int32_t totalSize = headerSize
- + forwardTableSize
- + reverseTableSize
- + statusTableSize + trieSize + rulesSize;
- #ifdef RBBI_DEBUG
- if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
- RBBIDebugPrintf("Header Size: %8d\n", headerSize);
- RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
- RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
- RBBIDebugPrintf("Trie Size: %8d\n", trieSize);
- RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize);
- RBBIDebugPrintf("Rules Size: %8d\n", rulesSize);
- RBBIDebugPrintf("-----------------------------\n");
- RBBIDebugPrintf("Total Size: %8d\n", totalSize);
- }
- #endif
- RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
- if (data == nullptr) {
- *fStatus = U_MEMORY_ALLOCATION_ERROR;
- return nullptr;
- }
- uprv_memset(data, 0, totalSize);
- data->fMagic = 0xb1a0;
- data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
- data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
- data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
- data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
- data->fLength = totalSize;
- data->fCatCount = fSetBuilder->getNumCharCategories();
- data->fFTable = headerSize;
- data->fFTableLen = forwardTableSize;
- data->fRTable = data->fFTable + data->fFTableLen;
- data->fRTableLen = reverseTableSize;
- data->fTrie = data->fRTable + data->fRTableLen;
- data->fTrieLen = trieSize;
- data->fStatusTable = data->fTrie + data->fTrieLen;
- data->fStatusTableLen= statusTableSize;
- data->fRuleSource = data->fStatusTable + statusTableSize;
- data->fRuleSourceLen = rulesLengthInUTF8;
- uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
- fForwardTable->exportTable((uint8_t *)data + data->fFTable);
- fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
- fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
- int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
- for (i=0; i<fRuleStatusVals->size(); i++) {
- ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
- }
- u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
- fStrippedRules.getBuffer(), fStrippedRules.length(),
- 0xfffd, nullptr, fStatus);
- if (U_FAILURE(*fStatus)) {
- return nullptr;
- }
- return data;
- }
- //----------------------------------------------------------------------------------------
- //
- // createRuleBasedBreakIterator construct from source rules that are passed in
- // in a UnicodeString
- //
- //----------------------------------------------------------------------------------------
- BreakIterator *
- RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
- UParseError *parseError,
- UErrorCode &status)
- {
- //
- // Read the input rules, generate a parse tree, symbol table,
- // and list of all Unicode Sets referenced by the rules.
- //
- RBBIRuleBuilder builder(rules, parseError, status);
- if (U_FAILURE(status)) { // status checked here bcos build below doesn't
- return nullptr;
- }
- RBBIDataHeader *data = builder.build(status);
- if (U_FAILURE(status)) {
- return nullptr;
- }
- //
- // Create a break iterator from the compiled rules.
- // (Identical to creation from stored pre-compiled rules)
- //
- // status is checked after init in construction.
- RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
- if (U_FAILURE(status)) {
- delete This;
- This = nullptr;
- }
- else if(This == nullptr) { // test for nullptr
- status = U_MEMORY_ALLOCATION_ERROR;
- }
- return This;
- }
- RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
- if (U_FAILURE(status)) {
- return nullptr;
- }
- fScanner->parse();
- if (U_FAILURE(status)) {
- return nullptr;
- }
- //
- // UnicodeSet processing.
- // Munge the Unicode Sets to create an initial set of character categories.
- //
- fSetBuilder->buildRanges();
- //
- // Generate the DFA state transition table.
- //
- fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
- if (fForwardTable == nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return nullptr;
- }
- fForwardTable->buildForwardTable();
- // State table and character category optimization.
- // Merge equivalent rows and columns.
- // Note that this process alters the initial set of character categories,
- // causing the representation of UnicodeSets in the parse tree to become invalid.
- optimizeTables();
- fForwardTable->buildSafeReverseTable(status);
- #ifdef RBBI_DEBUG
- if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
- fForwardTable->printStates();
- fForwardTable->printRuleStatusTable();
- fForwardTable->printReverseTable();
- }
- #endif
- // Generate the mapping tables (TRIE) from input code points to
- // the character categories.
- //
- fSetBuilder->buildTrie();
- //
- // Package up the compiled data into a memory image
- // in the run-time format.
- //
- RBBIDataHeader *data = flattenData(); // returns nullptr if error
- if (U_FAILURE(status)) {
- return nullptr;
- }
- return data;
- }
- void RBBIRuleBuilder::optimizeTables() {
- bool didSomething;
- do {
- didSomething = false;
- // Begin looking for duplicates with char class 3.
- // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
- // and should not have other categories merged into them.
- IntPair duplPair = {3, 0};
- while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
- fSetBuilder->mergeCategories(duplPair);
- fForwardTable->removeColumn(duplPair.second);
- didSomething = true;
- }
- while (fForwardTable->removeDuplicateStates() > 0) {
- didSomething = true;
- }
- } while (didSomething);
- }
- U_NAMESPACE_END
- #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|