rbbirb.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // file: rbbirb.cpp
  5. //
  6. // Copyright (C) 2002-2011, International Business Machines Corporation and others.
  7. // All Rights Reserved.
  8. //
  9. // This file contains the RBBIRuleBuilder class implementation. This is the main class for
  10. // building (compiling) break rules into the tables required by the runtime
  11. // RBBI engine.
  12. //
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_BREAK_ITERATION
  15. #include "unicode/brkiter.h"
  16. #include "unicode/rbbi.h"
  17. #include "unicode/ubrk.h"
  18. #include "unicode/unistr.h"
  19. #include "unicode/uniset.h"
  20. #include "unicode/uchar.h"
  21. #include "unicode/uchriter.h"
  22. #include "unicode/ustring.h"
  23. #include "unicode/parsepos.h"
  24. #include "unicode/parseerr.h"
  25. #include "cmemory.h"
  26. #include "cstring.h"
  27. #include "rbbirb.h"
  28. #include "rbbinode.h"
  29. #include "rbbiscan.h"
  30. #include "rbbisetb.h"
  31. #include "rbbitblb.h"
  32. #include "rbbidata.h"
  33. #include "uassert.h"
  34. U_NAMESPACE_BEGIN
  35. //----------------------------------------------------------------------------------------
  36. //
  37. // Constructor.
  38. //
  39. //----------------------------------------------------------------------------------------
  40. RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
  41. UParseError *parseErr,
  42. UErrorCode &status)
  43. : fRules(rules), fStrippedRules(rules)
  44. {
  45. fStatus = &status; // status is checked below
  46. fParseError = parseErr;
  47. fDebugEnv = nullptr;
  48. #ifdef RBBI_DEBUG
  49. fDebugEnv = getenv("U_RBBIDEBUG");
  50. #endif
  51. fForwardTree = nullptr;
  52. fReverseTree = nullptr;
  53. fSafeFwdTree = nullptr;
  54. fSafeRevTree = nullptr;
  55. fDefaultTree = &fForwardTree;
  56. fForwardTable = nullptr;
  57. fRuleStatusVals = nullptr;
  58. fChainRules = false;
  59. fLBCMNoChain = false;
  60. fLookAheadHardBreak = false;
  61. fUSetNodes = nullptr;
  62. fRuleStatusVals = nullptr;
  63. fScanner = nullptr;
  64. fSetBuilder = nullptr;
  65. if (parseErr) {
  66. uprv_memset(parseErr, 0, sizeof(UParseError));
  67. }
  68. if (U_FAILURE(status)) {
  69. return;
  70. }
  71. fUSetNodes = new UVector(status); // bcos status gets overwritten here
  72. fRuleStatusVals = new UVector(status);
  73. fScanner = new RBBIRuleScanner(this);
  74. fSetBuilder = new RBBISetBuilder(this);
  75. if (U_FAILURE(status)) {
  76. return;
  77. }
  78. if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
  79. status = U_MEMORY_ALLOCATION_ERROR;
  80. }
  81. }
  82. //----------------------------------------------------------------------------------------
  83. //
  84. // Destructor
  85. //
  86. //----------------------------------------------------------------------------------------
  87. RBBIRuleBuilder::~RBBIRuleBuilder() {
  88. int i;
  89. for (i=0; ; i++) {
  90. RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
  91. if (n==nullptr) {
  92. break;
  93. }
  94. delete n;
  95. }
  96. delete fUSetNodes;
  97. delete fSetBuilder;
  98. delete fForwardTable;
  99. delete fForwardTree;
  100. delete fReverseTree;
  101. delete fSafeFwdTree;
  102. delete fSafeRevTree;
  103. delete fScanner;
  104. delete fRuleStatusVals;
  105. }
  106. //----------------------------------------------------------------------------------------
  107. //
  108. // flattenData() - Collect up the compiled RBBI rule data and put it into
  109. // the format for saving in ICU data files,
  110. // which is also the format needed by the RBBI runtime engine.
  111. //
  112. //----------------------------------------------------------------------------------------
  113. static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
  114. RBBIDataHeader *RBBIRuleBuilder::flattenData() {
  115. int32_t i;
  116. if (U_FAILURE(*fStatus)) {
  117. return nullptr;
  118. }
  119. // Remove whitespace from the rules to make it smaller.
  120. // The rule parser has already removed comments.
  121. fStrippedRules = fScanner->stripRules(fStrippedRules);
  122. // Calculate the size of each section in the data.
  123. // Sizes here are padded up to a multiple of 8 for better memory alignment.
  124. // Sections sizes actually stored in the header are for the actual data
  125. // without the padding.
  126. //
  127. int32_t headerSize = align8(sizeof(RBBIDataHeader));
  128. int32_t forwardTableSize = align8(fForwardTable->getTableSize());
  129. int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
  130. int32_t trieSize = align8(fSetBuilder->getTrieSize());
  131. int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
  132. int32_t rulesLengthInUTF8 = 0;
  133. u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
  134. fStrippedRules.getBuffer(), fStrippedRules.length(),
  135. 0xfffd, nullptr, fStatus);
  136. *fStatus = U_ZERO_ERROR;
  137. int32_t rulesSize = align8((rulesLengthInUTF8+1));
  138. int32_t totalSize = headerSize
  139. + forwardTableSize
  140. + reverseTableSize
  141. + statusTableSize + trieSize + rulesSize;
  142. #ifdef RBBI_DEBUG
  143. if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
  144. RBBIDebugPrintf("Header Size: %8d\n", headerSize);
  145. RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
  146. RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
  147. RBBIDebugPrintf("Trie Size: %8d\n", trieSize);
  148. RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize);
  149. RBBIDebugPrintf("Rules Size: %8d\n", rulesSize);
  150. RBBIDebugPrintf("-----------------------------\n");
  151. RBBIDebugPrintf("Total Size: %8d\n", totalSize);
  152. }
  153. #endif
  154. RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
  155. if (data == nullptr) {
  156. *fStatus = U_MEMORY_ALLOCATION_ERROR;
  157. return nullptr;
  158. }
  159. uprv_memset(data, 0, totalSize);
  160. data->fMagic = 0xb1a0;
  161. data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
  162. data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
  163. data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
  164. data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
  165. data->fLength = totalSize;
  166. data->fCatCount = fSetBuilder->getNumCharCategories();
  167. data->fFTable = headerSize;
  168. data->fFTableLen = forwardTableSize;
  169. data->fRTable = data->fFTable + data->fFTableLen;
  170. data->fRTableLen = reverseTableSize;
  171. data->fTrie = data->fRTable + data->fRTableLen;
  172. data->fTrieLen = trieSize;
  173. data->fStatusTable = data->fTrie + data->fTrieLen;
  174. data->fStatusTableLen= statusTableSize;
  175. data->fRuleSource = data->fStatusTable + statusTableSize;
  176. data->fRuleSourceLen = rulesLengthInUTF8;
  177. uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
  178. fForwardTable->exportTable((uint8_t *)data + data->fFTable);
  179. fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
  180. fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
  181. int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
  182. for (i=0; i<fRuleStatusVals->size(); i++) {
  183. ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
  184. }
  185. u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
  186. fStrippedRules.getBuffer(), fStrippedRules.length(),
  187. 0xfffd, nullptr, fStatus);
  188. if (U_FAILURE(*fStatus)) {
  189. return nullptr;
  190. }
  191. return data;
  192. }
  193. //----------------------------------------------------------------------------------------
  194. //
  195. // createRuleBasedBreakIterator construct from source rules that are passed in
  196. // in a UnicodeString
  197. //
  198. //----------------------------------------------------------------------------------------
  199. BreakIterator *
  200. RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
  201. UParseError *parseError,
  202. UErrorCode &status)
  203. {
  204. //
  205. // Read the input rules, generate a parse tree, symbol table,
  206. // and list of all Unicode Sets referenced by the rules.
  207. //
  208. RBBIRuleBuilder builder(rules, parseError, status);
  209. if (U_FAILURE(status)) { // status checked here bcos build below doesn't
  210. return nullptr;
  211. }
  212. RBBIDataHeader *data = builder.build(status);
  213. if (U_FAILURE(status)) {
  214. return nullptr;
  215. }
  216. //
  217. // Create a break iterator from the compiled rules.
  218. // (Identical to creation from stored pre-compiled rules)
  219. //
  220. // status is checked after init in construction.
  221. RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
  222. if (U_FAILURE(status)) {
  223. delete This;
  224. This = nullptr;
  225. }
  226. else if(This == nullptr) { // test for nullptr
  227. status = U_MEMORY_ALLOCATION_ERROR;
  228. }
  229. return This;
  230. }
  231. RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
  232. if (U_FAILURE(status)) {
  233. return nullptr;
  234. }
  235. fScanner->parse();
  236. if (U_FAILURE(status)) {
  237. return nullptr;
  238. }
  239. //
  240. // UnicodeSet processing.
  241. // Munge the Unicode Sets to create an initial set of character categories.
  242. //
  243. fSetBuilder->buildRanges();
  244. //
  245. // Generate the DFA state transition table.
  246. //
  247. fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
  248. if (fForwardTable == nullptr) {
  249. status = U_MEMORY_ALLOCATION_ERROR;
  250. return nullptr;
  251. }
  252. fForwardTable->buildForwardTable();
  253. // State table and character category optimization.
  254. // Merge equivalent rows and columns.
  255. // Note that this process alters the initial set of character categories,
  256. // causing the representation of UnicodeSets in the parse tree to become invalid.
  257. optimizeTables();
  258. fForwardTable->buildSafeReverseTable(status);
  259. #ifdef RBBI_DEBUG
  260. if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
  261. fForwardTable->printStates();
  262. fForwardTable->printRuleStatusTable();
  263. fForwardTable->printReverseTable();
  264. }
  265. #endif
  266. // Generate the mapping tables (TRIE) from input code points to
  267. // the character categories.
  268. //
  269. fSetBuilder->buildTrie();
  270. //
  271. // Package up the compiled data into a memory image
  272. // in the run-time format.
  273. //
  274. RBBIDataHeader *data = flattenData(); // returns nullptr if error
  275. if (U_FAILURE(status)) {
  276. return nullptr;
  277. }
  278. return data;
  279. }
  280. void RBBIRuleBuilder::optimizeTables() {
  281. bool didSomething;
  282. do {
  283. didSomething = false;
  284. // Begin looking for duplicates with char class 3.
  285. // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
  286. // and should not have other categories merged into them.
  287. IntPair duplPair = {3, 0};
  288. while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
  289. fSetBuilder->mergeCategories(duplPair);
  290. fForwardTable->removeColumn(duplPair.second);
  291. didSomething = true;
  292. }
  293. while (fForwardTable->removeDuplicateStates() > 0) {
  294. didSomething = true;
  295. }
  296. } while (didSomething);
  297. }
  298. U_NAMESPACE_END
  299. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */