rbbiscan.cpp 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // file: rbbiscan.cpp
  5. //
  6. // Copyright (C) 2002-2016, International Business Machines Corporation and others.
  7. // All Rights Reserved.
  8. //
  9. // This file contains the Rule Based Break Iterator Rule Builder functions for
  10. // scanning the rules and assembling a parse tree. This is the first phase
  11. // of compiling the rules.
  12. //
  13. // The overall of the rules is managed by class RBBIRuleBuilder, which will
  14. // create and use an instance of this class as part of the process.
  15. //
  16. #include "unicode/utypes.h"
  17. #if !UCONFIG_NO_BREAK_ITERATION
  18. #include "unicode/unistr.h"
  19. #include "unicode/uniset.h"
  20. #include "unicode/uchar.h"
  21. #include "unicode/uchriter.h"
  22. #include "unicode/parsepos.h"
  23. #include "unicode/parseerr.h"
  24. #include "cmemory.h"
  25. #include "cstring.h"
  26. #include "rbbirpt.h" // Contains state table for the rbbi rules parser.
  27. // generated by a Perl script.
  28. #include "rbbirb.h"
  29. #include "rbbinode.h"
  30. #include "rbbiscan.h"
  31. #include "rbbitblb.h"
  32. #include "uassert.h"
  33. //------------------------------------------------------------------------------
  34. //
  35. // Unicode Set init strings for each of the character classes needed for parsing a rule file.
  36. // (Initialized with hex values for portability to EBCDIC based machines.
  37. // Really ugly, but there's no good way to avoid it.)
  38. //
  39. // The sets are referred to by name in the rbbirpt.txt, which is the
  40. // source form of the state transition table for the RBBI rule parser.
  41. //
  42. //------------------------------------------------------------------------------
  43. static const char16_t gRuleSet_rule_char_pattern[] = {
  44. // Characters that may appear as literals in patterns without escaping or quoting.
  45. // [ ^ [ \ p { Z } \ u 0 0 2 0
  46. 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
  47. // - \ u 0 0 7 f ] - [ \ p
  48. 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
  49. // { L } ] - [ \ p { N } ] ]
  50. 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
  51. static const char16_t gRuleSet_name_char_pattern[] = {
  52. // [ _ \ p { L } \ p { N } ]
  53. 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
  54. static const char16_t gRuleSet_digit_char_pattern[] = {
  55. // [ 0 - 9 ]
  56. 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
  57. static const char16_t gRuleSet_name_start_char_pattern[] = {
  58. // [ _ \ p { L } ]
  59. 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
  60. static const char16_t kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any"
  61. U_CDECL_BEGIN
  62. static void U_CALLCONV RBBISetTable_deleter(void *p) {
  63. icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p;
  64. delete px->key;
  65. // Note: px->val is owned by the linked list "fSetsListHead" in scanner.
  66. // Don't delete the value nodes here.
  67. uprv_free(px);
  68. }
  69. U_CDECL_END
  70. U_NAMESPACE_BEGIN
  71. //------------------------------------------------------------------------------
  72. //
  73. // Constructor.
  74. //
  75. //------------------------------------------------------------------------------
  76. RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
  77. {
  78. fRB = rb;
  79. fScanIndex = 0;
  80. fNextIndex = 0;
  81. fQuoteMode = false;
  82. fLineNum = 1;
  83. fCharNum = 0;
  84. fLastChar = 0;
  85. fStateTable = nullptr;
  86. fStack[0] = 0;
  87. fStackPtr = 0;
  88. fNodeStack[0] = nullptr;
  89. fNodeStackPtr = 0;
  90. fReverseRule = false;
  91. fLookAheadRule = false;
  92. fNoChainInRule = false;
  93. fSymbolTable = nullptr;
  94. fSetTable = nullptr;
  95. fRuleNum = 0;
  96. fOptionStart = 0;
  97. // Do not check status until after all critical fields are sufficiently initialized
  98. // that the destructor can run cleanly.
  99. if (U_FAILURE(*rb->fStatus)) {
  100. return;
  101. }
  102. //
  103. // Set up the constant Unicode Sets.
  104. // Note: These could be made static, lazily initialized, and shared among
  105. // all instances of RBBIRuleScanners. BUT this is quite a bit simpler,
  106. // and the time to build these few sets should be small compared to a
  107. // full break iterator build.
  108. fRuleSets[kRuleSet_rule_char-128]
  109. = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus);
  110. // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
  111. fRuleSets[kRuleSet_white_space-128].
  112. add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
  113. fRuleSets[kRuleSet_name_char-128]
  114. = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus);
  115. fRuleSets[kRuleSet_name_start_char-128]
  116. = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
  117. fRuleSets[kRuleSet_digit_char-128]
  118. = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus);
  119. if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
  120. // This case happens if ICU's data is missing. UnicodeSet tries to look up property
  121. // names from the init string, can't find them, and claims an illegal argument.
  122. // Change the error so that the actual problem will be clearer to users.
  123. *rb->fStatus = U_BRK_INIT_ERROR;
  124. }
  125. if (U_FAILURE(*rb->fStatus)) {
  126. return;
  127. }
  128. fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
  129. if (fSymbolTable == nullptr) {
  130. *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
  131. return;
  132. }
  133. fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, rb->fStatus);
  134. if (U_FAILURE(*rb->fStatus)) {
  135. return;
  136. }
  137. uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
  138. }
  139. //------------------------------------------------------------------------------
  140. //
  141. // Destructor
  142. //
  143. //------------------------------------------------------------------------------
  144. RBBIRuleScanner::~RBBIRuleScanner() {
  145. delete fSymbolTable;
  146. if (fSetTable != nullptr) {
  147. uhash_close(fSetTable);
  148. fSetTable = nullptr;
  149. }
  150. // Node Stack.
  151. // Normally has one entry, which is the entire parse tree for the rules.
  152. // If errors occurred, there may be additional subtrees left on the stack.
  153. while (fNodeStackPtr > 0) {
  154. delete fNodeStack[fNodeStackPtr];
  155. fNodeStackPtr--;
  156. }
  157. }
  158. //------------------------------------------------------------------------------
  159. //
  160. // doParseAction Do some action during rule parsing.
  161. // Called by the parse state machine.
  162. // Actions build the parse tree and Unicode Sets,
  163. // and maintain the parse stack for nested expressions.
  164. //
  165. // TODO: unify EParseAction and RBBI_RuleParseAction enum types.
  166. // They represent exactly the same thing. They're separate
  167. // only to work around enum forward declaration restrictions
  168. // in some compilers, while at the same time avoiding multiple
  169. // definitions problems. I'm sure that there's a better way.
  170. //
  171. //------------------------------------------------------------------------------
  172. UBool RBBIRuleScanner::doParseActions(int32_t action)
  173. {
  174. RBBINode *n = nullptr;
  175. UBool returnVal = true;
  176. switch (action) {
  177. case doExprStart:
  178. pushNewNode(RBBINode::opStart);
  179. fRuleNum++;
  180. break;
  181. case doNoChain:
  182. // Scanned a '^' while on the rule start state.
  183. fNoChainInRule = true;
  184. break;
  185. case doExprOrOperator:
  186. {
  187. fixOpStack(RBBINode::precOpCat);
  188. RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
  189. RBBINode *orNode = pushNewNode(RBBINode::opOr);
  190. if (U_FAILURE(*fRB->fStatus)) {
  191. break;
  192. }
  193. orNode->fLeftChild = operandNode;
  194. operandNode->fParent = orNode;
  195. }
  196. break;
  197. case doExprCatOperator:
  198. // concatenation operator.
  199. // For the implicit concatenation of adjacent terms in an expression that are
  200. // not separated by any other operator. Action is invoked between the
  201. // actions for the two terms.
  202. {
  203. fixOpStack(RBBINode::precOpCat);
  204. RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
  205. RBBINode *catNode = pushNewNode(RBBINode::opCat);
  206. if (U_FAILURE(*fRB->fStatus)) {
  207. break;
  208. }
  209. catNode->fLeftChild = operandNode;
  210. operandNode->fParent = catNode;
  211. }
  212. break;
  213. case doLParen:
  214. // Open Paren.
  215. // The openParen node is a dummy operation type with a low precedence,
  216. // which has the affect of ensuring that any real binary op that
  217. // follows within the parens binds more tightly to the operands than
  218. // stuff outside of the parens.
  219. pushNewNode(RBBINode::opLParen);
  220. break;
  221. case doExprRParen:
  222. fixOpStack(RBBINode::precLParen);
  223. break;
  224. case doNOP:
  225. break;
  226. case doStartAssign:
  227. // We've just scanned "$variable = "
  228. // The top of the node stack has the $variable ref node.
  229. // Save the start position of the RHS text in the StartExpression node
  230. // that precedes the $variableReference node on the stack.
  231. // This will eventually be used when saving the full $variable replacement
  232. // text as a string.
  233. n = fNodeStack[fNodeStackPtr-1];
  234. n->fFirstPos = fNextIndex; // move past the '='
  235. // Push a new start-of-expression node; needed to keep parse of the
  236. // RHS expression happy.
  237. pushNewNode(RBBINode::opStart);
  238. break;
  239. case doEndAssign:
  240. {
  241. // We have reached the end of an assignment statement.
  242. // Current scan char is the ';' that terminates the assignment.
  243. // Terminate expression, leaves expression parse tree rooted in TOS node.
  244. fixOpStack(RBBINode::precStart);
  245. RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2];
  246. RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1];
  247. RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr];
  248. // Save original text of right side of assignment, excluding the terminating ';'
  249. // in the root of the node for the right-hand-side expression.
  250. RHSExprNode->fFirstPos = startExprNode->fFirstPos;
  251. RHSExprNode->fLastPos = fScanIndex;
  252. fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
  253. // Expression parse tree becomes l. child of the $variable reference node.
  254. varRefNode->fLeftChild = RHSExprNode;
  255. RHSExprNode->fParent = varRefNode;
  256. // Make a symbol table entry for the $variableRef node.
  257. fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
  258. if (U_FAILURE(*fRB->fStatus)) {
  259. // This is a round-about way to get the parse position set
  260. // so that duplicate symbols error messages include a line number.
  261. UErrorCode t = *fRB->fStatus;
  262. *fRB->fStatus = U_ZERO_ERROR;
  263. error(t);
  264. }
  265. // Clean up the stack.
  266. delete startExprNode;
  267. fNodeStackPtr-=3;
  268. break;
  269. }
  270. case doEndOfRule:
  271. {
  272. fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression
  273. if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node.
  274. break;
  275. }
  276. #ifdef RBBI_DEBUG
  277. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
  278. #endif
  279. U_ASSERT(fNodeStackPtr == 1);
  280. RBBINode *thisRule = fNodeStack[fNodeStackPtr];
  281. // If this rule includes a look-ahead '/', add a endMark node to the
  282. // expression tree.
  283. if (fLookAheadRule) {
  284. RBBINode *endNode = pushNewNode(RBBINode::endMark);
  285. RBBINode *catNode = pushNewNode(RBBINode::opCat);
  286. if (U_FAILURE(*fRB->fStatus)) {
  287. break;
  288. }
  289. fNodeStackPtr -= 2;
  290. catNode->fLeftChild = thisRule;
  291. catNode->fRightChild = endNode;
  292. fNodeStack[fNodeStackPtr] = catNode;
  293. endNode->fVal = fRuleNum;
  294. endNode->fLookAheadEnd = true;
  295. thisRule = catNode;
  296. // TODO: Disable chaining out of look-ahead (hard break) rules.
  297. // The break on rule match is forced, so there is no point in building up
  298. // the state table to chain into another rule for a longer match.
  299. }
  300. // Mark this node as being the root of a rule.
  301. thisRule->fRuleRoot = true;
  302. // Flag if chaining into this rule is wanted.
  303. //
  304. if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain
  305. !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
  306. thisRule->fChainIn = true;
  307. }
  308. // All rule expressions are ORed together.
  309. // The ';' that terminates an expression really just functions as a '|' with
  310. // a low operator prededence.
  311. //
  312. // Each of the four sets of rules are collected separately.
  313. // (forward, reverse, safe_forward, safe_reverse)
  314. // OR this rule into the appropriate group of them.
  315. //
  316. RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
  317. if (*destRules != nullptr) {
  318. // This is not the first rule encountered.
  319. // OR previous stuff (from *destRules)
  320. // with the current rule expression (on the Node Stack)
  321. // with the resulting OR expression going to *destRules
  322. //
  323. thisRule = fNodeStack[fNodeStackPtr];
  324. RBBINode *prevRules = *destRules;
  325. RBBINode *orNode = pushNewNode(RBBINode::opOr);
  326. if (U_FAILURE(*fRB->fStatus)) {
  327. break;
  328. }
  329. orNode->fLeftChild = prevRules;
  330. prevRules->fParent = orNode;
  331. orNode->fRightChild = thisRule;
  332. thisRule->fParent = orNode;
  333. *destRules = orNode;
  334. }
  335. else
  336. {
  337. // This is the first rule encountered (for this direction).
  338. // Just move its parse tree from the stack to *destRules.
  339. *destRules = fNodeStack[fNodeStackPtr];
  340. }
  341. fReverseRule = false; // in preparation for the next rule.
  342. fLookAheadRule = false;
  343. fNoChainInRule = false;
  344. fNodeStackPtr = 0;
  345. }
  346. break;
  347. case doRuleError:
  348. error(U_BRK_RULE_SYNTAX);
  349. returnVal = false;
  350. break;
  351. case doVariableNameExpectedErr:
  352. error(U_BRK_RULE_SYNTAX);
  353. break;
  354. //
  355. // Unary operands + ? *
  356. // These all appear after the operand to which they apply.
  357. // When we hit one, the operand (may be a whole sub expression)
  358. // will be on the top of the stack.
  359. // Unary Operator becomes TOS, with the old TOS as its one child.
  360. case doUnaryOpPlus:
  361. {
  362. RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
  363. RBBINode *plusNode = pushNewNode(RBBINode::opPlus);
  364. if (U_FAILURE(*fRB->fStatus)) {
  365. break;
  366. }
  367. plusNode->fLeftChild = operandNode;
  368. operandNode->fParent = plusNode;
  369. }
  370. break;
  371. case doUnaryOpQuestion:
  372. {
  373. RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
  374. RBBINode *qNode = pushNewNode(RBBINode::opQuestion);
  375. if (U_FAILURE(*fRB->fStatus)) {
  376. break;
  377. }
  378. qNode->fLeftChild = operandNode;
  379. operandNode->fParent = qNode;
  380. }
  381. break;
  382. case doUnaryOpStar:
  383. {
  384. RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
  385. RBBINode *starNode = pushNewNode(RBBINode::opStar);
  386. if (U_FAILURE(*fRB->fStatus)) {
  387. break;
  388. }
  389. starNode->fLeftChild = operandNode;
  390. operandNode->fParent = starNode;
  391. }
  392. break;
  393. case doRuleChar:
  394. // A "Rule Character" is any single character that is a literal part
  395. // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]"
  396. // These are pretty uncommon in break rules; the terms are more commonly
  397. // sets. To keep things uniform, treat these characters like as
  398. // sets that just happen to contain only one character.
  399. {
  400. n = pushNewNode(RBBINode::setRef);
  401. if (U_FAILURE(*fRB->fStatus)) {
  402. break;
  403. }
  404. findSetFor(UnicodeString(fC.fChar), n);
  405. n->fFirstPos = fScanIndex;
  406. n->fLastPos = fNextIndex;
  407. fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  408. break;
  409. }
  410. case doDotAny:
  411. // scanned a ".", meaning match any single character.
  412. {
  413. n = pushNewNode(RBBINode::setRef);
  414. if (U_FAILURE(*fRB->fStatus)) {
  415. break;
  416. }
  417. findSetFor(UnicodeString(true, kAny, 3), n);
  418. n->fFirstPos = fScanIndex;
  419. n->fLastPos = fNextIndex;
  420. fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  421. break;
  422. }
  423. case doSlash:
  424. // Scanned a '/', which identifies a look-ahead break position in a rule.
  425. n = pushNewNode(RBBINode::lookAhead);
  426. if (U_FAILURE(*fRB->fStatus)) {
  427. break;
  428. }
  429. n->fVal = fRuleNum;
  430. n->fFirstPos = fScanIndex;
  431. n->fLastPos = fNextIndex;
  432. fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  433. fLookAheadRule = true;
  434. break;
  435. case doStartTagValue:
  436. // Scanned a '{', the opening delimiter for a tag value within a rule.
  437. n = pushNewNode(RBBINode::tag);
  438. if (U_FAILURE(*fRB->fStatus)) {
  439. break;
  440. }
  441. n->fVal = 0;
  442. n->fFirstPos = fScanIndex;
  443. n->fLastPos = fNextIndex;
  444. break;
  445. case doTagDigit:
  446. // Just scanned a decimal digit that's part of a tag value
  447. {
  448. n = fNodeStack[fNodeStackPtr];
  449. uint32_t v = u_charDigitValue(fC.fChar);
  450. U_ASSERT(v < 10);
  451. n->fVal = n->fVal*10 + v;
  452. break;
  453. }
  454. case doTagValue:
  455. n = fNodeStack[fNodeStackPtr];
  456. n->fLastPos = fNextIndex;
  457. fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  458. break;
  459. case doTagExpectedError:
  460. error(U_BRK_MALFORMED_RULE_TAG);
  461. returnVal = false;
  462. break;
  463. case doOptionStart:
  464. // Scanning a !!option. At the start of string.
  465. fOptionStart = fScanIndex;
  466. break;
  467. case doOptionEnd:
  468. {
  469. UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
  470. if (opt == UNICODE_STRING("chain", 5)) {
  471. fRB->fChainRules = true;
  472. } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
  473. fRB->fLBCMNoChain = true;
  474. } else if (opt == UNICODE_STRING("forward", 7)) {
  475. fRB->fDefaultTree = &fRB->fForwardTree;
  476. } else if (opt == UNICODE_STRING("reverse", 7)) {
  477. fRB->fDefaultTree = &fRB->fReverseTree;
  478. } else if (opt == UNICODE_STRING("safe_forward", 12)) {
  479. fRB->fDefaultTree = &fRB->fSafeFwdTree;
  480. } else if (opt == UNICODE_STRING("safe_reverse", 12)) {
  481. fRB->fDefaultTree = &fRB->fSafeRevTree;
  482. } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
  483. fRB->fLookAheadHardBreak = true;
  484. } else if (opt == UNICODE_STRING("quoted_literals_only", 20)) {
  485. fRuleSets[kRuleSet_rule_char-128].clear();
  486. } else if (opt == UNICODE_STRING("unquoted_literals", 17)) {
  487. fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus);
  488. } else {
  489. error(U_BRK_UNRECOGNIZED_OPTION);
  490. }
  491. }
  492. break;
  493. case doReverseDir:
  494. fReverseRule = true;
  495. break;
  496. case doStartVariableName:
  497. n = pushNewNode(RBBINode::varRef);
  498. if (U_FAILURE(*fRB->fStatus)) {
  499. break;
  500. }
  501. n->fFirstPos = fScanIndex;
  502. break;
  503. case doEndVariableName:
  504. n = fNodeStack[fNodeStackPtr];
  505. if (n==nullptr || n->fType != RBBINode::varRef) {
  506. error(U_BRK_INTERNAL_ERROR);
  507. break;
  508. }
  509. n->fLastPos = fScanIndex;
  510. fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
  511. // Look the newly scanned name up in the symbol table
  512. // If there's an entry, set the l. child of the var ref to the replacement expression.
  513. // (We also pass through here when scanning assignments, but no harm is done, other
  514. // than a slight wasted effort that seems hard to avoid. Lookup will be null)
  515. n->fLeftChild = fSymbolTable->lookupNode(n->fText);
  516. break;
  517. case doCheckVarDef:
  518. n = fNodeStack[fNodeStackPtr];
  519. if (n->fLeftChild == nullptr) {
  520. error(U_BRK_UNDEFINED_VARIABLE);
  521. returnVal = false;
  522. }
  523. break;
  524. case doExprFinished:
  525. break;
  526. case doRuleErrorAssignExpr:
  527. error(U_BRK_ASSIGN_ERROR);
  528. returnVal = false;
  529. break;
  530. case doExit:
  531. returnVal = false;
  532. break;
  533. case doScanUnicodeSet:
  534. scanSet();
  535. break;
  536. default:
  537. error(U_BRK_INTERNAL_ERROR);
  538. returnVal = false;
  539. break;
  540. }
  541. return returnVal && U_SUCCESS(*fRB->fStatus);
  542. }
  543. //------------------------------------------------------------------------------
  544. //
  545. // Error Report a rule parse error.
  546. // Only report it if no previous error has been recorded.
  547. //
  548. //------------------------------------------------------------------------------
  549. void RBBIRuleScanner::error(UErrorCode e) {
  550. if (U_SUCCESS(*fRB->fStatus)) {
  551. *fRB->fStatus = e;
  552. if (fRB->fParseError) {
  553. fRB->fParseError->line = fLineNum;
  554. fRB->fParseError->offset = fCharNum;
  555. fRB->fParseError->preContext[0] = 0;
  556. fRB->fParseError->postContext[0] = 0;
  557. }
  558. }
  559. }
  560. //------------------------------------------------------------------------------
  561. //
  562. // fixOpStack The parse stack holds partially assembled chunks of the parse tree.
  563. // An entry on the stack may be as small as a single setRef node,
  564. // or as large as the parse tree
  565. // for an entire expression (this will be the one item left on the stack
  566. // when the parsing of an RBBI rule completes.
  567. //
  568. // This function is called when a binary operator is encountered.
  569. // It looks back up the stack for operators that are not yet associated
  570. // with a right operand, and if the precedence of the stacked operator >=
  571. // the precedence of the current operator, binds the operand left,
  572. // to the previously encountered operator.
  573. //
  574. //------------------------------------------------------------------------------
  575. void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
  576. RBBINode *n;
  577. // printNodeStack("entering fixOpStack()");
  578. for (;;) {
  579. n = fNodeStack[fNodeStackPtr-1]; // an operator node
  580. if (n->fPrecedence == 0) {
  581. RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
  582. error(U_BRK_INTERNAL_ERROR);
  583. return;
  584. }
  585. if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
  586. // The most recent operand goes with the current operator,
  587. // not with the previously stacked one.
  588. break;
  589. }
  590. // Stack operator is a binary op ( '|' or concatenation)
  591. // TOS operand becomes right child of this operator.
  592. // Resulting subexpression becomes the TOS operand.
  593. n->fRightChild = fNodeStack[fNodeStackPtr];
  594. fNodeStack[fNodeStackPtr]->fParent = n;
  595. fNodeStackPtr--;
  596. // printNodeStack("looping in fixOpStack() ");
  597. }
  598. if (p <= RBBINode::precLParen) {
  599. // Scan is at a right paren or end of expression.
  600. // The scanned item must match the stack, or else there was an error.
  601. // Discard the left paren (or start expr) node from the stack,
  602. // leaving the completed (sub)expression as TOS.
  603. if (n->fPrecedence != p) {
  604. // Right paren encountered matched start of expression node, or
  605. // end of expression matched with a left paren node.
  606. error(U_BRK_MISMATCHED_PAREN);
  607. }
  608. fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
  609. fNodeStackPtr--;
  610. // Delete the now-discarded LParen or Start node.
  611. delete n;
  612. }
  613. // printNodeStack("leaving fixOpStack()");
  614. }
  615. //------------------------------------------------------------------------------
  616. //
  617. // findSetFor given a UnicodeString,
  618. // - find the corresponding Unicode Set (uset node)
  619. // (create one if necessary)
  620. // - Set fLeftChild of the caller's node (should be a setRef node)
  621. // to the uset node
  622. // Maintain a hash table of uset nodes, so the same one is always used
  623. // for the same string.
  624. // If a "to adopt" set is provided and we haven't seen this key before,
  625. // add the provided set to the hash table.
  626. // If the string is one (32 bit) char in length, the set contains
  627. // just one element which is the char in question.
  628. // If the string is "any", return a set containing all chars.
  629. //
  630. //------------------------------------------------------------------------------
  631. void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
  632. RBBISetTableEl *el;
  633. // First check whether we've already cached a set for this string.
  634. // If so, just use the cached set in the new node.
  635. // delete any set provided by the caller, since we own it.
  636. el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
  637. if (el != nullptr) {
  638. delete setToAdopt;
  639. node->fLeftChild = el->val;
  640. U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
  641. return;
  642. }
  643. // Haven't seen this set before.
  644. // If the caller didn't provide us with a prebuilt set,
  645. // create a new UnicodeSet now.
  646. if (setToAdopt == nullptr) {
  647. if (s.compare(kAny, -1) == 0) {
  648. setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
  649. } else {
  650. UChar32 c;
  651. c = s.char32At(0);
  652. setToAdopt = new UnicodeSet(c, c);
  653. }
  654. }
  655. //
  656. // Make a new uset node to refer to this UnicodeSet
  657. // This new uset node becomes the child of the caller's setReference node.
  658. //
  659. RBBINode *usetNode = new RBBINode(RBBINode::uset);
  660. if (usetNode == nullptr) {
  661. error(U_MEMORY_ALLOCATION_ERROR);
  662. return;
  663. }
  664. usetNode->fInputSet = setToAdopt;
  665. usetNode->fParent = node;
  666. node->fLeftChild = usetNode;
  667. usetNode->fText = s;
  668. //
  669. // Add the new uset node to the list of all uset nodes.
  670. //
  671. fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
  672. //
  673. // Add the new set to the set hash table.
  674. //
  675. el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
  676. UnicodeString *tkey = new UnicodeString(s);
  677. if (tkey == nullptr || el == nullptr || setToAdopt == nullptr) {
  678. // Delete to avoid memory leak
  679. delete tkey;
  680. tkey = nullptr;
  681. uprv_free(el);
  682. el = nullptr;
  683. delete setToAdopt;
  684. setToAdopt = nullptr;
  685. error(U_MEMORY_ALLOCATION_ERROR);
  686. return;
  687. }
  688. el->key = tkey;
  689. el->val = usetNode;
  690. uhash_put(fSetTable, el->key, el, fRB->fStatus);
  691. return;
  692. }
  693. //
  694. // Assorted Unicode character constants.
  695. // Numeric because there is no portable way to enter them as literals.
  696. // (Think EBCDIC).
  697. //
  698. static const char16_t chCR = 0x0d; // New lines, for terminating comments.
  699. static const char16_t chLF = 0x0a;
  700. static const char16_t chNEL = 0x85; // NEL newline variant
  701. static const char16_t chLS = 0x2028; // Unicode Line Separator
  702. static const char16_t chApos = 0x27; // single quote, for quoted chars.
  703. static const char16_t chPound = 0x23; // '#', introduces a comment.
  704. static const char16_t chBackSlash = 0x5c; // '\' introduces a char escape
  705. static const char16_t chLParen = 0x28;
  706. static const char16_t chRParen = 0x29;
  707. //------------------------------------------------------------------------------
  708. //
  709. // stripRules Return a rules string without extra spaces.
  710. // (Comments are removed separately, during rule parsing.)
  711. //
  712. //------------------------------------------------------------------------------
  713. UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
  714. UnicodeString strippedRules;
  715. int32_t rulesLength = rules.length();
  716. for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
  717. UChar32 cp = rules.char32At(idx);
  718. bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
  719. if (whiteSpace) {
  720. continue;
  721. }
  722. strippedRules.append(cp);
  723. }
  724. return strippedRules;
  725. }
  726. //------------------------------------------------------------------------------
  727. //
  728. // nextCharLL Low Level Next Char from rule input source.
  729. // Get a char from the input character iterator,
  730. // keep track of input position for error reporting.
  731. //
  732. //------------------------------------------------------------------------------
  733. UChar32 RBBIRuleScanner::nextCharLL() {
  734. UChar32 ch;
  735. if (fNextIndex >= fRB->fRules.length()) {
  736. return (UChar32)-1;
  737. }
  738. ch = fRB->fRules.char32At(fNextIndex);
  739. if (U_IS_SURROGATE(ch)) {
  740. error(U_ILLEGAL_CHAR_FOUND);
  741. return U_SENTINEL;
  742. }
  743. fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
  744. if (ch == chCR ||
  745. ch == chNEL ||
  746. ch == chLS ||
  747. (ch == chLF && fLastChar != chCR)) {
  748. // Character is starting a new line. Bump up the line number, and
  749. // reset the column to 0.
  750. fLineNum++;
  751. fCharNum=0;
  752. if (fQuoteMode) {
  753. error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
  754. fQuoteMode = false;
  755. }
  756. }
  757. else {
  758. // Character is not starting a new line. Except in the case of a
  759. // LF following a CR, increment the column position.
  760. if (ch != chLF) {
  761. fCharNum++;
  762. }
  763. }
  764. fLastChar = ch;
  765. return ch;
  766. }
  767. //------------------------------------------------------------------------------
  768. //
  769. // nextChar for rules scanning. At this level, we handle stripping
  770. // out comments and processing backslash character escapes.
  771. // The rest of the rules grammar is handled at the next level up.
  772. //
  773. //------------------------------------------------------------------------------
  774. void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
  775. // Unicode Character constants needed for the processing done by nextChar(),
  776. // in hex because literals wont work on EBCDIC machines.
  777. fScanIndex = fNextIndex;
  778. c.fChar = nextCharLL();
  779. c.fEscaped = false;
  780. //
  781. // check for '' sequence.
  782. // These are recognized in all contexts, whether in quoted text or not.
  783. //
  784. if (c.fChar == chApos) {
  785. if (fRB->fRules.char32At(fNextIndex) == chApos) {
  786. c.fChar = nextCharLL(); // get nextChar officially so character counts
  787. c.fEscaped = true; // stay correct.
  788. }
  789. else
  790. {
  791. // Single quote, by itself.
  792. // Toggle quoting mode.
  793. // Return either '(' or ')', because quotes cause a grouping of the quoted text.
  794. fQuoteMode = !fQuoteMode;
  795. if (fQuoteMode) {
  796. c.fChar = chLParen;
  797. } else {
  798. c.fChar = chRParen;
  799. }
  800. c.fEscaped = false; // The paren that we return is not escaped.
  801. return;
  802. }
  803. }
  804. if (fQuoteMode) {
  805. c.fEscaped = true;
  806. }
  807. else
  808. {
  809. // We are not in a 'quoted region' of the source.
  810. //
  811. if (c.fChar == chPound) {
  812. // Start of a comment. Consume the rest of it.
  813. // The new-line char that terminates the comment is always returned.
  814. // It will be treated as white-space, and serves to break up anything
  815. // that might otherwise incorrectly clump together with a comment in
  816. // the middle (a variable name, for example.)
  817. int32_t commentStart = fScanIndex;
  818. for (;;) {
  819. c.fChar = nextCharLL();
  820. if (c.fChar == (UChar32)-1 || // EOF
  821. c.fChar == chCR ||
  822. c.fChar == chLF ||
  823. c.fChar == chNEL ||
  824. c.fChar == chLS) {break;}
  825. }
  826. for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
  827. fRB->fStrippedRules.setCharAt(i, u' ');
  828. }
  829. }
  830. if (c.fChar == (UChar32)-1) {
  831. return;
  832. }
  833. //
  834. // check for backslash escaped characters.
  835. // Use UnicodeString::unescapeAt() to handle them.
  836. //
  837. if (c.fChar == chBackSlash) {
  838. c.fEscaped = true;
  839. int32_t startX = fNextIndex;
  840. c.fChar = fRB->fRules.unescapeAt(fNextIndex);
  841. if (fNextIndex == startX) {
  842. error(U_BRK_HEX_DIGITS_EXPECTED);
  843. }
  844. fCharNum += fNextIndex-startX;
  845. }
  846. }
  847. // putc(c.fChar, stdout);
  848. }
  849. //------------------------------------------------------------------------------
  850. //
  851. // Parse RBBI rules. The state machine for rules parsing is here.
  852. // The state tables are hand-written in the file rbbirpt.txt,
  853. // and converted to the form used here by a perl
  854. // script rbbicst.pl
  855. //
  856. //------------------------------------------------------------------------------
  857. void RBBIRuleScanner::parse() {
  858. uint16_t state;
  859. const RBBIRuleTableEl *tableEl;
  860. if (U_FAILURE(*fRB->fStatus)) {
  861. return;
  862. }
  863. state = 1;
  864. nextChar(fC);
  865. //
  866. // Main loop for the rule parsing state machine.
  867. // Runs once per state transition.
  868. // Each time through optionally performs, depending on the state table,
  869. // - an advance to the the next input char
  870. // - an action to be performed.
  871. // - pushing or popping a state to/from the local state return stack.
  872. //
  873. for (;;) {
  874. // Bail out if anything has gone wrong.
  875. // RBBI rule file parsing stops on the first error encountered.
  876. if (U_FAILURE(*fRB->fStatus)) {
  877. break;
  878. }
  879. // Quit if state == 0. This is the normal way to exit the state machine.
  880. //
  881. if (state == 0) {
  882. break;
  883. }
  884. // Find the state table element that matches the input char from the rule, or the
  885. // class of the input character. Start with the first table row for this
  886. // state, then linearly scan forward until we find a row that matches the
  887. // character. The last row for each state always matches all characters, so
  888. // the search will stop there, if not before.
  889. //
  890. tableEl = &gRuleParseStateTable[state];
  891. #ifdef RBBI_DEBUG
  892. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
  893. RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",
  894. fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
  895. }
  896. #endif
  897. for (;;) {
  898. #ifdef RBBI_DEBUG
  899. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
  900. #endif
  901. if (tableEl->fCharClass < 127 && fC.fEscaped == false && tableEl->fCharClass == fC.fChar) {
  902. // Table row specified an individual character, not a set, and
  903. // the input character is not escaped, and
  904. // the input character matched it.
  905. break;
  906. }
  907. if (tableEl->fCharClass == 255) {
  908. // Table row specified default, match anything character class.
  909. break;
  910. }
  911. if (tableEl->fCharClass == 254 && fC.fEscaped) {
  912. // Table row specified "escaped" and the char was escaped.
  913. break;
  914. }
  915. if (tableEl->fCharClass == 253 && fC.fEscaped &&
  916. (fC.fChar == 0x50 || fC.fChar == 0x70 )) {
  917. // Table row specified "escaped P" and the char is either 'p' or 'P'.
  918. break;
  919. }
  920. if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) {
  921. // Table row specified eof and we hit eof on the input.
  922. break;
  923. }
  924. if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
  925. fC.fEscaped == false && // char is not escaped &&
  926. fC.fChar != (UChar32)-1) { // char is not EOF
  927. U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets));
  928. if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
  929. // Table row specified a character class, or set of characters,
  930. // and the current char matches it.
  931. break;
  932. }
  933. }
  934. // No match on this row, advance to the next row for this state,
  935. tableEl++;
  936. }
  937. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
  938. //
  939. // We've found the row of the state table that matches the current input
  940. // character from the rules string.
  941. // Perform any action specified by this row in the state table.
  942. if (doParseActions((int32_t)tableEl->fAction) == false) {
  943. // Break out of the state machine loop if the
  944. // the action signalled some kind of error, or
  945. // the action was to exit, occurs on normal end-of-rules-input.
  946. break;
  947. }
  948. if (tableEl->fPushState != 0) {
  949. fStackPtr++;
  950. if (fStackPtr >= kStackSize) {
  951. error(U_BRK_INTERNAL_ERROR);
  952. RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
  953. fStackPtr--;
  954. }
  955. fStack[fStackPtr] = tableEl->fPushState;
  956. }
  957. if (tableEl->fNextChar) {
  958. nextChar(fC);
  959. }
  960. // Get the next state from the table entry, or from the
  961. // state stack if the next state was specified as "pop".
  962. if (tableEl->fNextState != 255) {
  963. state = tableEl->fNextState;
  964. } else {
  965. state = fStack[fStackPtr];
  966. fStackPtr--;
  967. if (fStackPtr < 0) {
  968. error(U_BRK_INTERNAL_ERROR);
  969. RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
  970. fStackPtr++;
  971. }
  972. }
  973. }
  974. if (U_FAILURE(*fRB->fStatus)) {
  975. return;
  976. }
  977. // If there are no forward rules set an error.
  978. //
  979. if (fRB->fForwardTree == nullptr) {
  980. error(U_BRK_RULE_SYNTAX);
  981. return;
  982. }
  983. //
  984. // Parsing of the input RBBI rules is complete.
  985. // We now have a parse tree for the rule expressions
  986. // and a list of all UnicodeSets that are referenced.
  987. //
  988. #ifdef RBBI_DEBUG
  989. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
  990. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) {
  991. RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
  992. RBBINode::printTree(fRB->fForwardTree, true);
  993. RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
  994. RBBINode::printTree(fRB->fReverseTree, true);
  995. RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
  996. RBBINode::printTree(fRB->fSafeFwdTree, true);
  997. RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
  998. RBBINode::printTree(fRB->fSafeRevTree, true);
  999. }
  1000. #endif
  1001. }
  1002. //------------------------------------------------------------------------------
  1003. //
  1004. // printNodeStack for debugging...
  1005. //
  1006. //------------------------------------------------------------------------------
  1007. #ifdef RBBI_DEBUG
  1008. void RBBIRuleScanner::printNodeStack(const char *title) {
  1009. int i;
  1010. RBBIDebugPrintf("%s. Dumping node stack...\n", title);
  1011. for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], true);}
  1012. }
  1013. #endif
  1014. //------------------------------------------------------------------------------
  1015. //
  1016. // pushNewNode create a new RBBINode of the specified type and push it
  1017. // onto the stack of nodes.
  1018. //
  1019. //------------------------------------------------------------------------------
  1020. RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
  1021. if (U_FAILURE(*fRB->fStatus)) {
  1022. return nullptr;
  1023. }
  1024. if (fNodeStackPtr >= kStackSize - 1) {
  1025. error(U_BRK_RULE_SYNTAX);
  1026. RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
  1027. return nullptr;
  1028. }
  1029. fNodeStackPtr++;
  1030. fNodeStack[fNodeStackPtr] = new RBBINode(t);
  1031. if (fNodeStack[fNodeStackPtr] == nullptr) {
  1032. *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
  1033. }
  1034. return fNodeStack[fNodeStackPtr];
  1035. }
  1036. //------------------------------------------------------------------------------
  1037. //
  1038. // scanSet Construct a UnicodeSet from the text at the current scan
  1039. // position. Advance the scan position to the first character
  1040. // after the set.
  1041. //
  1042. // A new RBBI setref node referring to the set is pushed onto the node
  1043. // stack.
  1044. //
  1045. // The scan position is normally under the control of the state machine
  1046. // that controls rule parsing. UnicodeSets, however, are parsed by
  1047. // the UnicodeSet constructor, not by the RBBI rule parser.
  1048. //
  1049. //------------------------------------------------------------------------------
  1050. void RBBIRuleScanner::scanSet() {
  1051. UnicodeSet *uset;
  1052. ParsePosition pos;
  1053. int startPos;
  1054. int i;
  1055. if (U_FAILURE(*fRB->fStatus)) {
  1056. return;
  1057. }
  1058. pos.setIndex(fScanIndex);
  1059. startPos = fScanIndex;
  1060. UErrorCode localStatus = U_ZERO_ERROR;
  1061. uset = new UnicodeSet();
  1062. if (uset == nullptr) {
  1063. localStatus = U_MEMORY_ALLOCATION_ERROR;
  1064. } else {
  1065. uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
  1066. }
  1067. if (U_FAILURE(localStatus)) {
  1068. // TODO: Get more accurate position of the error from UnicodeSet's return info.
  1069. // UnicodeSet appears to not be reporting correctly at this time.
  1070. #ifdef RBBI_DEBUG
  1071. RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex());
  1072. #endif
  1073. error(localStatus);
  1074. delete uset;
  1075. return;
  1076. }
  1077. // Verify that the set contains at least one code point.
  1078. //
  1079. U_ASSERT(uset!=nullptr);
  1080. if (uset->isEmpty()) {
  1081. // This set is empty.
  1082. // Make it an error, because it almost certainly is not what the user wanted.
  1083. // Also, avoids having to think about corner cases in the tree manipulation code
  1084. // that occurs later on.
  1085. error(U_BRK_RULE_EMPTY_SET);
  1086. delete uset;
  1087. return;
  1088. }
  1089. // Advance the RBBI parse position over the UnicodeSet pattern.
  1090. // Don't just set fScanIndex because the line/char positions maintained
  1091. // for error reporting would be thrown off.
  1092. i = pos.getIndex();
  1093. for (;;) {
  1094. if (fNextIndex >= i) {
  1095. break;
  1096. }
  1097. nextCharLL();
  1098. }
  1099. if (U_SUCCESS(*fRB->fStatus)) {
  1100. RBBINode *n;
  1101. n = pushNewNode(RBBINode::setRef);
  1102. if (U_FAILURE(*fRB->fStatus)) {
  1103. return;
  1104. }
  1105. n->fFirstPos = startPos;
  1106. n->fLastPos = fNextIndex;
  1107. fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  1108. // findSetFor() serves several purposes here:
  1109. // - Adopts storage for the UnicodeSet, will be responsible for deleting.
  1110. // - Maintains collection of all sets in use, needed later for establishing
  1111. // character categories for run time engine.
  1112. // - Eliminates mulitiple instances of the same set.
  1113. // - Creates a new uset node if necessary (if this isn't a duplicate.)
  1114. findSetFor(n->fText, n, uset);
  1115. }
  1116. }
  1117. int32_t RBBIRuleScanner::numRules() {
  1118. return fRuleNum;
  1119. }
  1120. U_NAMESPACE_END
  1121. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */