rbbiscan.cpp 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // file: rbbiscan.cpp
  5. //
  6. // Copyright (C) 2002-2016, International Business Machines Corporation and others.
  7. // All Rights Reserved.
  8. //
  9. // This file contains the Rule Based Break Iterator Rule Builder functions for
  10. // scanning the rules and assembling a parse tree. This is the first phase
  11. // of compiling the rules.
  12. //
  13. // The overall of the rules is managed by class RBBIRuleBuilder, which will
  14. // create and use an instance of this class as part of the process.
  15. //
  16. #include "unicode/utypes.h"
  17. #if !UCONFIG_NO_BREAK_ITERATION
  18. #include "unicode/unistr.h"
  19. #include "unicode/uniset.h"
  20. #include "unicode/uchar.h"
  21. #include "unicode/uchriter.h"
  22. #include "unicode/parsepos.h"
  23. #include "unicode/parseerr.h"
  24. #include "cmemory.h"
  25. #include "cstring.h"
  26. #include "rbbirpt.h" // Contains state table for the rbbi rules parser.
  27. // generated by a Perl script.
  28. #include "rbbirb.h"
  29. #include "rbbinode.h"
  30. #include "rbbiscan.h"
  31. #include "rbbitblb.h"
  32. #include "uassert.h"
  33. //------------------------------------------------------------------------------
  34. //
  35. // Unicode Set init strings for each of the character classes needed for parsing a rule file.
  36. // (Initialized with hex values for portability to EBCDIC based machines.
  37. // Really ugly, but there's no good way to avoid it.)
  38. //
  39. // The sets are referred to by name in the rbbirpt.txt, which is the
  40. // source form of the state transition table for the RBBI rule parser.
  41. //
  42. //------------------------------------------------------------------------------
  43. static const char16_t gRuleSet_rule_char_pattern[] = {
  44. // Characters that may appear as literals in patterns without escaping or quoting.
  45. // [ ^ [ \ p { Z } \ u 0 0 2 0
  46. 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
  47. // - \ u 0 0 7 f ] - [ \ p
  48. 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
  49. // { L } ] - [ \ p { N } ] ]
  50. 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
  51. static const char16_t gRuleSet_name_char_pattern[] = {
  52. // [ _ \ p { L } \ p { N } ]
  53. 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
  54. static const char16_t gRuleSet_digit_char_pattern[] = {
  55. // [ 0 - 9 ]
  56. 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
  57. static const char16_t gRuleSet_name_start_char_pattern[] = {
  58. // [ _ \ p { L } ]
  59. 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
  60. static const char16_t kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any"
  61. U_CDECL_BEGIN
  62. static void U_CALLCONV RBBISetTable_deleter(void *p) {
  63. icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p;
  64. delete px->key;
  65. // Note: px->val is owned by the linked list "fSetsListHead" in scanner.
  66. // Don't delete the value nodes here.
  67. uprv_free(px);
  68. }
  69. U_CDECL_END
  70. U_NAMESPACE_BEGIN
  71. //------------------------------------------------------------------------------
  72. //
  73. // Constructor.
  74. //
  75. //------------------------------------------------------------------------------
  76. RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
  77. {
  78. fRB = rb;
  79. fScanIndex = 0;
  80. fNextIndex = 0;
  81. fQuoteMode = false;
  82. fLineNum = 1;
  83. fCharNum = 0;
  84. fLastChar = 0;
  85. fStateTable = nullptr;
  86. fStack[0] = 0;
  87. fStackPtr = 0;
  88. fNodeStack[0] = nullptr;
  89. fNodeStackPtr = 0;
  90. fReverseRule = false;
  91. fLookAheadRule = false;
  92. fNoChainInRule = false;
  93. fSymbolTable = nullptr;
  94. fSetTable = nullptr;
  95. fRuleNum = 0;
  96. fOptionStart = 0;
  97. // Do not check status until after all critical fields are sufficiently initialized
  98. // that the destructor can run cleanly.
  99. if (U_FAILURE(*rb->fStatus)) {
  100. return;
  101. }
  102. //
  103. // Set up the constant Unicode Sets.
  104. // Note: These could be made static, lazily initialized, and shared among
  105. // all instances of RBBIRuleScanners. BUT this is quite a bit simpler,
  106. // and the time to build these few sets should be small compared to a
  107. // full break iterator build.
  108. fRuleSets[kRuleSet_rule_char-128]
  109. = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus);
  110. // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
  111. fRuleSets[kRuleSet_white_space-128].
  112. add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
  113. fRuleSets[kRuleSet_name_char-128]
  114. = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus);
  115. fRuleSets[kRuleSet_name_start_char-128]
  116. = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
  117. fRuleSets[kRuleSet_digit_char-128]
  118. = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus);
  119. if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
  120. // This case happens if ICU's data is missing. UnicodeSet tries to look up property
  121. // names from the init string, can't find them, and claims an illegal argument.
  122. // Change the error so that the actual problem will be clearer to users.
  123. *rb->fStatus = U_BRK_INIT_ERROR;
  124. }
  125. if (U_FAILURE(*rb->fStatus)) {
  126. return;
  127. }
  128. fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
  129. if (fSymbolTable == nullptr) {
  130. *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
  131. return;
  132. }
  133. fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, rb->fStatus);
  134. if (U_FAILURE(*rb->fStatus)) {
  135. return;
  136. }
  137. uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
  138. }
  139. //------------------------------------------------------------------------------
  140. //
  141. // Destructor
  142. //
  143. //------------------------------------------------------------------------------
  144. RBBIRuleScanner::~RBBIRuleScanner() {
  145. delete fSymbolTable;
  146. if (fSetTable != nullptr) {
  147. uhash_close(fSetTable);
  148. fSetTable = nullptr;
  149. }
  150. // Node Stack.
  151. // Normally has one entry, which is the entire parse tree for the rules.
  152. // If errors occurred, there may be additional subtrees left on the stack.
  153. while (fNodeStackPtr > 0) {
  154. delete fNodeStack[fNodeStackPtr];
  155. fNodeStackPtr--;
  156. }
  157. }
  158. //------------------------------------------------------------------------------
  159. //
  160. // doParseAction Do some action during rule parsing.
  161. // Called by the parse state machine.
  162. // Actions build the parse tree and Unicode Sets,
  163. // and maintain the parse stack for nested expressions.
  164. //
  165. // TODO: unify EParseAction and RBBI_RuleParseAction enum types.
  166. // They represent exactly the same thing. They're separate
  167. // only to work around enum forward declaration restrictions
  168. // in some compilers, while at the same time avoiding multiple
  169. // definitions problems. I'm sure that there's a better way.
  170. //
  171. //------------------------------------------------------------------------------
  172. UBool RBBIRuleScanner::doParseActions(int32_t action)
  173. {
  174. RBBINode *n = nullptr;
  175. UBool returnVal = true;
  176. switch (action) {
  177. case doExprStart:
  178. pushNewNode(RBBINode::opStart);
  179. fRuleNum++;
  180. break;
  181. case doNoChain:
  182. // Scanned a '^' while on the rule start state.
  183. fNoChainInRule = true;
  184. break;
  185. case doExprOrOperator:
  186. {
  187. fixOpStack(RBBINode::precOpCat);
  188. RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
  189. RBBINode *orNode = pushNewNode(RBBINode::opOr);
  190. if (U_FAILURE(*fRB->fStatus)) {
  191. break;
  192. }
  193. orNode->fLeftChild = operandNode;
  194. operandNode->fParent = orNode;
  195. }
  196. break;
  197. case doExprCatOperator:
  198. // concatenation operator.
  199. // For the implicit concatenation of adjacent terms in an expression that are
  200. // not separated by any other operator. Action is invoked between the
  201. // actions for the two terms.
  202. {
  203. fixOpStack(RBBINode::precOpCat);
  204. RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
  205. RBBINode *catNode = pushNewNode(RBBINode::opCat);
  206. if (U_FAILURE(*fRB->fStatus)) {
  207. break;
  208. }
  209. catNode->fLeftChild = operandNode;
  210. operandNode->fParent = catNode;
  211. }
  212. break;
  213. case doLParen:
  214. // Open Paren.
  215. // The openParen node is a dummy operation type with a low precedence,
  216. // which has the affect of ensuring that any real binary op that
  217. // follows within the parens binds more tightly to the operands than
  218. // stuff outside of the parens.
  219. pushNewNode(RBBINode::opLParen);
  220. break;
  221. case doExprRParen:
  222. fixOpStack(RBBINode::precLParen);
  223. break;
  224. case doNOP:
  225. break;
  226. case doStartAssign:
  227. // We've just scanned "$variable = "
  228. // The top of the node stack has the $variable ref node.
  229. // Save the start position of the RHS text in the StartExpression node
  230. // that precedes the $variableReference node on the stack.
  231. // This will eventually be used when saving the full $variable replacement
  232. // text as a string.
  233. n = fNodeStack[fNodeStackPtr-1];
  234. n->fFirstPos = fNextIndex; // move past the '='
  235. // Push a new start-of-expression node; needed to keep parse of the
  236. // RHS expression happy.
  237. pushNewNode(RBBINode::opStart);
  238. break;
  239. case doEndAssign:
  240. {
  241. // We have reached the end of an assignment statement.
  242. // Current scan char is the ';' that terminates the assignment.
  243. // Terminate expression, leaves expression parse tree rooted in TOS node.
  244. fixOpStack(RBBINode::precStart);
  245. if (U_FAILURE(*fRB->fStatus)) {
  246. break;
  247. }
  248. RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2];
  249. RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1];
  250. RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr];
  251. // Save original text of right side of assignment, excluding the terminating ';'
  252. // in the root of the node for the right-hand-side expression.
  253. RHSExprNode->fFirstPos = startExprNode->fFirstPos;
  254. RHSExprNode->fLastPos = fScanIndex;
  255. fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
  256. // Expression parse tree becomes l. child of the $variable reference node.
  257. varRefNode->fLeftChild = RHSExprNode;
  258. RHSExprNode->fParent = varRefNode;
  259. // Make a symbol table entry for the $variableRef node.
  260. fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
  261. if (U_FAILURE(*fRB->fStatus)) {
  262. // This is a round-about way to get the parse position set
  263. // so that duplicate symbols error messages include a line number.
  264. UErrorCode t = *fRB->fStatus;
  265. *fRB->fStatus = U_ZERO_ERROR;
  266. error(t);
  267. // When adding $variableRef to the symbol table fail, Delete
  268. // both nodes because deleting varRefNode will not delete
  269. // RHSExprNode internally.
  270. delete RHSExprNode;
  271. delete varRefNode;
  272. }
  273. // Clean up the stack.
  274. delete startExprNode;
  275. fNodeStackPtr-=3;
  276. break;
  277. }
  278. case doEndOfRule:
  279. {
  280. fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression
  281. if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node.
  282. break;
  283. }
  284. #ifdef RBBI_DEBUG
  285. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
  286. #endif
  287. U_ASSERT(fNodeStackPtr == 1);
  288. RBBINode *thisRule = fNodeStack[fNodeStackPtr];
  289. // If this rule includes a look-ahead '/', add a endMark node to the
  290. // expression tree.
  291. if (fLookAheadRule) {
  292. RBBINode *endNode = pushNewNode(RBBINode::endMark);
  293. RBBINode *catNode = pushNewNode(RBBINode::opCat);
  294. if (U_FAILURE(*fRB->fStatus)) {
  295. break;
  296. }
  297. fNodeStackPtr -= 2;
  298. catNode->fLeftChild = thisRule;
  299. catNode->fRightChild = endNode;
  300. fNodeStack[fNodeStackPtr] = catNode;
  301. endNode->fVal = fRuleNum;
  302. endNode->fLookAheadEnd = true;
  303. thisRule = catNode;
  304. // TODO: Disable chaining out of look-ahead (hard break) rules.
  305. // The break on rule match is forced, so there is no point in building up
  306. // the state table to chain into another rule for a longer match.
  307. }
  308. // Mark this node as being the root of a rule.
  309. thisRule->fRuleRoot = true;
  310. // Flag if chaining into this rule is wanted.
  311. //
  312. if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain
  313. !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
  314. thisRule->fChainIn = true;
  315. }
  316. // All rule expressions are ORed together.
  317. // The ';' that terminates an expression really just functions as a '|' with
  318. // a low operator prededence.
  319. //
  320. // Each of the four sets of rules are collected separately.
  321. // (forward, reverse, safe_forward, safe_reverse)
  322. // OR this rule into the appropriate group of them.
  323. //
  324. RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
  325. if (*destRules != nullptr) {
  326. // This is not the first rule encountered.
  327. // OR previous stuff (from *destRules)
  328. // with the current rule expression (on the Node Stack)
  329. // with the resulting OR expression going to *destRules
  330. //
  331. thisRule = fNodeStack[fNodeStackPtr];
  332. RBBINode *prevRules = *destRules;
  333. RBBINode *orNode = pushNewNode(RBBINode::opOr);
  334. if (U_FAILURE(*fRB->fStatus)) {
  335. break;
  336. }
  337. orNode->fLeftChild = prevRules;
  338. prevRules->fParent = orNode;
  339. orNode->fRightChild = thisRule;
  340. thisRule->fParent = orNode;
  341. *destRules = orNode;
  342. }
  343. else
  344. {
  345. // This is the first rule encountered (for this direction).
  346. // Just move its parse tree from the stack to *destRules.
  347. *destRules = fNodeStack[fNodeStackPtr];
  348. }
  349. fReverseRule = false; // in preparation for the next rule.
  350. fLookAheadRule = false;
  351. fNoChainInRule = false;
  352. fNodeStackPtr = 0;
  353. }
  354. break;
  355. case doRuleError:
  356. error(U_BRK_RULE_SYNTAX);
  357. returnVal = false;
  358. break;
  359. case doVariableNameExpectedErr:
  360. error(U_BRK_RULE_SYNTAX);
  361. break;
  362. //
  363. // Unary operands + ? *
  364. // These all appear after the operand to which they apply.
  365. // When we hit one, the operand (may be a whole sub expression)
  366. // will be on the top of the stack.
  367. // Unary Operator becomes TOS, with the old TOS as its one child.
  368. case doUnaryOpPlus:
  369. {
  370. RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
  371. RBBINode *plusNode = pushNewNode(RBBINode::opPlus);
  372. if (U_FAILURE(*fRB->fStatus)) {
  373. break;
  374. }
  375. plusNode->fLeftChild = operandNode;
  376. operandNode->fParent = plusNode;
  377. }
  378. break;
  379. case doUnaryOpQuestion:
  380. {
  381. RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
  382. RBBINode *qNode = pushNewNode(RBBINode::opQuestion);
  383. if (U_FAILURE(*fRB->fStatus)) {
  384. break;
  385. }
  386. qNode->fLeftChild = operandNode;
  387. operandNode->fParent = qNode;
  388. }
  389. break;
  390. case doUnaryOpStar:
  391. {
  392. RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
  393. RBBINode *starNode = pushNewNode(RBBINode::opStar);
  394. if (U_FAILURE(*fRB->fStatus)) {
  395. break;
  396. }
  397. starNode->fLeftChild = operandNode;
  398. operandNode->fParent = starNode;
  399. }
  400. break;
  401. case doRuleChar:
  402. // A "Rule Character" is any single character that is a literal part
  403. // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]"
  404. // These are pretty uncommon in break rules; the terms are more commonly
  405. // sets. To keep things uniform, treat these characters like as
  406. // sets that just happen to contain only one character.
  407. {
  408. n = pushNewNode(RBBINode::setRef);
  409. if (U_FAILURE(*fRB->fStatus)) {
  410. break;
  411. }
  412. findSetFor(UnicodeString(fC.fChar), n);
  413. n->fFirstPos = fScanIndex;
  414. n->fLastPos = fNextIndex;
  415. fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  416. break;
  417. }
  418. case doDotAny:
  419. // scanned a ".", meaning match any single character.
  420. {
  421. n = pushNewNode(RBBINode::setRef);
  422. if (U_FAILURE(*fRB->fStatus)) {
  423. break;
  424. }
  425. findSetFor(UnicodeString(true, kAny, 3), n);
  426. n->fFirstPos = fScanIndex;
  427. n->fLastPos = fNextIndex;
  428. fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  429. break;
  430. }
  431. case doSlash:
  432. // Scanned a '/', which identifies a look-ahead break position in a rule.
  433. n = pushNewNode(RBBINode::lookAhead);
  434. if (U_FAILURE(*fRB->fStatus)) {
  435. break;
  436. }
  437. n->fVal = fRuleNum;
  438. n->fFirstPos = fScanIndex;
  439. n->fLastPos = fNextIndex;
  440. fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  441. fLookAheadRule = true;
  442. break;
  443. case doStartTagValue:
  444. // Scanned a '{', the opening delimiter for a tag value within a rule.
  445. n = pushNewNode(RBBINode::tag);
  446. if (U_FAILURE(*fRB->fStatus)) {
  447. break;
  448. }
  449. n->fVal = 0;
  450. n->fFirstPos = fScanIndex;
  451. n->fLastPos = fNextIndex;
  452. break;
  453. case doTagDigit:
  454. // Just scanned a decimal digit that's part of a tag value
  455. {
  456. n = fNodeStack[fNodeStackPtr];
  457. uint32_t v = u_charDigitValue(fC.fChar);
  458. U_ASSERT(v < 10);
  459. int64_t updated = static_cast<int64_t>(n->fVal)*10 + v;
  460. // Avoid overflow n->fVal
  461. if (updated > INT32_MAX) {
  462. error(U_BRK_RULE_SYNTAX);
  463. break;
  464. }
  465. n->fVal = static_cast<int32_t>(updated);
  466. break;
  467. }
  468. case doTagValue:
  469. n = fNodeStack[fNodeStackPtr];
  470. n->fLastPos = fNextIndex;
  471. fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  472. break;
  473. case doTagExpectedError:
  474. error(U_BRK_MALFORMED_RULE_TAG);
  475. returnVal = false;
  476. break;
  477. case doOptionStart:
  478. // Scanning a !!option. At the start of string.
  479. fOptionStart = fScanIndex;
  480. break;
  481. case doOptionEnd:
  482. {
  483. UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
  484. if (opt == UNICODE_STRING("chain", 5)) {
  485. fRB->fChainRules = true;
  486. } else if (opt == UNICODE_STRING("forward", 7)) {
  487. fRB->fDefaultTree = &fRB->fForwardTree;
  488. } else if (opt == UNICODE_STRING("reverse", 7)) {
  489. fRB->fDefaultTree = &fRB->fReverseTree;
  490. } else if (opt == UNICODE_STRING("safe_forward", 12)) {
  491. fRB->fDefaultTree = &fRB->fSafeFwdTree;
  492. } else if (opt == UNICODE_STRING("safe_reverse", 12)) {
  493. fRB->fDefaultTree = &fRB->fSafeRevTree;
  494. } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
  495. fRB->fLookAheadHardBreak = true;
  496. } else if (opt == UNICODE_STRING("quoted_literals_only", 20)) {
  497. fRuleSets[kRuleSet_rule_char-128].clear();
  498. } else if (opt == UNICODE_STRING("unquoted_literals", 17)) {
  499. fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus);
  500. } else {
  501. error(U_BRK_UNRECOGNIZED_OPTION);
  502. }
  503. }
  504. break;
  505. case doReverseDir:
  506. fReverseRule = true;
  507. break;
  508. case doStartVariableName:
  509. n = pushNewNode(RBBINode::varRef);
  510. if (U_FAILURE(*fRB->fStatus)) {
  511. break;
  512. }
  513. n->fFirstPos = fScanIndex;
  514. break;
  515. case doEndVariableName:
  516. n = fNodeStack[fNodeStackPtr];
  517. if (n==nullptr || n->fType != RBBINode::varRef) {
  518. error(U_BRK_INTERNAL_ERROR);
  519. break;
  520. }
  521. n->fLastPos = fScanIndex;
  522. fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
  523. // Look the newly scanned name up in the symbol table
  524. // If there's an entry, set the l. child of the var ref to the replacement expression.
  525. // (We also pass through here when scanning assignments, but no harm is done, other
  526. // than a slight wasted effort that seems hard to avoid. Lookup will be null)
  527. n->fLeftChild = fSymbolTable->lookupNode(n->fText);
  528. break;
  529. case doCheckVarDef:
  530. n = fNodeStack[fNodeStackPtr];
  531. if (n->fLeftChild == nullptr) {
  532. error(U_BRK_UNDEFINED_VARIABLE);
  533. returnVal = false;
  534. }
  535. break;
  536. case doExprFinished:
  537. break;
  538. case doRuleErrorAssignExpr:
  539. error(U_BRK_ASSIGN_ERROR);
  540. returnVal = false;
  541. break;
  542. case doExit:
  543. returnVal = false;
  544. break;
  545. case doScanUnicodeSet:
  546. scanSet();
  547. break;
  548. default:
  549. error(U_BRK_INTERNAL_ERROR);
  550. returnVal = false;
  551. break;
  552. }
  553. return returnVal && U_SUCCESS(*fRB->fStatus);
  554. }
  555. //------------------------------------------------------------------------------
  556. //
  557. // Error Report a rule parse error.
  558. // Only report it if no previous error has been recorded.
  559. //
  560. //------------------------------------------------------------------------------
  561. void RBBIRuleScanner::error(UErrorCode e) {
  562. if (U_SUCCESS(*fRB->fStatus)) {
  563. *fRB->fStatus = e;
  564. if (fRB->fParseError) {
  565. fRB->fParseError->line = fLineNum;
  566. fRB->fParseError->offset = fCharNum;
  567. fRB->fParseError->preContext[0] = 0;
  568. fRB->fParseError->postContext[0] = 0;
  569. }
  570. }
  571. }
  572. //------------------------------------------------------------------------------
  573. //
  574. // fixOpStack The parse stack holds partially assembled chunks of the parse tree.
  575. // An entry on the stack may be as small as a single setRef node,
  576. // or as large as the parse tree
  577. // for an entire expression (this will be the one item left on the stack
  578. // when the parsing of an RBBI rule completes.
  579. //
  580. // This function is called when a binary operator is encountered.
  581. // It looks back up the stack for operators that are not yet associated
  582. // with a right operand, and if the precedence of the stacked operator >=
  583. // the precedence of the current operator, binds the operand left,
  584. // to the previously encountered operator.
  585. //
  586. //------------------------------------------------------------------------------
  587. void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
  588. RBBINode *n;
  589. // printNodeStack("entering fixOpStack()");
  590. for (;;) {
  591. n = fNodeStack[fNodeStackPtr-1]; // an operator node
  592. if (n->fPrecedence == 0) {
  593. RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
  594. error(U_BRK_INTERNAL_ERROR);
  595. return;
  596. }
  597. if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
  598. // The most recent operand goes with the current operator,
  599. // not with the previously stacked one.
  600. break;
  601. }
  602. // Stack operator is a binary op ( '|' or concatenation)
  603. // TOS operand becomes right child of this operator.
  604. // Resulting subexpression becomes the TOS operand.
  605. n->fRightChild = fNodeStack[fNodeStackPtr];
  606. fNodeStack[fNodeStackPtr]->fParent = n;
  607. fNodeStackPtr--;
  608. // printNodeStack("looping in fixOpStack() ");
  609. }
  610. if (p <= RBBINode::precLParen) {
  611. // Scan is at a right paren or end of expression.
  612. // The scanned item must match the stack, or else there was an error.
  613. // Discard the left paren (or start expr) node from the stack,
  614. // leaving the completed (sub)expression as TOS.
  615. if (n->fPrecedence != p) {
  616. // Right paren encountered matched start of expression node, or
  617. // end of expression matched with a left paren node.
  618. error(U_BRK_MISMATCHED_PAREN);
  619. }
  620. fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
  621. fNodeStackPtr--;
  622. // Delete the now-discarded LParen or Start node.
  623. delete n;
  624. }
  625. // printNodeStack("leaving fixOpStack()");
  626. }
  627. //------------------------------------------------------------------------------
  628. //
  629. // findSetFor given a UnicodeString,
  630. // - find the corresponding Unicode Set (uset node)
  631. // (create one if necessary)
  632. // - Set fLeftChild of the caller's node (should be a setRef node)
  633. // to the uset node
  634. // Maintain a hash table of uset nodes, so the same one is always used
  635. // for the same string.
  636. // If a "to adopt" set is provided and we haven't seen this key before,
  637. // add the provided set to the hash table.
  638. // If the string is one (32 bit) char in length, the set contains
  639. // just one element which is the char in question.
  640. // If the string is "any", return a set containing all chars.
  641. //
  642. //------------------------------------------------------------------------------
  643. void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
  644. RBBISetTableEl *el;
  645. // First check whether we've already cached a set for this string.
  646. // If so, just use the cached set in the new node.
  647. // delete any set provided by the caller, since we own it.
  648. el = static_cast<RBBISetTableEl*>(uhash_get(fSetTable, &s));
  649. if (el != nullptr) {
  650. delete setToAdopt;
  651. node->fLeftChild = el->val;
  652. U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
  653. return;
  654. }
  655. // Haven't seen this set before.
  656. // If the caller didn't provide us with a prebuilt set,
  657. // create a new UnicodeSet now.
  658. if (setToAdopt == nullptr) {
  659. if (s.compare(kAny, -1) == 0) {
  660. setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
  661. } else {
  662. UChar32 c;
  663. c = s.char32At(0);
  664. setToAdopt = new UnicodeSet(c, c);
  665. }
  666. }
  667. //
  668. // Make a new uset node to refer to this UnicodeSet
  669. // This new uset node becomes the child of the caller's setReference node.
  670. //
  671. RBBINode *usetNode = new RBBINode(RBBINode::uset);
  672. if (usetNode == nullptr) {
  673. error(U_MEMORY_ALLOCATION_ERROR);
  674. delete setToAdopt;
  675. return;
  676. }
  677. usetNode->fInputSet = setToAdopt;
  678. usetNode->fParent = node;
  679. node->fLeftChild = usetNode;
  680. usetNode->fText = s;
  681. //
  682. // Add the new uset node to the list of all uset nodes.
  683. //
  684. fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
  685. //
  686. // Add the new set to the set hash table.
  687. //
  688. el = static_cast<RBBISetTableEl*>(uprv_malloc(sizeof(RBBISetTableEl)));
  689. UnicodeString *tkey = new UnicodeString(s);
  690. if (tkey == nullptr || el == nullptr || setToAdopt == nullptr) {
  691. // Delete to avoid memory leak
  692. delete tkey;
  693. tkey = nullptr;
  694. uprv_free(el);
  695. el = nullptr;
  696. delete setToAdopt;
  697. setToAdopt = nullptr;
  698. error(U_MEMORY_ALLOCATION_ERROR);
  699. return;
  700. }
  701. el->key = tkey;
  702. el->val = usetNode;
  703. uhash_put(fSetTable, el->key, el, fRB->fStatus);
  704. }
  705. //
  706. // Assorted Unicode character constants.
  707. // Numeric because there is no portable way to enter them as literals.
  708. // (Think EBCDIC).
  709. //
  710. static const char16_t chCR = 0x0d; // New lines, for terminating comments.
  711. static const char16_t chLF = 0x0a;
  712. static const char16_t chNEL = 0x85; // NEL newline variant
  713. static const char16_t chLS = 0x2028; // Unicode Line Separator
  714. static const char16_t chApos = 0x27; // single quote, for quoted chars.
  715. static const char16_t chPound = 0x23; // '#', introduces a comment.
  716. static const char16_t chBackSlash = 0x5c; // '\' introduces a char escape
  717. static const char16_t chLParen = 0x28;
  718. static const char16_t chRParen = 0x29;
  719. //------------------------------------------------------------------------------
  720. //
  721. // stripRules Return a rules string without extra spaces.
  722. // (Comments are removed separately, during rule parsing.)
  723. //
  724. //------------------------------------------------------------------------------
  725. UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
  726. UnicodeString strippedRules;
  727. int32_t rulesLength = rules.length();
  728. for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
  729. UChar32 cp = rules.char32At(idx);
  730. bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
  731. if (whiteSpace) {
  732. continue;
  733. }
  734. strippedRules.append(cp);
  735. }
  736. return strippedRules;
  737. }
  738. //------------------------------------------------------------------------------
  739. //
  740. // nextCharLL Low Level Next Char from rule input source.
  741. // Get a char from the input character iterator,
  742. // keep track of input position for error reporting.
  743. //
  744. //------------------------------------------------------------------------------
  745. UChar32 RBBIRuleScanner::nextCharLL() {
  746. UChar32 ch;
  747. if (fNextIndex >= fRB->fRules.length()) {
  748. return static_cast<UChar32>(-1);
  749. }
  750. ch = fRB->fRules.char32At(fNextIndex);
  751. if (U_IS_SURROGATE(ch)) {
  752. error(U_ILLEGAL_CHAR_FOUND);
  753. return U_SENTINEL;
  754. }
  755. fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
  756. if (ch == chCR ||
  757. ch == chNEL ||
  758. ch == chLS ||
  759. (ch == chLF && fLastChar != chCR)) {
  760. // Character is starting a new line. Bump up the line number, and
  761. // reset the column to 0.
  762. fLineNum++;
  763. fCharNum=0;
  764. if (fQuoteMode) {
  765. error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
  766. fQuoteMode = false;
  767. }
  768. }
  769. else {
  770. // Character is not starting a new line. Except in the case of a
  771. // LF following a CR, increment the column position.
  772. if (ch != chLF) {
  773. fCharNum++;
  774. }
  775. }
  776. fLastChar = ch;
  777. return ch;
  778. }
  779. //------------------------------------------------------------------------------
  780. //
  781. // nextChar for rules scanning. At this level, we handle stripping
  782. // out comments and processing backslash character escapes.
  783. // The rest of the rules grammar is handled at the next level up.
  784. //
  785. //------------------------------------------------------------------------------
  786. void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
  787. // Unicode Character constants needed for the processing done by nextChar(),
  788. // in hex because literals wont work on EBCDIC machines.
  789. fScanIndex = fNextIndex;
  790. c.fChar = nextCharLL();
  791. c.fEscaped = false;
  792. //
  793. // check for '' sequence.
  794. // These are recognized in all contexts, whether in quoted text or not.
  795. //
  796. if (c.fChar == chApos) {
  797. if (fRB->fRules.char32At(fNextIndex) == chApos) {
  798. c.fChar = nextCharLL(); // get nextChar officially so character counts
  799. c.fEscaped = true; // stay correct.
  800. }
  801. else
  802. {
  803. // Single quote, by itself.
  804. // Toggle quoting mode.
  805. // Return either '(' or ')', because quotes cause a grouping of the quoted text.
  806. fQuoteMode = !fQuoteMode;
  807. if (fQuoteMode) {
  808. c.fChar = chLParen;
  809. } else {
  810. c.fChar = chRParen;
  811. }
  812. c.fEscaped = false; // The paren that we return is not escaped.
  813. return;
  814. }
  815. }
  816. if (c.fChar == static_cast<UChar32>(-1)) {
  817. return;
  818. }
  819. if (fQuoteMode) {
  820. c.fEscaped = true;
  821. }
  822. else
  823. {
  824. // We are not in a 'quoted region' of the source.
  825. //
  826. if (c.fChar == chPound) {
  827. // Start of a comment. Consume the rest of it.
  828. // The new-line char that terminates the comment is always returned.
  829. // It will be treated as white-space, and serves to break up anything
  830. // that might otherwise incorrectly clump together with a comment in
  831. // the middle (a variable name, for example.)
  832. int32_t commentStart = fScanIndex;
  833. for (;;) {
  834. c.fChar = nextCharLL();
  835. if (c.fChar == static_cast<UChar32>(-1) || // EOF
  836. c.fChar == chCR ||
  837. c.fChar == chLF ||
  838. c.fChar == chNEL ||
  839. c.fChar == chLS) {break;}
  840. }
  841. for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
  842. fRB->fStrippedRules.setCharAt(i, u' ');
  843. }
  844. }
  845. if (c.fChar == static_cast<UChar32>(-1)) {
  846. return;
  847. }
  848. //
  849. // check for backslash escaped characters.
  850. // Use UnicodeString::unescapeAt() to handle them.
  851. //
  852. if (c.fChar == chBackSlash) {
  853. c.fEscaped = true;
  854. int32_t startX = fNextIndex;
  855. c.fChar = fRB->fRules.unescapeAt(fNextIndex);
  856. if (fNextIndex == startX) {
  857. error(U_BRK_HEX_DIGITS_EXPECTED);
  858. }
  859. fCharNum += fNextIndex-startX;
  860. }
  861. }
  862. // putc(c.fChar, stdout);
  863. }
  864. //------------------------------------------------------------------------------
  865. //
  866. // Parse RBBI rules. The state machine for rules parsing is here.
  867. // The state tables are hand-written in the file rbbirpt.txt,
  868. // and converted to the form used here by a perl
  869. // script rbbicst.pl
  870. //
  871. //------------------------------------------------------------------------------
  872. void RBBIRuleScanner::parse() {
  873. uint16_t state;
  874. const RBBIRuleTableEl *tableEl;
  875. if (U_FAILURE(*fRB->fStatus)) {
  876. return;
  877. }
  878. state = 1;
  879. nextChar(fC);
  880. //
  881. // Main loop for the rule parsing state machine.
  882. // Runs once per state transition.
  883. // Each time through optionally performs, depending on the state table,
  884. // - an advance to the the next input char
  885. // - an action to be performed.
  886. // - pushing or popping a state to/from the local state return stack.
  887. //
  888. for (;;) {
  889. // Bail out if anything has gone wrong.
  890. // RBBI rule file parsing stops on the first error encountered.
  891. if (U_FAILURE(*fRB->fStatus)) {
  892. break;
  893. }
  894. // Quit if state == 0. This is the normal way to exit the state machine.
  895. //
  896. if (state == 0) {
  897. break;
  898. }
  899. // Find the state table element that matches the input char from the rule, or the
  900. // class of the input character. Start with the first table row for this
  901. // state, then linearly scan forward until we find a row that matches the
  902. // character. The last row for each state always matches all characters, so
  903. // the search will stop there, if not before.
  904. //
  905. tableEl = &gRuleParseStateTable[state];
  906. #ifdef RBBI_DEBUG
  907. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
  908. RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",
  909. fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
  910. }
  911. #endif
  912. for (;;) {
  913. #ifdef RBBI_DEBUG
  914. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
  915. #endif
  916. if (tableEl->fCharClass < 127 && fC.fEscaped == false && tableEl->fCharClass == fC.fChar) {
  917. // Table row specified an individual character, not a set, and
  918. // the input character is not escaped, and
  919. // the input character matched it.
  920. break;
  921. }
  922. if (tableEl->fCharClass == 255) {
  923. // Table row specified default, match anything character class.
  924. break;
  925. }
  926. if (tableEl->fCharClass == 254 && fC.fEscaped) {
  927. // Table row specified "escaped" and the char was escaped.
  928. break;
  929. }
  930. if (tableEl->fCharClass == 253 && fC.fEscaped &&
  931. (fC.fChar == 0x50 || fC.fChar == 0x70 )) {
  932. // Table row specified "escaped P" and the char is either 'p' or 'P'.
  933. break;
  934. }
  935. if (tableEl->fCharClass == 252 && fC.fChar == static_cast<UChar32>(-1)) {
  936. // Table row specified eof and we hit eof on the input.
  937. break;
  938. }
  939. if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
  940. fC.fEscaped == false && // char is not escaped &&
  941. fC.fChar != static_cast<UChar32>(-1)) { // char is not EOF
  942. U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets));
  943. if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
  944. // Table row specified a character class, or set of characters,
  945. // and the current char matches it.
  946. break;
  947. }
  948. }
  949. // No match on this row, advance to the next row for this state,
  950. tableEl++;
  951. }
  952. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
  953. //
  954. // We've found the row of the state table that matches the current input
  955. // character from the rules string.
  956. // Perform any action specified by this row in the state table.
  957. if (doParseActions(static_cast<int32_t>(tableEl->fAction)) == false) {
  958. // Break out of the state machine loop if the
  959. // the action signalled some kind of error, or
  960. // the action was to exit, occurs on normal end-of-rules-input.
  961. break;
  962. }
  963. if (tableEl->fPushState != 0) {
  964. fStackPtr++;
  965. if (fStackPtr >= kStackSize) {
  966. error(U_BRK_INTERNAL_ERROR);
  967. RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
  968. fStackPtr--;
  969. }
  970. fStack[fStackPtr] = tableEl->fPushState;
  971. }
  972. if (tableEl->fNextChar) {
  973. nextChar(fC);
  974. }
  975. // Get the next state from the table entry, or from the
  976. // state stack if the next state was specified as "pop".
  977. if (tableEl->fNextState != 255) {
  978. state = tableEl->fNextState;
  979. } else {
  980. state = fStack[fStackPtr];
  981. fStackPtr--;
  982. if (fStackPtr < 0) {
  983. error(U_BRK_INTERNAL_ERROR);
  984. RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
  985. fStackPtr++;
  986. }
  987. }
  988. }
  989. if (U_FAILURE(*fRB->fStatus)) {
  990. return;
  991. }
  992. // If there are no forward rules set an error.
  993. //
  994. if (fRB->fForwardTree == nullptr) {
  995. error(U_BRK_RULE_SYNTAX);
  996. return;
  997. }
  998. //
  999. // Parsing of the input RBBI rules is complete.
  1000. // We now have a parse tree for the rule expressions
  1001. // and a list of all UnicodeSets that are referenced.
  1002. //
  1003. #ifdef RBBI_DEBUG
  1004. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
  1005. if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) {
  1006. RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
  1007. RBBINode::printTree(fRB->fForwardTree, true);
  1008. RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
  1009. RBBINode::printTree(fRB->fReverseTree, true);
  1010. RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
  1011. RBBINode::printTree(fRB->fSafeFwdTree, true);
  1012. RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
  1013. RBBINode::printTree(fRB->fSafeRevTree, true);
  1014. }
  1015. #endif
  1016. }
  1017. //------------------------------------------------------------------------------
  1018. //
  1019. // printNodeStack for debugging...
  1020. //
  1021. //------------------------------------------------------------------------------
  1022. #ifdef RBBI_DEBUG
  1023. void RBBIRuleScanner::printNodeStack(const char *title) {
  1024. int i;
  1025. RBBIDebugPrintf("%s. Dumping node stack...\n", title);
  1026. for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], true);}
  1027. }
  1028. #endif
  1029. //------------------------------------------------------------------------------
  1030. //
  1031. // pushNewNode create a new RBBINode of the specified type and push it
  1032. // onto the stack of nodes.
  1033. //
  1034. //------------------------------------------------------------------------------
  1035. RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
  1036. if (U_FAILURE(*fRB->fStatus)) {
  1037. return nullptr;
  1038. }
  1039. if (fNodeStackPtr >= kStackSize - 1) {
  1040. error(U_BRK_RULE_SYNTAX);
  1041. RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
  1042. return nullptr;
  1043. }
  1044. fNodeStackPtr++;
  1045. fNodeStack[fNodeStackPtr] = new RBBINode(t);
  1046. if (fNodeStack[fNodeStackPtr] == nullptr) {
  1047. *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
  1048. }
  1049. return fNodeStack[fNodeStackPtr];
  1050. }
  1051. //------------------------------------------------------------------------------
  1052. //
  1053. // scanSet Construct a UnicodeSet from the text at the current scan
  1054. // position. Advance the scan position to the first character
  1055. // after the set.
  1056. //
  1057. // A new RBBI setref node referring to the set is pushed onto the node
  1058. // stack.
  1059. //
  1060. // The scan position is normally under the control of the state machine
  1061. // that controls rule parsing. UnicodeSets, however, are parsed by
  1062. // the UnicodeSet constructor, not by the RBBI rule parser.
  1063. //
  1064. //------------------------------------------------------------------------------
  1065. void RBBIRuleScanner::scanSet() {
  1066. ParsePosition pos;
  1067. int startPos;
  1068. int i;
  1069. if (U_FAILURE(*fRB->fStatus)) {
  1070. return;
  1071. }
  1072. pos.setIndex(fScanIndex);
  1073. startPos = fScanIndex;
  1074. UErrorCode localStatus = U_ZERO_ERROR;
  1075. LocalPointer<UnicodeSet> uset(new UnicodeSet(), localStatus);
  1076. if (U_FAILURE(localStatus)) {
  1077. error(localStatus);
  1078. return;
  1079. }
  1080. uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
  1081. if (U_FAILURE(localStatus)) {
  1082. // TODO: Get more accurate position of the error from UnicodeSet's return info.
  1083. // UnicodeSet appears to not be reporting correctly at this time.
  1084. #ifdef RBBI_DEBUG
  1085. RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex());
  1086. #endif
  1087. error(localStatus);
  1088. return;
  1089. }
  1090. // Verify that the set contains at least one code point.
  1091. //
  1092. U_ASSERT(uset.isValid());
  1093. UnicodeSet tempSet(*uset);
  1094. // Use tempSet to handle the case that the UnicodeSet contains
  1095. // only string element, such as [{ab}] and treat it as empty set.
  1096. tempSet.removeAllStrings();
  1097. if (tempSet.isEmpty()) {
  1098. // This set is empty.
  1099. // Make it an error, because it almost certainly is not what the user wanted.
  1100. // Also, avoids having to think about corner cases in the tree manipulation code
  1101. // that occurs later on.
  1102. error(U_BRK_RULE_EMPTY_SET);
  1103. return;
  1104. }
  1105. // Advance the RBBI parse position over the UnicodeSet pattern.
  1106. // Don't just set fScanIndex because the line/char positions maintained
  1107. // for error reporting would be thrown off.
  1108. i = pos.getIndex();
  1109. for (;U_SUCCESS(*fRB->fStatus);) {
  1110. if (fNextIndex >= i) {
  1111. break;
  1112. }
  1113. nextCharLL();
  1114. }
  1115. if (U_SUCCESS(*fRB->fStatus)) {
  1116. RBBINode *n;
  1117. n = pushNewNode(RBBINode::setRef);
  1118. if (U_FAILURE(*fRB->fStatus)) {
  1119. return;
  1120. }
  1121. n->fFirstPos = startPos;
  1122. n->fLastPos = fNextIndex;
  1123. fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  1124. // findSetFor() serves several purposes here:
  1125. // - Adopts storage for the UnicodeSet, will be responsible for deleting.
  1126. // - Maintains collection of all sets in use, needed later for establishing
  1127. // character categories for run time engine.
  1128. // - Eliminates mulitiple instances of the same set.
  1129. // - Creates a new uset node if necessary (if this isn't a duplicate.)
  1130. findSetFor(n->fText, n, uset.orphan());
  1131. }
  1132. }
  1133. int32_t RBBIRuleScanner::numRules() {
  1134. return fRuleNum;
  1135. }
  1136. U_NAMESPACE_END
  1137. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */