rbbinode.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ***************************************************************************
  5. * Copyright (C) 2002-2016 International Business Machines Corporation *
  6. * and others. All rights reserved. *
  7. ***************************************************************************
  8. */
  9. //
  10. // File: rbbinode.cpp
  11. //
  12. // Implementation of class RBBINode, which represents a node in the
  13. // tree generated when parsing the Rules Based Break Iterator rules.
  14. //
  15. // This "Class" is actually closer to a struct.
  16. // Code using it is expected to directly access fields much of the time.
  17. //
  18. #include "unicode/utypes.h"
  19. #if !UCONFIG_NO_BREAK_ITERATION
  20. #include "unicode/unistr.h"
  21. #include "unicode/uniset.h"
  22. #include "unicode/uchar.h"
  23. #include "unicode/parsepos.h"
  24. #include "cstr.h"
  25. #include "uvector.h"
  26. #include "rbbirb.h"
  27. #include "rbbinode.h"
  28. #include "uassert.h"
  29. U_NAMESPACE_BEGIN
  30. #ifdef RBBI_DEBUG
  31. static int gLastSerial = 0;
  32. #endif
  33. //-------------------------------------------------------------------------
  34. //
  35. // Constructor. Just set the fields to reasonable default values.
  36. //
  37. //-------------------------------------------------------------------------
  38. RBBINode::RBBINode(NodeType t) : UMemory() {
  39. #ifdef RBBI_DEBUG
  40. fSerialNum = ++gLastSerial;
  41. #endif
  42. fType = t;
  43. fParent = nullptr;
  44. fLeftChild = nullptr;
  45. fRightChild = nullptr;
  46. fInputSet = nullptr;
  47. fFirstPos = 0;
  48. fLastPos = 0;
  49. fNullable = false;
  50. fLookAheadEnd = false;
  51. fRuleRoot = false;
  52. fChainIn = false;
  53. fVal = 0;
  54. fPrecedence = precZero;
  55. UErrorCode status = U_ZERO_ERROR;
  56. fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
  57. fLastPosSet = new UVector(status);
  58. fFollowPos = new UVector(status);
  59. if (t==opCat) {fPrecedence = precOpCat;}
  60. else if (t==opOr) {fPrecedence = precOpOr;}
  61. else if (t==opStart) {fPrecedence = precStart;}
  62. else if (t==opLParen) {fPrecedence = precLParen;}
  63. }
  64. RBBINode::RBBINode(const RBBINode &other) : UMemory(other) {
  65. #ifdef RBBI_DEBUG
  66. fSerialNum = ++gLastSerial;
  67. #endif
  68. fType = other.fType;
  69. fParent = nullptr;
  70. fLeftChild = nullptr;
  71. fRightChild = nullptr;
  72. fInputSet = other.fInputSet;
  73. fPrecedence = other.fPrecedence;
  74. fText = other.fText;
  75. fFirstPos = other.fFirstPos;
  76. fLastPos = other.fLastPos;
  77. fNullable = other.fNullable;
  78. fVal = other.fVal;
  79. fRuleRoot = false;
  80. fChainIn = other.fChainIn;
  81. UErrorCode status = U_ZERO_ERROR;
  82. fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
  83. fLastPosSet = new UVector(status);
  84. fFollowPos = new UVector(status);
  85. }
  86. //-------------------------------------------------------------------------
  87. //
  88. // Destructor. Deletes both this node AND any child nodes,
  89. // except in the case of variable reference nodes. For
  90. // these, the l. child points back to the definition, which
  91. // is common for all references to the variable, meaning
  92. // it can't be deleted here.
  93. //
  94. //-------------------------------------------------------------------------
  95. RBBINode::~RBBINode() {
  96. // printf("deleting node %8x serial %4d\n", this, this->fSerialNum);
  97. delete fInputSet;
  98. fInputSet = nullptr;
  99. switch (this->fType) {
  100. case varRef:
  101. case setRef:
  102. // for these node types, multiple instances point to the same "children"
  103. // Storage ownership of children handled elsewhere. Don't delete here.
  104. break;
  105. default:
  106. // Avoid using a recursive implementation because of stack overflow problems.
  107. // See bug ICU-22584.
  108. // delete fLeftChild;
  109. NRDeleteNode(fLeftChild);
  110. fLeftChild = nullptr;
  111. // delete fRightChild;
  112. NRDeleteNode(fRightChild);
  113. fRightChild = nullptr;
  114. }
  115. delete fFirstPosSet;
  116. delete fLastPosSet;
  117. delete fFollowPos;
  118. }
  119. /**
  120. * Non-recursive delete of a node + its children. Used from the node destructor
  121. * instead of the more obvious recursive implementation to avoid problems with
  122. * stack overflow with some perverse test rule data (from fuzzing).
  123. */
  124. void RBBINode::NRDeleteNode(RBBINode *node) {
  125. if (node == nullptr) {
  126. return;
  127. }
  128. RBBINode *stopNode = node->fParent;
  129. RBBINode *nextNode = node;
  130. while (nextNode != stopNode && nextNode != nullptr) {
  131. RBBINode *currentNode = nextNode;
  132. if ((currentNode->fLeftChild == nullptr && currentNode->fRightChild == nullptr) ||
  133. currentNode->fType == varRef || // varRef and setRef nodes do not
  134. currentNode->fType == setRef) { // own their children nodes.
  135. // CurrentNode is effectively a leaf node; it's safe to go ahead and delete it.
  136. nextNode = currentNode->fParent;
  137. if (nextNode) {
  138. if (nextNode->fLeftChild == currentNode) {
  139. nextNode->fLeftChild = nullptr;
  140. } else if (nextNode->fRightChild == currentNode) {
  141. nextNode->fRightChild = nullptr;
  142. }
  143. }
  144. delete currentNode;
  145. } else if (currentNode->fLeftChild) {
  146. nextNode = currentNode->fLeftChild;
  147. if (nextNode->fParent == nullptr) {
  148. nextNode->fParent = currentNode;
  149. // fParent isn't always set; do it now if not.
  150. }
  151. U_ASSERT(nextNode->fParent == currentNode);
  152. } else if (currentNode->fRightChild) {
  153. nextNode = currentNode->fRightChild;
  154. if (nextNode->fParent == nullptr) {
  155. nextNode->fParent = currentNode;
  156. // fParent isn't always set; do it now if not.
  157. }
  158. U_ASSERT(nextNode->fParent == currentNode);
  159. }
  160. }
  161. }
  162. //-------------------------------------------------------------------------
  163. //
  164. // cloneTree Make a copy of the subtree rooted at this node.
  165. // Discard any variable references encountered along the way,
  166. // and replace with copies of the variable's definitions.
  167. // Used to replicate the expression underneath variable
  168. // references in preparation for generating the DFA tables.
  169. //
  170. //-------------------------------------------------------------------------
  171. RBBINode *RBBINode::cloneTree() {
  172. RBBINode *n;
  173. if (fType == RBBINode::varRef) {
  174. // If the current node is a variable reference, skip over it
  175. // and clone the definition of the variable instead.
  176. n = fLeftChild->cloneTree();
  177. } else if (fType == RBBINode::uset) {
  178. n = this;
  179. } else {
  180. n = new RBBINode(*this);
  181. // Check for null pointer.
  182. if (n != nullptr) {
  183. if (fLeftChild != nullptr) {
  184. n->fLeftChild = fLeftChild->cloneTree();
  185. n->fLeftChild->fParent = n;
  186. }
  187. if (fRightChild != nullptr) {
  188. n->fRightChild = fRightChild->cloneTree();
  189. n->fRightChild->fParent = n;
  190. }
  191. }
  192. }
  193. return n;
  194. }
  195. //-------------------------------------------------------------------------
  196. //
  197. // flattenVariables Walk a parse tree, replacing any variable
  198. // references with a copy of the variable's definition.
  199. // Aside from variables, the tree is not changed.
  200. //
  201. // Return the root of the tree. If the root was not a variable
  202. // reference, it remains unchanged - the root we started with
  203. // is the root we return. If, however, the root was a variable
  204. // reference, the root of the newly cloned replacement tree will
  205. // be returned, and the original tree deleted.
  206. //
  207. // This function works by recursively walking the tree
  208. // without doing anything until a variable reference is
  209. // found, then calling cloneTree() at that point. Any
  210. // nested references are handled by cloneTree(), not here.
  211. //
  212. //-------------------------------------------------------------------------
  213. constexpr int kRecursiveDepthLimit = 3500;
  214. RBBINode *RBBINode::flattenVariables(UErrorCode& status, int depth) {
  215. if (U_FAILURE(status)) {
  216. return this;
  217. }
  218. // If the depth of the stack is too deep, we return U_INPUT_TOO_LONG_ERROR
  219. // to avoid stack overflow crash.
  220. if (depth > kRecursiveDepthLimit) {
  221. status = U_INPUT_TOO_LONG_ERROR;
  222. return this;
  223. }
  224. if (fType == varRef) {
  225. RBBINode *retNode = fLeftChild->cloneTree();
  226. if (retNode != nullptr) {
  227. retNode->fRuleRoot = this->fRuleRoot;
  228. retNode->fChainIn = this->fChainIn;
  229. }
  230. delete this; // TODO: undefined behavior. Fix.
  231. return retNode;
  232. }
  233. if (fLeftChild != nullptr) {
  234. fLeftChild = fLeftChild->flattenVariables(status, depth+1);
  235. fLeftChild->fParent = this;
  236. }
  237. if (fRightChild != nullptr) {
  238. fRightChild = fRightChild->flattenVariables(status, depth+1);
  239. fRightChild->fParent = this;
  240. }
  241. return this;
  242. }
  243. //-------------------------------------------------------------------------
  244. //
  245. // flattenSets Walk the parse tree, replacing any nodes of type setRef
  246. // with a copy of the expression tree for the set. A set's
  247. // equivalent expression tree is precomputed and saved as
  248. // the left child of the uset node.
  249. //
  250. //-------------------------------------------------------------------------
  251. void RBBINode::flattenSets() {
  252. U_ASSERT(fType != setRef);
  253. if (fLeftChild != nullptr) {
  254. if (fLeftChild->fType==setRef) {
  255. RBBINode *setRefNode = fLeftChild;
  256. RBBINode *usetNode = setRefNode->fLeftChild;
  257. RBBINode *replTree = usetNode->fLeftChild;
  258. fLeftChild = replTree->cloneTree();
  259. fLeftChild->fParent = this;
  260. delete setRefNode;
  261. } else {
  262. fLeftChild->flattenSets();
  263. }
  264. }
  265. if (fRightChild != nullptr) {
  266. if (fRightChild->fType==setRef) {
  267. RBBINode *setRefNode = fRightChild;
  268. RBBINode *usetNode = setRefNode->fLeftChild;
  269. RBBINode *replTree = usetNode->fLeftChild;
  270. fRightChild = replTree->cloneTree();
  271. fRightChild->fParent = this;
  272. delete setRefNode;
  273. } else {
  274. fRightChild->flattenSets();
  275. }
  276. }
  277. }
  278. //-------------------------------------------------------------------------
  279. //
  280. // findNodes() Locate all the nodes of the specified type, starting
  281. // at the specified root.
  282. //
  283. //-------------------------------------------------------------------------
  284. void RBBINode::findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status) {
  285. /* test for buffer overflows */
  286. if (U_FAILURE(status)) {
  287. return;
  288. }
  289. U_ASSERT(!dest->hasDeleter());
  290. if (fType == kind) {
  291. dest->addElement(this, status);
  292. }
  293. if (fLeftChild != nullptr) {
  294. fLeftChild->findNodes(dest, kind, status);
  295. }
  296. if (fRightChild != nullptr) {
  297. fRightChild->findNodes(dest, kind, status);
  298. }
  299. }
  300. //-------------------------------------------------------------------------
  301. //
  302. // print. Print out a single node, for debugging.
  303. //
  304. //-------------------------------------------------------------------------
  305. #ifdef RBBI_DEBUG
  306. static int32_t serial(const RBBINode *node) {
  307. return (node == nullptr? -1 : node->fSerialNum);
  308. }
  309. void RBBINode::printNode(const RBBINode *node) {
  310. static const char * const nodeTypeNames[] = {
  311. "setRef",
  312. "uset",
  313. "varRef",
  314. "leafChar",
  315. "lookAhead",
  316. "tag",
  317. "endMark",
  318. "opStart",
  319. "opCat",
  320. "opOr",
  321. "opStar",
  322. "opPlus",
  323. "opQuestion",
  324. "opBreak",
  325. "opReverse",
  326. "opLParen"
  327. };
  328. if (node==nullptr) {
  329. RBBIDebugPrintf("%10p", (void *)node);
  330. } else {
  331. RBBIDebugPrintf("%10p %5d %12s %c%c %5d %5d %5d %6d %d ",
  332. (void *)node, node->fSerialNum, nodeTypeNames[node->fType],
  333. node->fRuleRoot?'R':' ', node->fChainIn?'C':' ',
  334. serial(node->fLeftChild), serial(node->fRightChild), serial(node->fParent),
  335. node->fFirstPos, node->fVal);
  336. if (node->fType == varRef) {
  337. RBBI_DEBUG_printUnicodeString(node->fText);
  338. }
  339. }
  340. RBBIDebugPrintf("\n");
  341. }
  342. #endif
  343. #ifdef RBBI_DEBUG
  344. U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth) {
  345. RBBIDebugPrintf("%*s", minWidth, CStr(s)());
  346. }
  347. #endif
  348. //-------------------------------------------------------------------------
  349. //
  350. // print. Print out the tree of nodes rooted at "this"
  351. //
  352. //-------------------------------------------------------------------------
  353. #ifdef RBBI_DEBUG
  354. void RBBINode::printNodeHeader() {
  355. RBBIDebugPrintf(" Address serial type LeftChild RightChild Parent position value\n");
  356. }
  357. void RBBINode::printTree(const RBBINode *node, UBool printHeading) {
  358. if (printHeading) {
  359. printNodeHeader();
  360. }
  361. printNode(node);
  362. if (node != nullptr) {
  363. // Only dump the definition under a variable reference if asked to.
  364. // Unconditionally dump children of all other node types.
  365. if (node->fType != varRef) {
  366. if (node->fLeftChild != nullptr) {
  367. printTree(node->fLeftChild, false);
  368. }
  369. if (node->fRightChild != nullptr) {
  370. printTree(node->fRightChild, false);
  371. }
  372. }
  373. }
  374. }
  375. #endif
  376. U_NAMESPACE_END
  377. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */