2
0

rbbinode.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ***************************************************************************
  5. * Copyright (C) 2002-2016 International Business Machines Corporation *
  6. * and others. All rights reserved. *
  7. ***************************************************************************
  8. */
  9. //
  10. // File: rbbinode.cpp
  11. //
  12. // Implementation of class RBBINode, which represents a node in the
  13. // tree generated when parsing the Rules Based Break Iterator rules.
  14. //
  15. // This "Class" is actually closer to a struct.
  16. // Code using it is expected to directly access fields much of the time.
  17. //
  18. #include "unicode/utypes.h"
  19. #if !UCONFIG_NO_BREAK_ITERATION
  20. #include "unicode/unistr.h"
  21. #include "unicode/uniset.h"
  22. #include "unicode/uchar.h"
  23. #include "unicode/parsepos.h"
  24. #include "cstr.h"
  25. #include "uvector.h"
  26. #include "rbbirb.h"
  27. #include "rbbinode.h"
  28. #include "uassert.h"
  29. U_NAMESPACE_BEGIN
  30. #ifdef RBBI_DEBUG
  31. static int gLastSerial = 0;
  32. #endif
  33. //-------------------------------------------------------------------------
  34. //
  35. // Constructor. Just set the fields to reasonable default values.
  36. //
  37. //-------------------------------------------------------------------------
  38. RBBINode::RBBINode(NodeType t, UErrorCode& status) : UMemory() {
  39. if (U_FAILURE(status)) {
  40. return;
  41. }
  42. #ifdef RBBI_DEBUG
  43. fSerialNum = ++gLastSerial;
  44. #endif
  45. fType = t;
  46. fParent = nullptr;
  47. fLeftChild = nullptr;
  48. fRightChild = nullptr;
  49. fInputSet = nullptr;
  50. fFirstPos = 0;
  51. fLastPos = 0;
  52. fNullable = false;
  53. fLookAheadEnd = false;
  54. fRuleRoot = false;
  55. fChainIn = false;
  56. fVal = 0;
  57. fPrecedence = precZero;
  58. fFirstPosSet = new UVector(status);
  59. fLastPosSet = new UVector(status);
  60. fFollowPos = new UVector(status);
  61. if (U_SUCCESS(status) &&
  62. (fFirstPosSet == nullptr || fLastPosSet == nullptr || fFollowPos == nullptr)) {
  63. status = U_MEMORY_ALLOCATION_ERROR;
  64. }
  65. if (t==opCat) {fPrecedence = precOpCat;}
  66. else if (t==opOr) {fPrecedence = precOpOr;}
  67. else if (t==opStart) {fPrecedence = precStart;}
  68. else if (t==opLParen) {fPrecedence = precLParen;}
  69. }
  70. RBBINode::RBBINode(const RBBINode &other, UErrorCode& status) : UMemory(other) {
  71. if (U_FAILURE(status)) {
  72. return;
  73. }
  74. #ifdef RBBI_DEBUG
  75. fSerialNum = ++gLastSerial;
  76. #endif
  77. fType = other.fType;
  78. fParent = nullptr;
  79. fLeftChild = nullptr;
  80. fRightChild = nullptr;
  81. fInputSet = other.fInputSet;
  82. fPrecedence = other.fPrecedence;
  83. fText = other.fText;
  84. fFirstPos = other.fFirstPos;
  85. fLastPos = other.fLastPos;
  86. fNullable = other.fNullable;
  87. fVal = other.fVal;
  88. fRuleRoot = false;
  89. fChainIn = other.fChainIn;
  90. fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
  91. fLastPosSet = new UVector(status);
  92. fFollowPos = new UVector(status);
  93. if (U_SUCCESS(status) &&
  94. (fFirstPosSet == nullptr || fLastPosSet == nullptr || fFollowPos == nullptr)) {
  95. status = U_MEMORY_ALLOCATION_ERROR;
  96. }
  97. }
  98. //-------------------------------------------------------------------------
  99. //
  100. // Destructor. Deletes both this node AND any child nodes,
  101. // except in the case of variable reference nodes. For
  102. // these, the l. child points back to the definition, which
  103. // is common for all references to the variable, meaning
  104. // it can't be deleted here.
  105. //
  106. //-------------------------------------------------------------------------
  107. RBBINode::~RBBINode() {
  108. // printf("deleting node %8x serial %4d\n", this, this->fSerialNum);
  109. delete fInputSet;
  110. fInputSet = nullptr;
  111. switch (this->fType) {
  112. case varRef:
  113. case setRef:
  114. // for these node types, multiple instances point to the same "children"
  115. // Storage ownership of children handled elsewhere. Don't delete here.
  116. break;
  117. default:
  118. // Avoid using a recursive implementation because of stack overflow problems.
  119. // See bug ICU-22584.
  120. // delete fLeftChild;
  121. NRDeleteNode(fLeftChild);
  122. fLeftChild = nullptr;
  123. // delete fRightChild;
  124. NRDeleteNode(fRightChild);
  125. fRightChild = nullptr;
  126. }
  127. delete fFirstPosSet;
  128. delete fLastPosSet;
  129. delete fFollowPos;
  130. }
  131. /**
  132. * Non-recursive delete of a node + its children. Used from the node destructor
  133. * instead of the more obvious recursive implementation to avoid problems with
  134. * stack overflow with some perverse test rule data (from fuzzing).
  135. */
  136. void RBBINode::NRDeleteNode(RBBINode *node) {
  137. if (node == nullptr) {
  138. return;
  139. }
  140. RBBINode *stopNode = node->fParent;
  141. RBBINode *nextNode = node;
  142. while (nextNode != stopNode && nextNode != nullptr) {
  143. RBBINode *currentNode = nextNode;
  144. if ((currentNode->fLeftChild == nullptr && currentNode->fRightChild == nullptr) ||
  145. currentNode->fType == varRef || // varRef and setRef nodes do not
  146. currentNode->fType == setRef) { // own their children nodes.
  147. // CurrentNode is effectively a leaf node; it's safe to go ahead and delete it.
  148. nextNode = currentNode->fParent;
  149. if (nextNode) {
  150. if (nextNode->fLeftChild == currentNode) {
  151. nextNode->fLeftChild = nullptr;
  152. } else if (nextNode->fRightChild == currentNode) {
  153. nextNode->fRightChild = nullptr;
  154. }
  155. }
  156. delete currentNode;
  157. } else if (currentNode->fLeftChild) {
  158. nextNode = currentNode->fLeftChild;
  159. if (nextNode->fParent == nullptr) {
  160. nextNode->fParent = currentNode;
  161. // fParent isn't always set; do it now if not.
  162. }
  163. U_ASSERT(nextNode->fParent == currentNode);
  164. } else if (currentNode->fRightChild) {
  165. nextNode = currentNode->fRightChild;
  166. if (nextNode->fParent == nullptr) {
  167. nextNode->fParent = currentNode;
  168. // fParent isn't always set; do it now if not.
  169. }
  170. U_ASSERT(nextNode->fParent == currentNode);
  171. }
  172. }
  173. }
  174. //-------------------------------------------------------------------------
  175. //
  176. // cloneTree Make a copy of the subtree rooted at this node.
  177. // Discard any variable references encountered along the way,
  178. // and replace with copies of the variable's definitions.
  179. // Used to replicate the expression underneath variable
  180. // references in preparation for generating the DFA tables.
  181. //
  182. //-------------------------------------------------------------------------
  183. constexpr int kRecursiveDepthLimit = 3500;
  184. RBBINode *RBBINode::cloneTree(UErrorCode &status, int depth) {
  185. if (U_FAILURE(status)) {
  186. return nullptr;
  187. }
  188. // If the depth of the stack is too deep, we return U_INPUT_TOO_LONG_ERROR
  189. // to avoid stack overflow crash.
  190. if (depth > kRecursiveDepthLimit) {
  191. status = U_INPUT_TOO_LONG_ERROR;
  192. return nullptr;
  193. }
  194. RBBINode *n;
  195. if (fType == RBBINode::varRef) {
  196. // If the current node is a variable reference, skip over it
  197. // and clone the definition of the variable instead.
  198. n = fLeftChild->cloneTree(status, depth+1);
  199. if (U_FAILURE(status)) {
  200. return nullptr;
  201. }
  202. } else if (fType == RBBINode::uset) {
  203. n = this;
  204. } else {
  205. n = new RBBINode(*this, status);
  206. if (U_FAILURE(status)) {
  207. delete n;
  208. return nullptr;
  209. }
  210. // Check for null pointer.
  211. if (n == nullptr) {
  212. status = U_MEMORY_ALLOCATION_ERROR;
  213. return nullptr;
  214. }
  215. if (fLeftChild != nullptr) {
  216. n->fLeftChild = fLeftChild->cloneTree(status, depth+1);
  217. if (U_FAILURE(status)) {
  218. delete n;
  219. return nullptr;
  220. }
  221. n->fLeftChild->fParent = n;
  222. }
  223. if (fRightChild != nullptr) {
  224. n->fRightChild = fRightChild->cloneTree(status, depth+1);
  225. if (U_FAILURE(status)) {
  226. delete n;
  227. return nullptr;
  228. }
  229. n->fRightChild->fParent = n;
  230. }
  231. }
  232. return n;
  233. }
  234. //-------------------------------------------------------------------------
  235. //
  236. // flattenVariables Walk a parse tree, replacing any variable
  237. // references with a copy of the variable's definition.
  238. // Aside from variables, the tree is not changed.
  239. //
  240. // Return the root of the tree. If the root was not a variable
  241. // reference, it remains unchanged - the root we started with
  242. // is the root we return. If, however, the root was a variable
  243. // reference, the root of the newly cloned replacement tree will
  244. // be returned, and the original tree deleted.
  245. //
  246. // This function works by recursively walking the tree
  247. // without doing anything until a variable reference is
  248. // found, then calling cloneTree() at that point. Any
  249. // nested references are handled by cloneTree(), not here.
  250. //
  251. //-------------------------------------------------------------------------
  252. RBBINode *RBBINode::flattenVariables(UErrorCode& status, int depth) {
  253. if (U_FAILURE(status)) {
  254. return this;
  255. }
  256. // If the depth of the stack is too deep, we return U_INPUT_TOO_LONG_ERROR
  257. // to avoid stack overflow crash.
  258. if (depth > kRecursiveDepthLimit) {
  259. status = U_INPUT_TOO_LONG_ERROR;
  260. return this;
  261. }
  262. if (fType == varRef) {
  263. RBBINode *retNode = fLeftChild->cloneTree(status, depth+1);
  264. if (U_FAILURE(status)) {
  265. return this;
  266. }
  267. retNode->fRuleRoot = this->fRuleRoot;
  268. retNode->fChainIn = this->fChainIn;
  269. delete this; // TODO: undefined behavior. Fix.
  270. return retNode;
  271. }
  272. if (fLeftChild != nullptr) {
  273. fLeftChild = fLeftChild->flattenVariables(status, depth+1);
  274. if (fLeftChild == nullptr) {
  275. status = U_MEMORY_ALLOCATION_ERROR;
  276. }
  277. if (U_FAILURE(status)) {
  278. return this;
  279. }
  280. fLeftChild->fParent = this;
  281. }
  282. if (fRightChild != nullptr) {
  283. fRightChild = fRightChild->flattenVariables(status, depth+1);
  284. if (fRightChild == nullptr) {
  285. status = U_MEMORY_ALLOCATION_ERROR;
  286. }
  287. if (U_FAILURE(status)) {
  288. return this;
  289. }
  290. fRightChild->fParent = this;
  291. }
  292. return this;
  293. }
  294. //-------------------------------------------------------------------------
  295. //
  296. // flattenSets Walk the parse tree, replacing any nodes of type setRef
  297. // with a copy of the expression tree for the set. A set's
  298. // equivalent expression tree is precomputed and saved as
  299. // the left child of the uset node.
  300. //
  301. //-------------------------------------------------------------------------
  302. void RBBINode::flattenSets(UErrorCode &status, int depth) {
  303. if (U_FAILURE(status)) {
  304. return;
  305. }
  306. // If the depth of the stack is too deep, we return U_INPUT_TOO_LONG_ERROR
  307. // to avoid stack overflow crash.
  308. if (depth > kRecursiveDepthLimit) {
  309. status = U_INPUT_TOO_LONG_ERROR;
  310. return;
  311. }
  312. U_ASSERT(fType != setRef);
  313. if (fLeftChild != nullptr) {
  314. if (fLeftChild->fType==setRef) {
  315. RBBINode *setRefNode = fLeftChild;
  316. RBBINode *usetNode = setRefNode->fLeftChild;
  317. RBBINode *replTree = usetNode->fLeftChild;
  318. fLeftChild = replTree->cloneTree(status, depth+1);
  319. if (U_FAILURE(status)) {
  320. delete setRefNode;
  321. return;
  322. }
  323. fLeftChild->fParent = this;
  324. delete setRefNode;
  325. } else {
  326. fLeftChild->flattenSets(status, depth+1);
  327. }
  328. }
  329. if (fRightChild != nullptr) {
  330. if (fRightChild->fType==setRef) {
  331. RBBINode *setRefNode = fRightChild;
  332. RBBINode *usetNode = setRefNode->fLeftChild;
  333. RBBINode *replTree = usetNode->fLeftChild;
  334. fRightChild = replTree->cloneTree(status, depth+1);
  335. if (U_FAILURE(status)) {
  336. delete setRefNode;
  337. return;
  338. }
  339. fRightChild->fParent = this;
  340. delete setRefNode;
  341. } else {
  342. fRightChild->flattenSets(status, depth+1);
  343. }
  344. }
  345. }
  346. //-------------------------------------------------------------------------
  347. //
  348. // findNodes() Locate all the nodes of the specified type, starting
  349. // at the specified root.
  350. //
  351. //-------------------------------------------------------------------------
  352. void RBBINode::findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status) {
  353. /* test for buffer overflows */
  354. if (U_FAILURE(status)) {
  355. return;
  356. }
  357. U_ASSERT(!dest->hasDeleter());
  358. if (fType == kind) {
  359. dest->addElement(this, status);
  360. }
  361. if (fLeftChild != nullptr) {
  362. fLeftChild->findNodes(dest, kind, status);
  363. }
  364. if (fRightChild != nullptr) {
  365. fRightChild->findNodes(dest, kind, status);
  366. }
  367. }
  368. //-------------------------------------------------------------------------
  369. //
  370. // print. Print out a single node, for debugging.
  371. //
  372. //-------------------------------------------------------------------------
  373. #ifdef RBBI_DEBUG
  374. static int32_t serial(const RBBINode *node) {
  375. return (node == nullptr? -1 : node->fSerialNum);
  376. }
  377. void RBBINode::printNode(const RBBINode *node) {
  378. static const char * const nodeTypeNames[] = {
  379. "setRef",
  380. "uset",
  381. "varRef",
  382. "leafChar",
  383. "lookAhead",
  384. "tag",
  385. "endMark",
  386. "opStart",
  387. "opCat",
  388. "opOr",
  389. "opStar",
  390. "opPlus",
  391. "opQuestion",
  392. "opBreak",
  393. "opReverse",
  394. "opLParen"
  395. };
  396. if (node==nullptr) {
  397. RBBIDebugPrintf("%10p", (void *)node);
  398. } else {
  399. RBBIDebugPrintf("%10p %5d %12s %c%c %5d %5d %5d %6d %d ",
  400. (void *)node, node->fSerialNum, nodeTypeNames[node->fType],
  401. node->fRuleRoot?'R':' ', node->fChainIn?'C':' ',
  402. serial(node->fLeftChild), serial(node->fRightChild), serial(node->fParent),
  403. node->fFirstPos, node->fVal);
  404. if (node->fType == varRef) {
  405. RBBI_DEBUG_printUnicodeString(node->fText);
  406. }
  407. }
  408. RBBIDebugPrintf("\n");
  409. }
  410. #endif
  411. #ifdef RBBI_DEBUG
  412. U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth) {
  413. RBBIDebugPrintf("%*s", minWidth, CStr(s)());
  414. }
  415. #endif
  416. //-------------------------------------------------------------------------
  417. //
  418. // print. Print out the tree of nodes rooted at "this"
  419. //
  420. //-------------------------------------------------------------------------
  421. #ifdef RBBI_DEBUG
  422. void RBBINode::printNodeHeader() {
  423. RBBIDebugPrintf(" Address serial type LeftChild RightChild Parent position value\n");
  424. }
  425. void RBBINode::printTree(const RBBINode *node, UBool printHeading) {
  426. if (printHeading) {
  427. printNodeHeader();
  428. }
  429. printNode(node);
  430. if (node != nullptr) {
  431. // Only dump the definition under a variable reference if asked to.
  432. // Unconditionally dump children of all other node types.
  433. if (node->fType != varRef) {
  434. if (node->fLeftChild != nullptr) {
  435. printTree(node->fLeftChild, false);
  436. }
  437. if (node->fRightChild != nullptr) {
  438. printTree(node->fRightChild, false);
  439. }
  440. }
  441. }
  442. }
  443. #endif
  444. U_NAMESPACE_END
  445. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */