stringtriebuilder.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2010-2012,2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * file name: stringtriebuilder.h
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2010dec24
  14. * created by: Markus W. Scherer
  15. */
  16. #ifndef __STRINGTRIEBUILDER_H__
  17. #define __STRINGTRIEBUILDER_H__
  18. #include "unicode/utypes.h"
  19. #if U_SHOW_CPLUSPLUS_API
  20. #include "unicode/uobject.h"
  21. /**
  22. * \file
  23. * \brief C++ API: Builder API for trie builders
  24. */
  25. // Forward declaration.
  26. /// \cond
  27. struct UHashtable;
  28. typedef struct UHashtable UHashtable;
  29. /// \endcond
  30. /**
  31. * Build options for BytesTrieBuilder and CharsTrieBuilder.
  32. * @stable ICU 4.8
  33. */
  34. enum UStringTrieBuildOption {
  35. /**
  36. * Builds a trie quickly.
  37. * @stable ICU 4.8
  38. */
  39. USTRINGTRIE_BUILD_FAST,
  40. /**
  41. * Builds a trie more slowly, attempting to generate
  42. * a shorter but equivalent serialization.
  43. * This build option also uses more memory.
  44. *
  45. * This option can be effective when many integer values are the same
  46. * and string/byte sequence suffixes can be shared.
  47. * Runtime speed is not expected to improve.
  48. * @stable ICU 4.8
  49. */
  50. USTRINGTRIE_BUILD_SMALL
  51. };
  52. U_NAMESPACE_BEGIN
  53. /**
  54. * Base class for string trie builder classes.
  55. *
  56. * This class is not intended for public subclassing.
  57. * @stable ICU 4.8
  58. */
  59. class U_COMMON_API StringTrieBuilder : public UObject {
  60. public:
  61. #ifndef U_HIDE_INTERNAL_API
  62. /** @internal */
  63. static int32_t hashNode(const void *node);
  64. /** @internal */
  65. static UBool equalNodes(const void *left, const void *right);
  66. #endif /* U_HIDE_INTERNAL_API */
  67. protected:
  68. // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
  69. // or else the compiler will create a public default constructor.
  70. /** @internal */
  71. StringTrieBuilder();
  72. /** @internal */
  73. virtual ~StringTrieBuilder();
  74. #ifndef U_HIDE_INTERNAL_API
  75. /** @internal */
  76. void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
  77. /** @internal */
  78. void deleteCompactBuilder();
  79. /** @internal */
  80. void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
  81. /** @internal */
  82. int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
  83. /** @internal */
  84. int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
  85. #endif /* U_HIDE_INTERNAL_API */
  86. class Node;
  87. #ifndef U_HIDE_INTERNAL_API
  88. /** @internal */
  89. Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
  90. /** @internal */
  91. Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
  92. int32_t length, UErrorCode &errorCode);
  93. #endif /* U_HIDE_INTERNAL_API */
  94. /** @internal */
  95. virtual int32_t getElementStringLength(int32_t i) const = 0;
  96. /** @internal */
  97. virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
  98. /** @internal */
  99. virtual int32_t getElementValue(int32_t i) const = 0;
  100. // Finds the first unit index after this one where
  101. // the first and last element have different units again.
  102. /** @internal */
  103. virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
  104. // Number of different units at unitIndex.
  105. /** @internal */
  106. virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
  107. /** @internal */
  108. virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
  109. /** @internal */
  110. virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
  111. /** @internal */
  112. virtual UBool matchNodesCanHaveValues() const = 0;
  113. /** @internal */
  114. virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
  115. /** @internal */
  116. virtual int32_t getMinLinearMatch() const = 0;
  117. /** @internal */
  118. virtual int32_t getMaxLinearMatchLength() const = 0;
  119. #ifndef U_HIDE_INTERNAL_API
  120. // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
  121. /** @internal */
  122. static const int32_t kMaxBranchLinearSubNodeLength=5;
  123. // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
  124. // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
  125. /** @internal */
  126. static const int32_t kMaxSplitBranchLevels=14;
  127. /**
  128. * Makes sure that there is only one unique node registered that is
  129. * equivalent to newNode.
  130. * @param newNode Input node. The builder takes ownership.
  131. * @param errorCode ICU in/out UErrorCode.
  132. Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==nullptr.
  133. * @return newNode if it is the first of its kind, or
  134. * an equivalent node if newNode is a duplicate.
  135. * @internal
  136. */
  137. Node *registerNode(Node *newNode, UErrorCode &errorCode);
  138. /**
  139. * Makes sure that there is only one unique FinalValueNode registered
  140. * with this value.
  141. * Avoids creating a node if the value is a duplicate.
  142. * @param value A final value.
  143. * @param errorCode ICU in/out UErrorCode.
  144. Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==nullptr.
  145. * @return A FinalValueNode with the given value.
  146. * @internal
  147. */
  148. Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
  149. #endif /* U_HIDE_INTERNAL_API */
  150. /*
  151. * C++ note:
  152. * registerNode() and registerFinalValue() take ownership of their input nodes,
  153. * and only return owned nodes.
  154. * If they see a failure UErrorCode, they will delete the input node.
  155. * If they get a nullptr pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
  156. * If there is a failure, they return nullptr.
  157. *
  158. * nullptr Node pointers can be safely passed into other Nodes because
  159. * they call the static Node::hashCode() which checks for a nullptr pointer first.
  160. *
  161. * Therefore, as long as builder functions register a new node,
  162. * they need to check for failures only before explicitly dereferencing
  163. * a Node pointer, or before setting a new UErrorCode.
  164. */
  165. // Hash set of nodes, maps from nodes to integer 1.
  166. /** @internal */
  167. UHashtable *nodes;
  168. // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
  169. // it is needed for layout of other objects.
  170. /**
  171. * @internal
  172. * \cond
  173. */
  174. class Node : public UObject {
  175. public:
  176. Node(int32_t initialHash) : hash(initialHash), offset(0) {}
  177. inline int32_t hashCode() const { return hash; }
  178. // Handles node==nullptr.
  179. static inline int32_t hashCode(const Node *node) { return node==nullptr ? 0 : node->hashCode(); }
  180. // Base class operator==() compares the actual class types.
  181. virtual bool operator==(const Node &other) const;
  182. inline bool operator!=(const Node &other) const { return !operator==(other); }
  183. /**
  184. * Traverses the Node graph and numbers branch edges, with rightmost edges first.
  185. * This is to avoid writing a duplicate node twice.
  186. *
  187. * Branch nodes in this trie data structure are not symmetric.
  188. * Most branch edges "jump" to other nodes but the rightmost branch edges
  189. * just continue without a jump.
  190. * Therefore, write() must write the rightmost branch edge last
  191. * (trie units are written backwards), and must write it at that point even if
  192. * it is a duplicate of a node previously written elsewhere.
  193. *
  194. * This function visits and marks right branch edges first.
  195. * Edges are numbered with increasingly negative values because we share the
  196. * offset field which gets positive values when nodes are written.
  197. * A branch edge also remembers the first number for any of its edges.
  198. *
  199. * When a further-left branch edge has a number in the range of the rightmost
  200. * edge's numbers, then it will be written as part of the required right edge
  201. * and we can avoid writing it first.
  202. *
  203. * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
  204. * edge numbers.
  205. *
  206. * @param edgeNumber The first edge number for this node and its sub-nodes.
  207. * @return An edge number that is at least the maximum-negative
  208. * of the input edge number and the numbers of this node and all of its sub-nodes.
  209. */
  210. virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
  211. // write() must set the offset to a positive value.
  212. virtual void write(StringTrieBuilder &builder) = 0;
  213. // See markRightEdgesFirst.
  214. inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
  215. StringTrieBuilder &builder) {
  216. // Note: Edge numbers are negative, lastRight<=firstRight.
  217. // If offset>0 then this node and its sub-nodes have been written already
  218. // and we need not write them again.
  219. // If this node is part of the unwritten right branch edge,
  220. // then we wait until that is written.
  221. if(offset<0 && (offset<lastRight || firstRight<offset)) {
  222. write(builder);
  223. }
  224. }
  225. inline int32_t getOffset() const { return offset; }
  226. protected:
  227. int32_t hash;
  228. int32_t offset;
  229. };
  230. #ifndef U_HIDE_INTERNAL_API
  231. // This class should not be overridden because
  232. // registerFinalValue() compares a stack-allocated FinalValueNode
  233. // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
  234. // with the input node, and the
  235. // !Node::operator==(other) used inside FinalValueNode::operator==(other)
  236. // will be false if the typeid's are different.
  237. /** @internal */
  238. class FinalValueNode : public Node {
  239. public:
  240. FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
  241. virtual bool operator==(const Node &other) const override;
  242. virtual void write(StringTrieBuilder &builder) override;
  243. protected:
  244. int32_t value;
  245. };
  246. #endif /* U_HIDE_INTERNAL_API */
  247. // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
  248. // it is needed for layout of other objects.
  249. /**
  250. * @internal
  251. */
  252. class ValueNode : public Node {
  253. public:
  254. ValueNode(int32_t initialHash) : Node(initialHash), hasValue(false), value(0) {}
  255. virtual bool operator==(const Node &other) const override;
  256. void setValue(int32_t v) {
  257. hasValue=true;
  258. value=v;
  259. hash=hash*37u+v;
  260. }
  261. protected:
  262. UBool hasValue;
  263. int32_t value;
  264. };
  265. #ifndef U_HIDE_INTERNAL_API
  266. /**
  267. * @internal
  268. */
  269. class IntermediateValueNode : public ValueNode {
  270. public:
  271. IntermediateValueNode(int32_t v, Node *nextNode)
  272. : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
  273. virtual bool operator==(const Node &other) const override;
  274. virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
  275. virtual void write(StringTrieBuilder &builder) override;
  276. protected:
  277. Node *next;
  278. };
  279. #endif /* U_HIDE_INTERNAL_API */
  280. // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
  281. // it is needed for layout of other objects.
  282. /**
  283. * @internal
  284. */
  285. class LinearMatchNode : public ValueNode {
  286. public:
  287. LinearMatchNode(int32_t len, Node *nextNode)
  288. : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
  289. length(len), next(nextNode) {}
  290. virtual bool operator==(const Node &other) const override;
  291. virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
  292. protected:
  293. int32_t length;
  294. Node *next;
  295. };
  296. #ifndef U_HIDE_INTERNAL_API
  297. /**
  298. * @internal
  299. */
  300. class BranchNode : public Node {
  301. public:
  302. BranchNode(int32_t initialHash) : Node(initialHash) {}
  303. protected:
  304. int32_t firstEdgeNumber;
  305. };
  306. /**
  307. * @internal
  308. */
  309. class ListBranchNode : public BranchNode {
  310. public:
  311. ListBranchNode() : BranchNode(0x444444), length(0) {}
  312. virtual bool operator==(const Node &other) const override;
  313. virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
  314. virtual void write(StringTrieBuilder &builder) override;
  315. // Adds a unit with a final value.
  316. void add(int32_t c, int32_t value) {
  317. units[length]=(char16_t)c;
  318. equal[length]=nullptr;
  319. values[length]=value;
  320. ++length;
  321. hash=(hash*37u+c)*37u+value;
  322. }
  323. // Adds a unit which leads to another match node.
  324. void add(int32_t c, Node *node) {
  325. units[length]=(char16_t)c;
  326. equal[length]=node;
  327. values[length]=0;
  328. ++length;
  329. hash=(hash*37u+c)*37u+hashCode(node);
  330. }
  331. protected:
  332. Node *equal[kMaxBranchLinearSubNodeLength]; // nullptr means "has final value".
  333. int32_t length;
  334. int32_t values[kMaxBranchLinearSubNodeLength];
  335. char16_t units[kMaxBranchLinearSubNodeLength];
  336. };
  337. /**
  338. * @internal
  339. */
  340. class SplitBranchNode : public BranchNode {
  341. public:
  342. SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
  343. : BranchNode(((0x555555u*37u+middleUnit)*37u+
  344. hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
  345. unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
  346. virtual bool operator==(const Node &other) const override;
  347. virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
  348. virtual void write(StringTrieBuilder &builder) override;
  349. protected:
  350. char16_t unit;
  351. Node *lessThan;
  352. Node *greaterOrEqual;
  353. };
  354. // Branch head node, for writing the actual node lead unit.
  355. /** @internal */
  356. class BranchHeadNode : public ValueNode {
  357. public:
  358. BranchHeadNode(int32_t len, Node *subNode)
  359. : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
  360. length(len), next(subNode) {}
  361. virtual bool operator==(const Node &other) const override;
  362. virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
  363. virtual void write(StringTrieBuilder &builder) override;
  364. protected:
  365. int32_t length;
  366. Node *next; // A branch sub-node.
  367. };
  368. #endif /* U_HIDE_INTERNAL_API */
  369. /// \endcond
  370. /** @internal */
  371. virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
  372. Node *nextNode) const = 0;
  373. /** @internal */
  374. virtual int32_t write(int32_t unit) = 0;
  375. /** @internal */
  376. virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
  377. /** @internal */
  378. virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
  379. /** @internal */
  380. virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
  381. /** @internal */
  382. virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
  383. };
  384. U_NAMESPACE_END
  385. #endif /* U_SHOW_CPLUSPLUS_API */
  386. #endif // __STRINGTRIEBUILDER_H__