rbbitblb.h 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // rbbitblb.h
  5. //
  6. /*
  7. **********************************************************************
  8. * Copyright (c) 2002-2016, International Business Machines
  9. * Corporation and others. All Rights Reserved.
  10. **********************************************************************
  11. */
  12. #ifndef RBBITBLB_H
  13. #define RBBITBLB_H
  14. #include "unicode/utypes.h"
  15. #if !UCONFIG_NO_BREAK_ITERATION
  16. #include "unicode/uobject.h"
  17. #include "unicode/rbbi.h"
  18. #include "rbbidata.h"
  19. #include "rbbirb.h"
  20. #include "rbbinode.h"
  21. U_NAMESPACE_BEGIN
  22. class RBBIRuleScanner;
  23. class RBBIRuleBuilder;
  24. class UVector32;
  25. //
  26. // class RBBITableBuilder is part of the RBBI rule compiler.
  27. // It builds the state transition table used by the RBBI runtime
  28. // from the expression syntax tree generated by the rule scanner.
  29. //
  30. // This class is part of the RBBI implementation only.
  31. // There is no user-visible public API here.
  32. //
  33. class RBBITableBuilder : public UMemory {
  34. public:
  35. RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
  36. ~RBBITableBuilder();
  37. void buildForwardTable();
  38. /** Return the runtime size in bytes of the built state table. */
  39. int32_t getTableSize() const;
  40. /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
  41. */
  42. void exportTable(void *where);
  43. /** Use 8 bits to encode the forward table */
  44. bool use8BitsForTable() const;
  45. /**
  46. * Find duplicate (redundant) character classes. Begin looking with categories.first.
  47. * Duplicate, if found are returned in the categories parameter.
  48. * This is an iterator-like function, used to identify character classes
  49. * (state table columns) that can be eliminated.
  50. * @param categories in/out parameter, specifies where to start looking for duplicates,
  51. * and returns the first pair of duplicates found, if any.
  52. * @return true if duplicate char classes were found, false otherwise.
  53. */
  54. bool findDuplCharClassFrom(IntPair *categories);
  55. /** Remove a column from the state table. Used when two character categories
  56. * have been found equivalent, and merged together, to eliminate the unneeded table column.
  57. */
  58. void removeColumn(int32_t column);
  59. /**
  60. * Check for, and remove duplicate states (table rows).
  61. * @return the number of states removed.
  62. */
  63. int32_t removeDuplicateStates();
  64. /** Build the safe reverse table from the already-constructed forward table. */
  65. void buildSafeReverseTable(UErrorCode &status);
  66. /** Return the runtime size in bytes of the built safe reverse state table. */
  67. int32_t getSafeTableSize() const;
  68. /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
  69. */
  70. void exportSafeTable(void *where);
  71. /** Use 8 bits to encode the safe reverse table */
  72. bool use8BitsForSafeTable() const;
  73. private:
  74. void calcNullable(RBBINode *n);
  75. void calcFirstPos(RBBINode *n);
  76. void calcLastPos(RBBINode *n);
  77. void calcFollowPos(RBBINode *n);
  78. void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
  79. void bofFixup();
  80. void buildStateTable();
  81. void mapLookAheadRules();
  82. void flagAcceptingStates();
  83. void flagLookAheadStates();
  84. void flagTaggedStates();
  85. void mergeRuleStatusVals();
  86. /**
  87. * Merge redundant state table columns, eliminating character classes with identical behavior.
  88. * Done after the state tables are generated, just before converting to their run-time format.
  89. */
  90. int32_t mergeColumns();
  91. void addRuleRootNodes(UVector *dest, RBBINode *node);
  92. /**
  93. * Find duplicate (redundant) states, beginning at the specified pair,
  94. * within this state table. This is an iterator-like function, used to
  95. * identify states (state table rows) that can be eliminated.
  96. * @param states in/out parameter, specifies where to start looking for duplicates,
  97. * and returns the first pair of duplicates found, if any.
  98. * @return true if duplicate states were found, false otherwise.
  99. */
  100. bool findDuplicateState(IntPair *states);
  101. /** Remove a duplicate state.
  102. * @param duplStates The duplicate states. The first is kept, the second is removed.
  103. * All references to the second in the state table are retargeted
  104. * to the first.
  105. */
  106. void removeState(IntPair duplStates);
  107. /** Find the next duplicate state in the safe reverse table. An iterator function.
  108. * @param states in/out parameter, specifies where to start looking for duplicates,
  109. * and returns the first pair of duplicates found, if any.
  110. * @return true if a duplicate pair of states was found.
  111. */
  112. bool findDuplicateSafeState(IntPair *states);
  113. /** Remove a duplicate state from the safe table.
  114. * @param duplStates The duplicate states. The first is kept, the second is removed.
  115. * All references to the second in the state table are retargeted
  116. * to the first.
  117. */
  118. void removeSafeState(IntPair duplStates);
  119. // Set functions for UVector.
  120. // TODO: make a USet subclass of UVector
  121. void setAdd(UVector *dest, UVector *source);
  122. UBool setEquals(UVector *a, UVector *b);
  123. void sortedAdd(UVector **dest, int32_t val);
  124. public:
  125. #ifdef RBBI_DEBUG
  126. void printSet(UVector *s);
  127. void printPosSets(RBBINode *n /* = nullptr */);
  128. void printStates();
  129. void printRuleStatusTable();
  130. void printReverseTable();
  131. #else
  132. #define printSet(s)
  133. #define printPosSets(n)
  134. #define printStates()
  135. #define printRuleStatusTable()
  136. #define printReverseTable()
  137. #endif
  138. private:
  139. RBBIRuleBuilder *fRB;
  140. RBBINode *&fTree; // The root node of the parse tree to build a
  141. // table for.
  142. UErrorCode *fStatus;
  143. /** State Descriptors, UVector<RBBIStateDescriptor> */
  144. UVector *fDStates; // D states (Aho's terminology)
  145. // Index is state number
  146. // Contents are RBBIStateDescriptor pointers.
  147. /** Synthesized safe table, UVector of UnicodeString, one string per table row. */
  148. UVector *fSafeTable;
  149. /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
  150. UVector32 *fLookAheadRuleMap = nullptr;
  151. /* Counter used when assigning lookahead rule numbers.
  152. * Contains the last look-ahead number already in use.
  153. * The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
  154. * for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */
  155. int32_t fLASlotsInUse = ACCEPTING_UNCONDITIONAL;
  156. RBBITableBuilder(const RBBITableBuilder &other) = delete; // forbid copying of this class
  157. RBBITableBuilder &operator=(const RBBITableBuilder &other) = delete; // forbid copying of this class
  158. };
  159. //
  160. // RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
  161. // one for each state.
  162. class RBBIStateDescriptor : public UMemory {
  163. public:
  164. UBool fMarked;
  165. uint32_t fAccepting;
  166. uint32_t fLookAhead;
  167. UVector *fTagVals;
  168. int32_t fTagsIdx;
  169. UVector *fPositions; // Set of parse tree positions associated
  170. // with this state. Unordered (it's a set).
  171. // UVector contents are RBBINode *
  172. UVector32 *fDtran; // Transitions out of this state.
  173. // indexed by input character
  174. // contents is int index of dest state
  175. // in RBBITableBuilder.fDStates
  176. RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus);
  177. ~RBBIStateDescriptor();
  178. private:
  179. RBBIStateDescriptor(const RBBIStateDescriptor &other) = delete; // forbid copying of this class
  180. RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other) = delete; // forbid copying of this class
  181. };
  182. U_NAMESPACE_END
  183. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  184. #endif