rbbisetb.h 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // rbbisetb.h
  5. /*
  6. **********************************************************************
  7. * Copyright (c) 2001-2005, International Business Machines
  8. * Corporation and others. All Rights Reserved.
  9. **********************************************************************
  10. */
  11. #ifndef RBBISETB_H
  12. #define RBBISETB_H
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_BREAK_ITERATION
  15. #include "unicode/ucptrie.h"
  16. #include "unicode/umutablecptrie.h"
  17. #include "unicode/uobject.h"
  18. #include "rbbirb.h"
  19. #include "uvector.h"
  20. U_NAMESPACE_BEGIN
  21. //
  22. // RBBISetBuilder Derives the character categories used by the runtime RBBI engine
  23. // from the Unicode Sets appearing in the source RBBI rules, and
  24. // creates the TRIE table used to map from Unicode to the
  25. // character categories.
  26. //
  27. //
  28. // RangeDescriptor
  29. //
  30. // Each of the non-overlapping character ranges gets one of these descriptors.
  31. // All of them are strung together in a linked list, which is kept in order
  32. // (by character)
  33. //
  34. class RangeDescriptor : public UMemory {
  35. public:
  36. UChar32 fStartChar {}; // Start of range, unicode 32 bit value.
  37. UChar32 fEndChar {}; // End of range, unicode 32 bit value.
  38. int32_t fNum {0}; // runtime-mapped input value for this range.
  39. bool fIncludesDict {false}; // True if the range includes $dictionary.
  40. bool fFirstInGroup {false}; // True if first range in a group with the same fNum.
  41. UVector *fIncludesSets {nullptr}; // vector of the the original
  42. // Unicode sets that include this range.
  43. // (Contains ptrs to uset nodes)
  44. RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list.
  45. RangeDescriptor(UErrorCode &status);
  46. RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
  47. ~RangeDescriptor();
  48. void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
  49. // where appearing in the second (higher) part.
  50. bool isDictionaryRange(); // Check whether this range appears as part of
  51. // the Unicode set named "dictionary"
  52. RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class
  53. RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class
  54. };
  55. //
  56. // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
  57. //
  58. // Starting with the rules parse tree from the scanner,
  59. //
  60. // - Enumerate the set of UnicodeSets that are referenced
  61. // by the RBBI rules.
  62. // - compute a derived set of non-overlapping UnicodeSets
  63. // that will correspond to columns in the state table for
  64. // the RBBI execution engine.
  65. // - construct the trie table that maps input characters
  66. // to set numbers in the non-overlapping set of sets.
  67. //
  68. class RBBISetBuilder : public UMemory {
  69. public:
  70. RBBISetBuilder(RBBIRuleBuilder *rb);
  71. ~RBBISetBuilder();
  72. void buildRanges();
  73. void buildTrie();
  74. void addValToSets(UVector *sets, uint32_t val);
  75. void addValToSet (RBBINode *usetNode, uint32_t val);
  76. int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
  77. // runtime state machine, which are the same as
  78. // columns in the DFA state table
  79. int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or
  80. // last category + 1 if there are no dictionary categories.
  81. int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
  82. void serializeTrie(uint8_t *where); // write out the serialized Trie.
  83. UChar32 getFirstChar(int32_t val) const;
  84. UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
  85. // character were encountered.
  86. /**
  87. * Merge two character categories that have been identified as having equivalent behavior.
  88. * The ranges belonging to the second category (table column) will be added to the first.
  89. * @param categories the pair of categories to be merged.
  90. */
  91. void mergeCategories(IntPair categories);
  92. #ifdef RBBI_DEBUG
  93. void printSets();
  94. void printRanges();
  95. void printRangeGroups();
  96. #else
  97. #define printSets()
  98. #define printRanges()
  99. #define printRangeGroups()
  100. #endif
  101. private:
  102. RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
  103. UErrorCode *fStatus;
  104. RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
  105. UMutableCPTrie *fMutableTrie; // The mapping TRIE that is the end result of processing
  106. UCPTrie *fTrie; // the Unicode Sets.
  107. uint32_t fTrieSize;
  108. // Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
  109. int32_t fGroupCount;
  110. // The number of the first dictionary char category.
  111. // If there are no Dictionary categories, set to the last category + 1.
  112. int32_t fDictCategoriesStart;
  113. UBool fSawBOF;
  114. RBBISetBuilder(const RBBISetBuilder &other) = delete; // forbid copying of this class
  115. RBBISetBuilder &operator=(const RBBISetBuilder &other) = delete; // forbid copying of this class
  116. };
  117. U_NAMESPACE_END
  118. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  119. #endif