rbbidata.h 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 1999-2014 International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: rbbidata.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * RBBI data formats Includes
  16. *
  17. * Structs that describes the format of the Binary RBBI data,
  18. * as it is stored in ICU's data file.
  19. *
  20. * RBBIDataWrapper - Instances of this class sit between the
  21. * raw data structs and the RulesBasedBreakIterator objects
  22. * that are created by applications. The wrapper class
  23. * provides reference counting for the underlying data,
  24. * and direct pointers to data that would not otherwise
  25. * be accessible without ugly pointer arithmetic. The
  26. * wrapper does not attempt to provide any higher level
  27. * abstractions for the data itself.
  28. *
  29. * There will be only one instance of RBBIDataWrapper for any
  30. * set of RBBI run time data being shared by instances
  31. * (clones) of RulesBasedBreakIterator.
  32. */
  33. #ifndef __RBBIDATA_H__
  34. #define __RBBIDATA_H__
  35. #include "unicode/utypes.h"
  36. #include "unicode/udata.h"
  37. #include "udataswp.h"
  38. /**
  39. * Swap RBBI data. See udataswp.h.
  40. * @internal
  41. */
  42. U_CAPI int32_t U_EXPORT2
  43. ubrk_swap(const UDataSwapper *ds,
  44. const void *inData, int32_t length, void *outData,
  45. UErrorCode *pErrorCode);
  46. #ifdef __cplusplus
  47. #include "unicode/ucptrie.h"
  48. #include "unicode/uobject.h"
  49. #include "unicode/unistr.h"
  50. #include "unicode/uversion.h"
  51. #include "umutex.h"
  52. U_NAMESPACE_BEGIN
  53. // The current RBBI data format version.
  54. static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0};
  55. /*
  56. * The following structs map exactly onto the raw data from ICU common data file.
  57. */
  58. struct RBBIDataHeader {
  59. uint32_t fMagic; /* == 0xbla0 */
  60. UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */
  61. /* if there is one associated with this data. */
  62. /* (version originates in rbbi, is copied to UDataInfo) */
  63. uint32_t fLength; /* Total length in bytes of this RBBI Data, */
  64. /* including all sections, not just the header. */
  65. uint32_t fCatCount; /* Number of character categories. */
  66. /* */
  67. /* Offsets and sizes of each of the subsections within the RBBI data. */
  68. /* All offsets are bytes from the start of the RBBIDataHeader. */
  69. /* All sizes are in bytes. */
  70. /* */
  71. uint32_t fFTable; /* forward state transition table. */
  72. uint32_t fFTableLen;
  73. uint32_t fRTable; /* Offset to the reverse state transition table. */
  74. uint32_t fRTableLen;
  75. uint32_t fTrie; /* Offset to Trie data for character categories */
  76. uint32_t fTrieLen;
  77. uint32_t fRuleSource; /* Offset to the source for for the break */
  78. uint32_t fRuleSourceLen; /* rules. Stored char16_t *. */
  79. uint32_t fStatusTable; /* Offset to the table of rule status values */
  80. uint32_t fStatusTableLen;
  81. uint32_t fReserved[6]; /* Reserved for expansion */
  82. };
  83. template <typename T>
  84. struct RBBIStateTableRowT {
  85. T fAccepting; // Non-zero if this row is for an accepting state.
  86. // Value 0: not an accepting state.
  87. // 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
  88. // >1: Look-ahead match has completed.
  89. // Actual boundary position happened earlier.
  90. // Value here == fLookAhead in earlier
  91. // state, at actual boundary pos.
  92. T fLookAhead; // Non-zero if this row is for a state that
  93. // corresponds to a '/' in the rule source.
  94. // Value is the same as the fAccepting
  95. // value for the rule (which will appear
  96. // in a different state.
  97. T fTagsIdx; // Non-zero if this row covers a {tagged} position
  98. // from a rule. Value is the index in the
  99. // StatusTable of the set of matching
  100. // tags (rule status values)
  101. T fNextState[1]; // Next State, indexed by char category.
  102. // Variable-length array declared with length 1
  103. // to disable bounds checkers.
  104. // Array Size is actually fData->fHeader->fCatCount
  105. // CAUTION: see RBBITableBuilder::getTableSize()
  106. // before changing anything here.
  107. };
  108. typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8;
  109. typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16;
  110. constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting
  111. union RBBIStateTableRow {
  112. RBBIStateTableRow16 r16;
  113. RBBIStateTableRow8 r8;
  114. };
  115. struct RBBIStateTable {
  116. uint32_t fNumStates; // Number of states.
  117. uint32_t fRowLen; // Length of a state table row, in bytes.
  118. uint32_t fDictCategoriesStart; // Char category number of the first dictionary
  119. // char class, or the the largest category number + 1
  120. // if there are no dictionary categories.
  121. uint32_t fLookAheadResultsSize; // Size of run-time array required for holding
  122. // look-ahead results. Indexed by row.fLookAhead.
  123. uint32_t fFlags; // Option Flags for this state table.
  124. char fTableData[1]; // First RBBIStateTableRow begins here.
  125. // Variable-length array declared with length 1
  126. // to disable bounds checkers.
  127. // (making it char[] simplifies ugly address
  128. // arithmetic for indexing variable length rows.)
  129. };
  130. constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
  131. constexpr uint32_t RBBI_BOF_REQUIRED = 2;
  132. constexpr uint32_t RBBI_8BITS_ROWS = 4;
  133. /* */
  134. /* The reference counting wrapper class */
  135. /* */
  136. class RBBIDataWrapper : public UMemory {
  137. public:
  138. enum EDontAdopt {
  139. kDontAdopt
  140. };
  141. RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
  142. RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
  143. RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
  144. ~RBBIDataWrapper();
  145. static UBool isDataVersionAcceptable(const UVersionInfo version);
  146. void init0();
  147. void init(const RBBIDataHeader *data, UErrorCode &status);
  148. RBBIDataWrapper *addReference();
  149. void removeReference();
  150. bool operator ==(const RBBIDataWrapper &other) const;
  151. int32_t hashCode();
  152. const UnicodeString &getRuleSourceString() const;
  153. void printData();
  154. void printTable(const char *heading, const RBBIStateTable *table);
  155. /* */
  156. /* Pointers to items within the data */
  157. /* */
  158. const RBBIDataHeader *fHeader;
  159. const RBBIStateTable *fForwardTable;
  160. const RBBIStateTable *fReverseTable;
  161. const char *fRuleSource;
  162. const int32_t *fRuleStatusTable;
  163. /* number of int32_t values in the rule status table. Used to sanity check indexing */
  164. int32_t fStatusMaxIdx;
  165. UCPTrie *fTrie;
  166. private:
  167. u_atomic_int32_t fRefCount;
  168. UDataMemory *fUDataMem;
  169. UnicodeString fRuleString;
  170. UBool fDontFreeData;
  171. RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */
  172. RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */
  173. };
  174. U_NAMESPACE_END
  175. U_CFUNC UBool rbbi_cleanup();
  176. #endif /* C++ */
  177. #endif