ucptrie_impl.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. // © 2017 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. // ucptrie_impl.h (modified from utrie2_impl.h)
  4. // created: 2017dec29 Markus W. Scherer
  5. #ifndef __UCPTRIE_IMPL_H__
  6. #define __UCPTRIE_IMPL_H__
  7. #include "unicode/ucptrie.h"
  8. #ifdef UCPTRIE_DEBUG
  9. #include "unicode/umutablecptrie.h"
  10. #endif
  11. // UCPTrie signature values, in platform endianness and opposite endianness.
  12. // The UCPTrie signature ASCII byte values spell "Tri3".
  13. #define UCPTRIE_SIG 0x54726933
  14. #define UCPTRIE_OE_SIG 0x33697254
  15. /**
  16. * Header data for the binary, memory-mappable representation of a UCPTrie/CodePointTrie.
  17. * @internal
  18. */
  19. struct UCPTrieHeader {
  20. /** "Tri3" in big-endian US-ASCII (0x54726933) */
  21. uint32_t signature;
  22. /**
  23. * Options bit field:
  24. * Bits 15..12: Data length bits 19..16.
  25. * Bits 11..8: Data null block offset bits 19..16.
  26. * Bits 7..6: UCPTrieType
  27. * Bits 5..3: Reserved (0).
  28. * Bits 2..0: UCPTrieValueWidth
  29. */
  30. uint16_t options;
  31. /** Total length of the index tables. */
  32. uint16_t indexLength;
  33. /** Data length bits 15..0. */
  34. uint16_t dataLength;
  35. /** Index-3 null block offset, 0x7fff or 0xffff if none. */
  36. uint16_t index3NullOffset;
  37. /** Data null block offset bits 15..0, 0xfffff if none. */
  38. uint16_t dataNullOffset;
  39. /**
  40. * First code point of the single-value range ending with U+10ffff,
  41. * rounded up and then shifted right by UCPTRIE_SHIFT_2.
  42. */
  43. uint16_t shiftedHighStart;
  44. };
  45. // Constants for use with UCPTrieHeader.options.
  46. constexpr uint16_t UCPTRIE_OPTIONS_DATA_LENGTH_MASK = 0xf000;
  47. constexpr uint16_t UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00;
  48. constexpr uint16_t UCPTRIE_OPTIONS_RESERVED_MASK = 0x38;
  49. constexpr uint16_t UCPTRIE_OPTIONS_VALUE_BITS_MASK = 7;
  50. /**
  51. * Value for index3NullOffset which indicates that there is no index-3 null block.
  52. * Bit 15 is unused for this value because this bit is used if the index-3 contains
  53. * 18-bit indexes.
  54. */
  55. constexpr int32_t UCPTRIE_NO_INDEX3_NULL_OFFSET = 0x7fff;
  56. constexpr int32_t UCPTRIE_NO_DATA_NULL_OFFSET = 0xfffff;
  57. // Internal constants.
  58. /** The length of the BMP index table. 1024=0x400 */
  59. constexpr int32_t UCPTRIE_BMP_INDEX_LENGTH = 0x10000 >> UCPTRIE_FAST_SHIFT;
  60. constexpr int32_t UCPTRIE_SMALL_LIMIT = 0x1000;
  61. constexpr int32_t UCPTRIE_SMALL_INDEX_LENGTH = UCPTRIE_SMALL_LIMIT >> UCPTRIE_FAST_SHIFT;
  62. /** Shift size for getting the index-3 table offset. */
  63. constexpr int32_t UCPTRIE_SHIFT_3 = 4;
  64. /** Shift size for getting the index-2 table offset. */
  65. constexpr int32_t UCPTRIE_SHIFT_2 = 5 + UCPTRIE_SHIFT_3;
  66. /** Shift size for getting the index-1 table offset. */
  67. constexpr int32_t UCPTRIE_SHIFT_1 = 5 + UCPTRIE_SHIFT_2;
  68. /**
  69. * Difference between two shift sizes,
  70. * for getting an index-2 offset from an index-3 offset. 5=9-4
  71. */
  72. constexpr int32_t UCPTRIE_SHIFT_2_3 = UCPTRIE_SHIFT_2 - UCPTRIE_SHIFT_3;
  73. /**
  74. * Difference between two shift sizes,
  75. * for getting an index-1 offset from an index-2 offset. 5=14-9
  76. */
  77. constexpr int32_t UCPTRIE_SHIFT_1_2 = UCPTRIE_SHIFT_1 - UCPTRIE_SHIFT_2;
  78. /**
  79. * Number of index-1 entries for the BMP. (4)
  80. * This part of the index-1 table is omitted from the serialized form.
  81. */
  82. constexpr int32_t UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> UCPTRIE_SHIFT_1;
  83. /** Number of entries in an index-2 block. 32=0x20 */
  84. constexpr int32_t UCPTRIE_INDEX_2_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_1_2;
  85. /** Mask for getting the lower bits for the in-index-2-block offset. */
  86. constexpr int32_t UCPTRIE_INDEX_2_MASK = UCPTRIE_INDEX_2_BLOCK_LENGTH - 1;
  87. /** Number of code points per index-2 table entry. 512=0x200 */
  88. constexpr int32_t UCPTRIE_CP_PER_INDEX_2_ENTRY = 1 << UCPTRIE_SHIFT_2;
  89. /** Number of entries in an index-3 block. 32=0x20 */
  90. constexpr int32_t UCPTRIE_INDEX_3_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_2_3;
  91. /** Mask for getting the lower bits for the in-index-3-block offset. */
  92. constexpr int32_t UCPTRIE_INDEX_3_MASK = UCPTRIE_INDEX_3_BLOCK_LENGTH - 1;
  93. /** Number of entries in a small data block. 16=0x10 */
  94. constexpr int32_t UCPTRIE_SMALL_DATA_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_3;
  95. /** Mask for getting the lower bits for the in-small-data-block offset. */
  96. constexpr int32_t UCPTRIE_SMALL_DATA_MASK = UCPTRIE_SMALL_DATA_BLOCK_LENGTH - 1;
  97. typedef UChar32
  98. UCPTrieGetRange(const void *trie, UChar32 start,
  99. UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
  100. U_CFUNC UChar32
  101. ucptrie_internalGetRange(UCPTrieGetRange *getRange,
  102. const void *trie, UChar32 start,
  103. UCPMapRangeOption option, uint32_t surrogateValue,
  104. UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
  105. #ifdef UCPTRIE_DEBUG
  106. U_CFUNC void
  107. ucptrie_printLengths(const UCPTrie *trie, const char *which);
  108. U_CFUNC void umutablecptrie_setName(UMutableCPTrie *builder, const char *name);
  109. #endif
  110. /*
  111. * Format of the binary, memory-mappable representation of a UCPTrie/CodePointTrie.
  112. * For overview information see https://icu.unicode.org/design/struct/utrie
  113. *
  114. * The binary trie data should be 32-bit-aligned.
  115. * The overall layout is:
  116. *
  117. * UCPTrieHeader header; -- 16 bytes, see struct definition above
  118. * uint16_t index[header.indexLength];
  119. * uintXY_t data[header.dataLength];
  120. *
  121. * The trie data array is an array of uint16_t, uint32_t, or uint8_t,
  122. * specified via the UCPTrieValueWidth when building the trie.
  123. * The data array is 32-bit-aligned for uint32_t, otherwise 16-bit-aligned.
  124. * The overall length of the trie data is a multiple of 4 bytes.
  125. * (Padding is added at the end of the index array and/or near the end of the data array as needed.)
  126. *
  127. * The length of the data array (dataLength) is stored as an integer split across two fields
  128. * of the header struct (high bits in header.options).
  129. *
  130. * The trie type can be "fast" or "small" which determines the index structure,
  131. * specified via the UCPTrieType when building the trie.
  132. *
  133. * The type and valueWidth are stored in the header.options.
  134. * There are reserved type and valueWidth values, and reserved header.options bits.
  135. * They could be used in future format extensions.
  136. * Code reading the trie structure must fail with an error when unknown values or options are set.
  137. *
  138. * Values for ASCII character (U+0000..U+007F) can always be found at the start of the data array.
  139. *
  140. * Values for code points below a type-specific fast-indexing limit are found via two-stage lookup.
  141. * For a "fast" trie, the limit is the BMP/supplementary boundary at U+10000.
  142. * For a "small" trie, the limit is UCPTRIE_SMALL_MAX+1=U+1000.
  143. *
  144. * All code points in the range highStart..U+10FFFF map to a single highValue
  145. * which is stored at the second-to-last position of the data array.
  146. * (See UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET.)
  147. * The highStart value is header.shiftedHighStart<<UCPTRIE_SHIFT_2.
  148. * (UCPTRIE_SHIFT_2=9)
  149. *
  150. * Values for code points fast_limit..highStart-1 are found via four-stage lookup.
  151. * The data block size is smaller for this range than for the fast range.
  152. * This together with more index stages with small blocks makes this range
  153. * more easily compactable.
  154. *
  155. * There is also a trie error value stored at the last position of the data array.
  156. * (See UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET.)
  157. * It is intended to be returned for inputs that are not Unicode code points
  158. * (outside U+0000..U+10FFFF), or in string processing for ill-formed input
  159. * (unpaired surrogate in UTF-16, ill-formed UTF-8 subsequence).
  160. *
  161. * For a "fast" trie:
  162. *
  163. * The index array starts with the BMP index table for BMP code point lookup.
  164. * Its length is 1024=0x400.
  165. *
  166. * The supplementary index-1 table follows the BMP index table.
  167. * Variable length, for code points up to highStart-1.
  168. * Maximum length 64=0x40=0x100000>>UCPTRIE_SHIFT_1.
  169. * (For 0x100000 supplementary code points U+10000..U+10ffff.)
  170. *
  171. * After this index-1 table follow the variable-length index-3 and index-2 tables.
  172. *
  173. * The supplementary index tables are omitted completely
  174. * if there is only BMP data (highStart<=U+10000).
  175. *
  176. * For a "small" trie:
  177. *
  178. * The index array starts with a fast-index table for lookup of code points U+0000..U+0FFF.
  179. *
  180. * The "supplementary" index tables are always stored.
  181. * The index-1 table starts from U+0000, its maximum length is 68=0x44=0x110000>>UCPTRIE_SHIFT_1.
  182. *
  183. * For both trie types:
  184. *
  185. * The last index-2 block may be a partial block, storing indexes only for code points
  186. * below highStart.
  187. *
  188. * Lookup for ASCII code point c:
  189. *
  190. * Linear access from the start of the data array.
  191. *
  192. * value = data[c];
  193. *
  194. * Lookup for fast-range code point c:
  195. *
  196. * Shift the code point right by UCPTRIE_FAST_SHIFT=6 bits,
  197. * fetch the index array value at that offset,
  198. * add the lower code point bits, index into the data array.
  199. *
  200. * value = data[index[c>>6] + (c&0x3f)];
  201. *
  202. * (This works for ASCII as well.)
  203. *
  204. * Lookup for small-range code point c below highStart:
  205. *
  206. * Split the code point into four bit fields using several sets of shifts & masks
  207. * to read consecutive values from the index-1, index-2, index-3 and data tables.
  208. *
  209. * If all of the data block offsets in an index-3 block fit within 16 bits (up to 0xffff),
  210. * then the data block offsets are stored directly as uint16_t.
  211. *
  212. * Otherwise (this is very unusual but possible), the index-2 entry for the index-3 block
  213. * has bit 15 (0x8000) set, and each set of 8 index-3 entries is preceded by
  214. * an additional uint16_t word. Data block offsets are 18 bits wide, with the top 2 bits stored
  215. * in the additional word.
  216. *
  217. * See ucptrie_internalSmallIndex() for details.
  218. *
  219. * (In a "small" trie, this works for ASCII and below-fast_limit code points as well.)
  220. *
  221. * Compaction:
  222. *
  223. * Multiple code point ranges ("blocks") that are aligned on certain boundaries
  224. * (determined by the shifting/bit fields of code points) and
  225. * map to the same data values normally share a single subsequence of the data array.
  226. * Data blocks can also overlap partially.
  227. * (Depending on the builder code finding duplicate and overlapping blocks.)
  228. *
  229. * Iteration over same-value ranges:
  230. *
  231. * Range iteration (ucptrie_getRange()) walks the structure from a start code point
  232. * until some code point is found that maps to a different value;
  233. * the end of the returned range is just before that.
  234. *
  235. * The header.dataNullOffset (split across two header fields, high bits in header.options)
  236. * is the offset of a widely shared data block filled with one single value.
  237. * It helps quickly skip over large ranges of data with that value.
  238. * The builder must ensure that if the start of any data block (fast or small)
  239. * matches the dataNullOffset, then the whole block must be filled with the null value.
  240. * Special care must be taken if there is no fast null data block
  241. * but a small one, which is shorter, and it matches the *start* of some fast data block.
  242. *
  243. * Similarly, the header.index3NullOffset is the index-array offset of an index-3 block
  244. * where all index entries point to the dataNullOffset.
  245. * If there is no such data or index-3 block, then these offsets are set to
  246. * values that cannot be reached (data offset out of range/reserved index offset),
  247. * normally UCPTRIE_NO_DATA_NULL_OFFSET or UCPTRIE_NO_INDEX3_NULL_OFFSET respectively.
  248. */
  249. #endif