ucnv_ext.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2003-2013, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: ucnv_ext.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2003jun13
  16. * created by: Markus W. Scherer
  17. *
  18. * Conversion extensions
  19. */
  20. #ifndef __UCNV_EXT_H__
  21. #define __UCNV_EXT_H__
  22. #include "unicode/utypes.h"
  23. #if !UCONFIG_NO_CONVERSION
  24. #include "unicode/ucnv.h"
  25. #include "ucnv_cnv.h"
  26. /*
  27. * See icuhtml/design/conversion/conversion_extensions.html
  28. *
  29. * Conversion extensions serve three purposes:
  30. * 1. They support m:n mappings.
  31. * 2. They support extension-only conversion files that are used together
  32. * with the regular conversion data in base files.
  33. * 3. They support mappings with more complicated meta data,
  34. * for example "good one-way" mappings (|4).
  35. *
  36. * A base file may contain an extension table (explicitly requested or
  37. * implicitly generated for m:n mappings), but its extension table is not
  38. * used when an extension-only file is used.
  39. *
  40. * It is an error if a base file contains any regular (not extension) mapping
  41. * from the same sequence as a mapping in the extension file
  42. * because the base mapping would hide the extension mapping.
  43. *
  44. *
  45. * Data for conversion extensions:
  46. *
  47. * One set of data structures per conversion direction (to/from Unicode).
  48. * The data structures are sorted by input units to allow for binary search.
  49. * Input sequences of more than one unit are handled like contraction tables
  50. * in collation:
  51. * The lookup value of a unit points to another table that is to be searched
  52. * for the next unit, recursively.
  53. *
  54. * For conversion from Unicode, the initial code point is looked up in
  55. * a 3-stage trie for speed,
  56. * with an additional table of unique results to save space.
  57. *
  58. * Long output strings are stored in separate arrays, with length and index
  59. * in the lookup tables.
  60. * Output results also include a flag distinguishing roundtrip from
  61. * (reverse) fallback mappings.
  62. *
  63. * Input Unicode strings must not begin or end with unpaired surrogates
  64. * to avoid problems with matches on parts of surrogate pairs.
  65. *
  66. * Mappings from multiple characters (code points or codepage state
  67. * table sequences) must be searched preferring the longest match.
  68. * For this to work and be efficient, the variable-width table must contain
  69. * all mappings that contain prefixes of the multiple characters.
  70. * If an extension table is built on top of a base table in another file
  71. * and a base table entry is a prefix of a multi-character mapping, then
  72. * this is an error.
  73. *
  74. *
  75. * Implementation note:
  76. *
  77. * Currently, the parser and several checks in the code limit the number
  78. * of UChars or bytes in a mapping to
  79. * UCNV_EXT_MAX_UCHARS and UCNV_EXT_MAX_BYTES, respectively,
  80. * which are output value limits in the data structure.
  81. *
  82. * For input, this is not strictly necessary - it is a hard limit only for the
  83. * buffers in UConverter that are used to store partial matches.
  84. *
  85. * Input sequences could otherwise be arbitrarily long if partial matches
  86. * need not be stored (i.e., if a sequence does not span several buffers with too
  87. * many units before the last buffer), although then results would differ
  88. * depending on whether partial matches exceed the limits or not,
  89. * which depends on the pattern of buffer sizes.
  90. *
  91. *
  92. * Data structure:
  93. *
  94. * int32_t indexes[>=32];
  95. *
  96. * Array of indexes and lengths etc. The length of the array is at least 32.
  97. * The actual length is stored in indexes[0] to be forward compatible.
  98. *
  99. * Each index to another array is the number of bytes from indexes[].
  100. * Each length of an array is the number of array base units in that array.
  101. *
  102. * Some of the structures may not be present, in which case their indexes
  103. * and lengths are 0.
  104. *
  105. * Usage of indexes[i]:
  106. * [0] length of indexes[]
  107. *
  108. * // to Unicode table
  109. * [1] index of toUTable[] (array of uint32_t)
  110. * [2] length of toUTable[]
  111. * [3] index of toUUChars[] (array of UChar)
  112. * [4] length of toUUChars[]
  113. *
  114. * // from Unicode table, not for the initial code point
  115. * [5] index of fromUTableUChars[] (array of UChar)
  116. * [6] index of fromUTableValues[] (array of uint32_t)
  117. * [7] length of fromUTableUChars[] and fromUTableValues[]
  118. * [8] index of fromUBytes[] (array of char)
  119. * [9] length of fromUBytes[]
  120. *
  121. * // from Unicode trie for initial-code point lookup
  122. * [10] index of fromUStage12[] (combined array of uint16_t for stages 1 & 2)
  123. * [11] length of stage 1 portion of fromUStage12[]
  124. * [12] length of fromUStage12[]
  125. * [13] index of fromUStage3[] (array of uint16_t indexes into fromUStage3b[])
  126. * [14] length of fromUStage3[]
  127. * [15] index of fromUStage3b[] (array of uint32_t like fromUTableValues[])
  128. * [16] length of fromUStage3b[]
  129. *
  130. * [17] Bit field containing numbers of bytes:
  131. * 31..24 reserved, 0
  132. * 23..16 maximum input bytes
  133. * 15.. 8 maximum output bytes
  134. * 7.. 0 maximum bytes per UChar
  135. *
  136. * [18] Bit field containing numbers of UChars:
  137. * 31..24 reserved, 0
  138. * 23..16 maximum input UChars
  139. * 15.. 8 maximum output UChars
  140. * 7.. 0 maximum UChars per byte
  141. *
  142. * [19] Bit field containing flags:
  143. * (extension table unicodeMask)
  144. * 1 UCNV_HAS_SURROGATES flag for the extension table
  145. * 0 UCNV_HAS_SUPPLEMENTARY flag for the extension table
  146. *
  147. * [20]..[30] reserved, 0
  148. * [31] number of bytes for the entire extension structure
  149. * [>31] reserved; there are indexes[0] indexes
  150. *
  151. *
  152. * uint32_t toUTable[];
  153. *
  154. * Array of byte/value pairs for lookups for toUnicode conversion.
  155. * The array is partitioned into sections like collation contraction tables.
  156. * Each section contains one word with the number of following words and
  157. * a default value for when the lookup in this section yields no match.
  158. *
  159. * A section is sorted in ascending order of input bytes,
  160. * allowing for fast linear or binary searches.
  161. * The builder may store entries for a contiguous range of byte values
  162. * (compare difference between the first and last one with count),
  163. * which then allows for direct array access.
  164. * The builder should always do this for the initial table section.
  165. *
  166. * Entries may have 0 values, see below.
  167. * No two entries in a section have the same byte values.
  168. *
  169. * Each uint32_t contains an input byte value in bits 31..24 and the
  170. * corresponding lookup value in bits 23..0.
  171. * Interpret the value as follows:
  172. * if(value==0) {
  173. * no match, see below
  174. * } else if(value<0x1f0000) {
  175. * partial match - use value as index to the next toUTable section
  176. * and match the next unit; (value indexes toUTable[value])
  177. * } else {
  178. * if(bit 23 set) {
  179. * roundtrip;
  180. * } else {
  181. * fallback;
  182. * }
  183. * unset value bit 23;
  184. * if(value<=0x2fffff) {
  185. * (value-0x1f0000) is a code point; (BMP: value<=0x1fffff)
  186. * } else {
  187. * bits 17..0 (value&0x3ffff) is an index to
  188. * the result UChars in toUUChars[]; (0 indexes toUUChars[0])
  189. * length of the result=((value>>18)-12); (length=0..19)
  190. * }
  191. * }
  192. *
  193. * The first word in a section contains the number of following words in the
  194. * input byte position (bits 31..24, number=1..0xff).
  195. * The value of the initial word is used when the current byte is not found
  196. * in this section.
  197. * If the value is not 0, then it represents a result as above.
  198. * If the value is 0, then the search has to return a shorter match with an
  199. * earlier default value as the result, or result in "unmappable" even for the
  200. * initial bytes.
  201. * If the value is 0 for the initial toUTable entry, then the initial byte
  202. * does not start any mapping input.
  203. *
  204. *
  205. * UChar toUUChars[];
  206. *
  207. * Contains toUnicode mapping results, stored as sequences of UChars.
  208. * Indexes and lengths stored in the toUTable[].
  209. *
  210. *
  211. * UChar fromUTableUChars[];
  212. * uint32_t fromUTableValues[];
  213. *
  214. * The fromUTable is split into two arrays, but works otherwise much like
  215. * the toUTable. The array is partitioned into sections like collation
  216. * contraction tables and toUTable.
  217. * A row in the table consists of same-index entries in fromUTableUChars[]
  218. * and fromUTableValues[].
  219. *
  220. * Interpret a value as follows:
  221. * if(value==0) {
  222. * no match, see below
  223. * } else if(value<=0xffffff) { (bits 31..24 are 0)
  224. * partial match - use value as index to the next fromUTable section
  225. * and match the next unit; (value indexes fromUTable[value])
  226. * } else {
  227. * if(value==0x80000001) {
  228. * return no mapping, but request for <subchar1>;
  229. * }
  230. * if(bit 31 set) {
  231. * roundtrip (|0);
  232. * } else if(bit 30 set) {
  233. * "good one-way" mapping (|4); -- new in ICU4C 51, _MBCSHeader.version 5.4/4.4
  234. * } else {
  235. * normal fallback (|1);
  236. * }
  237. * // bit 29 reserved, 0
  238. * length=(value>>24)&0x1f; (bits 28..24)
  239. * if(length==1..3) {
  240. * bits 23..0 contain 1..3 bytes, padded with 00s on the left;
  241. * } else {
  242. * bits 23..0 (value&0xffffff) is an index to
  243. * the result bytes in fromUBytes[]; (0 indexes fromUBytes[0])
  244. * }
  245. * }
  246. *
  247. * The first pair in a section contains the number of following pairs in the
  248. * UChar position (16 bits, number=1..0xffff).
  249. * The value of the initial pair is used when the current UChar is not found
  250. * in this section.
  251. * If the value is not 0, then it represents a result as above.
  252. * If the value is 0, then the search has to return a shorter match with an
  253. * earlier default value as the result, or result in "unmappable" even for the
  254. * initial UChars.
  255. *
  256. * If the from Unicode trie is present, then the from Unicode search tables
  257. * are not used for initial code points.
  258. * In this case, the first entries (index 0) in the tables are not used
  259. * (reserved, set to 0) because a value of 0 is used in trie results
  260. * to indicate no mapping.
  261. *
  262. *
  263. * uint16_t fromUStage12[];
  264. *
  265. * Stages 1 & 2 of a trie that maps an initial code point.
  266. * Indexes in stage 1 are all offset by the length of stage 1 so that the
  267. * same array pointer can be used for both stages.
  268. * If (c>>10)>=(length of stage 1) then c does not start any mapping.
  269. * Same bit distribution as for regular conversion tries.
  270. *
  271. *
  272. * uint16_t fromUStage3[];
  273. * uint32_t fromUStage3b[];
  274. *
  275. * Stage 3 of the trie. The first array simply contains indexes to the second,
  276. * which contains words in the same format as fromUTableValues[].
  277. * Use a stage 3 granularity of 4, which allows for 256k stage 3 entries,
  278. * and 16-bit entries in stage 3 allow for 64k stage 3b entries.
  279. * The stage 3 granularity means that the stage 2 entry needs to be left-shifted.
  280. *
  281. * Two arrays are used because it is expected that more than half of the stage 3
  282. * entries will be zero. The 16-bit index stage 3 array saves space even
  283. * considering storing a total of 6 bytes per non-zero entry in both arrays
  284. * together.
  285. * Using a stage 3 granularity of >1 diminishes the compactability in that stage
  286. * but provides a larger effective addressing space in stage 2.
  287. * All but the final result stage use 16-bit entries to save space.
  288. *
  289. * fromUStage3b[] contains a zero for "no mapping" at its index 0,
  290. * and may contain UCNV_EXT_FROM_U_SUBCHAR1 at index 1 for "<subchar1> SUB mapping"
  291. * (i.e., "no mapping" with preference for <subchar1> rather than <subchar>),
  292. * and all other items are unique non-zero results.
  293. *
  294. * The default value of a fromUTableValues[] section that is referenced
  295. * _directly_ from a fromUStage3b[] item may also be UCNV_EXT_FROM_U_SUBCHAR1,
  296. * but this value must not occur anywhere else in fromUTableValues[]
  297. * because "no mapping" is always a property of a single code point,
  298. * never of multiple.
  299. *
  300. *
  301. * char fromUBytes[];
  302. *
  303. * Contains fromUnicode mapping results, stored as sequences of chars.
  304. * Indexes and lengths stored in the fromUTableValues[].
  305. */
  306. enum {
  307. UCNV_EXT_INDEXES_LENGTH, /* 0 */
  308. UCNV_EXT_TO_U_INDEX, /* 1 */
  309. UCNV_EXT_TO_U_LENGTH,
  310. UCNV_EXT_TO_U_UCHARS_INDEX,
  311. UCNV_EXT_TO_U_UCHARS_LENGTH,
  312. UCNV_EXT_FROM_U_UCHARS_INDEX, /* 5 */
  313. UCNV_EXT_FROM_U_VALUES_INDEX,
  314. UCNV_EXT_FROM_U_LENGTH,
  315. UCNV_EXT_FROM_U_BYTES_INDEX,
  316. UCNV_EXT_FROM_U_BYTES_LENGTH,
  317. UCNV_EXT_FROM_U_STAGE_12_INDEX, /* 10 */
  318. UCNV_EXT_FROM_U_STAGE_1_LENGTH,
  319. UCNV_EXT_FROM_U_STAGE_12_LENGTH,
  320. UCNV_EXT_FROM_U_STAGE_3_INDEX,
  321. UCNV_EXT_FROM_U_STAGE_3_LENGTH,
  322. UCNV_EXT_FROM_U_STAGE_3B_INDEX,
  323. UCNV_EXT_FROM_U_STAGE_3B_LENGTH,
  324. UCNV_EXT_COUNT_BYTES, /* 17 */
  325. UCNV_EXT_COUNT_UCHARS,
  326. UCNV_EXT_FLAGS,
  327. UCNV_EXT_RESERVED_INDEX, /* 20, moves with additional indexes */
  328. UCNV_EXT_SIZE=31,
  329. UCNV_EXT_INDEXES_MIN_LENGTH=32
  330. };
  331. /* get the pointer to an extension array from indexes[index] */
  332. #define UCNV_EXT_ARRAY(indexes, index, itemType) \
  333. ((const itemType *)((const char *)(indexes)+(indexes)[index]))
  334. #define UCNV_GET_MAX_BYTES_PER_UCHAR(indexes) \
  335. ((indexes)[UCNV_EXT_COUNT_BYTES]&0xff)
  336. /* internal API ------------------------------------------------------------- */
  337. U_CFUNC UBool
  338. ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
  339. int32_t firstLength,
  340. const char **src, const char *srcLimit,
  341. UChar **target, const UChar *targetLimit,
  342. int32_t **offsets, int32_t srcIndex,
  343. UBool flush,
  344. UErrorCode *pErrorCode);
  345. U_CFUNC UChar32
  346. ucnv_extSimpleMatchToU(const int32_t *cx,
  347. const char *source, int32_t length,
  348. UBool useFallback);
  349. U_CFUNC void
  350. ucnv_extContinueMatchToU(UConverter *cnv,
  351. UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
  352. UErrorCode *pErrorCode);
  353. U_CFUNC UBool
  354. ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
  355. UChar32 cp,
  356. const UChar **src, const UChar *srcLimit,
  357. char **target, const char *targetLimit,
  358. int32_t **offsets, int32_t srcIndex,
  359. UBool flush,
  360. UErrorCode *pErrorCode);
  361. U_CFUNC int32_t
  362. ucnv_extSimpleMatchFromU(const int32_t *cx,
  363. UChar32 cp, uint32_t *pValue,
  364. UBool useFallback);
  365. U_CFUNC void
  366. ucnv_extContinueMatchFromU(UConverter *cnv,
  367. UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
  368. UErrorCode *pErrorCode);
  369. /*
  370. * Add code points and strings to the set according to the extension mappings.
  371. * Limitation on the UConverterSetFilter:
  372. * The filters currently assume that they are used with 1:1 mappings.
  373. * They only apply to single input code points, and then they pass through
  374. * only mappings with single-charset-code results.
  375. * For example, the Shift-JIS filter only works for 2-byte results and tests
  376. * that those 2 bytes are in the JIS X 0208 range of Shift-JIS.
  377. */
  378. U_CFUNC void
  379. ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
  380. const USetAdder *sa,
  381. UConverterUnicodeSet which,
  382. UConverterSetFilter filter,
  383. UErrorCode *pErrorCode);
  384. /* toUnicode helpers -------------------------------------------------------- */
  385. #define UCNV_EXT_TO_U_BYTE_SHIFT 24
  386. #define UCNV_EXT_TO_U_VALUE_MASK 0xffffff
  387. #define UCNV_EXT_TO_U_MIN_CODE_POINT 0x1f0000
  388. #define UCNV_EXT_TO_U_MAX_CODE_POINT 0x2fffff
  389. #define UCNV_EXT_TO_U_ROUNDTRIP_FLAG ((uint32_t)1<<23)
  390. #define UCNV_EXT_TO_U_INDEX_MASK 0x3ffff
  391. #define UCNV_EXT_TO_U_LENGTH_SHIFT 18
  392. #define UCNV_EXT_TO_U_LENGTH_OFFSET 12
  393. /* maximum number of indexed UChars */
  394. #define UCNV_EXT_MAX_UCHARS 19
  395. #define UCNV_EXT_TO_U_MAKE_WORD(byte, value) (((uint32_t)(byte)<<UCNV_EXT_TO_U_BYTE_SHIFT)|(value))
  396. #define UCNV_EXT_TO_U_GET_BYTE(word) ((word)>>UCNV_EXT_TO_U_BYTE_SHIFT)
  397. #define UCNV_EXT_TO_U_GET_VALUE(word) ((word)&UCNV_EXT_TO_U_VALUE_MASK)
  398. #define UCNV_EXT_TO_U_IS_PARTIAL(value) ((value)<UCNV_EXT_TO_U_MIN_CODE_POINT)
  399. #define UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value) (value)
  400. #define UCNV_EXT_TO_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_TO_U_ROUNDTRIP_FLAG)!=0)
  401. #define UCNV_EXT_TO_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_TO_U_ROUNDTRIP_FLAG)
  402. /* use after masking off the roundtrip flag */
  403. #define UCNV_EXT_TO_U_IS_CODE_POINT(value) ((value)<=UCNV_EXT_TO_U_MAX_CODE_POINT)
  404. #define UCNV_EXT_TO_U_GET_CODE_POINT(value) ((value)-UCNV_EXT_TO_U_MIN_CODE_POINT)
  405. #define UCNV_EXT_TO_U_GET_INDEX(value) ((value)&UCNV_EXT_TO_U_INDEX_MASK)
  406. #define UCNV_EXT_TO_U_GET_LENGTH(value) (((value)>>UCNV_EXT_TO_U_LENGTH_SHIFT)-UCNV_EXT_TO_U_LENGTH_OFFSET)
  407. /* fromUnicode helpers ------------------------------------------------------ */
  408. /* most trie constants are shared with ucnvmbcs.h */
  409. /* see similar utrie.h UTRIE_INDEX_SHIFT and UTRIE_DATA_GRANULARITY */
  410. #define UCNV_EXT_STAGE_2_LEFT_SHIFT 2
  411. #define UCNV_EXT_STAGE_3_GRANULARITY 4
  412. /* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */
  413. #define UCNV_EXT_FROM_U(stage12, stage3, s1Index, c) \
  414. (stage3)[ ((int32_t)(stage12)[ (stage12)[s1Index] +(((c)>>4)&0x3f) ]<<UCNV_EXT_STAGE_2_LEFT_SHIFT) +((c)&0xf) ]
  415. #define UCNV_EXT_FROM_U_LENGTH_SHIFT 24
  416. #define UCNV_EXT_FROM_U_ROUNDTRIP_FLAG ((uint32_t)1<<31)
  417. #define UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG 0x40000000
  418. #define UCNV_EXT_FROM_U_STATUS_MASK 0xc0000000
  419. #define UCNV_EXT_FROM_U_RESERVED_MASK 0x20000000
  420. #define UCNV_EXT_FROM_U_DATA_MASK 0xffffff
  421. /* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */
  422. #define UCNV_EXT_FROM_U_SUBCHAR1 0x80000001
  423. /* at most 3 bytes in the lower part of the value */
  424. #define UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH 3
  425. /* maximum number of indexed bytes */
  426. #define UCNV_EXT_MAX_BYTES 0x1f
  427. #define UCNV_EXT_FROM_U_IS_PARTIAL(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)==0)
  428. #define UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value) (value)
  429. #define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0)
  430. #define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)
  431. /* get length; masks away all other bits */
  432. #define UCNV_EXT_FROM_U_GET_LENGTH(value) (int32_t)(((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES)
  433. /* get bytes or bytes index */
  434. #define UCNV_EXT_FROM_U_GET_DATA(value) ((value)&UCNV_EXT_FROM_U_DATA_MASK)
  435. #endif
  436. #endif