ucnv_ext.cpp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2003-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: ucnv_ext.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2003jun13
  16. * created by: Markus W. Scherer
  17. *
  18. * Conversion extensions
  19. */
  20. #include "unicode/utypes.h"
  21. #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  22. #include "unicode/uset.h"
  23. #include "unicode/ustring.h"
  24. #include "ucnv_bld.h"
  25. #include "ucnv_cnv.h"
  26. #include "ucnv_ext.h"
  27. #include "cmemory.h"
  28. #include "uassert.h"
  29. /* to Unicode --------------------------------------------------------------- */
  30. /*
  31. * @return lookup value for the byte, if found; else 0
  32. */
  33. static inline uint32_t
  34. ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
  35. uint32_t word0, word;
  36. int32_t i, start, limit;
  37. /* check the input byte against the lowest and highest section bytes */
  38. start = static_cast<int32_t>(UCNV_EXT_TO_U_GET_BYTE(toUSection[0]));
  39. limit = static_cast<int32_t>(UCNV_EXT_TO_U_GET_BYTE(toUSection[length - 1]));
  40. if(byte<start || limit<byte) {
  41. return 0; /* the byte is out of range */
  42. }
  43. if(length==((limit-start)+1)) {
  44. /* direct access on a linear array */
  45. return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */
  46. }
  47. /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */
  48. word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0);
  49. /*
  50. * Shift byte once instead of each section word and add 0xffffff.
  51. * We will compare the shifted/added byte (bbffffff) against
  52. * section words which have byte values in the same bit position.
  53. * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv
  54. * for all v=0..f
  55. * so we need not mask off the lower 24 bits of each section word.
  56. */
  57. word=word0|UCNV_EXT_TO_U_VALUE_MASK;
  58. /* binary search */
  59. start=0;
  60. limit=length;
  61. for(;;) {
  62. i=limit-start;
  63. if(i<=1) {
  64. break; /* done */
  65. }
  66. /* start<limit-1 */
  67. if(i<=4) {
  68. /* linear search for the last part */
  69. if(word0<=toUSection[start]) {
  70. break;
  71. }
  72. if(++start<limit && word0<=toUSection[start]) {
  73. break;
  74. }
  75. if(++start<limit && word0<=toUSection[start]) {
  76. break;
  77. }
  78. /* always break at start==limit-1 */
  79. ++start;
  80. break;
  81. }
  82. i=(start+limit)/2;
  83. if(word<toUSection[i]) {
  84. limit=i;
  85. } else {
  86. start=i;
  87. }
  88. }
  89. /* did we really find it? */
  90. if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) {
  91. return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */
  92. } else {
  93. return 0; /* not found */
  94. }
  95. }
  96. /*
  97. * true if not an SI/SO stateful converter,
  98. * or if the match length fits with the current converter state
  99. */
  100. #define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \
  101. ((sisoState)<0 || ((sisoState)==0) == (match==1))
  102. /*
  103. * this works like ucnv_extMatchFromU() except
  104. * - the first character is in pre
  105. * - no trie is used
  106. * - the returned matchLength is not offset by 2
  107. */
  108. static int32_t
  109. ucnv_extMatchToU(const int32_t *cx, int8_t sisoState,
  110. const char *pre, int32_t preLength,
  111. const char *src, int32_t srcLength,
  112. uint32_t *pMatchValue,
  113. UBool /*useFallback*/, UBool flush) {
  114. const uint32_t *toUTable, *toUSection;
  115. uint32_t value, matchValue;
  116. int32_t i, j, idx, length, matchLength;
  117. uint8_t b;
  118. if(cx==nullptr || cx[UCNV_EXT_TO_U_LENGTH]<=0) {
  119. return 0; /* no extension data, no match */
  120. }
  121. /* initialize */
  122. toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t);
  123. idx=0;
  124. matchValue=0;
  125. i=j=matchLength=0;
  126. if(sisoState==0) {
  127. /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
  128. if(preLength>1) {
  129. return 0; /* no match of a DBCS sequence in SBCS mode */
  130. } else if(preLength==1) {
  131. srcLength=0;
  132. } else /* preLength==0 */ {
  133. if(srcLength>1) {
  134. srcLength=1;
  135. }
  136. }
  137. flush=true;
  138. }
  139. /* we must not remember fallback matches when not using fallbacks */
  140. /* match input units until there is a full match or the input is consumed */
  141. for(;;) {
  142. /* go to the next section */
  143. toUSection=toUTable+idx;
  144. /* read first pair of the section */
  145. value=*toUSection++;
  146. length=UCNV_EXT_TO_U_GET_BYTE(value);
  147. value=UCNV_EXT_TO_U_GET_VALUE(value);
  148. if( value!=0 &&
  149. (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
  150. TO_U_USE_FALLBACK(useFallback)) &&
  151. UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
  152. ) {
  153. /* remember longest match so far */
  154. matchValue=value;
  155. matchLength=i+j;
  156. }
  157. /* match pre[] then src[] */
  158. if(i<preLength) {
  159. b = static_cast<uint8_t>(pre[i++]);
  160. } else if(j<srcLength) {
  161. b = static_cast<uint8_t>(src[j++]);
  162. } else {
  163. /* all input consumed, partial match */
  164. if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) {
  165. /*
  166. * end of the entire input stream, stop with the longest match so far
  167. * or: partial match must not be longer than UCNV_EXT_MAX_BYTES
  168. * because it must fit into state buffers
  169. */
  170. break;
  171. } else {
  172. /* continue with more input next time */
  173. return -length;
  174. }
  175. }
  176. /* search for the current char16_t */
  177. value=ucnv_extFindToU(toUSection, length, b);
  178. if(value==0) {
  179. /* no match here, stop with the longest match so far */
  180. break;
  181. } else {
  182. if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
  183. /* partial match, continue */
  184. idx = static_cast<int32_t>(UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value));
  185. } else {
  186. if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
  187. TO_U_USE_FALLBACK(useFallback)) &&
  188. UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
  189. ) {
  190. /* full match, stop with result */
  191. matchValue=value;
  192. matchLength=i+j;
  193. } else {
  194. /* full match on fallback not taken, stop with the longest match so far */
  195. }
  196. break;
  197. }
  198. }
  199. }
  200. if(matchLength==0) {
  201. /* no match at all */
  202. return 0;
  203. }
  204. /* return result */
  205. *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
  206. return matchLength;
  207. }
  208. static inline void
  209. ucnv_extWriteToU(UConverter *cnv, const int32_t *cx,
  210. uint32_t value,
  211. char16_t **target, const char16_t *targetLimit,
  212. int32_t **offsets, int32_t srcIndex,
  213. UErrorCode *pErrorCode) {
  214. /* output the result */
  215. if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
  216. /* output a single code point */
  217. ucnv_toUWriteCodePoint(
  218. cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value),
  219. target, targetLimit,
  220. offsets, srcIndex,
  221. pErrorCode);
  222. } else {
  223. /* output a string - with correct data we have resultLength>0 */
  224. ucnv_toUWriteUChars(
  225. cnv,
  226. UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, char16_t)+
  227. UCNV_EXT_TO_U_GET_INDEX(value),
  228. UCNV_EXT_TO_U_GET_LENGTH(value),
  229. target, targetLimit,
  230. offsets, srcIndex,
  231. pErrorCode);
  232. }
  233. }
  234. /*
  235. * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS),
  236. * or 1 for DBCS-only,
  237. * or -1 if the converter is not SI/SO stateful
  238. *
  239. * Note: For SI/SO stateful converters getting here,
  240. * cnv->mode==0 is equivalent to firstLength==1.
  241. */
  242. #define UCNV_SISO_STATE(cnv) \
  243. ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \
  244. (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1)
  245. /*
  246. * target<targetLimit; set error code for overflow
  247. */
  248. U_CFUNC UBool
  249. ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
  250. int32_t firstLength,
  251. const char **src, const char *srcLimit,
  252. char16_t **target, const char16_t *targetLimit,
  253. int32_t **offsets, int32_t srcIndex,
  254. UBool flush,
  255. UErrorCode *pErrorCode) {
  256. uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */
  257. int32_t match;
  258. /* try to match */
  259. match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv),
  260. (const char *)cnv->toUBytes, firstLength,
  261. *src, (int32_t)(srcLimit-*src),
  262. &value,
  263. cnv->useFallback, flush);
  264. if(match>0) {
  265. /* advance src pointer for the consumed input */
  266. *src+=match-firstLength;
  267. /* write result to target */
  268. ucnv_extWriteToU(cnv, cx,
  269. value,
  270. target, targetLimit,
  271. offsets, srcIndex,
  272. pErrorCode);
  273. return true;
  274. } else if(match<0) {
  275. /* save state for partial match */
  276. const char *s;
  277. int32_t j;
  278. /* copy the first code point */
  279. s=(const char *)cnv->toUBytes;
  280. cnv->preToUFirstLength=(int8_t)firstLength;
  281. for(j=0; j<firstLength; ++j) {
  282. cnv->preToU[j]=*s++;
  283. }
  284. /* now copy the newly consumed input */
  285. s=*src;
  286. match=-match;
  287. for(; j<match; ++j) {
  288. cnv->preToU[j]=*s++;
  289. }
  290. *src=s; /* same as *src=srcLimit; because we reached the end of input */
  291. cnv->preToULength=(int8_t)match;
  292. return true;
  293. } else /* match==0 no match */ {
  294. return false;
  295. }
  296. }
  297. U_CFUNC UChar32
  298. ucnv_extSimpleMatchToU(const int32_t *cx,
  299. const char *source, int32_t length,
  300. UBool useFallback) {
  301. uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */
  302. int32_t match;
  303. if(length<=0) {
  304. return 0xffff;
  305. }
  306. /* try to match */
  307. match=ucnv_extMatchToU(cx, -1,
  308. source, length,
  309. nullptr, 0,
  310. &value,
  311. useFallback, true);
  312. if(match==length) {
  313. /* write result for simple, single-character conversion */
  314. if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
  315. return UCNV_EXT_TO_U_GET_CODE_POINT(value);
  316. }
  317. }
  318. /*
  319. * return no match because
  320. * - match>0 && value points to string: simple conversion cannot handle multiple code points
  321. * - match>0 && match!=length: not all input consumed, forbidden for this function
  322. * - match==0: no match found in the first place
  323. * - match<0: partial match, not supported for simple conversion (and flush==true)
  324. */
  325. return 0xfffe;
  326. }
  327. /*
  328. * continue partial match with new input
  329. * never called for simple, single-character conversion
  330. */
  331. U_CFUNC void
  332. ucnv_extContinueMatchToU(UConverter *cnv,
  333. UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
  334. UErrorCode *pErrorCode) {
  335. uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */
  336. int32_t match, length;
  337. match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv),
  338. cnv->preToU, cnv->preToULength,
  339. pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
  340. &value,
  341. cnv->useFallback, pArgs->flush);
  342. if(match>0) {
  343. if(match>=cnv->preToULength) {
  344. /* advance src pointer for the consumed input */
  345. pArgs->source+=match-cnv->preToULength;
  346. cnv->preToULength=0;
  347. } else {
  348. /* the match did not use all of preToU[] - keep the rest for replay */
  349. length=cnv->preToULength-match;
  350. uprv_memmove(cnv->preToU, cnv->preToU+match, length);
  351. cnv->preToULength=(int8_t)-length;
  352. }
  353. /* write result */
  354. ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes,
  355. value,
  356. &pArgs->target, pArgs->targetLimit,
  357. &pArgs->offsets, srcIndex,
  358. pErrorCode);
  359. } else if(match<0) {
  360. /* save state for partial match */
  361. const char *s;
  362. int32_t j;
  363. /* just _append_ the newly consumed input to preToU[] */
  364. s=pArgs->source;
  365. match=-match;
  366. for(j=cnv->preToULength; j<match; ++j) {
  367. cnv->preToU[j]=*s++;
  368. }
  369. pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
  370. cnv->preToULength=(int8_t)match;
  371. } else /* match==0 */ {
  372. /*
  373. * no match
  374. *
  375. * We need to split the previous input into two parts:
  376. *
  377. * 1. The first codepage character is unmappable - that's how we got into
  378. * trying the extension data in the first place.
  379. * We need to move it from the preToU buffer
  380. * to the error buffer, set an error code,
  381. * and prepare the rest of the previous input for 2.
  382. *
  383. * 2. The rest of the previous input must be converted once we
  384. * come back from the callback for the first character.
  385. * At that time, we have to try again from scratch to convert
  386. * these input characters.
  387. * The replay will be handled by the ucnv.c conversion code.
  388. */
  389. /* move the first codepage character to the error field */
  390. uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
  391. cnv->toULength=cnv->preToUFirstLength;
  392. /* move the rest up inside the buffer */
  393. length=cnv->preToULength-cnv->preToUFirstLength;
  394. if(length>0) {
  395. uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
  396. }
  397. /* mark preToU for replay */
  398. cnv->preToULength=(int8_t)-length;
  399. /* set the error code for unassigned */
  400. *pErrorCode=U_INVALID_CHAR_FOUND;
  401. }
  402. }
  403. /* from Unicode ------------------------------------------------------------- */
  404. // Use roundtrips, "good one-way" mappings, and some normal fallbacks.
  405. static inline UBool
  406. extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) {
  407. return
  408. ((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 ||
  409. FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
  410. (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0;
  411. }
  412. /*
  413. * @return index of the char16_t, if found; else <0
  414. */
  415. static inline int32_t
  416. ucnv_extFindFromU(const char16_t *fromUSection, int32_t length, char16_t u) {
  417. int32_t i, start, limit;
  418. /* binary search */
  419. start=0;
  420. limit=length;
  421. for(;;) {
  422. i=limit-start;
  423. if(i<=1) {
  424. break; /* done */
  425. }
  426. /* start<limit-1 */
  427. if(i<=4) {
  428. /* linear search for the last part */
  429. if(u<=fromUSection[start]) {
  430. break;
  431. }
  432. if(++start<limit && u<=fromUSection[start]) {
  433. break;
  434. }
  435. if(++start<limit && u<=fromUSection[start]) {
  436. break;
  437. }
  438. /* always break at start==limit-1 */
  439. ++start;
  440. break;
  441. }
  442. i=(start+limit)/2;
  443. if(u<fromUSection[i]) {
  444. limit=i;
  445. } else {
  446. start=i;
  447. }
  448. }
  449. /* did we really find it? */
  450. if(start<limit && u==fromUSection[start]) {
  451. return start;
  452. } else {
  453. return -1; /* not found */
  454. }
  455. }
  456. /*
  457. * @param cx pointer to extension data; if nullptr, returns 0
  458. * @param firstCP the first code point before all the other UChars
  459. * @param pre UChars that must match; !initialMatch: partial match with them
  460. * @param preLength length of pre, >=0
  461. * @param src UChars that can be used to complete a match
  462. * @param srcLength length of src, >=0
  463. * @param pMatchValue [out] output result value for the match from the data structure
  464. * @param useFallback "use fallback" flag, usually from cnv->useFallback
  465. * @param flush true if the end of the input stream is reached
  466. * @return >1: matched, return value=total match length (number of input units matched)
  467. * 1: matched, no mapping but request for <subchar1>
  468. * (only for the first code point)
  469. * 0: no match
  470. * <0: partial match, return value=negative total match length
  471. * (partial matches are never returned for flush==true)
  472. * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
  473. * the matchLength is 2 if only firstCP matched, and >2 if firstCP and
  474. * further code units matched
  475. */
  476. static int32_t
  477. ucnv_extMatchFromU(const int32_t *cx,
  478. UChar32 firstCP,
  479. const char16_t *pre, int32_t preLength,
  480. const char16_t *src, int32_t srcLength,
  481. uint32_t *pMatchValue,
  482. UBool useFallback, UBool flush) {
  483. const uint16_t *stage12, *stage3;
  484. const uint32_t *stage3b;
  485. const char16_t *fromUTableUChars, *fromUSectionUChars;
  486. const uint32_t *fromUTableValues, *fromUSectionValues;
  487. uint32_t value, matchValue;
  488. int32_t i, j, idx, length, matchLength;
  489. char16_t c;
  490. if(cx==nullptr) {
  491. return 0; /* no extension data, no match */
  492. }
  493. /* trie lookup of firstCP */
  494. idx=firstCP>>10; /* stage 1 index */
  495. if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
  496. return 0; /* the first code point is outside the trie */
  497. }
  498. stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
  499. stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
  500. idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP);
  501. stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
  502. value=stage3b[idx];
  503. if(value==0) {
  504. return 0;
  505. }
  506. /*
  507. * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0:
  508. * Do not interpret values with reserved bits used, for forward compatibility,
  509. * and do not even remember intermediate results with reserved bits used.
  510. */
  511. if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
  512. /* partial match, enter the loop below */
  513. idx = static_cast<int32_t>(UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value));
  514. /* initialize */
  515. fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, char16_t);
  516. fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);
  517. matchValue=0;
  518. i=j=matchLength=0;
  519. /* we must not remember fallback matches when not using fallbacks */
  520. /* match input units until there is a full match or the input is consumed */
  521. for(;;) {
  522. /* go to the next section */
  523. fromUSectionUChars=fromUTableUChars+idx;
  524. fromUSectionValues=fromUTableValues+idx;
  525. /* read first pair of the section */
  526. length=*fromUSectionUChars++;
  527. value=*fromUSectionValues++;
  528. if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) {
  529. /* remember longest match so far */
  530. matchValue=value;
  531. matchLength=2+i+j;
  532. }
  533. /* match pre[] then src[] */
  534. if(i<preLength) {
  535. c=pre[i++];
  536. } else if(j<srcLength) {
  537. c=src[j++];
  538. } else {
  539. /* all input consumed, partial match */
  540. if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) {
  541. /*
  542. * end of the entire input stream, stop with the longest match so far
  543. * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
  544. * because it must fit into state buffers
  545. */
  546. break;
  547. } else {
  548. /* continue with more input next time */
  549. return -(2+length);
  550. }
  551. }
  552. /* search for the current char16_t */
  553. idx=ucnv_extFindFromU(fromUSectionUChars, length, c);
  554. if(idx<0) {
  555. /* no match here, stop with the longest match so far */
  556. break;
  557. } else {
  558. value=fromUSectionValues[idx];
  559. if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
  560. /* partial match, continue */
  561. idx = static_cast<int32_t>(UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value));
  562. } else {
  563. if(extFromUUseMapping(useFallback, value, firstCP)) {
  564. /* full match, stop with result */
  565. matchValue=value;
  566. matchLength=2+i+j;
  567. } else {
  568. /* full match on fallback not taken, stop with the longest match so far */
  569. }
  570. break;
  571. }
  572. }
  573. }
  574. if(matchLength==0) {
  575. /* no match at all */
  576. return 0;
  577. }
  578. } else /* result from firstCP trie lookup */ {
  579. if(extFromUUseMapping(useFallback, value, firstCP)) {
  580. /* full match, stop with result */
  581. matchValue=value;
  582. matchLength=2;
  583. } else {
  584. /* fallback not taken */
  585. return 0;
  586. }
  587. }
  588. /* return result */
  589. if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
  590. return 1; /* assert matchLength==2 */
  591. }
  592. *pMatchValue=matchValue;
  593. return matchLength;
  594. }
  595. /*
  596. * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits
  597. */
  598. static inline void
  599. ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx,
  600. uint32_t value,
  601. char **target, const char *targetLimit,
  602. int32_t **offsets, int32_t srcIndex,
  603. UErrorCode *pErrorCode) {
  604. uint8_t buffer[1+UCNV_EXT_MAX_BYTES];
  605. const uint8_t *result;
  606. int32_t length, prevLength;
  607. length=UCNV_EXT_FROM_U_GET_LENGTH(value);
  608. value = UCNV_EXT_FROM_U_GET_DATA(value);
  609. /* output the result */
  610. if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
  611. /*
  612. * Generate a byte array and then write it below.
  613. * This is not the fastest possible way, but it should be ok for
  614. * extension mappings, and it is much simpler.
  615. * Offset and overflow handling are only done once this way.
  616. */
  617. uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */
  618. switch(length) {
  619. case 3:
  620. *p++ = static_cast<uint8_t>(value >> 16);
  621. U_FALLTHROUGH;
  622. case 2:
  623. *p++ = static_cast<uint8_t>(value >> 8);
  624. U_FALLTHROUGH;
  625. case 1:
  626. *p++ = static_cast<uint8_t>(value);
  627. U_FALLTHROUGH;
  628. default:
  629. break; /* will never occur */
  630. }
  631. result=buffer+1;
  632. } else {
  633. result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
  634. }
  635. /* with correct data we have length>0 */
  636. if((prevLength=cnv->fromUnicodeStatus)!=0) {
  637. /* handle SI/SO stateful output */
  638. uint8_t shiftByte;
  639. if(prevLength>1 && length==1) {
  640. /* change from double-byte mode to single-byte */
  641. shiftByte = static_cast<uint8_t>(UCNV_SI);
  642. cnv->fromUnicodeStatus=1;
  643. } else if(prevLength==1 && length>1) {
  644. /* change from single-byte mode to double-byte */
  645. shiftByte = static_cast<uint8_t>(UCNV_SO);
  646. cnv->fromUnicodeStatus=2;
  647. } else {
  648. shiftByte=0;
  649. }
  650. if(shiftByte!=0) {
  651. /* prepend the shift byte to the result bytes */
  652. buffer[0]=shiftByte;
  653. if(result!=buffer+1) {
  654. uprv_memcpy(buffer+1, result, length);
  655. }
  656. result=buffer;
  657. ++length;
  658. }
  659. }
  660. ucnv_fromUWriteBytes(cnv, reinterpret_cast<const char*>(result), length,
  661. target, targetLimit,
  662. offsets, srcIndex,
  663. pErrorCode);
  664. }
  665. /*
  666. * target<targetLimit; set error code for overflow
  667. */
  668. U_CFUNC UBool
  669. ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
  670. UChar32 cp,
  671. const char16_t **src, const char16_t *srcLimit,
  672. char **target, const char *targetLimit,
  673. int32_t **offsets, int32_t srcIndex,
  674. UBool flush,
  675. UErrorCode *pErrorCode) {
  676. uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */
  677. int32_t match;
  678. /* try to match */
  679. match=ucnv_extMatchFromU(cx, cp,
  680. nullptr, 0,
  681. *src, (int32_t)(srcLimit-*src),
  682. &value,
  683. cnv->useFallback, flush);
  684. /* reject a match if the result is a single byte for DBCS-only */
  685. if( match>=2 &&
  686. !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 &&
  687. cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY)
  688. ) {
  689. /* advance src pointer for the consumed input */
  690. *src+=match-2; /* remove 2 for the initial code point */
  691. /* write result to target */
  692. ucnv_extWriteFromU(cnv, cx,
  693. value,
  694. target, targetLimit,
  695. offsets, srcIndex,
  696. pErrorCode);
  697. return true;
  698. } else if(match<0) {
  699. /* save state for partial match */
  700. const char16_t *s;
  701. int32_t j;
  702. /* copy the first code point */
  703. cnv->preFromUFirstCP=cp;
  704. /* now copy the newly consumed input */
  705. s=*src;
  706. match=-match-2; /* remove 2 for the initial code point */
  707. for(j=0; j<match; ++j) {
  708. cnv->preFromU[j]=*s++;
  709. }
  710. *src=s; /* same as *src=srcLimit; because we reached the end of input */
  711. cnv->preFromULength=(int8_t)match;
  712. return true;
  713. } else if(match==1) {
  714. /* matched, no mapping but request for <subchar1> */
  715. cnv->useSubChar1=true;
  716. return false;
  717. } else /* match==0 no match */ {
  718. return false;
  719. }
  720. }
  721. /*
  722. * Used by ISO 2022 implementation.
  723. * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping
  724. */
  725. U_CFUNC int32_t
  726. ucnv_extSimpleMatchFromU(const int32_t *cx,
  727. UChar32 cp, uint32_t *pValue,
  728. UBool useFallback) {
  729. uint32_t value;
  730. int32_t match;
  731. /* try to match */
  732. match=ucnv_extMatchFromU(cx,
  733. cp,
  734. nullptr, 0,
  735. nullptr, 0,
  736. &value,
  737. useFallback, true);
  738. if(match>=2) {
  739. /* write result for simple, single-character conversion */
  740. int32_t length;
  741. int isRoundtrip;
  742. isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value);
  743. length=UCNV_EXT_FROM_U_GET_LENGTH(value);
  744. value = UCNV_EXT_FROM_U_GET_DATA(value);
  745. if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
  746. *pValue=value;
  747. return isRoundtrip ? length : -length;
  748. #if 0 /* not currently used */
  749. } else if(length==4) {
  750. /* de-serialize a 4-byte result */
  751. const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
  752. *pValue=
  753. ((uint32_t)result[0]<<24)|
  754. ((uint32_t)result[1]<<16)|
  755. ((uint32_t)result[2]<<8)|
  756. result[3];
  757. return isRoundtrip ? 4 : -4;
  758. #endif
  759. }
  760. }
  761. /*
  762. * return no match because
  763. * - match>1 && resultLength>4: result too long for simple conversion
  764. * - match==1: no match found, <subchar1> preferred
  765. * - match==0: no match found in the first place
  766. * - match<0: partial match, not supported for simple conversion (and flush==true)
  767. */
  768. return 0;
  769. }
  770. /*
  771. * continue partial match with new input, requires cnv->preFromUFirstCP>=0
  772. * never called for simple, single-character conversion
  773. */
  774. U_CFUNC void
  775. ucnv_extContinueMatchFromU(UConverter *cnv,
  776. UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
  777. UErrorCode *pErrorCode) {
  778. uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */
  779. int32_t match;
  780. match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes,
  781. cnv->preFromUFirstCP,
  782. cnv->preFromU, cnv->preFromULength,
  783. pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
  784. &value,
  785. cnv->useFallback, pArgs->flush);
  786. if(match>=2) {
  787. match-=2; /* remove 2 for the initial code point */
  788. if(match>=cnv->preFromULength) {
  789. /* advance src pointer for the consumed input */
  790. pArgs->source+=match-cnv->preFromULength;
  791. cnv->preFromULength=0;
  792. } else {
  793. /* the match did not use all of preFromU[] - keep the rest for replay */
  794. int32_t length=cnv->preFromULength-match;
  795. u_memmove(cnv->preFromU, cnv->preFromU+match, length);
  796. cnv->preFromULength=(int8_t)-length;
  797. }
  798. /* finish the partial match */
  799. cnv->preFromUFirstCP=U_SENTINEL;
  800. /* write result */
  801. ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes,
  802. value,
  803. &pArgs->target, pArgs->targetLimit,
  804. &pArgs->offsets, srcIndex,
  805. pErrorCode);
  806. } else if(match<0) {
  807. /* save state for partial match */
  808. const char16_t *s;
  809. int32_t j;
  810. /* just _append_ the newly consumed input to preFromU[] */
  811. s=pArgs->source;
  812. match=-match-2; /* remove 2 for the initial code point */
  813. for(j=cnv->preFromULength; j<match; ++j) {
  814. U_ASSERT(j>=0);
  815. cnv->preFromU[j]=*s++;
  816. }
  817. pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
  818. cnv->preFromULength=(int8_t)match;
  819. } else /* match==0 or 1 */ {
  820. /*
  821. * no match
  822. *
  823. * We need to split the previous input into two parts:
  824. *
  825. * 1. The first code point is unmappable - that's how we got into
  826. * trying the extension data in the first place.
  827. * We need to move it from the preFromU buffer
  828. * to the error buffer, set an error code,
  829. * and prepare the rest of the previous input for 2.
  830. *
  831. * 2. The rest of the previous input must be converted once we
  832. * come back from the callback for the first code point.
  833. * At that time, we have to try again from scratch to convert
  834. * these input characters.
  835. * The replay will be handled by the ucnv.c conversion code.
  836. */
  837. if(match==1) {
  838. /* matched, no mapping but request for <subchar1> */
  839. cnv->useSubChar1=true;
  840. }
  841. /* move the first code point to the error field */
  842. cnv->fromUChar32=cnv->preFromUFirstCP;
  843. cnv->preFromUFirstCP=U_SENTINEL;
  844. /* mark preFromU for replay */
  845. cnv->preFromULength=-cnv->preFromULength;
  846. /* set the error code for unassigned */
  847. *pErrorCode=U_INVALID_CHAR_FOUND;
  848. }
  849. }
  850. static UBool
  851. extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) {
  852. if(which==UCNV_ROUNDTRIP_SET) {
  853. // Add only code points for which the roundtrip flag is set.
  854. // Do not add any fallbacks, even if ucnv_fromUnicode() would use them
  855. // (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet().
  856. //
  857. // By analogy, also do not add "good one-way" mappings.
  858. //
  859. // Do not add entries with reserved bits set.
  860. if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!=
  861. UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) {
  862. return false;
  863. }
  864. } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
  865. // Do not add entries with reserved bits set.
  866. if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) {
  867. return false;
  868. }
  869. }
  870. // Do not add <subchar1> entries or other (future?) pseudo-entries
  871. // with an output length of 0.
  872. return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength;
  873. }
  874. static void
  875. ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
  876. const int32_t *cx,
  877. const USetAdder *sa,
  878. UConverterUnicodeSet which,
  879. int32_t minLength,
  880. UChar32 firstCP,
  881. char16_t s[UCNV_EXT_MAX_UCHARS], int32_t length,
  882. int32_t sectionIndex,
  883. UErrorCode *pErrorCode) {
  884. const char16_t *fromUSectionUChars;
  885. const uint32_t *fromUSectionValues;
  886. uint32_t value;
  887. int32_t i, count;
  888. fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, char16_t)+sectionIndex;
  889. fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;
  890. /* read first pair of the section */
  891. count=*fromUSectionUChars++;
  892. value=*fromUSectionValues++;
  893. if(extSetUseMapping(which, minLength, value)) {
  894. if(length==U16_LENGTH(firstCP)) {
  895. /* add the initial code point */
  896. sa->add(sa->set, firstCP);
  897. } else {
  898. /* add the string so far */
  899. sa->addString(sa->set, s, length);
  900. }
  901. }
  902. for(i=0; i<count; ++i) {
  903. /* append this code unit and recurse or add the string */
  904. s[length]=fromUSectionUChars[i];
  905. value=fromUSectionValues[i];
  906. if(value==0) {
  907. /* no mapping, do nothing */
  908. } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
  909. ucnv_extGetUnicodeSetString(
  910. sharedData, cx, sa, which, minLength,
  911. firstCP, s, length+1,
  912. static_cast<int32_t>(UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value)),
  913. pErrorCode);
  914. } else if(extSetUseMapping(which, minLength, value)) {
  915. sa->addString(sa->set, s, length+1);
  916. }
  917. }
  918. }
  919. U_CFUNC void
  920. ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
  921. const USetAdder *sa,
  922. UConverterUnicodeSet which,
  923. UConverterSetFilter filter,
  924. UErrorCode *pErrorCode) {
  925. const int32_t *cx;
  926. const uint16_t *stage12, *stage3, *ps2, *ps3;
  927. const uint32_t *stage3b;
  928. uint32_t value;
  929. int32_t st1, stage1Length, st2, st3, minLength;
  930. char16_t s[UCNV_EXT_MAX_UCHARS];
  931. UChar32 c;
  932. int32_t length;
  933. cx=sharedData->mbcs.extIndexes;
  934. if(cx==nullptr) {
  935. return;
  936. }
  937. stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
  938. stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
  939. stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
  940. stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
  941. /* enumerate the from-Unicode trie table */
  942. c=0; /* keep track of the current code point while enumerating */
  943. if(filter==UCNV_SET_FILTER_2022_CN) {
  944. minLength=3;
  945. } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
  946. filter!=UCNV_SET_FILTER_NONE
  947. ) {
  948. /* DBCS-only, ignore single-byte results */
  949. minLength=2;
  950. } else {
  951. minLength=1;
  952. }
  953. /*
  954. * the trie enumeration is almost the same as
  955. * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1
  956. */
  957. for(st1=0; st1<stage1Length; ++st1) {
  958. st2=stage12[st1];
  959. if(st2>stage1Length) {
  960. ps2=stage12+st2;
  961. for(st2=0; st2<64; ++st2) {
  962. if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) {
  963. /* read the stage 3 block */
  964. ps3=stage3+st3;
  965. do {
  966. value=stage3b[*ps3++];
  967. if(value==0) {
  968. /* no mapping, do nothing */
  969. } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
  970. // Recurse for partial results.
  971. length=0;
  972. U16_APPEND_UNSAFE(s, length, c);
  973. ucnv_extGetUnicodeSetString(
  974. sharedData, cx, sa, which, minLength,
  975. c, s, length,
  976. (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
  977. pErrorCode);
  978. } else if(extSetUseMapping(which, minLength, value)) {
  979. switch(filter) {
  980. case UCNV_SET_FILTER_2022_CN:
  981. if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
  982. continue;
  983. }
  984. break;
  985. case UCNV_SET_FILTER_SJIS:
  986. if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
  987. continue;
  988. }
  989. break;
  990. case UCNV_SET_FILTER_GR94DBCS:
  991. if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
  992. (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
  993. (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
  994. continue;
  995. }
  996. break;
  997. case UCNV_SET_FILTER_HZ:
  998. if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
  999. (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
  1000. (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
  1001. continue;
  1002. }
  1003. break;
  1004. default:
  1005. /*
  1006. * UCNV_SET_FILTER_NONE,
  1007. * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
  1008. */
  1009. break;
  1010. }
  1011. sa->add(sa->set, c);
  1012. }
  1013. } while((++c&0xf)!=0);
  1014. } else {
  1015. c+=16; /* empty stage 3 block */
  1016. }
  1017. }
  1018. } else {
  1019. c+=1024; /* empty stage 2 block */
  1020. }
  1021. }
  1022. }
  1023. #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */