ucnv_err.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *****************************************************************************
  5. *
  6. * Copyright (C) 1998-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *****************************************************************************
  10. *
  11. * ucnv_err.c
  12. * Implements error behaviour functions called by T_UConverter_{from,to}Unicode
  13. *
  14. *
  15. * Change history:
  16. *
  17. * 06/29/2000 helena Major rewrite of the callback APIs.
  18. */
  19. #include "unicode/utypes.h"
  20. #if !UCONFIG_NO_CONVERSION
  21. #include "unicode/ucnv_err.h"
  22. #include "unicode/ucnv_cb.h"
  23. #include "ucnv_cnv.h"
  24. #include "cmemory.h"
  25. #include "unicode/ucnv.h"
  26. #include "ustrfmt.h"
  27. #define VALUE_STRING_LENGTH 48
  28. /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
  29. #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
  30. #define UNICODE_U_CODEPOINT 0x0055
  31. #define UNICODE_X_CODEPOINT 0x0058
  32. #define UNICODE_RS_CODEPOINT 0x005C
  33. #define UNICODE_U_LOW_CODEPOINT 0x0075
  34. #define UNICODE_X_LOW_CODEPOINT 0x0078
  35. #define UNICODE_AMP_CODEPOINT 0x0026
  36. #define UNICODE_HASH_CODEPOINT 0x0023
  37. #define UNICODE_SEMICOLON_CODEPOINT 0x003B
  38. #define UNICODE_PLUS_CODEPOINT 0x002B
  39. #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
  40. #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
  41. #define UNICODE_SPACE_CODEPOINT 0x0020
  42. #define UCNV_PRV_ESCAPE_ICU 0
  43. #define UCNV_PRV_ESCAPE_C 'C'
  44. #define UCNV_PRV_ESCAPE_XML_DEC 'D'
  45. #define UCNV_PRV_ESCAPE_XML_HEX 'X'
  46. #define UCNV_PRV_ESCAPE_JAVA 'J'
  47. #define UCNV_PRV_ESCAPE_UNICODE 'U'
  48. #define UCNV_PRV_ESCAPE_CSS2 'S'
  49. #define UCNV_PRV_STOP_ON_ILLEGAL 'i'
  50. /*
  51. * IS_DEFAULT_IGNORABLE_CODE_POINT
  52. * This is to check if a code point has the default ignorable unicode property.
  53. * As such, this list needs to be updated if the ignorable code point list ever
  54. * changes.
  55. * To avoid dependency on other code, this list is hard coded here.
  56. * When an ignorable code point is found and is unmappable, the default callbacks
  57. * will ignore them.
  58. * For a list of the default ignorable code points, use this link:
  59. * https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
  60. *
  61. * This list should be sync with the one in CharsetCallback.java
  62. */
  63. #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \
  64. (c == 0x00AD) || \
  65. (c == 0x034F) || \
  66. (c == 0x061C) || \
  67. (c == 0x115F) || \
  68. (c == 0x1160) || \
  69. (0x17B4 <= c && c <= 0x17B5) || \
  70. (0x180B <= c && c <= 0x180F) || \
  71. (0x200B <= c && c <= 0x200F) || \
  72. (0x202A <= c && c <= 0x202E) || \
  73. (0x2060 <= c && c <= 0x206F) || \
  74. (c == 0x3164) || \
  75. (0xFE00 <= c && c <= 0xFE0F) || \
  76. (c == 0xFEFF) || \
  77. (c == 0xFFA0) || \
  78. (0xFFF0 <= c && c <= 0xFFF8) || \
  79. (0x1BCA0 <= c && c <= 0x1BCA3) || \
  80. (0x1D173 <= c && c <= 0x1D17A) || \
  81. (0xE0000 <= c && c <= 0xE0FFF))
  82. /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
  83. U_CAPI void U_EXPORT2
  84. UCNV_FROM_U_CALLBACK_STOP (
  85. const void *context,
  86. UConverterFromUnicodeArgs *fromUArgs,
  87. const char16_t* codeUnits,
  88. int32_t length,
  89. UChar32 codePoint,
  90. UConverterCallbackReason reason,
  91. UErrorCode * err)
  92. {
  93. (void)context;
  94. (void)fromUArgs;
  95. (void)codeUnits;
  96. (void)length;
  97. if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
  98. {
  99. /*
  100. * Skip if the codepoint has unicode property of default ignorable.
  101. */
  102. *err = U_ZERO_ERROR;
  103. }
  104. /* the caller must have set the error code accordingly */
  105. }
  106. /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
  107. U_CAPI void U_EXPORT2
  108. UCNV_TO_U_CALLBACK_STOP (
  109. const void *context,
  110. UConverterToUnicodeArgs *toUArgs,
  111. const char* codePoints,
  112. int32_t length,
  113. UConverterCallbackReason reason,
  114. UErrorCode * err)
  115. {
  116. /* the caller must have set the error code accordingly */
  117. (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
  118. }
  119. U_CAPI void U_EXPORT2
  120. UCNV_FROM_U_CALLBACK_SKIP (
  121. const void *context,
  122. UConverterFromUnicodeArgs *fromUArgs,
  123. const char16_t* codeUnits,
  124. int32_t length,
  125. UChar32 codePoint,
  126. UConverterCallbackReason reason,
  127. UErrorCode * err)
  128. {
  129. (void)fromUArgs;
  130. (void)codeUnits;
  131. (void)length;
  132. if (reason <= UCNV_IRREGULAR)
  133. {
  134. if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
  135. {
  136. /*
  137. * Skip if the codepoint has unicode property of default ignorable.
  138. */
  139. *err = U_ZERO_ERROR;
  140. }
  141. else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
  142. {
  143. *err = U_ZERO_ERROR;
  144. }
  145. /* else the caller must have set the error code accordingly. */
  146. }
  147. /* else ignore the reset, close and clone calls. */
  148. }
  149. U_CAPI void U_EXPORT2
  150. UCNV_FROM_U_CALLBACK_SUBSTITUTE (
  151. const void *context,
  152. UConverterFromUnicodeArgs *fromArgs,
  153. const char16_t* codeUnits,
  154. int32_t length,
  155. UChar32 codePoint,
  156. UConverterCallbackReason reason,
  157. UErrorCode * err)
  158. {
  159. (void)codeUnits;
  160. (void)length;
  161. if (reason <= UCNV_IRREGULAR)
  162. {
  163. if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
  164. {
  165. /*
  166. * Skip if the codepoint has unicode property of default ignorable.
  167. */
  168. *err = U_ZERO_ERROR;
  169. }
  170. else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
  171. {
  172. *err = U_ZERO_ERROR;
  173. ucnv_cbFromUWriteSub(fromArgs, 0, err);
  174. }
  175. /* else the caller must have set the error code accordingly. */
  176. }
  177. /* else ignore the reset, close and clone calls. */
  178. }
  179. /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
  180. *uses a clean copy (resetted) of the converter, to convert that unicode
  181. *escape sequence to the target codepage (if conversion failure happens then
  182. *we revert to substituting with subchar)
  183. */
  184. U_CAPI void U_EXPORT2
  185. UCNV_FROM_U_CALLBACK_ESCAPE (
  186. const void *context,
  187. UConverterFromUnicodeArgs *fromArgs,
  188. const char16_t *codeUnits,
  189. int32_t length,
  190. UChar32 codePoint,
  191. UConverterCallbackReason reason,
  192. UErrorCode * err)
  193. {
  194. char16_t valueString[VALUE_STRING_LENGTH];
  195. int32_t valueStringLength = 0;
  196. int32_t i = 0;
  197. const char16_t *myValueSource = nullptr;
  198. UErrorCode err2 = U_ZERO_ERROR;
  199. UConverterFromUCallback original = nullptr;
  200. const void *originalContext;
  201. UConverterFromUCallback ignoredCallback = nullptr;
  202. const void *ignoredContext;
  203. if (reason > UCNV_IRREGULAR)
  204. {
  205. return;
  206. }
  207. else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
  208. {
  209. /*
  210. * Skip if the codepoint has unicode property of default ignorable.
  211. */
  212. *err = U_ZERO_ERROR;
  213. return;
  214. }
  215. ucnv_setFromUCallBack (fromArgs->converter,
  216. (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
  217. nullptr,
  218. &original,
  219. &originalContext,
  220. &err2);
  221. if (U_FAILURE (err2))
  222. {
  223. *err = err2;
  224. return;
  225. }
  226. if(context==nullptr)
  227. {
  228. while (i < length)
  229. {
  230. valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
  231. valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
  232. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
  233. }
  234. }
  235. else
  236. {
  237. switch(*((char*)context))
  238. {
  239. case UCNV_PRV_ESCAPE_JAVA:
  240. while (i < length)
  241. {
  242. valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
  243. valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
  244. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
  245. }
  246. break;
  247. case UCNV_PRV_ESCAPE_C:
  248. valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
  249. if(length==2){
  250. valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
  251. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
  252. }
  253. else{
  254. valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
  255. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
  256. }
  257. break;
  258. case UCNV_PRV_ESCAPE_XML_DEC:
  259. valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
  260. valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
  261. if(length==2){
  262. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
  263. }
  264. else{
  265. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
  266. }
  267. valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
  268. break;
  269. case UCNV_PRV_ESCAPE_XML_HEX:
  270. valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
  271. valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
  272. valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
  273. if(length==2){
  274. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
  275. }
  276. else{
  277. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
  278. }
  279. valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
  280. break;
  281. case UCNV_PRV_ESCAPE_UNICODE:
  282. valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */
  283. valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
  284. valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */
  285. if (length == 2) {
  286. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
  287. } else {
  288. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
  289. }
  290. valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */
  291. break;
  292. case UCNV_PRV_ESCAPE_CSS2:
  293. valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
  294. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
  295. /* Always add space character, because the next character might be whitespace,
  296. which would erroneously be considered the termination of the escape sequence. */
  297. valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT;
  298. break;
  299. default:
  300. while (i < length)
  301. {
  302. valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
  303. valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
  304. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
  305. }
  306. }
  307. }
  308. myValueSource = valueString;
  309. /* reset the error */
  310. *err = U_ZERO_ERROR;
  311. ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
  312. ucnv_setFromUCallBack (fromArgs->converter,
  313. original,
  314. originalContext,
  315. &ignoredCallback,
  316. &ignoredContext,
  317. &err2);
  318. if (U_FAILURE (err2))
  319. {
  320. *err = err2;
  321. return;
  322. }
  323. }
  324. U_CAPI void U_EXPORT2
  325. UCNV_TO_U_CALLBACK_SKIP (
  326. const void *context,
  327. UConverterToUnicodeArgs *toArgs,
  328. const char* codeUnits,
  329. int32_t length,
  330. UConverterCallbackReason reason,
  331. UErrorCode * err)
  332. {
  333. (void)toArgs;
  334. (void)codeUnits;
  335. (void)length;
  336. if (reason <= UCNV_IRREGULAR)
  337. {
  338. if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
  339. {
  340. *err = U_ZERO_ERROR;
  341. }
  342. /* else the caller must have set the error code accordingly. */
  343. }
  344. /* else ignore the reset, close and clone calls. */
  345. }
  346. U_CAPI void U_EXPORT2
  347. UCNV_TO_U_CALLBACK_SUBSTITUTE (
  348. const void *context,
  349. UConverterToUnicodeArgs *toArgs,
  350. const char* codeUnits,
  351. int32_t length,
  352. UConverterCallbackReason reason,
  353. UErrorCode * err)
  354. {
  355. (void)codeUnits;
  356. (void)length;
  357. if (reason <= UCNV_IRREGULAR)
  358. {
  359. if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
  360. {
  361. *err = U_ZERO_ERROR;
  362. ucnv_cbToUWriteSub(toArgs,0,err);
  363. }
  364. /* else the caller must have set the error code accordingly. */
  365. }
  366. /* else ignore the reset, close and clone calls. */
  367. }
  368. /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
  369. *and uses that as the substitution sequence
  370. */
  371. U_CAPI void U_EXPORT2
  372. UCNV_TO_U_CALLBACK_ESCAPE (
  373. const void *context,
  374. UConverterToUnicodeArgs *toArgs,
  375. const char* codeUnits,
  376. int32_t length,
  377. UConverterCallbackReason reason,
  378. UErrorCode * err)
  379. {
  380. char16_t uniValueString[VALUE_STRING_LENGTH];
  381. int32_t valueStringLength = 0;
  382. int32_t i = 0;
  383. if (reason > UCNV_IRREGULAR)
  384. {
  385. return;
  386. }
  387. if(context==nullptr)
  388. {
  389. while (i < length)
  390. {
  391. uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
  392. uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */
  393. valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
  394. }
  395. }
  396. else
  397. {
  398. switch(*((char*)context))
  399. {
  400. case UCNV_PRV_ESCAPE_XML_DEC:
  401. while (i < length)
  402. {
  403. uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
  404. uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
  405. valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
  406. uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
  407. }
  408. break;
  409. case UCNV_PRV_ESCAPE_XML_HEX:
  410. while (i < length)
  411. {
  412. uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
  413. uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
  414. uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
  415. valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
  416. uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
  417. }
  418. break;
  419. case UCNV_PRV_ESCAPE_C:
  420. while (i < length)
  421. {
  422. uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
  423. uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
  424. valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
  425. }
  426. break;
  427. default:
  428. while (i < length)
  429. {
  430. uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
  431. uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */
  432. uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
  433. valueStringLength += 2;
  434. }
  435. }
  436. }
  437. /* reset the error */
  438. *err = U_ZERO_ERROR;
  439. ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
  440. }
  441. #endif