ucnv_err.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *****************************************************************************
  5. *
  6. * Copyright (C) 1998-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *****************************************************************************
  10. *
  11. * ucnv_err.c
  12. * Implements error behaviour functions called by T_UConverter_{from,to}Unicode
  13. *
  14. *
  15. * Change history:
  16. *
  17. * 06/29/2000 helena Major rewrite of the callback APIs.
  18. */
  19. #include "unicode/utypes.h"
  20. #if !UCONFIG_NO_CONVERSION
  21. #include "unicode/ucnv_err.h"
  22. #include "unicode/ucnv_cb.h"
  23. #include "ucnv_cnv.h"
  24. #include "cmemory.h"
  25. #include "unicode/ucnv.h"
  26. #include "ustrfmt.h"
  27. #define VALUE_STRING_LENGTH 48
  28. /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
  29. #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
  30. #define UNICODE_U_CODEPOINT 0x0055
  31. #define UNICODE_X_CODEPOINT 0x0058
  32. #define UNICODE_RS_CODEPOINT 0x005C
  33. #define UNICODE_U_LOW_CODEPOINT 0x0075
  34. #define UNICODE_X_LOW_CODEPOINT 0x0078
  35. #define UNICODE_AMP_CODEPOINT 0x0026
  36. #define UNICODE_HASH_CODEPOINT 0x0023
  37. #define UNICODE_SEMICOLON_CODEPOINT 0x003B
  38. #define UNICODE_PLUS_CODEPOINT 0x002B
  39. #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
  40. #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
  41. #define UNICODE_SPACE_CODEPOINT 0x0020
  42. #define UCNV_PRV_ESCAPE_ICU 0
  43. #define UCNV_PRV_ESCAPE_C 'C'
  44. #define UCNV_PRV_ESCAPE_XML_DEC 'D'
  45. #define UCNV_PRV_ESCAPE_XML_HEX 'X'
  46. #define UCNV_PRV_ESCAPE_JAVA 'J'
  47. #define UCNV_PRV_ESCAPE_UNICODE 'U'
  48. #define UCNV_PRV_ESCAPE_CSS2 'S'
  49. #define UCNV_PRV_STOP_ON_ILLEGAL 'i'
  50. /*
  51. * IS_DEFAULT_IGNORABLE_CODE_POINT
  52. * This is to check if a code point has the default ignorable unicode property.
  53. * As such, this list needs to be updated if the ignorable code point list ever
  54. * changes.
  55. * To avoid dependency on other code, this list is hard coded here.
  56. * When an ignorable code point is found and is unmappable, the default callbacks
  57. * will ignore them.
  58. * For a list of the default ignorable code points, use this link:
  59. * https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
  60. *
  61. * This list should be sync with the one in CharsetCallback.java
  62. */
  63. #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \
  64. (c == 0x00AD) || \
  65. (c == 0x034F) || \
  66. (c == 0x061C) || \
  67. (c == 0x115F) || \
  68. (c == 0x1160) || \
  69. (0x17B4 <= c && c <= 0x17B5) || \
  70. (0x180B <= c && c <= 0x180F) || \
  71. (0x200B <= c && c <= 0x200F) || \
  72. (0x202A <= c && c <= 0x202E) || \
  73. (0x2060 <= c && c <= 0x206F) || \
  74. (c == 0x3164) || \
  75. (0xFE00 <= c && c <= 0xFE0F) || \
  76. (c == 0xFEFF) || \
  77. (c == 0xFFA0) || \
  78. (0xFFF0 <= c && c <= 0xFFF8) || \
  79. (0x1BCA0 <= c && c <= 0x1BCA3) || \
  80. (0x1D173 <= c && c <= 0x1D17A) || \
  81. (0xE0000 <= c && c <= 0xE0FFF))
  82. /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
  83. U_CAPI void U_EXPORT2
  84. UCNV_FROM_U_CALLBACK_STOP (
  85. const void *context,
  86. UConverterFromUnicodeArgs *fromUArgs,
  87. const char16_t* codeUnits,
  88. int32_t length,
  89. UChar32 codePoint,
  90. UConverterCallbackReason reason,
  91. UErrorCode * err)
  92. {
  93. (void)context;
  94. (void)fromUArgs;
  95. (void)codeUnits;
  96. (void)length;
  97. if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
  98. {
  99. /*
  100. * Skip if the codepoint has unicode property of default ignorable.
  101. */
  102. *err = U_ZERO_ERROR;
  103. }
  104. /* the caller must have set the error code accordingly */
  105. return;
  106. }
  107. /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
  108. U_CAPI void U_EXPORT2
  109. UCNV_TO_U_CALLBACK_STOP (
  110. const void *context,
  111. UConverterToUnicodeArgs *toUArgs,
  112. const char* codePoints,
  113. int32_t length,
  114. UConverterCallbackReason reason,
  115. UErrorCode * err)
  116. {
  117. /* the caller must have set the error code accordingly */
  118. (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
  119. return;
  120. }
  121. U_CAPI void U_EXPORT2
  122. UCNV_FROM_U_CALLBACK_SKIP (
  123. const void *context,
  124. UConverterFromUnicodeArgs *fromUArgs,
  125. const char16_t* codeUnits,
  126. int32_t length,
  127. UChar32 codePoint,
  128. UConverterCallbackReason reason,
  129. UErrorCode * err)
  130. {
  131. (void)fromUArgs;
  132. (void)codeUnits;
  133. (void)length;
  134. if (reason <= UCNV_IRREGULAR)
  135. {
  136. if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
  137. {
  138. /*
  139. * Skip if the codepoint has unicode property of default ignorable.
  140. */
  141. *err = U_ZERO_ERROR;
  142. }
  143. else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
  144. {
  145. *err = U_ZERO_ERROR;
  146. }
  147. /* else the caller must have set the error code accordingly. */
  148. }
  149. /* else ignore the reset, close and clone calls. */
  150. }
  151. U_CAPI void U_EXPORT2
  152. UCNV_FROM_U_CALLBACK_SUBSTITUTE (
  153. const void *context,
  154. UConverterFromUnicodeArgs *fromArgs,
  155. const char16_t* codeUnits,
  156. int32_t length,
  157. UChar32 codePoint,
  158. UConverterCallbackReason reason,
  159. UErrorCode * err)
  160. {
  161. (void)codeUnits;
  162. (void)length;
  163. if (reason <= UCNV_IRREGULAR)
  164. {
  165. if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
  166. {
  167. /*
  168. * Skip if the codepoint has unicode property of default ignorable.
  169. */
  170. *err = U_ZERO_ERROR;
  171. }
  172. else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
  173. {
  174. *err = U_ZERO_ERROR;
  175. ucnv_cbFromUWriteSub(fromArgs, 0, err);
  176. }
  177. /* else the caller must have set the error code accordingly. */
  178. }
  179. /* else ignore the reset, close and clone calls. */
  180. }
  181. /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
  182. *uses a clean copy (resetted) of the converter, to convert that unicode
  183. *escape sequence to the target codepage (if conversion failure happens then
  184. *we revert to substituting with subchar)
  185. */
  186. U_CAPI void U_EXPORT2
  187. UCNV_FROM_U_CALLBACK_ESCAPE (
  188. const void *context,
  189. UConverterFromUnicodeArgs *fromArgs,
  190. const char16_t *codeUnits,
  191. int32_t length,
  192. UChar32 codePoint,
  193. UConverterCallbackReason reason,
  194. UErrorCode * err)
  195. {
  196. char16_t valueString[VALUE_STRING_LENGTH];
  197. int32_t valueStringLength = 0;
  198. int32_t i = 0;
  199. const char16_t *myValueSource = nullptr;
  200. UErrorCode err2 = U_ZERO_ERROR;
  201. UConverterFromUCallback original = nullptr;
  202. const void *originalContext;
  203. UConverterFromUCallback ignoredCallback = nullptr;
  204. const void *ignoredContext;
  205. if (reason > UCNV_IRREGULAR)
  206. {
  207. return;
  208. }
  209. else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
  210. {
  211. /*
  212. * Skip if the codepoint has unicode property of default ignorable.
  213. */
  214. *err = U_ZERO_ERROR;
  215. return;
  216. }
  217. ucnv_setFromUCallBack (fromArgs->converter,
  218. (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
  219. nullptr,
  220. &original,
  221. &originalContext,
  222. &err2);
  223. if (U_FAILURE (err2))
  224. {
  225. *err = err2;
  226. return;
  227. }
  228. if(context==nullptr)
  229. {
  230. while (i < length)
  231. {
  232. valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
  233. valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
  234. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
  235. }
  236. }
  237. else
  238. {
  239. switch(*((char*)context))
  240. {
  241. case UCNV_PRV_ESCAPE_JAVA:
  242. while (i < length)
  243. {
  244. valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
  245. valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
  246. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
  247. }
  248. break;
  249. case UCNV_PRV_ESCAPE_C:
  250. valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
  251. if(length==2){
  252. valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
  253. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
  254. }
  255. else{
  256. valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
  257. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
  258. }
  259. break;
  260. case UCNV_PRV_ESCAPE_XML_DEC:
  261. valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
  262. valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
  263. if(length==2){
  264. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
  265. }
  266. else{
  267. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
  268. }
  269. valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
  270. break;
  271. case UCNV_PRV_ESCAPE_XML_HEX:
  272. valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
  273. valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
  274. valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
  275. if(length==2){
  276. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
  277. }
  278. else{
  279. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
  280. }
  281. valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
  282. break;
  283. case UCNV_PRV_ESCAPE_UNICODE:
  284. valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */
  285. valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
  286. valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */
  287. if (length == 2) {
  288. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
  289. } else {
  290. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
  291. }
  292. valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */
  293. break;
  294. case UCNV_PRV_ESCAPE_CSS2:
  295. valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
  296. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
  297. /* Always add space character, because the next character might be whitespace,
  298. which would erroneously be considered the termination of the escape sequence. */
  299. valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT;
  300. break;
  301. default:
  302. while (i < length)
  303. {
  304. valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
  305. valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
  306. valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
  307. }
  308. }
  309. }
  310. myValueSource = valueString;
  311. /* reset the error */
  312. *err = U_ZERO_ERROR;
  313. ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
  314. ucnv_setFromUCallBack (fromArgs->converter,
  315. original,
  316. originalContext,
  317. &ignoredCallback,
  318. &ignoredContext,
  319. &err2);
  320. if (U_FAILURE (err2))
  321. {
  322. *err = err2;
  323. return;
  324. }
  325. return;
  326. }
  327. U_CAPI void U_EXPORT2
  328. UCNV_TO_U_CALLBACK_SKIP (
  329. const void *context,
  330. UConverterToUnicodeArgs *toArgs,
  331. const char* codeUnits,
  332. int32_t length,
  333. UConverterCallbackReason reason,
  334. UErrorCode * err)
  335. {
  336. (void)toArgs;
  337. (void)codeUnits;
  338. (void)length;
  339. if (reason <= UCNV_IRREGULAR)
  340. {
  341. if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
  342. {
  343. *err = U_ZERO_ERROR;
  344. }
  345. /* else the caller must have set the error code accordingly. */
  346. }
  347. /* else ignore the reset, close and clone calls. */
  348. }
  349. U_CAPI void U_EXPORT2
  350. UCNV_TO_U_CALLBACK_SUBSTITUTE (
  351. const void *context,
  352. UConverterToUnicodeArgs *toArgs,
  353. const char* codeUnits,
  354. int32_t length,
  355. UConverterCallbackReason reason,
  356. UErrorCode * err)
  357. {
  358. (void)codeUnits;
  359. (void)length;
  360. if (reason <= UCNV_IRREGULAR)
  361. {
  362. if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
  363. {
  364. *err = U_ZERO_ERROR;
  365. ucnv_cbToUWriteSub(toArgs,0,err);
  366. }
  367. /* else the caller must have set the error code accordingly. */
  368. }
  369. /* else ignore the reset, close and clone calls. */
  370. }
  371. /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
  372. *and uses that as the substitution sequence
  373. */
  374. U_CAPI void U_EXPORT2
  375. UCNV_TO_U_CALLBACK_ESCAPE (
  376. const void *context,
  377. UConverterToUnicodeArgs *toArgs,
  378. const char* codeUnits,
  379. int32_t length,
  380. UConverterCallbackReason reason,
  381. UErrorCode * err)
  382. {
  383. char16_t uniValueString[VALUE_STRING_LENGTH];
  384. int32_t valueStringLength = 0;
  385. int32_t i = 0;
  386. if (reason > UCNV_IRREGULAR)
  387. {
  388. return;
  389. }
  390. if(context==nullptr)
  391. {
  392. while (i < length)
  393. {
  394. uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
  395. uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */
  396. valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
  397. }
  398. }
  399. else
  400. {
  401. switch(*((char*)context))
  402. {
  403. case UCNV_PRV_ESCAPE_XML_DEC:
  404. while (i < length)
  405. {
  406. uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
  407. uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
  408. valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
  409. uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
  410. }
  411. break;
  412. case UCNV_PRV_ESCAPE_XML_HEX:
  413. while (i < length)
  414. {
  415. uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
  416. uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
  417. uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
  418. valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
  419. uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
  420. }
  421. break;
  422. case UCNV_PRV_ESCAPE_C:
  423. while (i < length)
  424. {
  425. uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
  426. uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
  427. valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
  428. }
  429. break;
  430. default:
  431. while (i < length)
  432. {
  433. uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
  434. uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */
  435. uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
  436. valueStringLength += 2;
  437. }
  438. }
  439. }
  440. /* reset the error */
  441. *err = U_ZERO_ERROR;
  442. ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
  443. }
  444. #endif