utf_impl.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 1999-2012, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: utf_impl.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 1999sep13
  16. * created by: Markus W. Scherer
  17. *
  18. * This file provides implementation functions for macros in the utfXX.h
  19. * that would otherwise be too long as macros.
  20. */
  21. /* set import/export definitions */
  22. #ifndef U_UTF8_IMPL
  23. # define U_UTF8_IMPL
  24. #endif
  25. #include "unicode/utypes.h"
  26. #include "unicode/utf.h"
  27. #include "unicode/utf8.h"
  28. #include "uassert.h"
  29. /*
  30. * Table of the number of utf8 trail bytes, indexed by the lead byte.
  31. * Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h
  32. *
  33. * The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table.
  34. *
  35. * Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were
  36. * changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES
  37. * may exist in old client code that must continue to run with newer icu library versions.
  38. *
  39. * This table could be replaced on many machines by
  40. * a few lines of assembler code using an
  41. * "index of first 0-bit from msb" instruction and
  42. * one or two more integer instructions.
  43. *
  44. * For example, on an i386, do something like
  45. * - MOV AL, leadByte
  46. * - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0)
  47. * - MOV AH, 0
  48. * - BSR BX, AX (16-bit)
  49. * - MOV AX, 6 (result)
  50. * - JZ finish (ZF==1 if leadByte==0xff)
  51. * - SUB AX, BX (result)
  52. * -finish:
  53. * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
  54. */
  55. U_CAPI const uint8_t
  56. utf8_countTrailBytes[256]={
  57. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  58. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  59. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  60. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  61. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  62. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  63. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  64. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  65. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  66. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  67. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  68. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  69. // illegal C0 & C1
  70. // 2-byte lead bytes C2..DF
  71. 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  72. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  73. // 3-byte lead bytes E0..EF
  74. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  75. // 4-byte lead bytes F0..F4
  76. // illegal F5..FF
  77. 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  78. };
  79. static const UChar32
  80. utf8_errorValue[6]={
  81. // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
  82. // but without relying on the obsolete unicode/utf_old.h.
  83. 0x15, 0x9f, 0xffff,
  84. 0x10ffff
  85. };
  86. static UChar32
  87. errorValue(int32_t count, int8_t strict) {
  88. if(strict>=0) {
  89. return utf8_errorValue[count];
  90. } else if(strict==-3) {
  91. return 0xfffd;
  92. } else {
  93. return U_SENTINEL;
  94. }
  95. }
  96. /*
  97. * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
  98. * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
  99. *
  100. * U8_NEXT() supports NUL-terminated strings indicated via length<0.
  101. *
  102. * The "strict" parameter controls the error behavior:
  103. * <0 "Safe" behavior of U8_NEXT():
  104. * -1: All illegal byte sequences yield U_SENTINEL=-1.
  105. * -2: Same as -1, except for lenient treatment of surrogate code points as legal.
  106. * Some implementations use this for roundtripping of
  107. * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
  108. * contain unpaired surrogates.
  109. * -3: All illegal byte sequences yield U+FFFD.
  110. * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., false):
  111. * All illegal byte sequences yield a positive code point such that this
  112. * result code point would be encoded with the same number of bytes as
  113. * the illegal sequence.
  114. * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., true):
  115. * Same as the obsolete "safe" behavior, but non-characters are also treated
  116. * like illegal sequences.
  117. *
  118. * Note that a UBool is the same as an int8_t.
  119. */
  120. U_CAPI UChar32 U_EXPORT2
  121. utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
  122. // *pi is one after byte c.
  123. int32_t i=*pi;
  124. // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
  125. if(i==length || c>0xf4) {
  126. // end of string, or not a lead byte
  127. } else if(c>=0xf0) {
  128. // Test for 4-byte sequences first because
  129. // U8_NEXT() handles shorter valid sequences inline.
  130. uint8_t t1=s[i], t2, t3;
  131. c&=7;
  132. if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
  133. ++i!=length && (t2=s[i]-0x80)<=0x3f &&
  134. ++i!=length && (t3=s[i]-0x80)<=0x3f) {
  135. ++i;
  136. c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
  137. // strict: forbid non-characters like U+fffe
  138. if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
  139. *pi=i;
  140. return c;
  141. }
  142. }
  143. } else if(c>=0xe0) {
  144. c&=0xf;
  145. if(strict!=-2) {
  146. uint8_t t1=s[i], t2;
  147. if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
  148. ++i!=length && (t2=s[i]-0x80)<=0x3f) {
  149. ++i;
  150. c=(c<<12)|((t1&0x3f)<<6)|t2;
  151. // strict: forbid non-characters like U+fffe
  152. if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
  153. *pi=i;
  154. return c;
  155. }
  156. }
  157. } else {
  158. // strict=-2 -> lenient: allow surrogates
  159. uint8_t t1=s[i]-0x80, t2;
  160. if(t1<=0x3f && (c>0 || t1>=0x20) &&
  161. ++i!=length && (t2=s[i]-0x80)<=0x3f) {
  162. *pi=i+1;
  163. return (c<<12)|(t1<<6)|t2;
  164. }
  165. }
  166. } else if(c>=0xc2) {
  167. uint8_t t1=s[i]-0x80;
  168. if(t1<=0x3f) {
  169. *pi=i+1;
  170. return ((c-0xc0)<<6)|t1;
  171. }
  172. } // else 0x80<=c<0xc2 is not a lead byte
  173. /* error handling */
  174. c=errorValue(i-*pi, strict);
  175. *pi=i;
  176. return c;
  177. }
  178. U_CAPI int32_t U_EXPORT2
  179. utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
  180. if((uint32_t)(c)<=0x7ff) {
  181. if((i)+1<(length)) {
  182. (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
  183. (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
  184. return i;
  185. }
  186. } else if((uint32_t)(c)<=0xffff) {
  187. /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
  188. if((i)+2<(length) && !U_IS_SURROGATE(c)) {
  189. (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
  190. (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
  191. (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
  192. return i;
  193. }
  194. } else if((uint32_t)(c)<=0x10ffff) {
  195. if((i)+3<(length)) {
  196. (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
  197. (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
  198. (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
  199. (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
  200. return i;
  201. }
  202. }
  203. /* c>0x10ffff or not enough space, write an error value */
  204. if(pIsError!=nullptr) {
  205. *pIsError=true;
  206. } else {
  207. length-=i;
  208. if(length>0) {
  209. int32_t offset;
  210. if(length>3) {
  211. length=3;
  212. }
  213. s+=i;
  214. offset=0;
  215. c=utf8_errorValue[length-1];
  216. U8_APPEND_UNSAFE(s, offset, c);
  217. i=i+offset;
  218. }
  219. }
  220. return i;
  221. }
  222. U_CAPI UChar32 U_EXPORT2
  223. utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
  224. // *pi is the index of byte c.
  225. int32_t i=*pi;
  226. if(U8_IS_TRAIL(c) && i>start) {
  227. uint8_t b1=s[--i];
  228. if(U8_IS_LEAD(b1)) {
  229. if(b1<0xe0) {
  230. *pi=i;
  231. return ((b1-0xc0)<<6)|(c&0x3f);
  232. } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
  233. // Truncated 3- or 4-byte sequence.
  234. *pi=i;
  235. return errorValue(1, strict);
  236. }
  237. } else if(U8_IS_TRAIL(b1) && i>start) {
  238. // Extract the value bits from the last trail byte.
  239. c&=0x3f;
  240. uint8_t b2=s[--i];
  241. if(0xe0<=b2 && b2<=0xf4) {
  242. if(b2<0xf0) {
  243. b2&=0xf;
  244. if(strict!=-2) {
  245. if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
  246. *pi=i;
  247. c=(b2<<12)|((b1&0x3f)<<6)|c;
  248. if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
  249. return c;
  250. } else {
  251. // strict: forbid non-characters like U+fffe
  252. return errorValue(2, strict);
  253. }
  254. }
  255. } else {
  256. // strict=-2 -> lenient: allow surrogates
  257. b1-=0x80;
  258. if((b2>0 || b1>=0x20)) {
  259. *pi=i;
  260. return (b2<<12)|(b1<<6)|c;
  261. }
  262. }
  263. } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
  264. // Truncated 4-byte sequence.
  265. *pi=i;
  266. return errorValue(2, strict);
  267. }
  268. } else if(U8_IS_TRAIL(b2) && i>start) {
  269. uint8_t b3=s[--i];
  270. if(0xf0<=b3 && b3<=0xf4) {
  271. b3&=7;
  272. if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
  273. *pi=i;
  274. c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
  275. if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
  276. return c;
  277. } else {
  278. // strict: forbid non-characters like U+fffe
  279. return errorValue(3, strict);
  280. }
  281. }
  282. }
  283. }
  284. }
  285. }
  286. return errorValue(0, strict);
  287. }
  288. U_CAPI int32_t U_EXPORT2
  289. utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
  290. // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
  291. int32_t orig_i=i;
  292. uint8_t c=s[i];
  293. if(U8_IS_TRAIL(c) && i>start) {
  294. uint8_t b1=s[--i];
  295. if(U8_IS_LEAD(b1)) {
  296. if(b1<0xe0 ||
  297. (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
  298. return i;
  299. }
  300. } else if(U8_IS_TRAIL(b1) && i>start) {
  301. uint8_t b2=s[--i];
  302. if(0xe0<=b2 && b2<=0xf4) {
  303. if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
  304. return i;
  305. }
  306. } else if(U8_IS_TRAIL(b2) && i>start) {
  307. uint8_t b3=s[--i];
  308. if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
  309. return i;
  310. }
  311. }
  312. }
  313. }
  314. return orig_i;
  315. }