encoding.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. /**
  2. * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3. * SPDX-License-Identifier: Apache-2.0.
  4. */
  5. #include <aws/common/encoding.h>
  6. #include <ctype.h>
  7. #include <stdlib.h>
  8. #ifdef USE_SIMD_ENCODING
  9. size_t aws_common_private_base64_decode_sse41(const unsigned char *in, unsigned char *out, size_t len);
  10. void aws_common_private_base64_encode_sse41(const unsigned char *in, unsigned char *out, size_t len);
  11. bool aws_common_private_has_avx2(void);
  12. #else
  13. /*
  14. * When AVX2 compilation is unavailable, we use these stubs to fall back to the pure-C decoder.
  15. * Since we force aws_common_private_has_avx2 to return false, the encode and decode functions should
  16. * not be called - but we must provide them anyway to avoid link errors.
  17. */
  18. static inline size_t aws_common_private_base64_decode_sse41(const unsigned char *in, unsigned char *out, size_t len) {
  19. (void)in;
  20. (void)out;
  21. (void)len;
  22. AWS_ASSERT(false);
  23. return (size_t)-1; /* unreachable */
  24. }
  25. static inline void aws_common_private_base64_encode_sse41(const unsigned char *in, unsigned char *out, size_t len) {
  26. (void)in;
  27. (void)out;
  28. (void)len;
  29. AWS_ASSERT(false);
  30. }
  31. static inline bool aws_common_private_has_avx2(void) {
  32. return false;
  33. }
  34. #endif
  35. static const uint8_t *HEX_CHARS = (const uint8_t *)"0123456789abcdef";
  36. static const uint8_t BASE64_SENTINEL_VALUE = 0xff;
  37. static const uint8_t BASE64_ENCODING_TABLE[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  38. /* in this table, 0xDD is an invalid decoded value, if you have to do byte counting for any reason, there's 16 bytes
  39. * per row. Reformatting is turned off to make sure this stays as 16 bytes per line. */
  40. /* clang-format off */
  41. static const uint8_t BASE64_DECODING_TABLE[256] = {
  42. 64, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  43. 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  44. 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 62, 0xDD, 0xDD, 0xDD, 63,
  45. 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0xDD, 0xDD, 0xDD, 255, 0xDD, 0xDD,
  46. 0xDD, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
  47. 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  48. 0xDD, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
  49. 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  50. 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  51. 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  52. 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  53. 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  54. 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  55. 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  56. 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD,
  57. 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD, 0xDD};
  58. /* clang-format on */
  59. int aws_hex_compute_encoded_len(size_t to_encode_len, size_t *encoded_length) {
  60. AWS_ASSERT(encoded_length);
  61. size_t temp = (to_encode_len << 1) + 1;
  62. if (AWS_UNLIKELY(temp < to_encode_len)) {
  63. return aws_raise_error(AWS_ERROR_OVERFLOW_DETECTED);
  64. }
  65. *encoded_length = temp;
  66. return AWS_OP_SUCCESS;
  67. }
  68. int aws_hex_encode(const struct aws_byte_cursor *AWS_RESTRICT to_encode, struct aws_byte_buf *AWS_RESTRICT output) {
  69. AWS_PRECONDITION(aws_byte_cursor_is_valid(to_encode));
  70. AWS_PRECONDITION(aws_byte_buf_is_valid(output));
  71. size_t encoded_len = 0;
  72. if (AWS_UNLIKELY(aws_hex_compute_encoded_len(to_encode->len, &encoded_len))) {
  73. return AWS_OP_ERR;
  74. }
  75. if (AWS_UNLIKELY(output->capacity < encoded_len)) {
  76. return aws_raise_error(AWS_ERROR_SHORT_BUFFER);
  77. }
  78. size_t written = 0;
  79. for (size_t i = 0; i < to_encode->len; ++i) {
  80. output->buffer[written++] = HEX_CHARS[to_encode->ptr[i] >> 4 & 0x0f];
  81. output->buffer[written++] = HEX_CHARS[to_encode->ptr[i] & 0x0f];
  82. }
  83. output->buffer[written] = '\0';
  84. output->len = encoded_len;
  85. return AWS_OP_SUCCESS;
  86. }
  87. int aws_hex_encode_append_dynamic(
  88. const struct aws_byte_cursor *AWS_RESTRICT to_encode,
  89. struct aws_byte_buf *AWS_RESTRICT output) {
  90. AWS_ASSERT(to_encode->ptr);
  91. AWS_ASSERT(aws_byte_buf_is_valid(output));
  92. size_t encoded_len = 0;
  93. if (AWS_UNLIKELY(aws_add_size_checked(to_encode->len, to_encode->len, &encoded_len))) {
  94. return AWS_OP_ERR;
  95. }
  96. if (AWS_UNLIKELY(aws_byte_buf_reserve_relative(output, encoded_len))) {
  97. return AWS_OP_ERR;
  98. }
  99. size_t written = output->len;
  100. for (size_t i = 0; i < to_encode->len; ++i) {
  101. output->buffer[written++] = HEX_CHARS[to_encode->ptr[i] >> 4 & 0x0f];
  102. output->buffer[written++] = HEX_CHARS[to_encode->ptr[i] & 0x0f];
  103. }
  104. output->len += encoded_len;
  105. return AWS_OP_SUCCESS;
  106. }
  107. static int s_hex_decode_char_to_int(char character, uint8_t *int_val) {
  108. if (character >= 'a' && character <= 'f') {
  109. *int_val = (uint8_t)(10 + (character - 'a'));
  110. return 0;
  111. }
  112. if (character >= 'A' && character <= 'F') {
  113. *int_val = (uint8_t)(10 + (character - 'A'));
  114. return 0;
  115. }
  116. if (character >= '0' && character <= '9') {
  117. *int_val = (uint8_t)(character - '0');
  118. return 0;
  119. }
  120. return AWS_OP_ERR;
  121. }
  122. int aws_hex_compute_decoded_len(size_t to_decode_len, size_t *decoded_len) {
  123. AWS_ASSERT(decoded_len);
  124. size_t temp = (to_decode_len + 1);
  125. if (AWS_UNLIKELY(temp < to_decode_len)) {
  126. return aws_raise_error(AWS_ERROR_OVERFLOW_DETECTED);
  127. }
  128. *decoded_len = temp >> 1;
  129. return AWS_OP_SUCCESS;
  130. }
  131. int aws_hex_decode(const struct aws_byte_cursor *AWS_RESTRICT to_decode, struct aws_byte_buf *AWS_RESTRICT output) {
  132. AWS_PRECONDITION(aws_byte_cursor_is_valid(to_decode));
  133. AWS_PRECONDITION(aws_byte_buf_is_valid(output));
  134. size_t decoded_length = 0;
  135. if (AWS_UNLIKELY(aws_hex_compute_decoded_len(to_decode->len, &decoded_length))) {
  136. return aws_raise_error(AWS_ERROR_OVERFLOW_DETECTED);
  137. }
  138. if (AWS_UNLIKELY(output->capacity < decoded_length)) {
  139. return aws_raise_error(AWS_ERROR_SHORT_BUFFER);
  140. }
  141. size_t written = 0;
  142. size_t i = 0;
  143. uint8_t high_value = 0;
  144. uint8_t low_value = 0;
  145. /* if the buffer isn't even, prepend a 0 to the buffer. */
  146. if (AWS_UNLIKELY(to_decode->len & 0x01)) {
  147. i = 1;
  148. if (s_hex_decode_char_to_int(to_decode->ptr[0], &low_value)) {
  149. return aws_raise_error(AWS_ERROR_INVALID_HEX_STR);
  150. }
  151. output->buffer[written++] = low_value;
  152. }
  153. for (; i < to_decode->len; i += 2) {
  154. if (AWS_UNLIKELY(
  155. s_hex_decode_char_to_int(to_decode->ptr[i], &high_value) ||
  156. s_hex_decode_char_to_int(to_decode->ptr[i + 1], &low_value))) {
  157. return aws_raise_error(AWS_ERROR_INVALID_HEX_STR);
  158. }
  159. uint8_t value = (uint8_t)(high_value << 4);
  160. value |= low_value;
  161. output->buffer[written++] = value;
  162. }
  163. output->len = decoded_length;
  164. return AWS_OP_SUCCESS;
  165. }
  166. int aws_base64_compute_encoded_len(size_t to_encode_len, size_t *encoded_len) {
  167. AWS_ASSERT(encoded_len);
  168. size_t tmp = to_encode_len + 2;
  169. if (AWS_UNLIKELY(tmp < to_encode_len)) {
  170. return aws_raise_error(AWS_ERROR_OVERFLOW_DETECTED);
  171. }
  172. tmp /= 3;
  173. size_t overflow_check = tmp;
  174. tmp = 4 * tmp + 1; /* plus one for the NULL terminator */
  175. if (AWS_UNLIKELY(tmp < overflow_check)) {
  176. return aws_raise_error(AWS_ERROR_OVERFLOW_DETECTED);
  177. }
  178. *encoded_len = tmp;
  179. return AWS_OP_SUCCESS;
  180. }
  181. int aws_base64_compute_decoded_len(const struct aws_byte_cursor *AWS_RESTRICT to_decode, size_t *decoded_len) {
  182. AWS_ASSERT(to_decode);
  183. AWS_ASSERT(decoded_len);
  184. const size_t len = to_decode->len;
  185. const uint8_t *input = to_decode->ptr;
  186. if (len == 0) {
  187. *decoded_len = 0;
  188. return AWS_OP_SUCCESS;
  189. }
  190. if (AWS_UNLIKELY(len & 0x03)) {
  191. return aws_raise_error(AWS_ERROR_INVALID_BASE64_STR);
  192. }
  193. size_t tmp = len * 3;
  194. if (AWS_UNLIKELY(tmp < len)) {
  195. return aws_raise_error(AWS_ERROR_OVERFLOW_DETECTED);
  196. }
  197. size_t padding = 0;
  198. if (len >= 2 && input[len - 1] == '=' && input[len - 2] == '=') { /*last two chars are = */
  199. padding = 2;
  200. } else if (input[len - 1] == '=') { /*last char is = */
  201. padding = 1;
  202. }
  203. *decoded_len = (tmp / 4 - padding);
  204. return AWS_OP_SUCCESS;
  205. }
  206. int aws_base64_encode(const struct aws_byte_cursor *AWS_RESTRICT to_encode, struct aws_byte_buf *AWS_RESTRICT output) {
  207. AWS_ASSERT(to_encode->ptr);
  208. AWS_ASSERT(output->buffer);
  209. size_t terminated_length = 0;
  210. size_t encoded_length = 0;
  211. if (AWS_UNLIKELY(aws_base64_compute_encoded_len(to_encode->len, &terminated_length))) {
  212. return AWS_OP_ERR;
  213. }
  214. size_t needed_capacity = 0;
  215. if (AWS_UNLIKELY(aws_add_size_checked(output->len, terminated_length, &needed_capacity))) {
  216. return AWS_OP_ERR;
  217. }
  218. if (AWS_UNLIKELY(output->capacity < needed_capacity)) {
  219. return aws_raise_error(AWS_ERROR_SHORT_BUFFER);
  220. }
  221. /*
  222. * For convenience to standard C functions expecting a null-terminated
  223. * string, the output is terminated. As the encoding itself can be used in
  224. * various ways, however, its length should never account for that byte.
  225. */
  226. encoded_length = (terminated_length - 1);
  227. if (aws_common_private_has_avx2()) {
  228. aws_common_private_base64_encode_sse41(to_encode->ptr, output->buffer + output->len, to_encode->len);
  229. output->buffer[output->len + encoded_length] = 0;
  230. output->len += encoded_length;
  231. return AWS_OP_SUCCESS;
  232. }
  233. size_t buffer_length = to_encode->len;
  234. size_t block_count = (buffer_length + 2) / 3;
  235. size_t remainder_count = (buffer_length % 3);
  236. size_t str_index = output->len;
  237. for (size_t i = 0; i < to_encode->len; i += 3) {
  238. uint32_t block = to_encode->ptr[i];
  239. block <<= 8;
  240. if (AWS_LIKELY(i + 1 < buffer_length)) {
  241. block = block | to_encode->ptr[i + 1];
  242. }
  243. block <<= 8;
  244. if (AWS_LIKELY(i + 2 < to_encode->len)) {
  245. block = block | to_encode->ptr[i + 2];
  246. }
  247. output->buffer[str_index++] = BASE64_ENCODING_TABLE[(block >> 18) & 0x3F];
  248. output->buffer[str_index++] = BASE64_ENCODING_TABLE[(block >> 12) & 0x3F];
  249. output->buffer[str_index++] = BASE64_ENCODING_TABLE[(block >> 6) & 0x3F];
  250. output->buffer[str_index++] = BASE64_ENCODING_TABLE[block & 0x3F];
  251. }
  252. if (remainder_count > 0) {
  253. output->buffer[output->len + block_count * 4 - 1] = '=';
  254. if (remainder_count == 1) {
  255. output->buffer[output->len + block_count * 4 - 2] = '=';
  256. }
  257. }
  258. /* it's a string add the null terminator. */
  259. output->buffer[output->len + encoded_length] = 0;
  260. output->len += encoded_length;
  261. return AWS_OP_SUCCESS;
  262. }
  263. static inline int s_base64_get_decoded_value(unsigned char to_decode, uint8_t *value, int8_t allow_sentinel) {
  264. uint8_t decode_value = BASE64_DECODING_TABLE[(size_t)to_decode];
  265. if (decode_value != 0xDD && (decode_value != BASE64_SENTINEL_VALUE || allow_sentinel)) {
  266. *value = decode_value;
  267. return AWS_OP_SUCCESS;
  268. }
  269. return AWS_OP_ERR;
  270. }
  271. int aws_base64_decode(const struct aws_byte_cursor *AWS_RESTRICT to_decode, struct aws_byte_buf *AWS_RESTRICT output) {
  272. size_t decoded_length = 0;
  273. if (AWS_UNLIKELY(aws_base64_compute_decoded_len(to_decode, &decoded_length))) {
  274. return AWS_OP_ERR;
  275. }
  276. if (output->capacity < decoded_length) {
  277. return aws_raise_error(AWS_ERROR_SHORT_BUFFER);
  278. }
  279. if (aws_common_private_has_avx2()) {
  280. size_t result = aws_common_private_base64_decode_sse41(to_decode->ptr, output->buffer, to_decode->len);
  281. if (result == -1) {
  282. return aws_raise_error(AWS_ERROR_INVALID_BASE64_STR);
  283. }
  284. output->len = result;
  285. return AWS_OP_SUCCESS;
  286. }
  287. int64_t block_count = (int64_t)to_decode->len / 4;
  288. size_t string_index = 0;
  289. uint8_t value1 = 0, value2 = 0, value3 = 0, value4 = 0;
  290. int64_t buffer_index = 0;
  291. for (int64_t i = 0; i < block_count - 1; ++i) {
  292. if (AWS_UNLIKELY(
  293. s_base64_get_decoded_value(to_decode->ptr[string_index++], &value1, 0) ||
  294. s_base64_get_decoded_value(to_decode->ptr[string_index++], &value2, 0) ||
  295. s_base64_get_decoded_value(to_decode->ptr[string_index++], &value3, 0) ||
  296. s_base64_get_decoded_value(to_decode->ptr[string_index++], &value4, 0))) {
  297. return aws_raise_error(AWS_ERROR_INVALID_BASE64_STR);
  298. }
  299. buffer_index = i * 3;
  300. output->buffer[buffer_index++] = (uint8_t)((value1 << 2) | ((value2 >> 4) & 0x03));
  301. output->buffer[buffer_index++] = (uint8_t)(((value2 << 4) & 0xF0) | ((value3 >> 2) & 0x0F));
  302. output->buffer[buffer_index] = (uint8_t)((value3 & 0x03) << 6 | value4);
  303. }
  304. buffer_index = (block_count - 1) * 3;
  305. if (buffer_index >= 0) {
  306. if (s_base64_get_decoded_value(to_decode->ptr[string_index++], &value1, 0) ||
  307. s_base64_get_decoded_value(to_decode->ptr[string_index++], &value2, 0) ||
  308. s_base64_get_decoded_value(to_decode->ptr[string_index++], &value3, 1) ||
  309. s_base64_get_decoded_value(to_decode->ptr[string_index], &value4, 1)) {
  310. return aws_raise_error(AWS_ERROR_INVALID_BASE64_STR);
  311. }
  312. output->buffer[buffer_index++] = (uint8_t)((value1 << 2) | ((value2 >> 4) & 0x03));
  313. if (value3 != BASE64_SENTINEL_VALUE) {
  314. output->buffer[buffer_index++] = (uint8_t)(((value2 << 4) & 0xF0) | ((value3 >> 2) & 0x0F));
  315. if (value4 != BASE64_SENTINEL_VALUE) {
  316. output->buffer[buffer_index] = (uint8_t)((value3 & 0x03) << 6 | value4);
  317. }
  318. }
  319. }
  320. output->len = decoded_length;
  321. return AWS_OP_SUCCESS;
  322. }
  323. struct aws_utf8_decoder {
  324. struct aws_allocator *alloc;
  325. /* Value of current codepoint, updated as we read each byte */
  326. uint32_t codepoint;
  327. /* Minimum value that current codepoint is allowed to end up with
  328. * (i.e. text cannot use 2 bytes to encode what would have fit in 1 byte) */
  329. uint32_t min;
  330. /* Number of bytes remaining the current codepoint */
  331. uint8_t remaining;
  332. /* Custom callback */
  333. int (*on_codepoint)(uint32_t codepoint, void *user_data);
  334. /* user_data for on_codepoint */
  335. void *user_data;
  336. };
  337. struct aws_utf8_decoder *aws_utf8_decoder_new(
  338. struct aws_allocator *allocator,
  339. const struct aws_utf8_decoder_options *options) {
  340. struct aws_utf8_decoder *decoder = aws_mem_calloc(allocator, 1, sizeof(struct aws_utf8_decoder));
  341. decoder->alloc = allocator;
  342. if (options) {
  343. decoder->on_codepoint = options->on_codepoint;
  344. decoder->user_data = options->user_data;
  345. }
  346. return decoder;
  347. }
  348. void aws_utf8_decoder_destroy(struct aws_utf8_decoder *decoder) {
  349. if (decoder) {
  350. aws_mem_release(decoder->alloc, decoder);
  351. }
  352. }
  353. void aws_utf8_decoder_reset(struct aws_utf8_decoder *decoder) {
  354. decoder->codepoint = 0;
  355. decoder->min = 0;
  356. decoder->remaining = 0;
  357. }
  358. /* Why yes, this could be optimized. */
  359. int aws_utf8_decoder_update(struct aws_utf8_decoder *decoder, struct aws_byte_cursor bytes) {
  360. /* We're respecting RFC-3629, which uses 1 to 4 byte sequences (never 5 or 6) */
  361. for (size_t i = 0; i < bytes.len; ++i) {
  362. uint8_t byte = bytes.ptr[i];
  363. if (decoder->remaining == 0) {
  364. /* Check first byte of the codepoint to determine how many more bytes remain */
  365. if ((byte & 0x80) == 0x00) {
  366. /* 1 byte codepoints start with 0xxxxxxx */
  367. decoder->remaining = 0;
  368. decoder->codepoint = byte;
  369. decoder->min = 0;
  370. } else if ((byte & 0xE0) == 0xC0) {
  371. /* 2 byte codepoints start with 110xxxxx */
  372. decoder->remaining = 1;
  373. decoder->codepoint = byte & 0x1F;
  374. decoder->min = 0x80;
  375. } else if ((byte & 0xF0) == 0xE0) {
  376. /* 3 byte codepoints start with 1110xxxx */
  377. decoder->remaining = 2;
  378. decoder->codepoint = byte & 0x0F;
  379. decoder->min = 0x800;
  380. } else if ((byte & 0xF8) == 0xF0) {
  381. /* 4 byte codepoints start with 11110xxx */
  382. decoder->remaining = 3;
  383. decoder->codepoint = byte & 0x07;
  384. decoder->min = 0x10000;
  385. } else {
  386. return aws_raise_error(AWS_ERROR_INVALID_UTF8);
  387. }
  388. } else {
  389. /* This is not the first byte of a codepoint.
  390. * Ensure it starts with 10xxxxxx*/
  391. if ((byte & 0xC0) != 0x80) {
  392. return aws_raise_error(AWS_ERROR_INVALID_UTF8);
  393. }
  394. /* Insert the 6 newly decoded bits:
  395. * shifting left anything we've already decoded, and insert the new bits to the right */
  396. decoder->codepoint = (decoder->codepoint << 6) | (byte & 0x3F);
  397. /* If we've decoded the whole codepoint, check it for validity
  398. * (don't need to do these particular checks on 1 byte codepoints) */
  399. if (--decoder->remaining == 0) {
  400. /* Check that it's not "overlong" (encoded using more bytes than necessary) */
  401. if (decoder->codepoint < decoder->min) {
  402. return aws_raise_error(AWS_ERROR_INVALID_UTF8);
  403. }
  404. /* UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF,
  405. * which are reserved for use with the UTF-16 encoding form (as
  406. * surrogate pairs) and do not directly represent characters */
  407. if (decoder->codepoint >= 0xD800 && decoder->codepoint <= 0xDFFF) {
  408. return aws_raise_error(AWS_ERROR_INVALID_UTF8);
  409. }
  410. }
  411. }
  412. /* Invoke user's on_codepoint callback */
  413. if (decoder->on_codepoint && decoder->remaining == 0) {
  414. if (decoder->on_codepoint(decoder->codepoint, decoder->user_data)) {
  415. return AWS_OP_ERR;
  416. }
  417. }
  418. }
  419. return AWS_OP_SUCCESS;
  420. }
  421. int aws_utf8_decoder_finalize(struct aws_utf8_decoder *decoder) {
  422. bool valid = decoder->remaining == 0;
  423. aws_utf8_decoder_reset(decoder);
  424. if (AWS_LIKELY(valid)) {
  425. return AWS_OP_SUCCESS;
  426. }
  427. return aws_raise_error(AWS_ERROR_INVALID_UTF8);
  428. }
  429. int aws_decode_utf8(struct aws_byte_cursor bytes, const struct aws_utf8_decoder_options *options) {
  430. struct aws_utf8_decoder decoder = {
  431. .on_codepoint = options ? options->on_codepoint : NULL,
  432. .user_data = options ? options->user_data : NULL,
  433. };
  434. if (aws_utf8_decoder_update(&decoder, bytes)) {
  435. return AWS_OP_ERR;
  436. }
  437. if (aws_utf8_decoder_finalize(&decoder)) {
  438. return AWS_OP_ERR;
  439. }
  440. return AWS_OP_SUCCESS;
  441. }