wchar_helper.h 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. /*
  2. * wchar_t helpers
  3. */
  4. typedef uint16_t cffi_char16_t;
  5. typedef uint32_t cffi_char32_t;
  6. #if Py_UNICODE_SIZE == 2
  7. /* Before Python 2.7, PyUnicode_FromWideChar is not able to convert
  8. wchar_t values greater than 65535 into two-unicode-characters surrogates.
  9. But even the Python 2.7 version doesn't detect wchar_t values that are
  10. out of range(1114112), and just returns nonsense.
  11. From cffi 1.11 we can't use it anyway, because we need a version
  12. with char32_t input types.
  13. */
  14. static PyObject *
  15. _my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
  16. {
  17. PyObject *unicode;
  18. Py_ssize_t i;
  19. Py_ssize_t alloc;
  20. const cffi_char32_t *orig_w;
  21. alloc = size;
  22. orig_w = w;
  23. for (i = size; i > 0; i--) {
  24. if (*w > 0xFFFF)
  25. alloc++;
  26. w++;
  27. }
  28. w = orig_w;
  29. unicode = PyUnicode_FromUnicode(NULL, alloc);
  30. if (!unicode)
  31. return NULL;
  32. /* Copy the wchar_t data into the new object */
  33. {
  34. Py_UNICODE *u;
  35. u = PyUnicode_AS_UNICODE(unicode);
  36. for (i = size; i > 0; i--) {
  37. if (*w > 0xFFFF) {
  38. cffi_char32_t ordinal;
  39. if (*w > 0x10FFFF) {
  40. PyErr_Format(PyExc_ValueError,
  41. "char32_t out of range for "
  42. "conversion to unicode: 0x%x", (int)*w);
  43. Py_DECREF(unicode);
  44. return NULL;
  45. }
  46. ordinal = *w++;
  47. ordinal -= 0x10000;
  48. *u++ = 0xD800 | (ordinal >> 10);
  49. *u++ = 0xDC00 | (ordinal & 0x3FF);
  50. }
  51. else
  52. *u++ = *w++;
  53. }
  54. }
  55. return unicode;
  56. }
  57. static PyObject *
  58. _my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
  59. {
  60. return PyUnicode_FromUnicode((const Py_UNICODE *)w, size);
  61. }
  62. #else /* Py_UNICODE_SIZE == 4 */
  63. static PyObject *
  64. _my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
  65. {
  66. return PyUnicode_FromUnicode((const Py_UNICODE *)w, size);
  67. }
  68. static PyObject *
  69. _my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
  70. {
  71. /* 'size' is the length of the 'w' array */
  72. PyObject *result = PyUnicode_FromUnicode(NULL, size);
  73. if (result != NULL) {
  74. Py_UNICODE *u_base = PyUnicode_AS_UNICODE(result);
  75. Py_UNICODE *u = u_base;
  76. if (size == 1) { /* performance only */
  77. *u = (cffi_char32_t)*w;
  78. }
  79. else {
  80. while (size > 0) {
  81. cffi_char32_t ch = *w++;
  82. size--;
  83. if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
  84. cffi_char32_t ch2 = *w;
  85. if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
  86. ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
  87. w++;
  88. size--;
  89. }
  90. }
  91. *u++ = ch;
  92. }
  93. if (PyUnicode_Resize(&result, u - u_base) < 0) {
  94. Py_DECREF(result);
  95. return NULL;
  96. }
  97. }
  98. }
  99. return result;
  100. }
  101. #endif
  102. #define IS_SURROGATE(u) (0xD800 <= (u)[0] && (u)[0] <= 0xDBFF && \
  103. 0xDC00 <= (u)[1] && (u)[1] <= 0xDFFF)
  104. #define AS_SURROGATE(u) (0x10000 + (((u)[0] - 0xD800) << 10) + \
  105. ((u)[1] - 0xDC00))
  106. static int
  107. _my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
  108. char *err_got)
  109. {
  110. Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
  111. if (PyUnicode_GET_SIZE(unicode) != 1) {
  112. sprintf(err_got, "unicode string of length %zd",
  113. PyUnicode_GET_SIZE(unicode));
  114. return -1;
  115. }
  116. #if Py_UNICODE_SIZE == 4
  117. if (((unsigned int)u[0]) > 0xFFFF)
  118. {
  119. sprintf(err_got, "larger-than-0xFFFF character");
  120. return -1;
  121. }
  122. #endif
  123. *result = (cffi_char16_t)u[0];
  124. return 0;
  125. }
  126. static int
  127. _my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
  128. char *err_got)
  129. {
  130. Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
  131. if (PyUnicode_GET_SIZE(unicode) == 1) {
  132. *result = (cffi_char32_t)u[0];
  133. return 0;
  134. }
  135. #if Py_UNICODE_SIZE == 2
  136. if (PyUnicode_GET_SIZE(unicode) == 2 && IS_SURROGATE(u)) {
  137. *result = AS_SURROGATE(u);
  138. return 0;
  139. }
  140. #endif
  141. sprintf(err_got, "unicode string of length %zd",
  142. PyUnicode_GET_SIZE(unicode));
  143. return -1;
  144. }
  145. static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
  146. {
  147. Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
  148. Py_ssize_t result = length;
  149. #if Py_UNICODE_SIZE == 4
  150. Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
  151. Py_ssize_t i;
  152. for (i=0; i<length; i++) {
  153. if (u[i] > 0xFFFF)
  154. result++;
  155. }
  156. #endif
  157. return result;
  158. }
  159. static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
  160. {
  161. Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
  162. Py_ssize_t result = length;
  163. #if Py_UNICODE_SIZE == 2
  164. Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
  165. Py_ssize_t i;
  166. for (i=0; i<length-1; i++) {
  167. if (IS_SURROGATE(u+i))
  168. result--;
  169. }
  170. #endif
  171. return result;
  172. }
  173. static int _my_PyUnicode_AsChar16(PyObject *unicode,
  174. cffi_char16_t *result,
  175. Py_ssize_t resultlen)
  176. {
  177. Py_ssize_t len = PyUnicode_GET_SIZE(unicode);
  178. Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
  179. Py_ssize_t i;
  180. for (i=0; i<len; i++) {
  181. #if Py_UNICODE_SIZE == 2
  182. cffi_char16_t ordinal = u[i];
  183. #else
  184. cffi_char32_t ordinal = u[i];
  185. if (ordinal > 0xFFFF) {
  186. if (ordinal > 0x10FFFF) {
  187. PyErr_Format(PyExc_ValueError,
  188. "unicode character out of range for "
  189. "conversion to char16_t: 0x%x", (int)ordinal);
  190. return -1;
  191. }
  192. ordinal -= 0x10000;
  193. *result++ = 0xD800 | (ordinal >> 10);
  194. *result++ = 0xDC00 | (ordinal & 0x3FF);
  195. continue;
  196. }
  197. #endif
  198. *result++ = ordinal;
  199. }
  200. return 0;
  201. }
  202. static int _my_PyUnicode_AsChar32(PyObject *unicode,
  203. cffi_char32_t *result,
  204. Py_ssize_t resultlen)
  205. {
  206. Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
  207. Py_ssize_t i;
  208. for (i=0; i<resultlen; i++) {
  209. cffi_char32_t ordinal = *u;
  210. #if Py_UNICODE_SIZE == 2
  211. if (IS_SURROGATE(u)) {
  212. ordinal = AS_SURROGATE(u);
  213. u++;
  214. }
  215. #endif
  216. result[i] = ordinal;
  217. u++;
  218. }
  219. return 0;
  220. }