wchar_helper_3.h 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. /*
  2. * wchar_t helpers, version CPython >= 3.3.
  3. *
  4. * CPython 3.3 added support for sys.maxunicode == 0x10FFFF on all
  5. * platforms, even ones with wchar_t limited to 2 bytes. As such,
  6. * this code here works from the outside like wchar_helper.h in the
  7. * case Py_UNICODE_SIZE == 4, but the implementation is very different.
  8. */
  9. typedef uint16_t cffi_char16_t;
  10. typedef uint32_t cffi_char32_t;
  11. static PyObject *
  12. _my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
  13. {
  14. return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, w, size);
  15. }
  16. static PyObject *
  17. _my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
  18. {
  19. /* are there any surrogate pairs, and if so, how many? */
  20. Py_ssize_t i, count_surrogates = 0;
  21. for (i = 0; i < size - 1; i++) {
  22. if (0xD800 <= w[i] && w[i] <= 0xDBFF &&
  23. 0xDC00 <= w[i+1] && w[i+1] <= 0xDFFF)
  24. count_surrogates++;
  25. }
  26. if (count_surrogates == 0) {
  27. /* no, fast path */
  28. return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, w, size);
  29. }
  30. else
  31. {
  32. PyObject *result = PyUnicode_New(size - count_surrogates, 0x10FFFF);
  33. Py_UCS4 *data;
  34. assert(PyUnicode_KIND(result) == PyUnicode_4BYTE_KIND);
  35. data = PyUnicode_4BYTE_DATA(result);
  36. for (i = 0; i < size; i++)
  37. {
  38. cffi_char32_t ch = w[i];
  39. if (0xD800 <= ch && ch <= 0xDBFF && i < size - 1) {
  40. cffi_char32_t ch2 = w[i + 1];
  41. if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
  42. ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
  43. i++;
  44. }
  45. }
  46. *data++ = ch;
  47. }
  48. return result;
  49. }
  50. }
  51. static int
  52. _my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
  53. char *err_got)
  54. {
  55. cffi_char32_t ch;
  56. if (PyUnicode_GET_LENGTH(unicode) != 1) {
  57. sprintf(err_got, "unicode string of length %zd",
  58. PyUnicode_GET_LENGTH(unicode));
  59. return -1;
  60. }
  61. ch = PyUnicode_READ_CHAR(unicode, 0);
  62. if (ch > 0xFFFF)
  63. {
  64. sprintf(err_got, "larger-than-0xFFFF character");
  65. return -1;
  66. }
  67. *result = (cffi_char16_t)ch;
  68. return 0;
  69. }
  70. static int
  71. _my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
  72. char *err_got)
  73. {
  74. if (PyUnicode_GET_LENGTH(unicode) != 1) {
  75. sprintf(err_got, "unicode string of length %zd",
  76. PyUnicode_GET_LENGTH(unicode));
  77. return -1;
  78. }
  79. *result = PyUnicode_READ_CHAR(unicode, 0);
  80. return 0;
  81. }
  82. static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
  83. {
  84. Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
  85. Py_ssize_t result = length;
  86. unsigned int kind = PyUnicode_KIND(unicode);
  87. if (kind == PyUnicode_4BYTE_KIND)
  88. {
  89. Py_UCS4 *data = PyUnicode_4BYTE_DATA(unicode);
  90. Py_ssize_t i;
  91. for (i = 0; i < length; i++) {
  92. if (data[i] > 0xFFFF)
  93. result++;
  94. }
  95. }
  96. return result;
  97. }
  98. static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
  99. {
  100. return PyUnicode_GET_LENGTH(unicode);
  101. }
  102. static int _my_PyUnicode_AsChar16(PyObject *unicode,
  103. cffi_char16_t *result,
  104. Py_ssize_t resultlen)
  105. {
  106. Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
  107. unsigned int kind = PyUnicode_KIND(unicode);
  108. void *data = PyUnicode_DATA(unicode);
  109. Py_ssize_t i;
  110. for (i = 0; i < len; i++) {
  111. cffi_char32_t ordinal = PyUnicode_READ(kind, data, i);
  112. if (ordinal > 0xFFFF) {
  113. if (ordinal > 0x10FFFF) {
  114. PyErr_Format(PyExc_ValueError,
  115. "unicode character out of range for "
  116. "conversion to char16_t: 0x%x", (int)ordinal);
  117. return -1;
  118. }
  119. ordinal -= 0x10000;
  120. *result++ = 0xD800 | (ordinal >> 10);
  121. *result++ = 0xDC00 | (ordinal & 0x3FF);
  122. }
  123. else
  124. *result++ = ordinal;
  125. }
  126. return 0;
  127. }
  128. static int _my_PyUnicode_AsChar32(PyObject *unicode,
  129. cffi_char32_t *result,
  130. Py_ssize_t resultlen)
  131. {
  132. if (PyUnicode_AsUCS4(unicode, (Py_UCS4 *)result, resultlen, 0) == NULL)
  133. return -1;
  134. return 0;
  135. }