find_max_char.h 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. /* Finding the optimal width of unicode characters in a buffer */
  2. #if !STRINGLIB_IS_UNICODE
  3. # error "find_max_char.h is specific to Unicode"
  4. #endif
  5. /* Mask to quickly check whether a C 'size_t' contains a
  6. non-ASCII, UTF8-encoded char. */
  7. #if (SIZEOF_SIZE_T == 8)
  8. # define UCS1_ASCII_CHAR_MASK 0x8080808080808080ULL
  9. #elif (SIZEOF_SIZE_T == 4)
  10. # define UCS1_ASCII_CHAR_MASK 0x80808080U
  11. #else
  12. # error C 'size_t' size should be either 4 or 8!
  13. #endif
  14. #if STRINGLIB_SIZEOF_CHAR == 1
  15. Py_LOCAL_INLINE(Py_UCS4)
  16. STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
  17. {
  18. const unsigned char *p = (const unsigned char *) begin;
  19. while (p < end) {
  20. if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
  21. /* Help register allocation */
  22. const unsigned char *_p = p;
  23. while (_p + SIZEOF_SIZE_T <= end) {
  24. size_t value = *(const size_t *) _p;
  25. if (value & UCS1_ASCII_CHAR_MASK)
  26. return 255;
  27. _p += SIZEOF_SIZE_T;
  28. }
  29. p = _p;
  30. if (p == end)
  31. break;
  32. }
  33. if (*p++ & 0x80)
  34. return 255;
  35. }
  36. return 127;
  37. }
  38. #undef ASCII_CHAR_MASK
  39. #else /* STRINGLIB_SIZEOF_CHAR == 1 */
  40. #define MASK_ASCII 0xFFFFFF80
  41. #define MASK_UCS1 0xFFFFFF00
  42. #define MASK_UCS2 0xFFFF0000
  43. #define MAX_CHAR_ASCII 0x7f
  44. #define MAX_CHAR_UCS1 0xff
  45. #define MAX_CHAR_UCS2 0xffff
  46. #define MAX_CHAR_UCS4 0x10ffff
  47. Py_LOCAL_INLINE(Py_UCS4)
  48. STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
  49. {
  50. #if STRINGLIB_SIZEOF_CHAR == 2
  51. const Py_UCS4 mask_limit = MASK_UCS1;
  52. const Py_UCS4 max_char_limit = MAX_CHAR_UCS2;
  53. #elif STRINGLIB_SIZEOF_CHAR == 4
  54. const Py_UCS4 mask_limit = MASK_UCS2;
  55. const Py_UCS4 max_char_limit = MAX_CHAR_UCS4;
  56. #else
  57. #error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4)
  58. #endif
  59. Py_UCS4 mask;
  60. Py_ssize_t n = end - begin;
  61. const STRINGLIB_CHAR *p = begin;
  62. const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4);
  63. Py_UCS4 max_char;
  64. max_char = MAX_CHAR_ASCII;
  65. mask = MASK_ASCII;
  66. while (p < unrolled_end) {
  67. STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3];
  68. if (bits & mask) {
  69. if (mask == mask_limit) {
  70. /* Limit reached */
  71. return max_char_limit;
  72. }
  73. if (mask == MASK_ASCII) {
  74. max_char = MAX_CHAR_UCS1;
  75. mask = MASK_UCS1;
  76. }
  77. else {
  78. /* mask can't be MASK_UCS2 because of mask_limit above */
  79. assert(mask == MASK_UCS1);
  80. max_char = MAX_CHAR_UCS2;
  81. mask = MASK_UCS2;
  82. }
  83. /* We check the new mask on the same chars in the next iteration */
  84. continue;
  85. }
  86. p += 4;
  87. }
  88. while (p < end) {
  89. if (p[0] & mask) {
  90. if (mask == mask_limit) {
  91. /* Limit reached */
  92. return max_char_limit;
  93. }
  94. if (mask == MASK_ASCII) {
  95. max_char = MAX_CHAR_UCS1;
  96. mask = MASK_UCS1;
  97. }
  98. else {
  99. /* mask can't be MASK_UCS2 because of mask_limit above */
  100. assert(mask == MASK_UCS1);
  101. max_char = MAX_CHAR_UCS2;
  102. mask = MASK_UCS2;
  103. }
  104. /* We check the new mask on the same chars in the next iteration */
  105. continue;
  106. }
  107. p++;
  108. }
  109. return max_char;
  110. }
  111. #undef MASK_ASCII
  112. #undef MASK_UCS1
  113. #undef MASK_UCS2
  114. #undef MAX_CHAR_ASCII
  115. #undef MAX_CHAR_UCS1
  116. #undef MAX_CHAR_UCS2
  117. #undef MAX_CHAR_UCS4
  118. #endif /* STRINGLIB_SIZEOF_CHAR == 1 */