u8-mbtoucr.c 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. /* Look at first character in UTF-8 string, returning an error code.
  2. Copyright (C) 1999-2002, 2006-2007, 2009-2020 Free Software Foundation, Inc.
  3. Written by Bruno Haible <bruno@clisp.org>, 2001.
  4. This program is free software: you can redistribute it and/or modify it
  5. under the terms of the GNU General Public License as published
  6. by the Free Software Foundation; either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program. If not, see <https://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "unistr.h"
  17. int
  18. u8_mbtoucr (ucs4_t *puc, const uint8_t *s, size_t n)
  19. {
  20. uint8_t c = *s;
  21. if (c < 0x80)
  22. {
  23. *puc = c;
  24. return 1;
  25. }
  26. else if (c >= 0xc2)
  27. {
  28. if (c < 0xe0)
  29. {
  30. if (n >= 2)
  31. {
  32. if ((s[1] ^ 0x80) < 0x40)
  33. {
  34. *puc = ((unsigned int) (c & 0x1f) << 6)
  35. | (unsigned int) (s[1] ^ 0x80);
  36. return 2;
  37. }
  38. /* invalid multibyte character */
  39. }
  40. else
  41. {
  42. /* incomplete multibyte character */
  43. *puc = 0xfffd;
  44. return -2;
  45. }
  46. }
  47. else if (c < 0xf0)
  48. {
  49. if (n >= 2)
  50. {
  51. if ((s[1] ^ 0x80) < 0x40
  52. && (c >= 0xe1 || s[1] >= 0xa0)
  53. && (c != 0xed || s[1] < 0xa0))
  54. {
  55. if (n >= 3)
  56. {
  57. if ((s[2] ^ 0x80) < 0x40)
  58. {
  59. *puc = ((unsigned int) (c & 0x0f) << 12)
  60. | ((unsigned int) (s[1] ^ 0x80) << 6)
  61. | (unsigned int) (s[2] ^ 0x80);
  62. return 3;
  63. }
  64. /* invalid multibyte character */
  65. }
  66. else
  67. {
  68. /* incomplete multibyte character */
  69. *puc = 0xfffd;
  70. return -2;
  71. }
  72. }
  73. /* invalid multibyte character */
  74. }
  75. else
  76. {
  77. /* incomplete multibyte character */
  78. *puc = 0xfffd;
  79. return -2;
  80. }
  81. }
  82. else if (c < 0xf8)
  83. {
  84. if (n >= 2)
  85. {
  86. if ((s[1] ^ 0x80) < 0x40
  87. && (c >= 0xf1 || s[1] >= 0x90)
  88. && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)))
  89. {
  90. if (n >= 3)
  91. {
  92. if ((s[2] ^ 0x80) < 0x40)
  93. {
  94. if (n >= 4)
  95. {
  96. if ((s[3] ^ 0x80) < 0x40)
  97. {
  98. *puc = ((unsigned int) (c & 0x07) << 18)
  99. | ((unsigned int) (s[1] ^ 0x80) << 12)
  100. | ((unsigned int) (s[2] ^ 0x80) << 6)
  101. | (unsigned int) (s[3] ^ 0x80);
  102. return 4;
  103. }
  104. /* invalid multibyte character */
  105. }
  106. else
  107. {
  108. /* incomplete multibyte character */
  109. *puc = 0xfffd;
  110. return -2;
  111. }
  112. }
  113. /* invalid multibyte character */
  114. }
  115. else
  116. {
  117. /* incomplete multibyte character */
  118. *puc = 0xfffd;
  119. return -2;
  120. }
  121. }
  122. /* invalid multibyte character */
  123. }
  124. else
  125. {
  126. /* incomplete multibyte character */
  127. *puc = 0xfffd;
  128. return -2;
  129. }
  130. }
  131. }
  132. /* invalid multibyte character */
  133. *puc = 0xfffd;
  134. return -1;
  135. }