u8-mbtoucr.c 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. /* Look at first character in UTF-8 string, returning an error code.
  2. Copyright (C) 1999-2002, 2006-2007, 2009-2024 Free Software Foundation, Inc.
  3. Written by Bruno Haible <bruno@clisp.org>, 2001.
  4. This file is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU Lesser General Public License as
  6. published by the Free Software Foundation; either version 2.1 of the
  7. License, or (at your option) any later version.
  8. This file is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <https://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "unistr.h"
  17. int
  18. u8_mbtoucr (ucs4_t *puc, const uint8_t *s, size_t n)
  19. {
  20. uint8_t c = *s;
  21. if (c < 0x80)
  22. {
  23. *puc = c;
  24. return 1;
  25. }
  26. else if (c >= 0xc2)
  27. {
  28. if (c < 0xe0)
  29. {
  30. if (n >= 2)
  31. {
  32. if ((s[1] ^ 0x80) < 0x40)
  33. {
  34. *puc = ((unsigned int) (c & 0x1f) << 6)
  35. | (unsigned int) (s[1] ^ 0x80);
  36. return 2;
  37. }
  38. /* invalid multibyte character */
  39. }
  40. else
  41. {
  42. /* incomplete multibyte character */
  43. *puc = 0xfffd;
  44. return -2;
  45. }
  46. }
  47. else if (c < 0xf0)
  48. {
  49. if (n >= 2)
  50. {
  51. if ((s[1] ^ 0x80) < 0x40
  52. && (c >= 0xe1 || s[1] >= 0xa0)
  53. && (c != 0xed || s[1] < 0xa0))
  54. {
  55. if (n >= 3)
  56. {
  57. if ((s[2] ^ 0x80) < 0x40)
  58. {
  59. *puc = ((unsigned int) (c & 0x0f) << 12)
  60. | ((unsigned int) (s[1] ^ 0x80) << 6)
  61. | (unsigned int) (s[2] ^ 0x80);
  62. return 3;
  63. }
  64. /* invalid multibyte character */
  65. }
  66. else
  67. {
  68. /* incomplete multibyte character */
  69. *puc = 0xfffd;
  70. return -2;
  71. }
  72. }
  73. /* invalid multibyte character */
  74. }
  75. else
  76. {
  77. /* incomplete multibyte character */
  78. *puc = 0xfffd;
  79. return -2;
  80. }
  81. }
  82. else if (c <= 0xf4)
  83. {
  84. if (n >= 2)
  85. {
  86. if ((s[1] ^ 0x80) < 0x40
  87. && (c >= 0xf1 || s[1] >= 0x90)
  88. && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
  89. {
  90. if (n >= 3)
  91. {
  92. if ((s[2] ^ 0x80) < 0x40)
  93. {
  94. if (n >= 4)
  95. {
  96. if ((s[3] ^ 0x80) < 0x40)
  97. {
  98. *puc = ((unsigned int) (c & 0x07) << 18)
  99. | ((unsigned int) (s[1] ^ 0x80) << 12)
  100. | ((unsigned int) (s[2] ^ 0x80) << 6)
  101. | (unsigned int) (s[3] ^ 0x80);
  102. return 4;
  103. }
  104. /* invalid multibyte character */
  105. }
  106. else
  107. {
  108. /* incomplete multibyte character */
  109. *puc = 0xfffd;
  110. return -2;
  111. }
  112. }
  113. /* invalid multibyte character */
  114. }
  115. else
  116. {
  117. /* incomplete multibyte character */
  118. *puc = 0xfffd;
  119. return -2;
  120. }
  121. }
  122. /* invalid multibyte character */
  123. }
  124. else
  125. {
  126. /* incomplete multibyte character */
  127. *puc = 0xfffd;
  128. return -2;
  129. }
  130. }
  131. }
  132. /* invalid multibyte character */
  133. *puc = 0xfffd;
  134. return -1;
  135. }