u8-check.c 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. /* Check UTF-8 string.
  2. Copyright (C) 2002, 2006-2007, 2009-2024 Free Software Foundation, Inc.
  3. Written by Bruno Haible <bruno@clisp.org>, 2002.
  4. This file is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU Lesser General Public License as
  6. published by the Free Software Foundation; either version 2.1 of the
  7. License, or (at your option) any later version.
  8. This file is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <https://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "unistr.h"
  17. const uint8_t *
  18. u8_check (const uint8_t *s, size_t n)
  19. {
  20. const uint8_t *s_end = s + n;
  21. while (s < s_end)
  22. {
  23. /* Keep in sync with unistr.h and u8-mbtouc-aux.c. */
  24. uint8_t c = *s;
  25. if (c < 0x80)
  26. {
  27. s++;
  28. continue;
  29. }
  30. if (c >= 0xc2)
  31. {
  32. if (c < 0xe0)
  33. {
  34. if (s + 2 <= s_end
  35. && (s[1] ^ 0x80) < 0x40)
  36. {
  37. s += 2;
  38. continue;
  39. }
  40. }
  41. else if (c < 0xf0)
  42. {
  43. if (s + 3 <= s_end
  44. && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
  45. && (c >= 0xe1 || s[1] >= 0xa0)
  46. && (c != 0xed || s[1] < 0xa0))
  47. {
  48. s += 3;
  49. continue;
  50. }
  51. }
  52. else if (c <= 0xf4)
  53. {
  54. if (s + 4 <= s_end
  55. && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
  56. && (s[3] ^ 0x80) < 0x40
  57. && (c >= 0xf1 || s[1] >= 0x90)
  58. && (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
  59. {
  60. s += 4;
  61. continue;
  62. }
  63. }
  64. }
  65. /* invalid or incomplete multibyte character */
  66. return s;
  67. }
  68. return NULL;
  69. }