encoding-check.cpp 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. #include <vector>
  2. #include <iostream>
  3. #include <fstream>
  4. #include <cstdlib>
  5. /*
  6. * The utf8_check() function scans the '\0'-terminated string starting
  7. * at s. It returns a pointer to the first byte of the first malformed
  8. * or overlong UTF-8 sequence found, or NULL if the string contains
  9. * only correct UTF-8. It also spots UTF-8 sequences that could cause
  10. * trouble if converted to UTF-16, namely surrogate characters
  11. * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
  12. * routine is very likely to find a malformed sequence if the input
  13. * uses any other encoding than UTF-8. It therefore can be used as a
  14. * very effective heuristic for distinguishing between UTF-8 and other
  15. * encodings.
  16. *
  17. * I wrote this code mainly as a specification of functionality; there
  18. * are no doubt performance optimizations possible for certain CPUs.
  19. *
  20. * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
  21. * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
  22. */
  23. unsigned char *utf8_check(unsigned char *s)
  24. {
  25. while (*s) {
  26. if (*s < 0x80) {
  27. // 0xxxxxxx
  28. s++;
  29. } else if ((s[0] & 0xe0) == 0xc0) {
  30. // 110xxxxx 10xxxxxx
  31. if ((s[1] & 0xc0) != 0x80 ||
  32. (s[0] & 0xfe) == 0xc0) { // overlong?
  33. return s;
  34. } else {
  35. s += 2;
  36. }
  37. } else if ((s[0] & 0xf0) == 0xe0) {
  38. // 1110xxxx 10xxxxxx 10xxxxxx
  39. if ((s[1] & 0xc0) != 0x80 ||
  40. (s[2] & 0xc0) != 0x80 ||
  41. (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || // overlong?
  42. (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || // surrogate?
  43. (s[0] == 0xef && s[1] == 0xbf &&
  44. (s[2] & 0xfe) == 0xbe)) { // U+FFFE or U+FFFF?
  45. return s;
  46. } else {
  47. s += 3;
  48. }
  49. } else if ((s[0] & 0xf8) == 0xf0) {
  50. // 11110xxX 10xxxxxx 10xxxxxx 10xxxxxx
  51. if ((s[1] & 0xc0) != 0x80 ||
  52. (s[2] & 0xc0) != 0x80 ||
  53. (s[3] & 0xc0) != 0x80 ||
  54. (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || // overlong?
  55. (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) { // > U+10FFFF?
  56. return s;
  57. } else {
  58. s += 4;
  59. }
  60. } else {
  61. return s;
  62. }
  63. }
  64. return NULL;
  65. }
  66. int main(int argc, char const *argv[])
  67. {
  68. if (argc != 3) {
  69. std::cerr << "Usage: " << argv[0] << " <program/library> <file>" << std::endl;
  70. return -1;
  71. }
  72. const char* target = argv[1];
  73. const char* filename = argv[2];
  74. const auto error_exit = [=](const char* error) {
  75. std::cerr << "\n\tError: " << error << ": " << filename << "\n"
  76. << "\tTarget: " << target << "\n"
  77. << std::endl;
  78. std::exit(-2);
  79. };
  80. std::ifstream file(filename, std::ios::binary | std::ios::ate);
  81. const auto size = file.tellg();
  82. if (size == 0) {
  83. return 0;
  84. }
  85. file.seekg(0, std::ios::beg);
  86. std::vector<char> buffer(size);
  87. if (file.read(buffer.data(), size)) {
  88. buffer.push_back('\0');
  89. // Check UTF-8 validity
  90. if (utf8_check(reinterpret_cast<unsigned char*>(buffer.data())) != nullptr) {
  91. error_exit("Source file does not contain (valid) UTF-8");
  92. }
  93. // Check against a BOM mark
  94. if (buffer.size() >= 3
  95. && buffer[0] == '\xef'
  96. && buffer[1] == '\xbb'
  97. && buffer[2] == '\xbf') {
  98. error_exit("Source file is valid UTF-8 but contains a BOM mark");
  99. }
  100. } else {
  101. error_exit("Could not read source file");
  102. }
  103. return 0;
  104. }