cescape_encode.h 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. #pragma once
  2. #include <util/system/types.h>
  3. // Whether to ensure strict ASCII compatibility
  4. // Turns UTF-8 strings into unreadable garbage for no known reason
  5. //#define CESCAPE_STRICT_ASCII
  6. namespace NYsonPull {
  7. namespace NDetail {
  8. namespace NCEscape {
  9. namespace NImpl {
  10. inline ui8 hex_digit(ui8 value) {
  11. constexpr ui8 hex_digits[] = "0123456789ABCDEF";
  12. return hex_digits[value];
  13. }
  14. inline ui8 oct_digit(ui8 value) {
  15. return '0' + value;
  16. }
  17. inline bool is_printable(ui8 c) {
  18. #ifdef CESCAPE_STRICT_ASCII
  19. return c >= 32 && c <= 126;
  20. #else
  21. return c >= 32;
  22. #endif
  23. }
  24. inline bool is_hex_digit(ui8 c) {
  25. return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
  26. }
  27. inline bool is_oct_digit(ui8 c) {
  28. return c >= '0' && c <= '7';
  29. }
  30. constexpr size_t ESCAPE_C_BUFFER_SIZE = 4;
  31. inline size_t escape_char(
  32. ui8 c,
  33. ui8 next,
  34. ui8 r[ESCAPE_C_BUFFER_SIZE]) {
  35. // (1) Printable characters go as-is, except backslash and double quote.
  36. // (2) Characters \r, \n, \t and \0 ... \7 replaced by their simple escape characters (if possible).
  37. // (3) Otherwise, character is encoded using hexadecimal escape sequence (if possible), or octal.
  38. if (c == '\"') {
  39. r[0] = '\\';
  40. r[1] = '\"';
  41. return 2;
  42. } else if (c == '\\') {
  43. r[0] = '\\';
  44. r[1] = '\\';
  45. return 2;
  46. } else if (is_printable(c)) {
  47. r[0] = c;
  48. return 1;
  49. } else if (c == '\r') {
  50. r[0] = '\\';
  51. r[1] = 'r';
  52. return 2;
  53. } else if (c == '\n') {
  54. r[0] = '\\';
  55. r[1] = 'n';
  56. return 2;
  57. } else if (c == '\t') {
  58. r[0] = '\\';
  59. r[1] = 't';
  60. return 2;
  61. } else if (c < 8 && !is_oct_digit(next)) {
  62. r[0] = '\\';
  63. r[1] = oct_digit(c);
  64. return 2;
  65. } else if (!is_hex_digit(next)) {
  66. r[0] = '\\';
  67. r[1] = 'x';
  68. r[2] = hex_digit((c & 0xF0) >> 4);
  69. r[3] = hex_digit((c & 0x0F) >> 0);
  70. return 4;
  71. } else {
  72. r[0] = '\\';
  73. r[1] = oct_digit((c & 0700) >> 6);
  74. r[2] = oct_digit((c & 0070) >> 3);
  75. r[3] = oct_digit((c & 0007) >> 0);
  76. return 4;
  77. }
  78. }
  79. template <typename T>
  80. inline void escape_impl(const ui8* str, size_t len, T&& consume) {
  81. ui8 buffer[ESCAPE_C_BUFFER_SIZE];
  82. size_t i, j;
  83. for (i = 0, j = 0; i < len; ++i) {
  84. auto next_char = i + 1 < len ? str[i + 1] : 0;
  85. size_t rlen = escape_char(str[i], next_char, buffer);
  86. if (rlen > 1) {
  87. consume(str + j, i - j);
  88. j = i + 1;
  89. consume(buffer, rlen);
  90. }
  91. }
  92. if (j > 0) {
  93. consume(str + j, len - j);
  94. } else {
  95. consume(str, len);
  96. }
  97. }
  98. }
  99. } // namespace NCEscape
  100. } // namespace NDetail
  101. }