relaxed_escaper.h 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. #pragma once
  2. #include <util/stream/output.h>
  3. #include <util/string/escape.h>
  4. #include <util/memory/tempbuf.h>
  5. #include <util/generic/strbuf.h>
  6. namespace NEscJ {
  7. // almost copypaste from util/string/escape.h
  8. // todo: move there (note difference in IsPrintable and handling of string)
  9. inline char HexDigit(char value) {
  10. if (value < 10)
  11. return '0' + value;
  12. else
  13. return 'A' + value - 10;
  14. }
  15. inline char OctDigit(char value) {
  16. return '0' + value;
  17. }
  18. inline bool IsUTF8(ui8 c) {
  19. return c < 0xf5 && c != 0xC0 && c != 0xC1;
  20. }
  21. inline bool IsControl(ui8 c) {
  22. return c < 0x20 || c == 0x7f;
  23. }
  24. inline bool IsPrintable(ui8 c) {
  25. return IsUTF8(c) && !IsControl(c);
  26. }
  27. inline bool IsHexDigit(ui8 c) {
  28. return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
  29. }
  30. inline bool IsOctDigit(ui8 c) {
  31. return c >= '0' && c <= '7';
  32. }
  33. struct TEscapeUtil {
  34. static constexpr size_t ESCAPE_C_BUFFER_SIZE = 6;
  35. template <bool asunicode, bool hasCustomSafeUnsafe>
  36. static inline size_t EscapeJ(ui8 c, ui8 next, char r[ESCAPE_C_BUFFER_SIZE], TStringBuf safe, TStringBuf unsafe) {
  37. // (1) Printable characters go as-is, except backslash and double quote.
  38. // (2) Characters \r, \n, \t and \0 ... \7 replaced by their simple escape characters (if possible).
  39. // (3) Otherwise, character is encoded using hexadecimal escape sequence (if possible), or octal.
  40. if (hasCustomSafeUnsafe && safe.find(c) != TStringBuf::npos) {
  41. r[0] = c;
  42. return 1;
  43. }
  44. if (c == '\"') {
  45. r[0] = '\\';
  46. r[1] = '\"';
  47. return 2;
  48. } else if (c == '\\') {
  49. r[0] = '\\';
  50. r[1] = '\\';
  51. return 2;
  52. } else if (IsPrintable(c) && (!hasCustomSafeUnsafe || unsafe.find(c) == TStringBuf::npos)) {
  53. r[0] = c;
  54. return 1;
  55. } else if (c == '\b') {
  56. r[0] = '\\';
  57. r[1] = 'b';
  58. return 2;
  59. } else if (c == '\f') {
  60. r[0] = '\\';
  61. r[1] = 'f';
  62. return 2;
  63. } else if (c == '\r') {
  64. r[0] = '\\';
  65. r[1] = 'r';
  66. return 2;
  67. } else if (c == '\n') {
  68. r[0] = '\\';
  69. r[1] = 'n';
  70. return 2;
  71. } else if (c == '\t') {
  72. r[0] = '\\';
  73. r[1] = 't';
  74. return 2;
  75. } else if (asunicode && IsUTF8(c)) { // utf8 controls escape for json
  76. r[0] = '\\';
  77. r[1] = 'u';
  78. r[2] = '0';
  79. r[3] = '0';
  80. r[4] = HexDigit((c & 0xF0) >> 4);
  81. r[5] = HexDigit((c & 0x0F) >> 0);
  82. return 6;
  83. } else if (c < 8 && !IsOctDigit(next)) {
  84. r[0] = '\\';
  85. r[1] = OctDigit(c);
  86. return 2;
  87. } else if (!IsHexDigit(next)) {
  88. r[0] = '\\';
  89. r[1] = 'x';
  90. r[2] = HexDigit((c & 0xF0) >> 4);
  91. r[3] = HexDigit((c & 0x0F) >> 0);
  92. return 4;
  93. } else {
  94. r[0] = '\\';
  95. r[1] = OctDigit((c & 0700) >> 6);
  96. r[2] = OctDigit((c & 0070) >> 3);
  97. r[3] = OctDigit((c & 0007) >> 0);
  98. return 4;
  99. }
  100. }
  101. };
  102. inline size_t SuggestBuffer(size_t len) {
  103. return len * TEscapeUtil::ESCAPE_C_BUFFER_SIZE;
  104. }
  105. template <bool tounicode, bool hasCustomSafeUnsafe>
  106. inline size_t EscapeJImpl(const char* str, size_t len, char* out, TStringBuf safe, TStringBuf unsafe) {
  107. char* out0 = out;
  108. char buffer[TEscapeUtil::ESCAPE_C_BUFFER_SIZE];
  109. size_t i, j;
  110. for (i = 0, j = 0; i < len; ++i) {
  111. size_t rlen = TEscapeUtil::EscapeJ<tounicode, hasCustomSafeUnsafe>(str[i], (i + 1 < len ? str[i + 1] : 0), buffer, safe, unsafe);
  112. if (rlen > 1) {
  113. memcpy(out, str + j, i - j);
  114. out += i - j;
  115. j = i + 1;
  116. memcpy(out, buffer, rlen);
  117. out += rlen;
  118. }
  119. }
  120. if (j > 0) {
  121. memcpy(out, str + j, len - j);
  122. out += len - j;
  123. } else {
  124. memcpy(out, str, len);
  125. out += len;
  126. }
  127. return out - out0;
  128. }
  129. template <bool tounicode>
  130. inline size_t EscapeJ(const char* str, size_t len, char* out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
  131. if (Y_LIKELY(safe.empty() && unsafe.empty())) {
  132. return EscapeJImpl<tounicode, false>(str, len, out, safe, unsafe);
  133. }
  134. return EscapeJImpl<tounicode, true>(str, len, out, safe, unsafe);
  135. }
  136. template <bool quote, bool tounicode>
  137. inline void EscapeJ(TStringBuf in, IOutputStream& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
  138. TTempBuf b(SuggestBuffer(in.size()) + 2);
  139. if (quote)
  140. b.Append("\"", 1);
  141. b.Proceed(EscapeJ<tounicode>(in.data(), in.size(), b.Current(), safe, unsafe));
  142. if (quote)
  143. b.Append("\"", 1);
  144. out.Write(b.Data(), b.Filled());
  145. }
  146. template <bool quote, bool tounicode>
  147. inline void EscapeJ(TStringBuf in, TString& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
  148. TTempBuf b(SuggestBuffer(in.size()) + 2);
  149. if (quote)
  150. b.Append("\"", 1);
  151. b.Proceed(EscapeJ<tounicode>(in.data(), in.size(), b.Current(), safe, unsafe));
  152. if (quote)
  153. b.Append("\"", 1);
  154. out.append(b.Data(), b.Filled());
  155. }
  156. template <bool quote, bool tounicode>
  157. inline TString EscapeJ(TStringBuf in, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
  158. TString s;
  159. EscapeJ<quote, tounicode>(in, s, safe, unsafe);
  160. return s;
  161. }
  162. // If the template parameter "tounicode" is ommited, then use the default value false
  163. inline size_t EscapeJ(const char* str, size_t len, char* out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
  164. return EscapeJ<false>(str, len, out, safe, unsafe);
  165. }
  166. template <bool quote>
  167. inline void EscapeJ(TStringBuf in, IOutputStream& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
  168. EscapeJ<quote, false>(in, out, safe, unsafe);
  169. }
  170. template <bool quote>
  171. inline void EscapeJ(TStringBuf in, TString& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
  172. EscapeJ<quote, false>(in, out, safe, unsafe);
  173. }
  174. template <bool quote>
  175. inline TString EscapeJ(TStringBuf in, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
  176. return EscapeJ<quote, false>(in, safe, unsafe);
  177. }
  178. }