yql_ast_escaping.cpp 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. #include "yql_ast_escaping.h"
  2. #include <util/charset/wide.h>
  3. #include <util/stream/output.h>
  4. #include <util/string/hex.h>
  5. namespace NYql {
  6. static char HexDigit(char c)
  7. {
  8. return (c < 10 ? '0' + c : 'A' + (c - 10));
  9. }
  10. static void EscapedPrintChar(ui8 c, IOutputStream* out)
  11. {
  12. switch (c) {
  13. case '\\': out->Write("\\\\", 2); break;
  14. case '"' : out->Write("\\\"", 2); break;
  15. case '\t': out->Write("\\t", 2); break;
  16. case '\n': out->Write("\\n", 2); break;
  17. case '\r': out->Write("\\r", 2); break;
  18. case '\b': out->Write("\\b", 2); break;
  19. case '\f': out->Write("\\f", 2); break;
  20. case '\a': out->Write("\\a", 2); break;
  21. case '\v': out->Write("\\v", 2); break;
  22. default: {
  23. if (isprint(c)) out->Write(static_cast<char>(c));
  24. else {
  25. char buf[4] = { "\\x" };
  26. buf[2] = HexDigit((c & 0xf0) >> 4);
  27. buf[3] = HexDigit((c & 0x0f));
  28. out->Write(buf, 4);
  29. }
  30. }
  31. }
  32. }
  33. static void EscapedPrintUnicode(wchar32 rune, IOutputStream* out)
  34. {
  35. static const int MAX_ESCAPE_LEN = 10;
  36. if (rune < 0x80) {
  37. EscapedPrintChar(static_cast<ui8>(rune & 0xff), out);
  38. } else {
  39. int i = 0;
  40. char buf[MAX_ESCAPE_LEN];
  41. if (rune < 0x10000) {
  42. buf[i++] = '\\';
  43. buf[i++] = 'u';
  44. } else {
  45. buf[i++] = '\\';
  46. buf[i++] = 'U';
  47. buf[i++] = HexDigit((rune & 0xf0000000) >> 28);
  48. buf[i++] = HexDigit((rune & 0x0f000000) >> 24);
  49. buf[i++] = HexDigit((rune & 0x00f00000) >> 20);
  50. buf[i++] = HexDigit((rune & 0x000f0000) >> 16);
  51. }
  52. buf[i++] = HexDigit((rune & 0xf000) >> 12);
  53. buf[i++] = HexDigit((rune & 0x0f00) >> 8);
  54. buf[i++] = HexDigit((rune & 0x00f0) >> 4);
  55. buf[i++] = HexDigit((rune & 0x000f));
  56. out->Write(buf, i);
  57. }
  58. }
  59. static bool TryParseOctal(const char*& p, const char* e, int maxlen, wchar32* value)
  60. {
  61. while (maxlen-- && p != e) {
  62. if (*value > 255) return false;
  63. char ch = *p++;
  64. if (ch >= '0' && ch <= '7') {
  65. *value = *value * 8 + (ch - '0');
  66. continue;
  67. }
  68. break;
  69. }
  70. return (maxlen == -1);
  71. }
  72. static bool TryParseHex(const char*& p, const char* e, int maxlen, wchar32* value)
  73. {
  74. while (maxlen-- > 0 && p != e) {
  75. char ch = *p++;
  76. if (ch >= '0' && ch <= '9') {
  77. *value = *value * 16 + (ch - '0');
  78. continue;
  79. }
  80. // to lower case
  81. ch |= 0x20;
  82. if (ch >= 'a' && ch <= 'f') {
  83. *value = *value * 16 + (ch - 'a') + 10;
  84. continue;
  85. }
  86. break;
  87. }
  88. return (maxlen == -1);
  89. }
  90. static bool IsValidUtf8Rune(wchar32 value) {
  91. return value <= 0x10ffff && (value < 0xd800 || value > 0xdfff);
  92. }
  93. TStringBuf UnescapeResultToString(EUnescapeResult result)
  94. {
  95. switch (result) {
  96. case EUnescapeResult::OK:
  97. return "OK";
  98. case EUnescapeResult::INVALID_ESCAPE_SEQUENCE:
  99. return "Expected escape sequence";
  100. case EUnescapeResult::INVALID_BINARY:
  101. return "Invalid binary value";
  102. case EUnescapeResult::INVALID_OCTAL:
  103. return "Invalid octal value";
  104. case EUnescapeResult::INVALID_HEXADECIMAL:
  105. return "Invalid hexadecimal value";
  106. case EUnescapeResult::INVALID_UNICODE:
  107. return "Invalid unicode value";
  108. case EUnescapeResult::INVALID_END:
  109. return "Unexpected end of atom";
  110. }
  111. return "Unknown unescape error";
  112. }
  113. void EscapeArbitraryAtom(TStringBuf atom, char quoteChar, IOutputStream* out)
  114. {
  115. out->Write(quoteChar);
  116. const ui8 *p = reinterpret_cast<const ui8*>(atom.begin()),
  117. *e = reinterpret_cast<const ui8*>(atom.end());
  118. while (p != e) {
  119. wchar32 rune = 0;
  120. size_t rune_len = 0;
  121. if (SafeReadUTF8Char(rune, rune_len, p, e) == RECODE_RESULT::RECODE_OK && IsValidUtf8Rune(rune)) {
  122. EscapedPrintUnicode(rune, out);
  123. p += rune_len;
  124. } else {
  125. EscapedPrintChar(*p++, out);
  126. }
  127. }
  128. out->Write(quoteChar);
  129. }
  130. EUnescapeResult UnescapeArbitraryAtom(
  131. TStringBuf atom, char endChar, IOutputStream* out, size_t* readBytes)
  132. {
  133. const char *p = atom.begin(),
  134. *e = atom.end();
  135. while (p != e) {
  136. char current = *p++;
  137. // C-style escape sequences
  138. if (current == '\\') {
  139. if (p == e) {
  140. *readBytes = p - atom.begin();
  141. return EUnescapeResult::INVALID_ESCAPE_SEQUENCE;
  142. }
  143. char next = *p++;
  144. switch (next) {
  145. case 't': current = '\t'; break;
  146. case 'n': current = '\n'; break;
  147. case 'r': current = '\r'; break;
  148. case 'b': current = '\b'; break;
  149. case 'f': current = '\f'; break;
  150. case 'a': current = '\a'; break;
  151. case 'v': current = '\v'; break;
  152. case '0': case '1': case '2': case '3': {
  153. wchar32 value = (next - '0');
  154. if (!TryParseOctal(p, e, 2, &value)) {
  155. *readBytes = p - atom.begin();
  156. return EUnescapeResult::INVALID_OCTAL;
  157. }
  158. current = value & 0xff;
  159. break;
  160. }
  161. case 'x': {
  162. wchar32 value = 0;
  163. if (!TryParseHex(p, e, 2, &value)) {
  164. *readBytes = p - atom.begin();
  165. return EUnescapeResult::INVALID_HEXADECIMAL;
  166. }
  167. current = value & 0xff;
  168. break;
  169. }
  170. case 'u':
  171. case 'U': {
  172. wchar32 value = 0;
  173. int len = (next == 'u' ? 4 : 8);
  174. if (!TryParseHex(p, e, len, &value) || !IsValidUtf8Rune(value)) {
  175. *readBytes = p - atom.begin();
  176. return EUnescapeResult::INVALID_UNICODE;
  177. }
  178. size_t written = 0;
  179. char buf[4];
  180. WideToUTF8(&value, 1, buf, written);
  181. out->Write(buf, written);
  182. continue;
  183. }
  184. default: {
  185. current = next;
  186. }
  187. }
  188. } else if (endChar == '`') {
  189. if (current == '`') {
  190. if (p == e) {
  191. *readBytes = p - atom.begin();
  192. return EUnescapeResult::OK;
  193. } else {
  194. if (*p != '`') {
  195. *readBytes = p - atom.begin();
  196. return EUnescapeResult::INVALID_ESCAPE_SEQUENCE;
  197. } else {
  198. p++;
  199. }
  200. }
  201. }
  202. } else if (current == endChar) {
  203. *readBytes = p - atom.begin();
  204. return EUnescapeResult::OK;
  205. }
  206. out->Write(current);
  207. }
  208. *readBytes = p - atom.begin();
  209. return EUnescapeResult::INVALID_END;
  210. }
  211. void EscapeBinaryAtom(TStringBuf atom, char quoteChar, IOutputStream* out)
  212. {
  213. char prefix[] = { 'x', quoteChar };
  214. out->Write(prefix, 2);
  215. out->Write(HexEncode(atom.data(), atom.size()));
  216. out->Write(quoteChar);
  217. }
  218. EUnescapeResult UnescapeBinaryAtom(
  219. TStringBuf atom, char endChar, IOutputStream* out, size_t* readBytes)
  220. {
  221. const char *p = atom.begin(),
  222. *e = atom.end();
  223. while (p != e) {
  224. char current = *p;
  225. if (current == endChar) {
  226. *readBytes = p - atom.begin();
  227. return EUnescapeResult::OK;
  228. }
  229. wchar32 byte = 0;
  230. if (!TryParseHex(p, e, 2, &byte)) {
  231. *readBytes = p - atom.begin();
  232. return EUnescapeResult::INVALID_BINARY;
  233. }
  234. out->Write(byte & 0xff);
  235. }
  236. *readBytes = p - atom.begin();
  237. return EUnescapeResult::INVALID_END;
  238. }
  239. } // namspace NYql