escape.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. #include "escape.h"
  2. #include "cast.h"
  3. #include <util/system/defaults.h>
  4. #include <util/charset/utf8.h>
  5. #include <util/charset/wide.h>
  6. /// @todo: escape trigraphs (eg "??/" is "\")
  7. /* REFEREBCES FOR ESCAPE SEQUENCE INTERPRETATION:
  8. * C99 p. 6.4.3 Universal character names.
  9. * C99 p. 6.4.4.4 Character constants.
  10. *
  11. * <simple-escape-sequence> ::= {
  12. * \' , \" , \? , \\ ,
  13. * \a , \b , \f , \n , \r , \t , \v
  14. * }
  15. *
  16. * <octal-escape-sequence> ::= \ <octal-digit> {1, 3}
  17. * <hexadecimal-escape-sequence> ::= \x <hexadecimal-digit> +
  18. * <universal-character-name> ::= \u <hexadecimal-digit> {4}
  19. * || \U <hexadecimal-digit> {8}
  20. *
  21. * NOTE (6.4.4.4.7):
  22. * Each octal or hexadecimal escape sequence is the longest sequence of characters that can
  23. * constitute the escape sequence.
  24. *
  25. * THEREFORE:
  26. * - Octal escape sequence spans until rightmost non-octal-digit character.
  27. * - Octal escape sequence always terminates after three octal digits.
  28. * - Hexadecimal escape sequence spans until rightmost non-hexadecimal-digit character.
  29. * - Universal character name consists of exactly 4 or 8 hexadecimal digit.
  30. *
  31. * by kerzum@
  32. * It is also required to escape trigraphs that are enabled in compilers by default and
  33. * are also processed inside string literals
  34. * The nine trigraphs and their replacements are
  35. *
  36. * Trigraph: ??( ??) ??< ??> ??= ??/ ??' ??! ??-
  37. * Replacement: [ ] { } # \ ^ | ~
  38. *
  39. */
  40. namespace {
  41. template <typename TChar>
  42. static inline char HexDigit(TChar value) {
  43. Y_ASSERT(value < 16);
  44. if (value < 10) {
  45. return '0' + value;
  46. } else {
  47. return 'A' + value - 10;
  48. }
  49. }
  50. template <typename TChar>
  51. static inline char OctDigit(TChar value) {
  52. Y_ASSERT(value < 8);
  53. return '0' + value;
  54. }
  55. template <typename TChar>
  56. static inline bool IsPrintable(TChar c) {
  57. return c >= 32 && c <= 126;
  58. }
  59. template <typename TChar>
  60. static inline bool IsHexDigit(TChar c) {
  61. return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
  62. }
  63. template <typename TChar>
  64. static inline bool IsOctDigit(TChar c) {
  65. return c >= '0' && c <= '7';
  66. }
  67. template <typename TChar>
  68. struct TEscapeUtil;
  69. template <>
  70. struct TEscapeUtil<char> {
  71. static const size_t ESCAPE_C_BUFFER_SIZE = 4;
  72. template <typename TNextChar, typename TBufferChar>
  73. static inline size_t EscapeC(unsigned char c, TNextChar next, TBufferChar r[ESCAPE_C_BUFFER_SIZE]) {
  74. // (1) Printable characters go as-is, except backslash and double quote.
  75. // (2) Characters \r, \n, \t and \0 ... \7 replaced by their simple escape characters (if possible).
  76. // (3) Otherwise, character is encoded using hexadecimal escape sequence (if possible), or octal.
  77. if (c == '\"') {
  78. r[0] = '\\';
  79. r[1] = '\"';
  80. return 2;
  81. } else if (c == '\\') {
  82. r[0] = '\\';
  83. r[1] = '\\';
  84. return 2;
  85. } else if (IsPrintable(c) && (!(c == '?' && next == '?'))) {
  86. r[0] = c;
  87. return 1;
  88. } else if (c == '\r') {
  89. r[0] = '\\';
  90. r[1] = 'r';
  91. return 2;
  92. } else if (c == '\n') {
  93. r[0] = '\\';
  94. r[1] = 'n';
  95. return 2;
  96. } else if (c == '\t') {
  97. r[0] = '\\';
  98. r[1] = 't';
  99. return 2;
  100. } else if (c < 8 && !IsOctDigit(next)) {
  101. r[0] = '\\';
  102. r[1] = OctDigit(c);
  103. return 2;
  104. } else if (!IsHexDigit(next)) {
  105. r[0] = '\\';
  106. r[1] = 'x';
  107. r[2] = HexDigit((c & 0xF0) >> 4);
  108. r[3] = HexDigit((c & 0x0F) >> 0);
  109. return 4;
  110. } else {
  111. r[0] = '\\';
  112. r[1] = OctDigit((c & 0700) >> 6);
  113. r[2] = OctDigit((c & 0070) >> 3);
  114. r[3] = OctDigit((c & 0007) >> 0);
  115. return 4;
  116. }
  117. }
  118. };
  119. template <>
  120. struct TEscapeUtil<wchar16> {
  121. static const size_t ESCAPE_C_BUFFER_SIZE = 6;
  122. template <typename TNextChar, typename TBufferChar>
  123. static inline size_t EscapeC(wchar16 c, TNextChar next, TBufferChar r[ESCAPE_C_BUFFER_SIZE]) {
  124. if (c < 0x100) {
  125. return TEscapeUtil<char>::EscapeC(char(c), next, r);
  126. } else {
  127. r[0] = '\\';
  128. r[1] = 'u';
  129. r[2] = HexDigit((c & 0xF000) >> 12);
  130. r[3] = HexDigit((c & 0x0F00) >> 8);
  131. r[4] = HexDigit((c & 0x00F0) >> 4);
  132. r[5] = HexDigit((c & 0x000F) >> 0);
  133. return 6;
  134. }
  135. }
  136. };
  137. }
  138. template <class TChar>
  139. TBasicString<TChar>& EscapeCImpl(const TChar* str, size_t len, TBasicString<TChar>& r) {
  140. using TEscapeUtil = ::TEscapeUtil<TChar>;
  141. TChar buffer[TEscapeUtil::ESCAPE_C_BUFFER_SIZE];
  142. size_t i, j;
  143. for (i = 0, j = 0; i < len; ++i) {
  144. size_t rlen = TEscapeUtil::EscapeC(str[i], (i + 1 < len ? str[i + 1] : 0), buffer);
  145. if (rlen > 1) {
  146. r.append(str + j, i - j);
  147. j = i + 1;
  148. r.append(buffer, rlen);
  149. }
  150. }
  151. if (j > 0) {
  152. r.append(str + j, len - j);
  153. } else {
  154. r.append(str, len);
  155. }
  156. return r;
  157. }
  158. template TString& EscapeCImpl<TString::TChar>(const TString::TChar* str, size_t len, TString& r);
  159. template TUtf16String& EscapeCImpl<TUtf16String::TChar>(const TUtf16String::TChar* str, size_t len, TUtf16String& r);
  160. namespace {
  161. template <class TStr>
  162. inline void AppendUnicode(TStr& s, wchar32 v) {
  163. char buf[10];
  164. size_t sz = 0;
  165. WriteUTF8Char(v, sz, (ui8*)buf);
  166. s.AppendNoAlias(buf, sz);
  167. }
  168. inline void AppendUnicode(TUtf16String& s, wchar32 v) {
  169. WriteSymbol(v, s);
  170. }
  171. template <ui32 sz, typename TChar>
  172. inline size_t CountHex(const TChar* p, const TChar* pe) {
  173. auto b = p;
  174. auto e = Min(p + sz, pe);
  175. while (b < e && IsHexDigit(*b)) {
  176. ++b;
  177. }
  178. return b - p;
  179. }
  180. template <size_t sz, typename TChar, typename T>
  181. inline bool ParseHex(const TChar* p, const TChar* pe, T& t) noexcept {
  182. return (p + sz <= pe) && TryIntFromString<16>(p, sz, t);
  183. }
  184. template <ui32 sz, typename TChar>
  185. inline size_t CountOct(const TChar* p, const TChar* pe) {
  186. ui32 maxsz = Min<size_t>(sz, pe - p);
  187. if (3 == sz && 3 == maxsz && !(*p >= '0' && *p <= '3')) {
  188. maxsz = 2;
  189. }
  190. for (ui32 i = 0; i < maxsz; ++i, ++p) {
  191. if (!IsOctDigit(*p)) {
  192. return i;
  193. }
  194. }
  195. return maxsz;
  196. }
  197. }
  198. template <class TChar, class TStr>
  199. static TStr& DoUnescapeC(const TChar* p, size_t sz, TStr& res) {
  200. const TChar* pe = p + sz;
  201. while (p != pe) {
  202. if ('\\' == *p) {
  203. ++p;
  204. if (p == pe) {
  205. return res;
  206. }
  207. switch (*p) {
  208. default:
  209. res.append(*p);
  210. break;
  211. case 'a':
  212. res.append('\a');
  213. break;
  214. case 'b':
  215. res.append('\b');
  216. break;
  217. case 'f':
  218. res.append('\f');
  219. break;
  220. case 'n':
  221. res.append('\n');
  222. break;
  223. case 'r':
  224. res.append('\r');
  225. break;
  226. case 't':
  227. res.append('\t');
  228. break;
  229. case 'v':
  230. res.append('\v');
  231. break;
  232. case 'u': {
  233. ui16 cp[2];
  234. if (ParseHex<4>(p + 1, pe, cp[0])) {
  235. if (Y_UNLIKELY(cp[0] >= 0xD800 && cp[0] <= 0xDBFF && ParseHex<4>(p + 7, pe, cp[1]) && p[5] == '\\' && p[6] == 'u')) {
  236. const wchar16 wbuf[] = {wchar16(cp[0]), wchar16(cp[1])};
  237. AppendUnicode(res, ReadSymbol(wbuf, wbuf + 2));
  238. p += 10;
  239. } else {
  240. AppendUnicode(res, (wchar32)cp[0]);
  241. p += 4;
  242. }
  243. } else {
  244. res.append(*p);
  245. }
  246. break;
  247. }
  248. case 'U':
  249. if (CountHex<8>(p + 1, pe) != 8) {
  250. res.append(*p);
  251. } else {
  252. AppendUnicode(res, IntFromString<ui32, 16>(p + 1, 8));
  253. p += 8;
  254. }
  255. break;
  256. case 'x':
  257. if (ui32 v = CountHex<2>(p + 1, pe)) {
  258. res.append((TChar)IntFromString<ui32, 16>(p + 1, v));
  259. p += v;
  260. } else {
  261. res.append(*p);
  262. }
  263. break;
  264. case '0':
  265. case '1':
  266. case '2':
  267. case '3': {
  268. ui32 v = CountOct<3>(p, pe); // v is always positive
  269. res.append((TChar)IntFromString<ui32, 8>(p, v));
  270. p += v - 1;
  271. } break;
  272. case '4':
  273. case '5':
  274. case '6':
  275. case '7': {
  276. ui32 v = CountOct<2>(p, pe); // v is always positive
  277. res.append((TChar)IntFromString<ui32, 8>(p, v));
  278. p += v - 1;
  279. } break;
  280. }
  281. ++p;
  282. } else {
  283. const auto r = std::basic_string_view<TChar>(p, pe - p).find('\\');
  284. const auto n = r != std::string::npos ? p + r : pe;
  285. res.append(p, n);
  286. p = n;
  287. }
  288. }
  289. return res;
  290. }
  291. template <class TChar>
  292. TBasicString<TChar>& UnescapeCImpl(const TChar* p, size_t sz, TBasicString<TChar>& res) {
  293. return DoUnescapeC(p, sz, res);
  294. }
  295. template <class TChar>
  296. TChar* UnescapeC(const TChar* str, size_t len, TChar* buf) {
  297. struct TUnboundedString {
  298. void append(TChar ch) noexcept {
  299. *P++ = ch;
  300. }
  301. void append(const TChar* b, const TChar* e) noexcept {
  302. while (b != e) {
  303. append(*b++);
  304. }
  305. }
  306. void AppendNoAlias(const TChar* s, size_t l) noexcept {
  307. append(s, s + l);
  308. }
  309. TChar* P;
  310. } bufbuf = {buf};
  311. return DoUnescapeC(str, len, bufbuf).P;
  312. }
  313. template TString& UnescapeCImpl<TString::TChar>(const TString::TChar* str, size_t len, TString& r);
  314. template TUtf16String& UnescapeCImpl<TUtf16String::TChar>(const TUtf16String::TChar* str, size_t len, TUtf16String& r);
  315. template char* UnescapeC<char>(const char* str, size_t len, char* buf);
  316. template <class TChar>
  317. size_t UnescapeCCharLen(const TChar* begin, const TChar* end) {
  318. if (begin >= end) {
  319. return 0;
  320. }
  321. if (*begin != '\\') {
  322. return 1;
  323. }
  324. if (++begin == end) {
  325. return 1;
  326. }
  327. switch (*begin) {
  328. default:
  329. return 2;
  330. case 'u':
  331. return CountHex<4>(begin + 1, end) == 4 ? 6 : 2;
  332. case 'U':
  333. return CountHex<8>(begin + 1, end) == 8 ? 10 : 2;
  334. case 'x':
  335. return 2 + CountHex<2>(begin + 1, end);
  336. case '0':
  337. case '1':
  338. case '2':
  339. case '3':
  340. return 1 + CountOct<3>(begin, end); // >= 2
  341. case '4':
  342. case '5':
  343. case '6':
  344. case '7':
  345. return 1 + CountOct<2>(begin, end); // >= 2
  346. }
  347. }
  348. template size_t UnescapeCCharLen<char>(const char* begin, const char* end);
  349. template size_t UnescapeCCharLen<TUtf16String::TChar>(const TUtf16String::TChar* begin, const TUtf16String::TChar* end);
  350. TString& EscapeC(const TStringBuf str, TString& s) {
  351. return EscapeC(str.data(), str.size(), s);
  352. }
  353. TUtf16String& EscapeC(const TWtringBuf str, TUtf16String& w) {
  354. return EscapeC(str.data(), str.size(), w);
  355. }
  356. TString EscapeC(const TString& str) {
  357. return EscapeC(str.data(), str.size());
  358. }
  359. TUtf16String EscapeC(const TUtf16String& str) {
  360. return EscapeC(str.data(), str.size());
  361. }
  362. TString& UnescapeC(const TStringBuf str, TString& s) {
  363. return UnescapeC(str.data(), str.size(), s);
  364. }
  365. TUtf16String& UnescapeC(const TWtringBuf str, TUtf16String& w) {
  366. return UnescapeC(str.data(), str.size(), w);
  367. }
  368. TString UnescapeC(const TStringBuf str) {
  369. return UnescapeC(str.data(), str.size());
  370. }
  371. TUtf16String UnescapeC(const TWtringBuf str) {
  372. return UnescapeC(str.data(), str.size());
  373. }