string_parser.c 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. #include <stdbool.h>
  2. #include <Python.h>
  3. #include "tokenizer.h"
  4. #include "pegen.h"
  5. #include "string_parser.h"
  6. //// STRING HANDLING FUNCTIONS ////
  7. static int
  8. warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
  9. {
  10. if (p->call_invalid_rules) {
  11. // Do not report warnings if we are in the second pass of the parser
  12. // to avoid showing the warning twice.
  13. return 0;
  14. }
  15. unsigned char c = *first_invalid_escape;
  16. if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { // in this case the tokenizer has already emitted a warning,
  17. // see tokenizer.c:warn_invalid_escape_sequence
  18. return 0;
  19. }
  20. int octal = ('4' <= c && c <= '7');
  21. PyObject *msg =
  22. octal
  23. ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
  24. first_invalid_escape)
  25. : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
  26. if (msg == NULL) {
  27. return -1;
  28. }
  29. PyObject *category;
  30. if (p->feature_version >= 12) {
  31. category = PyExc_SyntaxWarning;
  32. }
  33. else {
  34. category = PyExc_DeprecationWarning;
  35. }
  36. if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
  37. t->lineno, NULL, NULL) < 0) {
  38. if (PyErr_ExceptionMatches(category)) {
  39. /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
  40. to get a more accurate error report */
  41. PyErr_Clear();
  42. /* This is needed, in order for the SyntaxError to point to the token t,
  43. since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
  44. error location, if p->known_err_token is not set. */
  45. p->known_err_token = t;
  46. if (octal) {
  47. RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
  48. first_invalid_escape);
  49. }
  50. else {
  51. RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
  52. }
  53. }
  54. Py_DECREF(msg);
  55. return -1;
  56. }
  57. Py_DECREF(msg);
  58. return 0;
  59. }
  60. static PyObject *
  61. decode_utf8(const char **sPtr, const char *end)
  62. {
  63. const char *s;
  64. const char *t;
  65. t = s = *sPtr;
  66. while (s < end && (*s & 0x80)) {
  67. s++;
  68. }
  69. *sPtr = s;
  70. return PyUnicode_DecodeUTF8(t, s - t, NULL);
  71. }
  72. static PyObject *
  73. decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
  74. {
  75. PyObject *v;
  76. PyObject *u;
  77. char *buf;
  78. char *p;
  79. const char *end;
  80. /* check for integer overflow */
  81. if (len > SIZE_MAX / 6) {
  82. return NULL;
  83. }
  84. /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
  85. "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
  86. u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
  87. if (u == NULL) {
  88. return NULL;
  89. }
  90. p = buf = PyBytes_AsString(u);
  91. if (p == NULL) {
  92. return NULL;
  93. }
  94. end = s + len;
  95. while (s < end) {
  96. if (*s == '\\') {
  97. *p++ = *s++;
  98. if (s >= end || *s & 0x80) {
  99. strcpy(p, "u005c");
  100. p += 5;
  101. if (s >= end) {
  102. break;
  103. }
  104. }
  105. }
  106. if (*s & 0x80) {
  107. PyObject *w;
  108. int kind;
  109. const void *data;
  110. Py_ssize_t w_len;
  111. Py_ssize_t i;
  112. w = decode_utf8(&s, end);
  113. if (w == NULL) {
  114. Py_DECREF(u);
  115. return NULL;
  116. }
  117. kind = PyUnicode_KIND(w);
  118. data = PyUnicode_DATA(w);
  119. w_len = PyUnicode_GET_LENGTH(w);
  120. for (i = 0; i < w_len; i++) {
  121. Py_UCS4 chr = PyUnicode_READ(kind, data, i);
  122. sprintf(p, "\\U%08x", chr);
  123. p += 10;
  124. }
  125. /* Should be impossible to overflow */
  126. assert(p - buf <= PyBytes_GET_SIZE(u));
  127. Py_DECREF(w);
  128. }
  129. else {
  130. *p++ = *s++;
  131. }
  132. }
  133. len = p - buf;
  134. s = buf;
  135. const char *first_invalid_escape;
  136. v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
  137. // HACK: later we can simply pass the line no, since we don't preserve the tokens
  138. // when we are decoding the string but we preserve the line numbers.
  139. if (v != NULL && first_invalid_escape != NULL && t != NULL) {
  140. if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
  141. /* We have not decref u before because first_invalid_escape points
  142. inside u. */
  143. Py_XDECREF(u);
  144. Py_DECREF(v);
  145. return NULL;
  146. }
  147. }
  148. Py_XDECREF(u);
  149. return v;
  150. }
  151. static PyObject *
  152. decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
  153. {
  154. const char *first_invalid_escape;
  155. PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
  156. if (result == NULL) {
  157. return NULL;
  158. }
  159. if (first_invalid_escape != NULL) {
  160. if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
  161. Py_DECREF(result);
  162. return NULL;
  163. }
  164. }
  165. return result;
  166. }
  167. PyObject *
  168. _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
  169. {
  170. if (raw) {
  171. return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
  172. }
  173. return decode_unicode_with_escapes(p, s, len, t);
  174. }
  175. /* s must include the bracketing quote characters, and r, b &/or f prefixes
  176. (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
  177. _PyPegen_parse_string parses it, and returns the decoded Python string object. */
  178. PyObject *
  179. _PyPegen_parse_string(Parser *p, Token *t)
  180. {
  181. const char *s = PyBytes_AsString(t->bytes);
  182. if (s == NULL) {
  183. return NULL;
  184. }
  185. size_t len;
  186. int quote = Py_CHARMASK(*s);
  187. int bytesmode = 0;
  188. int rawmode = 0;
  189. if (Py_ISALPHA(quote)) {
  190. while (!bytesmode || !rawmode) {
  191. if (quote == 'b' || quote == 'B') {
  192. quote =(unsigned char)*++s;
  193. bytesmode = 1;
  194. }
  195. else if (quote == 'u' || quote == 'U') {
  196. quote = (unsigned char)*++s;
  197. }
  198. else if (quote == 'r' || quote == 'R') {
  199. quote = (unsigned char)*++s;
  200. rawmode = 1;
  201. }
  202. else {
  203. break;
  204. }
  205. }
  206. }
  207. if (quote != '\'' && quote != '\"') {
  208. PyErr_BadInternalCall();
  209. return NULL;
  210. }
  211. /* Skip the leading quote char. */
  212. s++;
  213. len = strlen(s);
  214. // gh-120155: 's' contains at least the trailing quote,
  215. // so the code '--len' below is safe.
  216. assert(len >= 1);
  217. if (len > INT_MAX) {
  218. PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
  219. return NULL;
  220. }
  221. if (s[--len] != quote) {
  222. /* Last quote char must match the first. */
  223. PyErr_BadInternalCall();
  224. return NULL;
  225. }
  226. if (len >= 4 && s[0] == quote && s[1] == quote) {
  227. /* A triple quoted string. We've already skipped one quote at
  228. the start and one at the end of the string. Now skip the
  229. two at the start. */
  230. s += 2;
  231. len -= 2;
  232. /* And check that the last two match. */
  233. if (s[--len] != quote || s[--len] != quote) {
  234. PyErr_BadInternalCall();
  235. return NULL;
  236. }
  237. }
  238. /* Avoid invoking escape decoding routines if possible. */
  239. rawmode = rawmode || strchr(s, '\\') == NULL;
  240. if (bytesmode) {
  241. /* Disallow non-ASCII characters. */
  242. const char *ch;
  243. for (ch = s; *ch; ch++) {
  244. if (Py_CHARMASK(*ch) >= 0x80) {
  245. RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
  246. t,
  247. "bytes can only contain ASCII "
  248. "literal characters");
  249. return NULL;
  250. }
  251. }
  252. if (rawmode) {
  253. return PyBytes_FromStringAndSize(s, len);
  254. }
  255. return decode_bytes_with_escapes(p, s, len, t);
  256. }
  257. return _PyPegen_decode_string(p, rawmode, s, len, t);
  258. }