123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279 |
- #include <stdbool.h>
- #include <Python.h>
- #include "tokenizer.h"
- #include "pegen.h"
- #include "string_parser.h"
- //// STRING HANDLING FUNCTIONS ////
- static int
- warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
- {
- if (p->call_invalid_rules) {
- // Do not report warnings if we are in the second pass of the parser
- // to avoid showing the warning twice.
- return 0;
- }
- unsigned char c = *first_invalid_escape;
- if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { // in this case the tokenizer has already emitted a warning,
- // see tokenizer.c:warn_invalid_escape_sequence
- return 0;
- }
- int octal = ('4' <= c && c <= '7');
- PyObject *msg =
- octal
- ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
- first_invalid_escape)
- : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
- if (msg == NULL) {
- return -1;
- }
- PyObject *category;
- if (p->feature_version >= 12) {
- category = PyExc_SyntaxWarning;
- }
- else {
- category = PyExc_DeprecationWarning;
- }
- if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
- t->lineno, NULL, NULL) < 0) {
- if (PyErr_ExceptionMatches(category)) {
- /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
- to get a more accurate error report */
- PyErr_Clear();
- /* This is needed, in order for the SyntaxError to point to the token t,
- since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
- error location, if p->known_err_token is not set. */
- p->known_err_token = t;
- if (octal) {
- RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
- first_invalid_escape);
- }
- else {
- RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
- }
- }
- Py_DECREF(msg);
- return -1;
- }
- Py_DECREF(msg);
- return 0;
- }
- static PyObject *
- decode_utf8(const char **sPtr, const char *end)
- {
- const char *s;
- const char *t;
- t = s = *sPtr;
- while (s < end && (*s & 0x80)) {
- s++;
- }
- *sPtr = s;
- return PyUnicode_DecodeUTF8(t, s - t, NULL);
- }
- static PyObject *
- decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
- {
- PyObject *v;
- PyObject *u;
- char *buf;
- char *p;
- const char *end;
- /* check for integer overflow */
- if (len > SIZE_MAX / 6) {
- return NULL;
- }
- /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
- "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
- u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
- if (u == NULL) {
- return NULL;
- }
- p = buf = PyBytes_AsString(u);
- if (p == NULL) {
- return NULL;
- }
- end = s + len;
- while (s < end) {
- if (*s == '\\') {
- *p++ = *s++;
- if (s >= end || *s & 0x80) {
- strcpy(p, "u005c");
- p += 5;
- if (s >= end) {
- break;
- }
- }
- }
- if (*s & 0x80) {
- PyObject *w;
- int kind;
- const void *data;
- Py_ssize_t w_len;
- Py_ssize_t i;
- w = decode_utf8(&s, end);
- if (w == NULL) {
- Py_DECREF(u);
- return NULL;
- }
- kind = PyUnicode_KIND(w);
- data = PyUnicode_DATA(w);
- w_len = PyUnicode_GET_LENGTH(w);
- for (i = 0; i < w_len; i++) {
- Py_UCS4 chr = PyUnicode_READ(kind, data, i);
- sprintf(p, "\\U%08x", chr);
- p += 10;
- }
- /* Should be impossible to overflow */
- assert(p - buf <= PyBytes_GET_SIZE(u));
- Py_DECREF(w);
- }
- else {
- *p++ = *s++;
- }
- }
- len = p - buf;
- s = buf;
- const char *first_invalid_escape;
- v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
- // HACK: later we can simply pass the line no, since we don't preserve the tokens
- // when we are decoding the string but we preserve the line numbers.
- if (v != NULL && first_invalid_escape != NULL && t != NULL) {
- if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
- /* We have not decref u before because first_invalid_escape points
- inside u. */
- Py_XDECREF(u);
- Py_DECREF(v);
- return NULL;
- }
- }
- Py_XDECREF(u);
- return v;
- }
- static PyObject *
- decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
- {
- const char *first_invalid_escape;
- PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
- if (result == NULL) {
- return NULL;
- }
- if (first_invalid_escape != NULL) {
- if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
- Py_DECREF(result);
- return NULL;
- }
- }
- return result;
- }
- PyObject *
- _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
- {
- if (raw) {
- return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
- }
- return decode_unicode_with_escapes(p, s, len, t);
- }
- /* s must include the bracketing quote characters, and r, b &/or f prefixes
- (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
- _PyPegen_parse_string parses it, and returns the decoded Python string object. */
- PyObject *
- _PyPegen_parse_string(Parser *p, Token *t)
- {
- const char *s = PyBytes_AsString(t->bytes);
- if (s == NULL) {
- return NULL;
- }
- size_t len;
- int quote = Py_CHARMASK(*s);
- int bytesmode = 0;
- int rawmode = 0;
- if (Py_ISALPHA(quote)) {
- while (!bytesmode || !rawmode) {
- if (quote == 'b' || quote == 'B') {
- quote =(unsigned char)*++s;
- bytesmode = 1;
- }
- else if (quote == 'u' || quote == 'U') {
- quote = (unsigned char)*++s;
- }
- else if (quote == 'r' || quote == 'R') {
- quote = (unsigned char)*++s;
- rawmode = 1;
- }
- else {
- break;
- }
- }
- }
- if (quote != '\'' && quote != '\"') {
- PyErr_BadInternalCall();
- return NULL;
- }
- /* Skip the leading quote char. */
- s++;
- len = strlen(s);
- // gh-120155: 's' contains at least the trailing quote,
- // so the code '--len' below is safe.
- assert(len >= 1);
- if (len > INT_MAX) {
- PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
- return NULL;
- }
- if (s[--len] != quote) {
- /* Last quote char must match the first. */
- PyErr_BadInternalCall();
- return NULL;
- }
- if (len >= 4 && s[0] == quote && s[1] == quote) {
- /* A triple quoted string. We've already skipped one quote at
- the start and one at the end of the string. Now skip the
- two at the start. */
- s += 2;
- len -= 2;
- /* And check that the last two match. */
- if (s[--len] != quote || s[--len] != quote) {
- PyErr_BadInternalCall();
- return NULL;
- }
- }
- /* Avoid invoking escape decoding routines if possible. */
- rawmode = rawmode || strchr(s, '\\') == NULL;
- if (bytesmode) {
- /* Disallow non-ASCII characters. */
- const char *ch;
- for (ch = s; *ch; ch++) {
- if (Py_CHARMASK(*ch) >= 0x80) {
- RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
- t,
- "bytes can only contain ASCII "
- "literal characters");
- return NULL;
- }
- }
- if (rawmode) {
- return PyBytes_FromStringAndSize(s, len);
- }
- return decode_bytes_with_escapes(p, s, len, t);
- }
- return _PyPegen_decode_string(p, rawmode, s, len, t);
- }
|