123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452 |
- #include <Python.h>
- #include <errcode.h>
- #include "tokenizer.h"
- #include "pegen.h"
- // TOKENIZER ERRORS
- void
- _PyPegen_raise_tokenizer_init_error(PyObject *filename)
- {
- if (!(PyErr_ExceptionMatches(PyExc_LookupError)
- || PyErr_ExceptionMatches(PyExc_SyntaxError)
- || PyErr_ExceptionMatches(PyExc_ValueError)
- || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
- return;
- }
- PyObject *errstr = NULL;
- PyObject *tuple = NULL;
- PyObject *type;
- PyObject *value;
- PyObject *tback;
- PyErr_Fetch(&type, &value, &tback);
- errstr = PyObject_Str(value);
- if (!errstr) {
- goto error;
- }
- PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
- if (!tmp) {
- goto error;
- }
- tuple = PyTuple_Pack(2, errstr, tmp);
- Py_DECREF(tmp);
- if (!value) {
- goto error;
- }
- PyErr_SetObject(PyExc_SyntaxError, tuple);
- error:
- Py_XDECREF(type);
- Py_XDECREF(value);
- Py_XDECREF(tback);
- Py_XDECREF(errstr);
- Py_XDECREF(tuple);
- }
- static inline void
- raise_unclosed_parentheses_error(Parser *p) {
- int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
- int error_col = p->tok->parencolstack[p->tok->level-1];
- RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
- error_lineno, error_col, error_lineno, -1,
- "'%c' was never closed",
- p->tok->parenstack[p->tok->level-1]);
- }
- int
- _Pypegen_tokenizer_error(Parser *p)
- {
- if (PyErr_Occurred()) {
- return -1;
- }
- const char *msg = NULL;
- PyObject* errtype = PyExc_SyntaxError;
- Py_ssize_t col_offset = -1;
- p->error_indicator = 1;
- switch (p->tok->done) {
- case E_TOKEN:
- msg = "invalid token";
- break;
- case E_EOF:
- if (p->tok->level) {
- raise_unclosed_parentheses_error(p);
- } else {
- RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
- }
- return -1;
- case E_DEDENT:
- RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
- return -1;
- case E_INTR:
- if (!PyErr_Occurred()) {
- PyErr_SetNone(PyExc_KeyboardInterrupt);
- }
- return -1;
- case E_NOMEM:
- PyErr_NoMemory();
- return -1;
- case E_TABSPACE:
- errtype = PyExc_TabError;
- msg = "inconsistent use of tabs and spaces in indentation";
- break;
- case E_TOODEEP:
- errtype = PyExc_IndentationError;
- msg = "too many levels of indentation";
- break;
- case E_LINECONT: {
- col_offset = p->tok->cur - p->tok->buf - 1;
- msg = "unexpected character after line continuation character";
- break;
- }
- case E_COLUMNOVERFLOW:
- PyErr_SetString(PyExc_OverflowError,
- "Parser column offset overflow - source line is too big");
- return -1;
- default:
- msg = "unknown parsing error";
- }
- RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
- col_offset >= 0 ? col_offset : 0,
- p->tok->lineno, -1, msg);
- return -1;
- }
- int
- _Pypegen_raise_decode_error(Parser *p)
- {
- assert(PyErr_Occurred());
- const char *errtype = NULL;
- if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
- errtype = "unicode error";
- }
- else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
- errtype = "value error";
- }
- if (errtype) {
- PyObject *type;
- PyObject *value;
- PyObject *tback;
- PyObject *errstr;
- PyErr_Fetch(&type, &value, &tback);
- errstr = PyObject_Str(value);
- if (errstr) {
- RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
- Py_DECREF(errstr);
- }
- else {
- PyErr_Clear();
- RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
- }
- Py_XDECREF(type);
- Py_XDECREF(value);
- Py_XDECREF(tback);
- }
- return -1;
- }
- static int
- _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
- // Tokenize the whole input to see if there are any tokenization
- // errors such as mistmatching parentheses. These will get priority
- // over generic syntax errors only if the line number of the error is
- // before the one that we had for the generic error.
- // We don't want to tokenize to the end for interactive input
- if (p->tok->prompt != NULL) {
- return 0;
- }
- PyObject *type, *value, *traceback;
- PyErr_Fetch(&type, &value, &traceback);
- Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
- Py_ssize_t current_err_line = current_token->lineno;
- int ret = 0;
- struct token new_token;
- _PyToken_Init(&new_token);
- for (;;) {
- switch (_PyTokenizer_Get(p->tok, &new_token)) {
- case ERRORTOKEN:
- if (PyErr_Occurred()) {
- ret = -1;
- goto exit;
- }
- if (p->tok->level != 0) {
- int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
- if (current_err_line > error_lineno) {
- raise_unclosed_parentheses_error(p);
- ret = -1;
- goto exit;
- }
- }
- break;
- case ENDMARKER:
- break;
- default:
- continue;
- }
- break;
- }
- exit:
- _PyToken_Free(&new_token);
- // If we're in an f-string, we want the syntax error in the expression part
- // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
- // do not swallow it.
- if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
- Py_XDECREF(value);
- Py_XDECREF(type);
- Py_XDECREF(traceback);
- } else {
- PyErr_Restore(type, value, traceback);
- }
- return ret;
- }
- // PARSER ERRORS
- void *
- _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
- {
- // Bail out if we already have an error set.
- if (p->error_indicator && PyErr_Occurred()) {
- return NULL;
- }
- if (p->fill == 0) {
- va_list va;
- va_start(va, errmsg);
- _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
- va_end(va);
- return NULL;
- }
- if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
- p->error_indicator = 1;
- return NULL;
- }
- Token *t = p->known_err_token != NULL
- ? p->known_err_token
- : p->tokens[use_mark ? p->mark : p->fill - 1];
- Py_ssize_t col_offset;
- Py_ssize_t end_col_offset = -1;
- if (t->col_offset == -1) {
- if (p->tok->cur == p->tok->buf) {
- col_offset = 0;
- } else {
- const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
- col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
- }
- } else {
- col_offset = t->col_offset + 1;
- }
- if (t->end_col_offset != -1) {
- end_col_offset = t->end_col_offset + 1;
- }
- va_list va;
- va_start(va, errmsg);
- _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
- va_end(va);
- return NULL;
- }
- static PyObject *
- get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
- {
- /* If the file descriptor is interactive, the source lines of the current
- * (multi-line) statement are stored in p->tok->interactive_src_start.
- * If not, we're parsing from a string, which means that the whole source
- * is stored in p->tok->str. */
- assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL);
- char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
- if (cur_line == NULL) {
- assert(p->tok->fp_interactive);
- // We can reach this point if the tokenizer buffers for interactive source have not been
- // initialized because we failed to decode the original source with the given locale.
- return PyUnicode_FromStringAndSize("", 0);
- }
- Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
- const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
- if (buf_end < cur_line) {
- buf_end = cur_line + strlen(cur_line);
- }
- for (int i = 0; i < relative_lineno - 1; i++) {
- char *new_line = strchr(cur_line, '\n');
- // The assert is here for debug builds but the conditional that
- // follows is there so in release builds we do not crash at the cost
- // to report a potentially wrong line.
- assert(new_line != NULL && new_line + 1 < buf_end);
- if (new_line == NULL || new_line + 1 > buf_end) {
- break;
- }
- cur_line = new_line + 1;
- }
- char *next_newline;
- if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
- next_newline = cur_line + strlen(cur_line);
- }
- return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
- }
- void *
- _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
- Py_ssize_t lineno, Py_ssize_t col_offset,
- Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
- const char *errmsg, va_list va)
- {
- // Bail out if we already have an error set.
- if (p->error_indicator && PyErr_Occurred()) {
- return NULL;
- }
- PyObject *value = NULL;
- PyObject *errstr = NULL;
- PyObject *error_line = NULL;
- PyObject *tmp = NULL;
- p->error_indicator = 1;
- if (end_lineno == CURRENT_POS) {
- end_lineno = p->tok->lineno;
- }
- if (end_col_offset == CURRENT_POS) {
- end_col_offset = p->tok->cur - p->tok->line_start;
- }
- errstr = PyUnicode_FromFormatV(errmsg, va);
- if (!errstr) {
- goto error;
- }
- if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
- error_line = get_error_line_from_tokenizer_buffers(p, lineno);
- }
- else if (p->start_rule == Py_file_input) {
- error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
- (int) lineno, p->tok->encoding);
- }
- if (!error_line) {
- /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
- then we need to find the error line from some other source, because
- p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
- failed or we're parsing from a string or the REPL. There's a third edge case where
- we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
- `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
- does not physically exist */
- assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
- if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
- Py_ssize_t size = p->tok->inp - p->tok->buf;
- error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
- }
- else if (p->tok->fp == NULL || p->tok->fp == stdin) {
- error_line = get_error_line_from_tokenizer_buffers(p, lineno);
- }
- else {
- error_line = PyUnicode_FromStringAndSize("", 0);
- }
- if (!error_line) {
- goto error;
- }
- }
- Py_ssize_t col_number = col_offset;
- Py_ssize_t end_col_number = end_col_offset;
- col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
- if (col_number < 0) {
- goto error;
- }
- if (end_col_offset > 0) {
- end_col_number = _PyPegen_byte_offset_to_character_offset(error_line, end_col_offset);
- if (end_col_number < 0) {
- goto error;
- }
- }
- tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
- if (!tmp) {
- goto error;
- }
- value = PyTuple_Pack(2, errstr, tmp);
- Py_DECREF(tmp);
- if (!value) {
- goto error;
- }
- PyErr_SetObject(errtype, value);
- Py_DECREF(errstr);
- Py_DECREF(value);
- return NULL;
- error:
- Py_XDECREF(errstr);
- Py_XDECREF(error_line);
- return NULL;
- }
- void
- _Pypegen_set_syntax_error(Parser* p, Token* last_token) {
- // Existing sintax error
- if (PyErr_Occurred()) {
- // Prioritize tokenizer errors to custom syntax errors raised
- // on the second phase only if the errors come from the parser.
- int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
- if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
- _PyPegen_tokenize_full_source_to_check_for_errors(p);
- }
- // Propagate the existing syntax error.
- return;
- }
- // Initialization error
- if (p->fill == 0) {
- RAISE_SYNTAX_ERROR("error at start before reading any input");
- }
- // Parser encountered EOF (End of File) unexpectedtly
- if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
- if (p->tok->level) {
- raise_unclosed_parentheses_error(p);
- } else {
- RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
- }
- return;
- }
- // Indentation error in the tokenizer
- if (last_token->type == INDENT || last_token->type == DEDENT) {
- RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
- return;
- }
- // Unknown error (generic case)
- // Use the last token we found on the first pass to avoid reporting
- // incorrect locations for generic syntax errors just because we reached
- // further away when trying to find specific syntax errors in the second
- // pass.
- RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
- // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
- // generic SyntaxError we just raised if errors are found.
- _PyPegen_tokenize_full_source_to_check_for_errors(p);
- }
- void
- _Pypegen_stack_overflow(Parser *p)
- {
- p->error_indicator = 1;
- PyErr_SetString(PyExc_MemoryError,
- "Parser stack overflowed - Python source too complex to parse");
- }
|