pegen.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044
  1. #include <Python.h>
  2. #include "pycore_ast.h" // _PyAST_Validate(),
  3. #include "pycore_pystate.h" // _PyThreadState_GET()
  4. #include <errcode.h>
  5. #include "tokenizer.h"
  6. #include "pegen.h"
  7. // Internal parser functions
  8. asdl_stmt_seq*
  9. _PyPegen_interactive_exit(Parser *p)
  10. {
  11. if (p->errcode) {
  12. *(p->errcode) = E_EOF;
  13. }
  14. return NULL;
  15. }
  16. Py_ssize_t
  17. _PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)
  18. {
  19. const char *data = PyUnicode_AsUTF8(line);
  20. Py_ssize_t len = 0;
  21. while (col_offset < end_col_offset) {
  22. Py_UCS4 ch = data[col_offset];
  23. if (ch < 0x80) {
  24. col_offset += 1;
  25. } else if ((ch & 0xe0) == 0xc0) {
  26. col_offset += 2;
  27. } else if ((ch & 0xf0) == 0xe0) {
  28. col_offset += 3;
  29. } else if ((ch & 0xf8) == 0xf0) {
  30. col_offset += 4;
  31. } else {
  32. PyErr_SetString(PyExc_ValueError, "Invalid UTF-8 sequence");
  33. return -1;
  34. }
  35. len++;
  36. }
  37. return len;
  38. }
  39. Py_ssize_t
  40. _PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
  41. {
  42. Py_ssize_t len = strlen(str);
  43. if (col_offset > len + 1) {
  44. col_offset = len + 1;
  45. }
  46. assert(col_offset >= 0);
  47. PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
  48. if (!text) {
  49. return -1;
  50. }
  51. Py_ssize_t size = PyUnicode_GET_LENGTH(text);
  52. Py_DECREF(text);
  53. return size;
  54. }
  55. // Calculate the extra amount of width space the given source
  56. // code segment might take if it were to be displayed on a fixed
  57. // width output device. Supports wide unicode characters and emojis.
  58. Py_ssize_t
  59. _PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
  60. {
  61. PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
  62. if (!segment) {
  63. return -1;
  64. }
  65. // Fast track for ascii strings
  66. if (PyUnicode_IS_ASCII(segment)) {
  67. Py_DECREF(segment);
  68. return character_offset;
  69. }
  70. PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
  71. if (!width_fn) {
  72. return -1;
  73. }
  74. Py_ssize_t width = 0;
  75. Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
  76. for (Py_ssize_t i = 0; i < len; i++) {
  77. PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
  78. if (!chr) {
  79. Py_DECREF(segment);
  80. Py_DECREF(width_fn);
  81. return -1;
  82. }
  83. PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
  84. Py_DECREF(chr);
  85. if (!width_specifier) {
  86. Py_DECREF(segment);
  87. Py_DECREF(width_fn);
  88. return -1;
  89. }
  90. if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
  91. _PyUnicode_EqualToASCIIString(width_specifier, "F")) {
  92. width += 2;
  93. }
  94. else {
  95. width += 1;
  96. }
  97. Py_DECREF(width_specifier);
  98. }
  99. Py_DECREF(segment);
  100. Py_DECREF(width_fn);
  101. return width;
  102. }
  103. Py_ssize_t
  104. _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
  105. {
  106. const char *str = PyUnicode_AsUTF8(line);
  107. if (!str) {
  108. return -1;
  109. }
  110. return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
  111. }
  112. // Here, mark is the start of the node, while p->mark is the end.
  113. // If node==NULL, they should be the same.
  114. int
  115. _PyPegen_insert_memo(Parser *p, int mark, int type, void *node)
  116. {
  117. // Insert in front
  118. Memo *m = _PyArena_Malloc(p->arena, sizeof(Memo));
  119. if (m == NULL) {
  120. return -1;
  121. }
  122. m->type = type;
  123. m->node = node;
  124. m->mark = p->mark;
  125. m->next = p->tokens[mark]->memo;
  126. p->tokens[mark]->memo = m;
  127. return 0;
  128. }
  129. // Like _PyPegen_insert_memo(), but updates an existing node if found.
  130. int
  131. _PyPegen_update_memo(Parser *p, int mark, int type, void *node)
  132. {
  133. for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) {
  134. if (m->type == type) {
  135. // Update existing node.
  136. m->node = node;
  137. m->mark = p->mark;
  138. return 0;
  139. }
  140. }
  141. // Insert new node.
  142. return _PyPegen_insert_memo(p, mark, type, node);
  143. }
  144. static int
  145. init_normalization(Parser *p)
  146. {
  147. if (p->normalize) {
  148. return 1;
  149. }
  150. p->normalize = _PyImport_GetModuleAttrString("unicodedata", "normalize");
  151. if (!p->normalize)
  152. {
  153. return 0;
  154. }
  155. return 1;
  156. }
  157. static int
  158. growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
  159. assert(initial_size > 0);
  160. arr->items = PyMem_Malloc(initial_size * sizeof(*arr->items));
  161. arr->size = initial_size;
  162. arr->num_items = 0;
  163. return arr->items != NULL;
  164. }
  165. static int
  166. growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
  167. if (arr->num_items >= arr->size) {
  168. size_t new_size = arr->size * 2;
  169. void *new_items_array = PyMem_Realloc(arr->items, new_size * sizeof(*arr->items));
  170. if (!new_items_array) {
  171. return 0;
  172. }
  173. arr->items = new_items_array;
  174. arr->size = new_size;
  175. }
  176. arr->items[arr->num_items].lineno = lineno;
  177. arr->items[arr->num_items].comment = comment; // Take ownership
  178. arr->num_items++;
  179. return 1;
  180. }
  181. static void
  182. growable_comment_array_deallocate(growable_comment_array *arr) {
  183. for (unsigned i = 0; i < arr->num_items; i++) {
  184. PyMem_Free(arr->items[i].comment);
  185. }
  186. PyMem_Free(arr->items);
  187. }
  188. static int
  189. _get_keyword_or_name_type(Parser *p, struct token *new_token)
  190. {
  191. int name_len = new_token->end_col_offset - new_token->col_offset;
  192. assert(name_len > 0);
  193. if (name_len >= p->n_keyword_lists ||
  194. p->keywords[name_len] == NULL ||
  195. p->keywords[name_len]->type == -1) {
  196. return NAME;
  197. }
  198. for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
  199. if (strncmp(k->str, new_token->start, name_len) == 0) {
  200. return k->type;
  201. }
  202. }
  203. return NAME;
  204. }
  205. static int
  206. initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
  207. assert(parser_token != NULL);
  208. parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
  209. parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
  210. if (parser_token->bytes == NULL) {
  211. return -1;
  212. }
  213. if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
  214. Py_DECREF(parser_token->bytes);
  215. return -1;
  216. }
  217. parser_token->metadata = NULL;
  218. if (new_token->metadata != NULL) {
  219. if (_PyArena_AddPyObject(p->arena, new_token->metadata) < 0) {
  220. Py_DECREF(parser_token->metadata);
  221. return -1;
  222. }
  223. parser_token->metadata = new_token->metadata;
  224. new_token->metadata = NULL;
  225. }
  226. parser_token->level = new_token->level;
  227. parser_token->lineno = new_token->lineno;
  228. parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
  229. : new_token->col_offset;
  230. parser_token->end_lineno = new_token->end_lineno;
  231. parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
  232. : new_token->end_col_offset;
  233. p->fill += 1;
  234. if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
  235. return _Pypegen_raise_decode_error(p);
  236. }
  237. return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
  238. }
  239. static int
  240. _resize_tokens_array(Parser *p) {
  241. int newsize = p->size * 2;
  242. Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
  243. if (new_tokens == NULL) {
  244. PyErr_NoMemory();
  245. return -1;
  246. }
  247. p->tokens = new_tokens;
  248. for (int i = p->size; i < newsize; i++) {
  249. p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
  250. if (p->tokens[i] == NULL) {
  251. p->size = i; // Needed, in order to cleanup correctly after parser fails
  252. PyErr_NoMemory();
  253. return -1;
  254. }
  255. }
  256. p->size = newsize;
  257. return 0;
  258. }
  259. int
  260. _PyPegen_fill_token(Parser *p)
  261. {
  262. struct token new_token;
  263. _PyToken_Init(&new_token);
  264. int type = _PyTokenizer_Get(p->tok, &new_token);
  265. // Record and skip '# type: ignore' comments
  266. while (type == TYPE_IGNORE) {
  267. Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
  268. char *tag = PyMem_Malloc(len + 1);
  269. if (tag == NULL) {
  270. PyErr_NoMemory();
  271. goto error;
  272. }
  273. strncpy(tag, new_token.start, len);
  274. tag[len] = '\0';
  275. // Ownership of tag passes to the growable array
  276. if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
  277. PyErr_NoMemory();
  278. goto error;
  279. }
  280. type = _PyTokenizer_Get(p->tok, &new_token);
  281. }
  282. // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
  283. if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
  284. type = NEWLINE; /* Add an extra newline */
  285. p->parsing_started = 0;
  286. if (p->tok->indent && !(p->flags & PyPARSE_DONT_IMPLY_DEDENT)) {
  287. p->tok->pendin = -p->tok->indent;
  288. p->tok->indent = 0;
  289. }
  290. }
  291. else {
  292. p->parsing_started = 1;
  293. }
  294. // Check if we are at the limit of the token array capacity and resize if needed
  295. if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
  296. goto error;
  297. }
  298. Token *t = p->tokens[p->fill];
  299. return initialize_token(p, t, &new_token, type);
  300. error:
  301. _PyToken_Free(&new_token);
  302. return -1;
  303. }
  304. #if defined(Py_DEBUG)
  305. // Instrumentation to count the effectiveness of memoization.
  306. // The array counts the number of tokens skipped by memoization,
  307. // indexed by type.
  308. #define NSTATISTICS _PYPEGEN_NSTATISTICS
  309. #define memo_statistics _PyRuntime.parser.memo_statistics
  310. void
  311. _PyPegen_clear_memo_statistics(void)
  312. {
  313. for (int i = 0; i < NSTATISTICS; i++) {
  314. memo_statistics[i] = 0;
  315. }
  316. }
  317. PyObject *
  318. _PyPegen_get_memo_statistics(void)
  319. {
  320. PyObject *ret = PyList_New(NSTATISTICS);
  321. if (ret == NULL) {
  322. return NULL;
  323. }
  324. for (int i = 0; i < NSTATISTICS; i++) {
  325. PyObject *value = PyLong_FromLong(memo_statistics[i]);
  326. if (value == NULL) {
  327. Py_DECREF(ret);
  328. return NULL;
  329. }
  330. // PyList_SetItem borrows a reference to value.
  331. if (PyList_SetItem(ret, i, value) < 0) {
  332. Py_DECREF(ret);
  333. return NULL;
  334. }
  335. }
  336. return ret;
  337. }
  338. #endif
  339. int // bool
  340. _PyPegen_is_memoized(Parser *p, int type, void *pres)
  341. {
  342. if (p->mark == p->fill) {
  343. if (_PyPegen_fill_token(p) < 0) {
  344. p->error_indicator = 1;
  345. return -1;
  346. }
  347. }
  348. Token *t = p->tokens[p->mark];
  349. for (Memo *m = t->memo; m != NULL; m = m->next) {
  350. if (m->type == type) {
  351. #if defined(Py_DEBUG)
  352. if (0 <= type && type < NSTATISTICS) {
  353. long count = m->mark - p->mark;
  354. // A memoized negative result counts for one.
  355. if (count <= 0) {
  356. count = 1;
  357. }
  358. memo_statistics[type] += count;
  359. }
  360. #endif
  361. p->mark = m->mark;
  362. *(void **)(pres) = m->node;
  363. return 1;
  364. }
  365. }
  366. return 0;
  367. }
  368. int
  369. _PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p)
  370. {
  371. int mark = p->mark;
  372. void *res = func(p);
  373. p->mark = mark;
  374. return (res != NULL) == positive;
  375. }
  376. int
  377. _PyPegen_lookahead_with_string(int positive, expr_ty (func)(Parser *, const char*), Parser *p, const char* arg)
  378. {
  379. int mark = p->mark;
  380. void *res = func(p, arg);
  381. p->mark = mark;
  382. return (res != NULL) == positive;
  383. }
  384. int
  385. _PyPegen_lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg)
  386. {
  387. int mark = p->mark;
  388. void *res = func(p, arg);
  389. p->mark = mark;
  390. return (res != NULL) == positive;
  391. }
  392. int
  393. _PyPegen_lookahead(int positive, void *(func)(Parser *), Parser *p)
  394. {
  395. int mark = p->mark;
  396. void *res = (void*)func(p);
  397. p->mark = mark;
  398. return (res != NULL) == positive;
  399. }
  400. Token *
  401. _PyPegen_expect_token(Parser *p, int type)
  402. {
  403. if (p->mark == p->fill) {
  404. if (_PyPegen_fill_token(p) < 0) {
  405. p->error_indicator = 1;
  406. return NULL;
  407. }
  408. }
  409. Token *t = p->tokens[p->mark];
  410. if (t->type != type) {
  411. return NULL;
  412. }
  413. p->mark += 1;
  414. return t;
  415. }
  416. void*
  417. _PyPegen_expect_forced_result(Parser *p, void* result, const char* expected) {
  418. if (p->error_indicator == 1) {
  419. return NULL;
  420. }
  421. if (result == NULL) {
  422. RAISE_SYNTAX_ERROR("expected (%s)", expected);
  423. return NULL;
  424. }
  425. return result;
  426. }
  427. Token *
  428. _PyPegen_expect_forced_token(Parser *p, int type, const char* expected) {
  429. if (p->error_indicator == 1) {
  430. return NULL;
  431. }
  432. if (p->mark == p->fill) {
  433. if (_PyPegen_fill_token(p) < 0) {
  434. p->error_indicator = 1;
  435. return NULL;
  436. }
  437. }
  438. Token *t = p->tokens[p->mark];
  439. if (t->type != type) {
  440. RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "expected '%s'", expected);
  441. return NULL;
  442. }
  443. p->mark += 1;
  444. return t;
  445. }
  446. expr_ty
  447. _PyPegen_expect_soft_keyword(Parser *p, const char *keyword)
  448. {
  449. if (p->mark == p->fill) {
  450. if (_PyPegen_fill_token(p) < 0) {
  451. p->error_indicator = 1;
  452. return NULL;
  453. }
  454. }
  455. Token *t = p->tokens[p->mark];
  456. if (t->type != NAME) {
  457. return NULL;
  458. }
  459. const char *s = PyBytes_AsString(t->bytes);
  460. if (!s) {
  461. p->error_indicator = 1;
  462. return NULL;
  463. }
  464. if (strcmp(s, keyword) != 0) {
  465. return NULL;
  466. }
  467. return _PyPegen_name_token(p);
  468. }
  469. Token *
  470. _PyPegen_get_last_nonnwhitespace_token(Parser *p)
  471. {
  472. assert(p->mark >= 0);
  473. Token *token = NULL;
  474. for (int m = p->mark - 1; m >= 0; m--) {
  475. token = p->tokens[m];
  476. if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) {
  477. break;
  478. }
  479. }
  480. return token;
  481. }
  482. PyObject *
  483. _PyPegen_new_identifier(Parser *p, const char *n)
  484. {
  485. PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
  486. if (!id) {
  487. goto error;
  488. }
  489. /* PyUnicode_DecodeUTF8 should always return a ready string. */
  490. assert(PyUnicode_IS_READY(id));
  491. /* Check whether there are non-ASCII characters in the
  492. identifier; if so, normalize to NFKC. */
  493. if (!PyUnicode_IS_ASCII(id))
  494. {
  495. PyObject *id2;
  496. if (!init_normalization(p))
  497. {
  498. Py_DECREF(id);
  499. goto error;
  500. }
  501. PyObject *form = PyUnicode_InternFromString("NFKC");
  502. if (form == NULL)
  503. {
  504. Py_DECREF(id);
  505. goto error;
  506. }
  507. PyObject *args[2] = {form, id};
  508. id2 = _PyObject_FastCall(p->normalize, args, 2);
  509. Py_DECREF(id);
  510. Py_DECREF(form);
  511. if (!id2) {
  512. goto error;
  513. }
  514. if (!PyUnicode_Check(id2))
  515. {
  516. PyErr_Format(PyExc_TypeError,
  517. "unicodedata.normalize() must return a string, not "
  518. "%.200s",
  519. _PyType_Name(Py_TYPE(id2)));
  520. Py_DECREF(id2);
  521. goto error;
  522. }
  523. id = id2;
  524. }
  525. PyUnicode_InternInPlace(&id);
  526. if (_PyArena_AddPyObject(p->arena, id) < 0)
  527. {
  528. Py_DECREF(id);
  529. goto error;
  530. }
  531. return id;
  532. error:
  533. p->error_indicator = 1;
  534. return NULL;
  535. }
  536. static expr_ty
  537. _PyPegen_name_from_token(Parser *p, Token* t)
  538. {
  539. if (t == NULL) {
  540. return NULL;
  541. }
  542. const char *s = PyBytes_AsString(t->bytes);
  543. if (!s) {
  544. p->error_indicator = 1;
  545. return NULL;
  546. }
  547. PyObject *id = _PyPegen_new_identifier(p, s);
  548. if (id == NULL) {
  549. p->error_indicator = 1;
  550. return NULL;
  551. }
  552. return _PyAST_Name(id, Load, t->lineno, t->col_offset, t->end_lineno,
  553. t->end_col_offset, p->arena);
  554. }
  555. expr_ty
  556. _PyPegen_name_token(Parser *p)
  557. {
  558. Token *t = _PyPegen_expect_token(p, NAME);
  559. return _PyPegen_name_from_token(p, t);
  560. }
  561. void *
  562. _PyPegen_string_token(Parser *p)
  563. {
  564. return _PyPegen_expect_token(p, STRING);
  565. }
  566. expr_ty _PyPegen_soft_keyword_token(Parser *p) {
  567. Token *t = _PyPegen_expect_token(p, NAME);
  568. if (t == NULL) {
  569. return NULL;
  570. }
  571. char *the_token;
  572. Py_ssize_t size;
  573. PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
  574. for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
  575. if (strncmp(*keyword, the_token, size) == 0) {
  576. return _PyPegen_name_from_token(p, t);
  577. }
  578. }
  579. return NULL;
  580. }
  581. static PyObject *
  582. parsenumber_raw(const char *s)
  583. {
  584. const char *end;
  585. long x;
  586. double dx;
  587. Py_complex compl;
  588. int imflag;
  589. assert(s != NULL);
  590. errno = 0;
  591. end = s + strlen(s) - 1;
  592. imflag = *end == 'j' || *end == 'J';
  593. if (s[0] == '0') {
  594. x = (long)PyOS_strtoul(s, (char **)&end, 0);
  595. if (x < 0 && errno == 0) {
  596. return PyLong_FromString(s, (char **)0, 0);
  597. }
  598. }
  599. else {
  600. x = PyOS_strtol(s, (char **)&end, 0);
  601. }
  602. if (*end == '\0') {
  603. if (errno != 0) {
  604. return PyLong_FromString(s, (char **)0, 0);
  605. }
  606. return PyLong_FromLong(x);
  607. }
  608. /* XXX Huge floats may silently fail */
  609. if (imflag) {
  610. compl.real = 0.;
  611. compl.imag = PyOS_string_to_double(s, (char **)&end, NULL);
  612. if (compl.imag == -1.0 && PyErr_Occurred()) {
  613. return NULL;
  614. }
  615. return PyComplex_FromCComplex(compl);
  616. }
  617. dx = PyOS_string_to_double(s, NULL, NULL);
  618. if (dx == -1.0 && PyErr_Occurred()) {
  619. return NULL;
  620. }
  621. return PyFloat_FromDouble(dx);
  622. }
  623. static PyObject *
  624. parsenumber(const char *s)
  625. {
  626. char *dup;
  627. char *end;
  628. PyObject *res = NULL;
  629. assert(s != NULL);
  630. if (strchr(s, '_') == NULL) {
  631. return parsenumber_raw(s);
  632. }
  633. /* Create a duplicate without underscores. */
  634. dup = PyMem_Malloc(strlen(s) + 1);
  635. if (dup == NULL) {
  636. return PyErr_NoMemory();
  637. }
  638. end = dup;
  639. for (; *s; s++) {
  640. if (*s != '_') {
  641. *end++ = *s;
  642. }
  643. }
  644. *end = '\0';
  645. res = parsenumber_raw(dup);
  646. PyMem_Free(dup);
  647. return res;
  648. }
  649. expr_ty
  650. _PyPegen_number_token(Parser *p)
  651. {
  652. Token *t = _PyPegen_expect_token(p, NUMBER);
  653. if (t == NULL) {
  654. return NULL;
  655. }
  656. const char *num_raw = PyBytes_AsString(t->bytes);
  657. if (num_raw == NULL) {
  658. p->error_indicator = 1;
  659. return NULL;
  660. }
  661. if (p->feature_version < 6 && strchr(num_raw, '_') != NULL) {
  662. p->error_indicator = 1;
  663. return RAISE_SYNTAX_ERROR("Underscores in numeric literals are only supported "
  664. "in Python 3.6 and greater");
  665. }
  666. PyObject *c = parsenumber(num_raw);
  667. if (c == NULL) {
  668. p->error_indicator = 1;
  669. PyThreadState *tstate = _PyThreadState_GET();
  670. // The only way a ValueError should happen in _this_ code is via
  671. // PyLong_FromString hitting a length limit.
  672. if (tstate->current_exception != NULL &&
  673. Py_TYPE(tstate->current_exception) == (PyTypeObject *)PyExc_ValueError
  674. ) {
  675. PyObject *exc = PyErr_GetRaisedException();
  676. /* Intentionally omitting columns to avoid a wall of 1000s of '^'s
  677. * on the error message. Nobody is going to overlook their huge
  678. * numeric literal once given the line. */
  679. RAISE_ERROR_KNOWN_LOCATION(
  680. p, PyExc_SyntaxError,
  681. t->lineno, -1 /* col_offset */,
  682. t->end_lineno, -1 /* end_col_offset */,
  683. "%S - Consider hexadecimal for huge integer literals "
  684. "to avoid decimal conversion limits.",
  685. exc);
  686. Py_DECREF(exc);
  687. }
  688. return NULL;
  689. }
  690. if (_PyArena_AddPyObject(p->arena, c) < 0) {
  691. Py_DECREF(c);
  692. p->error_indicator = 1;
  693. return NULL;
  694. }
  695. return _PyAST_Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno,
  696. t->end_col_offset, p->arena);
  697. }
  698. /* Check that the source for a single input statement really is a single
  699. statement by looking at what is left in the buffer after parsing.
  700. Trailing whitespace and comments are OK. */
  701. static int // bool
  702. bad_single_statement(Parser *p)
  703. {
  704. char *cur = p->tok->cur;
  705. char c = *cur;
  706. for (;;) {
  707. while (c == ' ' || c == '\t' || c == '\n' || c == '\014') {
  708. c = *++cur;
  709. }
  710. if (!c) {
  711. return 0;
  712. }
  713. if (c != '#') {
  714. return 1;
  715. }
  716. /* Suck up comment. */
  717. while (c && c != '\n') {
  718. c = *++cur;
  719. }
  720. }
  721. }
  722. static int
  723. compute_parser_flags(PyCompilerFlags *flags)
  724. {
  725. int parser_flags = 0;
  726. if (!flags) {
  727. return 0;
  728. }
  729. if (flags->cf_flags & PyCF_DONT_IMPLY_DEDENT) {
  730. parser_flags |= PyPARSE_DONT_IMPLY_DEDENT;
  731. }
  732. if (flags->cf_flags & PyCF_IGNORE_COOKIE) {
  733. parser_flags |= PyPARSE_IGNORE_COOKIE;
  734. }
  735. if (flags->cf_flags & CO_FUTURE_BARRY_AS_BDFL) {
  736. parser_flags |= PyPARSE_BARRY_AS_BDFL;
  737. }
  738. if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
  739. parser_flags |= PyPARSE_TYPE_COMMENTS;
  740. }
  741. if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) {
  742. parser_flags |= PyPARSE_ASYNC_HACKS;
  743. }
  744. if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
  745. parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
  746. }
  747. return parser_flags;
  748. }
  749. // Parser API
  750. Parser *
  751. _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
  752. int feature_version, int *errcode, PyArena *arena)
  753. {
  754. Parser *p = PyMem_Malloc(sizeof(Parser));
  755. if (p == NULL) {
  756. return (Parser *) PyErr_NoMemory();
  757. }
  758. assert(tok != NULL);
  759. tok->type_comments = (flags & PyPARSE_TYPE_COMMENTS) > 0;
  760. tok->async_hacks = (flags & PyPARSE_ASYNC_HACKS) > 0;
  761. p->tok = tok;
  762. p->keywords = NULL;
  763. p->n_keyword_lists = -1;
  764. p->soft_keywords = NULL;
  765. p->tokens = PyMem_Malloc(sizeof(Token *));
  766. if (!p->tokens) {
  767. PyMem_Free(p);
  768. return (Parser *) PyErr_NoMemory();
  769. }
  770. p->tokens[0] = PyMem_Calloc(1, sizeof(Token));
  771. if (!p->tokens[0]) {
  772. PyMem_Free(p->tokens);
  773. PyMem_Free(p);
  774. return (Parser *) PyErr_NoMemory();
  775. }
  776. if (!growable_comment_array_init(&p->type_ignore_comments, 10)) {
  777. PyMem_Free(p->tokens[0]);
  778. PyMem_Free(p->tokens);
  779. PyMem_Free(p);
  780. return (Parser *) PyErr_NoMemory();
  781. }
  782. p->mark = 0;
  783. p->fill = 0;
  784. p->size = 1;
  785. p->errcode = errcode;
  786. p->arena = arena;
  787. p->start_rule = start_rule;
  788. p->parsing_started = 0;
  789. p->normalize = NULL;
  790. p->error_indicator = 0;
  791. p->starting_lineno = 0;
  792. p->starting_col_offset = 0;
  793. p->flags = flags;
  794. p->feature_version = feature_version;
  795. p->known_err_token = NULL;
  796. p->level = 0;
  797. p->call_invalid_rules = 0;
  798. #ifdef Py_DEBUG
  799. p->debug = _Py_GetConfig()->parser_debug;
  800. #endif
  801. return p;
  802. }
  803. void
  804. _PyPegen_Parser_Free(Parser *p)
  805. {
  806. Py_XDECREF(p->normalize);
  807. for (int i = 0; i < p->size; i++) {
  808. PyMem_Free(p->tokens[i]);
  809. }
  810. PyMem_Free(p->tokens);
  811. growable_comment_array_deallocate(&p->type_ignore_comments);
  812. PyMem_Free(p);
  813. }
  814. static void
  815. reset_parser_state_for_error_pass(Parser *p)
  816. {
  817. for (int i = 0; i < p->fill; i++) {
  818. p->tokens[i]->memo = NULL;
  819. }
  820. p->mark = 0;
  821. p->call_invalid_rules = 1;
  822. // Don't try to get extra tokens in interactive mode when trying to
  823. // raise specialized errors in the second pass.
  824. p->tok->interactive_underflow = IUNDERFLOW_STOP;
  825. }
  826. static inline int
  827. _is_end_of_source(Parser *p) {
  828. int err = p->tok->done;
  829. return err == E_EOF || err == E_EOFS || err == E_EOLS;
  830. }
  831. void *
  832. _PyPegen_run_parser(Parser *p)
  833. {
  834. void *res = _PyPegen_parse(p);
  835. assert(p->level == 0);
  836. if (res == NULL) {
  837. if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) && _is_end_of_source(p)) {
  838. PyErr_Clear();
  839. return RAISE_SYNTAX_ERROR("incomplete input");
  840. }
  841. if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
  842. return NULL;
  843. }
  844. // Make a second parser pass. In this pass we activate heavier and slower checks
  845. // to produce better error messages and more complete diagnostics. Extra "invalid_*"
  846. // rules will be active during parsing.
  847. Token *last_token = p->tokens[p->fill - 1];
  848. reset_parser_state_for_error_pass(p);
  849. _PyPegen_parse(p);
  850. // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
  851. // point.
  852. _Pypegen_set_syntax_error(p, last_token);
  853. return NULL;
  854. }
  855. if (p->start_rule == Py_single_input && bad_single_statement(p)) {
  856. p->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
  857. return RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
  858. }
  859. // test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
  860. #if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
  861. if (p->start_rule == Py_single_input ||
  862. p->start_rule == Py_file_input ||
  863. p->start_rule == Py_eval_input)
  864. {
  865. if (!_PyAST_Validate(res)) {
  866. return NULL;
  867. }
  868. }
  869. #endif
  870. return res;
  871. }
  872. mod_ty
  873. _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
  874. const char *enc, const char *ps1, const char *ps2,
  875. PyCompilerFlags *flags, int *errcode, PyArena *arena)
  876. {
  877. struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
  878. if (tok == NULL) {
  879. if (PyErr_Occurred()) {
  880. _PyPegen_raise_tokenizer_init_error(filename_ob);
  881. return NULL;
  882. }
  883. return NULL;
  884. }
  885. if (!tok->fp || ps1 != NULL || ps2 != NULL ||
  886. PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
  887. tok->fp_interactive = 1;
  888. }
  889. // This transfers the ownership to the tokenizer
  890. tok->filename = Py_NewRef(filename_ob);
  891. // From here on we need to clean up even if there's an error
  892. mod_ty result = NULL;
  893. int parser_flags = compute_parser_flags(flags);
  894. Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
  895. errcode, arena);
  896. if (p == NULL) {
  897. goto error;
  898. }
  899. result = _PyPegen_run_parser(p);
  900. _PyPegen_Parser_Free(p);
  901. error:
  902. _PyTokenizer_Free(tok);
  903. return result;
  904. }
  905. mod_ty
  906. _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filename_ob,
  907. PyCompilerFlags *flags, PyArena *arena)
  908. {
  909. int exec_input = start_rule == Py_file_input;
  910. struct tok_state *tok;
  911. if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
  912. tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
  913. } else {
  914. tok = _PyTokenizer_FromString(str, exec_input, 0);
  915. }
  916. if (tok == NULL) {
  917. if (PyErr_Occurred()) {
  918. _PyPegen_raise_tokenizer_init_error(filename_ob);
  919. }
  920. return NULL;
  921. }
  922. // This transfers the ownership to the tokenizer
  923. tok->filename = Py_NewRef(filename_ob);
  924. // We need to clear up from here on
  925. mod_ty result = NULL;
  926. int parser_flags = compute_parser_flags(flags);
  927. int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
  928. flags->cf_feature_version : PY_MINOR_VERSION;
  929. Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
  930. NULL, arena);
  931. if (p == NULL) {
  932. goto error;
  933. }
  934. result = _PyPegen_run_parser(p);
  935. _PyPegen_Parser_Free(p);
  936. error:
  937. _PyTokenizer_Free(tok);
  938. return result;
  939. }