pegen.c 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019
  1. #include <Python.h>
  2. #include "pycore_ast.h" // _PyAST_Validate(),
  3. #include "pycore_pystate.h" // _PyThreadState_GET()
  4. #include <errcode.h>
  5. #include "tokenizer.h"
  6. #include "pegen.h"
  7. // Internal parser functions
  8. asdl_stmt_seq*
  9. _PyPegen_interactive_exit(Parser *p)
  10. {
  11. if (p->errcode) {
  12. *(p->errcode) = E_EOF;
  13. }
  14. return NULL;
  15. }
  16. Py_ssize_t
  17. _PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
  18. {
  19. Py_ssize_t len = strlen(str);
  20. if (col_offset > len + 1) {
  21. col_offset = len + 1;
  22. }
  23. assert(col_offset >= 0);
  24. PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
  25. if (!text) {
  26. return -1;
  27. }
  28. Py_ssize_t size = PyUnicode_GET_LENGTH(text);
  29. Py_DECREF(text);
  30. return size;
  31. }
  32. // Calculate the extra amount of width space the given source
  33. // code segment might take if it were to be displayed on a fixed
  34. // width output device. Supports wide unicode characters and emojis.
  35. Py_ssize_t
  36. _PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
  37. {
  38. PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
  39. if (!segment) {
  40. return -1;
  41. }
  42. // Fast track for ascii strings
  43. if (PyUnicode_IS_ASCII(segment)) {
  44. Py_DECREF(segment);
  45. return character_offset;
  46. }
  47. PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
  48. if (!width_fn) {
  49. return -1;
  50. }
  51. Py_ssize_t width = 0;
  52. Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
  53. for (Py_ssize_t i = 0; i < len; i++) {
  54. PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
  55. if (!chr) {
  56. Py_DECREF(segment);
  57. Py_DECREF(width_fn);
  58. return -1;
  59. }
  60. PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
  61. Py_DECREF(chr);
  62. if (!width_specifier) {
  63. Py_DECREF(segment);
  64. Py_DECREF(width_fn);
  65. return -1;
  66. }
  67. if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
  68. _PyUnicode_EqualToASCIIString(width_specifier, "F")) {
  69. width += 2;
  70. }
  71. else {
  72. width += 1;
  73. }
  74. Py_DECREF(width_specifier);
  75. }
  76. Py_DECREF(segment);
  77. Py_DECREF(width_fn);
  78. return width;
  79. }
  80. Py_ssize_t
  81. _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
  82. {
  83. const char *str = PyUnicode_AsUTF8(line);
  84. if (!str) {
  85. return -1;
  86. }
  87. return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
  88. }
  89. // Here, mark is the start of the node, while p->mark is the end.
  90. // If node==NULL, they should be the same.
  91. int
  92. _PyPegen_insert_memo(Parser *p, int mark, int type, void *node)
  93. {
  94. // Insert in front
  95. Memo *m = _PyArena_Malloc(p->arena, sizeof(Memo));
  96. if (m == NULL) {
  97. return -1;
  98. }
  99. m->type = type;
  100. m->node = node;
  101. m->mark = p->mark;
  102. m->next = p->tokens[mark]->memo;
  103. p->tokens[mark]->memo = m;
  104. return 0;
  105. }
  106. // Like _PyPegen_insert_memo(), but updates an existing node if found.
  107. int
  108. _PyPegen_update_memo(Parser *p, int mark, int type, void *node)
  109. {
  110. for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) {
  111. if (m->type == type) {
  112. // Update existing node.
  113. m->node = node;
  114. m->mark = p->mark;
  115. return 0;
  116. }
  117. }
  118. // Insert new node.
  119. return _PyPegen_insert_memo(p, mark, type, node);
  120. }
  121. static int
  122. init_normalization(Parser *p)
  123. {
  124. if (p->normalize) {
  125. return 1;
  126. }
  127. p->normalize = _PyImport_GetModuleAttrString("unicodedata", "normalize");
  128. if (!p->normalize)
  129. {
  130. return 0;
  131. }
  132. return 1;
  133. }
  134. static int
  135. growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
  136. assert(initial_size > 0);
  137. arr->items = PyMem_Malloc(initial_size * sizeof(*arr->items));
  138. arr->size = initial_size;
  139. arr->num_items = 0;
  140. return arr->items != NULL;
  141. }
  142. static int
  143. growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
  144. if (arr->num_items >= arr->size) {
  145. size_t new_size = arr->size * 2;
  146. void *new_items_array = PyMem_Realloc(arr->items, new_size * sizeof(*arr->items));
  147. if (!new_items_array) {
  148. return 0;
  149. }
  150. arr->items = new_items_array;
  151. arr->size = new_size;
  152. }
  153. arr->items[arr->num_items].lineno = lineno;
  154. arr->items[arr->num_items].comment = comment; // Take ownership
  155. arr->num_items++;
  156. return 1;
  157. }
  158. static void
  159. growable_comment_array_deallocate(growable_comment_array *arr) {
  160. for (unsigned i = 0; i < arr->num_items; i++) {
  161. PyMem_Free(arr->items[i].comment);
  162. }
  163. PyMem_Free(arr->items);
  164. }
  165. static int
  166. _get_keyword_or_name_type(Parser *p, struct token *new_token)
  167. {
  168. int name_len = new_token->end_col_offset - new_token->col_offset;
  169. assert(name_len > 0);
  170. if (name_len >= p->n_keyword_lists ||
  171. p->keywords[name_len] == NULL ||
  172. p->keywords[name_len]->type == -1) {
  173. return NAME;
  174. }
  175. for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
  176. if (strncmp(k->str, new_token->start, name_len) == 0) {
  177. return k->type;
  178. }
  179. }
  180. return NAME;
  181. }
  182. static int
  183. initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
  184. assert(parser_token != NULL);
  185. parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
  186. parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
  187. if (parser_token->bytes == NULL) {
  188. return -1;
  189. }
  190. if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
  191. Py_DECREF(parser_token->bytes);
  192. return -1;
  193. }
  194. parser_token->metadata = NULL;
  195. if (new_token->metadata != NULL) {
  196. if (_PyArena_AddPyObject(p->arena, new_token->metadata) < 0) {
  197. Py_DECREF(parser_token->metadata);
  198. return -1;
  199. }
  200. parser_token->metadata = new_token->metadata;
  201. new_token->metadata = NULL;
  202. }
  203. parser_token->level = new_token->level;
  204. parser_token->lineno = new_token->lineno;
  205. parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
  206. : new_token->col_offset;
  207. parser_token->end_lineno = new_token->end_lineno;
  208. parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
  209. : new_token->end_col_offset;
  210. p->fill += 1;
  211. if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
  212. return _Pypegen_raise_decode_error(p);
  213. }
  214. return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
  215. }
  216. static int
  217. _resize_tokens_array(Parser *p) {
  218. int newsize = p->size * 2;
  219. Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
  220. if (new_tokens == NULL) {
  221. PyErr_NoMemory();
  222. return -1;
  223. }
  224. p->tokens = new_tokens;
  225. for (int i = p->size; i < newsize; i++) {
  226. p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
  227. if (p->tokens[i] == NULL) {
  228. p->size = i; // Needed, in order to cleanup correctly after parser fails
  229. PyErr_NoMemory();
  230. return -1;
  231. }
  232. }
  233. p->size = newsize;
  234. return 0;
  235. }
  236. int
  237. _PyPegen_fill_token(Parser *p)
  238. {
  239. struct token new_token;
  240. _PyToken_Init(&new_token);
  241. int type = _PyTokenizer_Get(p->tok, &new_token);
  242. // Record and skip '# type: ignore' comments
  243. while (type == TYPE_IGNORE) {
  244. Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
  245. char *tag = PyMem_Malloc(len + 1);
  246. if (tag == NULL) {
  247. PyErr_NoMemory();
  248. goto error;
  249. }
  250. strncpy(tag, new_token.start, len);
  251. tag[len] = '\0';
  252. // Ownership of tag passes to the growable array
  253. if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
  254. PyErr_NoMemory();
  255. goto error;
  256. }
  257. type = _PyTokenizer_Get(p->tok, &new_token);
  258. }
  259. // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
  260. if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
  261. type = NEWLINE; /* Add an extra newline */
  262. p->parsing_started = 0;
  263. if (p->tok->indent && !(p->flags & PyPARSE_DONT_IMPLY_DEDENT)) {
  264. p->tok->pendin = -p->tok->indent;
  265. p->tok->indent = 0;
  266. }
  267. }
  268. else {
  269. p->parsing_started = 1;
  270. }
  271. // Check if we are at the limit of the token array capacity and resize if needed
  272. if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
  273. goto error;
  274. }
  275. Token *t = p->tokens[p->fill];
  276. return initialize_token(p, t, &new_token, type);
  277. error:
  278. _PyToken_Free(&new_token);
  279. return -1;
  280. }
  281. #if defined(Py_DEBUG)
  282. // Instrumentation to count the effectiveness of memoization.
  283. // The array counts the number of tokens skipped by memoization,
  284. // indexed by type.
  285. #define NSTATISTICS _PYPEGEN_NSTATISTICS
  286. #define memo_statistics _PyRuntime.parser.memo_statistics
  287. void
  288. _PyPegen_clear_memo_statistics(void)
  289. {
  290. for (int i = 0; i < NSTATISTICS; i++) {
  291. memo_statistics[i] = 0;
  292. }
  293. }
  294. PyObject *
  295. _PyPegen_get_memo_statistics(void)
  296. {
  297. PyObject *ret = PyList_New(NSTATISTICS);
  298. if (ret == NULL) {
  299. return NULL;
  300. }
  301. for (int i = 0; i < NSTATISTICS; i++) {
  302. PyObject *value = PyLong_FromLong(memo_statistics[i]);
  303. if (value == NULL) {
  304. Py_DECREF(ret);
  305. return NULL;
  306. }
  307. // PyList_SetItem borrows a reference to value.
  308. if (PyList_SetItem(ret, i, value) < 0) {
  309. Py_DECREF(ret);
  310. return NULL;
  311. }
  312. }
  313. return ret;
  314. }
  315. #endif
  316. int // bool
  317. _PyPegen_is_memoized(Parser *p, int type, void *pres)
  318. {
  319. if (p->mark == p->fill) {
  320. if (_PyPegen_fill_token(p) < 0) {
  321. p->error_indicator = 1;
  322. return -1;
  323. }
  324. }
  325. Token *t = p->tokens[p->mark];
  326. for (Memo *m = t->memo; m != NULL; m = m->next) {
  327. if (m->type == type) {
  328. #if defined(PY_DEBUG)
  329. if (0 <= type && type < NSTATISTICS) {
  330. long count = m->mark - p->mark;
  331. // A memoized negative result counts for one.
  332. if (count <= 0) {
  333. count = 1;
  334. }
  335. memo_statistics[type] += count;
  336. }
  337. #endif
  338. p->mark = m->mark;
  339. *(void **)(pres) = m->node;
  340. return 1;
  341. }
  342. }
  343. return 0;
  344. }
  345. int
  346. _PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p)
  347. {
  348. int mark = p->mark;
  349. void *res = func(p);
  350. p->mark = mark;
  351. return (res != NULL) == positive;
  352. }
  353. int
  354. _PyPegen_lookahead_with_string(int positive, expr_ty (func)(Parser *, const char*), Parser *p, const char* arg)
  355. {
  356. int mark = p->mark;
  357. void *res = func(p, arg);
  358. p->mark = mark;
  359. return (res != NULL) == positive;
  360. }
  361. int
  362. _PyPegen_lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg)
  363. {
  364. int mark = p->mark;
  365. void *res = func(p, arg);
  366. p->mark = mark;
  367. return (res != NULL) == positive;
  368. }
  369. int
  370. _PyPegen_lookahead(int positive, void *(func)(Parser *), Parser *p)
  371. {
  372. int mark = p->mark;
  373. void *res = (void*)func(p);
  374. p->mark = mark;
  375. return (res != NULL) == positive;
  376. }
  377. Token *
  378. _PyPegen_expect_token(Parser *p, int type)
  379. {
  380. if (p->mark == p->fill) {
  381. if (_PyPegen_fill_token(p) < 0) {
  382. p->error_indicator = 1;
  383. return NULL;
  384. }
  385. }
  386. Token *t = p->tokens[p->mark];
  387. if (t->type != type) {
  388. return NULL;
  389. }
  390. p->mark += 1;
  391. return t;
  392. }
  393. void*
  394. _PyPegen_expect_forced_result(Parser *p, void* result, const char* expected) {
  395. if (p->error_indicator == 1) {
  396. return NULL;
  397. }
  398. if (result == NULL) {
  399. RAISE_SYNTAX_ERROR("expected (%s)", expected);
  400. return NULL;
  401. }
  402. return result;
  403. }
  404. Token *
  405. _PyPegen_expect_forced_token(Parser *p, int type, const char* expected) {
  406. if (p->error_indicator == 1) {
  407. return NULL;
  408. }
  409. if (p->mark == p->fill) {
  410. if (_PyPegen_fill_token(p) < 0) {
  411. p->error_indicator = 1;
  412. return NULL;
  413. }
  414. }
  415. Token *t = p->tokens[p->mark];
  416. if (t->type != type) {
  417. RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "expected '%s'", expected);
  418. return NULL;
  419. }
  420. p->mark += 1;
  421. return t;
  422. }
  423. expr_ty
  424. _PyPegen_expect_soft_keyword(Parser *p, const char *keyword)
  425. {
  426. if (p->mark == p->fill) {
  427. if (_PyPegen_fill_token(p) < 0) {
  428. p->error_indicator = 1;
  429. return NULL;
  430. }
  431. }
  432. Token *t = p->tokens[p->mark];
  433. if (t->type != NAME) {
  434. return NULL;
  435. }
  436. const char *s = PyBytes_AsString(t->bytes);
  437. if (!s) {
  438. p->error_indicator = 1;
  439. return NULL;
  440. }
  441. if (strcmp(s, keyword) != 0) {
  442. return NULL;
  443. }
  444. return _PyPegen_name_token(p);
  445. }
  446. Token *
  447. _PyPegen_get_last_nonnwhitespace_token(Parser *p)
  448. {
  449. assert(p->mark >= 0);
  450. Token *token = NULL;
  451. for (int m = p->mark - 1; m >= 0; m--) {
  452. token = p->tokens[m];
  453. if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) {
  454. break;
  455. }
  456. }
  457. return token;
  458. }
  459. PyObject *
  460. _PyPegen_new_identifier(Parser *p, const char *n)
  461. {
  462. PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
  463. if (!id) {
  464. goto error;
  465. }
  466. /* PyUnicode_DecodeUTF8 should always return a ready string. */
  467. assert(PyUnicode_IS_READY(id));
  468. /* Check whether there are non-ASCII characters in the
  469. identifier; if so, normalize to NFKC. */
  470. if (!PyUnicode_IS_ASCII(id))
  471. {
  472. PyObject *id2;
  473. if (!init_normalization(p))
  474. {
  475. Py_DECREF(id);
  476. goto error;
  477. }
  478. PyObject *form = PyUnicode_InternFromString("NFKC");
  479. if (form == NULL)
  480. {
  481. Py_DECREF(id);
  482. goto error;
  483. }
  484. PyObject *args[2] = {form, id};
  485. id2 = _PyObject_FastCall(p->normalize, args, 2);
  486. Py_DECREF(id);
  487. Py_DECREF(form);
  488. if (!id2) {
  489. goto error;
  490. }
  491. if (!PyUnicode_Check(id2))
  492. {
  493. PyErr_Format(PyExc_TypeError,
  494. "unicodedata.normalize() must return a string, not "
  495. "%.200s",
  496. _PyType_Name(Py_TYPE(id2)));
  497. Py_DECREF(id2);
  498. goto error;
  499. }
  500. id = id2;
  501. }
  502. PyUnicode_InternInPlace(&id);
  503. if (_PyArena_AddPyObject(p->arena, id) < 0)
  504. {
  505. Py_DECREF(id);
  506. goto error;
  507. }
  508. return id;
  509. error:
  510. p->error_indicator = 1;
  511. return NULL;
  512. }
  513. static expr_ty
  514. _PyPegen_name_from_token(Parser *p, Token* t)
  515. {
  516. if (t == NULL) {
  517. return NULL;
  518. }
  519. const char *s = PyBytes_AsString(t->bytes);
  520. if (!s) {
  521. p->error_indicator = 1;
  522. return NULL;
  523. }
  524. PyObject *id = _PyPegen_new_identifier(p, s);
  525. if (id == NULL) {
  526. p->error_indicator = 1;
  527. return NULL;
  528. }
  529. return _PyAST_Name(id, Load, t->lineno, t->col_offset, t->end_lineno,
  530. t->end_col_offset, p->arena);
  531. }
  532. expr_ty
  533. _PyPegen_name_token(Parser *p)
  534. {
  535. Token *t = _PyPegen_expect_token(p, NAME);
  536. return _PyPegen_name_from_token(p, t);
  537. }
  538. void *
  539. _PyPegen_string_token(Parser *p)
  540. {
  541. return _PyPegen_expect_token(p, STRING);
  542. }
  543. expr_ty _PyPegen_soft_keyword_token(Parser *p) {
  544. Token *t = _PyPegen_expect_token(p, NAME);
  545. if (t == NULL) {
  546. return NULL;
  547. }
  548. char *the_token;
  549. Py_ssize_t size;
  550. PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
  551. for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
  552. if (strncmp(*keyword, the_token, size) == 0) {
  553. return _PyPegen_name_from_token(p, t);
  554. }
  555. }
  556. return NULL;
  557. }
  558. static PyObject *
  559. parsenumber_raw(const char *s)
  560. {
  561. const char *end;
  562. long x;
  563. double dx;
  564. Py_complex compl;
  565. int imflag;
  566. assert(s != NULL);
  567. errno = 0;
  568. end = s + strlen(s) - 1;
  569. imflag = *end == 'j' || *end == 'J';
  570. if (s[0] == '0') {
  571. x = (long)PyOS_strtoul(s, (char **)&end, 0);
  572. if (x < 0 && errno == 0) {
  573. return PyLong_FromString(s, (char **)0, 0);
  574. }
  575. }
  576. else {
  577. x = PyOS_strtol(s, (char **)&end, 0);
  578. }
  579. if (*end == '\0') {
  580. if (errno != 0) {
  581. return PyLong_FromString(s, (char **)0, 0);
  582. }
  583. return PyLong_FromLong(x);
  584. }
  585. /* XXX Huge floats may silently fail */
  586. if (imflag) {
  587. compl.real = 0.;
  588. compl.imag = PyOS_string_to_double(s, (char **)&end, NULL);
  589. if (compl.imag == -1.0 && PyErr_Occurred()) {
  590. return NULL;
  591. }
  592. return PyComplex_FromCComplex(compl);
  593. }
  594. dx = PyOS_string_to_double(s, NULL, NULL);
  595. if (dx == -1.0 && PyErr_Occurred()) {
  596. return NULL;
  597. }
  598. return PyFloat_FromDouble(dx);
  599. }
  600. static PyObject *
  601. parsenumber(const char *s)
  602. {
  603. char *dup;
  604. char *end;
  605. PyObject *res = NULL;
  606. assert(s != NULL);
  607. if (strchr(s, '_') == NULL) {
  608. return parsenumber_raw(s);
  609. }
  610. /* Create a duplicate without underscores. */
  611. dup = PyMem_Malloc(strlen(s) + 1);
  612. if (dup == NULL) {
  613. return PyErr_NoMemory();
  614. }
  615. end = dup;
  616. for (; *s; s++) {
  617. if (*s != '_') {
  618. *end++ = *s;
  619. }
  620. }
  621. *end = '\0';
  622. res = parsenumber_raw(dup);
  623. PyMem_Free(dup);
  624. return res;
  625. }
  626. expr_ty
  627. _PyPegen_number_token(Parser *p)
  628. {
  629. Token *t = _PyPegen_expect_token(p, NUMBER);
  630. if (t == NULL) {
  631. return NULL;
  632. }
  633. const char *num_raw = PyBytes_AsString(t->bytes);
  634. if (num_raw == NULL) {
  635. p->error_indicator = 1;
  636. return NULL;
  637. }
  638. if (p->feature_version < 6 && strchr(num_raw, '_') != NULL) {
  639. p->error_indicator = 1;
  640. return RAISE_SYNTAX_ERROR("Underscores in numeric literals are only supported "
  641. "in Python 3.6 and greater");
  642. }
  643. PyObject *c = parsenumber(num_raw);
  644. if (c == NULL) {
  645. p->error_indicator = 1;
  646. PyThreadState *tstate = _PyThreadState_GET();
  647. // The only way a ValueError should happen in _this_ code is via
  648. // PyLong_FromString hitting a length limit.
  649. if (tstate->current_exception != NULL &&
  650. Py_TYPE(tstate->current_exception) == (PyTypeObject *)PyExc_ValueError
  651. ) {
  652. PyObject *exc = PyErr_GetRaisedException();
  653. /* Intentionally omitting columns to avoid a wall of 1000s of '^'s
  654. * on the error message. Nobody is going to overlook their huge
  655. * numeric literal once given the line. */
  656. RAISE_ERROR_KNOWN_LOCATION(
  657. p, PyExc_SyntaxError,
  658. t->lineno, -1 /* col_offset */,
  659. t->end_lineno, -1 /* end_col_offset */,
  660. "%S - Consider hexadecimal for huge integer literals "
  661. "to avoid decimal conversion limits.",
  662. exc);
  663. Py_DECREF(exc);
  664. }
  665. return NULL;
  666. }
  667. if (_PyArena_AddPyObject(p->arena, c) < 0) {
  668. Py_DECREF(c);
  669. p->error_indicator = 1;
  670. return NULL;
  671. }
  672. return _PyAST_Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno,
  673. t->end_col_offset, p->arena);
  674. }
  675. /* Check that the source for a single input statement really is a single
  676. statement by looking at what is left in the buffer after parsing.
  677. Trailing whitespace and comments are OK. */
  678. static int // bool
  679. bad_single_statement(Parser *p)
  680. {
  681. char *cur = p->tok->cur;
  682. char c = *cur;
  683. for (;;) {
  684. while (c == ' ' || c == '\t' || c == '\n' || c == '\014') {
  685. c = *++cur;
  686. }
  687. if (!c) {
  688. return 0;
  689. }
  690. if (c != '#') {
  691. return 1;
  692. }
  693. /* Suck up comment. */
  694. while (c && c != '\n') {
  695. c = *++cur;
  696. }
  697. }
  698. }
  699. static int
  700. compute_parser_flags(PyCompilerFlags *flags)
  701. {
  702. int parser_flags = 0;
  703. if (!flags) {
  704. return 0;
  705. }
  706. if (flags->cf_flags & PyCF_DONT_IMPLY_DEDENT) {
  707. parser_flags |= PyPARSE_DONT_IMPLY_DEDENT;
  708. }
  709. if (flags->cf_flags & PyCF_IGNORE_COOKIE) {
  710. parser_flags |= PyPARSE_IGNORE_COOKIE;
  711. }
  712. if (flags->cf_flags & CO_FUTURE_BARRY_AS_BDFL) {
  713. parser_flags |= PyPARSE_BARRY_AS_BDFL;
  714. }
  715. if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
  716. parser_flags |= PyPARSE_TYPE_COMMENTS;
  717. }
  718. if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) {
  719. parser_flags |= PyPARSE_ASYNC_HACKS;
  720. }
  721. if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
  722. parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
  723. }
  724. return parser_flags;
  725. }
  726. // Parser API
  727. Parser *
  728. _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
  729. int feature_version, int *errcode, PyArena *arena)
  730. {
  731. Parser *p = PyMem_Malloc(sizeof(Parser));
  732. if (p == NULL) {
  733. return (Parser *) PyErr_NoMemory();
  734. }
  735. assert(tok != NULL);
  736. tok->type_comments = (flags & PyPARSE_TYPE_COMMENTS) > 0;
  737. tok->async_hacks = (flags & PyPARSE_ASYNC_HACKS) > 0;
  738. p->tok = tok;
  739. p->keywords = NULL;
  740. p->n_keyword_lists = -1;
  741. p->soft_keywords = NULL;
  742. p->tokens = PyMem_Malloc(sizeof(Token *));
  743. if (!p->tokens) {
  744. PyMem_Free(p);
  745. return (Parser *) PyErr_NoMemory();
  746. }
  747. p->tokens[0] = PyMem_Calloc(1, sizeof(Token));
  748. if (!p->tokens[0]) {
  749. PyMem_Free(p->tokens);
  750. PyMem_Free(p);
  751. return (Parser *) PyErr_NoMemory();
  752. }
  753. if (!growable_comment_array_init(&p->type_ignore_comments, 10)) {
  754. PyMem_Free(p->tokens[0]);
  755. PyMem_Free(p->tokens);
  756. PyMem_Free(p);
  757. return (Parser *) PyErr_NoMemory();
  758. }
  759. p->mark = 0;
  760. p->fill = 0;
  761. p->size = 1;
  762. p->errcode = errcode;
  763. p->arena = arena;
  764. p->start_rule = start_rule;
  765. p->parsing_started = 0;
  766. p->normalize = NULL;
  767. p->error_indicator = 0;
  768. p->starting_lineno = 0;
  769. p->starting_col_offset = 0;
  770. p->flags = flags;
  771. p->feature_version = feature_version;
  772. p->known_err_token = NULL;
  773. p->level = 0;
  774. p->call_invalid_rules = 0;
  775. #ifdef Py_DEBUG
  776. p->debug = _Py_GetConfig()->parser_debug;
  777. #endif
  778. return p;
  779. }
  780. void
  781. _PyPegen_Parser_Free(Parser *p)
  782. {
  783. Py_XDECREF(p->normalize);
  784. for (int i = 0; i < p->size; i++) {
  785. PyMem_Free(p->tokens[i]);
  786. }
  787. PyMem_Free(p->tokens);
  788. growable_comment_array_deallocate(&p->type_ignore_comments);
  789. PyMem_Free(p);
  790. }
  791. static void
  792. reset_parser_state_for_error_pass(Parser *p)
  793. {
  794. for (int i = 0; i < p->fill; i++) {
  795. p->tokens[i]->memo = NULL;
  796. }
  797. p->mark = 0;
  798. p->call_invalid_rules = 1;
  799. // Don't try to get extra tokens in interactive mode when trying to
  800. // raise specialized errors in the second pass.
  801. p->tok->interactive_underflow = IUNDERFLOW_STOP;
  802. }
  803. static inline int
  804. _is_end_of_source(Parser *p) {
  805. int err = p->tok->done;
  806. return err == E_EOF || err == E_EOFS || err == E_EOLS;
  807. }
  808. void *
  809. _PyPegen_run_parser(Parser *p)
  810. {
  811. void *res = _PyPegen_parse(p);
  812. assert(p->level == 0);
  813. if (res == NULL) {
  814. if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) && _is_end_of_source(p)) {
  815. PyErr_Clear();
  816. return RAISE_SYNTAX_ERROR("incomplete input");
  817. }
  818. if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
  819. return NULL;
  820. }
  821. // Make a second parser pass. In this pass we activate heavier and slower checks
  822. // to produce better error messages and more complete diagnostics. Extra "invalid_*"
  823. // rules will be active during parsing.
  824. Token *last_token = p->tokens[p->fill - 1];
  825. reset_parser_state_for_error_pass(p);
  826. _PyPegen_parse(p);
  827. // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
  828. // point.
  829. _Pypegen_set_syntax_error(p, last_token);
  830. return NULL;
  831. }
  832. if (p->start_rule == Py_single_input && bad_single_statement(p)) {
  833. p->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
  834. return RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
  835. }
  836. // test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
  837. #if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
  838. if (p->start_rule == Py_single_input ||
  839. p->start_rule == Py_file_input ||
  840. p->start_rule == Py_eval_input)
  841. {
  842. if (!_PyAST_Validate(res)) {
  843. return NULL;
  844. }
  845. }
  846. #endif
  847. return res;
  848. }
  849. mod_ty
  850. _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
  851. const char *enc, const char *ps1, const char *ps2,
  852. PyCompilerFlags *flags, int *errcode, PyArena *arena)
  853. {
  854. struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
  855. if (tok == NULL) {
  856. if (PyErr_Occurred()) {
  857. _PyPegen_raise_tokenizer_init_error(filename_ob);
  858. return NULL;
  859. }
  860. return NULL;
  861. }
  862. if (!tok->fp || ps1 != NULL || ps2 != NULL ||
  863. PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
  864. tok->fp_interactive = 1;
  865. }
  866. // This transfers the ownership to the tokenizer
  867. tok->filename = Py_NewRef(filename_ob);
  868. // From here on we need to clean up even if there's an error
  869. mod_ty result = NULL;
  870. int parser_flags = compute_parser_flags(flags);
  871. Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
  872. errcode, arena);
  873. if (p == NULL) {
  874. goto error;
  875. }
  876. result = _PyPegen_run_parser(p);
  877. _PyPegen_Parser_Free(p);
  878. error:
  879. _PyTokenizer_Free(tok);
  880. return result;
  881. }
  882. mod_ty
  883. _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filename_ob,
  884. PyCompilerFlags *flags, PyArena *arena)
  885. {
  886. int exec_input = start_rule == Py_file_input;
  887. struct tok_state *tok;
  888. if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
  889. tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
  890. } else {
  891. tok = _PyTokenizer_FromString(str, exec_input, 0);
  892. }
  893. if (tok == NULL) {
  894. if (PyErr_Occurred()) {
  895. _PyPegen_raise_tokenizer_init_error(filename_ob);
  896. }
  897. return NULL;
  898. }
  899. // This transfers the ownership to the tokenizer
  900. tok->filename = Py_NewRef(filename_ob);
  901. // We need to clear up from here on
  902. mod_ty result = NULL;
  903. int parser_flags = compute_parser_flags(flags);
  904. int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
  905. flags->cf_feature_version : PY_MINOR_VERSION;
  906. Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
  907. NULL, arena);
  908. if (p == NULL) {
  909. goto error;
  910. }
  911. result = _PyPegen_run_parser(p);
  912. _PyPegen_Parser_Free(p);
  913. error:
  914. _PyTokenizer_Free(tok);
  915. return result;
  916. }