Python-tokenize.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. #include "Python.h"
  2. #include "errcode.h"
  3. #include "../Parser/tokenizer.h"
  4. #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
  5. #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
  6. static struct PyModuleDef _tokenizemodule;
  7. typedef struct {
  8. PyTypeObject *TokenizerIter;
  9. } tokenize_state;
  10. static tokenize_state *
  11. get_tokenize_state(PyObject *module) {
  12. return (tokenize_state *)PyModule_GetState(module);
  13. }
  14. #define _tokenize_get_state_by_type(type) \
  15. get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
  16. #include "pycore_runtime.h"
  17. #include "clinic/Python-tokenize.c.h"
  18. /*[clinic input]
  19. module _tokenizer
  20. class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
  21. [clinic start generated code]*/
  22. /*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
  23. typedef struct
  24. {
  25. PyObject_HEAD struct tok_state *tok;
  26. int done;
  27. /* Needed to cache line for performance */
  28. PyObject *last_line;
  29. Py_ssize_t last_lineno;
  30. Py_ssize_t last_end_lineno;
  31. Py_ssize_t byte_col_offset_diff;
  32. } tokenizeriterobject;
  33. /*[clinic input]
  34. @classmethod
  35. _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
  36. readline: object
  37. /
  38. *
  39. extra_tokens: bool
  40. encoding: str(c_default="NULL") = 'utf-8'
  41. [clinic start generated code]*/
  42. static PyObject *
  43. tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
  44. int extra_tokens, const char *encoding)
  45. /*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
  46. {
  47. tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
  48. if (self == NULL) {
  49. return NULL;
  50. }
  51. PyObject *filename = PyUnicode_FromString("<string>");
  52. if (filename == NULL) {
  53. return NULL;
  54. }
  55. self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
  56. if (self->tok == NULL) {
  57. Py_DECREF(filename);
  58. return NULL;
  59. }
  60. self->tok->filename = filename;
  61. if (extra_tokens) {
  62. self->tok->tok_extra_tokens = 1;
  63. }
  64. self->done = 0;
  65. self->last_line = NULL;
  66. self->byte_col_offset_diff = 0;
  67. self->last_lineno = 0;
  68. self->last_end_lineno = 0;
  69. return (PyObject *)self;
  70. }
  71. static int
  72. _tokenizer_error(struct tok_state *tok)
  73. {
  74. if (PyErr_Occurred()) {
  75. return -1;
  76. }
  77. const char *msg = NULL;
  78. PyObject* errtype = PyExc_SyntaxError;
  79. switch (tok->done) {
  80. case E_TOKEN:
  81. msg = "invalid token";
  82. break;
  83. case E_EOF:
  84. PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
  85. PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
  86. tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
  87. return -1;
  88. case E_DEDENT:
  89. msg = "unindent does not match any outer indentation level";
  90. errtype = PyExc_IndentationError;
  91. break;
  92. case E_INTR:
  93. if (!PyErr_Occurred()) {
  94. PyErr_SetNone(PyExc_KeyboardInterrupt);
  95. }
  96. return -1;
  97. case E_NOMEM:
  98. PyErr_NoMemory();
  99. return -1;
  100. case E_TABSPACE:
  101. errtype = PyExc_TabError;
  102. msg = "inconsistent use of tabs and spaces in indentation";
  103. break;
  104. case E_TOODEEP:
  105. errtype = PyExc_IndentationError;
  106. msg = "too many levels of indentation";
  107. break;
  108. case E_LINECONT: {
  109. msg = "unexpected character after line continuation character";
  110. break;
  111. }
  112. default:
  113. msg = "unknown tokenization error";
  114. }
  115. PyObject* errstr = NULL;
  116. PyObject* error_line = NULL;
  117. PyObject* tmp = NULL;
  118. PyObject* value = NULL;
  119. int result = 0;
  120. Py_ssize_t size = tok->inp - tok->buf;
  121. assert(tok->buf[size-1] == '\n');
  122. size -= 1; // Remove the newline character from the end of the line
  123. error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
  124. if (!error_line) {
  125. result = -1;
  126. goto exit;
  127. }
  128. Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
  129. if (offset == -1) {
  130. result = -1;
  131. goto exit;
  132. }
  133. tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
  134. if (!tmp) {
  135. result = -1;
  136. goto exit;
  137. }
  138. errstr = PyUnicode_FromString(msg);
  139. if (!errstr) {
  140. result = -1;
  141. goto exit;
  142. }
  143. value = PyTuple_Pack(2, errstr, tmp);
  144. if (!value) {
  145. result = -1;
  146. goto exit;
  147. }
  148. PyErr_SetObject(errtype, value);
  149. exit:
  150. Py_XDECREF(errstr);
  151. Py_XDECREF(error_line);
  152. Py_XDECREF(tmp);
  153. Py_XDECREF(value);
  154. return result;
  155. }
  156. static PyObject *
  157. tokenizeriter_next(tokenizeriterobject *it)
  158. {
  159. PyObject* result = NULL;
  160. struct token token;
  161. _PyToken_Init(&token);
  162. int type = _PyTokenizer_Get(it->tok, &token);
  163. if (type == ERRORTOKEN) {
  164. if(!PyErr_Occurred()) {
  165. _tokenizer_error(it->tok);
  166. assert(PyErr_Occurred());
  167. }
  168. goto exit;
  169. }
  170. if (it->done || type == ERRORTOKEN) {
  171. PyErr_SetString(PyExc_StopIteration, "EOF");
  172. it->done = 1;
  173. goto exit;
  174. }
  175. PyObject *str = NULL;
  176. if (token.start == NULL || token.end == NULL) {
  177. str = PyUnicode_FromString("");
  178. }
  179. else {
  180. str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
  181. }
  182. if (str == NULL) {
  183. goto exit;
  184. }
  185. int is_trailing_token = 0;
  186. if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
  187. is_trailing_token = 1;
  188. }
  189. const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
  190. PyObject* line = NULL;
  191. int line_changed = 1;
  192. if (it->tok->tok_extra_tokens && is_trailing_token) {
  193. line = PyUnicode_FromString("");
  194. } else {
  195. Py_ssize_t size = it->tok->inp - line_start;
  196. if (size >= 1 && it->tok->implicit_newline) {
  197. size -= 1;
  198. }
  199. if (it->tok->lineno != it->last_lineno) {
  200. // Line has changed since last token, so we fetch the new line and cache it
  201. // in the iter object.
  202. Py_XDECREF(it->last_line);
  203. line = PyUnicode_DecodeUTF8(line_start, size, "replace");
  204. it->last_line = line;
  205. it->byte_col_offset_diff = 0;
  206. } else {
  207. // Line hasn't changed so we reuse the cached one.
  208. line = it->last_line;
  209. line_changed = 0;
  210. }
  211. }
  212. if (line == NULL) {
  213. Py_DECREF(str);
  214. goto exit;
  215. }
  216. Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
  217. Py_ssize_t end_lineno = it->tok->lineno;
  218. it->last_lineno = lineno;
  219. it->last_end_lineno = end_lineno;
  220. Py_ssize_t col_offset = -1;
  221. Py_ssize_t end_col_offset = -1;
  222. Py_ssize_t byte_offset = -1;
  223. if (token.start != NULL && token.start >= line_start) {
  224. byte_offset = token.start - line_start;
  225. if (line_changed) {
  226. col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
  227. it->byte_col_offset_diff = byte_offset - col_offset;
  228. }
  229. else {
  230. col_offset = byte_offset - it->byte_col_offset_diff;
  231. }
  232. }
  233. if (token.end != NULL && token.end >= it->tok->line_start) {
  234. Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
  235. if (lineno == end_lineno) {
  236. // If the whole token is at the same line, we can just use the token.start
  237. // buffer for figuring out the new column offset, since using line is not
  238. // performant for very long lines.
  239. Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
  240. end_col_offset = col_offset + token_col_offset;
  241. it->byte_col_offset_diff += token.end - token.start - token_col_offset;
  242. } else {
  243. end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
  244. it->byte_col_offset_diff += end_byte_offset - end_col_offset;
  245. }
  246. }
  247. if (it->tok->tok_extra_tokens) {
  248. if (is_trailing_token) {
  249. lineno = end_lineno = lineno + 1;
  250. col_offset = end_col_offset = 0;
  251. }
  252. // Necessary adjustments to match the original Python tokenize
  253. // implementation
  254. if (type > DEDENT && type < OP) {
  255. type = OP;
  256. }
  257. else if (type == ASYNC || type == AWAIT) {
  258. type = NAME;
  259. }
  260. else if (type == NEWLINE) {
  261. Py_DECREF(str);
  262. if (!it->tok->implicit_newline) {
  263. if (it->tok->start[0] == '\r') {
  264. str = PyUnicode_FromString("\r\n");
  265. } else {
  266. str = PyUnicode_FromString("\n");
  267. }
  268. }
  269. end_col_offset++;
  270. }
  271. else if (type == NL) {
  272. if (it->tok->implicit_newline) {
  273. Py_DECREF(str);
  274. str = PyUnicode_FromString("");
  275. }
  276. }
  277. if (str == NULL) {
  278. Py_DECREF(line);
  279. goto exit;
  280. }
  281. }
  282. result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
  283. exit:
  284. _PyToken_Free(&token);
  285. if (type == ENDMARKER) {
  286. it->done = 1;
  287. }
  288. return result;
  289. }
  290. static void
  291. tokenizeriter_dealloc(tokenizeriterobject *it)
  292. {
  293. PyTypeObject *tp = Py_TYPE(it);
  294. Py_XDECREF(it->last_line);
  295. _PyTokenizer_Free(it->tok);
  296. tp->tp_free(it);
  297. Py_DECREF(tp);
  298. }
  299. static PyType_Slot tokenizeriter_slots[] = {
  300. {Py_tp_new, tokenizeriter_new},
  301. {Py_tp_dealloc, tokenizeriter_dealloc},
  302. {Py_tp_getattro, PyObject_GenericGetAttr},
  303. {Py_tp_iter, PyObject_SelfIter},
  304. {Py_tp_iternext, tokenizeriter_next},
  305. {0, NULL},
  306. };
  307. static PyType_Spec tokenizeriter_spec = {
  308. .name = "_tokenize.TokenizerIter",
  309. .basicsize = sizeof(tokenizeriterobject),
  310. .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
  311. .slots = tokenizeriter_slots,
  312. };
  313. static int
  314. tokenizemodule_exec(PyObject *m)
  315. {
  316. tokenize_state *state = get_tokenize_state(m);
  317. if (state == NULL) {
  318. return -1;
  319. }
  320. state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
  321. if (state->TokenizerIter == NULL) {
  322. return -1;
  323. }
  324. if (PyModule_AddType(m, state->TokenizerIter) < 0) {
  325. return -1;
  326. }
  327. return 0;
  328. }
  329. static PyMethodDef tokenize_methods[] = {
  330. {NULL, NULL, 0, NULL} /* Sentinel */
  331. };
  332. static PyModuleDef_Slot tokenizemodule_slots[] = {
  333. {Py_mod_exec, tokenizemodule_exec},
  334. {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
  335. {0, NULL}
  336. };
  337. static int
  338. tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
  339. {
  340. tokenize_state *state = get_tokenize_state(m);
  341. Py_VISIT(state->TokenizerIter);
  342. return 0;
  343. }
  344. static int
  345. tokenizemodule_clear(PyObject *m)
  346. {
  347. tokenize_state *state = get_tokenize_state(m);
  348. Py_CLEAR(state->TokenizerIter);
  349. return 0;
  350. }
  351. static void
  352. tokenizemodule_free(void *m)
  353. {
  354. tokenizemodule_clear((PyObject *)m);
  355. }
  356. static struct PyModuleDef _tokenizemodule = {
  357. PyModuleDef_HEAD_INIT,
  358. .m_name = "_tokenize",
  359. .m_size = sizeof(tokenize_state),
  360. .m_slots = tokenizemodule_slots,
  361. .m_methods = tokenize_methods,
  362. .m_traverse = tokenizemodule_traverse,
  363. .m_clear = tokenizemodule_clear,
  364. .m_free = tokenizemodule_free,
  365. };
  366. PyMODINIT_FUNC
  367. PyInit__tokenize(void)
  368. {
  369. return PyModuleDef_Init(&_tokenizemodule);
  370. }