Python-tokenize.c 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. #include "Python.h"
  2. #include "errcode.h"
  3. #include "../Parser/tokenizer.h"
  4. #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
  5. #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
  6. static struct PyModuleDef _tokenizemodule;
  7. typedef struct {
  8. PyTypeObject *TokenizerIter;
  9. } tokenize_state;
  10. static tokenize_state *
  11. get_tokenize_state(PyObject *module) {
  12. return (tokenize_state *)PyModule_GetState(module);
  13. }
  14. #define _tokenize_get_state_by_type(type) \
  15. get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
  16. #include "pycore_runtime.h"
  17. #include "clinic/Python-tokenize.c.h"
  18. /*[clinic input]
  19. module _tokenizer
  20. class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
  21. [clinic start generated code]*/
  22. /*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
  23. typedef struct
  24. {
  25. PyObject_HEAD struct tok_state *tok;
  26. int done;
  27. } tokenizeriterobject;
  28. /*[clinic input]
  29. @classmethod
  30. _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
  31. readline: object
  32. /
  33. *
  34. extra_tokens: bool
  35. encoding: str(c_default="NULL") = 'utf-8'
  36. [clinic start generated code]*/
  37. static PyObject *
  38. tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
  39. int extra_tokens, const char *encoding)
  40. /*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
  41. {
  42. tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
  43. if (self == NULL) {
  44. return NULL;
  45. }
  46. PyObject *filename = PyUnicode_FromString("<string>");
  47. if (filename == NULL) {
  48. return NULL;
  49. }
  50. self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
  51. if (self->tok == NULL) {
  52. Py_DECREF(filename);
  53. return NULL;
  54. }
  55. self->tok->filename = filename;
  56. if (extra_tokens) {
  57. self->tok->tok_extra_tokens = 1;
  58. }
  59. self->done = 0;
  60. return (PyObject *)self;
  61. }
  62. static int
  63. _tokenizer_error(struct tok_state *tok)
  64. {
  65. if (PyErr_Occurred()) {
  66. return -1;
  67. }
  68. const char *msg = NULL;
  69. PyObject* errtype = PyExc_SyntaxError;
  70. switch (tok->done) {
  71. case E_TOKEN:
  72. msg = "invalid token";
  73. break;
  74. case E_EOF:
  75. PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
  76. PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
  77. tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
  78. return -1;
  79. case E_DEDENT:
  80. msg = "unindent does not match any outer indentation level";
  81. errtype = PyExc_IndentationError;
  82. break;
  83. case E_INTR:
  84. if (!PyErr_Occurred()) {
  85. PyErr_SetNone(PyExc_KeyboardInterrupt);
  86. }
  87. return -1;
  88. case E_NOMEM:
  89. PyErr_NoMemory();
  90. return -1;
  91. case E_TABSPACE:
  92. errtype = PyExc_TabError;
  93. msg = "inconsistent use of tabs and spaces in indentation";
  94. break;
  95. case E_TOODEEP:
  96. errtype = PyExc_IndentationError;
  97. msg = "too many levels of indentation";
  98. break;
  99. case E_LINECONT: {
  100. msg = "unexpected character after line continuation character";
  101. break;
  102. }
  103. default:
  104. msg = "unknown tokenization error";
  105. }
  106. PyObject* errstr = NULL;
  107. PyObject* error_line = NULL;
  108. PyObject* tmp = NULL;
  109. PyObject* value = NULL;
  110. int result = 0;
  111. Py_ssize_t size = tok->inp - tok->buf;
  112. assert(tok->buf[size-1] == '\n');
  113. size -= 1; // Remove the newline character from the end of the line
  114. error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
  115. if (!error_line) {
  116. result = -1;
  117. goto exit;
  118. }
  119. Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
  120. if (offset == -1) {
  121. result = -1;
  122. goto exit;
  123. }
  124. tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
  125. if (!tmp) {
  126. result = -1;
  127. goto exit;
  128. }
  129. errstr = PyUnicode_FromString(msg);
  130. if (!errstr) {
  131. result = -1;
  132. goto exit;
  133. }
  134. value = PyTuple_Pack(2, errstr, tmp);
  135. if (!value) {
  136. result = -1;
  137. goto exit;
  138. }
  139. PyErr_SetObject(errtype, value);
  140. exit:
  141. Py_XDECREF(errstr);
  142. Py_XDECREF(error_line);
  143. Py_XDECREF(tmp);
  144. Py_XDECREF(value);
  145. return result;
  146. }
  147. static PyObject *
  148. tokenizeriter_next(tokenizeriterobject *it)
  149. {
  150. PyObject* result = NULL;
  151. struct token token;
  152. _PyToken_Init(&token);
  153. int type = _PyTokenizer_Get(it->tok, &token);
  154. if (type == ERRORTOKEN) {
  155. if(!PyErr_Occurred()) {
  156. _tokenizer_error(it->tok);
  157. assert(PyErr_Occurred());
  158. }
  159. goto exit;
  160. }
  161. if (it->done || type == ERRORTOKEN) {
  162. PyErr_SetString(PyExc_StopIteration, "EOF");
  163. it->done = 1;
  164. goto exit;
  165. }
  166. PyObject *str = NULL;
  167. if (token.start == NULL || token.end == NULL) {
  168. str = PyUnicode_FromString("");
  169. }
  170. else {
  171. str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
  172. }
  173. if (str == NULL) {
  174. goto exit;
  175. }
  176. int is_trailing_token = 0;
  177. if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
  178. is_trailing_token = 1;
  179. }
  180. const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
  181. PyObject* line = NULL;
  182. if (it->tok->tok_extra_tokens && is_trailing_token) {
  183. line = PyUnicode_FromString("");
  184. } else {
  185. Py_ssize_t size = it->tok->inp - line_start;
  186. if (size >= 1 && it->tok->implicit_newline) {
  187. size -= 1;
  188. }
  189. line = PyUnicode_DecodeUTF8(line_start, size, "replace");
  190. }
  191. if (line == NULL) {
  192. Py_DECREF(str);
  193. goto exit;
  194. }
  195. Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
  196. Py_ssize_t end_lineno = it->tok->lineno;
  197. Py_ssize_t col_offset = -1;
  198. Py_ssize_t end_col_offset = -1;
  199. if (token.start != NULL && token.start >= line_start) {
  200. col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
  201. }
  202. if (token.end != NULL && token.end >= it->tok->line_start) {
  203. end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
  204. }
  205. if (it->tok->tok_extra_tokens) {
  206. if (is_trailing_token) {
  207. lineno = end_lineno = lineno + 1;
  208. col_offset = end_col_offset = 0;
  209. }
  210. // Necessary adjustments to match the original Python tokenize
  211. // implementation
  212. if (type > DEDENT && type < OP) {
  213. type = OP;
  214. }
  215. else if (type == ASYNC || type == AWAIT) {
  216. type = NAME;
  217. }
  218. else if (type == NEWLINE) {
  219. Py_DECREF(str);
  220. if (!it->tok->implicit_newline) {
  221. if (it->tok->start[0] == '\r') {
  222. str = PyUnicode_FromString("\r\n");
  223. } else {
  224. str = PyUnicode_FromString("\n");
  225. }
  226. }
  227. end_col_offset++;
  228. }
  229. else if (type == NL) {
  230. if (it->tok->implicit_newline) {
  231. Py_DECREF(str);
  232. str = PyUnicode_FromString("");
  233. }
  234. }
  235. if (str == NULL) {
  236. Py_DECREF(line);
  237. goto exit;
  238. }
  239. }
  240. result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
  241. exit:
  242. _PyToken_Free(&token);
  243. if (type == ENDMARKER) {
  244. it->done = 1;
  245. }
  246. return result;
  247. }
  248. static void
  249. tokenizeriter_dealloc(tokenizeriterobject *it)
  250. {
  251. PyTypeObject *tp = Py_TYPE(it);
  252. _PyTokenizer_Free(it->tok);
  253. tp->tp_free(it);
  254. Py_DECREF(tp);
  255. }
  256. static PyType_Slot tokenizeriter_slots[] = {
  257. {Py_tp_new, tokenizeriter_new},
  258. {Py_tp_dealloc, tokenizeriter_dealloc},
  259. {Py_tp_getattro, PyObject_GenericGetAttr},
  260. {Py_tp_iter, PyObject_SelfIter},
  261. {Py_tp_iternext, tokenizeriter_next},
  262. {0, NULL},
  263. };
  264. static PyType_Spec tokenizeriter_spec = {
  265. .name = "_tokenize.TokenizerIter",
  266. .basicsize = sizeof(tokenizeriterobject),
  267. .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
  268. .slots = tokenizeriter_slots,
  269. };
  270. static int
  271. tokenizemodule_exec(PyObject *m)
  272. {
  273. tokenize_state *state = get_tokenize_state(m);
  274. if (state == NULL) {
  275. return -1;
  276. }
  277. state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
  278. if (state->TokenizerIter == NULL) {
  279. return -1;
  280. }
  281. if (PyModule_AddType(m, state->TokenizerIter) < 0) {
  282. return -1;
  283. }
  284. return 0;
  285. }
  286. static PyMethodDef tokenize_methods[] = {
  287. {NULL, NULL, 0, NULL} /* Sentinel */
  288. };
  289. static PyModuleDef_Slot tokenizemodule_slots[] = {
  290. {Py_mod_exec, tokenizemodule_exec},
  291. {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
  292. {0, NULL}
  293. };
  294. static int
  295. tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
  296. {
  297. tokenize_state *state = get_tokenize_state(m);
  298. Py_VISIT(state->TokenizerIter);
  299. return 0;
  300. }
  301. static int
  302. tokenizemodule_clear(PyObject *m)
  303. {
  304. tokenize_state *state = get_tokenize_state(m);
  305. Py_CLEAR(state->TokenizerIter);
  306. return 0;
  307. }
  308. static void
  309. tokenizemodule_free(void *m)
  310. {
  311. tokenizemodule_clear((PyObject *)m);
  312. }
  313. static struct PyModuleDef _tokenizemodule = {
  314. PyModuleDef_HEAD_INIT,
  315. .m_name = "_tokenize",
  316. .m_size = sizeof(tokenize_state),
  317. .m_slots = tokenizemodule_slots,
  318. .m_methods = tokenize_methods,
  319. .m_traverse = tokenizemodule_traverse,
  320. .m_clear = tokenizemodule_clear,
  321. .m_free = tokenizemodule_free,
  322. };
  323. PyMODINIT_FUNC
  324. PyInit__tokenize(void)
  325. {
  326. return PyModuleDef_Init(&_tokenizemodule);
  327. }