test_tokenize.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. # -*- coding: utf-8 # This file contains Unicode characters.
  2. from textwrap import dedent
  3. import pytest
  4. from parso.utils import split_lines, parse_version_string
  5. from parso.python.token import PythonTokenTypes
  6. from parso.python import tokenize
  7. from parso import parse
  8. from parso.python.tokenize import PythonToken
  9. # To make it easier to access some of the token types, just put them here.
  10. NAME = PythonTokenTypes.NAME
  11. NEWLINE = PythonTokenTypes.NEWLINE
  12. STRING = PythonTokenTypes.STRING
  13. NUMBER = PythonTokenTypes.NUMBER
  14. INDENT = PythonTokenTypes.INDENT
  15. DEDENT = PythonTokenTypes.DEDENT
  16. ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
  17. OP = PythonTokenTypes.OP
  18. ENDMARKER = PythonTokenTypes.ENDMARKER
  19. ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
  20. FSTRING_START = PythonTokenTypes.FSTRING_START
  21. FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
  22. FSTRING_END = PythonTokenTypes.FSTRING_END
  23. def _get_token_list(string, version=None):
  24. # Load the current version.
  25. version_info = parse_version_string(version)
  26. return list(tokenize.tokenize(string, version_info=version_info))
  27. def test_end_pos_one_line():
  28. parsed = parse(dedent('''
  29. def testit():
  30. a = "huhu"
  31. '''))
  32. simple_stmt = next(parsed.iter_funcdefs()).get_suite().children[-1]
  33. string = simple_stmt.children[0].get_rhs()
  34. assert string.end_pos == (3, 14)
  35. def test_end_pos_multi_line():
  36. parsed = parse(dedent('''
  37. def testit():
  38. a = """huhu
  39. asdfasdf""" + "h"
  40. '''))
  41. expr_stmt = next(parsed.iter_funcdefs()).get_suite().children[1].children[0]
  42. string_leaf = expr_stmt.get_rhs().children[0]
  43. assert string_leaf.end_pos == (4, 11)
  44. def test_simple_no_whitespace():
  45. # Test a simple one line string, no preceding whitespace
  46. simple_docstring = '"""simple one line docstring"""'
  47. token_list = _get_token_list(simple_docstring)
  48. _, value, _, prefix = token_list[0]
  49. assert prefix == ''
  50. assert value == '"""simple one line docstring"""'
  51. def test_simple_with_whitespace():
  52. # Test a simple one line string with preceding whitespace and newline
  53. simple_docstring = ' """simple one line docstring""" \r\n'
  54. token_list = _get_token_list(simple_docstring)
  55. assert token_list[0][0] == INDENT
  56. typ, value, start_pos, prefix = token_list[1]
  57. assert prefix == ' '
  58. assert value == '"""simple one line docstring"""'
  59. assert typ == STRING
  60. typ, value, start_pos, prefix = token_list[2]
  61. assert prefix == ' '
  62. assert typ == NEWLINE
  63. def test_function_whitespace():
  64. # Test function definition whitespace identification
  65. fundef = dedent('''
  66. def test_whitespace(*args, **kwargs):
  67. x = 1
  68. if x > 0:
  69. print(True)
  70. ''')
  71. token_list = _get_token_list(fundef)
  72. for _, value, _, prefix in token_list:
  73. if value == 'test_whitespace':
  74. assert prefix == ' '
  75. if value == '(':
  76. assert prefix == ''
  77. if value == '*':
  78. assert prefix == ''
  79. if value == '**':
  80. assert prefix == ' '
  81. if value == 'print':
  82. assert prefix == ' '
  83. if value == 'if':
  84. assert prefix == ' '
  85. def test_tokenize_multiline_I():
  86. # Make sure multiline string having newlines have the end marker on the
  87. # next line
  88. fundef = '''""""\n'''
  89. token_list = _get_token_list(fundef)
  90. assert token_list == [PythonToken(ERRORTOKEN, '""""\n', (1, 0), ''),
  91. PythonToken(ENDMARKER, '', (2, 0), '')]
  92. def test_tokenize_multiline_II():
  93. # Make sure multiline string having no newlines have the end marker on
  94. # same line
  95. fundef = '''""""'''
  96. token_list = _get_token_list(fundef)
  97. assert token_list == [PythonToken(ERRORTOKEN, '""""', (1, 0), ''),
  98. PythonToken(ENDMARKER, '', (1, 4), '')]
  99. def test_tokenize_multiline_III():
  100. # Make sure multiline string having newlines have the end marker on the
  101. # next line even if several newline
  102. fundef = '''""""\n\n'''
  103. token_list = _get_token_list(fundef)
  104. assert token_list == [PythonToken(ERRORTOKEN, '""""\n\n', (1, 0), ''),
  105. PythonToken(ENDMARKER, '', (3, 0), '')]
  106. def test_identifier_contains_unicode():
  107. fundef = dedent('''
  108. def 我あφ():
  109. pass
  110. ''')
  111. token_list = _get_token_list(fundef)
  112. unicode_token = token_list[1]
  113. assert unicode_token[0] == NAME
  114. def test_quoted_strings():
  115. string_tokens = [
  116. 'u"test"',
  117. 'u"""test"""',
  118. 'U"""test"""',
  119. "u'''test'''",
  120. "U'''test'''",
  121. ]
  122. for s in string_tokens:
  123. module = parse('''a = %s\n''' % s)
  124. simple_stmt = module.children[0]
  125. expr_stmt = simple_stmt.children[0]
  126. assert len(expr_stmt.children) == 3
  127. string_tok = expr_stmt.children[2]
  128. assert string_tok.type == 'string'
  129. assert string_tok.value == s
  130. def test_ur_literals():
  131. """
  132. Decided to parse `u''` literals regardless of Python version. This makes
  133. probably sense:
  134. - Python 3+ doesn't support it, but it doesn't hurt
  135. not be. While this is incorrect, it's just incorrect for one "old" and in
  136. the future not very important version.
  137. - All the other Python versions work very well with it.
  138. """
  139. def check(literal, is_literal=True):
  140. token_list = _get_token_list(literal)
  141. typ, result_literal, _, _ = token_list[0]
  142. if is_literal:
  143. if typ != FSTRING_START:
  144. assert typ == STRING
  145. assert result_literal == literal
  146. else:
  147. assert typ == NAME
  148. check('u""')
  149. check('ur""', is_literal=False)
  150. check('Ur""', is_literal=False)
  151. check('UR""', is_literal=False)
  152. check('bR""')
  153. check('Rb""')
  154. check('fr""')
  155. check('rF""')
  156. check('f""')
  157. check('F""')
  158. def test_error_literal():
  159. error_token, newline, endmarker = _get_token_list('"\n')
  160. assert error_token.type == ERRORTOKEN
  161. assert error_token.string == '"'
  162. assert newline.type == NEWLINE
  163. assert endmarker.type == ENDMARKER
  164. assert endmarker.prefix == ''
  165. bracket, error_token, endmarker = _get_token_list('( """')
  166. assert error_token.type == ERRORTOKEN
  167. assert error_token.prefix == ' '
  168. assert error_token.string == '"""'
  169. assert endmarker.type == ENDMARKER
  170. assert endmarker.prefix == ''
  171. def test_endmarker_end_pos():
  172. def check(code):
  173. tokens = _get_token_list(code)
  174. lines = split_lines(code)
  175. assert tokens[-1].end_pos == (len(lines), len(lines[-1]))
  176. check('#c')
  177. check('#c\n')
  178. check('a\n')
  179. check('a')
  180. check(r'a\\n')
  181. check('a\\')
  182. @pytest.mark.parametrize(
  183. ('code', 'types'), [
  184. # Indentation
  185. (' foo', [INDENT, NAME, DEDENT]),
  186. (' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
  187. (' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
  188. NEWLINE, NAME, DEDENT]),
  189. (' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
  190. # Name stuff
  191. ('1foo1', [NUMBER, NAME]),
  192. ('மெல்லினம்', [NAME]),
  193. ('²', [ERRORTOKEN]),
  194. ('ä²ö', [NAME, ERRORTOKEN, NAME]),
  195. ('ää²¹öö', [NAME, ERRORTOKEN, NAME]),
  196. (' \x00a', [INDENT, ERRORTOKEN, NAME, DEDENT]),
  197. (dedent('''\
  198. class BaseCache:
  199. a
  200. def
  201. b
  202. def
  203. c
  204. '''), [NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE,
  205. ERROR_DEDENT, NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
  206. NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT, DEDENT]),
  207. (' )\n foo', [INDENT, OP, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
  208. ('a\n b\n )\n c', [NAME, NEWLINE, INDENT, NAME, NEWLINE, INDENT, OP,
  209. NEWLINE, DEDENT, NAME, DEDENT]),
  210. (' 1 \\\ndef', [INDENT, NUMBER, NAME, DEDENT]),
  211. ]
  212. )
  213. def test_token_types(code, types):
  214. actual_types = [t.type for t in _get_token_list(code)]
  215. assert actual_types == types + [ENDMARKER]
  216. def test_error_string():
  217. indent, t1, newline, token, endmarker = _get_token_list(' "\n')
  218. assert t1.type == ERRORTOKEN
  219. assert t1.prefix == ' '
  220. assert t1.string == '"'
  221. assert newline.type == NEWLINE
  222. assert endmarker.prefix == ''
  223. assert endmarker.string == ''
  224. def test_indent_error_recovery():
  225. code = dedent("""\
  226. str(
  227. from x import a
  228. def
  229. """)
  230. lst = _get_token_list(code)
  231. expected = [
  232. # `str(`
  233. INDENT, NAME, OP,
  234. # `from parso`
  235. NAME, NAME,
  236. # `import a` on same line as the previous from parso
  237. NAME, NAME, NEWLINE,
  238. # Dedent happens, because there's an import now and the import
  239. # statement "breaks" out of the opening paren on the first line.
  240. DEDENT,
  241. # `b`
  242. NAME, NEWLINE, ENDMARKER]
  243. assert [t.type for t in lst] == expected
  244. def test_error_token_after_dedent():
  245. code = dedent("""\
  246. class C:
  247. pass
  248. $foo
  249. """)
  250. lst = _get_token_list(code)
  251. expected = [
  252. NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
  253. # $foo\n
  254. ERRORTOKEN, NAME, NEWLINE, ENDMARKER
  255. ]
  256. assert [t.type for t in lst] == expected
  257. def test_brackets_no_indentation():
  258. """
  259. There used to be an issue that the parentheses counting would go below
  260. zero. This should not happen.
  261. """
  262. code = dedent("""\
  263. }
  264. {
  265. }
  266. """)
  267. lst = _get_token_list(code)
  268. assert [t.type for t in lst] == [OP, NEWLINE, OP, OP, NEWLINE, ENDMARKER]
  269. def test_form_feed():
  270. indent, error_token, dedent_, endmarker = _get_token_list(dedent('''\
  271. \f"""'''))
  272. assert error_token.prefix == '\f'
  273. assert error_token.string == '"""'
  274. assert endmarker.prefix == ''
  275. assert indent.type == INDENT
  276. assert dedent_.type == DEDENT
  277. def test_carriage_return():
  278. lst = _get_token_list(' =\\\rclass')
  279. assert [t.type for t in lst] == [INDENT, OP, NAME, DEDENT, ENDMARKER]
  280. def test_backslash():
  281. code = '\\\n# 1 \n'
  282. endmarker, = _get_token_list(code)
  283. assert endmarker.prefix == code
  284. @pytest.mark.parametrize(
  285. ('code', 'types'), [
  286. # f-strings
  287. ('f"', [FSTRING_START]),
  288. ('f""', [FSTRING_START, FSTRING_END]),
  289. ('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]),
  290. ('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP]),
  291. (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
  292. (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
  293. # format spec
  294. (r'f"Some {x:.2f}{y}"', [FSTRING_START, FSTRING_STRING, OP, NAME, OP,
  295. FSTRING_STRING, OP, OP, NAME, OP, FSTRING_END]),
  296. # multiline f-string
  297. ('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
  298. ('f"""abc{\n123}def"""', [
  299. FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
  300. FSTRING_END
  301. ]),
  302. # a line continuation inside of an fstring_string
  303. ('f"abc\\\ndef"', [
  304. FSTRING_START, FSTRING_STRING, FSTRING_END
  305. ]),
  306. ('f"\\\n{123}\\\n"', [
  307. FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
  308. FSTRING_END
  309. ]),
  310. # a line continuation inside of an fstring_expr
  311. ('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END]),
  312. # a line continuation inside of an format spec
  313. ('f"{123:.2\\\nf}"', [
  314. FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END
  315. ]),
  316. # a newline without a line continuation inside a single-line string is
  317. # wrong, and will generate an ERRORTOKEN
  318. ('f"abc\ndef"', [
  319. FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN
  320. ]),
  321. # a more complex example
  322. (r'print(f"Some {x:.2f}a{y}")', [
  323. NAME, OP, FSTRING_START, FSTRING_STRING, OP, NAME, OP,
  324. FSTRING_STRING, OP, FSTRING_STRING, OP, NAME, OP, FSTRING_END, OP
  325. ]),
  326. # issue #86, a string-like in an f-string expression
  327. ('f"{ ""}"', [
  328. FSTRING_START, OP, FSTRING_END, STRING
  329. ]),
  330. ('f"{ f""}"', [
  331. FSTRING_START, OP, NAME, FSTRING_END, STRING
  332. ]),
  333. ]
  334. )
  335. def test_fstring_token_types(code, types, each_version):
  336. actual_types = [t.type for t in _get_token_list(code, each_version)]
  337. assert types + [ENDMARKER] == actual_types
  338. @pytest.mark.parametrize(
  339. ('code', 'types'), [
  340. # issue #87, `:=` in the outest paratheses should be tokenized
  341. # as a format spec marker and part of the format
  342. ('f"{x:=10}"', [
  343. FSTRING_START, OP, NAME, OP, FSTRING_STRING, OP, FSTRING_END
  344. ]),
  345. ('f"{(x:=10)}"', [
  346. FSTRING_START, OP, OP, NAME, OP, NUMBER, OP, OP, FSTRING_END
  347. ]),
  348. ]
  349. )
  350. def test_fstring_assignment_expression(code, types, version_ge_py38):
  351. actual_types = [t.type for t in _get_token_list(code, version_ge_py38)]
  352. assert types + [ENDMARKER] == actual_types
  353. def test_fstring_end_error_pos(version_ge_py38):
  354. f_start, f_string, bracket, f_end, endmarker = \
  355. _get_token_list('f" { "', version_ge_py38)
  356. assert f_start.start_pos == (1, 0)
  357. assert f_string.start_pos == (1, 2)
  358. assert bracket.start_pos == (1, 3)
  359. assert f_end.start_pos == (1, 5)
  360. assert endmarker.start_pos == (1, 6)