test_tokenize.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. # -*- coding: utf-8 # This file contains Unicode characters.
  2. import sys
  3. from textwrap import dedent
  4. import pytest
  5. from parso.utils import split_lines, parse_version_string
  6. from parso.python.token import PythonTokenTypes
  7. from parso.python import tokenize
  8. from parso import parse
  9. from parso.python.tokenize import PythonToken
  10. # To make it easier to access some of the token types, just put them here.
  11. NAME = PythonTokenTypes.NAME
  12. NEWLINE = PythonTokenTypes.NEWLINE
  13. STRING = PythonTokenTypes.STRING
  14. NUMBER = PythonTokenTypes.NUMBER
  15. INDENT = PythonTokenTypes.INDENT
  16. DEDENT = PythonTokenTypes.DEDENT
  17. ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
  18. OP = PythonTokenTypes.OP
  19. ENDMARKER = PythonTokenTypes.ENDMARKER
  20. ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
  21. FSTRING_START = PythonTokenTypes.FSTRING_START
  22. FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
  23. FSTRING_END = PythonTokenTypes.FSTRING_END
  24. def _get_token_list(string, version=None):
  25. # Load the current version.
  26. version_info = parse_version_string(version)
  27. return list(tokenize.tokenize(string, version_info))
  28. def test_end_pos_one_line():
  29. parsed = parse(dedent('''
  30. def testit():
  31. a = "huhu"
  32. '''))
  33. simple_stmt = next(parsed.iter_funcdefs()).get_suite().children[-1]
  34. string = simple_stmt.children[0].get_rhs()
  35. assert string.end_pos == (3, 14)
  36. def test_end_pos_multi_line():
  37. parsed = parse(dedent('''
  38. def testit():
  39. a = """huhu
  40. asdfasdf""" + "h"
  41. '''))
  42. expr_stmt = next(parsed.iter_funcdefs()).get_suite().children[1].children[0]
  43. string_leaf = expr_stmt.get_rhs().children[0]
  44. assert string_leaf.end_pos == (4, 11)
  45. def test_simple_no_whitespace():
  46. # Test a simple one line string, no preceding whitespace
  47. simple_docstring = '"""simple one line docstring"""'
  48. token_list = _get_token_list(simple_docstring)
  49. _, value, _, prefix = token_list[0]
  50. assert prefix == ''
  51. assert value == '"""simple one line docstring"""'
  52. def test_simple_with_whitespace():
  53. # Test a simple one line string with preceding whitespace and newline
  54. simple_docstring = ' """simple one line docstring""" \r\n'
  55. token_list = _get_token_list(simple_docstring)
  56. assert token_list[0][0] == INDENT
  57. typ, value, start_pos, prefix = token_list[1]
  58. assert prefix == ' '
  59. assert value == '"""simple one line docstring"""'
  60. assert typ == STRING
  61. typ, value, start_pos, prefix = token_list[2]
  62. assert prefix == ' '
  63. assert typ == NEWLINE
  64. def test_function_whitespace():
  65. # Test function definition whitespace identification
  66. fundef = dedent('''
  67. def test_whitespace(*args, **kwargs):
  68. x = 1
  69. if x > 0:
  70. print(True)
  71. ''')
  72. token_list = _get_token_list(fundef)
  73. for _, value, _, prefix in token_list:
  74. if value == 'test_whitespace':
  75. assert prefix == ' '
  76. if value == '(':
  77. assert prefix == ''
  78. if value == '*':
  79. assert prefix == ''
  80. if value == '**':
  81. assert prefix == ' '
  82. if value == 'print':
  83. assert prefix == ' '
  84. if value == 'if':
  85. assert prefix == ' '
  86. def test_tokenize_multiline_I():
  87. # Make sure multiline string having newlines have the end marker on the
  88. # next line
  89. fundef = '''""""\n'''
  90. token_list = _get_token_list(fundef)
  91. assert token_list == [PythonToken(ERRORTOKEN, '""""\n', (1, 0), ''),
  92. PythonToken(ENDMARKER , '', (2, 0), '')]
  93. def test_tokenize_multiline_II():
  94. # Make sure multiline string having no newlines have the end marker on
  95. # same line
  96. fundef = '''""""'''
  97. token_list = _get_token_list(fundef)
  98. assert token_list == [PythonToken(ERRORTOKEN, '""""', (1, 0), ''),
  99. PythonToken(ENDMARKER, '', (1, 4), '')]
  100. def test_tokenize_multiline_III():
  101. # Make sure multiline string having newlines have the end marker on the
  102. # next line even if several newline
  103. fundef = '''""""\n\n'''
  104. token_list = _get_token_list(fundef)
  105. assert token_list == [PythonToken(ERRORTOKEN, '""""\n\n', (1, 0), ''),
  106. PythonToken(ENDMARKER, '', (3, 0), '')]
  107. def test_identifier_contains_unicode():
  108. fundef = dedent('''
  109. def 我あφ():
  110. pass
  111. ''')
  112. token_list = _get_token_list(fundef)
  113. unicode_token = token_list[1]
  114. if sys.version_info.major >= 3:
  115. assert unicode_token[0] == NAME
  116. else:
  117. # Unicode tokens in Python 2 seem to be identified as operators.
  118. # They will be ignored in the parser, that's ok.
  119. assert unicode_token[0] == ERRORTOKEN
  120. def test_quoted_strings():
  121. string_tokens = [
  122. 'u"test"',
  123. 'u"""test"""',
  124. 'U"""test"""',
  125. "u'''test'''",
  126. "U'''test'''",
  127. ]
  128. for s in string_tokens:
  129. module = parse('''a = %s\n''' % s)
  130. simple_stmt = module.children[0]
  131. expr_stmt = simple_stmt.children[0]
  132. assert len(expr_stmt.children) == 3
  133. string_tok = expr_stmt.children[2]
  134. assert string_tok.type == 'string'
  135. assert string_tok.value == s
  136. def test_ur_literals():
  137. """
  138. Decided to parse `u''` literals regardless of Python version. This makes
  139. probably sense:
  140. - Python 3+ doesn't support it, but it doesn't hurt
  141. not be. While this is incorrect, it's just incorrect for one "old" and in
  142. the future not very important version.
  143. - All the other Python versions work very well with it.
  144. """
  145. def check(literal, is_literal=True):
  146. token_list = _get_token_list(literal)
  147. typ, result_literal, _, _ = token_list[0]
  148. if is_literal:
  149. if typ != FSTRING_START:
  150. assert typ == STRING
  151. assert result_literal == literal
  152. else:
  153. assert typ == NAME
  154. check('u""')
  155. check('ur""', is_literal=not sys.version_info.major >= 3)
  156. check('Ur""', is_literal=not sys.version_info.major >= 3)
  157. check('UR""', is_literal=not sys.version_info.major >= 3)
  158. check('bR""')
  159. # Starting with Python 3.3 this ordering is also possible.
  160. if sys.version_info.major >= 3:
  161. check('Rb""')
  162. # Starting with Python 3.6 format strings where introduced.
  163. check('fr""', is_literal=sys.version_info >= (3, 6))
  164. check('rF""', is_literal=sys.version_info >= (3, 6))
  165. check('f""', is_literal=sys.version_info >= (3, 6))
  166. check('F""', is_literal=sys.version_info >= (3, 6))
  167. def test_error_literal():
  168. error_token, newline, endmarker = _get_token_list('"\n')
  169. assert error_token.type == ERRORTOKEN
  170. assert error_token.string == '"'
  171. assert newline.type == NEWLINE
  172. assert endmarker.type == ENDMARKER
  173. assert endmarker.prefix == ''
  174. bracket, error_token, endmarker = _get_token_list('( """')
  175. assert error_token.type == ERRORTOKEN
  176. assert error_token.prefix == ' '
  177. assert error_token.string == '"""'
  178. assert endmarker.type == ENDMARKER
  179. assert endmarker.prefix == ''
  180. def test_endmarker_end_pos():
  181. def check(code):
  182. tokens = _get_token_list(code)
  183. lines = split_lines(code)
  184. assert tokens[-1].end_pos == (len(lines), len(lines[-1]))
  185. check('#c')
  186. check('#c\n')
  187. check('a\n')
  188. check('a')
  189. check(r'a\\n')
  190. check('a\\')
  191. xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])
  192. @pytest.mark.parametrize(
  193. ('code', 'types'), [
  194. # Indentation
  195. (' foo', [INDENT, NAME, DEDENT]),
  196. (' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
  197. (' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
  198. NEWLINE, NAME, DEDENT]),
  199. (' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
  200. # Name stuff
  201. ('1foo1', [NUMBER, NAME]),
  202. pytest.param(
  203. u'மெல்லினம்', [NAME],
  204. **xfail_py2),
  205. pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
  206. pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
  207. pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
  208. (' \x00a', [INDENT, ERRORTOKEN, NAME, DEDENT]),
  209. (dedent('''\
  210. class BaseCache:
  211. a
  212. def
  213. b
  214. def
  215. c
  216. '''), [NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE,
  217. ERROR_DEDENT, NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
  218. NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT, DEDENT]),
  219. (' )\n foo', [INDENT, OP, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
  220. ('a\n b\n )\n c', [NAME, NEWLINE, INDENT, NAME, NEWLINE, INDENT, OP,
  221. NEWLINE, DEDENT, NAME, DEDENT]),
  222. (' 1 \\\ndef', [INDENT, NUMBER, NAME, DEDENT]),
  223. ]
  224. )
  225. def test_token_types(code, types):
  226. actual_types = [t.type for t in _get_token_list(code)]
  227. assert actual_types == types + [ENDMARKER]
  228. def test_error_string():
  229. indent, t1, newline, token, endmarker = _get_token_list(' "\n')
  230. assert t1.type == ERRORTOKEN
  231. assert t1.prefix == ' '
  232. assert t1.string == '"'
  233. assert newline.type == NEWLINE
  234. assert endmarker.prefix == ''
  235. assert endmarker.string == ''
  236. def test_indent_error_recovery():
  237. code = dedent("""\
  238. str(
  239. from x import a
  240. def
  241. """)
  242. lst = _get_token_list(code)
  243. expected = [
  244. # `str(`
  245. INDENT, NAME, OP,
  246. # `from parso`
  247. NAME, NAME,
  248. # `import a` on same line as the previous from parso
  249. NAME, NAME, NEWLINE,
  250. # Dedent happens, because there's an import now and the import
  251. # statement "breaks" out of the opening paren on the first line.
  252. DEDENT,
  253. # `b`
  254. NAME, NEWLINE, ENDMARKER]
  255. assert [t.type for t in lst] == expected
  256. def test_error_token_after_dedent():
  257. code = dedent("""\
  258. class C:
  259. pass
  260. $foo
  261. """)
  262. lst = _get_token_list(code)
  263. expected = [
  264. NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
  265. # $foo\n
  266. ERRORTOKEN, NAME, NEWLINE, ENDMARKER
  267. ]
  268. assert [t.type for t in lst] == expected
  269. def test_brackets_no_indentation():
  270. """
  271. There used to be an issue that the parentheses counting would go below
  272. zero. This should not happen.
  273. """
  274. code = dedent("""\
  275. }
  276. {
  277. }
  278. """)
  279. lst = _get_token_list(code)
  280. assert [t.type for t in lst] == [OP, NEWLINE, OP, OP, NEWLINE, ENDMARKER]
  281. def test_form_feed():
  282. indent, error_token, dedent_, endmarker = _get_token_list(dedent('''\
  283. \f"""'''))
  284. assert error_token.prefix == '\f'
  285. assert error_token.string == '"""'
  286. assert endmarker.prefix == ''
  287. assert indent.type == INDENT
  288. assert dedent_.type == DEDENT
  289. def test_carriage_return():
  290. lst = _get_token_list(' =\\\rclass')
  291. assert [t.type for t in lst] == [INDENT, OP, NAME, DEDENT, ENDMARKER]
  292. def test_backslash():
  293. code = '\\\n# 1 \n'
  294. endmarker, = _get_token_list(code)
  295. assert endmarker.prefix == code
  296. @pytest.mark.parametrize(
  297. ('code', 'types'), [
  298. # f-strings
  299. ('f"', [FSTRING_START]),
  300. ('f""', [FSTRING_START, FSTRING_END]),
  301. ('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]),
  302. ('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP]),
  303. (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
  304. (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
  305. # format spec
  306. (r'f"Some {x:.2f}{y}"', [FSTRING_START, FSTRING_STRING, OP, NAME, OP,
  307. FSTRING_STRING, OP, OP, NAME, OP, FSTRING_END]),
  308. # multiline f-string
  309. ('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
  310. ('f"""abc{\n123}def"""', [
  311. FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
  312. FSTRING_END
  313. ]),
  314. # a line continuation inside of an fstring_string
  315. ('f"abc\\\ndef"', [
  316. FSTRING_START, FSTRING_STRING, FSTRING_END
  317. ]),
  318. ('f"\\\n{123}\\\n"', [
  319. FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
  320. FSTRING_END
  321. ]),
  322. # a line continuation inside of an fstring_expr
  323. ('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END]),
  324. # a line continuation inside of an format spec
  325. ('f"{123:.2\\\nf}"', [
  326. FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END
  327. ]),
  328. # a newline without a line continuation inside a single-line string is
  329. # wrong, and will generate an ERRORTOKEN
  330. ('f"abc\ndef"', [
  331. FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN
  332. ]),
  333. # a more complex example
  334. (r'print(f"Some {x:.2f}a{y}")', [
  335. NAME, OP, FSTRING_START, FSTRING_STRING, OP, NAME, OP,
  336. FSTRING_STRING, OP, FSTRING_STRING, OP, NAME, OP, FSTRING_END, OP
  337. ]),
  338. # issue #86, a string-like in an f-string expression
  339. ('f"{ ""}"', [
  340. FSTRING_START, OP, FSTRING_END, STRING
  341. ]),
  342. ('f"{ f""}"', [
  343. FSTRING_START, OP, NAME, FSTRING_END, STRING
  344. ]),
  345. ]
  346. )
  347. def test_fstring_token_types(code, types, version_ge_py36):
  348. actual_types = [t.type for t in _get_token_list(code, version_ge_py36)]
  349. assert types + [ENDMARKER] == actual_types
  350. @pytest.mark.parametrize(
  351. ('code', 'types'), [
  352. # issue #87, `:=` in the outest paratheses should be tokenized
  353. # as a format spec marker and part of the format
  354. ('f"{x:=10}"', [
  355. FSTRING_START, OP, NAME, OP, FSTRING_STRING, OP, FSTRING_END
  356. ]),
  357. ('f"{(x:=10)}"', [
  358. FSTRING_START, OP, OP, NAME, OP, NUMBER, OP, OP, FSTRING_END
  359. ]),
  360. ]
  361. )
  362. def test_fstring_assignment_expression(code, types, version_ge_py38):
  363. actual_types = [t.type for t in _get_token_list(code, version_ge_py38)]
  364. assert types + [ENDMARKER] == actual_types
  365. def test_fstring_end_error_pos(version_ge_py38):
  366. f_start, f_string, bracket, f_end, endmarker = \
  367. _get_token_list('f" { "', version_ge_py38)
  368. assert f_start.start_pos == (1, 0)
  369. assert f_string.start_pos == (1, 2)
  370. assert bracket.start_pos == (1, 3)
  371. assert f_end.start_pos == (1, 5)
  372. assert endmarker.start_pos == (1, 6)