123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443 |
- # -*- coding: utf-8 # This file contains Unicode characters.
- import sys
- from textwrap import dedent
- import pytest
- from parso.utils import split_lines, parse_version_string
- from parso.python.token import PythonTokenTypes
- from parso.python import tokenize
- from parso import parse
- from parso.python.tokenize import PythonToken
- # To make it easier to access some of the token types, just put them here.
- NAME = PythonTokenTypes.NAME
- NEWLINE = PythonTokenTypes.NEWLINE
- STRING = PythonTokenTypes.STRING
- NUMBER = PythonTokenTypes.NUMBER
- INDENT = PythonTokenTypes.INDENT
- DEDENT = PythonTokenTypes.DEDENT
- ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
- OP = PythonTokenTypes.OP
- ENDMARKER = PythonTokenTypes.ENDMARKER
- ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
- FSTRING_START = PythonTokenTypes.FSTRING_START
- FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
- FSTRING_END = PythonTokenTypes.FSTRING_END
- def _get_token_list(string, version=None):
- # Load the current version.
- version_info = parse_version_string(version)
- return list(tokenize.tokenize(string, version_info))
- def test_end_pos_one_line():
- parsed = parse(dedent('''
- def testit():
- a = "huhu"
- '''))
- simple_stmt = next(parsed.iter_funcdefs()).get_suite().children[-1]
- string = simple_stmt.children[0].get_rhs()
- assert string.end_pos == (3, 14)
- def test_end_pos_multi_line():
- parsed = parse(dedent('''
- def testit():
- a = """huhu
- asdfasdf""" + "h"
- '''))
- expr_stmt = next(parsed.iter_funcdefs()).get_suite().children[1].children[0]
- string_leaf = expr_stmt.get_rhs().children[0]
- assert string_leaf.end_pos == (4, 11)
- def test_simple_no_whitespace():
- # Test a simple one line string, no preceding whitespace
- simple_docstring = '"""simple one line docstring"""'
- token_list = _get_token_list(simple_docstring)
- _, value, _, prefix = token_list[0]
- assert prefix == ''
- assert value == '"""simple one line docstring"""'
- def test_simple_with_whitespace():
- # Test a simple one line string with preceding whitespace and newline
- simple_docstring = ' """simple one line docstring""" \r\n'
- token_list = _get_token_list(simple_docstring)
- assert token_list[0][0] == INDENT
- typ, value, start_pos, prefix = token_list[1]
- assert prefix == ' '
- assert value == '"""simple one line docstring"""'
- assert typ == STRING
- typ, value, start_pos, prefix = token_list[2]
- assert prefix == ' '
- assert typ == NEWLINE
- def test_function_whitespace():
- # Test function definition whitespace identification
- fundef = dedent('''
- def test_whitespace(*args, **kwargs):
- x = 1
- if x > 0:
- print(True)
- ''')
- token_list = _get_token_list(fundef)
- for _, value, _, prefix in token_list:
- if value == 'test_whitespace':
- assert prefix == ' '
- if value == '(':
- assert prefix == ''
- if value == '*':
- assert prefix == ''
- if value == '**':
- assert prefix == ' '
- if value == 'print':
- assert prefix == ' '
- if value == 'if':
- assert prefix == ' '
- def test_tokenize_multiline_I():
- # Make sure multiline string having newlines have the end marker on the
- # next line
- fundef = '''""""\n'''
- token_list = _get_token_list(fundef)
- assert token_list == [PythonToken(ERRORTOKEN, '""""\n', (1, 0), ''),
- PythonToken(ENDMARKER , '', (2, 0), '')]
- def test_tokenize_multiline_II():
- # Make sure multiline string having no newlines have the end marker on
- # same line
- fundef = '''""""'''
- token_list = _get_token_list(fundef)
- assert token_list == [PythonToken(ERRORTOKEN, '""""', (1, 0), ''),
- PythonToken(ENDMARKER, '', (1, 4), '')]
- def test_tokenize_multiline_III():
- # Make sure multiline string having newlines have the end marker on the
- # next line even if several newline
- fundef = '''""""\n\n'''
- token_list = _get_token_list(fundef)
- assert token_list == [PythonToken(ERRORTOKEN, '""""\n\n', (1, 0), ''),
- PythonToken(ENDMARKER, '', (3, 0), '')]
- def test_identifier_contains_unicode():
- fundef = dedent('''
- def 我あφ():
- pass
- ''')
- token_list = _get_token_list(fundef)
- unicode_token = token_list[1]
- if sys.version_info.major >= 3:
- assert unicode_token[0] == NAME
- else:
- # Unicode tokens in Python 2 seem to be identified as operators.
- # They will be ignored in the parser, that's ok.
- assert unicode_token[0] == ERRORTOKEN
- def test_quoted_strings():
- string_tokens = [
- 'u"test"',
- 'u"""test"""',
- 'U"""test"""',
- "u'''test'''",
- "U'''test'''",
- ]
- for s in string_tokens:
- module = parse('''a = %s\n''' % s)
- simple_stmt = module.children[0]
- expr_stmt = simple_stmt.children[0]
- assert len(expr_stmt.children) == 3
- string_tok = expr_stmt.children[2]
- assert string_tok.type == 'string'
- assert string_tok.value == s
- def test_ur_literals():
- """
- Decided to parse `u''` literals regardless of Python version. This makes
- probably sense:
- - Python 3+ doesn't support it, but it doesn't hurt
- not be. While this is incorrect, it's just incorrect for one "old" and in
- the future not very important version.
- - All the other Python versions work very well with it.
- """
- def check(literal, is_literal=True):
- token_list = _get_token_list(literal)
- typ, result_literal, _, _ = token_list[0]
- if is_literal:
- if typ != FSTRING_START:
- assert typ == STRING
- assert result_literal == literal
- else:
- assert typ == NAME
- check('u""')
- check('ur""', is_literal=not sys.version_info.major >= 3)
- check('Ur""', is_literal=not sys.version_info.major >= 3)
- check('UR""', is_literal=not sys.version_info.major >= 3)
- check('bR""')
- # Starting with Python 3.3 this ordering is also possible.
- if sys.version_info.major >= 3:
- check('Rb""')
- # Starting with Python 3.6 format strings where introduced.
- check('fr""', is_literal=sys.version_info >= (3, 6))
- check('rF""', is_literal=sys.version_info >= (3, 6))
- check('f""', is_literal=sys.version_info >= (3, 6))
- check('F""', is_literal=sys.version_info >= (3, 6))
- def test_error_literal():
- error_token, newline, endmarker = _get_token_list('"\n')
- assert error_token.type == ERRORTOKEN
- assert error_token.string == '"'
- assert newline.type == NEWLINE
- assert endmarker.type == ENDMARKER
- assert endmarker.prefix == ''
- bracket, error_token, endmarker = _get_token_list('( """')
- assert error_token.type == ERRORTOKEN
- assert error_token.prefix == ' '
- assert error_token.string == '"""'
- assert endmarker.type == ENDMARKER
- assert endmarker.prefix == ''
- def test_endmarker_end_pos():
- def check(code):
- tokens = _get_token_list(code)
- lines = split_lines(code)
- assert tokens[-1].end_pos == (len(lines), len(lines[-1]))
- check('#c')
- check('#c\n')
- check('a\n')
- check('a')
- check(r'a\\n')
- check('a\\')
- xfail_py2 = dict(marks=[pytest.mark.xfail(sys.version_info[0] == 2, reason='Python 2')])
- @pytest.mark.parametrize(
- ('code', 'types'), [
- # Indentation
- (' foo', [INDENT, NAME, DEDENT]),
- (' foo\n bar', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
- (' foo\n bar \n baz', [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME,
- NEWLINE, NAME, DEDENT]),
- (' foo\nbar', [INDENT, NAME, NEWLINE, DEDENT, NAME]),
- # Name stuff
- ('1foo1', [NUMBER, NAME]),
- pytest.param(
- u'மெல்லினம்', [NAME],
- **xfail_py2),
- pytest.param(u'²', [ERRORTOKEN], **xfail_py2),
- pytest.param(u'ä²ö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
- pytest.param(u'ää²¹öö', [NAME, ERRORTOKEN, NAME], **xfail_py2),
- (' \x00a', [INDENT, ERRORTOKEN, NAME, DEDENT]),
- (dedent('''\
- class BaseCache:
- a
- def
- b
- def
- c
- '''), [NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE,
- ERROR_DEDENT, NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
- NAME, NEWLINE, INDENT, NAME, NEWLINE, DEDENT, DEDENT]),
- (' )\n foo', [INDENT, OP, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
- ('a\n b\n )\n c', [NAME, NEWLINE, INDENT, NAME, NEWLINE, INDENT, OP,
- NEWLINE, DEDENT, NAME, DEDENT]),
- (' 1 \\\ndef', [INDENT, NUMBER, NAME, DEDENT]),
- ]
- )
- def test_token_types(code, types):
- actual_types = [t.type for t in _get_token_list(code)]
- assert actual_types == types + [ENDMARKER]
- def test_error_string():
- indent, t1, newline, token, endmarker = _get_token_list(' "\n')
- assert t1.type == ERRORTOKEN
- assert t1.prefix == ' '
- assert t1.string == '"'
- assert newline.type == NEWLINE
- assert endmarker.prefix == ''
- assert endmarker.string == ''
- def test_indent_error_recovery():
- code = dedent("""\
- str(
- from x import a
- def
- """)
- lst = _get_token_list(code)
- expected = [
- # `str(`
- INDENT, NAME, OP,
- # `from parso`
- NAME, NAME,
- # `import a` on same line as the previous from parso
- NAME, NAME, NEWLINE,
- # Dedent happens, because there's an import now and the import
- # statement "breaks" out of the opening paren on the first line.
- DEDENT,
- # `b`
- NAME, NEWLINE, ENDMARKER]
- assert [t.type for t in lst] == expected
- def test_error_token_after_dedent():
- code = dedent("""\
- class C:
- pass
- $foo
- """)
- lst = _get_token_list(code)
- expected = [
- NAME, NAME, OP, NEWLINE, INDENT, NAME, NEWLINE, DEDENT,
- # $foo\n
- ERRORTOKEN, NAME, NEWLINE, ENDMARKER
- ]
- assert [t.type for t in lst] == expected
- def test_brackets_no_indentation():
- """
- There used to be an issue that the parentheses counting would go below
- zero. This should not happen.
- """
- code = dedent("""\
- }
- {
- }
- """)
- lst = _get_token_list(code)
- assert [t.type for t in lst] == [OP, NEWLINE, OP, OP, NEWLINE, ENDMARKER]
- def test_form_feed():
- indent, error_token, dedent_, endmarker = _get_token_list(dedent('''\
- \f"""'''))
- assert error_token.prefix == '\f'
- assert error_token.string == '"""'
- assert endmarker.prefix == ''
- assert indent.type == INDENT
- assert dedent_.type == DEDENT
- def test_carriage_return():
- lst = _get_token_list(' =\\\rclass')
- assert [t.type for t in lst] == [INDENT, OP, NAME, DEDENT, ENDMARKER]
- def test_backslash():
- code = '\\\n# 1 \n'
- endmarker, = _get_token_list(code)
- assert endmarker.prefix == code
- @pytest.mark.parametrize(
- ('code', 'types'), [
- # f-strings
- ('f"', [FSTRING_START]),
- ('f""', [FSTRING_START, FSTRING_END]),
- ('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END]),
- ('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP]),
- (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
- (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
- # format spec
- (r'f"Some {x:.2f}{y}"', [FSTRING_START, FSTRING_STRING, OP, NAME, OP,
- FSTRING_STRING, OP, OP, NAME, OP, FSTRING_END]),
- # multiline f-string
- ('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END]),
- ('f"""abc{\n123}def"""', [
- FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
- FSTRING_END
- ]),
- # a line continuation inside of an fstring_string
- ('f"abc\\\ndef"', [
- FSTRING_START, FSTRING_STRING, FSTRING_END
- ]),
- ('f"\\\n{123}\\\n"', [
- FSTRING_START, FSTRING_STRING, OP, NUMBER, OP, FSTRING_STRING,
- FSTRING_END
- ]),
- # a line continuation inside of an fstring_expr
- ('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END]),
- # a line continuation inside of an format spec
- ('f"{123:.2\\\nf}"', [
- FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END
- ]),
- # a newline without a line continuation inside a single-line string is
- # wrong, and will generate an ERRORTOKEN
- ('f"abc\ndef"', [
- FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN
- ]),
- # a more complex example
- (r'print(f"Some {x:.2f}a{y}")', [
- NAME, OP, FSTRING_START, FSTRING_STRING, OP, NAME, OP,
- FSTRING_STRING, OP, FSTRING_STRING, OP, NAME, OP, FSTRING_END, OP
- ]),
- # issue #86, a string-like in an f-string expression
- ('f"{ ""}"', [
- FSTRING_START, OP, FSTRING_END, STRING
- ]),
- ('f"{ f""}"', [
- FSTRING_START, OP, NAME, FSTRING_END, STRING
- ]),
- ]
- )
- def test_fstring_token_types(code, types, version_ge_py36):
- actual_types = [t.type for t in _get_token_list(code, version_ge_py36)]
- assert types + [ENDMARKER] == actual_types
- @pytest.mark.parametrize(
- ('code', 'types'), [
- # issue #87, `:=` in the outest paratheses should be tokenized
- # as a format spec marker and part of the format
- ('f"{x:=10}"', [
- FSTRING_START, OP, NAME, OP, FSTRING_STRING, OP, FSTRING_END
- ]),
- ('f"{(x:=10)}"', [
- FSTRING_START, OP, OP, NAME, OP, NUMBER, OP, OP, FSTRING_END
- ]),
- ]
- )
- def test_fstring_assignment_expression(code, types, version_ge_py38):
- actual_types = [t.type for t in _get_token_list(code, version_ge_py38)]
- assert types + [ENDMARKER] == actual_types
- def test_fstring_end_error_pos(version_ge_py38):
- f_start, f_string, bracket, f_end, endmarker = \
- _get_token_list('f" { "', version_ge_py38)
- assert f_start.start_pos == (1, 0)
- assert f_string.start_pos == (1, 2)
- assert bracket.start_pos == (1, 3)
- assert f_end.start_pos == (1, 5)
- assert endmarker.start_pos == (1, 6)
|