asttokens.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. # Copyright 2016 Grist Labs, Inc.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import abc
  15. import ast
  16. import bisect
  17. import sys
  18. import token
  19. from ast import Module
  20. from typing import Iterable, Iterator, List, Optional, Tuple, Any, cast, TYPE_CHECKING
  21. import six
  22. from six.moves import xrange # pylint: disable=redefined-builtin
  23. from .line_numbers import LineNumbers
  24. from .util import (
  25. Token, match_token, is_non_coding_token, patched_generate_tokens, last_stmt,
  26. annotate_fstring_nodes, generate_tokens, is_module, is_stmt
  27. )
  28. if TYPE_CHECKING: # pragma: no cover
  29. from .util import AstNode, TokenInfo
  30. class ASTTextBase(six.with_metaclass(abc.ABCMeta, object)):
  31. def __init__(self, source_text, filename):
  32. # type: (Any, str) -> None
  33. # FIXME: Strictly, the type of source_text is one of the six string types, but hard to specify with mypy given
  34. # https://mypy.readthedocs.io/en/stable/common_issues.html#variables-vs-type-aliases
  35. self._filename = filename
  36. # Decode source after parsing to let Python 2 handle coding declarations.
  37. # (If the encoding was not utf-8 compatible, then even if it parses correctly,
  38. # we'll fail with a unicode error here.)
  39. source_text = six.ensure_text(source_text)
  40. self._text = source_text
  41. self._line_numbers = LineNumbers(source_text)
  42. @abc.abstractmethod
  43. def get_text_positions(self, node, padded):
  44. # type: (AstNode, bool) -> Tuple[Tuple[int, int], Tuple[int, int]]
  45. """
  46. Returns two ``(lineno, col_offset)`` tuples for the start and end of the given node.
  47. If the positions can't be determined, or the nodes don't correspond to any particular text,
  48. returns ``(1, 0)`` for both.
  49. ``padded`` corresponds to the ``padded`` argument to ``ast.get_source_segment()``.
  50. This means that if ``padded`` is True, the start position will be adjusted to include
  51. leading whitespace if ``node`` is a multiline statement.
  52. """
  53. raise NotImplementedError # pragma: no cover
  54. def get_text_range(self, node, padded=True):
  55. # type: (AstNode, bool) -> Tuple[int, int]
  56. """
  57. Returns the (startpos, endpos) positions in source text corresponding to the given node.
  58. Returns (0, 0) for nodes (like `Load`) that don't correspond to any particular text.
  59. See ``get_text_positions()`` for details on the ``padded`` argument.
  60. """
  61. start, end = self.get_text_positions(node, padded)
  62. return (
  63. self._line_numbers.line_to_offset(*start),
  64. self._line_numbers.line_to_offset(*end),
  65. )
  66. def get_text(self, node, padded=True):
  67. # type: (AstNode, bool) -> str
  68. """
  69. Returns the text corresponding to the given node.
  70. Returns '' for nodes (like `Load`) that don't correspond to any particular text.
  71. See ``get_text_positions()`` for details on the ``padded`` argument.
  72. """
  73. start, end = self.get_text_range(node, padded)
  74. return self._text[start: end]
  75. class ASTTokens(ASTTextBase, object):
  76. """
  77. ASTTokens maintains the text of Python code in several forms: as a string, as line numbers, and
  78. as tokens, and is used to mark and access token and position information.
  79. ``source_text`` must be a unicode or UTF8-encoded string. If you pass in UTF8 bytes, remember
  80. that all offsets you'll get are to the unicode text, which is available as the ``.text``
  81. property.
  82. If ``parse`` is set, the ``source_text`` will be parsed with ``ast.parse()``, and the resulting
  83. tree marked with token info and made available as the ``.tree`` property.
  84. If ``tree`` is given, it will be marked and made available as the ``.tree`` property. In
  85. addition to the trees produced by the ``ast`` module, ASTTokens will also mark trees produced
  86. using ``astroid`` library <https://www.astroid.org>.
  87. If only ``source_text`` is given, you may use ``.mark_tokens(tree)`` to mark the nodes of an AST
  88. tree created separately.
  89. """
  90. def __init__(self, source_text, parse=False, tree=None, filename='<unknown>', tokens=None):
  91. # type: (Any, bool, Optional[Module], str, Iterable[TokenInfo]) -> None
  92. # FIXME: Strictly, the type of source_text is one of the six string types, but hard to specify with mypy given
  93. # https://mypy.readthedocs.io/en/stable/common_issues.html#variables-vs-type-aliases
  94. super(ASTTokens, self).__init__(source_text, filename)
  95. self._tree = ast.parse(source_text, filename) if parse else tree
  96. # Tokenize the code.
  97. if tokens is None:
  98. tokens = generate_tokens(self._text)
  99. self._tokens = list(self._translate_tokens(tokens))
  100. # Extract the start positions of all tokens, so that we can quickly map positions to tokens.
  101. self._token_offsets = [tok.startpos for tok in self._tokens]
  102. if self._tree:
  103. self.mark_tokens(self._tree)
  104. def mark_tokens(self, root_node):
  105. # type: (Module) -> None
  106. """
  107. Given the root of the AST or Astroid tree produced from source_text, visits all nodes marking
  108. them with token and position information by adding ``.first_token`` and
  109. ``.last_token``attributes. This is done automatically in the constructor when ``parse`` or
  110. ``tree`` arguments are set, but may be used manually with a separate AST or Astroid tree.
  111. """
  112. # The hard work of this class is done by MarkTokens
  113. from .mark_tokens import MarkTokens # to avoid import loops
  114. MarkTokens(self).visit_tree(root_node)
  115. def _translate_tokens(self, original_tokens):
  116. # type: (Iterable[TokenInfo]) -> Iterator[Token]
  117. """
  118. Translates the given standard library tokens into our own representation.
  119. """
  120. for index, tok in enumerate(patched_generate_tokens(original_tokens)):
  121. tok_type, tok_str, start, end, line = tok
  122. yield Token(tok_type, tok_str, start, end, line, index,
  123. self._line_numbers.line_to_offset(start[0], start[1]),
  124. self._line_numbers.line_to_offset(end[0], end[1]))
  125. @property
  126. def text(self):
  127. # type: () -> str
  128. """The source code passed into the constructor."""
  129. return self._text
  130. @property
  131. def tokens(self):
  132. # type: () -> List[Token]
  133. """The list of tokens corresponding to the source code from the constructor."""
  134. return self._tokens
  135. @property
  136. def tree(self):
  137. # type: () -> Optional[Module]
  138. """The root of the AST tree passed into the constructor or parsed from the source code."""
  139. return self._tree
  140. @property
  141. def filename(self):
  142. # type: () -> str
  143. """The filename that was parsed"""
  144. return self._filename
  145. def get_token_from_offset(self, offset):
  146. # type: (int) -> Token
  147. """
  148. Returns the token containing the given character offset (0-based position in source text),
  149. or the preceeding token if the position is between tokens.
  150. """
  151. return self._tokens[bisect.bisect(self._token_offsets, offset) - 1]
  152. def get_token(self, lineno, col_offset):
  153. # type: (int, int) -> Token
  154. """
  155. Returns the token containing the given (lineno, col_offset) position, or the preceeding token
  156. if the position is between tokens.
  157. """
  158. # TODO: add test for multibyte unicode. We need to translate offsets from ast module (which
  159. # are in utf8) to offsets into the unicode text. tokenize module seems to use unicode offsets
  160. # but isn't explicit.
  161. return self.get_token_from_offset(self._line_numbers.line_to_offset(lineno, col_offset))
  162. def get_token_from_utf8(self, lineno, col_offset):
  163. # type: (int, int) -> Token
  164. """
  165. Same as get_token(), but interprets col_offset as a UTF8 offset, which is what `ast` uses.
  166. """
  167. return self.get_token(lineno, self._line_numbers.from_utf8_col(lineno, col_offset))
  168. def next_token(self, tok, include_extra=False):
  169. # type: (Token, bool) -> Token
  170. """
  171. Returns the next token after the given one. If include_extra is True, includes non-coding
  172. tokens from the tokenize module, such as NL and COMMENT.
  173. """
  174. i = tok.index + 1
  175. if not include_extra:
  176. while is_non_coding_token(self._tokens[i].type):
  177. i += 1
  178. return self._tokens[i]
  179. def prev_token(self, tok, include_extra=False):
  180. # type: (Token, bool) -> Token
  181. """
  182. Returns the previous token before the given one. If include_extra is True, includes non-coding
  183. tokens from the tokenize module, such as NL and COMMENT.
  184. """
  185. i = tok.index - 1
  186. if not include_extra:
  187. while is_non_coding_token(self._tokens[i].type):
  188. i -= 1
  189. return self._tokens[i]
  190. def find_token(self, start_token, tok_type, tok_str=None, reverse=False):
  191. # type: (Token, int, Optional[str], bool) -> Token
  192. """
  193. Looks for the first token, starting at start_token, that matches tok_type and, if given, the
  194. token string. Searches backwards if reverse is True. Returns ENDMARKER token if not found (you
  195. can check it with `token.ISEOF(t.type)`).
  196. """
  197. t = start_token
  198. advance = self.prev_token if reverse else self.next_token
  199. while not match_token(t, tok_type, tok_str) and not token.ISEOF(t.type):
  200. t = advance(t, include_extra=True)
  201. return t
  202. def token_range(self,
  203. first_token, # type: Token
  204. last_token, # type: Token
  205. include_extra=False, # type: bool
  206. ):
  207. # type: (...) -> Iterator[Token]
  208. """
  209. Yields all tokens in order from first_token through and including last_token. If
  210. include_extra is True, includes non-coding tokens such as tokenize.NL and .COMMENT.
  211. """
  212. for i in xrange(first_token.index, last_token.index + 1):
  213. if include_extra or not is_non_coding_token(self._tokens[i].type):
  214. yield self._tokens[i]
  215. def get_tokens(self, node, include_extra=False):
  216. # type: (AstNode, bool) -> Iterator[Token]
  217. """
  218. Yields all tokens making up the given node. If include_extra is True, includes non-coding
  219. tokens such as tokenize.NL and .COMMENT.
  220. """
  221. return self.token_range(node.first_token, node.last_token, include_extra=include_extra)
  222. def get_text_positions(self, node, padded):
  223. # type: (AstNode, bool) -> Tuple[Tuple[int, int], Tuple[int, int]]
  224. """
  225. Returns two ``(lineno, col_offset)`` tuples for the start and end of the given node.
  226. If the positions can't be determined, or the nodes don't correspond to any particular text,
  227. returns ``(1, 0)`` for both.
  228. ``padded`` corresponds to the ``padded`` argument to ``ast.get_source_segment()``.
  229. This means that if ``padded`` is True, the start position will be adjusted to include
  230. leading whitespace if ``node`` is a multiline statement.
  231. """
  232. if not hasattr(node, 'first_token'):
  233. return (1, 0), (1, 0)
  234. start = node.first_token.start
  235. end = node.last_token.end
  236. if padded and any(match_token(t, token.NEWLINE) for t in self.get_tokens(node)):
  237. # Set col_offset to 0 to include leading indentation for multiline statements.
  238. start = (start[0], 0)
  239. return start, end
  240. class ASTText(ASTTextBase, object):
  241. """
  242. Supports the same ``get_text*`` methods as ``ASTTokens``,
  243. but uses the AST to determine the text positions instead of tokens.
  244. This is faster than ``ASTTokens`` as it requires less setup work.
  245. It also (sometimes) supports nodes inside f-strings, which ``ASTTokens`` doesn't.
  246. Some node types and/or Python versions are not supported.
  247. In these cases the ``get_text*`` methods will fall back to using ``ASTTokens``
  248. which incurs the usual setup cost the first time.
  249. If you want to avoid this, check ``supports_tokenless(node)`` before calling ``get_text*`` methods.
  250. """
  251. def __init__(self, source_text, tree=None, filename='<unknown>'):
  252. # type: (Any, Optional[Module], str) -> None
  253. # FIXME: Strictly, the type of source_text is one of the six string types, but hard to specify with mypy given
  254. # https://mypy.readthedocs.io/en/stable/common_issues.html#variables-vs-type-aliases
  255. super(ASTText, self).__init__(source_text, filename)
  256. self._tree = tree
  257. if self._tree is not None:
  258. annotate_fstring_nodes(self._tree)
  259. self._asttokens = None # type: Optional[ASTTokens]
  260. @property
  261. def tree(self):
  262. # type: () -> Module
  263. if self._tree is None:
  264. self._tree = ast.parse(self._text, self._filename)
  265. annotate_fstring_nodes(self._tree)
  266. return self._tree
  267. @property
  268. def asttokens(self):
  269. # type: () -> ASTTokens
  270. if self._asttokens is None:
  271. self._asttokens = ASTTokens(
  272. self._text,
  273. tree=self.tree,
  274. filename=self._filename,
  275. )
  276. return self._asttokens
  277. def _get_text_positions_tokenless(self, node, padded):
  278. # type: (AstNode, bool) -> Tuple[Tuple[int, int], Tuple[int, int]]
  279. """
  280. Version of ``get_text_positions()`` that doesn't use tokens.
  281. """
  282. if sys.version_info[:2] < (3, 8): # pragma: no cover
  283. # This is just for mpypy
  284. raise AssertionError("This method should only be called internally after checking supports_tokenless()")
  285. if is_module(node):
  286. # Modules don't have position info, so just return the range of the whole text.
  287. # The token-using method does something different, but its behavior seems weird and inconsistent.
  288. # For example, in a file with only comments, it only returns the first line.
  289. # It's hard to imagine a case when this matters.
  290. return (1, 0), self._line_numbers.offset_to_line(len(self._text))
  291. if getattr(node, 'lineno', None) is None:
  292. return (1, 0), (1, 0)
  293. assert node # tell mypy that node is not None, which we allowed up to here for compatibility
  294. decorators = getattr(node, 'decorator_list', [])
  295. if not decorators:
  296. # Astroid uses node.decorators.nodes instead of node.decorator_list.
  297. decorators_node = getattr(node, 'decorators', None)
  298. decorators = getattr(decorators_node, 'nodes', [])
  299. if decorators:
  300. # Function/Class definition nodes are marked by AST as starting at def/class,
  301. # not the first decorator. This doesn't match the token-using behavior,
  302. # or inspect.getsource(), and just seems weird.
  303. start_node = decorators[0]
  304. else:
  305. start_node = node
  306. start_lineno = start_node.lineno
  307. end_node = last_stmt(node)
  308. # Include leading indentation for multiline statements.
  309. # This doesn't mean simple statements that happen to be on multiple lines,
  310. # but compound statements where inner indentation matters.
  311. # So we don't just compare node.lineno and node.end_lineno,
  312. # we check for a contained statement starting on a different line.
  313. if padded and (
  314. start_lineno != end_node.lineno
  315. or (
  316. # Astroid docstrings aren't treated as separate statements.
  317. # So to handle function/class definitions with a docstring but no other body,
  318. # we just check that the node is a statement with a docstring
  319. # and spanning multiple lines in the simple, literal sense.
  320. start_lineno != node.end_lineno
  321. and getattr(node, "doc_node", None)
  322. and is_stmt(node)
  323. )
  324. ):
  325. start_col_offset = 0
  326. else:
  327. start_col_offset = self._line_numbers.from_utf8_col(start_lineno, start_node.col_offset)
  328. start = (start_lineno, start_col_offset)
  329. # To match the token-using behaviour, we exclude trailing semicolons and comments.
  330. # This means that for blocks containing multiple statements, we have to use the last one
  331. # instead of the actual node for end_lineno and end_col_offset.
  332. end_lineno = cast(int, end_node.end_lineno)
  333. end_col_offset = cast(int, end_node.end_col_offset)
  334. end_col_offset = self._line_numbers.from_utf8_col(end_lineno, end_col_offset)
  335. end = (end_lineno, end_col_offset)
  336. return start, end
  337. def get_text_positions(self, node, padded):
  338. # type: (AstNode, bool) -> Tuple[Tuple[int, int], Tuple[int, int]]
  339. """
  340. Returns two ``(lineno, col_offset)`` tuples for the start and end of the given node.
  341. If the positions can't be determined, or the nodes don't correspond to any particular text,
  342. returns ``(1, 0)`` for both.
  343. ``padded`` corresponds to the ``padded`` argument to ``ast.get_source_segment()``.
  344. This means that if ``padded`` is True, the start position will be adjusted to include
  345. leading whitespace if ``node`` is a multiline statement.
  346. """
  347. if getattr(node, "_broken_positions", None):
  348. # This node was marked in util.annotate_fstring_nodes as having untrustworthy lineno/col_offset.
  349. return (1, 0), (1, 0)
  350. if supports_tokenless(node):
  351. return self._get_text_positions_tokenless(node, padded)
  352. return self.asttokens.get_text_positions(node, padded)
  353. # Node types that _get_text_positions_tokenless doesn't support. Only relevant for Python 3.8+.
  354. _unsupported_tokenless_types = () # type: Tuple[str, ...]
  355. if sys.version_info[:2] >= (3, 8):
  356. # no lineno
  357. _unsupported_tokenless_types += ("arguments", "Arguments", "withitem")
  358. if sys.version_info[:2] == (3, 8):
  359. # _get_text_positions_tokenless works incorrectly for these types due to bugs in Python 3.8.
  360. _unsupported_tokenless_types += ("arg", "Starred")
  361. # no lineno in 3.8
  362. _unsupported_tokenless_types += ("Slice", "ExtSlice", "Index", "keyword")
  363. def supports_tokenless(node=None):
  364. # type: (Any) -> bool
  365. """
  366. Returns True if the Python version and the node (if given) are supported by
  367. the ``get_text*`` methods of ``ASTText`` without falling back to ``ASTTokens``.
  368. See ``ASTText`` for why this matters.
  369. The following cases are not supported:
  370. - Python 3.7 and earlier
  371. - PyPy
  372. - ``ast.arguments`` / ``astroid.Arguments``
  373. - ``ast.withitem``
  374. - ``astroid.Comprehension``
  375. - ``astroid.AssignName`` inside ``astroid.Arguments`` or ``astroid.ExceptHandler``
  376. - The following nodes in Python 3.8 only:
  377. - ``ast.arg``
  378. - ``ast.Starred``
  379. - ``ast.Slice``
  380. - ``ast.ExtSlice``
  381. - ``ast.Index``
  382. - ``ast.keyword``
  383. """
  384. return (
  385. type(node).__name__ not in _unsupported_tokenless_types
  386. and not (
  387. # astroid nodes
  388. not isinstance(node, ast.AST) and node is not None and (
  389. (
  390. type(node).__name__ == "AssignName"
  391. and type(node.parent).__name__ in ("Arguments", "ExceptHandler")
  392. )
  393. )
  394. )
  395. and sys.version_info[:2] >= (3, 8)
  396. and 'pypy' not in sys.version.lower()
  397. )