_tokenizer.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. from __future__ import annotations
  2. import contextlib
  3. import re
  4. from dataclasses import dataclass
  5. from typing import Iterator, NoReturn
  6. from .specifiers import Specifier
  7. @dataclass
  8. class Token:
  9. name: str
  10. text: str
  11. position: int
  12. class ParserSyntaxError(Exception):
  13. """The provided source text could not be parsed correctly."""
  14. def __init__(
  15. self,
  16. message: str,
  17. *,
  18. source: str,
  19. span: tuple[int, int],
  20. ) -> None:
  21. self.span = span
  22. self.message = message
  23. self.source = source
  24. super().__init__()
  25. def __str__(self) -> str:
  26. marker = " " * self.span[0] + "~" * (self.span[1] - self.span[0]) + "^"
  27. return "\n ".join([self.message, self.source, marker])
  28. DEFAULT_RULES: dict[str, str | re.Pattern[str]] = {
  29. "LEFT_PARENTHESIS": r"\(",
  30. "RIGHT_PARENTHESIS": r"\)",
  31. "LEFT_BRACKET": r"\[",
  32. "RIGHT_BRACKET": r"\]",
  33. "SEMICOLON": r";",
  34. "COMMA": r",",
  35. "QUOTED_STRING": re.compile(
  36. r"""
  37. (
  38. ('[^']*')
  39. |
  40. ("[^"]*")
  41. )
  42. """,
  43. re.VERBOSE,
  44. ),
  45. "OP": r"(===|==|~=|!=|<=|>=|<|>)",
  46. "BOOLOP": r"\b(or|and)\b",
  47. "IN": r"\bin\b",
  48. "NOT": r"\bnot\b",
  49. "VARIABLE": re.compile(
  50. r"""
  51. \b(
  52. python_version
  53. |python_full_version
  54. |os[._]name
  55. |sys[._]platform
  56. |platform_(release|system)
  57. |platform[._](version|machine|python_implementation)
  58. |python_implementation
  59. |implementation_(name|version)
  60. |extra
  61. )\b
  62. """,
  63. re.VERBOSE,
  64. ),
  65. "SPECIFIER": re.compile(
  66. Specifier._operator_regex_str + Specifier._version_regex_str,
  67. re.VERBOSE | re.IGNORECASE,
  68. ),
  69. "AT": r"\@",
  70. "URL": r"[^ \t]+",
  71. "IDENTIFIER": r"\b[a-zA-Z0-9][a-zA-Z0-9._-]*\b",
  72. "VERSION_PREFIX_TRAIL": r"\.\*",
  73. "VERSION_LOCAL_LABEL_TRAIL": r"\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*",
  74. "WS": r"[ \t]+",
  75. "END": r"$",
  76. }
  77. class Tokenizer:
  78. """Context-sensitive token parsing.
  79. Provides methods to examine the input stream to check whether the next token
  80. matches.
  81. """
  82. def __init__(
  83. self,
  84. source: str,
  85. *,
  86. rules: dict[str, str | re.Pattern[str]],
  87. ) -> None:
  88. self.source = source
  89. self.rules: dict[str, re.Pattern[str]] = {
  90. name: re.compile(pattern) for name, pattern in rules.items()
  91. }
  92. self.next_token: Token | None = None
  93. self.position = 0
  94. def consume(self, name: str) -> None:
  95. """Move beyond provided token name, if at current position."""
  96. if self.check(name):
  97. self.read()
  98. def check(self, name: str, *, peek: bool = False) -> bool:
  99. """Check whether the next token has the provided name.
  100. By default, if the check succeeds, the token *must* be read before
  101. another check. If `peek` is set to `True`, the token is not loaded and
  102. would need to be checked again.
  103. """
  104. assert (
  105. self.next_token is None
  106. ), f"Cannot check for {name!r}, already have {self.next_token!r}"
  107. assert name in self.rules, f"Unknown token name: {name!r}"
  108. expression = self.rules[name]
  109. match = expression.match(self.source, self.position)
  110. if match is None:
  111. return False
  112. if not peek:
  113. self.next_token = Token(name, match[0], self.position)
  114. return True
  115. def expect(self, name: str, *, expected: str) -> Token:
  116. """Expect a certain token name next, failing with a syntax error otherwise.
  117. The token is *not* read.
  118. """
  119. if not self.check(name):
  120. raise self.raise_syntax_error(f"Expected {expected}")
  121. return self.read()
  122. def read(self) -> Token:
  123. """Consume the next token and return it."""
  124. token = self.next_token
  125. assert token is not None
  126. self.position += len(token.text)
  127. self.next_token = None
  128. return token
  129. def raise_syntax_error(
  130. self,
  131. message: str,
  132. *,
  133. span_start: int | None = None,
  134. span_end: int | None = None,
  135. ) -> NoReturn:
  136. """Raise ParserSyntaxError at the given position."""
  137. span = (
  138. self.position if span_start is None else span_start,
  139. self.position if span_end is None else span_end,
  140. )
  141. raise ParserSyntaxError(
  142. message,
  143. source=self.source,
  144. span=span,
  145. )
  146. @contextlib.contextmanager
  147. def enclosing_tokens(
  148. self, open_token: str, close_token: str, *, around: str
  149. ) -> Iterator[None]:
  150. if self.check(open_token):
  151. open_position = self.position
  152. self.read()
  153. else:
  154. open_position = None
  155. yield
  156. if open_position is None:
  157. return
  158. if not self.check(close_token):
  159. self.raise_syntax_error(
  160. f"Expected matching {close_token} for {open_token}, after {around}",
  161. span_start=open_position,
  162. )
  163. self.read()