pygments.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. """
  2. Adaptor classes for using Pygments lexers within prompt_toolkit.
  3. This includes syntax synchronization code, so that we don't have to start
  4. lexing at the beginning of a document, when displaying a very large text.
  5. """
  6. from __future__ import annotations
  7. import re
  8. from abc import ABCMeta, abstractmethod
  9. from typing import TYPE_CHECKING, Callable, Dict, Generator, Iterable, Tuple
  10. from prompt_toolkit.document import Document
  11. from prompt_toolkit.filters import FilterOrBool, to_filter
  12. from prompt_toolkit.formatted_text.base import StyleAndTextTuples
  13. from prompt_toolkit.formatted_text.utils import split_lines
  14. from prompt_toolkit.styles.pygments import pygments_token_to_classname
  15. from .base import Lexer, SimpleLexer
  16. if TYPE_CHECKING:
  17. from pygments.lexer import Lexer as PygmentsLexerCls
  18. __all__ = [
  19. "PygmentsLexer",
  20. "SyntaxSync",
  21. "SyncFromStart",
  22. "RegexSync",
  23. ]
  24. class SyntaxSync(metaclass=ABCMeta):
  25. """
  26. Syntax synchronizer. This is a tool that finds a start position for the
  27. lexer. This is especially important when editing big documents; we don't
  28. want to start the highlighting by running the lexer from the beginning of
  29. the file. That is very slow when editing.
  30. """
  31. @abstractmethod
  32. def get_sync_start_position(
  33. self, document: Document, lineno: int
  34. ) -> tuple[int, int]:
  35. """
  36. Return the position from where we can start lexing as a (row, column)
  37. tuple.
  38. :param document: `Document` instance that contains all the lines.
  39. :param lineno: The line that we want to highlight. (We need to return
  40. this line, or an earlier position.)
  41. """
  42. class SyncFromStart(SyntaxSync):
  43. """
  44. Always start the syntax highlighting from the beginning.
  45. """
  46. def get_sync_start_position(
  47. self, document: Document, lineno: int
  48. ) -> tuple[int, int]:
  49. return 0, 0
  50. class RegexSync(SyntaxSync):
  51. """
  52. Synchronize by starting at a line that matches the given regex pattern.
  53. """
  54. # Never go more than this amount of lines backwards for synchronization.
  55. # That would be too CPU intensive.
  56. MAX_BACKWARDS = 500
  57. # Start lexing at the start, if we are in the first 'n' lines and no
  58. # synchronization position was found.
  59. FROM_START_IF_NO_SYNC_POS_FOUND = 100
  60. def __init__(self, pattern: str) -> None:
  61. self._compiled_pattern = re.compile(pattern)
  62. def get_sync_start_position(
  63. self, document: Document, lineno: int
  64. ) -> tuple[int, int]:
  65. """
  66. Scan backwards, and find a possible position to start.
  67. """
  68. pattern = self._compiled_pattern
  69. lines = document.lines
  70. # Scan upwards, until we find a point where we can start the syntax
  71. # synchronization.
  72. for i in range(lineno, max(-1, lineno - self.MAX_BACKWARDS), -1):
  73. match = pattern.match(lines[i])
  74. if match:
  75. return i, match.start()
  76. # No synchronization point found. If we aren't that far from the
  77. # beginning, start at the very beginning, otherwise, just try to start
  78. # at the current line.
  79. if lineno < self.FROM_START_IF_NO_SYNC_POS_FOUND:
  80. return 0, 0
  81. else:
  82. return lineno, 0
  83. @classmethod
  84. def from_pygments_lexer_cls(cls, lexer_cls: PygmentsLexerCls) -> RegexSync:
  85. """
  86. Create a :class:`.RegexSync` instance for this Pygments lexer class.
  87. """
  88. patterns = {
  89. # For Python, start highlighting at any class/def block.
  90. "Python": r"^\s*(class|def)\s+",
  91. "Python 3": r"^\s*(class|def)\s+",
  92. # For HTML, start at any open/close tag definition.
  93. "HTML": r"<[/a-zA-Z]",
  94. # For javascript, start at a function.
  95. "JavaScript": r"\bfunction\b",
  96. # TODO: Add definitions for other languages.
  97. # By default, we start at every possible line.
  98. }
  99. p = patterns.get(lexer_cls.name, "^")
  100. return cls(p)
  101. class _TokenCache(Dict[Tuple[str, ...], str]):
  102. """
  103. Cache that converts Pygments tokens into `prompt_toolkit` style objects.
  104. ``Token.A.B.C`` will be converted into:
  105. ``class:pygments,pygments.A,pygments.A.B,pygments.A.B.C``
  106. """
  107. def __missing__(self, key: tuple[str, ...]) -> str:
  108. result = "class:" + pygments_token_to_classname(key)
  109. self[key] = result
  110. return result
  111. _token_cache = _TokenCache()
  112. class PygmentsLexer(Lexer):
  113. """
  114. Lexer that calls a pygments lexer.
  115. Example::
  116. from pygments.lexers.html import HtmlLexer
  117. lexer = PygmentsLexer(HtmlLexer)
  118. Note: Don't forget to also load a Pygments compatible style. E.g.::
  119. from prompt_toolkit.styles.from_pygments import style_from_pygments_cls
  120. from pygments.styles import get_style_by_name
  121. style = style_from_pygments_cls(get_style_by_name('monokai'))
  122. :param pygments_lexer_cls: A `Lexer` from Pygments.
  123. :param sync_from_start: Start lexing at the start of the document. This
  124. will always give the best results, but it will be slow for bigger
  125. documents. (When the last part of the document is display, then the
  126. whole document will be lexed by Pygments on every key stroke.) It is
  127. recommended to disable this for inputs that are expected to be more
  128. than 1,000 lines.
  129. :param syntax_sync: `SyntaxSync` object.
  130. """
  131. # Minimum amount of lines to go backwards when starting the parser.
  132. # This is important when the lines are retrieved in reverse order, or when
  133. # scrolling upwards. (Due to the complexity of calculating the vertical
  134. # scroll offset in the `Window` class, lines are not always retrieved in
  135. # order.)
  136. MIN_LINES_BACKWARDS = 50
  137. # When a parser was started this amount of lines back, read the parser
  138. # until we get the current line. Otherwise, start a new parser.
  139. # (This should probably be bigger than MIN_LINES_BACKWARDS.)
  140. REUSE_GENERATOR_MAX_DISTANCE = 100
  141. def __init__(
  142. self,
  143. pygments_lexer_cls: type[PygmentsLexerCls],
  144. sync_from_start: FilterOrBool = True,
  145. syntax_sync: SyntaxSync | None = None,
  146. ) -> None:
  147. self.pygments_lexer_cls = pygments_lexer_cls
  148. self.sync_from_start = to_filter(sync_from_start)
  149. # Instantiate the Pygments lexer.
  150. self.pygments_lexer = pygments_lexer_cls(
  151. stripnl=False, stripall=False, ensurenl=False
  152. )
  153. # Create syntax sync instance.
  154. self.syntax_sync = syntax_sync or RegexSync.from_pygments_lexer_cls(
  155. pygments_lexer_cls
  156. )
  157. @classmethod
  158. def from_filename(
  159. cls, filename: str, sync_from_start: FilterOrBool = True
  160. ) -> Lexer:
  161. """
  162. Create a `Lexer` from a filename.
  163. """
  164. # Inline imports: the Pygments dependency is optional!
  165. from pygments.lexers import get_lexer_for_filename
  166. from pygments.util import ClassNotFound
  167. try:
  168. pygments_lexer = get_lexer_for_filename(filename)
  169. except ClassNotFound:
  170. return SimpleLexer()
  171. else:
  172. return cls(pygments_lexer.__class__, sync_from_start=sync_from_start)
  173. def lex_document(self, document: Document) -> Callable[[int], StyleAndTextTuples]:
  174. """
  175. Create a lexer function that takes a line number and returns the list
  176. of (style_str, text) tuples as the Pygments lexer returns for that line.
  177. """
  178. LineGenerator = Generator[Tuple[int, StyleAndTextTuples], None, None]
  179. # Cache of already lexed lines.
  180. cache: dict[int, StyleAndTextTuples] = {}
  181. # Pygments generators that are currently lexing.
  182. # Map lexer generator to the line number.
  183. line_generators: dict[LineGenerator, int] = {}
  184. def get_syntax_sync() -> SyntaxSync:
  185. "The Syntax synchronization object that we currently use."
  186. if self.sync_from_start():
  187. return SyncFromStart()
  188. else:
  189. return self.syntax_sync
  190. def find_closest_generator(i: int) -> LineGenerator | None:
  191. "Return a generator close to line 'i', or None if none was found."
  192. for generator, lineno in line_generators.items():
  193. if lineno < i and i - lineno < self.REUSE_GENERATOR_MAX_DISTANCE:
  194. return generator
  195. return None
  196. def create_line_generator(start_lineno: int, column: int = 0) -> LineGenerator:
  197. """
  198. Create a generator that yields the lexed lines.
  199. Each iteration it yields a (line_number, [(style_str, text), ...]) tuple.
  200. """
  201. def get_text_fragments() -> Iterable[tuple[str, str]]:
  202. text = "\n".join(document.lines[start_lineno:])[column:]
  203. # We call `get_text_fragments_unprocessed`, because `get_tokens` will
  204. # still replace \r\n and \r by \n. (We don't want that,
  205. # Pygments should return exactly the same amount of text, as we
  206. # have given as input.)
  207. for _, t, v in self.pygments_lexer.get_tokens_unprocessed(text):
  208. # Turn Pygments `Token` object into prompt_toolkit style
  209. # strings.
  210. yield _token_cache[t], v
  211. yield from enumerate(split_lines(list(get_text_fragments())), start_lineno)
  212. def get_generator(i: int) -> LineGenerator:
  213. """
  214. Find an already started generator that is close, or create a new one.
  215. """
  216. # Find closest line generator.
  217. generator = find_closest_generator(i)
  218. if generator:
  219. return generator
  220. # No generator found. Determine starting point for the syntax
  221. # synchronization first.
  222. # Go at least x lines back. (Make scrolling upwards more
  223. # efficient.)
  224. i = max(0, i - self.MIN_LINES_BACKWARDS)
  225. if i == 0:
  226. row = 0
  227. column = 0
  228. else:
  229. row, column = get_syntax_sync().get_sync_start_position(document, i)
  230. # Find generator close to this point, or otherwise create a new one.
  231. generator = find_closest_generator(i)
  232. if generator:
  233. return generator
  234. else:
  235. generator = create_line_generator(row, column)
  236. # If the column is not 0, ignore the first line. (Which is
  237. # incomplete. This happens when the synchronization algorithm tells
  238. # us to start parsing in the middle of a line.)
  239. if column:
  240. next(generator)
  241. row += 1
  242. line_generators[generator] = row
  243. return generator
  244. def get_line(i: int) -> StyleAndTextTuples:
  245. "Return the tokens for a given line number."
  246. try:
  247. return cache[i]
  248. except KeyError:
  249. generator = get_generator(i)
  250. # Exhaust the generator, until we find the requested line.
  251. for num, line in generator:
  252. cache[num] = line
  253. if num == i:
  254. line_generators[generator] = i
  255. # Remove the next item from the cache.
  256. # (It could happen that it's already there, because of
  257. # another generator that started filling these lines,
  258. # but we want to synchronize these lines with the
  259. # current lexer's state.)
  260. if num + 1 in cache:
  261. del cache[num + 1]
  262. return cache[num]
  263. return []
  264. return get_line