lexers.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. """
  2. Lexer interface and implementation.
  3. Used for syntax highlighting.
  4. """
  5. from __future__ import unicode_literals
  6. from abc import ABCMeta, abstractmethod
  7. from six import with_metaclass
  8. from six.moves import range
  9. from prompt_toolkit.token import Token
  10. from prompt_toolkit.filters import to_cli_filter
  11. from .utils import split_lines
  12. import re
  13. import six
  14. __all__ = (
  15. 'Lexer',
  16. 'SimpleLexer',
  17. 'PygmentsLexer',
  18. 'SyntaxSync',
  19. 'SyncFromStart',
  20. 'RegexSync',
  21. )
  22. class Lexer(with_metaclass(ABCMeta, object)):
  23. """
  24. Base class for all lexers.
  25. """
  26. @abstractmethod
  27. def lex_document(self, cli, document):
  28. """
  29. Takes a :class:`~prompt_toolkit.document.Document` and returns a
  30. callable that takes a line number and returns the tokens for that line.
  31. """
  32. class SimpleLexer(Lexer):
  33. """
  34. Lexer that doesn't do any tokenizing and returns the whole input as one token.
  35. :param token: The `Token` for this lexer.
  36. """
  37. # `default_token` parameter is deprecated!
  38. def __init__(self, token=Token, default_token=None):
  39. self.token = token
  40. if default_token is not None:
  41. self.token = default_token
  42. def lex_document(self, cli, document):
  43. lines = document.lines
  44. def get_line(lineno):
  45. " Return the tokens for the given line. "
  46. try:
  47. return [(self.token, lines[lineno])]
  48. except IndexError:
  49. return []
  50. return get_line
  51. class SyntaxSync(with_metaclass(ABCMeta, object)):
  52. """
  53. Syntax synchroniser. This is a tool that finds a start position for the
  54. lexer. This is especially important when editing big documents; we don't
  55. want to start the highlighting by running the lexer from the beginning of
  56. the file. That is very slow when editing.
  57. """
  58. @abstractmethod
  59. def get_sync_start_position(self, document, lineno):
  60. """
  61. Return the position from where we can start lexing as a (row, column)
  62. tuple.
  63. :param document: `Document` instance that contains all the lines.
  64. :param lineno: The line that we want to highlight. (We need to return
  65. this line, or an earlier position.)
  66. """
  67. class SyncFromStart(SyntaxSync):
  68. """
  69. Always start the syntax highlighting from the beginning.
  70. """
  71. def get_sync_start_position(self, document, lineno):
  72. return 0, 0
  73. class RegexSync(SyntaxSync):
  74. """
  75. Synchronize by starting at a line that matches the given regex pattern.
  76. """
  77. # Never go more than this amount of lines backwards for synchronisation.
  78. # That would be too CPU intensive.
  79. MAX_BACKWARDS = 500
  80. # Start lexing at the start, if we are in the first 'n' lines and no
  81. # synchronisation position was found.
  82. FROM_START_IF_NO_SYNC_POS_FOUND = 100
  83. def __init__(self, pattern):
  84. assert isinstance(pattern, six.text_type)
  85. self._compiled_pattern = re.compile(pattern)
  86. def get_sync_start_position(self, document, lineno):
  87. " Scan backwards, and find a possible position to start. "
  88. pattern = self._compiled_pattern
  89. lines = document.lines
  90. # Scan upwards, until we find a point where we can start the syntax
  91. # synchronisation.
  92. for i in range(lineno, max(-1, lineno - self.MAX_BACKWARDS), -1):
  93. match = pattern.match(lines[i])
  94. if match:
  95. return i, match.start()
  96. # No synchronisation point found. If we aren't that far from the
  97. # beginning, start at the very beginning, otherwise, just try to start
  98. # at the current line.
  99. if lineno < self.FROM_START_IF_NO_SYNC_POS_FOUND:
  100. return 0, 0
  101. else:
  102. return lineno, 0
  103. @classmethod
  104. def from_pygments_lexer_cls(cls, lexer_cls):
  105. """
  106. Create a :class:`.RegexSync` instance for this Pygments lexer class.
  107. """
  108. patterns = {
  109. # For Python, start highlighting at any class/def block.
  110. 'Python': r'^\s*(class|def)\s+',
  111. 'Python 3': r'^\s*(class|def)\s+',
  112. # For HTML, start at any open/close tag definition.
  113. 'HTML': r'<[/a-zA-Z]',
  114. # For javascript, start at a function.
  115. 'JavaScript': r'\bfunction\b'
  116. # TODO: Add definitions for other languages.
  117. # By default, we start at every possible line.
  118. }
  119. p = patterns.get(lexer_cls.name, '^')
  120. return cls(p)
  121. class PygmentsLexer(Lexer):
  122. """
  123. Lexer that calls a pygments lexer.
  124. Example::
  125. from pygments.lexers import HtmlLexer
  126. lexer = PygmentsLexer(HtmlLexer)
  127. Note: Don't forget to also load a Pygments compatible style. E.g.::
  128. from prompt_toolkit.styles.from_pygments import style_from_pygments
  129. from pygments.styles import get_style_by_name
  130. style = style_from_pygments(get_style_by_name('monokai'))
  131. :param pygments_lexer_cls: A `Lexer` from Pygments.
  132. :param sync_from_start: Start lexing at the start of the document. This
  133. will always give the best results, but it will be slow for bigger
  134. documents. (When the last part of the document is display, then the
  135. whole document will be lexed by Pygments on every key stroke.) It is
  136. recommended to disable this for inputs that are expected to be more
  137. than 1,000 lines.
  138. :param syntax_sync: `SyntaxSync` object.
  139. """
  140. # Minimum amount of lines to go backwards when starting the parser.
  141. # This is important when the lines are retrieved in reverse order, or when
  142. # scrolling upwards. (Due to the complexity of calculating the vertical
  143. # scroll offset in the `Window` class, lines are not always retrieved in
  144. # order.)
  145. MIN_LINES_BACKWARDS = 50
  146. # When a parser was started this amount of lines back, read the parser
  147. # until we get the current line. Otherwise, start a new parser.
  148. # (This should probably be bigger than MIN_LINES_BACKWARDS.)
  149. REUSE_GENERATOR_MAX_DISTANCE = 100
  150. def __init__(self, pygments_lexer_cls, sync_from_start=True, syntax_sync=None):
  151. assert syntax_sync is None or isinstance(syntax_sync, SyntaxSync)
  152. self.pygments_lexer_cls = pygments_lexer_cls
  153. self.sync_from_start = to_cli_filter(sync_from_start)
  154. # Instantiate the Pygments lexer.
  155. self.pygments_lexer = pygments_lexer_cls(
  156. stripnl=False,
  157. stripall=False,
  158. ensurenl=False)
  159. # Create syntax sync instance.
  160. self.syntax_sync = syntax_sync or RegexSync.from_pygments_lexer_cls(pygments_lexer_cls)
  161. @classmethod
  162. def from_filename(cls, filename, sync_from_start=True):
  163. """
  164. Create a `Lexer` from a filename.
  165. """
  166. # Inline imports: the Pygments dependency is optional!
  167. from pygments.util import ClassNotFound
  168. from pygments.lexers import get_lexer_for_filename
  169. try:
  170. pygments_lexer = get_lexer_for_filename(filename)
  171. except ClassNotFound:
  172. return SimpleLexer()
  173. else:
  174. return cls(pygments_lexer.__class__, sync_from_start=sync_from_start)
  175. def lex_document(self, cli, document):
  176. """
  177. Create a lexer function that takes a line number and returns the list
  178. of (Token, text) tuples as the Pygments lexer returns for that line.
  179. """
  180. # Cache of already lexed lines.
  181. cache = {}
  182. # Pygments generators that are currently lexing.
  183. line_generators = {} # Map lexer generator to the line number.
  184. def get_syntax_sync():
  185. " The Syntax synchronisation objcet that we currently use. "
  186. if self.sync_from_start(cli):
  187. return SyncFromStart()
  188. else:
  189. return self.syntax_sync
  190. def find_closest_generator(i):
  191. " Return a generator close to line 'i', or None if none was fonud. "
  192. for generator, lineno in line_generators.items():
  193. if lineno < i and i - lineno < self.REUSE_GENERATOR_MAX_DISTANCE:
  194. return generator
  195. def create_line_generator(start_lineno, column=0):
  196. """
  197. Create a generator that yields the lexed lines.
  198. Each iteration it yields a (line_number, [(token, text), ...]) tuple.
  199. """
  200. def get_tokens():
  201. text = '\n'.join(document.lines[start_lineno:])[column:]
  202. # We call `get_tokens_unprocessed`, because `get_tokens` will
  203. # still replace \r\n and \r by \n. (We don't want that,
  204. # Pygments should return exactly the same amount of text, as we
  205. # have given as input.)
  206. for _, t, v in self.pygments_lexer.get_tokens_unprocessed(text):
  207. yield t, v
  208. return enumerate(split_lines(get_tokens()), start_lineno)
  209. def get_generator(i):
  210. """
  211. Find an already started generator that is close, or create a new one.
  212. """
  213. # Find closest line generator.
  214. generator = find_closest_generator(i)
  215. if generator:
  216. return generator
  217. # No generator found. Determine starting point for the syntax
  218. # synchronisation first.
  219. # Go at least x lines back. (Make scrolling upwards more
  220. # efficient.)
  221. i = max(0, i - self.MIN_LINES_BACKWARDS)
  222. if i == 0:
  223. row = 0
  224. column = 0
  225. else:
  226. row, column = get_syntax_sync().get_sync_start_position(document, i)
  227. # Find generator close to this point, or otherwise create a new one.
  228. generator = find_closest_generator(i)
  229. if generator:
  230. return generator
  231. else:
  232. generator = create_line_generator(row, column)
  233. # If the column is not 0, ignore the first line. (Which is
  234. # incomplete. This happens when the synchronisation algorithm tells
  235. # us to start parsing in the middle of a line.)
  236. if column:
  237. next(generator)
  238. row += 1
  239. line_generators[generator] = row
  240. return generator
  241. def get_line(i):
  242. " Return the tokens for a given line number. "
  243. try:
  244. return cache[i]
  245. except KeyError:
  246. generator = get_generator(i)
  247. # Exhaust the generator, until we find the requested line.
  248. for num, line in generator:
  249. cache[num] = line
  250. if num == i:
  251. line_generators[generator] = i
  252. # Remove the next item from the cache.
  253. # (It could happen that it's already there, because of
  254. # another generator that started filling these lines,
  255. # but we want to synchronise these lines with the
  256. # current lexer's state.)
  257. if num + 1 in cache:
  258. del cache[num + 1]
  259. return cache[num]
  260. return []
  261. return get_line