lexer.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961
  1. """
  2. pygments.lexer
  3. ~~~~~~~~~~~~~~
  4. Base lexer classes.
  5. :copyright: Copyright 2006-2024 by the Pygments team, see AUTHORS.
  6. :license: BSD, see LICENSE for details.
  7. """
  8. import re
  9. import sys
  10. import time
  11. from pygments.filter import apply_filters, Filter
  12. from pygments.filters import get_filter_by_name
  13. from pygments.token import Error, Text, Other, Whitespace, _TokenType
  14. from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
  15. make_analysator, Future, guess_decode
  16. from pygments.regexopt import regex_opt
  17. __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
  18. 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
  19. 'default', 'words', 'line_re']
  20. line_re = re.compile('.*?\n')
  21. _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
  22. (b'\xff\xfe\0\0', 'utf-32'),
  23. (b'\0\0\xfe\xff', 'utf-32be'),
  24. (b'\xff\xfe', 'utf-16'),
  25. (b'\xfe\xff', 'utf-16be')]
  26. _default_analyse = staticmethod(lambda x: 0.0)
  27. class LexerMeta(type):
  28. """
  29. This metaclass automagically converts ``analyse_text`` methods into
  30. static methods which always return float values.
  31. """
  32. def __new__(mcs, name, bases, d):
  33. if 'analyse_text' in d:
  34. d['analyse_text'] = make_analysator(d['analyse_text'])
  35. return type.__new__(mcs, name, bases, d)
  36. class Lexer(metaclass=LexerMeta):
  37. """
  38. Lexer for a specific language.
  39. See also :doc:`lexerdevelopment`, a high-level guide to writing
  40. lexers.
  41. Lexer classes have attributes used for choosing the most appropriate
  42. lexer based on various criteria.
  43. .. autoattribute:: name
  44. :no-value:
  45. .. autoattribute:: aliases
  46. :no-value:
  47. .. autoattribute:: filenames
  48. :no-value:
  49. .. autoattribute:: alias_filenames
  50. .. autoattribute:: mimetypes
  51. :no-value:
  52. .. autoattribute:: priority
  53. Lexers included in Pygments should have two additional attributes:
  54. .. autoattribute:: url
  55. :no-value:
  56. .. autoattribute:: version_added
  57. :no-value:
  58. Lexers included in Pygments may have additional attributes:
  59. .. autoattribute:: _example
  60. :no-value:
  61. You can pass options to the constructor. The basic options recognized
  62. by all lexers and processed by the base `Lexer` class are:
  63. ``stripnl``
  64. Strip leading and trailing newlines from the input (default: True).
  65. ``stripall``
  66. Strip all leading and trailing whitespace from the input
  67. (default: False).
  68. ``ensurenl``
  69. Make sure that the input ends with a newline (default: True). This
  70. is required for some lexers that consume input linewise.
  71. .. versionadded:: 1.3
  72. ``tabsize``
  73. If given and greater than 0, expand tabs in the input (default: 0).
  74. ``encoding``
  75. If given, must be an encoding name. This encoding will be used to
  76. convert the input string to Unicode, if it is not already a Unicode
  77. string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
  78. Latin1 detection. Can also be ``'chardet'`` to use the chardet
  79. library, if it is installed.
  80. ``inencoding``
  81. Overrides the ``encoding`` if given.
  82. """
  83. #: Full name of the lexer, in human-readable form
  84. name = None
  85. #: A list of short, unique identifiers that can be used to look
  86. #: up the lexer from a list, e.g., using `get_lexer_by_name()`.
  87. aliases = []
  88. #: A list of `fnmatch` patterns that match filenames which contain
  89. #: content for this lexer. The patterns in this list should be unique among
  90. #: all lexers.
  91. filenames = []
  92. #: A list of `fnmatch` patterns that match filenames which may or may not
  93. #: contain content for this lexer. This list is used by the
  94. #: :func:`.guess_lexer_for_filename()` function, to determine which lexers
  95. #: are then included in guessing the correct one. That means that
  96. #: e.g. every lexer for HTML and a template language should include
  97. #: ``\*.html`` in this list.
  98. alias_filenames = []
  99. #: A list of MIME types for content that can be lexed with this lexer.
  100. mimetypes = []
  101. #: Priority, should multiple lexers match and no content is provided
  102. priority = 0
  103. #: URL of the language specification/definition. Used in the Pygments
  104. #: documentation. Set to an empty string to disable.
  105. url = None
  106. #: Version of Pygments in which the lexer was added.
  107. version_added = None
  108. #: Example file name. Relative to the ``tests/examplefiles`` directory.
  109. #: This is used by the documentation generator to show an example.
  110. _example = None
  111. def __init__(self, **options):
  112. """
  113. This constructor takes arbitrary options as keyword arguments.
  114. Every subclass must first process its own options and then call
  115. the `Lexer` constructor, since it processes the basic
  116. options like `stripnl`.
  117. An example looks like this:
  118. .. sourcecode:: python
  119. def __init__(self, **options):
  120. self.compress = options.get('compress', '')
  121. Lexer.__init__(self, **options)
  122. As these options must all be specifiable as strings (due to the
  123. command line usage), there are various utility functions
  124. available to help with that, see `Utilities`_.
  125. """
  126. self.options = options
  127. self.stripnl = get_bool_opt(options, 'stripnl', True)
  128. self.stripall = get_bool_opt(options, 'stripall', False)
  129. self.ensurenl = get_bool_opt(options, 'ensurenl', True)
  130. self.tabsize = get_int_opt(options, 'tabsize', 0)
  131. self.encoding = options.get('encoding', 'guess')
  132. self.encoding = options.get('inencoding') or self.encoding
  133. self.filters = []
  134. for filter_ in get_list_opt(options, 'filters', ()):
  135. self.add_filter(filter_)
  136. def __repr__(self):
  137. if self.options:
  138. return f'<pygments.lexers.{self.__class__.__name__} with {self.options!r}>'
  139. else:
  140. return f'<pygments.lexers.{self.__class__.__name__}>'
  141. def add_filter(self, filter_, **options):
  142. """
  143. Add a new stream filter to this lexer.
  144. """
  145. if not isinstance(filter_, Filter):
  146. filter_ = get_filter_by_name(filter_, **options)
  147. self.filters.append(filter_)
  148. def analyse_text(text):
  149. """
  150. A static method which is called for lexer guessing.
  151. It should analyse the text and return a float in the range
  152. from ``0.0`` to ``1.0``. If it returns ``0.0``, the lexer
  153. will not be selected as the most probable one, if it returns
  154. ``1.0``, it will be selected immediately. This is used by
  155. `guess_lexer`.
  156. The `LexerMeta` metaclass automatically wraps this function so
  157. that it works like a static method (no ``self`` or ``cls``
  158. parameter) and the return value is automatically converted to
  159. `float`. If the return value is an object that is boolean `False`
  160. it's the same as if the return values was ``0.0``.
  161. """
  162. def _preprocess_lexer_input(self, text):
  163. """Apply preprocessing such as decoding the input, removing BOM and normalizing newlines."""
  164. if not isinstance(text, str):
  165. if self.encoding == 'guess':
  166. text, _ = guess_decode(text)
  167. elif self.encoding == 'chardet':
  168. try:
  169. import chardet
  170. except ImportError as e:
  171. raise ImportError('To enable chardet encoding guessing, '
  172. 'please install the chardet library '
  173. 'from http://chardet.feedparser.org/') from e
  174. # check for BOM first
  175. decoded = None
  176. for bom, encoding in _encoding_map:
  177. if text.startswith(bom):
  178. decoded = text[len(bom):].decode(encoding, 'replace')
  179. break
  180. # no BOM found, so use chardet
  181. if decoded is None:
  182. enc = chardet.detect(text[:1024]) # Guess using first 1KB
  183. decoded = text.decode(enc.get('encoding') or 'utf-8',
  184. 'replace')
  185. text = decoded
  186. else:
  187. text = text.decode(self.encoding)
  188. if text.startswith('\ufeff'):
  189. text = text[len('\ufeff'):]
  190. else:
  191. if text.startswith('\ufeff'):
  192. text = text[len('\ufeff'):]
  193. # text now *is* a unicode string
  194. text = text.replace('\r\n', '\n')
  195. text = text.replace('\r', '\n')
  196. if self.stripall:
  197. text = text.strip()
  198. elif self.stripnl:
  199. text = text.strip('\n')
  200. if self.tabsize > 0:
  201. text = text.expandtabs(self.tabsize)
  202. if self.ensurenl and not text.endswith('\n'):
  203. text += '\n'
  204. return text
  205. def get_tokens(self, text, unfiltered=False):
  206. """
  207. This method is the basic interface of a lexer. It is called by
  208. the `highlight()` function. It must process the text and return an
  209. iterable of ``(tokentype, value)`` pairs from `text`.
  210. Normally, you don't need to override this method. The default
  211. implementation processes the options recognized by all lexers
  212. (`stripnl`, `stripall` and so on), and then yields all tokens
  213. from `get_tokens_unprocessed()`, with the ``index`` dropped.
  214. If `unfiltered` is set to `True`, the filtering mechanism is
  215. bypassed even if filters are defined.
  216. """
  217. text = self._preprocess_lexer_input(text)
  218. def streamer():
  219. for _, t, v in self.get_tokens_unprocessed(text):
  220. yield t, v
  221. stream = streamer()
  222. if not unfiltered:
  223. stream = apply_filters(stream, self.filters, self)
  224. return stream
  225. def get_tokens_unprocessed(self, text):
  226. """
  227. This method should process the text and return an iterable of
  228. ``(index, tokentype, value)`` tuples where ``index`` is the starting
  229. position of the token within the input text.
  230. It must be overridden by subclasses. It is recommended to
  231. implement it as a generator to maximize effectiveness.
  232. """
  233. raise NotImplementedError
  234. class DelegatingLexer(Lexer):
  235. """
  236. This lexer takes two lexer as arguments. A root lexer and
  237. a language lexer. First everything is scanned using the language
  238. lexer, afterwards all ``Other`` tokens are lexed using the root
  239. lexer.
  240. The lexers from the ``template`` lexer package use this base lexer.
  241. """
  242. def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
  243. self.root_lexer = _root_lexer(**options)
  244. self.language_lexer = _language_lexer(**options)
  245. self.needle = _needle
  246. Lexer.__init__(self, **options)
  247. def get_tokens_unprocessed(self, text):
  248. buffered = ''
  249. insertions = []
  250. lng_buffer = []
  251. for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
  252. if t is self.needle:
  253. if lng_buffer:
  254. insertions.append((len(buffered), lng_buffer))
  255. lng_buffer = []
  256. buffered += v
  257. else:
  258. lng_buffer.append((i, t, v))
  259. if lng_buffer:
  260. insertions.append((len(buffered), lng_buffer))
  261. return do_insertions(insertions,
  262. self.root_lexer.get_tokens_unprocessed(buffered))
  263. # ------------------------------------------------------------------------------
  264. # RegexLexer and ExtendedRegexLexer
  265. #
  266. class include(str): # pylint: disable=invalid-name
  267. """
  268. Indicates that a state should include rules from another state.
  269. """
  270. pass
  271. class _inherit:
  272. """
  273. Indicates the a state should inherit from its superclass.
  274. """
  275. def __repr__(self):
  276. return 'inherit'
  277. inherit = _inherit() # pylint: disable=invalid-name
  278. class combined(tuple): # pylint: disable=invalid-name
  279. """
  280. Indicates a state combined from multiple states.
  281. """
  282. def __new__(cls, *args):
  283. return tuple.__new__(cls, args)
  284. def __init__(self, *args):
  285. # tuple.__init__ doesn't do anything
  286. pass
  287. class _PseudoMatch:
  288. """
  289. A pseudo match object constructed from a string.
  290. """
  291. def __init__(self, start, text):
  292. self._text = text
  293. self._start = start
  294. def start(self, arg=None):
  295. return self._start
  296. def end(self, arg=None):
  297. return self._start + len(self._text)
  298. def group(self, arg=None):
  299. if arg:
  300. raise IndexError('No such group')
  301. return self._text
  302. def groups(self):
  303. return (self._text,)
  304. def groupdict(self):
  305. return {}
  306. def bygroups(*args):
  307. """
  308. Callback that yields multiple actions for each group in the match.
  309. """
  310. def callback(lexer, match, ctx=None):
  311. for i, action in enumerate(args):
  312. if action is None:
  313. continue
  314. elif type(action) is _TokenType:
  315. data = match.group(i + 1)
  316. if data:
  317. yield match.start(i + 1), action, data
  318. else:
  319. data = match.group(i + 1)
  320. if data is not None:
  321. if ctx:
  322. ctx.pos = match.start(i + 1)
  323. for item in action(lexer,
  324. _PseudoMatch(match.start(i + 1), data), ctx):
  325. if item:
  326. yield item
  327. if ctx:
  328. ctx.pos = match.end()
  329. return callback
  330. class _This:
  331. """
  332. Special singleton used for indicating the caller class.
  333. Used by ``using``.
  334. """
  335. this = _This()
  336. def using(_other, **kwargs):
  337. """
  338. Callback that processes the match with a different lexer.
  339. The keyword arguments are forwarded to the lexer, except `state` which
  340. is handled separately.
  341. `state` specifies the state that the new lexer will start in, and can
  342. be an enumerable such as ('root', 'inline', 'string') or a simple
  343. string which is assumed to be on top of the root state.
  344. Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
  345. """
  346. gt_kwargs = {}
  347. if 'state' in kwargs:
  348. s = kwargs.pop('state')
  349. if isinstance(s, (list, tuple)):
  350. gt_kwargs['stack'] = s
  351. else:
  352. gt_kwargs['stack'] = ('root', s)
  353. if _other is this:
  354. def callback(lexer, match, ctx=None):
  355. # if keyword arguments are given the callback
  356. # function has to create a new lexer instance
  357. if kwargs:
  358. # XXX: cache that somehow
  359. kwargs.update(lexer.options)
  360. lx = lexer.__class__(**kwargs)
  361. else:
  362. lx = lexer
  363. s = match.start()
  364. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  365. yield i + s, t, v
  366. if ctx:
  367. ctx.pos = match.end()
  368. else:
  369. def callback(lexer, match, ctx=None):
  370. # XXX: cache that somehow
  371. kwargs.update(lexer.options)
  372. lx = _other(**kwargs)
  373. s = match.start()
  374. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  375. yield i + s, t, v
  376. if ctx:
  377. ctx.pos = match.end()
  378. return callback
  379. class default:
  380. """
  381. Indicates a state or state action (e.g. #pop) to apply.
  382. For example default('#pop') is equivalent to ('', Token, '#pop')
  383. Note that state tuples may be used as well.
  384. .. versionadded:: 2.0
  385. """
  386. def __init__(self, state):
  387. self.state = state
  388. class words(Future):
  389. """
  390. Indicates a list of literal words that is transformed into an optimized
  391. regex that matches any of the words.
  392. .. versionadded:: 2.0
  393. """
  394. def __init__(self, words, prefix='', suffix=''):
  395. self.words = words
  396. self.prefix = prefix
  397. self.suffix = suffix
  398. def get(self):
  399. return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
  400. class RegexLexerMeta(LexerMeta):
  401. """
  402. Metaclass for RegexLexer, creates the self._tokens attribute from
  403. self.tokens on the first instantiation.
  404. """
  405. def _process_regex(cls, regex, rflags, state):
  406. """Preprocess the regular expression component of a token definition."""
  407. if isinstance(regex, Future):
  408. regex = regex.get()
  409. return re.compile(regex, rflags).match
  410. def _process_token(cls, token):
  411. """Preprocess the token component of a token definition."""
  412. assert type(token) is _TokenType or callable(token), \
  413. f'token type must be simple type or callable, not {token!r}'
  414. return token
  415. def _process_new_state(cls, new_state, unprocessed, processed):
  416. """Preprocess the state transition action of a token definition."""
  417. if isinstance(new_state, str):
  418. # an existing state
  419. if new_state == '#pop':
  420. return -1
  421. elif new_state in unprocessed:
  422. return (new_state,)
  423. elif new_state == '#push':
  424. return new_state
  425. elif new_state[:5] == '#pop:':
  426. return -int(new_state[5:])
  427. else:
  428. assert False, f'unknown new state {new_state!r}'
  429. elif isinstance(new_state, combined):
  430. # combine a new state from existing ones
  431. tmp_state = '_tmp_%d' % cls._tmpname
  432. cls._tmpname += 1
  433. itokens = []
  434. for istate in new_state:
  435. assert istate != new_state, f'circular state ref {istate!r}'
  436. itokens.extend(cls._process_state(unprocessed,
  437. processed, istate))
  438. processed[tmp_state] = itokens
  439. return (tmp_state,)
  440. elif isinstance(new_state, tuple):
  441. # push more than one state
  442. for istate in new_state:
  443. assert (istate in unprocessed or
  444. istate in ('#pop', '#push')), \
  445. 'unknown new state ' + istate
  446. return new_state
  447. else:
  448. assert False, f'unknown new state def {new_state!r}'
  449. def _process_state(cls, unprocessed, processed, state):
  450. """Preprocess a single state definition."""
  451. assert isinstance(state, str), f"wrong state name {state!r}"
  452. assert state[0] != '#', f"invalid state name {state!r}"
  453. if state in processed:
  454. return processed[state]
  455. tokens = processed[state] = []
  456. rflags = cls.flags
  457. for tdef in unprocessed[state]:
  458. if isinstance(tdef, include):
  459. # it's a state reference
  460. assert tdef != state, f"circular state reference {state!r}"
  461. tokens.extend(cls._process_state(unprocessed, processed,
  462. str(tdef)))
  463. continue
  464. if isinstance(tdef, _inherit):
  465. # should be processed already, but may not in the case of:
  466. # 1. the state has no counterpart in any parent
  467. # 2. the state includes more than one 'inherit'
  468. continue
  469. if isinstance(tdef, default):
  470. new_state = cls._process_new_state(tdef.state, unprocessed, processed)
  471. tokens.append((re.compile('').match, None, new_state))
  472. continue
  473. assert type(tdef) is tuple, f"wrong rule def {tdef!r}"
  474. try:
  475. rex = cls._process_regex(tdef[0], rflags, state)
  476. except Exception as err:
  477. raise ValueError(f"uncompilable regex {tdef[0]!r} in state {state!r} of {cls!r}: {err}") from err
  478. token = cls._process_token(tdef[1])
  479. if len(tdef) == 2:
  480. new_state = None
  481. else:
  482. new_state = cls._process_new_state(tdef[2],
  483. unprocessed, processed)
  484. tokens.append((rex, token, new_state))
  485. return tokens
  486. def process_tokendef(cls, name, tokendefs=None):
  487. """Preprocess a dictionary of token definitions."""
  488. processed = cls._all_tokens[name] = {}
  489. tokendefs = tokendefs or cls.tokens[name]
  490. for state in list(tokendefs):
  491. cls._process_state(tokendefs, processed, state)
  492. return processed
  493. def get_tokendefs(cls):
  494. """
  495. Merge tokens from superclasses in MRO order, returning a single tokendef
  496. dictionary.
  497. Any state that is not defined by a subclass will be inherited
  498. automatically. States that *are* defined by subclasses will, by
  499. default, override that state in the superclass. If a subclass wishes to
  500. inherit definitions from a superclass, it can use the special value
  501. "inherit", which will cause the superclass' state definition to be
  502. included at that point in the state.
  503. """
  504. tokens = {}
  505. inheritable = {}
  506. for c in cls.__mro__:
  507. toks = c.__dict__.get('tokens', {})
  508. for state, items in toks.items():
  509. curitems = tokens.get(state)
  510. if curitems is None:
  511. # N.b. because this is assigned by reference, sufficiently
  512. # deep hierarchies are processed incrementally (e.g. for
  513. # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
  514. # will not see any inherits in B).
  515. tokens[state] = items
  516. try:
  517. inherit_ndx = items.index(inherit)
  518. except ValueError:
  519. continue
  520. inheritable[state] = inherit_ndx
  521. continue
  522. inherit_ndx = inheritable.pop(state, None)
  523. if inherit_ndx is None:
  524. continue
  525. # Replace the "inherit" value with the items
  526. curitems[inherit_ndx:inherit_ndx+1] = items
  527. try:
  528. # N.b. this is the index in items (that is, the superclass
  529. # copy), so offset required when storing below.
  530. new_inh_ndx = items.index(inherit)
  531. except ValueError:
  532. pass
  533. else:
  534. inheritable[state] = inherit_ndx + new_inh_ndx
  535. return tokens
  536. def __call__(cls, *args, **kwds):
  537. """Instantiate cls after preprocessing its token definitions."""
  538. if '_tokens' not in cls.__dict__:
  539. cls._all_tokens = {}
  540. cls._tmpname = 0
  541. if hasattr(cls, 'token_variants') and cls.token_variants:
  542. # don't process yet
  543. pass
  544. else:
  545. cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
  546. return type.__call__(cls, *args, **kwds)
  547. class RegexLexer(Lexer, metaclass=RegexLexerMeta):
  548. """
  549. Base for simple stateful regular expression-based lexers.
  550. Simplifies the lexing process so that you need only
  551. provide a list of states and regular expressions.
  552. """
  553. #: Flags for compiling the regular expressions.
  554. #: Defaults to MULTILINE.
  555. flags = re.MULTILINE
  556. #: At all time there is a stack of states. Initially, the stack contains
  557. #: a single state 'root'. The top of the stack is called "the current state".
  558. #:
  559. #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
  560. #:
  561. #: ``new_state`` can be omitted to signify no state transition.
  562. #: If ``new_state`` is a string, it is pushed on the stack. This ensure
  563. #: the new current state is ``new_state``.
  564. #: If ``new_state`` is a tuple of strings, all of those strings are pushed
  565. #: on the stack and the current state will be the last element of the list.
  566. #: ``new_state`` can also be ``combined('state1', 'state2', ...)``
  567. #: to signify a new, anonymous state combined from the rules of two
  568. #: or more existing ones.
  569. #: Furthermore, it can be '#pop' to signify going back one step in
  570. #: the state stack, or '#push' to push the current state on the stack
  571. #: again. Note that if you push while in a combined state, the combined
  572. #: state itself is pushed, and not only the state in which the rule is
  573. #: defined.
  574. #:
  575. #: The tuple can also be replaced with ``include('state')``, in which
  576. #: case the rules from the state named by the string are included in the
  577. #: current one.
  578. tokens = {}
  579. def get_tokens_unprocessed(self, text, stack=('root',)):
  580. """
  581. Split ``text`` into (tokentype, text) pairs.
  582. ``stack`` is the initial stack (default: ``['root']``)
  583. """
  584. pos = 0
  585. tokendefs = self._tokens
  586. statestack = list(stack)
  587. statetokens = tokendefs[statestack[-1]]
  588. while 1:
  589. for rexmatch, action, new_state in statetokens:
  590. m = rexmatch(text, pos)
  591. if m:
  592. if action is not None:
  593. if type(action) is _TokenType:
  594. yield pos, action, m.group()
  595. else:
  596. yield from action(self, m)
  597. pos = m.end()
  598. if new_state is not None:
  599. # state transition
  600. if isinstance(new_state, tuple):
  601. for state in new_state:
  602. if state == '#pop':
  603. if len(statestack) > 1:
  604. statestack.pop()
  605. elif state == '#push':
  606. statestack.append(statestack[-1])
  607. else:
  608. statestack.append(state)
  609. elif isinstance(new_state, int):
  610. # pop, but keep at least one state on the stack
  611. # (random code leading to unexpected pops should
  612. # not allow exceptions)
  613. if abs(new_state) >= len(statestack):
  614. del statestack[1:]
  615. else:
  616. del statestack[new_state:]
  617. elif new_state == '#push':
  618. statestack.append(statestack[-1])
  619. else:
  620. assert False, f"wrong state def: {new_state!r}"
  621. statetokens = tokendefs[statestack[-1]]
  622. break
  623. else:
  624. # We are here only if all state tokens have been considered
  625. # and there was not a match on any of them.
  626. try:
  627. if text[pos] == '\n':
  628. # at EOL, reset state to "root"
  629. statestack = ['root']
  630. statetokens = tokendefs['root']
  631. yield pos, Whitespace, '\n'
  632. pos += 1
  633. continue
  634. yield pos, Error, text[pos]
  635. pos += 1
  636. except IndexError:
  637. break
  638. class LexerContext:
  639. """
  640. A helper object that holds lexer position data.
  641. """
  642. def __init__(self, text, pos, stack=None, end=None):
  643. self.text = text
  644. self.pos = pos
  645. self.end = end or len(text) # end=0 not supported ;-)
  646. self.stack = stack or ['root']
  647. def __repr__(self):
  648. return f'LexerContext({self.text!r}, {self.pos!r}, {self.stack!r})'
  649. class ExtendedRegexLexer(RegexLexer):
  650. """
  651. A RegexLexer that uses a context object to store its state.
  652. """
  653. def get_tokens_unprocessed(self, text=None, context=None):
  654. """
  655. Split ``text`` into (tokentype, text) pairs.
  656. If ``context`` is given, use this lexer context instead.
  657. """
  658. tokendefs = self._tokens
  659. if not context:
  660. ctx = LexerContext(text, 0)
  661. statetokens = tokendefs['root']
  662. else:
  663. ctx = context
  664. statetokens = tokendefs[ctx.stack[-1]]
  665. text = ctx.text
  666. while 1:
  667. for rexmatch, action, new_state in statetokens:
  668. m = rexmatch(text, ctx.pos, ctx.end)
  669. if m:
  670. if action is not None:
  671. if type(action) is _TokenType:
  672. yield ctx.pos, action, m.group()
  673. ctx.pos = m.end()
  674. else:
  675. yield from action(self, m, ctx)
  676. if not new_state:
  677. # altered the state stack?
  678. statetokens = tokendefs[ctx.stack[-1]]
  679. # CAUTION: callback must set ctx.pos!
  680. if new_state is not None:
  681. # state transition
  682. if isinstance(new_state, tuple):
  683. for state in new_state:
  684. if state == '#pop':
  685. if len(ctx.stack) > 1:
  686. ctx.stack.pop()
  687. elif state == '#push':
  688. ctx.stack.append(ctx.stack[-1])
  689. else:
  690. ctx.stack.append(state)
  691. elif isinstance(new_state, int):
  692. # see RegexLexer for why this check is made
  693. if abs(new_state) >= len(ctx.stack):
  694. del ctx.stack[1:]
  695. else:
  696. del ctx.stack[new_state:]
  697. elif new_state == '#push':
  698. ctx.stack.append(ctx.stack[-1])
  699. else:
  700. assert False, f"wrong state def: {new_state!r}"
  701. statetokens = tokendefs[ctx.stack[-1]]
  702. break
  703. else:
  704. try:
  705. if ctx.pos >= ctx.end:
  706. break
  707. if text[ctx.pos] == '\n':
  708. # at EOL, reset state to "root"
  709. ctx.stack = ['root']
  710. statetokens = tokendefs['root']
  711. yield ctx.pos, Text, '\n'
  712. ctx.pos += 1
  713. continue
  714. yield ctx.pos, Error, text[ctx.pos]
  715. ctx.pos += 1
  716. except IndexError:
  717. break
  718. def do_insertions(insertions, tokens):
  719. """
  720. Helper for lexers which must combine the results of several
  721. sublexers.
  722. ``insertions`` is a list of ``(index, itokens)`` pairs.
  723. Each ``itokens`` iterable should be inserted at position
  724. ``index`` into the token stream given by the ``tokens``
  725. argument.
  726. The result is a combined token stream.
  727. TODO: clean up the code here.
  728. """
  729. insertions = iter(insertions)
  730. try:
  731. index, itokens = next(insertions)
  732. except StopIteration:
  733. # no insertions
  734. yield from tokens
  735. return
  736. realpos = None
  737. insleft = True
  738. # iterate over the token stream where we want to insert
  739. # the tokens from the insertion list.
  740. for i, t, v in tokens:
  741. # first iteration. store the position of first item
  742. if realpos is None:
  743. realpos = i
  744. oldi = 0
  745. while insleft and i + len(v) >= index:
  746. tmpval = v[oldi:index - i]
  747. if tmpval:
  748. yield realpos, t, tmpval
  749. realpos += len(tmpval)
  750. for it_index, it_token, it_value in itokens:
  751. yield realpos, it_token, it_value
  752. realpos += len(it_value)
  753. oldi = index - i
  754. try:
  755. index, itokens = next(insertions)
  756. except StopIteration:
  757. insleft = False
  758. break # not strictly necessary
  759. if oldi < len(v):
  760. yield realpos, t, v[oldi:]
  761. realpos += len(v) - oldi
  762. # leftover tokens
  763. while insleft:
  764. # no normal tokens, set realpos to zero
  765. realpos = realpos or 0
  766. for p, t, v in itokens:
  767. yield realpos, t, v
  768. realpos += len(v)
  769. try:
  770. index, itokens = next(insertions)
  771. except StopIteration:
  772. insleft = False
  773. break # not strictly necessary
  774. class ProfilingRegexLexerMeta(RegexLexerMeta):
  775. """Metaclass for ProfilingRegexLexer, collects regex timing info."""
  776. def _process_regex(cls, regex, rflags, state):
  777. if isinstance(regex, words):
  778. rex = regex_opt(regex.words, prefix=regex.prefix,
  779. suffix=regex.suffix)
  780. else:
  781. rex = regex
  782. compiled = re.compile(rex, rflags)
  783. def match_func(text, pos, endpos=sys.maxsize):
  784. info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
  785. t0 = time.time()
  786. res = compiled.match(text, pos, endpos)
  787. t1 = time.time()
  788. info[0] += 1
  789. info[1] += t1 - t0
  790. return res
  791. return match_func
  792. class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta):
  793. """Drop-in replacement for RegexLexer that does profiling of its regexes."""
  794. _prof_data = []
  795. _prof_sort_index = 4 # defaults to time per call
  796. def get_tokens_unprocessed(self, text, stack=('root',)):
  797. # this needs to be a stack, since using(this) will produce nested calls
  798. self.__class__._prof_data.append({})
  799. yield from RegexLexer.get_tokens_unprocessed(self, text, stack)
  800. rawdata = self.__class__._prof_data.pop()
  801. data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
  802. n, 1000 * t, 1000 * t / n)
  803. for ((s, r), (n, t)) in rawdata.items()),
  804. key=lambda x: x[self._prof_sort_index],
  805. reverse=True)
  806. sum_total = sum(x[3] for x in data)
  807. print()
  808. print('Profiling result for %s lexing %d chars in %.3f ms' %
  809. (self.__class__.__name__, len(text), sum_total))
  810. print('=' * 110)
  811. print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
  812. print('-' * 110)
  813. for d in data:
  814. print('%-20s %-65s %5d %8.4f %8.4f' % d)
  815. print('=' * 110)