lexer.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952
  1. """
  2. pygments.lexer
  3. ~~~~~~~~~~~~~~
  4. Base lexer classes.
  5. :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS.
  6. :license: BSD, see LICENSE for details.
  7. """
  8. import re
  9. import sys
  10. import time
  11. from pygments.filter import apply_filters, Filter
  12. from pygments.filters import get_filter_by_name
  13. from pygments.token import Error, Text, Other, Whitespace, _TokenType
  14. from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
  15. make_analysator, Future, guess_decode
  16. from pygments.regexopt import regex_opt
  17. __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
  18. 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
  19. 'default', 'words', 'line_re']
  20. line_re = re.compile('.*?\n')
  21. _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
  22. (b'\xff\xfe\0\0', 'utf-32'),
  23. (b'\0\0\xfe\xff', 'utf-32be'),
  24. (b'\xff\xfe', 'utf-16'),
  25. (b'\xfe\xff', 'utf-16be')]
  26. _default_analyse = staticmethod(lambda x: 0.0)
  27. class LexerMeta(type):
  28. """
  29. This metaclass automagically converts ``analyse_text`` methods into
  30. static methods which always return float values.
  31. """
  32. def __new__(mcs, name, bases, d):
  33. if 'analyse_text' in d:
  34. d['analyse_text'] = make_analysator(d['analyse_text'])
  35. return type.__new__(mcs, name, bases, d)
  36. class Lexer(metaclass=LexerMeta):
  37. """
  38. Lexer for a specific language.
  39. See also :doc:`lexerdevelopment`, a high-level guide to writing
  40. lexers.
  41. Lexer classes have attributes used for choosing the most appropriate
  42. lexer based on various criteria.
  43. .. autoattribute:: name
  44. :no-value:
  45. .. autoattribute:: aliases
  46. :no-value:
  47. .. autoattribute:: filenames
  48. :no-value:
  49. .. autoattribute:: alias_filenames
  50. .. autoattribute:: mimetypes
  51. :no-value:
  52. .. autoattribute:: priority
  53. Lexers included in Pygments should have an additional attribute:
  54. .. autoattribute:: url
  55. :no-value:
  56. Lexers included in Pygments may have additional attributes:
  57. .. autoattribute:: _example
  58. :no-value:
  59. You can pass options to the constructor. The basic options recognized
  60. by all lexers and processed by the base `Lexer` class are:
  61. ``stripnl``
  62. Strip leading and trailing newlines from the input (default: True).
  63. ``stripall``
  64. Strip all leading and trailing whitespace from the input
  65. (default: False).
  66. ``ensurenl``
  67. Make sure that the input ends with a newline (default: True). This
  68. is required for some lexers that consume input linewise.
  69. .. versionadded:: 1.3
  70. ``tabsize``
  71. If given and greater than 0, expand tabs in the input (default: 0).
  72. ``encoding``
  73. If given, must be an encoding name. This encoding will be used to
  74. convert the input string to Unicode, if it is not already a Unicode
  75. string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
  76. Latin1 detection. Can also be ``'chardet'`` to use the chardet
  77. library, if it is installed.
  78. ``inencoding``
  79. Overrides the ``encoding`` if given.
  80. """
  81. #: Full name of the lexer, in human-readable form
  82. name = None
  83. #: A list of short, unique identifiers that can be used to look
  84. #: up the lexer from a list, e.g., using `get_lexer_by_name()`.
  85. aliases = []
  86. #: A list of `fnmatch` patterns that match filenames which contain
  87. #: content for this lexer. The patterns in this list should be unique among
  88. #: all lexers.
  89. filenames = []
  90. #: A list of `fnmatch` patterns that match filenames which may or may not
  91. #: contain content for this lexer. This list is used by the
  92. #: :func:`.guess_lexer_for_filename()` function, to determine which lexers
  93. #: are then included in guessing the correct one. That means that
  94. #: e.g. every lexer for HTML and a template language should include
  95. #: ``\*.html`` in this list.
  96. alias_filenames = []
  97. #: A list of MIME types for content that can be lexed with this lexer.
  98. mimetypes = []
  99. #: Priority, should multiple lexers match and no content is provided
  100. priority = 0
  101. #: URL of the language specification/definition. Used in the Pygments
  102. #: documentation.
  103. url = None
  104. #: Example file name. Relative to the ``tests/examplefiles`` directory.
  105. #: This is used by the documentation generator to show an example.
  106. _example = None
  107. def __init__(self, **options):
  108. """
  109. This constructor takes arbitrary options as keyword arguments.
  110. Every subclass must first process its own options and then call
  111. the `Lexer` constructor, since it processes the basic
  112. options like `stripnl`.
  113. An example looks like this:
  114. .. sourcecode:: python
  115. def __init__(self, **options):
  116. self.compress = options.get('compress', '')
  117. Lexer.__init__(self, **options)
  118. As these options must all be specifiable as strings (due to the
  119. command line usage), there are various utility functions
  120. available to help with that, see `Utilities`_.
  121. """
  122. self.options = options
  123. self.stripnl = get_bool_opt(options, 'stripnl', True)
  124. self.stripall = get_bool_opt(options, 'stripall', False)
  125. self.ensurenl = get_bool_opt(options, 'ensurenl', True)
  126. self.tabsize = get_int_opt(options, 'tabsize', 0)
  127. self.encoding = options.get('encoding', 'guess')
  128. self.encoding = options.get('inencoding') or self.encoding
  129. self.filters = []
  130. for filter_ in get_list_opt(options, 'filters', ()):
  131. self.add_filter(filter_)
  132. def __repr__(self):
  133. if self.options:
  134. return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
  135. self.options)
  136. else:
  137. return '<pygments.lexers.%s>' % self.__class__.__name__
  138. def add_filter(self, filter_, **options):
  139. """
  140. Add a new stream filter to this lexer.
  141. """
  142. if not isinstance(filter_, Filter):
  143. filter_ = get_filter_by_name(filter_, **options)
  144. self.filters.append(filter_)
  145. def analyse_text(text):
  146. """
  147. A static method which is called for lexer guessing.
  148. It should analyse the text and return a float in the range
  149. from ``0.0`` to ``1.0``. If it returns ``0.0``, the lexer
  150. will not be selected as the most probable one, if it returns
  151. ``1.0``, it will be selected immediately. This is used by
  152. `guess_lexer`.
  153. The `LexerMeta` metaclass automatically wraps this function so
  154. that it works like a static method (no ``self`` or ``cls``
  155. parameter) and the return value is automatically converted to
  156. `float`. If the return value is an object that is boolean `False`
  157. it's the same as if the return values was ``0.0``.
  158. """
  159. def get_tokens(self, text, unfiltered=False):
  160. """
  161. This method is the basic interface of a lexer. It is called by
  162. the `highlight()` function. It must process the text and return an
  163. iterable of ``(tokentype, value)`` pairs from `text`.
  164. Normally, you don't need to override this method. The default
  165. implementation processes the options recognized by all lexers
  166. (`stripnl`, `stripall` and so on), and then yields all tokens
  167. from `get_tokens_unprocessed()`, with the ``index`` dropped.
  168. If `unfiltered` is set to `True`, the filtering mechanism is
  169. bypassed even if filters are defined.
  170. """
  171. if not isinstance(text, str):
  172. if self.encoding == 'guess':
  173. text, _ = guess_decode(text)
  174. elif self.encoding == 'chardet':
  175. try:
  176. import chardet
  177. except ImportError as e:
  178. raise ImportError('To enable chardet encoding guessing, '
  179. 'please install the chardet library '
  180. 'from http://chardet.feedparser.org/') from e
  181. # check for BOM first
  182. decoded = None
  183. for bom, encoding in _encoding_map:
  184. if text.startswith(bom):
  185. decoded = text[len(bom):].decode(encoding, 'replace')
  186. break
  187. # no BOM found, so use chardet
  188. if decoded is None:
  189. enc = chardet.detect(text[:1024]) # Guess using first 1KB
  190. decoded = text.decode(enc.get('encoding') or 'utf-8',
  191. 'replace')
  192. text = decoded
  193. else:
  194. text = text.decode(self.encoding)
  195. if text.startswith('\ufeff'):
  196. text = text[len('\ufeff'):]
  197. else:
  198. if text.startswith('\ufeff'):
  199. text = text[len('\ufeff'):]
  200. # text now *is* a unicode string
  201. text = text.replace('\r\n', '\n')
  202. text = text.replace('\r', '\n')
  203. if self.stripall:
  204. text = text.strip()
  205. elif self.stripnl:
  206. text = text.strip('\n')
  207. if self.tabsize > 0:
  208. text = text.expandtabs(self.tabsize)
  209. if self.ensurenl and not text.endswith('\n'):
  210. text += '\n'
  211. def streamer():
  212. for _, t, v in self.get_tokens_unprocessed(text):
  213. yield t, v
  214. stream = streamer()
  215. if not unfiltered:
  216. stream = apply_filters(stream, self.filters, self)
  217. return stream
  218. def get_tokens_unprocessed(self, text):
  219. """
  220. This method should process the text and return an iterable of
  221. ``(index, tokentype, value)`` tuples where ``index`` is the starting
  222. position of the token within the input text.
  223. It must be overridden by subclasses. It is recommended to
  224. implement it as a generator to maximize effectiveness.
  225. """
  226. raise NotImplementedError
  227. class DelegatingLexer(Lexer):
  228. """
  229. This lexer takes two lexer as arguments. A root lexer and
  230. a language lexer. First everything is scanned using the language
  231. lexer, afterwards all ``Other`` tokens are lexed using the root
  232. lexer.
  233. The lexers from the ``template`` lexer package use this base lexer.
  234. """
  235. def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
  236. self.root_lexer = _root_lexer(**options)
  237. self.language_lexer = _language_lexer(**options)
  238. self.needle = _needle
  239. Lexer.__init__(self, **options)
  240. def get_tokens_unprocessed(self, text):
  241. buffered = ''
  242. insertions = []
  243. lng_buffer = []
  244. for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
  245. if t is self.needle:
  246. if lng_buffer:
  247. insertions.append((len(buffered), lng_buffer))
  248. lng_buffer = []
  249. buffered += v
  250. else:
  251. lng_buffer.append((i, t, v))
  252. if lng_buffer:
  253. insertions.append((len(buffered), lng_buffer))
  254. return do_insertions(insertions,
  255. self.root_lexer.get_tokens_unprocessed(buffered))
  256. # ------------------------------------------------------------------------------
  257. # RegexLexer and ExtendedRegexLexer
  258. #
  259. class include(str): # pylint: disable=invalid-name
  260. """
  261. Indicates that a state should include rules from another state.
  262. """
  263. pass
  264. class _inherit:
  265. """
  266. Indicates the a state should inherit from its superclass.
  267. """
  268. def __repr__(self):
  269. return 'inherit'
  270. inherit = _inherit() # pylint: disable=invalid-name
  271. class combined(tuple): # pylint: disable=invalid-name
  272. """
  273. Indicates a state combined from multiple states.
  274. """
  275. def __new__(cls, *args):
  276. return tuple.__new__(cls, args)
  277. def __init__(self, *args):
  278. # tuple.__init__ doesn't do anything
  279. pass
  280. class _PseudoMatch:
  281. """
  282. A pseudo match object constructed from a string.
  283. """
  284. def __init__(self, start, text):
  285. self._text = text
  286. self._start = start
  287. def start(self, arg=None):
  288. return self._start
  289. def end(self, arg=None):
  290. return self._start + len(self._text)
  291. def group(self, arg=None):
  292. if arg:
  293. raise IndexError('No such group')
  294. return self._text
  295. def groups(self):
  296. return (self._text,)
  297. def groupdict(self):
  298. return {}
  299. def bygroups(*args):
  300. """
  301. Callback that yields multiple actions for each group in the match.
  302. """
  303. def callback(lexer, match, ctx=None):
  304. for i, action in enumerate(args):
  305. if action is None:
  306. continue
  307. elif type(action) is _TokenType:
  308. data = match.group(i + 1)
  309. if data:
  310. yield match.start(i + 1), action, data
  311. else:
  312. data = match.group(i + 1)
  313. if data is not None:
  314. if ctx:
  315. ctx.pos = match.start(i + 1)
  316. for item in action(lexer,
  317. _PseudoMatch(match.start(i + 1), data), ctx):
  318. if item:
  319. yield item
  320. if ctx:
  321. ctx.pos = match.end()
  322. return callback
  323. class _This:
  324. """
  325. Special singleton used for indicating the caller class.
  326. Used by ``using``.
  327. """
  328. this = _This()
  329. def using(_other, **kwargs):
  330. """
  331. Callback that processes the match with a different lexer.
  332. The keyword arguments are forwarded to the lexer, except `state` which
  333. is handled separately.
  334. `state` specifies the state that the new lexer will start in, and can
  335. be an enumerable such as ('root', 'inline', 'string') or a simple
  336. string which is assumed to be on top of the root state.
  337. Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
  338. """
  339. gt_kwargs = {}
  340. if 'state' in kwargs:
  341. s = kwargs.pop('state')
  342. if isinstance(s, (list, tuple)):
  343. gt_kwargs['stack'] = s
  344. else:
  345. gt_kwargs['stack'] = ('root', s)
  346. if _other is this:
  347. def callback(lexer, match, ctx=None):
  348. # if keyword arguments are given the callback
  349. # function has to create a new lexer instance
  350. if kwargs:
  351. # XXX: cache that somehow
  352. kwargs.update(lexer.options)
  353. lx = lexer.__class__(**kwargs)
  354. else:
  355. lx = lexer
  356. s = match.start()
  357. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  358. yield i + s, t, v
  359. if ctx:
  360. ctx.pos = match.end()
  361. else:
  362. def callback(lexer, match, ctx=None):
  363. # XXX: cache that somehow
  364. kwargs.update(lexer.options)
  365. lx = _other(**kwargs)
  366. s = match.start()
  367. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  368. yield i + s, t, v
  369. if ctx:
  370. ctx.pos = match.end()
  371. return callback
  372. class default:
  373. """
  374. Indicates a state or state action (e.g. #pop) to apply.
  375. For example default('#pop') is equivalent to ('', Token, '#pop')
  376. Note that state tuples may be used as well.
  377. .. versionadded:: 2.0
  378. """
  379. def __init__(self, state):
  380. self.state = state
  381. class words(Future):
  382. """
  383. Indicates a list of literal words that is transformed into an optimized
  384. regex that matches any of the words.
  385. .. versionadded:: 2.0
  386. """
  387. def __init__(self, words, prefix='', suffix=''):
  388. self.words = words
  389. self.prefix = prefix
  390. self.suffix = suffix
  391. def get(self):
  392. return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
  393. class RegexLexerMeta(LexerMeta):
  394. """
  395. Metaclass for RegexLexer, creates the self._tokens attribute from
  396. self.tokens on the first instantiation.
  397. """
  398. def _process_regex(cls, regex, rflags, state):
  399. """Preprocess the regular expression component of a token definition."""
  400. if isinstance(regex, Future):
  401. regex = regex.get()
  402. return re.compile(regex, rflags).match
  403. def _process_token(cls, token):
  404. """Preprocess the token component of a token definition."""
  405. assert type(token) is _TokenType or callable(token), \
  406. 'token type must be simple type or callable, not %r' % (token,)
  407. return token
  408. def _process_new_state(cls, new_state, unprocessed, processed):
  409. """Preprocess the state transition action of a token definition."""
  410. if isinstance(new_state, str):
  411. # an existing state
  412. if new_state == '#pop':
  413. return -1
  414. elif new_state in unprocessed:
  415. return (new_state,)
  416. elif new_state == '#push':
  417. return new_state
  418. elif new_state[:5] == '#pop:':
  419. return -int(new_state[5:])
  420. else:
  421. assert False, 'unknown new state %r' % new_state
  422. elif isinstance(new_state, combined):
  423. # combine a new state from existing ones
  424. tmp_state = '_tmp_%d' % cls._tmpname
  425. cls._tmpname += 1
  426. itokens = []
  427. for istate in new_state:
  428. assert istate != new_state, 'circular state ref %r' % istate
  429. itokens.extend(cls._process_state(unprocessed,
  430. processed, istate))
  431. processed[tmp_state] = itokens
  432. return (tmp_state,)
  433. elif isinstance(new_state, tuple):
  434. # push more than one state
  435. for istate in new_state:
  436. assert (istate in unprocessed or
  437. istate in ('#pop', '#push')), \
  438. 'unknown new state ' + istate
  439. return new_state
  440. else:
  441. assert False, 'unknown new state def %r' % new_state
  442. def _process_state(cls, unprocessed, processed, state):
  443. """Preprocess a single state definition."""
  444. assert type(state) is str, "wrong state name %r" % state
  445. assert state[0] != '#', "invalid state name %r" % state
  446. if state in processed:
  447. return processed[state]
  448. tokens = processed[state] = []
  449. rflags = cls.flags
  450. for tdef in unprocessed[state]:
  451. if isinstance(tdef, include):
  452. # it's a state reference
  453. assert tdef != state, "circular state reference %r" % state
  454. tokens.extend(cls._process_state(unprocessed, processed,
  455. str(tdef)))
  456. continue
  457. if isinstance(tdef, _inherit):
  458. # should be processed already, but may not in the case of:
  459. # 1. the state has no counterpart in any parent
  460. # 2. the state includes more than one 'inherit'
  461. continue
  462. if isinstance(tdef, default):
  463. new_state = cls._process_new_state(tdef.state, unprocessed, processed)
  464. tokens.append((re.compile('').match, None, new_state))
  465. continue
  466. assert type(tdef) is tuple, "wrong rule def %r" % tdef
  467. try:
  468. rex = cls._process_regex(tdef[0], rflags, state)
  469. except Exception as err:
  470. raise ValueError("uncompilable regex %r in state %r of %r: %s" %
  471. (tdef[0], state, cls, err)) from err
  472. token = cls._process_token(tdef[1])
  473. if len(tdef) == 2:
  474. new_state = None
  475. else:
  476. new_state = cls._process_new_state(tdef[2],
  477. unprocessed, processed)
  478. tokens.append((rex, token, new_state))
  479. return tokens
  480. def process_tokendef(cls, name, tokendefs=None):
  481. """Preprocess a dictionary of token definitions."""
  482. processed = cls._all_tokens[name] = {}
  483. tokendefs = tokendefs or cls.tokens[name]
  484. for state in list(tokendefs):
  485. cls._process_state(tokendefs, processed, state)
  486. return processed
  487. def get_tokendefs(cls):
  488. """
  489. Merge tokens from superclasses in MRO order, returning a single tokendef
  490. dictionary.
  491. Any state that is not defined by a subclass will be inherited
  492. automatically. States that *are* defined by subclasses will, by
  493. default, override that state in the superclass. If a subclass wishes to
  494. inherit definitions from a superclass, it can use the special value
  495. "inherit", which will cause the superclass' state definition to be
  496. included at that point in the state.
  497. """
  498. tokens = {}
  499. inheritable = {}
  500. for c in cls.__mro__:
  501. toks = c.__dict__.get('tokens', {})
  502. for state, items in toks.items():
  503. curitems = tokens.get(state)
  504. if curitems is None:
  505. # N.b. because this is assigned by reference, sufficiently
  506. # deep hierarchies are processed incrementally (e.g. for
  507. # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
  508. # will not see any inherits in B).
  509. tokens[state] = items
  510. try:
  511. inherit_ndx = items.index(inherit)
  512. except ValueError:
  513. continue
  514. inheritable[state] = inherit_ndx
  515. continue
  516. inherit_ndx = inheritable.pop(state, None)
  517. if inherit_ndx is None:
  518. continue
  519. # Replace the "inherit" value with the items
  520. curitems[inherit_ndx:inherit_ndx+1] = items
  521. try:
  522. # N.b. this is the index in items (that is, the superclass
  523. # copy), so offset required when storing below.
  524. new_inh_ndx = items.index(inherit)
  525. except ValueError:
  526. pass
  527. else:
  528. inheritable[state] = inherit_ndx + new_inh_ndx
  529. return tokens
  530. def __call__(cls, *args, **kwds):
  531. """Instantiate cls after preprocessing its token definitions."""
  532. if '_tokens' not in cls.__dict__:
  533. cls._all_tokens = {}
  534. cls._tmpname = 0
  535. if hasattr(cls, 'token_variants') and cls.token_variants:
  536. # don't process yet
  537. pass
  538. else:
  539. cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
  540. return type.__call__(cls, *args, **kwds)
  541. class RegexLexer(Lexer, metaclass=RegexLexerMeta):
  542. """
  543. Base for simple stateful regular expression-based lexers.
  544. Simplifies the lexing process so that you need only
  545. provide a list of states and regular expressions.
  546. """
  547. #: Flags for compiling the regular expressions.
  548. #: Defaults to MULTILINE.
  549. flags = re.MULTILINE
  550. #: At all time there is a stack of states. Initially, the stack contains
  551. #: a single state 'root'. The top of the stack is called "the current state".
  552. #:
  553. #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
  554. #:
  555. #: ``new_state`` can be omitted to signify no state transition.
  556. #: If ``new_state`` is a string, it is pushed on the stack. This ensure
  557. #: the new current state is ``new_state``.
  558. #: If ``new_state`` is a tuple of strings, all of those strings are pushed
  559. #: on the stack and the current state will be the last element of the list.
  560. #: ``new_state`` can also be ``combined('state1', 'state2', ...)``
  561. #: to signify a new, anonymous state combined from the rules of two
  562. #: or more existing ones.
  563. #: Furthermore, it can be '#pop' to signify going back one step in
  564. #: the state stack, or '#push' to push the current state on the stack
  565. #: again. Note that if you push while in a combined state, the combined
  566. #: state itself is pushed, and not only the state in which the rule is
  567. #: defined.
  568. #:
  569. #: The tuple can also be replaced with ``include('state')``, in which
  570. #: case the rules from the state named by the string are included in the
  571. #: current one.
  572. tokens = {}
  573. def get_tokens_unprocessed(self, text, stack=('root',)):
  574. """
  575. Split ``text`` into (tokentype, text) pairs.
  576. ``stack`` is the initial stack (default: ``['root']``)
  577. """
  578. pos = 0
  579. tokendefs = self._tokens
  580. statestack = list(stack)
  581. statetokens = tokendefs[statestack[-1]]
  582. while 1:
  583. for rexmatch, action, new_state in statetokens:
  584. m = rexmatch(text, pos)
  585. if m:
  586. if action is not None:
  587. if type(action) is _TokenType:
  588. yield pos, action, m.group()
  589. else:
  590. yield from action(self, m)
  591. pos = m.end()
  592. if new_state is not None:
  593. # state transition
  594. if isinstance(new_state, tuple):
  595. for state in new_state:
  596. if state == '#pop':
  597. if len(statestack) > 1:
  598. statestack.pop()
  599. elif state == '#push':
  600. statestack.append(statestack[-1])
  601. else:
  602. statestack.append(state)
  603. elif isinstance(new_state, int):
  604. # pop, but keep at least one state on the stack
  605. # (random code leading to unexpected pops should
  606. # not allow exceptions)
  607. if abs(new_state) >= len(statestack):
  608. del statestack[1:]
  609. else:
  610. del statestack[new_state:]
  611. elif new_state == '#push':
  612. statestack.append(statestack[-1])
  613. else:
  614. assert False, "wrong state def: %r" % new_state
  615. statetokens = tokendefs[statestack[-1]]
  616. break
  617. else:
  618. # We are here only if all state tokens have been considered
  619. # and there was not a match on any of them.
  620. try:
  621. if text[pos] == '\n':
  622. # at EOL, reset state to "root"
  623. statestack = ['root']
  624. statetokens = tokendefs['root']
  625. yield pos, Whitespace, '\n'
  626. pos += 1
  627. continue
  628. yield pos, Error, text[pos]
  629. pos += 1
  630. except IndexError:
  631. break
  632. class LexerContext:
  633. """
  634. A helper object that holds lexer position data.
  635. """
  636. def __init__(self, text, pos, stack=None, end=None):
  637. self.text = text
  638. self.pos = pos
  639. self.end = end or len(text) # end=0 not supported ;-)
  640. self.stack = stack or ['root']
  641. def __repr__(self):
  642. return 'LexerContext(%r, %r, %r)' % (
  643. self.text, self.pos, self.stack)
  644. class ExtendedRegexLexer(RegexLexer):
  645. """
  646. A RegexLexer that uses a context object to store its state.
  647. """
  648. def get_tokens_unprocessed(self, text=None, context=None):
  649. """
  650. Split ``text`` into (tokentype, text) pairs.
  651. If ``context`` is given, use this lexer context instead.
  652. """
  653. tokendefs = self._tokens
  654. if not context:
  655. ctx = LexerContext(text, 0)
  656. statetokens = tokendefs['root']
  657. else:
  658. ctx = context
  659. statetokens = tokendefs[ctx.stack[-1]]
  660. text = ctx.text
  661. while 1:
  662. for rexmatch, action, new_state in statetokens:
  663. m = rexmatch(text, ctx.pos, ctx.end)
  664. if m:
  665. if action is not None:
  666. if type(action) is _TokenType:
  667. yield ctx.pos, action, m.group()
  668. ctx.pos = m.end()
  669. else:
  670. yield from action(self, m, ctx)
  671. if not new_state:
  672. # altered the state stack?
  673. statetokens = tokendefs[ctx.stack[-1]]
  674. # CAUTION: callback must set ctx.pos!
  675. if new_state is not None:
  676. # state transition
  677. if isinstance(new_state, tuple):
  678. for state in new_state:
  679. if state == '#pop':
  680. if len(ctx.stack) > 1:
  681. ctx.stack.pop()
  682. elif state == '#push':
  683. ctx.stack.append(ctx.stack[-1])
  684. else:
  685. ctx.stack.append(state)
  686. elif isinstance(new_state, int):
  687. # see RegexLexer for why this check is made
  688. if abs(new_state) >= len(ctx.stack):
  689. del ctx.stack[1:]
  690. else:
  691. del ctx.stack[new_state:]
  692. elif new_state == '#push':
  693. ctx.stack.append(ctx.stack[-1])
  694. else:
  695. assert False, "wrong state def: %r" % new_state
  696. statetokens = tokendefs[ctx.stack[-1]]
  697. break
  698. else:
  699. try:
  700. if ctx.pos >= ctx.end:
  701. break
  702. if text[ctx.pos] == '\n':
  703. # at EOL, reset state to "root"
  704. ctx.stack = ['root']
  705. statetokens = tokendefs['root']
  706. yield ctx.pos, Text, '\n'
  707. ctx.pos += 1
  708. continue
  709. yield ctx.pos, Error, text[ctx.pos]
  710. ctx.pos += 1
  711. except IndexError:
  712. break
  713. def do_insertions(insertions, tokens):
  714. """
  715. Helper for lexers which must combine the results of several
  716. sublexers.
  717. ``insertions`` is a list of ``(index, itokens)`` pairs.
  718. Each ``itokens`` iterable should be inserted at position
  719. ``index`` into the token stream given by the ``tokens``
  720. argument.
  721. The result is a combined token stream.
  722. TODO: clean up the code here.
  723. """
  724. insertions = iter(insertions)
  725. try:
  726. index, itokens = next(insertions)
  727. except StopIteration:
  728. # no insertions
  729. yield from tokens
  730. return
  731. realpos = None
  732. insleft = True
  733. # iterate over the token stream where we want to insert
  734. # the tokens from the insertion list.
  735. for i, t, v in tokens:
  736. # first iteration. store the position of first item
  737. if realpos is None:
  738. realpos = i
  739. oldi = 0
  740. while insleft and i + len(v) >= index:
  741. tmpval = v[oldi:index - i]
  742. if tmpval:
  743. yield realpos, t, tmpval
  744. realpos += len(tmpval)
  745. for it_index, it_token, it_value in itokens:
  746. yield realpos, it_token, it_value
  747. realpos += len(it_value)
  748. oldi = index - i
  749. try:
  750. index, itokens = next(insertions)
  751. except StopIteration:
  752. insleft = False
  753. break # not strictly necessary
  754. if oldi < len(v):
  755. yield realpos, t, v[oldi:]
  756. realpos += len(v) - oldi
  757. # leftover tokens
  758. while insleft:
  759. # no normal tokens, set realpos to zero
  760. realpos = realpos or 0
  761. for p, t, v in itokens:
  762. yield realpos, t, v
  763. realpos += len(v)
  764. try:
  765. index, itokens = next(insertions)
  766. except StopIteration:
  767. insleft = False
  768. break # not strictly necessary
  769. class ProfilingRegexLexerMeta(RegexLexerMeta):
  770. """Metaclass for ProfilingRegexLexer, collects regex timing info."""
  771. def _process_regex(cls, regex, rflags, state):
  772. if isinstance(regex, words):
  773. rex = regex_opt(regex.words, prefix=regex.prefix,
  774. suffix=regex.suffix)
  775. else:
  776. rex = regex
  777. compiled = re.compile(rex, rflags)
  778. def match_func(text, pos, endpos=sys.maxsize):
  779. info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
  780. t0 = time.time()
  781. res = compiled.match(text, pos, endpos)
  782. t1 = time.time()
  783. info[0] += 1
  784. info[1] += t1 - t0
  785. return res
  786. return match_func
  787. class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta):
  788. """Drop-in replacement for RegexLexer that does profiling of its regexes."""
  789. _prof_data = []
  790. _prof_sort_index = 4 # defaults to time per call
  791. def get_tokens_unprocessed(self, text, stack=('root',)):
  792. # this needs to be a stack, since using(this) will produce nested calls
  793. self.__class__._prof_data.append({})
  794. yield from RegexLexer.get_tokens_unprocessed(self, text, stack)
  795. rawdata = self.__class__._prof_data.pop()
  796. data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
  797. n, 1000 * t, 1000 * t / n)
  798. for ((s, r), (n, t)) in rawdata.items()),
  799. key=lambda x: x[self._prof_sort_index],
  800. reverse=True)
  801. sum_total = sum(x[3] for x in data)
  802. print()
  803. print('Profiling result for %s lexing %d chars in %.3f ms' %
  804. (self.__class__.__name__, len(text), sum_total))
  805. print('=' * 110)
  806. print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
  807. print('-' * 110)
  808. for d in data:
  809. print('%-20s %-65s %5d %8.4f %8.4f' % d)
  810. print('=' * 110)