lexer.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881
  1. # -*- coding: utf-8 -*-
  2. """
  3. pygments.lexer
  4. ~~~~~~~~~~~~~~
  5. Base lexer classes.
  6. :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
  7. :license: BSD, see LICENSE for details.
  8. """
  9. from __future__ import print_function
  10. import re
  11. import sys
  12. import time
  13. from pygments.filter import apply_filters, Filter
  14. from pygments.filters import get_filter_by_name
  15. from pygments.token import Error, Text, Other, _TokenType
  16. from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
  17. make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode
  18. from pygments.regexopt import regex_opt
  19. __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
  20. 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
  21. 'default', 'words']
  22. _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
  23. (b'\xff\xfe\0\0', 'utf-32'),
  24. (b'\0\0\xfe\xff', 'utf-32be'),
  25. (b'\xff\xfe', 'utf-16'),
  26. (b'\xfe\xff', 'utf-16be')]
  27. _default_analyse = staticmethod(lambda x: 0.0)
  28. class LexerMeta(type):
  29. """
  30. This metaclass automagically converts ``analyse_text`` methods into
  31. static methods which always return float values.
  32. """
  33. def __new__(mcs, name, bases, d):
  34. if 'analyse_text' in d:
  35. d['analyse_text'] = make_analysator(d['analyse_text'])
  36. return type.__new__(mcs, name, bases, d)
  37. @add_metaclass(LexerMeta)
  38. class Lexer(object):
  39. """
  40. Lexer for a specific language.
  41. Basic options recognized:
  42. ``stripnl``
  43. Strip leading and trailing newlines from the input (default: True).
  44. ``stripall``
  45. Strip all leading and trailing whitespace from the input
  46. (default: False).
  47. ``ensurenl``
  48. Make sure that the input ends with a newline (default: True). This
  49. is required for some lexers that consume input linewise.
  50. .. versionadded:: 1.3
  51. ``tabsize``
  52. If given and greater than 0, expand tabs in the input (default: 0).
  53. ``encoding``
  54. If given, must be an encoding name. This encoding will be used to
  55. convert the input string to Unicode, if it is not already a Unicode
  56. string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
  57. Latin1 detection. Can also be ``'chardet'`` to use the chardet
  58. library, if it is installed.
  59. ``inencoding``
  60. Overrides the ``encoding`` if given.
  61. """
  62. #: Name of the lexer
  63. name = None
  64. #: Shortcuts for the lexer
  65. aliases = []
  66. #: File name globs
  67. filenames = []
  68. #: Secondary file name globs
  69. alias_filenames = []
  70. #: MIME types
  71. mimetypes = []
  72. #: Priority, should multiple lexers match and no content is provided
  73. priority = 0
  74. def __init__(self, **options):
  75. self.options = options
  76. self.stripnl = get_bool_opt(options, 'stripnl', True)
  77. self.stripall = get_bool_opt(options, 'stripall', False)
  78. self.ensurenl = get_bool_opt(options, 'ensurenl', True)
  79. self.tabsize = get_int_opt(options, 'tabsize', 0)
  80. self.encoding = options.get('encoding', 'guess')
  81. self.encoding = options.get('inencoding') or self.encoding
  82. self.filters = []
  83. for filter_ in get_list_opt(options, 'filters', ()):
  84. self.add_filter(filter_)
  85. def __repr__(self):
  86. if self.options:
  87. return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
  88. self.options)
  89. else:
  90. return '<pygments.lexers.%s>' % self.__class__.__name__
  91. def add_filter(self, filter_, **options):
  92. """
  93. Add a new stream filter to this lexer.
  94. """
  95. if not isinstance(filter_, Filter):
  96. filter_ = get_filter_by_name(filter_, **options)
  97. self.filters.append(filter_)
  98. def analyse_text(text):
  99. """
  100. Has to return a float between ``0`` and ``1`` that indicates
  101. if a lexer wants to highlight this text. Used by ``guess_lexer``.
  102. If this method returns ``0`` it won't highlight it in any case, if
  103. it returns ``1`` highlighting with this lexer is guaranteed.
  104. The `LexerMeta` metaclass automatically wraps this function so
  105. that it works like a static method (no ``self`` or ``cls``
  106. parameter) and the return value is automatically converted to
  107. `float`. If the return value is an object that is boolean `False`
  108. it's the same as if the return values was ``0.0``.
  109. """
  110. def get_tokens(self, text, unfiltered=False):
  111. """
  112. Return an iterable of (tokentype, value) pairs generated from
  113. `text`. If `unfiltered` is set to `True`, the filtering mechanism
  114. is bypassed even if filters are defined.
  115. Also preprocess the text, i.e. expand tabs and strip it if
  116. wanted and applies registered filters.
  117. """
  118. if not isinstance(text, text_type):
  119. if self.encoding == 'guess':
  120. text, _ = guess_decode(text)
  121. elif self.encoding == 'chardet':
  122. try:
  123. import chardet
  124. except ImportError:
  125. raise ImportError('To enable chardet encoding guessing, '
  126. 'please install the chardet library '
  127. 'from http://chardet.feedparser.org/')
  128. # check for BOM first
  129. decoded = None
  130. for bom, encoding in _encoding_map:
  131. if text.startswith(bom):
  132. decoded = text[len(bom):].decode(encoding, 'replace')
  133. break
  134. # no BOM found, so use chardet
  135. if decoded is None:
  136. enc = chardet.detect(text[:1024]) # Guess using first 1KB
  137. decoded = text.decode(enc.get('encoding') or 'utf-8',
  138. 'replace')
  139. text = decoded
  140. else:
  141. text = text.decode(self.encoding)
  142. if text.startswith(u'\ufeff'):
  143. text = text[len(u'\ufeff'):]
  144. else:
  145. if text.startswith(u'\ufeff'):
  146. text = text[len(u'\ufeff'):]
  147. # text now *is* a unicode string
  148. text = text.replace('\r\n', '\n')
  149. text = text.replace('\r', '\n')
  150. if self.stripall:
  151. text = text.strip()
  152. elif self.stripnl:
  153. text = text.strip('\n')
  154. if self.tabsize > 0:
  155. text = text.expandtabs(self.tabsize)
  156. if self.ensurenl and not text.endswith('\n'):
  157. text += '\n'
  158. def streamer():
  159. for _, t, v in self.get_tokens_unprocessed(text):
  160. yield t, v
  161. stream = streamer()
  162. if not unfiltered:
  163. stream = apply_filters(stream, self.filters, self)
  164. return stream
  165. def get_tokens_unprocessed(self, text):
  166. """
  167. Return an iterable of (index, tokentype, value) pairs where "index"
  168. is the starting position of the token within the input text.
  169. In subclasses, implement this method as a generator to
  170. maximize effectiveness.
  171. """
  172. raise NotImplementedError
  173. class DelegatingLexer(Lexer):
  174. """
  175. This lexer takes two lexer as arguments. A root lexer and
  176. a language lexer. First everything is scanned using the language
  177. lexer, afterwards all ``Other`` tokens are lexed using the root
  178. lexer.
  179. The lexers from the ``template`` lexer package use this base lexer.
  180. """
  181. def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
  182. self.root_lexer = _root_lexer(**options)
  183. self.language_lexer = _language_lexer(**options)
  184. self.needle = _needle
  185. Lexer.__init__(self, **options)
  186. def get_tokens_unprocessed(self, text):
  187. buffered = ''
  188. insertions = []
  189. lng_buffer = []
  190. for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
  191. if t is self.needle:
  192. if lng_buffer:
  193. insertions.append((len(buffered), lng_buffer))
  194. lng_buffer = []
  195. buffered += v
  196. else:
  197. lng_buffer.append((i, t, v))
  198. if lng_buffer:
  199. insertions.append((len(buffered), lng_buffer))
  200. return do_insertions(insertions,
  201. self.root_lexer.get_tokens_unprocessed(buffered))
  202. # ------------------------------------------------------------------------------
  203. # RegexLexer and ExtendedRegexLexer
  204. #
  205. class include(str): # pylint: disable=invalid-name
  206. """
  207. Indicates that a state should include rules from another state.
  208. """
  209. pass
  210. class _inherit(object):
  211. """
  212. Indicates the a state should inherit from its superclass.
  213. """
  214. def __repr__(self):
  215. return 'inherit'
  216. inherit = _inherit() # pylint: disable=invalid-name
  217. class combined(tuple): # pylint: disable=invalid-name
  218. """
  219. Indicates a state combined from multiple states.
  220. """
  221. def __new__(cls, *args):
  222. return tuple.__new__(cls, args)
  223. def __init__(self, *args):
  224. # tuple.__init__ doesn't do anything
  225. pass
  226. class _PseudoMatch(object):
  227. """
  228. A pseudo match object constructed from a string.
  229. """
  230. def __init__(self, start, text):
  231. self._text = text
  232. self._start = start
  233. def start(self, arg=None):
  234. return self._start
  235. def end(self, arg=None):
  236. return self._start + len(self._text)
  237. def group(self, arg=None):
  238. if arg:
  239. raise IndexError('No such group')
  240. return self._text
  241. def groups(self):
  242. return (self._text,)
  243. def groupdict(self):
  244. return {}
  245. def bygroups(*args):
  246. """
  247. Callback that yields multiple actions for each group in the match.
  248. """
  249. def callback(lexer, match, ctx=None):
  250. for i, action in enumerate(args):
  251. if action is None:
  252. continue
  253. elif type(action) is _TokenType:
  254. data = match.group(i + 1)
  255. if data:
  256. yield match.start(i + 1), action, data
  257. else:
  258. data = match.group(i + 1)
  259. if data is not None:
  260. if ctx:
  261. ctx.pos = match.start(i + 1)
  262. for item in action(lexer,
  263. _PseudoMatch(match.start(i + 1), data), ctx):
  264. if item:
  265. yield item
  266. if ctx:
  267. ctx.pos = match.end()
  268. return callback
  269. class _This(object):
  270. """
  271. Special singleton used for indicating the caller class.
  272. Used by ``using``.
  273. """
  274. this = _This()
  275. def using(_other, **kwargs):
  276. """
  277. Callback that processes the match with a different lexer.
  278. The keyword arguments are forwarded to the lexer, except `state` which
  279. is handled separately.
  280. `state` specifies the state that the new lexer will start in, and can
  281. be an enumerable such as ('root', 'inline', 'string') or a simple
  282. string which is assumed to be on top of the root state.
  283. Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
  284. """
  285. gt_kwargs = {}
  286. if 'state' in kwargs:
  287. s = kwargs.pop('state')
  288. if isinstance(s, (list, tuple)):
  289. gt_kwargs['stack'] = s
  290. else:
  291. gt_kwargs['stack'] = ('root', s)
  292. if _other is this:
  293. def callback(lexer, match, ctx=None):
  294. # if keyword arguments are given the callback
  295. # function has to create a new lexer instance
  296. if kwargs:
  297. # XXX: cache that somehow
  298. kwargs.update(lexer.options)
  299. lx = lexer.__class__(**kwargs)
  300. else:
  301. lx = lexer
  302. s = match.start()
  303. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  304. yield i + s, t, v
  305. if ctx:
  306. ctx.pos = match.end()
  307. else:
  308. def callback(lexer, match, ctx=None):
  309. # XXX: cache that somehow
  310. kwargs.update(lexer.options)
  311. lx = _other(**kwargs)
  312. s = match.start()
  313. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  314. yield i + s, t, v
  315. if ctx:
  316. ctx.pos = match.end()
  317. return callback
  318. class default:
  319. """
  320. Indicates a state or state action (e.g. #pop) to apply.
  321. For example default('#pop') is equivalent to ('', Token, '#pop')
  322. Note that state tuples may be used as well.
  323. .. versionadded:: 2.0
  324. """
  325. def __init__(self, state):
  326. self.state = state
  327. class words(Future):
  328. """
  329. Indicates a list of literal words that is transformed into an optimized
  330. regex that matches any of the words.
  331. .. versionadded:: 2.0
  332. """
  333. def __init__(self, words, prefix='', suffix=''):
  334. self.words = words
  335. self.prefix = prefix
  336. self.suffix = suffix
  337. def get(self):
  338. return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
  339. class RegexLexerMeta(LexerMeta):
  340. """
  341. Metaclass for RegexLexer, creates the self._tokens attribute from
  342. self.tokens on the first instantiation.
  343. """
  344. def _process_regex(cls, regex, rflags, state):
  345. """Preprocess the regular expression component of a token definition."""
  346. if isinstance(regex, Future):
  347. regex = regex.get()
  348. return re.compile(regex, rflags).match
  349. def _process_token(cls, token):
  350. """Preprocess the token component of a token definition."""
  351. assert type(token) is _TokenType or callable(token), \
  352. 'token type must be simple type or callable, not %r' % (token,)
  353. return token
  354. def _process_new_state(cls, new_state, unprocessed, processed):
  355. """Preprocess the state transition action of a token definition."""
  356. if isinstance(new_state, str):
  357. # an existing state
  358. if new_state == '#pop':
  359. return -1
  360. elif new_state in unprocessed:
  361. return (new_state,)
  362. elif new_state == '#push':
  363. return new_state
  364. elif new_state[:5] == '#pop:':
  365. return -int(new_state[5:])
  366. else:
  367. assert False, 'unknown new state %r' % new_state
  368. elif isinstance(new_state, combined):
  369. # combine a new state from existing ones
  370. tmp_state = '_tmp_%d' % cls._tmpname
  371. cls._tmpname += 1
  372. itokens = []
  373. for istate in new_state:
  374. assert istate != new_state, 'circular state ref %r' % istate
  375. itokens.extend(cls._process_state(unprocessed,
  376. processed, istate))
  377. processed[tmp_state] = itokens
  378. return (tmp_state,)
  379. elif isinstance(new_state, tuple):
  380. # push more than one state
  381. for istate in new_state:
  382. assert (istate in unprocessed or
  383. istate in ('#pop', '#push')), \
  384. 'unknown new state ' + istate
  385. return new_state
  386. else:
  387. assert False, 'unknown new state def %r' % new_state
  388. def _process_state(cls, unprocessed, processed, state):
  389. """Preprocess a single state definition."""
  390. assert type(state) is str, "wrong state name %r" % state
  391. assert state[0] != '#', "invalid state name %r" % state
  392. if state in processed:
  393. return processed[state]
  394. tokens = processed[state] = []
  395. rflags = cls.flags
  396. for tdef in unprocessed[state]:
  397. if isinstance(tdef, include):
  398. # it's a state reference
  399. assert tdef != state, "circular state reference %r" % state
  400. tokens.extend(cls._process_state(unprocessed, processed,
  401. str(tdef)))
  402. continue
  403. if isinstance(tdef, _inherit):
  404. # should be processed already, but may not in the case of:
  405. # 1. the state has no counterpart in any parent
  406. # 2. the state includes more than one 'inherit'
  407. continue
  408. if isinstance(tdef, default):
  409. new_state = cls._process_new_state(tdef.state, unprocessed, processed)
  410. tokens.append((re.compile('').match, None, new_state))
  411. continue
  412. assert type(tdef) is tuple, "wrong rule def %r" % tdef
  413. try:
  414. rex = cls._process_regex(tdef[0], rflags, state)
  415. except Exception as err:
  416. raise ValueError("uncompilable regex %r in state %r of %r: %s" %
  417. (tdef[0], state, cls, err))
  418. token = cls._process_token(tdef[1])
  419. if len(tdef) == 2:
  420. new_state = None
  421. else:
  422. new_state = cls._process_new_state(tdef[2],
  423. unprocessed, processed)
  424. tokens.append((rex, token, new_state))
  425. return tokens
  426. def process_tokendef(cls, name, tokendefs=None):
  427. """Preprocess a dictionary of token definitions."""
  428. processed = cls._all_tokens[name] = {}
  429. tokendefs = tokendefs or cls.tokens[name]
  430. for state in list(tokendefs):
  431. cls._process_state(tokendefs, processed, state)
  432. return processed
  433. def get_tokendefs(cls):
  434. """
  435. Merge tokens from superclasses in MRO order, returning a single tokendef
  436. dictionary.
  437. Any state that is not defined by a subclass will be inherited
  438. automatically. States that *are* defined by subclasses will, by
  439. default, override that state in the superclass. If a subclass wishes to
  440. inherit definitions from a superclass, it can use the special value
  441. "inherit", which will cause the superclass' state definition to be
  442. included at that point in the state.
  443. """
  444. tokens = {}
  445. inheritable = {}
  446. for c in cls.__mro__:
  447. toks = c.__dict__.get('tokens', {})
  448. for state, items in iteritems(toks):
  449. curitems = tokens.get(state)
  450. if curitems is None:
  451. # N.b. because this is assigned by reference, sufficiently
  452. # deep hierarchies are processed incrementally (e.g. for
  453. # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
  454. # will not see any inherits in B).
  455. tokens[state] = items
  456. try:
  457. inherit_ndx = items.index(inherit)
  458. except ValueError:
  459. continue
  460. inheritable[state] = inherit_ndx
  461. continue
  462. inherit_ndx = inheritable.pop(state, None)
  463. if inherit_ndx is None:
  464. continue
  465. # Replace the "inherit" value with the items
  466. curitems[inherit_ndx:inherit_ndx+1] = items
  467. try:
  468. # N.b. this is the index in items (that is, the superclass
  469. # copy), so offset required when storing below.
  470. new_inh_ndx = items.index(inherit)
  471. except ValueError:
  472. pass
  473. else:
  474. inheritable[state] = inherit_ndx + new_inh_ndx
  475. return tokens
  476. def __call__(cls, *args, **kwds):
  477. """Instantiate cls after preprocessing its token definitions."""
  478. if '_tokens' not in cls.__dict__:
  479. cls._all_tokens = {}
  480. cls._tmpname = 0
  481. if hasattr(cls, 'token_variants') and cls.token_variants:
  482. # don't process yet
  483. pass
  484. else:
  485. cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
  486. return type.__call__(cls, *args, **kwds)
  487. @add_metaclass(RegexLexerMeta)
  488. class RegexLexer(Lexer):
  489. """
  490. Base for simple stateful regular expression-based lexers.
  491. Simplifies the lexing process so that you need only
  492. provide a list of states and regular expressions.
  493. """
  494. #: Flags for compiling the regular expressions.
  495. #: Defaults to MULTILINE.
  496. flags = re.MULTILINE
  497. #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
  498. #:
  499. #: The initial state is 'root'.
  500. #: ``new_state`` can be omitted to signify no state transition.
  501. #: If it is a string, the state is pushed on the stack and changed.
  502. #: If it is a tuple of strings, all states are pushed on the stack and
  503. #: the current state will be the topmost.
  504. #: It can also be ``combined('state1', 'state2', ...)``
  505. #: to signify a new, anonymous state combined from the rules of two
  506. #: or more existing ones.
  507. #: Furthermore, it can be '#pop' to signify going back one step in
  508. #: the state stack, or '#push' to push the current state on the stack
  509. #: again.
  510. #:
  511. #: The tuple can also be replaced with ``include('state')``, in which
  512. #: case the rules from the state named by the string are included in the
  513. #: current one.
  514. tokens = {}
  515. def get_tokens_unprocessed(self, text, stack=('root',)):
  516. """
  517. Split ``text`` into (tokentype, text) pairs.
  518. ``stack`` is the inital stack (default: ``['root']``)
  519. """
  520. pos = 0
  521. tokendefs = self._tokens
  522. statestack = list(stack)
  523. statetokens = tokendefs[statestack[-1]]
  524. while 1:
  525. for rexmatch, action, new_state in statetokens:
  526. m = rexmatch(text, pos)
  527. if m:
  528. if action is not None:
  529. if type(action) is _TokenType:
  530. yield pos, action, m.group()
  531. else:
  532. for item in action(self, m):
  533. yield item
  534. pos = m.end()
  535. if new_state is not None:
  536. # state transition
  537. if isinstance(new_state, tuple):
  538. for state in new_state:
  539. if state == '#pop':
  540. if len(statestack) > 1:
  541. statestack.pop()
  542. elif state == '#push':
  543. statestack.append(statestack[-1])
  544. else:
  545. statestack.append(state)
  546. elif isinstance(new_state, int):
  547. # pop, but keep at least one state on the stack
  548. # (random code leading to unexpected pops should
  549. # not allow exceptions)
  550. if abs(new_state) >= len(statestack):
  551. del statestack[1:]
  552. else:
  553. del statestack[new_state:]
  554. elif new_state == '#push':
  555. statestack.append(statestack[-1])
  556. else:
  557. assert False, "wrong state def: %r" % new_state
  558. statetokens = tokendefs[statestack[-1]]
  559. break
  560. else:
  561. # We are here only if all state tokens have been considered
  562. # and there was not a match on any of them.
  563. try:
  564. if text[pos] == '\n':
  565. # at EOL, reset state to "root"
  566. statestack = ['root']
  567. statetokens = tokendefs['root']
  568. yield pos, Text, u'\n'
  569. pos += 1
  570. continue
  571. yield pos, Error, text[pos]
  572. pos += 1
  573. except IndexError:
  574. break
  575. class LexerContext(object):
  576. """
  577. A helper object that holds lexer position data.
  578. """
  579. def __init__(self, text, pos, stack=None, end=None):
  580. self.text = text
  581. self.pos = pos
  582. self.end = end or len(text) # end=0 not supported ;-)
  583. self.stack = stack or ['root']
  584. def __repr__(self):
  585. return 'LexerContext(%r, %r, %r)' % (
  586. self.text, self.pos, self.stack)
  587. class ExtendedRegexLexer(RegexLexer):
  588. """
  589. A RegexLexer that uses a context object to store its state.
  590. """
  591. def get_tokens_unprocessed(self, text=None, context=None):
  592. """
  593. Split ``text`` into (tokentype, text) pairs.
  594. If ``context`` is given, use this lexer context instead.
  595. """
  596. tokendefs = self._tokens
  597. if not context:
  598. ctx = LexerContext(text, 0)
  599. statetokens = tokendefs['root']
  600. else:
  601. ctx = context
  602. statetokens = tokendefs[ctx.stack[-1]]
  603. text = ctx.text
  604. while 1:
  605. for rexmatch, action, new_state in statetokens:
  606. m = rexmatch(text, ctx.pos, ctx.end)
  607. if m:
  608. if action is not None:
  609. if type(action) is _TokenType:
  610. yield ctx.pos, action, m.group()
  611. ctx.pos = m.end()
  612. else:
  613. for item in action(self, m, ctx):
  614. yield item
  615. if not new_state:
  616. # altered the state stack?
  617. statetokens = tokendefs[ctx.stack[-1]]
  618. # CAUTION: callback must set ctx.pos!
  619. if new_state is not None:
  620. # state transition
  621. if isinstance(new_state, tuple):
  622. for state in new_state:
  623. if state == '#pop':
  624. if len(ctx.stack) > 1:
  625. ctx.stack.pop()
  626. elif state == '#push':
  627. ctx.stack.append(ctx.stack[-1])
  628. else:
  629. ctx.stack.append(state)
  630. elif isinstance(new_state, int):
  631. # see RegexLexer for why this check is made
  632. if abs(new_state) >= len(ctx.stack):
  633. del ctx.state[1:]
  634. else:
  635. del ctx.stack[new_state:]
  636. elif new_state == '#push':
  637. ctx.stack.append(ctx.stack[-1])
  638. else:
  639. assert False, "wrong state def: %r" % new_state
  640. statetokens = tokendefs[ctx.stack[-1]]
  641. break
  642. else:
  643. try:
  644. if ctx.pos >= ctx.end:
  645. break
  646. if text[ctx.pos] == '\n':
  647. # at EOL, reset state to "root"
  648. ctx.stack = ['root']
  649. statetokens = tokendefs['root']
  650. yield ctx.pos, Text, u'\n'
  651. ctx.pos += 1
  652. continue
  653. yield ctx.pos, Error, text[ctx.pos]
  654. ctx.pos += 1
  655. except IndexError:
  656. break
  657. def do_insertions(insertions, tokens):
  658. """
  659. Helper for lexers which must combine the results of several
  660. sublexers.
  661. ``insertions`` is a list of ``(index, itokens)`` pairs.
  662. Each ``itokens`` iterable should be inserted at position
  663. ``index`` into the token stream given by the ``tokens``
  664. argument.
  665. The result is a combined token stream.
  666. TODO: clean up the code here.
  667. """
  668. insertions = iter(insertions)
  669. try:
  670. index, itokens = next(insertions)
  671. except StopIteration:
  672. # no insertions
  673. for item in tokens:
  674. yield item
  675. return
  676. realpos = None
  677. insleft = True
  678. # iterate over the token stream where we want to insert
  679. # the tokens from the insertion list.
  680. for i, t, v in tokens:
  681. # first iteration. store the postition of first item
  682. if realpos is None:
  683. realpos = i
  684. oldi = 0
  685. while insleft and i + len(v) >= index:
  686. tmpval = v[oldi:index - i]
  687. yield realpos, t, tmpval
  688. realpos += len(tmpval)
  689. for it_index, it_token, it_value in itokens:
  690. yield realpos, it_token, it_value
  691. realpos += len(it_value)
  692. oldi = index - i
  693. try:
  694. index, itokens = next(insertions)
  695. except StopIteration:
  696. insleft = False
  697. break # not strictly necessary
  698. yield realpos, t, v[oldi:]
  699. realpos += len(v) - oldi
  700. # leftover tokens
  701. while insleft:
  702. # no normal tokens, set realpos to zero
  703. realpos = realpos or 0
  704. for p, t, v in itokens:
  705. yield realpos, t, v
  706. realpos += len(v)
  707. try:
  708. index, itokens = next(insertions)
  709. except StopIteration:
  710. insleft = False
  711. break # not strictly necessary
  712. class ProfilingRegexLexerMeta(RegexLexerMeta):
  713. """Metaclass for ProfilingRegexLexer, collects regex timing info."""
  714. def _process_regex(cls, regex, rflags, state):
  715. if isinstance(regex, words):
  716. rex = regex_opt(regex.words, prefix=regex.prefix,
  717. suffix=regex.suffix)
  718. else:
  719. rex = regex
  720. compiled = re.compile(rex, rflags)
  721. def match_func(text, pos, endpos=sys.maxsize):
  722. info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
  723. t0 = time.time()
  724. res = compiled.match(text, pos, endpos)
  725. t1 = time.time()
  726. info[0] += 1
  727. info[1] += t1 - t0
  728. return res
  729. return match_func
  730. @add_metaclass(ProfilingRegexLexerMeta)
  731. class ProfilingRegexLexer(RegexLexer):
  732. """Drop-in replacement for RegexLexer that does profiling of its regexes."""
  733. _prof_data = []
  734. _prof_sort_index = 4 # defaults to time per call
  735. def get_tokens_unprocessed(self, text, stack=('root',)):
  736. # this needs to be a stack, since using(this) will produce nested calls
  737. self.__class__._prof_data.append({})
  738. for tok in RegexLexer.get_tokens_unprocessed(self, text, stack):
  739. yield tok
  740. rawdata = self.__class__._prof_data.pop()
  741. data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
  742. n, 1000 * t, 1000 * t / n)
  743. for ((s, r), (n, t)) in rawdata.items()),
  744. key=lambda x: x[self._prof_sort_index],
  745. reverse=True)
  746. sum_total = sum(x[3] for x in data)
  747. print()
  748. print('Profiling result for %s lexing %d chars in %.3f ms' %
  749. (self.__class__.__name__, len(text), sum_total))
  750. print('=' * 110)
  751. print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
  752. print('-' * 110)
  753. for d in data:
  754. print('%-20s %-65s %5d %8.4f %8.4f' % d)
  755. print('=' * 110)