special.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. # -*- coding: utf-8 -*-
  2. """
  3. pygments.lexers.special
  4. ~~~~~~~~~~~~~~~~~~~~~~~
  5. Special lexers.
  6. :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
  7. :license: BSD, see LICENSE for details.
  8. """
  9. import re
  10. from pygments.lexer import Lexer
  11. from pygments.token import Token, Error, Text
  12. from pygments.util import get_choice_opt, text_type, BytesIO
  13. __all__ = ['TextLexer', 'RawTokenLexer']
  14. class TextLexer(Lexer):
  15. """
  16. "Null" lexer, doesn't highlight anything.
  17. """
  18. name = 'Text only'
  19. aliases = ['text']
  20. filenames = ['*.txt']
  21. mimetypes = ['text/plain']
  22. priority = 0.01
  23. def get_tokens_unprocessed(self, text):
  24. yield 0, Text, text
  25. def analyse_text(text):
  26. return TextLexer.priority
  27. _ttype_cache = {}
  28. line_re = re.compile(b'.*?\n')
  29. class RawTokenLexer(Lexer):
  30. """
  31. Recreate a token stream formatted with the `RawTokenFormatter`. This
  32. lexer raises exceptions during parsing if the token stream in the
  33. file is malformed.
  34. Additional options accepted:
  35. `compress`
  36. If set to ``"gz"`` or ``"bz2"``, decompress the token stream with
  37. the given compression algorithm before lexing (default: ``""``).
  38. """
  39. name = 'Raw token data'
  40. aliases = ['raw']
  41. filenames = []
  42. mimetypes = ['application/x-pygments-tokens']
  43. def __init__(self, **options):
  44. self.compress = get_choice_opt(options, 'compress',
  45. ['', 'none', 'gz', 'bz2'], '')
  46. Lexer.__init__(self, **options)
  47. def get_tokens(self, text):
  48. if isinstance(text, text_type):
  49. # raw token stream never has any non-ASCII characters
  50. text = text.encode('ascii')
  51. if self.compress == 'gz':
  52. import gzip
  53. gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text))
  54. text = gzipfile.read()
  55. elif self.compress == 'bz2':
  56. import bz2
  57. text = bz2.decompress(text)
  58. # do not call Lexer.get_tokens() because we do not want Unicode
  59. # decoding to occur, and stripping is not optional.
  60. text = text.strip(b'\n') + b'\n'
  61. for i, t, v in self.get_tokens_unprocessed(text):
  62. yield t, v
  63. def get_tokens_unprocessed(self, text):
  64. length = 0
  65. for match in line_re.finditer(text):
  66. try:
  67. ttypestr, val = match.group().split(b'\t', 1)
  68. except ValueError:
  69. val = match.group().decode('ascii', 'replace')
  70. ttype = Error
  71. else:
  72. ttype = _ttype_cache.get(ttypestr)
  73. if not ttype:
  74. ttype = Token
  75. ttypes = ttypestr.split('.')[1:]
  76. for ttype_ in ttypes:
  77. if not ttype_ or not ttype_[0].isupper():
  78. raise ValueError('malformed token name')
  79. ttype = getattr(ttype, ttype_)
  80. _ttype_cache[ttypestr] = ttype
  81. val = val[2:-2].decode('unicode-escape')
  82. yield length, ttype, val
  83. length += len(val)