123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382 |
- # -*- coding: utf-8 -*-
- """
- pygments.lexers.textfmts
- ~~~~~~~~~~~~~~~~~~~~~~~~
- Lexers for various text formats.
- :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
- :license: BSD, see LICENSE for details.
- """
- import re
- from pygments.lexers import guess_lexer, get_lexer_by_name
- from pygments.lexer import RegexLexer, bygroups, default, do_insertions
- from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
- Number, Generic, Literal, Punctuation
- from pygments.util import ClassNotFound
- __all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer',
- 'NotmuchLexer']
- class IrcLogsLexer(RegexLexer):
- """
- Lexer for IRC logs in *irssi*, *xchat* or *weechat* style.
- """
- name = 'IRC logs'
- aliases = ['irc']
- filenames = ['*.weechatlog']
- mimetypes = ['text/x-irclog']
- flags = re.VERBOSE | re.MULTILINE
- timestamp = r"""
- (
- # irssi / xchat and others
- (?: \[|\()? # Opening bracket or paren for the timestamp
- (?: # Timestamp
- (?: (?:\d{1,4} [-/])* # Date as - or /-separated groups of digits
- (?:\d{1,4})
- [T ])? # Date/time separator: T or space
- (?: \d?\d [:.])* # Time as :/.-separated groups of 1 or 2 digits
- (?: \d?\d)
- )
- (?: \]|\))?\s+ # Closing bracket or paren for the timestamp
- |
- # weechat
- \d{4}\s\w{3}\s\d{2}\s # Date
- \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
- |
- # xchat
- \w{3}\s\d{2}\s # Date
- \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
- )?
- """
- tokens = {
- 'root': [
- # log start/end
- (r'^\*\*\*\*(.*)\*\*\*\*$', Comment),
- # hack
- ("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)),
- # normal msgs
- ("^" + timestamp + r"""
- (\s*<.*?>\s*) # Nick """,
- bygroups(Comment.Preproc, Name.Tag), 'msg'),
- # /me msgs
- ("^" + timestamp + r"""
- (\s*[*]\s+) # Star
- (\S+\s+.*?\n) # Nick + rest of message """,
- bygroups(Comment.Preproc, Keyword, Generic.Inserted)),
- # join/part msgs
- ("^" + timestamp + r"""
- (\s*(?:\*{3}|<?-[!@=P]?->?)\s*) # Star(s) or symbols
- (\S+\s+) # Nick + Space
- (.*?\n) # Rest of message """,
- bygroups(Comment.Preproc, Keyword, String, Comment)),
- (r"^.*?\n", Text),
- ],
- 'msg': [
- (r"\S+:(?!//)", Name.Attribute), # Prefix
- (r".*\n", Text, '#pop'),
- ],
- }
- class GettextLexer(RegexLexer):
- """
- Lexer for Gettext catalog files.
- .. versionadded:: 0.9
- """
- name = 'Gettext Catalog'
- aliases = ['pot', 'po']
- filenames = ['*.pot', '*.po']
- mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext']
- tokens = {
- 'root': [
- (r'^#,\s.*?$', Keyword.Type),
- (r'^#:\s.*?$', Keyword.Declaration),
- # (r'^#$', Comment),
- (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single),
- (r'^(")([A-Za-z-]+:)(.*")$',
- bygroups(String, Name.Property, String)),
- (r'^".*"$', String),
- (r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$',
- bygroups(Name.Variable, Text, String)),
- (r'^(msgstr\[)(\d)(\])(\s+)(".*")$',
- bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)),
- ]
- }
- class HttpLexer(RegexLexer):
- """
- Lexer for HTTP sessions.
- .. versionadded:: 1.5
- """
- name = 'HTTP'
- aliases = ['http']
- flags = re.DOTALL
- def get_tokens_unprocessed(self, text, stack=('root',)):
- """Reset the content-type state."""
- self.content_type = None
- return RegexLexer.get_tokens_unprocessed(self, text, stack)
- def header_callback(self, match):
- if match.group(1).lower() == 'content-type':
- content_type = match.group(5).strip()
- if ';' in content_type:
- content_type = content_type[:content_type.find(';')].strip()
- self.content_type = content_type
- yield match.start(1), Name.Attribute, match.group(1)
- yield match.start(2), Text, match.group(2)
- yield match.start(3), Operator, match.group(3)
- yield match.start(4), Text, match.group(4)
- yield match.start(5), Literal, match.group(5)
- yield match.start(6), Text, match.group(6)
- def continuous_header_callback(self, match):
- yield match.start(1), Text, match.group(1)
- yield match.start(2), Literal, match.group(2)
- yield match.start(3), Text, match.group(3)
- def content_callback(self, match):
- content_type = getattr(self, 'content_type', None)
- content = match.group()
- offset = match.start()
- if content_type:
- from pygments.lexers import get_lexer_for_mimetype
- possible_lexer_mimetypes = [content_type]
- if '+' in content_type:
- # application/calendar+xml can be treated as application/xml
- # if there's not a better match.
- general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2',
- content_type)
- possible_lexer_mimetypes.append(general_type)
- for i in possible_lexer_mimetypes:
- try:
- lexer = get_lexer_for_mimetype(i)
- except ClassNotFound:
- pass
- else:
- for idx, token, value in lexer.get_tokens_unprocessed(content):
- yield offset + idx, token, value
- return
- yield offset, Text, content
- tokens = {
- 'root': [
- (r'(GET|POST|PUT|DELETE|HEAD|OPTIONS|TRACE|PATCH)( +)([^ ]+)( +)'
- r'(HTTP)(/)(1\.[01])(\r?\n|\Z)',
- bygroups(Name.Function, Text, Name.Namespace, Text,
- Keyword.Reserved, Operator, Number, Text),
- 'headers'),
- (r'(HTTP)(/)(1\.[01])( +)(\d{3})( +)([^\r\n]+)(\r?\n|\Z)',
- bygroups(Keyword.Reserved, Operator, Number, Text, Number,
- Text, Name.Exception, Text),
- 'headers'),
- ],
- 'headers': [
- (r'([^\s:]+)( *)(:)( *)([^\r\n]+)(\r?\n|\Z)', header_callback),
- (r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback),
- (r'\r?\n', Text, 'content')
- ],
- 'content': [
- (r'.+', content_callback)
- ]
- }
- def analyse_text(text):
- return text.startswith(('GET /', 'POST /', 'PUT /', 'DELETE /', 'HEAD /',
- 'OPTIONS /', 'TRACE /', 'PATCH /'))
- class TodotxtLexer(RegexLexer):
- """
- Lexer for `Todo.txt <http://todotxt.com/>`_ todo list format.
- .. versionadded:: 2.0
- """
- name = 'Todotxt'
- aliases = ['todotxt']
- # *.todotxt is not a standard extension for Todo.txt files; including it
- # makes testing easier, and also makes autodetecting file type easier.
- filenames = ['todo.txt', '*.todotxt']
- mimetypes = ['text/x-todo']
- # Aliases mapping standard token types of Todo.txt format concepts
- CompleteTaskText = Operator # Chosen to de-emphasize complete tasks
- IncompleteTaskText = Text # Incomplete tasks should look like plain text
- # Priority should have most emphasis to indicate importance of tasks
- Priority = Generic.Heading
- # Dates should have next most emphasis because time is important
- Date = Generic.Subheading
- # Project and context should have equal weight, and be in different colors
- Project = Generic.Error
- Context = String
- # If tag functionality is added, it should have the same weight as Project
- # and Context, and a different color. Generic.Traceback would work well.
- # Regex patterns for building up rules; dates, priorities, projects, and
- # contexts are all atomic
- # TODO: Make date regex more ISO 8601 compliant
- date_regex = r'\d{4,}-\d{2}-\d{2}'
- priority_regex = r'\([A-Z]\)'
- project_regex = r'\+\S+'
- context_regex = r'@\S+'
- # Compound regex expressions
- complete_one_date_regex = r'(x )(' + date_regex + r')'
- complete_two_date_regex = (complete_one_date_regex + r'( )(' +
- date_regex + r')')
- priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')'
- tokens = {
- # Should parse starting at beginning of line; each line is a task
- 'root': [
- # Complete task entry points: two total:
- # 1. Complete task with two dates
- (complete_two_date_regex, bygroups(CompleteTaskText, Date,
- CompleteTaskText, Date),
- 'complete'),
- # 2. Complete task with one date
- (complete_one_date_regex, bygroups(CompleteTaskText, Date),
- 'complete'),
- # Incomplete task entry points: six total:
- # 1. Priority plus date
- (priority_date_regex, bygroups(Priority, IncompleteTaskText, Date),
- 'incomplete'),
- # 2. Priority only
- (priority_regex, Priority, 'incomplete'),
- # 3. Leading date
- (date_regex, Date, 'incomplete'),
- # 4. Leading context
- (context_regex, Context, 'incomplete'),
- # 5. Leading project
- (project_regex, Project, 'incomplete'),
- # 6. Non-whitespace catch-all
- (r'\S+', IncompleteTaskText, 'incomplete'),
- ],
- # Parse a complete task
- 'complete': [
- # Newline indicates end of task, should return to root
- (r'\s*\n', CompleteTaskText, '#pop'),
- # Tokenize contexts and projects
- (context_regex, Context),
- (project_regex, Project),
- # Tokenize non-whitespace text
- (r'\S+', CompleteTaskText),
- # Tokenize whitespace not containing a newline
- (r'\s+', CompleteTaskText),
- ],
- # Parse an incomplete task
- 'incomplete': [
- # Newline indicates end of task, should return to root
- (r'\s*\n', IncompleteTaskText, '#pop'),
- # Tokenize contexts and projects
- (context_regex, Context),
- (project_regex, Project),
- # Tokenize non-whitespace text
- (r'\S+', IncompleteTaskText),
- # Tokenize whitespace not containing a newline
- (r'\s+', IncompleteTaskText),
- ],
- }
- class NotmuchLexer(RegexLexer):
- """
- For `Notmuch <https://notmuchmail.org/>`_ email text format.
- .. versionadded:: 2.5
- Additional options accepted:
- `body_lexer`
- If given, highlight the contents of the message body with the specified
- lexer, else guess it according to the body content (default: ``None``).
- """
- name = 'Notmuch'
- aliases = ['notmuch']
- def _highlight_code(self, match):
- code = match.group(1)
- try:
- if self.body_lexer:
- lexer = get_lexer_by_name(self.body_lexer)
- else:
- lexer = guess_lexer(code.strip())
- except ClassNotFound:
- lexer = get_lexer_by_name('text')
- for item in lexer.get_tokens_unprocessed(code):
- yield item
- tokens = {
- 'root': [
- (r'\fmessage{\s*', Keyword, ('message', 'message-attr')),
- ],
- 'message-attr': [
- (r'(\s*id:\s*)([^\s]+)', bygroups(Name.Attribute, String)),
- (r'(\s*(?:depth|match|excluded):\s*)(\d+)',
- bygroups(Name.Attribute, Number.Integer)),
- (r'(\s*filename:\s*)(.+\n)',
- bygroups(Name.Attribute, String)),
- default('#pop'),
- ],
- 'message': [
- (r'\fmessage}\n', Keyword, '#pop'),
- (r'\fheader{\n', Keyword, 'header'),
- (r'\fbody{\n', Keyword, 'body'),
- ],
- 'header': [
- (r'\fheader}\n', Keyword, '#pop'),
- (r'((?:Subject|From|To|Cc|Date):\s*)(.*\n)',
- bygroups(Name.Attribute, String)),
- (r'(.*)(\s*\(.*\))(\s*\(.*\)\n)',
- bygroups(Generic.Strong, Literal, Name.Tag)),
- ],
- 'body': [
- (r'\fpart{\n', Keyword, 'part'),
- (r'\f(part|attachment){\s*', Keyword, ('part', 'part-attr')),
- (r'\fbody}\n', Keyword, '#pop'),
- ],
- 'part-attr': [
- (r'(ID:\s*)(\d+)', bygroups(Name.Attribute, Number.Integer)),
- (r'(,\s*)((?:Filename|Content-id):\s*)([^,]+)',
- bygroups(Punctuation, Name.Attribute, String)),
- (r'(,\s*)(Content-type:\s*)(.+\n)',
- bygroups(Punctuation, Name.Attribute, String)),
- default('#pop'),
- ],
- 'part': [
- (r'\f(?:part|attachment)}\n', Keyword, '#pop'),
- (r'\f(?:part|attachment){\s*', Keyword, ('#push', 'part-attr')),
- (r'^Non-text part: .*\n', Comment),
- (r'(?s)(.*?(?=\f(?:part|attachment)}\n))', _highlight_code),
- ],
- }
- def analyse_text(text):
- return 1.0 if text.startswith('\fmessage{') else 0.0
- def __init__(self, **options):
- self.body_lexer = options.get('body_lexer', None)
- RegexLexer.__init__(self, **options)
|