textfmts.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. # -*- coding: utf-8 -*-
  2. """
  3. pygments.lexers.textfmts
  4. ~~~~~~~~~~~~~~~~~~~~~~~~
  5. Lexers for various text formats.
  6. :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
  7. :license: BSD, see LICENSE for details.
  8. """
  9. import re
  10. from pygments.lexers import guess_lexer, get_lexer_by_name
  11. from pygments.lexer import RegexLexer, bygroups, default, do_insertions
  12. from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
  13. Number, Generic, Literal, Punctuation
  14. from pygments.util import ClassNotFound
  15. __all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer',
  16. 'NotmuchLexer']
  17. class IrcLogsLexer(RegexLexer):
  18. """
  19. Lexer for IRC logs in *irssi*, *xchat* or *weechat* style.
  20. """
  21. name = 'IRC logs'
  22. aliases = ['irc']
  23. filenames = ['*.weechatlog']
  24. mimetypes = ['text/x-irclog']
  25. flags = re.VERBOSE | re.MULTILINE
  26. timestamp = r"""
  27. (
  28. # irssi / xchat and others
  29. (?: \[|\()? # Opening bracket or paren for the timestamp
  30. (?: # Timestamp
  31. (?: (?:\d{1,4} [-/])* # Date as - or /-separated groups of digits
  32. (?:\d{1,4})
  33. [T ])? # Date/time separator: T or space
  34. (?: \d?\d [:.])* # Time as :/.-separated groups of 1 or 2 digits
  35. (?: \d?\d)
  36. )
  37. (?: \]|\))?\s+ # Closing bracket or paren for the timestamp
  38. |
  39. # weechat
  40. \d{4}\s\w{3}\s\d{2}\s # Date
  41. \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
  42. |
  43. # xchat
  44. \w{3}\s\d{2}\s # Date
  45. \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
  46. )?
  47. """
  48. tokens = {
  49. 'root': [
  50. # log start/end
  51. (r'^\*\*\*\*(.*)\*\*\*\*$', Comment),
  52. # hack
  53. ("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)),
  54. # normal msgs
  55. ("^" + timestamp + r"""
  56. (\s*<.*?>\s*) # Nick """,
  57. bygroups(Comment.Preproc, Name.Tag), 'msg'),
  58. # /me msgs
  59. ("^" + timestamp + r"""
  60. (\s*[*]\s+) # Star
  61. (\S+\s+.*?\n) # Nick + rest of message """,
  62. bygroups(Comment.Preproc, Keyword, Generic.Inserted)),
  63. # join/part msgs
  64. ("^" + timestamp + r"""
  65. (\s*(?:\*{3}|<?-[!@=P]?->?)\s*) # Star(s) or symbols
  66. (\S+\s+) # Nick + Space
  67. (.*?\n) # Rest of message """,
  68. bygroups(Comment.Preproc, Keyword, String, Comment)),
  69. (r"^.*?\n", Text),
  70. ],
  71. 'msg': [
  72. (r"\S+:(?!//)", Name.Attribute), # Prefix
  73. (r".*\n", Text, '#pop'),
  74. ],
  75. }
  76. class GettextLexer(RegexLexer):
  77. """
  78. Lexer for Gettext catalog files.
  79. .. versionadded:: 0.9
  80. """
  81. name = 'Gettext Catalog'
  82. aliases = ['pot', 'po']
  83. filenames = ['*.pot', '*.po']
  84. mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext']
  85. tokens = {
  86. 'root': [
  87. (r'^#,\s.*?$', Keyword.Type),
  88. (r'^#:\s.*?$', Keyword.Declaration),
  89. # (r'^#$', Comment),
  90. (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single),
  91. (r'^(")([A-Za-z-]+:)(.*")$',
  92. bygroups(String, Name.Property, String)),
  93. (r'^".*"$', String),
  94. (r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$',
  95. bygroups(Name.Variable, Text, String)),
  96. (r'^(msgstr\[)(\d)(\])(\s+)(".*")$',
  97. bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)),
  98. ]
  99. }
  100. class HttpLexer(RegexLexer):
  101. """
  102. Lexer for HTTP sessions.
  103. .. versionadded:: 1.5
  104. """
  105. name = 'HTTP'
  106. aliases = ['http']
  107. flags = re.DOTALL
  108. def get_tokens_unprocessed(self, text, stack=('root',)):
  109. """Reset the content-type state."""
  110. self.content_type = None
  111. return RegexLexer.get_tokens_unprocessed(self, text, stack)
  112. def header_callback(self, match):
  113. if match.group(1).lower() == 'content-type':
  114. content_type = match.group(5).strip()
  115. if ';' in content_type:
  116. content_type = content_type[:content_type.find(';')].strip()
  117. self.content_type = content_type
  118. yield match.start(1), Name.Attribute, match.group(1)
  119. yield match.start(2), Text, match.group(2)
  120. yield match.start(3), Operator, match.group(3)
  121. yield match.start(4), Text, match.group(4)
  122. yield match.start(5), Literal, match.group(5)
  123. yield match.start(6), Text, match.group(6)
  124. def continuous_header_callback(self, match):
  125. yield match.start(1), Text, match.group(1)
  126. yield match.start(2), Literal, match.group(2)
  127. yield match.start(3), Text, match.group(3)
  128. def content_callback(self, match):
  129. content_type = getattr(self, 'content_type', None)
  130. content = match.group()
  131. offset = match.start()
  132. if content_type:
  133. from pygments.lexers import get_lexer_for_mimetype
  134. possible_lexer_mimetypes = [content_type]
  135. if '+' in content_type:
  136. # application/calendar+xml can be treated as application/xml
  137. # if there's not a better match.
  138. general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2',
  139. content_type)
  140. possible_lexer_mimetypes.append(general_type)
  141. for i in possible_lexer_mimetypes:
  142. try:
  143. lexer = get_lexer_for_mimetype(i)
  144. except ClassNotFound:
  145. pass
  146. else:
  147. for idx, token, value in lexer.get_tokens_unprocessed(content):
  148. yield offset + idx, token, value
  149. return
  150. yield offset, Text, content
  151. tokens = {
  152. 'root': [
  153. (r'(GET|POST|PUT|DELETE|HEAD|OPTIONS|TRACE|PATCH)( +)([^ ]+)( +)'
  154. r'(HTTP)(/)(1\.[01])(\r?\n|\Z)',
  155. bygroups(Name.Function, Text, Name.Namespace, Text,
  156. Keyword.Reserved, Operator, Number, Text),
  157. 'headers'),
  158. (r'(HTTP)(/)(1\.[01])( +)(\d{3})( +)([^\r\n]+)(\r?\n|\Z)',
  159. bygroups(Keyword.Reserved, Operator, Number, Text, Number,
  160. Text, Name.Exception, Text),
  161. 'headers'),
  162. ],
  163. 'headers': [
  164. (r'([^\s:]+)( *)(:)( *)([^\r\n]+)(\r?\n|\Z)', header_callback),
  165. (r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback),
  166. (r'\r?\n', Text, 'content')
  167. ],
  168. 'content': [
  169. (r'.+', content_callback)
  170. ]
  171. }
  172. def analyse_text(text):
  173. return text.startswith(('GET /', 'POST /', 'PUT /', 'DELETE /', 'HEAD /',
  174. 'OPTIONS /', 'TRACE /', 'PATCH /'))
  175. class TodotxtLexer(RegexLexer):
  176. """
  177. Lexer for `Todo.txt <http://todotxt.com/>`_ todo list format.
  178. .. versionadded:: 2.0
  179. """
  180. name = 'Todotxt'
  181. aliases = ['todotxt']
  182. # *.todotxt is not a standard extension for Todo.txt files; including it
  183. # makes testing easier, and also makes autodetecting file type easier.
  184. filenames = ['todo.txt', '*.todotxt']
  185. mimetypes = ['text/x-todo']
  186. # Aliases mapping standard token types of Todo.txt format concepts
  187. CompleteTaskText = Operator # Chosen to de-emphasize complete tasks
  188. IncompleteTaskText = Text # Incomplete tasks should look like plain text
  189. # Priority should have most emphasis to indicate importance of tasks
  190. Priority = Generic.Heading
  191. # Dates should have next most emphasis because time is important
  192. Date = Generic.Subheading
  193. # Project and context should have equal weight, and be in different colors
  194. Project = Generic.Error
  195. Context = String
  196. # If tag functionality is added, it should have the same weight as Project
  197. # and Context, and a different color. Generic.Traceback would work well.
  198. # Regex patterns for building up rules; dates, priorities, projects, and
  199. # contexts are all atomic
  200. # TODO: Make date regex more ISO 8601 compliant
  201. date_regex = r'\d{4,}-\d{2}-\d{2}'
  202. priority_regex = r'\([A-Z]\)'
  203. project_regex = r'\+\S+'
  204. context_regex = r'@\S+'
  205. # Compound regex expressions
  206. complete_one_date_regex = r'(x )(' + date_regex + r')'
  207. complete_two_date_regex = (complete_one_date_regex + r'( )(' +
  208. date_regex + r')')
  209. priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')'
  210. tokens = {
  211. # Should parse starting at beginning of line; each line is a task
  212. 'root': [
  213. # Complete task entry points: two total:
  214. # 1. Complete task with two dates
  215. (complete_two_date_regex, bygroups(CompleteTaskText, Date,
  216. CompleteTaskText, Date),
  217. 'complete'),
  218. # 2. Complete task with one date
  219. (complete_one_date_regex, bygroups(CompleteTaskText, Date),
  220. 'complete'),
  221. # Incomplete task entry points: six total:
  222. # 1. Priority plus date
  223. (priority_date_regex, bygroups(Priority, IncompleteTaskText, Date),
  224. 'incomplete'),
  225. # 2. Priority only
  226. (priority_regex, Priority, 'incomplete'),
  227. # 3. Leading date
  228. (date_regex, Date, 'incomplete'),
  229. # 4. Leading context
  230. (context_regex, Context, 'incomplete'),
  231. # 5. Leading project
  232. (project_regex, Project, 'incomplete'),
  233. # 6. Non-whitespace catch-all
  234. (r'\S+', IncompleteTaskText, 'incomplete'),
  235. ],
  236. # Parse a complete task
  237. 'complete': [
  238. # Newline indicates end of task, should return to root
  239. (r'\s*\n', CompleteTaskText, '#pop'),
  240. # Tokenize contexts and projects
  241. (context_regex, Context),
  242. (project_regex, Project),
  243. # Tokenize non-whitespace text
  244. (r'\S+', CompleteTaskText),
  245. # Tokenize whitespace not containing a newline
  246. (r'\s+', CompleteTaskText),
  247. ],
  248. # Parse an incomplete task
  249. 'incomplete': [
  250. # Newline indicates end of task, should return to root
  251. (r'\s*\n', IncompleteTaskText, '#pop'),
  252. # Tokenize contexts and projects
  253. (context_regex, Context),
  254. (project_regex, Project),
  255. # Tokenize non-whitespace text
  256. (r'\S+', IncompleteTaskText),
  257. # Tokenize whitespace not containing a newline
  258. (r'\s+', IncompleteTaskText),
  259. ],
  260. }
  261. class NotmuchLexer(RegexLexer):
  262. """
  263. For `Notmuch <https://notmuchmail.org/>`_ email text format.
  264. .. versionadded:: 2.5
  265. Additional options accepted:
  266. `body_lexer`
  267. If given, highlight the contents of the message body with the specified
  268. lexer, else guess it according to the body content (default: ``None``).
  269. """
  270. name = 'Notmuch'
  271. aliases = ['notmuch']
  272. def _highlight_code(self, match):
  273. code = match.group(1)
  274. try:
  275. if self.body_lexer:
  276. lexer = get_lexer_by_name(self.body_lexer)
  277. else:
  278. lexer = guess_lexer(code.strip())
  279. except ClassNotFound:
  280. lexer = get_lexer_by_name('text')
  281. for item in lexer.get_tokens_unprocessed(code):
  282. yield item
  283. tokens = {
  284. 'root': [
  285. (r'\fmessage{\s*', Keyword, ('message', 'message-attr')),
  286. ],
  287. 'message-attr': [
  288. (r'(\s*id:\s*)([^\s]+)', bygroups(Name.Attribute, String)),
  289. (r'(\s*(?:depth|match|excluded):\s*)(\d+)',
  290. bygroups(Name.Attribute, Number.Integer)),
  291. (r'(\s*filename:\s*)(.+\n)',
  292. bygroups(Name.Attribute, String)),
  293. default('#pop'),
  294. ],
  295. 'message': [
  296. (r'\fmessage}\n', Keyword, '#pop'),
  297. (r'\fheader{\n', Keyword, 'header'),
  298. (r'\fbody{\n', Keyword, 'body'),
  299. ],
  300. 'header': [
  301. (r'\fheader}\n', Keyword, '#pop'),
  302. (r'((?:Subject|From|To|Cc|Date):\s*)(.*\n)',
  303. bygroups(Name.Attribute, String)),
  304. (r'(.*)(\s*\(.*\))(\s*\(.*\)\n)',
  305. bygroups(Generic.Strong, Literal, Name.Tag)),
  306. ],
  307. 'body': [
  308. (r'\fpart{\n', Keyword, 'part'),
  309. (r'\f(part|attachment){\s*', Keyword, ('part', 'part-attr')),
  310. (r'\fbody}\n', Keyword, '#pop'),
  311. ],
  312. 'part-attr': [
  313. (r'(ID:\s*)(\d+)', bygroups(Name.Attribute, Number.Integer)),
  314. (r'(,\s*)((?:Filename|Content-id):\s*)([^,]+)',
  315. bygroups(Punctuation, Name.Attribute, String)),
  316. (r'(,\s*)(Content-type:\s*)(.+\n)',
  317. bygroups(Punctuation, Name.Attribute, String)),
  318. default('#pop'),
  319. ],
  320. 'part': [
  321. (r'\f(?:part|attachment)}\n', Keyword, '#pop'),
  322. (r'\f(?:part|attachment){\s*', Keyword, ('#push', 'part-attr')),
  323. (r'^Non-text part: .*\n', Comment),
  324. (r'(?s)(.*?(?=\f(?:part|attachment)}\n))', _highlight_code),
  325. ],
  326. }
  327. def analyse_text(text):
  328. return 1.0 if text.startswith('\fmessage{') else 0.0
  329. def __init__(self, **options):
  330. self.body_lexer = options.get('body_lexer', None)
  331. RegexLexer.__init__(self, **options)