textfmts.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. """
  2. pygments.lexers.textfmts
  3. ~~~~~~~~~~~~~~~~~~~~~~~~
  4. Lexers for various text formats.
  5. :copyright: Copyright 2006-2024 by the Pygments team, see AUTHORS.
  6. :license: BSD, see LICENSE for details.
  7. """
  8. import re
  9. from pygments.lexers import guess_lexer, get_lexer_by_name
  10. from pygments.lexer import RegexLexer, bygroups, default, include
  11. from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
  12. Number, Generic, Literal, Punctuation
  13. from pygments.util import ClassNotFound
  14. __all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer',
  15. 'NotmuchLexer', 'KernelLogLexer']
  16. class IrcLogsLexer(RegexLexer):
  17. """
  18. Lexer for IRC logs in *irssi*, *xchat* or *weechat* style.
  19. """
  20. name = 'IRC logs'
  21. aliases = ['irc']
  22. filenames = ['*.weechatlog']
  23. mimetypes = ['text/x-irclog']
  24. url = 'https://en.wikipedia.org/wiki/Internet_Relay_Chat'
  25. version_added = ''
  26. flags = re.VERBOSE | re.MULTILINE
  27. timestamp = r"""
  28. (
  29. # irssi / xchat and others
  30. (?: \[|\()? # Opening bracket or paren for the timestamp
  31. (?: # Timestamp
  32. (?: (?:\d{1,4} [-/])* # Date as - or /-separated groups of digits
  33. (?:\d{1,4})
  34. [T ])? # Date/time separator: T or space
  35. (?: \d?\d [:.])* # Time as :/.-separated groups of 1 or 2 digits
  36. (?: \d?\d)
  37. )
  38. (?: \]|\))?\s+ # Closing bracket or paren for the timestamp
  39. |
  40. # weechat
  41. \d{4}\s\w{3}\s\d{2}\s # Date
  42. \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
  43. |
  44. # xchat
  45. \w{3}\s\d{2}\s # Date
  46. \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
  47. )?
  48. """
  49. tokens = {
  50. 'root': [
  51. # log start/end
  52. (r'^\*\*\*\*(.*)\*\*\*\*$', Comment),
  53. # hack
  54. ("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)),
  55. # normal msgs
  56. ("^" + timestamp + r"""
  57. (\s*<.*?>\s*) # Nick """,
  58. bygroups(Comment.Preproc, Name.Tag), 'msg'),
  59. # /me msgs
  60. ("^" + timestamp + r"""
  61. (\s*[*]\s+) # Star
  62. (\S+\s+.*?\n) # Nick + rest of message """,
  63. bygroups(Comment.Preproc, Keyword, Generic.Inserted)),
  64. # join/part msgs
  65. ("^" + timestamp + r"""
  66. (\s*(?:\*{3}|<?-[!@=P]?->?)\s*) # Star(s) or symbols
  67. (\S+\s+) # Nick + Space
  68. (.*?\n) # Rest of message """,
  69. bygroups(Comment.Preproc, Keyword, String, Comment)),
  70. (r"^.*?\n", Text),
  71. ],
  72. 'msg': [
  73. (r"\S+:(?!//)", Name.Attribute), # Prefix
  74. (r".*\n", Text, '#pop'),
  75. ],
  76. }
  77. class GettextLexer(RegexLexer):
  78. """
  79. Lexer for Gettext catalog files.
  80. """
  81. name = 'Gettext Catalog'
  82. aliases = ['pot', 'po']
  83. filenames = ['*.pot', '*.po']
  84. mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext']
  85. url = 'https://www.gnu.org/software/gettext'
  86. version_added = '0.9'
  87. tokens = {
  88. 'root': [
  89. (r'^#,\s.*?$', Keyword.Type),
  90. (r'^#:\s.*?$', Keyword.Declaration),
  91. # (r'^#$', Comment),
  92. (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single),
  93. (r'^(")([A-Za-z-]+:)(.*")$',
  94. bygroups(String, Name.Property, String)),
  95. (r'^".*"$', String),
  96. (r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$',
  97. bygroups(Name.Variable, Text, String)),
  98. (r'^(msgstr\[)(\d)(\])(\s+)(".*")$',
  99. bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)),
  100. ]
  101. }
  102. class HttpLexer(RegexLexer):
  103. """
  104. Lexer for HTTP sessions.
  105. """
  106. name = 'HTTP'
  107. aliases = ['http']
  108. url = 'https://httpwg.org/specs'
  109. version_added = '1.5'
  110. flags = re.DOTALL
  111. def get_tokens_unprocessed(self, text, stack=('root',)):
  112. """Reset the content-type state."""
  113. self.content_type = None
  114. return RegexLexer.get_tokens_unprocessed(self, text, stack)
  115. def header_callback(self, match):
  116. if match.group(1).lower() == 'content-type':
  117. content_type = match.group(5).strip()
  118. if ';' in content_type:
  119. content_type = content_type[:content_type.find(';')].strip()
  120. self.content_type = content_type
  121. yield match.start(1), Name.Attribute, match.group(1)
  122. yield match.start(2), Text, match.group(2)
  123. yield match.start(3), Operator, match.group(3)
  124. yield match.start(4), Text, match.group(4)
  125. yield match.start(5), Literal, match.group(5)
  126. yield match.start(6), Text, match.group(6)
  127. def continuous_header_callback(self, match):
  128. yield match.start(1), Text, match.group(1)
  129. yield match.start(2), Literal, match.group(2)
  130. yield match.start(3), Text, match.group(3)
  131. def content_callback(self, match):
  132. content_type = getattr(self, 'content_type', None)
  133. content = match.group()
  134. offset = match.start()
  135. if content_type:
  136. from pygments.lexers import get_lexer_for_mimetype
  137. possible_lexer_mimetypes = [content_type]
  138. if '+' in content_type:
  139. # application/calendar+xml can be treated as application/xml
  140. # if there's not a better match.
  141. general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2',
  142. content_type)
  143. possible_lexer_mimetypes.append(general_type)
  144. for i in possible_lexer_mimetypes:
  145. try:
  146. lexer = get_lexer_for_mimetype(i)
  147. except ClassNotFound:
  148. pass
  149. else:
  150. for idx, token, value in lexer.get_tokens_unprocessed(content):
  151. yield offset + idx, token, value
  152. return
  153. yield offset, Text, content
  154. tokens = {
  155. 'root': [
  156. (r'([a-zA-Z][-_a-zA-Z]+)( +)([^ ]+)( +)'
  157. r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)',
  158. bygroups(Name.Function, Text, Name.Namespace, Text,
  159. Keyword.Reserved, Operator, Number, Text),
  160. 'headers'),
  161. (r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)',
  162. bygroups(Keyword.Reserved, Operator, Number, Text, Number, Text,
  163. Name.Exception, Text),
  164. 'headers'),
  165. ],
  166. 'headers': [
  167. (r'([^\s:]+)( *)(:)( *)([^\r\n]*)(\r?\n|\Z)', header_callback),
  168. (r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback),
  169. (r'\r?\n', Text, 'content')
  170. ],
  171. 'content': [
  172. (r'.+', content_callback)
  173. ]
  174. }
  175. def analyse_text(text):
  176. return any (
  177. re.search(pattern, text) is not None
  178. for pattern in (
  179. r'^([a-zA-Z][-_a-zA-Z]+)( +)([^ ]+)( +)(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)',
  180. r'^(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)',
  181. )
  182. )
  183. class TodotxtLexer(RegexLexer):
  184. """
  185. Lexer for Todo.txt todo list format.
  186. """
  187. name = 'Todotxt'
  188. url = 'http://todotxt.com/'
  189. aliases = ['todotxt']
  190. version_added = '2.0'
  191. # *.todotxt is not a standard extension for Todo.txt files; including it
  192. # makes testing easier, and also makes autodetecting file type easier.
  193. filenames = ['todo.txt', '*.todotxt']
  194. mimetypes = ['text/x-todo']
  195. # Aliases mapping standard token types of Todo.txt format concepts
  196. CompleteTaskText = Operator # Chosen to de-emphasize complete tasks
  197. IncompleteTaskText = Text # Incomplete tasks should look like plain text
  198. # Priority should have most emphasis to indicate importance of tasks
  199. Priority = Generic.Heading
  200. # Dates should have next most emphasis because time is important
  201. Date = Generic.Subheading
  202. # Project and context should have equal weight, and be in different colors
  203. Project = Generic.Error
  204. Context = String
  205. # If tag functionality is added, it should have the same weight as Project
  206. # and Context, and a different color. Generic.Traceback would work well.
  207. # Regex patterns for building up rules; dates, priorities, projects, and
  208. # contexts are all atomic
  209. # TODO: Make date regex more ISO 8601 compliant
  210. date_regex = r'\d{4,}-\d{2}-\d{2}'
  211. priority_regex = r'\([A-Z]\)'
  212. project_regex = r'\+\S+'
  213. context_regex = r'@\S+'
  214. # Compound regex expressions
  215. complete_one_date_regex = r'(x )(' + date_regex + r')'
  216. complete_two_date_regex = (complete_one_date_regex + r'( )(' +
  217. date_regex + r')')
  218. priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')'
  219. tokens = {
  220. # Should parse starting at beginning of line; each line is a task
  221. 'root': [
  222. # Complete task entry points: two total:
  223. # 1. Complete task with two dates
  224. (complete_two_date_regex, bygroups(CompleteTaskText, Date,
  225. CompleteTaskText, Date),
  226. 'complete'),
  227. # 2. Complete task with one date
  228. (complete_one_date_regex, bygroups(CompleteTaskText, Date),
  229. 'complete'),
  230. # Incomplete task entry points: six total:
  231. # 1. Priority plus date
  232. (priority_date_regex, bygroups(Priority, IncompleteTaskText, Date),
  233. 'incomplete'),
  234. # 2. Priority only
  235. (priority_regex, Priority, 'incomplete'),
  236. # 3. Leading date
  237. (date_regex, Date, 'incomplete'),
  238. # 4. Leading context
  239. (context_regex, Context, 'incomplete'),
  240. # 5. Leading project
  241. (project_regex, Project, 'incomplete'),
  242. # 6. Non-whitespace catch-all
  243. (r'\S+', IncompleteTaskText, 'incomplete'),
  244. ],
  245. # Parse a complete task
  246. 'complete': [
  247. # Newline indicates end of task, should return to root
  248. (r'\s*\n', CompleteTaskText, '#pop'),
  249. # Tokenize contexts and projects
  250. (context_regex, Context),
  251. (project_regex, Project),
  252. # Tokenize non-whitespace text
  253. (r'\S+', CompleteTaskText),
  254. # Tokenize whitespace not containing a newline
  255. (r'\s+', CompleteTaskText),
  256. ],
  257. # Parse an incomplete task
  258. 'incomplete': [
  259. # Newline indicates end of task, should return to root
  260. (r'\s*\n', IncompleteTaskText, '#pop'),
  261. # Tokenize contexts and projects
  262. (context_regex, Context),
  263. (project_regex, Project),
  264. # Tokenize non-whitespace text
  265. (r'\S+', IncompleteTaskText),
  266. # Tokenize whitespace not containing a newline
  267. (r'\s+', IncompleteTaskText),
  268. ],
  269. }
  270. class NotmuchLexer(RegexLexer):
  271. """
  272. For Notmuch email text format.
  273. Additional options accepted:
  274. `body_lexer`
  275. If given, highlight the contents of the message body with the specified
  276. lexer, else guess it according to the body content (default: ``None``).
  277. """
  278. name = 'Notmuch'
  279. url = 'https://notmuchmail.org/'
  280. aliases = ['notmuch']
  281. version_added = '2.5'
  282. def _highlight_code(self, match):
  283. code = match.group(1)
  284. try:
  285. if self.body_lexer:
  286. lexer = get_lexer_by_name(self.body_lexer)
  287. else:
  288. lexer = guess_lexer(code.strip())
  289. except ClassNotFound:
  290. lexer = get_lexer_by_name('text')
  291. yield from lexer.get_tokens_unprocessed(code)
  292. tokens = {
  293. 'root': [
  294. (r'\fmessage\{\s*', Keyword, ('message', 'message-attr')),
  295. ],
  296. 'message-attr': [
  297. (r'(\s*id:\s*)(\S+)', bygroups(Name.Attribute, String)),
  298. (r'(\s*(?:depth|match|excluded):\s*)(\d+)',
  299. bygroups(Name.Attribute, Number.Integer)),
  300. (r'(\s*filename:\s*)(.+\n)',
  301. bygroups(Name.Attribute, String)),
  302. default('#pop'),
  303. ],
  304. 'message': [
  305. (r'\fmessage\}\n', Keyword, '#pop'),
  306. (r'\fheader\{\n', Keyword, 'header'),
  307. (r'\fbody\{\n', Keyword, 'body'),
  308. ],
  309. 'header': [
  310. (r'\fheader\}\n', Keyword, '#pop'),
  311. (r'((?:Subject|From|To|Cc|Date):\s*)(.*\n)',
  312. bygroups(Name.Attribute, String)),
  313. (r'(.*)(\s*\(.*\))(\s*\(.*\)\n)',
  314. bygroups(Generic.Strong, Literal, Name.Tag)),
  315. ],
  316. 'body': [
  317. (r'\fpart\{\n', Keyword, 'part'),
  318. (r'\f(part|attachment)\{\s*', Keyword, ('part', 'part-attr')),
  319. (r'\fbody\}\n', Keyword, '#pop'),
  320. ],
  321. 'part-attr': [
  322. (r'(ID:\s*)(\d+)', bygroups(Name.Attribute, Number.Integer)),
  323. (r'(,\s*)((?:Filename|Content-id):\s*)([^,]+)',
  324. bygroups(Punctuation, Name.Attribute, String)),
  325. (r'(,\s*)(Content-type:\s*)(.+\n)',
  326. bygroups(Punctuation, Name.Attribute, String)),
  327. default('#pop'),
  328. ],
  329. 'part': [
  330. (r'\f(?:part|attachment)\}\n', Keyword, '#pop'),
  331. (r'\f(?:part|attachment)\{\s*', Keyword, ('#push', 'part-attr')),
  332. (r'^Non-text part: .*\n', Comment),
  333. (r'(?s)(.*?(?=\f(?:part|attachment)\}\n))', _highlight_code),
  334. ],
  335. }
  336. def analyse_text(text):
  337. return 1.0 if text.startswith('\fmessage{') else 0.0
  338. def __init__(self, **options):
  339. self.body_lexer = options.get('body_lexer', None)
  340. RegexLexer.__init__(self, **options)
  341. class KernelLogLexer(RegexLexer):
  342. """
  343. For Linux Kernel log ("dmesg") output.
  344. """
  345. name = 'Kernel log'
  346. aliases = ['kmsg', 'dmesg']
  347. filenames = ['*.kmsg', '*.dmesg']
  348. url = 'https://fr.wikipedia.org/wiki/Dmesg'
  349. version_added = '2.6'
  350. tokens = {
  351. 'root': [
  352. (r'^[^:]+:debug : (?=\[)', Text, 'debug'),
  353. (r'^[^:]+:info : (?=\[)', Text, 'info'),
  354. (r'^[^:]+:warn : (?=\[)', Text, 'warn'),
  355. (r'^[^:]+:notice: (?=\[)', Text, 'warn'),
  356. (r'^[^:]+:err : (?=\[)', Text, 'error'),
  357. (r'^[^:]+:crit : (?=\[)', Text, 'error'),
  358. (r'^(?=\[)', Text, 'unknown'),
  359. ],
  360. 'unknown': [
  361. (r'^(?=.+(warning|notice|audit|deprecated))', Text, 'warn'),
  362. (r'^(?=.+(error|critical|fail|Bug))', Text, 'error'),
  363. default('info'),
  364. ],
  365. 'base': [
  366. (r'\[[0-9. ]+\] ', Number),
  367. (r'(?<=\] ).+?:', Keyword),
  368. (r'\n', Text, '#pop'),
  369. ],
  370. 'debug': [
  371. include('base'),
  372. (r'.+\n', Comment, '#pop')
  373. ],
  374. 'info': [
  375. include('base'),
  376. (r'.+\n', Text, '#pop')
  377. ],
  378. 'warn': [
  379. include('base'),
  380. (r'.+\n', Generic.Strong, '#pop')
  381. ],
  382. 'error': [
  383. include('base'),
  384. (r'.+\n', Generic.Error, '#pop')
  385. ]
  386. }