markup.py 64 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654
  1. """
  2. pygments.lexers.markup
  3. ~~~~~~~~~~~~~~~~~~~~~~
  4. Lexers for non-HTML markup languages.
  5. :copyright: Copyright 2006-2024 by the Pygments team, see AUTHORS.
  6. :license: BSD, see LICENSE for details.
  7. """
  8. import re
  9. from pygments.lexers.html import XmlLexer
  10. from pygments.lexers.javascript import JavascriptLexer
  11. from pygments.lexers.css import CssLexer
  12. from pygments.lexers.lilypond import LilyPondLexer
  13. from pygments.lexers.data import JsonLexer
  14. from pygments.lexer import RegexLexer, DelegatingLexer, include, bygroups, \
  15. using, this, do_insertions, default, words
  16. from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
  17. Number, Punctuation, Generic, Other, Whitespace, Literal
  18. from pygments.util import get_bool_opt, ClassNotFound
  19. __all__ = ['BBCodeLexer', 'MoinWikiLexer', 'RstLexer', 'TexLexer', 'GroffLexer',
  20. 'MozPreprocHashLexer', 'MozPreprocPercentLexer',
  21. 'MozPreprocXulLexer', 'MozPreprocJavascriptLexer',
  22. 'MozPreprocCssLexer', 'MarkdownLexer', 'OrgLexer', 'TiddlyWiki5Lexer',
  23. 'WikitextLexer']
  24. class BBCodeLexer(RegexLexer):
  25. """
  26. A lexer that highlights BBCode(-like) syntax.
  27. """
  28. name = 'BBCode'
  29. aliases = ['bbcode']
  30. mimetypes = ['text/x-bbcode']
  31. url = 'https://www.bbcode.org/'
  32. version_added = '0.6'
  33. tokens = {
  34. 'root': [
  35. (r'[^[]+', Text),
  36. # tag/end tag begin
  37. (r'\[/?\w+', Keyword, 'tag'),
  38. # stray bracket
  39. (r'\[', Text),
  40. ],
  41. 'tag': [
  42. (r'\s+', Text),
  43. # attribute with value
  44. (r'(\w+)(=)("?[^\s"\]]+"?)',
  45. bygroups(Name.Attribute, Operator, String)),
  46. # tag argument (a la [color=green])
  47. (r'(=)("?[^\s"\]]+"?)',
  48. bygroups(Operator, String)),
  49. # tag end
  50. (r'\]', Keyword, '#pop'),
  51. ],
  52. }
  53. class MoinWikiLexer(RegexLexer):
  54. """
  55. For MoinMoin (and Trac) Wiki markup.
  56. """
  57. name = 'MoinMoin/Trac Wiki markup'
  58. aliases = ['trac-wiki', 'moin']
  59. filenames = []
  60. mimetypes = ['text/x-trac-wiki']
  61. url = 'https://moinmo.in'
  62. version_added = '0.7'
  63. flags = re.MULTILINE | re.IGNORECASE
  64. tokens = {
  65. 'root': [
  66. (r'^#.*$', Comment),
  67. (r'(!)(\S+)', bygroups(Keyword, Text)), # Ignore-next
  68. # Titles
  69. (r'^(=+)([^=]+)(=+)(\s*#.+)?$',
  70. bygroups(Generic.Heading, using(this), Generic.Heading, String)),
  71. # Literal code blocks, with optional shebang
  72. (r'(\{\{\{)(\n#!.+)?', bygroups(Name.Builtin, Name.Namespace), 'codeblock'),
  73. (r'(\'\'\'?|\|\||`|__|~~|\^|,,|::)', Comment), # Formatting
  74. # Lists
  75. (r'^( +)([.*-])( )', bygroups(Text, Name.Builtin, Text)),
  76. (r'^( +)([a-z]{1,5}\.)( )', bygroups(Text, Name.Builtin, Text)),
  77. # Other Formatting
  78. (r'\[\[\w+.*?\]\]', Keyword), # Macro
  79. (r'(\[[^\s\]]+)(\s+[^\]]+?)?(\])',
  80. bygroups(Keyword, String, Keyword)), # Link
  81. (r'^----+$', Keyword), # Horizontal rules
  82. (r'[^\n\'\[{!_~^,|]+', Text),
  83. (r'\n', Text),
  84. (r'.', Text),
  85. ],
  86. 'codeblock': [
  87. (r'\}\}\}', Name.Builtin, '#pop'),
  88. # these blocks are allowed to be nested in Trac, but not MoinMoin
  89. (r'\{\{\{', Text, '#push'),
  90. (r'[^{}]+', Comment.Preproc), # slurp boring text
  91. (r'.', Comment.Preproc), # allow loose { or }
  92. ],
  93. }
  94. class RstLexer(RegexLexer):
  95. """
  96. For reStructuredText markup.
  97. Additional options accepted:
  98. `handlecodeblocks`
  99. Highlight the contents of ``.. sourcecode:: language``,
  100. ``.. code:: language`` and ``.. code-block:: language``
  101. directives with a lexer for the given language (default:
  102. ``True``).
  103. .. versionadded:: 0.8
  104. """
  105. name = 'reStructuredText'
  106. url = 'https://docutils.sourceforge.io/rst.html'
  107. aliases = ['restructuredtext', 'rst', 'rest']
  108. filenames = ['*.rst', '*.rest']
  109. mimetypes = ["text/x-rst", "text/prs.fallenstein.rst"]
  110. version_added = '0.7'
  111. flags = re.MULTILINE
  112. def _handle_sourcecode(self, match):
  113. from pygments.lexers import get_lexer_by_name
  114. # section header
  115. yield match.start(1), Punctuation, match.group(1)
  116. yield match.start(2), Text, match.group(2)
  117. yield match.start(3), Operator.Word, match.group(3)
  118. yield match.start(4), Punctuation, match.group(4)
  119. yield match.start(5), Text, match.group(5)
  120. yield match.start(6), Keyword, match.group(6)
  121. yield match.start(7), Text, match.group(7)
  122. # lookup lexer if wanted and existing
  123. lexer = None
  124. if self.handlecodeblocks:
  125. try:
  126. lexer = get_lexer_by_name(match.group(6).strip())
  127. except ClassNotFound:
  128. pass
  129. indention = match.group(8)
  130. indention_size = len(indention)
  131. code = (indention + match.group(9) + match.group(10) + match.group(11))
  132. # no lexer for this language. handle it like it was a code block
  133. if lexer is None:
  134. yield match.start(8), String, code
  135. return
  136. # highlight the lines with the lexer.
  137. ins = []
  138. codelines = code.splitlines(True)
  139. code = ''
  140. for line in codelines:
  141. if len(line) > indention_size:
  142. ins.append((len(code), [(0, Text, line[:indention_size])]))
  143. code += line[indention_size:]
  144. else:
  145. code += line
  146. yield from do_insertions(ins, lexer.get_tokens_unprocessed(code))
  147. # from docutils.parsers.rst.states
  148. closers = '\'")]}>\u2019\u201d\xbb!?'
  149. unicode_delimiters = '\u2010\u2011\u2012\u2013\u2014\u00a0'
  150. end_string_suffix = (rf'((?=$)|(?=[-/:.,; \n\x00{re.escape(unicode_delimiters)}{re.escape(closers)}]))')
  151. tokens = {
  152. 'root': [
  153. # Heading with overline
  154. (r'^(=+|-+|`+|:+|\.+|\'+|"+|~+|\^+|_+|\*+|\++|#+)([ \t]*\n)'
  155. r'(.+)(\n)(\1)(\n)',
  156. bygroups(Generic.Heading, Text, Generic.Heading,
  157. Text, Generic.Heading, Text)),
  158. # Plain heading
  159. (r'^(\S.*)(\n)(={3,}|-{3,}|`{3,}|:{3,}|\.{3,}|\'{3,}|"{3,}|'
  160. r'~{3,}|\^{3,}|_{3,}|\*{3,}|\+{3,}|#{3,})(\n)',
  161. bygroups(Generic.Heading, Text, Generic.Heading, Text)),
  162. # Bulleted lists
  163. (r'^(\s*)([-*+])( .+\n(?:\1 .+\n)*)',
  164. bygroups(Text, Number, using(this, state='inline'))),
  165. # Numbered lists
  166. (r'^(\s*)([0-9#ivxlcmIVXLCM]+\.)( .+\n(?:\1 .+\n)*)',
  167. bygroups(Text, Number, using(this, state='inline'))),
  168. (r'^(\s*)(\(?[0-9#ivxlcmIVXLCM]+\))( .+\n(?:\1 .+\n)*)',
  169. bygroups(Text, Number, using(this, state='inline'))),
  170. # Numbered, but keep words at BOL from becoming lists
  171. (r'^(\s*)([A-Z]+\.)( .+\n(?:\1 .+\n)+)',
  172. bygroups(Text, Number, using(this, state='inline'))),
  173. (r'^(\s*)(\(?[A-Za-z]+\))( .+\n(?:\1 .+\n)+)',
  174. bygroups(Text, Number, using(this, state='inline'))),
  175. # Line blocks
  176. (r'^(\s*)(\|)( .+\n(?:\| .+\n)*)',
  177. bygroups(Text, Operator, using(this, state='inline'))),
  178. # Sourcecode directives
  179. (r'^( *\.\.)(\s*)((?:source)?code(?:-block)?)(::)([ \t]*)([^\n]+)'
  180. r'(\n[ \t]*\n)([ \t]+)(.*)(\n)((?:(?:\8.*)?\n)+)',
  181. _handle_sourcecode),
  182. # A directive
  183. (r'^( *\.\.)(\s*)([\w:-]+?)(::)(?:([ \t]*)(.*))',
  184. bygroups(Punctuation, Text, Operator.Word, Punctuation, Text,
  185. using(this, state='inline'))),
  186. # A reference target
  187. (r'^( *\.\.)(\s*)(_(?:[^:\\]|\\.)+:)(.*?)$',
  188. bygroups(Punctuation, Text, Name.Tag, using(this, state='inline'))),
  189. # A footnote/citation target
  190. (r'^( *\.\.)(\s*)(\[.+\])(.*?)$',
  191. bygroups(Punctuation, Text, Name.Tag, using(this, state='inline'))),
  192. # A substitution def
  193. (r'^( *\.\.)(\s*)(\|.+\|)(\s*)([\w:-]+?)(::)(?:([ \t]*)(.*))',
  194. bygroups(Punctuation, Text, Name.Tag, Text, Operator.Word,
  195. Punctuation, Text, using(this, state='inline'))),
  196. # Comments
  197. (r'^ *\.\..*(\n( +.*\n|\n)+)?', Comment),
  198. # Field list marker
  199. (r'^( *)(:(?:\\\\|\\:|[^:\n])+:(?=\s))([ \t]*)',
  200. bygroups(Text, Name.Class, Text)),
  201. # Definition list
  202. (r'^(\S.*(?<!::)\n)((?:(?: +.*)\n)+)',
  203. bygroups(using(this, state='inline'), using(this, state='inline'))),
  204. # Code blocks
  205. (r'(::)(\n[ \t]*\n)([ \t]+)(.*)(\n)((?:(?:\3.*)?\n)+)',
  206. bygroups(String.Escape, Text, String, String, Text, String)),
  207. include('inline'),
  208. ],
  209. 'inline': [
  210. (r'\\.', Text), # escape
  211. (r'``', String, 'literal'), # code
  212. (r'(`.+?)(<.+?>)(`__?)', # reference with inline target
  213. bygroups(String, String.Interpol, String)),
  214. (r'`.+?`__?', String), # reference
  215. (r'(`.+?`)(:[a-zA-Z0-9:-]+?:)?',
  216. bygroups(Name.Variable, Name.Attribute)), # role
  217. (r'(:[a-zA-Z0-9:-]+?:)(`.+?`)',
  218. bygroups(Name.Attribute, Name.Variable)), # role (content first)
  219. (r'\*\*.+?\*\*', Generic.Strong), # Strong emphasis
  220. (r'\*.+?\*', Generic.Emph), # Emphasis
  221. (r'\[.*?\]_', String), # Footnote or citation
  222. (r'<.+?>', Name.Tag), # Hyperlink
  223. (r'[^\\\n\[*`:]+', Text),
  224. (r'.', Text),
  225. ],
  226. 'literal': [
  227. (r'[^`]+', String),
  228. (r'``' + end_string_suffix, String, '#pop'),
  229. (r'`', String),
  230. ]
  231. }
  232. def __init__(self, **options):
  233. self.handlecodeblocks = get_bool_opt(options, 'handlecodeblocks', True)
  234. RegexLexer.__init__(self, **options)
  235. def analyse_text(text):
  236. if text[:2] == '..' and text[2:3] != '.':
  237. return 0.3
  238. p1 = text.find("\n")
  239. p2 = text.find("\n", p1 + 1)
  240. if (p2 > -1 and # has two lines
  241. p1 * 2 + 1 == p2 and # they are the same length
  242. text[p1+1] in '-=' and # the next line both starts and ends with
  243. text[p1+1] == text[p2-1]): # ...a sufficiently high header
  244. return 0.5
  245. class TexLexer(RegexLexer):
  246. """
  247. Lexer for the TeX and LaTeX typesetting languages.
  248. """
  249. name = 'TeX'
  250. aliases = ['tex', 'latex']
  251. filenames = ['*.tex', '*.aux', '*.toc']
  252. mimetypes = ['text/x-tex', 'text/x-latex']
  253. url = 'https://tug.org'
  254. version_added = ''
  255. tokens = {
  256. 'general': [
  257. (r'%.*?\n', Comment),
  258. (r'[{}]', Name.Builtin),
  259. (r'[&_^]', Name.Builtin),
  260. ],
  261. 'root': [
  262. (r'\\\[', String.Backtick, 'displaymath'),
  263. (r'\\\(', String, 'inlinemath'),
  264. (r'\$\$', String.Backtick, 'displaymath'),
  265. (r'\$', String, 'inlinemath'),
  266. (r'\\([a-zA-Z@_:]+|\S?)', Keyword, 'command'),
  267. (r'\\$', Keyword),
  268. include('general'),
  269. (r'[^\\$%&_^{}]+', Text),
  270. ],
  271. 'math': [
  272. (r'\\([a-zA-Z]+|\S?)', Name.Variable),
  273. include('general'),
  274. (r'[0-9]+', Number),
  275. (r'[-=!+*/()\[\]]', Operator),
  276. (r'[^=!+*/()\[\]\\$%&_^{}0-9-]+', Name.Builtin),
  277. ],
  278. 'inlinemath': [
  279. (r'\\\)', String, '#pop'),
  280. (r'\$', String, '#pop'),
  281. include('math'),
  282. ],
  283. 'displaymath': [
  284. (r'\\\]', String, '#pop'),
  285. (r'\$\$', String, '#pop'),
  286. (r'\$', Name.Builtin),
  287. include('math'),
  288. ],
  289. 'command': [
  290. (r'\[.*?\]', Name.Attribute),
  291. (r'\*', Keyword),
  292. default('#pop'),
  293. ],
  294. }
  295. def analyse_text(text):
  296. for start in ("\\documentclass", "\\input", "\\documentstyle",
  297. "\\relax"):
  298. if text[:len(start)] == start:
  299. return True
  300. class GroffLexer(RegexLexer):
  301. """
  302. Lexer for the (g)roff typesetting language, supporting groff
  303. extensions. Mainly useful for highlighting manpage sources.
  304. """
  305. name = 'Groff'
  306. aliases = ['groff', 'nroff', 'man']
  307. filenames = ['*.[1-9]', '*.man', '*.1p', '*.3pm']
  308. mimetypes = ['application/x-troff', 'text/troff']
  309. url = 'https://www.gnu.org/software/groff'
  310. version_added = '0.6'
  311. tokens = {
  312. 'root': [
  313. (r'(\.)(\w+)', bygroups(Text, Keyword), 'request'),
  314. (r'\.', Punctuation, 'request'),
  315. # Regular characters, slurp till we find a backslash or newline
  316. (r'[^\\\n]+', Text, 'textline'),
  317. default('textline'),
  318. ],
  319. 'textline': [
  320. include('escapes'),
  321. (r'[^\\\n]+', Text),
  322. (r'\n', Text, '#pop'),
  323. ],
  324. 'escapes': [
  325. # groff has many ways to write escapes.
  326. (r'\\"[^\n]*', Comment),
  327. (r'\\[fn]\w', String.Escape),
  328. (r'\\\(.{2}', String.Escape),
  329. (r'\\.\[.*\]', String.Escape),
  330. (r'\\.', String.Escape),
  331. (r'\\\n', Text, 'request'),
  332. ],
  333. 'request': [
  334. (r'\n', Text, '#pop'),
  335. include('escapes'),
  336. (r'"[^\n"]+"', String.Double),
  337. (r'\d+', Number),
  338. (r'\S+', String),
  339. (r'\s+', Text),
  340. ],
  341. }
  342. def analyse_text(text):
  343. if text[:1] != '.':
  344. return False
  345. if text[:3] == '.\\"':
  346. return True
  347. if text[:4] == '.TH ':
  348. return True
  349. if text[1:3].isalnum() and text[3].isspace():
  350. return 0.9
  351. class MozPreprocHashLexer(RegexLexer):
  352. """
  353. Lexer for Mozilla Preprocessor files (with '#' as the marker).
  354. Other data is left untouched.
  355. """
  356. name = 'mozhashpreproc'
  357. aliases = [name]
  358. filenames = []
  359. mimetypes = []
  360. url = 'https://firefox-source-docs.mozilla.org/build/buildsystem/preprocessor.html'
  361. version_added = '2.0'
  362. tokens = {
  363. 'root': [
  364. (r'^#', Comment.Preproc, ('expr', 'exprstart')),
  365. (r'.+', Other),
  366. ],
  367. 'exprstart': [
  368. (r'(literal)(.*)', bygroups(Comment.Preproc, Text), '#pop:2'),
  369. (words((
  370. 'define', 'undef', 'if', 'ifdef', 'ifndef', 'else', 'elif',
  371. 'elifdef', 'elifndef', 'endif', 'expand', 'filter', 'unfilter',
  372. 'include', 'includesubst', 'error')),
  373. Comment.Preproc, '#pop'),
  374. ],
  375. 'expr': [
  376. (words(('!', '!=', '==', '&&', '||')), Operator),
  377. (r'(defined)(\()', bygroups(Keyword, Punctuation)),
  378. (r'\)', Punctuation),
  379. (r'[0-9]+', Number.Decimal),
  380. (r'__\w+?__', Name.Variable),
  381. (r'@\w+?@', Name.Class),
  382. (r'\w+', Name),
  383. (r'\n', Text, '#pop'),
  384. (r'\s+', Text),
  385. (r'\S', Punctuation),
  386. ],
  387. }
  388. class MozPreprocPercentLexer(MozPreprocHashLexer):
  389. """
  390. Lexer for Mozilla Preprocessor files (with '%' as the marker).
  391. Other data is left untouched.
  392. """
  393. name = 'mozpercentpreproc'
  394. aliases = [name]
  395. filenames = []
  396. mimetypes = []
  397. url = 'https://firefox-source-docs.mozilla.org/build/buildsystem/preprocessor.html'
  398. version_added = '2.0'
  399. tokens = {
  400. 'root': [
  401. (r'^%', Comment.Preproc, ('expr', 'exprstart')),
  402. (r'.+', Other),
  403. ],
  404. }
  405. class MozPreprocXulLexer(DelegatingLexer):
  406. """
  407. Subclass of the `MozPreprocHashLexer` that highlights unlexed data with the
  408. `XmlLexer`.
  409. """
  410. name = "XUL+mozpreproc"
  411. aliases = ['xul+mozpreproc']
  412. filenames = ['*.xul.in']
  413. mimetypes = []
  414. url = 'https://firefox-source-docs.mozilla.org/build/buildsystem/preprocessor.html'
  415. version_added = '2.0'
  416. def __init__(self, **options):
  417. super().__init__(XmlLexer, MozPreprocHashLexer, **options)
  418. class MozPreprocJavascriptLexer(DelegatingLexer):
  419. """
  420. Subclass of the `MozPreprocHashLexer` that highlights unlexed data with the
  421. `JavascriptLexer`.
  422. """
  423. name = "Javascript+mozpreproc"
  424. aliases = ['javascript+mozpreproc']
  425. filenames = ['*.js.in']
  426. mimetypes = []
  427. url = 'https://firefox-source-docs.mozilla.org/build/buildsystem/preprocessor.html'
  428. version_added = '2.0'
  429. def __init__(self, **options):
  430. super().__init__(JavascriptLexer, MozPreprocHashLexer, **options)
  431. class MozPreprocCssLexer(DelegatingLexer):
  432. """
  433. Subclass of the `MozPreprocHashLexer` that highlights unlexed data with the
  434. `CssLexer`.
  435. """
  436. name = "CSS+mozpreproc"
  437. aliases = ['css+mozpreproc']
  438. filenames = ['*.css.in']
  439. mimetypes = []
  440. url = 'https://firefox-source-docs.mozilla.org/build/buildsystem/preprocessor.html'
  441. version_added = '2.0'
  442. def __init__(self, **options):
  443. super().__init__(CssLexer, MozPreprocPercentLexer, **options)
  444. class MarkdownLexer(RegexLexer):
  445. """
  446. For Markdown markup.
  447. """
  448. name = 'Markdown'
  449. url = 'https://daringfireball.net/projects/markdown/'
  450. aliases = ['markdown', 'md']
  451. filenames = ['*.md', '*.markdown']
  452. mimetypes = ["text/x-markdown"]
  453. version_added = '2.2'
  454. flags = re.MULTILINE
  455. def _handle_codeblock(self, match):
  456. from pygments.lexers import get_lexer_by_name
  457. yield match.start('initial'), String.Backtick, match.group('initial')
  458. yield match.start('lang'), String.Backtick, match.group('lang')
  459. if match.group('afterlang') is not None:
  460. yield match.start('whitespace'), Whitespace, match.group('whitespace')
  461. yield match.start('extra'), Text, match.group('extra')
  462. yield match.start('newline'), Whitespace, match.group('newline')
  463. # lookup lexer if wanted and existing
  464. lexer = None
  465. if self.handlecodeblocks:
  466. try:
  467. lexer = get_lexer_by_name(match.group('lang').strip())
  468. except ClassNotFound:
  469. pass
  470. code = match.group('code')
  471. # no lexer for this language. handle it like it was a code block
  472. if lexer is None:
  473. yield match.start('code'), String, code
  474. else:
  475. # FIXME: aren't the offsets wrong?
  476. yield from do_insertions([], lexer.get_tokens_unprocessed(code))
  477. yield match.start('terminator'), String.Backtick, match.group('terminator')
  478. tokens = {
  479. 'root': [
  480. # heading with '#' prefix (atx-style)
  481. (r'(^#[^#].+)(\n)', bygroups(Generic.Heading, Text)),
  482. # subheading with '#' prefix (atx-style)
  483. (r'(^#{2,6}[^#].+)(\n)', bygroups(Generic.Subheading, Text)),
  484. # heading with '=' underlines (Setext-style)
  485. (r'^(.+)(\n)(=+)(\n)', bygroups(Generic.Heading, Text, Generic.Heading, Text)),
  486. # subheading with '-' underlines (Setext-style)
  487. (r'^(.+)(\n)(-+)(\n)', bygroups(Generic.Subheading, Text, Generic.Subheading, Text)),
  488. # task list
  489. (r'^(\s*)([*-] )(\[[ xX]\])( .+\n)',
  490. bygroups(Whitespace, Keyword, Keyword, using(this, state='inline'))),
  491. # bulleted list
  492. (r'^(\s*)([*-])(\s)(.+\n)',
  493. bygroups(Whitespace, Keyword, Whitespace, using(this, state='inline'))),
  494. # numbered list
  495. (r'^(\s*)([0-9]+\.)( .+\n)',
  496. bygroups(Whitespace, Keyword, using(this, state='inline'))),
  497. # quote
  498. (r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)),
  499. # code block fenced by 3 backticks
  500. (r'^(\s*```\n[\w\W]*?^\s*```$\n)', String.Backtick),
  501. # code block with language
  502. # Some tools include extra stuff after the language name, just
  503. # highlight that as text. For example: https://docs.enola.dev/use/execmd
  504. (r'''(?x)
  505. ^(?P<initial>\s*```)
  506. (?P<lang>[\w\-]+)
  507. (?P<afterlang>
  508. (?P<whitespace>[^\S\n]+)
  509. (?P<extra>.*))?
  510. (?P<newline>\n)
  511. (?P<code>(.|\n)*?)
  512. (?P<terminator>^\s*```$\n)
  513. ''',
  514. _handle_codeblock),
  515. include('inline'),
  516. ],
  517. 'inline': [
  518. # escape
  519. (r'\\.', Text),
  520. # inline code
  521. (r'([^`]?)(`[^`\n]+`)', bygroups(Text, String.Backtick)),
  522. # warning: the following rules eat outer tags.
  523. # eg. **foo _bar_ baz** => foo and baz are not recognized as bold
  524. # bold fenced by '**'
  525. (r'([^\*]?)(\*\*[^* \n][^*\n]*\*\*)', bygroups(Text, Generic.Strong)),
  526. # bold fenced by '__'
  527. (r'([^_]?)(__[^_ \n][^_\n]*__)', bygroups(Text, Generic.Strong)),
  528. # italics fenced by '*'
  529. (r'([^\*]?)(\*[^* \n][^*\n]*\*)', bygroups(Text, Generic.Emph)),
  530. # italics fenced by '_'
  531. (r'([^_]?)(_[^_ \n][^_\n]*_)', bygroups(Text, Generic.Emph)),
  532. # strikethrough
  533. (r'([^~]?)(~~[^~ \n][^~\n]*~~)', bygroups(Text, Generic.Deleted)),
  534. # mentions and topics (twitter and github stuff)
  535. (r'[@#][\w/:]+', Name.Entity),
  536. # (image?) links eg: ![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)
  537. (r'(!?\[)([^]]+)(\])(\()([^)]+)(\))',
  538. bygroups(Text, Name.Tag, Text, Text, Name.Attribute, Text)),
  539. # reference-style links, e.g.:
  540. # [an example][id]
  541. # [id]: http://example.com/
  542. (r'(\[)([^]]+)(\])(\[)([^]]*)(\])',
  543. bygroups(Text, Name.Tag, Text, Text, Name.Label, Text)),
  544. (r'^(\s*\[)([^]]*)(\]:\s*)(.+)',
  545. bygroups(Text, Name.Label, Text, Name.Attribute)),
  546. # general text, must come last!
  547. (r'[^\\\s]+', Text),
  548. (r'.', Text),
  549. ],
  550. }
  551. def __init__(self, **options):
  552. self.handlecodeblocks = get_bool_opt(options, 'handlecodeblocks', True)
  553. RegexLexer.__init__(self, **options)
  554. class OrgLexer(RegexLexer):
  555. """
  556. For Org Mode markup.
  557. """
  558. name = 'Org Mode'
  559. url = 'https://orgmode.org'
  560. aliases = ['org', 'orgmode', 'org-mode']
  561. filenames = ['*.org']
  562. mimetypes = ["text/org"]
  563. version_added = '2.18'
  564. def _inline(start, end):
  565. return rf'(?<!\w){start}(.|\n(?!\n))+?{end}(?!\w)'
  566. tokens = {
  567. 'root': [
  568. (r'^# .*', Comment.Single),
  569. # Headings
  570. (r'^(\* )(COMMENT)( .*)',
  571. bygroups(Generic.Heading, Comment.Preproc, Generic.Heading)),
  572. (r'^(\*\*+ )(COMMENT)( .*)',
  573. bygroups(Generic.Subheading, Comment.Preproc, Generic.Subheading)),
  574. (r'^(\* )(DONE)( .*)',
  575. bygroups(Generic.Heading, Generic.Deleted, Generic.Heading)),
  576. (r'^(\*\*+ )(DONE)( .*)',
  577. bygroups(Generic.Subheading, Generic.Deleted, Generic.Subheading)),
  578. (r'^(\* )(TODO)( .*)',
  579. bygroups(Generic.Heading, Generic.Error, Generic.Heading)),
  580. (r'^(\*\*+ )(TODO)( .*)',
  581. bygroups(Generic.Subheading, Generic.Error, Generic.Subheading)),
  582. (r'^(\* .+?)( :[a-zA-Z0-9_@:]+:)?$', bygroups(Generic.Heading, Generic.Emph)),
  583. (r'^(\*\*+ .+?)( :[a-zA-Z0-9_@:]+:)?$', bygroups(Generic.Subheading, Generic.Emph)),
  584. # Unordered lists items, including TODO items and description items
  585. (r'^(?:( *)([+-] )|( +)(\* ))(\[[ X-]\])?(.+ ::)?',
  586. bygroups(Whitespace, Keyword, Whitespace, Keyword, Generic.Prompt, Name.Label)),
  587. # Ordered list items
  588. (r'^( *)([0-9]+[.)])( \[@[0-9]+\])?', bygroups(Whitespace, Keyword, Generic.Emph)),
  589. # Dynamic blocks
  590. (r'(?i)^( *#\+begin: *)((?:.|\n)*?)(^ *#\+end: *$)',
  591. bygroups(Operator.Word, using(this), Operator.Word)),
  592. # Comment blocks
  593. (r'(?i)^( *#\+begin_comment *\n)((?:.|\n)*?)(^ *#\+end_comment *$)',
  594. bygroups(Operator.Word, Comment.Multiline, Operator.Word)),
  595. # Source code blocks
  596. # TODO: language-dependent syntax highlighting (see Markdown lexer)
  597. (r'(?i)^( *#\+begin_src .*)((?:.|\n)*?)(^ *#\+end_src *$)',
  598. bygroups(Operator.Word, Text, Operator.Word)),
  599. # Other blocks
  600. (r'(?i)^( *#\+begin_\w+)( *\n)((?:.|\n)*?)(^ *#\+end_\w+)( *$)',
  601. bygroups(Operator.Word, Whitespace, Text, Operator.Word, Whitespace)),
  602. # Keywords
  603. (r'^(#\+\w+:)(.*)$', bygroups(Name.Namespace, Text)),
  604. # Properties and drawers
  605. (r'(?i)^( *:\w+: *\n)((?:.|\n)*?)(^ *:end: *$)',
  606. bygroups(Name.Decorator, Comment.Special, Name.Decorator)),
  607. # Line break operator
  608. (r'\\\\$', Operator),
  609. # Deadline, Scheduled, CLOSED
  610. (r'(?i)^( *(?:DEADLINE|SCHEDULED): )(<.+?> *)$',
  611. bygroups(Generic.Error, Literal.Date)),
  612. (r'(?i)^( *CLOSED: )(\[.+?\] *)$',
  613. bygroups(Generic.Deleted, Literal.Date)),
  614. # Bold
  615. (_inline(r'\*', r'\*+'), Generic.Strong),
  616. # Italic
  617. (_inline(r'/', r'/'), Generic.Emph),
  618. # Verbatim
  619. (_inline(r'=', r'='), String), # TODO token
  620. # Code
  621. (_inline(r'~', r'~'), String),
  622. # Strikethrough
  623. (_inline(r'\+', r'\+'), Generic.Deleted),
  624. # Underline
  625. (_inline(r'_', r'_+'), Generic.EmphStrong),
  626. # Dates
  627. (r'<.+?>', Literal.Date),
  628. # Macros
  629. (r'\{\{\{.+?\}\}\}', Comment.Preproc),
  630. # Footnotes
  631. (r'(?<!\[)\[fn:.+?\]', Name.Tag),
  632. # Links
  633. (r'(?s)(\[\[)(.*?)(\]\[)(.*?)(\]\])',
  634. bygroups(Punctuation, Name.Attribute, Punctuation, Name.Tag, Punctuation)),
  635. (r'(?s)(\[\[)(.+?)(\]\])', bygroups(Punctuation, Name.Attribute, Punctuation)),
  636. (r'(<<)(.+?)(>>)', bygroups(Punctuation, Name.Attribute, Punctuation)),
  637. # Tables
  638. (r'^( *)(\|[ -].*?[ -]\|)$', bygroups(Whitespace, String)),
  639. # Any other text
  640. (r'[^#*+\-0-9:\\/=~_<{\[|\n]+', Text),
  641. (r'[#*+\-0-9:\\/=~_<{\[|\n]', Text),
  642. ],
  643. }
  644. class TiddlyWiki5Lexer(RegexLexer):
  645. """
  646. For TiddlyWiki5 markup.
  647. """
  648. name = 'tiddler'
  649. url = 'https://tiddlywiki.com/#TiddlerFiles'
  650. aliases = ['tid']
  651. filenames = ['*.tid']
  652. mimetypes = ["text/vnd.tiddlywiki"]
  653. version_added = '2.7'
  654. flags = re.MULTILINE
  655. def _handle_codeblock(self, match):
  656. """
  657. match args: 1:backticks, 2:lang_name, 3:newline, 4:code, 5:backticks
  658. """
  659. from pygments.lexers import get_lexer_by_name
  660. # section header
  661. yield match.start(1), String, match.group(1)
  662. yield match.start(2), String, match.group(2)
  663. yield match.start(3), Text, match.group(3)
  664. # lookup lexer if wanted and existing
  665. lexer = None
  666. if self.handlecodeblocks:
  667. try:
  668. lexer = get_lexer_by_name(match.group(2).strip())
  669. except ClassNotFound:
  670. pass
  671. code = match.group(4)
  672. # no lexer for this language. handle it like it was a code block
  673. if lexer is None:
  674. yield match.start(4), String, code
  675. return
  676. yield from do_insertions([], lexer.get_tokens_unprocessed(code))
  677. yield match.start(5), String, match.group(5)
  678. def _handle_cssblock(self, match):
  679. """
  680. match args: 1:style tag 2:newline, 3:code, 4:closing style tag
  681. """
  682. from pygments.lexers import get_lexer_by_name
  683. # section header
  684. yield match.start(1), String, match.group(1)
  685. yield match.start(2), String, match.group(2)
  686. lexer = None
  687. if self.handlecodeblocks:
  688. try:
  689. lexer = get_lexer_by_name('css')
  690. except ClassNotFound:
  691. pass
  692. code = match.group(3)
  693. # no lexer for this language. handle it like it was a code block
  694. if lexer is None:
  695. yield match.start(3), String, code
  696. return
  697. yield from do_insertions([], lexer.get_tokens_unprocessed(code))
  698. yield match.start(4), String, match.group(4)
  699. tokens = {
  700. 'root': [
  701. # title in metadata section
  702. (r'^(title)(:\s)(.+\n)', bygroups(Keyword, Text, Generic.Heading)),
  703. # headings
  704. (r'^(!)([^!].+\n)', bygroups(Generic.Heading, Text)),
  705. (r'^(!{2,6})(.+\n)', bygroups(Generic.Subheading, Text)),
  706. # bulleted or numbered lists or single-line block quotes
  707. # (can be mixed)
  708. (r'^(\s*)([*#>]+)(\s*)(.+\n)',
  709. bygroups(Text, Keyword, Text, using(this, state='inline'))),
  710. # multi-line block quotes
  711. (r'^(<<<.*\n)([\w\W]*?)(^<<<.*$)', bygroups(String, Text, String)),
  712. # table header
  713. (r'^(\|.*?\|h)$', bygroups(Generic.Strong)),
  714. # table footer or caption
  715. (r'^(\|.*?\|[cf])$', bygroups(Generic.Emph)),
  716. # table class
  717. (r'^(\|.*?\|k)$', bygroups(Name.Tag)),
  718. # definitions
  719. (r'^(;.*)$', bygroups(Generic.Strong)),
  720. # text block
  721. (r'^(```\n)([\w\W]*?)(^```$)', bygroups(String, Text, String)),
  722. # code block with language
  723. (r'^(```)(\w+)(\n)([\w\W]*?)(^```$)', _handle_codeblock),
  724. # CSS style block
  725. (r'^(<style>)(\n)([\w\W]*?)(^</style>$)', _handle_cssblock),
  726. include('keywords'),
  727. include('inline'),
  728. ],
  729. 'keywords': [
  730. (words((
  731. '\\define', '\\end', 'caption', 'created', 'modified', 'tags',
  732. 'title', 'type'), prefix=r'^', suffix=r'\b'),
  733. Keyword),
  734. ],
  735. 'inline': [
  736. # escape
  737. (r'\\.', Text),
  738. # created or modified date
  739. (r'\d{17}', Number.Integer),
  740. # italics
  741. (r'(\s)(//[^/]+//)((?=\W|\n))',
  742. bygroups(Text, Generic.Emph, Text)),
  743. # superscript
  744. (r'(\s)(\^\^[^\^]+\^\^)', bygroups(Text, Generic.Emph)),
  745. # subscript
  746. (r'(\s)(,,[^,]+,,)', bygroups(Text, Generic.Emph)),
  747. # underscore
  748. (r'(\s)(__[^_]+__)', bygroups(Text, Generic.Strong)),
  749. # bold
  750. (r"(\s)(''[^']+'')((?=\W|\n))",
  751. bygroups(Text, Generic.Strong, Text)),
  752. # strikethrough
  753. (r'(\s)(~~[^~]+~~)((?=\W|\n))',
  754. bygroups(Text, Generic.Deleted, Text)),
  755. # TiddlyWiki variables
  756. (r'<<[^>]+>>', Name.Tag),
  757. (r'\$\$[^$]+\$\$', Name.Tag),
  758. (r'\$\([^)]+\)\$', Name.Tag),
  759. # TiddlyWiki style or class
  760. (r'^@@.*$', Name.Tag),
  761. # HTML tags
  762. (r'</?[^>]+>', Name.Tag),
  763. # inline code
  764. (r'`[^`]+`', String.Backtick),
  765. # HTML escaped symbols
  766. (r'&\S*?;', String.Regex),
  767. # Wiki links
  768. (r'(\[{2})([^]\|]+)(\]{2})', bygroups(Text, Name.Tag, Text)),
  769. # External links
  770. (r'(\[{2})([^]\|]+)(\|)([^]\|]+)(\]{2})',
  771. bygroups(Text, Name.Tag, Text, Name.Attribute, Text)),
  772. # Transclusion
  773. (r'(\{{2})([^}]+)(\}{2})', bygroups(Text, Name.Tag, Text)),
  774. # URLs
  775. (r'(\b.?.?tps?://[^\s"]+)', bygroups(Name.Attribute)),
  776. # general text, must come last!
  777. (r'[\w]+', Text),
  778. (r'.', Text)
  779. ],
  780. }
  781. def __init__(self, **options):
  782. self.handlecodeblocks = get_bool_opt(options, 'handlecodeblocks', True)
  783. RegexLexer.__init__(self, **options)
  784. class WikitextLexer(RegexLexer):
  785. """
  786. For MediaWiki Wikitext.
  787. Parsing Wikitext is tricky, and results vary between different MediaWiki
  788. installations, so we only highlight common syntaxes (built-in or from
  789. popular extensions), and also assume templates produce no unbalanced
  790. syntaxes.
  791. """
  792. name = 'Wikitext'
  793. url = 'https://www.mediawiki.org/wiki/Wikitext'
  794. aliases = ['wikitext', 'mediawiki']
  795. filenames = []
  796. mimetypes = ['text/x-wiki']
  797. version_added = '2.15'
  798. flags = re.MULTILINE
  799. def nowiki_tag_rules(tag_name):
  800. return [
  801. (rf'(?i)(</)({tag_name})(\s*)(>)', bygroups(Punctuation,
  802. Name.Tag, Whitespace, Punctuation), '#pop'),
  803. include('entity'),
  804. include('text'),
  805. ]
  806. def plaintext_tag_rules(tag_name):
  807. return [
  808. (rf'(?si)(.*?)(</)({tag_name})(\s*)(>)', bygroups(Text,
  809. Punctuation, Name.Tag, Whitespace, Punctuation), '#pop'),
  810. ]
  811. def delegate_tag_rules(tag_name, lexer, **lexer_kwargs):
  812. return [
  813. (rf'(?i)(</)({tag_name})(\s*)(>)', bygroups(Punctuation,
  814. Name.Tag, Whitespace, Punctuation), '#pop'),
  815. (rf'(?si).+?(?=</{tag_name}\s*>)', using(lexer, **lexer_kwargs)),
  816. ]
  817. def text_rules(token):
  818. return [
  819. (r'\w+', token),
  820. (r'[^\S\n]+', token),
  821. (r'(?s).', token),
  822. ]
  823. def handle_syntaxhighlight(self, match, ctx):
  824. from pygments.lexers import get_lexer_by_name
  825. attr_content = match.group()
  826. start = 0
  827. index = 0
  828. while True:
  829. index = attr_content.find('>', start)
  830. # Exclude comment end (-->)
  831. if attr_content[index-2:index] != '--':
  832. break
  833. start = index + 1
  834. if index == -1:
  835. # No tag end
  836. yield from self.get_tokens_unprocessed(attr_content, stack=['root', 'attr'])
  837. return
  838. attr = attr_content[:index]
  839. yield from self.get_tokens_unprocessed(attr, stack=['root', 'attr'])
  840. yield match.start(3) + index, Punctuation, '>'
  841. lexer = None
  842. content = attr_content[index+1:]
  843. lang_match = re.findall(r'\blang=("|\'|)(\w+)(\1)', attr)
  844. if len(lang_match) >= 1:
  845. # Pick the last match in case of multiple matches
  846. lang = lang_match[-1][1]
  847. try:
  848. lexer = get_lexer_by_name(lang)
  849. except ClassNotFound:
  850. pass
  851. if lexer is None:
  852. yield match.start() + index + 1, Text, content
  853. else:
  854. yield from lexer.get_tokens_unprocessed(content)
  855. def handle_score(self, match, ctx):
  856. attr_content = match.group()
  857. start = 0
  858. index = 0
  859. while True:
  860. index = attr_content.find('>', start)
  861. # Exclude comment end (-->)
  862. if attr_content[index-2:index] != '--':
  863. break
  864. start = index + 1
  865. if index == -1:
  866. # No tag end
  867. yield from self.get_tokens_unprocessed(attr_content, stack=['root', 'attr'])
  868. return
  869. attr = attr_content[:index]
  870. content = attr_content[index+1:]
  871. yield from self.get_tokens_unprocessed(attr, stack=['root', 'attr'])
  872. yield match.start(3) + index, Punctuation, '>'
  873. lang_match = re.findall(r'\blang=("|\'|)(\w+)(\1)', attr)
  874. # Pick the last match in case of multiple matches
  875. lang = lang_match[-1][1] if len(lang_match) >= 1 else 'lilypond'
  876. if lang == 'lilypond': # Case sensitive
  877. yield from LilyPondLexer().get_tokens_unprocessed(content)
  878. else: # ABC
  879. # FIXME: Use ABC lexer in the future
  880. yield match.start() + index + 1, Text, content
  881. # a-z removed to prevent linter from complaining, REMEMBER to use (?i)
  882. title_char = r' %!"$&\'()*,\-./0-9:;=?@A-Z\\\^_`~+\u0080-\uFFFF'
  883. nbsp_char = r'(?:\t|&nbsp;|&\#0*160;|&\#[Xx]0*[Aa]0;|[ \xA0\u1680\u2000-\u200A\u202F\u205F\u3000])'
  884. link_address = r'(?:[0-9.]+|\[[0-9a-f:.]+\]|[^\x00-\x20"<>\[\]\x7F\xA0\u1680\u2000-\u200A\u202F\u205F\u3000\uFFFD])'
  885. link_char_class = r'[^\x00-\x20"<>\[\]\x7F\xA0\u1680\u2000-\u200A\u202F\u205F\u3000\uFFFD]'
  886. double_slashes_i = {
  887. '__FORCETOC__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOEDITSECTION__', '__NOGALLERY__',
  888. '__NOTITLECONVERT__', '__NOTC__', '__NOTOC__', '__TOC__',
  889. }
  890. double_slashes = {
  891. '__EXPECTUNUSEDCATEGORY__', '__HIDDENCAT__', '__INDEX__', '__NEWSECTIONLINK__',
  892. '__NOINDEX__', '__NONEWSECTIONLINK__', '__STATICREDIRECT__', '__NOGLOBAL__',
  893. '__DISAMBIG__', '__EXPECTED_UNCONNECTED_PAGE__',
  894. }
  895. protocols = {
  896. 'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://', 'https://',
  897. 'irc://', 'ircs://', 'magnet:', 'mailto:', 'mms://', 'news:', 'nntp://', 'redis://',
  898. 'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://', 'svn://', 'tel:', 'telnet://', 'urn:',
  899. 'worldwind://', 'xmpp:', '//',
  900. }
  901. non_relative_protocols = protocols - {'//'}
  902. html_tags = {
  903. 'abbr', 'b', 'bdi', 'bdo', 'big', 'blockquote', 'br', 'caption', 'center', 'cite', 'code',
  904. 'data', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'h5',
  905. 'h6', 'hr', 'i', 'ins', 'kbd', 'li', 'link', 'mark', 'meta', 'ol', 'p', 'q', 'rb', 'rp',
  906. 'rt', 'rtc', 'ruby', 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
  907. 'table', 'td', 'th', 'time', 'tr', 'tt', 'u', 'ul', 'var', 'wbr',
  908. }
  909. parser_tags = {
  910. 'graph', 'charinsert', 'rss', 'chem', 'categorytree', 'nowiki', 'inputbox', 'math',
  911. 'hiero', 'score', 'pre', 'ref', 'translate', 'imagemap', 'templatestyles', 'languages',
  912. 'noinclude', 'mapframe', 'section', 'poem', 'syntaxhighlight', 'includeonly', 'tvar',
  913. 'onlyinclude', 'templatedata', 'langconvert', 'timeline', 'dynamicpagelist', 'gallery',
  914. 'maplink', 'ce', 'references',
  915. }
  916. variant_langs = {
  917. # ZhConverter.php
  918. 'zh', 'zh-hans', 'zh-hant', 'zh-cn', 'zh-hk', 'zh-mo', 'zh-my', 'zh-sg', 'zh-tw',
  919. # WuuConverter.php
  920. 'wuu', 'wuu-hans', 'wuu-hant',
  921. # UzConverter.php
  922. 'uz', 'uz-latn', 'uz-cyrl',
  923. # TlyConverter.php
  924. 'tly', 'tly-cyrl',
  925. # TgConverter.php
  926. 'tg', 'tg-latn',
  927. # SrConverter.php
  928. 'sr', 'sr-ec', 'sr-el',
  929. # ShiConverter.php
  930. 'shi', 'shi-tfng', 'shi-latn',
  931. # ShConverter.php
  932. 'sh-latn', 'sh-cyrl',
  933. # KuConverter.php
  934. 'ku', 'ku-arab', 'ku-latn',
  935. # IuConverter.php
  936. 'iu', 'ike-cans', 'ike-latn',
  937. # GanConverter.php
  938. 'gan', 'gan-hans', 'gan-hant',
  939. # EnConverter.php
  940. 'en', 'en-x-piglatin',
  941. # CrhConverter.php
  942. 'crh', 'crh-cyrl', 'crh-latn',
  943. # BanConverter.php
  944. 'ban', 'ban-bali', 'ban-x-dharma', 'ban-x-palmleaf', 'ban-x-pku',
  945. }
  946. magic_vars_i = {
  947. 'ARTICLEPATH', 'INT', 'PAGEID', 'SCRIPTPATH', 'SERVER', 'SERVERNAME', 'STYLEPATH',
  948. }
  949. magic_vars = {
  950. '!', '=', 'BASEPAGENAME', 'BASEPAGENAMEE', 'CASCADINGSOURCES', 'CONTENTLANGUAGE',
  951. 'CONTENTLANG', 'CURRENTDAY', 'CURRENTDAY2', 'CURRENTDAYNAME', 'CURRENTDOW', 'CURRENTHOUR',
  952. 'CURRENTMONTH', 'CURRENTMONTH2', 'CURRENTMONTH1', 'CURRENTMONTHABBREV', 'CURRENTMONTHNAME',
  953. 'CURRENTMONTHNAMEGEN', 'CURRENTTIME', 'CURRENTTIMESTAMP', 'CURRENTVERSION', 'CURRENTWEEK',
  954. 'CURRENTYEAR', 'DIRECTIONMARK', 'DIRMARK', 'FULLPAGENAME', 'FULLPAGENAMEE', 'LOCALDAY',
  955. 'LOCALDAY2', 'LOCALDAYNAME', 'LOCALDOW', 'LOCALHOUR', 'LOCALMONTH', 'LOCALMONTH2',
  956. 'LOCALMONTH1', 'LOCALMONTHABBREV', 'LOCALMONTHNAME', 'LOCALMONTHNAMEGEN', 'LOCALTIME',
  957. 'LOCALTIMESTAMP', 'LOCALWEEK', 'LOCALYEAR', 'NAMESPACE', 'NAMESPACEE', 'NAMESPACENUMBER',
  958. 'NUMBEROFACTIVEUSERS', 'NUMBEROFADMINS', 'NUMBEROFARTICLES', 'NUMBEROFEDITS',
  959. 'NUMBEROFFILES', 'NUMBEROFPAGES', 'NUMBEROFUSERS', 'PAGELANGUAGE', 'PAGENAME', 'PAGENAMEE',
  960. 'REVISIONDAY', 'REVISIONDAY2', 'REVISIONID', 'REVISIONMONTH', 'REVISIONMONTH1',
  961. 'REVISIONSIZE', 'REVISIONTIMESTAMP', 'REVISIONUSER', 'REVISIONYEAR', 'ROOTPAGENAME',
  962. 'ROOTPAGENAMEE', 'SITENAME', 'SUBJECTPAGENAME', 'ARTICLEPAGENAME', 'SUBJECTPAGENAMEE',
  963. 'ARTICLEPAGENAMEE', 'SUBJECTSPACE', 'ARTICLESPACE', 'SUBJECTSPACEE', 'ARTICLESPACEE',
  964. 'SUBPAGENAME', 'SUBPAGENAMEE', 'TALKPAGENAME', 'TALKPAGENAMEE', 'TALKSPACE', 'TALKSPACEE',
  965. }
  966. parser_functions_i = {
  967. 'ANCHORENCODE', 'BIDI', 'CANONICALURL', 'CANONICALURLE', 'FILEPATH', 'FORMATNUM',
  968. 'FULLURL', 'FULLURLE', 'GENDER', 'GRAMMAR', 'INT', r'\#LANGUAGE', 'LC', 'LCFIRST', 'LOCALURL',
  969. 'LOCALURLE', 'NS', 'NSE', 'PADLEFT', 'PADRIGHT', 'PAGEID', 'PLURAL', 'UC', 'UCFIRST',
  970. 'URLENCODE',
  971. }
  972. parser_functions = {
  973. 'BASEPAGENAME', 'BASEPAGENAMEE', 'CASCADINGSOURCES', 'DEFAULTSORT', 'DEFAULTSORTKEY',
  974. 'DEFAULTCATEGORYSORT', 'FULLPAGENAME', 'FULLPAGENAMEE', 'NAMESPACE', 'NAMESPACEE',
  975. 'NAMESPACENUMBER', 'NUMBERINGROUP', 'NUMINGROUP', 'NUMBEROFACTIVEUSERS', 'NUMBEROFADMINS',
  976. 'NUMBEROFARTICLES', 'NUMBEROFEDITS', 'NUMBEROFFILES', 'NUMBEROFPAGES', 'NUMBEROFUSERS',
  977. 'PAGENAME', 'PAGENAMEE', 'PAGESINCATEGORY', 'PAGESINCAT', 'PAGESIZE', 'PROTECTIONEXPIRY',
  978. 'PROTECTIONLEVEL', 'REVISIONDAY', 'REVISIONDAY2', 'REVISIONID', 'REVISIONMONTH',
  979. 'REVISIONMONTH1', 'REVISIONTIMESTAMP', 'REVISIONUSER', 'REVISIONYEAR', 'ROOTPAGENAME',
  980. 'ROOTPAGENAMEE', 'SUBJECTPAGENAME', 'ARTICLEPAGENAME', 'SUBJECTPAGENAMEE',
  981. 'ARTICLEPAGENAMEE', 'SUBJECTSPACE', 'ARTICLESPACE', 'SUBJECTSPACEE', 'ARTICLESPACEE',
  982. 'SUBPAGENAME', 'SUBPAGENAMEE', 'TALKPAGENAME', 'TALKPAGENAMEE', 'TALKSPACE', 'TALKSPACEE',
  983. 'INT', 'DISPLAYTITLE', 'PAGESINNAMESPACE', 'PAGESINNS',
  984. }
  985. tokens = {
  986. 'root': [
  987. # Redirects
  988. (r"""(?xi)
  989. (\A\s*?)(\#REDIRECT:?) # may contain a colon
  990. (\s+)(\[\[) (?=[^\]\n]* \]\]$)
  991. """,
  992. bygroups(Whitespace, Keyword, Whitespace, Punctuation), 'redirect-inner'),
  993. # Subheadings
  994. (r'^(={2,6})(.+?)(\1)(\s*$\n)',
  995. bygroups(Generic.Subheading, Generic.Subheading, Generic.Subheading, Whitespace)),
  996. # Headings
  997. (r'^(=.+?=)(\s*$\n)',
  998. bygroups(Generic.Heading, Whitespace)),
  999. # Double-slashed magic words
  1000. (words(double_slashes_i, prefix=r'(?i)'), Name.Function.Magic),
  1001. (words(double_slashes), Name.Function.Magic),
  1002. # Raw URLs
  1003. (r'(?i)\b(?:{}){}{}*'.format('|'.join(protocols),
  1004. link_address, link_char_class), Name.Label),
  1005. # Magic links
  1006. (rf'\b(?:RFC|PMID){nbsp_char}+[0-9]+\b',
  1007. Name.Function.Magic),
  1008. (r"""(?x)
  1009. \bISBN {nbsp_char}
  1010. (?: 97[89] {nbsp_dash}? )?
  1011. (?: [0-9] {nbsp_dash}? ){{9}} # escape format()
  1012. [0-9Xx]\b
  1013. """.format(nbsp_char=nbsp_char, nbsp_dash=f'(?:-|{nbsp_char})'), Name.Function.Magic),
  1014. include('list'),
  1015. include('inline'),
  1016. include('text'),
  1017. ],
  1018. 'redirect-inner': [
  1019. (r'(\]\])(\s*?\n)', bygroups(Punctuation, Whitespace), '#pop'),
  1020. (r'(\#)([^#]*?)', bygroups(Punctuation, Name.Label)),
  1021. (rf'(?i)[{title_char}]+', Name.Tag),
  1022. ],
  1023. 'list': [
  1024. # Description lists
  1025. (r'^;', Keyword, 'dt'),
  1026. # Ordered lists, unordered lists and indents
  1027. (r'^[#:*]+', Keyword),
  1028. # Horizontal rules
  1029. (r'^-{4,}', Keyword),
  1030. ],
  1031. 'inline': [
  1032. # Signatures
  1033. (r'~{3,5}', Keyword),
  1034. # Entities
  1035. include('entity'),
  1036. # Bold & italic
  1037. (r"('')(''')(?!')", bygroups(Generic.Emph,
  1038. Generic.EmphStrong), 'inline-italic-bold'),
  1039. (r"'''(?!')", Generic.Strong, 'inline-bold'),
  1040. (r"''(?!')", Generic.Emph, 'inline-italic'),
  1041. # Comments & parameters & templates
  1042. include('replaceable'),
  1043. # Media links
  1044. (
  1045. r"""(?xi)
  1046. (\[\[)
  1047. (File|Image) (:)
  1048. ((?: [{}] | \{{{{2,3}}[^{{}}]*?\}}{{2,3}} | <!--[\s\S]*?--> )*)
  1049. (?: (\#) ([{}]*?) )?
  1050. """.format(title_char, f'{title_char}#'),
  1051. bygroups(Punctuation, Name.Namespace, Punctuation,
  1052. using(this, state=['wikilink-name']), Punctuation, Name.Label),
  1053. 'medialink-inner'
  1054. ),
  1055. # Wikilinks
  1056. (
  1057. r"""(?xi)
  1058. (\[\[)(?!{}) # Should not contain URLs
  1059. (?: ([{}]*) (:))?
  1060. ((?: [{}] | \{{{{2,3}}[^{{}}]*?\}}{{2,3}} | <!--[\s\S]*?--> )*?)
  1061. (?: (\#) ([{}]*?) )?
  1062. (\]\])
  1063. """.format('|'.join(protocols), title_char.replace('/', ''),
  1064. title_char, f'{title_char}#'),
  1065. bygroups(Punctuation, Name.Namespace, Punctuation,
  1066. using(this, state=['wikilink-name']), Punctuation, Name.Label, Punctuation)
  1067. ),
  1068. (
  1069. r"""(?xi)
  1070. (\[\[)(?!{})
  1071. (?: ([{}]*) (:))?
  1072. ((?: [{}] | \{{{{2,3}}[^{{}}]*?\}}{{2,3}} | <!--[\s\S]*?--> )*?)
  1073. (?: (\#) ([{}]*?) )?
  1074. (\|)
  1075. """.format('|'.join(protocols), title_char.replace('/', ''),
  1076. title_char, f'{title_char}#'),
  1077. bygroups(Punctuation, Name.Namespace, Punctuation,
  1078. using(this, state=['wikilink-name']), Punctuation, Name.Label, Punctuation),
  1079. 'wikilink-inner'
  1080. ),
  1081. # External links
  1082. (
  1083. r"""(?xi)
  1084. (\[)
  1085. ((?:{}) {} {}*)
  1086. (\s*)
  1087. """.format('|'.join(protocols), link_address, link_char_class),
  1088. bygroups(Punctuation, Name.Label, Whitespace),
  1089. 'extlink-inner'
  1090. ),
  1091. # Tables
  1092. (r'^(:*)(\s*?)(\{\|)([^\n]*)$', bygroups(Keyword,
  1093. Whitespace, Punctuation, using(this, state=['root', 'attr'])), 'table'),
  1094. # HTML tags
  1095. (r'(?i)(<)({})\b'.format('|'.join(html_tags)),
  1096. bygroups(Punctuation, Name.Tag), 'tag-inner-ordinary'),
  1097. (r'(?i)(</)({})\b(\s*)(>)'.format('|'.join(html_tags)),
  1098. bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
  1099. # <nowiki>
  1100. (r'(?i)(<)(nowiki)\b', bygroups(Punctuation,
  1101. Name.Tag), ('tag-nowiki', 'tag-inner')),
  1102. # <pre>
  1103. (r'(?i)(<)(pre)\b', bygroups(Punctuation,
  1104. Name.Tag), ('tag-pre', 'tag-inner')),
  1105. # <categorytree>
  1106. (r'(?i)(<)(categorytree)\b', bygroups(
  1107. Punctuation, Name.Tag), ('tag-categorytree', 'tag-inner')),
  1108. # <hiero>
  1109. (r'(?i)(<)(hiero)\b', bygroups(Punctuation,
  1110. Name.Tag), ('tag-hiero', 'tag-inner')),
  1111. # <math>
  1112. (r'(?i)(<)(math)\b', bygroups(Punctuation,
  1113. Name.Tag), ('tag-math', 'tag-inner')),
  1114. # <chem>
  1115. (r'(?i)(<)(chem)\b', bygroups(Punctuation,
  1116. Name.Tag), ('tag-chem', 'tag-inner')),
  1117. # <ce>
  1118. (r'(?i)(<)(ce)\b', bygroups(Punctuation,
  1119. Name.Tag), ('tag-ce', 'tag-inner')),
  1120. # <charinsert>
  1121. (r'(?i)(<)(charinsert)\b', bygroups(
  1122. Punctuation, Name.Tag), ('tag-charinsert', 'tag-inner')),
  1123. # <templatedata>
  1124. (r'(?i)(<)(templatedata)\b', bygroups(
  1125. Punctuation, Name.Tag), ('tag-templatedata', 'tag-inner')),
  1126. # <gallery>
  1127. (r'(?i)(<)(gallery)\b', bygroups(
  1128. Punctuation, Name.Tag), ('tag-gallery', 'tag-inner')),
  1129. # <graph>
  1130. (r'(?i)(<)(gallery)\b', bygroups(
  1131. Punctuation, Name.Tag), ('tag-graph', 'tag-inner')),
  1132. # <dynamicpagelist>
  1133. (r'(?i)(<)(dynamicpagelist)\b', bygroups(
  1134. Punctuation, Name.Tag), ('tag-dynamicpagelist', 'tag-inner')),
  1135. # <inputbox>
  1136. (r'(?i)(<)(inputbox)\b', bygroups(
  1137. Punctuation, Name.Tag), ('tag-inputbox', 'tag-inner')),
  1138. # <rss>
  1139. (r'(?i)(<)(rss)\b', bygroups(
  1140. Punctuation, Name.Tag), ('tag-rss', 'tag-inner')),
  1141. # <imagemap>
  1142. (r'(?i)(<)(imagemap)\b', bygroups(
  1143. Punctuation, Name.Tag), ('tag-imagemap', 'tag-inner')),
  1144. # <syntaxhighlight>
  1145. (r'(?i)(</)(syntaxhighlight)\b(\s*)(>)',
  1146. bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
  1147. (r'(?si)(<)(syntaxhighlight)\b([^>]*?(?<!/)>.*?)(?=</\2\s*>)',
  1148. bygroups(Punctuation, Name.Tag, handle_syntaxhighlight)),
  1149. # <syntaxhighlight>: Fallback case for self-closing tags
  1150. (r'(?i)(<)(syntaxhighlight)\b(\s*?)((?:[^>]|-->)*?)(/\s*?(?<!--)>)', bygroups(
  1151. Punctuation, Name.Tag, Whitespace, using(this, state=['root', 'attr']), Punctuation)),
  1152. # <source>
  1153. (r'(?i)(</)(source)\b(\s*)(>)',
  1154. bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
  1155. (r'(?si)(<)(source)\b([^>]*?(?<!/)>.*?)(?=</\2\s*>)',
  1156. bygroups(Punctuation, Name.Tag, handle_syntaxhighlight)),
  1157. # <source>: Fallback case for self-closing tags
  1158. (r'(?i)(<)(source)\b(\s*?)((?:[^>]|-->)*?)(/\s*?(?<!--)>)', bygroups(
  1159. Punctuation, Name.Tag, Whitespace, using(this, state=['root', 'attr']), Punctuation)),
  1160. # <score>
  1161. (r'(?i)(</)(score)\b(\s*)(>)',
  1162. bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
  1163. (r'(?si)(<)(score)\b([^>]*?(?<!/)>.*?)(?=</\2\s*>)',
  1164. bygroups(Punctuation, Name.Tag, handle_score)),
  1165. # <score>: Fallback case for self-closing tags
  1166. (r'(?i)(<)(score)\b(\s*?)((?:[^>]|-->)*?)(/\s*?(?<!--)>)', bygroups(
  1167. Punctuation, Name.Tag, Whitespace, using(this, state=['root', 'attr']), Punctuation)),
  1168. # Other parser tags
  1169. (r'(?i)(<)({})\b'.format('|'.join(parser_tags)),
  1170. bygroups(Punctuation, Name.Tag), 'tag-inner-ordinary'),
  1171. (r'(?i)(</)({})\b(\s*)(>)'.format('|'.join(parser_tags)),
  1172. bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
  1173. # LanguageConverter markups
  1174. (
  1175. r"""(?xi)
  1176. (-\{{) # Use {{ to escape format()
  1177. ([^|]) (\|)
  1178. (?:
  1179. (?: ([^;]*?) (=>))?
  1180. (\s* (?:{variants}) \s*) (:)
  1181. )?
  1182. """.format(variants='|'.join(variant_langs)),
  1183. bygroups(Punctuation, Keyword, Punctuation,
  1184. using(this, state=['root', 'lc-raw']),
  1185. Operator, Name.Label, Punctuation),
  1186. 'lc-inner'
  1187. ),
  1188. # LanguageConverter markups: composite conversion grammar
  1189. (
  1190. r"""(?xi)
  1191. (-\{)
  1192. ([a-z\s;-]*?) (\|)
  1193. """,
  1194. bygroups(Punctuation,
  1195. using(this, state=['root', 'lc-flag']),
  1196. Punctuation),
  1197. 'lc-raw'
  1198. ),
  1199. # LanguageConverter markups: fallbacks
  1200. (
  1201. r"""(?xi)
  1202. (-\{{) (?!\{{) # Use {{ to escape format()
  1203. (?: (\s* (?:{variants}) \s*) (:))?
  1204. """.format(variants='|'.join(variant_langs)),
  1205. bygroups(Punctuation, Name.Label, Punctuation),
  1206. 'lc-inner'
  1207. ),
  1208. ],
  1209. 'wikilink-name': [
  1210. include('replaceable'),
  1211. (r'[^{<]+', Name.Tag),
  1212. (r'(?s).', Name.Tag),
  1213. ],
  1214. 'wikilink-inner': [
  1215. # Quit in case of another wikilink
  1216. (r'(?=\[\[)', Punctuation, '#pop'),
  1217. (r'\]\]', Punctuation, '#pop'),
  1218. include('inline'),
  1219. include('text'),
  1220. ],
  1221. 'medialink-inner': [
  1222. (r'\]\]', Punctuation, '#pop'),
  1223. (r'(\|)([^\n=|]*)(=)',
  1224. bygroups(Punctuation, Name.Attribute, Operator)),
  1225. (r'\|', Punctuation),
  1226. include('inline'),
  1227. include('text'),
  1228. ],
  1229. 'quote-common': [
  1230. # Quit in case of link/template endings
  1231. (r'(?=\]\]|\{\{|\}\})', Punctuation, '#pop'),
  1232. (r'\n', Text, '#pop'),
  1233. ],
  1234. 'inline-italic': [
  1235. include('quote-common'),
  1236. (r"('')(''')(?!')", bygroups(Generic.Emph,
  1237. Generic.Strong), ('#pop', 'inline-bold')),
  1238. (r"'''(?!')", Generic.EmphStrong, ('#pop', 'inline-italic-bold')),
  1239. (r"''(?!')", Generic.Emph, '#pop'),
  1240. include('inline'),
  1241. include('text-italic'),
  1242. ],
  1243. 'inline-bold': [
  1244. include('quote-common'),
  1245. (r"(''')('')(?!')", bygroups(
  1246. Generic.Strong, Generic.Emph), ('#pop', 'inline-italic')),
  1247. (r"'''(?!')", Generic.Strong, '#pop'),
  1248. (r"''(?!')", Generic.EmphStrong, ('#pop', 'inline-bold-italic')),
  1249. include('inline'),
  1250. include('text-bold'),
  1251. ],
  1252. 'inline-bold-italic': [
  1253. include('quote-common'),
  1254. (r"('')(''')(?!')", bygroups(Generic.EmphStrong,
  1255. Generic.Strong), '#pop'),
  1256. (r"'''(?!')", Generic.EmphStrong, ('#pop', 'inline-italic')),
  1257. (r"''(?!')", Generic.EmphStrong, ('#pop', 'inline-bold')),
  1258. include('inline'),
  1259. include('text-bold-italic'),
  1260. ],
  1261. 'inline-italic-bold': [
  1262. include('quote-common'),
  1263. (r"(''')('')(?!')", bygroups(
  1264. Generic.EmphStrong, Generic.Emph), '#pop'),
  1265. (r"'''(?!')", Generic.EmphStrong, ('#pop', 'inline-italic')),
  1266. (r"''(?!')", Generic.EmphStrong, ('#pop', 'inline-bold')),
  1267. include('inline'),
  1268. include('text-bold-italic'),
  1269. ],
  1270. 'lc-flag': [
  1271. (r'\s+', Whitespace),
  1272. (r';', Punctuation),
  1273. *text_rules(Keyword),
  1274. ],
  1275. 'lc-inner': [
  1276. (
  1277. r"""(?xi)
  1278. (;)
  1279. (?: ([^;]*?) (=>))?
  1280. (\s* (?:{variants}) \s*) (:)
  1281. """.format(variants='|'.join(variant_langs)),
  1282. bygroups(Punctuation, using(this, state=['root', 'lc-raw']),
  1283. Operator, Name.Label, Punctuation)
  1284. ),
  1285. (r';?\s*?\}-', Punctuation, '#pop'),
  1286. include('inline'),
  1287. include('text'),
  1288. ],
  1289. 'lc-raw': [
  1290. (r'\}-', Punctuation, '#pop'),
  1291. include('inline'),
  1292. include('text'),
  1293. ],
  1294. 'replaceable': [
  1295. # Comments
  1296. (r'<!--[\s\S]*?(?:-->|\Z)', Comment.Multiline),
  1297. # Parameters
  1298. (
  1299. r"""(?x)
  1300. (\{{3})
  1301. ([^|]*?)
  1302. (?=\}{3}|\|)
  1303. """,
  1304. bygroups(Punctuation, Name.Variable),
  1305. 'parameter-inner',
  1306. ),
  1307. # Magic variables
  1308. (r'(?i)(\{{\{{)(\s*)({})(\s*)(\}}\}})'.format('|'.join(magic_vars_i)),
  1309. bygroups(Punctuation, Whitespace, Name.Function, Whitespace, Punctuation)),
  1310. (r'(\{{\{{)(\s*)({})(\s*)(\}}\}})'.format('|'.join(magic_vars)),
  1311. bygroups(Punctuation, Whitespace, Name.Function, Whitespace, Punctuation)),
  1312. # Parser functions & templates
  1313. (r'\{\{', Punctuation, 'template-begin-space'),
  1314. # <tvar> legacy syntax
  1315. (r'(?i)(<)(tvar)\b(\|)([^>]*?)(>)', bygroups(Punctuation,
  1316. Name.Tag, Punctuation, String, Punctuation)),
  1317. (r'</>', Punctuation, '#pop'),
  1318. # <tvar>
  1319. (r'(?i)(<)(tvar)\b', bygroups(Punctuation, Name.Tag), 'tag-inner-ordinary'),
  1320. (r'(?i)(</)(tvar)\b(\s*)(>)',
  1321. bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
  1322. ],
  1323. 'parameter-inner': [
  1324. (r'\}{3}', Punctuation, '#pop'),
  1325. (r'\|', Punctuation),
  1326. include('inline'),
  1327. include('text'),
  1328. ],
  1329. 'template-begin-space': [
  1330. # Templates allow line breaks at the beginning, and due to how MediaWiki handles
  1331. # comments, an extra state is required to handle things like {{\n<!---->\n name}}
  1332. (r'<!--[\s\S]*?(?:-->|\Z)', Comment.Multiline),
  1333. (r'\s+', Whitespace),
  1334. # Parser functions
  1335. (
  1336. r'(?i)(\#[{}]*?|{})(:)'.format(title_char,
  1337. '|'.join(parser_functions_i)),
  1338. bygroups(Name.Function, Punctuation), ('#pop', 'template-inner')
  1339. ),
  1340. (
  1341. r'({})(:)'.format('|'.join(parser_functions)),
  1342. bygroups(Name.Function, Punctuation), ('#pop', 'template-inner')
  1343. ),
  1344. # Templates
  1345. (
  1346. rf'(?i)([{title_char}]*?)(:)',
  1347. bygroups(Name.Namespace, Punctuation), ('#pop', 'template-name')
  1348. ),
  1349. default(('#pop', 'template-name'),),
  1350. ],
  1351. 'template-name': [
  1352. (r'(\s*?)(\|)', bygroups(Text, Punctuation), ('#pop', 'template-inner')),
  1353. (r'\}\}', Punctuation, '#pop'),
  1354. (r'\n', Text, '#pop'),
  1355. include('replaceable'),
  1356. *text_rules(Name.Tag),
  1357. ],
  1358. 'template-inner': [
  1359. (r'\}\}', Punctuation, '#pop'),
  1360. (r'\|', Punctuation),
  1361. (
  1362. r"""(?x)
  1363. (?<=\|)
  1364. ( (?: (?! \{\{ | \}\} )[^=\|<])*? ) # Exclude templates and tags
  1365. (=)
  1366. """,
  1367. bygroups(Name.Label, Operator)
  1368. ),
  1369. include('inline'),
  1370. include('text'),
  1371. ],
  1372. 'table': [
  1373. # Use [ \t\n\r\0\x0B] instead of \s to follow PHP trim() behavior
  1374. # Endings
  1375. (r'^([ \t\n\r\0\x0B]*?)(\|\})',
  1376. bygroups(Whitespace, Punctuation), '#pop'),
  1377. # Table rows
  1378. (r'^([ \t\n\r\0\x0B]*?)(\|-+)(.*)$', bygroups(Whitespace, Punctuation,
  1379. using(this, state=['root', 'attr']))),
  1380. # Captions
  1381. (
  1382. r"""(?x)
  1383. ^([ \t\n\r\0\x0B]*?)(\|\+)
  1384. # Exclude links, template and tags
  1385. (?: ( (?: (?! \[\[ | \{\{ )[^|\n<] )*? )(\|) )?
  1386. (.*?)$
  1387. """,
  1388. bygroups(Whitespace, Punctuation, using(this, state=[
  1389. 'root', 'attr']), Punctuation, Generic.Heading),
  1390. ),
  1391. # Table data
  1392. (
  1393. r"""(?x)
  1394. ( ^(?:[ \t\n\r\0\x0B]*?)\| | \|\| )
  1395. (?: ( (?: (?! \[\[ | \{\{ )[^|\n<] )*? )(\|)(?!\|) )?
  1396. """,
  1397. bygroups(Punctuation, using(this, state=[
  1398. 'root', 'attr']), Punctuation),
  1399. ),
  1400. # Table headers
  1401. (
  1402. r"""(?x)
  1403. ( ^(?:[ \t\n\r\0\x0B]*?)! )
  1404. (?: ( (?: (?! \[\[ | \{\{ )[^|\n<] )*? )(\|)(?!\|) )?
  1405. """,
  1406. bygroups(Punctuation, using(this, state=[
  1407. 'root', 'attr']), Punctuation),
  1408. 'table-header',
  1409. ),
  1410. include('list'),
  1411. include('inline'),
  1412. include('text'),
  1413. ],
  1414. 'table-header': [
  1415. # Requires another state for || handling inside headers
  1416. (r'\n', Text, '#pop'),
  1417. (
  1418. r"""(?x)
  1419. (!!|\|\|)
  1420. (?:
  1421. ( (?: (?! \[\[ | \{\{ )[^|\n<] )*? )
  1422. (\|)(?!\|)
  1423. )?
  1424. """,
  1425. bygroups(Punctuation, using(this, state=[
  1426. 'root', 'attr']), Punctuation)
  1427. ),
  1428. *text_rules(Generic.Subheading),
  1429. ],
  1430. 'entity': [
  1431. (r'&\S*?;', Name.Entity),
  1432. ],
  1433. 'dt': [
  1434. (r'\n', Text, '#pop'),
  1435. include('inline'),
  1436. (r':', Keyword, '#pop'),
  1437. include('text'),
  1438. ],
  1439. 'extlink-inner': [
  1440. (r'\]', Punctuation, '#pop'),
  1441. include('inline'),
  1442. include('text'),
  1443. ],
  1444. 'nowiki-ish': [
  1445. include('entity'),
  1446. include('text'),
  1447. ],
  1448. 'attr': [
  1449. include('replaceable'),
  1450. (r'\s+', Whitespace),
  1451. (r'(=)(\s*)(")', bygroups(Operator, Whitespace, String.Double), 'attr-val-2'),
  1452. (r"(=)(\s*)(')", bygroups(Operator, Whitespace, String.Single), 'attr-val-1'),
  1453. (r'(=)(\s*)', bygroups(Operator, Whitespace), 'attr-val-0'),
  1454. (r'[\w:-]+', Name.Attribute),
  1455. ],
  1456. 'attr-val-0': [
  1457. (r'\s', Whitespace, '#pop'),
  1458. include('replaceable'),
  1459. *text_rules(String),
  1460. ],
  1461. 'attr-val-1': [
  1462. (r"'", String.Single, '#pop'),
  1463. include('replaceable'),
  1464. *text_rules(String.Single),
  1465. ],
  1466. 'attr-val-2': [
  1467. (r'"', String.Double, '#pop'),
  1468. include('replaceable'),
  1469. *text_rules(String.Double),
  1470. ],
  1471. 'tag-inner-ordinary': [
  1472. (r'/?\s*>', Punctuation, '#pop'),
  1473. include('tag-attr'),
  1474. ],
  1475. 'tag-inner': [
  1476. # Return to root state for self-closing tags
  1477. (r'/\s*>', Punctuation, '#pop:2'),
  1478. (r'\s*>', Punctuation, '#pop'),
  1479. include('tag-attr'),
  1480. ],
  1481. # There states below are just like their non-tag variants, the key difference is
  1482. # they forcibly quit when encountering tag closing markup
  1483. 'tag-attr': [
  1484. include('replaceable'),
  1485. (r'\s+', Whitespace),
  1486. (r'(=)(\s*)(")', bygroups(Operator,
  1487. Whitespace, String.Double), 'tag-attr-val-2'),
  1488. (r"(=)(\s*)(')", bygroups(Operator,
  1489. Whitespace, String.Single), 'tag-attr-val-1'),
  1490. (r'(=)(\s*)', bygroups(Operator, Whitespace), 'tag-attr-val-0'),
  1491. (r'[\w:-]+', Name.Attribute),
  1492. ],
  1493. 'tag-attr-val-0': [
  1494. (r'\s', Whitespace, '#pop'),
  1495. (r'/?>', Punctuation, '#pop:2'),
  1496. include('replaceable'),
  1497. *text_rules(String),
  1498. ],
  1499. 'tag-attr-val-1': [
  1500. (r"'", String.Single, '#pop'),
  1501. (r'/?>', Punctuation, '#pop:2'),
  1502. include('replaceable'),
  1503. *text_rules(String.Single),
  1504. ],
  1505. 'tag-attr-val-2': [
  1506. (r'"', String.Double, '#pop'),
  1507. (r'/?>', Punctuation, '#pop:2'),
  1508. include('replaceable'),
  1509. *text_rules(String.Double),
  1510. ],
  1511. 'tag-nowiki': nowiki_tag_rules('nowiki'),
  1512. 'tag-pre': nowiki_tag_rules('pre'),
  1513. 'tag-categorytree': plaintext_tag_rules('categorytree'),
  1514. 'tag-dynamicpagelist': plaintext_tag_rules('dynamicpagelist'),
  1515. 'tag-hiero': plaintext_tag_rules('hiero'),
  1516. 'tag-inputbox': plaintext_tag_rules('inputbox'),
  1517. 'tag-imagemap': plaintext_tag_rules('imagemap'),
  1518. 'tag-charinsert': plaintext_tag_rules('charinsert'),
  1519. 'tag-timeline': plaintext_tag_rules('timeline'),
  1520. 'tag-gallery': plaintext_tag_rules('gallery'),
  1521. 'tag-graph': plaintext_tag_rules('graph'),
  1522. 'tag-rss': plaintext_tag_rules('rss'),
  1523. 'tag-math': delegate_tag_rules('math', TexLexer, state='math'),
  1524. 'tag-chem': delegate_tag_rules('chem', TexLexer, state='math'),
  1525. 'tag-ce': delegate_tag_rules('ce', TexLexer, state='math'),
  1526. 'tag-templatedata': delegate_tag_rules('templatedata', JsonLexer),
  1527. 'text-italic': text_rules(Generic.Emph),
  1528. 'text-bold': text_rules(Generic.Strong),
  1529. 'text-bold-italic': text_rules(Generic.EmphStrong),
  1530. 'text': text_rules(Text),
  1531. }