rdf.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. # -*- coding: utf-8 -*-
  2. """
  3. pygments.lexers.rdf
  4. ~~~~~~~~~~~~~~~~~~~
  5. Lexers for semantic web and RDF query languages and markup.
  6. :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
  7. :license: BSD, see LICENSE for details.
  8. """
  9. import re
  10. from pygments.lexer import RegexLexer, bygroups, default
  11. from pygments.token import Keyword, Punctuation, String, Number, Operator, Generic, \
  12. Whitespace, Name, Literal, Comment, Text
  13. __all__ = ['SparqlLexer', 'TurtleLexer', 'ShExCLexer']
  14. class SparqlLexer(RegexLexer):
  15. """
  16. Lexer for `SPARQL <http://www.w3.org/TR/rdf-sparql-query/>`_ query language.
  17. .. versionadded:: 2.0
  18. """
  19. name = 'SPARQL'
  20. aliases = ['sparql']
  21. filenames = ['*.rq', '*.sparql']
  22. mimetypes = ['application/sparql-query']
  23. # character group definitions ::
  24. PN_CHARS_BASE_GRP = (u'a-zA-Z'
  25. u'\u00c0-\u00d6'
  26. u'\u00d8-\u00f6'
  27. u'\u00f8-\u02ff'
  28. u'\u0370-\u037d'
  29. u'\u037f-\u1fff'
  30. u'\u200c-\u200d'
  31. u'\u2070-\u218f'
  32. u'\u2c00-\u2fef'
  33. u'\u3001-\ud7ff'
  34. u'\uf900-\ufdcf'
  35. u'\ufdf0-\ufffd')
  36. PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
  37. PN_CHARS_GRP = (PN_CHARS_U_GRP +
  38. r'\-' +
  39. r'0-9' +
  40. u'\u00b7' +
  41. u'\u0300-\u036f' +
  42. u'\u203f-\u2040')
  43. HEX_GRP = '0-9A-Fa-f'
  44. PN_LOCAL_ESC_CHARS_GRP = r' _~.\-!$&"()*+,;=/?#@%'
  45. # terminal productions ::
  46. PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
  47. PN_CHARS_U = '[' + PN_CHARS_U_GRP + ']'
  48. PN_CHARS = '[' + PN_CHARS_GRP + ']'
  49. HEX = '[' + HEX_GRP + ']'
  50. PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
  51. IRIREF = r'<(?:[^<>"{}|^`\\\x00-\x20])*>'
  52. BLANK_NODE_LABEL = '_:[0-9' + PN_CHARS_U_GRP + '](?:[' + PN_CHARS_GRP + \
  53. '.]*' + PN_CHARS + ')?'
  54. PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
  55. VARNAME = u'[0-9' + PN_CHARS_U_GRP + '][' + PN_CHARS_U_GRP + \
  56. u'0-9\u00b7\u0300-\u036f\u203f-\u2040]*'
  57. PERCENT = '%' + HEX + HEX
  58. PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS
  59. PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
  60. PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
  61. '(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
  62. PN_CHARS_GRP + ':]|' + PLX + '))?')
  63. EXPONENT = r'[eE][+-]?\d+'
  64. # Lexer token definitions ::
  65. tokens = {
  66. 'root': [
  67. (r'\s+', Text),
  68. # keywords ::
  69. (r'(?i)(select|construct|describe|ask|where|filter|group\s+by|minus|'
  70. r'distinct|reduced|from\s+named|from|order\s+by|desc|asc|limit|'
  71. r'offset|bindings|load|clear|drop|create|add|move|copy|'
  72. r'insert\s+data|delete\s+data|delete\s+where|delete|insert|'
  73. r'using\s+named|using|graph|default|named|all|optional|service|'
  74. r'silent|bind|union|not\s+in|in|as|having|to|prefix|base)\b', Keyword),
  75. (r'(a)\b', Keyword),
  76. # IRIs ::
  77. ('(' + IRIREF + ')', Name.Label),
  78. # blank nodes ::
  79. ('(' + BLANK_NODE_LABEL + ')', Name.Label),
  80. # # variables ::
  81. ('[?$]' + VARNAME, Name.Variable),
  82. # prefixed names ::
  83. (r'(' + PN_PREFIX + r')?(\:)(' + PN_LOCAL + r')?',
  84. bygroups(Name.Namespace, Punctuation, Name.Tag)),
  85. # function names ::
  86. (r'(?i)(str|lang|langmatches|datatype|bound|iri|uri|bnode|rand|abs|'
  87. r'ceil|floor|round|concat|strlen|ucase|lcase|encode_for_uri|'
  88. r'contains|strstarts|strends|strbefore|strafter|year|month|day|'
  89. r'hours|minutes|seconds|timezone|tz|now|md5|sha1|sha256|sha384|'
  90. r'sha512|coalesce|if|strlang|strdt|sameterm|isiri|isuri|isblank|'
  91. r'isliteral|isnumeric|regex|substr|replace|exists|not\s+exists|'
  92. r'count|sum|min|max|avg|sample|group_concat|separator)\b',
  93. Name.Function),
  94. # boolean literals ::
  95. (r'(true|false)', Keyword.Constant),
  96. # double literals ::
  97. (r'[+\-]?(\d+\.\d*' + EXPONENT + r'|\.?\d+' + EXPONENT + ')', Number.Float),
  98. # decimal literals ::
  99. (r'[+\-]?(\d+\.\d*|\.\d+)', Number.Float),
  100. # integer literals ::
  101. (r'[+\-]?\d+', Number.Integer),
  102. # operators ::
  103. (r'(\|\||&&|=|\*|\-|\+|/|!=|<=|>=|!|<|>)', Operator),
  104. # punctuation characters ::
  105. (r'[(){}.;,:^\[\]]', Punctuation),
  106. # line comments ::
  107. (r'#[^\n]*', Comment),
  108. # strings ::
  109. (r'"""', String, 'triple-double-quoted-string'),
  110. (r'"', String, 'single-double-quoted-string'),
  111. (r"'''", String, 'triple-single-quoted-string'),
  112. (r"'", String, 'single-single-quoted-string'),
  113. ],
  114. 'triple-double-quoted-string': [
  115. (r'"""', String, 'end-of-string'),
  116. (r'[^\\]+', String),
  117. (r'\\', String, 'string-escape'),
  118. ],
  119. 'single-double-quoted-string': [
  120. (r'"', String, 'end-of-string'),
  121. (r'[^"\\\n]+', String),
  122. (r'\\', String, 'string-escape'),
  123. ],
  124. 'triple-single-quoted-string': [
  125. (r"'''", String, 'end-of-string'),
  126. (r'[^\\]+', String),
  127. (r'\\', String.Escape, 'string-escape'),
  128. ],
  129. 'single-single-quoted-string': [
  130. (r"'", String, 'end-of-string'),
  131. (r"[^'\\\n]+", String),
  132. (r'\\', String, 'string-escape'),
  133. ],
  134. 'string-escape': [
  135. (r'u' + HEX + '{4}', String.Escape, '#pop'),
  136. (r'U' + HEX + '{8}', String.Escape, '#pop'),
  137. (r'.', String.Escape, '#pop'),
  138. ],
  139. 'end-of-string': [
  140. (r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
  141. bygroups(Operator, Name.Function), '#pop:2'),
  142. (r'\^\^', Operator, '#pop:2'),
  143. default('#pop:2'),
  144. ],
  145. }
  146. class TurtleLexer(RegexLexer):
  147. """
  148. Lexer for `Turtle <http://www.w3.org/TR/turtle/>`_ data language.
  149. .. versionadded:: 2.1
  150. """
  151. name = 'Turtle'
  152. aliases = ['turtle']
  153. filenames = ['*.ttl']
  154. mimetypes = ['text/turtle', 'application/x-turtle']
  155. flags = re.IGNORECASE
  156. patterns = {
  157. 'PNAME_NS': r'((?:[a-z][\w-]*)?\:)', # Simplified character range
  158. 'IRIREF': r'(<[^<>"{}|^`\\\x00-\x20]*>)'
  159. }
  160. # PNAME_NS PN_LOCAL (with simplified character range)
  161. patterns['PrefixedName'] = r'%(PNAME_NS)s([a-z][\w-]*)' % patterns
  162. tokens = {
  163. 'root': [
  164. (r'\s+', Whitespace),
  165. # Base / prefix
  166. (r'(@base|BASE)(\s+)%(IRIREF)s(\s*)(\.?)' % patterns,
  167. bygroups(Keyword, Whitespace, Name.Variable, Whitespace,
  168. Punctuation)),
  169. (r'(@prefix|PREFIX)(\s+)%(PNAME_NS)s(\s+)%(IRIREF)s(\s*)(\.?)' % patterns,
  170. bygroups(Keyword, Whitespace, Name.Namespace, Whitespace,
  171. Name.Variable, Whitespace, Punctuation)),
  172. # The shorthand predicate 'a'
  173. (r'(?<=\s)a(?=\s)', Keyword.Type),
  174. # IRIREF
  175. (r'%(IRIREF)s' % patterns, Name.Variable),
  176. # PrefixedName
  177. (r'%(PrefixedName)s' % patterns,
  178. bygroups(Name.Namespace, Name.Tag)),
  179. # Comment
  180. (r'#[^\n]+', Comment),
  181. (r'\b(true|false)\b', Literal),
  182. (r'[+\-]?\d*\.\d+', Number.Float),
  183. (r'[+\-]?\d*(:?\.\d+)?E[+\-]?\d+', Number.Float),
  184. (r'[+\-]?\d+', Number.Integer),
  185. (r'[\[\](){}.;,:^]', Punctuation),
  186. (r'"""', String, 'triple-double-quoted-string'),
  187. (r'"', String, 'single-double-quoted-string'),
  188. (r"'''", String, 'triple-single-quoted-string'),
  189. (r"'", String, 'single-single-quoted-string'),
  190. ],
  191. 'triple-double-quoted-string': [
  192. (r'"""', String, 'end-of-string'),
  193. (r'[^\\]+', String),
  194. (r'\\', String, 'string-escape'),
  195. ],
  196. 'single-double-quoted-string': [
  197. (r'"', String, 'end-of-string'),
  198. (r'[^"\\\n]+', String),
  199. (r'\\', String, 'string-escape'),
  200. ],
  201. 'triple-single-quoted-string': [
  202. (r"'''", String, 'end-of-string'),
  203. (r'[^\\]+', String),
  204. (r'\\', String, 'string-escape'),
  205. ],
  206. 'single-single-quoted-string': [
  207. (r"'", String, 'end-of-string'),
  208. (r"[^'\\\n]+", String),
  209. (r'\\', String, 'string-escape'),
  210. ],
  211. 'string-escape': [
  212. (r'.', String, '#pop'),
  213. ],
  214. 'end-of-string': [
  215. (r'(@)([a-z]+(:?-[a-z0-9]+)*)',
  216. bygroups(Operator, Generic.Emph), '#pop:2'),
  217. (r'(\^\^)%(IRIREF)s' % patterns, bygroups(Operator, Generic.Emph), '#pop:2'),
  218. (r'(\^\^)%(PrefixedName)s' % patterns,
  219. bygroups(Operator, Generic.Emph, Generic.Emph), '#pop:2'),
  220. default('#pop:2'),
  221. ],
  222. }
  223. # Turtle and Tera Term macro files share the same file extension
  224. # but each has a recognizable and distinct syntax.
  225. def analyse_text(text):
  226. for t in ('@base ', 'BASE ', '@prefix ', 'PREFIX '):
  227. if re.search(r'^\s*%s' % t, text):
  228. return 0.80
  229. class ShExCLexer(RegexLexer):
  230. """
  231. Lexer for `ShExC <https://shex.io/shex-semantics/#shexc>`_ shape expressions language syntax.
  232. """
  233. name = 'ShExC'
  234. aliases = ['shexc', 'shex']
  235. filenames = ['*.shex']
  236. mimetypes = ['text/shex']
  237. # character group definitions ::
  238. PN_CHARS_BASE_GRP = (u'a-zA-Z'
  239. u'\u00c0-\u00d6'
  240. u'\u00d8-\u00f6'
  241. u'\u00f8-\u02ff'
  242. u'\u0370-\u037d'
  243. u'\u037f-\u1fff'
  244. u'\u200c-\u200d'
  245. u'\u2070-\u218f'
  246. u'\u2c00-\u2fef'
  247. u'\u3001-\ud7ff'
  248. u'\uf900-\ufdcf'
  249. u'\ufdf0-\ufffd')
  250. PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
  251. PN_CHARS_GRP = (PN_CHARS_U_GRP +
  252. r'\-' +
  253. r'0-9' +
  254. u'\u00b7' +
  255. u'\u0300-\u036f' +
  256. u'\u203f-\u2040')
  257. HEX_GRP = '0-9A-Fa-f'
  258. PN_LOCAL_ESC_CHARS_GRP = r"_~.\-!$&'()*+,;=/?#@%"
  259. # terminal productions ::
  260. PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
  261. PN_CHARS_U = '[' + PN_CHARS_U_GRP + ']'
  262. PN_CHARS = '[' + PN_CHARS_GRP + ']'
  263. HEX = '[' + HEX_GRP + ']'
  264. PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
  265. UCHAR_NO_BACKSLASH = '(?:u' + HEX + '{4}|U' + HEX + '{8})'
  266. UCHAR = r'\\' + UCHAR_NO_BACKSLASH
  267. IRIREF = r'<(?:[^\x00-\x20<>"{}|^`\\]|' + UCHAR + ')*>'
  268. BLANK_NODE_LABEL = '_:[0-9' + PN_CHARS_U_GRP + '](?:[' + PN_CHARS_GRP + \
  269. '.]*' + PN_CHARS + ')?'
  270. PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
  271. PERCENT = '%' + HEX + HEX
  272. PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS
  273. PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
  274. PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
  275. '(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
  276. PN_CHARS_GRP + ':]|' + PLX + '))?')
  277. EXPONENT = r'[eE][+-]?\d+'
  278. # Lexer token definitions ::
  279. tokens = {
  280. 'root': [
  281. (r'\s+', Text),
  282. # keywords ::
  283. (r'(?i)(base|prefix|start|external|'
  284. r'literal|iri|bnode|nonliteral|length|minlength|maxlength|'
  285. r'mininclusive|minexclusive|maxinclusive|maxexclusive|'
  286. r'totaldigits|fractiondigits|'
  287. r'closed|extra)\b', Keyword),
  288. (r'(a)\b', Keyword),
  289. # IRIs ::
  290. ('(' + IRIREF + ')', Name.Label),
  291. # blank nodes ::
  292. ('(' + BLANK_NODE_LABEL + ')', Name.Label),
  293. # prefixed names ::
  294. (r'(' + PN_PREFIX + r')?(\:)(' + PN_LOCAL + ')?',
  295. bygroups(Name.Namespace, Punctuation, Name.Tag)),
  296. # boolean literals ::
  297. (r'(true|false)', Keyword.Constant),
  298. # double literals ::
  299. (r'[+\-]?(\d+\.\d*' + EXPONENT + r'|\.?\d+' + EXPONENT + ')', Number.Float),
  300. # decimal literals ::
  301. (r'[+\-]?(\d+\.\d*|\.\d+)', Number.Float),
  302. # integer literals ::
  303. (r'[+\-]?\d+', Number.Integer),
  304. # operators ::
  305. (r'[@|$&=*+?^\-~]', Operator),
  306. # operator keywords ::
  307. (r'(?i)(and|or|not)\b', Operator.Word),
  308. # punctuation characters ::
  309. (r'[(){}.;,:^\[\]]', Punctuation),
  310. # line comments ::
  311. (r'#[^\n]*', Comment),
  312. # strings ::
  313. (r'"""', String, 'triple-double-quoted-string'),
  314. (r'"', String, 'single-double-quoted-string'),
  315. (r"'''", String, 'triple-single-quoted-string'),
  316. (r"'", String, 'single-single-quoted-string'),
  317. ],
  318. 'triple-double-quoted-string': [
  319. (r'"""', String, 'end-of-string'),
  320. (r'[^\\]+', String),
  321. (r'\\', String, 'string-escape'),
  322. ],
  323. 'single-double-quoted-string': [
  324. (r'"', String, 'end-of-string'),
  325. (r'[^"\\\n]+', String),
  326. (r'\\', String, 'string-escape'),
  327. ],
  328. 'triple-single-quoted-string': [
  329. (r"'''", String, 'end-of-string'),
  330. (r'[^\\]+', String),
  331. (r'\\', String.Escape, 'string-escape'),
  332. ],
  333. 'single-single-quoted-string': [
  334. (r"'", String, 'end-of-string'),
  335. (r"[^'\\\n]+", String),
  336. (r'\\', String, 'string-escape'),
  337. ],
  338. 'string-escape': [
  339. (UCHAR_NO_BACKSLASH, String.Escape, '#pop'),
  340. (r'.', String.Escape, '#pop'),
  341. ],
  342. 'end-of-string': [
  343. (r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
  344. bygroups(Operator, Name.Function), '#pop:2'),
  345. (r'\^\^', Operator, '#pop:2'),
  346. default('#pop:2'),
  347. ],
  348. }