xpathparser.g 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. # -*- test-case-name: twisted.words.test.test_xpath -*-
  2. # Copyright (c) Twisted Matrix Laboratories.
  3. # See LICENSE for details.
  4. # pylint: disable=W9401,W9402
  5. # DO NOT EDIT xpathparser.py!
  6. #
  7. # It is generated from xpathparser.g using Yapps. Make needed changes there.
  8. # This also means that the generated Python may not conform to Twisted's coding
  9. # standards, so it is wrapped in exec to prevent automated checkers from
  10. # complaining.
  11. # HOWTO Generate me:
  12. #
  13. # 1.) Grab a copy of yapps2:
  14. # https://github.com/smurfix/yapps
  15. #
  16. # Note: Do NOT use the package in debian/ubuntu as it has incompatible
  17. # modifications. The original at http://theory.stanford.edu/~amitp/yapps/
  18. # hasn't been touched since 2003 and has not been updated to work with
  19. # Python 3.
  20. #
  21. # 2.) Generate the grammar:
  22. #
  23. # yapps2 xpathparser.g xpathparser.py.proto
  24. #
  25. # 3.) Edit the output to depend on the embedded runtime, and remove extraneous
  26. # imports:
  27. #
  28. # sed -e '/^# Begin/,${/^[^ ].*mport/d}' -e 's/runtime\.//g' \
  29. # -e "s/^\(from __future\)/exec(r'''\n\1/" -e"\$a''')"
  30. # xpathparser.py.proto > xpathparser.py
  31. """
  32. XPath Parser.
  33. Besides the parser code produced by Yapps, this module also defines the
  34. parse-time exception classes, a scanner class, a base class for parsers
  35. produced by Yapps, and a context class that keeps track of the parse stack.
  36. These have been copied from the Yapps runtime module.
  37. """
  38. from __future__ import print_function
  39. import sys, re
  40. MIN_WINDOW=4096
  41. # File lookup window
  42. class SyntaxError(Exception):
  43. """When we run into an unexpected token, this is the exception to use"""
  44. def __init__(self, pos=None, msg="Bad Token", context=None):
  45. Exception.__init__(self)
  46. self.pos = pos
  47. self.msg = msg
  48. self.context = context
  49. def __str__(self):
  50. if not self.pos: return 'SyntaxError'
  51. else: return 'SyntaxError@%s(%s)' % (repr(self.pos), self.msg)
  52. class NoMoreTokens(Exception):
  53. """Another exception object, for when we run out of tokens"""
  54. pass
  55. class Token(object):
  56. """Yapps token.
  57. This is a container for a scanned token.
  58. """
  59. def __init__(self, type,value, pos=None):
  60. """Initialize a token."""
  61. self.type = type
  62. self.value = value
  63. self.pos = pos
  64. def __repr__(self):
  65. output = '<%s: %s' % (self.type, repr(self.value))
  66. if self.pos:
  67. output += " @ "
  68. if self.pos[0]:
  69. output += "%s:" % self.pos[0]
  70. if self.pos[1]:
  71. output += "%d" % self.pos[1]
  72. if self.pos[2] is not None:
  73. output += ".%d" % self.pos[2]
  74. output += ">"
  75. return output
  76. in_name=0
  77. class Scanner(object):
  78. """Yapps scanner.
  79. The Yapps scanner can work in context sensitive or context
  80. insensitive modes. The token(i) method is used to retrieve the
  81. i-th token. It takes a restrict set that limits the set of tokens
  82. it is allowed to return. In context sensitive mode, this restrict
  83. set guides the scanner. In context insensitive mode, there is no
  84. restriction (the set is always the full set of tokens).
  85. """
  86. def __init__(self, patterns, ignore, input="",
  87. file=None,filename=None,stacked=False):
  88. """Initialize the scanner.
  89. Parameters:
  90. patterns : [(terminal, uncompiled regex), ...] or None
  91. ignore : {terminal:None, ...}
  92. input : string
  93. If patterns is None, we assume that the subclass has
  94. defined self.patterns : [(terminal, compiled regex), ...].
  95. Note that the patterns parameter expects uncompiled regexes,
  96. whereas the self.patterns field expects compiled regexes.
  97. The 'ignore' value is either None or a callable, which is called
  98. with the scanner and the to-be-ignored match object; this can
  99. be used for include file or comment handling.
  100. """
  101. if not filename:
  102. global in_name
  103. filename="<f.%d>" % in_name
  104. in_name += 1
  105. self.input = input
  106. self.ignore = ignore
  107. self.file = file
  108. self.filename = filename
  109. self.pos = 0
  110. self.del_pos = 0 # skipped
  111. self.line = 1
  112. self.del_line = 0 # skipped
  113. self.col = 0
  114. self.tokens = []
  115. self.stack = None
  116. self.stacked = stacked
  117. self.last_read_token = None
  118. self.last_token = None
  119. self.last_types = None
  120. if patterns is not None:
  121. # Compile the regex strings into regex objects
  122. self.patterns = []
  123. for terminal, regex in patterns:
  124. self.patterns.append( (terminal, re.compile(regex)) )
  125. def stack_input(self, input="", file=None, filename=None):
  126. """Temporarily parse from a second file."""
  127. # Already reading from somewhere else: Go on top of that, please.
  128. if self.stack:
  129. # autogenerate a recursion-level-identifying filename
  130. if not filename:
  131. filename = 1
  132. else:
  133. try:
  134. filename += 1
  135. except TypeError:
  136. pass
  137. # now pass off to the include file
  138. self.stack.stack_input(input,file,filename)
  139. else:
  140. try:
  141. filename += 0
  142. except TypeError:
  143. pass
  144. else:
  145. filename = "<str_%d>" % filename
  146. # self.stack = object.__new__(self.__class__)
  147. # Scanner.__init__(self.stack,self.patterns,self.ignore,input,file,filename, stacked=True)
  148. # Note that the pattern+ignore are added by the generated
  149. # scanner code
  150. self.stack = self.__class__(input,file,filename, stacked=True)
  151. def get_pos(self):
  152. """Return a file/line/char tuple."""
  153. if self.stack: return self.stack.get_pos()
  154. return (self.filename, self.line+self.del_line, self.col)
  155. # def __repr__(self):
  156. # """Print the last few tokens that have been scanned in"""
  157. # output = ''
  158. # for t in self.tokens:
  159. # output += '%s\n' % (repr(t),)
  160. # return output
  161. def print_line_with_pointer(self, pos, length=0, out=sys.stderr):
  162. """Print the line of 'text' that includes position 'p',
  163. along with a second line with a single caret (^) at position p"""
  164. file,line,p = pos
  165. if file != self.filename:
  166. if self.stack: return self.stack.print_line_with_pointer(pos,length=length,out=out)
  167. print >>out, "(%s: not in input buffer)" % file
  168. return
  169. text = self.input
  170. p += length-1 # starts at pos 1
  171. origline=line
  172. line -= self.del_line
  173. spos=0
  174. if line > 0:
  175. while 1:
  176. line = line - 1
  177. try:
  178. cr = text.index("\n",spos)
  179. except ValueError:
  180. if line:
  181. text = ""
  182. break
  183. if line == 0:
  184. text = text[spos:cr]
  185. break
  186. spos = cr+1
  187. else:
  188. print >>out, "(%s:%d not in input buffer)" % (file,origline)
  189. return
  190. # Now try printing part of the line
  191. text = text[max(p-80, 0):p+80]
  192. p = p - max(p-80, 0)
  193. # Strip to the left
  194. i = text[:p].rfind('\n')
  195. j = text[:p].rfind('\r')
  196. if i < 0 or (0 <= j < i): i = j
  197. if 0 <= i < p:
  198. p = p - i - 1
  199. text = text[i+1:]
  200. # Strip to the right
  201. i = text.find('\n', p)
  202. j = text.find('\r', p)
  203. if i < 0 or (0 <= j < i): i = j
  204. if i >= 0:
  205. text = text[:i]
  206. # Now shorten the text
  207. while len(text) > 70 and p > 60:
  208. # Cut off 10 chars
  209. text = "..." + text[10:]
  210. p = p - 7
  211. # Now print the string, along with an indicator
  212. print >>out, '> ',text
  213. print >>out, '> ',' '*p + '^'
  214. def grab_input(self):
  215. """Get more input if possible."""
  216. if not self.file: return
  217. if len(self.input) - self.pos >= MIN_WINDOW: return
  218. data = self.file.read(MIN_WINDOW)
  219. if data is None or data == "":
  220. self.file = None
  221. # Drop bytes from the start, if necessary.
  222. if self.pos > 2*MIN_WINDOW:
  223. self.del_pos += MIN_WINDOW
  224. self.del_line += self.input[:MIN_WINDOW].count("\n")
  225. self.pos -= MIN_WINDOW
  226. self.input = self.input[MIN_WINDOW:] + data
  227. else:
  228. self.input = self.input + data
  229. def getchar(self):
  230. """Return the next character."""
  231. self.grab_input()
  232. c = self.input[self.pos]
  233. self.pos += 1
  234. return c
  235. def token(self, restrict, context=None):
  236. """Scan for another token."""
  237. while 1:
  238. if self.stack:
  239. try:
  240. return self.stack.token(restrict, context)
  241. except StopIteration:
  242. self.stack = None
  243. # Keep looking for a token, ignoring any in self.ignore
  244. self.grab_input()
  245. # special handling for end-of-file
  246. if self.stacked and self.pos==len(self.input):
  247. raise StopIteration
  248. # Search the patterns for the longest match, with earlier
  249. # tokens in the list having preference
  250. best_match = -1
  251. best_pat = '(error)'
  252. best_m = None
  253. for p, regexp in self.patterns:
  254. # First check to see if we're ignoring this token
  255. if restrict and p not in restrict and p not in self.ignore:
  256. continue
  257. m = regexp.match(self.input, self.pos)
  258. if m and m.end()-m.start() > best_match:
  259. # We got a match that's better than the previous one
  260. best_pat = p
  261. best_match = m.end()-m.start()
  262. best_m = m
  263. # If we didn't find anything, raise an error
  264. if best_pat == '(error)' and best_match < 0:
  265. msg = 'Bad Token'
  266. if restrict:
  267. msg = 'Trying to find one of '+', '.join(restrict)
  268. raise SyntaxError(self.get_pos(), msg, context=context)
  269. ignore = best_pat in self.ignore
  270. value = self.input[self.pos:self.pos+best_match]
  271. if not ignore:
  272. tok=Token(type=best_pat, value=value, pos=self.get_pos())
  273. self.pos += best_match
  274. npos = value.rfind("\n")
  275. if npos > -1:
  276. self.col = best_match-npos
  277. self.line += value.count("\n")
  278. else:
  279. self.col += best_match
  280. # If we found something that isn't to be ignored, return it
  281. if not ignore:
  282. if len(self.tokens) >= 10:
  283. del self.tokens[0]
  284. self.tokens.append(tok)
  285. self.last_read_token = tok
  286. # print repr(tok)
  287. return tok
  288. else:
  289. ignore = self.ignore[best_pat]
  290. if ignore:
  291. ignore(self, best_m)
  292. def peek(self, *types, **kw):
  293. """Returns the token type for lookahead; if there are any args
  294. then the list of args is the set of token types to allow"""
  295. context = kw.get("context",None)
  296. if self.last_token is None:
  297. self.last_types = types
  298. self.last_token = self.token(types,context)
  299. elif self.last_types:
  300. for t in types:
  301. if t not in self.last_types:
  302. raise NotImplementedError("Unimplemented: restriction set changed")
  303. return self.last_token.type
  304. def scan(self, type, **kw):
  305. """Returns the matched text, and moves to the next token"""
  306. context = kw.get("context",None)
  307. if self.last_token is None:
  308. tok = self.token([type],context)
  309. else:
  310. if self.last_types and type not in self.last_types:
  311. raise NotImplementedError("Unimplemented: restriction set changed")
  312. tok = self.last_token
  313. self.last_token = None
  314. if tok.type != type:
  315. if not self.last_types: self.last_types=[]
  316. raise SyntaxError(tok.pos, 'Trying to find '+type+': '+ ', '.join(self.last_types)+", got "+tok.type, context=context)
  317. return tok.value
  318. class Parser(object):
  319. """Base class for Yapps-generated parsers.
  320. """
  321. def __init__(self, scanner):
  322. self._scanner = scanner
  323. def _stack(self, input="",file=None,filename=None):
  324. """Temporarily read from someplace else"""
  325. self._scanner.stack_input(input,file,filename)
  326. self._tok = None
  327. def _peek(self, *types, **kw):
  328. """Returns the token type for lookahead; if there are any args
  329. then the list of args is the set of token types to allow"""
  330. return self._scanner.peek(*types, **kw)
  331. def _scan(self, type, **kw):
  332. """Returns the matched text, and moves to the next token"""
  333. return self._scanner.scan(type, **kw)
  334. class Context(object):
  335. """Class to represent the parser's call stack.
  336. Every rule creates a Context that links to its parent rule. The
  337. contexts can be used for debugging.
  338. """
  339. def __init__(self, parent, scanner, rule, args=()):
  340. """Create a new context.
  341. Args:
  342. parent: Context object or None
  343. scanner: Scanner object
  344. rule: string (name of the rule)
  345. args: tuple listing parameters to the rule
  346. """
  347. self.parent = parent
  348. self.scanner = scanner
  349. self.rule = rule
  350. self.args = args
  351. while scanner.stack: scanner = scanner.stack
  352. self.token = scanner.last_read_token
  353. def __str__(self):
  354. output = ''
  355. if self.parent: output = str(self.parent) + ' > '
  356. output += self.rule
  357. return output
  358. def print_error(err, scanner, max_ctx=None):
  359. """Print error messages, the parser stack, and the input text -- for human-readable error messages."""
  360. # NOTE: this function assumes 80 columns :-(
  361. # Figure out the line number
  362. pos = err.pos
  363. if not pos:
  364. pos = scanner.get_pos()
  365. file_name, line_number, column_number = pos
  366. print('%s:%d:%d: %s' % (file_name, line_number, column_number, err.msg), file=sys.stderr)
  367. scanner.print_line_with_pointer(pos)
  368. context = err.context
  369. token = None
  370. while context:
  371. print('while parsing %s%s:' % (context.rule, tuple(context.args)), file=sys.stderr)
  372. if context.token:
  373. token = context.token
  374. if token:
  375. scanner.print_line_with_pointer(token.pos, length=len(token.value))
  376. context = context.parent
  377. if max_ctx:
  378. max_ctx = max_ctx-1
  379. if not max_ctx:
  380. break
  381. def wrap_error_reporter(parser, rule, *args,**kw):
  382. try:
  383. return getattr(parser, rule)(*args,**kw)
  384. except SyntaxError as e:
  385. print_error(e, parser._scanner)
  386. except NoMoreTokens:
  387. print('Could not complete parsing; stopped around here:', file=sys.stderr)
  388. print(parser._scanner, file=sys.stderr)
  389. from twisted.words.xish.xpath import AttribValue, BooleanValue, CompareValue
  390. from twisted.words.xish.xpath import Function, IndexValue, LiteralValue
  391. from twisted.words.xish.xpath import _AnyLocation, _Location
  392. %%
  393. parser XPathParser:
  394. ignore: "\\s+"
  395. token INDEX: "[0-9]+"
  396. token WILDCARD: "\*"
  397. token IDENTIFIER: "[a-zA-Z][a-zA-Z0-9_\-]*"
  398. token ATTRIBUTE: "\@[a-zA-Z][a-zA-Z0-9_\-]*"
  399. token FUNCNAME: "[a-zA-Z][a-zA-Z0-9_]*"
  400. token CMP_EQ: "\="
  401. token CMP_NE: "\!\="
  402. token STR_DQ: '"([^"]|(\\"))*?"'
  403. token STR_SQ: "'([^']|(\\'))*?'"
  404. token OP_AND: "and"
  405. token OP_OR: "or"
  406. token END: "$"
  407. rule XPATH: PATH {{ result = PATH; current = result }}
  408. ( PATH {{ current.childLocation = PATH; current = current.childLocation }} ) * END
  409. {{ return result }}
  410. rule PATH: ("/" {{ result = _Location() }} | "//" {{ result = _AnyLocation() }} )
  411. ( IDENTIFIER {{ result.elementName = IDENTIFIER }} | WILDCARD {{ result.elementName = None }} )
  412. ( "\[" PREDICATE {{ result.predicates.append(PREDICATE) }} "\]")*
  413. {{ return result }}
  414. rule PREDICATE: EXPR {{ return EXPR }} |
  415. INDEX {{ return IndexValue(INDEX) }}
  416. rule EXPR: FACTOR {{ e = FACTOR }}
  417. ( BOOLOP FACTOR {{ e = BooleanValue(e, BOOLOP, FACTOR) }} )*
  418. {{ return e }}
  419. rule BOOLOP: ( OP_AND {{ return OP_AND }} | OP_OR {{ return OP_OR }} )
  420. rule FACTOR: TERM {{ return TERM }}
  421. | "\(" EXPR "\)" {{ return EXPR }}
  422. rule TERM: VALUE {{ t = VALUE }}
  423. [ CMP VALUE {{ t = CompareValue(t, CMP, VALUE) }} ]
  424. {{ return t }}
  425. rule VALUE: "@" IDENTIFIER {{ return AttribValue(IDENTIFIER) }} |
  426. FUNCNAME {{ f = Function(FUNCNAME); args = [] }}
  427. "\(" [ VALUE {{ args.append(VALUE) }}
  428. (
  429. "," VALUE {{ args.append(VALUE) }}
  430. )*
  431. ] "\)" {{ f.setParams(*args); return f }} |
  432. STR {{ return LiteralValue(STR[1:len(STR)-1]) }}
  433. rule CMP: (CMP_EQ {{ return CMP_EQ }} | CMP_NE {{ return CMP_NE }})
  434. rule STR: (STR_DQ {{ return STR_DQ }} | STR_SQ {{ return STR_SQ }})