xpathparser.g 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. # -*- test-case-name: twisted.words.test.test_xpath -*-
  2. # Copyright (c) Twisted Matrix Laboratories.
  3. # See LICENSE for details.
  4. # pylint: disable=W9401,W9402
  5. # DO NOT EDIT xpathparser.py!
  6. #
  7. # It is generated from xpathparser.g using Yapps. Make needed changes there.
  8. # This also means that the generated Python may not conform to Twisted's coding
  9. # standards, so it is wrapped in exec to prevent automated checkers from
  10. # complaining.
  11. # HOWTO Generate me:
  12. #
  13. # 1.) Grab a copy of yapps2:
  14. # https://github.com/smurfix/yapps
  15. #
  16. # Note: Do NOT use the package in debian/ubuntu as it has incompatible
  17. # modifications. The original at http://theory.stanford.edu/~amitp/yapps/
  18. # hasn't been touched since 2003 and has not been updated to work with
  19. # Python 3.
  20. #
  21. # 2.) Generate the grammar:
  22. #
  23. # yapps2 xpathparser.g xpathparser.py.proto
  24. #
  25. # 3.) Edit the output to depend on the embedded runtime, and remove extraneous
  26. # imports:
  27. #
  28. # sed -e '/^# Begin/,${/^[^ ].*mport/d}' -e 's/runtime\.//g' \
  29. # -e "s/^\(from __future\)/exec(r'''\n\1/" -e"\$a''')"
  30. # xpathparser.py.proto > xpathparser.py
  31. """
  32. XPath Parser.
  33. Besides the parser code produced by Yapps, this module also defines the
  34. parse-time exception classes, a scanner class, a base class for parsers
  35. produced by Yapps, and a context class that keeps track of the parse stack.
  36. These have been copied from the Yapps runtime module.
  37. """
  38. import sys, re
  39. MIN_WINDOW=4096
  40. # File lookup window
  41. class SyntaxError(Exception):
  42. """When we run into an unexpected token, this is the exception to use"""
  43. def __init__(self, pos=None, msg="Bad Token", context=None):
  44. Exception.__init__(self)
  45. self.pos = pos
  46. self.msg = msg
  47. self.context = context
  48. def __str__(self):
  49. if not self.pos: return 'SyntaxError'
  50. else: return 'SyntaxError@%s(%s)' % (repr(self.pos), self.msg)
  51. class NoMoreTokens(Exception):
  52. """Another exception object, for when we run out of tokens"""
  53. pass
  54. class Token:
  55. """Yapps token.
  56. This is a container for a scanned token.
  57. """
  58. def __init__(self, type,value, pos=None):
  59. """Initialize a token."""
  60. self.type = type
  61. self.value = value
  62. self.pos = pos
  63. def __repr__(self):
  64. output = '<%s: %s' % (self.type, repr(self.value))
  65. if self.pos:
  66. output += " @ "
  67. if self.pos[0]:
  68. output += "%s:" % self.pos[0]
  69. if self.pos[1]:
  70. output += "%d" % self.pos[1]
  71. if self.pos[2] is not None:
  72. output += ".%d" % self.pos[2]
  73. output += ">"
  74. return output
  75. in_name=0
  76. class Scanner:
  77. """Yapps scanner.
  78. The Yapps scanner can work in context sensitive or context
  79. insensitive modes. The token(i) method is used to retrieve the
  80. i-th token. It takes a restrict set that limits the set of tokens
  81. it is allowed to return. In context sensitive mode, this restrict
  82. set guides the scanner. In context insensitive mode, there is no
  83. restriction (the set is always the full set of tokens).
  84. """
  85. def __init__(self, patterns, ignore, input="",
  86. file=None,filename=None,stacked=False):
  87. """Initialize the scanner.
  88. Parameters:
  89. patterns : [(terminal, uncompiled regex), ...] or None
  90. ignore : {terminal:None, ...}
  91. input : string
  92. If patterns is None, we assume that the subclass has
  93. defined self.patterns : [(terminal, compiled regex), ...].
  94. Note that the patterns parameter expects uncompiled regexes,
  95. whereas the self.patterns field expects compiled regexes.
  96. The 'ignore' value is either None or a callable, which is called
  97. with the scanner and the to-be-ignored match object; this can
  98. be used for include file or comment handling.
  99. """
  100. if not filename:
  101. global in_name
  102. filename="<f.%d>" % in_name
  103. in_name += 1
  104. self.input = input
  105. self.ignore = ignore
  106. self.file = file
  107. self.filename = filename
  108. self.pos = 0
  109. self.del_pos = 0 # skipped
  110. self.line = 1
  111. self.del_line = 0 # skipped
  112. self.col = 0
  113. self.tokens = []
  114. self.stack = None
  115. self.stacked = stacked
  116. self.last_read_token = None
  117. self.last_token = None
  118. self.last_types = None
  119. if patterns is not None:
  120. # Compile the regex strings into regex objects
  121. self.patterns = []
  122. for terminal, regex in patterns:
  123. self.patterns.append( (terminal, re.compile(regex)) )
  124. def stack_input(self, input="", file=None, filename=None):
  125. """Temporarily parse from a second file."""
  126. # Already reading from somewhere else: Go on top of that, please.
  127. if self.stack:
  128. # autogenerate a recursion-level-identifying filename
  129. if not filename:
  130. filename = 1
  131. else:
  132. try:
  133. filename += 1
  134. except TypeError:
  135. pass
  136. # now pass off to the include file
  137. self.stack.stack_input(input,file,filename)
  138. else:
  139. try:
  140. filename += 0
  141. except TypeError:
  142. pass
  143. else:
  144. filename = "<str_%d>" % filename
  145. # self.stack = object.__new__(self.__class__)
  146. # Scanner.__init__(self.stack,self.patterns,self.ignore,input,file,filename, stacked=True)
  147. # Note that the pattern+ignore are added by the generated
  148. # scanner code
  149. self.stack = self.__class__(input,file,filename, stacked=True)
  150. def get_pos(self):
  151. """Return a file/line/char tuple."""
  152. if self.stack: return self.stack.get_pos()
  153. return (self.filename, self.line+self.del_line, self.col)
  154. # def __repr__(self):
  155. # """Print the last few tokens that have been scanned in"""
  156. # output = ''
  157. # for t in self.tokens:
  158. # output += '%s\n' % (repr(t),)
  159. # return output
  160. def print_line_with_pointer(self, pos, length=0, out=sys.stderr):
  161. """Print the line of 'text' that includes position 'p',
  162. along with a second line with a single caret (^) at position p"""
  163. file,line,p = pos
  164. if file != self.filename:
  165. if self.stack: return self.stack.print_line_with_pointer(pos,length=length,out=out)
  166. print >>out, "(%s: not in input buffer)" % file
  167. return
  168. text = self.input
  169. p += length-1 # starts at pos 1
  170. origline=line
  171. line -= self.del_line
  172. spos=0
  173. if line > 0:
  174. while 1:
  175. line = line - 1
  176. try:
  177. cr = text.index("\n",spos)
  178. except ValueError:
  179. if line:
  180. text = ""
  181. break
  182. if line == 0:
  183. text = text[spos:cr]
  184. break
  185. spos = cr+1
  186. else:
  187. print >>out, "(%s:%d not in input buffer)" % (file,origline)
  188. return
  189. # Now try printing part of the line
  190. text = text[max(p-80, 0):p+80]
  191. p = p - max(p-80, 0)
  192. # Strip to the left
  193. i = text[:p].rfind('\n')
  194. j = text[:p].rfind('\r')
  195. if i < 0 or (0 <= j < i): i = j
  196. if 0 <= i < p:
  197. p = p - i - 1
  198. text = text[i+1:]
  199. # Strip to the right
  200. i = text.find('\n', p)
  201. j = text.find('\r', p)
  202. if i < 0 or (0 <= j < i): i = j
  203. if i >= 0:
  204. text = text[:i]
  205. # Now shorten the text
  206. while len(text) > 70 and p > 60:
  207. # Cut off 10 chars
  208. text = "..." + text[10:]
  209. p = p - 7
  210. # Now print the string, along with an indicator
  211. print >>out, '> ',text
  212. print >>out, '> ',' '*p + '^'
  213. def grab_input(self):
  214. """Get more input if possible."""
  215. if not self.file: return
  216. if len(self.input) - self.pos >= MIN_WINDOW: return
  217. data = self.file.read(MIN_WINDOW)
  218. if data is None or data == "":
  219. self.file = None
  220. # Drop bytes from the start, if necessary.
  221. if self.pos > 2*MIN_WINDOW:
  222. self.del_pos += MIN_WINDOW
  223. self.del_line += self.input[:MIN_WINDOW].count("\n")
  224. self.pos -= MIN_WINDOW
  225. self.input = self.input[MIN_WINDOW:] + data
  226. else:
  227. self.input = self.input + data
  228. def getchar(self):
  229. """Return the next character."""
  230. self.grab_input()
  231. c = self.input[self.pos]
  232. self.pos += 1
  233. return c
  234. def token(self, restrict, context=None):
  235. """Scan for another token."""
  236. while 1:
  237. if self.stack:
  238. try:
  239. return self.stack.token(restrict, context)
  240. except StopIteration:
  241. self.stack = None
  242. # Keep looking for a token, ignoring any in self.ignore
  243. self.grab_input()
  244. # special handling for end-of-file
  245. if self.stacked and self.pos==len(self.input):
  246. raise StopIteration
  247. # Search the patterns for the longest match, with earlier
  248. # tokens in the list having preference
  249. best_match = -1
  250. best_pat = '(error)'
  251. best_m = None
  252. for p, regexp in self.patterns:
  253. # First check to see if we're ignoring this token
  254. if restrict and p not in restrict and p not in self.ignore:
  255. continue
  256. m = regexp.match(self.input, self.pos)
  257. if m and m.end()-m.start() > best_match:
  258. # We got a match that's better than the previous one
  259. best_pat = p
  260. best_match = m.end()-m.start()
  261. best_m = m
  262. # If we didn't find anything, raise an error
  263. if best_pat == '(error)' and best_match < 0:
  264. msg = 'Bad Token'
  265. if restrict:
  266. msg = 'Trying to find one of '+', '.join(restrict)
  267. raise SyntaxError(self.get_pos(), msg, context=context)
  268. ignore = best_pat in self.ignore
  269. value = self.input[self.pos:self.pos+best_match]
  270. if not ignore:
  271. tok=Token(type=best_pat, value=value, pos=self.get_pos())
  272. self.pos += best_match
  273. npos = value.rfind("\n")
  274. if npos > -1:
  275. self.col = best_match-npos
  276. self.line += value.count("\n")
  277. else:
  278. self.col += best_match
  279. # If we found something that isn't to be ignored, return it
  280. if not ignore:
  281. if len(self.tokens) >= 10:
  282. del self.tokens[0]
  283. self.tokens.append(tok)
  284. self.last_read_token = tok
  285. # print repr(tok)
  286. return tok
  287. else:
  288. ignore = self.ignore[best_pat]
  289. if ignore:
  290. ignore(self, best_m)
  291. def peek(self, *types, **kw):
  292. """Returns the token type for lookahead; if there are any args
  293. then the list of args is the set of token types to allow"""
  294. context = kw.get("context",None)
  295. if self.last_token is None:
  296. self.last_types = types
  297. self.last_token = self.token(types,context)
  298. elif self.last_types:
  299. for t in types:
  300. if t not in self.last_types:
  301. raise NotImplementedError("Unimplemented: restriction set changed")
  302. return self.last_token.type
  303. def scan(self, type, **kw):
  304. """Returns the matched text, and moves to the next token"""
  305. context = kw.get("context",None)
  306. if self.last_token is None:
  307. tok = self.token([type],context)
  308. else:
  309. if self.last_types and type not in self.last_types:
  310. raise NotImplementedError("Unimplemented: restriction set changed")
  311. tok = self.last_token
  312. self.last_token = None
  313. if tok.type != type:
  314. if not self.last_types: self.last_types=[]
  315. raise SyntaxError(tok.pos, 'Trying to find '+type+': '+ ', '.join(self.last_types)+", got "+tok.type, context=context)
  316. return tok.value
  317. class Parser:
  318. """Base class for Yapps-generated parsers.
  319. """
  320. def __init__(self, scanner):
  321. self._scanner = scanner
  322. def _stack(self, input="",file=None,filename=None):
  323. """Temporarily read from someplace else"""
  324. self._scanner.stack_input(input,file,filename)
  325. self._tok = None
  326. def _peek(self, *types, **kw):
  327. """Returns the token type for lookahead; if there are any args
  328. then the list of args is the set of token types to allow"""
  329. return self._scanner.peek(*types, **kw)
  330. def _scan(self, type, **kw):
  331. """Returns the matched text, and moves to the next token"""
  332. return self._scanner.scan(type, **kw)
  333. class Context:
  334. """Class to represent the parser's call stack.
  335. Every rule creates a Context that links to its parent rule. The
  336. contexts can be used for debugging.
  337. """
  338. def __init__(self, parent, scanner, rule, args=()):
  339. """Create a new context.
  340. Args:
  341. parent: Context object or None
  342. scanner: Scanner object
  343. rule: string (name of the rule)
  344. args: tuple listing parameters to the rule
  345. """
  346. self.parent = parent
  347. self.scanner = scanner
  348. self.rule = rule
  349. self.args = args
  350. while scanner.stack: scanner = scanner.stack
  351. self.token = scanner.last_read_token
  352. def __str__(self):
  353. output = ''
  354. if self.parent: output = str(self.parent) + ' > '
  355. output += self.rule
  356. return output
  357. def print_error(err, scanner, max_ctx=None):
  358. """Print error messages, the parser stack, and the input text -- for human-readable error messages."""
  359. # NOTE: this function assumes 80 columns :-(
  360. # Figure out the line number
  361. pos = err.pos
  362. if not pos:
  363. pos = scanner.get_pos()
  364. file_name, line_number, column_number = pos
  365. print('%s:%d:%d: %s' % (file_name, line_number, column_number, err.msg), file=sys.stderr)
  366. scanner.print_line_with_pointer(pos)
  367. context = err.context
  368. token = None
  369. while context:
  370. print('while parsing %s%s:' % (context.rule, tuple(context.args)), file=sys.stderr)
  371. if context.token:
  372. token = context.token
  373. if token:
  374. scanner.print_line_with_pointer(token.pos, length=len(token.value))
  375. context = context.parent
  376. if max_ctx:
  377. max_ctx = max_ctx-1
  378. if not max_ctx:
  379. break
  380. def wrap_error_reporter(parser, rule, *args,**kw):
  381. try:
  382. return getattr(parser, rule)(*args,**kw)
  383. except SyntaxError as e:
  384. print_error(e, parser._scanner)
  385. except NoMoreTokens:
  386. print('Could not complete parsing; stopped around here:', file=sys.stderr)
  387. print(parser._scanner, file=sys.stderr)
  388. from twisted.words.xish.xpath import AttribValue, BooleanValue, CompareValue
  389. from twisted.words.xish.xpath import Function, IndexValue, LiteralValue
  390. from twisted.words.xish.xpath import _AnyLocation, _Location
  391. %%
  392. parser XPathParser:
  393. ignore: "\\s+"
  394. token INDEX: "[0-9]+"
  395. token WILDCARD: "\*"
  396. token IDENTIFIER: "[a-zA-Z][a-zA-Z0-9_\-]*"
  397. token ATTRIBUTE: "\@[a-zA-Z][a-zA-Z0-9_\-]*"
  398. token FUNCNAME: "[a-zA-Z][a-zA-Z0-9_]*"
  399. token CMP_EQ: "\="
  400. token CMP_NE: "\!\="
  401. token STR_DQ: '"([^"]|(\\"))*?"'
  402. token STR_SQ: "'([^']|(\\'))*?'"
  403. token OP_AND: "and"
  404. token OP_OR: "or"
  405. token END: "$"
  406. rule XPATH: PATH {{ result = PATH; current = result }}
  407. ( PATH {{ current.childLocation = PATH; current = current.childLocation }} ) * END
  408. {{ return result }}
  409. rule PATH: ("/" {{ result = _Location() }} | "//" {{ result = _AnyLocation() }} )
  410. ( IDENTIFIER {{ result.elementName = IDENTIFIER }} | WILDCARD {{ result.elementName = None }} )
  411. ( "\[" PREDICATE {{ result.predicates.append(PREDICATE) }} "\]")*
  412. {{ return result }}
  413. rule PREDICATE: EXPR {{ return EXPR }} |
  414. INDEX {{ return IndexValue(INDEX) }}
  415. rule EXPR: FACTOR {{ e = FACTOR }}
  416. ( BOOLOP FACTOR {{ e = BooleanValue(e, BOOLOP, FACTOR) }} )*
  417. {{ return e }}
  418. rule BOOLOP: ( OP_AND {{ return OP_AND }} | OP_OR {{ return OP_OR }} )
  419. rule FACTOR: TERM {{ return TERM }}
  420. | "\(" EXPR "\)" {{ return EXPR }}
  421. rule TERM: VALUE {{ t = VALUE }}
  422. [ CMP VALUE {{ t = CompareValue(t, CMP, VALUE) }} ]
  423. {{ return t }}
  424. rule VALUE: "@" IDENTIFIER {{ return AttribValue(IDENTIFIER) }} |
  425. FUNCNAME {{ f = Function(FUNCNAME); args = [] }}
  426. "\(" [ VALUE {{ args.append(VALUE) }}
  427. (
  428. "," VALUE {{ args.append(VALUE) }}
  429. )*
  430. ] "\)" {{ f.setParams(*args); return f }} |
  431. STR {{ return LiteralValue(STR[1:len(STR)-1]) }}
  432. rule CMP: (CMP_EQ {{ return CMP_EQ }} | CMP_NE {{ return CMP_NE }})
  433. rule STR: (STR_DQ {{ return STR_DQ }} | STR_SQ {{ return STR_SQ }})