antlr3baserecognizer.hpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. /** \file
  2. * Defines the basic structure to support recognizing by either a lexer,
  3. * parser, or tree parser.
  4. * \addtogroup BaseRecognizer
  5. * @{
  6. */
  7. #ifndef _ANTLR3_BASERECOGNIZER_HPP
  8. #define _ANTLR3_BASERECOGNIZER_HPP
  9. // [The "BSD licence"]
  10. // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
  11. //
  12. // All rights reserved.
  13. //
  14. // Redistribution and use in source and binary forms, with or without
  15. // modification, are permitted provided that the following conditions
  16. // are met:
  17. // 1. Redistributions of source code must retain the above copyright
  18. // notice, this list of conditions and the following disclaimer.
  19. // 2. Redistributions in binary form must reproduce the above copyright
  20. // notice, this list of conditions and the following disclaimer in the
  21. // documentation and/or other materials provided with the distribution.
  22. // 3. The name of the author may not be used to endorse or promote products
  23. // derived from this software without specific prior written permission.
  24. //
  25. // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  26. // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  27. // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  28. // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  29. // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  30. // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  31. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  32. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  33. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  34. // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35. namespace antlr3 {
  36. /** \brief Base tracking context structure for all types of
  37. * recognizers.
  38. */
  39. template< class ImplTraits, class StreamType >
  40. class BaseRecognizer : public ImplTraits::AllocPolicyType
  41. {
  42. public:
  43. typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
  44. typedef typename StreamType::IntStreamType IntStreamType;
  45. typedef typename ComponentTypeFinder<ImplTraits, StreamType>::ComponentType SuperType;
  46. typedef typename StreamType::UnitType UnitType;
  47. typedef typename ImplTraits::template ExceptionBaseType<StreamType> ExceptionBaseType;
  48. typedef typename ImplTraits::BitsetType BitsetType;
  49. typedef typename ImplTraits::BitsetListType BitsetListType;
  50. typedef typename ImplTraits::StringType StringType;
  51. typedef typename ImplTraits::template RecognizerSharedStateType<StreamType> RecognizerSharedStateType;
  52. typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType;
  53. typedef typename ImplTraits::LexerType LexerType;
  54. typedef typename ImplTraits::ParserType ParserType;
  55. typedef typename ImplTraits::TreeParserType TreeParserType;
  56. typedef typename AllocPolicyType::template StackType<StringType> StringStackType;
  57. typedef typename AllocPolicyType::template ListType<StringType> StringListType;
  58. private:
  59. /// A pointer to the shared recognizer state, such that multiple
  60. /// recognizers can use the same inputs streams and so on (in
  61. /// the case of grammar inheritance for instance.
  62. ///
  63. RecognizerSharedStateType* m_state;
  64. /// If set to something other than NULL, then this structure is
  65. /// points to an instance of the debugger interface. In general, the
  66. /// debugger is only referenced internally in recovery/error operations
  67. /// so that it does not cause overhead by having to check this pointer
  68. /// in every function/method
  69. ///
  70. DebugEventListenerType* m_debugger;
  71. public:
  72. BaseRecognizer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
  73. SuperType* get_super();
  74. RecognizerSharedStateType* get_state() const;
  75. DebugEventListenerType* get_debugger() const;
  76. void set_state( RecognizerSharedStateType* state );
  77. void set_debugger( DebugEventListenerType* debugger );
  78. /// Match current input symbol against ttype. Upon error, do one token
  79. /// insertion or deletion if possible.
  80. /// To turn off single token insertion or deletion error
  81. /// recovery, override mismatchRecover() and have it call
  82. /// plain mismatch(), which does not recover. Then any error
  83. /// in a rule will cause an exception and immediate exit from
  84. /// rule. Rule would recover by resynchronizing to the set of
  85. /// symbols that can follow rule ref.
  86. ///
  87. const UnitType* match(ANTLR_UINT32 ttype, BitsetListType* follow);
  88. /// Consumes the next token, whatever it is, and resets the recognizer state
  89. /// so that it is not in error.
  90. ///
  91. /// \param recognizer
  92. /// Recognizer context pointer
  93. ///
  94. void matchAny();
  95. /// function that decides if the token ahead of the current one is the
  96. /// one we were loking for, in which case the curernt one is very likely extraneous
  97. /// and can be reported that way.
  98. ///
  99. bool mismatchIsUnwantedToken(IntStreamType* input, ANTLR_UINT32 ttype);
  100. /// function that decides if the current token is one that can logically
  101. /// follow the one we were looking for, in which case the one we were looking for is
  102. /// probably missing from the input.
  103. ///
  104. bool mismatchIsMissingToken(IntStreamType* input, BitsetListType* follow);
  105. /// Factor out what to do upon token mismatch so tree parsers can behave
  106. /// differently. Override and call mismatchRecover(input, ttype, follow)
  107. /// to get single token insertion and deletion. Use this to turn off
  108. /// single token insertion and deletion. Override mismatchRecover
  109. /// to call this instead.
  110. ///
  111. /// \remark mismatch only works for parsers and must be overridden for anything else.
  112. ///
  113. void mismatch(ANTLR_UINT32 ttype, BitsetListType* follow);
  114. /// Report a recognition problem.
  115. ///
  116. /// This method sets errorRecovery to indicate the parser is recovering
  117. /// not parsing. Once in recovery mode, no errors are generated.
  118. /// To get out of recovery mode, the parser must successfully match
  119. /// a token (after a resync). So it will go:
  120. ///
  121. /// 1. error occurs
  122. /// 2. enter recovery mode, report error
  123. /// 3. consume until token found in resynch set
  124. /// 4. try to resume parsing
  125. /// 5. next match() will reset errorRecovery mode
  126. ///
  127. /// If you override, make sure to update errorCount if you care about that.
  128. ///
  129. void reportError();
  130. void reportError( ClassForwarder<LexerType> );
  131. template<typename CompType>
  132. void reportError( ClassForwarder<CompType> );
  133. /** Function that is called to display a recognition error message. You may
  134. * override this function independently of (*reportError)() above as that function calls
  135. * this one to do the actual exception printing.
  136. */
  137. void displayRecognitionError(ANTLR_UINT8** tokenNames);
  138. /// Get number of recognition errors (lexer, parser, tree parser). Each
  139. /// recognizer tracks its own number. So parser and lexer each have
  140. /// separate count. Does not count the spurious errors found between
  141. /// an error and next valid token match
  142. ///
  143. /// \see reportError()
  144. ///
  145. ANTLR_UINT32 getNumberOfSyntaxErrors();
  146. /** Function that recovers from an error found in the input stream.
  147. * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
  148. * be from a mismatched token that the (*match)() could not recover from.
  149. */
  150. void recover();
  151. /** function that is a hook to listen to token consumption during error recovery.
  152. * This is mainly used by the debug parser to send events to the listener.
  153. */
  154. void beginResync();
  155. /** function that is a hook to listen to token consumption during error recovery.
  156. * This is mainly used by the debug parser to send events to the listener.
  157. */
  158. void endResync();
  159. /** function that is a hook to listen to token consumption during error recovery.
  160. * This is mainly used by the debug parser to send events to the listener.
  161. */
  162. void beginBacktrack(ANTLR_UINT32 level);
  163. /** function that is a hook to listen to token consumption during error recovery.
  164. * This is mainly used by the debug parser to send events to the listener.
  165. */
  166. void endBacktrack(ANTLR_UINT32 level, bool successful);
  167. /// Compute the error recovery set for the current rule.
  168. /// Documentation below is from the Java implementation.
  169. ///
  170. /// During rule invocation, the parser pushes the set of tokens that can
  171. /// follow that rule reference on the stack; this amounts to
  172. /// computing FIRST of what follows the rule reference in the
  173. /// enclosing rule. This local follow set only includes tokens
  174. /// from within the rule; i.e., the FIRST computation done by
  175. /// ANTLR stops at the end of a rule.
  176. //
  177. /// EXAMPLE
  178. //
  179. /// When you find a "no viable alt exception", the input is not
  180. /// consistent with any of the alternatives for rule r. The best
  181. /// thing to do is to consume tokens until you see something that
  182. /// can legally follow a call to r *or* any rule that called r.
  183. /// You don't want the exact set of viable next tokens because the
  184. /// input might just be missing a token--you might consume the
  185. /// rest of the input looking for one of the missing tokens.
  186. ///
  187. /// Consider grammar:
  188. ///
  189. /// a : '[' b ']'
  190. /// | '(' b ')'
  191. /// ;
  192. /// b : c '^' INT ;
  193. /// c : ID
  194. /// | INT
  195. /// ;
  196. ///
  197. /// At each rule invocation, the set of tokens that could follow
  198. /// that rule is pushed on a stack. Here are the various "local"
  199. /// follow sets:
  200. ///
  201. /// FOLLOW(b1_in_a) = FIRST(']') = ']'
  202. /// FOLLOW(b2_in_a) = FIRST(')') = ')'
  203. /// FOLLOW(c_in_b) = FIRST('^') = '^'
  204. ///
  205. /// Upon erroneous input "[]", the call chain is
  206. ///
  207. /// a -> b -> c
  208. ///
  209. /// and, hence, the follow context stack is:
  210. ///
  211. /// depth local follow set after call to rule
  212. /// 0 <EOF> a (from main())
  213. /// 1 ']' b
  214. /// 3 '^' c
  215. ///
  216. /// Notice that ')' is not included, because b would have to have
  217. /// been called from a different context in rule a for ')' to be
  218. /// included.
  219. ///
  220. /// For error recovery, we cannot consider FOLLOW(c)
  221. /// (context-sensitive or otherwise). We need the combined set of
  222. /// all context-sensitive FOLLOW sets--the set of all tokens that
  223. /// could follow any reference in the call chain. We need to
  224. /// resync to one of those tokens. Note that FOLLOW(c)='^' and if
  225. /// we resync'd to that token, we'd consume until EOF. We need to
  226. /// sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
  227. /// In this case, for input "[]", LA(1) is in this set so we would
  228. /// not consume anything and after printing an error rule c would
  229. /// return normally. It would not find the required '^' though.
  230. /// At this point, it gets a mismatched token error and throws an
  231. /// exception (since LA(1) is not in the viable following token
  232. /// set). The rule exception handler tries to recover, but finds
  233. /// the same recovery set and doesn't consume anything. Rule b
  234. /// exits normally returning to rule a. Now it finds the ']' (and
  235. /// with the successful match exits errorRecovery mode).
  236. ///
  237. /// So, you can see that the parser walks up call chain looking
  238. /// for the token that was a member of the recovery set.
  239. ///
  240. /// Errors are not generated in errorRecovery mode.
  241. ///
  242. /// ANTLR's error recovery mechanism is based upon original ideas:
  243. ///
  244. /// "Algorithms + Data Structures = Programs" by Niklaus Wirth
  245. ///
  246. /// and
  247. ///
  248. /// "A note on error recovery in recursive descent parsers":
  249. /// http://portal.acm.org/citation.cfm?id=947902.947905
  250. ///
  251. /// Later, Josef Grosch had some good ideas:
  252. ///
  253. /// "Efficient and Comfortable Error Recovery in Recursive Descent
  254. /// Parsers":
  255. /// ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
  256. ///
  257. /// Like Grosch I implemented local FOLLOW sets that are combined
  258. /// at run-time upon error to avoid overhead during parsing.
  259. ///
  260. BitsetType* computeErrorRecoverySet();
  261. /// Compute the context-sensitive FOLLOW set for current rule.
  262. /// Documentation below is from the Java runtime.
  263. ///
  264. /// This is the set of token types that can follow a specific rule
  265. /// reference given a specific call chain. You get the set of
  266. /// viable tokens that can possibly come next (look ahead depth 1)
  267. /// given the current call chain. Contrast this with the
  268. /// definition of plain FOLLOW for rule r:
  269. ///
  270. /// FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
  271. ///
  272. /// where x in T* and alpha, beta in V*; T is set of terminals and
  273. /// V is the set of terminals and non terminals. In other words,
  274. /// FOLLOW(r) is the set of all tokens that can possibly follow
  275. /// references to r in///any* sentential form (context). At
  276. /// runtime, however, we know precisely which context applies as
  277. /// we have the call chain. We may compute the exact (rather
  278. /// than covering superset) set of following tokens.
  279. ///
  280. /// For example, consider grammar:
  281. ///
  282. /// stat : ID '=' expr ';' // FOLLOW(stat)=={EOF}
  283. /// | "return" expr '.'
  284. /// ;
  285. /// expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'}
  286. /// atom : INT // FOLLOW(atom)=={'+',')',';','.'}
  287. /// | '(' expr ')'
  288. /// ;
  289. ///
  290. /// The FOLLOW sets are all inclusive whereas context-sensitive
  291. /// FOLLOW sets are precisely what could follow a rule reference.
  292. /// For input input "i=(3);", here is the derivation:
  293. ///
  294. /// stat => ID '=' expr ';'
  295. /// => ID '=' atom ('+' atom)* ';'
  296. /// => ID '=' '(' expr ')' ('+' atom)* ';'
  297. /// => ID '=' '(' atom ')' ('+' atom)* ';'
  298. /// => ID '=' '(' INT ')' ('+' atom)* ';'
  299. /// => ID '=' '(' INT ')' ';'
  300. ///
  301. /// At the "3" token, you'd have a call chain of
  302. ///
  303. /// stat -> expr -> atom -> expr -> atom
  304. ///
  305. /// What can follow that specific nested ref to atom? Exactly ')'
  306. /// as you can see by looking at the derivation of this specific
  307. /// input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
  308. ///
  309. /// You want the exact viable token set when recovering from a
  310. /// token mismatch. Upon token mismatch, if LA(1) is member of
  311. /// the viable next token set, then you know there is most likely
  312. /// a missing token in the input stream. "Insert" one by just not
  313. /// throwing an exception.
  314. ///
  315. BitsetType* computeCSRuleFollow();
  316. /// Compute the current followset for the input stream.
  317. ///
  318. BitsetType* combineFollows(bool exact);
  319. /// Attempt to recover from a single missing or extra token.
  320. ///
  321. /// EXTRA TOKEN
  322. ///
  323. /// LA(1) is not what we are looking for. If LA(2) has the right token,
  324. /// however, then assume LA(1) is some extra spurious token. Delete it
  325. /// and LA(2) as if we were doing a normal match(), which advances the
  326. /// input.
  327. ///
  328. /// MISSING TOKEN
  329. ///
  330. /// If current token is consistent with what could come after
  331. /// ttype then it is ok to "insert" the missing token, else throw
  332. /// exception For example, Input "i=(3;" is clearly missing the
  333. /// ')'. When the parser returns from the nested call to expr, it
  334. /// will have call chain:
  335. ///
  336. /// stat -> expr -> atom
  337. ///
  338. /// and it will be trying to match the ')' at this point in the
  339. /// derivation:
  340. ///
  341. /// => ID '=' '(' INT ')' ('+' atom)* ';'
  342. /// ^
  343. /// match() will see that ';' doesn't match ')' and report a
  344. /// mismatched token error. To recover, it sees that LA(1)==';'
  345. /// is in the set of tokens that can follow the ')' token
  346. /// reference in rule atom. It can assume that you forgot the ')'.
  347. ///
  348. /// The exception that was passed in, in the java implementation is
  349. /// sorted in the recognizer exception stack in the C version. To 'throw' it we set the
  350. /// error flag and rules cascade back when this is set.
  351. ///
  352. const UnitType* recoverFromMismatchedToken( ANTLR_UINT32 ttype, BitsetListType* follow);
  353. /** Function that recovers from a mismatched set in the token stream, in a similar manner
  354. * to (*recoverFromMismatchedToken)
  355. */
  356. const UnitType* recoverFromMismatchedSet(BitsetListType* follow);
  357. /** common routine to handle single token insertion for recovery functions.
  358. */
  359. /// This code is factored out from mismatched token and mismatched set
  360. /// recovery. It handles "single token insertion" error recovery for
  361. /// both. No tokens are consumed to recover from insertions. Return
  362. /// true if recovery was possible else return false.
  363. ///
  364. bool recoverFromMismatchedElement(BitsetListType* follow);
  365. /** function that consumes input until the next token matches
  366. * the given token.
  367. */
  368. void consumeUntil(ANTLR_UINT32 tokenType);
  369. /** function that consumes input until the next token matches
  370. * one in the given set.
  371. */
  372. void consumeUntilSet(BitsetType* set);
  373. /** function that returns an ANTLR3_LIST of the strings that identify
  374. * the rules in the parser that got you to this point. Can be overridden by installing your
  375. * own function set.
  376. *
  377. * \todo Document how to override invocation stack functions.
  378. */
  379. StringStackType getRuleInvocationStack();
  380. StringStackType getRuleInvocationStackNamed(ANTLR_UINT8* name);
  381. /** function that converts an ANLR3_LIST of tokens to an ANTLR3_LIST of
  382. * string token names. As this is mostly used in string template processing it may not be useful
  383. * in the C runtime.
  384. */
  385. StringListType toStrings( const StringListType& );
  386. /** function to return whether the rule has parsed input starting at the supplied
  387. * start index before. If the rule has not parsed input starting from the supplied start index,
  388. * then it will return ANTLR3_MEMO_RULE_UNKNOWN. If it has parsed from the suppled start point
  389. * then it will return the point where it last stopped parsing after that start point.
  390. */
  391. ANTLR_MARKER getRuleMemoization( ANTLR_INTKEY ruleIndex,
  392. ANTLR_MARKER ruleParseStart);
  393. /** function that determines whether the rule has parsed input at the current index
  394. * in the input stream
  395. */
  396. bool alreadyParsedRule(ANTLR_MARKER ruleIndex);
  397. /** Function that records whether the rule has parsed the input at a
  398. * current position successfully or not.
  399. */
  400. void memoize(ANTLR_MARKER ruleIndex,
  401. ANTLR_MARKER ruleParseStart);
  402. /// Function that returns the current input symbol.
  403. /// The is placed into any label for the associated token ref; e.g., x=ID. Token
  404. /// and tree parsers need to return different objects. Rather than test
  405. /// for input stream type or change the IntStream interface, I use
  406. /// a simple method to ask the recognizer to tell me what the current
  407. /// input symbol is.
  408. ///
  409. /// This is ignored for lexers and the lexer implementation of this
  410. /// function should return NULL.
  411. ///
  412. const UnitType* getCurrentInputSymbol(IntStreamType* istream);
  413. const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<LexerType>);
  414. const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<ParserType>);
  415. const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<TreeParserType>);
  416. /// Conjure up a missing token during error recovery.
  417. ///
  418. /// The recognizer attempts to recover from single missing
  419. /// symbols. But, actions might refer to that missing symbol.
  420. /// For example, x=ID {f($x);}. The action clearly assumes
  421. /// that there has been an identifier matched previously and that
  422. /// $x points at that token. If that token is missing, but
  423. /// the next token in the stream is what we want we assume that
  424. /// this token is missing and we keep going. Because we
  425. /// have to return some token to replace the missing token,
  426. /// we have to conjure one up. This method gives the user control
  427. /// over the tokens returned for missing tokens. Mostly,
  428. /// you will want to create something special for identifier
  429. /// tokens. For literals such as '{' and ',', the default
  430. /// action in the parser or tree parser works. It simply creates
  431. /// a CommonToken of the appropriate type. The text will be the token.
  432. /// If you change what tokens must be created by the lexer,
  433. /// override this method to create the appropriate tokens.
  434. ///
  435. UnitType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e,
  436. ANTLR_UINT32 expectedTokenType,
  437. BitsetListType* follow);
  438. /** Function that returns whether the supplied grammar function
  439. * will parse the current input stream or not. This is the way that syntactic
  440. * predicates are evaluated. Unlike java, C is perfectly happy to invoke code
  441. * via a pointer to a function (hence that's what all the ANTLR3 C interfaces
  442. * do.
  443. */
  444. template<typename Predicate>
  445. bool synpred( ClassForwarder<Predicate> );
  446. //In place of exConstruct, just directly instantiate the Exception Object
  447. /** Reset the recognizer
  448. */
  449. void reset();
  450. void reset( ClassForwarder<LexerType> );
  451. template<typename CompType>
  452. void reset( ClassForwarder<CompType> );
  453. void exConstruct();
  454. ~BaseRecognizer();
  455. };
  456. }
  457. #include "antlr3baserecognizer.inl"
  458. /// @}
  459. ///
  460. #endif /* _ANTLR3_BASERECOGNIZER_H */