DefaultErrorStrategy.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. /* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
  2. * Use of this file is governed by the BSD 3-clause license that
  3. * can be found in the LICENSE.txt file in the project root.
  4. */
  5. #pragma once
  6. #include "ANTLRErrorStrategy.h"
  7. #include "misc/IntervalSet.h"
  8. namespace antlr4 {
  9. /**
  10. * This is the default implementation of {@link ANTLRErrorStrategy} used for
  11. * error reporting and recovery in ANTLR parsers.
  12. */
  13. class ANTLR4CPP_PUBLIC DefaultErrorStrategy : public ANTLRErrorStrategy {
  14. public:
  15. DefaultErrorStrategy();
  16. DefaultErrorStrategy(DefaultErrorStrategy const& other) = delete;
  17. virtual ~DefaultErrorStrategy();
  18. DefaultErrorStrategy& operator = (DefaultErrorStrategy const& other) = delete;
  19. protected:
  20. /**
  21. * Indicates whether the error strategy is currently "recovering from an
  22. * error". This is used to suppress reporting multiple error messages while
  23. * attempting to recover from a detected syntax error.
  24. *
  25. * @see #inErrorRecoveryMode
  26. */
  27. bool errorRecoveryMode;
  28. /** The index into the input stream where the last error occurred.
  29. * This is used to prevent infinite loops where an error is found
  30. * but no token is consumed during recovery...another error is found,
  31. * ad nauseum. This is a failsafe mechanism to guarantee that at least
  32. * one token/tree node is consumed for two errors.
  33. */
  34. int lastErrorIndex;
  35. misc::IntervalSet lastErrorStates;
  36. /// <summary>
  37. /// {@inheritDoc}
  38. /// <p/>
  39. /// The default implementation simply calls <seealso cref="#endErrorCondition"/> to
  40. /// ensure that the handler is not in error recovery mode.
  41. /// </summary>
  42. public:
  43. virtual void reset(Parser *recognizer) override;
  44. /// <summary>
  45. /// This method is called to enter error recovery mode when a recognition
  46. /// exception is reported.
  47. /// </summary>
  48. /// <param name="recognizer"> the parser instance </param>
  49. protected:
  50. virtual void beginErrorCondition(Parser *recognizer);
  51. /// <summary>
  52. /// {@inheritDoc}
  53. /// </summary>
  54. public:
  55. virtual bool inErrorRecoveryMode(Parser *recognizer) override;
  56. /// <summary>
  57. /// This method is called to leave error recovery mode after recovering from
  58. /// a recognition exception.
  59. /// </summary>
  60. /// <param name="recognizer"> </param>
  61. protected:
  62. virtual void endErrorCondition(Parser *recognizer);
  63. /// <summary>
  64. /// {@inheritDoc}
  65. /// <p/>
  66. /// The default implementation simply calls <seealso cref="#endErrorCondition"/>.
  67. /// </summary>
  68. public:
  69. virtual void reportMatch(Parser *recognizer) override;
  70. /// {@inheritDoc}
  71. /// <p/>
  72. /// The default implementation returns immediately if the handler is already
  73. /// in error recovery mode. Otherwise, it calls <seealso cref="#beginErrorCondition"/>
  74. /// and dispatches the reporting task based on the runtime type of {@code e}
  75. /// according to the following table.
  76. ///
  77. /// <ul>
  78. /// <li><seealso cref="NoViableAltException"/>: Dispatches the call to
  79. /// <seealso cref="#reportNoViableAlternative"/></li>
  80. /// <li><seealso cref="InputMismatchException"/>: Dispatches the call to
  81. /// <seealso cref="#reportInputMismatch"/></li>
  82. /// <li><seealso cref="FailedPredicateException"/>: Dispatches the call to
  83. /// <seealso cref="#reportFailedPredicate"/></li>
  84. /// <li>All other types: calls <seealso cref="Parser#notifyErrorListeners"/> to report
  85. /// the exception</li>
  86. /// </ul>
  87. virtual void reportError(Parser *recognizer, const RecognitionException &e) override;
  88. /// <summary>
  89. /// {@inheritDoc}
  90. /// <p/>
  91. /// The default implementation resynchronizes the parser by consuming tokens
  92. /// until we find one in the resynchronization set--loosely the set of tokens
  93. /// that can follow the current rule.
  94. /// </summary>
  95. virtual void recover(Parser *recognizer, std::exception_ptr e) override;
  96. /**
  97. * The default implementation of {@link ANTLRErrorStrategy#sync} makes sure
  98. * that the current lookahead symbol is consistent with what were expecting
  99. * at this point in the ATN. You can call this anytime but ANTLR only
  100. * generates code to check before subrules/loops and each iteration.
  101. *
  102. * <p>Implements Jim Idle's magic sync mechanism in closures and optional
  103. * subrules. E.g.,</p>
  104. *
  105. * <pre>
  106. * a : sync ( stuff sync )* ;
  107. * sync : {consume to what can follow sync} ;
  108. * </pre>
  109. *
  110. * At the start of a sub rule upon error, {@link #sync} performs single
  111. * token deletion, if possible. If it can't do that, it bails on the current
  112. * rule and uses the default error recovery, which consumes until the
  113. * resynchronization set of the current rule.
  114. *
  115. * <p>If the sub rule is optional ({@code (...)?}, {@code (...)*}, or block
  116. * with an empty alternative), then the expected set includes what follows
  117. * the subrule.</p>
  118. *
  119. * <p>During loop iteration, it consumes until it sees a token that can start a
  120. * sub rule or what follows loop. Yes, that is pretty aggressive. We opt to
  121. * stay in the loop as long as possible.</p>
  122. *
  123. * <p><strong>ORIGINS</strong></p>
  124. *
  125. * <p>Previous versions of ANTLR did a poor job of their recovery within loops.
  126. * A single mismatch token or missing token would force the parser to bail
  127. * out of the entire rules surrounding the loop. So, for rule</p>
  128. *
  129. * <pre>
  130. * classDef : 'class' ID '{' member* '}'
  131. * </pre>
  132. *
  133. * input with an extra token between members would force the parser to
  134. * consume until it found the next class definition rather than the next
  135. * member definition of the current class.
  136. *
  137. * <p>This functionality cost a little bit of effort because the parser has to
  138. * compare token set at the start of the loop and at each iteration. If for
  139. * some reason speed is suffering for you, you can turn off this
  140. * functionality by simply overriding this method as a blank { }.</p>
  141. */
  142. virtual void sync(Parser *recognizer) override;
  143. /// <summary>
  144. /// This is called by <seealso cref="#reportError"/> when the exception is a
  145. /// <seealso cref="NoViableAltException"/>.
  146. /// </summary>
  147. /// <seealso cref= #reportError
  148. /// </seealso>
  149. /// <param name="recognizer"> the parser instance </param>
  150. /// <param name="e"> the recognition exception </param>
  151. protected:
  152. virtual void reportNoViableAlternative(Parser *recognizer, const NoViableAltException &e);
  153. /// <summary>
  154. /// This is called by <seealso cref="#reportError"/> when the exception is an
  155. /// <seealso cref="InputMismatchException"/>.
  156. /// </summary>
  157. /// <seealso cref= #reportError
  158. /// </seealso>
  159. /// <param name="recognizer"> the parser instance </param>
  160. /// <param name="e"> the recognition exception </param>
  161. virtual void reportInputMismatch(Parser *recognizer, const InputMismatchException &e);
  162. /// <summary>
  163. /// This is called by <seealso cref="#reportError"/> when the exception is a
  164. /// <seealso cref="FailedPredicateException"/>.
  165. /// </summary>
  166. /// <seealso cref= #reportError
  167. /// </seealso>
  168. /// <param name="recognizer"> the parser instance </param>
  169. /// <param name="e"> the recognition exception </param>
  170. virtual void reportFailedPredicate(Parser *recognizer, const FailedPredicateException &e);
  171. /**
  172. * This method is called to report a syntax error which requires the removal
  173. * of a token from the input stream. At the time this method is called, the
  174. * erroneous symbol is current {@code LT(1)} symbol and has not yet been
  175. * removed from the input stream. When this method returns,
  176. * {@code recognizer} is in error recovery mode.
  177. *
  178. * <p>This method is called when {@link #singleTokenDeletion} identifies
  179. * single-token deletion as a viable recovery strategy for a mismatched
  180. * input error.</p>
  181. *
  182. * <p>The default implementation simply returns if the handler is already in
  183. * error recovery mode. Otherwise, it calls {@link #beginErrorCondition} to
  184. * enter error recovery mode, followed by calling
  185. * {@link Parser#notifyErrorListeners}.</p>
  186. *
  187. * @param recognizer the parser instance
  188. */
  189. virtual void reportUnwantedToken(Parser *recognizer);
  190. /**
  191. * This method is called to report a syntax error which requires the
  192. * insertion of a missing token into the input stream. At the time this
  193. * method is called, the missing token has not yet been inserted. When this
  194. * method returns, {@code recognizer} is in error recovery mode.
  195. *
  196. * <p>This method is called when {@link #singleTokenInsertion} identifies
  197. * single-token insertion as a viable recovery strategy for a mismatched
  198. * input error.</p>
  199. *
  200. * <p>The default implementation simply returns if the handler is already in
  201. * error recovery mode. Otherwise, it calls {@link #beginErrorCondition} to
  202. * enter error recovery mode, followed by calling
  203. * {@link Parser#notifyErrorListeners}.</p>
  204. *
  205. * @param recognizer the parser instance
  206. */
  207. virtual void reportMissingToken(Parser *recognizer);
  208. public:
  209. /**
  210. * {@inheritDoc}
  211. *
  212. * <p>The default implementation attempts to recover from the mismatched input
  213. * by using single token insertion and deletion as described below. If the
  214. * recovery attempt fails, this method throws an
  215. * {@link InputMismatchException}.</p>
  216. *
  217. * <p><strong>EXTRA TOKEN</strong> (single token deletion)</p>
  218. *
  219. * <p>{@code LA(1)} is not what we are looking for. If {@code LA(2)} has the
  220. * right token, however, then assume {@code LA(1)} is some extra spurious
  221. * token and delete it. Then consume and return the next token (which was
  222. * the {@code LA(2)} token) as the successful result of the match operation.</p>
  223. *
  224. * <p>This recovery strategy is implemented by {@link #singleTokenDeletion}.</p>
  225. *
  226. * <p><strong>MISSING TOKEN</strong> (single token insertion)</p>
  227. *
  228. * <p>If current token (at {@code LA(1)}) is consistent with what could come
  229. * after the expected {@code LA(1)} token, then assume the token is missing
  230. * and use the parser's {@link TokenFactory} to create it on the fly. The
  231. * "insertion" is performed by returning the created token as the successful
  232. * result of the match operation.</p>
  233. *
  234. * <p>This recovery strategy is implemented by {@link #singleTokenInsertion}.</p>
  235. *
  236. * <p><strong>EXAMPLE</strong></p>
  237. *
  238. * <p>For example, Input {@code i=(3;} is clearly missing the {@code ')'}. When
  239. * the parser returns from the nested call to {@code expr}, it will have
  240. * call chain:</p>
  241. *
  242. * <pre>
  243. * stat &rarr; expr &rarr; atom
  244. * </pre>
  245. *
  246. * and it will be trying to match the {@code ')'} at this point in the
  247. * derivation:
  248. *
  249. * <pre>
  250. * =&gt; ID '=' '(' INT ')' ('+' atom)* ';'
  251. * ^
  252. * </pre>
  253. *
  254. * The attempt to match {@code ')'} will fail when it sees {@code ';'} and
  255. * call {@link #recoverInline}. To recover, it sees that {@code LA(1)==';'}
  256. * is in the set of tokens that can follow the {@code ')'} token reference
  257. * in rule {@code atom}. It can assume that you forgot the {@code ')'}.
  258. */
  259. virtual Token* recoverInline(Parser *recognizer) override;
  260. /// <summary>
  261. /// This method implements the single-token insertion inline error recovery
  262. /// strategy. It is called by <seealso cref="#recoverInline"/> if the single-token
  263. /// deletion strategy fails to recover from the mismatched input. If this
  264. /// method returns {@code true}, {@code recognizer} will be in error recovery
  265. /// mode.
  266. /// <p/>
  267. /// This method determines whether or not single-token insertion is viable by
  268. /// checking if the {@code LA(1)} input symbol could be successfully matched
  269. /// if it were instead the {@code LA(2)} symbol. If this method returns
  270. /// {@code true}, the caller is responsible for creating and inserting a
  271. /// token with the correct type to produce this behavior.
  272. /// </summary>
  273. /// <param name="recognizer"> the parser instance </param>
  274. /// <returns> {@code true} if single-token insertion is a viable recovery
  275. /// strategy for the current mismatched input, otherwise {@code false} </returns>
  276. protected:
  277. virtual bool singleTokenInsertion(Parser *recognizer);
  278. /// <summary>
  279. /// This method implements the single-token deletion inline error recovery
  280. /// strategy. It is called by <seealso cref="#recoverInline"/> to attempt to recover
  281. /// from mismatched input. If this method returns null, the parser and error
  282. /// handler state will not have changed. If this method returns non-null,
  283. /// {@code recognizer} will <em>not</em> be in error recovery mode since the
  284. /// returned token was a successful match.
  285. /// <p/>
  286. /// If the single-token deletion is successful, this method calls
  287. /// <seealso cref="#reportUnwantedToken"/> to report the error, followed by
  288. /// <seealso cref="Parser#consume"/> to actually "delete" the extraneous token. Then,
  289. /// before returning <seealso cref="#reportMatch"/> is called to signal a successful
  290. /// match.
  291. /// </summary>
  292. /// <param name="recognizer"> the parser instance </param>
  293. /// <returns> the successfully matched <seealso cref="Token"/> instance if single-token
  294. /// deletion successfully recovers from the mismatched input, otherwise
  295. /// {@code null} </returns>
  296. virtual Token* singleTokenDeletion(Parser *recognizer);
  297. /// <summary>
  298. /// Conjure up a missing token during error recovery.
  299. ///
  300. /// The recognizer attempts to recover from single missing
  301. /// symbols. But, actions might refer to that missing symbol.
  302. /// For example, x=ID {f($x);}. The action clearly assumes
  303. /// that there has been an identifier matched previously and that
  304. /// $x points at that token. If that token is missing, but
  305. /// the next token in the stream is what we want we assume that
  306. /// this token is missing and we keep going. Because we
  307. /// have to return some token to replace the missing token,
  308. /// we have to conjure one up. This method gives the user control
  309. /// over the tokens returned for missing tokens. Mostly,
  310. /// you will want to create something special for identifier
  311. /// tokens. For literals such as '{' and ',', the default
  312. /// action in the parser or tree parser works. It simply creates
  313. /// a CommonToken of the appropriate type. The text will be the token.
  314. /// If you change what tokens must be created by the lexer,
  315. /// override this method to create the appropriate tokens.
  316. /// </summary>
  317. virtual Token* getMissingSymbol(Parser *recognizer);
  318. virtual misc::IntervalSet getExpectedTokens(Parser *recognizer);
  319. /// <summary>
  320. /// How should a token be displayed in an error message? The default
  321. /// is to display just the text, but during development you might
  322. /// want to have a lot of information spit out. Override in that case
  323. /// to use t.toString() (which, for CommonToken, dumps everything about
  324. /// the token). This is better than forcing you to override a method in
  325. /// your token objects because you don't have to go modify your lexer
  326. /// so that it creates a new class.
  327. /// </summary>
  328. virtual std::string getTokenErrorDisplay(Token *t);
  329. virtual std::string getSymbolText(Token *symbol);
  330. virtual size_t getSymbolType(Token *symbol);
  331. virtual std::string escapeWSAndQuote(const std::string &s) const;
  332. /* Compute the error recovery set for the current rule. During
  333. * rule invocation, the parser pushes the set of tokens that can
  334. * follow that rule reference on the stack; this amounts to
  335. * computing FIRST of what follows the rule reference in the
  336. * enclosing rule. See LinearApproximator.FIRST().
  337. * This local follow set only includes tokens
  338. * from within the rule; i.e., the FIRST computation done by
  339. * ANTLR stops at the end of a rule.
  340. *
  341. * EXAMPLE
  342. *
  343. * When you find a "no viable alt exception", the input is not
  344. * consistent with any of the alternatives for rule r. The best
  345. * thing to do is to consume tokens until you see something that
  346. * can legally follow a call to r *or* any rule that called r.
  347. * You don't want the exact set of viable next tokens because the
  348. * input might just be missing a token--you might consume the
  349. * rest of the input looking for one of the missing tokens.
  350. *
  351. * Consider grammar:
  352. *
  353. * a : '[' b ']'
  354. * | '(' b ')'
  355. * ;
  356. * b : c '^' INT ;
  357. * c : ID
  358. * | INT
  359. * ;
  360. *
  361. * At each rule invocation, the set of tokens that could follow
  362. * that rule is pushed on a stack. Here are the various
  363. * context-sensitive follow sets:
  364. *
  365. * FOLLOW(b1_in_a) = FIRST(']') = ']'
  366. * FOLLOW(b2_in_a) = FIRST(')') = ')'
  367. * FOLLOW(c_in_b) = FIRST('^') = '^'
  368. *
  369. * Upon erroneous input "[]", the call chain is
  370. *
  371. * a -> b -> c
  372. *
  373. * and, hence, the follow context stack is:
  374. *
  375. * depth follow set start of rule execution
  376. * 0 <EOF> a (from main())
  377. * 1 ']' b
  378. * 2 '^' c
  379. *
  380. * Notice that ')' is not included, because b would have to have
  381. * been called from a different context in rule a for ')' to be
  382. * included.
  383. *
  384. * For error recovery, we cannot consider FOLLOW(c)
  385. * (context-sensitive or otherwise). We need the combined set of
  386. * all context-sensitive FOLLOW sets--the set of all tokens that
  387. * could follow any reference in the call chain. We need to
  388. * resync to one of those tokens. Note that FOLLOW(c)='^' and if
  389. * we resync'd to that token, we'd consume until EOF. We need to
  390. * sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
  391. * In this case, for input "[]", LA(1) is ']' and in the set, so we would
  392. * not consume anything. After printing an error, rule c would
  393. * return normally. Rule b would not find the required '^' though.
  394. * At this point, it gets a mismatched token error and throws an
  395. * exception (since LA(1) is not in the viable following token
  396. * set). The rule exception handler tries to recover, but finds
  397. * the same recovery set and doesn't consume anything. Rule b
  398. * exits normally returning to rule a. Now it finds the ']' (and
  399. * with the successful match exits errorRecovery mode).
  400. *
  401. * So, you can see that the parser walks up the call chain looking
  402. * for the token that was a member of the recovery set.
  403. *
  404. * Errors are not generated in errorRecovery mode.
  405. *
  406. * ANTLR's error recovery mechanism is based upon original ideas:
  407. *
  408. * "Algorithms + Data Structures = Programs" by Niklaus Wirth
  409. *
  410. * and
  411. *
  412. * "A note on error recovery in recursive descent parsers":
  413. * http://portal.acm.org/citation.cfm?id=947902.947905
  414. *
  415. * Later, Josef Grosch had some good ideas:
  416. *
  417. * "Efficient and Comfortable Error Recovery in Recursive Descent
  418. * Parsers":
  419. * ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
  420. *
  421. * Like Grosch I implement context-sensitive FOLLOW sets that are combined
  422. * at run-time upon error to avoid overhead during parsing.
  423. */
  424. virtual misc::IntervalSet getErrorRecoverySet(Parser *recognizer);
  425. /// <summary>
  426. /// Consume tokens until one matches the given token set. </summary>
  427. virtual void consumeUntil(Parser *recognizer, const misc::IntervalSet &set);
  428. private:
  429. std::vector<std::unique_ptr<Token>> _errorSymbols; // Temporarily created token.
  430. void InitializeInstanceFields();
  431. };
  432. } // namespace antlr4