antlr3lexer.hpp 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /** \file
  2. * Base interface for any ANTLR3 lexer.
  3. *
  4. * An ANLTR3 lexer builds from two sets of components:
  5. *
  6. * - The runtime components that provide common functionality such as
  7. * traversing character streams, building tokens for output and so on.
  8. * - The generated rules and struutre of the actual lexer, which call upon the
  9. * runtime components.
  10. *
  11. * A lexer class contains a character input stream, a base recognizer interface
  12. * (which it will normally implement) and a token source interface (which it also
  13. * implements. The Tokensource interface is called by a token consumer (such as
  14. * a parser, but in theory it can be anything that wants a set of abstract
  15. * tokens in place of a raw character stream.
  16. *
  17. * So then, we set up a lexer in a sequence akin to:
  18. *
  19. * - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
  20. * and initialize it.
  21. * - Create a lexer interface and tell it where it its input stream is.
  22. * This will cause the creation of a base recognizer class, which it will
  23. * override with its own implementations of some methods. The lexer creator
  24. * can also then in turn override anything it likes.
  25. * - The lexer token source interface is then passed to some interface that
  26. * knows how to use it, byte calling for a next token.
  27. * - When a next token is called, let ze lexing begin.
  28. *
  29. */
  30. #ifndef _ANTLR3_LEXER_HPP
  31. #define _ANTLR3_LEXER_HPP
  32. // [The "BSD licence"]
  33. // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
  34. //
  35. // All rights reserved.
  36. //
  37. // Redistribution and use in source and binary forms, with or without
  38. // modification, are permitted provided that the following conditions
  39. // are met:
  40. // 1. Redistributions of source code must retain the above copyright
  41. // notice, this list of conditions and the following disclaimer.
  42. // 2. Redistributions in binary form must reproduce the above copyright
  43. // notice, this list of conditions and the following disclaimer in the
  44. // documentation and/or other materials provided with the distribution.
  45. // 3. The name of the author may not be used to endorse or promote products
  46. // derived from this software without specific prior written permission.
  47. //
  48. // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  49. // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  50. // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  51. // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  52. // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  53. // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  54. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  55. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  56. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  57. // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  58. /* Definitions
  59. */
  60. namespace antlr3 {
  61. static const ANTLR_UINT32 ANTLR_STRING_TERMINATOR = 0xFFFFFFFF;
  62. template<class ImplTraits>
  63. class Lexer : public ImplTraits::template RecognizerType< typename ImplTraits::InputStreamType >,
  64. public ImplTraits::TokenSourceType
  65. {
  66. public:
  67. typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
  68. typedef typename ImplTraits::InputStreamType InputStreamType;
  69. typedef InputStreamType StreamType;
  70. typedef typename InputStreamType::IntStreamType IntStreamType;
  71. typedef typename ImplTraits::CommonTokenType CommonTokenType;
  72. typedef typename ImplTraits::StreamDataType TokenType;
  73. typedef typename ImplTraits::StringType StringType;
  74. typedef typename ImplTraits::StringStreamType StringStreamType;
  75. typedef typename ImplTraits::template RecognizerType< InputStreamType > RecognizerType;
  76. typedef typename RecognizerType::RecognizerSharedStateType RecognizerSharedStateType;
  77. typedef typename ImplTraits::template ExceptionBaseType<InputStreamType> ExceptionBaseType;
  78. typedef typename ImplTraits::BitsetListType BitsetListType;
  79. typedef typename ImplTraits::TokenSourceType TokenSourceType;
  80. typedef typename RecognizerSharedStateType::RuleMemoType RuleMemoType;
  81. typedef typename RecognizerType::DebugEventListenerType DebuggerType;
  82. private:
  83. /** A pointer to the character stream whence this lexer is receiving
  84. * characters.
  85. * TODO: I may come back to this and implement charstream outside
  86. * the input stream as per the java implementation.
  87. */
  88. InputStreamType* m_input;
  89. public:
  90. Lexer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
  91. Lexer(ANTLR_UINT32 sizeHint, InputStreamType* input, RecognizerSharedStateType* state);
  92. InputStreamType* get_input() const;
  93. IntStreamType* get_istream() const;
  94. RecognizerType* get_rec();
  95. const RecognizerType* get_rec() const;
  96. TokenSourceType* get_tokSource();
  97. //functions used in .stg file
  98. const RecognizerType* get_recognizer() const;
  99. RecognizerSharedStateType* get_lexstate() const;
  100. void set_lexstate( RecognizerSharedStateType* lexstate );
  101. const TokenSourceType* get_tokSource() const;
  102. CommonTokenType* get_ltoken() const;
  103. void set_ltoken( const CommonTokenType* ltoken );
  104. bool hasFailed() const;
  105. ANTLR_INT32 get_backtracking() const;
  106. void inc_backtracking();
  107. void dec_backtracking();
  108. bool get_failedflag() const;
  109. void set_failedflag( bool failed );
  110. InputStreamType* get_strstream() const;
  111. ANTLR_MARKER index() const;
  112. void seek(ANTLR_MARKER index);
  113. const CommonTokenType* EOF_Token() const;
  114. bool hasException() const;
  115. ExceptionBaseType* get_exception() const;
  116. void constructEx();
  117. void lrecover();
  118. ANTLR_MARKER mark();
  119. void rewind(ANTLR_MARKER marker);
  120. void rewindLast();
  121. void setText( const StringType& text );
  122. void skip();
  123. RuleMemoType* getRuleMemo() const;
  124. DebuggerType* get_debugger() const;
  125. void setRuleMemo(RuleMemoType* rulememo);
  126. ANTLR_UINT32 LA(ANTLR_INT32 i);
  127. void consume();
  128. void memoize(ANTLR_MARKER ruleIndex, ANTLR_MARKER ruleParseStart);
  129. bool haveParsedRule(ANTLR_MARKER ruleIndex);
  130. /** Pointer to a function that sets the charstream source for the lexer and
  131. * causes it to be reset.
  132. */
  133. void setCharStream(InputStreamType* input);
  134. /*!
  135. * \brief
  136. * Change to a new input stream, remembering the old one.
  137. *
  138. * \param lexer
  139. * Pointer to the lexer instance to switch input streams for.
  140. *
  141. * \param input
  142. * New input stream to install as the current one.
  143. *
  144. * Switches the current character input stream to
  145. * a new one, saving the old one, which we will revert to at the end of this
  146. * new one.
  147. */
  148. void pushCharStream(InputStreamType* input);
  149. /*!
  150. * \brief
  151. * Stops using the current input stream and reverts to any prior
  152. * input stream on the stack.
  153. *
  154. * \param lexer
  155. * Description of parameter lexer.
  156. *
  157. * Pointer to a function that abandons the current input stream, whether it
  158. * is empty or not and reverts to the previous stacked input stream.
  159. *
  160. * \remark
  161. * The function fails silently if there are no prior input streams.
  162. */
  163. void popCharStream();
  164. /** Function that emits (a copy of ) the supplied token as the next token in
  165. * the stream.
  166. */
  167. void emit(const CommonTokenType* token);
  168. /** Pointer to a function that constructs a new token from the lexer stored information
  169. */
  170. CommonTokenType* emit();
  171. /** Pointer to a function that attempts to match and consume the specified string from the input
  172. * stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
  173. * with 0xFFFFFFFF, which is an invalid UTF32 character
  174. */
  175. bool matchs(ANTLR_UCHAR* string);
  176. /** Pointer to a function that matches and consumes the specified character from the input stream.
  177. * The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
  178. * implementation is source encoding agnostic and so input streams do not generally need to
  179. * override the default implmentation.
  180. */
  181. bool matchc(ANTLR_UCHAR c);
  182. /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
  183. * but this would only be useful if the tokens were in tsome guaranteed order which is
  184. * only going to happen with a hand crafted token set).
  185. */
  186. bool matchRange(ANTLR_UCHAR low, ANTLR_UCHAR high);
  187. /** Pointer to a function that matches the next token/char in the input stream
  188. * regardless of what it actaully is.
  189. */
  190. void matchAny();
  191. /** Pointer to a function that recovers from an error found in the input stream.
  192. * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
  193. * be from a mismatched token that the (*match)() could not recover from.
  194. */
  195. void recover();
  196. /** Function to return the current line number in the input stream
  197. */
  198. ANTLR_UINT32 getLine();
  199. ANTLR_MARKER getCharIndex();
  200. ANTLR_UINT32 getCharPositionInLine();
  201. /** Function to return the text so far for the current token being generated
  202. */
  203. StringType getText();
  204. //Other utility functions
  205. void fillExceptionData( ExceptionBaseType* ex );
  206. /** Default lexer error handler (works for 8 bit streams only!!!)
  207. */
  208. void displayRecognitionError( ANTLR_UINT8** tokenNames, ExceptionBaseType* ex);
  209. void exConstruct();
  210. TokenType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e,
  211. ANTLR_UINT32 expectedTokenType, BitsetListType* follow);
  212. /** Pointer to a function that knows how to free the resources of a lexer
  213. */
  214. ~Lexer();
  215. };
  216. }
  217. #include "antlr3lexer.inl"
  218. #endif