antlr3intstream.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. /** \file
  2. * Defines the the class interface for an antlr3 INTSTREAM.
  3. *
  4. * Certain functionality (such as DFAs for instance) abstract the stream of tokens
  5. * or characters in to a steam of integers. Hence this structure should be included
  6. * in any stream that is able to provide the output as a stream of integers (which is anything
  7. * basically.
  8. *
  9. * There are no specific implementations of the methods in this interface in general. Though
  10. * for purposes of casting and so on, it may be necesssary to implement a function with
  11. * the signature in this interface which abstracts the base immplementation. In essence though
  12. * the base stream provides a pointer to this interface, within which it installs its
  13. * normal match() functions and so on. Interaces such as DFA are then passed the pANTLR3_INT_STREAM
  14. * and can treat any input as an int stream.
  15. *
  16. * For instance, a lexer implements a pANTLR3_BASE_RECOGNIZER, within which there is a pANTLR3_INT_STREAM.
  17. * However, a pANTLR3_INPUT_STREAM also provides a pANTLR3_INT_STREAM, which it has constructed from
  18. * it's normal interface when it was created. This is then pointed at by the pANTLR_BASE_RECOGNIZER
  19. * when it is intialized with a pANTLR3_INPUT_STREAM.
  20. *
  21. * Similarly if a pANTLR3_BASE_RECOGNIZER is initialized with a pANTLR3_TOKEN_STREAM, then the
  22. * pANTLR3_INT_STREAM is taken from the pANTLR3_TOKEN_STREAM.
  23. *
  24. * If a pANTLR3_BASE_RECOGNIZER is initialized with a pANTLR3_TREENODE_STREAM, then guess where
  25. * the pANTLR3_INT_STREAM comes from?
  26. *
  27. * Note that because the context pointer points to the actual interface structure that is providing
  28. * the ANTLR3_INT_STREAM it is defined as a (void *) in this interface. There is no direct implementation
  29. * of an ANTLR3_INT_STREAM (unless someone did not understand what I was doing here =;?P
  30. */
  31. #ifndef _ANTLR3_INTSTREAM_HPP
  32. #define _ANTLR3_INTSTREAM_HPP
  33. // [The "BSD licence"]
  34. // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
  35. //
  36. // All rights reserved.
  37. //
  38. // Redistribution and use in source and binary forms, with or without
  39. // modification, are permitted provided that the following conditions
  40. // are met:
  41. // 1. Redistributions of source code must retain the above copyright
  42. // notice, this list of conditions and the following disclaimer.
  43. // 2. Redistributions in binary form must reproduce the above copyright
  44. // notice, this list of conditions and the following disclaimer in the
  45. // documentation and/or other materials provided with the distribution.
  46. // 3. The name of the author may not be used to endorse or promote products
  47. // derived from this software without specific prior written permission.
  48. //
  49. // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  50. // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  51. // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  52. // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  53. // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  54. // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  55. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  56. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  57. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  58. // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  59. namespace antlr3 {
  60. enum STREAM_TYPE
  61. {
  62. /** Type indicator for a character stream
  63. * \remark if a custom stream is created but it can be treated as
  64. * a char stream, then you may OR in this value to your type indicator
  65. */
  66. CHARSTREAM = 0x0001
  67. /** Type indicator for a Token stream
  68. * \remark if a custom stream is created but it can be treated as
  69. * a token stream, then you may OR in this value to your type indicator
  70. */
  71. , TOKENSTREAM = 0x0002
  72. /** Type indicator for a common tree node stream
  73. * \remark if a custom stream is created but it can be treated as
  74. * a common tree node stream, then you may OR in this value to your type indicator
  75. */
  76. , COMMONTREENODE = 0x0004
  77. /** Type mask for input stream so we can switch in the above types
  78. * \remark DO NOT USE 0x0000 as a stream type!
  79. */
  80. , INPUT_MASK = 0x0007
  81. };
  82. class RESOLVE_ENDIAN_AT_RUNTIME {};
  83. class BYTE_AGNOSTIC {};
  84. class ANTLR_LITTLE_ENDIAN {};
  85. class ANTLR_BIG_ENDIAN {};
  86. template<class ImplTraits, class SuperType>
  87. class IntStream : public ImplTraits::AllocPolicyType
  88. {
  89. public:
  90. typedef typename ImplTraits::StringType StringType;
  91. protected:
  92. /** Potentially useful in error reporting and so on, this string is
  93. * an identification of the input source. It may be NULL, so anything
  94. * attempting to access it needs to check this and substitute a sensible
  95. * default.
  96. */
  97. StringType m_streamName;
  98. /** Last marker position allocated
  99. */
  100. ANTLR_MARKER m_lastMarker;
  101. bool m_upper_case; //if set, values should be returbed in upper case
  102. /// Indicates whether we should implement endian-specific logic
  103. /// 0 - Undefined 1 - Default(machine and input are both same), 2 - Little Endian, 3 - Big Endian
  104. ANTLR_UINT8 m_endian_spec;
  105. public:
  106. IntStream();
  107. // Return a string that identifies the input source
  108. //
  109. StringType getSourceName();
  110. StringType& get_streamName();
  111. const StringType& get_streamName() const;
  112. ANTLR_MARKER get_lastMarker() const;
  113. SuperType* get_super();
  114. /**
  115. * Function that installs a version of LA that always
  116. * returns upper case. Only valid for character streams and creates a case
  117. * insensitive lexer if the lexer tokens are described in upper case. The
  118. * tokens will preserve case in the token text.
  119. */
  120. void setUcaseLA(bool flag);
  121. /** Consume the next 'ANTR3_UINT32' in the stream
  122. */
  123. void consume();
  124. /** Get ANTLR3_UINT32 at current input pointer + i ahead where i=1 is next ANTLR3_UINT32
  125. */
  126. ANTLR_UINT32 LA( ANTLR_INT32 i);
  127. /** Tell the stream to start buffering if it hasn't already. Return
  128. * current input position, index(), or some other marker so that
  129. * when passed to rewind() you get back to the same spot.
  130. * rewind(mark()) should not affect the input cursor.
  131. */
  132. ANTLR_MARKER mark();
  133. /** Return the current input symbol index 0..n where n indicates the
  134. * last symbol has been read.
  135. */
  136. ANTLR_MARKER index();
  137. /** Reset the stream so that next call to index would return marker.
  138. * The marker will usually be index() but it doesn't have to be. It's
  139. * just a marker to indicate what state the stream was in. This is
  140. * essentially calling release() and seek(). If there are markers
  141. * created after this marker argument, this routine must unroll them
  142. * like a stack. Assume the state the stream was in when this marker
  143. * was created.
  144. */
  145. void rewind(ANTLR_MARKER marker);
  146. /** Reset the stream to the last marker position, witouh destryoing the
  147. * last marker position.
  148. */
  149. void rewindLast();
  150. /** You may want to commit to a backtrack but don't want to force the
  151. * stream to keep bookkeeping objects around for a marker that is
  152. * no longer necessary. This will have the same behavior as
  153. * rewind() except it releases resources without the backward seek.
  154. */
  155. void release(ANTLR_MARKER mark);
  156. /** Set the input cursor to the position indicated by index. This is
  157. * normally used to seek ahead in the input stream. No buffering is
  158. * required to do this unless you know your stream will use seek to
  159. * move backwards such as when backtracking.
  160. *
  161. * This is different from rewind in its multi-directional
  162. * requirement and in that its argument is strictly an input cursor (index).
  163. *
  164. * For char streams, seeking forward must update the stream state such
  165. * as line number. For seeking backwards, you will be presumably
  166. * backtracking using the mark/rewind mechanism that restores state and
  167. * so this method does not need to update state when seeking backwards.
  168. *
  169. * Currently, this method is only used for efficient backtracking, but
  170. * in the future it may be used for incremental parsing.
  171. */
  172. void seek(ANTLR_MARKER index);
  173. /// Debug only method to flag consumption of initial off-channel
  174. /// tokens in the input stream
  175. ///
  176. void consumeInitialHiddenTokens();
  177. void rewindMark(ANTLR_MARKER marker);
  178. ANTLR_MARKER tindex();
  179. /** Frees any resources that were allocated for the implementation of this
  180. * interface. Usually this is just releasing the memory allocated
  181. * for the structure itself, but it may of course do anything it need to
  182. * so long as it does not stamp on anything else.
  183. */
  184. ~IntStream();
  185. protected:
  186. void setupIntStream(bool machineBigEndian, bool inputBigEndian);
  187. void findout_endian_spec(bool machineBigEndian, bool inputBigEndian);
  188. //If the user chooses this option, then we will be resolving stuffs at run-time
  189. ANTLR_UINT32 LA( ANTLR_INT32 i, ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> );
  190. //resolve into one of the three categories below at runtime
  191. void consume( ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> );
  192. };
  193. template<class ImplTraits, class SuperType>
  194. class EBCDIC_IntStream : public IntStream<ImplTraits, SuperType>
  195. {
  196. public:
  197. ANTLR_UINT32 LA( ANTLR_INT32 i);
  198. protected:
  199. void setupIntStream();
  200. };
  201. template<class ImplTraits, class SuperType>
  202. class UTF8_IntStream : public IntStream<ImplTraits, SuperType>
  203. {
  204. public:
  205. ANTLR_UINT32 LA( ANTLR_INT32 i);
  206. void consume();
  207. protected:
  208. void setupIntStream(bool machineBigEndian, bool inputBigEndian);
  209. private:
  210. static const ANTLR_UINT32* TrailingBytesForUTF8();
  211. static const UTF32* OffsetsFromUTF8();
  212. };
  213. template<class ImplTraits, class SuperType>
  214. class UTF16_IntStream : public IntStream<ImplTraits, SuperType>
  215. {
  216. public:
  217. ANTLR_UINT32 LA( ANTLR_INT32 i);
  218. void consume();
  219. ANTLR_MARKER index();
  220. void seek(ANTLR_MARKER seekPoint);
  221. protected:
  222. void setupIntStream(bool machineBigEndian, bool inputBigEndian);
  223. /// \brief Return the input element assuming an 8 bit ascii input
  224. ///
  225. /// \param[in] input Input stream context pointer
  226. /// \param[in] la 1 based offset of next input stream element
  227. ///
  228. /// \return Next input character in internal ANTLR3 encoding (UTF32)
  229. ///
  230. ANTLR_UINT32 LA( ANTLR_INT32 i, ClassForwarder<BYTE_AGNOSTIC> );
  231. /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
  232. ///
  233. /// \param[in] input Input stream context pointer
  234. /// \param[in] la 1 based offset of next input stream element
  235. ///
  236. /// \return Next input character in internal ANTLR3 encoding (UTF32)
  237. ///
  238. ANTLR_UINT32 LA( ANTLR_INT32 i, ClassForwarder<ANTLR_LITTLE_ENDIAN> );
  239. /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
  240. ///
  241. /// \param[in] input Input stream context pointer
  242. /// \param[in] la 1 based offset of next input stream element
  243. ///
  244. /// \return Next input character in internal ANTLR3 encoding (UTF32)
  245. ///
  246. ANTLR_UINT32 LA( ANTLR_INT32 i, ClassForwarder<ANTLR_BIG_ENDIAN> );
  247. /// \brief Consume the next character in a UTF16 input stream
  248. ///
  249. /// \param input Input stream context pointer
  250. ///
  251. void consume( ClassForwarder<BYTE_AGNOSTIC> );
  252. /// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not
  253. /// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance
  254. /// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we
  255. /// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream
  256. /// is fubar but we just ignore that.
  257. ///
  258. /// \param input Input stream context pointer
  259. ///
  260. void consume( ClassForwarder<ANTLR_LITTLE_ENDIAN> );
  261. /// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not
  262. ///
  263. /// \param input Input stream context pointer
  264. ///
  265. void consume( ClassForwarder<ANTLR_BIG_ENDIAN> );
  266. };
  267. template<class ImplTraits, class SuperType>
  268. class UTF32_IntStream : public IntStream<ImplTraits, SuperType>
  269. {
  270. public:
  271. ANTLR_UINT32 LA( ANTLR_INT32 i);
  272. void consume();
  273. /// \brief Calculate the current index in the output stream.
  274. /// \param[in] input Input stream context pointer
  275. ///
  276. ANTLR_MARKER index();
  277. void seek(ANTLR_MARKER seekPoint);
  278. protected:
  279. void setupIntStream(bool machineBigEndian, bool inputBigEndian);
  280. ANTLR_UINT32 LA( ANTLR_INT32 i, ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> );
  281. ANTLR_UINT32 LA( ANTLR_INT32 i, ClassForwarder<BYTE_AGNOSTIC> );
  282. ANTLR_UINT32 LA( ANTLR_INT32 i, ClassForwarder<ANTLR_LITTLE_ENDIAN> );
  283. ANTLR_UINT32 LA( ANTLR_INT32 i, ClassForwarder<ANTLR_BIG_ENDIAN> );
  284. void consume( ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> );
  285. void consume( ClassForwarder<BYTE_AGNOSTIC> );
  286. void consume( ClassForwarder<ANTLR_LITTLE_ENDIAN> );
  287. void consume( ClassForwarder<ANTLR_BIG_ENDIAN> );
  288. };
  289. template<class ImplTraits>
  290. class TokenIntStream : public IntStream<ImplTraits, typename ImplTraits::TokenStreamType >
  291. {
  292. public:
  293. typedef typename ImplTraits::CommonTokenType CommonTokenType;
  294. typedef typename ImplTraits::StringType StringType;
  295. typedef typename ImplTraits::TokenStreamType TokenStreamType;
  296. typedef IntStream<ImplTraits, TokenStreamType > BaseType;
  297. private:
  298. /** Because the indirect call, though small in individual cases can
  299. * mount up if there are thousands of tokens (very large input streams), callers
  300. * of size can optionally use this cached size field.
  301. */
  302. ANTLR_UINT32 m_cachedSize;
  303. public:
  304. TokenIntStream();
  305. ANTLR_UINT32 get_cachedSize() const;
  306. void set_cachedSize( ANTLR_UINT32 cachedSize );
  307. void consume();
  308. void consumeInitialHiddenTokens();
  309. ANTLR_UINT32 LA( ANTLR_INT32 i );
  310. ANTLR_MARKER mark();
  311. ANTLR_UINT32 size();
  312. void release();
  313. ANTLR_MARKER tindex();
  314. void rewindLast();
  315. void rewind(ANTLR_MARKER marker);
  316. void seek(ANTLR_MARKER index);
  317. StringType getSourceName();
  318. };
  319. template<class ImplTraits>
  320. class TreeNodeIntStream : public IntStream<ImplTraits, typename ImplTraits::TreeNodeStreamType>
  321. {
  322. public:
  323. typedef typename ImplTraits::TreeNodeStreamType TreeNodeStreamType;
  324. typedef IntStream<ImplTraits, TreeNodeStreamType > BaseType;
  325. typedef typename ImplTraits::TreeType TreeType;
  326. typedef typename ImplTraits::TreeTypePtr TreeTypePtr;
  327. typedef typename ImplTraits::CommonTokenType CommonTokenType;
  328. public:
  329. void consume();
  330. ANTLR_MARKER tindex();
  331. ANTLR_UINT32 LA(ANTLR_INT32 i);
  332. ANTLR_MARKER mark();
  333. void release(ANTLR_MARKER marker);
  334. void rewindMark(ANTLR_MARKER marker);
  335. void rewindLast();
  336. void seek(ANTLR_MARKER index);
  337. ANTLR_UINT32 size();
  338. };
  339. }
  340. #include "antlr3intstream.inl"
  341. #endif