antlr3input.hpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. /** \file
  2. * Defines the basic structures used to manipulate character
  3. * streams from any input source. Any character size and encoding
  4. * can in theory be used, so long as a set of functinos is provided that
  5. * can return a 32 bit Integer representation of their characters amd efficiently mark and revert
  6. * to specific offsets into their input streams.
  7. */
  8. #ifndef _ANTLR_INPUT_HPP
  9. #define _ANTLR_INPUT_HPP
  10. // [The "BSD licence"]
  11. // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
  12. //
  13. // All rights reserved.
  14. //
  15. // Redistribution and use in source and binary forms, with or without
  16. // modification, are permitted provided that the following conditions
  17. // are met:
  18. // 1. Redistributions of source code must retain the above copyright
  19. // notice, this list of conditions and the following disclaimer.
  20. // 2. Redistributions in binary form must reproduce the above copyright
  21. // notice, this list of conditions and the following disclaimer in the
  22. // documentation and/or other materials provided with the distribution.
  23. // 3. The name of the author may not be used to endorse or promote products
  24. // derived from this software without specific prior written permission.
  25. //
  26. // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  27. // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  28. // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  29. // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  30. // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  31. // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  32. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  33. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  34. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  35. // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36. namespace antlr3 {
  37. /// Master context structure for an ANTLR3 C runtime based input stream.
  38. /// \ingroup apistructures. Calling LT on this doesn't seem right. You would
  39. /// call it only with parser / TreeParser, and their respective input streams
  40. /// has that function. calling it from lexer will throw a compile time error
  41. ///
  42. template<class ImplTraits>
  43. class InputStream : public ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType >
  44. {
  45. public:
  46. typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
  47. typedef typename ImplTraits::LexStateType LexStateType;
  48. typedef typename ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > IntStreamType;
  49. typedef IntStreamType BaseType;
  50. typedef typename ImplTraits::StreamDataType UnitType;
  51. typedef UnitType DataType;
  52. typedef UnitType TokenType;
  53. typedef typename AllocPolicyType::template VectorType<LexStateType> MarkersType;
  54. typedef typename ImplTraits::StringType StringType;
  55. private:
  56. /** Pointer the start of the input string, characters may be
  57. * taken as offsets from here and in original input format encoding.
  58. */
  59. const DataType* m_data;
  60. /** Pointer to the next character to be consumed from the input data
  61. * This is cast to point at the encoding of the original file that
  62. * was read by the functions installed as pointer in this input stream
  63. * context instance at file/string/whatever load time.
  64. */
  65. const DataType* m_nextChar;
  66. /** Number of characters that can be consumed at this point in time.
  67. * Mostly this is just what is left in the pre-read buffer, but if the
  68. * input source is a stream such as a socket or something then we may
  69. * call special read code to wait for more input.
  70. */
  71. ANTLR_UINT32 m_sizeBuf;
  72. /** The line number we are traversing in the input file. This gets incremented
  73. * by a newline() call in the lexer grammar actions.
  74. */
  75. ANTLR_UINT32 m_line;
  76. /** Pointer into the input buffer where the current line
  77. * started.
  78. */
  79. const DataType* m_currentLine;
  80. /** The offset within the current line of the current character
  81. */
  82. ANTLR_INT32 m_charPositionInLine;
  83. /** Tracks how deep mark() calls are nested
  84. */
  85. ANTLR_UINT32 m_markDepth;
  86. /** List of mark() points in the input stream
  87. */
  88. MarkersType m_markers;
  89. /** File name string, set to pointer to memory if
  90. * you set it manually as it will be free()d
  91. */
  92. StringType m_fileName;
  93. /** File number, needs to be set manually to some file index of your devising.
  94. */
  95. ANTLR_UINT32 m_fileNo;
  96. /// Character that automatically causes an internal line count
  97. /// increment.
  98. ///
  99. ANTLR_UCHAR m_newlineChar;
  100. /// Indicates the size, in 8 bit units, of a single character. Note that
  101. /// the C runtime does not deal with surrogates as this would be
  102. /// slow and complicated. If this is a UTF-8 stream then this field
  103. /// will be set to 0. Generally you are best working internally with 32 bit characters
  104. /// as this is the most efficient.
  105. ///
  106. ANTLR_UINT8 m_charByteSize;
  107. /** Indicates if the data pointer was allocated by us, and so should be freed
  108. * when the stream dies.
  109. */
  110. bool m_isAllocated;
  111. /// Indicates the encoding scheme used in this input stream
  112. ///
  113. ANTLR_UINT32 m_encoding;
  114. /* API */
  115. public:
  116. InputStream(const ANTLR_UINT8* fileName, ANTLR_UINT32 encoding);
  117. InputStream(const ANTLR_UINT8* data, ANTLR_UINT32 encoding, ANTLR_UINT32 size, ANTLR_UINT8* name);
  118. ~InputStream();
  119. const DataType* get_data() const;
  120. bool get_isAllocated() const;
  121. const DataType* get_nextChar() const;
  122. ANTLR_UINT32 get_sizeBuf() const;
  123. ANTLR_UINT32 get_line() const;
  124. const DataType* get_currentLine() const;
  125. ANTLR_INT32 get_charPositionInLine() const;
  126. ANTLR_UINT32 get_markDepth() const;
  127. MarkersType& get_markers();
  128. const StringType& get_fileName() const;
  129. ANTLR_UINT32 get_fileNo() const;
  130. ANTLR_UCHAR get_newlineChar() const;
  131. ANTLR_UINT8 get_charByteSize() const;
  132. ANTLR_UINT32 get_encoding() const;
  133. void set_data( DataType* data );
  134. void set_isAllocated( bool isAllocated );
  135. void set_nextChar( const DataType* nextChar );
  136. void set_sizeBuf( ANTLR_UINT32 sizeBuf );
  137. void set_line( ANTLR_UINT32 line );
  138. void set_currentLine( const DataType* currentLine );
  139. void set_charPositionInLine( ANTLR_INT32 charPositionInLine );
  140. void set_markDepth( ANTLR_UINT32 markDepth );
  141. void set_markers( const MarkersType& markers );
  142. void set_fileName( const StringType& fileName );
  143. void set_fileNo( ANTLR_UINT32 fileNo );
  144. void set_newlineChar( ANTLR_UCHAR newlineChar );
  145. void set_charByteSize( ANTLR_UINT8 charByteSize );
  146. void set_encoding( ANTLR_UINT32 encoding );
  147. void inc_charPositionInLine();
  148. void inc_line();
  149. void inc_markDepth();
  150. IntStreamType* get_istream();
  151. /** Function that resets the input stream
  152. */
  153. void reset();
  154. /** Pointer to a function that reuses and resets an input stream by
  155. * supplying a new 'source'
  156. */
  157. void reuse(ANTLR_UINT8* inString, ANTLR_UINT32 size, ANTLR_UINT8* name);
  158. /** Function to return the total size of the input buffer. For streams
  159. * this may be just the total we have available so far. This means of course that
  160. * the input stream must be careful to accumulate enough input so that any backtracking
  161. * can be satisfied.
  162. */
  163. ANTLR_UINT32 size();
  164. /** Function to return a substring of the input stream. String is returned in allocated
  165. * memory and is in same encoding as the input stream itself, NOT internal ANTLR_UCHAR form.
  166. */
  167. StringType substr(ANTLR_MARKER start, ANTLR_MARKER stop);
  168. /** Function to return the current line number in the input stream
  169. */
  170. ANTLR_UINT32 get_line();
  171. /** Function to return the current line buffer in the input stream
  172. * The pointer returned is directly into the input stream so you must copy
  173. * it if you wish to manipulate it without damaging the input stream. Encoding
  174. * is obviously in the same form as the input stream.
  175. * \remark
  176. * - Note taht this function wil lbe inaccurate if setLine is called as there
  177. * is no way at the moment to position the input stream at a particular line
  178. * number offset.
  179. */
  180. const DataType* getLineBuf();
  181. /** Function to return the current offset in the current input stream line
  182. */
  183. ANTLR_UINT32 get_charPositionInLine();
  184. /** Function to set the current position in the current line.
  185. */
  186. void set_charPositionInLine(ANTLR_UINT32 position);
  187. /** Function to override the default newline character that the input stream
  188. * looks for to trigger the line/offset and line buffer recording information.
  189. * \remark
  190. * - By default the chracter '\n' will be installed as the newline trigger character. When this
  191. * character is seen by the consume() function then the current line number is incremented and the
  192. * current line offset is reset to 0. The Pointer for the line of input we are consuming
  193. * is updated to point to the next character after this one in the input stream (which means it
  194. * may become invalid if the last newline character in the file is seen (so watch out).
  195. * - If for some reason you do not want the counters and pointers to be restee, you can set the
  196. * chracter to some impossible character such as '\0' or whatever.
  197. * - This is a single character only, so choose the last character in a sequence of two or more.
  198. * - This is only a simple aid to error reporting - if you have a complicated binary input structure
  199. * it may not be adequate, but you can always override every function in the input stream with your
  200. * own of course, and can even write your own complete input stream set if you like.
  201. * - It is your responsiblity to set a valid character for the input stream type. There is no point
  202. * setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never
  203. * trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF
  204. */
  205. void set_newLineChar(ANTLR_UINT32 newlineChar);
  206. ANTLR_MARKER index_impl();
  207. private:
  208. /** \brief Use the contents of an operating system file as the input
  209. * for an input stream.
  210. *
  211. * \param fileName Name of operating system file to read.
  212. * \return
  213. * - Pointer to new input stream context upon success
  214. * - One of the ANTLR3_ERR_ defines on error.
  215. */
  216. void createFileStream(const ANTLR_UINT8* fileName);
  217. /** \brief Use the supplied 'string' as input to the stream
  218. *
  219. * \param data Pointer to the input data
  220. * \return
  221. * - Pointer to new input stream context upon success
  222. * - NULL defines on error.
  223. */
  224. void createStringStream(const ANTLR_UINT8* data);
  225. void genericSetupStream();
  226. /// Determine endianess of the input stream and install the
  227. /// API required for the encoding in that format.
  228. ///
  229. void setupInputStream();
  230. };
  231. /** \brief Structure for track lex input states as part of mark()
  232. * and rewind() of lexer.
  233. */
  234. template<class ImplTraits>
  235. class LexState : public ImplTraits::AllocPolicyType
  236. {
  237. public:
  238. typedef typename ImplTraits::StreamDataType DataType;
  239. private:
  240. /** Pointer to the next character to be consumed from the input data
  241. * This is cast to point at the encoding of the original file that
  242. * was read by the functions installed as pointer in this input stream
  243. * context instance at file/string/whatever load time.
  244. */
  245. const DataType* m_nextChar;
  246. /** The line number we are traversing in the input file. This gets incremented
  247. * by a newline() call in the lexer grammer actions.
  248. */
  249. ANTLR_UINT32 m_line;
  250. /** Pointer into the input buffer where the current line
  251. * started.
  252. */
  253. const DataType* m_currentLine;
  254. /** The offset within the current line of the current character
  255. */
  256. ANTLR_INT32 m_charPositionInLine;
  257. public:
  258. LexState();
  259. const DataType* get_nextChar() const;
  260. ANTLR_UINT32 get_line() const;
  261. const DataType* get_currentLine() const;
  262. ANTLR_INT32 get_charPositionInLine() const;
  263. void set_nextChar( const DataType* nextChar );
  264. void set_line( ANTLR_UINT32 line );
  265. void set_currentLine( const DataType* currentLine );
  266. void set_charPositionInLine( ANTLR_INT32 charPositionInLine );
  267. };
  268. class ParseNullStringException : public std::exception
  269. {
  270. virtual const char* what() const noexcept
  271. {
  272. return "Null String";
  273. }
  274. };
  275. }
  276. #include "antlr3input.inl"
  277. #endif /* _ANTLR_INPUT_H */