ruleiter.h 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (c) 2003-2011, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * Author: Alan Liu
  9. * Created: September 24 2003
  10. * Since: ICU 2.8
  11. **********************************************************************
  12. */
  13. #ifndef _RULEITER_H_
  14. #define _RULEITER_H_
  15. #include "unicode/uobject.h"
  16. U_NAMESPACE_BEGIN
  17. class UnicodeString;
  18. class ParsePosition;
  19. class SymbolTable;
  20. /**
  21. * An iterator that returns 32-bit code points. This class is deliberately
  22. * <em>not</em> related to any of the ICU character iterator classes
  23. * in order to minimize complexity.
  24. * @author Alan Liu
  25. * @since ICU 2.8
  26. */
  27. class RuleCharacterIterator : public UMemory {
  28. // TODO: Ideas for later. (Do not implement if not needed, lest the
  29. // code coverage numbers go down due to unused methods.)
  30. // 1. Add a copy constructor, operator==() method.
  31. // 2. Rather than return DONE, throw an exception if the end
  32. // is reached -- this is an alternate usage model, probably not useful.
  33. private:
  34. /**
  35. * Text being iterated.
  36. */
  37. const UnicodeString& text;
  38. /**
  39. * Position of iterator.
  40. */
  41. ParsePosition& pos;
  42. /**
  43. * Symbol table used to parse and dereference variables. May be 0.
  44. */
  45. const SymbolTable* sym;
  46. /**
  47. * Current variable expansion, or 0 if none.
  48. */
  49. const UnicodeString* buf;
  50. /**
  51. * Position within buf. Meaningless if buf == 0.
  52. */
  53. int32_t bufPos;
  54. public:
  55. /**
  56. * Value returned when there are no more characters to iterate.
  57. */
  58. static constexpr int32_t DONE = -1;
  59. /**
  60. * Bitmask option to enable parsing of variable names. If (options &
  61. * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
  62. * its value. Variables are parsed using the SymbolTable API.
  63. */
  64. static constexpr int32_t PARSE_VARIABLES = 1;
  65. /**
  66. * Bitmask option to enable parsing of escape sequences. If (options &
  67. * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
  68. * to its value. Escapes are parsed using Utility.unescapeAt().
  69. */
  70. static constexpr int32_t PARSE_ESCAPES = 2;
  71. /**
  72. * Bitmask option to enable skipping of whitespace. If (options &
  73. * SKIP_WHITESPACE) != 0, then Pattern_White_Space characters will be silently
  74. * skipped, as if they were not present in the input.
  75. */
  76. static constexpr int32_t SKIP_WHITESPACE = 4;
  77. /**
  78. * Constructs an iterator over the given text, starting at the given
  79. * position.
  80. * @param text the text to be iterated
  81. * @param sym the symbol table, or null if there is none. If sym is null,
  82. * then variables will not be dereferenced, even if the PARSE_VARIABLES
  83. * option is set.
  84. * @param pos upon input, the index of the next character to return. If a
  85. * variable has been dereferenced, then pos will <em>not</em> increment as
  86. * characters of the variable value are iterated.
  87. */
  88. RuleCharacterIterator(const UnicodeString& text, const SymbolTable* sym,
  89. ParsePosition& pos);
  90. /**
  91. * Returns true if this iterator has no more characters to return.
  92. */
  93. UBool atEnd() const;
  94. /**
  95. * Returns the next character using the given options, or DONE if there
  96. * are no more characters, and advance the position to the next
  97. * character.
  98. * @param options one or more of the following options, bitwise-OR-ed
  99. * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
  100. * @param isEscaped output parameter set to true if the character
  101. * was escaped
  102. * @param ec input-output error code. An error will only be set by
  103. * this routing if options includes PARSE_VARIABLES and an unknown
  104. * variable name is seen, or if options includes PARSE_ESCAPES and
  105. * an invalid escape sequence is seen.
  106. * @return the current 32-bit code point, or DONE
  107. */
  108. UChar32 next(int32_t options, UBool& isEscaped, UErrorCode& ec);
  109. /**
  110. * Returns true if this iterator is currently within a variable expansion.
  111. */
  112. inline UBool inVariable() const;
  113. /**
  114. * An opaque object representing the position of a RuleCharacterIterator.
  115. */
  116. struct Pos : public UMemory {
  117. private:
  118. const UnicodeString* buf;
  119. int32_t pos;
  120. int32_t bufPos;
  121. friend class RuleCharacterIterator;
  122. };
  123. /**
  124. * Sets an object which, when later passed to setPos(), will
  125. * restore this iterator's position. Usage idiom:
  126. *
  127. * RuleCharacterIterator iterator = ...;
  128. * RuleCharacterIterator::Pos pos;
  129. * iterator.getPos(pos);
  130. * for (;;) {
  131. * iterator.getPos(pos);
  132. * int c = iterator.next(...);
  133. * ...
  134. * }
  135. * iterator.setPos(pos);
  136. *
  137. * @param p a position object to be set to this iterator's
  138. * current position.
  139. */
  140. void getPos(Pos& p) const;
  141. /**
  142. * Restores this iterator to the position it had when getPos()
  143. * set the given object.
  144. * @param p a position object previously set by getPos()
  145. */
  146. void setPos(const Pos& p);
  147. /**
  148. * Skips ahead past any ignored characters, as indicated by the given
  149. * options. This is useful in conjunction with the lookahead() method.
  150. *
  151. * Currently, this only has an effect for SKIP_WHITESPACE.
  152. * @param options one or more of the following options, bitwise-OR-ed
  153. * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
  154. */
  155. void skipIgnored(int32_t options);
  156. /**
  157. * Returns a string containing the remainder of the characters to be
  158. * returned by this iterator, without any option processing. If the
  159. * iterator is currently within a variable expansion, this will only
  160. * extend to the end of the variable expansion. This method is provided
  161. * so that iterators may interoperate with string-based APIs. The typical
  162. * sequence of calls is to call skipIgnored(), then call lookahead(), then
  163. * parse the string returned by lookahead(), then call jumpahead() to
  164. * resynchronize the iterator.
  165. * @param result a string to receive the characters to be returned
  166. * by future calls to next()
  167. * @param maxLookAhead The maximum to copy into the result.
  168. * @return a reference to result
  169. */
  170. UnicodeString& lookahead(UnicodeString& result, int32_t maxLookAhead = -1) const;
  171. /**
  172. * Advances the position by the given number of 16-bit code units.
  173. * This is useful in conjunction with the lookahead() method.
  174. * @param count the number of 16-bit code units to jump over
  175. */
  176. void jumpahead(int32_t count);
  177. /**
  178. * Returns a string representation of this object, consisting of the
  179. * characters being iterated, with a '|' marking the current position.
  180. * Position within an expanded variable is <em>not</em> indicated.
  181. * @param result output parameter to receive a string
  182. * representation of this object
  183. */
  184. // UnicodeString& toString(UnicodeString& result) const;
  185. private:
  186. /**
  187. * Returns the current 32-bit code point without parsing escapes, parsing
  188. * variables, or skipping whitespace.
  189. * @return the current 32-bit code point
  190. */
  191. UChar32 _current() const;
  192. /**
  193. * Advances the position by the given amount.
  194. * @param count the number of 16-bit code units to advance past
  195. */
  196. void _advance(int32_t count);
  197. };
  198. inline UBool RuleCharacterIterator::inVariable() const {
  199. return buf != 0;
  200. }
  201. U_NAMESPACE_END
  202. #endif // _RULEITER_H_
  203. //eof