rbt_pars.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 1999-2011, International Business Machines Corporation
  6. * and others. All Rights Reserved.
  7. **********************************************************************
  8. * Date Name Description
  9. * 11/17/99 aliu Creation.
  10. **********************************************************************
  11. */
  12. #ifndef RBT_PARS_H
  13. #define RBT_PARS_H
  14. #include "unicode/utypes.h"
  15. #if !UCONFIG_NO_TRANSLITERATION
  16. #ifdef __cplusplus
  17. #include "unicode/uobject.h"
  18. #include "unicode/parseerr.h"
  19. #include "unicode/unorm.h"
  20. #include "rbt.h"
  21. #include "hash.h"
  22. #include "uvector.h"
  23. U_NAMESPACE_BEGIN
  24. class TransliterationRuleData;
  25. class UnicodeFunctor;
  26. class ParseData;
  27. class RuleHalf;
  28. class ParsePosition;
  29. class StringMatcher;
  30. class TransliteratorParser : public UMemory {
  31. public:
  32. /**
  33. * A Vector of TransliterationRuleData objects, one for each discrete group
  34. * of rules in the rule set
  35. */
  36. UVector dataVector;
  37. /**
  38. * PUBLIC data member.
  39. * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
  40. */
  41. UVector idBlockVector;
  42. /**
  43. * PUBLIC data member containing the parsed compound filter, if any.
  44. */
  45. UnicodeSet* compoundFilter;
  46. private:
  47. /**
  48. * The current data object for which we are parsing rules
  49. */
  50. TransliterationRuleData* curData;
  51. UTransDirection direction;
  52. /**
  53. * Parse error information.
  54. */
  55. UParseError parseError;
  56. /**
  57. * Temporary symbol table used during parsing.
  58. */
  59. ParseData* parseData;
  60. /**
  61. * Temporary vector of matcher variables. When parsing is complete, this
  62. * is copied into the array data.variables. As with data.variables,
  63. * element 0 corresponds to character data.variablesBase.
  64. */
  65. UVector variablesVector;
  66. /**
  67. * Temporary table of variable names. When parsing is complete, this is
  68. * copied into data.variableNames.
  69. */
  70. Hashtable variableNames;
  71. /**
  72. * String of standins for segments. Used during the parsing of a single
  73. * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
  74. * to StringMatcher object segmentObjects.elementAt(0), etc.
  75. */
  76. UnicodeString segmentStandins;
  77. /**
  78. * Vector of StringMatcher objects for segments. Used during the
  79. * parsing of a single rule.
  80. * segmentStandins.charAt(0) is the standin for "$1" and corresponds
  81. * to StringMatcher object segmentObjects.elementAt(0), etc.
  82. */
  83. UVector segmentObjects;
  84. /**
  85. * The next available stand-in for variables. This starts at some point in
  86. * the private use area (discovered dynamically) and increments up toward
  87. * <code>variableLimit</code>. At any point during parsing, available
  88. * variables are <code>variableNext..variableLimit-1</code>.
  89. */
  90. char16_t variableNext;
  91. /**
  92. * The last available stand-in for variables. This is discovered
  93. * dynamically. At any point during parsing, available variables are
  94. * <code>variableNext..variableLimit-1</code>.
  95. */
  96. char16_t variableLimit;
  97. /**
  98. * When we encounter an undefined variable, we do not immediately signal
  99. * an error, in case we are defining this variable, e.g., "$a = [a-z];".
  100. * Instead, we save the name of the undefined variable, and substitute
  101. * in the placeholder char variableLimit - 1, and decrement
  102. * variableLimit.
  103. */
  104. UnicodeString undefinedVariableName;
  105. /**
  106. * The stand-in character for the 'dot' set, represented by '.' in
  107. * patterns. This is allocated the first time it is needed, and
  108. * reused thereafter.
  109. */
  110. char16_t dotStandIn;
  111. public:
  112. /**
  113. * Constructor.
  114. */
  115. TransliteratorParser(UErrorCode &statusReturn);
  116. /**
  117. * Destructor.
  118. */
  119. ~TransliteratorParser();
  120. /**
  121. * Parse the given string as a sequence of rules, separated by newline
  122. * characters ('\n'), and cause this object to implement those rules. Any
  123. * previous rules are discarded. Typically this method is called exactly
  124. * once after construction.
  125. *
  126. * Parse the given rules, in the given direction. After this call
  127. * returns, query the public data members for results. The caller
  128. * owns the 'data' and 'compoundFilter' data members after this
  129. * call returns.
  130. * @param rules rules, separated by ';'
  131. * @param direction either FORWARD or REVERSE.
  132. * @param pe Struct to receive information on position
  133. * of error if an error is encountered
  134. * @param ec Output param set to success/failure code.
  135. */
  136. void parse(const UnicodeString& rules,
  137. UTransDirection direction,
  138. UParseError& pe,
  139. UErrorCode& ec);
  140. /**
  141. * Return the compound filter parsed by parse(). Caller owns result.
  142. * @return the compound filter parsed by parse().
  143. */
  144. UnicodeSet* orphanCompoundFilter();
  145. private:
  146. /**
  147. * Return a representation of this transliterator as source rules.
  148. * @param rules Output param to receive the rules.
  149. * @param direction either FORWARD or REVERSE.
  150. */
  151. void parseRules(const UnicodeString& rules,
  152. UTransDirection direction,
  153. UErrorCode& status);
  154. /**
  155. * MAIN PARSER. Parse the next rule in the given rule string, starting
  156. * at pos. Return the index after the last character parsed. Do not
  157. * parse characters at or after limit.
  158. *
  159. * Important: The character at pos must be a non-whitespace character
  160. * that is not the comment character.
  161. *
  162. * This method handles quoting, escaping, and whitespace removal. It
  163. * parses the end-of-rule character. It recognizes context and cursor
  164. * indicators. Once it does a lexical breakdown of the rule at pos, it
  165. * creates a rule object and adds it to our rule list.
  166. * @param rules Output param to receive the rules.
  167. * @param pos the starting position.
  168. * @param limit pointer past the last character of the rule.
  169. * @return the index after the last character parsed.
  170. */
  171. int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
  172. /**
  173. * Set the variable range to [start, end] (inclusive).
  174. * @param start the start value of the range.
  175. * @param end the end value of the range.
  176. */
  177. void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
  178. /**
  179. * Assert that the given character is NOT within the variable range.
  180. * If it is, return false. This is necessary to ensure that the
  181. * variable range does not overlap characters used in a rule.
  182. * @param ch the given character.
  183. * @return True, if the given character is NOT within the variable range.
  184. */
  185. UBool checkVariableRange(UChar32 ch) const;
  186. /**
  187. * Set the maximum backup to 'backup', in response to a pragma
  188. * statement.
  189. * @param backup the new value to be set.
  190. */
  191. void pragmaMaximumBackup(int32_t backup);
  192. /**
  193. * Begin normalizing all rules using the given mode, in response
  194. * to a pragma statement.
  195. * @param mode the given mode.
  196. */
  197. void pragmaNormalizeRules(UNormalizationMode mode);
  198. /**
  199. * Return true if the given rule looks like a pragma.
  200. * @param pos offset to the first non-whitespace character
  201. * of the rule.
  202. * @param limit pointer past the last character of the rule.
  203. * @return true if the given rule looks like a pragma.
  204. */
  205. static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
  206. /**
  207. * Parse a pragma. This method assumes resemblesPragma() has
  208. * already returned true.
  209. * @param pos offset to the first non-whitespace character
  210. * of the rule.
  211. * @param limit pointer past the last character of the rule.
  212. * @return the position index after the final ';' of the pragma,
  213. * or -1 on failure.
  214. */
  215. int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
  216. /**
  217. * Called by main parser upon syntax error. Search the rule string
  218. * for the probable end of the rule. Of course, if the error is that
  219. * the end of rule marker is missing, then the rule end will not be found.
  220. * In any case the rule start will be correctly reported.
  221. * @param parseErrorCode error code.
  222. * @param msg error description.
  223. * @param start position of first character of current rule.
  224. * @return start position of first character of current rule.
  225. */
  226. int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
  227. UErrorCode& status);
  228. /**
  229. * Parse a UnicodeSet out, store it, and return the stand-in character
  230. * used to represent it.
  231. *
  232. * @param rule the rule for UnicodeSet.
  233. * @param pos the position in pattern at which to start parsing.
  234. * @return the stand-in character used to represent it.
  235. */
  236. char16_t parseSet(const UnicodeString& rule,
  237. ParsePosition& pos,
  238. UErrorCode& status);
  239. /**
  240. * Generate and return a stand-in for a new UnicodeFunctor. Store
  241. * the matcher (adopt it).
  242. * @param adopted the UnicodeFunctor to be adopted.
  243. * @return a stand-in for a new UnicodeFunctor.
  244. */
  245. char16_t generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
  246. /**
  247. * Return the standin for segment seg (1-based).
  248. * @param seg the given segment.
  249. * @return the standIn character for the given segment.
  250. */
  251. char16_t getSegmentStandin(int32_t seg, UErrorCode& status);
  252. /**
  253. * Set the object for segment seg (1-based).
  254. * @param seg the given segment.
  255. * @param adopted the StringMatcher to be adopted.
  256. */
  257. void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
  258. /**
  259. * Return the stand-in for the dot set. It is allocated the first
  260. * time and reused thereafter.
  261. * @return the stand-in for the dot set.
  262. */
  263. char16_t getDotStandIn(UErrorCode& status);
  264. /**
  265. * Append the value of the given variable name to the given
  266. * UnicodeString.
  267. * @param name the variable name to be appended.
  268. * @param buf the given UnicodeString to append to.
  269. */
  270. void appendVariableDef(const UnicodeString& name,
  271. UnicodeString& buf,
  272. UErrorCode& status);
  273. /**
  274. * Glue method to get around access restrictions in C++.
  275. */
  276. /*static Transliterator* createBasicInstance(const UnicodeString& id,
  277. const UnicodeString* canonID);*/
  278. friend class RuleHalf;
  279. // Disallowed methods; no impl.
  280. /**
  281. * Copy constructor
  282. */
  283. TransliteratorParser(const TransliteratorParser&);
  284. /**
  285. * Assignment operator
  286. */
  287. TransliteratorParser& operator=(const TransliteratorParser&);
  288. };
  289. U_NAMESPACE_END
  290. #endif /* #ifdef __cplusplus */
  291. /**
  292. * Strip/convert the following from the transliterator rules:
  293. * comments
  294. * newlines
  295. * white space at the beginning and end of a line
  296. * unescape \u notation
  297. *
  298. * The target must be equal in size as the source.
  299. * @internal
  300. */
  301. U_CAPI int32_t
  302. utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
  303. #endif /* #if !UCONFIG_NO_TRANSLITERATION */
  304. #endif