util_props.cpp 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (c) 2001-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * Date Name Description
  9. * 11/19/2001 aliu Creation.
  10. **********************************************************************
  11. */
  12. #include "unicode/uchar.h"
  13. #include "unicode/utf16.h"
  14. #include "patternprops.h"
  15. #include "util.h"
  16. U_NAMESPACE_BEGIN
  17. /**
  18. * Parse an integer at pos, either of the form \d+ or of the form
  19. * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
  20. * or octal format.
  21. * @param pos INPUT-OUTPUT parameter. On input, the first
  22. * character to parse. On output, the character after the last
  23. * parsed character.
  24. */
  25. int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) {
  26. int32_t count = 0;
  27. int32_t value = 0;
  28. int32_t p = pos;
  29. int8_t radix = 10;
  30. if (p < limit && rule.charAt(p) == 48 /*0*/) {
  31. if (p+1 < limit && (rule.charAt(p+1) == 0x78 /*x*/ || rule.charAt(p+1) == 0x58 /*X*/)) {
  32. p += 2;
  33. radix = 16;
  34. }
  35. else {
  36. p++;
  37. count = 1;
  38. radix = 8;
  39. }
  40. }
  41. while (p < limit) {
  42. int32_t d = u_digit(rule.charAt(p++), radix);
  43. if (d < 0) {
  44. --p;
  45. break;
  46. }
  47. ++count;
  48. int32_t v = (value * radix) + d;
  49. if (v <= value) {
  50. // If there are too many input digits, at some point
  51. // the value will go negative, e.g., if we have seen
  52. // "0x8000000" already and there is another '0', when
  53. // we parse the next 0 the value will go negative.
  54. return 0;
  55. }
  56. value = v;
  57. }
  58. if (count > 0) {
  59. pos = p;
  60. }
  61. return value;
  62. }
  63. /**
  64. * Parse a pattern string starting at offset pos. Keywords are
  65. * matched case-insensitively. Spaces may be skipped and may be
  66. * optional or required. Integer values may be parsed, and if
  67. * they are, they will be returned in the given array. If
  68. * successful, the offset of the next non-space character is
  69. * returned. On failure, -1 is returned.
  70. * @param pattern must only contain lowercase characters, which
  71. * will match their uppercase equivalents as well. A space
  72. * character matches one or more required spaces. A '~' character
  73. * matches zero or more optional spaces. A '#' character matches
  74. * an integer and stores it in parsedInts, which the caller must
  75. * ensure has enough capacity.
  76. * @param parsedInts array to receive parsed integers. Caller
  77. * must ensure that parsedInts.length is >= the number of '#'
  78. * signs in 'pattern'.
  79. * @return the position after the last character parsed, or -1 if
  80. * the parse failed
  81. */
  82. int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
  83. const UnicodeString& pattern, int32_t* parsedInts) {
  84. // TODO Update this to handle surrogates
  85. int32_t p;
  86. int32_t intCount = 0; // number of integers parsed
  87. for (int32_t i=0; i<pattern.length(); ++i) {
  88. char16_t cpat = pattern.charAt(i);
  89. char16_t c;
  90. switch (cpat) {
  91. case 32 /*' '*/:
  92. if (pos >= limit) {
  93. return -1;
  94. }
  95. c = rule.charAt(pos++);
  96. if (!PatternProps::isWhiteSpace(c)) {
  97. return -1;
  98. }
  99. // FALL THROUGH to skipWhitespace
  100. U_FALLTHROUGH;
  101. case 126 /*'~'*/:
  102. pos = skipWhitespace(rule, pos);
  103. break;
  104. case 35 /*'#'*/:
  105. p = pos;
  106. parsedInts[intCount++] = parseInteger(rule, p, limit);
  107. if (p == pos) {
  108. // Syntax error; failed to parse integer
  109. return -1;
  110. }
  111. pos = p;
  112. break;
  113. default:
  114. if (pos >= limit) {
  115. return -1;
  116. }
  117. c = (char16_t) u_tolower(rule.charAt(pos++));
  118. if (c != cpat) {
  119. return -1;
  120. }
  121. break;
  122. }
  123. }
  124. return pos;
  125. }
  126. /**
  127. * Parse a Unicode identifier from the given string at the given
  128. * position. Return the identifier, or an empty string if there
  129. * is no identifier.
  130. * @param str the string to parse
  131. * @param pos INPUT-OUTPUT parameter. On INPUT, pos is the
  132. * first character to examine. It must be less than str.length(),
  133. * and it must not point to a whitespace character. That is, must
  134. * have pos < str.length(). On
  135. * OUTPUT, the position after the last parsed character.
  136. * @return the Unicode identifier, or an empty string if there is
  137. * no valid identifier at pos.
  138. */
  139. UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) {
  140. // assert(pos < str.length());
  141. UnicodeString buf;
  142. int p = pos;
  143. while (p < str.length()) {
  144. UChar32 ch = str.char32At(p);
  145. if (buf.length() == 0) {
  146. if (u_isIDStart(ch)) {
  147. buf.append(ch);
  148. } else {
  149. buf.truncate(0);
  150. return buf;
  151. }
  152. } else {
  153. if (u_isIDPart(ch)) {
  154. buf.append(ch);
  155. } else {
  156. break;
  157. }
  158. }
  159. p += U16_LENGTH(ch);
  160. }
  161. pos = p;
  162. return buf;
  163. }
  164. /**
  165. * Parse an unsigned 31-bit integer at the given offset. Use
  166. * UCharacter.digit() to parse individual characters into digits.
  167. * @param text the text to be parsed
  168. * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the
  169. * offset within text at which to start parsing; it should point
  170. * to a valid digit. On exit, pos[0] is the offset after the last
  171. * parsed character. If the parse failed, it will be unchanged on
  172. * exit. Must be >= 0 on entry.
  173. * @param radix the radix in which to parse; must be >= 2 and <=
  174. * 36.
  175. * @return a non-negative parsed number, or -1 upon parse failure.
  176. * Parse fails if there are no digits, that is, if pos[0] does not
  177. * point to a valid digit on entry, or if the number to be parsed
  178. * does not fit into a 31-bit unsigned integer.
  179. */
  180. int32_t ICU_Utility::parseNumber(const UnicodeString& text,
  181. int32_t& pos, int8_t radix) {
  182. // assert(pos[0] >= 0);
  183. // assert(radix >= 2);
  184. // assert(radix <= 36);
  185. int32_t n = 0;
  186. int32_t p = pos;
  187. while (p < text.length()) {
  188. UChar32 ch = text.char32At(p);
  189. int32_t d = u_digit(ch, radix);
  190. if (d < 0) {
  191. break;
  192. }
  193. n = radix*n + d;
  194. // ASSUME that when a 32-bit integer overflows it becomes
  195. // negative. E.g., 214748364 * 10 + 8 => negative value.
  196. if (n < 0) {
  197. return -1;
  198. }
  199. ++p;
  200. }
  201. if (p == pos) {
  202. return -1;
  203. }
  204. pos = p;
  205. return n;
  206. }
  207. U_NAMESPACE_END