unistr_case.cpp 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 1999-2014, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: unistr_case.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:2
  14. *
  15. * created on: 2004aug19
  16. * created by: Markus W. Scherer
  17. *
  18. * Case-mapping functions moved here from unistr.cpp
  19. */
  20. #include "unicode/utypes.h"
  21. #include "unicode/brkiter.h"
  22. #include "unicode/casemap.h"
  23. #include "unicode/edits.h"
  24. #include "unicode/putil.h"
  25. #include "cstring.h"
  26. #include "cmemory.h"
  27. #include "unicode/ustring.h"
  28. #include "unicode/unistr.h"
  29. #include "unicode/uchar.h"
  30. #include "uassert.h"
  31. #include "ucasemap_imp.h"
  32. #include "uelement.h"
  33. U_NAMESPACE_BEGIN
  34. //========================================
  35. // Read-only implementation
  36. //========================================
  37. int8_t
  38. UnicodeString::doCaseCompare(int32_t start,
  39. int32_t length,
  40. const char16_t *srcChars,
  41. int32_t srcStart,
  42. int32_t srcLength,
  43. uint32_t options) const
  44. {
  45. // compare illegal string values
  46. // treat const char16_t *srcChars==nullptr as an empty string
  47. if(isBogus()) {
  48. return -1;
  49. }
  50. // pin indices to legal values
  51. pinIndices(start, length);
  52. if(srcChars == nullptr) {
  53. srcStart = srcLength = 0;
  54. }
  55. // get the correct pointer
  56. const char16_t *chars = getArrayStart();
  57. chars += start;
  58. if(srcStart!=0) {
  59. srcChars += srcStart;
  60. }
  61. if(chars != srcChars) {
  62. UErrorCode errorCode=U_ZERO_ERROR;
  63. int32_t result=u_strcmpFold(chars, length, srcChars, srcLength,
  64. options|U_COMPARE_IGNORE_CASE, &errorCode);
  65. if(result!=0) {
  66. return (int8_t)(result >> 24 | 1);
  67. }
  68. } else {
  69. // get the srcLength if necessary
  70. if(srcLength < 0) {
  71. srcLength = u_strlen(srcChars + srcStart);
  72. }
  73. if(length != srcLength) {
  74. return (int8_t)((length - srcLength) >> 24 | 1);
  75. }
  76. }
  77. return 0;
  78. }
  79. //========================================
  80. // Write implementation
  81. //========================================
  82. UnicodeString &
  83. UnicodeString::caseMap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
  84. UStringCaseMapper *stringCaseMapper) {
  85. if(isEmpty() || !isWritable()) {
  86. // nothing to do
  87. return *this;
  88. }
  89. char16_t oldBuffer[2 * US_STACKBUF_SIZE];
  90. char16_t *oldArray;
  91. int32_t oldLength = length();
  92. int32_t newLength;
  93. UBool writable = isBufferWritable();
  94. UErrorCode errorCode = U_ZERO_ERROR;
  95. #if !UCONFIG_NO_BREAK_ITERATION
  96. // Read-only alias to the original string contents for the titlecasing BreakIterator.
  97. // We cannot set the iterator simply to *this because *this is being modified.
  98. UnicodeString oldString;
  99. #endif
  100. // Try to avoid heap-allocating a new character array for this string.
  101. if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) {
  102. // Short string: Copy the contents into a temporary buffer and
  103. // case-map back into the current array, or into the stack buffer.
  104. char16_t *buffer = getArrayStart();
  105. int32_t capacity;
  106. oldArray = oldBuffer;
  107. u_memcpy(oldBuffer, buffer, oldLength);
  108. if (writable) {
  109. capacity = getCapacity();
  110. } else {
  111. // Switch from the read-only alias or shared heap buffer to the stack buffer.
  112. if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ false)) {
  113. return *this;
  114. }
  115. U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer);
  116. buffer = fUnion.fStackFields.fBuffer;
  117. capacity = US_STACKBUF_SIZE;
  118. }
  119. #if !UCONFIG_NO_BREAK_ITERATION
  120. if (iter != nullptr) {
  121. oldString.setTo(false, oldArray, oldLength);
  122. iter->setText(oldString);
  123. }
  124. #endif
  125. newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
  126. buffer, capacity,
  127. oldArray, oldLength, nullptr, errorCode);
  128. if (U_SUCCESS(errorCode)) {
  129. setLength(newLength);
  130. return *this;
  131. } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
  132. // common overflow handling below
  133. } else {
  134. setToBogus();
  135. return *this;
  136. }
  137. } else {
  138. // Longer string or read-only buffer:
  139. // Collect only changes and then apply them to this string.
  140. // Case mapping often changes only small parts of a string,
  141. // and often does not change its length.
  142. oldArray = getArrayStart();
  143. Edits edits;
  144. char16_t replacementChars[200];
  145. #if !UCONFIG_NO_BREAK_ITERATION
  146. if (iter != nullptr) {
  147. oldString.setTo(false, oldArray, oldLength);
  148. iter->setText(oldString);
  149. }
  150. #endif
  151. stringCaseMapper(caseLocale, options | U_OMIT_UNCHANGED_TEXT, UCASEMAP_BREAK_ITERATOR
  152. replacementChars, UPRV_LENGTHOF(replacementChars),
  153. oldArray, oldLength, &edits, errorCode);
  154. if (U_SUCCESS(errorCode)) {
  155. // Grow the buffer at most once, not for multiple doReplace() calls.
  156. newLength = oldLength + edits.lengthDelta();
  157. if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) {
  158. return *this;
  159. }
  160. for (Edits::Iterator ei = edits.getCoarseChangesIterator(); ei.next(errorCode);) {
  161. doReplace(ei.destinationIndex(), ei.oldLength(),
  162. replacementChars, ei.replacementIndex(), ei.newLength());
  163. }
  164. if (U_FAILURE(errorCode)) {
  165. setToBogus();
  166. }
  167. return *this;
  168. } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
  169. // common overflow handling below
  170. newLength = oldLength + edits.lengthDelta();
  171. } else {
  172. setToBogus();
  173. return *this;
  174. }
  175. }
  176. // Handle buffer overflow, newLength is known.
  177. // We need to allocate a new buffer for the internal string case mapping function.
  178. // This is very similar to how doReplace() keeps the old array pointer
  179. // and deletes the old array itself after it is done.
  180. // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
  181. int32_t *bufferToDelete = 0;
  182. if (!cloneArrayIfNeeded(newLength, newLength, false, &bufferToDelete, true)) {
  183. return *this;
  184. }
  185. errorCode = U_ZERO_ERROR;
  186. // No need to iter->setText() again: The case mapper restarts via iter->first().
  187. newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
  188. getArrayStart(), getCapacity(),
  189. oldArray, oldLength, nullptr, errorCode);
  190. if (bufferToDelete) {
  191. uprv_free(bufferToDelete);
  192. }
  193. if (U_SUCCESS(errorCode)) {
  194. setLength(newLength);
  195. } else {
  196. setToBogus();
  197. }
  198. return *this;
  199. }
  200. UnicodeString &
  201. UnicodeString::foldCase(uint32_t options) {
  202. return caseMap(UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalFold);
  203. }
  204. U_NAMESPACE_END
  205. // Defined here to reduce dependencies on break iterator
  206. U_CAPI int32_t U_EXPORT2
  207. uhash_hashCaselessUnicodeString(const UElement key) {
  208. U_NAMESPACE_USE
  209. const UnicodeString *str = (const UnicodeString*) key.pointer;
  210. if (str == nullptr) {
  211. return 0;
  212. }
  213. // Inefficient; a better way would be to have a hash function in
  214. // UnicodeString that does case folding on the fly.
  215. UnicodeString copy(*str);
  216. return copy.foldCase().hashCode();
  217. }
  218. // Defined here to reduce dependencies on break iterator
  219. U_CAPI UBool U_EXPORT2
  220. uhash_compareCaselessUnicodeString(const UElement key1, const UElement key2) {
  221. U_NAMESPACE_USE
  222. const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
  223. const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
  224. if (str1 == str2) {
  225. return true;
  226. }
  227. if (str1 == nullptr || str2 == nullptr) {
  228. return false;
  229. }
  230. return str1->caseCompare(*str2, U_FOLD_CASE_DEFAULT) == 0;
  231. }