unistr.cpp 60 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. * Copyright (C) 1999-2016, International Business Machines Corporation and
  6. * others. All Rights Reserved.
  7. ******************************************************************************
  8. *
  9. * File unistr.cpp
  10. *
  11. * Modification History:
  12. *
  13. * Date Name Description
  14. * 09/25/98 stephen Creation.
  15. * 04/20/99 stephen Overhauled per 4/16 code review.
  16. * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  17. * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
  18. * Replaceable.
  19. * 06/25/01 grhoten Removed the dependency on iostream
  20. ******************************************************************************
  21. */
  22. #include <string_view>
  23. #include "unicode/utypes.h"
  24. #include "unicode/appendable.h"
  25. #include "unicode/putil.h"
  26. #include "cstring.h"
  27. #include "cmemory.h"
  28. #include "unicode/ustring.h"
  29. #include "unicode/unistr.h"
  30. #include "unicode/utf.h"
  31. #include "unicode/utf16.h"
  32. #include "uelement.h"
  33. #include "ustr_imp.h"
  34. #include "umutex.h"
  35. #include "uassert.h"
  36. #if 0
  37. #include <iostream>
  38. using namespace std;
  39. //DEBUGGING
  40. void
  41. print(const UnicodeString& s,
  42. const char *name)
  43. {
  44. char16_t c;
  45. cout << name << ":|";
  46. for(int i = 0; i < s.length(); ++i) {
  47. c = s[i];
  48. if(c>= 0x007E || c < 0x0020)
  49. cout << "[0x" << hex << s[i] << "]";
  50. else
  51. cout << (char) s[i];
  52. }
  53. cout << '|' << endl;
  54. }
  55. void
  56. print(const char16_t *s,
  57. int32_t len,
  58. const char *name)
  59. {
  60. char16_t c;
  61. cout << name << ":|";
  62. for(int i = 0; i < len; ++i) {
  63. c = s[i];
  64. if(c>= 0x007E || c < 0x0020)
  65. cout << "[0x" << hex << s[i] << "]";
  66. else
  67. cout << (char) s[i];
  68. }
  69. cout << '|' << endl;
  70. }
  71. // END DEBUGGING
  72. #endif
  73. // Local function definitions for now
  74. // need to copy areas that may overlap
  75. static
  76. inline void
  77. us_arrayCopy(const char16_t *src, int32_t srcStart,
  78. char16_t *dst, int32_t dstStart, int32_t count)
  79. {
  80. if(count>0) {
  81. uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
  82. }
  83. }
  84. // u_unescapeAt() callback to get a char16_t from a UnicodeString
  85. U_CDECL_BEGIN
  86. static char16_t U_CALLCONV
  87. UnicodeString_charAt(int32_t offset, void *context) {
  88. return ((icu::UnicodeString*) context)->charAt(offset);
  89. }
  90. U_CDECL_END
  91. U_NAMESPACE_BEGIN
  92. /* The Replaceable virtual destructor can't be defined in the header
  93. due to how AIX works with multiple definitions of virtual functions.
  94. */
  95. Replaceable::~Replaceable() {}
  96. UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
  97. UnicodeString U_EXPORT2
  98. operator+ (const UnicodeString &s1, const UnicodeString &s2) {
  99. int32_t sumLengths;
  100. if (uprv_add32_overflow(s1.length(), s2.length(), &sumLengths)) {
  101. UnicodeString bogus;
  102. bogus.setToBogus();
  103. return bogus;
  104. }
  105. if (sumLengths != INT32_MAX) {
  106. ++sumLengths; // space for a terminating NUL if we need one
  107. }
  108. return UnicodeString(sumLengths, static_cast<UChar32>(0), 0).append(s1).append(s2);
  109. }
  110. U_COMMON_API UnicodeString U_EXPORT2
  111. unistr_internalConcat(const UnicodeString &s1, std::u16string_view s2) {
  112. int32_t sumLengths;
  113. if (s2.length() > INT32_MAX ||
  114. uprv_add32_overflow(s1.length(), static_cast<int32_t>(s2.length()), &sumLengths)) {
  115. UnicodeString bogus;
  116. bogus.setToBogus();
  117. return bogus;
  118. }
  119. if (sumLengths != INT32_MAX) {
  120. ++sumLengths; // space for a terminating NUL if we need one
  121. }
  122. return UnicodeString(sumLengths, static_cast<UChar32>(0), 0).append(s1).append(s2);
  123. }
  124. //========================================
  125. // Reference Counting functions, put at top of file so that optimizing compilers
  126. // have a chance to automatically inline.
  127. //========================================
  128. void
  129. UnicodeString::addRef() {
  130. umtx_atomic_inc(reinterpret_cast<u_atomic_int32_t*>(fUnion.fFields.fArray) - 1);
  131. }
  132. int32_t
  133. UnicodeString::removeRef() {
  134. return umtx_atomic_dec(reinterpret_cast<u_atomic_int32_t*>(fUnion.fFields.fArray) - 1);
  135. }
  136. int32_t
  137. UnicodeString::refCount() const {
  138. return umtx_loadAcquire(*(reinterpret_cast<u_atomic_int32_t*>(fUnion.fFields.fArray) - 1));
  139. }
  140. void
  141. UnicodeString::releaseArray() {
  142. if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
  143. uprv_free(reinterpret_cast<int32_t*>(fUnion.fFields.fArray) - 1);
  144. }
  145. }
  146. //========================================
  147. // Constructors
  148. //========================================
  149. // The default constructor is inline in unistr.h.
  150. UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
  151. fUnion.fFields.fLengthAndFlags = 0;
  152. if (count <= 0 || static_cast<uint32_t>(c) > 0x10ffff) {
  153. // just allocate and do not do anything else
  154. allocate(capacity);
  155. } else if(c <= 0xffff) {
  156. int32_t length = count;
  157. if(capacity < length) {
  158. capacity = length;
  159. }
  160. if(allocate(capacity)) {
  161. char16_t *array = getArrayStart();
  162. char16_t unit = static_cast<char16_t>(c);
  163. for(int32_t i = 0; i < length; ++i) {
  164. array[i] = unit;
  165. }
  166. setLength(length);
  167. }
  168. } else { // supplementary code point, write surrogate pairs
  169. if(count > (INT32_MAX / 2)) {
  170. // We would get more than 2G UChars.
  171. allocate(capacity);
  172. return;
  173. }
  174. int32_t length = count * 2;
  175. if(capacity < length) {
  176. capacity = length;
  177. }
  178. if(allocate(capacity)) {
  179. char16_t *array = getArrayStart();
  180. char16_t lead = U16_LEAD(c);
  181. char16_t trail = U16_TRAIL(c);
  182. for(int32_t i = 0; i < length; i += 2) {
  183. array[i] = lead;
  184. array[i + 1] = trail;
  185. }
  186. setLength(length);
  187. }
  188. }
  189. }
  190. UnicodeString::UnicodeString(char16_t ch) {
  191. fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
  192. fUnion.fStackFields.fBuffer[0] = ch;
  193. }
  194. UnicodeString::UnicodeString(UChar32 ch) {
  195. fUnion.fFields.fLengthAndFlags = kShortString;
  196. int32_t i = 0;
  197. UBool isError = false;
  198. U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
  199. // We test isError so that the compiler does not complain that we don't.
  200. // If isError then i==0 which is what we want anyway.
  201. if(!isError) {
  202. setShortLength(i);
  203. }
  204. }
  205. UnicodeString::UnicodeString(const char16_t *text,
  206. int32_t textLength) {
  207. fUnion.fFields.fLengthAndFlags = kShortString;
  208. doAppend(text, 0, textLength);
  209. }
  210. UnicodeString::UnicodeString(UBool isTerminated,
  211. ConstChar16Ptr textPtr,
  212. int32_t textLength) {
  213. fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
  214. const char16_t *text = textPtr;
  215. if(text == nullptr) {
  216. // treat as an empty string, do not alias
  217. setToEmpty();
  218. } else if(textLength < -1 ||
  219. (textLength == -1 && !isTerminated) ||
  220. (textLength >= 0 && isTerminated && text[textLength] != 0)
  221. ) {
  222. setToBogus();
  223. } else {
  224. if(textLength == -1) {
  225. // text is terminated, or else it would have failed the above test
  226. textLength = u_strlen(text);
  227. }
  228. setArray(const_cast<char16_t *>(text), textLength,
  229. isTerminated ? textLength + 1 : textLength);
  230. }
  231. }
  232. UnicodeString::UnicodeString(char16_t *buff,
  233. int32_t buffLength,
  234. int32_t buffCapacity) {
  235. fUnion.fFields.fLengthAndFlags = kWritableAlias;
  236. if(buff == nullptr) {
  237. // treat as an empty string, do not alias
  238. setToEmpty();
  239. } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
  240. setToBogus();
  241. } else {
  242. if(buffLength == -1) {
  243. // fLength = u_strlen(buff); but do not look beyond buffCapacity
  244. const char16_t *p = buff, *limit = buff + buffCapacity;
  245. while(p != limit && *p != 0) {
  246. ++p;
  247. }
  248. buffLength = static_cast<int32_t>(p - buff);
  249. }
  250. setArray(buff, buffLength, buffCapacity);
  251. }
  252. }
  253. UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
  254. fUnion.fFields.fLengthAndFlags = kShortString;
  255. if(src==nullptr) {
  256. // treat as an empty string
  257. } else {
  258. if(length<0) {
  259. length = static_cast<int32_t>(uprv_strlen(src));
  260. }
  261. if(cloneArrayIfNeeded(length, length, false)) {
  262. u_charsToUChars(src, getArrayStart(), length);
  263. setLength(length);
  264. } else {
  265. setToBogus();
  266. }
  267. }
  268. }
  269. UnicodeString UnicodeString::readOnlyAliasFromU16StringView(std::u16string_view text) {
  270. UnicodeString result;
  271. if (text.length() <= INT32_MAX) {
  272. result.setTo(false, text.data(), static_cast<int32_t>(text.length()));
  273. } else {
  274. result.setToBogus();
  275. }
  276. return result;
  277. }
  278. UnicodeString UnicodeString::readOnlyAliasFromUnicodeString(const UnicodeString &text) {
  279. UnicodeString result;
  280. if (text.isBogus()) {
  281. result.setToBogus();
  282. } else {
  283. result.setTo(false, text.getBuffer(), text.length());
  284. }
  285. return result;
  286. }
  287. #if U_CHARSET_IS_UTF8
  288. UnicodeString::UnicodeString(const char *codepageData) {
  289. fUnion.fFields.fLengthAndFlags = kShortString;
  290. if (codepageData != nullptr) {
  291. setToUTF8(codepageData);
  292. }
  293. }
  294. UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
  295. fUnion.fFields.fLengthAndFlags = kShortString;
  296. // if there's nothing to convert, do nothing
  297. if (codepageData == nullptr || dataLength == 0 || dataLength < -1) {
  298. return;
  299. }
  300. if(dataLength == -1) {
  301. dataLength = static_cast<int32_t>(uprv_strlen(codepageData));
  302. }
  303. setToUTF8(StringPiece(codepageData, dataLength));
  304. }
  305. // else see unistr_cnv.cpp
  306. #endif
  307. UnicodeString::UnicodeString(const UnicodeString& that) {
  308. fUnion.fFields.fLengthAndFlags = kShortString;
  309. copyFrom(that);
  310. }
  311. UnicodeString::UnicodeString(UnicodeString &&src) noexcept {
  312. copyFieldsFrom(src, true);
  313. }
  314. UnicodeString::UnicodeString(const UnicodeString& that,
  315. int32_t srcStart) {
  316. fUnion.fFields.fLengthAndFlags = kShortString;
  317. setTo(that, srcStart);
  318. }
  319. UnicodeString::UnicodeString(const UnicodeString& that,
  320. int32_t srcStart,
  321. int32_t srcLength) {
  322. fUnion.fFields.fLengthAndFlags = kShortString;
  323. setTo(that, srcStart, srcLength);
  324. }
  325. // Replaceable base class clone() default implementation, does not clone
  326. Replaceable *
  327. Replaceable::clone() const {
  328. return nullptr;
  329. }
  330. // UnicodeString overrides clone() with a real implementation
  331. UnicodeString *
  332. UnicodeString::clone() const {
  333. LocalPointer<UnicodeString> clonedString(new UnicodeString(*this));
  334. return clonedString.isValid() && !clonedString->isBogus() ? clonedString.orphan() : nullptr;
  335. }
  336. //========================================
  337. // array allocation
  338. //========================================
  339. namespace {
  340. const int32_t kGrowSize = 128;
  341. // The number of bytes for one int32_t reference counter and capacity UChars
  342. // must fit into a 32-bit size_t (at least when on a 32-bit platform).
  343. // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
  344. // and round up to a multiple of 16 bytes.
  345. // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
  346. // (With more complicated checks we could go up to 0x7ffffffd without rounding up,
  347. // but that does not seem worth it.)
  348. const int32_t kMaxCapacity = 0x7ffffff5;
  349. int32_t getGrowCapacity(int32_t newLength) {
  350. int32_t growSize = (newLength >> 2) + kGrowSize;
  351. if(growSize <= (kMaxCapacity - newLength)) {
  352. return newLength + growSize;
  353. } else {
  354. return kMaxCapacity;
  355. }
  356. }
  357. } // namespace
  358. UBool
  359. UnicodeString::allocate(int32_t capacity) {
  360. if(capacity <= US_STACKBUF_SIZE) {
  361. fUnion.fFields.fLengthAndFlags = kShortString;
  362. return true;
  363. }
  364. if(capacity <= kMaxCapacity) {
  365. ++capacity; // for the NUL
  366. // Switch to size_t which is unsigned so that we can allocate up to 4GB.
  367. // Reference counter + UChars.
  368. size_t numBytes = sizeof(int32_t) + static_cast<size_t>(capacity) * U_SIZEOF_UCHAR;
  369. // Round up to a multiple of 16.
  370. numBytes = (numBytes + 15) & ~15;
  371. int32_t* array = static_cast<int32_t*>(uprv_malloc(numBytes));
  372. if(array != nullptr) {
  373. // set initial refCount and point behind the refCount
  374. *array++ = 1;
  375. numBytes -= sizeof(int32_t);
  376. // have fArray point to the first char16_t
  377. fUnion.fFields.fArray = reinterpret_cast<char16_t*>(array);
  378. fUnion.fFields.fCapacity = static_cast<int32_t>(numBytes / U_SIZEOF_UCHAR);
  379. fUnion.fFields.fLengthAndFlags = kLongString;
  380. return true;
  381. }
  382. }
  383. fUnion.fFields.fLengthAndFlags = kIsBogus;
  384. fUnion.fFields.fArray = nullptr;
  385. fUnion.fFields.fCapacity = 0;
  386. return false;
  387. }
  388. //========================================
  389. // Destructor
  390. //========================================
  391. #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
  392. static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
  393. static u_atomic_int32_t beyondCount(0);
  394. U_CAPI void unistr_printLengths() {
  395. int32_t i;
  396. for(i = 0; i <= 59; ++i) {
  397. printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
  398. }
  399. int32_t beyond = beyondCount;
  400. for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
  401. beyond += finalLengthCounts[i];
  402. }
  403. printf(">59, %9d\n", beyond);
  404. }
  405. #endif
  406. UnicodeString::~UnicodeString()
  407. {
  408. #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
  409. // Count lengths of strings at the end of their lifetime.
  410. // Useful for discussion of a desirable stack buffer size.
  411. // Count the contents length, not the optional NUL terminator nor further capacity.
  412. // Ignore open-buffer strings and strings which alias external storage.
  413. if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
  414. if(hasShortLength()) {
  415. umtx_atomic_inc(finalLengthCounts + getShortLength());
  416. } else {
  417. umtx_atomic_inc(&beyondCount);
  418. }
  419. }
  420. #endif
  421. releaseArray();
  422. }
  423. //========================================
  424. // Factory methods
  425. //========================================
  426. UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
  427. UnicodeString result;
  428. result.setToUTF8(utf8);
  429. return result;
  430. }
  431. UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
  432. UnicodeString result;
  433. int32_t capacity;
  434. // Most UTF-32 strings will be BMP-only and result in a same-length
  435. // UTF-16 string. We overestimate the capacity just slightly,
  436. // just in case there are a few supplementary characters.
  437. if(length <= US_STACKBUF_SIZE) {
  438. capacity = US_STACKBUF_SIZE;
  439. } else {
  440. capacity = length + (length >> 4) + 4;
  441. }
  442. do {
  443. char16_t *utf16 = result.getBuffer(capacity);
  444. int32_t length16;
  445. UErrorCode errorCode = U_ZERO_ERROR;
  446. u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
  447. utf32, length,
  448. 0xfffd, // Substitution character.
  449. nullptr, // Don't care about number of substitutions.
  450. &errorCode);
  451. result.releaseBuffer(length16);
  452. if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
  453. capacity = length16 + 1; // +1 for the terminating NUL.
  454. continue;
  455. } else if(U_FAILURE(errorCode)) {
  456. result.setToBogus();
  457. }
  458. break;
  459. } while(true);
  460. return result;
  461. }
  462. //========================================
  463. // Assignment
  464. //========================================
  465. UnicodeString &
  466. UnicodeString::operator=(const UnicodeString &src) {
  467. return copyFrom(src);
  468. }
  469. UnicodeString &
  470. UnicodeString::fastCopyFrom(const UnicodeString &src) {
  471. return copyFrom(src, true);
  472. }
  473. UnicodeString &
  474. UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
  475. // if assigning to ourselves, do nothing
  476. if(this == &src) {
  477. return *this;
  478. }
  479. // is the right side bogus?
  480. if(src.isBogus()) {
  481. setToBogus();
  482. return *this;
  483. }
  484. // delete the current contents
  485. releaseArray();
  486. if(src.isEmpty()) {
  487. // empty string - use the stack buffer
  488. setToEmpty();
  489. return *this;
  490. }
  491. // fLength>0 and not an "open" src.getBuffer(minCapacity)
  492. fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
  493. switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
  494. case kShortString:
  495. // short string using the stack buffer, do the same
  496. uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
  497. getShortLength() * U_SIZEOF_UCHAR);
  498. break;
  499. case kLongString:
  500. // src uses a refCounted string buffer, use that buffer with refCount
  501. // src is const, use a cast - we don't actually change it
  502. const_cast<UnicodeString &>(src).addRef();
  503. // copy all fields, share the reference-counted buffer
  504. fUnion.fFields.fArray = src.fUnion.fFields.fArray;
  505. fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
  506. if(!hasShortLength()) {
  507. fUnion.fFields.fLength = src.fUnion.fFields.fLength;
  508. }
  509. break;
  510. case kReadonlyAlias:
  511. if(fastCopy) {
  512. // src is a readonly alias, do the same
  513. // -> maintain the readonly alias as such
  514. fUnion.fFields.fArray = src.fUnion.fFields.fArray;
  515. fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
  516. if(!hasShortLength()) {
  517. fUnion.fFields.fLength = src.fUnion.fFields.fLength;
  518. }
  519. break;
  520. }
  521. // else if(!fastCopy) fall through to case kWritableAlias
  522. // -> allocate a new buffer and copy the contents
  523. U_FALLTHROUGH;
  524. case kWritableAlias: {
  525. // src is a writable alias; we make a copy of that instead
  526. int32_t srcLength = src.length();
  527. if(allocate(srcLength)) {
  528. u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
  529. setLength(srcLength);
  530. break;
  531. }
  532. // if there is not enough memory, then fall through to setting to bogus
  533. U_FALLTHROUGH;
  534. }
  535. default:
  536. // if src is bogus, set ourselves to bogus
  537. // do not call setToBogus() here because fArray and flags are not consistent here
  538. fUnion.fFields.fLengthAndFlags = kIsBogus;
  539. fUnion.fFields.fArray = nullptr;
  540. fUnion.fFields.fCapacity = 0;
  541. break;
  542. }
  543. return *this;
  544. }
  545. UnicodeString &UnicodeString::operator=(UnicodeString &&src) noexcept {
  546. // No explicit check for self move assignment, consistent with standard library.
  547. // Self move assignment causes no crash nor leak but might make the object bogus.
  548. releaseArray();
  549. copyFieldsFrom(src, true);
  550. return *this;
  551. }
  552. // Same as move assignment except without memory management.
  553. void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) noexcept {
  554. int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
  555. if(lengthAndFlags & kUsingStackBuffer) {
  556. // Short string using the stack buffer, copy the contents.
  557. // Check for self assignment to prevent "overlap in memcpy" warnings,
  558. // although it should be harmless to copy a buffer to itself exactly.
  559. if(this != &src) {
  560. uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
  561. getShortLength() * U_SIZEOF_UCHAR);
  562. }
  563. } else {
  564. // In all other cases, copy all fields.
  565. fUnion.fFields.fArray = src.fUnion.fFields.fArray;
  566. fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
  567. if(!hasShortLength()) {
  568. fUnion.fFields.fLength = src.fUnion.fFields.fLength;
  569. }
  570. if(setSrcToBogus) {
  571. // Set src to bogus without releasing any memory.
  572. src.fUnion.fFields.fLengthAndFlags = kIsBogus;
  573. src.fUnion.fFields.fArray = nullptr;
  574. src.fUnion.fFields.fCapacity = 0;
  575. }
  576. }
  577. }
  578. void UnicodeString::swap(UnicodeString &other) noexcept {
  579. UnicodeString temp; // Empty short string: Known not to need releaseArray().
  580. // Copy fields without resetting source values in between.
  581. temp.copyFieldsFrom(*this, false);
  582. this->copyFieldsFrom(other, false);
  583. other.copyFieldsFrom(temp, false);
  584. // Set temp to an empty string so that other's memory is not released twice.
  585. temp.fUnion.fFields.fLengthAndFlags = kShortString;
  586. }
  587. //========================================
  588. // Miscellaneous operations
  589. //========================================
  590. UnicodeString UnicodeString::unescape() const {
  591. UnicodeString result(length(), static_cast<UChar32>(0), static_cast<int32_t>(0)); // construct with capacity
  592. if (result.isBogus()) {
  593. return result;
  594. }
  595. const char16_t *array = getBuffer();
  596. int32_t len = length();
  597. int32_t prev = 0;
  598. for (int32_t i=0;;) {
  599. if (i == len) {
  600. result.append(array, prev, len - prev);
  601. break;
  602. }
  603. if (array[i++] == 0x5C /*'\\'*/) {
  604. result.append(array, prev, (i - 1) - prev);
  605. UChar32 c = unescapeAt(i); // advances i
  606. if (c < 0) {
  607. result.remove(); // return empty string
  608. break; // invalid escape sequence
  609. }
  610. result.append(c);
  611. prev = i;
  612. }
  613. }
  614. return result;
  615. }
  616. UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
  617. return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
  618. }
  619. //========================================
  620. // Read-only implementation
  621. //========================================
  622. UBool
  623. UnicodeString::doEquals(const char16_t *text, int32_t len) const {
  624. // Requires: this not bogus and have same lengths.
  625. // Byte-wise comparison works for equality regardless of endianness.
  626. return uprv_memcmp(getArrayStart(), text, len * U_SIZEOF_UCHAR) == 0;
  627. }
  628. UBool
  629. UnicodeString::doEqualsSubstring( int32_t start,
  630. int32_t length,
  631. const char16_t *srcChars,
  632. int32_t srcStart,
  633. int32_t srcLength) const
  634. {
  635. // compare illegal string values
  636. if(isBogus()) {
  637. return false;
  638. }
  639. // pin indices to legal values
  640. pinIndices(start, length);
  641. if(srcChars == nullptr) {
  642. // treat const char16_t *srcChars==nullptr as an empty string
  643. return length == 0 ? true : false;
  644. }
  645. // get the correct pointer
  646. const char16_t *chars = getArrayStart();
  647. chars += start;
  648. srcChars += srcStart;
  649. // get the srcLength if necessary
  650. if(srcLength < 0) {
  651. srcLength = u_strlen(srcChars + srcStart);
  652. }
  653. if (length != srcLength) {
  654. return false;
  655. }
  656. if(length == 0 || chars == srcChars) {
  657. return true;
  658. }
  659. return u_memcmp(chars, srcChars, srcLength) == 0;
  660. }
  661. int8_t
  662. UnicodeString::doCompare( int32_t start,
  663. int32_t length,
  664. const char16_t *srcChars,
  665. int32_t srcStart,
  666. int32_t srcLength) const
  667. {
  668. // compare illegal string values
  669. if(isBogus()) {
  670. return -1;
  671. }
  672. // pin indices to legal values
  673. pinIndices(start, length);
  674. if(srcChars == nullptr) {
  675. // treat const char16_t *srcChars==nullptr as an empty string
  676. return length == 0 ? 0 : 1;
  677. }
  678. // get the correct pointer
  679. const char16_t *chars = getArrayStart();
  680. chars += start;
  681. srcChars += srcStart;
  682. int32_t minLength;
  683. int8_t lengthResult;
  684. // get the srcLength if necessary
  685. if(srcLength < 0) {
  686. srcLength = u_strlen(srcChars + srcStart);
  687. }
  688. // are we comparing different lengths?
  689. if(length != srcLength) {
  690. if(length < srcLength) {
  691. minLength = length;
  692. lengthResult = -1;
  693. } else {
  694. minLength = srcLength;
  695. lengthResult = 1;
  696. }
  697. } else {
  698. minLength = length;
  699. lengthResult = 0;
  700. }
  701. /*
  702. * note that uprv_memcmp() returns an int but we return an int8_t;
  703. * we need to take care not to truncate the result -
  704. * one way to do this is to right-shift the value to
  705. * move the sign bit into the lower 8 bits and making sure that this
  706. * does not become 0 itself
  707. */
  708. if(minLength > 0 && chars != srcChars) {
  709. int32_t result;
  710. # if U_IS_BIG_ENDIAN
  711. // big-endian: byte comparison works
  712. result = uprv_memcmp(chars, srcChars, minLength * sizeof(char16_t));
  713. if(result != 0) {
  714. return (int8_t)(result >> 15 | 1);
  715. }
  716. # else
  717. // little-endian: compare char16_t units
  718. do {
  719. result = static_cast<int32_t>(*(chars++)) - static_cast<int32_t>(*(srcChars++));
  720. if(result != 0) {
  721. return static_cast<int8_t>(result >> 15 | 1);
  722. }
  723. } while(--minLength > 0);
  724. # endif
  725. }
  726. return lengthResult;
  727. }
  728. /* String compare in code point order - doCompare() compares in code unit order. */
  729. int8_t
  730. UnicodeString::doCompareCodePointOrder(int32_t start,
  731. int32_t length,
  732. const char16_t *srcChars,
  733. int32_t srcStart,
  734. int32_t srcLength) const
  735. {
  736. // compare illegal string values
  737. // treat const char16_t *srcChars==nullptr as an empty string
  738. if(isBogus()) {
  739. return -1;
  740. }
  741. // pin indices to legal values
  742. pinIndices(start, length);
  743. if(srcChars == nullptr) {
  744. srcStart = srcLength = 0;
  745. }
  746. int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=nullptr)?(srcChars + srcStart):nullptr, srcLength, false, true);
  747. /* translate the 32-bit result into an 8-bit one */
  748. if(diff!=0) {
  749. return static_cast<int8_t>(diff >> 15 | 1);
  750. } else {
  751. return 0;
  752. }
  753. }
  754. int32_t
  755. UnicodeString::getLength() const {
  756. return length();
  757. }
  758. char16_t
  759. UnicodeString::getCharAt(int32_t offset) const {
  760. return charAt(offset);
  761. }
  762. UChar32
  763. UnicodeString::getChar32At(int32_t offset) const {
  764. return char32At(offset);
  765. }
  766. UChar32
  767. UnicodeString::char32At(int32_t offset) const
  768. {
  769. int32_t len = length();
  770. if (static_cast<uint32_t>(offset) < static_cast<uint32_t>(len)) {
  771. const char16_t *array = getArrayStart();
  772. UChar32 c;
  773. U16_GET(array, 0, offset, len, c);
  774. return c;
  775. } else {
  776. return kInvalidUChar;
  777. }
  778. }
  779. int32_t
  780. UnicodeString::getChar32Start(int32_t offset) const {
  781. if (static_cast<uint32_t>(offset) < static_cast<uint32_t>(length())) {
  782. const char16_t *array = getArrayStart();
  783. U16_SET_CP_START(array, 0, offset);
  784. return offset;
  785. } else {
  786. return 0;
  787. }
  788. }
  789. int32_t
  790. UnicodeString::getChar32Limit(int32_t offset) const {
  791. int32_t len = length();
  792. if (static_cast<uint32_t>(offset) < static_cast<uint32_t>(len)) {
  793. const char16_t *array = getArrayStart();
  794. U16_SET_CP_LIMIT(array, 0, offset, len);
  795. return offset;
  796. } else {
  797. return len;
  798. }
  799. }
  800. int32_t
  801. UnicodeString::countChar32(int32_t start, int32_t length) const {
  802. pinIndices(start, length);
  803. // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for nullptr
  804. return u_countChar32(getArrayStart()+start, length);
  805. }
  806. UBool
  807. UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
  808. pinIndices(start, length);
  809. // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for nullptr
  810. return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
  811. }
  812. int32_t
  813. UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
  814. // pin index
  815. int32_t len = length();
  816. if(index<0) {
  817. index=0;
  818. } else if(index>len) {
  819. index=len;
  820. }
  821. const char16_t *array = getArrayStart();
  822. if(delta>0) {
  823. U16_FWD_N(array, index, len, delta);
  824. } else {
  825. U16_BACK_N(array, 0, index, -delta);
  826. }
  827. return index;
  828. }
  829. void
  830. UnicodeString::doExtract(int32_t start,
  831. int32_t length,
  832. char16_t *dst,
  833. int32_t dstStart) const
  834. {
  835. // pin indices to legal values
  836. pinIndices(start, length);
  837. // do not copy anything if we alias dst itself
  838. const char16_t *array = getArrayStart();
  839. if(array + start != dst + dstStart) {
  840. us_arrayCopy(array, start, dst, dstStart, length);
  841. }
  842. }
  843. int32_t
  844. UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
  845. UErrorCode &errorCode) const {
  846. int32_t len = length();
  847. if(U_SUCCESS(errorCode)) {
  848. if (isBogus() || destCapacity < 0 || (destCapacity > 0 && dest == nullptr)) {
  849. errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  850. } else {
  851. const char16_t *array = getArrayStart();
  852. if(len>0 && len<=destCapacity && array!=dest) {
  853. u_memcpy(dest, array, len);
  854. }
  855. return u_terminateUChars(dest, destCapacity, len, &errorCode);
  856. }
  857. }
  858. return len;
  859. }
  860. int32_t
  861. UnicodeString::extract(int32_t start,
  862. int32_t length,
  863. char *target,
  864. int32_t targetCapacity,
  865. enum EInvariant) const
  866. {
  867. // if the arguments are illegal, then do nothing
  868. if(targetCapacity < 0 || (targetCapacity > 0 && target == nullptr)) {
  869. return 0;
  870. }
  871. // pin the indices to legal values
  872. pinIndices(start, length);
  873. if(length <= targetCapacity) {
  874. u_UCharsToChars(getArrayStart() + start, target, length);
  875. }
  876. UErrorCode status = U_ZERO_ERROR;
  877. return u_terminateChars(target, targetCapacity, length, &status);
  878. }
  879. UnicodeString
  880. UnicodeString::tempSubString(int32_t start, int32_t len) const {
  881. pinIndices(start, len);
  882. const char16_t *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
  883. if(array==nullptr) {
  884. array=fUnion.fStackFields.fBuffer; // anything not nullptr because that would make an empty string
  885. len=-2; // bogus result string
  886. }
  887. return UnicodeString(false, array + start, len);
  888. }
  889. int32_t
  890. UnicodeString::toUTF8(int32_t start, int32_t len,
  891. char *target, int32_t capacity) const {
  892. pinIndices(start, len);
  893. int32_t length8;
  894. UErrorCode errorCode = U_ZERO_ERROR;
  895. u_strToUTF8WithSub(target, capacity, &length8,
  896. getBuffer() + start, len,
  897. 0xFFFD, // Standard substitution character.
  898. nullptr, // Don't care about number of substitutions.
  899. &errorCode);
  900. return length8;
  901. }
  902. #if U_CHARSET_IS_UTF8
  903. int32_t
  904. UnicodeString::extract(int32_t start, int32_t len,
  905. char *target, uint32_t dstSize) const {
  906. // if the arguments are illegal, then do nothing
  907. if (/*dstSize < 0 || */(dstSize > 0 && target == nullptr)) {
  908. return 0;
  909. }
  910. return toUTF8(start, len, target, dstSize <= 0x7fffffff ? static_cast<int32_t>(dstSize) : 0x7fffffff);
  911. }
  912. // else see unistr_cnv.cpp
  913. #endif
  914. void
  915. UnicodeString::extractBetween(int32_t start,
  916. int32_t limit,
  917. UnicodeString& target) const {
  918. pinIndex(start);
  919. pinIndex(limit);
  920. doExtract(start, limit - start, target);
  921. }
  922. // When converting from UTF-16 to UTF-8, the result will have at most 3 times
  923. // as many bytes as the source has UChars.
  924. // The "worst cases" are writing systems like Indic, Thai and CJK with
  925. // 3:1 bytes:UChars.
  926. void
  927. UnicodeString::toUTF8(ByteSink &sink) const {
  928. int32_t length16 = length();
  929. if(length16 != 0) {
  930. char stackBuffer[1024];
  931. int32_t capacity = static_cast<int32_t>(sizeof(stackBuffer));
  932. UBool utf8IsOwned = false;
  933. char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
  934. 3*length16,
  935. stackBuffer, capacity,
  936. &capacity);
  937. int32_t length8 = 0;
  938. UErrorCode errorCode = U_ZERO_ERROR;
  939. u_strToUTF8WithSub(utf8, capacity, &length8,
  940. getBuffer(), length16,
  941. 0xFFFD, // Standard substitution character.
  942. nullptr, // Don't care about number of substitutions.
  943. &errorCode);
  944. if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
  945. utf8 = static_cast<char*>(uprv_malloc(length8));
  946. if(utf8 != nullptr) {
  947. utf8IsOwned = true;
  948. errorCode = U_ZERO_ERROR;
  949. u_strToUTF8WithSub(utf8, length8, &length8,
  950. getBuffer(), length16,
  951. 0xFFFD, // Standard substitution character.
  952. nullptr, // Don't care about number of substitutions.
  953. &errorCode);
  954. } else {
  955. errorCode = U_MEMORY_ALLOCATION_ERROR;
  956. }
  957. }
  958. if(U_SUCCESS(errorCode)) {
  959. sink.Append(utf8, length8);
  960. sink.Flush();
  961. }
  962. if(utf8IsOwned) {
  963. uprv_free(utf8);
  964. }
  965. }
  966. }
  967. int32_t
  968. UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
  969. int32_t length32=0;
  970. if(U_SUCCESS(errorCode)) {
  971. // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
  972. u_strToUTF32WithSub(utf32, capacity, &length32,
  973. getBuffer(), length(),
  974. 0xfffd, // Substitution character.
  975. nullptr, // Don't care about number of substitutions.
  976. &errorCode);
  977. }
  978. return length32;
  979. }
  980. int32_t
  981. UnicodeString::indexOf(const char16_t *srcChars,
  982. int32_t srcStart,
  983. int32_t srcLength,
  984. int32_t start,
  985. int32_t length) const
  986. {
  987. if (isBogus() || srcChars == nullptr || srcStart < 0 || srcLength == 0) {
  988. return -1;
  989. }
  990. // UnicodeString does not find empty substrings
  991. if(srcLength < 0 && srcChars[srcStart] == 0) {
  992. return -1;
  993. }
  994. // get the indices within bounds
  995. pinIndices(start, length);
  996. // find the first occurrence of the substring
  997. const char16_t *array = getArrayStart();
  998. const char16_t *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
  999. if(match == nullptr) {
  1000. return -1;
  1001. } else {
  1002. return static_cast<int32_t>(match - array);
  1003. }
  1004. }
  1005. int32_t
  1006. UnicodeString::doIndexOf(char16_t c,
  1007. int32_t start,
  1008. int32_t length) const
  1009. {
  1010. // pin indices
  1011. pinIndices(start, length);
  1012. // find the first occurrence of c
  1013. const char16_t *array = getArrayStart();
  1014. const char16_t *match = u_memchr(array + start, c, length);
  1015. if(match == nullptr) {
  1016. return -1;
  1017. } else {
  1018. return static_cast<int32_t>(match - array);
  1019. }
  1020. }
  1021. int32_t
  1022. UnicodeString::doIndexOf(UChar32 c,
  1023. int32_t start,
  1024. int32_t length) const {
  1025. // pin indices
  1026. pinIndices(start, length);
  1027. // find the first occurrence of c
  1028. const char16_t *array = getArrayStart();
  1029. const char16_t *match = u_memchr32(array + start, c, length);
  1030. if(match == nullptr) {
  1031. return -1;
  1032. } else {
  1033. return static_cast<int32_t>(match - array);
  1034. }
  1035. }
  1036. int32_t
  1037. UnicodeString::lastIndexOf(const char16_t *srcChars,
  1038. int32_t srcStart,
  1039. int32_t srcLength,
  1040. int32_t start,
  1041. int32_t length) const
  1042. {
  1043. if (isBogus() || srcChars == nullptr || srcStart < 0 || srcLength == 0) {
  1044. return -1;
  1045. }
  1046. // UnicodeString does not find empty substrings
  1047. if(srcLength < 0 && srcChars[srcStart] == 0) {
  1048. return -1;
  1049. }
  1050. // get the indices within bounds
  1051. pinIndices(start, length);
  1052. // find the last occurrence of the substring
  1053. const char16_t *array = getArrayStart();
  1054. const char16_t *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
  1055. if(match == nullptr) {
  1056. return -1;
  1057. } else {
  1058. return static_cast<int32_t>(match - array);
  1059. }
  1060. }
  1061. int32_t
  1062. UnicodeString::doLastIndexOf(char16_t c,
  1063. int32_t start,
  1064. int32_t length) const
  1065. {
  1066. if(isBogus()) {
  1067. return -1;
  1068. }
  1069. // pin indices
  1070. pinIndices(start, length);
  1071. // find the last occurrence of c
  1072. const char16_t *array = getArrayStart();
  1073. const char16_t *match = u_memrchr(array + start, c, length);
  1074. if(match == nullptr) {
  1075. return -1;
  1076. } else {
  1077. return static_cast<int32_t>(match - array);
  1078. }
  1079. }
  1080. int32_t
  1081. UnicodeString::doLastIndexOf(UChar32 c,
  1082. int32_t start,
  1083. int32_t length) const {
  1084. // pin indices
  1085. pinIndices(start, length);
  1086. // find the last occurrence of c
  1087. const char16_t *array = getArrayStart();
  1088. const char16_t *match = u_memrchr32(array + start, c, length);
  1089. if(match == nullptr) {
  1090. return -1;
  1091. } else {
  1092. return static_cast<int32_t>(match - array);
  1093. }
  1094. }
  1095. //========================================
  1096. // Write implementation
  1097. //========================================
  1098. UnicodeString&
  1099. UnicodeString::findAndReplace(int32_t start,
  1100. int32_t length,
  1101. const UnicodeString& oldText,
  1102. int32_t oldStart,
  1103. int32_t oldLength,
  1104. const UnicodeString& newText,
  1105. int32_t newStart,
  1106. int32_t newLength)
  1107. {
  1108. if(isBogus() || oldText.isBogus() || newText.isBogus()) {
  1109. return *this;
  1110. }
  1111. pinIndices(start, length);
  1112. oldText.pinIndices(oldStart, oldLength);
  1113. newText.pinIndices(newStart, newLength);
  1114. if(oldLength == 0) {
  1115. return *this;
  1116. }
  1117. while(length > 0 && length >= oldLength) {
  1118. int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
  1119. if(pos < 0) {
  1120. // no more oldText's here: done
  1121. break;
  1122. } else {
  1123. // we found oldText, replace it by newText and go beyond it
  1124. replace(pos, oldLength, newText, newStart, newLength);
  1125. length -= pos + oldLength - start;
  1126. start = pos + newLength;
  1127. }
  1128. }
  1129. return *this;
  1130. }
  1131. void
  1132. UnicodeString::setToBogus()
  1133. {
  1134. releaseArray();
  1135. fUnion.fFields.fLengthAndFlags = kIsBogus;
  1136. fUnion.fFields.fArray = nullptr;
  1137. fUnion.fFields.fCapacity = 0;
  1138. }
  1139. // turn a bogus string into an empty one
  1140. void
  1141. UnicodeString::unBogus() {
  1142. if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
  1143. setToEmpty();
  1144. }
  1145. }
  1146. const char16_t *
  1147. UnicodeString::getTerminatedBuffer() {
  1148. if(!isWritable()) {
  1149. return nullptr;
  1150. }
  1151. char16_t *array = getArrayStart();
  1152. int32_t len = length();
  1153. if(len < getCapacity()) {
  1154. if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
  1155. // If len<capacity on a read-only alias, then array[len] is
  1156. // either the original NUL (if constructed with (true, s, length))
  1157. // or one of the original string contents characters (if later truncated),
  1158. // therefore we can assume that array[len] is initialized memory.
  1159. if(array[len] == 0) {
  1160. return array;
  1161. }
  1162. } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
  1163. // kRefCounted: Do not write the NUL if the buffer is shared.
  1164. // That is mostly safe, except when the length of one copy was modified
  1165. // without copy-on-write, e.g., via truncate(newLength) or remove().
  1166. // Then the NUL would be written into the middle of another copy's string.
  1167. // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
  1168. // Do not test if there is a NUL already because it might be uninitialized memory.
  1169. // (That would be safe, but tools like valgrind & Purify would complain.)
  1170. array[len] = 0;
  1171. return array;
  1172. }
  1173. }
  1174. if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
  1175. array = getArrayStart();
  1176. array[len] = 0;
  1177. return array;
  1178. } else {
  1179. return nullptr;
  1180. }
  1181. }
  1182. // setTo() analogous to the readonly-aliasing constructor with the same signature
  1183. UnicodeString &
  1184. UnicodeString::setTo(UBool isTerminated,
  1185. ConstChar16Ptr textPtr,
  1186. int32_t textLength)
  1187. {
  1188. if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
  1189. // do not modify a string that has an "open" getBuffer(minCapacity)
  1190. return *this;
  1191. }
  1192. const char16_t *text = textPtr;
  1193. if(text == nullptr) {
  1194. // treat as an empty string, do not alias
  1195. releaseArray();
  1196. setToEmpty();
  1197. return *this;
  1198. }
  1199. if( textLength < -1 ||
  1200. (textLength == -1 && !isTerminated) ||
  1201. (textLength >= 0 && isTerminated && text[textLength] != 0)
  1202. ) {
  1203. setToBogus();
  1204. return *this;
  1205. }
  1206. releaseArray();
  1207. if(textLength == -1) {
  1208. // text is terminated, or else it would have failed the above test
  1209. textLength = u_strlen(text);
  1210. }
  1211. fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
  1212. setArray(const_cast<char16_t*>(text), textLength, isTerminated ? textLength + 1 : textLength);
  1213. return *this;
  1214. }
  1215. // setTo() analogous to the writable-aliasing constructor with the same signature
  1216. UnicodeString &
  1217. UnicodeString::setTo(char16_t *buffer,
  1218. int32_t buffLength,
  1219. int32_t buffCapacity) {
  1220. if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
  1221. // do not modify a string that has an "open" getBuffer(minCapacity)
  1222. return *this;
  1223. }
  1224. if(buffer == nullptr) {
  1225. // treat as an empty string, do not alias
  1226. releaseArray();
  1227. setToEmpty();
  1228. return *this;
  1229. }
  1230. if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
  1231. setToBogus();
  1232. return *this;
  1233. } else if(buffLength == -1) {
  1234. // buffLength = u_strlen(buff); but do not look beyond buffCapacity
  1235. const char16_t *p = buffer, *limit = buffer + buffCapacity;
  1236. while(p != limit && *p != 0) {
  1237. ++p;
  1238. }
  1239. buffLength = static_cast<int32_t>(p - buffer);
  1240. }
  1241. releaseArray();
  1242. fUnion.fFields.fLengthAndFlags = kWritableAlias;
  1243. setArray(buffer, buffLength, buffCapacity);
  1244. return *this;
  1245. }
  1246. UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
  1247. unBogus();
  1248. int32_t length = utf8.length();
  1249. int32_t capacity;
  1250. // The UTF-16 string will be at most as long as the UTF-8 string.
  1251. if(length <= US_STACKBUF_SIZE) {
  1252. capacity = US_STACKBUF_SIZE;
  1253. } else {
  1254. capacity = length + 1; // +1 for the terminating NUL.
  1255. }
  1256. char16_t *utf16 = getBuffer(capacity);
  1257. int32_t length16;
  1258. UErrorCode errorCode = U_ZERO_ERROR;
  1259. u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
  1260. utf8.data(), length,
  1261. 0xfffd, // Substitution character.
  1262. nullptr, // Don't care about number of substitutions.
  1263. &errorCode);
  1264. releaseBuffer(length16);
  1265. if(U_FAILURE(errorCode)) {
  1266. setToBogus();
  1267. }
  1268. return *this;
  1269. }
  1270. UnicodeString&
  1271. UnicodeString::setCharAt(int32_t offset,
  1272. char16_t c)
  1273. {
  1274. int32_t len = length();
  1275. if(cloneArrayIfNeeded() && len > 0) {
  1276. if(offset < 0) {
  1277. offset = 0;
  1278. } else if(offset >= len) {
  1279. offset = len - 1;
  1280. }
  1281. getArrayStart()[offset] = c;
  1282. }
  1283. return *this;
  1284. }
  1285. UnicodeString&
  1286. UnicodeString::replace(int32_t start,
  1287. int32_t _length,
  1288. UChar32 srcChar) {
  1289. char16_t buffer[U16_MAX_LENGTH];
  1290. int32_t count = 0;
  1291. UBool isError = false;
  1292. U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
  1293. // We test isError so that the compiler does not complain that we don't.
  1294. // If isError (srcChar is not a valid code point) then count==0 which means
  1295. // we remove the source segment rather than replacing it with srcChar.
  1296. return doReplace(start, _length, buffer, 0, isError ? 0 : count);
  1297. }
  1298. UnicodeString&
  1299. UnicodeString::append(UChar32 srcChar) {
  1300. char16_t buffer[U16_MAX_LENGTH];
  1301. int32_t _length = 0;
  1302. UBool isError = false;
  1303. U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
  1304. // We test isError so that the compiler does not complain that we don't.
  1305. // If isError then _length==0 which turns the doAppend() into a no-op anyway.
  1306. return isError ? *this : doAppend(buffer, 0, _length);
  1307. }
  1308. UnicodeString&
  1309. UnicodeString::doReplace( int32_t start,
  1310. int32_t length,
  1311. const UnicodeString& src,
  1312. int32_t srcStart,
  1313. int32_t srcLength)
  1314. {
  1315. // pin the indices to legal values
  1316. src.pinIndices(srcStart, srcLength);
  1317. // get the characters from src
  1318. // and replace the range in ourselves with them
  1319. return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
  1320. }
  1321. UnicodeString&
  1322. UnicodeString::doReplace(int32_t start,
  1323. int32_t length,
  1324. const char16_t *srcChars,
  1325. int32_t srcStart,
  1326. int32_t srcLength)
  1327. {
  1328. if(!isWritable()) {
  1329. return *this;
  1330. }
  1331. int32_t oldLength = this->length();
  1332. // optimize (read-only alias).remove(0, start) and .remove(start, end)
  1333. if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
  1334. if(start == 0) {
  1335. // remove prefix by adjusting the array pointer
  1336. pinIndex(length);
  1337. fUnion.fFields.fArray += length;
  1338. fUnion.fFields.fCapacity -= length;
  1339. setLength(oldLength - length);
  1340. return *this;
  1341. } else {
  1342. pinIndex(start);
  1343. if(length >= (oldLength - start)) {
  1344. // remove suffix by reducing the length (like truncate())
  1345. setLength(start);
  1346. fUnion.fFields.fCapacity = start; // not NUL-terminated any more
  1347. return *this;
  1348. }
  1349. }
  1350. }
  1351. if(start == oldLength) {
  1352. return doAppend(srcChars, srcStart, srcLength);
  1353. }
  1354. if (srcChars == nullptr) {
  1355. srcLength = 0;
  1356. } else {
  1357. // Perform all remaining operations relative to srcChars + srcStart.
  1358. // From this point forward, do not use srcStart.
  1359. srcChars += srcStart;
  1360. if (srcLength < 0) {
  1361. // get the srcLength if necessary
  1362. srcLength = u_strlen(srcChars);
  1363. }
  1364. }
  1365. // pin the indices to legal values
  1366. pinIndices(start, length);
  1367. // Calculate the size of the string after the replace.
  1368. // Avoid int32_t overflow.
  1369. int32_t newLength = oldLength - length;
  1370. if(srcLength > (INT32_MAX - newLength)) {
  1371. setToBogus();
  1372. return *this;
  1373. }
  1374. newLength += srcLength;
  1375. // Check for insertion into ourself
  1376. const char16_t *oldArray = getArrayStart();
  1377. if (isBufferWritable() &&
  1378. oldArray < srcChars + srcLength &&
  1379. srcChars < oldArray + oldLength) {
  1380. // Copy into a new UnicodeString and start over
  1381. UnicodeString copy(srcChars, srcLength);
  1382. if (copy.isBogus()) {
  1383. setToBogus();
  1384. return *this;
  1385. }
  1386. return doReplace(start, length, copy.getArrayStart(), 0, srcLength);
  1387. }
  1388. // cloneArrayIfNeeded(doCopyArray=false) may change fArray but will not copy the current contents;
  1389. // therefore we need to keep the current fArray
  1390. char16_t oldStackBuffer[US_STACKBUF_SIZE];
  1391. if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
  1392. // copy the stack buffer contents because it will be overwritten with
  1393. // fUnion.fFields values
  1394. u_memcpy(oldStackBuffer, oldArray, oldLength);
  1395. oldArray = oldStackBuffer;
  1396. }
  1397. // clone our array and allocate a bigger array if needed
  1398. int32_t *bufferToDelete = nullptr;
  1399. if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
  1400. false, &bufferToDelete)
  1401. ) {
  1402. return *this;
  1403. }
  1404. // now do the replace
  1405. char16_t *newArray = getArrayStart();
  1406. if(newArray != oldArray) {
  1407. // if fArray changed, then we need to copy everything except what will change
  1408. us_arrayCopy(oldArray, 0, newArray, 0, start);
  1409. us_arrayCopy(oldArray, start + length,
  1410. newArray, start + srcLength,
  1411. oldLength - (start + length));
  1412. } else if(length != srcLength) {
  1413. // fArray did not change; copy only the portion that isn't changing, leaving a hole
  1414. us_arrayCopy(oldArray, start + length,
  1415. newArray, start + srcLength,
  1416. oldLength - (start + length));
  1417. }
  1418. // now fill in the hole with the new string
  1419. us_arrayCopy(srcChars, 0, newArray, start, srcLength);
  1420. setLength(newLength);
  1421. // delayed delete in case srcChars == fArray when we started, and
  1422. // to keep oldArray alive for the above operations
  1423. if (bufferToDelete) {
  1424. uprv_free(bufferToDelete);
  1425. }
  1426. return *this;
  1427. }
  1428. UnicodeString&
  1429. UnicodeString::doReplace(int32_t start, int32_t length, std::u16string_view src) {
  1430. if (!isWritable()) {
  1431. return *this;
  1432. }
  1433. if (src.length() > INT32_MAX) {
  1434. setToBogus();
  1435. return *this;
  1436. }
  1437. return doReplace(start, length, src.data(), 0, static_cast<int32_t>(src.length()));
  1438. }
  1439. // Versions of doReplace() only for append() variants.
  1440. // doReplace() and doAppend() optimize for different cases.
  1441. UnicodeString&
  1442. UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
  1443. if(srcLength == 0) {
  1444. return *this;
  1445. }
  1446. // pin the indices to legal values
  1447. src.pinIndices(srcStart, srcLength);
  1448. return doAppend(src.getArrayStart(), srcStart, srcLength);
  1449. }
  1450. UnicodeString&
  1451. UnicodeString::doAppend(const char16_t *srcChars, int32_t srcStart, int32_t srcLength) {
  1452. if(!isWritable() || srcLength == 0 || srcChars == nullptr) {
  1453. return *this;
  1454. }
  1455. // Perform all remaining operations relative to srcChars + srcStart.
  1456. // From this point forward, do not use srcStart.
  1457. srcChars += srcStart;
  1458. if(srcLength < 0) {
  1459. // get the srcLength if necessary
  1460. if((srcLength = u_strlen(srcChars)) == 0) {
  1461. return *this;
  1462. }
  1463. }
  1464. int32_t oldLength = length();
  1465. int32_t newLength;
  1466. if (srcLength <= getCapacity() - oldLength && isBufferWritable()) {
  1467. newLength = oldLength + srcLength;
  1468. // Faster than a memmove
  1469. if (srcLength <= 4) {
  1470. char16_t *arr = getArrayStart();
  1471. arr[oldLength] = srcChars[0];
  1472. if (srcLength > 1) arr[oldLength+1] = srcChars[1];
  1473. if (srcLength > 2) arr[oldLength+2] = srcChars[2];
  1474. if (srcLength > 3) arr[oldLength+3] = srcChars[3];
  1475. setLength(newLength);
  1476. return *this;
  1477. }
  1478. } else {
  1479. if (uprv_add32_overflow(oldLength, srcLength, &newLength)) {
  1480. setToBogus();
  1481. return *this;
  1482. }
  1483. // Check for append onto ourself
  1484. const char16_t* oldArray = getArrayStart();
  1485. if (isBufferWritable() &&
  1486. oldArray < srcChars + srcLength &&
  1487. srcChars < oldArray + oldLength) {
  1488. // Copy into a new UnicodeString and start over
  1489. UnicodeString copy(srcChars, srcLength);
  1490. if (copy.isBogus()) {
  1491. setToBogus();
  1492. return *this;
  1493. }
  1494. return doAppend(copy.getArrayStart(), 0, srcLength);
  1495. }
  1496. // optimize append() onto a large-enough, owned string
  1497. if (!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
  1498. return *this;
  1499. }
  1500. }
  1501. char16_t *newArray = getArrayStart();
  1502. // Do not copy characters when
  1503. // char16_t *buffer=str.getAppendBuffer(...);
  1504. // is followed by
  1505. // str.append(buffer, length);
  1506. // or
  1507. // str.appendString(buffer, length)
  1508. // or similar.
  1509. if(srcChars != newArray + oldLength) {
  1510. us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength);
  1511. }
  1512. setLength(newLength);
  1513. return *this;
  1514. }
  1515. UnicodeString&
  1516. UnicodeString::doAppend(std::u16string_view src) {
  1517. if (!isWritable() || src.empty()) {
  1518. return *this;
  1519. }
  1520. if (src.length() > INT32_MAX) {
  1521. setToBogus();
  1522. return *this;
  1523. }
  1524. return doAppend(src.data(), 0, static_cast<int32_t>(src.length()));
  1525. }
  1526. /**
  1527. * Replaceable API
  1528. */
  1529. void
  1530. UnicodeString::handleReplaceBetween(int32_t start,
  1531. int32_t limit,
  1532. const UnicodeString& text) {
  1533. replaceBetween(start, limit, text);
  1534. }
  1535. /**
  1536. * Replaceable API
  1537. */
  1538. void
  1539. UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
  1540. if (limit <= start) {
  1541. return; // Nothing to do; avoid bogus malloc call
  1542. }
  1543. char16_t* text = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (limit - start)));
  1544. // Check to make sure text is not null.
  1545. if (text != nullptr) {
  1546. extractBetween(start, limit, text, 0);
  1547. insert(dest, text, 0, limit - start);
  1548. uprv_free(text);
  1549. }
  1550. }
  1551. /**
  1552. * Replaceable API
  1553. *
  1554. * NOTE: This is for the Replaceable class. There is no rep.cpp,
  1555. * so we implement this function here.
  1556. */
  1557. UBool Replaceable::hasMetaData() const {
  1558. return true;
  1559. }
  1560. /**
  1561. * Replaceable API
  1562. */
  1563. UBool UnicodeString::hasMetaData() const {
  1564. return false;
  1565. }
  1566. UnicodeString&
  1567. UnicodeString::doReverse(int32_t start, int32_t length) {
  1568. if(length <= 1 || !cloneArrayIfNeeded()) {
  1569. return *this;
  1570. }
  1571. // pin the indices to legal values
  1572. pinIndices(start, length);
  1573. if(length <= 1) { // pinIndices() might have shrunk the length
  1574. return *this;
  1575. }
  1576. char16_t *left = getArrayStart() + start;
  1577. char16_t *right = left + length - 1; // -1 for inclusive boundary (length>=2)
  1578. char16_t swap;
  1579. UBool hasSupplementary = false;
  1580. // Before the loop we know left<right because length>=2.
  1581. do {
  1582. hasSupplementary |= static_cast<UBool>(U16_IS_LEAD(swap = *left));
  1583. hasSupplementary |= static_cast<UBool>(U16_IS_LEAD(*left++ = *right));
  1584. *right-- = swap;
  1585. } while(left < right);
  1586. // Make sure to test the middle code unit of an odd-length string.
  1587. // Redundant if the length is even.
  1588. hasSupplementary |= static_cast<UBool>(U16_IS_LEAD(*left));
  1589. /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
  1590. if(hasSupplementary) {
  1591. char16_t swap2;
  1592. left = getArrayStart() + start;
  1593. right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
  1594. while(left < right) {
  1595. if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
  1596. *left++ = swap2;
  1597. *left++ = swap;
  1598. } else {
  1599. ++left;
  1600. }
  1601. }
  1602. }
  1603. return *this;
  1604. }
  1605. UBool
  1606. UnicodeString::padLeading(int32_t targetLength,
  1607. char16_t padChar)
  1608. {
  1609. int32_t oldLength = length();
  1610. if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
  1611. return false;
  1612. } else {
  1613. // move contents up by padding width
  1614. char16_t *array = getArrayStart();
  1615. int32_t start = targetLength - oldLength;
  1616. us_arrayCopy(array, 0, array, start, oldLength);
  1617. // fill in padding character
  1618. while(--start >= 0) {
  1619. array[start] = padChar;
  1620. }
  1621. setLength(targetLength);
  1622. return true;
  1623. }
  1624. }
  1625. UBool
  1626. UnicodeString::padTrailing(int32_t targetLength,
  1627. char16_t padChar)
  1628. {
  1629. int32_t oldLength = length();
  1630. if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
  1631. return false;
  1632. } else {
  1633. // fill in padding character
  1634. char16_t *array = getArrayStart();
  1635. int32_t length = targetLength;
  1636. while(--length >= oldLength) {
  1637. array[length] = padChar;
  1638. }
  1639. setLength(targetLength);
  1640. return true;
  1641. }
  1642. }
  1643. //========================================
  1644. // Hashing
  1645. //========================================
  1646. int32_t
  1647. UnicodeString::doHashCode() const
  1648. {
  1649. /* Delegate hash computation to uhash. This makes UnicodeString
  1650. * hashing consistent with char16_t* hashing. */
  1651. int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
  1652. if (hashCode == kInvalidHashCode) {
  1653. hashCode = kEmptyHashCode;
  1654. }
  1655. return hashCode;
  1656. }
  1657. //========================================
  1658. // External Buffer
  1659. //========================================
  1660. char16_t *
  1661. UnicodeString::getBuffer(int32_t minCapacity) {
  1662. if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
  1663. fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
  1664. setZeroLength();
  1665. return getArrayStart();
  1666. } else {
  1667. return nullptr;
  1668. }
  1669. }
  1670. void
  1671. UnicodeString::releaseBuffer(int32_t newLength) {
  1672. if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
  1673. // set the new fLength
  1674. int32_t capacity=getCapacity();
  1675. if(newLength==-1) {
  1676. // the new length is the string length, capped by fCapacity
  1677. const char16_t *array=getArrayStart(), *p=array, *limit=array+capacity;
  1678. while(p<limit && *p!=0) {
  1679. ++p;
  1680. }
  1681. newLength = static_cast<int32_t>(p - array);
  1682. } else if(newLength>capacity) {
  1683. newLength=capacity;
  1684. }
  1685. setLength(newLength);
  1686. fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
  1687. }
  1688. }
  1689. //========================================
  1690. // Miscellaneous
  1691. //========================================
  1692. UBool
  1693. UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
  1694. int32_t growCapacity,
  1695. UBool doCopyArray,
  1696. int32_t **pBufferToDelete,
  1697. UBool forceClone) {
  1698. // default parameters need to be static, therefore
  1699. // the defaults are -1 to have convenience defaults
  1700. if(newCapacity == -1) {
  1701. newCapacity = getCapacity();
  1702. }
  1703. // while a getBuffer(minCapacity) is "open",
  1704. // prevent any modifications of the string by returning false here
  1705. // if the string is bogus, then only an assignment or similar can revive it
  1706. if(!isWritable()) {
  1707. return false;
  1708. }
  1709. /*
  1710. * We need to make a copy of the array if
  1711. * the buffer is read-only, or
  1712. * the buffer is refCounted (shared), and refCount>1, or
  1713. * the buffer is too small.
  1714. * Return false if memory could not be allocated.
  1715. */
  1716. if(forceClone ||
  1717. fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
  1718. (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
  1719. newCapacity > getCapacity()
  1720. ) {
  1721. // check growCapacity for default value and use of the stack buffer
  1722. if(growCapacity < 0) {
  1723. growCapacity = newCapacity;
  1724. } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
  1725. growCapacity = US_STACKBUF_SIZE;
  1726. }
  1727. // save old values
  1728. char16_t oldStackBuffer[US_STACKBUF_SIZE];
  1729. char16_t *oldArray;
  1730. int32_t oldLength = length();
  1731. int16_t flags = fUnion.fFields.fLengthAndFlags;
  1732. if(flags&kUsingStackBuffer) {
  1733. U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
  1734. if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
  1735. // copy the stack buffer contents because it will be overwritten with
  1736. // fUnion.fFields values
  1737. us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
  1738. oldArray = oldStackBuffer;
  1739. } else {
  1740. oldArray = nullptr; // no need to copy from the stack buffer to itself
  1741. }
  1742. } else {
  1743. oldArray = fUnion.fFields.fArray;
  1744. U_ASSERT(oldArray!=nullptr); /* when stack buffer is not used, oldArray must have a non-nullptr reference */
  1745. }
  1746. // allocate a new array
  1747. if(allocate(growCapacity) ||
  1748. (newCapacity < growCapacity && allocate(newCapacity))
  1749. ) {
  1750. if(doCopyArray) {
  1751. // copy the contents
  1752. // do not copy more than what fits - it may be smaller than before
  1753. int32_t minLength = oldLength;
  1754. newCapacity = getCapacity();
  1755. if(newCapacity < minLength) {
  1756. minLength = newCapacity;
  1757. }
  1758. if(oldArray != nullptr) {
  1759. us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
  1760. }
  1761. setLength(minLength);
  1762. } else {
  1763. setZeroLength();
  1764. }
  1765. // release the old array
  1766. if(flags & kRefCounted) {
  1767. // the array is refCounted; decrement and release if 0
  1768. u_atomic_int32_t* pRefCount = reinterpret_cast<u_atomic_int32_t*>(oldArray) - 1;
  1769. if(umtx_atomic_dec(pRefCount) == 0) {
  1770. if (pBufferToDelete == nullptr) {
  1771. // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
  1772. // is defined as volatile. (Volatile has useful non-standard behavior
  1773. // with this compiler.)
  1774. uprv_free((void *)pRefCount);
  1775. } else {
  1776. // the caller requested to delete it himself
  1777. *pBufferToDelete = reinterpret_cast<int32_t*>(pRefCount);
  1778. }
  1779. }
  1780. }
  1781. } else {
  1782. // not enough memory for growCapacity and not even for the smaller newCapacity
  1783. // reset the old values for setToBogus() to release the array
  1784. if(!(flags&kUsingStackBuffer)) {
  1785. fUnion.fFields.fArray = oldArray;
  1786. }
  1787. fUnion.fFields.fLengthAndFlags = flags;
  1788. setToBogus();
  1789. return false;
  1790. }
  1791. }
  1792. return true;
  1793. }
  1794. // UnicodeStringAppendable ------------------------------------------------- ***
  1795. UnicodeStringAppendable::~UnicodeStringAppendable() {}
  1796. UBool
  1797. UnicodeStringAppendable::appendCodeUnit(char16_t c) {
  1798. return str.doAppend(&c, 0, 1).isWritable();
  1799. }
  1800. UBool
  1801. UnicodeStringAppendable::appendCodePoint(UChar32 c) {
  1802. char16_t buffer[U16_MAX_LENGTH];
  1803. int32_t cLength = 0;
  1804. UBool isError = false;
  1805. U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
  1806. return !isError && str.doAppend(buffer, 0, cLength).isWritable();
  1807. }
  1808. UBool
  1809. UnicodeStringAppendable::appendString(const char16_t *s, int32_t length) {
  1810. return str.doAppend(s, 0, length).isWritable();
  1811. }
  1812. UBool
  1813. UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
  1814. return str.cloneArrayIfNeeded(str.length() + appendCapacity);
  1815. }
  1816. char16_t *
  1817. UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
  1818. int32_t desiredCapacityHint,
  1819. char16_t *scratch, int32_t scratchCapacity,
  1820. int32_t *resultCapacity) {
  1821. if(minCapacity < 1 || scratchCapacity < minCapacity) {
  1822. *resultCapacity = 0;
  1823. return nullptr;
  1824. }
  1825. int32_t oldLength = str.length();
  1826. if(minCapacity <= (kMaxCapacity - oldLength) &&
  1827. desiredCapacityHint <= (kMaxCapacity - oldLength) &&
  1828. str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
  1829. *resultCapacity = str.getCapacity() - oldLength;
  1830. return str.getArrayStart() + oldLength;
  1831. }
  1832. *resultCapacity = scratchCapacity;
  1833. return scratch;
  1834. }
  1835. U_NAMESPACE_END
  1836. U_NAMESPACE_USE
  1837. U_CAPI int32_t U_EXPORT2
  1838. uhash_hashUnicodeString(const UElement key) {
  1839. const UnicodeString *str = (const UnicodeString*) key.pointer;
  1840. return (str == nullptr) ? 0 : str->hashCode();
  1841. }
  1842. // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
  1843. // does not depend on hashtable code.
  1844. U_CAPI UBool U_EXPORT2
  1845. uhash_compareUnicodeString(const UElement key1, const UElement key2) {
  1846. const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
  1847. const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
  1848. if (str1 == str2) {
  1849. return true;
  1850. }
  1851. if (str1 == nullptr || str2 == nullptr) {
  1852. return false;
  1853. }
  1854. return *str1 == *str2;
  1855. }
  1856. #ifdef U_STATIC_IMPLEMENTATION
  1857. /*
  1858. This should never be called. It is defined here to make sure that the
  1859. virtual vector deleting destructor is defined within unistr.cpp.
  1860. The vector deleting destructor is already a part of UObject,
  1861. but defining it here makes sure that it is included with this object file.
  1862. This makes sure that static library dependencies are kept to a minimum.
  1863. */
  1864. #if defined(__clang__) || U_GCC_MAJOR_MINOR >= 1100
  1865. #pragma GCC diagnostic push
  1866. #pragma GCC diagnostic ignored "-Wunused-function"
  1867. static void uprv_UnicodeStringDummy() {
  1868. delete [] (new UnicodeString[2]);
  1869. }
  1870. #pragma GCC diagnostic pop
  1871. #endif
  1872. #endif