utext.cpp 97 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2005-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: utext.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2005apr12
  16. * created by: Markus W. Scherer
  17. */
  18. #include <cstddef>
  19. #include "unicode/utypes.h"
  20. #include "unicode/ustring.h"
  21. #include "unicode/unistr.h"
  22. #include "unicode/chariter.h"
  23. #include "unicode/utext.h"
  24. #include "unicode/utf.h"
  25. #include "unicode/utf8.h"
  26. #include "unicode/utf16.h"
  27. #include "ustr_imp.h"
  28. #include "cmemory.h"
  29. #include "cstring.h"
  30. #include "uassert.h"
  31. #include "putilimp.h"
  32. U_NAMESPACE_USE
  33. #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
  34. static UBool
  35. utext_access(UText *ut, int64_t index, UBool forward) {
  36. return ut->pFuncs->access(ut, index, forward);
  37. }
  38. U_CAPI UBool U_EXPORT2
  39. utext_moveIndex32(UText *ut, int32_t delta) {
  40. UChar32 c;
  41. if (delta > 0) {
  42. do {
  43. if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, true)) {
  44. return false;
  45. }
  46. c = ut->chunkContents[ut->chunkOffset];
  47. if (U16_IS_SURROGATE(c)) {
  48. c = utext_next32(ut);
  49. if (c == U_SENTINEL) {
  50. return false;
  51. }
  52. } else {
  53. ut->chunkOffset++;
  54. }
  55. } while(--delta>0);
  56. } else if (delta<0) {
  57. do {
  58. if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, false)) {
  59. return false;
  60. }
  61. c = ut->chunkContents[ut->chunkOffset-1];
  62. if (U16_IS_SURROGATE(c)) {
  63. c = utext_previous32(ut);
  64. if (c == U_SENTINEL) {
  65. return false;
  66. }
  67. } else {
  68. ut->chunkOffset--;
  69. }
  70. } while(++delta<0);
  71. }
  72. return true;
  73. }
  74. U_CAPI int64_t U_EXPORT2
  75. utext_nativeLength(UText *ut) {
  76. return ut->pFuncs->nativeLength(ut);
  77. }
  78. U_CAPI UBool U_EXPORT2
  79. utext_isLengthExpensive(const UText *ut) {
  80. UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
  81. return r;
  82. }
  83. U_CAPI int64_t U_EXPORT2
  84. utext_getNativeIndex(const UText *ut) {
  85. if(ut->chunkOffset <= ut->nativeIndexingLimit) {
  86. return ut->chunkNativeStart+ut->chunkOffset;
  87. } else {
  88. return ut->pFuncs->mapOffsetToNative(ut);
  89. }
  90. }
  91. U_CAPI void U_EXPORT2
  92. utext_setNativeIndex(UText *ut, int64_t index) {
  93. if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
  94. // The desired position is outside of the current chunk.
  95. // Access the new position. Assume a forward iteration from here,
  96. // which will also be optimimum for a single random access.
  97. // Reverse iterations may suffer slightly.
  98. ut->pFuncs->access(ut, index, true);
  99. } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) {
  100. // utf-16 indexing.
  101. ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
  102. } else {
  103. ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
  104. }
  105. // The convention is that the index must always be on a code point boundary.
  106. // Adjust the index position if it is in the middle of a surrogate pair.
  107. if (ut->chunkOffset<ut->chunkLength) {
  108. char16_t c= ut->chunkContents[ut->chunkOffset];
  109. if (U16_IS_TRAIL(c)) {
  110. if (ut->chunkOffset==0) {
  111. ut->pFuncs->access(ut, ut->chunkNativeStart, false);
  112. }
  113. if (ut->chunkOffset>0) {
  114. char16_t lead = ut->chunkContents[ut->chunkOffset-1];
  115. if (U16_IS_LEAD(lead)) {
  116. ut->chunkOffset--;
  117. }
  118. }
  119. }
  120. }
  121. }
  122. U_CAPI int64_t U_EXPORT2
  123. utext_getPreviousNativeIndex(UText *ut) {
  124. //
  125. // Fast-path the common case.
  126. // Common means current position is not at the beginning of a chunk
  127. // and the preceding character is not supplementary.
  128. //
  129. int32_t i = ut->chunkOffset - 1;
  130. int64_t result;
  131. if (i >= 0) {
  132. char16_t c = ut->chunkContents[i];
  133. if (U16_IS_TRAIL(c) == false) {
  134. if (i <= ut->nativeIndexingLimit) {
  135. result = ut->chunkNativeStart + i;
  136. } else {
  137. ut->chunkOffset = i;
  138. result = ut->pFuncs->mapOffsetToNative(ut);
  139. ut->chunkOffset++;
  140. }
  141. return result;
  142. }
  143. }
  144. // If at the start of text, simply return 0.
  145. if (ut->chunkOffset==0 && ut->chunkNativeStart==0) {
  146. return 0;
  147. }
  148. // Harder, less common cases. We are at a chunk boundary, or on a surrogate.
  149. // Keep it simple, use other functions to handle the edges.
  150. //
  151. utext_previous32(ut);
  152. result = UTEXT_GETNATIVEINDEX(ut);
  153. utext_next32(ut);
  154. return result;
  155. }
  156. //
  157. // utext_current32. Get the UChar32 at the current position.
  158. // UText iteration position is always on a code point boundary,
  159. // never on the trail half of a surrogate pair.
  160. //
  161. U_CAPI UChar32 U_EXPORT2
  162. utext_current32(UText *ut) {
  163. UChar32 c;
  164. if (ut->chunkOffset==ut->chunkLength) {
  165. // Current position is just off the end of the chunk.
  166. if (ut->pFuncs->access(ut, ut->chunkNativeLimit, true) == false) {
  167. // Off the end of the text.
  168. return U_SENTINEL;
  169. }
  170. }
  171. c = ut->chunkContents[ut->chunkOffset];
  172. if (U16_IS_LEAD(c) == false) {
  173. // Normal, non-supplementary case.
  174. return c;
  175. }
  176. //
  177. // Possible supplementary char.
  178. //
  179. UChar32 trail = 0;
  180. UChar32 supplementaryC = c;
  181. if ((ut->chunkOffset+1) < ut->chunkLength) {
  182. // The trail surrogate is in the same chunk.
  183. trail = ut->chunkContents[ut->chunkOffset+1];
  184. } else {
  185. // The trail surrogate is in a different chunk.
  186. // Because we must maintain the iteration position, we need to switch forward
  187. // into the new chunk, get the trail surrogate, then revert the chunk back to the
  188. // original one.
  189. // An edge case to be careful of: the entire text may end with an unpaired
  190. // leading surrogate. The attempt to access the trail will fail, but
  191. // the original position before the unpaired lead still needs to be restored.
  192. int64_t nativePosition = ut->chunkNativeLimit;
  193. if (ut->pFuncs->access(ut, nativePosition, true)) {
  194. trail = ut->chunkContents[ut->chunkOffset];
  195. }
  196. UBool r = ut->pFuncs->access(ut, nativePosition, false); // reverse iteration flag loads preceding chunk
  197. U_ASSERT(r);
  198. // Here we need to restore chunkOffset since the access functions were called with
  199. // chunkNativeLimit but that is not where we were (we were 1 code unit before the
  200. // limit). Restoring was originally added in ICU-4669 but did not support access
  201. // functions that changed the chunk size, the following does.
  202. ut->chunkOffset = ut->chunkLength - 1;
  203. if(!r) {
  204. return U_SENTINEL;
  205. }
  206. }
  207. if (U16_IS_TRAIL(trail)) {
  208. supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
  209. }
  210. return supplementaryC;
  211. }
  212. U_CAPI UChar32 U_EXPORT2
  213. utext_char32At(UText *ut, int64_t nativeIndex) {
  214. UChar32 c = U_SENTINEL;
  215. // Fast path the common case.
  216. if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) {
  217. ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
  218. c = ut->chunkContents[ut->chunkOffset];
  219. if (U16_IS_SURROGATE(c) == false) {
  220. return c;
  221. }
  222. }
  223. utext_setNativeIndex(ut, nativeIndex);
  224. if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
  225. c = ut->chunkContents[ut->chunkOffset];
  226. if (U16_IS_SURROGATE(c)) {
  227. // For surrogates, let current32() deal with the complications
  228. // of supplementaries that may span chunk boundaries.
  229. c = utext_current32(ut);
  230. }
  231. }
  232. return c;
  233. }
  234. U_CAPI UChar32 U_EXPORT2
  235. utext_next32(UText *ut) {
  236. UChar32 c;
  237. if (ut->chunkOffset >= ut->chunkLength) {
  238. if (ut->pFuncs->access(ut, ut->chunkNativeLimit, true) == false) {
  239. return U_SENTINEL;
  240. }
  241. }
  242. c = ut->chunkContents[ut->chunkOffset++];
  243. if (U16_IS_LEAD(c) == false) {
  244. // Normal case, not supplementary.
  245. // (A trail surrogate seen here is just returned as is, as a surrogate value.
  246. // It cannot be part of a pair.)
  247. return c;
  248. }
  249. if (ut->chunkOffset >= ut->chunkLength) {
  250. if (ut->pFuncs->access(ut, ut->chunkNativeLimit, true) == false) {
  251. // c is an unpaired lead surrogate at the end of the text.
  252. // return it as it is.
  253. return c;
  254. }
  255. }
  256. UChar32 trail = ut->chunkContents[ut->chunkOffset];
  257. if (U16_IS_TRAIL(trail) == false) {
  258. // c was an unpaired lead surrogate, not at the end of the text.
  259. // return it as it is (unpaired). Iteration position is on the
  260. // following character, possibly in the next chunk, where the
  261. // trail surrogate would have been if it had existed.
  262. return c;
  263. }
  264. UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
  265. ut->chunkOffset++; // move iteration position over the trail surrogate.
  266. return supplementary;
  267. }
  268. U_CAPI UChar32 U_EXPORT2
  269. utext_previous32(UText *ut) {
  270. UChar32 c;
  271. if (ut->chunkOffset <= 0) {
  272. if (ut->pFuncs->access(ut, ut->chunkNativeStart, false) == false) {
  273. return U_SENTINEL;
  274. }
  275. }
  276. ut->chunkOffset--;
  277. c = ut->chunkContents[ut->chunkOffset];
  278. if (U16_IS_TRAIL(c) == false) {
  279. // Normal case, not supplementary.
  280. // (A lead surrogate seen here is just returned as is, as a surrogate value.
  281. // It cannot be part of a pair.)
  282. return c;
  283. }
  284. if (ut->chunkOffset <= 0) {
  285. if (ut->pFuncs->access(ut, ut->chunkNativeStart, false) == false) {
  286. // c is an unpaired trail surrogate at the start of the text.
  287. // return it as it is.
  288. return c;
  289. }
  290. }
  291. UChar32 lead = ut->chunkContents[ut->chunkOffset-1];
  292. if (U16_IS_LEAD(lead) == false) {
  293. // c was an unpaired trail surrogate, not at the end of the text.
  294. // return it as it is (unpaired). Iteration position is at c
  295. return c;
  296. }
  297. UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
  298. ut->chunkOffset--; // move iteration position over the lead surrogate.
  299. return supplementary;
  300. }
  301. U_CAPI UChar32 U_EXPORT2
  302. utext_next32From(UText *ut, int64_t index) {
  303. UChar32 c = U_SENTINEL;
  304. if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
  305. // Desired position is outside of the current chunk.
  306. if(!ut->pFuncs->access(ut, index, true)) {
  307. // no chunk available here
  308. return U_SENTINEL;
  309. }
  310. } else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
  311. // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
  312. ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
  313. } else {
  314. // Desired position is in chunk, with non-UTF16 indexing.
  315. ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index);
  316. }
  317. c = ut->chunkContents[ut->chunkOffset++];
  318. if (U16_IS_SURROGATE(c)) {
  319. // Surrogates. Many edge cases. Use other functions that already
  320. // deal with the problems.
  321. utext_setNativeIndex(ut, index);
  322. c = utext_next32(ut);
  323. }
  324. return c;
  325. }
  326. U_CAPI UChar32 U_EXPORT2
  327. utext_previous32From(UText *ut, int64_t index) {
  328. //
  329. // Return the character preceding the specified index.
  330. // Leave the iteration position at the start of the character that was returned.
  331. //
  332. UChar32 cPrev; // The character preceding cCurr, which is what we will return.
  333. // Address the chunk containing the position preceding the incoming index
  334. // A tricky edge case:
  335. // We try to test the requested native index against the chunkNativeStart to determine
  336. // whether the character preceding the one at the index is in the current chunk.
  337. // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
  338. // requested index is on something other than the first position of the first char.
  339. //
  340. if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) {
  341. // Requested native index is outside of the current chunk.
  342. if(!ut->pFuncs->access(ut, index, false)) {
  343. // no chunk available here
  344. return U_SENTINEL;
  345. }
  346. } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
  347. // Direct UTF-16 indexing.
  348. ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
  349. } else {
  350. ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
  351. if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, false)) {
  352. // no chunk available here
  353. return U_SENTINEL;
  354. }
  355. }
  356. //
  357. // Simple case with no surrogates.
  358. //
  359. ut->chunkOffset--;
  360. cPrev = ut->chunkContents[ut->chunkOffset];
  361. if (U16_IS_SURROGATE(cPrev)) {
  362. // Possible supplementary. Many edge cases.
  363. // Let other functions do the heavy lifting.
  364. utext_setNativeIndex(ut, index);
  365. cPrev = utext_previous32(ut);
  366. }
  367. return cPrev;
  368. }
  369. U_CAPI int32_t U_EXPORT2
  370. utext_extract(UText *ut,
  371. int64_t start, int64_t limit,
  372. char16_t *dest, int32_t destCapacity,
  373. UErrorCode *status) {
  374. return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status);
  375. }
  376. U_CAPI UBool U_EXPORT2
  377. utext_equals(const UText *a, const UText *b) {
  378. if (a==nullptr || b==nullptr ||
  379. a->magic != UTEXT_MAGIC ||
  380. b->magic != UTEXT_MAGIC) {
  381. // Null or invalid arguments don't compare equal to anything.
  382. return false;
  383. }
  384. if (a->pFuncs != b->pFuncs) {
  385. // Different types of text providers.
  386. return false;
  387. }
  388. if (a->context != b->context) {
  389. // Different sources (different strings)
  390. return false;
  391. }
  392. if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) {
  393. // Different current position in the string.
  394. return false;
  395. }
  396. return true;
  397. }
  398. U_CAPI UBool U_EXPORT2
  399. utext_isWritable(const UText *ut)
  400. {
  401. UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
  402. return b;
  403. }
  404. U_CAPI void U_EXPORT2
  405. utext_freeze(UText *ut) {
  406. // Zero out the WRITABLE flag.
  407. ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
  408. }
  409. U_CAPI UBool U_EXPORT2
  410. utext_hasMetaData(const UText *ut)
  411. {
  412. UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
  413. return b;
  414. }
  415. U_CAPI int32_t U_EXPORT2
  416. utext_replace(UText *ut,
  417. int64_t nativeStart, int64_t nativeLimit,
  418. const char16_t *replacementText, int32_t replacementLength,
  419. UErrorCode *status)
  420. {
  421. if (U_FAILURE(*status)) {
  422. return 0;
  423. }
  424. if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
  425. *status = U_NO_WRITE_PERMISSION;
  426. return 0;
  427. }
  428. int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
  429. return i;
  430. }
  431. U_CAPI void U_EXPORT2
  432. utext_copy(UText *ut,
  433. int64_t nativeStart, int64_t nativeLimit,
  434. int64_t destIndex,
  435. UBool move,
  436. UErrorCode *status)
  437. {
  438. if (U_FAILURE(*status)) {
  439. return;
  440. }
  441. if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
  442. *status = U_NO_WRITE_PERMISSION;
  443. return;
  444. }
  445. ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
  446. }
  447. U_CAPI UText * U_EXPORT2
  448. utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
  449. if (U_FAILURE(*status)) {
  450. return dest;
  451. }
  452. UText *result = src->pFuncs->clone(dest, src, deep, status);
  453. if (U_FAILURE(*status)) {
  454. return result;
  455. }
  456. if (result == nullptr) {
  457. *status = U_MEMORY_ALLOCATION_ERROR;
  458. return result;
  459. }
  460. if (readOnly) {
  461. utext_freeze(result);
  462. }
  463. return result;
  464. }
  465. //------------------------------------------------------------------------------
  466. //
  467. // UText common functions implementation
  468. //
  469. //------------------------------------------------------------------------------
  470. //
  471. // UText.flags bit definitions
  472. //
  473. enum {
  474. UTEXT_HEAP_ALLOCATED = 1, // 1 if ICU has allocated this UText struct on the heap.
  475. // 0 if caller provided storage for the UText.
  476. UTEXT_EXTRA_HEAP_ALLOCATED = 2, // 1 if ICU has allocated extra storage as a separate
  477. // heap block.
  478. // 0 if there is no separate allocation. Either no extra
  479. // storage was requested, or it is appended to the end
  480. // of the main UText storage.
  481. UTEXT_OPEN = 4 // 1 if this UText is currently open
  482. // 0 if this UText is not open.
  483. };
  484. //
  485. // Extended form of a UText. The purpose is to aid in computing the total size required
  486. // when a provider asks for a UText to be allocated with extra storage.
  487. struct ExtendedUText {
  488. UText ut;
  489. std::max_align_t extension;
  490. };
  491. static const UText emptyText = UTEXT_INITIALIZER;
  492. U_CAPI UText * U_EXPORT2
  493. utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
  494. if (U_FAILURE(*status)) {
  495. return ut;
  496. }
  497. if (ut == nullptr) {
  498. // We need to heap-allocate storage for the new UText
  499. int32_t spaceRequired = sizeof(UText);
  500. if (extraSpace > 0) {
  501. spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(std::max_align_t);
  502. }
  503. ut = (UText *)uprv_malloc(spaceRequired);
  504. if (ut == nullptr) {
  505. *status = U_MEMORY_ALLOCATION_ERROR;
  506. return nullptr;
  507. } else {
  508. *ut = emptyText;
  509. ut->flags |= UTEXT_HEAP_ALLOCATED;
  510. if (spaceRequired>0) {
  511. ut->extraSize = extraSpace;
  512. ut->pExtra = &((ExtendedUText *)ut)->extension;
  513. }
  514. }
  515. } else {
  516. // We have been supplied with an already existing UText.
  517. // Verify that it really appears to be a UText.
  518. if (ut->magic != UTEXT_MAGIC) {
  519. *status = U_ILLEGAL_ARGUMENT_ERROR;
  520. return ut;
  521. }
  522. // If the ut is already open and there's a provider supplied close
  523. // function, call it.
  524. if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != nullptr) {
  525. ut->pFuncs->close(ut);
  526. }
  527. ut->flags &= ~UTEXT_OPEN;
  528. // If extra space was requested by our caller, check whether
  529. // sufficient already exists, and allocate new if needed.
  530. if (extraSpace > ut->extraSize) {
  531. // Need more space. If there is existing separately allocated space,
  532. // delete it first, then allocate new space.
  533. if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
  534. uprv_free(ut->pExtra);
  535. ut->extraSize = 0;
  536. }
  537. ut->pExtra = uprv_malloc(extraSpace);
  538. if (ut->pExtra == nullptr) {
  539. *status = U_MEMORY_ALLOCATION_ERROR;
  540. } else {
  541. ut->extraSize = extraSpace;
  542. ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
  543. }
  544. }
  545. }
  546. if (U_SUCCESS(*status)) {
  547. ut->flags |= UTEXT_OPEN;
  548. // Initialize all remaining fields of the UText.
  549. //
  550. ut->context = nullptr;
  551. ut->chunkContents = nullptr;
  552. ut->p = nullptr;
  553. ut->q = nullptr;
  554. ut->r = nullptr;
  555. ut->a = 0;
  556. ut->b = 0;
  557. ut->c = 0;
  558. ut->chunkOffset = 0;
  559. ut->chunkLength = 0;
  560. ut->chunkNativeStart = 0;
  561. ut->chunkNativeLimit = 0;
  562. ut->nativeIndexingLimit = 0;
  563. ut->providerProperties = 0;
  564. ut->privA = 0;
  565. ut->privB = 0;
  566. ut->privC = 0;
  567. ut->privP = nullptr;
  568. if (ut->pExtra!=nullptr && ut->extraSize>0)
  569. uprv_memset(ut->pExtra, 0, ut->extraSize);
  570. }
  571. return ut;
  572. }
  573. U_CAPI UText * U_EXPORT2
  574. utext_close(UText *ut) {
  575. if (ut==nullptr ||
  576. ut->magic != UTEXT_MAGIC ||
  577. (ut->flags & UTEXT_OPEN) == 0)
  578. {
  579. // The supplied ut is not an open UText.
  580. // Do nothing.
  581. return ut;
  582. }
  583. // If the provider gave us a close function, call it now.
  584. // This will clean up anything allocated specifically by the provider.
  585. if (ut->pFuncs->close != nullptr) {
  586. ut->pFuncs->close(ut);
  587. }
  588. ut->flags &= ~UTEXT_OPEN;
  589. // If we (the framework) allocated the UText or subsidiary storage,
  590. // delete it.
  591. if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
  592. uprv_free(ut->pExtra);
  593. ut->pExtra = nullptr;
  594. ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED;
  595. ut->extraSize = 0;
  596. }
  597. // Zero out function table of the closed UText. This is a defensive move,
  598. // intended to cause applications that inadvertently use a closed
  599. // utext to crash with null pointer errors.
  600. ut->pFuncs = nullptr;
  601. if (ut->flags & UTEXT_HEAP_ALLOCATED) {
  602. // This UText was allocated by UText setup. We need to free it.
  603. // Clear magic, so we can detect if the user messes up and immediately
  604. // tries to reopen another UText using the deleted storage.
  605. ut->magic = 0;
  606. uprv_free(ut);
  607. ut = nullptr;
  608. }
  609. return ut;
  610. }
  611. //
  612. // invalidateChunk Reset a chunk to have no contents, so that the next call
  613. // to access will cause new data to load.
  614. // This is needed when copy/move/replace operate directly on the
  615. // backing text, potentially putting it out of sync with the
  616. // contents in the chunk.
  617. //
  618. static void
  619. invalidateChunk(UText *ut) {
  620. ut->chunkLength = 0;
  621. ut->chunkNativeLimit = 0;
  622. ut->chunkNativeStart = 0;
  623. ut->chunkOffset = 0;
  624. ut->nativeIndexingLimit = 0;
  625. }
  626. //
  627. // pinIndex Do range pinning on a native index parameter.
  628. // 64 bit pinning is done in place.
  629. // 32 bit truncated result is returned as a convenience for
  630. // use in providers that don't need 64 bits.
  631. static int32_t
  632. pinIndex(int64_t &index, int64_t limit) {
  633. if (index<0) {
  634. index = 0;
  635. } else if (index > limit) {
  636. index = limit;
  637. }
  638. return static_cast<int32_t>(index);
  639. }
  640. U_CDECL_BEGIN
  641. //
  642. // Pointer relocation function,
  643. // a utility used by shallow clone.
  644. // Adjust a pointer that refers to something within one UText (the source)
  645. // to refer to the same relative offset within a another UText (the target)
  646. //
  647. static void adjustPointer(UText *dest, const void **destPtr, const UText *src) {
  648. // convert all pointers to (char *) so that byte address arithmetic will work.
  649. char *dptr = (char *)*destPtr;
  650. char *dUText = (char *)dest;
  651. char *sUText = (char *)src;
  652. if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) {
  653. // target ptr was to something within the src UText's pExtra storage.
  654. // relocate it into the target UText's pExtra region.
  655. *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra);
  656. } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) {
  657. // target ptr was pointing to somewhere within the source UText itself.
  658. // Move it to the same offset within the target UText.
  659. *destPtr = dUText + (dptr-sUText);
  660. }
  661. }
  662. //
  663. // Clone. This is a generic copy-the-utext-by-value clone function that can be
  664. // used as-is with some utext types, and as a helper by other clones.
  665. //
  666. static UText * U_CALLCONV
  667. shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
  668. if (U_FAILURE(*status)) {
  669. return nullptr;
  670. }
  671. int32_t srcExtraSize = src->extraSize;
  672. //
  673. // Use the generic text_setup to allocate storage if required.
  674. //
  675. dest = utext_setup(dest, srcExtraSize, status);
  676. if (U_FAILURE(*status)) {
  677. return dest;
  678. }
  679. //
  680. // flags (how the UText was allocated) and the pointer to the
  681. // extra storage must retain the values in the cloned utext that
  682. // were set up by utext_setup. Save them separately before
  683. // copying the whole struct.
  684. //
  685. void *destExtra = dest->pExtra;
  686. int32_t flags = dest->flags;
  687. //
  688. // Copy the whole UText struct by value.
  689. // Any "Extra" storage is copied also.
  690. //
  691. int sizeToCopy = src->sizeOfStruct;
  692. if (sizeToCopy > dest->sizeOfStruct) {
  693. sizeToCopy = dest->sizeOfStruct;
  694. }
  695. uprv_memcpy(dest, src, sizeToCopy);
  696. dest->pExtra = destExtra;
  697. dest->flags = flags;
  698. if (srcExtraSize > 0) {
  699. uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
  700. }
  701. //
  702. // Relocate any pointers in the target that refer to the UText itself
  703. // to point to the cloned copy rather than the original source.
  704. //
  705. adjustPointer(dest, &dest->context, src);
  706. adjustPointer(dest, &dest->p, src);
  707. adjustPointer(dest, &dest->q, src);
  708. adjustPointer(dest, &dest->r, src);
  709. adjustPointer(dest, (const void **)&dest->chunkContents, src);
  710. // The newly shallow-cloned UText does _not_ own the underlying storage for the text.
  711. // (The source for the clone may or may not have owned the text.)
  712. dest->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  713. return dest;
  714. }
  715. U_CDECL_END
  716. //------------------------------------------------------------------------------
  717. //
  718. // UText implementation for UTF-8 char * strings (read-only)
  719. // Limitation: string length must be <= 0x7fffffff in length.
  720. // (length must for in an int32_t variable)
  721. //
  722. // Use of UText data members:
  723. // context pointer to UTF-8 string
  724. // utext.b is the input string length (bytes).
  725. // utext.c Length scanned so far in string
  726. // (for optimizing finding length of zero terminated strings.)
  727. // utext.p pointer to the current buffer
  728. // utext.q pointer to the other buffer.
  729. //
  730. //------------------------------------------------------------------------------
  731. // Chunk size.
  732. // Must be less than 85 (256/3), because of byte mapping from char16_t indexes to native indexes.
  733. // Worst case is three native bytes to one char16_t. (Supplemenaries are 4 native bytes
  734. // to two UChars.)
  735. // The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
  736. // is a three-byte sequence (truncated four-byte sequence).
  737. //
  738. enum { UTF8_TEXT_CHUNK_SIZE=32 };
  739. //
  740. // UTF8Buf Two of these structs will be set up in the UText's extra allocated space.
  741. // Each contains the char16_t chunk buffer, the to and from native maps, and
  742. // header info.
  743. //
  744. // because backwards iteration fills the buffers starting at the end and
  745. // working towards the front, the filled part of the buffers may not begin
  746. // at the start of the available storage for the buffers.
  747. //
  748. // Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
  749. // the last character added being a supplementary, and thus requiring a surrogate
  750. // pair. Doing this is simpler than checking for the edge case.
  751. //
  752. struct UTF8Buf {
  753. int32_t bufNativeStart; // Native index of first char in char16_t buf
  754. int32_t bufNativeLimit; // Native index following last char in buf.
  755. int32_t bufStartIdx; // First filled position in buf.
  756. int32_t bufLimitIdx; // Limit of filled range in buf.
  757. int32_t bufNILimit; // Limit of native indexing part of buf
  758. int32_t toUCharsMapStart; // Native index corresponding to
  759. // mapToUChars[0].
  760. // Set to bufNativeStart when filling forwards.
  761. // Set to computed value when filling backwards.
  762. char16_t buf[UTF8_TEXT_CHUNK_SIZE+4]; // The char16_t buffer. Requires one extra position beyond the
  763. // the chunk size, to allow for surrogate at the end.
  764. // Length must be identical to mapToNative array, below,
  765. // because of the way indexing works when the array is
  766. // filled backwards during a reverse iteration. Thus,
  767. // the additional extra size.
  768. uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+4]; // map char16_t index in buf to
  769. // native offset from bufNativeStart.
  770. // Requires two extra slots,
  771. // one for a supplementary starting in the last normal position,
  772. // and one for an entry for the buffer limit position.
  773. uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
  774. // corresponding offset in filled part of buf.
  775. int32_t align;
  776. };
  777. U_CDECL_BEGIN
  778. //
  779. // utf8TextLength
  780. //
  781. // Get the length of the string. If we don't already know it,
  782. // we'll need to scan for the trailing nul.
  783. //
  784. static int64_t U_CALLCONV
  785. utf8TextLength(UText *ut) {
  786. if (ut->b < 0) {
  787. // Zero terminated string, and we haven't scanned to the end yet.
  788. // Scan it now.
  789. const char *r = (const char *)ut->context + ut->c;
  790. while (*r != 0) {
  791. r++;
  792. }
  793. if ((r - (const char *)ut->context) < 0x7fffffff) {
  794. ut->b = (int32_t)(r - (const char *)ut->context);
  795. } else {
  796. // Actual string was bigger (more than 2 gig) than we
  797. // can handle. Clip it to 2 GB.
  798. ut->b = 0x7fffffff;
  799. }
  800. ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  801. }
  802. return ut->b;
  803. }
  804. static UBool U_CALLCONV
  805. utf8TextAccess(UText *ut, int64_t index, UBool forward) {
  806. //
  807. // Apologies to those who are allergic to goto statements.
  808. // Consider each goto to a labelled block to be the equivalent of
  809. // call the named block as if it were a function();
  810. // return;
  811. //
  812. const uint8_t *s8=(const uint8_t *)ut->context;
  813. UTF8Buf *u8b = nullptr;
  814. int32_t length = ut->b; // Length of original utf-8
  815. int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits.
  816. int32_t mapIndex = 0;
  817. if (index<0) {
  818. ix=0;
  819. } else if (index > 0x7fffffff) {
  820. // Strings with 64 bit lengths not supported by this UTF-8 provider.
  821. ix = 0x7fffffff;
  822. }
  823. // Pin requested index to the string length.
  824. if (ix>length) {
  825. if (length>=0) {
  826. ix=length;
  827. } else if (ix>=ut->c) {
  828. // Zero terminated string, and requested index is beyond
  829. // the region that has already been scanned.
  830. // Scan up to either the end of the string or to the
  831. // requested position, whichever comes first.
  832. while (ut->c<ix && s8[ut->c]!=0) {
  833. ut->c++;
  834. }
  835. // TODO: support for null terminated string length > 32 bits.
  836. if (s8[ut->c] == 0) {
  837. // We just found the actual length of the string.
  838. // Trim the requested index back to that.
  839. ix = ut->c;
  840. ut->b = ut->c;
  841. length = ut->c;
  842. ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  843. }
  844. }
  845. }
  846. //
  847. // Dispatch to the appropriate action for a forward iteration request.
  848. //
  849. if (forward) {
  850. if (ix==ut->chunkNativeLimit) {
  851. // Check for normal sequential iteration cases first.
  852. if (ix==length) {
  853. // Just reached end of string
  854. // Don't swap buffers, but do set the
  855. // current buffer position.
  856. ut->chunkOffset = ut->chunkLength;
  857. return false;
  858. } else {
  859. // End of current buffer.
  860. // check whether other buffer already has what we need.
  861. UTF8Buf *altB = (UTF8Buf *)ut->q;
  862. if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) {
  863. goto swapBuffers;
  864. }
  865. }
  866. }
  867. // A random access. Desired index could be in either or niether buf.
  868. // For optimizing the order of testing, first check for the index
  869. // being in the other buffer. This will be the case for uses that
  870. // move back and forth over a fairly limited range
  871. {
  872. u8b = (UTF8Buf *)ut->q; // the alternate buffer
  873. if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) {
  874. // Requested index is in the other buffer.
  875. goto swapBuffers;
  876. }
  877. if (ix == length) {
  878. // Requested index is end-of-string.
  879. // (this is the case of randomly seeking to the end.
  880. // The case of iterating off the end is handled earlier.)
  881. if (ix == ut->chunkNativeLimit) {
  882. // Current buffer extends up to the end of the string.
  883. // Leave it as the current buffer.
  884. ut->chunkOffset = ut->chunkLength;
  885. return false;
  886. }
  887. if (ix == u8b->bufNativeLimit) {
  888. // Alternate buffer extends to the end of string.
  889. // Swap it in as the current buffer.
  890. goto swapBuffersAndFail;
  891. }
  892. // Neither existing buffer extends to the end of the string.
  893. goto makeStubBuffer;
  894. }
  895. if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) {
  896. // Requested index is in neither buffer.
  897. goto fillForward;
  898. }
  899. // Requested index is in this buffer.
  900. u8b = (UTF8Buf *)ut->p; // the current buffer
  901. mapIndex = ix - u8b->toUCharsMapStart;
  902. U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars));
  903. ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  904. return true;
  905. }
  906. }
  907. //
  908. // Dispatch to the appropriate action for a
  909. // Backwards Direction iteration request.
  910. //
  911. if (ix==ut->chunkNativeStart) {
  912. // Check for normal sequential iteration cases first.
  913. if (ix==0) {
  914. // Just reached the start of string
  915. // Don't swap buffers, but do set the
  916. // current buffer position.
  917. ut->chunkOffset = 0;
  918. return false;
  919. } else {
  920. // Start of current buffer.
  921. // check whether other buffer already has what we need.
  922. UTF8Buf *altB = (UTF8Buf *)ut->q;
  923. if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) {
  924. goto swapBuffers;
  925. }
  926. }
  927. }
  928. // A random access. Desired index could be in either or niether buf.
  929. // For optimizing the order of testing,
  930. // Most likely case: in the other buffer.
  931. // Second most likely: in neither buffer.
  932. // Unlikely, but must work: in the current buffer.
  933. u8b = (UTF8Buf *)ut->q; // the alternate buffer
  934. if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) {
  935. // Requested index is in the other buffer.
  936. goto swapBuffers;
  937. }
  938. // Requested index is start-of-string.
  939. // (this is the case of randomly seeking to the start.
  940. // The case of iterating off the start is handled earlier.)
  941. if (ix==0) {
  942. if (u8b->bufNativeStart==0) {
  943. // Alternate buffer contains the data for the start string.
  944. // Make it be the current buffer.
  945. goto swapBuffersAndFail;
  946. } else {
  947. // Request for data before the start of string,
  948. // neither buffer is usable.
  949. // set up a zero-length buffer.
  950. goto makeStubBuffer;
  951. }
  952. }
  953. if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) {
  954. // Requested index is in neither buffer.
  955. goto fillReverse;
  956. }
  957. // Requested index is in this buffer.
  958. // Set the utf16 buffer index.
  959. u8b = (UTF8Buf *)ut->p;
  960. mapIndex = ix - u8b->toUCharsMapStart;
  961. ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  962. if (ut->chunkOffset==0) {
  963. // This occurs when the first character in the text is
  964. // a multi-byte UTF-8 char, and the requested index is to
  965. // one of the trailing bytes. Because there is no preceding ,
  966. // character, this access fails. We can't pick up on the
  967. // situation sooner because the requested index is not zero.
  968. return false;
  969. } else {
  970. return true;
  971. }
  972. swapBuffers:
  973. // The alternate buffer (ut->q) has the string data that was requested.
  974. // Swap the primary and alternate buffers, and set the
  975. // chunk index into the new primary buffer.
  976. {
  977. u8b = (UTF8Buf *)ut->q;
  978. ut->q = ut->p;
  979. ut->p = u8b;
  980. ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
  981. ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
  982. ut->chunkNativeStart = u8b->bufNativeStart;
  983. ut->chunkNativeLimit = u8b->bufNativeLimit;
  984. ut->nativeIndexingLimit = u8b->bufNILimit;
  985. // Index into the (now current) chunk
  986. // Use the map to set the chunk index. It's more trouble than it's worth
  987. // to check whether native indexing can be used.
  988. U_ASSERT(ix>=u8b->bufNativeStart);
  989. U_ASSERT(ix<=u8b->bufNativeLimit);
  990. mapIndex = ix - u8b->toUCharsMapStart;
  991. U_ASSERT(mapIndex>=0);
  992. U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars));
  993. ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  994. return true;
  995. }
  996. swapBuffersAndFail:
  997. // We got a request for either the start or end of the string,
  998. // with iteration continuing in the out-of-bounds direction.
  999. // The alternate buffer already contains the data up to the
  1000. // start/end.
  1001. // Swap the buffers, then return failure, indicating that we couldn't
  1002. // make things correct for continuing the iteration in the requested
  1003. // direction. The position & buffer are correct should the
  1004. // user decide to iterate in the opposite direction.
  1005. u8b = (UTF8Buf *)ut->q;
  1006. ut->q = ut->p;
  1007. ut->p = u8b;
  1008. ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
  1009. ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
  1010. ut->chunkNativeStart = u8b->bufNativeStart;
  1011. ut->chunkNativeLimit = u8b->bufNativeLimit;
  1012. ut->nativeIndexingLimit = u8b->bufNILimit;
  1013. // Index into the (now current) chunk
  1014. // For this function (swapBuffersAndFail), the requested index
  1015. // will always be at either the start or end of the chunk.
  1016. if (ix==u8b->bufNativeLimit) {
  1017. ut->chunkOffset = ut->chunkLength;
  1018. } else {
  1019. ut->chunkOffset = 0;
  1020. U_ASSERT(ix == u8b->bufNativeStart);
  1021. }
  1022. return false;
  1023. makeStubBuffer:
  1024. // The user has done a seek/access past the start or end
  1025. // of the string. Rather than loading data that is likely
  1026. // to never be used, just set up a zero-length buffer at
  1027. // the position.
  1028. u8b = (UTF8Buf *)ut->q;
  1029. u8b->bufNativeStart = ix;
  1030. u8b->bufNativeLimit = ix;
  1031. u8b->bufStartIdx = 0;
  1032. u8b->bufLimitIdx = 0;
  1033. u8b->bufNILimit = 0;
  1034. u8b->toUCharsMapStart = ix;
  1035. u8b->mapToNative[0] = 0;
  1036. u8b->mapToUChars[0] = 0;
  1037. goto swapBuffersAndFail;
  1038. fillForward:
  1039. {
  1040. // Move the incoming index to a code point boundary.
  1041. U8_SET_CP_START(s8, 0, ix);
  1042. // Swap the UText buffers.
  1043. // We want to fill what was previously the alternate buffer,
  1044. // and make what was the current buffer be the new alternate.
  1045. UTF8Buf *u8b_swap = (UTF8Buf *)ut->q;
  1046. ut->q = ut->p;
  1047. ut->p = u8b_swap;
  1048. int32_t strLen = ut->b;
  1049. UBool nulTerminated = false;
  1050. if (strLen < 0) {
  1051. strLen = 0x7fffffff;
  1052. nulTerminated = true;
  1053. }
  1054. char16_t *buf = u8b_swap->buf;
  1055. uint8_t *mapToNative = u8b_swap->mapToNative;
  1056. uint8_t *mapToUChars = u8b_swap->mapToUChars;
  1057. int32_t destIx = 0;
  1058. int32_t srcIx = ix;
  1059. UBool seenNonAscii = false;
  1060. UChar32 c = 0;
  1061. // Fill the chunk buffer and mapping arrays.
  1062. while (destIx<UTF8_TEXT_CHUNK_SIZE) {
  1063. c = s8[srcIx];
  1064. if (c>0 && c<0x80) {
  1065. // Special case ASCII range for speed.
  1066. // zero is excluded to simplify bounds checking.
  1067. buf[destIx] = (char16_t)c;
  1068. mapToNative[destIx] = (uint8_t)(srcIx - ix);
  1069. mapToUChars[srcIx-ix] = (uint8_t)destIx;
  1070. srcIx++;
  1071. destIx++;
  1072. } else {
  1073. // General case, handle everything.
  1074. if (seenNonAscii == false) {
  1075. seenNonAscii = true;
  1076. u8b_swap->bufNILimit = destIx;
  1077. }
  1078. int32_t cIx = srcIx;
  1079. int32_t dIx = destIx;
  1080. int32_t dIxSaved = destIx;
  1081. U8_NEXT_OR_FFFD(s8, srcIx, strLen, c);
  1082. if (c==0 && nulTerminated) {
  1083. srcIx--;
  1084. break;
  1085. }
  1086. U16_APPEND_UNSAFE(buf, destIx, c);
  1087. do {
  1088. mapToNative[dIx++] = (uint8_t)(cIx - ix);
  1089. } while (dIx < destIx);
  1090. do {
  1091. mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved;
  1092. } while (cIx < srcIx);
  1093. }
  1094. if (srcIx>=strLen) {
  1095. break;
  1096. }
  1097. }
  1098. // store Native <--> Chunk Map entries for the end of the buffer.
  1099. // There is no actual character here, but the index position is valid.
  1100. mapToNative[destIx] = (uint8_t)(srcIx - ix);
  1101. mapToUChars[srcIx - ix] = (uint8_t)destIx;
  1102. // fill in Buffer descriptor
  1103. u8b_swap->bufNativeStart = ix;
  1104. u8b_swap->bufNativeLimit = srcIx;
  1105. u8b_swap->bufStartIdx = 0;
  1106. u8b_swap->bufLimitIdx = destIx;
  1107. if (seenNonAscii == false) {
  1108. u8b_swap->bufNILimit = destIx;
  1109. }
  1110. u8b_swap->toUCharsMapStart = u8b_swap->bufNativeStart;
  1111. // Set UText chunk to refer to this buffer.
  1112. ut->chunkContents = buf;
  1113. ut->chunkOffset = 0;
  1114. ut->chunkLength = u8b_swap->bufLimitIdx;
  1115. ut->chunkNativeStart = u8b_swap->bufNativeStart;
  1116. ut->chunkNativeLimit = u8b_swap->bufNativeLimit;
  1117. ut->nativeIndexingLimit = u8b_swap->bufNILimit;
  1118. // For zero terminated strings, keep track of the maximum point
  1119. // scanned so far.
  1120. if (nulTerminated && srcIx>ut->c) {
  1121. ut->c = srcIx;
  1122. if (c==0) {
  1123. // We scanned to the end.
  1124. // Remember the actual length.
  1125. ut->b = srcIx;
  1126. ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  1127. }
  1128. }
  1129. return true;
  1130. }
  1131. fillReverse:
  1132. {
  1133. // Move the incoming index to a code point boundary.
  1134. // Can only do this if the incoming index is somewhere in the interior of the string.
  1135. // If index is at the end, there is no character there to look at.
  1136. if (ix != ut->b) {
  1137. // Note: this function will only move the index back if it is on a trail byte
  1138. // and there is a preceding lead byte and the sequence from the lead
  1139. // through this trail could be part of a valid UTF-8 sequence
  1140. // Otherwise the index remains unchanged.
  1141. U8_SET_CP_START(s8, 0, ix);
  1142. }
  1143. // Swap the UText buffers.
  1144. // We want to fill what was previously the alternate buffer,
  1145. // and make what was the current buffer be the new alternate.
  1146. UTF8Buf *u8b_swap = (UTF8Buf *)ut->q;
  1147. ut->q = ut->p;
  1148. ut->p = u8b_swap;
  1149. char16_t *buf = u8b_swap->buf;
  1150. uint8_t *mapToNative = u8b_swap->mapToNative;
  1151. uint8_t *mapToUChars = u8b_swap->mapToUChars;
  1152. int32_t toUCharsMapStart = ix - sizeof(UTF8Buf::mapToUChars) + 1;
  1153. // Note that toUCharsMapStart can be negative. Happens when the remaining
  1154. // text from current position to the beginning is less than the buffer size.
  1155. // + 1 because mapToUChars must have a slot at the end for the bufNativeLimit entry.
  1156. int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region
  1157. // at end of buffer to leave room
  1158. // for a surrogate pair at the
  1159. // buffer start.
  1160. int32_t srcIx = ix;
  1161. int32_t bufNILimit = destIx;
  1162. UChar32 c;
  1163. // Map to/from Native Indexes, fill in for the position at the end of
  1164. // the buffer.
  1165. //
  1166. mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1167. mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
  1168. // Fill the chunk buffer
  1169. // Work backwards, filling from the end of the buffer towards the front.
  1170. //
  1171. while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) {
  1172. srcIx--;
  1173. destIx--;
  1174. // Get last byte of the UTF-8 character
  1175. c = s8[srcIx];
  1176. if (c<0x80) {
  1177. // Special case ASCII range for speed.
  1178. buf[destIx] = (char16_t)c;
  1179. U_ASSERT(toUCharsMapStart <= srcIx);
  1180. mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
  1181. mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1182. } else {
  1183. // General case, handle everything non-ASCII.
  1184. int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char
  1185. // Get the full character from the UTF8 string.
  1186. // use code derived from the macros in utf8.h
  1187. // Leaves srcIx pointing at the first byte of the UTF-8 char.
  1188. //
  1189. c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3);
  1190. // leaves srcIx at first byte of the multi-byte char.
  1191. // Store the character in UTF-16 buffer.
  1192. if (c<0x10000) {
  1193. buf[destIx] = (char16_t)c;
  1194. mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1195. } else {
  1196. buf[destIx] = U16_TRAIL(c);
  1197. mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1198. buf[--destIx] = U16_LEAD(c);
  1199. mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1200. }
  1201. // Fill in the map from native indexes to UChars buf index.
  1202. do {
  1203. mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
  1204. } while (sIx >= srcIx);
  1205. U_ASSERT(toUCharsMapStart <= (srcIx+1));
  1206. // Set native indexing limit to be the current position.
  1207. // We are processing a non-ascii, non-native-indexing char now;
  1208. // the limit will be here if the rest of the chars to be
  1209. // added to this buffer are ascii.
  1210. bufNILimit = destIx;
  1211. }
  1212. }
  1213. u8b_swap->bufNativeStart = srcIx;
  1214. u8b_swap->bufNativeLimit = ix;
  1215. u8b_swap->bufStartIdx = destIx;
  1216. u8b_swap->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2;
  1217. u8b_swap->bufNILimit = bufNILimit - u8b_swap->bufStartIdx;
  1218. u8b_swap->toUCharsMapStart = toUCharsMapStart;
  1219. ut->chunkContents = &buf[u8b_swap->bufStartIdx];
  1220. ut->chunkLength = u8b_swap->bufLimitIdx - u8b_swap->bufStartIdx;
  1221. ut->chunkOffset = ut->chunkLength;
  1222. ut->chunkNativeStart = u8b_swap->bufNativeStart;
  1223. ut->chunkNativeLimit = u8b_swap->bufNativeLimit;
  1224. ut->nativeIndexingLimit = u8b_swap->bufNILimit;
  1225. return true;
  1226. }
  1227. }
  1228. //
  1229. // This is a slightly modified copy of u_strFromUTF8,
  1230. // Inserts a Replacement Char rather than failing on invalid UTF-8
  1231. // Removes unnecessary features.
  1232. //
  1233. static char16_t*
  1234. utext_strFromUTF8(char16_t *dest,
  1235. int32_t destCapacity,
  1236. int32_t *pDestLength,
  1237. const char* src,
  1238. int32_t srcLength, // required. NUL terminated not supported.
  1239. UErrorCode *pErrorCode
  1240. )
  1241. {
  1242. char16_t *pDest = dest;
  1243. char16_t *pDestLimit = (dest!=nullptr)?(dest+destCapacity):nullptr;
  1244. UChar32 ch=0;
  1245. int32_t index = 0;
  1246. int32_t reqLength = 0;
  1247. uint8_t* pSrc = (uint8_t*) src;
  1248. while((index < srcLength)&&(pDest<pDestLimit)){
  1249. ch = pSrc[index++];
  1250. if(ch <=0x7f){
  1251. *pDest++=(char16_t)ch;
  1252. }else{
  1253. ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
  1254. if(U_IS_BMP(ch)){
  1255. *(pDest++)=(char16_t)ch;
  1256. }else{
  1257. *(pDest++)=U16_LEAD(ch);
  1258. if(pDest<pDestLimit){
  1259. *(pDest++)=U16_TRAIL(ch);
  1260. }else{
  1261. reqLength++;
  1262. break;
  1263. }
  1264. }
  1265. }
  1266. }
  1267. /* donot fill the dest buffer just count the UChars needed */
  1268. while(index < srcLength){
  1269. ch = pSrc[index++];
  1270. if(ch <= 0x7f){
  1271. reqLength++;
  1272. }else{
  1273. ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
  1274. reqLength+=U16_LENGTH(ch);
  1275. }
  1276. }
  1277. reqLength+=(int32_t)(pDest - dest);
  1278. if(pDestLength){
  1279. *pDestLength = reqLength;
  1280. }
  1281. /* Terminate the buffer */
  1282. u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
  1283. return dest;
  1284. }
  1285. static int32_t U_CALLCONV
  1286. utf8TextExtract(UText *ut,
  1287. int64_t start, int64_t limit,
  1288. char16_t *dest, int32_t destCapacity,
  1289. UErrorCode *pErrorCode) {
  1290. if(U_FAILURE(*pErrorCode)) {
  1291. return 0;
  1292. }
  1293. if(destCapacity<0 || (dest==nullptr && destCapacity>0)) {
  1294. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1295. return 0;
  1296. }
  1297. int32_t length = ut->b;
  1298. int32_t start32 = pinIndex(start, length);
  1299. int32_t limit32 = pinIndex(limit, length);
  1300. if(start32>limit32) {
  1301. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1302. return 0;
  1303. }
  1304. // adjust the incoming indexes to land on code point boundaries if needed.
  1305. // adjust by no more than three, because that is the largest number of trail bytes
  1306. // in a well formed UTF8 character.
  1307. const uint8_t *buf = (const uint8_t *)ut->context;
  1308. int i;
  1309. if (start32 < ut->chunkNativeLimit) {
  1310. for (i=0; i<3; i++) {
  1311. if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
  1312. break;
  1313. }
  1314. start32--;
  1315. }
  1316. }
  1317. if (limit32 < ut->chunkNativeLimit) {
  1318. for (i=0; i<3; i++) {
  1319. if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
  1320. break;
  1321. }
  1322. limit32--;
  1323. }
  1324. }
  1325. // Do the actual extract.
  1326. int32_t destLength=0;
  1327. utext_strFromUTF8(dest, destCapacity, &destLength,
  1328. (const char *)ut->context+start32, limit32-start32,
  1329. pErrorCode);
  1330. utf8TextAccess(ut, limit32, true);
  1331. return destLength;
  1332. }
  1333. //
  1334. // utf8TextMapOffsetToNative
  1335. //
  1336. // Map a chunk (UTF-16) offset to a native index.
  1337. static int64_t U_CALLCONV
  1338. utf8TextMapOffsetToNative(const UText *ut) {
  1339. //
  1340. UTF8Buf *u8b = (UTF8Buf *)ut->p;
  1341. U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength);
  1342. int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart;
  1343. U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit);
  1344. return nativeOffset;
  1345. }
  1346. //
  1347. // Map a native index to the corresponding chunk offset
  1348. //
  1349. static int32_t U_CALLCONV
  1350. utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
  1351. U_ASSERT(index64 <= 0x7fffffff);
  1352. int32_t index = (int32_t)index64;
  1353. UTF8Buf *u8b = (UTF8Buf *)ut->p;
  1354. U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
  1355. U_ASSERT(index<=ut->chunkNativeLimit);
  1356. int32_t mapIndex = index - u8b->toUCharsMapStart;
  1357. U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars));
  1358. int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  1359. U_ASSERT(offset>=0 && offset<=ut->chunkLength);
  1360. return offset;
  1361. }
  1362. static UText * U_CALLCONV
  1363. utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status)
  1364. {
  1365. // First do a generic shallow clone. Does everything needed for the UText struct itself.
  1366. dest = shallowTextClone(dest, src, status);
  1367. // For deep clones, make a copy of the string.
  1368. // The copied storage is owned by the newly created clone.
  1369. //
  1370. // TODO: There is an issue with using utext_nativeLength().
  1371. // That function is non-const in cases where the input was NUL terminated
  1372. // and the length has not yet been determined.
  1373. // This function (clone()) is const.
  1374. // There potentially a thread safety issue lurking here.
  1375. //
  1376. if (deep && U_SUCCESS(*status)) {
  1377. int32_t len = (int32_t)utext_nativeLength((UText *)src);
  1378. char *copyStr = (char *)uprv_malloc(len+1);
  1379. if (copyStr == nullptr) {
  1380. *status = U_MEMORY_ALLOCATION_ERROR;
  1381. } else {
  1382. uprv_memcpy(copyStr, src->context, len+1);
  1383. dest->context = copyStr;
  1384. dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  1385. }
  1386. }
  1387. return dest;
  1388. }
  1389. static void U_CALLCONV
  1390. utf8TextClose(UText *ut) {
  1391. // Most of the work of close is done by the generic UText framework close.
  1392. // All that needs to be done here is to delete the UTF8 string if the UText
  1393. // owns it. This occurs if the UText was created by cloning.
  1394. if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  1395. char *s = (char *)ut->context;
  1396. uprv_free(s);
  1397. ut->context = nullptr;
  1398. }
  1399. }
  1400. U_CDECL_END
  1401. static const struct UTextFuncs utf8Funcs =
  1402. {
  1403. sizeof(UTextFuncs),
  1404. 0, 0, 0, // Reserved alignment padding
  1405. utf8TextClone,
  1406. utf8TextLength,
  1407. utf8TextAccess,
  1408. utf8TextExtract,
  1409. nullptr, /* replace*/
  1410. nullptr, /* copy */
  1411. utf8TextMapOffsetToNative,
  1412. utf8TextMapIndexToUTF16,
  1413. utf8TextClose,
  1414. nullptr, // spare 1
  1415. nullptr, // spare 2
  1416. nullptr // spare 3
  1417. };
  1418. static const char gEmptyString[] = {0};
  1419. U_CAPI UText * U_EXPORT2
  1420. utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) {
  1421. if(U_FAILURE(*status)) {
  1422. return nullptr;
  1423. }
  1424. if(s==nullptr && length==0) {
  1425. s = gEmptyString;
  1426. }
  1427. if(s==nullptr || length<-1 || length>INT32_MAX) {
  1428. *status=U_ILLEGAL_ARGUMENT_ERROR;
  1429. return nullptr;
  1430. }
  1431. ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status);
  1432. if (U_FAILURE(*status)) {
  1433. return ut;
  1434. }
  1435. ut->pFuncs = &utf8Funcs;
  1436. ut->context = s;
  1437. ut->b = (int32_t)length;
  1438. ut->c = (int32_t)length;
  1439. if (ut->c < 0) {
  1440. ut->c = 0;
  1441. ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  1442. }
  1443. ut->p = ut->pExtra;
  1444. ut->q = (char *)ut->pExtra + sizeof(UTF8Buf);
  1445. return ut;
  1446. }
  1447. //------------------------------------------------------------------------------
  1448. //
  1449. // UText implementation wrapper for Replaceable (read/write)
  1450. //
  1451. // Use of UText data members:
  1452. // context pointer to Replaceable.
  1453. // p pointer to Replaceable if it is owned by the UText.
  1454. //
  1455. //------------------------------------------------------------------------------
  1456. // minimum chunk size for this implementation: 3
  1457. // to allow for possible trimming for code point boundaries
  1458. enum { REP_TEXT_CHUNK_SIZE=10 };
  1459. struct ReplExtra {
  1460. /*
  1461. * Chunk UChars.
  1462. * +1 to simplify filling with surrogate pair at the end.
  1463. */
  1464. char16_t s[REP_TEXT_CHUNK_SIZE+1];
  1465. };
  1466. U_CDECL_BEGIN
  1467. static UText * U_CALLCONV
  1468. repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
  1469. // First do a generic shallow clone. Does everything needed for the UText struct itself.
  1470. dest = shallowTextClone(dest, src, status);
  1471. // For deep clones, make a copy of the Replaceable.
  1472. // The copied Replaceable storage is owned by the newly created UText clone.
  1473. // A non-nullptr pointer in UText.p is the signal to the close() function to delete
  1474. // it.
  1475. //
  1476. if (deep && U_SUCCESS(*status)) {
  1477. const Replaceable *replSrc = (const Replaceable *)src->context;
  1478. dest->context = replSrc->clone();
  1479. dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  1480. // with deep clone, the copy is writable, even when the source is not.
  1481. dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  1482. }
  1483. return dest;
  1484. }
  1485. static void U_CALLCONV
  1486. repTextClose(UText *ut) {
  1487. // Most of the work of close is done by the generic UText framework close.
  1488. // All that needs to be done here is delete the Replaceable if the UText
  1489. // owns it. This occurs if the UText was created by cloning.
  1490. if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  1491. Replaceable *rep = (Replaceable *)ut->context;
  1492. delete rep;
  1493. ut->context = nullptr;
  1494. }
  1495. }
  1496. static int64_t U_CALLCONV
  1497. repTextLength(UText *ut) {
  1498. const Replaceable *replSrc = (const Replaceable *)ut->context;
  1499. int32_t len = replSrc->length();
  1500. return len;
  1501. }
  1502. static UBool U_CALLCONV
  1503. repTextAccess(UText *ut, int64_t index, UBool forward) {
  1504. const Replaceable *rep=(const Replaceable *)ut->context;
  1505. int32_t length=rep->length(); // Full length of the input text (bigger than a chunk)
  1506. // clip the requested index to the limits of the text.
  1507. int32_t index32 = pinIndex(index, length);
  1508. U_ASSERT(index<=INT32_MAX);
  1509. /*
  1510. * Compute start/limit boundaries around index, for a segment of text
  1511. * to be extracted.
  1512. * To allow for the possibility that our user gave an index to the trailing
  1513. * half of a surrogate pair, we must request one extra preceding char16_t when
  1514. * going in the forward direction. This will ensure that the buffer has the
  1515. * entire code point at the specified index.
  1516. */
  1517. if(forward) {
  1518. if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) {
  1519. // Buffer already contains the requested position.
  1520. ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
  1521. return true;
  1522. }
  1523. if (index32>=length && ut->chunkNativeLimit==length) {
  1524. // Request for end of string, and buffer already extends up to it.
  1525. // Can't get the data, but don't change the buffer.
  1526. ut->chunkOffset = length - (int32_t)ut->chunkNativeStart;
  1527. return false;
  1528. }
  1529. ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
  1530. // Going forward, so we want to have the buffer with stuff at and beyond
  1531. // the requested index. The -1 gets us one code point before the
  1532. // requested index also, to handle the case of the index being on
  1533. // a trail surrogate of a surrogate pair.
  1534. if(ut->chunkNativeLimit > length) {
  1535. ut->chunkNativeLimit = length;
  1536. }
  1537. // unless buffer ran off end, start is index-1.
  1538. ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE;
  1539. if(ut->chunkNativeStart < 0) {
  1540. ut->chunkNativeStart = 0;
  1541. }
  1542. } else {
  1543. // Reverse iteration. Fill buffer with data preceding the requested index.
  1544. if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) {
  1545. // Requested position already in buffer.
  1546. ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart;
  1547. return true;
  1548. }
  1549. if (index32==0 && ut->chunkNativeStart==0) {
  1550. // Request for start, buffer already begins at start.
  1551. // No data, but keep the buffer as is.
  1552. ut->chunkOffset = 0;
  1553. return false;
  1554. }
  1555. // Figure out the bounds of the chunk to extract for reverse iteration.
  1556. // Need to worry about chunk not splitting surrogate pairs, and while still
  1557. // containing the data we need.
  1558. // Fix by requesting a chunk that includes an extra char16_t at the end.
  1559. // If this turns out to be a lead surrogate, we can lop it off and still have
  1560. // the data we wanted.
  1561. ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE;
  1562. if (ut->chunkNativeStart < 0) {
  1563. ut->chunkNativeStart = 0;
  1564. }
  1565. ut->chunkNativeLimit = index32 + 1;
  1566. if (ut->chunkNativeLimit > length) {
  1567. ut->chunkNativeLimit = length;
  1568. }
  1569. }
  1570. // Extract the new chunk of text from the Replaceable source.
  1571. ReplExtra *ex = (ReplExtra *)ut->pExtra;
  1572. // UnicodeString with its buffer a writable alias to the chunk buffer
  1573. UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/);
  1574. rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer);
  1575. ut->chunkContents = ex->s;
  1576. ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart);
  1577. ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart);
  1578. // Surrogate pairs from the input text must not span chunk boundaries.
  1579. // If end of chunk could be the start of a surrogate, trim it off.
  1580. if (ut->chunkNativeLimit < length &&
  1581. U16_IS_LEAD(ex->s[ut->chunkLength-1])) {
  1582. ut->chunkLength--;
  1583. ut->chunkNativeLimit--;
  1584. if (ut->chunkOffset > ut->chunkLength) {
  1585. ut->chunkOffset = ut->chunkLength;
  1586. }
  1587. }
  1588. // if the first char16_t in the chunk could be the trailing half of a surrogate pair,
  1589. // trim it off.
  1590. if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
  1591. ++(ut->chunkContents);
  1592. ++(ut->chunkNativeStart);
  1593. --(ut->chunkLength);
  1594. --(ut->chunkOffset);
  1595. }
  1596. // adjust the index/chunkOffset to a code point boundary
  1597. U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset);
  1598. // Use fast indexing for get/setNativeIndex()
  1599. ut->nativeIndexingLimit = ut->chunkLength;
  1600. return true;
  1601. }
  1602. static int32_t U_CALLCONV
  1603. repTextExtract(UText *ut,
  1604. int64_t start, int64_t limit,
  1605. char16_t *dest, int32_t destCapacity,
  1606. UErrorCode *status) {
  1607. const Replaceable *rep=(const Replaceable *)ut->context;
  1608. int32_t length=rep->length();
  1609. if(U_FAILURE(*status)) {
  1610. return 0;
  1611. }
  1612. if(destCapacity<0 || (dest==nullptr && destCapacity>0)) {
  1613. *status=U_ILLEGAL_ARGUMENT_ERROR;
  1614. }
  1615. if(start>limit) {
  1616. *status=U_INDEX_OUTOFBOUNDS_ERROR;
  1617. return 0;
  1618. }
  1619. int32_t start32 = pinIndex(start, length);
  1620. int32_t limit32 = pinIndex(limit, length);
  1621. // adjust start, limit if they point to trail half of surrogates
  1622. if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) &&
  1623. U_IS_SUPPLEMENTARY(rep->char32At(start32))){
  1624. start32--;
  1625. }
  1626. if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) &&
  1627. U_IS_SUPPLEMENTARY(rep->char32At(limit32))){
  1628. limit32--;
  1629. }
  1630. length=limit32-start32;
  1631. if(length>destCapacity) {
  1632. limit32 = start32 + destCapacity;
  1633. }
  1634. UnicodeString buffer(dest, 0, destCapacity); // writable alias
  1635. rep->extractBetween(start32, limit32, buffer);
  1636. repTextAccess(ut, limit32, true);
  1637. return u_terminateUChars(dest, destCapacity, length, status);
  1638. }
  1639. static int32_t U_CALLCONV
  1640. repTextReplace(UText *ut,
  1641. int64_t start, int64_t limit,
  1642. const char16_t *src, int32_t length,
  1643. UErrorCode *status) {
  1644. Replaceable *rep=(Replaceable *)ut->context;
  1645. int32_t oldLength;
  1646. if(U_FAILURE(*status)) {
  1647. return 0;
  1648. }
  1649. if(src==nullptr && length!=0) {
  1650. *status=U_ILLEGAL_ARGUMENT_ERROR;
  1651. return 0;
  1652. }
  1653. oldLength=rep->length(); // will subtract from new length
  1654. if(start>limit ) {
  1655. *status=U_INDEX_OUTOFBOUNDS_ERROR;
  1656. return 0;
  1657. }
  1658. int32_t start32 = pinIndex(start, oldLength);
  1659. int32_t limit32 = pinIndex(limit, oldLength);
  1660. // Snap start & limit to code point boundaries.
  1661. if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) &&
  1662. start32>0 && U16_IS_LEAD(rep->charAt(start32-1)))
  1663. {
  1664. start32--;
  1665. }
  1666. if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) &&
  1667. U16_IS_TRAIL(rep->charAt(limit32)))
  1668. {
  1669. limit32++;
  1670. }
  1671. // Do the actual replace operation using methods of the Replaceable class
  1672. UnicodeString replStr(length < 0, src, length); // read-only alias
  1673. rep->handleReplaceBetween(start32, limit32, replStr);
  1674. int32_t newLength = rep->length();
  1675. int32_t lengthDelta = newLength - oldLength;
  1676. // Is the UText chunk buffer OK?
  1677. if (ut->chunkNativeLimit > start32) {
  1678. // this replace operation may have impacted the current chunk.
  1679. // invalidate it, which will force a reload on the next access.
  1680. invalidateChunk(ut);
  1681. }
  1682. // set the iteration position to the end of the newly inserted replacement text.
  1683. int32_t newIndexPos = limit32 + lengthDelta;
  1684. repTextAccess(ut, newIndexPos, true);
  1685. return lengthDelta;
  1686. }
  1687. static void U_CALLCONV
  1688. repTextCopy(UText *ut,
  1689. int64_t start, int64_t limit,
  1690. int64_t destIndex,
  1691. UBool move,
  1692. UErrorCode *status)
  1693. {
  1694. Replaceable *rep=(Replaceable *)ut->context;
  1695. int32_t length=rep->length();
  1696. if(U_FAILURE(*status)) {
  1697. return;
  1698. }
  1699. if (start>limit || (start<destIndex && destIndex<limit))
  1700. {
  1701. *status=U_INDEX_OUTOFBOUNDS_ERROR;
  1702. return;
  1703. }
  1704. int32_t start32 = pinIndex(start, length);
  1705. int32_t limit32 = pinIndex(limit, length);
  1706. int32_t destIndex32 = pinIndex(destIndex, length);
  1707. // TODO: snap input parameters to code point boundaries.
  1708. if(move) {
  1709. // move: copy to destIndex, then replace original with nothing
  1710. int32_t segLength=limit32-start32;
  1711. rep->copy(start32, limit32, destIndex32);
  1712. if(destIndex32<start32) {
  1713. start32+=segLength;
  1714. limit32+=segLength;
  1715. }
  1716. rep->handleReplaceBetween(start32, limit32, UnicodeString());
  1717. } else {
  1718. // copy
  1719. rep->copy(start32, limit32, destIndex32);
  1720. }
  1721. // If the change to the text touched the region in the chunk buffer,
  1722. // invalidate the buffer.
  1723. int32_t firstAffectedIndex = destIndex32;
  1724. if (move && start32<firstAffectedIndex) {
  1725. firstAffectedIndex = start32;
  1726. }
  1727. if (firstAffectedIndex < ut->chunkNativeLimit) {
  1728. // changes may have affected range covered by the chunk
  1729. invalidateChunk(ut);
  1730. }
  1731. // Put iteration position at the newly inserted (moved) block,
  1732. int32_t nativeIterIndex = destIndex32 + limit32 - start32;
  1733. if (move && destIndex32>start32) {
  1734. // moved a block of text towards the end of the string.
  1735. nativeIterIndex = destIndex32;
  1736. }
  1737. // Set position, reload chunk if needed.
  1738. repTextAccess(ut, nativeIterIndex, true);
  1739. }
  1740. static const struct UTextFuncs repFuncs =
  1741. {
  1742. sizeof(UTextFuncs),
  1743. 0, 0, 0, // Reserved alignment padding
  1744. repTextClone,
  1745. repTextLength,
  1746. repTextAccess,
  1747. repTextExtract,
  1748. repTextReplace,
  1749. repTextCopy,
  1750. nullptr, // MapOffsetToNative,
  1751. nullptr, // MapIndexToUTF16,
  1752. repTextClose,
  1753. nullptr, // spare 1
  1754. nullptr, // spare 2
  1755. nullptr // spare 3
  1756. };
  1757. U_CAPI UText * U_EXPORT2
  1758. utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status)
  1759. {
  1760. if(U_FAILURE(*status)) {
  1761. return nullptr;
  1762. }
  1763. if(rep==nullptr) {
  1764. *status=U_ILLEGAL_ARGUMENT_ERROR;
  1765. return nullptr;
  1766. }
  1767. ut = utext_setup(ut, sizeof(ReplExtra), status);
  1768. if(U_FAILURE(*status)) {
  1769. return ut;
  1770. }
  1771. ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  1772. if(rep->hasMetaData()) {
  1773. ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
  1774. }
  1775. ut->pFuncs = &repFuncs;
  1776. ut->context = rep;
  1777. return ut;
  1778. }
  1779. U_CDECL_END
  1780. //------------------------------------------------------------------------------
  1781. //
  1782. // UText implementation for UnicodeString (read/write) and
  1783. // for const UnicodeString (read only)
  1784. // (same implementation, only the flags are different)
  1785. //
  1786. // Use of UText data members:
  1787. // context pointer to UnicodeString
  1788. // p pointer to UnicodeString IF this UText owns the string
  1789. // and it must be deleted on close(). nullptr otherwise.
  1790. //
  1791. //------------------------------------------------------------------------------
  1792. U_CDECL_BEGIN
  1793. static UText * U_CALLCONV
  1794. unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
  1795. // First do a generic shallow clone. Does everything needed for the UText struct itself.
  1796. dest = shallowTextClone(dest, src, status);
  1797. // For deep clones, make a copy of the UnicodeSring.
  1798. // The copied UnicodeString storage is owned by the newly created UText clone.
  1799. // A non-nullptr pointer in UText.p is the signal to the close() function to delete
  1800. // the UText.
  1801. //
  1802. if (deep && U_SUCCESS(*status)) {
  1803. const UnicodeString *srcString = (const UnicodeString *)src->context;
  1804. dest->context = new UnicodeString(*srcString);
  1805. dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  1806. // with deep clone, the copy is writable, even when the source is not.
  1807. dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  1808. }
  1809. return dest;
  1810. }
  1811. static void U_CALLCONV
  1812. unistrTextClose(UText *ut) {
  1813. // Most of the work of close is done by the generic UText framework close.
  1814. // All that needs to be done here is delete the UnicodeString if the UText
  1815. // owns it. This occurs if the UText was created by cloning.
  1816. if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  1817. UnicodeString *str = (UnicodeString *)ut->context;
  1818. delete str;
  1819. ut->context = nullptr;
  1820. }
  1821. }
  1822. static int64_t U_CALLCONV
  1823. unistrTextLength(UText *t) {
  1824. return ((const UnicodeString *)t->context)->length();
  1825. }
  1826. static UBool U_CALLCONV
  1827. unistrTextAccess(UText *ut, int64_t index, UBool forward) {
  1828. int32_t length = ut->chunkLength;
  1829. ut->chunkOffset = pinIndex(index, length);
  1830. // Check whether request is at the start or end
  1831. UBool retVal = (forward && index<length) || (!forward && index>0);
  1832. return retVal;
  1833. }
  1834. static int32_t U_CALLCONV
  1835. unistrTextExtract(UText *t,
  1836. int64_t start, int64_t limit,
  1837. char16_t *dest, int32_t destCapacity,
  1838. UErrorCode *pErrorCode) {
  1839. const UnicodeString *us=(const UnicodeString *)t->context;
  1840. int32_t length=us->length();
  1841. if(U_FAILURE(*pErrorCode)) {
  1842. return 0;
  1843. }
  1844. if(destCapacity<0 || (dest==nullptr && destCapacity>0)) {
  1845. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1846. }
  1847. if(start<0 || start>limit) {
  1848. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1849. return 0;
  1850. }
  1851. int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length;
  1852. int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length;
  1853. length=limit32-start32;
  1854. if (destCapacity>0 && dest!=nullptr) {
  1855. int32_t trimmedLength = length;
  1856. if(trimmedLength>destCapacity) {
  1857. trimmedLength=destCapacity;
  1858. }
  1859. us->extract(start32, trimmedLength, dest);
  1860. t->chunkOffset = start32+trimmedLength;
  1861. } else {
  1862. t->chunkOffset = start32;
  1863. }
  1864. u_terminateUChars(dest, destCapacity, length, pErrorCode);
  1865. return length;
  1866. }
  1867. static int32_t U_CALLCONV
  1868. unistrTextReplace(UText *ut,
  1869. int64_t start, int64_t limit,
  1870. const char16_t *src, int32_t length,
  1871. UErrorCode *pErrorCode) {
  1872. UnicodeString *us=(UnicodeString *)ut->context;
  1873. int32_t oldLength;
  1874. if(U_FAILURE(*pErrorCode)) {
  1875. return 0;
  1876. }
  1877. if(src==nullptr && length!=0) {
  1878. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1879. }
  1880. if(start>limit) {
  1881. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1882. return 0;
  1883. }
  1884. oldLength=us->length();
  1885. int32_t start32 = pinIndex(start, oldLength);
  1886. int32_t limit32 = pinIndex(limit, oldLength);
  1887. if (start32 < oldLength) {
  1888. start32 = us->getChar32Start(start32);
  1889. }
  1890. if (limit32 < oldLength) {
  1891. limit32 = us->getChar32Start(limit32);
  1892. }
  1893. // replace
  1894. us->replace(start32, limit32-start32, src, length);
  1895. int32_t newLength = us->length();
  1896. // Update the chunk description.
  1897. ut->chunkContents = us->getBuffer();
  1898. ut->chunkLength = newLength;
  1899. ut->chunkNativeLimit = newLength;
  1900. ut->nativeIndexingLimit = newLength;
  1901. // Set iteration position to the point just following the newly inserted text.
  1902. int32_t lengthDelta = newLength - oldLength;
  1903. ut->chunkOffset = limit32 + lengthDelta;
  1904. return lengthDelta;
  1905. }
  1906. static void U_CALLCONV
  1907. unistrTextCopy(UText *ut,
  1908. int64_t start, int64_t limit,
  1909. int64_t destIndex,
  1910. UBool move,
  1911. UErrorCode *pErrorCode) {
  1912. UnicodeString *us=(UnicodeString *)ut->context;
  1913. int32_t length=us->length();
  1914. if(U_FAILURE(*pErrorCode)) {
  1915. return;
  1916. }
  1917. int32_t start32 = pinIndex(start, length);
  1918. int32_t limit32 = pinIndex(limit, length);
  1919. int32_t destIndex32 = pinIndex(destIndex, length);
  1920. if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) {
  1921. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1922. return;
  1923. }
  1924. if(move) {
  1925. // move: copy to destIndex, then remove original
  1926. int32_t segLength=limit32-start32;
  1927. us->copy(start32, limit32, destIndex32);
  1928. if(destIndex32<start32) {
  1929. start32+=segLength;
  1930. }
  1931. us->remove(start32, segLength);
  1932. } else {
  1933. // copy
  1934. us->copy(start32, limit32, destIndex32);
  1935. }
  1936. // update chunk description, set iteration position.
  1937. ut->chunkContents = us->getBuffer();
  1938. if (move==false) {
  1939. // copy operation, string length grows
  1940. ut->chunkLength += limit32-start32;
  1941. ut->chunkNativeLimit = ut->chunkLength;
  1942. ut->nativeIndexingLimit = ut->chunkLength;
  1943. }
  1944. // Iteration position to end of the newly inserted text.
  1945. ut->chunkOffset = destIndex32+limit32-start32;
  1946. if (move && destIndex32>start32) {
  1947. ut->chunkOffset = destIndex32;
  1948. }
  1949. }
  1950. static const struct UTextFuncs unistrFuncs =
  1951. {
  1952. sizeof(UTextFuncs),
  1953. 0, 0, 0, // Reserved alignment padding
  1954. unistrTextClone,
  1955. unistrTextLength,
  1956. unistrTextAccess,
  1957. unistrTextExtract,
  1958. unistrTextReplace,
  1959. unistrTextCopy,
  1960. nullptr, // MapOffsetToNative,
  1961. nullptr, // MapIndexToUTF16,
  1962. unistrTextClose,
  1963. nullptr, // spare 1
  1964. nullptr, // spare 2
  1965. nullptr // spare 3
  1966. };
  1967. U_CDECL_END
  1968. U_CAPI UText * U_EXPORT2
  1969. utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
  1970. ut = utext_openConstUnicodeString(ut, s, status);
  1971. if (U_SUCCESS(*status)) {
  1972. ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  1973. }
  1974. return ut;
  1975. }
  1976. U_CAPI UText * U_EXPORT2
  1977. utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) {
  1978. if (U_SUCCESS(*status) && s->isBogus()) {
  1979. // The UnicodeString is bogus, but we still need to detach the UText
  1980. // from whatever it was hooked to before, if anything.
  1981. utext_openUChars(ut, nullptr, 0, status);
  1982. *status = U_ILLEGAL_ARGUMENT_ERROR;
  1983. return ut;
  1984. }
  1985. ut = utext_setup(ut, 0, status);
  1986. // note: use the standard (writable) function table for UnicodeString.
  1987. // The flag settings disable writing, so having the functions in
  1988. // the table is harmless.
  1989. if (U_SUCCESS(*status)) {
  1990. ut->pFuncs = &unistrFuncs;
  1991. ut->context = s;
  1992. ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
  1993. ut->chunkContents = s->getBuffer();
  1994. ut->chunkLength = s->length();
  1995. ut->chunkNativeStart = 0;
  1996. ut->chunkNativeLimit = ut->chunkLength;
  1997. ut->nativeIndexingLimit = ut->chunkLength;
  1998. }
  1999. return ut;
  2000. }
  2001. //------------------------------------------------------------------------------
  2002. //
  2003. // UText implementation for const char16_t * strings
  2004. //
  2005. // Use of UText data members:
  2006. // context pointer to UnicodeString
  2007. // a length. -1 if not yet known.
  2008. //
  2009. // TODO: support 64 bit lengths.
  2010. //
  2011. //------------------------------------------------------------------------------
  2012. U_CDECL_BEGIN
  2013. static UText * U_CALLCONV
  2014. ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) {
  2015. // First do a generic shallow clone.
  2016. dest = shallowTextClone(dest, src, status);
  2017. // For deep clones, make a copy of the string.
  2018. // The copied storage is owned by the newly created clone.
  2019. // A non-nullptr pointer in UText.p is the signal to the close() function to delete
  2020. // it.
  2021. //
  2022. if (deep && U_SUCCESS(*status)) {
  2023. U_ASSERT(utext_nativeLength(dest) < INT32_MAX);
  2024. int32_t len = (int32_t)utext_nativeLength(dest);
  2025. // The cloned string IS going to be NUL terminated, whether or not the original was.
  2026. const char16_t *srcStr = (const char16_t *)src->context;
  2027. char16_t *copyStr = (char16_t *)uprv_malloc((len+1) * sizeof(char16_t));
  2028. if (copyStr == nullptr) {
  2029. *status = U_MEMORY_ALLOCATION_ERROR;
  2030. } else {
  2031. int64_t i;
  2032. for (i=0; i<len; i++) {
  2033. copyStr[i] = srcStr[i];
  2034. }
  2035. copyStr[len] = 0;
  2036. dest->context = copyStr;
  2037. dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  2038. }
  2039. }
  2040. return dest;
  2041. }
  2042. static void U_CALLCONV
  2043. ucstrTextClose(UText *ut) {
  2044. // Most of the work of close is done by the generic UText framework close.
  2045. // All that needs to be done here is delete the string if the UText
  2046. // owns it. This occurs if the UText was created by cloning.
  2047. if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  2048. char16_t *s = (char16_t *)ut->context;
  2049. uprv_free(s);
  2050. ut->context = nullptr;
  2051. }
  2052. }
  2053. static int64_t U_CALLCONV
  2054. ucstrTextLength(UText *ut) {
  2055. if (ut->a < 0) {
  2056. // null terminated, we don't yet know the length. Scan for it.
  2057. // Access is not convenient for doing this
  2058. // because the current iteration position can't be changed.
  2059. const char16_t *str = (const char16_t *)ut->context;
  2060. for (;;) {
  2061. if (str[ut->chunkNativeLimit] == 0) {
  2062. break;
  2063. }
  2064. ut->chunkNativeLimit++;
  2065. }
  2066. ut->a = ut->chunkNativeLimit;
  2067. ut->chunkLength = (int32_t)ut->chunkNativeLimit;
  2068. ut->nativeIndexingLimit = ut->chunkLength;
  2069. ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  2070. }
  2071. return ut->a;
  2072. }
  2073. static UBool U_CALLCONV
  2074. ucstrTextAccess(UText *ut, int64_t index, UBool forward) {
  2075. const char16_t *str = (const char16_t *)ut->context;
  2076. // pin the requested index to the bounds of the string,
  2077. // and set current iteration position.
  2078. if (index<0) {
  2079. index = 0;
  2080. } else if (index < ut->chunkNativeLimit) {
  2081. // The request data is within the chunk as it is known so far.
  2082. // Put index on a code point boundary.
  2083. U16_SET_CP_START(str, 0, index);
  2084. } else if (ut->a >= 0) {
  2085. // We know the length of this string, and the user is requesting something
  2086. // at or beyond the length. Pin the requested index to the length.
  2087. index = ut->a;
  2088. } else {
  2089. // Null terminated string, length not yet known, and the requested index
  2090. // is beyond where we have scanned so far.
  2091. // Scan to 32 UChars beyond the requested index. The strategy here is
  2092. // to avoid fully scanning a long string when the caller only wants to
  2093. // see a few characters at its beginning.
  2094. int32_t scanLimit = (int32_t)index + 32;
  2095. if ((index + 32)>INT32_MAX || (index + 32)<0 ) { // note: int64 expression
  2096. scanLimit = INT32_MAX;
  2097. }
  2098. int32_t chunkLimit = (int32_t)ut->chunkNativeLimit;
  2099. for (; chunkLimit<scanLimit; chunkLimit++) {
  2100. if (str[chunkLimit] == 0) {
  2101. // We found the end of the string. Remember it, pin the requested index to it,
  2102. // and bail out of here.
  2103. ut->a = chunkLimit;
  2104. ut->chunkLength = chunkLimit;
  2105. ut->nativeIndexingLimit = chunkLimit;
  2106. if (index >= chunkLimit) {
  2107. index = chunkLimit;
  2108. } else {
  2109. U16_SET_CP_START(str, 0, index);
  2110. }
  2111. ut->chunkNativeLimit = chunkLimit;
  2112. ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  2113. goto breakout;
  2114. }
  2115. }
  2116. // We scanned through the next batch of UChars without finding the end.
  2117. U16_SET_CP_START(str, 0, index);
  2118. if (chunkLimit == INT32_MAX) {
  2119. // Scanned to the limit of a 32 bit length.
  2120. // Forceably trim the overlength string back so length fits in int32
  2121. // TODO: add support for 64 bit strings.
  2122. ut->a = chunkLimit;
  2123. ut->chunkLength = chunkLimit;
  2124. ut->nativeIndexingLimit = chunkLimit;
  2125. if (index > chunkLimit) {
  2126. index = chunkLimit;
  2127. }
  2128. ut->chunkNativeLimit = chunkLimit;
  2129. ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  2130. } else {
  2131. // The endpoint of a chunk must not be left in the middle of a surrogate pair.
  2132. // If the current end is on a lead surrogate, back the end up by one.
  2133. // It doesn't matter if the end char happens to be an unpaired surrogate,
  2134. // and it's simpler not to worry about it.
  2135. if (U16_IS_LEAD(str[chunkLimit-1])) {
  2136. --chunkLimit;
  2137. }
  2138. // Null-terminated chunk with end still unknown.
  2139. // Update the chunk length to reflect what has been scanned thus far.
  2140. // That the full length is still unknown is (still) flagged by
  2141. // ut->a being < 0.
  2142. ut->chunkNativeLimit = chunkLimit;
  2143. ut->nativeIndexingLimit = chunkLimit;
  2144. ut->chunkLength = chunkLimit;
  2145. }
  2146. }
  2147. breakout:
  2148. U_ASSERT(index<=INT32_MAX);
  2149. ut->chunkOffset = (int32_t)index;
  2150. // Check whether request is at the start or end
  2151. UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0);
  2152. return retVal;
  2153. }
  2154. static int32_t U_CALLCONV
  2155. ucstrTextExtract(UText *ut,
  2156. int64_t start, int64_t limit,
  2157. char16_t *dest, int32_t destCapacity,
  2158. UErrorCode *pErrorCode)
  2159. {
  2160. if(U_FAILURE(*pErrorCode)) {
  2161. return 0;
  2162. }
  2163. if(destCapacity<0 || (dest==nullptr && destCapacity>0) || start>limit) {
  2164. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  2165. return 0;
  2166. }
  2167. //const char16_t *s=(const char16_t *)ut->context;
  2168. int32_t si, di;
  2169. int32_t start32;
  2170. int32_t limit32;
  2171. // Access the start. Does two things we need:
  2172. // Pins 'start' to the length of the string, if it came in out-of-bounds.
  2173. // Snaps 'start' to the beginning of a code point.
  2174. ucstrTextAccess(ut, start, true);
  2175. const char16_t *s=ut->chunkContents;
  2176. start32 = ut->chunkOffset;
  2177. int32_t strLength=(int32_t)ut->a;
  2178. if (strLength >= 0) {
  2179. limit32 = pinIndex(limit, strLength);
  2180. } else {
  2181. limit32 = pinIndex(limit, INT32_MAX);
  2182. }
  2183. di = 0;
  2184. for (si=start32; si<limit32; si++) {
  2185. if (strLength<0 && s[si]==0) {
  2186. // Just hit the end of a null-terminated string.
  2187. ut->a = si; // set string length for this UText
  2188. ut->chunkNativeLimit = si;
  2189. ut->chunkLength = si;
  2190. ut->nativeIndexingLimit = si;
  2191. strLength = si;
  2192. limit32 = si;
  2193. break;
  2194. }
  2195. U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */
  2196. if (di<destCapacity) {
  2197. // only store if there is space.
  2198. dest[di] = s[si];
  2199. } else {
  2200. if (strLength>=0) {
  2201. // We have filled the destination buffer, and the string length is known.
  2202. // Cut the loop short. There is no need to scan string termination.
  2203. di = limit32 - start32;
  2204. si = limit32;
  2205. break;
  2206. }
  2207. }
  2208. di++;
  2209. }
  2210. // If the limit index points to a lead surrogate of a pair,
  2211. // add the corresponding trail surrogate to the destination.
  2212. if (si>0 && U16_IS_LEAD(s[si-1]) &&
  2213. ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si])))
  2214. {
  2215. if (di<destCapacity) {
  2216. // store only if there is space in the output buffer.
  2217. dest[di++] = s[si];
  2218. }
  2219. si++;
  2220. }
  2221. // Put iteration position at the point just following the extracted text
  2222. if (si <= ut->chunkNativeLimit) {
  2223. ut->chunkOffset = si;
  2224. } else {
  2225. ucstrTextAccess(ut, si, true);
  2226. }
  2227. // Add a terminating NUL if space in the buffer permits,
  2228. // and set the error status as required.
  2229. u_terminateUChars(dest, destCapacity, di, pErrorCode);
  2230. return di;
  2231. }
  2232. static const struct UTextFuncs ucstrFuncs =
  2233. {
  2234. sizeof(UTextFuncs),
  2235. 0, 0, 0, // Reserved alignment padding
  2236. ucstrTextClone,
  2237. ucstrTextLength,
  2238. ucstrTextAccess,
  2239. ucstrTextExtract,
  2240. nullptr, // Replace
  2241. nullptr, // Copy
  2242. nullptr, // MapOffsetToNative,
  2243. nullptr, // MapIndexToUTF16,
  2244. ucstrTextClose,
  2245. nullptr, // spare 1
  2246. nullptr, // spare 2
  2247. nullptr, // spare 3
  2248. };
  2249. U_CDECL_END
  2250. static const char16_t gEmptyUString[] = {0};
  2251. U_CAPI UText * U_EXPORT2
  2252. utext_openUChars(UText *ut, const char16_t *s, int64_t length, UErrorCode *status) {
  2253. if (U_FAILURE(*status)) {
  2254. return nullptr;
  2255. }
  2256. if(s==nullptr && length==0) {
  2257. s = gEmptyUString;
  2258. }
  2259. if (s==nullptr || length < -1 || length>INT32_MAX) {
  2260. *status = U_ILLEGAL_ARGUMENT_ERROR;
  2261. return nullptr;
  2262. }
  2263. ut = utext_setup(ut, 0, status);
  2264. if (U_SUCCESS(*status)) {
  2265. ut->pFuncs = &ucstrFuncs;
  2266. ut->context = s;
  2267. ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
  2268. if (length==-1) {
  2269. ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  2270. }
  2271. ut->a = length;
  2272. ut->chunkContents = s;
  2273. ut->chunkNativeStart = 0;
  2274. ut->chunkNativeLimit = length>=0? length : 0;
  2275. ut->chunkLength = (int32_t)ut->chunkNativeLimit;
  2276. ut->chunkOffset = 0;
  2277. ut->nativeIndexingLimit = ut->chunkLength;
  2278. }
  2279. return ut;
  2280. }
  2281. //------------------------------------------------------------------------------
  2282. //
  2283. // UText implementation for text from ICU CharacterIterators
  2284. //
  2285. // Use of UText data members:
  2286. // context pointer to the CharacterIterator
  2287. // a length of the full text.
  2288. // p pointer to buffer 1
  2289. // b start index of local buffer 1 contents
  2290. // q pointer to buffer 2
  2291. // c start index of local buffer 2 contents
  2292. // r pointer to the character iterator if the UText owns it.
  2293. // Null otherwise.
  2294. //
  2295. //------------------------------------------------------------------------------
  2296. #define CIBufSize 16
  2297. U_CDECL_BEGIN
  2298. static void U_CALLCONV
  2299. charIterTextClose(UText *ut) {
  2300. // Most of the work of close is done by the generic UText framework close.
  2301. // All that needs to be done here is delete the CharacterIterator if the UText
  2302. // owns it. This occurs if the UText was created by cloning.
  2303. CharacterIterator *ci = (CharacterIterator *)ut->r;
  2304. delete ci;
  2305. ut->r = nullptr;
  2306. }
  2307. static int64_t U_CALLCONV
  2308. charIterTextLength(UText *ut) {
  2309. return (int32_t)ut->a;
  2310. }
  2311. static UBool U_CALLCONV
  2312. charIterTextAccess(UText *ut, int64_t index, UBool forward) {
  2313. CharacterIterator *ci = (CharacterIterator *)ut->context;
  2314. int32_t clippedIndex = (int32_t)index;
  2315. if (clippedIndex<0) {
  2316. clippedIndex=0;
  2317. } else if (clippedIndex>=ut->a) {
  2318. clippedIndex=(int32_t)ut->a;
  2319. }
  2320. int32_t neededIndex = clippedIndex;
  2321. if (!forward && neededIndex>0) {
  2322. // reverse iteration, want the position just before what was asked for.
  2323. neededIndex--;
  2324. } else if (forward && neededIndex==ut->a && neededIndex>0) {
  2325. // Forward iteration, don't ask for something past the end of the text.
  2326. neededIndex--;
  2327. }
  2328. // Find the native index of the start of the buffer containing what we want.
  2329. neededIndex -= neededIndex % CIBufSize;
  2330. char16_t *buf = nullptr;
  2331. UBool needChunkSetup = true;
  2332. int i;
  2333. if (ut->chunkNativeStart == neededIndex) {
  2334. // The buffer we want is already the current chunk.
  2335. needChunkSetup = false;
  2336. } else if (ut->b == neededIndex) {
  2337. // The first buffer (buffer p) has what we need.
  2338. buf = (char16_t *)ut->p;
  2339. } else if (ut->c == neededIndex) {
  2340. // The second buffer (buffer q) has what we need.
  2341. buf = (char16_t *)ut->q;
  2342. } else {
  2343. // Neither buffer already has what we need.
  2344. // Load new data from the character iterator.
  2345. // Use the buf that is not the current buffer.
  2346. buf = (char16_t *)ut->p;
  2347. if (ut->p == ut->chunkContents) {
  2348. buf = (char16_t *)ut->q;
  2349. }
  2350. ci->setIndex(neededIndex);
  2351. for (i=0; i<CIBufSize; i++) {
  2352. buf[i] = ci->nextPostInc();
  2353. if (i+neededIndex > ut->a) {
  2354. break;
  2355. }
  2356. }
  2357. }
  2358. // We have a buffer with the data we need.
  2359. // Set it up as the current chunk, if it wasn't already.
  2360. if (needChunkSetup) {
  2361. ut->chunkContents = buf;
  2362. ut->chunkLength = CIBufSize;
  2363. ut->chunkNativeStart = neededIndex;
  2364. ut->chunkNativeLimit = neededIndex + CIBufSize;
  2365. if (ut->chunkNativeLimit > ut->a) {
  2366. ut->chunkNativeLimit = ut->a;
  2367. ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart);
  2368. }
  2369. ut->nativeIndexingLimit = ut->chunkLength;
  2370. U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize);
  2371. }
  2372. ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart;
  2373. UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0);
  2374. return success;
  2375. }
  2376. static UText * U_CALLCONV
  2377. charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) {
  2378. if (U_FAILURE(*status)) {
  2379. return nullptr;
  2380. }
  2381. if (deep) {
  2382. // There is no CharacterIterator API for cloning the underlying text storage.
  2383. *status = U_UNSUPPORTED_ERROR;
  2384. return nullptr;
  2385. } else {
  2386. CharacterIterator *srcCI =(CharacterIterator *)src->context;
  2387. srcCI = srcCI->clone();
  2388. dest = utext_openCharacterIterator(dest, srcCI, status);
  2389. if (U_FAILURE(*status)) {
  2390. return dest;
  2391. }
  2392. // cast off const on getNativeIndex.
  2393. // For CharacterIterator based UTexts, this is safe, the operation is const.
  2394. int64_t ix = utext_getNativeIndex((UText *)src);
  2395. utext_setNativeIndex(dest, ix);
  2396. dest->r = srcCI; // flags that this UText owns the CharacterIterator
  2397. }
  2398. return dest;
  2399. }
  2400. static int32_t U_CALLCONV
  2401. charIterTextExtract(UText *ut,
  2402. int64_t start, int64_t limit,
  2403. char16_t *dest, int32_t destCapacity,
  2404. UErrorCode *status)
  2405. {
  2406. if(U_FAILURE(*status)) {
  2407. return 0;
  2408. }
  2409. if(destCapacity<0 || (dest==nullptr && destCapacity>0) || start>limit) {
  2410. *status=U_ILLEGAL_ARGUMENT_ERROR;
  2411. return 0;
  2412. }
  2413. int32_t length = (int32_t)ut->a;
  2414. int32_t start32 = pinIndex(start, length);
  2415. int32_t limit32 = pinIndex(limit, length);
  2416. int32_t desti = 0;
  2417. int32_t srci;
  2418. int32_t copyLimit;
  2419. CharacterIterator *ci = (CharacterIterator *)ut->context;
  2420. ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed.
  2421. srci = ci->getIndex();
  2422. copyLimit = srci;
  2423. while (srci<limit32) {
  2424. UChar32 c = ci->next32PostInc();
  2425. int32_t len = U16_LENGTH(c);
  2426. U_ASSERT(desti+len>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */
  2427. if (desti+len <= destCapacity) {
  2428. U16_APPEND_UNSAFE(dest, desti, c);
  2429. copyLimit = srci+len;
  2430. } else {
  2431. desti += len;
  2432. *status = U_BUFFER_OVERFLOW_ERROR;
  2433. }
  2434. srci += len;
  2435. }
  2436. charIterTextAccess(ut, copyLimit, true);
  2437. u_terminateUChars(dest, destCapacity, desti, status);
  2438. return desti;
  2439. }
  2440. static const struct UTextFuncs charIterFuncs =
  2441. {
  2442. sizeof(UTextFuncs),
  2443. 0, 0, 0, // Reserved alignment padding
  2444. charIterTextClone,
  2445. charIterTextLength,
  2446. charIterTextAccess,
  2447. charIterTextExtract,
  2448. nullptr, // Replace
  2449. nullptr, // Copy
  2450. nullptr, // MapOffsetToNative,
  2451. nullptr, // MapIndexToUTF16,
  2452. charIterTextClose,
  2453. nullptr, // spare 1
  2454. nullptr, // spare 2
  2455. nullptr // spare 3
  2456. };
  2457. U_CDECL_END
  2458. U_CAPI UText * U_EXPORT2
  2459. utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) {
  2460. if (U_FAILURE(*status)) {
  2461. return nullptr;
  2462. }
  2463. if (ci->startIndex() > 0) {
  2464. // No support for CharacterIterators that do not start indexing from zero.
  2465. *status = U_UNSUPPORTED_ERROR;
  2466. return nullptr;
  2467. }
  2468. // Extra space in UText for 2 buffers of CIBufSize UChars each.
  2469. int32_t extraSpace = 2 * CIBufSize * sizeof(char16_t);
  2470. ut = utext_setup(ut, extraSpace, status);
  2471. if (U_SUCCESS(*status)) {
  2472. ut->pFuncs = &charIterFuncs;
  2473. ut->context = ci;
  2474. ut->providerProperties = 0;
  2475. ut->a = ci->endIndex(); // Length of text
  2476. ut->p = ut->pExtra; // First buffer
  2477. ut->b = -1; // Native index of first buffer contents
  2478. ut->q = (char16_t*)ut->pExtra+CIBufSize; // Second buffer
  2479. ut->c = -1; // Native index of second buffer contents
  2480. // Initialize current chunk contents to be empty.
  2481. // First access will fault something in.
  2482. // Note: The initial nativeStart and chunkOffset must sum to zero
  2483. // so that getNativeIndex() will correctly compute to zero
  2484. // if no call to Access() has ever been made. They can't be both
  2485. // zero without Access() thinking that the chunk is valid.
  2486. ut->chunkContents = (char16_t *)ut->p;
  2487. ut->chunkNativeStart = -1;
  2488. ut->chunkOffset = 1;
  2489. ut->chunkNativeLimit = 0;
  2490. ut->chunkLength = 0;
  2491. ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing
  2492. }
  2493. return ut;
  2494. }