utrie2_builder.cpp 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2001-2014, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: utrie2_builder.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2008sep26 (split off from utrie2.c)
  16. * created by: Markus W. Scherer
  17. *
  18. * This is a common implementation of a Unicode trie.
  19. * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
  20. * Unicode code points (0..0x10ffff).
  21. * This is the second common version of a Unicode trie (hence the name UTrie2).
  22. * See utrie2.h for a comparison.
  23. *
  24. * This file contains only the builder code.
  25. * See utrie2.c for the runtime and enumeration code.
  26. */
  27. // #define UTRIE2_DEBUG
  28. #ifdef UTRIE2_DEBUG
  29. # include <stdio.h>
  30. #endif
  31. // #define UCPTRIE_DEBUG
  32. #include "unicode/utypes.h"
  33. #ifdef UCPTRIE_DEBUG
  34. #include "unicode/ucptrie.h"
  35. #include "unicode/umutablecptrie.h"
  36. #include "ucptrie_impl.h"
  37. #endif
  38. #include "cmemory.h"
  39. #include "utrie2.h"
  40. #include "utrie2_impl.h"
  41. #include "utrie.h" // for utrie2_fromUTrie()
  42. /* Implementation notes ----------------------------------------------------- */
  43. /*
  44. * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
  45. * have been chosen to minimize trie sizes overall.
  46. * Most of the code is flexible enough to work with a range of values,
  47. * within certain limits.
  48. *
  49. * Exception: Support for separate values for lead surrogate code _units_
  50. * vs. code _points_ was added after the constants were fixed,
  51. * and has not been tested nor particularly designed for different constant values.
  52. * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
  53. * part and back.)
  54. *
  55. * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
  56. * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
  57. * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
  58. * remapping) stops working.
  59. *
  60. * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
  61. * assumes that a single index-2 block is used for 0x400 code points
  62. * corresponding to one lead surrogate.
  63. *
  64. * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
  65. * more than one Unicode plane, and the split of the index-2 table into a BMP
  66. * part and a supplementary part, with a gap in between, would not work.
  67. *
  68. * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
  69. * there is data with more than 64k distinct values,
  70. * for example for Unihan collation with a separate collation weight per
  71. * Han character.
  72. */
  73. /* Building a trie ----------------------------------------------------------*/
  74. enum {
  75. /** The null index-2 block, following the gap in the index-2 table. */
  76. UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
  77. /** The start of allocated index-2 blocks. */
  78. UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
  79. /**
  80. * The null data block.
  81. * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
  82. * to work with 6-bit trail bytes from 2-byte UTF-8.
  83. */
  84. UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
  85. /** The start of allocated data blocks. */
  86. UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
  87. /**
  88. * The start of data blocks for U+0800 and above.
  89. * Below, compaction uses a block length of 64 for 2-byte UTF-8.
  90. * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
  91. * Data values for 0x780 code points beyond ASCII.
  92. */
  93. UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
  94. };
  95. /* Start with allocation of 16k data entries. */
  96. #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
  97. /* Grow about 8x each time. */
  98. #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
  99. static int32_t
  100. allocIndex2Block(UNewTrie2 *trie);
  101. U_CAPI UTrie2 * U_EXPORT2
  102. utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
  103. UTrie2 *trie;
  104. UNewTrie2 *newTrie;
  105. uint32_t *data;
  106. int32_t i, j;
  107. if(U_FAILURE(*pErrorCode)) {
  108. return nullptr;
  109. }
  110. trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
  111. newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
  112. data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
  113. if(trie==nullptr || newTrie==nullptr || data==nullptr) {
  114. uprv_free(trie);
  115. uprv_free(newTrie);
  116. uprv_free(data);
  117. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  118. return 0;
  119. }
  120. uprv_memset(trie, 0, sizeof(UTrie2));
  121. trie->initialValue=initialValue;
  122. trie->errorValue=errorValue;
  123. trie->highStart=0x110000;
  124. trie->newTrie=newTrie;
  125. #ifdef UTRIE2_DEBUG
  126. trie->name="open";
  127. #endif
  128. newTrie->data=data;
  129. #ifdef UCPTRIE_DEBUG
  130. newTrie->t3=umutablecptrie_open(initialValue, errorValue, pErrorCode);
  131. #endif
  132. newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
  133. newTrie->initialValue=initialValue;
  134. newTrie->errorValue=errorValue;
  135. newTrie->highStart=0x110000;
  136. newTrie->firstFreeBlock=0; /* no free block in the list */
  137. newTrie->isCompacted=false;
  138. /*
  139. * preallocate and reset
  140. * - ASCII
  141. * - the bad-UTF-8-data block
  142. * - the null data block
  143. */
  144. for(i=0; i<0x80; ++i) {
  145. newTrie->data[i]=initialValue;
  146. }
  147. for(; i<0xc0; ++i) {
  148. newTrie->data[i]=errorValue;
  149. }
  150. for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
  151. newTrie->data[i]=initialValue;
  152. }
  153. newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
  154. newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
  155. /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
  156. for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
  157. newTrie->index2[i]=j;
  158. newTrie->map[i]=1;
  159. }
  160. /* reference counts for the bad-UTF-8-data block */
  161. for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
  162. newTrie->map[i]=0;
  163. }
  164. /*
  165. * Reference counts for the null data block: all blocks except for the ASCII blocks.
  166. * Plus 1 so that we don't drop this block during compaction.
  167. * Plus as many as needed for lead surrogate code points.
  168. */
  169. /* i==newTrie->dataNullOffset */
  170. newTrie->map[i++]=
  171. (0x110000>>UTRIE2_SHIFT_2)-
  172. (0x80>>UTRIE2_SHIFT_2)+
  173. 1+
  174. UTRIE2_LSCP_INDEX_2_LENGTH;
  175. j+=UTRIE2_DATA_BLOCK_LENGTH;
  176. for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
  177. newTrie->map[i]=0;
  178. }
  179. /*
  180. * set the remaining indexes in the BMP index-2 block
  181. * to the null data block
  182. */
  183. for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
  184. newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
  185. }
  186. /*
  187. * Fill the index gap with impossible values so that compaction
  188. * does not overlap other index-2 blocks with the gap.
  189. */
  190. for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
  191. newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
  192. }
  193. /* set the indexes in the null index-2 block */
  194. for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
  195. newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
  196. }
  197. newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
  198. newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
  199. /* set the index-1 indexes for the linear index-2 block */
  200. for(i=0, j=0;
  201. i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
  202. ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
  203. ) {
  204. newTrie->index1[i]=j;
  205. }
  206. /* set the remaining index-1 indexes to the null index-2 block */
  207. for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
  208. newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
  209. }
  210. /*
  211. * Preallocate and reset data for U+0080..U+07ff,
  212. * for 2-byte UTF-8 which will be compacted in 64-blocks
  213. * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
  214. */
  215. for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
  216. utrie2_set32(trie, i, initialValue, pErrorCode);
  217. }
  218. return trie;
  219. }
  220. static UNewTrie2 *
  221. cloneBuilder(const UNewTrie2 *other) {
  222. UNewTrie2 *trie;
  223. trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
  224. if(trie==nullptr) {
  225. return nullptr;
  226. }
  227. trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
  228. if(trie->data==nullptr) {
  229. uprv_free(trie);
  230. return nullptr;
  231. }
  232. #ifdef UCPTRIE_DEBUG
  233. if(other->t3==nullptr) {
  234. trie->t3=nullptr;
  235. } else {
  236. UErrorCode errorCode=U_ZERO_ERROR;
  237. trie->t3=umutablecptrie_clone(other->t3, &errorCode);
  238. }
  239. #endif
  240. trie->dataCapacity=other->dataCapacity;
  241. /* clone data */
  242. uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
  243. uprv_memcpy(trie->index2, other->index2, (size_t)other->index2Length*4);
  244. trie->index2NullOffset=other->index2NullOffset;
  245. trie->index2Length=other->index2Length;
  246. uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);
  247. trie->dataNullOffset=other->dataNullOffset;
  248. trie->dataLength=other->dataLength;
  249. /* reference counters */
  250. if(other->isCompacted) {
  251. trie->firstFreeBlock=0;
  252. } else {
  253. uprv_memcpy(trie->map, other->map, ((size_t)other->dataLength>>UTRIE2_SHIFT_2)*4);
  254. trie->firstFreeBlock=other->firstFreeBlock;
  255. }
  256. trie->initialValue=other->initialValue;
  257. trie->errorValue=other->errorValue;
  258. trie->highStart=other->highStart;
  259. trie->isCompacted=other->isCompacted;
  260. return trie;
  261. }
  262. U_CAPI UTrie2 * U_EXPORT2
  263. utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
  264. UTrie2 *trie;
  265. if(U_FAILURE(*pErrorCode)) {
  266. return nullptr;
  267. }
  268. if(other==nullptr || (other->memory==nullptr && other->newTrie==nullptr)) {
  269. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  270. return nullptr;
  271. }
  272. trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
  273. if(trie==nullptr) {
  274. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  275. return nullptr;
  276. }
  277. uprv_memcpy(trie, other, sizeof(UTrie2));
  278. if(other->memory!=nullptr) {
  279. trie->memory=uprv_malloc(other->length);
  280. if(trie->memory!=nullptr) {
  281. trie->isMemoryOwned=true;
  282. uprv_memcpy(trie->memory, other->memory, other->length);
  283. /* make the clone's pointers point to its own memory */
  284. trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
  285. if(other->data16!=nullptr) {
  286. trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
  287. }
  288. if(other->data32!=nullptr) {
  289. trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
  290. }
  291. }
  292. } else /* other->newTrie!=nullptr */ {
  293. trie->newTrie=cloneBuilder(other->newTrie);
  294. }
  295. if(trie->memory==nullptr && trie->newTrie==nullptr) {
  296. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  297. uprv_free(trie);
  298. trie=nullptr;
  299. }
  300. return trie;
  301. }
  302. typedef struct NewTrieAndStatus {
  303. UTrie2 *trie;
  304. UErrorCode errorCode;
  305. UBool exclusiveLimit; /* rather than inclusive range end */
  306. } NewTrieAndStatus;
  307. static UBool U_CALLCONV
  308. copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
  309. NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
  310. if(value!=nt->trie->initialValue) {
  311. if(nt->exclusiveLimit) {
  312. --end;
  313. }
  314. if(start==end) {
  315. utrie2_set32(nt->trie, start, value, &nt->errorCode);
  316. } else {
  317. utrie2_setRange32(nt->trie, start, end, value, true, &nt->errorCode);
  318. }
  319. return U_SUCCESS(nt->errorCode);
  320. } else {
  321. return true;
  322. }
  323. }
  324. #ifdef UTRIE2_DEBUG
  325. static long countInitial(const UTrie2 *trie) {
  326. uint32_t initialValue=trie->initialValue;
  327. int32_t length=trie->dataLength;
  328. long count=0;
  329. if(trie->data16!=nullptr) {
  330. for(int32_t i=0; i<length; ++i) {
  331. if(trie->data16[i]==initialValue) { ++count; }
  332. }
  333. } else {
  334. for(int32_t i=0; i<length; ++i) {
  335. if(trie->data32[i]==initialValue) { ++count; }
  336. }
  337. }
  338. return count;
  339. }
  340. static void
  341. utrie_printLengths(const UTrie *trie) {
  342. long indexLength=trie->indexLength;
  343. long dataLength=(long)trie->dataLength;
  344. long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=nullptr ? 4 : 2);
  345. printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n",
  346. indexLength, dataLength, totalLength);
  347. }
  348. static void
  349. utrie2_printLengths(const UTrie2 *trie, const char *which) {
  350. long indexLength=trie->indexLength;
  351. long dataLength=(long)trie->dataLength;
  352. long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=nullptr ? 4 : 2);
  353. printf("**UTrie2Lengths(%s %s)** index:%6ld data:%6ld countInitial:%6ld serialized:%6ld\n",
  354. which, trie->name, indexLength, dataLength, countInitial(trie), totalLength);
  355. }
  356. #endif
  357. U_CAPI UTrie2 * U_EXPORT2
  358. utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
  359. NewTrieAndStatus context;
  360. char16_t lead;
  361. if(U_FAILURE(*pErrorCode)) {
  362. return nullptr;
  363. }
  364. if(other==nullptr || (other->memory==nullptr && other->newTrie==nullptr)) {
  365. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  366. return nullptr;
  367. }
  368. if(other->newTrie!=nullptr && !other->newTrie->isCompacted) {
  369. return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */
  370. }
  371. /* Clone the frozen trie by enumerating it and building a new one. */
  372. context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
  373. if(U_FAILURE(*pErrorCode)) {
  374. return nullptr;
  375. }
  376. context.exclusiveLimit=false;
  377. context.errorCode=*pErrorCode;
  378. utrie2_enum(other, nullptr, copyEnumRange, &context);
  379. *pErrorCode=context.errorCode;
  380. for(lead=0xd800; lead<0xdc00; ++lead) {
  381. uint32_t value;
  382. if(other->data32==nullptr) {
  383. value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
  384. } else {
  385. value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
  386. }
  387. if(value!=other->initialValue) {
  388. utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
  389. }
  390. }
  391. if(U_FAILURE(*pErrorCode)) {
  392. utrie2_close(context.trie);
  393. context.trie=nullptr;
  394. }
  395. return context.trie;
  396. }
  397. /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
  398. U_CAPI UTrie2 * U_EXPORT2
  399. utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
  400. NewTrieAndStatus context;
  401. char16_t lead;
  402. if(U_FAILURE(*pErrorCode)) {
  403. return nullptr;
  404. }
  405. if(trie1==nullptr) {
  406. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  407. return nullptr;
  408. }
  409. context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
  410. if(U_FAILURE(*pErrorCode)) {
  411. return nullptr;
  412. }
  413. context.exclusiveLimit=true;
  414. context.errorCode=*pErrorCode;
  415. utrie_enum(trie1, nullptr, copyEnumRange, &context);
  416. *pErrorCode=context.errorCode;
  417. for(lead=0xd800; lead<0xdc00; ++lead) {
  418. uint32_t value;
  419. if(trie1->data32==nullptr) {
  420. value=UTRIE_GET16_FROM_LEAD(trie1, lead);
  421. } else {
  422. value=UTRIE_GET32_FROM_LEAD(trie1, lead);
  423. }
  424. if(value!=trie1->initialValue) {
  425. utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
  426. }
  427. }
  428. if(U_SUCCESS(*pErrorCode)) {
  429. utrie2_freeze(context.trie,
  430. trie1->data32!=nullptr ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
  431. pErrorCode);
  432. }
  433. #ifdef UTRIE2_DEBUG
  434. if(U_SUCCESS(*pErrorCode)) {
  435. utrie_printLengths(trie1);
  436. utrie2_printLengths(context.trie, "fromUTrie");
  437. }
  438. #endif
  439. if(U_FAILURE(*pErrorCode)) {
  440. utrie2_close(context.trie);
  441. context.trie=nullptr;
  442. }
  443. return context.trie;
  444. }
  445. static inline UBool
  446. isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
  447. int32_t i2, block;
  448. if(U_IS_LEAD(c) && forLSCP) {
  449. i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
  450. (c>>UTRIE2_SHIFT_2);
  451. } else {
  452. i2=trie->index1[c>>UTRIE2_SHIFT_1]+
  453. ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
  454. }
  455. block=trie->index2[i2];
  456. return (UBool)(block==trie->dataNullOffset);
  457. }
  458. static int32_t
  459. allocIndex2Block(UNewTrie2 *trie) {
  460. int32_t newBlock, newTop;
  461. newBlock=trie->index2Length;
  462. newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
  463. if(newTop>UPRV_LENGTHOF(trie->index2)) {
  464. /*
  465. * Should never occur.
  466. * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
  467. * or the code writes more values than should be possible.
  468. */
  469. return -1;
  470. }
  471. trie->index2Length=newTop;
  472. uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
  473. return newBlock;
  474. }
  475. static int32_t
  476. getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
  477. int32_t i1, i2;
  478. if(U_IS_LEAD(c) && forLSCP) {
  479. return UTRIE2_LSCP_INDEX_2_OFFSET;
  480. }
  481. i1=c>>UTRIE2_SHIFT_1;
  482. i2=trie->index1[i1];
  483. if(i2==trie->index2NullOffset) {
  484. i2=allocIndex2Block(trie);
  485. if(i2<0) {
  486. return -1; /* program error */
  487. }
  488. trie->index1[i1]=i2;
  489. }
  490. return i2;
  491. }
  492. static int32_t
  493. allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
  494. int32_t newBlock, newTop;
  495. if(trie->firstFreeBlock!=0) {
  496. /* get the first free block */
  497. newBlock=trie->firstFreeBlock;
  498. trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
  499. } else {
  500. /* get a new block from the high end */
  501. newBlock=trie->dataLength;
  502. newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
  503. if(newTop>trie->dataCapacity) {
  504. /* out of memory in the data array */
  505. int32_t capacity;
  506. uint32_t *data;
  507. if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
  508. capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
  509. } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
  510. capacity=UNEWTRIE2_MAX_DATA_LENGTH;
  511. } else {
  512. /*
  513. * Should never occur.
  514. * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
  515. * or the code writes more values than should be possible.
  516. */
  517. return -1;
  518. }
  519. data=(uint32_t *)uprv_malloc(capacity*4);
  520. if(data==nullptr) {
  521. return -1;
  522. }
  523. uprv_memcpy(data, trie->data, (size_t)trie->dataLength*4);
  524. uprv_free(trie->data);
  525. trie->data=data;
  526. trie->dataCapacity=capacity;
  527. }
  528. trie->dataLength=newTop;
  529. }
  530. uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
  531. trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
  532. return newBlock;
  533. }
  534. /* call when the block's reference counter reaches 0 */
  535. static void
  536. releaseDataBlock(UNewTrie2 *trie, int32_t block) {
  537. /* put this block at the front of the free-block chain */
  538. trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
  539. trie->firstFreeBlock=block;
  540. }
  541. static inline UBool
  542. isWritableBlock(UNewTrie2 *trie, int32_t block) {
  543. return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
  544. }
  545. static inline void
  546. setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
  547. int32_t oldBlock;
  548. ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */
  549. oldBlock=trie->index2[i2];
  550. if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
  551. releaseDataBlock(trie, oldBlock);
  552. }
  553. trie->index2[i2]=block;
  554. }
  555. /**
  556. * No error checking for illegal arguments.
  557. *
  558. * @return -1 if no new data block available (out of memory in data array)
  559. * @internal
  560. */
  561. static int32_t
  562. getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
  563. int32_t i2, oldBlock, newBlock;
  564. i2=getIndex2Block(trie, c, forLSCP);
  565. if(i2<0) {
  566. return -1; /* program error */
  567. }
  568. i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
  569. oldBlock=trie->index2[i2];
  570. if(isWritableBlock(trie, oldBlock)) {
  571. return oldBlock;
  572. }
  573. /* allocate a new data block */
  574. newBlock=allocDataBlock(trie, oldBlock);
  575. if(newBlock<0) {
  576. /* out of memory in the data array */
  577. return -1;
  578. }
  579. setIndex2Entry(trie, i2, newBlock);
  580. return newBlock;
  581. }
  582. /**
  583. * @return true if the value was successfully set
  584. */
  585. static void
  586. set32(UNewTrie2 *trie,
  587. UChar32 c, UBool forLSCP, uint32_t value,
  588. UErrorCode *pErrorCode) {
  589. int32_t block;
  590. if(trie==nullptr || trie->isCompacted) {
  591. *pErrorCode=U_NO_WRITE_PERMISSION;
  592. return;
  593. }
  594. #ifdef UCPTRIE_DEBUG
  595. umutablecptrie_set(trie->t3, c, value, pErrorCode);
  596. #endif
  597. block=getDataBlock(trie, c, forLSCP);
  598. if(block<0) {
  599. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  600. return;
  601. }
  602. trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
  603. }
  604. U_CAPI void U_EXPORT2
  605. utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
  606. if(U_FAILURE(*pErrorCode)) {
  607. return;
  608. }
  609. if((uint32_t)c>0x10ffff) {
  610. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  611. return;
  612. }
  613. set32(trie->newTrie, c, true, value, pErrorCode);
  614. }
  615. U_CAPI void U_EXPORT2
  616. utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
  617. UChar32 c, uint32_t value,
  618. UErrorCode *pErrorCode) {
  619. if(U_FAILURE(*pErrorCode)) {
  620. return;
  621. }
  622. if(!U_IS_LEAD(c)) {
  623. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  624. return;
  625. }
  626. set32(trie->newTrie, c, false, value, pErrorCode);
  627. }
  628. static void
  629. writeBlock(uint32_t *block, uint32_t value) {
  630. uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
  631. while(block<limit) {
  632. *block++=value;
  633. }
  634. }
  635. /**
  636. * initialValue is ignored if overwrite=true
  637. * @internal
  638. */
  639. static void
  640. fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
  641. uint32_t value, uint32_t initialValue, UBool overwrite) {
  642. uint32_t *pLimit;
  643. pLimit=block+limit;
  644. block+=start;
  645. if(overwrite) {
  646. while(block<pLimit) {
  647. *block++=value;
  648. }
  649. } else {
  650. while(block<pLimit) {
  651. if(*block==initialValue) {
  652. *block=value;
  653. }
  654. ++block;
  655. }
  656. }
  657. }
  658. U_CAPI void U_EXPORT2
  659. utrie2_setRange32(UTrie2 *trie,
  660. UChar32 start, UChar32 end,
  661. uint32_t value, UBool overwrite,
  662. UErrorCode *pErrorCode) {
  663. /*
  664. * repeat value in [start..end]
  665. * mark index values for repeat-data blocks by setting bit 31 of the index values
  666. * fill around existing values if any, if(overwrite)
  667. */
  668. UNewTrie2 *newTrie;
  669. int32_t block, rest, repeatBlock;
  670. UChar32 limit;
  671. if(U_FAILURE(*pErrorCode)) {
  672. return;
  673. }
  674. if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
  675. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  676. return;
  677. }
  678. newTrie=trie->newTrie;
  679. if(newTrie==nullptr || newTrie->isCompacted) {
  680. *pErrorCode=U_NO_WRITE_PERMISSION;
  681. return;
  682. }
  683. #ifdef UCPTRIE_DEBUG
  684. umutablecptrie_setRange(newTrie->t3, start, end, value, pErrorCode);
  685. #endif
  686. if(!overwrite && value==newTrie->initialValue) {
  687. return; /* nothing to do */
  688. }
  689. limit=end+1;
  690. if(start&UTRIE2_DATA_MASK) {
  691. UChar32 nextStart;
  692. /* set partial block at [start..following block boundary[ */
  693. block=getDataBlock(newTrie, start, true);
  694. if(block<0) {
  695. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  696. return;
  697. }
  698. nextStart=(start+UTRIE2_DATA_MASK)&~UTRIE2_DATA_MASK;
  699. if(nextStart<=limit) {
  700. fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
  701. value, newTrie->initialValue, overwrite);
  702. start=nextStart;
  703. } else {
  704. fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
  705. value, newTrie->initialValue, overwrite);
  706. return;
  707. }
  708. }
  709. /* number of positions in the last, partial block */
  710. rest=limit&UTRIE2_DATA_MASK;
  711. /* round down limit to a block boundary */
  712. limit&=~UTRIE2_DATA_MASK;
  713. /* iterate over all-value blocks */
  714. if(value==newTrie->initialValue) {
  715. repeatBlock=newTrie->dataNullOffset;
  716. } else {
  717. repeatBlock=-1;
  718. }
  719. while(start<limit) {
  720. int32_t i2;
  721. UBool setRepeatBlock=false;
  722. if(value==newTrie->initialValue && isInNullBlock(newTrie, start, true)) {
  723. start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
  724. continue;
  725. }
  726. /* get index value */
  727. i2=getIndex2Block(newTrie, start, true);
  728. if(i2<0) {
  729. *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
  730. return;
  731. }
  732. i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
  733. block=newTrie->index2[i2];
  734. if(isWritableBlock(newTrie, block)) {
  735. /* already allocated */
  736. if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
  737. /*
  738. * We overwrite all values, and it's not a
  739. * protected (ASCII-linear or 2-byte UTF-8) block:
  740. * replace with the repeatBlock.
  741. */
  742. setRepeatBlock=true;
  743. } else {
  744. /* !overwrite, or protected block: just write the values into this block */
  745. fillBlock(newTrie->data+block,
  746. 0, UTRIE2_DATA_BLOCK_LENGTH,
  747. value, newTrie->initialValue, overwrite);
  748. }
  749. } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
  750. /*
  751. * Set the repeatBlock instead of the null block or previous repeat block:
  752. *
  753. * If !isWritableBlock() then all entries in the block have the same value
  754. * because it's the null block or a range block (the repeatBlock from a previous
  755. * call to utrie2_setRange32()).
  756. * No other blocks are used multiple times before compacting.
  757. *
  758. * The null block is the only non-writable block with the initialValue because
  759. * of the repeatBlock initialization above. (If value==initialValue, then
  760. * the repeatBlock will be the null data block.)
  761. *
  762. * We set our repeatBlock if the desired value differs from the block's value,
  763. * and if we overwrite any data or if the data is all initial values
  764. * (which is the same as the block being the null block, see above).
  765. */
  766. setRepeatBlock=true;
  767. }
  768. if(setRepeatBlock) {
  769. if(repeatBlock>=0) {
  770. setIndex2Entry(newTrie, i2, repeatBlock);
  771. } else {
  772. /* create and set and fill the repeatBlock */
  773. repeatBlock=getDataBlock(newTrie, start, true);
  774. if(repeatBlock<0) {
  775. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  776. return;
  777. }
  778. writeBlock(newTrie->data+repeatBlock, value);
  779. }
  780. }
  781. start+=UTRIE2_DATA_BLOCK_LENGTH;
  782. }
  783. if(rest>0) {
  784. /* set partial block at [last block boundary..limit[ */
  785. block=getDataBlock(newTrie, start, true);
  786. if(block<0) {
  787. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  788. return;
  789. }
  790. fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
  791. }
  792. return;
  793. }
  794. /* compaction --------------------------------------------------------------- */
  795. static inline UBool
  796. equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
  797. while(length>0 && *s==*t) {
  798. ++s;
  799. ++t;
  800. --length;
  801. }
  802. return (UBool)(length==0);
  803. }
  804. static inline UBool
  805. equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
  806. while(length>0 && *s==*t) {
  807. ++s;
  808. ++t;
  809. --length;
  810. }
  811. return (UBool)(length==0);
  812. }
  813. static int32_t
  814. findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
  815. int32_t block;
  816. /* ensure that we do not even partially get past index2Length */
  817. index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
  818. for(block=0; block<=index2Length; ++block) {
  819. if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
  820. return block;
  821. }
  822. }
  823. return -1;
  824. }
  825. static int32_t
  826. findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
  827. int32_t block;
  828. /* ensure that we do not even partially get past dataLength */
  829. dataLength-=blockLength;
  830. for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
  831. if(equal_uint32(data+block, data+otherBlock, blockLength)) {
  832. return block;
  833. }
  834. }
  835. return -1;
  836. }
  837. /*
  838. * Find the start of the last range in the trie by enumerating backward.
  839. * Indexes for supplementary code points higher than this will be omitted.
  840. */
  841. static UChar32
  842. findHighStart(UNewTrie2 *trie, uint32_t highValue) {
  843. const uint32_t *data32;
  844. uint32_t value, initialValue;
  845. UChar32 c, prev;
  846. int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
  847. data32=trie->data;
  848. initialValue=trie->initialValue;
  849. index2NullOffset=trie->index2NullOffset;
  850. nullBlock=trie->dataNullOffset;
  851. /* set variables for previous range */
  852. if(highValue==initialValue) {
  853. prevI2Block=index2NullOffset;
  854. prevBlock=nullBlock;
  855. } else {
  856. prevI2Block=-1;
  857. prevBlock=-1;
  858. }
  859. prev=0x110000;
  860. /* enumerate index-2 blocks */
  861. i1=UNEWTRIE2_INDEX_1_LENGTH;
  862. c=prev;
  863. while(c>0) {
  864. i2Block=trie->index1[--i1];
  865. if(i2Block==prevI2Block) {
  866. /* the index-2 block is the same as the previous one, and filled with highValue */
  867. c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
  868. continue;
  869. }
  870. prevI2Block=i2Block;
  871. if(i2Block==index2NullOffset) {
  872. /* this is the null index-2 block */
  873. if(highValue!=initialValue) {
  874. return c;
  875. }
  876. c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
  877. } else {
  878. /* enumerate data blocks for one index-2 block */
  879. for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
  880. block=trie->index2[i2Block+ --i2];
  881. if(block==prevBlock) {
  882. /* the block is the same as the previous one, and filled with highValue */
  883. c-=UTRIE2_DATA_BLOCK_LENGTH;
  884. continue;
  885. }
  886. prevBlock=block;
  887. if(block==nullBlock) {
  888. /* this is the null data block */
  889. if(highValue!=initialValue) {
  890. return c;
  891. }
  892. c-=UTRIE2_DATA_BLOCK_LENGTH;
  893. } else {
  894. for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
  895. value=data32[block+ --j];
  896. if(value!=highValue) {
  897. return c;
  898. }
  899. --c;
  900. }
  901. }
  902. }
  903. }
  904. }
  905. /* deliver last range */
  906. return 0;
  907. }
  908. /*
  909. * Compact a build-time trie.
  910. *
  911. * The compaction
  912. * - removes blocks that are identical with earlier ones
  913. * - overlaps adjacent blocks as much as possible (if overlap==true)
  914. * - moves blocks in steps of the data granularity
  915. * - moves and overlaps blocks that overlap with multiple values in the overlap region
  916. *
  917. * It does not
  918. * - try to move and overlap blocks that are not already adjacent
  919. */
  920. static void
  921. compactData(UNewTrie2 *trie) {
  922. #ifdef UTRIE2_DEBUG
  923. int32_t countSame=0, sumOverlaps=0;
  924. #endif
  925. int32_t start, newStart, movedStart;
  926. int32_t blockLength, overlap;
  927. int32_t i, mapIndex, blockCount;
  928. /* do not compact linear-ASCII data */
  929. newStart=UTRIE2_DATA_START_OFFSET;
  930. for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
  931. trie->map[i]=start;
  932. }
  933. /*
  934. * Start with a block length of 64 for 2-byte UTF-8,
  935. * then switch to UTRIE2_DATA_BLOCK_LENGTH.
  936. */
  937. blockLength=64;
  938. blockCount=blockLength>>UTRIE2_SHIFT_2;
  939. for(start=newStart; start<trie->dataLength;) {
  940. /*
  941. * start: index of first entry of current block
  942. * newStart: index where the current block is to be moved
  943. * (right after current end of already-compacted data)
  944. */
  945. if(start==UNEWTRIE2_DATA_0800_OFFSET) {
  946. blockLength=UTRIE2_DATA_BLOCK_LENGTH;
  947. blockCount=1;
  948. }
  949. /* skip blocks that are not used */
  950. if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
  951. /* advance start to the next block */
  952. start+=blockLength;
  953. /* leave newStart with the previous block! */
  954. continue;
  955. }
  956. /* search for an identical block */
  957. if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
  958. >=0
  959. ) {
  960. #ifdef UTRIE2_DEBUG
  961. ++countSame;
  962. #endif
  963. /* found an identical block, set the other block's index value for the current block */
  964. for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
  965. trie->map[mapIndex++]=movedStart;
  966. movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
  967. }
  968. /* advance start to the next block */
  969. start+=blockLength;
  970. /* leave newStart with the previous block! */
  971. continue;
  972. }
  973. /* see if the beginning of this block can be overlapped with the end of the previous block */
  974. /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
  975. for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
  976. overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
  977. overlap-=UTRIE2_DATA_GRANULARITY) {}
  978. #ifdef UTRIE2_DEBUG
  979. sumOverlaps+=overlap;
  980. #endif
  981. if(overlap>0 || newStart<start) {
  982. /* some overlap, or just move the whole block */
  983. movedStart=newStart-overlap;
  984. for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
  985. trie->map[mapIndex++]=movedStart;
  986. movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
  987. }
  988. /* move the non-overlapping indexes to their new positions */
  989. start+=overlap;
  990. for(i=blockLength-overlap; i>0; --i) {
  991. trie->data[newStart++]=trie->data[start++];
  992. }
  993. } else /* no overlap && newStart==start */ {
  994. for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
  995. trie->map[mapIndex++]=start;
  996. start+=UTRIE2_DATA_BLOCK_LENGTH;
  997. }
  998. newStart=start;
  999. }
  1000. }
  1001. /* now adjust the index-2 table */
  1002. for(i=0; i<trie->index2Length; ++i) {
  1003. if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
  1004. /* Gap indexes are invalid (-1). Skip over the gap. */
  1005. i+=UNEWTRIE2_INDEX_GAP_LENGTH;
  1006. }
  1007. trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
  1008. }
  1009. trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
  1010. /* ensure dataLength alignment */
  1011. while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
  1012. trie->data[newStart++]=trie->initialValue;
  1013. }
  1014. #ifdef UTRIE2_DEBUG
  1015. /* we saved some space */
  1016. printf("compacting UTrie2: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n",
  1017. (long)trie->dataLength, (long)newStart, (long)countSame, (long)sumOverlaps);
  1018. #endif
  1019. trie->dataLength=newStart;
  1020. }
  1021. static void
  1022. compactIndex2(UNewTrie2 *trie) {
  1023. int32_t i, start, newStart, movedStart, overlap;
  1024. /* do not compact linear-BMP index-2 blocks */
  1025. newStart=UTRIE2_INDEX_2_BMP_LENGTH;
  1026. for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
  1027. trie->map[i]=start;
  1028. }
  1029. /* Reduce the index table gap to what will be needed at runtime. */
  1030. newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
  1031. for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
  1032. /*
  1033. * start: index of first entry of current block
  1034. * newStart: index where the current block is to be moved
  1035. * (right after current end of already-compacted data)
  1036. */
  1037. /* search for an identical block */
  1038. if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
  1039. >=0
  1040. ) {
  1041. /* found an identical block, set the other block's index value for the current block */
  1042. trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
  1043. /* advance start to the next block */
  1044. start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
  1045. /* leave newStart with the previous block! */
  1046. continue;
  1047. }
  1048. /* see if the beginning of this block can be overlapped with the end of the previous block */
  1049. /* look for maximum overlap with the previous, adjacent block */
  1050. for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
  1051. overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
  1052. --overlap) {}
  1053. if(overlap>0 || newStart<start) {
  1054. /* some overlap, or just move the whole block */
  1055. trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
  1056. /* move the non-overlapping indexes to their new positions */
  1057. start+=overlap;
  1058. for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
  1059. trie->index2[newStart++]=trie->index2[start++];
  1060. }
  1061. } else /* no overlap && newStart==start */ {
  1062. trie->map[start>>UTRIE2_SHIFT_1_2]=start;
  1063. start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
  1064. newStart=start;
  1065. }
  1066. }
  1067. /* now adjust the index-1 table */
  1068. for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
  1069. trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
  1070. }
  1071. trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
  1072. /*
  1073. * Ensure data table alignment:
  1074. * Needs to be granularity-aligned for 16-bit trie
  1075. * (so that dataMove will be down-shiftable),
  1076. * and 2-aligned for uint32_t data.
  1077. */
  1078. while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
  1079. /* Arbitrary value: 0x3fffc not possible for real data. */
  1080. trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
  1081. }
  1082. #ifdef UTRIE2_DEBUG
  1083. /* we saved some space */
  1084. printf("compacting UTrie2: count of 16-bit index words %lu->%lu\n",
  1085. (long)trie->index2Length, (long)newStart);
  1086. #endif
  1087. trie->index2Length=newStart;
  1088. }
  1089. static void
  1090. compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
  1091. UNewTrie2 *newTrie;
  1092. UChar32 highStart, suppHighStart;
  1093. uint32_t highValue;
  1094. newTrie=trie->newTrie;
  1095. /* find highStart and round it up */
  1096. highValue=utrie2_get32(trie, 0x10ffff);
  1097. highStart=findHighStart(newTrie, highValue);
  1098. highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
  1099. if(highStart==0x110000) {
  1100. highValue=trie->errorValue;
  1101. }
  1102. /*
  1103. * Set trie->highStart only after utrie2_get32(trie, highStart).
  1104. * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
  1105. */
  1106. trie->highStart=newTrie->highStart=highStart;
  1107. #ifdef UTRIE2_DEBUG
  1108. printf("UTrie2: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n",
  1109. (long)highStart, (long)highValue, (long)trie->initialValue);
  1110. #endif
  1111. if(highStart<0x110000) {
  1112. /* Blank out [highStart..10ffff] to release associated data blocks. */
  1113. suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
  1114. utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, true, pErrorCode);
  1115. if(U_FAILURE(*pErrorCode)) {
  1116. return;
  1117. }
  1118. }
  1119. compactData(newTrie);
  1120. if(highStart>0x10000) {
  1121. compactIndex2(newTrie);
  1122. #ifdef UTRIE2_DEBUG
  1123. } else {
  1124. printf("UTrie2: highStart U+%04lx count of 16-bit index words %lu->%lu\n",
  1125. (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
  1126. #endif
  1127. }
  1128. /*
  1129. * Store the highValue in the data array and round up the dataLength.
  1130. * Must be done after compactData() because that assumes that dataLength
  1131. * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
  1132. */
  1133. newTrie->data[newTrie->dataLength++]=highValue;
  1134. while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
  1135. newTrie->data[newTrie->dataLength++]=trie->initialValue;
  1136. }
  1137. newTrie->isCompacted=true;
  1138. }
  1139. /* serialization ------------------------------------------------------------ */
  1140. /**
  1141. * Maximum length of the runtime index array.
  1142. * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
  1143. * (The actual maximum length is lower,
  1144. * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
  1145. */
  1146. #define UTRIE2_MAX_INDEX_LENGTH 0xffff
  1147. /**
  1148. * Maximum length of the runtime data array.
  1149. * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
  1150. * and by uint16_t UTrie2Header.shiftedDataLength.
  1151. */
  1152. #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
  1153. /* Compact and internally serialize the trie. */
  1154. U_CAPI void U_EXPORT2
  1155. utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
  1156. UNewTrie2 *newTrie;
  1157. UTrie2Header *header;
  1158. uint32_t *p;
  1159. uint16_t *dest16;
  1160. int32_t i, length;
  1161. int32_t allIndexesLength;
  1162. int32_t dataMove; /* >0 if the data is moved to the end of the index array */
  1163. UChar32 highStart;
  1164. /* argument check */
  1165. if(U_FAILURE(*pErrorCode)) {
  1166. return;
  1167. }
  1168. if( trie==nullptr ||
  1169. valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
  1170. ) {
  1171. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1172. return;
  1173. }
  1174. newTrie=trie->newTrie;
  1175. if(newTrie==nullptr) {
  1176. /* already frozen */
  1177. UTrie2ValueBits frozenValueBits=
  1178. trie->data16!=nullptr ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
  1179. if(valueBits!=frozenValueBits) {
  1180. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1181. }
  1182. return;
  1183. }
  1184. /* compact if necessary */
  1185. if(!newTrie->isCompacted) {
  1186. compactTrie(trie, pErrorCode);
  1187. if(U_FAILURE(*pErrorCode)) {
  1188. return;
  1189. }
  1190. }
  1191. highStart=trie->highStart;
  1192. if(highStart<=0x10000) {
  1193. allIndexesLength=UTRIE2_INDEX_1_OFFSET;
  1194. } else {
  1195. allIndexesLength=newTrie->index2Length;
  1196. }
  1197. if(valueBits==UTRIE2_16_VALUE_BITS) {
  1198. dataMove=allIndexesLength;
  1199. } else {
  1200. dataMove=0;
  1201. }
  1202. /* are indexLength and dataLength within limits? */
  1203. if( /* for unshifted indexLength */
  1204. allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
  1205. /* for unshifted dataNullOffset */
  1206. (dataMove+newTrie->dataNullOffset)>0xffff ||
  1207. /* for unshifted 2-byte UTF-8 index-2 values */
  1208. (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
  1209. /* for shiftedDataLength */
  1210. (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
  1211. ) {
  1212. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1213. return;
  1214. }
  1215. /* calculate the total serialized length */
  1216. length=sizeof(UTrie2Header)+allIndexesLength*2;
  1217. if(valueBits==UTRIE2_16_VALUE_BITS) {
  1218. length+=newTrie->dataLength*2;
  1219. } else {
  1220. length+=newTrie->dataLength*4;
  1221. }
  1222. trie->memory=uprv_malloc(length);
  1223. if(trie->memory==nullptr) {
  1224. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  1225. return;
  1226. }
  1227. trie->length=length;
  1228. trie->isMemoryOwned=true;
  1229. trie->indexLength=allIndexesLength;
  1230. trie->dataLength=newTrie->dataLength;
  1231. if(highStart<=0x10000) {
  1232. trie->index2NullOffset=0xffff;
  1233. } else {
  1234. trie->index2NullOffset=static_cast<uint16_t>(UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset);
  1235. }
  1236. trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
  1237. trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
  1238. /* set the header fields */
  1239. header=(UTrie2Header *)trie->memory;
  1240. header->signature=UTRIE2_SIG; /* "Tri2" */
  1241. header->options=(uint16_t)valueBits;
  1242. header->indexLength=(uint16_t)trie->indexLength;
  1243. header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
  1244. header->index2NullOffset=trie->index2NullOffset;
  1245. header->dataNullOffset=trie->dataNullOffset;
  1246. header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
  1247. /* fill the index and data arrays */
  1248. dest16=(uint16_t *)(header+1);
  1249. trie->index=dest16;
  1250. /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
  1251. p=(uint32_t *)newTrie->index2;
  1252. for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
  1253. *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
  1254. }
  1255. /* write UTF-8 2-byte index-2 values, not right-shifted */
  1256. for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */
  1257. *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
  1258. }
  1259. for(; i<(0xe0-0xc0); ++i) { /* C2..DF */
  1260. *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
  1261. }
  1262. if(highStart>0x10000) {
  1263. int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
  1264. int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
  1265. /* write 16-bit index-1 values for supplementary code points */
  1266. p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
  1267. for(i=index1Length; i>0; --i) {
  1268. *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
  1269. }
  1270. /*
  1271. * write the index-2 array values for supplementary code points,
  1272. * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
  1273. */
  1274. p=(uint32_t *)newTrie->index2+index2Offset;
  1275. for(i=newTrie->index2Length-index2Offset; i>0; --i) {
  1276. *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
  1277. }
  1278. }
  1279. /* write the 16/32-bit data array */
  1280. switch(valueBits) {
  1281. case UTRIE2_16_VALUE_BITS:
  1282. /* write 16-bit data values */
  1283. trie->data16=dest16;
  1284. trie->data32=nullptr;
  1285. p=newTrie->data;
  1286. for(i=newTrie->dataLength; i>0; --i) {
  1287. *dest16++=(uint16_t)*p++;
  1288. }
  1289. break;
  1290. case UTRIE2_32_VALUE_BITS:
  1291. /* write 32-bit data values */
  1292. trie->data16=nullptr;
  1293. trie->data32=(uint32_t *)dest16;
  1294. uprv_memcpy(dest16, newTrie->data, (size_t)newTrie->dataLength*4);
  1295. break;
  1296. default:
  1297. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1298. return;
  1299. }
  1300. #ifdef UTRIE2_DEBUG
  1301. utrie2_printLengths(trie, "");
  1302. #endif
  1303. #ifdef UCPTRIE_DEBUG
  1304. umutablecptrie_setName(newTrie->t3, trie->name);
  1305. ucptrie_close(
  1306. umutablecptrie_buildImmutable(
  1307. newTrie->t3, UCPTRIE_TYPE_FAST, (UCPTrieValueWidth)valueBits, pErrorCode));
  1308. #endif
  1309. /* Delete the UNewTrie2. */
  1310. uprv_free(newTrie->data);
  1311. uprv_free(newTrie);
  1312. trie->newTrie=nullptr;
  1313. }