umutablecptrie.cpp 64 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852
  1. // © 2017 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. // umutablecptrie.cpp (inspired by utrie2_builder.cpp)
  4. // created: 2017dec29 Markus W. Scherer
  5. // #define UCPTRIE_DEBUG
  6. #ifdef UCPTRIE_DEBUG
  7. # include <stdio.h>
  8. #endif
  9. #include "unicode/utypes.h"
  10. #include "unicode/ucptrie.h"
  11. #include "unicode/umutablecptrie.h"
  12. #include "unicode/uobject.h"
  13. #include "unicode/utf16.h"
  14. #include "cmemory.h"
  15. #include "uassert.h"
  16. #include "ucptrie_impl.h"
  17. // ICU-20235 In case Microsoft math.h has defined this, undefine it.
  18. #ifdef OVERFLOW
  19. #undef OVERFLOW
  20. #endif
  21. U_NAMESPACE_BEGIN
  22. namespace {
  23. constexpr int32_t MAX_UNICODE = 0x10ffff;
  24. constexpr int32_t UNICODE_LIMIT = 0x110000;
  25. constexpr int32_t BMP_LIMIT = 0x10000;
  26. constexpr int32_t ASCII_LIMIT = 0x80;
  27. constexpr int32_t I_LIMIT = UNICODE_LIMIT >> UCPTRIE_SHIFT_3;
  28. constexpr int32_t BMP_I_LIMIT = BMP_LIMIT >> UCPTRIE_SHIFT_3;
  29. constexpr int32_t ASCII_I_LIMIT = ASCII_LIMIT >> UCPTRIE_SHIFT_3;
  30. constexpr int32_t SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3));
  31. // Flag values for data blocks.
  32. constexpr uint8_t ALL_SAME = 0;
  33. constexpr uint8_t MIXED = 1;
  34. constexpr uint8_t SAME_AS = 2;
  35. /** Start with allocation of 16k data entries. */
  36. constexpr int32_t INITIAL_DATA_LENGTH = ((int32_t)1 << 14);
  37. /** Grow about 8x each time. */
  38. constexpr int32_t MEDIUM_DATA_LENGTH = ((int32_t)1 << 17);
  39. /**
  40. * Maximum length of the build-time data array.
  41. * One entry per 0x110000 code points.
  42. */
  43. constexpr int32_t MAX_DATA_LENGTH = UNICODE_LIMIT;
  44. // Flag values for index-3 blocks while compacting/building.
  45. constexpr uint8_t I3_NULL = 0;
  46. constexpr uint8_t I3_BMP = 1;
  47. constexpr uint8_t I3_16 = 2;
  48. constexpr uint8_t I3_18 = 3;
  49. constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8;
  50. class AllSameBlocks;
  51. class MixedBlocks;
  52. class MutableCodePointTrie : public UMemory {
  53. public:
  54. MutableCodePointTrie(uint32_t initialValue, uint32_t errorValue, UErrorCode &errorCode);
  55. MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode);
  56. MutableCodePointTrie(const MutableCodePointTrie &other) = delete;
  57. ~MutableCodePointTrie();
  58. MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete;
  59. static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode);
  60. static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode);
  61. uint32_t get(UChar32 c) const;
  62. int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context,
  63. uint32_t *pValue) const;
  64. void set(UChar32 c, uint32_t value, UErrorCode &errorCode);
  65. void setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode);
  66. UCPTrie *build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode);
  67. private:
  68. void clear();
  69. bool ensureHighStart(UChar32 c);
  70. int32_t allocDataBlock(int32_t blockLength);
  71. int32_t getDataBlock(int32_t i);
  72. void maskValues(uint32_t mask);
  73. UChar32 findHighStart() const;
  74. int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks);
  75. int32_t compactData(
  76. int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
  77. int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
  78. int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
  79. int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode);
  80. uint32_t *index = nullptr;
  81. int32_t indexCapacity = 0;
  82. int32_t index3NullOffset = -1;
  83. uint32_t *data = nullptr;
  84. int32_t dataCapacity = 0;
  85. int32_t dataLength = 0;
  86. int32_t dataNullOffset = -1;
  87. uint32_t origInitialValue;
  88. uint32_t initialValue;
  89. uint32_t errorValue;
  90. UChar32 highStart;
  91. uint32_t highValue;
  92. #ifdef UCPTRIE_DEBUG
  93. public:
  94. const char *name;
  95. #endif
  96. private:
  97. /** Temporary array while building the final data. */
  98. uint16_t *index16 = nullptr;
  99. uint8_t flags[UNICODE_LIMIT >> UCPTRIE_SHIFT_3];
  100. };
  101. MutableCodePointTrie::MutableCodePointTrie(uint32_t iniValue, uint32_t errValue, UErrorCode &errorCode) :
  102. origInitialValue(iniValue), initialValue(iniValue), errorValue(errValue),
  103. highStart(0), highValue(initialValue)
  104. #ifdef UCPTRIE_DEBUG
  105. , name("open")
  106. #endif
  107. {
  108. if (U_FAILURE(errorCode)) { return; }
  109. index = (uint32_t *)uprv_malloc(BMP_I_LIMIT * 4);
  110. data = (uint32_t *)uprv_malloc(INITIAL_DATA_LENGTH * 4);
  111. if (index == nullptr || data == nullptr) {
  112. errorCode = U_MEMORY_ALLOCATION_ERROR;
  113. return;
  114. }
  115. indexCapacity = BMP_I_LIMIT;
  116. dataCapacity = INITIAL_DATA_LENGTH;
  117. }
  118. MutableCodePointTrie::MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode) :
  119. index3NullOffset(other.index3NullOffset),
  120. dataNullOffset(other.dataNullOffset),
  121. origInitialValue(other.origInitialValue), initialValue(other.initialValue),
  122. errorValue(other.errorValue),
  123. highStart(other.highStart), highValue(other.highValue)
  124. #ifdef UCPTRIE_DEBUG
  125. , name("mutable clone")
  126. #endif
  127. {
  128. if (U_FAILURE(errorCode)) { return; }
  129. int32_t iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT;
  130. index = (uint32_t *)uprv_malloc(iCapacity * 4);
  131. data = (uint32_t *)uprv_malloc(other.dataCapacity * 4);
  132. if (index == nullptr || data == nullptr) {
  133. errorCode = U_MEMORY_ALLOCATION_ERROR;
  134. return;
  135. }
  136. indexCapacity = iCapacity;
  137. dataCapacity = other.dataCapacity;
  138. int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
  139. uprv_memcpy(flags, other.flags, iLimit);
  140. uprv_memcpy(index, other.index, iLimit * 4);
  141. uprv_memcpy(data, other.data, (size_t)other.dataLength * 4);
  142. dataLength = other.dataLength;
  143. U_ASSERT(other.index16 == nullptr);
  144. }
  145. MutableCodePointTrie::~MutableCodePointTrie() {
  146. uprv_free(index);
  147. uprv_free(data);
  148. uprv_free(index16);
  149. }
  150. MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) {
  151. // Use the highValue as the initialValue to reduce the highStart.
  152. uint32_t errorValue = ucpmap_get(map, -1);
  153. uint32_t initialValue = ucpmap_get(map, 0x10ffff);
  154. LocalPointer<MutableCodePointTrie> mutableTrie(
  155. new MutableCodePointTrie(initialValue, errorValue, errorCode),
  156. errorCode);
  157. if (U_FAILURE(errorCode)) {
  158. return nullptr;
  159. }
  160. UChar32 start = 0, end;
  161. uint32_t value;
  162. while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
  163. nullptr, nullptr, &value)) >= 0) {
  164. if (value != initialValue) {
  165. if (start == end) {
  166. mutableTrie->set(start, value, errorCode);
  167. } else {
  168. mutableTrie->setRange(start, end, value, errorCode);
  169. }
  170. }
  171. start = end + 1;
  172. }
  173. if (U_SUCCESS(errorCode)) {
  174. return mutableTrie.orphan();
  175. } else {
  176. return nullptr;
  177. }
  178. }
  179. MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) {
  180. // Use the highValue as the initialValue to reduce the highStart.
  181. uint32_t errorValue;
  182. uint32_t initialValue;
  183. switch (trie->valueWidth) {
  184. case UCPTRIE_VALUE_BITS_16:
  185. errorValue = trie->data.ptr16[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
  186. initialValue = trie->data.ptr16[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
  187. break;
  188. case UCPTRIE_VALUE_BITS_32:
  189. errorValue = trie->data.ptr32[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
  190. initialValue = trie->data.ptr32[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
  191. break;
  192. case UCPTRIE_VALUE_BITS_8:
  193. errorValue = trie->data.ptr8[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
  194. initialValue = trie->data.ptr8[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
  195. break;
  196. default:
  197. // Unreachable if the trie is properly initialized.
  198. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  199. return nullptr;
  200. }
  201. LocalPointer<MutableCodePointTrie> mutableTrie(
  202. new MutableCodePointTrie(initialValue, errorValue, errorCode),
  203. errorCode);
  204. if (U_FAILURE(errorCode)) {
  205. return nullptr;
  206. }
  207. UChar32 start = 0, end;
  208. uint32_t value;
  209. while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
  210. nullptr, nullptr, &value)) >= 0) {
  211. if (value != initialValue) {
  212. if (start == end) {
  213. mutableTrie->set(start, value, errorCode);
  214. } else {
  215. mutableTrie->setRange(start, end, value, errorCode);
  216. }
  217. }
  218. start = end + 1;
  219. }
  220. if (U_SUCCESS(errorCode)) {
  221. return mutableTrie.orphan();
  222. } else {
  223. return nullptr;
  224. }
  225. }
  226. void MutableCodePointTrie::clear() {
  227. index3NullOffset = dataNullOffset = -1;
  228. dataLength = 0;
  229. highValue = initialValue = origInitialValue;
  230. highStart = 0;
  231. uprv_free(index16);
  232. index16 = nullptr;
  233. }
  234. uint32_t MutableCodePointTrie::get(UChar32 c) const {
  235. if ((uint32_t)c > MAX_UNICODE) {
  236. return errorValue;
  237. }
  238. if (c >= highStart) {
  239. return highValue;
  240. }
  241. int32_t i = c >> UCPTRIE_SHIFT_3;
  242. if (flags[i] == ALL_SAME) {
  243. return index[i];
  244. } else {
  245. return data[index[i] + (c & UCPTRIE_SMALL_DATA_MASK)];
  246. }
  247. }
  248. inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue,
  249. UCPMapValueFilter *filter, const void *context) {
  250. if (value == initialValue) {
  251. value = nullValue;
  252. } else if (filter != nullptr) {
  253. value = filter(context, value);
  254. }
  255. return value;
  256. }
  257. UChar32 MutableCodePointTrie::getRange(
  258. UChar32 start, UCPMapValueFilter *filter, const void *context,
  259. uint32_t *pValue) const {
  260. if ((uint32_t)start > MAX_UNICODE) {
  261. return U_SENTINEL;
  262. }
  263. if (start >= highStart) {
  264. if (pValue != nullptr) {
  265. uint32_t value = highValue;
  266. if (filter != nullptr) { value = filter(context, value); }
  267. *pValue = value;
  268. }
  269. return MAX_UNICODE;
  270. }
  271. uint32_t nullValue = initialValue;
  272. if (filter != nullptr) { nullValue = filter(context, nullValue); }
  273. UChar32 c = start;
  274. uint32_t trieValue, value;
  275. bool haveValue = false;
  276. int32_t i = c >> UCPTRIE_SHIFT_3;
  277. do {
  278. if (flags[i] == ALL_SAME) {
  279. uint32_t trieValue2 = index[i];
  280. if (haveValue) {
  281. if (trieValue2 != trieValue) {
  282. if (filter == nullptr ||
  283. maybeFilterValue(trieValue2, initialValue, nullValue,
  284. filter, context) != value) {
  285. return c - 1;
  286. }
  287. trieValue = trieValue2; // may or may not help
  288. }
  289. } else {
  290. trieValue = trieValue2;
  291. value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
  292. if (pValue != nullptr) { *pValue = value; }
  293. haveValue = true;
  294. }
  295. c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK;
  296. } else /* MIXED */ {
  297. int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK);
  298. uint32_t trieValue2 = data[di];
  299. if (haveValue) {
  300. if (trieValue2 != trieValue) {
  301. if (filter == nullptr ||
  302. maybeFilterValue(trieValue2, initialValue, nullValue,
  303. filter, context) != value) {
  304. return c - 1;
  305. }
  306. trieValue = trieValue2; // may or may not help
  307. }
  308. } else {
  309. trieValue = trieValue2;
  310. value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
  311. if (pValue != nullptr) { *pValue = value; }
  312. haveValue = true;
  313. }
  314. while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) {
  315. trieValue2 = data[++di];
  316. if (trieValue2 != trieValue) {
  317. if (filter == nullptr ||
  318. maybeFilterValue(trieValue2, initialValue, nullValue,
  319. filter, context) != value) {
  320. return c - 1;
  321. }
  322. }
  323. trieValue = trieValue2; // may or may not help
  324. }
  325. }
  326. ++i;
  327. } while (c < highStart);
  328. U_ASSERT(haveValue);
  329. if (maybeFilterValue(highValue, initialValue, nullValue,
  330. filter, context) != value) {
  331. return c - 1;
  332. } else {
  333. return MAX_UNICODE;
  334. }
  335. }
  336. void
  337. writeBlock(uint32_t *block, uint32_t value) {
  338. uint32_t *limit = block + UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
  339. while (block < limit) {
  340. *block++ = value;
  341. }
  342. }
  343. bool MutableCodePointTrie::ensureHighStart(UChar32 c) {
  344. if (c >= highStart) {
  345. // Round up to a UCPTRIE_CP_PER_INDEX_2_ENTRY boundary to simplify compaction.
  346. c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1);
  347. int32_t i = highStart >> UCPTRIE_SHIFT_3;
  348. int32_t iLimit = c >> UCPTRIE_SHIFT_3;
  349. if (iLimit > indexCapacity) {
  350. uint32_t *newIndex = (uint32_t *)uprv_malloc(I_LIMIT * 4);
  351. if (newIndex == nullptr) { return false; }
  352. uprv_memcpy(newIndex, index, i * 4);
  353. uprv_free(index);
  354. index = newIndex;
  355. indexCapacity = I_LIMIT;
  356. }
  357. do {
  358. flags[i] = ALL_SAME;
  359. index[i] = initialValue;
  360. } while(++i < iLimit);
  361. highStart = c;
  362. }
  363. return true;
  364. }
  365. int32_t MutableCodePointTrie::allocDataBlock(int32_t blockLength) {
  366. int32_t newBlock = dataLength;
  367. int32_t newTop = newBlock + blockLength;
  368. if (newTop > dataCapacity) {
  369. int32_t capacity;
  370. if (dataCapacity < MEDIUM_DATA_LENGTH) {
  371. capacity = MEDIUM_DATA_LENGTH;
  372. } else if (dataCapacity < MAX_DATA_LENGTH) {
  373. capacity = MAX_DATA_LENGTH;
  374. } else {
  375. // Should never occur.
  376. // Either MAX_DATA_LENGTH is incorrect,
  377. // or the code writes more values than should be possible.
  378. return -1;
  379. }
  380. uint32_t *newData = (uint32_t *)uprv_malloc(capacity * 4);
  381. if (newData == nullptr) {
  382. return -1;
  383. }
  384. uprv_memcpy(newData, data, (size_t)dataLength * 4);
  385. uprv_free(data);
  386. data = newData;
  387. dataCapacity = capacity;
  388. }
  389. dataLength = newTop;
  390. return newBlock;
  391. }
  392. /**
  393. * No error checking for illegal arguments.
  394. *
  395. * @return -1 if no new data block available (out of memory in data array)
  396. * @internal
  397. */
  398. int32_t MutableCodePointTrie::getDataBlock(int32_t i) {
  399. if (flags[i] == MIXED) {
  400. return index[i];
  401. }
  402. if (i < BMP_I_LIMIT) {
  403. int32_t newBlock = allocDataBlock(UCPTRIE_FAST_DATA_BLOCK_LENGTH);
  404. if (newBlock < 0) { return newBlock; }
  405. int32_t iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1);
  406. int32_t iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
  407. do {
  408. U_ASSERT(flags[iStart] == ALL_SAME);
  409. writeBlock(data + newBlock, index[iStart]);
  410. flags[iStart] = MIXED;
  411. index[iStart++] = newBlock;
  412. newBlock += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
  413. } while (iStart < iLimit);
  414. return index[i];
  415. } else {
  416. int32_t newBlock = allocDataBlock(UCPTRIE_SMALL_DATA_BLOCK_LENGTH);
  417. if (newBlock < 0) { return newBlock; }
  418. writeBlock(data + newBlock, index[i]);
  419. flags[i] = MIXED;
  420. index[i] = newBlock;
  421. return newBlock;
  422. }
  423. }
  424. void MutableCodePointTrie::set(UChar32 c, uint32_t value, UErrorCode &errorCode) {
  425. if (U_FAILURE(errorCode)) {
  426. return;
  427. }
  428. if ((uint32_t)c > MAX_UNICODE) {
  429. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  430. return;
  431. }
  432. int32_t block;
  433. if (!ensureHighStart(c) || (block = getDataBlock(c >> UCPTRIE_SHIFT_3)) < 0) {
  434. errorCode = U_MEMORY_ALLOCATION_ERROR;
  435. return;
  436. }
  437. data[block + (c & UCPTRIE_SMALL_DATA_MASK)] = value;
  438. }
  439. void
  440. fillBlock(uint32_t *block, UChar32 start, UChar32 limit, uint32_t value) {
  441. uint32_t *pLimit = block + limit;
  442. block += start;
  443. while (block < pLimit) {
  444. *block++ = value;
  445. }
  446. }
  447. void MutableCodePointTrie::setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode) {
  448. if (U_FAILURE(errorCode)) {
  449. return;
  450. }
  451. if ((uint32_t)start > MAX_UNICODE || (uint32_t)end > MAX_UNICODE || start > end) {
  452. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  453. return;
  454. }
  455. if (!ensureHighStart(end)) {
  456. errorCode = U_MEMORY_ALLOCATION_ERROR;
  457. return;
  458. }
  459. UChar32 limit = end + 1;
  460. if (start & UCPTRIE_SMALL_DATA_MASK) {
  461. // Set partial block at [start..following block boundary[.
  462. int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3);
  463. if (block < 0) {
  464. errorCode = U_MEMORY_ALLOCATION_ERROR;
  465. return;
  466. }
  467. UChar32 nextStart = (start + UCPTRIE_SMALL_DATA_MASK) & ~UCPTRIE_SMALL_DATA_MASK;
  468. if (nextStart <= limit) {
  469. fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, UCPTRIE_SMALL_DATA_BLOCK_LENGTH,
  470. value);
  471. start = nextStart;
  472. } else {
  473. fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, limit & UCPTRIE_SMALL_DATA_MASK,
  474. value);
  475. return;
  476. }
  477. }
  478. // Number of positions in the last, partial block.
  479. int32_t rest = limit & UCPTRIE_SMALL_DATA_MASK;
  480. // Round down limit to a block boundary.
  481. limit &= ~UCPTRIE_SMALL_DATA_MASK;
  482. // Iterate over all-value blocks.
  483. while (start < limit) {
  484. int32_t i = start >> UCPTRIE_SHIFT_3;
  485. if (flags[i] == ALL_SAME) {
  486. index[i] = value;
  487. } else /* MIXED */ {
  488. fillBlock(data + index[i], 0, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, value);
  489. }
  490. start += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
  491. }
  492. if (rest > 0) {
  493. // Set partial block at [last block boundary..limit[.
  494. int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3);
  495. if (block < 0) {
  496. errorCode = U_MEMORY_ALLOCATION_ERROR;
  497. return;
  498. }
  499. fillBlock(data + block, 0, rest, value);
  500. }
  501. }
  502. /* compaction --------------------------------------------------------------- */
  503. void MutableCodePointTrie::maskValues(uint32_t mask) {
  504. initialValue &= mask;
  505. errorValue &= mask;
  506. highValue &= mask;
  507. int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
  508. for (int32_t i = 0; i < iLimit; ++i) {
  509. if (flags[i] == ALL_SAME) {
  510. index[i] &= mask;
  511. }
  512. }
  513. for (int32_t i = 0; i < dataLength; ++i) {
  514. data[i] &= mask;
  515. }
  516. }
  517. template<typename UIntA, typename UIntB>
  518. bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) {
  519. while (length > 0 && *s == *t) {
  520. ++s;
  521. ++t;
  522. --length;
  523. }
  524. return length == 0;
  525. }
  526. bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) {
  527. const uint32_t *pLimit = p + length;
  528. while (p < pLimit && *p == value) { ++p; }
  529. return p == pLimit;
  530. }
  531. /** Search for an identical block. */
  532. int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
  533. const uint16_t *q, int32_t qStart, int32_t blockLength) {
  534. // Ensure that we do not even partially get past length.
  535. length -= blockLength;
  536. q += qStart;
  537. while (pStart <= length) {
  538. if (equalBlocks(p + pStart, q, blockLength)) {
  539. return pStart;
  540. }
  541. ++pStart;
  542. }
  543. return -1;
  544. }
  545. int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit,
  546. uint32_t value, int32_t blockLength) {
  547. // Ensure that we do not even partially get past limit.
  548. limit -= blockLength;
  549. for (int32_t block = start; block <= limit; ++block) {
  550. if (p[block] == value) {
  551. for (int32_t i = 1;; ++i) {
  552. if (i == blockLength) {
  553. return block;
  554. }
  555. if (p[block + i] != value) {
  556. block += i;
  557. break;
  558. }
  559. }
  560. }
  561. }
  562. return -1;
  563. }
  564. /**
  565. * Look for maximum overlap of the beginning of the other block
  566. * with the previous, adjacent block.
  567. */
  568. template<typename UIntA, typename UIntB>
  569. int32_t getOverlap(const UIntA *p, int32_t length,
  570. const UIntB *q, int32_t qStart, int32_t blockLength) {
  571. int32_t overlap = blockLength - 1;
  572. U_ASSERT(overlap <= length);
  573. q += qStart;
  574. while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
  575. --overlap;
  576. }
  577. return overlap;
  578. }
  579. int32_t getAllSameOverlap(const uint32_t *p, int32_t length, uint32_t value,
  580. int32_t blockLength) {
  581. int32_t min = length - (blockLength - 1);
  582. int32_t i = length;
  583. while (min < i && p[i - 1] == value) { --i; }
  584. return length - i;
  585. }
  586. bool isStartOfSomeFastBlock(uint32_t dataOffset, const uint32_t index[], int32_t fastILimit) {
  587. for (int32_t i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
  588. if (index[i] == dataOffset) {
  589. return true;
  590. }
  591. }
  592. return false;
  593. }
  594. /**
  595. * Finds the start of the last range in the trie by enumerating backward.
  596. * Indexes for code points higher than this will be omitted.
  597. */
  598. UChar32 MutableCodePointTrie::findHighStart() const {
  599. int32_t i = highStart >> UCPTRIE_SHIFT_3;
  600. while (i > 0) {
  601. bool match;
  602. if (flags[--i] == ALL_SAME) {
  603. match = index[i] == highValue;
  604. } else /* MIXED */ {
  605. const uint32_t *p = data + index[i];
  606. for (int32_t j = 0;; ++j) {
  607. if (j == UCPTRIE_SMALL_DATA_BLOCK_LENGTH) {
  608. match = true;
  609. break;
  610. }
  611. if (p[j] != highValue) {
  612. match = false;
  613. break;
  614. }
  615. }
  616. }
  617. if (!match) {
  618. return (i + 1) << UCPTRIE_SHIFT_3;
  619. }
  620. }
  621. return 0;
  622. }
  623. class AllSameBlocks {
  624. public:
  625. static constexpr int32_t NEW_UNIQUE = -1;
  626. static constexpr int32_t OVERFLOW = -2;
  627. AllSameBlocks() : length(0), mostRecent(-1) {}
  628. int32_t findOrAdd(int32_t index, int32_t count, uint32_t value) {
  629. if (mostRecent >= 0 && values[mostRecent] == value) {
  630. refCounts[mostRecent] += count;
  631. return indexes[mostRecent];
  632. }
  633. for (int32_t i = 0; i < length; ++i) {
  634. if (values[i] == value) {
  635. mostRecent = i;
  636. refCounts[i] += count;
  637. return indexes[i];
  638. }
  639. }
  640. if (length == CAPACITY) {
  641. return OVERFLOW;
  642. }
  643. mostRecent = length;
  644. indexes[length] = index;
  645. values[length] = value;
  646. refCounts[length++] = count;
  647. return NEW_UNIQUE;
  648. }
  649. /** Replaces the block which has the lowest reference count. */
  650. void add(int32_t index, int32_t count, uint32_t value) {
  651. U_ASSERT(length == CAPACITY);
  652. int32_t least = -1;
  653. int32_t leastCount = I_LIMIT;
  654. for (int32_t i = 0; i < length; ++i) {
  655. U_ASSERT(values[i] != value);
  656. if (refCounts[i] < leastCount) {
  657. least = i;
  658. leastCount = refCounts[i];
  659. }
  660. }
  661. U_ASSERT(least >= 0);
  662. mostRecent = least;
  663. indexes[least] = index;
  664. values[least] = value;
  665. refCounts[least] = count;
  666. }
  667. int32_t findMostUsed() const {
  668. if (length == 0) { return -1; }
  669. int32_t max = -1;
  670. int32_t maxCount = 0;
  671. for (int32_t i = 0; i < length; ++i) {
  672. if (refCounts[i] > maxCount) {
  673. max = i;
  674. maxCount = refCounts[i];
  675. }
  676. }
  677. return indexes[max];
  678. }
  679. private:
  680. static constexpr int32_t CAPACITY = 32;
  681. int32_t length;
  682. int32_t mostRecent;
  683. int32_t indexes[CAPACITY];
  684. uint32_t values[CAPACITY];
  685. int32_t refCounts[CAPACITY];
  686. };
  687. // Custom hash table for mixed-value blocks to be found anywhere in the
  688. // compacted data or index so far.
  689. class MixedBlocks {
  690. public:
  691. MixedBlocks() {}
  692. ~MixedBlocks() {
  693. uprv_free(table);
  694. }
  695. bool init(int32_t maxLength, int32_t newBlockLength) {
  696. // We store actual data indexes + 1 to reserve 0 for empty entries.
  697. int32_t maxDataIndex = maxLength - newBlockLength + 1;
  698. int32_t newLength;
  699. if (maxDataIndex <= 0xfff) { // 4k
  700. newLength = 6007;
  701. shift = 12;
  702. mask = 0xfff;
  703. } else if (maxDataIndex <= 0x7fff) { // 32k
  704. newLength = 50021;
  705. shift = 15;
  706. mask = 0x7fff;
  707. } else if (maxDataIndex <= 0x1ffff) { // 128k
  708. newLength = 200003;
  709. shift = 17;
  710. mask = 0x1ffff;
  711. } else {
  712. // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M
  713. newLength = 1500007;
  714. shift = 21;
  715. mask = 0x1fffff;
  716. }
  717. if (newLength > capacity) {
  718. uprv_free(table);
  719. table = (uint32_t *)uprv_malloc(newLength * 4);
  720. if (table == nullptr) {
  721. return false;
  722. }
  723. capacity = newLength;
  724. }
  725. length = newLength;
  726. uprv_memset(table, 0, length * 4);
  727. blockLength = newBlockLength;
  728. return true;
  729. }
  730. template<typename UInt>
  731. void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) {
  732. int32_t start = prevDataLength - blockLength;
  733. if (start >= minStart) {
  734. ++start; // Skip the last block that we added last time.
  735. } else {
  736. start = minStart; // Begin with the first full block.
  737. }
  738. for (int32_t end = newDataLength - blockLength; start <= end; ++start) {
  739. uint32_t hashCode = makeHashCode(data, start);
  740. addEntry(data, start, hashCode, start);
  741. }
  742. }
  743. template<typename UIntA, typename UIntB>
  744. int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const {
  745. uint32_t hashCode = makeHashCode(blockData, blockStart);
  746. int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode);
  747. if (entryIndex >= 0) {
  748. return (table[entryIndex] & mask) - 1;
  749. } else {
  750. return -1;
  751. }
  752. }
  753. int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const {
  754. uint32_t hashCode = makeHashCode(blockValue);
  755. int32_t entryIndex = findEntry(data, blockValue, hashCode);
  756. if (entryIndex >= 0) {
  757. return (table[entryIndex] & mask) - 1;
  758. } else {
  759. return -1;
  760. }
  761. }
  762. private:
  763. template<typename UInt>
  764. uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const {
  765. int32_t blockLimit = blockStart + blockLength;
  766. uint32_t hashCode = blockData[blockStart++];
  767. do {
  768. hashCode = 37 * hashCode + blockData[blockStart++];
  769. } while (blockStart < blockLimit);
  770. return hashCode;
  771. }
  772. uint32_t makeHashCode(uint32_t blockValue) const {
  773. uint32_t hashCode = blockValue;
  774. for (int32_t i = 1; i < blockLength; ++i) {
  775. hashCode = 37 * hashCode + blockValue;
  776. }
  777. return hashCode;
  778. }
  779. template<typename UInt>
  780. void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) {
  781. U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask);
  782. int32_t entryIndex = findEntry(data, data, blockStart, hashCode);
  783. if (entryIndex < 0) {
  784. table[~entryIndex] = (hashCode << shift) | (dataIndex + 1);
  785. }
  786. }
  787. template<typename UIntA, typename UIntB>
  788. int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart,
  789. uint32_t hashCode) const {
  790. uint32_t shiftedHashCode = hashCode << shift;
  791. int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1
  792. for (int32_t entryIndex = initialEntryIndex;;) {
  793. uint32_t entry = table[entryIndex];
  794. if (entry == 0) {
  795. return ~entryIndex;
  796. }
  797. if ((entry & ~mask) == shiftedHashCode) {
  798. int32_t dataIndex = (entry & mask) - 1;
  799. if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) {
  800. return entryIndex;
  801. }
  802. }
  803. entryIndex = nextIndex(initialEntryIndex, entryIndex);
  804. }
  805. }
  806. int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const {
  807. uint32_t shiftedHashCode = hashCode << shift;
  808. int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1
  809. for (int32_t entryIndex = initialEntryIndex;;) {
  810. uint32_t entry = table[entryIndex];
  811. if (entry == 0) {
  812. return ~entryIndex;
  813. }
  814. if ((entry & ~mask) == shiftedHashCode) {
  815. int32_t dataIndex = (entry & mask) - 1;
  816. if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) {
  817. return entryIndex;
  818. }
  819. }
  820. entryIndex = nextIndex(initialEntryIndex, entryIndex);
  821. }
  822. }
  823. inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const {
  824. // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length);
  825. return (entryIndex + initialEntryIndex) % length;
  826. }
  827. // Hash table.
  828. // The length is a prime number, larger than the maximum data length.
  829. // The "shift" lower bits store a data index + 1.
  830. // The remaining upper bits store a partial hashCode of the block data values.
  831. uint32_t *table = nullptr;
  832. int32_t capacity = 0;
  833. int32_t length = 0;
  834. int32_t shift = 0;
  835. uint32_t mask = 0;
  836. int32_t blockLength = 0;
  837. };
  838. int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) {
  839. #ifdef UCPTRIE_DEBUG
  840. bool overflow = false;
  841. #endif
  842. // ASCII data will be stored as a linear table, even if the following code
  843. // does not yet count it that way.
  844. int32_t newDataCapacity = ASCII_LIMIT;
  845. // Add room for a small data null block in case it would match the start of
  846. // a fast data block where dataNullOffset must not be set in that case.
  847. newDataCapacity += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
  848. // Add room for special values (errorValue, highValue) and padding.
  849. newDataCapacity += 4;
  850. int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
  851. int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
  852. int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
  853. for (int32_t i = 0; i < iLimit; i += inc) {
  854. if (i == fastILimit) {
  855. blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
  856. inc = 1;
  857. }
  858. uint32_t value = index[i];
  859. if (flags[i] == MIXED) {
  860. // Really mixed?
  861. const uint32_t *p = data + value;
  862. value = *p;
  863. if (allValuesSameAs(p + 1, blockLength - 1, value)) {
  864. flags[i] = ALL_SAME;
  865. index[i] = value;
  866. // Fall through to ALL_SAME handling.
  867. } else {
  868. newDataCapacity += blockLength;
  869. continue;
  870. }
  871. } else {
  872. U_ASSERT(flags[i] == ALL_SAME);
  873. if (inc > 1) {
  874. // Do all of the fast-range data block's ALL_SAME parts have the same value?
  875. bool allSame = true;
  876. int32_t next_i = i + inc;
  877. for (int32_t j = i + 1; j < next_i; ++j) {
  878. U_ASSERT(flags[j] == ALL_SAME);
  879. if (index[j] != value) {
  880. allSame = false;
  881. break;
  882. }
  883. }
  884. if (!allSame) {
  885. // Turn it into a MIXED block.
  886. if (getDataBlock(i) < 0) {
  887. return -1;
  888. }
  889. newDataCapacity += blockLength;
  890. continue;
  891. }
  892. }
  893. }
  894. // Is there another ALL_SAME block with the same value?
  895. int32_t other = allSameBlocks.findOrAdd(i, inc, value);
  896. if (other == AllSameBlocks::OVERFLOW) {
  897. // The fixed-size array overflowed. Slow check for a duplicate block.
  898. #ifdef UCPTRIE_DEBUG
  899. if (!overflow) {
  900. puts("UCPTrie AllSameBlocks overflow");
  901. overflow = true;
  902. }
  903. #endif
  904. int32_t jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
  905. for (int32_t j = 0;; j += jInc) {
  906. if (j == i) {
  907. allSameBlocks.add(i, inc, value);
  908. break;
  909. }
  910. if (j == fastILimit) {
  911. jInc = 1;
  912. }
  913. if (flags[j] == ALL_SAME && index[j] == value) {
  914. allSameBlocks.add(j, jInc + inc, value);
  915. other = j;
  916. break;
  917. // We could keep counting blocks with the same value
  918. // before we add the first one, which may improve compaction in rare cases,
  919. // but it would make it slower.
  920. }
  921. }
  922. }
  923. if (other >= 0) {
  924. flags[i] = SAME_AS;
  925. index[i] = other;
  926. } else {
  927. // New unique same-value block.
  928. newDataCapacity += blockLength;
  929. }
  930. }
  931. return newDataCapacity;
  932. }
  933. #ifdef UCPTRIE_DEBUG
  934. # define DEBUG_DO(expr) expr
  935. #else
  936. # define DEBUG_DO(expr)
  937. #endif
  938. #ifdef UCPTRIE_DEBUG
  939. // Braille symbols: U+28xx = UTF-8 E2 A0 80..E2 A3 BF
  940. int32_t appendValue(char s[], int32_t length, uint32_t value) {
  941. value ^= value >> 16;
  942. value ^= value >> 8;
  943. s[length] = 0xE2;
  944. s[length + 1] = (char)(0xA0 + ((value >> 6) & 3));
  945. s[length + 2] = (char)(0x80 + (value & 0x3F));
  946. return length + 3;
  947. }
  948. void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value,
  949. UChar32 start, int32_t overlap, uint32_t initialValue) {
  950. char s[UCPTRIE_FAST_DATA_BLOCK_LENGTH * 3 + 3];
  951. int32_t length = 0;
  952. int32_t i;
  953. for (i = 0; i < overlap; ++i) {
  954. length = appendValue(s, length, 0); // Braille blank
  955. }
  956. s[length++] = '|';
  957. for (; i < blockLength; ++i) {
  958. if (block != nullptr) {
  959. value = block[i];
  960. }
  961. if (value == initialValue) {
  962. value = 0x40; // Braille lower left dot
  963. }
  964. length = appendValue(s, length, value);
  965. }
  966. s[length] = 0;
  967. start += overlap;
  968. if (start <= 0xffff) {
  969. printf(" %04lX %s|\n", (long)start, s);
  970. } else if (start <= 0xfffff) {
  971. printf(" %5lX %s|\n", (long)start, s);
  972. } else {
  973. printf(" %6lX %s|\n", (long)start, s);
  974. }
  975. }
  976. #endif
  977. /**
  978. * Compacts a build-time trie.
  979. *
  980. * The compaction
  981. * - removes blocks that are identical with earlier ones
  982. * - overlaps each new non-duplicate block as much as possible with the previously-written one
  983. * - works with fast-range data blocks whose length is a multiple of that of
  984. * higher-code-point data blocks
  985. *
  986. * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
  987. */
  988. int32_t MutableCodePointTrie::compactData(
  989. int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
  990. int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) {
  991. #ifdef UCPTRIE_DEBUG
  992. int32_t countSame=0, sumOverlaps=0;
  993. bool printData = dataLength == 29088 /* line.brk */ ||
  994. // dataLength == 30048 /* CanonIterData */ ||
  995. dataLength == 50400 /* zh.txt~stroke */;
  996. #endif
  997. // The linear ASCII data has been copied into newData already.
  998. int32_t newDataLength = 0;
  999. for (int32_t i = 0; newDataLength < ASCII_LIMIT;
  1000. newDataLength += UCPTRIE_FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
  1001. index[i] = newDataLength;
  1002. #ifdef UCPTRIE_DEBUG
  1003. if (printData) {
  1004. printBlock(newData + newDataLength, UCPTRIE_FAST_DATA_BLOCK_LENGTH, 0, newDataLength, 0, initialValue);
  1005. }
  1006. #endif
  1007. }
  1008. int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
  1009. if (!mixedBlocks.init(newDataCapacity, blockLength)) {
  1010. errorCode = U_MEMORY_ALLOCATION_ERROR;
  1011. return 0;
  1012. }
  1013. mixedBlocks.extend(newData, 0, 0, newDataLength);
  1014. int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
  1015. int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
  1016. int32_t fastLength = 0;
  1017. for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) {
  1018. if (i == fastILimit) {
  1019. blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
  1020. inc = 1;
  1021. fastLength = newDataLength;
  1022. if (!mixedBlocks.init(newDataCapacity, blockLength)) {
  1023. errorCode = U_MEMORY_ALLOCATION_ERROR;
  1024. return 0;
  1025. }
  1026. mixedBlocks.extend(newData, 0, 0, newDataLength);
  1027. }
  1028. if (flags[i] == ALL_SAME) {
  1029. uint32_t value = index[i];
  1030. // Find an earlier part of the data array of length blockLength
  1031. // that is filled with this value.
  1032. int32_t n = mixedBlocks.findAllSameBlock(newData, value);
  1033. // If we find a match, and the current block is the data null block,
  1034. // and it is not a fast block but matches the start of a fast block,
  1035. // then we need to continue looking.
  1036. // This is because this small block is shorter than the fast block,
  1037. // and not all of the rest of the fast block is filled with this value.
  1038. // Otherwise trie.getRange() would detect that the fast block starts at
  1039. // dataNullOffset and assume incorrectly that it is filled with the null value.
  1040. while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength &&
  1041. isStartOfSomeFastBlock(n, index, fastILimit)) {
  1042. n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength);
  1043. }
  1044. if (n >= 0) {
  1045. DEBUG_DO(++countSame);
  1046. index[i] = n;
  1047. } else {
  1048. n = getAllSameOverlap(newData, newDataLength, value, blockLength);
  1049. DEBUG_DO(sumOverlaps += n);
  1050. #ifdef UCPTRIE_DEBUG
  1051. if (printData) {
  1052. printBlock(nullptr, blockLength, value, i << UCPTRIE_SHIFT_3, n, initialValue);
  1053. }
  1054. #endif
  1055. index[i] = newDataLength - n;
  1056. int32_t prevDataLength = newDataLength;
  1057. while (n < blockLength) {
  1058. newData[newDataLength++] = value;
  1059. ++n;
  1060. }
  1061. mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
  1062. }
  1063. } else if (flags[i] == MIXED) {
  1064. const uint32_t *block = data + index[i];
  1065. int32_t n = mixedBlocks.findBlock(newData, block, 0);
  1066. if (n >= 0) {
  1067. DEBUG_DO(++countSame);
  1068. index[i] = n;
  1069. } else {
  1070. n = getOverlap(newData, newDataLength, block, 0, blockLength);
  1071. DEBUG_DO(sumOverlaps += n);
  1072. #ifdef UCPTRIE_DEBUG
  1073. if (printData) {
  1074. printBlock(block, blockLength, 0, i << UCPTRIE_SHIFT_3, n, initialValue);
  1075. }
  1076. #endif
  1077. index[i] = newDataLength - n;
  1078. int32_t prevDataLength = newDataLength;
  1079. while (n < blockLength) {
  1080. newData[newDataLength++] = block[n++];
  1081. }
  1082. mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
  1083. }
  1084. } else /* SAME_AS */ {
  1085. uint32_t j = index[i];
  1086. index[i] = index[j];
  1087. }
  1088. }
  1089. #ifdef UCPTRIE_DEBUG
  1090. /* we saved some space */
  1091. printf("compacting UCPTrie: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n",
  1092. (long)dataLength, (long)newDataLength, (long)countSame, (long)sumOverlaps);
  1093. #endif
  1094. return newDataLength;
  1095. }
  1096. int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks,
  1097. UErrorCode &errorCode) {
  1098. int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3);
  1099. if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) {
  1100. // Only the linear fast index, no multi-stage index tables.
  1101. index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET;
  1102. return fastIndexLength;
  1103. }
  1104. // Condense the fast index table.
  1105. // Also, does it contain an index-3 block with all dataNullOffset?
  1106. uint16_t fastIndex[UCPTRIE_BMP_INDEX_LENGTH]; // fastIndexLength
  1107. int32_t i3FirstNull = -1;
  1108. for (int32_t i = 0, j = 0; i < fastILimit; ++j) {
  1109. uint32_t i3 = index[i];
  1110. fastIndex[j] = (uint16_t)i3;
  1111. if (i3 == (uint32_t)dataNullOffset) {
  1112. if (i3FirstNull < 0) {
  1113. i3FirstNull = j;
  1114. } else if (index3NullOffset < 0 &&
  1115. (j - i3FirstNull + 1) == UCPTRIE_INDEX_3_BLOCK_LENGTH) {
  1116. index3NullOffset = i3FirstNull;
  1117. }
  1118. } else {
  1119. i3FirstNull = -1;
  1120. }
  1121. // Set the index entries that compactData() skipped.
  1122. // Needed when the multi-stage index covers the fast index range as well.
  1123. int32_t iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
  1124. while (++i < iNext) {
  1125. i3 += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
  1126. index[i] = i3;
  1127. }
  1128. }
  1129. if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
  1130. errorCode = U_MEMORY_ALLOCATION_ERROR;
  1131. return 0;
  1132. }
  1133. mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength);
  1134. // Examine index-3 blocks. For each determine one of:
  1135. // - same as the index-3 null block
  1136. // - same as a fast-index block
  1137. // - 16-bit indexes
  1138. // - 18-bit indexes
  1139. // We store this in the first flags entry for the index-3 block.
  1140. //
  1141. // Also determine an upper limit for the index-3 table length.
  1142. int32_t index3Capacity = 0;
  1143. i3FirstNull = index3NullOffset;
  1144. bool hasLongI3Blocks = false;
  1145. // If the fast index covers the whole BMP, then
  1146. // the multi-stage index is only for supplementary code points.
  1147. // Otherwise, the multi-stage index covers all of Unicode.
  1148. int32_t iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT;
  1149. int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
  1150. for (int32_t i = iStart; i < iLimit;) {
  1151. int32_t j = i;
  1152. int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
  1153. uint32_t oredI3 = 0;
  1154. bool isNull = true;
  1155. do {
  1156. uint32_t i3 = index[j];
  1157. oredI3 |= i3;
  1158. if (i3 != (uint32_t)dataNullOffset) {
  1159. isNull = false;
  1160. }
  1161. } while (++j < jLimit);
  1162. if (isNull) {
  1163. flags[i] = I3_NULL;
  1164. if (i3FirstNull < 0) {
  1165. if (oredI3 <= 0xffff) {
  1166. index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
  1167. } else {
  1168. index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
  1169. hasLongI3Blocks = true;
  1170. }
  1171. i3FirstNull = 0;
  1172. }
  1173. } else {
  1174. if (oredI3 <= 0xffff) {
  1175. int32_t n = mixedBlocks.findBlock(fastIndex, index, i);
  1176. if (n >= 0) {
  1177. flags[i] = I3_BMP;
  1178. index[i] = n;
  1179. } else {
  1180. flags[i] = I3_16;
  1181. index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
  1182. }
  1183. } else {
  1184. flags[i] = I3_18;
  1185. index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
  1186. hasLongI3Blocks = true;
  1187. }
  1188. }
  1189. i = j;
  1190. }
  1191. int32_t index2Capacity = (iLimit - iStart) >> UCPTRIE_SHIFT_2_3;
  1192. // Length of the index-1 table, rounded up.
  1193. int32_t index1Length = (index2Capacity + UCPTRIE_INDEX_2_MASK) >> UCPTRIE_SHIFT_1_2;
  1194. // Index table: Fast index, index-1, index-3, index-2.
  1195. // +1 for possible index table padding.
  1196. int32_t index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1;
  1197. index16 = (uint16_t *)uprv_malloc(index16Capacity * 2);
  1198. if (index16 == nullptr) {
  1199. errorCode = U_MEMORY_ALLOCATION_ERROR;
  1200. return 0;
  1201. }
  1202. uprv_memcpy(index16, fastIndex, fastIndexLength * 2);
  1203. if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
  1204. errorCode = U_MEMORY_ALLOCATION_ERROR;
  1205. return 0;
  1206. }
  1207. MixedBlocks longI3Blocks;
  1208. if (hasLongI3Blocks) {
  1209. if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) {
  1210. errorCode = U_MEMORY_ALLOCATION_ERROR;
  1211. return 0;
  1212. }
  1213. }
  1214. // Compact the index-3 table and write an uncompacted version of the index-2 table.
  1215. uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity
  1216. int32_t i2Length = 0;
  1217. i3FirstNull = index3NullOffset;
  1218. int32_t index3Start = fastIndexLength + index1Length;
  1219. int32_t indexLength = index3Start;
  1220. for (int32_t i = iStart; i < iLimit; i += UCPTRIE_INDEX_3_BLOCK_LENGTH) {
  1221. int32_t i3;
  1222. uint8_t f = flags[i];
  1223. if (f == I3_NULL && i3FirstNull < 0) {
  1224. // First index-3 null block. Write & overlap it like a normal block, then remember it.
  1225. f = dataNullOffset <= 0xffff ? I3_16 : I3_18;
  1226. i3FirstNull = 0;
  1227. }
  1228. if (f == I3_NULL) {
  1229. i3 = index3NullOffset;
  1230. } else if (f == I3_BMP) {
  1231. i3 = index[i];
  1232. } else if (f == I3_16) {
  1233. int32_t n = mixedBlocks.findBlock(index16, index, i);
  1234. if (n >= 0) {
  1235. i3 = n;
  1236. } else {
  1237. if (indexLength == index3Start) {
  1238. // No overlap at the boundary between the index-1 and index-3 tables.
  1239. n = 0;
  1240. } else {
  1241. n = getOverlap(index16, indexLength,
  1242. index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
  1243. }
  1244. i3 = indexLength - n;
  1245. int32_t prevIndexLength = indexLength;
  1246. while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) {
  1247. index16[indexLength++] = index[i + n++];
  1248. }
  1249. mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
  1250. if (hasLongI3Blocks) {
  1251. longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
  1252. }
  1253. }
  1254. } else {
  1255. U_ASSERT(f == I3_18);
  1256. U_ASSERT(hasLongI3Blocks);
  1257. // Encode an index-3 block that contains one or more data indexes exceeding 16 bits.
  1258. int32_t j = i;
  1259. int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
  1260. int32_t k = indexLength;
  1261. do {
  1262. ++k;
  1263. uint32_t v = index[j++];
  1264. uint32_t upperBits = (v & 0x30000) >> 2;
  1265. index16[k++] = v;
  1266. v = index[j++];
  1267. upperBits |= (v & 0x30000) >> 4;
  1268. index16[k++] = v;
  1269. v = index[j++];
  1270. upperBits |= (v & 0x30000) >> 6;
  1271. index16[k++] = v;
  1272. v = index[j++];
  1273. upperBits |= (v & 0x30000) >> 8;
  1274. index16[k++] = v;
  1275. v = index[j++];
  1276. upperBits |= (v & 0x30000) >> 10;
  1277. index16[k++] = v;
  1278. v = index[j++];
  1279. upperBits |= (v & 0x30000) >> 12;
  1280. index16[k++] = v;
  1281. v = index[j++];
  1282. upperBits |= (v & 0x30000) >> 14;
  1283. index16[k++] = v;
  1284. v = index[j++];
  1285. upperBits |= (v & 0x30000) >> 16;
  1286. index16[k++] = v;
  1287. index16[k - 9] = upperBits;
  1288. } while (j < jLimit);
  1289. int32_t n = longI3Blocks.findBlock(index16, index16, indexLength);
  1290. if (n >= 0) {
  1291. i3 = n | 0x8000;
  1292. } else {
  1293. if (indexLength == index3Start) {
  1294. // No overlap at the boundary between the index-1 and index-3 tables.
  1295. n = 0;
  1296. } else {
  1297. n = getOverlap(index16, indexLength,
  1298. index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
  1299. }
  1300. i3 = (indexLength - n) | 0x8000;
  1301. int32_t prevIndexLength = indexLength;
  1302. if (n > 0) {
  1303. int32_t start = indexLength;
  1304. while (n < INDEX_3_18BIT_BLOCK_LENGTH) {
  1305. index16[indexLength++] = index16[start + n++];
  1306. }
  1307. } else {
  1308. indexLength += INDEX_3_18BIT_BLOCK_LENGTH;
  1309. }
  1310. mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
  1311. if (hasLongI3Blocks) {
  1312. longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
  1313. }
  1314. }
  1315. }
  1316. if (index3NullOffset < 0 && i3FirstNull >= 0) {
  1317. index3NullOffset = i3;
  1318. }
  1319. // Set the index-2 table entry.
  1320. index2[i2Length++] = i3;
  1321. }
  1322. U_ASSERT(i2Length == index2Capacity);
  1323. U_ASSERT(indexLength <= index3Start + index3Capacity);
  1324. if (index3NullOffset < 0) {
  1325. index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET;
  1326. }
  1327. if (indexLength >= (UCPTRIE_NO_INDEX3_NULL_OFFSET + UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
  1328. // The index-3 offsets exceed 15 bits, or
  1329. // the last one cannot be distinguished from the no-null-block value.
  1330. errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  1331. return 0;
  1332. }
  1333. // Compact the index-2 table and write the index-1 table.
  1334. static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH,
  1335. "must re-init mixedBlocks");
  1336. int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH;
  1337. int32_t i1 = fastIndexLength;
  1338. for (int32_t i = 0; i < i2Length; i += blockLength) {
  1339. int32_t n;
  1340. if ((i2Length - i) >= blockLength) {
  1341. // normal block
  1342. U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH);
  1343. n = mixedBlocks.findBlock(index16, index2, i);
  1344. } else {
  1345. // highStart is inside the last index-2 block. Shorten it.
  1346. blockLength = i2Length - i;
  1347. n = findSameBlock(index16, index3Start, indexLength,
  1348. index2, i, blockLength);
  1349. }
  1350. int32_t i2;
  1351. if (n >= 0) {
  1352. i2 = n;
  1353. } else {
  1354. if (indexLength == index3Start) {
  1355. // No overlap at the boundary between the index-1 and index-3/2 tables.
  1356. n = 0;
  1357. } else {
  1358. n = getOverlap(index16, indexLength, index2, i, blockLength);
  1359. }
  1360. i2 = indexLength - n;
  1361. int32_t prevIndexLength = indexLength;
  1362. while (n < blockLength) {
  1363. index16[indexLength++] = index2[i + n++];
  1364. }
  1365. mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
  1366. }
  1367. // Set the index-1 table entry.
  1368. index16[i1++] = i2;
  1369. }
  1370. U_ASSERT(i1 == index3Start);
  1371. U_ASSERT(indexLength <= index16Capacity);
  1372. #ifdef UCPTRIE_DEBUG
  1373. /* we saved some space */
  1374. printf("compacting UCPTrie: count of 16-bit index words %lu->%lu\n",
  1375. (long)iLimit, (long)indexLength);
  1376. #endif
  1377. return indexLength;
  1378. }
  1379. int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorCode) {
  1380. // Find the real highStart and round it up.
  1381. U_ASSERT((highStart & (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) == 0);
  1382. highValue = get(MAX_UNICODE);
  1383. int32_t realHighStart = findHighStart();
  1384. realHighStart = (realHighStart + (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) &
  1385. ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1);
  1386. if (realHighStart == UNICODE_LIMIT) {
  1387. highValue = initialValue;
  1388. }
  1389. #ifdef UCPTRIE_DEBUG
  1390. printf("UCPTrie: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n",
  1391. (long)realHighStart, (long)highValue, (long)initialValue);
  1392. #endif
  1393. // We always store indexes and data values for the fast range.
  1394. // Pin highStart to the top of that range while building.
  1395. UChar32 fastLimit = fastILimit << UCPTRIE_SHIFT_3;
  1396. if (realHighStart < fastLimit) {
  1397. for (int32_t i = (realHighStart >> UCPTRIE_SHIFT_3); i < fastILimit; ++i) {
  1398. flags[i] = ALL_SAME;
  1399. index[i] = highValue;
  1400. }
  1401. highStart = fastLimit;
  1402. } else {
  1403. highStart = realHighStart;
  1404. }
  1405. uint32_t asciiData[ASCII_LIMIT];
  1406. for (int32_t i = 0; i < ASCII_LIMIT; ++i) {
  1407. asciiData[i] = get(i);
  1408. }
  1409. // First we look for which data blocks have the same value repeated over the whole block,
  1410. // deduplicate such blocks, find a good null data block (for faster enumeration),
  1411. // and get an upper bound for the necessary data array length.
  1412. AllSameBlocks allSameBlocks;
  1413. int32_t newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks);
  1414. if (newDataCapacity < 0) {
  1415. errorCode = U_MEMORY_ALLOCATION_ERROR;
  1416. return 0;
  1417. }
  1418. uint32_t *newData = (uint32_t *)uprv_malloc(newDataCapacity * 4);
  1419. if (newData == nullptr) {
  1420. errorCode = U_MEMORY_ALLOCATION_ERROR;
  1421. return 0;
  1422. }
  1423. uprv_memcpy(newData, asciiData, sizeof(asciiData));
  1424. int32_t dataNullIndex = allSameBlocks.findMostUsed();
  1425. MixedBlocks mixedBlocks;
  1426. int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity,
  1427. dataNullIndex, mixedBlocks, errorCode);
  1428. if (U_FAILURE(errorCode)) { return 0; }
  1429. U_ASSERT(newDataLength <= newDataCapacity);
  1430. uprv_free(data);
  1431. data = newData;
  1432. dataCapacity = newDataCapacity;
  1433. dataLength = newDataLength;
  1434. if (dataLength > (0x3ffff + UCPTRIE_SMALL_DATA_BLOCK_LENGTH)) {
  1435. // The offset of the last data block is too high to be stored in the index table.
  1436. errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  1437. return 0;
  1438. }
  1439. if (dataNullIndex >= 0) {
  1440. dataNullOffset = index[dataNullIndex];
  1441. #ifdef UCPTRIE_DEBUG
  1442. if (data[dataNullOffset] != initialValue) {
  1443. printf("UCPTrie initialValue %lx -> more common nullValue %lx\n",
  1444. (long)initialValue, (long)data[dataNullOffset]);
  1445. }
  1446. #endif
  1447. initialValue = data[dataNullOffset];
  1448. } else {
  1449. dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET;
  1450. }
  1451. int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode);
  1452. highStart = realHighStart;
  1453. return indexLength;
  1454. }
  1455. UCPTrie *MutableCodePointTrie::build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode) {
  1456. if (U_FAILURE(errorCode)) {
  1457. return nullptr;
  1458. }
  1459. if (type < UCPTRIE_TYPE_FAST || UCPTRIE_TYPE_SMALL < type ||
  1460. valueWidth < UCPTRIE_VALUE_BITS_16 || UCPTRIE_VALUE_BITS_8 < valueWidth) {
  1461. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1462. return nullptr;
  1463. }
  1464. // The mutable trie always stores 32-bit values.
  1465. // When we build a UCPTrie for a smaller value width, we first mask off unused bits
  1466. // before compacting the data.
  1467. switch (valueWidth) {
  1468. case UCPTRIE_VALUE_BITS_32:
  1469. break;
  1470. case UCPTRIE_VALUE_BITS_16:
  1471. maskValues(0xffff);
  1472. break;
  1473. case UCPTRIE_VALUE_BITS_8:
  1474. maskValues(0xff);
  1475. break;
  1476. default:
  1477. break;
  1478. }
  1479. UChar32 fastLimit = type == UCPTRIE_TYPE_FAST ? BMP_LIMIT : UCPTRIE_SMALL_LIMIT;
  1480. int32_t indexLength = compactTrie(fastLimit >> UCPTRIE_SHIFT_3, errorCode);
  1481. if (U_FAILURE(errorCode)) {
  1482. clear();
  1483. return nullptr;
  1484. }
  1485. // Ensure data table alignment: The index length must be even for uint32_t data.
  1486. if (valueWidth == UCPTRIE_VALUE_BITS_32 && (indexLength & 1) != 0) {
  1487. index16[indexLength++] = 0xffee; // arbitrary value
  1488. }
  1489. // Make the total trie structure length a multiple of 4 bytes by padding the data table,
  1490. // and store special values as the last two data values.
  1491. int32_t length = indexLength * 2;
  1492. if (valueWidth == UCPTRIE_VALUE_BITS_16) {
  1493. if (((indexLength ^ dataLength) & 1) != 0) {
  1494. // padding
  1495. data[dataLength++] = errorValue;
  1496. }
  1497. if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
  1498. data[dataLength++] = highValue;
  1499. data[dataLength++] = errorValue;
  1500. }
  1501. length += dataLength * 2;
  1502. } else if (valueWidth == UCPTRIE_VALUE_BITS_32) {
  1503. // 32-bit data words never need padding to a multiple of 4 bytes.
  1504. if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
  1505. if (data[dataLength - 1] != highValue) {
  1506. data[dataLength++] = highValue;
  1507. }
  1508. data[dataLength++] = errorValue;
  1509. }
  1510. length += dataLength * 4;
  1511. } else {
  1512. int32_t and3 = (length + dataLength) & 3;
  1513. if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) {
  1514. // all set
  1515. } else if(and3 == 3 && data[dataLength - 1] == highValue) {
  1516. data[dataLength++] = errorValue;
  1517. } else {
  1518. while (and3 != 2) {
  1519. data[dataLength++] = highValue;
  1520. and3 = (and3 + 1) & 3;
  1521. }
  1522. data[dataLength++] = highValue;
  1523. data[dataLength++] = errorValue;
  1524. }
  1525. length += dataLength;
  1526. }
  1527. // Calculate the total length of the UCPTrie as a single memory block.
  1528. length += sizeof(UCPTrie);
  1529. U_ASSERT((length & 3) == 0);
  1530. uint8_t *bytes = (uint8_t *)uprv_malloc(length);
  1531. if (bytes == nullptr) {
  1532. errorCode = U_MEMORY_ALLOCATION_ERROR;
  1533. clear();
  1534. return nullptr;
  1535. }
  1536. UCPTrie *trie = reinterpret_cast<UCPTrie *>(bytes);
  1537. uprv_memset(trie, 0, sizeof(UCPTrie));
  1538. trie->indexLength = indexLength;
  1539. trie->dataLength = dataLength;
  1540. trie->highStart = highStart;
  1541. // Round up shifted12HighStart to a multiple of 0x1000 for easy testing from UTF-8 lead bytes.
  1542. // Runtime code needs to then test for the real highStart as well.
  1543. trie->shifted12HighStart = (highStart + 0xfff) >> 12;
  1544. trie->type = type;
  1545. trie->valueWidth = valueWidth;
  1546. trie->index3NullOffset = index3NullOffset;
  1547. trie->dataNullOffset = dataNullOffset;
  1548. trie->nullValue = initialValue;
  1549. bytes += sizeof(UCPTrie);
  1550. // Fill the index and data arrays.
  1551. uint16_t *dest16 = (uint16_t *)bytes;
  1552. trie->index = dest16;
  1553. if (highStart <= fastLimit) {
  1554. // Condense only the fast index from the mutable-trie index.
  1555. for (int32_t i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) {
  1556. *dest16++ = (uint16_t)index[i]; // dest16[j]
  1557. }
  1558. } else {
  1559. uprv_memcpy(dest16, index16, indexLength * 2);
  1560. dest16 += indexLength;
  1561. }
  1562. bytes += indexLength * 2;
  1563. // Write the data array.
  1564. const uint32_t *p = data;
  1565. switch (valueWidth) {
  1566. case UCPTRIE_VALUE_BITS_16:
  1567. // Write 16-bit data values.
  1568. trie->data.ptr16 = dest16;
  1569. for (int32_t i = dataLength; i > 0; --i) {
  1570. *dest16++ = (uint16_t)*p++;
  1571. }
  1572. break;
  1573. case UCPTRIE_VALUE_BITS_32:
  1574. // Write 32-bit data values.
  1575. trie->data.ptr32 = (uint32_t *)bytes;
  1576. uprv_memcpy(bytes, p, (size_t)dataLength * 4);
  1577. break;
  1578. case UCPTRIE_VALUE_BITS_8:
  1579. // Write 8-bit data values.
  1580. trie->data.ptr8 = bytes;
  1581. for (int32_t i = dataLength; i > 0; --i) {
  1582. *bytes++ = (uint8_t)*p++;
  1583. }
  1584. break;
  1585. default:
  1586. // Will not occur, valueWidth checked at the beginning.
  1587. break;
  1588. }
  1589. #ifdef UCPTRIE_DEBUG
  1590. trie->name = name;
  1591. ucptrie_printLengths(trie, "");
  1592. #endif
  1593. clear();
  1594. return trie;
  1595. }
  1596. } // namespace
  1597. U_NAMESPACE_END
  1598. U_NAMESPACE_USE
  1599. U_CAPI UMutableCPTrie * U_EXPORT2
  1600. umutablecptrie_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
  1601. if (U_FAILURE(*pErrorCode)) {
  1602. return nullptr;
  1603. }
  1604. LocalPointer<MutableCodePointTrie> trie(
  1605. new MutableCodePointTrie(initialValue, errorValue, *pErrorCode), *pErrorCode);
  1606. if (U_FAILURE(*pErrorCode)) {
  1607. return nullptr;
  1608. }
  1609. return reinterpret_cast<UMutableCPTrie *>(trie.orphan());
  1610. }
  1611. U_CAPI UMutableCPTrie * U_EXPORT2
  1612. umutablecptrie_clone(const UMutableCPTrie *other, UErrorCode *pErrorCode) {
  1613. if (U_FAILURE(*pErrorCode)) {
  1614. return nullptr;
  1615. }
  1616. if (other == nullptr) {
  1617. return nullptr;
  1618. }
  1619. LocalPointer<MutableCodePointTrie> clone(
  1620. new MutableCodePointTrie(*reinterpret_cast<const MutableCodePointTrie *>(other), *pErrorCode), *pErrorCode);
  1621. if (U_FAILURE(*pErrorCode)) {
  1622. return nullptr;
  1623. }
  1624. return reinterpret_cast<UMutableCPTrie *>(clone.orphan());
  1625. }
  1626. U_CAPI void U_EXPORT2
  1627. umutablecptrie_close(UMutableCPTrie *trie) {
  1628. delete reinterpret_cast<MutableCodePointTrie *>(trie);
  1629. }
  1630. U_CAPI UMutableCPTrie * U_EXPORT2
  1631. umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode) {
  1632. if (U_FAILURE(*pErrorCode)) {
  1633. return nullptr;
  1634. }
  1635. if (map == nullptr) {
  1636. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1637. return nullptr;
  1638. }
  1639. return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPMap(map, *pErrorCode));
  1640. }
  1641. U_CAPI UMutableCPTrie * U_EXPORT2
  1642. umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode) {
  1643. if (U_FAILURE(*pErrorCode)) {
  1644. return nullptr;
  1645. }
  1646. if (trie == nullptr) {
  1647. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1648. return nullptr;
  1649. }
  1650. return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPTrie(trie, *pErrorCode));
  1651. }
  1652. U_CAPI uint32_t U_EXPORT2
  1653. umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c) {
  1654. return reinterpret_cast<const MutableCodePointTrie *>(trie)->get(c);
  1655. }
  1656. namespace {
  1657. UChar32 getRange(const void *trie, UChar32 start,
  1658. UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
  1659. return reinterpret_cast<const MutableCodePointTrie *>(trie)->
  1660. getRange(start, filter, context, pValue);
  1661. }
  1662. } // namespace
  1663. U_CAPI UChar32 U_EXPORT2
  1664. umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start,
  1665. UCPMapRangeOption option, uint32_t surrogateValue,
  1666. UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
  1667. return ucptrie_internalGetRange(getRange, trie, start,
  1668. option, surrogateValue,
  1669. filter, context, pValue);
  1670. }
  1671. U_CAPI void U_EXPORT2
  1672. umutablecptrie_set(UMutableCPTrie *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
  1673. if (U_FAILURE(*pErrorCode)) {
  1674. return;
  1675. }
  1676. reinterpret_cast<MutableCodePointTrie *>(trie)->set(c, value, *pErrorCode);
  1677. }
  1678. U_CAPI void U_EXPORT2
  1679. umutablecptrie_setRange(UMutableCPTrie *trie, UChar32 start, UChar32 end,
  1680. uint32_t value, UErrorCode *pErrorCode) {
  1681. if (U_FAILURE(*pErrorCode)) {
  1682. return;
  1683. }
  1684. reinterpret_cast<MutableCodePointTrie *>(trie)->setRange(start, end, value, *pErrorCode);
  1685. }
  1686. /* Compact and internally serialize the trie. */
  1687. U_CAPI UCPTrie * U_EXPORT2
  1688. umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth,
  1689. UErrorCode *pErrorCode) {
  1690. if (U_FAILURE(*pErrorCode)) {
  1691. return nullptr;
  1692. }
  1693. return reinterpret_cast<MutableCodePointTrie *>(trie)->build(type, valueWidth, *pErrorCode);
  1694. }
  1695. #ifdef UCPTRIE_DEBUG
  1696. U_CFUNC void umutablecptrie_setName(UMutableCPTrie *trie, const char *name) {
  1697. reinterpret_cast<MutableCodePointTrie *>(trie)->name = name;
  1698. }
  1699. #endif