collationweights.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 1999-2015, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: collationweights.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2001mar08 as ucol_wgt.cpp
  16. * created by: Markus W. Scherer
  17. *
  18. * This file contains code for allocating n collation element weights
  19. * between two exclusive limits.
  20. * It is used only internally by the collation tailoring builder.
  21. */
  22. #include "unicode/utypes.h"
  23. #if !UCONFIG_NO_COLLATION
  24. #include "cmemory.h"
  25. #include "collation.h"
  26. #include "collationweights.h"
  27. #include "uarrsort.h"
  28. #include "uassert.h"
  29. #ifdef UCOL_DEBUG
  30. # include <stdio.h>
  31. #endif
  32. U_NAMESPACE_BEGIN
  33. /* collation element weight allocation -------------------------------------- */
  34. /* helper functions for CE weights */
  35. static inline uint32_t
  36. getWeightTrail(uint32_t weight, int32_t length) {
  37. return (uint32_t)(weight>>(8*(4-length)))&0xff;
  38. }
  39. static inline uint32_t
  40. setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
  41. length=8*(4-length);
  42. return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
  43. }
  44. static inline uint32_t
  45. getWeightByte(uint32_t weight, int32_t idx) {
  46. return getWeightTrail(weight, idx); /* same calculation */
  47. }
  48. static inline uint32_t
  49. setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
  50. uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
  51. idx*=8;
  52. if(idx<32) {
  53. mask=((uint32_t)0xffffffff)>>idx;
  54. } else {
  55. // Do not use uint32_t>>32 because on some platforms that does not shift at all
  56. // while we need it to become 0.
  57. // PowerPC: 0xffffffff>>32 = 0 (wanted)
  58. // x86: 0xffffffff>>32 = 0xffffffff (not wanted)
  59. //
  60. // ANSI C99 6.5.7 Bitwise shift operators:
  61. // "If the value of the right operand is negative
  62. // or is greater than or equal to the width of the promoted left operand,
  63. // the behavior is undefined."
  64. mask=0;
  65. }
  66. idx=32-idx;
  67. mask|=0xffffff00<<idx;
  68. return (uint32_t)((weight&mask)|(byte<<idx));
  69. }
  70. static inline uint32_t
  71. truncateWeight(uint32_t weight, int32_t length) {
  72. return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
  73. }
  74. static inline uint32_t
  75. incWeightTrail(uint32_t weight, int32_t length) {
  76. return (uint32_t)(weight+(1UL<<(8*(4-length))));
  77. }
  78. static inline uint32_t
  79. decWeightTrail(uint32_t weight, int32_t length) {
  80. return (uint32_t)(weight-(1UL<<(8*(4-length))));
  81. }
  82. CollationWeights::CollationWeights()
  83. : middleLength(0), rangeIndex(0), rangeCount(0) {
  84. for(int32_t i = 0; i < 5; ++i) {
  85. minBytes[i] = maxBytes[i] = 0;
  86. }
  87. }
  88. void
  89. CollationWeights::initForPrimary(UBool compressible) {
  90. middleLength=1;
  91. minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
  92. maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
  93. if(compressible) {
  94. minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
  95. maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
  96. } else {
  97. minBytes[2] = 2;
  98. maxBytes[2] = 0xff;
  99. }
  100. minBytes[3] = 2;
  101. maxBytes[3] = 0xff;
  102. minBytes[4] = 2;
  103. maxBytes[4] = 0xff;
  104. }
  105. void
  106. CollationWeights::initForSecondary() {
  107. // We use only the lower 16 bits for secondary weights.
  108. middleLength=3;
  109. minBytes[1] = 0;
  110. maxBytes[1] = 0;
  111. minBytes[2] = 0;
  112. maxBytes[2] = 0;
  113. minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
  114. maxBytes[3] = 0xff;
  115. minBytes[4] = 2;
  116. maxBytes[4] = 0xff;
  117. }
  118. void
  119. CollationWeights::initForTertiary() {
  120. // We use only the lower 16 bits for tertiary weights.
  121. middleLength=3;
  122. minBytes[1] = 0;
  123. maxBytes[1] = 0;
  124. minBytes[2] = 0;
  125. maxBytes[2] = 0;
  126. // We use only 6 bits per byte.
  127. // The other bits are used for case & quaternary weights.
  128. minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
  129. maxBytes[3] = 0x3f;
  130. minBytes[4] = 2;
  131. maxBytes[4] = 0x3f;
  132. }
  133. uint32_t
  134. CollationWeights::incWeight(uint32_t weight, int32_t length) const {
  135. for(;;) {
  136. uint32_t byte=getWeightByte(weight, length);
  137. if(byte<maxBytes[length]) {
  138. return setWeightByte(weight, length, byte+1);
  139. } else {
  140. // Roll over, set this byte to the minimum and increment the previous one.
  141. weight=setWeightByte(weight, length, minBytes[length]);
  142. --length;
  143. U_ASSERT(length > 0);
  144. }
  145. }
  146. }
  147. uint32_t
  148. CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
  149. for(;;) {
  150. offset += getWeightByte(weight, length);
  151. if((uint32_t)offset <= maxBytes[length]) {
  152. return setWeightByte(weight, length, offset);
  153. } else {
  154. // Split the offset between this byte and the previous one.
  155. offset -= minBytes[length];
  156. weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
  157. offset /= countBytes(length);
  158. --length;
  159. U_ASSERT(length > 0);
  160. }
  161. }
  162. }
  163. void
  164. CollationWeights::lengthenRange(WeightRange &range) const {
  165. int32_t length=range.length+1;
  166. range.start=setWeightTrail(range.start, length, minBytes[length]);
  167. range.end=setWeightTrail(range.end, length, maxBytes[length]);
  168. range.count*=countBytes(length);
  169. range.length=length;
  170. }
  171. /* for uprv_sortArray: sort ranges in weight order */
  172. static int32_t U_CALLCONV
  173. compareRanges(const void * /*context*/, const void *left, const void *right) {
  174. uint32_t l, r;
  175. l=((const CollationWeights::WeightRange *)left)->start;
  176. r=((const CollationWeights::WeightRange *)right)->start;
  177. if(l<r) {
  178. return -1;
  179. } else if(l>r) {
  180. return 1;
  181. } else {
  182. return 0;
  183. }
  184. }
  185. UBool
  186. CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
  187. U_ASSERT(lowerLimit != 0);
  188. U_ASSERT(upperLimit != 0);
  189. /* get the lengths of the limits */
  190. int32_t lowerLength=lengthOfWeight(lowerLimit);
  191. int32_t upperLength=lengthOfWeight(upperLimit);
  192. #ifdef UCOL_DEBUG
  193. printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
  194. printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
  195. #endif
  196. U_ASSERT(lowerLength>=middleLength);
  197. // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
  198. if(lowerLimit>=upperLimit) {
  199. #ifdef UCOL_DEBUG
  200. printf("error: no space between lower & upper limits\n");
  201. #endif
  202. return false;
  203. }
  204. /* check that neither is a prefix of the other */
  205. if(lowerLength<upperLength) {
  206. if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
  207. #ifdef UCOL_DEBUG
  208. printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
  209. #endif
  210. return false;
  211. }
  212. }
  213. /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
  214. WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
  215. uprv_memset(lower, 0, sizeof(lower));
  216. uprv_memset(&middle, 0, sizeof(middle));
  217. uprv_memset(upper, 0, sizeof(upper));
  218. /*
  219. * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
  220. * range minimum length
  221. * lower[4] 4
  222. * lower[3] 3
  223. * lower[2] 2
  224. * middle 1
  225. * upper[2] 2
  226. * upper[3] 3
  227. * upper[4] 4
  228. *
  229. * We are now going to calculate up to 7 ranges.
  230. * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
  231. */
  232. uint32_t weight=lowerLimit;
  233. for(int32_t length=lowerLength; length>middleLength; --length) {
  234. uint32_t trail=getWeightTrail(weight, length);
  235. if(trail<maxBytes[length]) {
  236. lower[length].start=incWeightTrail(weight, length);
  237. lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
  238. lower[length].length=length;
  239. lower[length].count=maxBytes[length]-trail;
  240. }
  241. weight=truncateWeight(weight, length-1);
  242. }
  243. if(weight<0xff000000) {
  244. middle.start=incWeightTrail(weight, middleLength);
  245. } else {
  246. // Prevent overflow for primary lead byte FF
  247. // which would yield a middle range starting at 0.
  248. middle.start=0xffffffff; // no middle range
  249. }
  250. weight=upperLimit;
  251. for(int32_t length=upperLength; length>middleLength; --length) {
  252. uint32_t trail=getWeightTrail(weight, length);
  253. if(trail>minBytes[length]) {
  254. upper[length].start=setWeightTrail(weight, length, minBytes[length]);
  255. upper[length].end=decWeightTrail(weight, length);
  256. upper[length].length=length;
  257. upper[length].count=trail-minBytes[length];
  258. }
  259. weight=truncateWeight(weight, length-1);
  260. }
  261. middle.end=decWeightTrail(weight, middleLength);
  262. /* set the middle range */
  263. middle.length=middleLength;
  264. if(middle.end>=middle.start) {
  265. middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
  266. } else {
  267. /* no middle range, eliminate overlaps */
  268. for(int32_t length=4; length>middleLength; --length) {
  269. if(lower[length].count>0 && upper[length].count>0) {
  270. // Note: The lowerEnd and upperStart weights are versions of
  271. // lowerLimit and upperLimit (which are lowerLimit<upperLimit),
  272. // truncated (still less-or-equal)
  273. // and then with their last bytes changed to the
  274. // maxByte (for lowerEnd) or minByte (for upperStart).
  275. const uint32_t lowerEnd=lower[length].end;
  276. const uint32_t upperStart=upper[length].start;
  277. UBool merged=false;
  278. if(lowerEnd>upperStart) {
  279. // These two lower and upper ranges collide.
  280. // Since lowerLimit<upperLimit and lowerEnd and upperStart
  281. // are versions with only their last bytes modified
  282. // (and following ones removed/reset to 0),
  283. // lowerEnd>upperStart is only possible
  284. // if the leading bytes are equal
  285. // and lastByte(lowerEnd)>lastByte(upperStart).
  286. U_ASSERT(truncateWeight(lowerEnd, length-1)==
  287. truncateWeight(upperStart, length-1));
  288. // Intersect these two ranges.
  289. lower[length].end=upper[length].end;
  290. lower[length].count=
  291. (int32_t)getWeightTrail(lower[length].end, length)-
  292. (int32_t)getWeightTrail(lower[length].start, length)+1;
  293. // count might be <=0 in which case there is no room,
  294. // and the range-collecting code below will ignore this range.
  295. merged=true;
  296. } else if(lowerEnd==upperStart) {
  297. // Not possible, unless minByte==maxByte which is not allowed.
  298. U_ASSERT(minBytes[length]<maxBytes[length]);
  299. } else /* lowerEnd<upperStart */ {
  300. if(incWeight(lowerEnd, length)==upperStart) {
  301. // Merge adjacent ranges.
  302. lower[length].end=upper[length].end;
  303. lower[length].count+=upper[length].count; // might be >countBytes
  304. merged=true;
  305. }
  306. }
  307. if(merged) {
  308. // Remove all shorter ranges.
  309. // There was no room available for them between the ranges we just merged.
  310. upper[length].count=0;
  311. while(--length>middleLength) {
  312. lower[length].count=upper[length].count=0;
  313. }
  314. break;
  315. }
  316. }
  317. }
  318. }
  319. #ifdef UCOL_DEBUG
  320. /* print ranges */
  321. for(int32_t length=4; length>=2; --length) {
  322. if(lower[length].count>0) {
  323. printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
  324. }
  325. }
  326. if(middle.count>0) {
  327. printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
  328. }
  329. for(int32_t length=2; length<=4; ++length) {
  330. if(upper[length].count>0) {
  331. printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
  332. }
  333. }
  334. #endif
  335. /* copy the ranges, shortest first, into the result array */
  336. rangeCount=0;
  337. if(middle.count>0) {
  338. uprv_memcpy(ranges, &middle, sizeof(WeightRange));
  339. rangeCount=1;
  340. }
  341. for(int32_t length=middleLength+1; length<=4; ++length) {
  342. /* copy upper first so that later the middle range is more likely the first one to use */
  343. if(upper[length].count>0) {
  344. uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
  345. ++rangeCount;
  346. }
  347. if(lower[length].count>0) {
  348. uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
  349. ++rangeCount;
  350. }
  351. }
  352. return rangeCount>0;
  353. }
  354. UBool
  355. CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
  356. // See if the first few minLength and minLength+1 ranges have enough weights.
  357. for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
  358. if(n <= ranges[i].count) {
  359. // Use the first few minLength and minLength+1 ranges.
  360. if(ranges[i].length > minLength) {
  361. // Reduce the number of weights from the last minLength+1 range
  362. // which might sort before some minLength ranges,
  363. // so that we use all weights in the minLength ranges.
  364. ranges[i].count = n;
  365. }
  366. rangeCount = i + 1;
  367. #ifdef UCOL_DEBUG
  368. printf("take first %ld ranges\n", rangeCount);
  369. #endif
  370. if(rangeCount>1) {
  371. /* sort the ranges by weight values */
  372. UErrorCode errorCode=U_ZERO_ERROR;
  373. uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
  374. compareRanges, nullptr, false, &errorCode);
  375. /* ignore error code: we know that the internal sort function will not fail here */
  376. }
  377. return true;
  378. }
  379. n -= ranges[i].count; // still >0
  380. }
  381. return false;
  382. }
  383. UBool
  384. CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
  385. // See if the minLength ranges have enough weights
  386. // when we split one and lengthen the following ones.
  387. int32_t count = 0;
  388. int32_t minLengthRangeCount;
  389. for(minLengthRangeCount = 0;
  390. minLengthRangeCount < rangeCount &&
  391. ranges[minLengthRangeCount].length == minLength;
  392. ++minLengthRangeCount) {
  393. count += ranges[minLengthRangeCount].count;
  394. }
  395. int32_t nextCountBytes = countBytes(minLength + 1);
  396. if(n > count * nextCountBytes) { return false; }
  397. // Use the minLength ranges. Merge them, and then split again as necessary.
  398. uint32_t start = ranges[0].start;
  399. uint32_t end = ranges[0].end;
  400. for(int32_t i = 1; i < minLengthRangeCount; ++i) {
  401. if(ranges[i].start < start) { start = ranges[i].start; }
  402. if(ranges[i].end > end) { end = ranges[i].end; }
  403. }
  404. // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
  405. // Goal:
  406. // count1 + count2 * nextCountBytes = n
  407. // count1 + count2 = count
  408. // These turn into
  409. // (count - count2) + count2 * nextCountBytes = n
  410. // and then into the following count1 & count2 computations.
  411. int32_t count2 = (n - count) / (nextCountBytes - 1); // number of weights to be lengthened
  412. int32_t count1 = count - count2; // number of minLength weights
  413. if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) {
  414. // round up
  415. ++count2;
  416. --count1;
  417. U_ASSERT((count1 + count2 * nextCountBytes) >= n);
  418. }
  419. ranges[0].start = start;
  420. if(count1 == 0) {
  421. // Make one long range.
  422. ranges[0].end = end;
  423. ranges[0].count = count;
  424. lengthenRange(ranges[0]);
  425. rangeCount = 1;
  426. } else {
  427. // Split the range, lengthen the second part.
  428. #ifdef UCOL_DEBUG
  429. printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
  430. splitRange, rangeCount, count1, count2);
  431. #endif
  432. // Next start = start + count1. First end = 1 before that.
  433. ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
  434. ranges[0].count = count1;
  435. ranges[1].start = incWeight(ranges[0].end, minLength);
  436. ranges[1].end = end;
  437. ranges[1].length = minLength; // +1 when lengthened
  438. ranges[1].count = count2; // *countBytes when lengthened
  439. lengthenRange(ranges[1]);
  440. rangeCount = 2;
  441. }
  442. return true;
  443. }
  444. /*
  445. * call getWeightRanges and then determine heuristically
  446. * which ranges to use for a given number of weights between (excluding)
  447. * two limits
  448. */
  449. UBool
  450. CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
  451. #ifdef UCOL_DEBUG
  452. puts("");
  453. #endif
  454. if(!getWeightRanges(lowerLimit, upperLimit)) {
  455. #ifdef UCOL_DEBUG
  456. printf("error: unable to get Weight ranges\n");
  457. #endif
  458. return false;
  459. }
  460. /* try until we find suitably large ranges */
  461. for(;;) {
  462. /* get the smallest number of bytes in a range */
  463. int32_t minLength=ranges[0].length;
  464. if(allocWeightsInShortRanges(n, minLength)) { break; }
  465. if(minLength == 4) {
  466. #ifdef UCOL_DEBUG
  467. printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
  468. minLengthCount, n);
  469. #endif
  470. return false;
  471. }
  472. if(allocWeightsInMinLengthRanges(n, minLength)) { break; }
  473. /* no good match, lengthen all minLength ranges and iterate */
  474. #ifdef UCOL_DEBUG
  475. printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
  476. #endif
  477. for(int32_t i=0; i<rangeCount && ranges[i].length==minLength; ++i) {
  478. lengthenRange(ranges[i]);
  479. }
  480. }
  481. #ifdef UCOL_DEBUG
  482. puts("final ranges:");
  483. for(int32_t i=0; i<rangeCount; ++i) {
  484. printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
  485. i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
  486. }
  487. #endif
  488. rangeIndex = 0;
  489. return true;
  490. }
  491. uint32_t
  492. CollationWeights::nextWeight() {
  493. if(rangeIndex >= rangeCount) {
  494. return 0xffffffff;
  495. } else {
  496. /* get the next weight */
  497. WeightRange &range = ranges[rangeIndex];
  498. uint32_t weight = range.start;
  499. if(--range.count == 0) {
  500. /* this range is finished */
  501. ++rangeIndex;
  502. } else {
  503. /* increment the weight for the next value */
  504. range.start = incWeight(weight, range.length);
  505. U_ASSERT(range.start <= range.end);
  506. }
  507. return weight;
  508. }
  509. }
  510. U_NAMESPACE_END
  511. #endif /* #if !UCONFIG_NO_COLLATION */