bocsu.h 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2001-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * file name: bocsu.h
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * Author: Markus W. Scherer
  14. *
  15. * Modification history:
  16. * 05/18/2001 weiv Made into separate module
  17. */
  18. #ifndef BOCSU_H
  19. #define BOCSU_H
  20. #include "unicode/utypes.h"
  21. #if !UCONFIG_NO_COLLATION
  22. U_NAMESPACE_BEGIN
  23. class ByteSink;
  24. U_NAMESPACE_END
  25. /*
  26. * "BOCSU"
  27. * Binary Ordered Compression Scheme for Unicode
  28. *
  29. * Specific application:
  30. *
  31. * Encode a Unicode string for the identical level of a sort key.
  32. * Restrictions:
  33. * - byte stream (unsigned 8-bit bytes)
  34. * - lexical order of the identical-level run must be
  35. * the same as code point order for the string
  36. * - avoid byte values 0, 1, 2
  37. *
  38. * Method: Slope Detection
  39. * Remember the previous code point (initial 0).
  40. * For each cp in the string, encode the difference to the previous one.
  41. *
  42. * With a compact encoding of differences, this yields good results for
  43. * small scripts and UTF-like results otherwise.
  44. *
  45. * Encoding of differences:
  46. * - Similar to a UTF, encoding the length of the byte sequence in the lead bytes.
  47. * - Does not need to be friendly for decoding or random access
  48. * (trail byte values may overlap with lead/single byte values).
  49. * - The signedness must be encoded as the most significant part.
  50. *
  51. * We encode differences with few bytes if their absolute values are small.
  52. * For correct ordering, we must treat the entire value range -10ffff..+10ffff
  53. * in ascending order, which forbids encoding the sign and the absolute value separately.
  54. * Instead, we split the lead byte range in the middle and encode non-negative values
  55. * going up and negative values going down.
  56. *
  57. * For very small absolute values, the difference is added to a middle byte value
  58. * for single-byte encoded differences.
  59. * For somewhat larger absolute values, the difference is divided by the number
  60. * of byte values available, the modulo is used for one trail byte, and the remainder
  61. * is added to a lead byte avoiding the single-byte range.
  62. * For large absolute values, the difference is similarly encoded in three bytes.
  63. *
  64. * This encoding does not use byte values 0, 1, 2, but uses all other byte values
  65. * for lead/single bytes so that the middle range of single bytes is as large
  66. * as possible.
  67. * Note that the lead byte ranges overlap some, but that the sequences as a whole
  68. * are well ordered. I.e., even if the lead byte is the same for sequences of different
  69. * lengths, the trail bytes establish correct order.
  70. * It would be possible to encode slightly larger ranges for each length (>1) by
  71. * subtracting the lower bound of the range. However, that would also slow down the
  72. * calculation.
  73. *
  74. * For the actual string encoding, an optimization moves the previous code point value
  75. * to the middle of its Unicode script block to minimize the differences in
  76. * same-script text runs.
  77. */
  78. /* Do not use byte values 0, 1, 2 because they are separators in sort keys. */
  79. #define SLOPE_MIN 3
  80. #define SLOPE_MAX 0xff
  81. #define SLOPE_MIDDLE 0x81
  82. #define SLOPE_TAIL_COUNT (SLOPE_MAX-SLOPE_MIN+1)
  83. #define SLOPE_MAX_BYTES 4
  84. /*
  85. * Number of lead bytes:
  86. * 1 middle byte for 0
  87. * 2*80=160 single bytes for !=0
  88. * 2*42=84 for double-byte values
  89. * 2*3=6 for 3-byte values
  90. * 2*1=2 for 4-byte values
  91. *
  92. * The sum must be <=SLOPE_TAIL_COUNT.
  93. *
  94. * Why these numbers?
  95. * - There should be >=128 single-byte values to cover 128-blocks
  96. * with small scripts.
  97. * - There should be >=20902 single/double-byte values to cover Unihan.
  98. * - It helps CJK Extension B some if there are 3-byte values that cover
  99. * the distance between them and Unihan.
  100. * This also helps to jump among distant places in the BMP.
  101. * - Four-byte values are necessary to cover the rest of Unicode.
  102. *
  103. * Symmetrical lead byte counts are for convenience.
  104. * With an equal distribution of even and odd differences there is also
  105. * no advantage to asymmetrical lead byte counts.
  106. */
  107. #define SLOPE_SINGLE 80
  108. #define SLOPE_LEAD_2 42
  109. #define SLOPE_LEAD_3 3
  110. #define SLOPE_LEAD_4 1
  111. /* The difference value range for single-byters. */
  112. #define SLOPE_REACH_POS_1 SLOPE_SINGLE
  113. #define SLOPE_REACH_NEG_1 (-SLOPE_SINGLE)
  114. /* The difference value range for double-byters. */
  115. #define SLOPE_REACH_POS_2 (SLOPE_LEAD_2*SLOPE_TAIL_COUNT+(SLOPE_LEAD_2-1))
  116. #define SLOPE_REACH_NEG_2 (-SLOPE_REACH_POS_2-1)
  117. /* The difference value range for 3-byters. */
  118. #define SLOPE_REACH_POS_3 (SLOPE_LEAD_3*SLOPE_TAIL_COUNT*SLOPE_TAIL_COUNT+(SLOPE_LEAD_3-1)*SLOPE_TAIL_COUNT+(SLOPE_TAIL_COUNT-1))
  119. #define SLOPE_REACH_NEG_3 (-SLOPE_REACH_POS_3-1)
  120. /* The lead byte start values. */
  121. #define SLOPE_START_POS_2 (SLOPE_MIDDLE+SLOPE_SINGLE+1)
  122. #define SLOPE_START_POS_3 (SLOPE_START_POS_2+SLOPE_LEAD_2)
  123. #define SLOPE_START_NEG_2 (SLOPE_MIDDLE+SLOPE_REACH_NEG_1)
  124. #define SLOPE_START_NEG_3 (SLOPE_START_NEG_2-SLOPE_LEAD_2)
  125. /*
  126. * Integer division and modulo with negative numerators
  127. * yields negative modulo results and quotients that are one more than
  128. * what we need here.
  129. */
  130. #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
  131. (m)=(n)%(d); \
  132. (n)/=(d); \
  133. if((m)<0) { \
  134. --(n); \
  135. (m)+=(d); \
  136. } \
  137. } UPRV_BLOCK_MACRO_END
  138. U_CFUNC UChar32
  139. u_writeIdenticalLevelRun(UChar32 prev, const UChar *s, int32_t length, icu::ByteSink &sink);
  140. #endif /* #if !UCONFIG_NO_COLLATION */
  141. #endif