mkql_bit_utils.h 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. #pragma once
  2. #include <util/system/types.h>
  3. #include <yql/essentials/public/udf/arrow/bit_util.h>
  4. namespace NKikimr {
  5. namespace NMiniKQL {
  6. inline ui64 SaturationSub(ui64 x, ui64 y) {
  7. // simple code produces cmov (same result as commented one)
  8. return (x > y) ? x - y : 0;
  9. //ui64 res = x - y;
  10. //res &= -(res <= x);
  11. //return res;
  12. }
  13. inline ui8 LoadByteUnaligned(const ui8* bitmap, size_t bitmapOffset) {
  14. size_t byteOffset = bitmapOffset >> 3;
  15. ui8 bit = ui8(bitmapOffset & 7u);
  16. ui8 first = bitmap[byteOffset];
  17. // extend to ui32 to avoid left UB in case of left shift of byte by 8 bit
  18. ui32 second = bitmap[byteOffset + (bit != 0)];
  19. return (first >> bit) | ui8(second << (8 - bit));
  20. }
  21. template<typename T>
  22. inline T SelectArg(ui8 isFirst, T first, T second) {
  23. if constexpr (std::is_arithmetic_v<T> && !std::is_floating_point_v<T>) {
  24. // isFirst == 1 -> mask 0xFF..FF, isFirst == 0 -> mask 0x00..00
  25. T mask = -T(isFirst);
  26. return (first & mask) | (second & ~mask);
  27. } else {
  28. return isFirst ? first : second;
  29. }
  30. }
  31. inline ui8 CompressByte(ui8 x, ui8 m) {
  32. // algorithm 7-4 (Compress or Generalized Extract) from second edition of "Hacker's Delight" (single byte version)
  33. // compresses bits from x according to mask m
  34. // X: 01101011
  35. // M: 00110010
  36. // MASKED VALUES: --10--1-
  37. // RESULT: 00000101
  38. // TODO: should be replaced by PEXT instruction from BMI2 instruction set
  39. ui8 mk, mp, mv, t;
  40. x = x & m; // Clear irrelevant bits.
  41. mk = ~m << 1; // We will count 0's to right.
  42. for (ui8 i = 0; i < 3; i++) {
  43. mp = mk ^ (mk << 1); // Parallel suffix.
  44. mp = mp ^ (mp << 2);
  45. mp = mp ^ (mp << 4);
  46. mv = mp & m; // Bits to move.
  47. m = m ^ mv | (mv >> (1 << i)); // Compress m.
  48. t = x & mv;
  49. x = x ^ t | (t >> (1 << i)); // Compress x.
  50. mk = mk & ~mp;
  51. }
  52. return x;
  53. }
  54. inline ui8 PopCountByte(ui8 value) {
  55. static constexpr uint8_t bytePopCounts[] = {
  56. 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3,
  57. 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4,
  58. 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4,
  59. 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5,
  60. 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2,
  61. 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5,
  62. 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4,
  63. 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6,
  64. 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
  65. return bytePopCounts[value];
  66. }
  67. inline size_t GetSparseBitmapPopCount(const ui8* src, size_t len) {
  68. // auto-vectorization is performed here
  69. size_t result = 0;
  70. while (len--) {
  71. result += *src++;
  72. }
  73. return result;
  74. }
  75. using NYql::NUdf::CompressSparseBitmap;
  76. using NYql::NUdf::CompressSparseBitmapNegate;
  77. inline void NegateSparseBitmap(ui8* dst, const ui8* src, size_t len) {
  78. while (len--) {
  79. *dst++ = *src++ ^ ui8(1);
  80. }
  81. }
  82. inline void XorSparseBitmapScalar(ui8* dst, ui8 value, const ui8* src, size_t len) {
  83. while (len--) {
  84. *dst++ = *src++ ^ value;
  85. }
  86. }
  87. inline void XorSparseBitmaps(ui8* dst, const ui8* src1, const ui8* src2, size_t len) {
  88. while (len--) {
  89. *dst++ = *src1++ ^ *src2++;
  90. }
  91. }
  92. inline void AndSparseBitmaps(ui8* dst, const ui8* src1, const ui8* src2, size_t len) {
  93. while (len--) {
  94. *dst++ = *src1++ & *src2++;
  95. }
  96. }
  97. inline void OrSparseBitmaps(ui8* dst, const ui8* src1, const ui8* src2, size_t len) {
  98. while (len--) {
  99. *dst++ = *src1++ | *src2++;
  100. }
  101. }
  102. inline size_t CompressBitmap(const ui8* src, size_t srcOffset,
  103. const ui8* bitmap, size_t bitmapOffset, ui8* dst, size_t dstOffset, size_t count)
  104. {
  105. // TODO: 1) aligned version (srcOffset % 8 == 0), (srcOffset % 8 == 0)
  106. // 2) 64 bit processing (instead of 8)
  107. ui8* target = dst + (dstOffset >> 3);
  108. ui8 state = *target;
  109. ui8 stateBits = dstOffset & 7u;
  110. state &= (ui32(1) << stateBits) - 1u;
  111. while (count) {
  112. ui8 srcByte = LoadByteUnaligned(src, srcOffset);
  113. ui8 bitmapByte = LoadByteUnaligned(bitmap, bitmapOffset);
  114. // zero all bits outside of input range
  115. bitmapByte &= ui8(0xff) >> ui8(SaturationSub(8u, count));
  116. ui8 compressed = CompressByte(srcByte, bitmapByte);
  117. ui8 compressedBits = PopCountByte(bitmapByte);
  118. state |= (compressed << stateBits);
  119. *target = state;
  120. stateBits += compressedBits;
  121. target += (stateBits >> 3);
  122. ui8 overflowState = ui32(compressed) >> (compressedBits - SaturationSub(stateBits, 8));
  123. ui8 mask = (stateBits >> 3) - 1;
  124. state = (state & mask) | (overflowState & ~mask);
  125. stateBits &= 7;
  126. dstOffset += compressedBits;
  127. srcOffset += 8;
  128. bitmapOffset += 8;
  129. count = SaturationSub(count, 8u);
  130. }
  131. *target = state;
  132. return dstOffset;
  133. }
  134. using NYql::NUdf::CompressAsSparseBitmap;
  135. using NYql::NUdf::CompressArray;
  136. using NYql::NUdf::DecompressToSparseBitmap;
  137. } // namespace NMiniKQL
  138. } // namespace NKikimr