bitset_util.h 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718
  1. #ifndef CROARING_BITSET_UTIL_H
  2. #define CROARING_BITSET_UTIL_H
  3. #include <stdint.h>
  4. #include <roaring/portability.h>
  5. #include <roaring/utilasm.h>
  6. #if CROARING_IS_X64
  7. #ifndef CROARING_COMPILER_SUPPORTS_AVX512
  8. #error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
  9. #endif // CROARING_COMPILER_SUPPORTS_AVX512
  10. #endif
  11. #if defined(__GNUC__) && !defined(__clang__)
  12. #pragma GCC diagnostic push
  13. #pragma GCC diagnostic ignored "-Wuninitialized"
  14. #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
  15. #endif
  16. #ifdef __cplusplus
  17. extern "C" {
  18. namespace roaring {
  19. namespace internal {
  20. #endif
  21. /*
  22. * Set all bits in indexes [begin,end) to true.
  23. */
  24. static inline void bitset_set_range(uint64_t *words, uint32_t start,
  25. uint32_t end) {
  26. if (start == end) return;
  27. uint32_t firstword = start / 64;
  28. uint32_t endword = (end - 1) / 64;
  29. if (firstword == endword) {
  30. words[firstword] |= ((~UINT64_C(0)) << (start % 64)) &
  31. ((~UINT64_C(0)) >> ((~end + 1) % 64));
  32. return;
  33. }
  34. words[firstword] |= (~UINT64_C(0)) << (start % 64);
  35. for (uint32_t i = firstword + 1; i < endword; i++) {
  36. words[i] = ~UINT64_C(0);
  37. }
  38. words[endword] |= (~UINT64_C(0)) >> ((~end + 1) % 64);
  39. }
  40. /*
  41. * Find the cardinality of the bitset in [begin,begin+lenminusone]
  42. */
  43. static inline int bitset_lenrange_cardinality(const uint64_t *words,
  44. uint32_t start,
  45. uint32_t lenminusone) {
  46. uint32_t firstword = start / 64;
  47. uint32_t endword = (start + lenminusone) / 64;
  48. if (firstword == endword) {
  49. return roaring_hamming(words[firstword] &
  50. ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
  51. << (start % 64));
  52. }
  53. int answer =
  54. roaring_hamming(words[firstword] & ((~UINT64_C(0)) << (start % 64)));
  55. for (uint32_t i = firstword + 1; i < endword; i++) {
  56. answer += roaring_hamming(words[i]);
  57. }
  58. answer += roaring_hamming(words[endword] &
  59. (~UINT64_C(0)) >>
  60. (((~start + 1) - lenminusone - 1) % 64));
  61. return answer;
  62. }
  63. /*
  64. * Check whether the cardinality of the bitset in [begin,begin+lenminusone] is 0
  65. */
  66. static inline bool bitset_lenrange_empty(const uint64_t *words, uint32_t start,
  67. uint32_t lenminusone) {
  68. uint32_t firstword = start / 64;
  69. uint32_t endword = (start + lenminusone) / 64;
  70. if (firstword == endword) {
  71. return (words[firstword] & ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
  72. << (start % 64)) == 0;
  73. }
  74. if (((words[firstword] & ((~UINT64_C(0)) << (start % 64)))) != 0) {
  75. return false;
  76. }
  77. for (uint32_t i = firstword + 1; i < endword; i++) {
  78. if (words[i] != 0) {
  79. return false;
  80. }
  81. }
  82. if ((words[endword] &
  83. (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)) != 0) {
  84. return false;
  85. }
  86. return true;
  87. }
  88. /*
  89. * Set all bits in indexes [begin,begin+lenminusone] to true.
  90. */
  91. static inline void bitset_set_lenrange(uint64_t *words, uint32_t start,
  92. uint32_t lenminusone) {
  93. uint32_t firstword = start / 64;
  94. uint32_t endword = (start + lenminusone) / 64;
  95. if (firstword == endword) {
  96. words[firstword] |= ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
  97. << (start % 64);
  98. return;
  99. }
  100. uint64_t temp = words[endword];
  101. words[firstword] |= (~UINT64_C(0)) << (start % 64);
  102. for (uint32_t i = firstword + 1; i < endword; i += 2)
  103. words[i] = words[i + 1] = ~UINT64_C(0);
  104. words[endword] =
  105. temp | (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64);
  106. }
  107. /*
  108. * Flip all the bits in indexes [begin,end).
  109. */
  110. static inline void bitset_flip_range(uint64_t *words, uint32_t start,
  111. uint32_t end) {
  112. if (start == end) return;
  113. uint32_t firstword = start / 64;
  114. uint32_t endword = (end - 1) / 64;
  115. words[firstword] ^= ~((~UINT64_C(0)) << (start % 64));
  116. for (uint32_t i = firstword; i < endword; i++) {
  117. words[i] = ~words[i];
  118. }
  119. words[endword] ^= ((~UINT64_C(0)) >> ((~end + 1) % 64));
  120. }
  121. /*
  122. * Set all bits in indexes [begin,end) to false.
  123. */
  124. static inline void bitset_reset_range(uint64_t *words, uint32_t start,
  125. uint32_t end) {
  126. if (start == end) return;
  127. uint32_t firstword = start / 64;
  128. uint32_t endword = (end - 1) / 64;
  129. if (firstword == endword) {
  130. words[firstword] &= ~(((~UINT64_C(0)) << (start % 64)) &
  131. ((~UINT64_C(0)) >> ((~end + 1) % 64)));
  132. return;
  133. }
  134. words[firstword] &= ~((~UINT64_C(0)) << (start % 64));
  135. for (uint32_t i = firstword + 1; i < endword; i++) {
  136. words[i] = UINT64_C(0);
  137. }
  138. words[endword] &= ~((~UINT64_C(0)) >> ((~end + 1) % 64));
  139. }
  140. /*
  141. * Given a bitset containing "length" 64-bit words, write out the position
  142. * of all the set bits to "out", values start at "base".
  143. *
  144. * The "out" pointer should be sufficient to store the actual number of bits
  145. * set.
  146. *
  147. * Returns how many values were actually decoded.
  148. *
  149. * This function should only be expected to be faster than
  150. * bitset_extract_setbits
  151. * when the density of the bitset is high.
  152. *
  153. * This function uses AVX2 decoding.
  154. */
  155. size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length,
  156. uint32_t *out, size_t outcapacity,
  157. uint32_t base);
  158. size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length,
  159. uint32_t *out, size_t outcapacity,
  160. uint32_t base);
  161. /*
  162. * Given a bitset containing "length" 64-bit words, write out the position
  163. * of all the set bits to "out", values start at "base".
  164. *
  165. * The "out" pointer should be sufficient to store the actual number of bits
  166. *set.
  167. *
  168. * Returns how many values were actually decoded.
  169. */
  170. size_t bitset_extract_setbits(const uint64_t *words, size_t length,
  171. uint32_t *out, uint32_t base);
  172. /*
  173. * Given a bitset containing "length" 64-bit words, write out the position
  174. * of all the set bits to "out" as 16-bit integers, values start at "base" (can
  175. *be set to zero)
  176. *
  177. * The "out" pointer should be sufficient to store the actual number of bits
  178. *set.
  179. *
  180. * Returns how many values were actually decoded.
  181. *
  182. * This function should only be expected to be faster than
  183. *bitset_extract_setbits_uint16
  184. * when the density of the bitset is high.
  185. *
  186. * This function uses SSE decoding.
  187. */
  188. size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length,
  189. uint16_t *out, size_t outcapacity,
  190. uint16_t base);
  191. size_t bitset_extract_setbits_avx512_uint16(const uint64_t *words,
  192. size_t length, uint16_t *out,
  193. size_t outcapacity, uint16_t base);
  194. /*
  195. * Given a bitset containing "length" 64-bit words, write out the position
  196. * of all the set bits to "out", values start at "base"
  197. * (can be set to zero)
  198. *
  199. * The "out" pointer should be sufficient to store the actual number of bits
  200. *set.
  201. *
  202. * Returns how many values were actually decoded.
  203. */
  204. size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length,
  205. uint16_t *out, uint16_t base);
  206. /*
  207. * Given two bitsets containing "length" 64-bit words, write out the position
  208. * of all the common set bits to "out", values start at "base"
  209. * (can be set to zero)
  210. *
  211. * The "out" pointer should be sufficient to store the actual number of bits
  212. * set.
  213. *
  214. * Returns how many values were actually decoded.
  215. */
  216. size_t bitset_extract_intersection_setbits_uint16(
  217. const uint64_t *__restrict__ words1, const uint64_t *__restrict__ words2,
  218. size_t length, uint16_t *out, uint16_t base);
  219. /*
  220. * Given a bitset having cardinality card, set all bit values in the list (there
  221. * are length of them)
  222. * and return the updated cardinality. This evidently assumes that the bitset
  223. * already contained data.
  224. */
  225. uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card,
  226. const uint16_t *list, uint64_t length);
  227. /*
  228. * Given a bitset, set all bit values in the list (there
  229. * are length of them).
  230. */
  231. void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length);
  232. /*
  233. * Given a bitset having cardinality card, unset all bit values in the list
  234. * (there are length of them)
  235. * and return the updated cardinality. This evidently assumes that the bitset
  236. * already contained data.
  237. */
  238. uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list,
  239. uint64_t length);
  240. /*
  241. * Given a bitset having cardinality card, toggle all bit values in the list
  242. * (there are length of them)
  243. * and return the updated cardinality. This evidently assumes that the bitset
  244. * already contained data.
  245. */
  246. uint64_t bitset_flip_list_withcard(uint64_t *words, uint64_t card,
  247. const uint16_t *list, uint64_t length);
  248. void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length);
  249. #if CROARING_IS_X64
  250. /***
  251. * BEGIN Harley-Seal popcount functions.
  252. */
  253. CROARING_TARGET_AVX2
  254. /**
  255. * Compute the population count of a 256-bit word
  256. * This is not especially fast, but it is convenient as part of other functions.
  257. */
  258. static inline __m256i popcount256(__m256i v) {
  259. const __m256i lookuppos = _mm256_setr_epi8(
  260. /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2,
  261. /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3,
  262. /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3,
  263. /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4,
  264. /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2,
  265. /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3,
  266. /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3,
  267. /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4);
  268. const __m256i lookupneg = _mm256_setr_epi8(
  269. /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2,
  270. /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3,
  271. /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3,
  272. /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4,
  273. /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2,
  274. /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3,
  275. /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3,
  276. /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4);
  277. const __m256i low_mask = _mm256_set1_epi8(0x0f);
  278. const __m256i lo = _mm256_and_si256(v, low_mask);
  279. const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
  280. const __m256i popcnt1 = _mm256_shuffle_epi8(lookuppos, lo);
  281. const __m256i popcnt2 = _mm256_shuffle_epi8(lookupneg, hi);
  282. return _mm256_sad_epu8(popcnt1, popcnt2);
  283. }
  284. CROARING_UNTARGET_AVX2
  285. CROARING_TARGET_AVX2
  286. /**
  287. * Simple CSA over 256 bits
  288. */
  289. static inline void CSA(__m256i *h, __m256i *l, __m256i a, __m256i b,
  290. __m256i c) {
  291. const __m256i u = _mm256_xor_si256(a, b);
  292. *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
  293. *l = _mm256_xor_si256(u, c);
  294. }
  295. CROARING_UNTARGET_AVX2
  296. CROARING_TARGET_AVX2
  297. /**
  298. * Fast Harley-Seal AVX population count function
  299. */
  300. inline static uint64_t avx2_harley_seal_popcount256(const __m256i *data,
  301. const uint64_t size) {
  302. __m256i total = _mm256_setzero_si256();
  303. __m256i ones = _mm256_setzero_si256();
  304. __m256i twos = _mm256_setzero_si256();
  305. __m256i fours = _mm256_setzero_si256();
  306. __m256i eights = _mm256_setzero_si256();
  307. __m256i sixteens = _mm256_setzero_si256();
  308. __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
  309. const uint64_t limit = size - size % 16;
  310. uint64_t i = 0;
  311. for (; i < limit; i += 16) {
  312. CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i),
  313. _mm256_lddqu_si256(data + i + 1));
  314. CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 2),
  315. _mm256_lddqu_si256(data + i + 3));
  316. CSA(&foursA, &twos, twos, twosA, twosB);
  317. CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 4),
  318. _mm256_lddqu_si256(data + i + 5));
  319. CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 6),
  320. _mm256_lddqu_si256(data + i + 7));
  321. CSA(&foursB, &twos, twos, twosA, twosB);
  322. CSA(&eightsA, &fours, fours, foursA, foursB);
  323. CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 8),
  324. _mm256_lddqu_si256(data + i + 9));
  325. CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 10),
  326. _mm256_lddqu_si256(data + i + 11));
  327. CSA(&foursA, &twos, twos, twosA, twosB);
  328. CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 12),
  329. _mm256_lddqu_si256(data + i + 13));
  330. CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 14),
  331. _mm256_lddqu_si256(data + i + 15));
  332. CSA(&foursB, &twos, twos, twosA, twosB);
  333. CSA(&eightsB, &fours, fours, foursA, foursB);
  334. CSA(&sixteens, &eights, eights, eightsA, eightsB);
  335. total = _mm256_add_epi64(total, popcount256(sixteens));
  336. }
  337. total = _mm256_slli_epi64(total, 4); // * 16
  338. total = _mm256_add_epi64(
  339. total, _mm256_slli_epi64(popcount256(eights), 3)); // += 8 * ...
  340. total = _mm256_add_epi64(
  341. total, _mm256_slli_epi64(popcount256(fours), 2)); // += 4 * ...
  342. total = _mm256_add_epi64(
  343. total, _mm256_slli_epi64(popcount256(twos), 1)); // += 2 * ...
  344. total = _mm256_add_epi64(total, popcount256(ones));
  345. for (; i < size; i++)
  346. total =
  347. _mm256_add_epi64(total, popcount256(_mm256_lddqu_si256(data + i)));
  348. return (uint64_t)(_mm256_extract_epi64(total, 0)) +
  349. (uint64_t)(_mm256_extract_epi64(total, 1)) +
  350. (uint64_t)(_mm256_extract_epi64(total, 2)) +
  351. (uint64_t)(_mm256_extract_epi64(total, 3));
  352. }
  353. CROARING_UNTARGET_AVX2
  354. #define CROARING_AVXPOPCNTFNC(opname, avx_intrinsic) \
  355. static inline uint64_t avx2_harley_seal_popcount256_##opname( \
  356. const __m256i *data1, const __m256i *data2, const uint64_t size) { \
  357. __m256i total = _mm256_setzero_si256(); \
  358. __m256i ones = _mm256_setzero_si256(); \
  359. __m256i twos = _mm256_setzero_si256(); \
  360. __m256i fours = _mm256_setzero_si256(); \
  361. __m256i eights = _mm256_setzero_si256(); \
  362. __m256i sixteens = _mm256_setzero_si256(); \
  363. __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \
  364. __m256i A1, A2; \
  365. const uint64_t limit = size - size % 16; \
  366. uint64_t i = 0; \
  367. for (; i < limit; i += 16) { \
  368. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
  369. _mm256_lddqu_si256(data2 + i)); \
  370. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \
  371. _mm256_lddqu_si256(data2 + i + 1)); \
  372. CSA(&twosA, &ones, ones, A1, A2); \
  373. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \
  374. _mm256_lddqu_si256(data2 + i + 2)); \
  375. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \
  376. _mm256_lddqu_si256(data2 + i + 3)); \
  377. CSA(&twosB, &ones, ones, A1, A2); \
  378. CSA(&foursA, &twos, twos, twosA, twosB); \
  379. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \
  380. _mm256_lddqu_si256(data2 + i + 4)); \
  381. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \
  382. _mm256_lddqu_si256(data2 + i + 5)); \
  383. CSA(&twosA, &ones, ones, A1, A2); \
  384. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \
  385. _mm256_lddqu_si256(data2 + i + 6)); \
  386. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \
  387. _mm256_lddqu_si256(data2 + i + 7)); \
  388. CSA(&twosB, &ones, ones, A1, A2); \
  389. CSA(&foursB, &twos, twos, twosA, twosB); \
  390. CSA(&eightsA, &fours, fours, foursA, foursB); \
  391. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \
  392. _mm256_lddqu_si256(data2 + i + 8)); \
  393. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \
  394. _mm256_lddqu_si256(data2 + i + 9)); \
  395. CSA(&twosA, &ones, ones, A1, A2); \
  396. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \
  397. _mm256_lddqu_si256(data2 + i + 10)); \
  398. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \
  399. _mm256_lddqu_si256(data2 + i + 11)); \
  400. CSA(&twosB, &ones, ones, A1, A2); \
  401. CSA(&foursA, &twos, twos, twosA, twosB); \
  402. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \
  403. _mm256_lddqu_si256(data2 + i + 12)); \
  404. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \
  405. _mm256_lddqu_si256(data2 + i + 13)); \
  406. CSA(&twosA, &ones, ones, A1, A2); \
  407. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \
  408. _mm256_lddqu_si256(data2 + i + 14)); \
  409. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \
  410. _mm256_lddqu_si256(data2 + i + 15)); \
  411. CSA(&twosB, &ones, ones, A1, A2); \
  412. CSA(&foursB, &twos, twos, twosA, twosB); \
  413. CSA(&eightsB, &fours, fours, foursA, foursB); \
  414. CSA(&sixteens, &eights, eights, eightsA, eightsB); \
  415. total = _mm256_add_epi64(total, popcount256(sixteens)); \
  416. } \
  417. total = _mm256_slli_epi64(total, 4); \
  418. total = _mm256_add_epi64(total, \
  419. _mm256_slli_epi64(popcount256(eights), 3)); \
  420. total = \
  421. _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \
  422. total = \
  423. _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \
  424. total = _mm256_add_epi64(total, popcount256(ones)); \
  425. for (; i < size; i++) { \
  426. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
  427. _mm256_lddqu_si256(data2 + i)); \
  428. total = _mm256_add_epi64(total, popcount256(A1)); \
  429. } \
  430. return (uint64_t)(_mm256_extract_epi64(total, 0)) + \
  431. (uint64_t)(_mm256_extract_epi64(total, 1)) + \
  432. (uint64_t)(_mm256_extract_epi64(total, 2)) + \
  433. (uint64_t)(_mm256_extract_epi64(total, 3)); \
  434. } \
  435. static inline uint64_t avx2_harley_seal_popcount256andstore_##opname( \
  436. const __m256i *__restrict__ data1, const __m256i *__restrict__ data2, \
  437. __m256i *__restrict__ out, const uint64_t size) { \
  438. __m256i total = _mm256_setzero_si256(); \
  439. __m256i ones = _mm256_setzero_si256(); \
  440. __m256i twos = _mm256_setzero_si256(); \
  441. __m256i fours = _mm256_setzero_si256(); \
  442. __m256i eights = _mm256_setzero_si256(); \
  443. __m256i sixteens = _mm256_setzero_si256(); \
  444. __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \
  445. __m256i A1, A2; \
  446. const uint64_t limit = size - size % 16; \
  447. uint64_t i = 0; \
  448. for (; i < limit; i += 16) { \
  449. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
  450. _mm256_lddqu_si256(data2 + i)); \
  451. _mm256_storeu_si256(out + i, A1); \
  452. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \
  453. _mm256_lddqu_si256(data2 + i + 1)); \
  454. _mm256_storeu_si256(out + i + 1, A2); \
  455. CSA(&twosA, &ones, ones, A1, A2); \
  456. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \
  457. _mm256_lddqu_si256(data2 + i + 2)); \
  458. _mm256_storeu_si256(out + i + 2, A1); \
  459. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \
  460. _mm256_lddqu_si256(data2 + i + 3)); \
  461. _mm256_storeu_si256(out + i + 3, A2); \
  462. CSA(&twosB, &ones, ones, A1, A2); \
  463. CSA(&foursA, &twos, twos, twosA, twosB); \
  464. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \
  465. _mm256_lddqu_si256(data2 + i + 4)); \
  466. _mm256_storeu_si256(out + i + 4, A1); \
  467. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \
  468. _mm256_lddqu_si256(data2 + i + 5)); \
  469. _mm256_storeu_si256(out + i + 5, A2); \
  470. CSA(&twosA, &ones, ones, A1, A2); \
  471. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \
  472. _mm256_lddqu_si256(data2 + i + 6)); \
  473. _mm256_storeu_si256(out + i + 6, A1); \
  474. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \
  475. _mm256_lddqu_si256(data2 + i + 7)); \
  476. _mm256_storeu_si256(out + i + 7, A2); \
  477. CSA(&twosB, &ones, ones, A1, A2); \
  478. CSA(&foursB, &twos, twos, twosA, twosB); \
  479. CSA(&eightsA, &fours, fours, foursA, foursB); \
  480. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \
  481. _mm256_lddqu_si256(data2 + i + 8)); \
  482. _mm256_storeu_si256(out + i + 8, A1); \
  483. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \
  484. _mm256_lddqu_si256(data2 + i + 9)); \
  485. _mm256_storeu_si256(out + i + 9, A2); \
  486. CSA(&twosA, &ones, ones, A1, A2); \
  487. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \
  488. _mm256_lddqu_si256(data2 + i + 10)); \
  489. _mm256_storeu_si256(out + i + 10, A1); \
  490. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \
  491. _mm256_lddqu_si256(data2 + i + 11)); \
  492. _mm256_storeu_si256(out + i + 11, A2); \
  493. CSA(&twosB, &ones, ones, A1, A2); \
  494. CSA(&foursA, &twos, twos, twosA, twosB); \
  495. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \
  496. _mm256_lddqu_si256(data2 + i + 12)); \
  497. _mm256_storeu_si256(out + i + 12, A1); \
  498. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \
  499. _mm256_lddqu_si256(data2 + i + 13)); \
  500. _mm256_storeu_si256(out + i + 13, A2); \
  501. CSA(&twosA, &ones, ones, A1, A2); \
  502. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \
  503. _mm256_lddqu_si256(data2 + i + 14)); \
  504. _mm256_storeu_si256(out + i + 14, A1); \
  505. A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \
  506. _mm256_lddqu_si256(data2 + i + 15)); \
  507. _mm256_storeu_si256(out + i + 15, A2); \
  508. CSA(&twosB, &ones, ones, A1, A2); \
  509. CSA(&foursB, &twos, twos, twosA, twosB); \
  510. CSA(&eightsB, &fours, fours, foursA, foursB); \
  511. CSA(&sixteens, &eights, eights, eightsA, eightsB); \
  512. total = _mm256_add_epi64(total, popcount256(sixteens)); \
  513. } \
  514. total = _mm256_slli_epi64(total, 4); \
  515. total = _mm256_add_epi64(total, \
  516. _mm256_slli_epi64(popcount256(eights), 3)); \
  517. total = \
  518. _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \
  519. total = \
  520. _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \
  521. total = _mm256_add_epi64(total, popcount256(ones)); \
  522. for (; i < size; i++) { \
  523. A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
  524. _mm256_lddqu_si256(data2 + i)); \
  525. _mm256_storeu_si256(out + i, A1); \
  526. total = _mm256_add_epi64(total, popcount256(A1)); \
  527. } \
  528. return (uint64_t)(_mm256_extract_epi64(total, 0)) + \
  529. (uint64_t)(_mm256_extract_epi64(total, 1)) + \
  530. (uint64_t)(_mm256_extract_epi64(total, 2)) + \
  531. (uint64_t)(_mm256_extract_epi64(total, 3)); \
  532. }
  533. CROARING_TARGET_AVX2
  534. CROARING_AVXPOPCNTFNC(or, _mm256_or_si256)
  535. CROARING_UNTARGET_AVX2
  536. CROARING_TARGET_AVX2
  537. CROARING_AVXPOPCNTFNC(union, _mm256_or_si256)
  538. CROARING_UNTARGET_AVX2
  539. CROARING_TARGET_AVX2
  540. CROARING_AVXPOPCNTFNC(and, _mm256_and_si256)
  541. CROARING_UNTARGET_AVX2
  542. CROARING_TARGET_AVX2
  543. CROARING_AVXPOPCNTFNC(intersection, _mm256_and_si256)
  544. CROARING_UNTARGET_AVX2
  545. CROARING_TARGET_AVX2
  546. CROARING_AVXPOPCNTFNC(xor, _mm256_xor_si256)
  547. CROARING_UNTARGET_AVX2
  548. CROARING_TARGET_AVX2
  549. CROARING_AVXPOPCNTFNC(andnot, _mm256_andnot_si256)
  550. CROARING_UNTARGET_AVX2
  551. #define VPOPCNT_AND_ADD(ptr, i, accu) \
  552. const __m512i v##i = _mm512_loadu_si512((const __m512i *)ptr + i); \
  553. const __m512i p##i = _mm512_popcnt_epi64(v##i); \
  554. accu = _mm512_add_epi64(accu, p##i);
  555. #if CROARING_COMPILER_SUPPORTS_AVX512
  556. CROARING_TARGET_AVX512
  557. static inline uint64_t sum_epu64_256(const __m256i v) {
  558. return (uint64_t)(_mm256_extract_epi64(v, 0)) +
  559. (uint64_t)(_mm256_extract_epi64(v, 1)) +
  560. (uint64_t)(_mm256_extract_epi64(v, 2)) +
  561. (uint64_t)(_mm256_extract_epi64(v, 3));
  562. }
  563. static inline uint64_t simd_sum_epu64(const __m512i v) {
  564. __m256i lo = _mm512_extracti64x4_epi64(v, 0);
  565. __m256i hi = _mm512_extracti64x4_epi64(v, 1);
  566. return sum_epu64_256(lo) + sum_epu64_256(hi);
  567. }
  568. static inline uint64_t avx512_vpopcount(const __m512i *data,
  569. const uint64_t size) {
  570. const uint64_t limit = size - size % 4;
  571. __m512i total = _mm512_setzero_si512();
  572. uint64_t i = 0;
  573. for (; i < limit; i += 4) {
  574. VPOPCNT_AND_ADD(data + i, 0, total);
  575. VPOPCNT_AND_ADD(data + i, 1, total);
  576. VPOPCNT_AND_ADD(data + i, 2, total);
  577. VPOPCNT_AND_ADD(data + i, 3, total);
  578. }
  579. for (; i < size; i++) {
  580. total = _mm512_add_epi64(
  581. total, _mm512_popcnt_epi64(_mm512_loadu_si512(data + i)));
  582. }
  583. return simd_sum_epu64(total);
  584. }
  585. CROARING_UNTARGET_AVX512
  586. #endif
  587. #define CROARING_AVXPOPCNTFNC512(opname, avx_intrinsic) \
  588. static inline uint64_t avx512_harley_seal_popcount512_##opname( \
  589. const __m512i *data1, const __m512i *data2, const uint64_t size) { \
  590. __m512i total = _mm512_setzero_si512(); \
  591. const uint64_t limit = size - size % 4; \
  592. uint64_t i = 0; \
  593. for (; i < limit; i += 4) { \
  594. __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i), \
  595. _mm512_loadu_si512(data2 + i)); \
  596. total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1)); \
  597. __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1), \
  598. _mm512_loadu_si512(data2 + i + 1)); \
  599. total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2)); \
  600. __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2), \
  601. _mm512_loadu_si512(data2 + i + 2)); \
  602. total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3)); \
  603. __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3), \
  604. _mm512_loadu_si512(data2 + i + 3)); \
  605. total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4)); \
  606. } \
  607. for (; i < size; i++) { \
  608. __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i), \
  609. _mm512_loadu_si512(data2 + i)); \
  610. total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a)); \
  611. } \
  612. return simd_sum_epu64(total); \
  613. } \
  614. static inline uint64_t avx512_harley_seal_popcount512andstore_##opname( \
  615. const __m512i *__restrict__ data1, const __m512i *__restrict__ data2, \
  616. __m512i *__restrict__ out, const uint64_t size) { \
  617. __m512i total = _mm512_setzero_si512(); \
  618. const uint64_t limit = size - size % 4; \
  619. uint64_t i = 0; \
  620. for (; i < limit; i += 4) { \
  621. __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i), \
  622. _mm512_loadu_si512(data2 + i)); \
  623. _mm512_storeu_si512(out + i, a1); \
  624. total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1)); \
  625. __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1), \
  626. _mm512_loadu_si512(data2 + i + 1)); \
  627. _mm512_storeu_si512(out + i + 1, a2); \
  628. total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2)); \
  629. __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2), \
  630. _mm512_loadu_si512(data2 + i + 2)); \
  631. _mm512_storeu_si512(out + i + 2, a3); \
  632. total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3)); \
  633. __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3), \
  634. _mm512_loadu_si512(data2 + i + 3)); \
  635. _mm512_storeu_si512(out + i + 3, a4); \
  636. total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4)); \
  637. } \
  638. for (; i < size; i++) { \
  639. __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i), \
  640. _mm512_loadu_si512(data2 + i)); \
  641. _mm512_storeu_si512(out + i, a); \
  642. total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a)); \
  643. } \
  644. return simd_sum_epu64(total); \
  645. }
  646. #if CROARING_COMPILER_SUPPORTS_AVX512
  647. CROARING_TARGET_AVX512
  648. CROARING_AVXPOPCNTFNC512(or, _mm512_or_si512)
  649. CROARING_AVXPOPCNTFNC512(union, _mm512_or_si512)
  650. CROARING_AVXPOPCNTFNC512(and, _mm512_and_si512)
  651. CROARING_AVXPOPCNTFNC512(intersection, _mm512_and_si512)
  652. CROARING_AVXPOPCNTFNC512(xor, _mm512_xor_si512)
  653. CROARING_AVXPOPCNTFNC512(andnot, _mm512_andnot_si512)
  654. CROARING_UNTARGET_AVX512
  655. #endif
  656. /***
  657. * END Harley-Seal popcount functions.
  658. */
  659. #endif // CROARING_IS_X64
  660. #ifdef __cplusplus
  661. }
  662. }
  663. } // extern "C" { namespace roaring { namespace internal
  664. #endif
  665. #if defined(__GNUC__) && !defined(__clang__)
  666. #pragma GCC diagnostic pop
  667. #endif
  668. #endif