123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718 |
- #ifndef CROARING_BITSET_UTIL_H
- #define CROARING_BITSET_UTIL_H
- #include <stdint.h>
- #include <roaring/portability.h>
- #include <roaring/utilasm.h>
- #if CROARING_IS_X64
- #ifndef CROARING_COMPILER_SUPPORTS_AVX512
- #error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
- #endif // CROARING_COMPILER_SUPPORTS_AVX512
- #endif
- #if defined(__GNUC__) && !defined(__clang__)
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wuninitialized"
- #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
- #endif
- #ifdef __cplusplus
- extern "C" {
- namespace roaring {
- namespace internal {
- #endif
- /*
- * Set all bits in indexes [begin,end) to true.
- */
- static inline void bitset_set_range(uint64_t *words, uint32_t start,
- uint32_t end) {
- if (start == end) return;
- uint32_t firstword = start / 64;
- uint32_t endword = (end - 1) / 64;
- if (firstword == endword) {
- words[firstword] |= ((~UINT64_C(0)) << (start % 64)) &
- ((~UINT64_C(0)) >> ((~end + 1) % 64));
- return;
- }
- words[firstword] |= (~UINT64_C(0)) << (start % 64);
- for (uint32_t i = firstword + 1; i < endword; i++) {
- words[i] = ~UINT64_C(0);
- }
- words[endword] |= (~UINT64_C(0)) >> ((~end + 1) % 64);
- }
- /*
- * Find the cardinality of the bitset in [begin,begin+lenminusone]
- */
- static inline int bitset_lenrange_cardinality(const uint64_t *words,
- uint32_t start,
- uint32_t lenminusone) {
- uint32_t firstword = start / 64;
- uint32_t endword = (start + lenminusone) / 64;
- if (firstword == endword) {
- return roaring_hamming(words[firstword] &
- ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
- << (start % 64));
- }
- int answer =
- roaring_hamming(words[firstword] & ((~UINT64_C(0)) << (start % 64)));
- for (uint32_t i = firstword + 1; i < endword; i++) {
- answer += roaring_hamming(words[i]);
- }
- answer += roaring_hamming(words[endword] &
- (~UINT64_C(0)) >>
- (((~start + 1) - lenminusone - 1) % 64));
- return answer;
- }
- /*
- * Check whether the cardinality of the bitset in [begin,begin+lenminusone] is 0
- */
- static inline bool bitset_lenrange_empty(const uint64_t *words, uint32_t start,
- uint32_t lenminusone) {
- uint32_t firstword = start / 64;
- uint32_t endword = (start + lenminusone) / 64;
- if (firstword == endword) {
- return (words[firstword] & ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
- << (start % 64)) == 0;
- }
- if (((words[firstword] & ((~UINT64_C(0)) << (start % 64)))) != 0) {
- return false;
- }
- for (uint32_t i = firstword + 1; i < endword; i++) {
- if (words[i] != 0) {
- return false;
- }
- }
- if ((words[endword] &
- (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)) != 0) {
- return false;
- }
- return true;
- }
- /*
- * Set all bits in indexes [begin,begin+lenminusone] to true.
- */
- static inline void bitset_set_lenrange(uint64_t *words, uint32_t start,
- uint32_t lenminusone) {
- uint32_t firstword = start / 64;
- uint32_t endword = (start + lenminusone) / 64;
- if (firstword == endword) {
- words[firstword] |= ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
- << (start % 64);
- return;
- }
- uint64_t temp = words[endword];
- words[firstword] |= (~UINT64_C(0)) << (start % 64);
- for (uint32_t i = firstword + 1; i < endword; i += 2)
- words[i] = words[i + 1] = ~UINT64_C(0);
- words[endword] =
- temp | (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64);
- }
- /*
- * Flip all the bits in indexes [begin,end).
- */
- static inline void bitset_flip_range(uint64_t *words, uint32_t start,
- uint32_t end) {
- if (start == end) return;
- uint32_t firstword = start / 64;
- uint32_t endword = (end - 1) / 64;
- words[firstword] ^= ~((~UINT64_C(0)) << (start % 64));
- for (uint32_t i = firstword; i < endword; i++) {
- words[i] = ~words[i];
- }
- words[endword] ^= ((~UINT64_C(0)) >> ((~end + 1) % 64));
- }
- /*
- * Set all bits in indexes [begin,end) to false.
- */
- static inline void bitset_reset_range(uint64_t *words, uint32_t start,
- uint32_t end) {
- if (start == end) return;
- uint32_t firstword = start / 64;
- uint32_t endword = (end - 1) / 64;
- if (firstword == endword) {
- words[firstword] &= ~(((~UINT64_C(0)) << (start % 64)) &
- ((~UINT64_C(0)) >> ((~end + 1) % 64)));
- return;
- }
- words[firstword] &= ~((~UINT64_C(0)) << (start % 64));
- for (uint32_t i = firstword + 1; i < endword; i++) {
- words[i] = UINT64_C(0);
- }
- words[endword] &= ~((~UINT64_C(0)) >> ((~end + 1) % 64));
- }
- /*
- * Given a bitset containing "length" 64-bit words, write out the position
- * of all the set bits to "out", values start at "base".
- *
- * The "out" pointer should be sufficient to store the actual number of bits
- * set.
- *
- * Returns how many values were actually decoded.
- *
- * This function should only be expected to be faster than
- * bitset_extract_setbits
- * when the density of the bitset is high.
- *
- * This function uses AVX2 decoding.
- */
- size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length,
- uint32_t *out, size_t outcapacity,
- uint32_t base);
- size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length,
- uint32_t *out, size_t outcapacity,
- uint32_t base);
- /*
- * Given a bitset containing "length" 64-bit words, write out the position
- * of all the set bits to "out", values start at "base".
- *
- * The "out" pointer should be sufficient to store the actual number of bits
- *set.
- *
- * Returns how many values were actually decoded.
- */
- size_t bitset_extract_setbits(const uint64_t *words, size_t length,
- uint32_t *out, uint32_t base);
- /*
- * Given a bitset containing "length" 64-bit words, write out the position
- * of all the set bits to "out" as 16-bit integers, values start at "base" (can
- *be set to zero)
- *
- * The "out" pointer should be sufficient to store the actual number of bits
- *set.
- *
- * Returns how many values were actually decoded.
- *
- * This function should only be expected to be faster than
- *bitset_extract_setbits_uint16
- * when the density of the bitset is high.
- *
- * This function uses SSE decoding.
- */
- size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length,
- uint16_t *out, size_t outcapacity,
- uint16_t base);
- size_t bitset_extract_setbits_avx512_uint16(const uint64_t *words,
- size_t length, uint16_t *out,
- size_t outcapacity, uint16_t base);
- /*
- * Given a bitset containing "length" 64-bit words, write out the position
- * of all the set bits to "out", values start at "base"
- * (can be set to zero)
- *
- * The "out" pointer should be sufficient to store the actual number of bits
- *set.
- *
- * Returns how many values were actually decoded.
- */
- size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length,
- uint16_t *out, uint16_t base);
- /*
- * Given two bitsets containing "length" 64-bit words, write out the position
- * of all the common set bits to "out", values start at "base"
- * (can be set to zero)
- *
- * The "out" pointer should be sufficient to store the actual number of bits
- * set.
- *
- * Returns how many values were actually decoded.
- */
- size_t bitset_extract_intersection_setbits_uint16(
- const uint64_t *__restrict__ words1, const uint64_t *__restrict__ words2,
- size_t length, uint16_t *out, uint16_t base);
- /*
- * Given a bitset having cardinality card, set all bit values in the list (there
- * are length of them)
- * and return the updated cardinality. This evidently assumes that the bitset
- * already contained data.
- */
- uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card,
- const uint16_t *list, uint64_t length);
- /*
- * Given a bitset, set all bit values in the list (there
- * are length of them).
- */
- void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length);
- /*
- * Given a bitset having cardinality card, unset all bit values in the list
- * (there are length of them)
- * and return the updated cardinality. This evidently assumes that the bitset
- * already contained data.
- */
- uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list,
- uint64_t length);
- /*
- * Given a bitset having cardinality card, toggle all bit values in the list
- * (there are length of them)
- * and return the updated cardinality. This evidently assumes that the bitset
- * already contained data.
- */
- uint64_t bitset_flip_list_withcard(uint64_t *words, uint64_t card,
- const uint16_t *list, uint64_t length);
- void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length);
- #if CROARING_IS_X64
- /***
- * BEGIN Harley-Seal popcount functions.
- */
- CROARING_TARGET_AVX2
- /**
- * Compute the population count of a 256-bit word
- * This is not especially fast, but it is convenient as part of other functions.
- */
- static inline __m256i popcount256(__m256i v) {
- const __m256i lookuppos = _mm256_setr_epi8(
- /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2,
- /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3,
- /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3,
- /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4,
- /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2,
- /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3,
- /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3,
- /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4);
- const __m256i lookupneg = _mm256_setr_epi8(
- /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2,
- /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3,
- /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3,
- /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4,
- /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2,
- /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3,
- /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3,
- /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4);
- const __m256i low_mask = _mm256_set1_epi8(0x0f);
- const __m256i lo = _mm256_and_si256(v, low_mask);
- const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
- const __m256i popcnt1 = _mm256_shuffle_epi8(lookuppos, lo);
- const __m256i popcnt2 = _mm256_shuffle_epi8(lookupneg, hi);
- return _mm256_sad_epu8(popcnt1, popcnt2);
- }
- CROARING_UNTARGET_AVX2
- CROARING_TARGET_AVX2
- /**
- * Simple CSA over 256 bits
- */
- static inline void CSA(__m256i *h, __m256i *l, __m256i a, __m256i b,
- __m256i c) {
- const __m256i u = _mm256_xor_si256(a, b);
- *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
- *l = _mm256_xor_si256(u, c);
- }
- CROARING_UNTARGET_AVX2
- CROARING_TARGET_AVX2
- /**
- * Fast Harley-Seal AVX population count function
- */
- inline static uint64_t avx2_harley_seal_popcount256(const __m256i *data,
- const uint64_t size) {
- __m256i total = _mm256_setzero_si256();
- __m256i ones = _mm256_setzero_si256();
- __m256i twos = _mm256_setzero_si256();
- __m256i fours = _mm256_setzero_si256();
- __m256i eights = _mm256_setzero_si256();
- __m256i sixteens = _mm256_setzero_si256();
- __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
- const uint64_t limit = size - size % 16;
- uint64_t i = 0;
- for (; i < limit; i += 16) {
- CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i),
- _mm256_lddqu_si256(data + i + 1));
- CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 2),
- _mm256_lddqu_si256(data + i + 3));
- CSA(&foursA, &twos, twos, twosA, twosB);
- CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 4),
- _mm256_lddqu_si256(data + i + 5));
- CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 6),
- _mm256_lddqu_si256(data + i + 7));
- CSA(&foursB, &twos, twos, twosA, twosB);
- CSA(&eightsA, &fours, fours, foursA, foursB);
- CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 8),
- _mm256_lddqu_si256(data + i + 9));
- CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 10),
- _mm256_lddqu_si256(data + i + 11));
- CSA(&foursA, &twos, twos, twosA, twosB);
- CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 12),
- _mm256_lddqu_si256(data + i + 13));
- CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 14),
- _mm256_lddqu_si256(data + i + 15));
- CSA(&foursB, &twos, twos, twosA, twosB);
- CSA(&eightsB, &fours, fours, foursA, foursB);
- CSA(&sixteens, &eights, eights, eightsA, eightsB);
- total = _mm256_add_epi64(total, popcount256(sixteens));
- }
- total = _mm256_slli_epi64(total, 4); // * 16
- total = _mm256_add_epi64(
- total, _mm256_slli_epi64(popcount256(eights), 3)); // += 8 * ...
- total = _mm256_add_epi64(
- total, _mm256_slli_epi64(popcount256(fours), 2)); // += 4 * ...
- total = _mm256_add_epi64(
- total, _mm256_slli_epi64(popcount256(twos), 1)); // += 2 * ...
- total = _mm256_add_epi64(total, popcount256(ones));
- for (; i < size; i++)
- total =
- _mm256_add_epi64(total, popcount256(_mm256_lddqu_si256(data + i)));
- return (uint64_t)(_mm256_extract_epi64(total, 0)) +
- (uint64_t)(_mm256_extract_epi64(total, 1)) +
- (uint64_t)(_mm256_extract_epi64(total, 2)) +
- (uint64_t)(_mm256_extract_epi64(total, 3));
- }
- CROARING_UNTARGET_AVX2
- #define CROARING_AVXPOPCNTFNC(opname, avx_intrinsic) \
- static inline uint64_t avx2_harley_seal_popcount256_##opname( \
- const __m256i *data1, const __m256i *data2, const uint64_t size) { \
- __m256i total = _mm256_setzero_si256(); \
- __m256i ones = _mm256_setzero_si256(); \
- __m256i twos = _mm256_setzero_si256(); \
- __m256i fours = _mm256_setzero_si256(); \
- __m256i eights = _mm256_setzero_si256(); \
- __m256i sixteens = _mm256_setzero_si256(); \
- __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \
- __m256i A1, A2; \
- const uint64_t limit = size - size % 16; \
- uint64_t i = 0; \
- for (; i < limit; i += 16) { \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
- _mm256_lddqu_si256(data2 + i)); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \
- _mm256_lddqu_si256(data2 + i + 1)); \
- CSA(&twosA, &ones, ones, A1, A2); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \
- _mm256_lddqu_si256(data2 + i + 2)); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \
- _mm256_lddqu_si256(data2 + i + 3)); \
- CSA(&twosB, &ones, ones, A1, A2); \
- CSA(&foursA, &twos, twos, twosA, twosB); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \
- _mm256_lddqu_si256(data2 + i + 4)); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \
- _mm256_lddqu_si256(data2 + i + 5)); \
- CSA(&twosA, &ones, ones, A1, A2); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \
- _mm256_lddqu_si256(data2 + i + 6)); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \
- _mm256_lddqu_si256(data2 + i + 7)); \
- CSA(&twosB, &ones, ones, A1, A2); \
- CSA(&foursB, &twos, twos, twosA, twosB); \
- CSA(&eightsA, &fours, fours, foursA, foursB); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \
- _mm256_lddqu_si256(data2 + i + 8)); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \
- _mm256_lddqu_si256(data2 + i + 9)); \
- CSA(&twosA, &ones, ones, A1, A2); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \
- _mm256_lddqu_si256(data2 + i + 10)); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \
- _mm256_lddqu_si256(data2 + i + 11)); \
- CSA(&twosB, &ones, ones, A1, A2); \
- CSA(&foursA, &twos, twos, twosA, twosB); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \
- _mm256_lddqu_si256(data2 + i + 12)); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \
- _mm256_lddqu_si256(data2 + i + 13)); \
- CSA(&twosA, &ones, ones, A1, A2); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \
- _mm256_lddqu_si256(data2 + i + 14)); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \
- _mm256_lddqu_si256(data2 + i + 15)); \
- CSA(&twosB, &ones, ones, A1, A2); \
- CSA(&foursB, &twos, twos, twosA, twosB); \
- CSA(&eightsB, &fours, fours, foursA, foursB); \
- CSA(&sixteens, &eights, eights, eightsA, eightsB); \
- total = _mm256_add_epi64(total, popcount256(sixteens)); \
- } \
- total = _mm256_slli_epi64(total, 4); \
- total = _mm256_add_epi64(total, \
- _mm256_slli_epi64(popcount256(eights), 3)); \
- total = \
- _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \
- total = \
- _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \
- total = _mm256_add_epi64(total, popcount256(ones)); \
- for (; i < size; i++) { \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
- _mm256_lddqu_si256(data2 + i)); \
- total = _mm256_add_epi64(total, popcount256(A1)); \
- } \
- return (uint64_t)(_mm256_extract_epi64(total, 0)) + \
- (uint64_t)(_mm256_extract_epi64(total, 1)) + \
- (uint64_t)(_mm256_extract_epi64(total, 2)) + \
- (uint64_t)(_mm256_extract_epi64(total, 3)); \
- } \
- static inline uint64_t avx2_harley_seal_popcount256andstore_##opname( \
- const __m256i *__restrict__ data1, const __m256i *__restrict__ data2, \
- __m256i *__restrict__ out, const uint64_t size) { \
- __m256i total = _mm256_setzero_si256(); \
- __m256i ones = _mm256_setzero_si256(); \
- __m256i twos = _mm256_setzero_si256(); \
- __m256i fours = _mm256_setzero_si256(); \
- __m256i eights = _mm256_setzero_si256(); \
- __m256i sixteens = _mm256_setzero_si256(); \
- __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; \
- __m256i A1, A2; \
- const uint64_t limit = size - size % 16; \
- uint64_t i = 0; \
- for (; i < limit; i += 16) { \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
- _mm256_lddqu_si256(data2 + i)); \
- _mm256_storeu_si256(out + i, A1); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1), \
- _mm256_lddqu_si256(data2 + i + 1)); \
- _mm256_storeu_si256(out + i + 1, A2); \
- CSA(&twosA, &ones, ones, A1, A2); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2), \
- _mm256_lddqu_si256(data2 + i + 2)); \
- _mm256_storeu_si256(out + i + 2, A1); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3), \
- _mm256_lddqu_si256(data2 + i + 3)); \
- _mm256_storeu_si256(out + i + 3, A2); \
- CSA(&twosB, &ones, ones, A1, A2); \
- CSA(&foursA, &twos, twos, twosA, twosB); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4), \
- _mm256_lddqu_si256(data2 + i + 4)); \
- _mm256_storeu_si256(out + i + 4, A1); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5), \
- _mm256_lddqu_si256(data2 + i + 5)); \
- _mm256_storeu_si256(out + i + 5, A2); \
- CSA(&twosA, &ones, ones, A1, A2); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6), \
- _mm256_lddqu_si256(data2 + i + 6)); \
- _mm256_storeu_si256(out + i + 6, A1); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7), \
- _mm256_lddqu_si256(data2 + i + 7)); \
- _mm256_storeu_si256(out + i + 7, A2); \
- CSA(&twosB, &ones, ones, A1, A2); \
- CSA(&foursB, &twos, twos, twosA, twosB); \
- CSA(&eightsA, &fours, fours, foursA, foursB); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8), \
- _mm256_lddqu_si256(data2 + i + 8)); \
- _mm256_storeu_si256(out + i + 8, A1); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9), \
- _mm256_lddqu_si256(data2 + i + 9)); \
- _mm256_storeu_si256(out + i + 9, A2); \
- CSA(&twosA, &ones, ones, A1, A2); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10), \
- _mm256_lddqu_si256(data2 + i + 10)); \
- _mm256_storeu_si256(out + i + 10, A1); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11), \
- _mm256_lddqu_si256(data2 + i + 11)); \
- _mm256_storeu_si256(out + i + 11, A2); \
- CSA(&twosB, &ones, ones, A1, A2); \
- CSA(&foursA, &twos, twos, twosA, twosB); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12), \
- _mm256_lddqu_si256(data2 + i + 12)); \
- _mm256_storeu_si256(out + i + 12, A1); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13), \
- _mm256_lddqu_si256(data2 + i + 13)); \
- _mm256_storeu_si256(out + i + 13, A2); \
- CSA(&twosA, &ones, ones, A1, A2); \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14), \
- _mm256_lddqu_si256(data2 + i + 14)); \
- _mm256_storeu_si256(out + i + 14, A1); \
- A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15), \
- _mm256_lddqu_si256(data2 + i + 15)); \
- _mm256_storeu_si256(out + i + 15, A2); \
- CSA(&twosB, &ones, ones, A1, A2); \
- CSA(&foursB, &twos, twos, twosA, twosB); \
- CSA(&eightsB, &fours, fours, foursA, foursB); \
- CSA(&sixteens, &eights, eights, eightsA, eightsB); \
- total = _mm256_add_epi64(total, popcount256(sixteens)); \
- } \
- total = _mm256_slli_epi64(total, 4); \
- total = _mm256_add_epi64(total, \
- _mm256_slli_epi64(popcount256(eights), 3)); \
- total = \
- _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \
- total = \
- _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1)); \
- total = _mm256_add_epi64(total, popcount256(ones)); \
- for (; i < size; i++) { \
- A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i), \
- _mm256_lddqu_si256(data2 + i)); \
- _mm256_storeu_si256(out + i, A1); \
- total = _mm256_add_epi64(total, popcount256(A1)); \
- } \
- return (uint64_t)(_mm256_extract_epi64(total, 0)) + \
- (uint64_t)(_mm256_extract_epi64(total, 1)) + \
- (uint64_t)(_mm256_extract_epi64(total, 2)) + \
- (uint64_t)(_mm256_extract_epi64(total, 3)); \
- }
- CROARING_TARGET_AVX2
- CROARING_AVXPOPCNTFNC(or, _mm256_or_si256)
- CROARING_UNTARGET_AVX2
- CROARING_TARGET_AVX2
- CROARING_AVXPOPCNTFNC(union, _mm256_or_si256)
- CROARING_UNTARGET_AVX2
- CROARING_TARGET_AVX2
- CROARING_AVXPOPCNTFNC(and, _mm256_and_si256)
- CROARING_UNTARGET_AVX2
- CROARING_TARGET_AVX2
- CROARING_AVXPOPCNTFNC(intersection, _mm256_and_si256)
- CROARING_UNTARGET_AVX2
- CROARING_TARGET_AVX2
- CROARING_AVXPOPCNTFNC(xor, _mm256_xor_si256)
- CROARING_UNTARGET_AVX2
- CROARING_TARGET_AVX2
- CROARING_AVXPOPCNTFNC(andnot, _mm256_andnot_si256)
- CROARING_UNTARGET_AVX2
- #define VPOPCNT_AND_ADD(ptr, i, accu) \
- const __m512i v##i = _mm512_loadu_si512((const __m512i *)ptr + i); \
- const __m512i p##i = _mm512_popcnt_epi64(v##i); \
- accu = _mm512_add_epi64(accu, p##i);
- #if CROARING_COMPILER_SUPPORTS_AVX512
- CROARING_TARGET_AVX512
- static inline uint64_t sum_epu64_256(const __m256i v) {
- return (uint64_t)(_mm256_extract_epi64(v, 0)) +
- (uint64_t)(_mm256_extract_epi64(v, 1)) +
- (uint64_t)(_mm256_extract_epi64(v, 2)) +
- (uint64_t)(_mm256_extract_epi64(v, 3));
- }
- static inline uint64_t simd_sum_epu64(const __m512i v) {
- __m256i lo = _mm512_extracti64x4_epi64(v, 0);
- __m256i hi = _mm512_extracti64x4_epi64(v, 1);
- return sum_epu64_256(lo) + sum_epu64_256(hi);
- }
- static inline uint64_t avx512_vpopcount(const __m512i *data,
- const uint64_t size) {
- const uint64_t limit = size - size % 4;
- __m512i total = _mm512_setzero_si512();
- uint64_t i = 0;
- for (; i < limit; i += 4) {
- VPOPCNT_AND_ADD(data + i, 0, total);
- VPOPCNT_AND_ADD(data + i, 1, total);
- VPOPCNT_AND_ADD(data + i, 2, total);
- VPOPCNT_AND_ADD(data + i, 3, total);
- }
- for (; i < size; i++) {
- total = _mm512_add_epi64(
- total, _mm512_popcnt_epi64(_mm512_loadu_si512(data + i)));
- }
- return simd_sum_epu64(total);
- }
- CROARING_UNTARGET_AVX512
- #endif
- #define CROARING_AVXPOPCNTFNC512(opname, avx_intrinsic) \
- static inline uint64_t avx512_harley_seal_popcount512_##opname( \
- const __m512i *data1, const __m512i *data2, const uint64_t size) { \
- __m512i total = _mm512_setzero_si512(); \
- const uint64_t limit = size - size % 4; \
- uint64_t i = 0; \
- for (; i < limit; i += 4) { \
- __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i), \
- _mm512_loadu_si512(data2 + i)); \
- total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1)); \
- __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1), \
- _mm512_loadu_si512(data2 + i + 1)); \
- total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2)); \
- __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2), \
- _mm512_loadu_si512(data2 + i + 2)); \
- total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3)); \
- __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3), \
- _mm512_loadu_si512(data2 + i + 3)); \
- total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4)); \
- } \
- for (; i < size; i++) { \
- __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i), \
- _mm512_loadu_si512(data2 + i)); \
- total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a)); \
- } \
- return simd_sum_epu64(total); \
- } \
- static inline uint64_t avx512_harley_seal_popcount512andstore_##opname( \
- const __m512i *__restrict__ data1, const __m512i *__restrict__ data2, \
- __m512i *__restrict__ out, const uint64_t size) { \
- __m512i total = _mm512_setzero_si512(); \
- const uint64_t limit = size - size % 4; \
- uint64_t i = 0; \
- for (; i < limit; i += 4) { \
- __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i), \
- _mm512_loadu_si512(data2 + i)); \
- _mm512_storeu_si512(out + i, a1); \
- total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1)); \
- __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1), \
- _mm512_loadu_si512(data2 + i + 1)); \
- _mm512_storeu_si512(out + i + 1, a2); \
- total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2)); \
- __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2), \
- _mm512_loadu_si512(data2 + i + 2)); \
- _mm512_storeu_si512(out + i + 2, a3); \
- total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3)); \
- __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3), \
- _mm512_loadu_si512(data2 + i + 3)); \
- _mm512_storeu_si512(out + i + 3, a4); \
- total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4)); \
- } \
- for (; i < size; i++) { \
- __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i), \
- _mm512_loadu_si512(data2 + i)); \
- _mm512_storeu_si512(out + i, a); \
- total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a)); \
- } \
- return simd_sum_epu64(total); \
- }
- #if CROARING_COMPILER_SUPPORTS_AVX512
- CROARING_TARGET_AVX512
- CROARING_AVXPOPCNTFNC512(or, _mm512_or_si512)
- CROARING_AVXPOPCNTFNC512(union, _mm512_or_si512)
- CROARING_AVXPOPCNTFNC512(and, _mm512_and_si512)
- CROARING_AVXPOPCNTFNC512(intersection, _mm512_and_si512)
- CROARING_AVXPOPCNTFNC512(xor, _mm512_xor_si512)
- CROARING_AVXPOPCNTFNC512(andnot, _mm512_andnot_si512)
- CROARING_UNTARGET_AVX512
- #endif
- /***
- * END Harley-Seal popcount functions.
- */
- #endif // CROARING_IS_X64
- #ifdef __cplusplus
- }
- }
- } // extern "C" { namespace roaring { namespace internal
- #endif
- #if defined(__GNUC__) && !defined(__clang__)
- #pragma GCC diagnostic pop
- #endif
- #endif
|