123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420 |
- #include <stdint.h>
- #include <string.h>
- #include "kyber512r3_params.h"
- #include "kyber512r3_consts_avx2.h"
- #include "kyber512r3_rejsample_avx2.h"
- #if defined(S2N_KYBER512R3_AVX2_BMI2)
- #include <immintrin.h>
- //#define BMI
- #ifndef BMI
- static const uint8_t idx[256][8] = {
- {-1, -1, -1, -1, -1, -1, -1, -1},
- { 0, -1, -1, -1, -1, -1, -1, -1},
- { 2, -1, -1, -1, -1, -1, -1, -1},
- { 0, 2, -1, -1, -1, -1, -1, -1},
- { 4, -1, -1, -1, -1, -1, -1, -1},
- { 0, 4, -1, -1, -1, -1, -1, -1},
- { 2, 4, -1, -1, -1, -1, -1, -1},
- { 0, 2, 4, -1, -1, -1, -1, -1},
- { 6, -1, -1, -1, -1, -1, -1, -1},
- { 0, 6, -1, -1, -1, -1, -1, -1},
- { 2, 6, -1, -1, -1, -1, -1, -1},
- { 0, 2, 6, -1, -1, -1, -1, -1},
- { 4, 6, -1, -1, -1, -1, -1, -1},
- { 0, 4, 6, -1, -1, -1, -1, -1},
- { 2, 4, 6, -1, -1, -1, -1, -1},
- { 0, 2, 4, 6, -1, -1, -1, -1},
- { 8, -1, -1, -1, -1, -1, -1, -1},
- { 0, 8, -1, -1, -1, -1, -1, -1},
- { 2, 8, -1, -1, -1, -1, -1, -1},
- { 0, 2, 8, -1, -1, -1, -1, -1},
- { 4, 8, -1, -1, -1, -1, -1, -1},
- { 0, 4, 8, -1, -1, -1, -1, -1},
- { 2, 4, 8, -1, -1, -1, -1, -1},
- { 0, 2, 4, 8, -1, -1, -1, -1},
- { 6, 8, -1, -1, -1, -1, -1, -1},
- { 0, 6, 8, -1, -1, -1, -1, -1},
- { 2, 6, 8, -1, -1, -1, -1, -1},
- { 0, 2, 6, 8, -1, -1, -1, -1},
- { 4, 6, 8, -1, -1, -1, -1, -1},
- { 0, 4, 6, 8, -1, -1, -1, -1},
- { 2, 4, 6, 8, -1, -1, -1, -1},
- { 0, 2, 4, 6, 8, -1, -1, -1},
- {10, -1, -1, -1, -1, -1, -1, -1},
- { 0, 10, -1, -1, -1, -1, -1, -1},
- { 2, 10, -1, -1, -1, -1, -1, -1},
- { 0, 2, 10, -1, -1, -1, -1, -1},
- { 4, 10, -1, -1, -1, -1, -1, -1},
- { 0, 4, 10, -1, -1, -1, -1, -1},
- { 2, 4, 10, -1, -1, -1, -1, -1},
- { 0, 2, 4, 10, -1, -1, -1, -1},
- { 6, 10, -1, -1, -1, -1, -1, -1},
- { 0, 6, 10, -1, -1, -1, -1, -1},
- { 2, 6, 10, -1, -1, -1, -1, -1},
- { 0, 2, 6, 10, -1, -1, -1, -1},
- { 4, 6, 10, -1, -1, -1, -1, -1},
- { 0, 4, 6, 10, -1, -1, -1, -1},
- { 2, 4, 6, 10, -1, -1, -1, -1},
- { 0, 2, 4, 6, 10, -1, -1, -1},
- { 8, 10, -1, -1, -1, -1, -1, -1},
- { 0, 8, 10, -1, -1, -1, -1, -1},
- { 2, 8, 10, -1, -1, -1, -1, -1},
- { 0, 2, 8, 10, -1, -1, -1, -1},
- { 4, 8, 10, -1, -1, -1, -1, -1},
- { 0, 4, 8, 10, -1, -1, -1, -1},
- { 2, 4, 8, 10, -1, -1, -1, -1},
- { 0, 2, 4, 8, 10, -1, -1, -1},
- { 6, 8, 10, -1, -1, -1, -1, -1},
- { 0, 6, 8, 10, -1, -1, -1, -1},
- { 2, 6, 8, 10, -1, -1, -1, -1},
- { 0, 2, 6, 8, 10, -1, -1, -1},
- { 4, 6, 8, 10, -1, -1, -1, -1},
- { 0, 4, 6, 8, 10, -1, -1, -1},
- { 2, 4, 6, 8, 10, -1, -1, -1},
- { 0, 2, 4, 6, 8, 10, -1, -1},
- {12, -1, -1, -1, -1, -1, -1, -1},
- { 0, 12, -1, -1, -1, -1, -1, -1},
- { 2, 12, -1, -1, -1, -1, -1, -1},
- { 0, 2, 12, -1, -1, -1, -1, -1},
- { 4, 12, -1, -1, -1, -1, -1, -1},
- { 0, 4, 12, -1, -1, -1, -1, -1},
- { 2, 4, 12, -1, -1, -1, -1, -1},
- { 0, 2, 4, 12, -1, -1, -1, -1},
- { 6, 12, -1, -1, -1, -1, -1, -1},
- { 0, 6, 12, -1, -1, -1, -1, -1},
- { 2, 6, 12, -1, -1, -1, -1, -1},
- { 0, 2, 6, 12, -1, -1, -1, -1},
- { 4, 6, 12, -1, -1, -1, -1, -1},
- { 0, 4, 6, 12, -1, -1, -1, -1},
- { 2, 4, 6, 12, -1, -1, -1, -1},
- { 0, 2, 4, 6, 12, -1, -1, -1},
- { 8, 12, -1, -1, -1, -1, -1, -1},
- { 0, 8, 12, -1, -1, -1, -1, -1},
- { 2, 8, 12, -1, -1, -1, -1, -1},
- { 0, 2, 8, 12, -1, -1, -1, -1},
- { 4, 8, 12, -1, -1, -1, -1, -1},
- { 0, 4, 8, 12, -1, -1, -1, -1},
- { 2, 4, 8, 12, -1, -1, -1, -1},
- { 0, 2, 4, 8, 12, -1, -1, -1},
- { 6, 8, 12, -1, -1, -1, -1, -1},
- { 0, 6, 8, 12, -1, -1, -1, -1},
- { 2, 6, 8, 12, -1, -1, -1, -1},
- { 0, 2, 6, 8, 12, -1, -1, -1},
- { 4, 6, 8, 12, -1, -1, -1, -1},
- { 0, 4, 6, 8, 12, -1, -1, -1},
- { 2, 4, 6, 8, 12, -1, -1, -1},
- { 0, 2, 4, 6, 8, 12, -1, -1},
- {10, 12, -1, -1, -1, -1, -1, -1},
- { 0, 10, 12, -1, -1, -1, -1, -1},
- { 2, 10, 12, -1, -1, -1, -1, -1},
- { 0, 2, 10, 12, -1, -1, -1, -1},
- { 4, 10, 12, -1, -1, -1, -1, -1},
- { 0, 4, 10, 12, -1, -1, -1, -1},
- { 2, 4, 10, 12, -1, -1, -1, -1},
- { 0, 2, 4, 10, 12, -1, -1, -1},
- { 6, 10, 12, -1, -1, -1, -1, -1},
- { 0, 6, 10, 12, -1, -1, -1, -1},
- { 2, 6, 10, 12, -1, -1, -1, -1},
- { 0, 2, 6, 10, 12, -1, -1, -1},
- { 4, 6, 10, 12, -1, -1, -1, -1},
- { 0, 4, 6, 10, 12, -1, -1, -1},
- { 2, 4, 6, 10, 12, -1, -1, -1},
- { 0, 2, 4, 6, 10, 12, -1, -1},
- { 8, 10, 12, -1, -1, -1, -1, -1},
- { 0, 8, 10, 12, -1, -1, -1, -1},
- { 2, 8, 10, 12, -1, -1, -1, -1},
- { 0, 2, 8, 10, 12, -1, -1, -1},
- { 4, 8, 10, 12, -1, -1, -1, -1},
- { 0, 4, 8, 10, 12, -1, -1, -1},
- { 2, 4, 8, 10, 12, -1, -1, -1},
- { 0, 2, 4, 8, 10, 12, -1, -1},
- { 6, 8, 10, 12, -1, -1, -1, -1},
- { 0, 6, 8, 10, 12, -1, -1, -1},
- { 2, 6, 8, 10, 12, -1, -1, -1},
- { 0, 2, 6, 8, 10, 12, -1, -1},
- { 4, 6, 8, 10, 12, -1, -1, -1},
- { 0, 4, 6, 8, 10, 12, -1, -1},
- { 2, 4, 6, 8, 10, 12, -1, -1},
- { 0, 2, 4, 6, 8, 10, 12, -1},
- {14, -1, -1, -1, -1, -1, -1, -1},
- { 0, 14, -1, -1, -1, -1, -1, -1},
- { 2, 14, -1, -1, -1, -1, -1, -1},
- { 0, 2, 14, -1, -1, -1, -1, -1},
- { 4, 14, -1, -1, -1, -1, -1, -1},
- { 0, 4, 14, -1, -1, -1, -1, -1},
- { 2, 4, 14, -1, -1, -1, -1, -1},
- { 0, 2, 4, 14, -1, -1, -1, -1},
- { 6, 14, -1, -1, -1, -1, -1, -1},
- { 0, 6, 14, -1, -1, -1, -1, -1},
- { 2, 6, 14, -1, -1, -1, -1, -1},
- { 0, 2, 6, 14, -1, -1, -1, -1},
- { 4, 6, 14, -1, -1, -1, -1, -1},
- { 0, 4, 6, 14, -1, -1, -1, -1},
- { 2, 4, 6, 14, -1, -1, -1, -1},
- { 0, 2, 4, 6, 14, -1, -1, -1},
- { 8, 14, -1, -1, -1, -1, -1, -1},
- { 0, 8, 14, -1, -1, -1, -1, -1},
- { 2, 8, 14, -1, -1, -1, -1, -1},
- { 0, 2, 8, 14, -1, -1, -1, -1},
- { 4, 8, 14, -1, -1, -1, -1, -1},
- { 0, 4, 8, 14, -1, -1, -1, -1},
- { 2, 4, 8, 14, -1, -1, -1, -1},
- { 0, 2, 4, 8, 14, -1, -1, -1},
- { 6, 8, 14, -1, -1, -1, -1, -1},
- { 0, 6, 8, 14, -1, -1, -1, -1},
- { 2, 6, 8, 14, -1, -1, -1, -1},
- { 0, 2, 6, 8, 14, -1, -1, -1},
- { 4, 6, 8, 14, -1, -1, -1, -1},
- { 0, 4, 6, 8, 14, -1, -1, -1},
- { 2, 4, 6, 8, 14, -1, -1, -1},
- { 0, 2, 4, 6, 8, 14, -1, -1},
- {10, 14, -1, -1, -1, -1, -1, -1},
- { 0, 10, 14, -1, -1, -1, -1, -1},
- { 2, 10, 14, -1, -1, -1, -1, -1},
- { 0, 2, 10, 14, -1, -1, -1, -1},
- { 4, 10, 14, -1, -1, -1, -1, -1},
- { 0, 4, 10, 14, -1, -1, -1, -1},
- { 2, 4, 10, 14, -1, -1, -1, -1},
- { 0, 2, 4, 10, 14, -1, -1, -1},
- { 6, 10, 14, -1, -1, -1, -1, -1},
- { 0, 6, 10, 14, -1, -1, -1, -1},
- { 2, 6, 10, 14, -1, -1, -1, -1},
- { 0, 2, 6, 10, 14, -1, -1, -1},
- { 4, 6, 10, 14, -1, -1, -1, -1},
- { 0, 4, 6, 10, 14, -1, -1, -1},
- { 2, 4, 6, 10, 14, -1, -1, -1},
- { 0, 2, 4, 6, 10, 14, -1, -1},
- { 8, 10, 14, -1, -1, -1, -1, -1},
- { 0, 8, 10, 14, -1, -1, -1, -1},
- { 2, 8, 10, 14, -1, -1, -1, -1},
- { 0, 2, 8, 10, 14, -1, -1, -1},
- { 4, 8, 10, 14, -1, -1, -1, -1},
- { 0, 4, 8, 10, 14, -1, -1, -1},
- { 2, 4, 8, 10, 14, -1, -1, -1},
- { 0, 2, 4, 8, 10, 14, -1, -1},
- { 6, 8, 10, 14, -1, -1, -1, -1},
- { 0, 6, 8, 10, 14, -1, -1, -1},
- { 2, 6, 8, 10, 14, -1, -1, -1},
- { 0, 2, 6, 8, 10, 14, -1, -1},
- { 4, 6, 8, 10, 14, -1, -1, -1},
- { 0, 4, 6, 8, 10, 14, -1, -1},
- { 2, 4, 6, 8, 10, 14, -1, -1},
- { 0, 2, 4, 6, 8, 10, 14, -1},
- {12, 14, -1, -1, -1, -1, -1, -1},
- { 0, 12, 14, -1, -1, -1, -1, -1},
- { 2, 12, 14, -1, -1, -1, -1, -1},
- { 0, 2, 12, 14, -1, -1, -1, -1},
- { 4, 12, 14, -1, -1, -1, -1, -1},
- { 0, 4, 12, 14, -1, -1, -1, -1},
- { 2, 4, 12, 14, -1, -1, -1, -1},
- { 0, 2, 4, 12, 14, -1, -1, -1},
- { 6, 12, 14, -1, -1, -1, -1, -1},
- { 0, 6, 12, 14, -1, -1, -1, -1},
- { 2, 6, 12, 14, -1, -1, -1, -1},
- { 0, 2, 6, 12, 14, -1, -1, -1},
- { 4, 6, 12, 14, -1, -1, -1, -1},
- { 0, 4, 6, 12, 14, -1, -1, -1},
- { 2, 4, 6, 12, 14, -1, -1, -1},
- { 0, 2, 4, 6, 12, 14, -1, -1},
- { 8, 12, 14, -1, -1, -1, -1, -1},
- { 0, 8, 12, 14, -1, -1, -1, -1},
- { 2, 8, 12, 14, -1, -1, -1, -1},
- { 0, 2, 8, 12, 14, -1, -1, -1},
- { 4, 8, 12, 14, -1, -1, -1, -1},
- { 0, 4, 8, 12, 14, -1, -1, -1},
- { 2, 4, 8, 12, 14, -1, -1, -1},
- { 0, 2, 4, 8, 12, 14, -1, -1},
- { 6, 8, 12, 14, -1, -1, -1, -1},
- { 0, 6, 8, 12, 14, -1, -1, -1},
- { 2, 6, 8, 12, 14, -1, -1, -1},
- { 0, 2, 6, 8, 12, 14, -1, -1},
- { 4, 6, 8, 12, 14, -1, -1, -1},
- { 0, 4, 6, 8, 12, 14, -1, -1},
- { 2, 4, 6, 8, 12, 14, -1, -1},
- { 0, 2, 4, 6, 8, 12, 14, -1},
- {10, 12, 14, -1, -1, -1, -1, -1},
- { 0, 10, 12, 14, -1, -1, -1, -1},
- { 2, 10, 12, 14, -1, -1, -1, -1},
- { 0, 2, 10, 12, 14, -1, -1, -1},
- { 4, 10, 12, 14, -1, -1, -1, -1},
- { 0, 4, 10, 12, 14, -1, -1, -1},
- { 2, 4, 10, 12, 14, -1, -1, -1},
- { 0, 2, 4, 10, 12, 14, -1, -1},
- { 6, 10, 12, 14, -1, -1, -1, -1},
- { 0, 6, 10, 12, 14, -1, -1, -1},
- { 2, 6, 10, 12, 14, -1, -1, -1},
- { 0, 2, 6, 10, 12, 14, -1, -1},
- { 4, 6, 10, 12, 14, -1, -1, -1},
- { 0, 4, 6, 10, 12, 14, -1, -1},
- { 2, 4, 6, 10, 12, 14, -1, -1},
- { 0, 2, 4, 6, 10, 12, 14, -1},
- { 8, 10, 12, 14, -1, -1, -1, -1},
- { 0, 8, 10, 12, 14, -1, -1, -1},
- { 2, 8, 10, 12, 14, -1, -1, -1},
- { 0, 2, 8, 10, 12, 14, -1, -1},
- { 4, 8, 10, 12, 14, -1, -1, -1},
- { 0, 4, 8, 10, 12, 14, -1, -1},
- { 2, 4, 8, 10, 12, 14, -1, -1},
- { 0, 2, 4, 8, 10, 12, 14, -1},
- { 6, 8, 10, 12, 14, -1, -1, -1},
- { 0, 6, 8, 10, 12, 14, -1, -1},
- { 2, 6, 8, 10, 12, 14, -1, -1},
- { 0, 2, 6, 8, 10, 12, 14, -1},
- { 4, 6, 8, 10, 12, 14, -1, -1},
- { 0, 4, 6, 8, 10, 12, 14, -1},
- { 2, 4, 6, 8, 10, 12, 14, -1},
- { 0, 2, 4, 6, 8, 10, 12, 14}
- };
- #endif
- #define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
- #define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
- unsigned int rej_uniform_avx2(int16_t * restrict r, const uint8_t *buf)
- {
- unsigned int ctr, pos;
- uint16_t val0, val1;
- uint32_t good;
- #ifdef BMI
- uint64_t idx0, idx1, idx2, idx3;
- #endif
- const __m256i bound = _mm256_load_si256(&qdata.vec[_16XQ/16]);
- const __m256i ones = _mm256_set1_epi8(1);
- const __m256i mask = _mm256_set1_epi16(0xFFF);
- const __m256i idx8 = _mm256_set_epi8(15,14,14,13,12,11,11,10,
- 9, 8, 8, 7, 6, 5, 5, 4,
- 11,10,10, 9, 8, 7, 7, 6,
- 5, 4, 4, 3, 2, 1, 1, 0);
- __m256i f0, f1, g0, g1, g2, g3;
- __m128i f, t, pilo, pihi;
- ctr = pos = 0;
- while(ctr <= S2N_KYBER_512_R3_N - 32 && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 48) {
- // correcting cast-align and cast-qual errors
- // old version: f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
- f0 = _mm256_loadu_si256((const void *)&buf[pos]);
- // old version: f1 = _mm256_loadu_si256((__m256i *)&buf[pos+24]);
- f1 = _mm256_loadu_si256((const void *)&buf[pos+24]);
- f0 = _mm256_permute4x64_epi64(f0, 0x94);
- f1 = _mm256_permute4x64_epi64(f1, 0x94);
- f0 = _mm256_shuffle_epi8(f0, idx8);
- f1 = _mm256_shuffle_epi8(f1, idx8);
- g0 = _mm256_srli_epi16(f0, 4);
- g1 = _mm256_srli_epi16(f1, 4);
- f0 = _mm256_blend_epi16(f0, g0, 0xAA);
- f1 = _mm256_blend_epi16(f1, g1, 0xAA);
- f0 = _mm256_and_si256(f0, mask);
- f1 = _mm256_and_si256(f1, mask);
- pos += 48;
- g0 = _mm256_cmpgt_epi16(bound, f0);
- g1 = _mm256_cmpgt_epi16(bound, f1);
- g0 = _mm256_packs_epi16(g0, g1);
- good = _mm256_movemask_epi8(g0);
- #ifdef BMI
- idx0 = _pdep_u64(good >> 0, 0x0101010101010101);
- idx1 = _pdep_u64(good >> 8, 0x0101010101010101);
- idx2 = _pdep_u64(good >> 16, 0x0101010101010101);
- idx3 = _pdep_u64(good >> 24, 0x0101010101010101);
- idx0 = (idx0 << 8) - idx0;
- idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
- idx1 = (idx1 << 8) - idx1;
- idx1 = _pext_u64(0x0E0C0A0806040200, idx1);
- idx2 = (idx2 << 8) - idx2;
- idx2 = _pext_u64(0x0E0C0A0806040200, idx2);
- idx3 = (idx3 << 8) - idx3;
- idx3 = _pext_u64(0x0E0C0A0806040200, idx3);
- g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0));
- g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1));
- g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1);
- g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1);
- #else
- // correcting cast-align and cast-qual errors
- // old version: g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >> 0) & 0xFF]));
- g0 = _mm256_castsi128_si256(_mm_loadl_epi64((const void *)&idx[(good >> 0) & 0xFF]));
- // old version: g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >> 8) & 0xFF]));
- g1 = _mm256_castsi128_si256(_mm_loadl_epi64((const void *)&idx[(good >> 8) & 0xFF]));
- // old version: g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx[(good >> 16) & 0xFF]), 1);
- g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((const void *)&idx[(good >> 16) & 0xFF]), 1);
- // old version: g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx[(good >> 24) & 0xFF]), 1);
- g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((const void *)&idx[(good >> 24) & 0xFF]), 1);
- #endif
- g2 = _mm256_add_epi8(g0, ones);
- g3 = _mm256_add_epi8(g1, ones);
- g0 = _mm256_unpacklo_epi8(g0, g2);
- g1 = _mm256_unpacklo_epi8(g1, g3);
- f0 = _mm256_shuffle_epi8(f0, g0);
- f1 = _mm256_shuffle_epi8(f1, g1);
- // correcting cast-align errors
- // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
- _mm_storeu_si128((void *)&r[ctr], _mm256_castsi256_si128(f0));
- ctr += _mm_popcnt_u32((good >> 0) & 0xFF);
- // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
- _mm_storeu_si128((void *)&r[ctr], _mm256_extracti128_si256(f0, 1));
- ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
- // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
- _mm_storeu_si128((void *)&r[ctr], _mm256_castsi256_si128(f1));
- ctr += _mm_popcnt_u32((good >> 8) & 0xFF);
- // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
- _mm_storeu_si128((void *)&r[ctr], _mm256_extracti128_si256(f1, 1));
- ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
- }
- while(ctr <= S2N_KYBER_512_R3_N - 8 && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 12) {
- // correcting cast-align and cast-qual errors
- // old version: f = _mm_loadu_si128((__m128i *)&buf[pos]);
- f = _mm_loadu_si128((const void *)&buf[pos]);
- f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
- t = _mm_srli_epi16(f, 4);
- f = _mm_blend_epi16(f, t, 0xAA);
- f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
- pos += 12;
- t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
- good = _mm_movemask_epi8(t);
- #ifdef BMI
- good &= 0x5555;
- idx0 = _pdep_u64(good, 0x1111111111111111);
- idx0 = (idx0 << 8) - idx0;
- idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
- pilo = _mm_cvtsi64_si128(idx0);
- #else
- good = _pext_u32(good, 0x5555);
- // correcting cast-align and cast-qual errors
- // old version: pilo = _mm_loadl_epi64((__m128i *)&idx[good]);
- pilo = _mm_loadl_epi64((const void *)&idx[good]);
- #endif
- pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
- pilo = _mm_unpacklo_epi8(pilo, pihi);
- f = _mm_shuffle_epi8(f, pilo);
- // correcting cast-align error
- // old version: _mm_storeu_si128((__m128i *)&r[ctr], f);
- _mm_storeu_si128((void *)&r[ctr], f);
- ctr += _mm_popcnt_u32(good);
- }
- while(ctr < S2N_KYBER_512_R3_N && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 3) {
- val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
- val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4));
- pos += 3;
- if(val0 < S2N_KYBER_512_R3_Q)
- r[ctr++] = val0;
- if(val1 < S2N_KYBER_512_R3_Q && ctr < S2N_KYBER_512_R3_N)
- r[ctr++] = val1;
- }
- return ctr;
- }
- #endif
|