123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 |
- #include <stdbool.h>
- #include <stddef.h>
- #include <stdint.h>
- #include "blake3_impl.h"
- #if defined(IS_X86)
- #if defined(_MSC_VER)
- #include <intrin.h>
- #elif defined(__GNUC__)
- #include <immintrin.h>
- #else
- #error "Unimplemented!"
- #endif
- #endif
- #define MAYBE_UNUSED(x) (void)((x))
- #if defined(IS_X86)
- static uint64_t xgetbv(void) {
- #if defined(_MSC_VER)
- return _xgetbv(0);
- #else
- uint32_t eax = 0, edx = 0;
- __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
- return ((uint64_t)edx << 32) | eax;
- #endif
- }
- static void cpuid(uint32_t out[4], uint32_t id) {
- #if defined(_MSC_VER)
- __cpuid((int *)out, id);
- #elif defined(__i386__) || defined(_M_IX86)
- __asm__ __volatile__("movl %%ebx, %1\n"
- "cpuid\n"
- "xchgl %1, %%ebx\n"
- : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
- : "a"(id));
- #else
- __asm__ __volatile__("cpuid\n"
- : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
- : "a"(id));
- #endif
- }
- static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
- #if defined(_MSC_VER)
- __cpuidex((int *)out, id, sid);
- #elif defined(__i386__) || defined(_M_IX86)
- __asm__ __volatile__("movl %%ebx, %1\n"
- "cpuid\n"
- "xchgl %1, %%ebx\n"
- : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
- : "a"(id), "c"(sid));
- #else
- __asm__ __volatile__("cpuid\n"
- : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
- : "a"(id), "c"(sid));
- #endif
- }
- #endif
- enum cpu_feature {
- SSE2 = 1 << 0,
- SSSE3 = 1 << 1,
- SSE41 = 1 << 2,
- AVX = 1 << 3,
- AVX2 = 1 << 4,
- AVX512F = 1 << 5,
- AVX512VL = 1 << 6,
- /* ... */
- UNDEFINED = 1 << 30
- };
- #if !defined(BLAKE3_TESTING)
- static /* Allow the variable to be controlled manually for testing */
- #endif
- enum cpu_feature g_cpu_features = UNDEFINED;
- LLVM_ATTRIBUTE_USED
- #if !defined(BLAKE3_TESTING)
- static
- #endif
- enum cpu_feature
- get_cpu_features(void) {
- if (g_cpu_features != UNDEFINED) {
- return g_cpu_features;
- } else {
- #if defined(IS_X86)
- uint32_t regs[4] = {0};
- uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
- (void)edx;
- enum cpu_feature features = 0;
- cpuid(regs, 0);
- const int max_id = *eax;
- cpuid(regs, 1);
- #if defined(__amd64__) || defined(_M_X64)
- features |= SSE2;
- #else
- if (*edx & (1UL << 26))
- features |= SSE2;
- #endif
- if (*ecx & (1UL << 0))
- features |= SSSE3;
- if (*ecx & (1UL << 19))
- features |= SSE41;
- if (*ecx & (1UL << 27)) { // OSXSAVE
- const uint64_t mask = xgetbv();
- if ((mask & 6) == 6) { // SSE and AVX states
- if (*ecx & (1UL << 28))
- features |= AVX;
- if (max_id >= 7) {
- cpuidex(regs, 7, 0);
- if (*ebx & (1UL << 5))
- features |= AVX2;
- if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
- if (*ebx & (1UL << 31))
- features |= AVX512VL;
- if (*ebx & (1UL << 16))
- features |= AVX512F;
- }
- }
- }
- }
- g_cpu_features = features;
- return features;
- #else
- /* How to detect NEON? */
- return 0;
- #endif
- }
- }
- void blake3_compress_in_place(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags) {
- #if defined(IS_X86)
- const enum cpu_feature features = get_cpu_features();
- MAYBE_UNUSED(features);
- #if !defined(BLAKE3_NO_AVX512)
- if (features & AVX512VL) {
- blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
- return;
- }
- #endif
- #if !defined(BLAKE3_NO_SSE41)
- if (features & SSE41) {
- blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
- return;
- }
- #endif
- #if !defined(BLAKE3_NO_SSE2)
- if (features & SSE2) {
- blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
- return;
- }
- #endif
- #endif
- blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
- }
- void blake3_compress_xof(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter, uint8_t flags,
- uint8_t out[64]) {
- #if defined(IS_X86)
- const enum cpu_feature features = get_cpu_features();
- MAYBE_UNUSED(features);
- #if !defined(BLAKE3_NO_AVX512)
- if (features & AVX512VL) {
- blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
- return;
- }
- #endif
- #if !defined(BLAKE3_NO_SSE41)
- if (features & SSE41) {
- blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
- return;
- }
- #endif
- #if !defined(BLAKE3_NO_SSE2)
- if (features & SSE2) {
- blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
- return;
- }
- #endif
- #endif
- blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
- }
- void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8], uint64_t counter,
- bool increment_counter, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
- #if defined(IS_X86)
- const enum cpu_feature features = get_cpu_features();
- MAYBE_UNUSED(features);
- #if !defined(BLAKE3_NO_AVX512)
- if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
- blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
- return;
- }
- #endif
- #if !defined(BLAKE3_NO_AVX2)
- if (features & AVX2) {
- blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
- return;
- }
- #endif
- #if !defined(BLAKE3_NO_SSE41)
- if (features & SSE41) {
- blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
- return;
- }
- #endif
- #if !defined(BLAKE3_NO_SSE2)
- if (features & SSE2) {
- blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
- return;
- }
- #endif
- #endif
- #if BLAKE3_USE_NEON == 1
- blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end, out);
- return;
- #endif
- blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
- increment_counter, flags, flags_start, flags_end,
- out);
- }
- // The dynamically detected SIMD degree of the current platform.
- size_t blake3_simd_degree(void) {
- #if defined(IS_X86)
- const enum cpu_feature features = get_cpu_features();
- MAYBE_UNUSED(features);
- #if !defined(BLAKE3_NO_AVX512)
- if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
- return 16;
- }
- #endif
- #if !defined(BLAKE3_NO_AVX2)
- if (features & AVX2) {
- return 8;
- }
- #endif
- #if !defined(BLAKE3_NO_SSE41)
- if (features & SSE41) {
- return 4;
- }
- #endif
- #if !defined(BLAKE3_NO_SSE2)
- if (features & SSE2) {
- return 4;
- }
- #endif
- #endif
- #if BLAKE3_USE_NEON == 1
- return 4;
- #endif
- return 1;
- }
|