blake3_dispatch.c 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. #include <stdbool.h>
  2. #include <stddef.h>
  3. #include <stdint.h>
  4. #include "blake3_impl.h"
  5. #if defined(IS_X86)
  6. #if defined(_MSC_VER)
  7. #include <intrin.h>
  8. #elif defined(__GNUC__)
  9. #include <immintrin.h>
  10. #else
  11. #error "Unimplemented!"
  12. #endif
  13. #endif
  14. #define MAYBE_UNUSED(x) (void)((x))
  15. #if defined(IS_X86)
  16. static uint64_t xgetbv(void) {
  17. #if defined(_MSC_VER)
  18. return _xgetbv(0);
  19. #else
  20. uint32_t eax = 0, edx = 0;
  21. __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
  22. return ((uint64_t)edx << 32) | eax;
  23. #endif
  24. }
  25. static void cpuid(uint32_t out[4], uint32_t id) {
  26. #if defined(_MSC_VER)
  27. __cpuid((int *)out, id);
  28. #elif defined(__i386__) || defined(_M_IX86)
  29. __asm__ __volatile__("movl %%ebx, %1\n"
  30. "cpuid\n"
  31. "xchgl %1, %%ebx\n"
  32. : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
  33. : "a"(id));
  34. #else
  35. __asm__ __volatile__("cpuid\n"
  36. : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
  37. : "a"(id));
  38. #endif
  39. }
  40. static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
  41. #if defined(_MSC_VER)
  42. __cpuidex((int *)out, id, sid);
  43. #elif defined(__i386__) || defined(_M_IX86)
  44. __asm__ __volatile__("movl %%ebx, %1\n"
  45. "cpuid\n"
  46. "xchgl %1, %%ebx\n"
  47. : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
  48. : "a"(id), "c"(sid));
  49. #else
  50. __asm__ __volatile__("cpuid\n"
  51. : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
  52. : "a"(id), "c"(sid));
  53. #endif
  54. }
  55. #endif
  56. enum cpu_feature {
  57. SSE2 = 1 << 0,
  58. SSSE3 = 1 << 1,
  59. SSE41 = 1 << 2,
  60. AVX = 1 << 3,
  61. AVX2 = 1 << 4,
  62. AVX512F = 1 << 5,
  63. AVX512VL = 1 << 6,
  64. /* ... */
  65. UNDEFINED = 1 << 30
  66. };
  67. #if !defined(BLAKE3_TESTING)
  68. static /* Allow the variable to be controlled manually for testing */
  69. #endif
  70. enum cpu_feature g_cpu_features = UNDEFINED;
  71. LLVM_ATTRIBUTE_USED
  72. #if !defined(BLAKE3_TESTING)
  73. static
  74. #endif
  75. enum cpu_feature
  76. get_cpu_features(void) {
  77. if (g_cpu_features != UNDEFINED) {
  78. return g_cpu_features;
  79. } else {
  80. #if defined(IS_X86)
  81. uint32_t regs[4] = {0};
  82. uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
  83. (void)edx;
  84. enum cpu_feature features = 0;
  85. cpuid(regs, 0);
  86. const int max_id = *eax;
  87. cpuid(regs, 1);
  88. #if defined(__amd64__) || defined(_M_X64)
  89. features |= SSE2;
  90. #else
  91. if (*edx & (1UL << 26))
  92. features |= SSE2;
  93. #endif
  94. if (*ecx & (1UL << 0))
  95. features |= SSSE3;
  96. if (*ecx & (1UL << 19))
  97. features |= SSE41;
  98. if (*ecx & (1UL << 27)) { // OSXSAVE
  99. const uint64_t mask = xgetbv();
  100. if ((mask & 6) == 6) { // SSE and AVX states
  101. if (*ecx & (1UL << 28))
  102. features |= AVX;
  103. if (max_id >= 7) {
  104. cpuidex(regs, 7, 0);
  105. if (*ebx & (1UL << 5))
  106. features |= AVX2;
  107. if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
  108. if (*ebx & (1UL << 31))
  109. features |= AVX512VL;
  110. if (*ebx & (1UL << 16))
  111. features |= AVX512F;
  112. }
  113. }
  114. }
  115. }
  116. g_cpu_features = features;
  117. return features;
  118. #else
  119. /* How to detect NEON? */
  120. return 0;
  121. #endif
  122. }
  123. }
  124. void blake3_compress_in_place(uint32_t cv[8],
  125. const uint8_t block[BLAKE3_BLOCK_LEN],
  126. uint8_t block_len, uint64_t counter,
  127. uint8_t flags) {
  128. #if defined(IS_X86)
  129. const enum cpu_feature features = get_cpu_features();
  130. MAYBE_UNUSED(features);
  131. #if !defined(BLAKE3_NO_AVX512)
  132. if (features & AVX512VL) {
  133. blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
  134. return;
  135. }
  136. #endif
  137. #if !defined(BLAKE3_NO_SSE41)
  138. if (features & SSE41) {
  139. blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
  140. return;
  141. }
  142. #endif
  143. #if !defined(BLAKE3_NO_SSE2)
  144. if (features & SSE2) {
  145. blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
  146. return;
  147. }
  148. #endif
  149. #endif
  150. blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
  151. }
  152. void blake3_compress_xof(const uint32_t cv[8],
  153. const uint8_t block[BLAKE3_BLOCK_LEN],
  154. uint8_t block_len, uint64_t counter, uint8_t flags,
  155. uint8_t out[64]) {
  156. #if defined(IS_X86)
  157. const enum cpu_feature features = get_cpu_features();
  158. MAYBE_UNUSED(features);
  159. #if !defined(BLAKE3_NO_AVX512)
  160. if (features & AVX512VL) {
  161. blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
  162. return;
  163. }
  164. #endif
  165. #if !defined(BLAKE3_NO_SSE41)
  166. if (features & SSE41) {
  167. blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
  168. return;
  169. }
  170. #endif
  171. #if !defined(BLAKE3_NO_SSE2)
  172. if (features & SSE2) {
  173. blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
  174. return;
  175. }
  176. #endif
  177. #endif
  178. blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
  179. }
  180. void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
  181. size_t blocks, const uint32_t key[8], uint64_t counter,
  182. bool increment_counter, uint8_t flags,
  183. uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
  184. #if defined(IS_X86)
  185. const enum cpu_feature features = get_cpu_features();
  186. MAYBE_UNUSED(features);
  187. #if !defined(BLAKE3_NO_AVX512)
  188. if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
  189. blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
  190. increment_counter, flags, flags_start, flags_end,
  191. out);
  192. return;
  193. }
  194. #endif
  195. #if !defined(BLAKE3_NO_AVX2)
  196. if (features & AVX2) {
  197. blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
  198. increment_counter, flags, flags_start, flags_end,
  199. out);
  200. return;
  201. }
  202. #endif
  203. #if !defined(BLAKE3_NO_SSE41)
  204. if (features & SSE41) {
  205. blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
  206. increment_counter, flags, flags_start, flags_end,
  207. out);
  208. return;
  209. }
  210. #endif
  211. #if !defined(BLAKE3_NO_SSE2)
  212. if (features & SSE2) {
  213. blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
  214. increment_counter, flags, flags_start, flags_end,
  215. out);
  216. return;
  217. }
  218. #endif
  219. #endif
  220. #if BLAKE3_USE_NEON == 1
  221. blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
  222. increment_counter, flags, flags_start, flags_end, out);
  223. return;
  224. #endif
  225. blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
  226. increment_counter, flags, flags_start, flags_end,
  227. out);
  228. }
  229. // The dynamically detected SIMD degree of the current platform.
  230. size_t blake3_simd_degree(void) {
  231. #if defined(IS_X86)
  232. const enum cpu_feature features = get_cpu_features();
  233. MAYBE_UNUSED(features);
  234. #if !defined(BLAKE3_NO_AVX512)
  235. if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
  236. return 16;
  237. }
  238. #endif
  239. #if !defined(BLAKE3_NO_AVX2)
  240. if (features & AVX2) {
  241. return 8;
  242. }
  243. #endif
  244. #if !defined(BLAKE3_NO_SSE41)
  245. if (features & SSE41) {
  246. return 4;
  247. }
  248. #endif
  249. #if !defined(BLAKE3_NO_SSE2)
  250. if (features & SSE2) {
  251. return 4;
  252. }
  253. #endif
  254. #endif
  255. #if BLAKE3_USE_NEON == 1
  256. return 4;
  257. #endif
  258. return 1;
  259. }