blake3_impl.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. #ifndef BLAKE3_IMPL_H
  2. #define BLAKE3_IMPL_H
  3. #include <assert.h>
  4. #include <stdbool.h>
  5. #include <stddef.h>
  6. #include <stdint.h>
  7. #include <string.h>
  8. #include "llvm-c/blake3.h"
  9. // For \p LLVM_LIBRARY_VISIBILITY
  10. #include "llvm/Support/Compiler.h"
  11. #include "llvm_blake3_prefix.h"
  12. // internal flags
  13. enum blake3_flags {
  14. CHUNK_START = 1 << 0,
  15. CHUNK_END = 1 << 1,
  16. PARENT = 1 << 2,
  17. ROOT = 1 << 3,
  18. KEYED_HASH = 1 << 4,
  19. DERIVE_KEY_CONTEXT = 1 << 5,
  20. DERIVE_KEY_MATERIAL = 1 << 6,
  21. };
  22. // This C implementation tries to support recent versions of GCC, Clang, and
  23. // MSVC.
  24. #if defined(_MSC_VER)
  25. #define INLINE static __forceinline
  26. #else
  27. #define INLINE static inline __attribute__((always_inline))
  28. #endif
  29. #if defined(__x86_64__) || defined(_M_X64)
  30. #define IS_X86
  31. #define IS_X86_64
  32. #endif
  33. #if defined(__i386__) || defined(_M_IX86)
  34. #define IS_X86
  35. #define IS_X86_32
  36. #endif
  37. #if defined(__aarch64__) || defined(_M_ARM64)
  38. #define IS_AARCH64
  39. #endif
  40. #if defined(IS_X86)
  41. #if defined(_MSC_VER)
  42. #include <intrin.h>
  43. #endif
  44. #include <immintrin.h>
  45. #endif
  46. #if !defined(BLAKE3_USE_NEON)
  47. // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
  48. #if defined(IS_AARCH64)
  49. #define BLAKE3_USE_NEON 1
  50. #else
  51. #define BLAKE3_USE_NEON 0
  52. #endif
  53. #endif
  54. #if defined(IS_X86)
  55. #define MAX_SIMD_DEGREE 16
  56. #elif BLAKE3_USE_NEON == 1
  57. #define MAX_SIMD_DEGREE 4
  58. #else
  59. #define MAX_SIMD_DEGREE 1
  60. #endif
  61. // There are some places where we want a static size that's equal to the
  62. // MAX_SIMD_DEGREE, but also at least 2.
  63. #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
  64. static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
  65. 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
  66. 0x1F83D9ABUL, 0x5BE0CD19UL};
  67. static const uint8_t MSG_SCHEDULE[7][16] = {
  68. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
  69. {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
  70. {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
  71. {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
  72. {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
  73. {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
  74. {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
  75. };
  76. /* Find index of the highest set bit */
  77. /* x is assumed to be nonzero. */
  78. static unsigned int highest_one(uint64_t x) {
  79. #if defined(__GNUC__) || defined(__clang__)
  80. return 63 ^ __builtin_clzll(x);
  81. #elif defined(_MSC_VER) && defined(IS_X86_64)
  82. unsigned long index;
  83. _BitScanReverse64(&index, x);
  84. return index;
  85. #elif defined(_MSC_VER) && defined(IS_X86_32)
  86. if(x >> 32) {
  87. unsigned long index;
  88. _BitScanReverse(&index, (unsigned long)(x >> 32));
  89. return 32 + index;
  90. } else {
  91. unsigned long index;
  92. _BitScanReverse(&index, (unsigned long)x);
  93. return index;
  94. }
  95. #else
  96. unsigned int c = 0;
  97. if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
  98. if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
  99. if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
  100. if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
  101. if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
  102. if(x & 0x0000000000000002ULL) { c += 1; }
  103. return c;
  104. #endif
  105. }
  106. // Count the number of 1 bits.
  107. INLINE unsigned int popcnt(uint64_t x) {
  108. #if defined(__GNUC__) || defined(__clang__)
  109. return __builtin_popcountll(x);
  110. #else
  111. unsigned int count = 0;
  112. while (x != 0) {
  113. count += 1;
  114. x &= x - 1;
  115. }
  116. return count;
  117. #endif
  118. }
  119. // Largest power of two less than or equal to x. As a special case, returns 1
  120. // when x is 0.
  121. INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
  122. return 1ULL << highest_one(x | 1);
  123. }
  124. INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
  125. INLINE uint32_t counter_high(uint64_t counter) {
  126. return (uint32_t)(counter >> 32);
  127. }
  128. INLINE uint32_t load32(const void *src) {
  129. const uint8_t *p = (const uint8_t *)src;
  130. return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
  131. ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
  132. }
  133. INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
  134. uint32_t key_words[8]) {
  135. key_words[0] = load32(&key[0 * 4]);
  136. key_words[1] = load32(&key[1 * 4]);
  137. key_words[2] = load32(&key[2 * 4]);
  138. key_words[3] = load32(&key[3 * 4]);
  139. key_words[4] = load32(&key[4 * 4]);
  140. key_words[5] = load32(&key[5 * 4]);
  141. key_words[6] = load32(&key[6 * 4]);
  142. key_words[7] = load32(&key[7 * 4]);
  143. }
  144. INLINE void store32(void *dst, uint32_t w) {
  145. uint8_t *p = (uint8_t *)dst;
  146. p[0] = (uint8_t)(w >> 0);
  147. p[1] = (uint8_t)(w >> 8);
  148. p[2] = (uint8_t)(w >> 16);
  149. p[3] = (uint8_t)(w >> 24);
  150. }
  151. INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
  152. store32(&bytes_out[0 * 4], cv_words[0]);
  153. store32(&bytes_out[1 * 4], cv_words[1]);
  154. store32(&bytes_out[2 * 4], cv_words[2]);
  155. store32(&bytes_out[3 * 4], cv_words[3]);
  156. store32(&bytes_out[4 * 4], cv_words[4]);
  157. store32(&bytes_out[5 * 4], cv_words[5]);
  158. store32(&bytes_out[6 * 4], cv_words[6]);
  159. store32(&bytes_out[7 * 4], cv_words[7]);
  160. }
  161. LLVM_LIBRARY_VISIBILITY
  162. void blake3_compress_in_place(uint32_t cv[8],
  163. const uint8_t block[BLAKE3_BLOCK_LEN],
  164. uint8_t block_len, uint64_t counter,
  165. uint8_t flags);
  166. LLVM_LIBRARY_VISIBILITY
  167. void blake3_compress_xof(const uint32_t cv[8],
  168. const uint8_t block[BLAKE3_BLOCK_LEN],
  169. uint8_t block_len, uint64_t counter, uint8_t flags,
  170. uint8_t out[64]);
  171. LLVM_LIBRARY_VISIBILITY
  172. void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
  173. size_t blocks, const uint32_t key[8], uint64_t counter,
  174. bool increment_counter, uint8_t flags,
  175. uint8_t flags_start, uint8_t flags_end, uint8_t *out);
  176. LLVM_LIBRARY_VISIBILITY
  177. size_t blake3_simd_degree(void);
  178. // Declarations for implementation-specific functions.
  179. LLVM_LIBRARY_VISIBILITY
  180. void blake3_compress_in_place_portable(uint32_t cv[8],
  181. const uint8_t block[BLAKE3_BLOCK_LEN],
  182. uint8_t block_len, uint64_t counter,
  183. uint8_t flags);
  184. LLVM_LIBRARY_VISIBILITY
  185. void blake3_compress_xof_portable(const uint32_t cv[8],
  186. const uint8_t block[BLAKE3_BLOCK_LEN],
  187. uint8_t block_len, uint64_t counter,
  188. uint8_t flags, uint8_t out[64]);
  189. LLVM_LIBRARY_VISIBILITY
  190. void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
  191. size_t blocks, const uint32_t key[8],
  192. uint64_t counter, bool increment_counter,
  193. uint8_t flags, uint8_t flags_start,
  194. uint8_t flags_end, uint8_t *out);
  195. #if defined(IS_X86)
  196. #if !defined(BLAKE3_NO_SSE2)
  197. LLVM_LIBRARY_VISIBILITY
  198. void blake3_compress_in_place_sse2(uint32_t cv[8],
  199. const uint8_t block[BLAKE3_BLOCK_LEN],
  200. uint8_t block_len, uint64_t counter,
  201. uint8_t flags);
  202. LLVM_LIBRARY_VISIBILITY
  203. void blake3_compress_xof_sse2(const uint32_t cv[8],
  204. const uint8_t block[BLAKE3_BLOCK_LEN],
  205. uint8_t block_len, uint64_t counter,
  206. uint8_t flags, uint8_t out[64]);
  207. LLVM_LIBRARY_VISIBILITY
  208. void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
  209. size_t blocks, const uint32_t key[8],
  210. uint64_t counter, bool increment_counter,
  211. uint8_t flags, uint8_t flags_start,
  212. uint8_t flags_end, uint8_t *out);
  213. #endif
  214. #if !defined(BLAKE3_NO_SSE41)
  215. LLVM_LIBRARY_VISIBILITY
  216. void blake3_compress_in_place_sse41(uint32_t cv[8],
  217. const uint8_t block[BLAKE3_BLOCK_LEN],
  218. uint8_t block_len, uint64_t counter,
  219. uint8_t flags);
  220. LLVM_LIBRARY_VISIBILITY
  221. void blake3_compress_xof_sse41(const uint32_t cv[8],
  222. const uint8_t block[BLAKE3_BLOCK_LEN],
  223. uint8_t block_len, uint64_t counter,
  224. uint8_t flags, uint8_t out[64]);
  225. LLVM_LIBRARY_VISIBILITY
  226. void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
  227. size_t blocks, const uint32_t key[8],
  228. uint64_t counter, bool increment_counter,
  229. uint8_t flags, uint8_t flags_start,
  230. uint8_t flags_end, uint8_t *out);
  231. #endif
  232. #if !defined(BLAKE3_NO_AVX2)
  233. LLVM_LIBRARY_VISIBILITY
  234. void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
  235. size_t blocks, const uint32_t key[8],
  236. uint64_t counter, bool increment_counter,
  237. uint8_t flags, uint8_t flags_start,
  238. uint8_t flags_end, uint8_t *out);
  239. #endif
  240. #if !defined(BLAKE3_NO_AVX512)
  241. LLVM_LIBRARY_VISIBILITY
  242. void blake3_compress_in_place_avx512(uint32_t cv[8],
  243. const uint8_t block[BLAKE3_BLOCK_LEN],
  244. uint8_t block_len, uint64_t counter,
  245. uint8_t flags);
  246. LLVM_LIBRARY_VISIBILITY
  247. void blake3_compress_xof_avx512(const uint32_t cv[8],
  248. const uint8_t block[BLAKE3_BLOCK_LEN],
  249. uint8_t block_len, uint64_t counter,
  250. uint8_t flags, uint8_t out[64]);
  251. LLVM_LIBRARY_VISIBILITY
  252. void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
  253. size_t blocks, const uint32_t key[8],
  254. uint64_t counter, bool increment_counter,
  255. uint8_t flags, uint8_t flags_start,
  256. uint8_t flags_end, uint8_t *out);
  257. #endif
  258. #endif
  259. #if BLAKE3_USE_NEON == 1
  260. LLVM_LIBRARY_VISIBILITY
  261. void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
  262. size_t blocks, const uint32_t key[8],
  263. uint64_t counter, bool increment_counter,
  264. uint8_t flags, uint8_t flags_start,
  265. uint8_t flags_end, uint8_t *out);
  266. #endif
  267. #endif /* BLAKE3_IMPL_H */