KeccakP-1600-times4-SIMD256_avx2.c 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284
  1. /*
  2. Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
  3. Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
  4. denoted as "the implementer".
  5. For more information, feedback or questions, please refer to our websites:
  6. http://keccak.noekeon.org/
  7. http://keyak.noekeon.org/
  8. http://ketje.noekeon.org/
  9. To the extent possible under law, the implementer has waived all copyright
  10. and related or neighboring rights to the source code in this file.
  11. http://creativecommons.org/publicdomain/zero/1.0/
  12. */
  13. #include <stdio.h>
  14. #include <stdlib.h>
  15. #include <string.h>
  16. // extra headers are removed: smmintrin.h, wmmintrin.h and emmintrin.h
  17. #if defined(S2N_KYBER512R3_AVX2_BMI2)
  18. #include <immintrin.h>
  19. #include "KeccakP-align_avx2.h"
  20. #include "KeccakP-1600-times4-SnP_avx2.h"
  21. #include "KeccakP-SIMD256-config_avx2.h"
  22. #include "KeccakP-brg_endian_avx2.h"
  23. #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
  24. #error Expecting a little-endian platform
  25. #endif
  26. typedef unsigned char UINT8;
  27. typedef unsigned long long int UINT64;
  28. typedef __m128i V128;
  29. typedef __m256i V256;
  30. #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
  31. #if defined(KeccakP1600times4_useAVX2)
  32. #define ANDnu256(a, b) _mm256_andnot_si256(a, b)
  33. // correcting cast-align error
  34. // old version: #define CONST256(a) _mm256_load_si256((const V256 *)&(a))
  35. #define CONST256(a) _mm256_load_si256((const void *)&(a))
  36. #define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a))
  37. #define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
  38. // correcting cast-align error
  39. // old version: #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
  40. #define LOAD256u(a) _mm256_loadu_si256((const void *)&(a))
  41. #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
  42. #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
  43. #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
  44. #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
  45. static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
  46. static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
  47. #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
  48. // correcting cast-align error
  49. // old version: #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
  50. #define STORE256u(a, b) _mm256_storeu_si256((void *)&(a), b)
  51. #define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
  52. #define XOR256(a, b) _mm256_xor_si256(a, b)
  53. #define XOReq256(a, b) a = _mm256_xor_si256(a, b)
  54. #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
  55. #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
  56. #define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
  57. #define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
  58. #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
  59. lanesH01 = UNPACKH( lanes0, lanes1 ), \
  60. lanesL23 = UNPACKL( lanes2, lanes3 ), \
  61. lanesH23 = UNPACKH( lanes2, lanes3 ), \
  62. lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
  63. lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
  64. lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
  65. lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
  66. #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
  67. lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
  68. lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
  69. lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
  70. lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
  71. lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
  72. lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
  73. lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
  74. #endif
  75. #define SnP_laneLengthInBytes 8
  76. void KeccakP1600times4_InitializeAll(void *states)
  77. {
  78. memset(states, 0, KeccakP1600times4_statesSizeInBytes);
  79. }
  80. void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
  81. {
  82. unsigned int sizeLeft = length;
  83. unsigned int lanePosition = offset/SnP_laneLengthInBytes;
  84. unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
  85. const unsigned char *curData = data;
  86. UINT64 *statesAsLanes = (UINT64 *)states;
  87. if ((sizeLeft > 0) && (offsetInLane != 0)) {
  88. unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
  89. UINT64 lane = 0;
  90. if (bytesInLane > sizeLeft)
  91. bytesInLane = sizeLeft;
  92. memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
  93. statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
  94. sizeLeft -= bytesInLane;
  95. lanePosition++;
  96. curData += bytesInLane;
  97. }
  98. while(sizeLeft >= SnP_laneLengthInBytes) {
  99. // correcting cast-align error
  100. // old version: UINT64 lane = *((const UINT64*)curData);
  101. UINT64 lane = *((const UINT64*)(const void *)curData);
  102. statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
  103. sizeLeft -= SnP_laneLengthInBytes;
  104. lanePosition++;
  105. curData += SnP_laneLengthInBytes;
  106. }
  107. if (sizeLeft > 0) {
  108. UINT64 lane = 0;
  109. memcpy(&lane, curData, sizeLeft);
  110. statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
  111. }
  112. }
  113. void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
  114. {
  115. V256 *stateAsLanes = (V256 *)states;
  116. unsigned int i;
  117. // correcting cast-align errors
  118. // old version: const UINT64 *curData0 = (const UINT64 *)data;
  119. const UINT64 *curData0 = (const void *)data;
  120. // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
  121. const UINT64 *curData1 = (const void *)(data+laneOffset*SnP_laneLengthInBytes);
  122. // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
  123. const UINT64 *curData2 = (const void *)(data+laneOffset*2*SnP_laneLengthInBytes);
  124. // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
  125. const UINT64 *curData3 = (const void *)(data+laneOffset*3*SnP_laneLengthInBytes);
  126. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  127. #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  128. #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
  129. lanes1 = LOAD256u( curData1[argIndex]),\
  130. lanes2 = LOAD256u( curData2[argIndex]),\
  131. lanes3 = LOAD256u( curData3[argIndex]),\
  132. INTLEAVE(),\
  133. XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
  134. XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
  135. XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
  136. XOReq256( stateAsLanes[argIndex+3], lanes3 )
  137. if ( laneCount >= 16 ) {
  138. Xor_In4( 0 );
  139. Xor_In4( 4 );
  140. Xor_In4( 8 );
  141. Xor_In4( 12 );
  142. if ( laneCount >= 20 ) {
  143. Xor_In4( 16 );
  144. for(i=20; i<laneCount; i++)
  145. Xor_In( i );
  146. }
  147. else {
  148. for(i=16; i<laneCount; i++)
  149. Xor_In( i );
  150. }
  151. }
  152. else {
  153. for(i=0; i<laneCount; i++)
  154. Xor_In( i );
  155. }
  156. #undef Xor_In
  157. #undef Xor_In4
  158. }
  159. void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
  160. {
  161. unsigned int sizeLeft = length;
  162. unsigned int lanePosition = offset/SnP_laneLengthInBytes;
  163. unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
  164. const unsigned char *curData = data;
  165. UINT64 *statesAsLanes = (UINT64 *)states;
  166. if ((sizeLeft > 0) && (offsetInLane != 0)) {
  167. unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
  168. if (bytesInLane > sizeLeft)
  169. bytesInLane = sizeLeft;
  170. memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
  171. sizeLeft -= bytesInLane;
  172. lanePosition++;
  173. curData += bytesInLane;
  174. }
  175. while(sizeLeft >= SnP_laneLengthInBytes) {
  176. // correcting cast-align error
  177. // old version: UINT64 lane = *((const UINT64*)curData);
  178. UINT64 lane = *((const UINT64*)(const void*)curData);
  179. statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
  180. sizeLeft -= SnP_laneLengthInBytes;
  181. lanePosition++;
  182. curData += SnP_laneLengthInBytes;
  183. }
  184. if (sizeLeft > 0) {
  185. memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
  186. }
  187. }
  188. void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
  189. {
  190. V256 *stateAsLanes = (V256 *)states;
  191. unsigned int i;
  192. // correcting cast-align errors
  193. // old version: const UINT64 *curData0 = (const UINT64 *)data;
  194. const UINT64 *curData0 = (const void *)data;
  195. // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
  196. const UINT64 *curData1 = (const void *)(data+laneOffset*SnP_laneLengthInBytes);
  197. // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
  198. const UINT64 *curData2 = (const void *)(data+laneOffset*2*SnP_laneLengthInBytes);
  199. // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
  200. const UINT64 *curData3 = (const void *)(data+laneOffset*3*SnP_laneLengthInBytes);
  201. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  202. #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  203. #define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
  204. lanes1 = LOAD256u( curData1[argIndex]),\
  205. lanes2 = LOAD256u( curData2[argIndex]),\
  206. lanes3 = LOAD256u( curData3[argIndex]),\
  207. INTLEAVE(),\
  208. STORE256( stateAsLanes[argIndex+0], lanes0 ),\
  209. STORE256( stateAsLanes[argIndex+1], lanes1 ),\
  210. STORE256( stateAsLanes[argIndex+2], lanes2 ),\
  211. STORE256( stateAsLanes[argIndex+3], lanes3 )
  212. if ( laneCount >= 16 ) {
  213. OverWr4( 0 );
  214. OverWr4( 4 );
  215. OverWr4( 8 );
  216. OverWr4( 12 );
  217. if ( laneCount >= 20 ) {
  218. OverWr4( 16 );
  219. for(i=20; i<laneCount; i++)
  220. OverWr( i );
  221. }
  222. else {
  223. for(i=16; i<laneCount; i++)
  224. OverWr( i );
  225. }
  226. }
  227. else {
  228. for(i=0; i<laneCount; i++)
  229. OverWr( i );
  230. }
  231. #undef OverWr
  232. #undef OverWr4
  233. }
  234. void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
  235. {
  236. unsigned int sizeLeft = byteCount;
  237. unsigned int lanePosition = 0;
  238. UINT64 *statesAsLanes = (UINT64 *)states;
  239. while(sizeLeft >= SnP_laneLengthInBytes) {
  240. statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
  241. sizeLeft -= SnP_laneLengthInBytes;
  242. lanePosition++;
  243. }
  244. if (sizeLeft > 0) {
  245. memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
  246. }
  247. }
  248. void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
  249. {
  250. unsigned int sizeLeft = length;
  251. unsigned int lanePosition = offset/SnP_laneLengthInBytes;
  252. unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
  253. unsigned char *curData = data;
  254. const UINT64 *statesAsLanes = (const UINT64 *)states;
  255. if ((sizeLeft > 0) && (offsetInLane != 0)) {
  256. unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
  257. if (bytesInLane > sizeLeft)
  258. bytesInLane = sizeLeft;
  259. // correcting cast-qual error
  260. // old version: memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
  261. memcpy( curData, ((const unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
  262. sizeLeft -= bytesInLane;
  263. lanePosition++;
  264. curData += bytesInLane;
  265. }
  266. while(sizeLeft >= SnP_laneLengthInBytes) {
  267. // correcting cast-align error
  268. // old version: *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
  269. *(UINT64*)(void*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
  270. sizeLeft -= SnP_laneLengthInBytes;
  271. lanePosition++;
  272. curData += SnP_laneLengthInBytes;
  273. }
  274. if (sizeLeft > 0) {
  275. memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
  276. }
  277. }
  278. void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
  279. {
  280. // correcting cast-align errors
  281. // old version: UINT64 *curData0 = (UINT64 *)data;
  282. UINT64 *curData0 = (void *)data;
  283. // old version: UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
  284. UINT64 *curData1 = (void *)(data+laneOffset*1*SnP_laneLengthInBytes);
  285. // old version: UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
  286. UINT64 *curData2 = (void *)(data+laneOffset*2*SnP_laneLengthInBytes);
  287. // old version: UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
  288. UINT64 *curData3 = (void *)(data+laneOffset*3*SnP_laneLengthInBytes);
  289. const V256 *stateAsLanes = (const V256 *)states;
  290. const UINT64 *stateAsLanes64 = (const UINT64*)states;
  291. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  292. unsigned int i;
  293. #define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \
  294. curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \
  295. curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \
  296. curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
  297. #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \
  298. lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \
  299. lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \
  300. lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \
  301. UNINTLEAVE(), \
  302. STORE256u( curData0[argIndex], lanes0 ), \
  303. STORE256u( curData1[argIndex], lanes1 ), \
  304. STORE256u( curData2[argIndex], lanes2 ), \
  305. STORE256u( curData3[argIndex], lanes3 )
  306. if ( laneCount >= 16 ) {
  307. Extr4( 0 );
  308. Extr4( 4 );
  309. Extr4( 8 );
  310. Extr4( 12 );
  311. if ( laneCount >= 20 ) {
  312. Extr4( 16 );
  313. for(i=20; i<laneCount; i++)
  314. Extr( i );
  315. }
  316. else {
  317. for(i=16; i<laneCount; i++)
  318. Extr( i );
  319. }
  320. }
  321. else {
  322. for(i=0; i<laneCount; i++)
  323. Extr( i );
  324. }
  325. #undef Extr
  326. #undef Extr4
  327. }
  328. void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
  329. {
  330. unsigned int sizeLeft = length;
  331. unsigned int lanePosition = offset/SnP_laneLengthInBytes;
  332. unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
  333. const unsigned char *curInput = input;
  334. unsigned char *curOutput = output;
  335. const UINT64 *statesAsLanes = (const UINT64 *)states;
  336. if ((sizeLeft > 0) && (offsetInLane != 0)) {
  337. unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
  338. UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
  339. if (bytesInLane > sizeLeft)
  340. bytesInLane = sizeLeft;
  341. sizeLeft -= bytesInLane;
  342. do {
  343. *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
  344. lane >>= 8;
  345. } while ( --bytesInLane != 0);
  346. lanePosition++;
  347. }
  348. while(sizeLeft >= SnP_laneLengthInBytes) {
  349. // correcting cast-align and cast-qual errors
  350. // old version: *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
  351. *((UINT64*)(void*)curOutput) = *((const UINT64*)(const void*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
  352. sizeLeft -= SnP_laneLengthInBytes;
  353. lanePosition++;
  354. curInput += SnP_laneLengthInBytes;
  355. curOutput += SnP_laneLengthInBytes;
  356. }
  357. if (sizeLeft != 0) {
  358. UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
  359. do {
  360. *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
  361. lane >>= 8;
  362. } while ( --sizeLeft != 0);
  363. }
  364. }
  365. void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
  366. {
  367. // correcting cast-align and cast-qual errors
  368. // old version: const UINT64 *curInput0 = (UINT64 *)input;
  369. const UINT64 *curInput0 = (const void *)input;
  370. // old version: const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
  371. const UINT64 *curInput1 = (const void *)(input+laneOffset*1*SnP_laneLengthInBytes);
  372. // old version: const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
  373. const UINT64 *curInput2 = (const void *)(input+laneOffset*2*SnP_laneLengthInBytes);
  374. // old version: const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
  375. const UINT64 *curInput3 = (const void *)(input+laneOffset*3*SnP_laneLengthInBytes);
  376. // correcting cast-align errors
  377. // old version: UINT64 *curOutput0 = (UINT64 *)output;
  378. UINT64 *curOutput0 = (void *)output;
  379. // old version: UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
  380. UINT64 *curOutput1 = (void *)(output+laneOffset*1*SnP_laneLengthInBytes);
  381. // old version: UUINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
  382. UINT64 *curOutput2 = (void *)(output+laneOffset*2*SnP_laneLengthInBytes);
  383. // old version: UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
  384. UINT64 *curOutput3 = (void *)(output+laneOffset*3*SnP_laneLengthInBytes);
  385. const V256 *stateAsLanes = (const V256 *)states;
  386. const UINT64 *stateAsLanes64 = (const UINT64*)states;
  387. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  388. unsigned int i;
  389. #define ExtrXor( argIndex ) \
  390. curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
  391. curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
  392. curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
  393. curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
  394. #define ExtrXor4( argIndex ) \
  395. lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
  396. lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
  397. lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
  398. lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
  399. UNINTLEAVE(),\
  400. lanesL01 = LOAD256u( curInput0[argIndex]),\
  401. lanesH01 = LOAD256u( curInput1[argIndex]),\
  402. lanesL23 = LOAD256u( curInput2[argIndex]),\
  403. lanesH23 = LOAD256u( curInput3[argIndex]),\
  404. XOReq256( lanes0, lanesL01 ),\
  405. XOReq256( lanes1, lanesH01 ),\
  406. XOReq256( lanes2, lanesL23 ),\
  407. XOReq256( lanes3, lanesH23 ),\
  408. STORE256u( curOutput0[argIndex], lanes0 ),\
  409. STORE256u( curOutput1[argIndex], lanes1 ),\
  410. STORE256u( curOutput2[argIndex], lanes2 ),\
  411. STORE256u( curOutput3[argIndex], lanes3 )
  412. if ( laneCount >= 16 ) {
  413. ExtrXor4( 0 );
  414. ExtrXor4( 4 );
  415. ExtrXor4( 8 );
  416. ExtrXor4( 12 );
  417. if ( laneCount >= 20 ) {
  418. ExtrXor4( 16 );
  419. for(i=20; i<laneCount; i++)
  420. ExtrXor( i );
  421. }
  422. else {
  423. for(i=16; i<laneCount; i++)
  424. ExtrXor( i );
  425. }
  426. }
  427. else {
  428. for(i=0; i<laneCount; i++)
  429. ExtrXor( i );
  430. }
  431. #undef ExtrXor
  432. #undef ExtrXor4
  433. }
  434. #define declareABCDE \
  435. V256 Aba, Abe, Abi, Abo, Abu; \
  436. V256 Aga, Age, Agi, Ago, Agu; \
  437. V256 Aka, Ake, Aki, Ako, Aku; \
  438. V256 Ama, Ame, Ami, Amo, Amu; \
  439. V256 Asa, Ase, Asi, Aso, Asu; \
  440. V256 Bba, Bbe, Bbi, Bbo, Bbu; \
  441. V256 Bga, Bge, Bgi, Bgo, Bgu; \
  442. V256 Bka, Bke, Bki, Bko, Bku; \
  443. V256 Bma, Bme, Bmi, Bmo, Bmu; \
  444. V256 Bsa, Bse, Bsi, Bso, Bsu; \
  445. V256 Ca, Ce, Ci, Co, Cu; \
  446. V256 Ca1, Ce1, Ci1, Co1, Cu1; \
  447. V256 Da, De, Di, Do, Du; \
  448. V256 Eba, Ebe, Ebi, Ebo, Ebu; \
  449. V256 Ega, Ege, Egi, Ego, Egu; \
  450. V256 Eka, Eke, Eki, Eko, Eku; \
  451. V256 Ema, Eme, Emi, Emo, Emu; \
  452. V256 Esa, Ese, Esi, Eso, Esu; \
  453. #define prepareTheta \
  454. Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
  455. Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
  456. Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
  457. Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
  458. Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
  459. /* --- Theta Rho Pi Chi Iota Prepare-theta */
  460. /* --- 64-bit lanes mapped to 64-bit words */
  461. #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
  462. ROL64in256(Ce1, Ce, 1); \
  463. Da = XOR256(Cu, Ce1); \
  464. ROL64in256(Ci1, Ci, 1); \
  465. De = XOR256(Ca, Ci1); \
  466. ROL64in256(Co1, Co, 1); \
  467. Di = XOR256(Ce, Co1); \
  468. ROL64in256(Cu1, Cu, 1); \
  469. Do = XOR256(Ci, Cu1); \
  470. ROL64in256(Ca1, Ca, 1); \
  471. Du = XOR256(Co, Ca1); \
  472. \
  473. XOReq256(A##ba, Da); \
  474. Bba = A##ba; \
  475. XOReq256(A##ge, De); \
  476. ROL64in256(Bbe, A##ge, 44); \
  477. XOReq256(A##ki, Di); \
  478. ROL64in256(Bbi, A##ki, 43); \
  479. E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
  480. XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
  481. Ca = E##ba; \
  482. XOReq256(A##mo, Do); \
  483. ROL64in256(Bbo, A##mo, 21); \
  484. E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
  485. Ce = E##be; \
  486. XOReq256(A##su, Du); \
  487. ROL64in256(Bbu, A##su, 14); \
  488. E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
  489. Ci = E##bi; \
  490. E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
  491. Co = E##bo; \
  492. E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
  493. Cu = E##bu; \
  494. \
  495. XOReq256(A##bo, Do); \
  496. ROL64in256(Bga, A##bo, 28); \
  497. XOReq256(A##gu, Du); \
  498. ROL64in256(Bge, A##gu, 20); \
  499. XOReq256(A##ka, Da); \
  500. ROL64in256(Bgi, A##ka, 3); \
  501. E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
  502. XOReq256(Ca, E##ga); \
  503. XOReq256(A##me, De); \
  504. ROL64in256(Bgo, A##me, 45); \
  505. E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
  506. XOReq256(Ce, E##ge); \
  507. XOReq256(A##si, Di); \
  508. ROL64in256(Bgu, A##si, 61); \
  509. E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
  510. XOReq256(Ci, E##gi); \
  511. E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
  512. XOReq256(Co, E##go); \
  513. E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
  514. XOReq256(Cu, E##gu); \
  515. \
  516. XOReq256(A##be, De); \
  517. ROL64in256(Bka, A##be, 1); \
  518. XOReq256(A##gi, Di); \
  519. ROL64in256(Bke, A##gi, 6); \
  520. XOReq256(A##ko, Do); \
  521. ROL64in256(Bki, A##ko, 25); \
  522. E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
  523. XOReq256(Ca, E##ka); \
  524. XOReq256(A##mu, Du); \
  525. ROL64in256_8(Bko, A##mu); \
  526. E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
  527. XOReq256(Ce, E##ke); \
  528. XOReq256(A##sa, Da); \
  529. ROL64in256(Bku, A##sa, 18); \
  530. E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
  531. XOReq256(Ci, E##ki); \
  532. E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
  533. XOReq256(Co, E##ko); \
  534. E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
  535. XOReq256(Cu, E##ku); \
  536. \
  537. XOReq256(A##bu, Du); \
  538. ROL64in256(Bma, A##bu, 27); \
  539. XOReq256(A##ga, Da); \
  540. ROL64in256(Bme, A##ga, 36); \
  541. XOReq256(A##ke, De); \
  542. ROL64in256(Bmi, A##ke, 10); \
  543. E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
  544. XOReq256(Ca, E##ma); \
  545. XOReq256(A##mi, Di); \
  546. ROL64in256(Bmo, A##mi, 15); \
  547. E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
  548. XOReq256(Ce, E##me); \
  549. XOReq256(A##so, Do); \
  550. ROL64in256_56(Bmu, A##so); \
  551. E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
  552. XOReq256(Ci, E##mi); \
  553. E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
  554. XOReq256(Co, E##mo); \
  555. E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
  556. XOReq256(Cu, E##mu); \
  557. \
  558. XOReq256(A##bi, Di); \
  559. ROL64in256(Bsa, A##bi, 62); \
  560. XOReq256(A##go, Do); \
  561. ROL64in256(Bse, A##go, 55); \
  562. XOReq256(A##ku, Du); \
  563. ROL64in256(Bsi, A##ku, 39); \
  564. E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
  565. XOReq256(Ca, E##sa); \
  566. XOReq256(A##ma, Da); \
  567. ROL64in256(Bso, A##ma, 41); \
  568. E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
  569. XOReq256(Ce, E##se); \
  570. XOReq256(A##se, De); \
  571. ROL64in256(Bsu, A##se, 2); \
  572. E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
  573. XOReq256(Ci, E##si); \
  574. E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
  575. XOReq256(Co, E##so); \
  576. E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
  577. XOReq256(Cu, E##su); \
  578. \
  579. /* --- Theta Rho Pi Chi Iota */
  580. /* --- 64-bit lanes mapped to 64-bit words */
  581. #define thetaRhoPiChiIota(i, A, E) \
  582. ROL64in256(Ce1, Ce, 1); \
  583. Da = XOR256(Cu, Ce1); \
  584. ROL64in256(Ci1, Ci, 1); \
  585. De = XOR256(Ca, Ci1); \
  586. ROL64in256(Co1, Co, 1); \
  587. Di = XOR256(Ce, Co1); \
  588. ROL64in256(Cu1, Cu, 1); \
  589. Do = XOR256(Ci, Cu1); \
  590. ROL64in256(Ca1, Ca, 1); \
  591. Du = XOR256(Co, Ca1); \
  592. \
  593. XOReq256(A##ba, Da); \
  594. Bba = A##ba; \
  595. XOReq256(A##ge, De); \
  596. ROL64in256(Bbe, A##ge, 44); \
  597. XOReq256(A##ki, Di); \
  598. ROL64in256(Bbi, A##ki, 43); \
  599. E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
  600. XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
  601. XOReq256(A##mo, Do); \
  602. ROL64in256(Bbo, A##mo, 21); \
  603. E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
  604. XOReq256(A##su, Du); \
  605. ROL64in256(Bbu, A##su, 14); \
  606. E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
  607. E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
  608. E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
  609. \
  610. XOReq256(A##bo, Do); \
  611. ROL64in256(Bga, A##bo, 28); \
  612. XOReq256(A##gu, Du); \
  613. ROL64in256(Bge, A##gu, 20); \
  614. XOReq256(A##ka, Da); \
  615. ROL64in256(Bgi, A##ka, 3); \
  616. E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
  617. XOReq256(A##me, De); \
  618. ROL64in256(Bgo, A##me, 45); \
  619. E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
  620. XOReq256(A##si, Di); \
  621. ROL64in256(Bgu, A##si, 61); \
  622. E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
  623. E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
  624. E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
  625. \
  626. XOReq256(A##be, De); \
  627. ROL64in256(Bka, A##be, 1); \
  628. XOReq256(A##gi, Di); \
  629. ROL64in256(Bke, A##gi, 6); \
  630. XOReq256(A##ko, Do); \
  631. ROL64in256(Bki, A##ko, 25); \
  632. E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
  633. XOReq256(A##mu, Du); \
  634. ROL64in256_8(Bko, A##mu); \
  635. E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
  636. XOReq256(A##sa, Da); \
  637. ROL64in256(Bku, A##sa, 18); \
  638. E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
  639. E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
  640. E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
  641. \
  642. XOReq256(A##bu, Du); \
  643. ROL64in256(Bma, A##bu, 27); \
  644. XOReq256(A##ga, Da); \
  645. ROL64in256(Bme, A##ga, 36); \
  646. XOReq256(A##ke, De); \
  647. ROL64in256(Bmi, A##ke, 10); \
  648. E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
  649. XOReq256(A##mi, Di); \
  650. ROL64in256(Bmo, A##mi, 15); \
  651. E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
  652. XOReq256(A##so, Do); \
  653. ROL64in256_56(Bmu, A##so); \
  654. E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
  655. E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
  656. E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
  657. \
  658. XOReq256(A##bi, Di); \
  659. ROL64in256(Bsa, A##bi, 62); \
  660. XOReq256(A##go, Do); \
  661. ROL64in256(Bse, A##go, 55); \
  662. XOReq256(A##ku, Du); \
  663. ROL64in256(Bsi, A##ku, 39); \
  664. E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
  665. XOReq256(A##ma, Da); \
  666. ROL64in256(Bso, A##ma, 41); \
  667. E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
  668. XOReq256(A##se, De); \
  669. ROL64in256(Bsu, A##se, 2); \
  670. E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
  671. E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
  672. E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
  673. \
  674. static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
  675. 0x0000000000000001ULL,
  676. 0x0000000000008082ULL,
  677. 0x800000000000808aULL,
  678. 0x8000000080008000ULL,
  679. 0x000000000000808bULL,
  680. 0x0000000080000001ULL,
  681. 0x8000000080008081ULL,
  682. 0x8000000000008009ULL,
  683. 0x000000000000008aULL,
  684. 0x0000000000000088ULL,
  685. 0x0000000080008009ULL,
  686. 0x000000008000000aULL,
  687. 0x000000008000808bULL,
  688. 0x800000000000008bULL,
  689. 0x8000000000008089ULL,
  690. 0x8000000000008003ULL,
  691. 0x8000000000008002ULL,
  692. 0x8000000000000080ULL,
  693. 0x000000000000800aULL,
  694. 0x800000008000000aULL,
  695. 0x8000000080008081ULL,
  696. 0x8000000000008080ULL,
  697. 0x0000000080000001ULL,
  698. 0x8000000080008008ULL};
  699. #define copyFromState(X, state) \
  700. X##ba = LOAD256(state[ 0]); \
  701. X##be = LOAD256(state[ 1]); \
  702. X##bi = LOAD256(state[ 2]); \
  703. X##bo = LOAD256(state[ 3]); \
  704. X##bu = LOAD256(state[ 4]); \
  705. X##ga = LOAD256(state[ 5]); \
  706. X##ge = LOAD256(state[ 6]); \
  707. X##gi = LOAD256(state[ 7]); \
  708. X##go = LOAD256(state[ 8]); \
  709. X##gu = LOAD256(state[ 9]); \
  710. X##ka = LOAD256(state[10]); \
  711. X##ke = LOAD256(state[11]); \
  712. X##ki = LOAD256(state[12]); \
  713. X##ko = LOAD256(state[13]); \
  714. X##ku = LOAD256(state[14]); \
  715. X##ma = LOAD256(state[15]); \
  716. X##me = LOAD256(state[16]); \
  717. X##mi = LOAD256(state[17]); \
  718. X##mo = LOAD256(state[18]); \
  719. X##mu = LOAD256(state[19]); \
  720. X##sa = LOAD256(state[20]); \
  721. X##se = LOAD256(state[21]); \
  722. X##si = LOAD256(state[22]); \
  723. X##so = LOAD256(state[23]); \
  724. X##su = LOAD256(state[24]); \
  725. #define copyToState(state, X) \
  726. STORE256(state[ 0], X##ba); \
  727. STORE256(state[ 1], X##be); \
  728. STORE256(state[ 2], X##bi); \
  729. STORE256(state[ 3], X##bo); \
  730. STORE256(state[ 4], X##bu); \
  731. STORE256(state[ 5], X##ga); \
  732. STORE256(state[ 6], X##ge); \
  733. STORE256(state[ 7], X##gi); \
  734. STORE256(state[ 8], X##go); \
  735. STORE256(state[ 9], X##gu); \
  736. STORE256(state[10], X##ka); \
  737. STORE256(state[11], X##ke); \
  738. STORE256(state[12], X##ki); \
  739. STORE256(state[13], X##ko); \
  740. STORE256(state[14], X##ku); \
  741. STORE256(state[15], X##ma); \
  742. STORE256(state[16], X##me); \
  743. STORE256(state[17], X##mi); \
  744. STORE256(state[18], X##mo); \
  745. STORE256(state[19], X##mu); \
  746. STORE256(state[20], X##sa); \
  747. STORE256(state[21], X##se); \
  748. STORE256(state[22], X##si); \
  749. STORE256(state[23], X##so); \
  750. STORE256(state[24], X##su); \
  751. #define copyStateVariables(X, Y) \
  752. X##ba = Y##ba; \
  753. X##be = Y##be; \
  754. X##bi = Y##bi; \
  755. X##bo = Y##bo; \
  756. X##bu = Y##bu; \
  757. X##ga = Y##ga; \
  758. X##ge = Y##ge; \
  759. X##gi = Y##gi; \
  760. X##go = Y##go; \
  761. X##gu = Y##gu; \
  762. X##ka = Y##ka; \
  763. X##ke = Y##ke; \
  764. X##ki = Y##ki; \
  765. X##ko = Y##ko; \
  766. X##ku = Y##ku; \
  767. X##ma = Y##ma; \
  768. X##me = Y##me; \
  769. X##mi = Y##mi; \
  770. X##mo = Y##mo; \
  771. X##mu = Y##mu; \
  772. X##sa = Y##sa; \
  773. X##se = Y##se; \
  774. X##si = Y##si; \
  775. X##so = Y##so; \
  776. X##su = Y##su; \
  777. #ifdef KeccakP1600times4_fullUnrolling
  778. #define FullUnrolling
  779. #else
  780. #define Unrolling KeccakP1600times4_unrolling
  781. #endif
  782. // The macro file is combined with source file directly
  783. /*****#include "KeccakP-1600-unrolling_avx2.macros"*****/
  784. /*******************************************************/
  785. /*
  786. Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
  787. Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
  788. denoted as "the implementer".
  789. For more information, feedback or questions, please refer to our websites:
  790. http://keccak.noekeon.org/
  791. http://keyak.noekeon.org/
  792. http://ketje.noekeon.org/
  793. To the extent possible under law, the implementer has waived all copyright
  794. and related or neighboring rights to the source code in this file.
  795. http://creativecommons.org/publicdomain/zero/1.0/
  796. */
  797. #if (defined(FullUnrolling))
  798. #define rounds24 \
  799. prepareTheta \
  800. thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
  801. thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
  802. thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
  803. thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
  804. thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
  805. thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
  806. thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
  807. thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
  808. thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
  809. thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
  810. thetaRhoPiChiIotaPrepareTheta(10, A, E) \
  811. thetaRhoPiChiIotaPrepareTheta(11, E, A) \
  812. thetaRhoPiChiIotaPrepareTheta(12, A, E) \
  813. thetaRhoPiChiIotaPrepareTheta(13, E, A) \
  814. thetaRhoPiChiIotaPrepareTheta(14, A, E) \
  815. thetaRhoPiChiIotaPrepareTheta(15, E, A) \
  816. thetaRhoPiChiIotaPrepareTheta(16, A, E) \
  817. thetaRhoPiChiIotaPrepareTheta(17, E, A) \
  818. thetaRhoPiChiIotaPrepareTheta(18, A, E) \
  819. thetaRhoPiChiIotaPrepareTheta(19, E, A) \
  820. thetaRhoPiChiIotaPrepareTheta(20, A, E) \
  821. thetaRhoPiChiIotaPrepareTheta(21, E, A) \
  822. thetaRhoPiChiIotaPrepareTheta(22, A, E) \
  823. thetaRhoPiChiIota(23, E, A) \
  824. #define rounds12 \
  825. prepareTheta \
  826. thetaRhoPiChiIotaPrepareTheta(12, A, E) \
  827. thetaRhoPiChiIotaPrepareTheta(13, E, A) \
  828. thetaRhoPiChiIotaPrepareTheta(14, A, E) \
  829. thetaRhoPiChiIotaPrepareTheta(15, E, A) \
  830. thetaRhoPiChiIotaPrepareTheta(16, A, E) \
  831. thetaRhoPiChiIotaPrepareTheta(17, E, A) \
  832. thetaRhoPiChiIotaPrepareTheta(18, A, E) \
  833. thetaRhoPiChiIotaPrepareTheta(19, E, A) \
  834. thetaRhoPiChiIotaPrepareTheta(20, A, E) \
  835. thetaRhoPiChiIotaPrepareTheta(21, E, A) \
  836. thetaRhoPiChiIotaPrepareTheta(22, A, E) \
  837. thetaRhoPiChiIota(23, E, A) \
  838. #elif (Unrolling == 12)
  839. #define rounds24 \
  840. prepareTheta \
  841. for(i=0; i<24; i+=12) { \
  842. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  843. thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
  844. thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
  845. thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
  846. thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
  847. thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
  848. thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
  849. thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
  850. thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
  851. thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
  852. thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
  853. thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
  854. } \
  855. #define rounds12 \
  856. prepareTheta \
  857. thetaRhoPiChiIotaPrepareTheta(12, A, E) \
  858. thetaRhoPiChiIotaPrepareTheta(13, E, A) \
  859. thetaRhoPiChiIotaPrepareTheta(14, A, E) \
  860. thetaRhoPiChiIotaPrepareTheta(15, E, A) \
  861. thetaRhoPiChiIotaPrepareTheta(16, A, E) \
  862. thetaRhoPiChiIotaPrepareTheta(17, E, A) \
  863. thetaRhoPiChiIotaPrepareTheta(18, A, E) \
  864. thetaRhoPiChiIotaPrepareTheta(19, E, A) \
  865. thetaRhoPiChiIotaPrepareTheta(20, A, E) \
  866. thetaRhoPiChiIotaPrepareTheta(21, E, A) \
  867. thetaRhoPiChiIotaPrepareTheta(22, A, E) \
  868. thetaRhoPiChiIota(23, E, A) \
  869. #elif (Unrolling == 6)
  870. #define rounds24 \
  871. prepareTheta \
  872. for(i=0; i<24; i+=6) { \
  873. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  874. thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
  875. thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
  876. thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
  877. thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
  878. thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
  879. } \
  880. #define rounds12 \
  881. prepareTheta \
  882. for(i=12; i<24; i+=6) { \
  883. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  884. thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
  885. thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
  886. thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
  887. thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
  888. thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
  889. } \
  890. #elif (Unrolling == 4)
  891. #define rounds24 \
  892. prepareTheta \
  893. for(i=0; i<24; i+=4) { \
  894. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  895. thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
  896. thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
  897. thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
  898. } \
  899. #define rounds12 \
  900. prepareTheta \
  901. for(i=12; i<24; i+=4) { \
  902. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  903. thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
  904. thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
  905. thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
  906. } \
  907. #elif (Unrolling == 3)
  908. #define rounds24 \
  909. prepareTheta \
  910. for(i=0; i<24; i+=3) { \
  911. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  912. thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
  913. thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
  914. copyStateVariables(A, E) \
  915. } \
  916. #define rounds12 \
  917. prepareTheta \
  918. for(i=12; i<24; i+=3) { \
  919. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  920. thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
  921. thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
  922. copyStateVariables(A, E) \
  923. } \
  924. #elif (Unrolling == 2)
  925. #define rounds24 \
  926. prepareTheta \
  927. for(i=0; i<24; i+=2) { \
  928. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  929. thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
  930. } \
  931. #define rounds12 \
  932. prepareTheta \
  933. for(i=12; i<24; i+=2) { \
  934. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  935. thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
  936. } \
  937. #elif (Unrolling == 1)
  938. #define rounds24 \
  939. prepareTheta \
  940. for(i=0; i<24; i++) { \
  941. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  942. copyStateVariables(A, E) \
  943. } \
  944. #define rounds12 \
  945. prepareTheta \
  946. for(i=12; i<24; i++) { \
  947. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  948. copyStateVariables(A, E) \
  949. } \
  950. #else
  951. #error "Unrolling is not correctly specified!"
  952. #endif
  953. #define roundsN(__nrounds) \
  954. prepareTheta \
  955. i = 24 - (__nrounds); \
  956. if ((i&1) != 0) { \
  957. thetaRhoPiChiIotaPrepareTheta(i, A, E) \
  958. copyStateVariables(A, E) \
  959. ++i; \
  960. } \
  961. for( /* empty */; i<24; i+=2) { \
  962. thetaRhoPiChiIotaPrepareTheta(i , A, E) \
  963. thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
  964. }
  965. /*******************************************************/
  966. void KeccakP1600times4_PermuteAll_24rounds(void *states)
  967. {
  968. V256 *statesAsLanes = (V256 *)states;
  969. declareABCDE
  970. #ifndef KeccakP1600times4_fullUnrolling
  971. unsigned int i;
  972. #endif
  973. copyFromState(A, statesAsLanes)
  974. rounds24
  975. copyToState(statesAsLanes, A)
  976. }
  977. void KeccakP1600times4_PermuteAll_12rounds(void *states)
  978. {
  979. V256 *statesAsLanes = (V256 *)states;
  980. declareABCDE
  981. #ifndef KeccakP1600times4_fullUnrolling
  982. unsigned int i;
  983. #endif
  984. copyFromState(A, statesAsLanes)
  985. rounds12
  986. copyToState(statesAsLanes, A)
  987. }
  988. size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
  989. {
  990. if (laneCount == 21) {
  991. #if 0
  992. const unsigned char *dataStart = data;
  993. const UINT64 *curData0 = (const UINT64 *)data;
  994. const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
  995. const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
  996. const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
  997. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  998. V256 *stateAsLanes = (V256 *)states;
  999. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  1000. #define Xor_In( argIndex ) \
  1001. XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  1002. #define Xor_In4( argIndex ) \
  1003. lanes0 = LOAD256u( curData0[argIndex]),\
  1004. lanes1 = LOAD256u( curData1[argIndex]),\
  1005. lanes2 = LOAD256u( curData2[argIndex]),\
  1006. lanes3 = LOAD256u( curData3[argIndex]),\
  1007. INTLEAVE(),\
  1008. XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
  1009. XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
  1010. XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
  1011. XOReq256( stateAsLanes[argIndex+3], lanes3 )
  1012. Xor_In4( 0 );
  1013. Xor_In4( 4 );
  1014. Xor_In4( 8 );
  1015. Xor_In4( 12 );
  1016. Xor_In4( 16 );
  1017. Xor_In( 20 );
  1018. #undef Xor_In
  1019. #undef Xor_In4
  1020. KeccakP1600times4_PermuteAll_24rounds(states);
  1021. curData0 += laneOffsetSerial;
  1022. curData1 += laneOffsetSerial;
  1023. curData2 += laneOffsetSerial;
  1024. curData3 += laneOffsetSerial;
  1025. dataByteLen -= laneOffsetSerial*8;
  1026. }
  1027. return (const unsigned char *)curData0 - dataStart;
  1028. #else
  1029. // unsigned int i;
  1030. const unsigned char *dataStart = data;
  1031. // correcting cast-align errors
  1032. // old version: const UINT64 *curData0 = (const UINT64 *)data;
  1033. const UINT64 *curData0 = (const void *)data;
  1034. // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
  1035. const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
  1036. // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
  1037. const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
  1038. // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
  1039. const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
  1040. V256 *statesAsLanes = (V256 *)states;
  1041. declareABCDE
  1042. copyFromState(A, statesAsLanes)
  1043. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  1044. #define XOR_In( Xxx, argIndex ) \
  1045. XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  1046. XOR_In( Aba, 0 );
  1047. XOR_In( Abe, 1 );
  1048. XOR_In( Abi, 2 );
  1049. XOR_In( Abo, 3 );
  1050. XOR_In( Abu, 4 );
  1051. XOR_In( Aga, 5 );
  1052. XOR_In( Age, 6 );
  1053. XOR_In( Agi, 7 );
  1054. XOR_In( Ago, 8 );
  1055. XOR_In( Agu, 9 );
  1056. XOR_In( Aka, 10 );
  1057. XOR_In( Ake, 11 );
  1058. XOR_In( Aki, 12 );
  1059. XOR_In( Ako, 13 );
  1060. XOR_In( Aku, 14 );
  1061. XOR_In( Ama, 15 );
  1062. XOR_In( Ame, 16 );
  1063. XOR_In( Ami, 17 );
  1064. XOR_In( Amo, 18 );
  1065. XOR_In( Amu, 19 );
  1066. XOR_In( Asa, 20 );
  1067. #undef XOR_In
  1068. rounds24
  1069. curData0 += laneOffsetSerial;
  1070. curData1 += laneOffsetSerial;
  1071. curData2 += laneOffsetSerial;
  1072. curData3 += laneOffsetSerial;
  1073. dataByteLen -= laneOffsetSerial*8;
  1074. }
  1075. copyToState(statesAsLanes, A)
  1076. return (const unsigned char *)curData0 - dataStart;
  1077. #endif
  1078. }
  1079. else {
  1080. // unsigned int i;
  1081. const unsigned char *dataStart = data;
  1082. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  1083. KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
  1084. KeccakP1600times4_PermuteAll_24rounds(states);
  1085. data += laneOffsetSerial*8;
  1086. dataByteLen -= laneOffsetSerial*8;
  1087. }
  1088. return data - dataStart;
  1089. }
  1090. }
  1091. size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
  1092. {
  1093. if (laneCount == 21) {
  1094. #if 0
  1095. const unsigned char *dataStart = data;
  1096. const UINT64 *curData0 = (const UINT64 *)data;
  1097. const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
  1098. const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
  1099. const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
  1100. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  1101. V256 *stateAsLanes = states;
  1102. V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
  1103. #define Xor_In( argIndex ) \
  1104. XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  1105. #define Xor_In4( argIndex ) \
  1106. lanes0 = LOAD256u( curData0[argIndex]),\
  1107. lanes1 = LOAD256u( curData1[argIndex]),\
  1108. lanes2 = LOAD256u( curData2[argIndex]),\
  1109. lanes3 = LOAD256u( curData3[argIndex]),\
  1110. INTLEAVE(),\
  1111. XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
  1112. XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
  1113. XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
  1114. XOReq256( stateAsLanes[argIndex+3], lanes3 )
  1115. Xor_In4( 0 );
  1116. Xor_In4( 4 );
  1117. Xor_In4( 8 );
  1118. Xor_In4( 12 );
  1119. Xor_In4( 16 );
  1120. Xor_In( 20 );
  1121. #undef Xor_In
  1122. #undef Xor_In4
  1123. KeccakP1600times4_PermuteAll_12rounds(states);
  1124. curData0 += laneOffsetSerial;
  1125. curData1 += laneOffsetSerial;
  1126. curData2 += laneOffsetSerial;
  1127. curData3 += laneOffsetSerial;
  1128. dataByteLen -= laneOffsetSerial*8;
  1129. }
  1130. return (const unsigned char *)curData0 - dataStart;
  1131. #else
  1132. // unsigned int i;
  1133. const unsigned char *dataStart = data;
  1134. // correcting cast-align errors
  1135. // old version: const UINT64 *curData0 = (const UINT64 *)data;
  1136. const UINT64 *curData0 = (const void *)data;
  1137. // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
  1138. const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
  1139. // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
  1140. const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
  1141. // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
  1142. const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
  1143. V256 *statesAsLanes = states;
  1144. declareABCDE
  1145. copyFromState(A, statesAsLanes)
  1146. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  1147. #define XOR_In( Xxx, argIndex ) \
  1148. XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
  1149. XOR_In( Aba, 0 );
  1150. XOR_In( Abe, 1 );
  1151. XOR_In( Abi, 2 );
  1152. XOR_In( Abo, 3 );
  1153. XOR_In( Abu, 4 );
  1154. XOR_In( Aga, 5 );
  1155. XOR_In( Age, 6 );
  1156. XOR_In( Agi, 7 );
  1157. XOR_In( Ago, 8 );
  1158. XOR_In( Agu, 9 );
  1159. XOR_In( Aka, 10 );
  1160. XOR_In( Ake, 11 );
  1161. XOR_In( Aki, 12 );
  1162. XOR_In( Ako, 13 );
  1163. XOR_In( Aku, 14 );
  1164. XOR_In( Ama, 15 );
  1165. XOR_In( Ame, 16 );
  1166. XOR_In( Ami, 17 );
  1167. XOR_In( Amo, 18 );
  1168. XOR_In( Amu, 19 );
  1169. XOR_In( Asa, 20 );
  1170. #undef XOR_In
  1171. rounds12
  1172. curData0 += laneOffsetSerial;
  1173. curData1 += laneOffsetSerial;
  1174. curData2 += laneOffsetSerial;
  1175. curData3 += laneOffsetSerial;
  1176. dataByteLen -= laneOffsetSerial*8;
  1177. }
  1178. copyToState(statesAsLanes, A)
  1179. return (const unsigned char *)curData0 - dataStart;
  1180. #endif
  1181. }
  1182. else {
  1183. // unsigned int i;
  1184. const unsigned char *dataStart = data;
  1185. while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
  1186. KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
  1187. KeccakP1600times4_PermuteAll_12rounds(states);
  1188. data += laneOffsetSerial*8;
  1189. dataByteLen -= laneOffsetSerial*8;
  1190. }
  1191. return data - dataStart;
  1192. }
  1193. }
  1194. #endif