123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284 |
- /*
- Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
- Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
- denoted as "the implementer".
- For more information, feedback or questions, please refer to our websites:
- http://keccak.noekeon.org/
- http://keyak.noekeon.org/
- http://ketje.noekeon.org/
- To the extent possible under law, the implementer has waived all copyright
- and related or neighboring rights to the source code in this file.
- http://creativecommons.org/publicdomain/zero/1.0/
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- // extra headers are removed: smmintrin.h, wmmintrin.h and emmintrin.h
- #if defined(S2N_KYBER512R3_AVX2_BMI2)
- #include <immintrin.h>
- #include "KeccakP-align_avx2.h"
- #include "KeccakP-1600-times4-SnP_avx2.h"
- #include "KeccakP-SIMD256-config_avx2.h"
- #include "KeccakP-brg_endian_avx2.h"
- #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
- #error Expecting a little-endian platform
- #endif
- typedef unsigned char UINT8;
- typedef unsigned long long int UINT64;
- typedef __m128i V128;
- typedef __m256i V256;
- #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
- #if defined(KeccakP1600times4_useAVX2)
- #define ANDnu256(a, b) _mm256_andnot_si256(a, b)
- // correcting cast-align error
- // old version: #define CONST256(a) _mm256_load_si256((const V256 *)&(a))
- #define CONST256(a) _mm256_load_si256((const void *)&(a))
- #define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a))
- #define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
- // correcting cast-align error
- // old version: #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
- #define LOAD256u(a) _mm256_loadu_si256((const void *)&(a))
- #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
- #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
- #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
- #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
- static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
- static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
- #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
- // correcting cast-align error
- // old version: #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
- #define STORE256u(a, b) _mm256_storeu_si256((void *)&(a), b)
- #define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
- #define XOR256(a, b) _mm256_xor_si256(a, b)
- #define XOReq256(a, b) a = _mm256_xor_si256(a, b)
- #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
- #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
- #define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
- #define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
- #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
- lanesH01 = UNPACKH( lanes0, lanes1 ), \
- lanesL23 = UNPACKL( lanes2, lanes3 ), \
- lanesH23 = UNPACKH( lanes2, lanes3 ), \
- lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
- lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
- lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
- lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
- #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
- lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
- lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
- lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
- lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
- lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
- lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
- lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
- #endif
- #define SnP_laneLengthInBytes 8
- void KeccakP1600times4_InitializeAll(void *states)
- {
- memset(states, 0, KeccakP1600times4_statesSizeInBytes);
- }
- void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
- {
- unsigned int sizeLeft = length;
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
- const unsigned char *curData = data;
- UINT64 *statesAsLanes = (UINT64 *)states;
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
- UINT64 lane = 0;
- if (bytesInLane > sizeLeft)
- bytesInLane = sizeLeft;
- memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
- sizeLeft -= bytesInLane;
- lanePosition++;
- curData += bytesInLane;
- }
- while(sizeLeft >= SnP_laneLengthInBytes) {
- // correcting cast-align error
- // old version: UINT64 lane = *((const UINT64*)curData);
- UINT64 lane = *((const UINT64*)(const void *)curData);
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
- sizeLeft -= SnP_laneLengthInBytes;
- lanePosition++;
- curData += SnP_laneLengthInBytes;
- }
- if (sizeLeft > 0) {
- UINT64 lane = 0;
- memcpy(&lane, curData, sizeLeft);
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
- }
- }
- void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
- {
- V256 *stateAsLanes = (V256 *)states;
- unsigned int i;
- // correcting cast-align errors
- // old version: const UINT64 *curData0 = (const UINT64 *)data;
- const UINT64 *curData0 = (const void *)data;
- // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
- const UINT64 *curData1 = (const void *)(data+laneOffset*SnP_laneLengthInBytes);
- // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
- const UINT64 *curData2 = (const void *)(data+laneOffset*2*SnP_laneLengthInBytes);
- // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
- const UINT64 *curData3 = (const void *)(data+laneOffset*3*SnP_laneLengthInBytes);
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
- #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
- #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
- lanes1 = LOAD256u( curData1[argIndex]),\
- lanes2 = LOAD256u( curData2[argIndex]),\
- lanes3 = LOAD256u( curData3[argIndex]),\
- INTLEAVE(),\
- XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
- XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
- XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
- XOReq256( stateAsLanes[argIndex+3], lanes3 )
- if ( laneCount >= 16 ) {
- Xor_In4( 0 );
- Xor_In4( 4 );
- Xor_In4( 8 );
- Xor_In4( 12 );
- if ( laneCount >= 20 ) {
- Xor_In4( 16 );
- for(i=20; i<laneCount; i++)
- Xor_In( i );
- }
- else {
- for(i=16; i<laneCount; i++)
- Xor_In( i );
- }
- }
- else {
- for(i=0; i<laneCount; i++)
- Xor_In( i );
- }
- #undef Xor_In
- #undef Xor_In4
- }
- void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
- {
- unsigned int sizeLeft = length;
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
- const unsigned char *curData = data;
- UINT64 *statesAsLanes = (UINT64 *)states;
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
- if (bytesInLane > sizeLeft)
- bytesInLane = sizeLeft;
- memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
- sizeLeft -= bytesInLane;
- lanePosition++;
- curData += bytesInLane;
- }
- while(sizeLeft >= SnP_laneLengthInBytes) {
- // correcting cast-align error
- // old version: UINT64 lane = *((const UINT64*)curData);
- UINT64 lane = *((const UINT64*)(const void*)curData);
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
- sizeLeft -= SnP_laneLengthInBytes;
- lanePosition++;
- curData += SnP_laneLengthInBytes;
- }
- if (sizeLeft > 0) {
- memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
- }
- }
- void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
- {
- V256 *stateAsLanes = (V256 *)states;
- unsigned int i;
- // correcting cast-align errors
- // old version: const UINT64 *curData0 = (const UINT64 *)data;
- const UINT64 *curData0 = (const void *)data;
- // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
- const UINT64 *curData1 = (const void *)(data+laneOffset*SnP_laneLengthInBytes);
- // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
- const UINT64 *curData2 = (const void *)(data+laneOffset*2*SnP_laneLengthInBytes);
- // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
- const UINT64 *curData3 = (const void *)(data+laneOffset*3*SnP_laneLengthInBytes);
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
- #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
- #define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
- lanes1 = LOAD256u( curData1[argIndex]),\
- lanes2 = LOAD256u( curData2[argIndex]),\
- lanes3 = LOAD256u( curData3[argIndex]),\
- INTLEAVE(),\
- STORE256( stateAsLanes[argIndex+0], lanes0 ),\
- STORE256( stateAsLanes[argIndex+1], lanes1 ),\
- STORE256( stateAsLanes[argIndex+2], lanes2 ),\
- STORE256( stateAsLanes[argIndex+3], lanes3 )
- if ( laneCount >= 16 ) {
- OverWr4( 0 );
- OverWr4( 4 );
- OverWr4( 8 );
- OverWr4( 12 );
- if ( laneCount >= 20 ) {
- OverWr4( 16 );
- for(i=20; i<laneCount; i++)
- OverWr( i );
- }
- else {
- for(i=16; i<laneCount; i++)
- OverWr( i );
- }
- }
- else {
- for(i=0; i<laneCount; i++)
- OverWr( i );
- }
- #undef OverWr
- #undef OverWr4
- }
- void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
- {
- unsigned int sizeLeft = byteCount;
- unsigned int lanePosition = 0;
- UINT64 *statesAsLanes = (UINT64 *)states;
- while(sizeLeft >= SnP_laneLengthInBytes) {
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
- sizeLeft -= SnP_laneLengthInBytes;
- lanePosition++;
- }
- if (sizeLeft > 0) {
- memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
- }
- }
- void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
- {
- unsigned int sizeLeft = length;
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
- unsigned char *curData = data;
- const UINT64 *statesAsLanes = (const UINT64 *)states;
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
- if (bytesInLane > sizeLeft)
- bytesInLane = sizeLeft;
- // correcting cast-qual error
- // old version: memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
- memcpy( curData, ((const unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
- sizeLeft -= bytesInLane;
- lanePosition++;
- curData += bytesInLane;
- }
- while(sizeLeft >= SnP_laneLengthInBytes) {
- // correcting cast-align error
- // old version: *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
- *(UINT64*)(void*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
- sizeLeft -= SnP_laneLengthInBytes;
- lanePosition++;
- curData += SnP_laneLengthInBytes;
- }
- if (sizeLeft > 0) {
- memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
- }
- }
- void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
- {
- // correcting cast-align errors
- // old version: UINT64 *curData0 = (UINT64 *)data;
- UINT64 *curData0 = (void *)data;
- // old version: UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
- UINT64 *curData1 = (void *)(data+laneOffset*1*SnP_laneLengthInBytes);
- // old version: UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
- UINT64 *curData2 = (void *)(data+laneOffset*2*SnP_laneLengthInBytes);
- // old version: UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
- UINT64 *curData3 = (void *)(data+laneOffset*3*SnP_laneLengthInBytes);
- const V256 *stateAsLanes = (const V256 *)states;
- const UINT64 *stateAsLanes64 = (const UINT64*)states;
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
- unsigned int i;
- #define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \
- curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \
- curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \
- curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
- #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \
- lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \
- lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \
- lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \
- UNINTLEAVE(), \
- STORE256u( curData0[argIndex], lanes0 ), \
- STORE256u( curData1[argIndex], lanes1 ), \
- STORE256u( curData2[argIndex], lanes2 ), \
- STORE256u( curData3[argIndex], lanes3 )
- if ( laneCount >= 16 ) {
- Extr4( 0 );
- Extr4( 4 );
- Extr4( 8 );
- Extr4( 12 );
- if ( laneCount >= 20 ) {
- Extr4( 16 );
- for(i=20; i<laneCount; i++)
- Extr( i );
- }
- else {
- for(i=16; i<laneCount; i++)
- Extr( i );
- }
- }
- else {
- for(i=0; i<laneCount; i++)
- Extr( i );
- }
- #undef Extr
- #undef Extr4
- }
- void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
- {
- unsigned int sizeLeft = length;
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
- const unsigned char *curInput = input;
- unsigned char *curOutput = output;
- const UINT64 *statesAsLanes = (const UINT64 *)states;
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
- UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
- if (bytesInLane > sizeLeft)
- bytesInLane = sizeLeft;
- sizeLeft -= bytesInLane;
- do {
- *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
- lane >>= 8;
- } while ( --bytesInLane != 0);
- lanePosition++;
- }
- while(sizeLeft >= SnP_laneLengthInBytes) {
- // correcting cast-align and cast-qual errors
- // old version: *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
- *((UINT64*)(void*)curOutput) = *((const UINT64*)(const void*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
- sizeLeft -= SnP_laneLengthInBytes;
- lanePosition++;
- curInput += SnP_laneLengthInBytes;
- curOutput += SnP_laneLengthInBytes;
- }
- if (sizeLeft != 0) {
- UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
- do {
- *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
- lane >>= 8;
- } while ( --sizeLeft != 0);
- }
- }
- void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
- {
- // correcting cast-align and cast-qual errors
- // old version: const UINT64 *curInput0 = (UINT64 *)input;
- const UINT64 *curInput0 = (const void *)input;
- // old version: const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
- const UINT64 *curInput1 = (const void *)(input+laneOffset*1*SnP_laneLengthInBytes);
- // old version: const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
- const UINT64 *curInput2 = (const void *)(input+laneOffset*2*SnP_laneLengthInBytes);
- // old version: const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
- const UINT64 *curInput3 = (const void *)(input+laneOffset*3*SnP_laneLengthInBytes);
- // correcting cast-align errors
- // old version: UINT64 *curOutput0 = (UINT64 *)output;
- UINT64 *curOutput0 = (void *)output;
- // old version: UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
- UINT64 *curOutput1 = (void *)(output+laneOffset*1*SnP_laneLengthInBytes);
- // old version: UUINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
- UINT64 *curOutput2 = (void *)(output+laneOffset*2*SnP_laneLengthInBytes);
- // old version: UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
- UINT64 *curOutput3 = (void *)(output+laneOffset*3*SnP_laneLengthInBytes);
- const V256 *stateAsLanes = (const V256 *)states;
- const UINT64 *stateAsLanes64 = (const UINT64*)states;
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
- unsigned int i;
- #define ExtrXor( argIndex ) \
- curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
- curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
- curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
- curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
- #define ExtrXor4( argIndex ) \
- lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
- lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
- lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
- lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
- UNINTLEAVE(),\
- lanesL01 = LOAD256u( curInput0[argIndex]),\
- lanesH01 = LOAD256u( curInput1[argIndex]),\
- lanesL23 = LOAD256u( curInput2[argIndex]),\
- lanesH23 = LOAD256u( curInput3[argIndex]),\
- XOReq256( lanes0, lanesL01 ),\
- XOReq256( lanes1, lanesH01 ),\
- XOReq256( lanes2, lanesL23 ),\
- XOReq256( lanes3, lanesH23 ),\
- STORE256u( curOutput0[argIndex], lanes0 ),\
- STORE256u( curOutput1[argIndex], lanes1 ),\
- STORE256u( curOutput2[argIndex], lanes2 ),\
- STORE256u( curOutput3[argIndex], lanes3 )
- if ( laneCount >= 16 ) {
- ExtrXor4( 0 );
- ExtrXor4( 4 );
- ExtrXor4( 8 );
- ExtrXor4( 12 );
- if ( laneCount >= 20 ) {
- ExtrXor4( 16 );
- for(i=20; i<laneCount; i++)
- ExtrXor( i );
- }
- else {
- for(i=16; i<laneCount; i++)
- ExtrXor( i );
- }
- }
- else {
- for(i=0; i<laneCount; i++)
- ExtrXor( i );
- }
- #undef ExtrXor
- #undef ExtrXor4
- }
- #define declareABCDE \
- V256 Aba, Abe, Abi, Abo, Abu; \
- V256 Aga, Age, Agi, Ago, Agu; \
- V256 Aka, Ake, Aki, Ako, Aku; \
- V256 Ama, Ame, Ami, Amo, Amu; \
- V256 Asa, Ase, Asi, Aso, Asu; \
- V256 Bba, Bbe, Bbi, Bbo, Bbu; \
- V256 Bga, Bge, Bgi, Bgo, Bgu; \
- V256 Bka, Bke, Bki, Bko, Bku; \
- V256 Bma, Bme, Bmi, Bmo, Bmu; \
- V256 Bsa, Bse, Bsi, Bso, Bsu; \
- V256 Ca, Ce, Ci, Co, Cu; \
- V256 Ca1, Ce1, Ci1, Co1, Cu1; \
- V256 Da, De, Di, Do, Du; \
- V256 Eba, Ebe, Ebi, Ebo, Ebu; \
- V256 Ega, Ege, Egi, Ego, Egu; \
- V256 Eka, Eke, Eki, Eko, Eku; \
- V256 Ema, Eme, Emi, Emo, Emu; \
- V256 Esa, Ese, Esi, Eso, Esu; \
- #define prepareTheta \
- Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
- Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
- Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
- Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
- Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
- /* --- Theta Rho Pi Chi Iota Prepare-theta */
- /* --- 64-bit lanes mapped to 64-bit words */
- #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
- ROL64in256(Ce1, Ce, 1); \
- Da = XOR256(Cu, Ce1); \
- ROL64in256(Ci1, Ci, 1); \
- De = XOR256(Ca, Ci1); \
- ROL64in256(Co1, Co, 1); \
- Di = XOR256(Ce, Co1); \
- ROL64in256(Cu1, Cu, 1); \
- Do = XOR256(Ci, Cu1); \
- ROL64in256(Ca1, Ca, 1); \
- Du = XOR256(Co, Ca1); \
- \
- XOReq256(A##ba, Da); \
- Bba = A##ba; \
- XOReq256(A##ge, De); \
- ROL64in256(Bbe, A##ge, 44); \
- XOReq256(A##ki, Di); \
- ROL64in256(Bbi, A##ki, 43); \
- E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
- XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
- Ca = E##ba; \
- XOReq256(A##mo, Do); \
- ROL64in256(Bbo, A##mo, 21); \
- E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
- Ce = E##be; \
- XOReq256(A##su, Du); \
- ROL64in256(Bbu, A##su, 14); \
- E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
- Ci = E##bi; \
- E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
- Co = E##bo; \
- E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
- Cu = E##bu; \
- \
- XOReq256(A##bo, Do); \
- ROL64in256(Bga, A##bo, 28); \
- XOReq256(A##gu, Du); \
- ROL64in256(Bge, A##gu, 20); \
- XOReq256(A##ka, Da); \
- ROL64in256(Bgi, A##ka, 3); \
- E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
- XOReq256(Ca, E##ga); \
- XOReq256(A##me, De); \
- ROL64in256(Bgo, A##me, 45); \
- E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
- XOReq256(Ce, E##ge); \
- XOReq256(A##si, Di); \
- ROL64in256(Bgu, A##si, 61); \
- E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
- XOReq256(Ci, E##gi); \
- E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
- XOReq256(Co, E##go); \
- E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
- XOReq256(Cu, E##gu); \
- \
- XOReq256(A##be, De); \
- ROL64in256(Bka, A##be, 1); \
- XOReq256(A##gi, Di); \
- ROL64in256(Bke, A##gi, 6); \
- XOReq256(A##ko, Do); \
- ROL64in256(Bki, A##ko, 25); \
- E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
- XOReq256(Ca, E##ka); \
- XOReq256(A##mu, Du); \
- ROL64in256_8(Bko, A##mu); \
- E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
- XOReq256(Ce, E##ke); \
- XOReq256(A##sa, Da); \
- ROL64in256(Bku, A##sa, 18); \
- E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
- XOReq256(Ci, E##ki); \
- E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
- XOReq256(Co, E##ko); \
- E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
- XOReq256(Cu, E##ku); \
- \
- XOReq256(A##bu, Du); \
- ROL64in256(Bma, A##bu, 27); \
- XOReq256(A##ga, Da); \
- ROL64in256(Bme, A##ga, 36); \
- XOReq256(A##ke, De); \
- ROL64in256(Bmi, A##ke, 10); \
- E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
- XOReq256(Ca, E##ma); \
- XOReq256(A##mi, Di); \
- ROL64in256(Bmo, A##mi, 15); \
- E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
- XOReq256(Ce, E##me); \
- XOReq256(A##so, Do); \
- ROL64in256_56(Bmu, A##so); \
- E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
- XOReq256(Ci, E##mi); \
- E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
- XOReq256(Co, E##mo); \
- E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
- XOReq256(Cu, E##mu); \
- \
- XOReq256(A##bi, Di); \
- ROL64in256(Bsa, A##bi, 62); \
- XOReq256(A##go, Do); \
- ROL64in256(Bse, A##go, 55); \
- XOReq256(A##ku, Du); \
- ROL64in256(Bsi, A##ku, 39); \
- E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
- XOReq256(Ca, E##sa); \
- XOReq256(A##ma, Da); \
- ROL64in256(Bso, A##ma, 41); \
- E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
- XOReq256(Ce, E##se); \
- XOReq256(A##se, De); \
- ROL64in256(Bsu, A##se, 2); \
- E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
- XOReq256(Ci, E##si); \
- E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
- XOReq256(Co, E##so); \
- E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
- XOReq256(Cu, E##su); \
- \
- /* --- Theta Rho Pi Chi Iota */
- /* --- 64-bit lanes mapped to 64-bit words */
- #define thetaRhoPiChiIota(i, A, E) \
- ROL64in256(Ce1, Ce, 1); \
- Da = XOR256(Cu, Ce1); \
- ROL64in256(Ci1, Ci, 1); \
- De = XOR256(Ca, Ci1); \
- ROL64in256(Co1, Co, 1); \
- Di = XOR256(Ce, Co1); \
- ROL64in256(Cu1, Cu, 1); \
- Do = XOR256(Ci, Cu1); \
- ROL64in256(Ca1, Ca, 1); \
- Du = XOR256(Co, Ca1); \
- \
- XOReq256(A##ba, Da); \
- Bba = A##ba; \
- XOReq256(A##ge, De); \
- ROL64in256(Bbe, A##ge, 44); \
- XOReq256(A##ki, Di); \
- ROL64in256(Bbi, A##ki, 43); \
- E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
- XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
- XOReq256(A##mo, Do); \
- ROL64in256(Bbo, A##mo, 21); \
- E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
- XOReq256(A##su, Du); \
- ROL64in256(Bbu, A##su, 14); \
- E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
- E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
- E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
- \
- XOReq256(A##bo, Do); \
- ROL64in256(Bga, A##bo, 28); \
- XOReq256(A##gu, Du); \
- ROL64in256(Bge, A##gu, 20); \
- XOReq256(A##ka, Da); \
- ROL64in256(Bgi, A##ka, 3); \
- E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
- XOReq256(A##me, De); \
- ROL64in256(Bgo, A##me, 45); \
- E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
- XOReq256(A##si, Di); \
- ROL64in256(Bgu, A##si, 61); \
- E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
- E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
- E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
- \
- XOReq256(A##be, De); \
- ROL64in256(Bka, A##be, 1); \
- XOReq256(A##gi, Di); \
- ROL64in256(Bke, A##gi, 6); \
- XOReq256(A##ko, Do); \
- ROL64in256(Bki, A##ko, 25); \
- E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
- XOReq256(A##mu, Du); \
- ROL64in256_8(Bko, A##mu); \
- E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
- XOReq256(A##sa, Da); \
- ROL64in256(Bku, A##sa, 18); \
- E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
- E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
- E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
- \
- XOReq256(A##bu, Du); \
- ROL64in256(Bma, A##bu, 27); \
- XOReq256(A##ga, Da); \
- ROL64in256(Bme, A##ga, 36); \
- XOReq256(A##ke, De); \
- ROL64in256(Bmi, A##ke, 10); \
- E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
- XOReq256(A##mi, Di); \
- ROL64in256(Bmo, A##mi, 15); \
- E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
- XOReq256(A##so, Do); \
- ROL64in256_56(Bmu, A##so); \
- E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
- E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
- E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
- \
- XOReq256(A##bi, Di); \
- ROL64in256(Bsa, A##bi, 62); \
- XOReq256(A##go, Do); \
- ROL64in256(Bse, A##go, 55); \
- XOReq256(A##ku, Du); \
- ROL64in256(Bsi, A##ku, 39); \
- E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
- XOReq256(A##ma, Da); \
- ROL64in256(Bso, A##ma, 41); \
- E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
- XOReq256(A##se, De); \
- ROL64in256(Bsu, A##se, 2); \
- E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
- E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
- E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
- \
- static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
- 0x0000000000000001ULL,
- 0x0000000000008082ULL,
- 0x800000000000808aULL,
- 0x8000000080008000ULL,
- 0x000000000000808bULL,
- 0x0000000080000001ULL,
- 0x8000000080008081ULL,
- 0x8000000000008009ULL,
- 0x000000000000008aULL,
- 0x0000000000000088ULL,
- 0x0000000080008009ULL,
- 0x000000008000000aULL,
- 0x000000008000808bULL,
- 0x800000000000008bULL,
- 0x8000000000008089ULL,
- 0x8000000000008003ULL,
- 0x8000000000008002ULL,
- 0x8000000000000080ULL,
- 0x000000000000800aULL,
- 0x800000008000000aULL,
- 0x8000000080008081ULL,
- 0x8000000000008080ULL,
- 0x0000000080000001ULL,
- 0x8000000080008008ULL};
- #define copyFromState(X, state) \
- X##ba = LOAD256(state[ 0]); \
- X##be = LOAD256(state[ 1]); \
- X##bi = LOAD256(state[ 2]); \
- X##bo = LOAD256(state[ 3]); \
- X##bu = LOAD256(state[ 4]); \
- X##ga = LOAD256(state[ 5]); \
- X##ge = LOAD256(state[ 6]); \
- X##gi = LOAD256(state[ 7]); \
- X##go = LOAD256(state[ 8]); \
- X##gu = LOAD256(state[ 9]); \
- X##ka = LOAD256(state[10]); \
- X##ke = LOAD256(state[11]); \
- X##ki = LOAD256(state[12]); \
- X##ko = LOAD256(state[13]); \
- X##ku = LOAD256(state[14]); \
- X##ma = LOAD256(state[15]); \
- X##me = LOAD256(state[16]); \
- X##mi = LOAD256(state[17]); \
- X##mo = LOAD256(state[18]); \
- X##mu = LOAD256(state[19]); \
- X##sa = LOAD256(state[20]); \
- X##se = LOAD256(state[21]); \
- X##si = LOAD256(state[22]); \
- X##so = LOAD256(state[23]); \
- X##su = LOAD256(state[24]); \
- #define copyToState(state, X) \
- STORE256(state[ 0], X##ba); \
- STORE256(state[ 1], X##be); \
- STORE256(state[ 2], X##bi); \
- STORE256(state[ 3], X##bo); \
- STORE256(state[ 4], X##bu); \
- STORE256(state[ 5], X##ga); \
- STORE256(state[ 6], X##ge); \
- STORE256(state[ 7], X##gi); \
- STORE256(state[ 8], X##go); \
- STORE256(state[ 9], X##gu); \
- STORE256(state[10], X##ka); \
- STORE256(state[11], X##ke); \
- STORE256(state[12], X##ki); \
- STORE256(state[13], X##ko); \
- STORE256(state[14], X##ku); \
- STORE256(state[15], X##ma); \
- STORE256(state[16], X##me); \
- STORE256(state[17], X##mi); \
- STORE256(state[18], X##mo); \
- STORE256(state[19], X##mu); \
- STORE256(state[20], X##sa); \
- STORE256(state[21], X##se); \
- STORE256(state[22], X##si); \
- STORE256(state[23], X##so); \
- STORE256(state[24], X##su); \
- #define copyStateVariables(X, Y) \
- X##ba = Y##ba; \
- X##be = Y##be; \
- X##bi = Y##bi; \
- X##bo = Y##bo; \
- X##bu = Y##bu; \
- X##ga = Y##ga; \
- X##ge = Y##ge; \
- X##gi = Y##gi; \
- X##go = Y##go; \
- X##gu = Y##gu; \
- X##ka = Y##ka; \
- X##ke = Y##ke; \
- X##ki = Y##ki; \
- X##ko = Y##ko; \
- X##ku = Y##ku; \
- X##ma = Y##ma; \
- X##me = Y##me; \
- X##mi = Y##mi; \
- X##mo = Y##mo; \
- X##mu = Y##mu; \
- X##sa = Y##sa; \
- X##se = Y##se; \
- X##si = Y##si; \
- X##so = Y##so; \
- X##su = Y##su; \
- #ifdef KeccakP1600times4_fullUnrolling
- #define FullUnrolling
- #else
- #define Unrolling KeccakP1600times4_unrolling
- #endif
- // The macro file is combined with source file directly
- /*****#include "KeccakP-1600-unrolling_avx2.macros"*****/
- /*******************************************************/
- /*
- Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
- Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
- denoted as "the implementer".
- For more information, feedback or questions, please refer to our websites:
- http://keccak.noekeon.org/
- http://keyak.noekeon.org/
- http://ketje.noekeon.org/
- To the extent possible under law, the implementer has waived all copyright
- and related or neighboring rights to the source code in this file.
- http://creativecommons.org/publicdomain/zero/1.0/
- */
- #if (defined(FullUnrolling))
- #define rounds24 \
- prepareTheta \
- thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
- thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
- thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
- thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
- thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
- thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
- thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
- thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
- thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
- thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
- thetaRhoPiChiIotaPrepareTheta(10, A, E) \
- thetaRhoPiChiIotaPrepareTheta(11, E, A) \
- thetaRhoPiChiIotaPrepareTheta(12, A, E) \
- thetaRhoPiChiIotaPrepareTheta(13, E, A) \
- thetaRhoPiChiIotaPrepareTheta(14, A, E) \
- thetaRhoPiChiIotaPrepareTheta(15, E, A) \
- thetaRhoPiChiIotaPrepareTheta(16, A, E) \
- thetaRhoPiChiIotaPrepareTheta(17, E, A) \
- thetaRhoPiChiIotaPrepareTheta(18, A, E) \
- thetaRhoPiChiIotaPrepareTheta(19, E, A) \
- thetaRhoPiChiIotaPrepareTheta(20, A, E) \
- thetaRhoPiChiIotaPrepareTheta(21, E, A) \
- thetaRhoPiChiIotaPrepareTheta(22, A, E) \
- thetaRhoPiChiIota(23, E, A) \
- #define rounds12 \
- prepareTheta \
- thetaRhoPiChiIotaPrepareTheta(12, A, E) \
- thetaRhoPiChiIotaPrepareTheta(13, E, A) \
- thetaRhoPiChiIotaPrepareTheta(14, A, E) \
- thetaRhoPiChiIotaPrepareTheta(15, E, A) \
- thetaRhoPiChiIotaPrepareTheta(16, A, E) \
- thetaRhoPiChiIotaPrepareTheta(17, E, A) \
- thetaRhoPiChiIotaPrepareTheta(18, A, E) \
- thetaRhoPiChiIotaPrepareTheta(19, E, A) \
- thetaRhoPiChiIotaPrepareTheta(20, A, E) \
- thetaRhoPiChiIotaPrepareTheta(21, E, A) \
- thetaRhoPiChiIotaPrepareTheta(22, A, E) \
- thetaRhoPiChiIota(23, E, A) \
- #elif (Unrolling == 12)
- #define rounds24 \
- prepareTheta \
- for(i=0; i<24; i+=12) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
- } \
- #define rounds12 \
- prepareTheta \
- thetaRhoPiChiIotaPrepareTheta(12, A, E) \
- thetaRhoPiChiIotaPrepareTheta(13, E, A) \
- thetaRhoPiChiIotaPrepareTheta(14, A, E) \
- thetaRhoPiChiIotaPrepareTheta(15, E, A) \
- thetaRhoPiChiIotaPrepareTheta(16, A, E) \
- thetaRhoPiChiIotaPrepareTheta(17, E, A) \
- thetaRhoPiChiIotaPrepareTheta(18, A, E) \
- thetaRhoPiChiIotaPrepareTheta(19, E, A) \
- thetaRhoPiChiIotaPrepareTheta(20, A, E) \
- thetaRhoPiChiIotaPrepareTheta(21, E, A) \
- thetaRhoPiChiIotaPrepareTheta(22, A, E) \
- thetaRhoPiChiIota(23, E, A) \
- #elif (Unrolling == 6)
- #define rounds24 \
- prepareTheta \
- for(i=0; i<24; i+=6) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
- } \
- #define rounds12 \
- prepareTheta \
- for(i=12; i<24; i+=6) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
- } \
- #elif (Unrolling == 4)
- #define rounds24 \
- prepareTheta \
- for(i=0; i<24; i+=4) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
- } \
- #define rounds12 \
- prepareTheta \
- for(i=12; i<24; i+=4) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
- } \
- #elif (Unrolling == 3)
- #define rounds24 \
- prepareTheta \
- for(i=0; i<24; i+=3) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
- copyStateVariables(A, E) \
- } \
- #define rounds12 \
- prepareTheta \
- for(i=12; i<24; i+=3) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
- thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
- copyStateVariables(A, E) \
- } \
- #elif (Unrolling == 2)
- #define rounds24 \
- prepareTheta \
- for(i=0; i<24; i+=2) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
- } \
- #define rounds12 \
- prepareTheta \
- for(i=12; i<24; i+=2) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
- } \
- #elif (Unrolling == 1)
- #define rounds24 \
- prepareTheta \
- for(i=0; i<24; i++) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- copyStateVariables(A, E) \
- } \
- #define rounds12 \
- prepareTheta \
- for(i=12; i<24; i++) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- copyStateVariables(A, E) \
- } \
- #else
- #error "Unrolling is not correctly specified!"
- #endif
- #define roundsN(__nrounds) \
- prepareTheta \
- i = 24 - (__nrounds); \
- if ((i&1) != 0) { \
- thetaRhoPiChiIotaPrepareTheta(i, A, E) \
- copyStateVariables(A, E) \
- ++i; \
- } \
- for( /* empty */; i<24; i+=2) { \
- thetaRhoPiChiIotaPrepareTheta(i , A, E) \
- thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
- }
- /*******************************************************/
- void KeccakP1600times4_PermuteAll_24rounds(void *states)
- {
- V256 *statesAsLanes = (V256 *)states;
- declareABCDE
- #ifndef KeccakP1600times4_fullUnrolling
- unsigned int i;
- #endif
- copyFromState(A, statesAsLanes)
- rounds24
- copyToState(statesAsLanes, A)
- }
- void KeccakP1600times4_PermuteAll_12rounds(void *states)
- {
- V256 *statesAsLanes = (V256 *)states;
- declareABCDE
- #ifndef KeccakP1600times4_fullUnrolling
- unsigned int i;
- #endif
- copyFromState(A, statesAsLanes)
- rounds12
- copyToState(statesAsLanes, A)
- }
- size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
- {
- if (laneCount == 21) {
- #if 0
- const unsigned char *dataStart = data;
- const UINT64 *curData0 = (const UINT64 *)data;
- const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
- const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
- const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
- V256 *stateAsLanes = (V256 *)states;
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
- #define Xor_In( argIndex ) \
- XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
- #define Xor_In4( argIndex ) \
- lanes0 = LOAD256u( curData0[argIndex]),\
- lanes1 = LOAD256u( curData1[argIndex]),\
- lanes2 = LOAD256u( curData2[argIndex]),\
- lanes3 = LOAD256u( curData3[argIndex]),\
- INTLEAVE(),\
- XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
- XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
- XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
- XOReq256( stateAsLanes[argIndex+3], lanes3 )
- Xor_In4( 0 );
- Xor_In4( 4 );
- Xor_In4( 8 );
- Xor_In4( 12 );
- Xor_In4( 16 );
- Xor_In( 20 );
- #undef Xor_In
- #undef Xor_In4
- KeccakP1600times4_PermuteAll_24rounds(states);
- curData0 += laneOffsetSerial;
- curData1 += laneOffsetSerial;
- curData2 += laneOffsetSerial;
- curData3 += laneOffsetSerial;
- dataByteLen -= laneOffsetSerial*8;
- }
- return (const unsigned char *)curData0 - dataStart;
- #else
- // unsigned int i;
- const unsigned char *dataStart = data;
- // correcting cast-align errors
- // old version: const UINT64 *curData0 = (const UINT64 *)data;
- const UINT64 *curData0 = (const void *)data;
- // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
- const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
- // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
- const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
- // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
- const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
- V256 *statesAsLanes = (V256 *)states;
- declareABCDE
- copyFromState(A, statesAsLanes)
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
- #define XOR_In( Xxx, argIndex ) \
- XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
- XOR_In( Aba, 0 );
- XOR_In( Abe, 1 );
- XOR_In( Abi, 2 );
- XOR_In( Abo, 3 );
- XOR_In( Abu, 4 );
- XOR_In( Aga, 5 );
- XOR_In( Age, 6 );
- XOR_In( Agi, 7 );
- XOR_In( Ago, 8 );
- XOR_In( Agu, 9 );
- XOR_In( Aka, 10 );
- XOR_In( Ake, 11 );
- XOR_In( Aki, 12 );
- XOR_In( Ako, 13 );
- XOR_In( Aku, 14 );
- XOR_In( Ama, 15 );
- XOR_In( Ame, 16 );
- XOR_In( Ami, 17 );
- XOR_In( Amo, 18 );
- XOR_In( Amu, 19 );
- XOR_In( Asa, 20 );
- #undef XOR_In
- rounds24
- curData0 += laneOffsetSerial;
- curData1 += laneOffsetSerial;
- curData2 += laneOffsetSerial;
- curData3 += laneOffsetSerial;
- dataByteLen -= laneOffsetSerial*8;
- }
- copyToState(statesAsLanes, A)
- return (const unsigned char *)curData0 - dataStart;
- #endif
- }
- else {
- // unsigned int i;
- const unsigned char *dataStart = data;
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
- KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
- KeccakP1600times4_PermuteAll_24rounds(states);
- data += laneOffsetSerial*8;
- dataByteLen -= laneOffsetSerial*8;
- }
- return data - dataStart;
- }
- }
- size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
- {
- if (laneCount == 21) {
- #if 0
- const unsigned char *dataStart = data;
- const UINT64 *curData0 = (const UINT64 *)data;
- const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
- const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
- const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
- V256 *stateAsLanes = states;
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
- #define Xor_In( argIndex ) \
- XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
- #define Xor_In4( argIndex ) \
- lanes0 = LOAD256u( curData0[argIndex]),\
- lanes1 = LOAD256u( curData1[argIndex]),\
- lanes2 = LOAD256u( curData2[argIndex]),\
- lanes3 = LOAD256u( curData3[argIndex]),\
- INTLEAVE(),\
- XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
- XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
- XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
- XOReq256( stateAsLanes[argIndex+3], lanes3 )
- Xor_In4( 0 );
- Xor_In4( 4 );
- Xor_In4( 8 );
- Xor_In4( 12 );
- Xor_In4( 16 );
- Xor_In( 20 );
- #undef Xor_In
- #undef Xor_In4
- KeccakP1600times4_PermuteAll_12rounds(states);
- curData0 += laneOffsetSerial;
- curData1 += laneOffsetSerial;
- curData2 += laneOffsetSerial;
- curData3 += laneOffsetSerial;
- dataByteLen -= laneOffsetSerial*8;
- }
- return (const unsigned char *)curData0 - dataStart;
- #else
- // unsigned int i;
- const unsigned char *dataStart = data;
- // correcting cast-align errors
- // old version: const UINT64 *curData0 = (const UINT64 *)data;
- const UINT64 *curData0 = (const void *)data;
- // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
- const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
- // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
- const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
- // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
- const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
- V256 *statesAsLanes = states;
- declareABCDE
- copyFromState(A, statesAsLanes)
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
- #define XOR_In( Xxx, argIndex ) \
- XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
- XOR_In( Aba, 0 );
- XOR_In( Abe, 1 );
- XOR_In( Abi, 2 );
- XOR_In( Abo, 3 );
- XOR_In( Abu, 4 );
- XOR_In( Aga, 5 );
- XOR_In( Age, 6 );
- XOR_In( Agi, 7 );
- XOR_In( Ago, 8 );
- XOR_In( Agu, 9 );
- XOR_In( Aka, 10 );
- XOR_In( Ake, 11 );
- XOR_In( Aki, 12 );
- XOR_In( Ako, 13 );
- XOR_In( Aku, 14 );
- XOR_In( Ama, 15 );
- XOR_In( Ame, 16 );
- XOR_In( Ami, 17 );
- XOR_In( Amo, 18 );
- XOR_In( Amu, 19 );
- XOR_In( Asa, 20 );
- #undef XOR_In
- rounds12
- curData0 += laneOffsetSerial;
- curData1 += laneOffsetSerial;
- curData2 += laneOffsetSerial;
- curData3 += laneOffsetSerial;
- dataByteLen -= laneOffsetSerial*8;
- }
- copyToState(statesAsLanes, A)
- return (const unsigned char *)curData0 - dataStart;
- #endif
- }
- else {
- // unsigned int i;
- const unsigned char *dataStart = data;
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
- KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
- KeccakP1600times4_PermuteAll_12rounds(states);
- data += laneOffsetSerial*8;
- dataByteLen -= laneOffsetSerial*8;
- }
- return data - dataStart;
- }
- }
- #endif
|