#include "shavite-hash-2way.h" #include "algo/sha/sph_types.h" #include // This is a fake, it actually does not do parallel AES, that requires VAES. // This is only intended when the preceding and folllowing functions use the // same 2x128 interleave. #if defined(__AVX2__) static const uint32_t IV512[] = { 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A }; #define mm256_ror2x256hi_1x32( a, b ) \ _mm256_blend_epi32( mm256_shuflr128_32( a ), \ mm256_shuflr128_32( b ), 0x88 ) #if defined(__VAES__) #define mm256_aesenc_2x128( x, k ) \ _mm256_aesenc_epi128( x, _mm256_castsi128_si256( k ) ) #else #define mm256_aesenc_2x128( x, k ) \ mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \ _mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) ) #endif static void c512_2way( shavite512_2way_context *ctx, const void *msg ) { const __m128i zero = _mm_setzero_si128(); __m256i p0, p1, p2, p3, x; __m256i k00, k01, k02, k03, k10, k11, k12, k13; __m256i *m = (__m256i*)msg; __m256i *h = (__m256i*)ctx->h; int r; p0 = h[0]; p1 = h[1]; p2 = h[2]; p3 = h[3]; // round k00 = m[0]; x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero ); k01 = m[1]; x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); k02 = m[2]; x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); k03 = m[3]; x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p0 = _mm256_xor_si256( p0, x ); k10 = m[4]; x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero ); k11 = m[5]; x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); k12 = m[6]; x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); k13 = m[7]; x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); p2 = _mm256_xor_si256( p2, x ); for ( r = 0; r < 3; r ++ ) { // round 1, 5, 9 k00 = _mm256_xor_si256( k13, mm256_shuflr128_32( mm256_aesenc_2x128( k00, zero ) ) ); if ( r == 0 ) k00 = _mm256_xor_si256( k00, _mm256_set_epi32( ~ctx->count3, ctx->count2, ctx->count1, ctx->count0, ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero ); k01 = _mm256_xor_si256( k00, mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) ); if ( r == 1 ) k01 = _mm256_xor_si256( k01, _mm256_set_epi32( ~ctx->count0, ctx->count1, ctx->count2, ctx->count3, ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); k02 = _mm256_xor_si256( k01, mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); k03 = _mm256_xor_si256( k02, mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p3 = _mm256_xor_si256( p3, x ); k10 = _mm256_xor_si256( k03, mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero ); k11 = _mm256_xor_si256( k10, mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); k12 = _mm256_xor_si256( k11, mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); k13 = _mm256_xor_si256( k12, mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) ); if ( r == 2 ) k13 = _mm256_xor_si256( k13, _mm256_set_epi32( ~ctx->count1, ctx->count0, ctx->count3, ctx->count2, ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); p1 = _mm256_xor_si256( p1, x ); // round 2, 6, 10 k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero ); k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p2 = _mm256_xor_si256( p2, x ); k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero ); k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); p0 = _mm256_xor_si256( p0, x ); // round 3, 7, 11 k00 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k00, zero ) ), k13 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero ); k01 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ), k00 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); k02 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ), k01 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); k03 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ), k02 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p1 = _mm256_xor_si256( p1, x ); k10 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ), k03 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero ); k11 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ), k10 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); k12 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ), k11 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); k13 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ), k12 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); p3 = _mm256_xor_si256( p3, x ); // round 4, 8, 12 k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero ); k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p0 = _mm256_xor_si256( p0, x ); k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero ); k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); p2 = _mm256_xor_si256( p2, x ); } // round 13 k00 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k00, zero ) ), k13 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero ); k01 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ), k00 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); k02 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ), k01 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); k03 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ), k02 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p3 = _mm256_xor_si256( p3, x ); k10 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ), k03 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero ); k11 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ), k10 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ); k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32( ~ctx->count2, ctx->count3, ctx->count0, ctx->count1, ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); k13 = _mm256_xor_si256( mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ), k12 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); p1 = _mm256_xor_si256( p1, x ); h[0] = _mm256_xor_si256( h[0], p2 ); h[1] = _mm256_xor_si256( h[1], p3 ); h[2] = _mm256_xor_si256( h[2], p0 ); h[3] = _mm256_xor_si256( h[3], p1 ); } void shavite512_2way_init( shavite512_2way_context *ctx ) { __m256i *h = (__m256i*)ctx->h; __m128i *iv = (__m128i*)IV512; h[0] = m256_const1_128( iv[0] ); h[1] = m256_const1_128( iv[1] ); h[2] = m256_const1_128( iv[2] ); h[3] = m256_const1_128( iv[3] ); ctx->ptr = 0; ctx->count0 = 0; ctx->count1 = 0; ctx->count2 = 0; ctx->count3 = 0; } // not tested, use update_close void shavite512_2way_update( shavite512_2way_context *ctx, const void *data, size_t len ) { unsigned char *buf = ctx->buf; size_t ptr = ctx->ptr; while ( len > 0 ) { size_t clen; clen = (sizeof ctx->buf) - ptr; if ( clen > len << 1 ) clen = len << 1; memcpy( buf + ptr, data, clen ); data = (const unsigned char *)data + clen; ptr += clen; len -= clen >> 1; if ( ptr == sizeof ctx->buf ) { if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 ) { ctx->count1 = ctx->count1 + 1; if ( ctx->count1 == 0 ) { ctx->count2 = ctx->count2 + 1; if ( ctx->count2 == 0 ) ctx->count3 = ctx->count3 + 1; } } c512_2way( ctx, buf ); ptr = 0; } } ctx->ptr = ptr; } // not tested void shavite512_2way_close( shavite512_2way_context *ctx, void *dst ) { unsigned char *buf; union { uint32_t u32[4]; uint16_t u16[8]; } count; buf = ctx->buf; uint32_t vp = ctx->ptr>>5; // Terminating byte then zero pad casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 ); // Zero pad full vectors up to count for ( ; vp < 6; vp++ ) casti_m256i( buf, vp ) = m256_zero; // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200 // Count is misaligned to 16 bits and straddles a vector. // Use u32 overlay to stage then u16 to load buf. count.u32[0] = ctx->count0 += (ctx->ptr << 2); // ptr/2 * 8 count.u32[1] = ctx->count1; count.u32[2] = ctx->count2; count.u32[3] = ctx->count3; casti_m256i( buf, 6 ) = m256_const1_128( _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16( 0x0200, count.u16[7], count.u16[6], count.u16[5], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); c512_2way( ctx, buf); casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 ); casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 ); casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 ); casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 ); } void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst, const void *data, size_t len ) { unsigned char *buf = ctx->buf; size_t ptr = ctx->ptr; // process full blocks and load buf with remainder. while ( len > 0 ) { size_t clen; clen = (sizeof ctx->buf) - ptr; if ( clen > len << 1 ) clen = len << 1; memcpy( buf + ptr, data, clen ); data = (const unsigned char *)data + clen; ptr += clen; len -= (clen >> 1); if ( ptr == sizeof ctx->buf ) { if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 ) { ctx->count1 = ctx->count1 + 1; if ( ctx->count1 == 0 ) { ctx->count2 = ctx->count2 + 1; if ( ctx->count2 == 0 ) ctx->count3 = ctx->count3 + 1; } } c512_2way( ctx, buf ); ptr = 0; } } uint32_t vp = ptr>>5; // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200 // Count is misaligned to 16 bits and straddles 2 vectors. // Use u32 overlay to stage then u16 to load buf. union { uint32_t u32[4]; uint16_t u16[8]; } count; count.u32[0] = ctx->count0 += (ptr << 2); // ptr/2 * 8 count.u32[1] = ctx->count1; count.u32[2] = ctx->count2; count.u32[3] = ctx->count3; if ( vp == 0 ) // empty buf, xevan. { casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 ); memset_zero_256( (__m256i*)buf + 1, 5 ); ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; } else // half full buf, everyone else. { casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 ); memset_zero_256( (__m256i*)buf + vp, 6 - vp ); } casti_m256i( buf, 6 ) = m256_const1_128( _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16( 0x0200, count.u16[7], count.u16[6], count.u16[5], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); c512_2way( ctx, buf); casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 ); casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 ); casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 ); casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 ); } void shavite512_2way_full( shavite512_2way_context *ctx, void *dst, const void *data, size_t len ) { __m256i *h = (__m256i*)ctx->h; __m128i *iv = (__m128i*)IV512; h[0] = m256_const1_128( iv[0] ); h[1] = m256_const1_128( iv[1] ); h[2] = m256_const1_128( iv[2] ); h[3] = m256_const1_128( iv[3] ); ctx->ptr = ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; unsigned char *buf = ctx->buf; size_t ptr = ctx->ptr; // process full blocks and load buf with remainder. while ( len > 0 ) { size_t clen; clen = (sizeof ctx->buf) - ptr; if ( clen > len << 1 ) clen = len << 1; memcpy( buf + ptr, data, clen ); data = (const unsigned char *)data + clen; ptr += clen; len -= (clen >> 1); if ( ptr == sizeof ctx->buf ) { if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 ) { ctx->count1 = ctx->count1 + 1; if ( ctx->count1 == 0 ) { ctx->count2 = ctx->count2 + 1; if ( ctx->count2 == 0 ) ctx->count3 = ctx->count3 + 1; } } c512_2way( ctx, buf ); ptr = 0; } } uint32_t vp = ptr>>5; // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200 // Count is misaligned to 16 bits and straddles 2 vectors. // Use u32 overlay to stage then u16 to load buf. union { uint32_t u32[4]; uint16_t u16[8]; } count; count.u32[0] = ctx->count0 += (ptr << 2); // ptr/2 * 8 count.u32[1] = ctx->count1; count.u32[2] = ctx->count2; count.u32[3] = ctx->count3; if ( vp == 0 ) // empty buf, xevan. { casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 ); memset_zero_256( (__m256i*)buf + 1, 5 ); ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; } else // half full buf, everyone else. { casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 ); memset_zero_256( (__m256i*)buf + vp, 6 - vp ); } casti_m256i( buf, 6 ) = m256_const1_128( _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16( 0x0200, count.u16[7], count.u16[6], count.u16[5], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); c512_2way( ctx, buf); casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 ); casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 ); casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 ); casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 ); } #endif // AVX2