diff --git a/blake2-simd.cpp b/blake2-simd.cpp index 32d81cab..f1297a72 100644 --- a/blake2-simd.cpp +++ b/blake2-simd.cpp @@ -28,6 +28,19 @@ # define EXCEPTION_EXECUTE_HANDLER 1 #endif +// Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x, Win64 supplies it except for VS2008. +// Also see http://stackoverflow.com/a/38547909/608639 +#if CRYPTOPP_SSE2_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || (defined(_MSC_VER) && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600)) +inline __m128i MM_SET_EPI64X(const word64 a, const word64 b) +{ + const word64 t[2] = {b,a}; __m128i r; + memcpy(&r, &t, sizeof(t)); + return r; +} +#else +# define MM_SET_EPI64X(a, b) _mm_set_epi64x(a, b) +#endif + NAMESPACE_BEGIN(CryptoPP) ANONYMOUS_NAMESPACE_BEGIN @@ -2161,4 +2174,1318 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State& state } #endif // CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE +#if CRYPTOPP_SSE2_AVAILABLE +void BLAKE2_Compress32_SSE2(const byte* input, BLAKE2_State& state) +{ + word32 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15; + GetBlock get(input); + get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15); + + __m128i row1,row2,row3,row4; + __m128i buf1,buf2,buf3,buf4; + __m128i ff0,ff1; + + row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); + row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); + row3 = _mm_setr_epi32(BLAKE2S_IV[0],BLAKE2S_IV[1],BLAKE2S_IV[2],BLAKE2S_IV[3]); + row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4],BLAKE2S_IV[5],BLAKE2S_IV[6],BLAKE2S_IV[7]),_mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); + buf1 = _mm_set_epi32(m6,m4,m2,m0); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf2 = _mm_set_epi32(m7,m5,m3,m1); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_set_epi32(m14,m12,m10,m8); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf4 = _mm_set_epi32(m15,m13,m11,m9); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); + + buf1 = _mm_set_epi32(m13,m9,m4,m14); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf2 = _mm_set_epi32(m6,m15,m8,m10); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_set_epi32(m5,m11,m0,m1); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf4 = _mm_set_epi32(m3,m7,m2,m12); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); + + buf1 = _mm_set_epi32(m15,m5,m12,m11); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf2 = _mm_set_epi32(m13,m2,m0,m8); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_set_epi32(m9,m7,m3,m10); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf4 = _mm_set_epi32(m4,m1,m6,m14); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); + + buf1 = _mm_set_epi32(m11,m13,m3,m7); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf2 = _mm_set_epi32(m14,m12,m1,m9); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_set_epi32(m15,m4,m5,m2); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf4 = _mm_set_epi32(m8,m0,m10,m6); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); + + buf1 = _mm_set_epi32(m10,m2,m5,m9); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf2 = _mm_set_epi32(m15,m4,m7,m0); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_set_epi32(m3,m6,m11,m14); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf4 = _mm_set_epi32(m13,m8,m12,m1); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); + + buf1 = _mm_set_epi32(m8,m0,m6,m2); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf2 = _mm_set_epi32(m3,m11,m10,m12); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_set_epi32(m1,m15,m7,m4); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf4 = _mm_set_epi32(m9,m14,m5,m13); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); + + buf1 = _mm_set_epi32(m4,m14,m1,m12); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf2 = _mm_set_epi32(m10,m13,m15,m5); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_set_epi32(m8,m9,m6,m0); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf4 = _mm_set_epi32(m11,m2,m3,m7); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); + + buf1 = _mm_set_epi32(m3,m12,m7,m13); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf2 = _mm_set_epi32(m9,m1,m14,m11); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_set_epi32(m2,m8,m15,m5); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf4 = _mm_set_epi32(m10,m6,m4,m0); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); + + buf1 = _mm_set_epi32(m0,m11,m14,m6); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf2 = _mm_set_epi32(m8,m3,m9,m15); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_set_epi32(m10,m1,m13,m12); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf4 = _mm_set_epi32(m5,m4,m7,m2); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); + + buf1 = _mm_set_epi32(m1,m7,m8,m10); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf2 = _mm_set_epi32(m5,m6,m4,m2); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_set_epi32(m13,m3,m9,m15); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); + + buf4 = _mm_set_epi32(m0,m12,m14,m11); + row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); + row4 = _mm_xor_si128(row4,row1); + row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); + row3 = _mm_add_epi32(row3,row4); + row2 = _mm_xor_si128(row2,row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); + + row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); + + _mm_storeu_si128((__m128i *)(void*)(&state.h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3))); + _mm_storeu_si128((__m128i *)(void*)(&state.h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4))); +} + +# if (__SUNPRO_CC != 0x5120) +void BLAKE2_Compress64_SSE2(const byte* input, BLAKE2_State& state) +{ + word64 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15; + GetBlock get(input); + get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15); + + __m128i row1l, row1h, row2l, row2h; + __m128i row3l, row3h, row4l, row4h; + __m128i b0, b1, t0, t1; + + row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); + row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])); + row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); + row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])); + row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[0])); + row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[2])); + row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[4])), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); + row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[6])), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0]))); + + b0 = MM_SET_EPI64X(m2, m0); + b1 = MM_SET_EPI64X(m6, m4); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l, 40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h, 40)); + + b0 = MM_SET_EPI64X(m3, m1); + b1 = MM_SET_EPI64X(m7, m5); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m10, m8); + b1 = MM_SET_EPI64X(m14, m12); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m11, m9); + b1 = MM_SET_EPI64X(m15, m13); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m4, m14); + b1 = MM_SET_EPI64X(m13, m9); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m8, m10); + b1 = MM_SET_EPI64X(m6, m15); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = MM_SET_EPI64X(m0, m1); + b1 = MM_SET_EPI64X(m5, m11); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m2, m12); + b1 = MM_SET_EPI64X(m3, m7); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m12, m11); + b1 = MM_SET_EPI64X(m15, m5); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m0, m8); + b1 = MM_SET_EPI64X(m13, m2); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = MM_SET_EPI64X(m3, m10); + b1 = MM_SET_EPI64X(m9, m7); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m6, m14); + b1 = MM_SET_EPI64X(m4, m1); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m3, m7); + b1 = MM_SET_EPI64X(m11, m13); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m1, m9); + b1 = MM_SET_EPI64X(m14, m12); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = MM_SET_EPI64X(m5, m2); + b1 = MM_SET_EPI64X(m15, m4); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m10, m6); + b1 = MM_SET_EPI64X(m8, m0); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m5, m9); + b1 = MM_SET_EPI64X(m10, m2); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m7, m0); + b1 = MM_SET_EPI64X(m15, m4); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m11, m14); + b1 = MM_SET_EPI64X(m3, m6); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = MM_SET_EPI64X(m12, m1); + b1 = MM_SET_EPI64X(m13, m8); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m6, m2); + b1 = MM_SET_EPI64X(m8, m0); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m10, m12); + b1 = MM_SET_EPI64X(m3, m11); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m7, m4); + b1 = MM_SET_EPI64X(m1, m15); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m5, m13); + b1 = MM_SET_EPI64X(m9, m14); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m1, m12); + b1 = MM_SET_EPI64X(m4, m14); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m15, m5); + b1 = MM_SET_EPI64X(m10, m13); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m6, m0); + b1 = MM_SET_EPI64X(m8, m9); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m3, m7); + b1 = MM_SET_EPI64X(m11, m2); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m7, m13); + b1 = MM_SET_EPI64X(m3, m12); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m14, m11); + b1 = MM_SET_EPI64X(m9, m1); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m15, m5); + b1 = MM_SET_EPI64X(m2, m8); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m4, m0); + b1 = MM_SET_EPI64X(m10, m6); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m14, m6); + b1 = MM_SET_EPI64X(m0, m11); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m9, m15); + b1 = MM_SET_EPI64X(m8, m3); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m13, m12); + b1 = MM_SET_EPI64X(m10, m1); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m7, m2); + b1 = MM_SET_EPI64X(m5, m4); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m8, m10); + b1 = MM_SET_EPI64X(m1, m7); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m4, m2); + b1 = MM_SET_EPI64X(m5, m6); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m9, m15); + b1 = MM_SET_EPI64X(m13, m3); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m14, m11); + b1 = MM_SET_EPI64X(m0, m12); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m2, m0); + b1 = MM_SET_EPI64X(m6, m4); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m3, m1); + b1 = MM_SET_EPI64X(m7, m5); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m10, m8); + b1 = MM_SET_EPI64X(m14, m12); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m11, m9); + b1 = MM_SET_EPI64X(m15, m13); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m4, m14); + b1 = MM_SET_EPI64X(m13, m9); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m8, m10); + b1 = MM_SET_EPI64X(m6, m15); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + + b0 = MM_SET_EPI64X(m0, m1); + b1 = MM_SET_EPI64X(m5, m11); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + b0 = MM_SET_EPI64X(m2, m12); + b1 = MM_SET_EPI64X(m3, m7); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l)); + _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h)); + + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l)); + _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h)); +} +# endif // (__SUNPRO_CC != 0x5120) +#endif // CRYPTOPP_SSE2_AVAILABLE + NAMESPACE_END \ No newline at end of file diff --git a/blake2.cpp b/blake2.cpp index 16442928..d0fbf2e8 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -22,32 +22,15 @@ NAMESPACE_BEGIN(CryptoPP) # undef CRYPTOPP_SSE42_AVAILABLE #endif -#if (CRYPTOPP_SSE42_AVAILABLE) -# include "nmmintrin.h" -#endif - -// Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x, Win64 supplies it except for VS2008. -// Also see http://stackoverflow.com/a/38547909/608639 -#if CRYPTOPP_SSE2_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || (defined(_MSC_VER) && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600)) -inline __m128i MM_SET_EPI64X(const word64 a, const word64 b) -{ - const word64 t[2] = {b,a}; __m128i r; - memcpy(&r, &t, sizeof(t)); - return r; -} -#else -# define MM_SET_EPI64X(a, b) _mm_set_epi64x(a, b) -#endif - // C/C++ implementation static void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State& state); static void BLAKE2_Compress64_CXX(const byte* input, BLAKE2_State& state); // Also see http://github.com/weidai11/cryptopp/issues/247 for SunCC 5.12 #if CRYPTOPP_SSE2_AVAILABLE -static void BLAKE2_Compress32_SSE2(const byte* input, BLAKE2_State& state); +extern void BLAKE2_Compress32_SSE2(const byte* input, BLAKE2_State& state); # if (__SUNPRO_CC != 0x5120) -static void BLAKE2_Compress64_SSE2(const byte* input, BLAKE2_State& state); +extern void BLAKE2_Compress64_SSE2(const byte* input, BLAKE2_State& state); # endif #endif @@ -573,1320 +556,6 @@ void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State& state state.h[i] = state.h[i] ^ ConditionalByteReverse(LittleEndian::ToEnum(), v[i] ^ v[i + 8]); } -#if CRYPTOPP_SSE2_AVAILABLE -static void BLAKE2_Compress32_SSE2(const byte* input, BLAKE2_State& state) -{ - word32 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15; - GetBlock get(input); - get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15); - - __m128i row1,row2,row3,row4; - __m128i buf1,buf2,buf3,buf4; - __m128i ff0,ff1; - - row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); - row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); - row3 = _mm_setr_epi32(BLAKE2S_IV[0],BLAKE2S_IV[1],BLAKE2S_IV[2],BLAKE2S_IV[3]); - row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4],BLAKE2S_IV[5],BLAKE2S_IV[6],BLAKE2S_IV[7]),_mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); - buf1 = _mm_set_epi32(m6,m4,m2,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m7,m5,m3,m1); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m14,m12,m10,m8); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m15,m13,m11,m9); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m13,m9,m4,m14); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m6,m15,m8,m10); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m5,m11,m0,m1); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m3,m7,m2,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m15,m5,m12,m11); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m13,m2,m0,m8); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m9,m7,m3,m10); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m4,m1,m6,m14); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m11,m13,m3,m7); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m14,m12,m1,m9); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m15,m4,m5,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m8,m0,m10,m6); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m10,m2,m5,m9); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m15,m4,m7,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m3,m6,m11,m14); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m13,m8,m12,m1); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m8,m0,m6,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m3,m11,m10,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m1,m15,m7,m4); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m9,m14,m5,m13); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m4,m14,m1,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m10,m13,m15,m5); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m8,m9,m6,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m11,m2,m3,m7); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m3,m12,m7,m13); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m9,m1,m14,m11); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m2,m8,m15,m5); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m10,m6,m4,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m0,m11,m14,m6); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m8,m3,m9,m15); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m10,m1,m13,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m5,m4,m7,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m1,m7,m8,m10); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m5,m6,m4,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m13,m3,m9,m15); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m0,m12,m14,m11); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - _mm_storeu_si128((__m128i *)(void*)(&state.h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3))); - _mm_storeu_si128((__m128i *)(void*)(&state.h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4))); -} - -# if (__SUNPRO_CC != 0x5120) -static void BLAKE2_Compress64_SSE2(const byte* input, BLAKE2_State& state) -{ - word64 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15; - GetBlock get(input); - get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15); - - __m128i row1l, row1h, row2l, row2h; - __m128i row3l, row3h, row4l, row4h; - __m128i b0, b1, t0, t1; - - row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); - row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])); - row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); - row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])); - row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[0])); - row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[2])); - row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[4])), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); - row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[6])), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0]))); - - b0 = MM_SET_EPI64X(m2, m0); - b1 = MM_SET_EPI64X(m6, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l, 40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h, 40)); - - b0 = MM_SET_EPI64X(m3, m1); - b1 = MM_SET_EPI64X(m7, m5); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m10, m8); - b1 = MM_SET_EPI64X(m14, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m11, m9); - b1 = MM_SET_EPI64X(m15, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m4, m14); - b1 = MM_SET_EPI64X(m13, m9); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m8, m10); - b1 = MM_SET_EPI64X(m6, m15); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - b0 = MM_SET_EPI64X(m0, m1); - b1 = MM_SET_EPI64X(m5, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m2, m12); - b1 = MM_SET_EPI64X(m3, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m12, m11); - b1 = MM_SET_EPI64X(m15, m5); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m0, m8); - b1 = MM_SET_EPI64X(m13, m2); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - b0 = MM_SET_EPI64X(m3, m10); - b1 = MM_SET_EPI64X(m9, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m6, m14); - b1 = MM_SET_EPI64X(m4, m1); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m3, m7); - b1 = MM_SET_EPI64X(m11, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m1, m9); - b1 = MM_SET_EPI64X(m14, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - b0 = MM_SET_EPI64X(m5, m2); - b1 = MM_SET_EPI64X(m15, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m10, m6); - b1 = MM_SET_EPI64X(m8, m0); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m5, m9); - b1 = MM_SET_EPI64X(m10, m2); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m7, m0); - b1 = MM_SET_EPI64X(m15, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m11, m14); - b1 = MM_SET_EPI64X(m3, m6); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - - b0 = MM_SET_EPI64X(m12, m1); - b1 = MM_SET_EPI64X(m13, m8); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m6, m2); - b1 = MM_SET_EPI64X(m8, m0); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m10, m12); - b1 = MM_SET_EPI64X(m3, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m7, m4); - b1 = MM_SET_EPI64X(m1, m15); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m5, m13); - b1 = MM_SET_EPI64X(m9, m14); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m1, m12); - b1 = MM_SET_EPI64X(m4, m14); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m15, m5); - b1 = MM_SET_EPI64X(m10, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m6, m0); - b1 = MM_SET_EPI64X(m8, m9); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m3, m7); - b1 = MM_SET_EPI64X(m11, m2); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m7, m13); - b1 = MM_SET_EPI64X(m3, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m14, m11); - b1 = MM_SET_EPI64X(m9, m1); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m15, m5); - b1 = MM_SET_EPI64X(m2, m8); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m4, m0); - b1 = MM_SET_EPI64X(m10, m6); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m14, m6); - b1 = MM_SET_EPI64X(m0, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m9, m15); - b1 = MM_SET_EPI64X(m8, m3); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m13, m12); - b1 = MM_SET_EPI64X(m10, m1); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m7, m2); - b1 = MM_SET_EPI64X(m5, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m8, m10); - b1 = MM_SET_EPI64X(m1, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m4, m2); - b1 = MM_SET_EPI64X(m5, m6); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m9, m15); - b1 = MM_SET_EPI64X(m13, m3); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m14, m11); - b1 = MM_SET_EPI64X(m0, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m2, m0); - b1 = MM_SET_EPI64X(m6, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m3, m1); - b1 = MM_SET_EPI64X(m7, m5); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m10, m8); - b1 = MM_SET_EPI64X(m14, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m11, m9); - b1 = MM_SET_EPI64X(m15, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m4, m14); - b1 = MM_SET_EPI64X(m13, m9); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m8, m10); - b1 = MM_SET_EPI64X(m6, m15); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m0, m1); - b1 = MM_SET_EPI64X(m5, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m2, m12); - b1 = MM_SET_EPI64X(m3, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - row1l = _mm_xor_si128(row3l, row1l); - row1h = _mm_xor_si128(row3h, row1h); - _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l)); - _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h)); - - row2l = _mm_xor_si128(row4l, row2l); - row2h = _mm_xor_si128(row4h, row2h); - _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l)); - _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h)); -} -# endif // (__SUNPRO_CC != 0x5120) -#endif // CRYPTOPP_SSE2_AVAILABLE - template class BLAKE2_Base; template class BLAKE2_Base;