diff --git a/blake2-simd.cpp b/blake2-simd.cpp index dc3f407a..151be928 100644 --- a/blake2-simd.cpp +++ b/blake2-simd.cpp @@ -11,11 +11,8 @@ #include "misc.h" #include "blake2.h" -#if (CRYPTOPP_SSE2_AVAILABLE) -# include "emmintrin.h" -#endif - #if (CRYPTOPP_SSE42_AVAILABLE) +# include "emmintrin.h" # include "nmmintrin.h" #endif @@ -2178,1318 +2175,4 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State& state } #endif // CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE -#if CRYPTOPP_SSE2_AVAILABLE -void BLAKE2_Compress32_SSE2(const byte* input, BLAKE2_State& state) -{ - word32 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15; - GetBlock get(input); - get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15); - - __m128i row1,row2,row3,row4; - __m128i buf1,buf2,buf3,buf4; - __m128i ff0,ff1; - - row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); - row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); - row3 = _mm_setr_epi32(BLAKE2S_IV[0],BLAKE2S_IV[1],BLAKE2S_IV[2],BLAKE2S_IV[3]); - row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4],BLAKE2S_IV[5],BLAKE2S_IV[6],BLAKE2S_IV[7]),_mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); - buf1 = _mm_set_epi32(m6,m4,m2,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m7,m5,m3,m1); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m14,m12,m10,m8); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m15,m13,m11,m9); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m13,m9,m4,m14); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m6,m15,m8,m10); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m5,m11,m0,m1); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m3,m7,m2,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m15,m5,m12,m11); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m13,m2,m0,m8); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m9,m7,m3,m10); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m4,m1,m6,m14); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m11,m13,m3,m7); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m14,m12,m1,m9); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m15,m4,m5,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m8,m0,m10,m6); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m10,m2,m5,m9); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m15,m4,m7,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m3,m6,m11,m14); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m13,m8,m12,m1); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m8,m0,m6,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m3,m11,m10,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m1,m15,m7,m4); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m9,m14,m5,m13); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m4,m14,m1,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m10,m13,m15,m5); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m8,m9,m6,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m11,m2,m3,m7); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m3,m12,m7,m13); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m9,m1,m14,m11); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m2,m8,m15,m5); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m10,m6,m4,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m0,m11,m14,m6); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m8,m3,m9,m15); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m10,m1,m13,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m5,m4,m7,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m1,m7,m8,m10); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m5,m6,m4,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m13,m3,m9,m15); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m0,m12,m14,m11); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - _mm_storeu_si128((__m128i *)(void*)(&state.h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3))); - _mm_storeu_si128((__m128i *)(void*)(&state.h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4))); -} - -# if (__SUNPRO_CC != 0x5120) -void BLAKE2_Compress64_SSE2(const byte* input, BLAKE2_State& state) -{ - word64 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15; - GetBlock get(input); - get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15); - - __m128i row1l, row1h, row2l, row2h; - __m128i row3l, row3h, row4l, row4h; - __m128i b0, b1, t0, t1; - - row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); - row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])); - row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); - row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])); - row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[0])); - row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[2])); - row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[4])), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); - row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[6])), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0]))); - - b0 = MM_SET_EPI64X(m2, m0); - b1 = MM_SET_EPI64X(m6, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l, 40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h, 40)); - - b0 = MM_SET_EPI64X(m3, m1); - b1 = MM_SET_EPI64X(m7, m5); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m10, m8); - b1 = MM_SET_EPI64X(m14, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m11, m9); - b1 = MM_SET_EPI64X(m15, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m4, m14); - b1 = MM_SET_EPI64X(m13, m9); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m8, m10); - b1 = MM_SET_EPI64X(m6, m15); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - b0 = MM_SET_EPI64X(m0, m1); - b1 = MM_SET_EPI64X(m5, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m2, m12); - b1 = MM_SET_EPI64X(m3, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m12, m11); - b1 = MM_SET_EPI64X(m15, m5); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m0, m8); - b1 = MM_SET_EPI64X(m13, m2); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - b0 = MM_SET_EPI64X(m3, m10); - b1 = MM_SET_EPI64X(m9, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m6, m14); - b1 = MM_SET_EPI64X(m4, m1); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m3, m7); - b1 = MM_SET_EPI64X(m11, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m1, m9); - b1 = MM_SET_EPI64X(m14, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - b0 = MM_SET_EPI64X(m5, m2); - b1 = MM_SET_EPI64X(m15, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m10, m6); - b1 = MM_SET_EPI64X(m8, m0); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m5, m9); - b1 = MM_SET_EPI64X(m10, m2); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m7, m0); - b1 = MM_SET_EPI64X(m15, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m11, m14); - b1 = MM_SET_EPI64X(m3, m6); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - - b0 = MM_SET_EPI64X(m12, m1); - b1 = MM_SET_EPI64X(m13, m8); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m6, m2); - b1 = MM_SET_EPI64X(m8, m0); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m10, m12); - b1 = MM_SET_EPI64X(m3, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m7, m4); - b1 = MM_SET_EPI64X(m1, m15); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m5, m13); - b1 = MM_SET_EPI64X(m9, m14); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m1, m12); - b1 = MM_SET_EPI64X(m4, m14); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m15, m5); - b1 = MM_SET_EPI64X(m10, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m6, m0); - b1 = MM_SET_EPI64X(m8, m9); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m3, m7); - b1 = MM_SET_EPI64X(m11, m2); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m7, m13); - b1 = MM_SET_EPI64X(m3, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m14, m11); - b1 = MM_SET_EPI64X(m9, m1); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m15, m5); - b1 = MM_SET_EPI64X(m2, m8); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m4, m0); - b1 = MM_SET_EPI64X(m10, m6); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m14, m6); - b1 = MM_SET_EPI64X(m0, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m9, m15); - b1 = MM_SET_EPI64X(m8, m3); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m13, m12); - b1 = MM_SET_EPI64X(m10, m1); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m7, m2); - b1 = MM_SET_EPI64X(m5, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m8, m10); - b1 = MM_SET_EPI64X(m1, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m4, m2); - b1 = MM_SET_EPI64X(m5, m6); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m9, m15); - b1 = MM_SET_EPI64X(m13, m3); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m14, m11); - b1 = MM_SET_EPI64X(m0, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m2, m0); - b1 = MM_SET_EPI64X(m6, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m3, m1); - b1 = MM_SET_EPI64X(m7, m5); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m10, m8); - b1 = MM_SET_EPI64X(m14, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m11, m9); - b1 = MM_SET_EPI64X(m15, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m4, m14); - b1 = MM_SET_EPI64X(m13, m9); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m8, m10); - b1 = MM_SET_EPI64X(m6, m15); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m0, m1); - b1 = MM_SET_EPI64X(m5, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m2, m12); - b1 = MM_SET_EPI64X(m3, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - row1l = _mm_xor_si128(row3l, row1l); - row1h = _mm_xor_si128(row3h, row1h); - _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l)); - _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h)); - - row2l = _mm_xor_si128(row4l, row2l); - row2h = _mm_xor_si128(row4h, row2h); - _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l)); - _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h)); -} -# endif // (__SUNPRO_CC != 0x5120) -#endif // CRYPTOPP_SSE2_AVAILABLE - NAMESPACE_END \ No newline at end of file diff --git a/blake2.cpp b/blake2.cpp index d0fbf2e8..3e1712eb 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -22,17 +22,8 @@ NAMESPACE_BEGIN(CryptoPP) # undef CRYPTOPP_SSE42_AVAILABLE #endif -// C/C++ implementation -static void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State& state); -static void BLAKE2_Compress64_CXX(const byte* input, BLAKE2_State& state); - -// Also see http://github.com/weidai11/cryptopp/issues/247 for SunCC 5.12 -#if CRYPTOPP_SSE2_AVAILABLE -extern void BLAKE2_Compress32_SSE2(const byte* input, BLAKE2_State& state); -# if (__SUNPRO_CC != 0x5120) -extern void BLAKE2_Compress64_SSE2(const byte* input, BLAKE2_State& state); -# endif -#endif +void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State& state); +void BLAKE2_Compress64_CXX(const byte* input, BLAKE2_State& state); #if CRYPTOPP_SSE42_AVAILABLE extern void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State& state); @@ -101,13 +92,6 @@ pfnCompress64 InitializeCompress64Fn() return &BLAKE2_Compress64_SSE4; else #endif -#if CRYPTOPP_SSE2_AVAILABLE -# if (__SUNPRO_CC != 0x5120) - if (HasSSE2()) - return &BLAKE2_Compress64_SSE2; - else -# endif -#endif #if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE if (HasNEON()) return &BLAKE2_Compress64_NEON; @@ -123,11 +107,6 @@ pfnCompress32 InitializeCompress32Fn() return &BLAKE2_Compress32_SSE4; else #endif -#if CRYPTOPP_SSE2_AVAILABLE - if (HasSSE2()) - return &BLAKE2_Compress32_SSE2; - else -#endif #if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE if (HasNEON()) return &BLAKE2_Compress32_NEON;