diff --git a/blake2.cpp b/blake2.cpp index b10fa7de..7c1b9b8d 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -1,4 +1,5 @@ -// blake2.cpp - written and placed in the public domain by Jeffrey Walton and Zooko Wilcox-O'Hearn +// blake2.cpp - written and placed in the public domain by Jeffrey Walton and Zooko Wilcox-O'Hearn. +// Copyright assigned to the Crypto++ project. // Based on Aumasson, Neves, Wilcox-O’Hearn and Winnerlein's reference BLAKE2 // implementation at http://github.com/BLAKE2/BLAKE2. @@ -588,7 +589,6 @@ static inline void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State::iv[4]) ), _mm_loadu_si128( (const __m128i *)(&state.t[0]) ) ); row4h = _mm_xor_si128( _mm_loadu_si128( (const __m128i *)(&BLAKE2_IV::iv[6]) ), _mm_loadu_si128( (const __m128i *)(&state.f[0]) ) ); - b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4); row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); @@ -1012,7 +1002,6 @@ static inline void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State& state) { - // TODO... fall back to C++ + // Fallback to C++ BLAKE2_CXX_Compress32(input, state); -}; + +#if 0 + __m128i row1, row2, row3, row4; + __m128i buf1, buf2, buf3, buf4; + + __m128i t0, t1, t2; + __m128i ff0, ff1; + + const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1); + const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + + const __m128i m0 = _mm_loadu_si128((const __m128i *)(input + 00)); + const __m128i m1 = _mm_loadu_si128((const __m128i *)(input + 16)); + const __m128i m2 = _mm_loadu_si128((const __m128i *)(input + 32)); + const __m128i m3 = _mm_loadu_si128((const __m128i *)(input + 48)); + + row1 = ff0 = _mm_loadu_si128((const __m128i *)(&state.h[0])); + row2 = ff1 = _mm_loadu_si128((const __m128i *)(&state.h[4])); + row3 = _mm_setr_epi32(BLAKE2_IV::iv[0], BLAKE2_IV::iv[1], BLAKE2_IV::iv[2], BLAKE2_IV::iv[3]); + row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2_IV::iv[4], BLAKE2_IV::iv[5], BLAKE2_IV::iv[6], BLAKE2_IV::iv[7]), _mm_loadu_si128((const __m128i *)(&state.t[0]))); + buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0)))); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + buf2 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(3,1,3,1)))); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(2,0,2,0)))); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + buf4 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(3,1,3,1)))); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_blend_epi16(m1, m2, 0x0C); + t1 = _mm_slli_si128(m3, 4); + t2 = _mm_blend_epi16(t0, t1, 0xF0); + buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); + t1 = _mm_blend_epi16(m1,m3,0xC0); + t2 = _mm_blend_epi16(t0, t1, 0xF0); + buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_slli_si128(m1, 4); + t1 = _mm_blend_epi16(m2, t0, 0x30); + t2 = _mm_blend_epi16(m0, t1, 0xF0); + buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpackhi_epi32(m0,m1); + t1 = _mm_slli_si128(m3, 4); + t2 = _mm_blend_epi16(t0, t1, 0x0C); + buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpackhi_epi32(m2,m3); + t1 = _mm_blend_epi16(m3,m1,0x0C); + t2 = _mm_blend_epi16(t0, t1, 0x0F); + buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpacklo_epi32(m2,m0); + t1 = _mm_blend_epi16(t0, m0, 0xF0); + t2 = _mm_slli_si128(m3, 8); + buf2 = _mm_blend_epi16(t1, t2, 0xC0); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_blend_epi16(m0, m2, 0x3C); + t1 = _mm_srli_si128(m1, 12); + t2 = _mm_blend_epi16(t0,t1,0x03); + buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_slli_si128(m3, 4); + t1 = _mm_blend_epi16(m0, m1, 0x33); + t2 = _mm_blend_epi16(t1, t0, 0xC0); + buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpackhi_epi32(m0,m1); + t1 = _mm_unpackhi_epi32(t0, m2); + t2 = _mm_blend_epi16(t1, m3, 0x0C); + buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_slli_si128(m2, 8); + t1 = _mm_blend_epi16(m3,m0,0x0C); + t2 = _mm_blend_epi16(t1, t0, 0xC0); + buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_blend_epi16(m0,m1,0x0F); + t1 = _mm_blend_epi16(t0, m3, 0xC0); + buf3 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpacklo_epi32(m0,m2); + t1 = _mm_unpackhi_epi32(m1,m2); + buf4 = _mm_unpacklo_epi64(t1,t0); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpacklo_epi64(m1,m2); + t1 = _mm_unpackhi_epi64(m0,m2); + t2 = _mm_blend_epi16(t0,t1,0x33); + buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpackhi_epi64(m1,m3); + t1 = _mm_unpacklo_epi64(m0,m1); + buf2 = _mm_blend_epi16(t0,t1,0x33); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_unpackhi_epi64(m3,m1); + t1 = _mm_unpackhi_epi64(m2,m0); + buf3 = _mm_blend_epi16(t1,t0,0x33); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_blend_epi16(m0,m2,0x03); + t1 = _mm_slli_si128(t0, 8); + t2 = _mm_blend_epi16(t1,m3,0x0F); + buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpackhi_epi32(m0,m1); + t1 = _mm_unpacklo_epi32(m0,m2); + buf1 = _mm_unpacklo_epi64(t0,t1); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_srli_si128(m2, 4); + t1 = _mm_blend_epi16(m0,m3,0x03); + buf2 = _mm_blend_epi16(t1,t0,0x3C); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_blend_epi16(m1,m0,0x0C); + t1 = _mm_srli_si128(m3, 4); + t2 = _mm_blend_epi16(t0,t1,0x30); + buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpacklo_epi64(m1,m2); + t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); + buf4 = _mm_blend_epi16(t0,t1,0x33); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_slli_si128(m1, 12); + t1 = _mm_blend_epi16(m0,m3,0x33); + buf1 = _mm_blend_epi16(t1,t0,0xC0); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_blend_epi16(m3,m2,0x30); + t1 = _mm_srli_si128(m1, 4); + t2 = _mm_blend_epi16(t0,t1,0x03); + buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_unpacklo_epi64(m0,m2); + t1 = _mm_srli_si128(m1, 4); + buf3 = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpackhi_epi32(m1,m2); + t1 = _mm_unpackhi_epi64(m0,t0); + buf4 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpackhi_epi32(m0,m1); + t1 = _mm_blend_epi16(t0,m3,0x0F); + buf1 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_blend_epi16(m2,m3,0x30); + t1 = _mm_srli_si128(m0,4); + t2 = _mm_blend_epi16(t0,t1,0x03); + buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_unpackhi_epi64(m0,m3); + t1 = _mm_unpacklo_epi64(m1,m2); + t2 = _mm_blend_epi16(t0,t1,0x3C); + buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpacklo_epi32(m0,m1); + t1 = _mm_unpackhi_epi32(m1,m2); + buf4 = _mm_unpacklo_epi64(t0,t1); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpackhi_epi32(m1,m3); + t1 = _mm_unpacklo_epi64(t0,m0); + t2 = _mm_blend_epi16(t1,m2,0xC0); + buf1 = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpackhi_epi32(m0,m3); + t1 = _mm_blend_epi16(m2,t0,0xF0); + buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_blend_epi16(m2,m0,0x0C); + t1 = _mm_slli_si128(t0,4); + buf3 = _mm_blend_epi16(t1,m3,0x0F); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_blend_epi16(m1,m0,0x30); + buf4 = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_blend_epi16(m0,m2,0x03); + t1 = _mm_blend_epi16(m1,m2,0x30); + t2 = _mm_blend_epi16(t1,t0,0x0F); + buf1 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_slli_si128(m0,4); + t1 = _mm_blend_epi16(m1,t0,0xC0); + buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_unpackhi_epi32(m0,m3); + t1 = _mm_unpacklo_epi32(m2,m3); + t2 = _mm_unpackhi_epi64(t0,t1); + buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_blend_epi16(m3,m2,0xC0); + t1 = _mm_unpacklo_epi32(m0,m3); + t2 = _mm_blend_epi16(t0,t1,0x0F); + buf4 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + _mm_storeu_si128((__m128i *)(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3))); + _mm_storeu_si128((__m128i *)(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4))); +#endif +} static inline void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State& state) { @@ -1945,6 +2432,7 @@ static inline void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State::iv[2])); row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(&BLAKE2_IV::iv[4])), _mm_loadu_si128((const __m128i *)(&state.t[0]))); row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(&BLAKE2_IV::iv[6])), _mm_loadu_si128((const __m128i *)(&state.f[0]))); + b0 = _mm_unpacklo_epi64(m0, m1); b1 = _mm_unpacklo_epi64(m2, m3); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); row4l = _mm_xor_si128(row4l, row1l); @@ -2030,7 +2518,6 @@ static inline void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State; template class BLAKE2_Base; NAMESPACE_END -