From bdeaae3ac940b166ea13035d7d08fc6b53440231 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sat, 3 Nov 2018 10:49:22 -0400 Subject: [PATCH] Fix disjoint t[] and f[] when using SIMD implementations --- blake2.cpp | 44 ++++++++++++++++++++++---------------------- blake2.h | 12 ++++++------ blake2b-simd.cpp | 12 ++++++------ blake2s-simd.cpp | 6 +++--- 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/blake2.cpp b/blake2.cpp index 04e1316a..e80985d4 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -455,12 +455,12 @@ void BLAKE2s::Restart(const BLAKE2s_ParameterBlock& block, const word32 counter[ } State& state = *m_state.data(); - state.t[0] = state.t[1] = 0, state.f[0] = state.f[1] = 0, state.length = 0; + state.tf[0] = state.tf[1] = 0, state.tf[2] = state.tf[3] = 0, state.length = 0; if (counter != NULLPTR) { - state.t[0] = counter[0]; - state.t[1] = counter[1]; + state.tf[0] = counter[0]; + state.tf[1] = counter[1]; } const word32* iv = BLAKE2S_IV; @@ -486,12 +486,12 @@ void BLAKE2b::Restart(const BLAKE2b_ParameterBlock& block, const word64 counter[ } State& state = *m_state.data(); - state.t[0] = state.t[1] = 0, state.f[0] = state.f[1] = 0, state.length = 0; + state.tf[0] = state.tf[1] = 0, state.tf[2] = state.tf[3] = 0, state.length = 0; if (counter != NULLPTR) { - state.t[0] = counter[0]; - state.t[1] = counter[1]; + state.tf[0] = counter[0]; + state.tf[1] = counter[1]; } const word64* iv = BLAKE2B_IV; @@ -584,11 +584,11 @@ void BLAKE2s::TruncatedFinal(byte *hash, size_t size) // Set last block unconditionally State& state = *m_state.data(); - state.f[0] = ~static_cast(0); + state.tf[2] = ~static_cast(0); // Set last node if tree mode if (m_treeMode) - state.f[1] = ~static_cast(0); + state.tf[3] = ~static_cast(0); // Increment counter for tail bytes only IncrementCounter(state.length); @@ -609,11 +609,11 @@ void BLAKE2b::TruncatedFinal(byte *hash, size_t size) // Set last block unconditionally State& state = *m_state.data(); - state.f[0] = ~static_cast(0); + state.tf[2] = ~static_cast(0); // Set last node if tree mode if (m_treeMode) - state.f[1] = ~static_cast(0); + state.tf[3] = ~static_cast(0); // Increment counter for tail bytes only IncrementCounter(state.length); @@ -630,15 +630,15 @@ void BLAKE2b::TruncatedFinal(byte *hash, size_t size) void BLAKE2s::IncrementCounter(size_t count) { State& state = *m_state.data(); - state.t[0] += static_cast(count); - state.t[1] += !!(state.t[0] < count); + state.tf[0] += static_cast(count); + state.tf[1] += !!(state.tf[0] < count); } void BLAKE2b::IncrementCounter(size_t count) { State& state = *m_state.data(); - state.t[0] += static_cast(count); - state.t[1] += !!(state.t[0] < count); + state.tf[0] += static_cast(count); + state.tf[1] += !!(state.tf[0] < count); } void BLAKE2s::Compress(const byte *input) @@ -702,10 +702,10 @@ void BLAKE2_Compress64_CXX(const byte* input, BLAKE2b_State& state) v[ 9] = iv[1]; v[10] = iv[2]; v[11] = iv[3]; - v[12] = state.t[0] ^ iv[4]; - v[13] = state.t[1] ^ iv[5]; - v[14] = state.f[0] ^ iv[6]; - v[15] = state.f[1] ^ iv[7]; + v[12] = state.tf[0] ^ iv[4]; + v[13] = state.tf[1] ^ iv[5]; + v[14] = state.tf[2] ^ iv[6]; + v[15] = state.tf[3] ^ iv[7]; BLAKE2B_ROUND<0>(m, v); BLAKE2B_ROUND<1>(m, v); @@ -739,10 +739,10 @@ void BLAKE2_Compress32_CXX(const byte* input, BLAKE2s_State& state) v[ 9] = iv[1]; v[10] = iv[2]; v[11] = iv[3]; - v[12] = state.t[0] ^ iv[4]; - v[13] = state.t[1] ^ iv[5]; - v[14] = state.f[0] ^ iv[6]; - v[15] = state.f[1] ^ iv[7]; + v[12] = state.tf[0] ^ iv[4]; + v[13] = state.tf[1] ^ iv[5]; + v[14] = state.tf[2] ^ iv[6]; + v[15] = state.tf[3] ^ iv[7]; BLAKE2S_ROUND<0>(m, v); BLAKE2S_ROUND<1>(m, v); diff --git a/blake2.h b/blake2.h index a7be88e5..dbb3d731 100644 --- a/blake2.h +++ b/blake2.h @@ -134,12 +134,12 @@ struct CRYPTOPP_NO_VTABLE BLAKE2s_State { // Set all members except scratch buffer[] h[0]=h[1]=h[2]=h[3]=h[4]=h[5]=h[6]=h[7] = 0; - t[0]=t[1]=f[0]=f[1] = 0; + tf[0]=tf[1]=tf[2]=tf[3] = 0; length = 0; } - // SSE2, SSE4 and NEON depend upon t[] and f[] being side-by-side - word32 h[8], t[2], f[2]; + // SSE4, Power7 and NEON depend upon t[] and f[] being side-by-side + word32 h[8], tf[4]; // t[2], f[2]; byte buffer[BLAKE2s_Info::BLOCKSIZE]; size_t length; }; @@ -152,12 +152,12 @@ struct CRYPTOPP_NO_VTABLE BLAKE2b_State { // Set all members except scratch buffer[] h[0]=h[1]=h[2]=h[3]=h[4]=h[5]=h[6]=h[7] = 0; - t[0]=t[1]=f[0]=f[1] = 0; + tf[0]=tf[1]=tf[2]=tf[3] = 0; length = 0; } - // SSE2, SSE4 and NEON depend upon t[] and f[] being side-by-side - word64 h[8], t[2], f[2]; + // SSE4, Power8 and NEON depend upon t[] and f[] being side-by-side + word64 h[8], tf[4]; // t[2], f[2]; byte buffer[BLAKE2b_Info::BLOCKSIZE]; size_t length; }; diff --git a/blake2b-simd.cpp b/blake2b-simd.cpp index eaf6e7f0..c95b46ac 100644 --- a/blake2b-simd.cpp +++ b/blake2b-simd.cpp @@ -457,8 +457,8 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2b_State& state) row2h = LOADU( &state.h[6] ); row3l = LOADU( &BLAKE2B_IV[0] ); row3h = LOADU( &BLAKE2B_IV[2] ); - row4l = _mm_xor_si128( LOADU( &BLAKE2B_IV[4] ), LOADU( &state.t[0] ) ); - row4h = _mm_xor_si128( LOADU( &BLAKE2B_IV[6] ), LOADU( &state.f[0] ) ); + row4l = _mm_xor_si128( LOADU( &BLAKE2B_IV[4] ), LOADU( &state.tf[0] ) ); + row4h = _mm_xor_si128( LOADU( &BLAKE2B_IV[6] ), LOADU( &state.tf[2] ) ); BLAKE2B_ROUND( 0 ); BLAKE2B_ROUND( 1 ); @@ -717,8 +717,8 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state) row3l = vld1q_u64(&BLAKE2B_IV[0]); row3h = vld1q_u64(&BLAKE2B_IV[2]); - row4l = veorq_u64(vld1q_u64(&BLAKE2B_IV[4]), vld1q_u64(&state.t[0])); - row4h = veorq_u64(vld1q_u64(&BLAKE2B_IV[6]), vld1q_u64(&state.f[0])); + row4l = veorq_u64(vld1q_u64(&BLAKE2B_IV[4]), vld1q_u64(&state.tf[0])); + row4h = veorq_u64(vld1q_u64(&BLAKE2B_IV[6]), vld1q_u64(&state.tf[2])); BLAKE2B_ROUND(0); BLAKE2B_ROUND(1); @@ -1194,8 +1194,8 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) row3l = VectorLoad64(&BLAKE2B_IV[0]); row3h = VectorLoad64(&BLAKE2B_IV[2]); - row4l = vec_xor(VectorLoad64(&BLAKE2B_IV[4]), VectorLoad64(&state.t[0])); - row4h = vec_xor(VectorLoad64(&BLAKE2B_IV[6]), VectorLoad64(&state.f[0])); + row4l = vec_xor(VectorLoad64(&BLAKE2B_IV[4]), VectorLoad64(&state.tf[0])); + row4h = vec_xor(VectorLoad64(&BLAKE2B_IV[6]), VectorLoad64(&state.tf[2])); BLAKE2B_ROUND(0); BLAKE2B_ROUND(1); diff --git a/blake2s-simd.cpp b/blake2s-simd.cpp index af5e47e8..e2c000ad 100644 --- a/blake2s-simd.cpp +++ b/blake2s-simd.cpp @@ -335,7 +335,7 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2s_State& state) row1 = ff0 = LOADU( &state.h[0] ); row2 = ff1 = LOADU( &state.h[4] ); row3 = LOADU( &BLAKE2S_IV[0] ); - row4 = _mm_xor_si128( LOADU( &BLAKE2S_IV[4] ), LOADU( &state.t[0] ) ); + row4 = _mm_xor_si128( LOADU( &BLAKE2S_IV[4] ), LOADU( &state.tf[0] ) ); BLAKE2S_ROUND( 0 ); BLAKE2S_ROUND( 1 ); @@ -653,7 +653,7 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state) const uint32x4_t f0 = row1 = vld1q_u32(&state.h[0]); const uint32x4_t f1 = row2 = vld1q_u32(&state.h[4]); row3 = vld1q_u32(&BLAKE2S_IV[0]); - row4 = veorq_u32(vld1q_u32(&BLAKE2S_IV[4]), vld1q_u32(&state.t[0])); + row4 = veorq_u32(vld1q_u32(&BLAKE2S_IV[4]), vld1q_u32(&state.tf[0])); BLAKE2S_ROUND(0); BLAKE2S_ROUND(1); @@ -1000,7 +1000,7 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) row1 = ff0 = VectorLoad32LE( &state.h[0] ); row2 = ff1 = VectorLoad32LE( &state.h[4] ); row3 = VectorLoad32( &BLAKE2S_IV[0] ); - row4 = vec_xor( VectorLoad32( &BLAKE2S_IV[4] ), VectorLoad32( &state.t[0] ) ); + row4 = vec_xor( VectorLoad32( &BLAKE2S_IV[4] ), VectorLoad32( &state.tf[0] ) ); BLAKE2S_ROUND( 0 ); BLAKE2S_ROUND( 1 );