diff --git a/blake2.cpp b/blake2.cpp index d25eca9a..1f59210b 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -543,27 +543,27 @@ static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State::iv[0],BLAKE2_IV::iv[1],BLAKE2_IV::iv[2],BLAKE2_IV::iv[3]); - row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2_IV::iv[4],BLAKE2_IV::iv[5],BLAKE2_IV::iv[6],BLAKE2_IV::iv[7]),_mm_loadu_si128((const __m128i *)(&state.t[0]))); + row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2_IV::iv[4],BLAKE2_IV::iv[5],BLAKE2_IV::iv[6],BLAKE2_IV::iv[7]),_mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); buf1 = _mm_set_epi32(m6,m4,m2,m0); row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); row4 = _mm_xor_si128(row4,row1); @@ -964,8 +964,8 @@ static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State& state) @@ -974,31 +974,31 @@ static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State __m128i row3l, row3h, row4l, row4h; __m128i b0, b1, t0, t1; - const word64 m0 = ((const word64*)input)[ 0]; - const word64 m1 = ((const word64*)input)[ 1]; - const word64 m2 = ((const word64*)input)[ 2]; - const word64 m3 = ((const word64*)input)[ 3]; - const word64 m4 = ((const word64*)input)[ 4]; - const word64 m5 = ((const word64*)input)[ 5]; - const word64 m6 = ((const word64*)input)[ 6]; - const word64 m7 = ((const word64*)input)[ 7]; - const word64 m8 = ((const word64*)input)[ 8]; - const word64 m9 = ((const word64*)input)[ 9]; - const word64 m10 = ((const word64*)input)[10]; - const word64 m11 = ((const word64*)input)[11]; - const word64 m12 = ((const word64*)input)[12]; - const word64 m13 = ((const word64*)input)[13]; - const word64 m14 = ((const word64*)input)[14]; - const word64 m15 = ((const word64*)input)[15]; + const word64 m0 = ((const word64*)(const void*)input)[ 0]; + const word64 m1 = ((const word64*)(const void*)input)[ 1]; + const word64 m2 = ((const word64*)(const void*)input)[ 2]; + const word64 m3 = ((const word64*)(const void*)input)[ 3]; + const word64 m4 = ((const word64*)(const void*)input)[ 4]; + const word64 m5 = ((const word64*)(const void*)input)[ 5]; + const word64 m6 = ((const word64*)(const void*)input)[ 6]; + const word64 m7 = ((const word64*)(const void*)input)[ 7]; + const word64 m8 = ((const word64*)(const void*)input)[ 8]; + const word64 m9 = ((const word64*)(const void*)input)[ 9]; + const word64 m10 = ((const word64*)(const void*)input)[10]; + const word64 m11 = ((const word64*)(const void*)input)[11]; + const word64 m12 = ((const word64*)(const void*)input)[12]; + const word64 m13 = ((const word64*)(const void*)input)[13]; + const word64 m14 = ((const word64*)(const void*)input)[14]; + const word64 m15 = ((const word64*)(const void*)input)[15]; - row1l = _mm_loadu_si128( (const __m128i *)(&state.h[0]) ); - row1h = _mm_loadu_si128( (const __m128i *)(&state.h[2]) ); - row2l = _mm_loadu_si128( (const __m128i *)(&state.h[4]) ); - row2h = _mm_loadu_si128( (const __m128i *)(&state.h[6]) ); - row3l = _mm_loadu_si128( (const __m128i *)(&BLAKE2_IV::iv[0]) ); - row3h = _mm_loadu_si128( (const __m128i *)(&BLAKE2_IV::iv[2]) ); - row4l = _mm_xor_si128( _mm_loadu_si128( (const __m128i *)(&BLAKE2_IV::iv[4]) ), _mm_loadu_si128( (const __m128i *)(&state.t[0]) ) ); - row4h = _mm_xor_si128( _mm_loadu_si128( (const __m128i *)(&BLAKE2_IV::iv[6]) ), _mm_loadu_si128( (const __m128i *)(&state.f[0]) ) ); + row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]) ); + row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2]) ); + row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]) ); + row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6]) ); + row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2_IV::iv[0]) ); + row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2_IV::iv[2]) ); + row4l = _mm_xor_si128( _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2_IV::iv[4]) ), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]) ) ); + row4h = _mm_xor_si128( _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2_IV::iv[6]) ), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0]) ) ); b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4); @@ -1855,13 +1855,13 @@ static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State row1l = _mm_xor_si128( row3l, row1l ); row1h = _mm_xor_si128( row3h, row1h ); - _mm_storeu_si128((__m128i *)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128( (const __m128i *)(&state.h[0]) ), row1l)); - _mm_storeu_si128((__m128i *)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128( (const __m128i *)(&state.h[2]) ), row1h)); + _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]) ), row1l)); + _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2]) ), row1h)); row2l = _mm_xor_si128( row4l, row2l ); row2h = _mm_xor_si128( row4h, row2h ); - _mm_storeu_si128((__m128i *)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128( (const __m128i *)(&state.h[4]) ), row2l)); - _mm_storeu_si128((__m128i *)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128( (const __m128i *)(&state.h[6]) ), row2h)); + _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]) ), row2l)); + _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6]) ), row2h)); } #endif // CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE @@ -1877,15 +1877,15 @@ static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State::iv[0], BLAKE2_IV::iv[1], BLAKE2_IV::iv[2], BLAKE2_IV::iv[3]); - row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2_IV::iv[4], BLAKE2_IV::iv[5], BLAKE2_IV::iv[6], BLAKE2_IV::iv[7]), _mm_loadu_si128((const __m128i *)(&state.t[0]))); + row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2_IV::iv[4], BLAKE2_IV::iv[5], BLAKE2_IV::iv[6], BLAKE2_IV::iv[7]), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0)))); row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); @@ -2417,8 +2417,8 @@ static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State& state) @@ -2432,23 +2432,23 @@ static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - const __m128i m0 = _mm_loadu_si128((const __m128i *)(input + 00)); - const __m128i m1 = _mm_loadu_si128((const __m128i *)(input + 16)); - const __m128i m2 = _mm_loadu_si128((const __m128i *)(input + 32)); - const __m128i m3 = _mm_loadu_si128((const __m128i *)(input + 48)); - const __m128i m4 = _mm_loadu_si128((const __m128i *)(input + 64)); - const __m128i m5 = _mm_loadu_si128((const __m128i *)(input + 80)); - const __m128i m6 = _mm_loadu_si128((const __m128i *)(input + 96)); - const __m128i m7 = _mm_loadu_si128((const __m128i *)(input + 112)); + const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00)); + const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16)); + const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32)); + const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48)); + const __m128i m4 = _mm_loadu_si128((const __m128i*)(const void*)(input + 64)); + const __m128i m5 = _mm_loadu_si128((const __m128i*)(const void*)(input + 80)); + const __m128i m6 = _mm_loadu_si128((const __m128i*)(const void*)(input + 96)); + const __m128i m7 = _mm_loadu_si128((const __m128i*)(const void*)(input + 112)); - row1l = _mm_loadu_si128((const __m128i *)(&state.h[0])); - row1h = _mm_loadu_si128((const __m128i *)(&state.h[2])); - row2l = _mm_loadu_si128((const __m128i *)(&state.h[4])); - row2h = _mm_loadu_si128((const __m128i *)(&state.h[6])); - row3l = _mm_loadu_si128((const __m128i *)(&BLAKE2_IV::iv[0])); - row3h = _mm_loadu_si128((const __m128i *)(&BLAKE2_IV::iv[2])); - row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(&BLAKE2_IV::iv[4])), _mm_loadu_si128((const __m128i *)(&state.t[0]))); - row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(&BLAKE2_IV::iv[6])), _mm_loadu_si128((const __m128i *)(&state.f[0]))); + row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); + row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])); + row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); + row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])); + row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2_IV::iv[0])); + row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2_IV::iv[2])); + row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2_IV::iv[4])), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); + row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2_IV::iv[6])), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0]))); b0 = _mm_unpacklo_epi64(m0, m1); b1 = _mm_unpacklo_epi64(m2, m3); @@ -3387,12 +3387,13 @@ static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State row1l = _mm_xor_si128(row3l, row1l); row1h = _mm_xor_si128(row3h, row1h); - _mm_storeu_si128((__m128i *)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(&state.h[0])), row1l)); - _mm_storeu_si128((__m128i *)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(&state.h[2])), row1h)); + _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l)); + _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h)); + row2l = _mm_xor_si128(row4l, row2l); row2h = _mm_xor_si128(row4h, row2h); - _mm_storeu_si128((__m128i *)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(&state.h[4])), row2l)); - _mm_storeu_si128((__m128i *)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i *)(&state.h[6])), row2h)); + _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l)); + _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h)); } #endif // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE