From 5f441d28e57426900d5538e0987f4acb50d6e987 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 13 Aug 2017 06:53:35 -0400 Subject: [PATCH] Cleanup __m128 casts due to Clang --- blake2-simd.cpp | 62 +++++++++++++++++++++++++---------------------- gcm-simd.cpp | 48 ++++++++++++++++++------------------ rijndael-simd.cpp | 60 ++++++++++++++++++++++++--------------------- 3 files changed, 89 insertions(+), 81 deletions(-) diff --git a/blake2-simd.cpp b/blake2-simd.cpp index 151be928..1aff1ff8 100644 --- a/blake2-simd.cpp +++ b/blake2-simd.cpp @@ -29,6 +29,10 @@ # define EXCEPTION_EXECUTE_HANDLER 1 #endif +// Clang __m128i casts +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) + NAMESPACE_BEGIN(CryptoPP) // Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x, Win64 supplies it except for VS2008. @@ -74,15 +78,15 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State& stat const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1); const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00)); - const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16)); - const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32)); - const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48)); + const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00)); + const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16)); + const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32)); + const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48)); - row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); - row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); + row1 = ff0 = _mm_loadu_si128(CONST_M128_CAST(&state.h[0])); + row2 = ff1 = _mm_loadu_si128(CONST_M128_CAST(&state.h[4])); row3 = _mm_setr_epi32(BLAKE2S_IV[0], BLAKE2S_IV[1], BLAKE2S_IV[2], BLAKE2S_IV[3]); - row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4], BLAKE2S_IV[5], BLAKE2S_IV[6], BLAKE2S_IV[7]), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); + row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4], BLAKE2S_IV[5], BLAKE2S_IV[6], BLAKE2S_IV[7]), _mm_loadu_si128(CONST_M128_CAST(&state.t[0]))); buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0)))); row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); @@ -614,8 +618,8 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State& stat row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3))); - _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4))); + _mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3))); + _mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4))); } void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State& state) @@ -629,23 +633,23 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State& state const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00)); - const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16)); - const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32)); - const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48)); - const __m128i m4 = _mm_loadu_si128((const __m128i*)(const void*)(input + 64)); - const __m128i m5 = _mm_loadu_si128((const __m128i*)(const void*)(input + 80)); - const __m128i m6 = _mm_loadu_si128((const __m128i*)(const void*)(input + 96)); - const __m128i m7 = _mm_loadu_si128((const __m128i*)(const void*)(input + 112)); + const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00)); + const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16)); + const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32)); + const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48)); + const __m128i m4 = _mm_loadu_si128(CONST_M128_CAST(input + 64)); + const __m128i m5 = _mm_loadu_si128(CONST_M128_CAST(input + 80)); + const __m128i m6 = _mm_loadu_si128(CONST_M128_CAST(input + 96)); + const __m128i m7 = _mm_loadu_si128(CONST_M128_CAST(input + 112)); - row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); - row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])); - row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); - row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])); - row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[0])); - row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[2])); - row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[4])), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); - row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[6])), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0]))); + row1l = _mm_loadu_si128(CONST_M128_CAST(&state.h[0])); + row1h = _mm_loadu_si128(CONST_M128_CAST(&state.h[2])); + row2l = _mm_loadu_si128(CONST_M128_CAST(&state.h[4])); + row2h = _mm_loadu_si128(CONST_M128_CAST(&state.h[6])); + row3l = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[0])); + row3h = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[2])); + row4l = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[4])), _mm_loadu_si128(CONST_M128_CAST(&state.t[0]))); + row4h = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[6])), _mm_loadu_si128(CONST_M128_CAST(&state.f[0]))); b0 = _mm_unpacklo_epi64(m0, m1); b1 = _mm_unpacklo_epi64(m2, m3); @@ -1584,13 +1588,13 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State& state row1l = _mm_xor_si128(row3l, row1l); row1h = _mm_xor_si128(row3h, row1h); - _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l)); - _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h)); + _mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[0])), row1l)); + _mm_storeu_si128(M128_CAST(&state.h[2]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[2])), row1h)); row2l = _mm_xor_si128(row4l, row2l); row2h = _mm_xor_si128(row4h, row2h); - _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l)); - _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h)); + _mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[4])), row2l)); + _mm_storeu_si128(M128_CAST(&state.h[6]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[6])), row2h)); } #endif // CRYPTOPP_SSE42_AVAILABLE diff --git a/gcm-simd.cpp b/gcm-simd.cpp index c8b9daa5..f2a975cf 100644 --- a/gcm-simd.cpp +++ b/gcm-simd.cpp @@ -50,6 +50,10 @@ # define EXCEPTION_EXECUTE_HANDLER 1 #endif +// Clang __m128i casts +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) + ANONYMOUS_NAMESPACE_BEGIN // GCC 4.8 is missing PMULL gear @@ -438,7 +442,7 @@ const word64 s_clmulConstants64[] = { W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)}; -const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64; +const __m128i *s_clmulConstants = CONST_M128_CAST(s_clmulConstants64); const unsigned int s_cltableSizeInBlocks = 8; ANONYMOUS_NAMESPACE_END @@ -497,11 +501,7 @@ __m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i &r) c2t ^= c1b shift c2 left 1 bit and xor in lowest bit of c1t */ -#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301 - c2 = _mm_xor_si128(c2, _mm_move_epi64(c0)); -#else c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8)); -#endif c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10)); c0 = _mm_srli_si128(c0, 8); c0 = _mm_xor_si128(c0, c1); @@ -527,37 +527,37 @@ __m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r) void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize) { const __m128i r = s_clmulConstants[0]; - const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((const __m128i *)(const void *)hashKey), s_clmulConstants[1]); + const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(hashKey)), s_clmulConstants[1]); __m128i h = h0; unsigned int i; for (i=0; i= 16) { size_t s = UnsignedMin(len/16, s_cltableSizeInBlocks), i=0; - __m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), mask2); + __m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-1)*16)), mask2); __m128i c0 = _mm_setzero_si128(); __m128i c1 = _mm_setzero_si128(); __m128i c2 = _mm_setzero_si128(); @@ -570,7 +570,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt if (++i == s) { - d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); + d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data)), mask1); d1 = _mm_xor_si128(d1, x); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); @@ -579,7 +579,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt break; } - d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask2); + d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-i)*16-8)), mask2); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); d2 = _mm_xor_si128(d2, d1); @@ -587,7 +587,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt if (++i == s) { - d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); + d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data)), mask1); d1 = _mm_xor_si128(d1, x); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11)); @@ -596,7 +596,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt break; } - d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask1); + d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-i)*16-8)), mask1); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10)); d1 = _mm_xor_si128(d1, d2); @@ -609,15 +609,15 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt x = GCM_Reduce_CLMUL(c0, c1, c2, r); } - _mm_store_si128((__m128i *)(void *)hbuffer, x); + _mm_store_si128(M128_CAST(hbuffer), x); return len; } #endif -#if CRYPTOPP_SSSE3_AVAILABLE +#if CRYPTOPP_CLMUL_AVAILABLE void GCM_ReverseHashBufferIfNeeded_SSSE3(byte *hashBuffer) { - __m128i &x = *(__m128i *)(void *)hashBuffer; + __m128i &x = *M128_CAST(hashBuffer); x = _mm_shuffle_epi8(x, s_clmulConstants[1]); } #endif diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index 611c6eb2..aedd4b32 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -70,6 +70,10 @@ # define MAYBE_CONST const #endif +// Clang __m128i casts +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) + NAMESPACE_BEGIN(CryptoPP) #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY @@ -373,23 +377,23 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, { while (length >= 4*blockSize) { - __m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3; + __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; if (flags & BlockTransformation::BT_InBlockIsCounter) { - const __m128i be1 = *(const __m128i *)(const void *)s_one; + const __m128i be1 = *CONST_M128_CAST(s_one); block1 = _mm_add_epi32(block0, be1); block2 = _mm_add_epi32(block1, be1); block3 = _mm_add_epi32(block2, be1); - _mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1)); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); } else { inBlocks += inIncrement; - block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); + block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); inBlocks += inIncrement; - block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); + block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); inBlocks += inIncrement; - block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); + block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); inBlocks += inIncrement; } @@ -397,13 +401,13 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, { // Coverity finding, appears to be false positive. Assert the condition. CRYPTOPP_ASSERT(xorBlocks); - block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); + block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); + block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; } @@ -411,23 +415,23 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { - block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); + block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); + block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; } - _mm_storeu_si128((__m128i *)(void *)outBlocks, block0); + _mm_storeu_si128(M128_CAST(outBlocks), block0); outBlocks += outIncrement; - _mm_storeu_si128((__m128i *)(void *)outBlocks, block1); + _mm_storeu_si128(M128_CAST(outBlocks), block1); outBlocks += outIncrement; - _mm_storeu_si128((__m128i *)(void *)outBlocks, block2); + _mm_storeu_si128(M128_CAST(outBlocks), block2); outBlocks += outIncrement; - _mm_storeu_si128((__m128i *)(void *)outBlocks, block3); + _mm_storeu_si128(M128_CAST(outBlocks), block3); outBlocks += outIncrement; length -= 4*blockSize; @@ -436,10 +440,10 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, while (length >= blockSize) { - __m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); + __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); if (flags & BlockTransformation::BT_XorInput) - block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); + block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); if (flags & BlockTransformation::BT_InBlockIsCounter) const_cast(inBlocks)[15]++; @@ -447,9 +451,9 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, func1(block, subkeys, static_cast(rounds)); if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); + block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - _mm_storeu_si128((__m128i *)(void *)outBlocks, block); + _mm_storeu_si128(M128_CAST(outBlocks), block); inBlocks += inIncrement; outBlocks += outIncrement; @@ -486,7 +490,7 @@ void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, wor const word32 *ro = rcLE, *rc = rcLE; CRYPTOPP_UNUSED(ro); - __m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keyLen-16)); + __m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16)); std::memcpy(rk, userKey, keyLen); // keySize: m_key allocates 4*(rounds+1 word32's. @@ -543,16 +547,16 @@ void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds) // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. vec_swap(*(__m128i *)(key), *(__m128i *)(key+4*rounds)); #else - std::swap(*(__m128i *)(void *)(key), *(__m128i *)(void *)(key+4*rounds)); + std::swap(*M128_CAST(key), *M128_CAST(key+4*rounds)); #endif for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4) { - temp = _mm_aesimc_si128(*(__m128i *)(void *)(key+i)); - *(__m128i *)(void *)(key+i) = _mm_aesimc_si128(*(__m128i *)(void *)(key+j)); - *(__m128i *)(void *)(key+j) = temp; + temp = _mm_aesimc_si128(*M128_CAST(key+i)); + *M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+j)); + *M128_CAST(key+j) = temp; } - *(__m128i *)(void *)(key+i) = _mm_aesimc_si128(*(__m128i *)(void *)(key+i)); + *M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+i)); } #endif // CRYPTOPP_AESNI_AVAILABLE