Cleanup __m128 casts due to Clang

pull/461/head
Jeffrey Walton 2017-08-13 06:53:35 -04:00
parent d8e56b8250
commit 5f441d28e5
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 89 additions and 81 deletions

View File

@ -29,6 +29,10 @@
# define EXCEPTION_EXECUTE_HANDLER 1 # define EXCEPTION_EXECUTE_HANDLER 1
#endif #endif
// Clang __m128i casts
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
// Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x, Win64 supplies it except for VS2008. // Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x, Win64 supplies it except for VS2008.
@ -74,15 +78,15 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State<word32, false>& stat
const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1); const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00)); const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00));
const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16)); const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16));
const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32)); const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32));
const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48)); const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48));
row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); row1 = ff0 = _mm_loadu_si128(CONST_M128_CAST(&state.h[0]));
row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); row2 = ff1 = _mm_loadu_si128(CONST_M128_CAST(&state.h[4]));
row3 = _mm_setr_epi32(BLAKE2S_IV[0], BLAKE2S_IV[1], BLAKE2S_IV[2], BLAKE2S_IV[3]); row3 = _mm_setr_epi32(BLAKE2S_IV[0], BLAKE2S_IV[1], BLAKE2S_IV[2], BLAKE2S_IV[3]);
row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4], BLAKE2S_IV[5], BLAKE2S_IV[6], BLAKE2S_IV[7]), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4], BLAKE2S_IV[5], BLAKE2S_IV[6], BLAKE2S_IV[7]), _mm_loadu_si128(CONST_M128_CAST(&state.t[0])));
buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0)))); buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0))));
row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
@ -614,8 +618,8 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State<word32, false>& stat
row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
_mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3))); _mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
_mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4))); _mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
} }
void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state) void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state)
@ -629,23 +633,23 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00)); const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00));
const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16)); const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16));
const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32)); const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32));
const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48)); const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48));
const __m128i m4 = _mm_loadu_si128((const __m128i*)(const void*)(input + 64)); const __m128i m4 = _mm_loadu_si128(CONST_M128_CAST(input + 64));
const __m128i m5 = _mm_loadu_si128((const __m128i*)(const void*)(input + 80)); const __m128i m5 = _mm_loadu_si128(CONST_M128_CAST(input + 80));
const __m128i m6 = _mm_loadu_si128((const __m128i*)(const void*)(input + 96)); const __m128i m6 = _mm_loadu_si128(CONST_M128_CAST(input + 96));
const __m128i m7 = _mm_loadu_si128((const __m128i*)(const void*)(input + 112)); const __m128i m7 = _mm_loadu_si128(CONST_M128_CAST(input + 112));
row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); row1l = _mm_loadu_si128(CONST_M128_CAST(&state.h[0]));
row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])); row1h = _mm_loadu_si128(CONST_M128_CAST(&state.h[2]));
row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); row2l = _mm_loadu_si128(CONST_M128_CAST(&state.h[4]));
row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])); row2h = _mm_loadu_si128(CONST_M128_CAST(&state.h[6]));
row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[0])); row3l = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[0]));
row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[2])); row3h = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[2]));
row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[4])), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); row4l = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[4])), _mm_loadu_si128(CONST_M128_CAST(&state.t[0])));
row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[6])), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0]))); row4h = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[6])), _mm_loadu_si128(CONST_M128_CAST(&state.f[0])));
b0 = _mm_unpacklo_epi64(m0, m1); b0 = _mm_unpacklo_epi64(m0, m1);
b1 = _mm_unpacklo_epi64(m2, m3); b1 = _mm_unpacklo_epi64(m2, m3);
@ -1584,13 +1588,13 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state
row1l = _mm_xor_si128(row3l, row1l); row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h); row1h = _mm_xor_si128(row3h, row1h);
_mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l)); _mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[0])), row1l));
_mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h)); _mm_storeu_si128(M128_CAST(&state.h[2]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[2])), row1h));
row2l = _mm_xor_si128(row4l, row2l); row2l = _mm_xor_si128(row4l, row2l);
row2h = _mm_xor_si128(row4h, row2h); row2h = _mm_xor_si128(row4h, row2h);
_mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l)); _mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[4])), row2l));
_mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h)); _mm_storeu_si128(M128_CAST(&state.h[6]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[6])), row2h));
} }
#endif // CRYPTOPP_SSE42_AVAILABLE #endif // CRYPTOPP_SSE42_AVAILABLE

View File

@ -50,6 +50,10 @@
# define EXCEPTION_EXECUTE_HANDLER 1 # define EXCEPTION_EXECUTE_HANDLER 1
#endif #endif
// Clang __m128i casts
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
ANONYMOUS_NAMESPACE_BEGIN ANONYMOUS_NAMESPACE_BEGIN
// GCC 4.8 is missing PMULL gear // GCC 4.8 is missing PMULL gear
@ -438,7 +442,7 @@ const word64 s_clmulConstants64[] = {
W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607),
W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)}; W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)};
const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64; const __m128i *s_clmulConstants = CONST_M128_CAST(s_clmulConstants64);
const unsigned int s_cltableSizeInBlocks = 8; const unsigned int s_cltableSizeInBlocks = 8;
ANONYMOUS_NAMESPACE_END ANONYMOUS_NAMESPACE_END
@ -497,11 +501,7 @@ __m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
c2t ^= c1b c2t ^= c1b
shift c2 left 1 bit and xor in lowest bit of c1t shift c2 left 1 bit and xor in lowest bit of c1t
*/ */
#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301
c2 = _mm_xor_si128(c2, _mm_move_epi64(c0));
#else
c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8)); c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8));
#endif
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10)); c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10));
c0 = _mm_srli_si128(c0, 8); c0 = _mm_srli_si128(c0, 8);
c0 = _mm_xor_si128(c0, c1); c0 = _mm_xor_si128(c0, c1);
@ -527,37 +527,37 @@ __m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r)
void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize) void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize)
{ {
const __m128i r = s_clmulConstants[0]; const __m128i r = s_clmulConstants[0];
const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((const __m128i *)(const void *)hashKey), s_clmulConstants[1]); const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(hashKey)), s_clmulConstants[1]);
__m128i h = h0; __m128i h = h0;
unsigned int i; unsigned int i;
for (i=0; i<tableSize-32; i+=32) for (i=0; i<tableSize-32; i+=32)
{ {
const __m128i h1 = GCM_Multiply_CLMUL(h, h0, r); const __m128i h1 = GCM_Multiply_CLMUL(h, h0, r);
_mm_storel_epi64((__m128i *)(void *)(mulTable+i), h); _mm_storel_epi64(M128_CAST(mulTable+i), h);
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+16), h1); _mm_storeu_si128(M128_CAST(mulTable+i+16), h1);
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+8), h); _mm_storeu_si128(M128_CAST(mulTable+i+8), h);
_mm_storel_epi64((__m128i *)(void *)(mulTable+i+8), h1); _mm_storel_epi64(M128_CAST(mulTable+i+8), h1);
h = GCM_Multiply_CLMUL(h1, h0, r); h = GCM_Multiply_CLMUL(h1, h0, r);
} }
const __m128i h1 = GCM_Multiply_CLMUL(h, h0, r); const __m128i h1 = GCM_Multiply_CLMUL(h, h0, r);
_mm_storel_epi64((__m128i *)(void *)(mulTable+i), h); _mm_storel_epi64(M128_CAST(mulTable+i), h);
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+16), h1); _mm_storeu_si128(M128_CAST(mulTable+i+16), h1);
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+8), h); _mm_storeu_si128(M128_CAST(mulTable+i+8), h);
_mm_storel_epi64((__m128i *)(void *)(mulTable+i+8), h1); _mm_storel_epi64(M128_CAST(mulTable+i+8), h1);
} }
size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer) size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
{ {
const __m128i *table = (const __m128i *)(const void *)mtable; const __m128i *table = CONST_M128_CAST(mtable);
__m128i x = _mm_load_si128((__m128i *)(void *)hbuffer); __m128i x = _mm_load_si128(M128_CAST(hbuffer));
const __m128i r = s_clmulConstants[0], mask1 = s_clmulConstants[1], mask2 = s_clmulConstants[2]; const __m128i r = s_clmulConstants[0], mask1 = s_clmulConstants[1], mask2 = s_clmulConstants[2];
while (len >= 16) while (len >= 16)
{ {
size_t s = UnsignedMin(len/16, s_cltableSizeInBlocks), i=0; size_t s = UnsignedMin(len/16, s_cltableSizeInBlocks), i=0;
__m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), mask2); __m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-1)*16)), mask2);
__m128i c0 = _mm_setzero_si128(); __m128i c0 = _mm_setzero_si128();
__m128i c1 = _mm_setzero_si128(); __m128i c1 = _mm_setzero_si128();
__m128i c2 = _mm_setzero_si128(); __m128i c2 = _mm_setzero_si128();
@ -570,7 +570,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
if (++i == s) if (++i == s)
{ {
d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data)), mask1);
d1 = _mm_xor_si128(d1, x); d1 = _mm_xor_si128(d1, x);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0)); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1));
@ -579,7 +579,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
break; break;
} }
d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask2); d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-i)*16-8)), mask2);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1)); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1));
d2 = _mm_xor_si128(d2, d1); d2 = _mm_xor_si128(d2, d1);
@ -587,7 +587,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
if (++i == s) if (++i == s)
{ {
d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data)), mask1);
d1 = _mm_xor_si128(d1, x); d1 = _mm_xor_si128(d1, x);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11));
@ -596,7 +596,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
break; break;
} }
d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask1); d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-i)*16-8)), mask1);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
d1 = _mm_xor_si128(d1, d2); d1 = _mm_xor_si128(d1, d2);
@ -609,15 +609,15 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
x = GCM_Reduce_CLMUL(c0, c1, c2, r); x = GCM_Reduce_CLMUL(c0, c1, c2, r);
} }
_mm_store_si128((__m128i *)(void *)hbuffer, x); _mm_store_si128(M128_CAST(hbuffer), x);
return len; return len;
} }
#endif #endif
#if CRYPTOPP_SSSE3_AVAILABLE #if CRYPTOPP_CLMUL_AVAILABLE
void GCM_ReverseHashBufferIfNeeded_SSSE3(byte *hashBuffer) void GCM_ReverseHashBufferIfNeeded_SSSE3(byte *hashBuffer)
{ {
__m128i &x = *(__m128i *)(void *)hashBuffer; __m128i &x = *M128_CAST(hashBuffer);
x = _mm_shuffle_epi8(x, s_clmulConstants[1]); x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
} }
#endif #endif

View File

@ -70,6 +70,10 @@
# define MAYBE_CONST const # define MAYBE_CONST const
#endif #endif
// Clang __m128i casts
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
@ -373,23 +377,23 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
{ {
while (length >= 4*blockSize) while (length >= 4*blockSize)
{ {
__m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3; __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3;
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const __m128i be1 = *(const __m128i *)(const void *)s_one; const __m128i be1 = *CONST_M128_CAST(s_one);
block1 = _mm_add_epi32(block0, be1); block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1); block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1); block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1)); _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1));
} }
else else
{ {
inBlocks += inIncrement; inBlocks += inIncrement;
block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement; inBlocks += inIncrement;
block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement; inBlocks += inIncrement;
block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement; inBlocks += inIncrement;
} }
@ -397,13 +401,13 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
{ {
// Coverity finding, appears to be false positive. Assert the condition. // Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks); CRYPTOPP_ASSERT(xorBlocks);
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
} }
@ -411,23 +415,23 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
} }
_mm_storeu_si128((__m128i *)(void *)outBlocks, block0); _mm_storeu_si128(M128_CAST(outBlocks), block0);
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block1); _mm_storeu_si128(M128_CAST(outBlocks), block1);
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block2); _mm_storeu_si128(M128_CAST(outBlocks), block2);
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block3); _mm_storeu_si128(M128_CAST(outBlocks), block3);
outBlocks += outIncrement; outBlocks += outIncrement;
length -= 4*blockSize; length -= 4*blockSize;
@ -436,10 +440,10 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
while (length >= blockSize) while (length >= blockSize)
{ {
__m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++; const_cast<byte *>(inBlocks)[15]++;
@ -447,9 +451,9 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
func1(block, subkeys, static_cast<unsigned int>(rounds)); func1(block, subkeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
_mm_storeu_si128((__m128i *)(void *)outBlocks, block); _mm_storeu_si128(M128_CAST(outBlocks), block);
inBlocks += inIncrement; inBlocks += inIncrement;
outBlocks += outIncrement; outBlocks += outIncrement;
@ -486,7 +490,7 @@ void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, wor
const word32 *ro = rcLE, *rc = rcLE; const word32 *ro = rcLE, *rc = rcLE;
CRYPTOPP_UNUSED(ro); CRYPTOPP_UNUSED(ro);
__m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keyLen-16)); __m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16));
std::memcpy(rk, userKey, keyLen); std::memcpy(rk, userKey, keyLen);
// keySize: m_key allocates 4*(rounds+1 word32's. // keySize: m_key allocates 4*(rounds+1 word32's.
@ -543,16 +547,16 @@ void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds)
// SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
vec_swap(*(__m128i *)(key), *(__m128i *)(key+4*rounds)); vec_swap(*(__m128i *)(key), *(__m128i *)(key+4*rounds));
#else #else
std::swap(*(__m128i *)(void *)(key), *(__m128i *)(void *)(key+4*rounds)); std::swap(*M128_CAST(key), *M128_CAST(key+4*rounds));
#endif #endif
for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4) for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4)
{ {
temp = _mm_aesimc_si128(*(__m128i *)(void *)(key+i)); temp = _mm_aesimc_si128(*M128_CAST(key+i));
*(__m128i *)(void *)(key+i) = _mm_aesimc_si128(*(__m128i *)(void *)(key+j)); *M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+j));
*(__m128i *)(void *)(key+j) = temp; *M128_CAST(key+j) = temp;
} }
*(__m128i *)(void *)(key+i) = _mm_aesimc_si128(*(__m128i *)(void *)(key+i)); *M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+i));
} }
#endif // CRYPTOPP_AESNI_AVAILABLE #endif // CRYPTOPP_AESNI_AVAILABLE