Cleanup __m128 casts due to Clang
parent
d8e56b8250
commit
5f441d28e5
|
|
@ -29,6 +29,10 @@
|
|||
# define EXCEPTION_EXECUTE_HANDLER 1
|
||||
#endif
|
||||
|
||||
// Clang __m128i casts
|
||||
#define M128_CAST(x) ((__m128i *)(void *)(x))
|
||||
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
// Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x, Win64 supplies it except for VS2008.
|
||||
|
|
@ -74,15 +78,15 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State<word32, false>& stat
|
|||
const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
|
||||
const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
|
||||
|
||||
const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00));
|
||||
const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16));
|
||||
const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32));
|
||||
const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48));
|
||||
const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00));
|
||||
const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16));
|
||||
const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32));
|
||||
const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48));
|
||||
|
||||
row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
|
||||
row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
|
||||
row1 = ff0 = _mm_loadu_si128(CONST_M128_CAST(&state.h[0]));
|
||||
row2 = ff1 = _mm_loadu_si128(CONST_M128_CAST(&state.h[4]));
|
||||
row3 = _mm_setr_epi32(BLAKE2S_IV[0], BLAKE2S_IV[1], BLAKE2S_IV[2], BLAKE2S_IV[3]);
|
||||
row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4], BLAKE2S_IV[5], BLAKE2S_IV[6], BLAKE2S_IV[7]), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
|
||||
row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4], BLAKE2S_IV[5], BLAKE2S_IV[6], BLAKE2S_IV[7]), _mm_loadu_si128(CONST_M128_CAST(&state.t[0])));
|
||||
buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0))));
|
||||
|
||||
row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
|
||||
|
|
@ -614,8 +618,8 @@ void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State<word32, false>& stat
|
|||
row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
|
||||
row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
|
||||
|
||||
_mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
|
||||
_mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
|
||||
_mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
|
||||
_mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
|
||||
}
|
||||
|
||||
void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state)
|
||||
|
|
@ -629,23 +633,23 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state
|
|||
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
|
||||
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
|
||||
|
||||
const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00));
|
||||
const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16));
|
||||
const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32));
|
||||
const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48));
|
||||
const __m128i m4 = _mm_loadu_si128((const __m128i*)(const void*)(input + 64));
|
||||
const __m128i m5 = _mm_loadu_si128((const __m128i*)(const void*)(input + 80));
|
||||
const __m128i m6 = _mm_loadu_si128((const __m128i*)(const void*)(input + 96));
|
||||
const __m128i m7 = _mm_loadu_si128((const __m128i*)(const void*)(input + 112));
|
||||
const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00));
|
||||
const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16));
|
||||
const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32));
|
||||
const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48));
|
||||
const __m128i m4 = _mm_loadu_si128(CONST_M128_CAST(input + 64));
|
||||
const __m128i m5 = _mm_loadu_si128(CONST_M128_CAST(input + 80));
|
||||
const __m128i m6 = _mm_loadu_si128(CONST_M128_CAST(input + 96));
|
||||
const __m128i m7 = _mm_loadu_si128(CONST_M128_CAST(input + 112));
|
||||
|
||||
row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
|
||||
row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2]));
|
||||
row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
|
||||
row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6]));
|
||||
row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[0]));
|
||||
row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[2]));
|
||||
row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[4])), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
|
||||
row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV[6])), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0])));
|
||||
row1l = _mm_loadu_si128(CONST_M128_CAST(&state.h[0]));
|
||||
row1h = _mm_loadu_si128(CONST_M128_CAST(&state.h[2]));
|
||||
row2l = _mm_loadu_si128(CONST_M128_CAST(&state.h[4]));
|
||||
row2h = _mm_loadu_si128(CONST_M128_CAST(&state.h[6]));
|
||||
row3l = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[0]));
|
||||
row3h = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[2]));
|
||||
row4l = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[4])), _mm_loadu_si128(CONST_M128_CAST(&state.t[0])));
|
||||
row4h = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[6])), _mm_loadu_si128(CONST_M128_CAST(&state.f[0])));
|
||||
|
||||
b0 = _mm_unpacklo_epi64(m0, m1);
|
||||
b1 = _mm_unpacklo_epi64(m2, m3);
|
||||
|
|
@ -1584,13 +1588,13 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state
|
|||
|
||||
row1l = _mm_xor_si128(row3l, row1l);
|
||||
row1h = _mm_xor_si128(row3h, row1h);
|
||||
_mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l));
|
||||
_mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h));
|
||||
_mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[0])), row1l));
|
||||
_mm_storeu_si128(M128_CAST(&state.h[2]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[2])), row1h));
|
||||
|
||||
row2l = _mm_xor_si128(row4l, row2l);
|
||||
row2h = _mm_xor_si128(row4h, row2h);
|
||||
_mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l));
|
||||
_mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h));
|
||||
_mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[4])), row2l));
|
||||
_mm_storeu_si128(M128_CAST(&state.h[6]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[6])), row2h));
|
||||
}
|
||||
#endif // CRYPTOPP_SSE42_AVAILABLE
|
||||
|
||||
|
|
|
|||
48
gcm-simd.cpp
48
gcm-simd.cpp
|
|
@ -50,6 +50,10 @@
|
|||
# define EXCEPTION_EXECUTE_HANDLER 1
|
||||
#endif
|
||||
|
||||
// Clang __m128i casts
|
||||
#define M128_CAST(x) ((__m128i *)(void *)(x))
|
||||
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
||||
|
||||
ANONYMOUS_NAMESPACE_BEGIN
|
||||
|
||||
// GCC 4.8 is missing PMULL gear
|
||||
|
|
@ -438,7 +442,7 @@ const word64 s_clmulConstants64[] = {
|
|||
W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607),
|
||||
W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)};
|
||||
|
||||
const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64;
|
||||
const __m128i *s_clmulConstants = CONST_M128_CAST(s_clmulConstants64);
|
||||
const unsigned int s_cltableSizeInBlocks = 8;
|
||||
|
||||
ANONYMOUS_NAMESPACE_END
|
||||
|
|
@ -497,11 +501,7 @@ __m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
|
|||
c2t ^= c1b
|
||||
shift c2 left 1 bit and xor in lowest bit of c1t
|
||||
*/
|
||||
#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301
|
||||
c2 = _mm_xor_si128(c2, _mm_move_epi64(c0));
|
||||
#else
|
||||
c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8));
|
||||
#endif
|
||||
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10));
|
||||
c0 = _mm_srli_si128(c0, 8);
|
||||
c0 = _mm_xor_si128(c0, c1);
|
||||
|
|
@ -527,37 +527,37 @@ __m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r)
|
|||
void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize)
|
||||
{
|
||||
const __m128i r = s_clmulConstants[0];
|
||||
const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((const __m128i *)(const void *)hashKey), s_clmulConstants[1]);
|
||||
const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(hashKey)), s_clmulConstants[1]);
|
||||
|
||||
__m128i h = h0;
|
||||
unsigned int i;
|
||||
for (i=0; i<tableSize-32; i+=32)
|
||||
{
|
||||
const __m128i h1 = GCM_Multiply_CLMUL(h, h0, r);
|
||||
_mm_storel_epi64((__m128i *)(void *)(mulTable+i), h);
|
||||
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+16), h1);
|
||||
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+8), h);
|
||||
_mm_storel_epi64((__m128i *)(void *)(mulTable+i+8), h1);
|
||||
_mm_storel_epi64(M128_CAST(mulTable+i), h);
|
||||
_mm_storeu_si128(M128_CAST(mulTable+i+16), h1);
|
||||
_mm_storeu_si128(M128_CAST(mulTable+i+8), h);
|
||||
_mm_storel_epi64(M128_CAST(mulTable+i+8), h1);
|
||||
h = GCM_Multiply_CLMUL(h1, h0, r);
|
||||
}
|
||||
|
||||
const __m128i h1 = GCM_Multiply_CLMUL(h, h0, r);
|
||||
_mm_storel_epi64((__m128i *)(void *)(mulTable+i), h);
|
||||
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+16), h1);
|
||||
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+8), h);
|
||||
_mm_storel_epi64((__m128i *)(void *)(mulTable+i+8), h1);
|
||||
_mm_storel_epi64(M128_CAST(mulTable+i), h);
|
||||
_mm_storeu_si128(M128_CAST(mulTable+i+16), h1);
|
||||
_mm_storeu_si128(M128_CAST(mulTable+i+8), h);
|
||||
_mm_storel_epi64(M128_CAST(mulTable+i+8), h1);
|
||||
}
|
||||
|
||||
size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
|
||||
{
|
||||
const __m128i *table = (const __m128i *)(const void *)mtable;
|
||||
__m128i x = _mm_load_si128((__m128i *)(void *)hbuffer);
|
||||
const __m128i *table = CONST_M128_CAST(mtable);
|
||||
__m128i x = _mm_load_si128(M128_CAST(hbuffer));
|
||||
const __m128i r = s_clmulConstants[0], mask1 = s_clmulConstants[1], mask2 = s_clmulConstants[2];
|
||||
|
||||
while (len >= 16)
|
||||
{
|
||||
size_t s = UnsignedMin(len/16, s_cltableSizeInBlocks), i=0;
|
||||
__m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), mask2);
|
||||
__m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-1)*16)), mask2);
|
||||
__m128i c0 = _mm_setzero_si128();
|
||||
__m128i c1 = _mm_setzero_si128();
|
||||
__m128i c2 = _mm_setzero_si128();
|
||||
|
|
@ -570,7 +570,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
|
|||
|
||||
if (++i == s)
|
||||
{
|
||||
d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1);
|
||||
d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data)), mask1);
|
||||
d1 = _mm_xor_si128(d1, x);
|
||||
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0));
|
||||
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1));
|
||||
|
|
@ -579,7 +579,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
|
|||
break;
|
||||
}
|
||||
|
||||
d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask2);
|
||||
d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-i)*16-8)), mask2);
|
||||
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
|
||||
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1));
|
||||
d2 = _mm_xor_si128(d2, d1);
|
||||
|
|
@ -587,7 +587,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
|
|||
|
||||
if (++i == s)
|
||||
{
|
||||
d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1);
|
||||
d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data)), mask1);
|
||||
d1 = _mm_xor_si128(d1, x);
|
||||
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10));
|
||||
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11));
|
||||
|
|
@ -596,7 +596,7 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
|
|||
break;
|
||||
}
|
||||
|
||||
d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask1);
|
||||
d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-i)*16-8)), mask1);
|
||||
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10));
|
||||
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
|
||||
d1 = _mm_xor_si128(d1, d2);
|
||||
|
|
@ -609,15 +609,15 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt
|
|||
x = GCM_Reduce_CLMUL(c0, c1, c2, r);
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i *)(void *)hbuffer, x);
|
||||
_mm_store_si128(M128_CAST(hbuffer), x);
|
||||
return len;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_SSSE3_AVAILABLE
|
||||
#if CRYPTOPP_CLMUL_AVAILABLE
|
||||
void GCM_ReverseHashBufferIfNeeded_SSSE3(byte *hashBuffer)
|
||||
{
|
||||
__m128i &x = *(__m128i *)(void *)hashBuffer;
|
||||
__m128i &x = *M128_CAST(hashBuffer);
|
||||
x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -70,6 +70,10 @@
|
|||
# define MAYBE_CONST const
|
||||
#endif
|
||||
|
||||
// Clang __m128i casts
|
||||
#define M128_CAST(x) ((__m128i *)(void *)(x))
|
||||
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
|
||||
|
|
@ -373,23 +377,23 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
|
|||
{
|
||||
while (length >= 4*blockSize)
|
||||
{
|
||||
__m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3;
|
||||
__m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3;
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
{
|
||||
const __m128i be1 = *(const __m128i *)(const void *)s_one;
|
||||
const __m128i be1 = *CONST_M128_CAST(s_one);
|
||||
block1 = _mm_add_epi32(block0, be1);
|
||||
block2 = _mm_add_epi32(block1, be1);
|
||||
block3 = _mm_add_epi32(block2, be1);
|
||||
_mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1));
|
||||
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1));
|
||||
}
|
||||
else
|
||||
{
|
||||
inBlocks += inIncrement;
|
||||
block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
|
||||
block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
|
||||
inBlocks += inIncrement;
|
||||
block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
|
||||
block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
|
||||
inBlocks += inIncrement;
|
||||
block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
|
||||
block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
|
||||
inBlocks += inIncrement;
|
||||
}
|
||||
|
||||
|
|
@ -397,13 +401,13 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
|
|||
{
|
||||
// Coverity finding, appears to be false positive. Assert the condition.
|
||||
CRYPTOPP_ASSERT(xorBlocks);
|
||||
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
|
||||
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
|
||||
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
|
||||
block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
|
||||
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
}
|
||||
|
||||
|
|
@ -411,23 +415,23 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
|
|||
|
||||
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
|
||||
{
|
||||
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
|
||||
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
|
||||
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
|
||||
block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
|
||||
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
}
|
||||
|
||||
_mm_storeu_si128((__m128i *)(void *)outBlocks, block0);
|
||||
_mm_storeu_si128(M128_CAST(outBlocks), block0);
|
||||
outBlocks += outIncrement;
|
||||
_mm_storeu_si128((__m128i *)(void *)outBlocks, block1);
|
||||
_mm_storeu_si128(M128_CAST(outBlocks), block1);
|
||||
outBlocks += outIncrement;
|
||||
_mm_storeu_si128((__m128i *)(void *)outBlocks, block2);
|
||||
_mm_storeu_si128(M128_CAST(outBlocks), block2);
|
||||
outBlocks += outIncrement;
|
||||
_mm_storeu_si128((__m128i *)(void *)outBlocks, block3);
|
||||
_mm_storeu_si128(M128_CAST(outBlocks), block3);
|
||||
outBlocks += outIncrement;
|
||||
|
||||
length -= 4*blockSize;
|
||||
|
|
@ -436,10 +440,10 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
|
|||
|
||||
while (length >= blockSize)
|
||||
{
|
||||
__m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
|
||||
__m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
|
||||
|
||||
if (flags & BlockTransformation::BT_XorInput)
|
||||
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
|
||||
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
|
||||
|
||||
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||
const_cast<byte *>(inBlocks)[15]++;
|
||||
|
|
@ -447,9 +451,9 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
|
|||
func1(block, subkeys, static_cast<unsigned int>(rounds));
|
||||
|
||||
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
|
||||
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
|
||||
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
|
||||
|
||||
_mm_storeu_si128((__m128i *)(void *)outBlocks, block);
|
||||
_mm_storeu_si128(M128_CAST(outBlocks), block);
|
||||
|
||||
inBlocks += inIncrement;
|
||||
outBlocks += outIncrement;
|
||||
|
|
@ -486,7 +490,7 @@ void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, wor
|
|||
const word32 *ro = rcLE, *rc = rcLE;
|
||||
CRYPTOPP_UNUSED(ro);
|
||||
|
||||
__m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keyLen-16));
|
||||
__m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16));
|
||||
std::memcpy(rk, userKey, keyLen);
|
||||
|
||||
// keySize: m_key allocates 4*(rounds+1 word32's.
|
||||
|
|
@ -543,16 +547,16 @@ void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds)
|
|||
// SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
|
||||
vec_swap(*(__m128i *)(key), *(__m128i *)(key+4*rounds));
|
||||
#else
|
||||
std::swap(*(__m128i *)(void *)(key), *(__m128i *)(void *)(key+4*rounds));
|
||||
std::swap(*M128_CAST(key), *M128_CAST(key+4*rounds));
|
||||
#endif
|
||||
for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4)
|
||||
{
|
||||
temp = _mm_aesimc_si128(*(__m128i *)(void *)(key+i));
|
||||
*(__m128i *)(void *)(key+i) = _mm_aesimc_si128(*(__m128i *)(void *)(key+j));
|
||||
*(__m128i *)(void *)(key+j) = temp;
|
||||
temp = _mm_aesimc_si128(*M128_CAST(key+i));
|
||||
*M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+j));
|
||||
*M128_CAST(key+j) = temp;
|
||||
}
|
||||
|
||||
*(__m128i *)(void *)(key+i) = _mm_aesimc_si128(*(__m128i *)(void *)(key+i));
|
||||
*M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+i));
|
||||
}
|
||||
#endif // CRYPTOPP_AESNI_AVAILABLE
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue