Cleanup casts due to Clang

pull/461/head
Jeffrey Walton 2017-08-13 06:32:09 -04:00
parent f02bf91ee5
commit 863bf9133c
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 103 additions and 91 deletions

View File

@ -35,6 +35,10 @@ inline __m128i MM_SET_EPI64X(const word64 a, const word64 b)
# define MM_SET_EPI64X(a, b) _mm_set_epi64x(a, b) # define MM_SET_EPI64X(a, b) _mm_set_epi64x(a, b)
#endif #endif
// Clang casts
#define M128I_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
// C/C++ implementation // C/C++ implementation
static void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State<word32, false>& state); static void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
static void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State<word64, true>& state); static void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
@ -626,10 +630,10 @@ static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State<word32, false
__m128i buf1,buf2,buf3,buf4; __m128i buf1,buf2,buf3,buf4;
__m128i ff0,ff1; __m128i ff0,ff1;
row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); row1 = ff0 = _mm_loadu_si128(CONST_M128I_CAST(&state.h[0]));
row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); row2 = ff1 = _mm_loadu_si128(CONST_M128I_CAST(&state.h[4]));
row3 = _mm_setr_epi32(BLAKE2S_IV(0),BLAKE2S_IV(1),BLAKE2S_IV(2),BLAKE2S_IV(3)); row3 = _mm_setr_epi32(BLAKE2S_IV(0),BLAKE2S_IV(1),BLAKE2S_IV(2),BLAKE2S_IV(3));
row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4),BLAKE2S_IV(5),BLAKE2S_IV(6),BLAKE2S_IV(7)),_mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4),BLAKE2S_IV(5),BLAKE2S_IV(6),BLAKE2S_IV(7)),_mm_loadu_si128(CONST_M128I_CAST(&state.t[0])));
buf1 = _mm_set_epi32(m6,m4,m2,m0); buf1 = _mm_set_epi32(m6,m4,m2,m0);
row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
row4 = _mm_xor_si128(row4,row1); row4 = _mm_xor_si128(row4,row1);
@ -1030,8 +1034,8 @@ static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State<word32, false
row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
_mm_storeu_si128((__m128i *)(void*)(&state.h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3))); _mm_storeu_si128(M128I_CAST(&state.h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3)));
_mm_storeu_si128((__m128i *)(void*)(&state.h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4))); _mm_storeu_si128(M128I_CAST(&state.h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4)));
} }
# if (__SUNPRO_CC != 0x5120) # if (__SUNPRO_CC != 0x5120)
@ -1045,14 +1049,14 @@ static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State<word64, true>
__m128i row3l, row3h, row4l, row4h; __m128i row3l, row3h, row4l, row4h;
__m128i b0, b1, t0, t1; __m128i b0, b1, t0, t1;
row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); row1l = _mm_loadu_si128(CONST_M128I_CAST(&state.h[0]));
row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])); row1h = _mm_loadu_si128(CONST_M128I_CAST(&state.h[2]));
row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); row2l = _mm_loadu_si128(CONST_M128I_CAST(&state.h[4]));
row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])); row2h = _mm_loadu_si128(CONST_M128I_CAST(&state.h[6]));
row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(0))); row3l = _mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(0)));
row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(2))); row3h = _mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(2)));
row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(4))), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); row4l = _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(4))), _mm_loadu_si128(CONST_M128I_CAST(&state.t[0])));
row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(6))), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0]))); row4h = _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(6))), _mm_loadu_si128(CONST_M128I_CAST(&state.f[0])));
b0 = MM_SET_EPI64X(m2, m0); b0 = MM_SET_EPI64X(m2, m0);
b1 = MM_SET_EPI64X(m6, m4); b1 = MM_SET_EPI64X(m6, m4);
@ -1918,13 +1922,13 @@ static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State<word64, true>
row1l = _mm_xor_si128(row3l, row1l); row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h); row1h = _mm_xor_si128(row3h, row1h);
_mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l)); _mm_storeu_si128(M128I_CAST(&state.h[0]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[0])), row1l));
_mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h)); _mm_storeu_si128(M128I_CAST(&state.h[2]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[2])), row1h));
row2l = _mm_xor_si128(row4l, row2l); row2l = _mm_xor_si128(row4l, row2l);
row2h = _mm_xor_si128(row4h, row2h); row2h = _mm_xor_si128(row4h, row2h);
_mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l)); _mm_storeu_si128(M128I_CAST(&state.h[4]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[4])), row2l));
_mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h)); _mm_storeu_si128(M128I_CAST(&state.h[6]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[6])), row2h));
} }
# endif // (__SUNPRO_CC != 0x5120) # endif // (__SUNPRO_CC != 0x5120)
#endif // CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE #endif // CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
@ -1941,15 +1945,15 @@ static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State<word32, false
const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1); const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00)); const __m128i m0 = _mm_loadu_si128(CONST_M128I_CAST(input + 00));
const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16)); const __m128i m1 = _mm_loadu_si128(CONST_M128I_CAST(input + 16));
const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32)); const __m128i m2 = _mm_loadu_si128(CONST_M128I_CAST(input + 32));
const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48)); const __m128i m3 = _mm_loadu_si128(CONST_M128I_CAST(input + 48));
row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); row1 = ff0 = _mm_loadu_si128(CONST_M128I_CAST(&state.h[0]));
row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); row2 = ff1 = _mm_loadu_si128(CONST_M128I_CAST(&state.h[4]));
row3 = _mm_setr_epi32(BLAKE2S_IV(0), BLAKE2S_IV(1), BLAKE2S_IV(2), BLAKE2S_IV(3)); row3 = _mm_setr_epi32(BLAKE2S_IV(0), BLAKE2S_IV(1), BLAKE2S_IV(2), BLAKE2S_IV(3));
row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4), BLAKE2S_IV(5), BLAKE2S_IV(6), BLAKE2S_IV(7)), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4), BLAKE2S_IV(5), BLAKE2S_IV(6), BLAKE2S_IV(7)), _mm_loadu_si128(CONST_M128I_CAST(&state.t[0])));
buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0)))); buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0))));
row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
@ -2481,8 +2485,8 @@ static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State<word32, false
row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
_mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3))); _mm_storeu_si128(M128I_CAST(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
_mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4))); _mm_storeu_si128(M128I_CAST(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
} }
static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State<word64, true>& state) static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
@ -2496,23 +2500,23 @@ static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State<word64, true>
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00)); const __m128i m0 = _mm_loadu_si128(CONST_M128I_CAST(input + 00));
const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16)); const __m128i m1 = _mm_loadu_si128(CONST_M128I_CAST(input + 16));
const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32)); const __m128i m2 = _mm_loadu_si128(CONST_M128I_CAST(input + 32));
const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48)); const __m128i m3 = _mm_loadu_si128(CONST_M128I_CAST(input + 48));
const __m128i m4 = _mm_loadu_si128((const __m128i*)(const void*)(input + 64)); const __m128i m4 = _mm_loadu_si128(CONST_M128I_CAST(input + 64));
const __m128i m5 = _mm_loadu_si128((const __m128i*)(const void*)(input + 80)); const __m128i m5 = _mm_loadu_si128(CONST_M128I_CAST(input + 80));
const __m128i m6 = _mm_loadu_si128((const __m128i*)(const void*)(input + 96)); const __m128i m6 = _mm_loadu_si128(CONST_M128I_CAST(input + 96));
const __m128i m7 = _mm_loadu_si128((const __m128i*)(const void*)(input + 112)); const __m128i m7 = _mm_loadu_si128(CONST_M128I_CAST(input + 112));
row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])); row1l = _mm_loadu_si128(CONST_M128I_CAST(&state.h[0]));
row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])); row1h = _mm_loadu_si128(CONST_M128I_CAST(&state.h[2]));
row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])); row2l = _mm_loadu_si128(CONST_M128I_CAST(&state.h[4]));
row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])); row2h = _mm_loadu_si128(CONST_M128I_CAST(&state.h[6]));
row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(0))); row3l = _mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(0)));
row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(2))); row3h = _mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(2)));
row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(4))), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0]))); row4l = _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(4))), _mm_loadu_si128(CONST_M128I_CAST(&state.t[0])));
row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(6))), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0]))); row4h = _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(6))), _mm_loadu_si128(CONST_M128I_CAST(&state.f[0])));
b0 = _mm_unpacklo_epi64(m0, m1); b0 = _mm_unpacklo_epi64(m0, m1);
b1 = _mm_unpacklo_epi64(m2, m3); b1 = _mm_unpacklo_epi64(m2, m3);
@ -3451,13 +3455,13 @@ static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State<word64, true>
row1l = _mm_xor_si128(row3l, row1l); row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h); row1h = _mm_xor_si128(row3h, row1h);
_mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l)); _mm_storeu_si128(M128I_CAST(&state.h[0]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[0])), row1l));
_mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h)); _mm_storeu_si128(M128I_CAST(&state.h[2]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[2])), row1h));
row2l = _mm_xor_si128(row4l, row2l); row2l = _mm_xor_si128(row4l, row2l);
row2h = _mm_xor_si128(row4h, row2h); row2h = _mm_xor_si128(row4h, row2h);
_mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l)); _mm_storeu_si128(M128I_CAST(&state.h[4]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[4])), row2l));
_mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h)); _mm_storeu_si128(M128I_CAST(&state.h[6]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[6])), row2h));
} }
#endif // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE #endif // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE

38
gcm.cpp
View File

@ -27,6 +27,10 @@
# undef CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE # undef CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#endif #endif
// Clang casts
#define M128I_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
#include "gcm.h" #include "gcm.h"
#include "cpu.h" #include "cpu.h"
@ -199,12 +203,12 @@ inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c)
// SunCC 5.14 crash (bewildering since asserts are not in effect in release builds) // SunCC 5.14 crash (bewildering since asserts are not in effect in release builds)
// Also see http://github.com/weidai11/cryptopp/issues/226 and http://github.com/weidai11/cryptopp/issues/284 // Also see http://github.com/weidai11/cryptopp/issues/226 and http://github.com/weidai11/cryptopp/issues/284
# if __SUNPRO_CC # if __SUNPRO_CC
*(__m128i *)(void *)a = _mm_xor_si128(*(__m128i *)(void *)b, *(__m128i *)(void *)c); *M128I_CAST(a) = _mm_xor_si128(*M128I_CAST(b), *M128I_CAST(c));
# elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE # elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<__m128i>())); CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<__m128i>()));
CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<__m128i>())); CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<__m128i>()));
CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<__m128i>())); CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<__m128i>()));
*(__m128i *)(void *)a = _mm_xor_si128(*(__m128i *)(void *)b, *(__m128i *)(void *)c); *M128I_CAST(a) = _mm_xor_si128(*M128I_CAST(b), *M128I_CAST(c));
# else # else
asm ("movdqa %1, %%xmm0; pxor %2, %%xmm0; movdqa %%xmm0, %0;" : "=m" (a[0]) : "m"(b[0]), "m"(c[0])); asm ("movdqa %1, %%xmm0; pxor %2, %%xmm0; movdqa %%xmm0, %0;" : "=m" (a[0]) : "m"(b[0]), "m"(c[0]));
# endif # endif
@ -237,7 +241,7 @@ static const word64 s_clmulConstants64[] = {
W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607),
W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)}; W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)};
static const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64; static const __m128i *s_clmulConstants = CONST_M128I_CAST(s_clmulConstants64);
static const unsigned int s_clmulTableSizeInBlocks = 8; static const unsigned int s_clmulTableSizeInBlocks = 8;
inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r) inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
@ -369,16 +373,16 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
if (HasCLMUL()) if (HasCLMUL())
{ {
const __m128i r = s_clmulConstants[0]; const __m128i r = s_clmulConstants[0];
__m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)(void *)hashKey), s_clmulConstants[1]); __m128i h0 = _mm_shuffle_epi8(_mm_load_si128(M128I_CAST(hashKey)), s_clmulConstants[1]);
__m128i h = h0; __m128i h = h0;
for (i=0; i<tableSize; i+=32) for (i=0; i<tableSize; i+=32)
{ {
__m128i h1 = CLMUL_GF_Mul(h, h0, r); __m128i h1 = CLMUL_GF_Mul(h, h0, r);
_mm_storel_epi64((__m128i *)(void *)(mulTable+i), h); _mm_storel_epi64(M128I_CAST(mulTable+i), h);
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+16), h1); _mm_storeu_si128(M128I_CAST(mulTable+i+16), h1);
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+8), h); _mm_storeu_si128(M128I_CAST(mulTable+i+8), h);
_mm_storel_epi64((__m128i *)(void *)(mulTable+i+8), h1); _mm_storel_epi64(M128I_CAST(mulTable+i+8), h1);
h = CLMUL_GF_Mul(h1, h0, r); h = CLMUL_GF_Mul(h1, h0, r);
} }
@ -517,7 +521,7 @@ inline void GCM_Base::ReverseHashBufferIfNeeded()
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
if (HasCLMUL()) if (HasCLMUL())
{ {
__m128i &x = *(__m128i *)(void *)HashBuffer(); __m128i &x = *M128I_CAST(HashBuffer());
x = _mm_shuffle_epi8(x, s_clmulConstants[1]); x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
} }
#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE #elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE
@ -608,14 +612,14 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
if (HasCLMUL()) if (HasCLMUL())
{ {
const __m128i *mulTable = (const __m128i *)(const void *)MulTable(); const __m128i *mulTable = CONST_M128I_CAST(MulTable());
__m128i x = _mm_load_si128((__m128i *)(void *)HashBuffer()); __m128i x = _mm_load_si128(M128I_CAST(HashBuffer()));
const __m128i r = s_clmulConstants[0], mask1 = s_clmulConstants[1], mask2 = s_clmulConstants[2]; const __m128i r = s_clmulConstants[0], mask1 = s_clmulConstants[1], mask2 = s_clmulConstants[2];
while (len >= 16) while (len >= 16)
{ {
size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0; size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
__m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), mask2); __m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data+(s-1)*16)), mask2);
__m128i c0 = _mm_setzero_si128(); __m128i c0 = _mm_setzero_si128();
__m128i c1 = _mm_setzero_si128(); __m128i c1 = _mm_setzero_si128();
__m128i c2 = _mm_setzero_si128(); __m128i c2 = _mm_setzero_si128();
@ -628,7 +632,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
if (++i == s) if (++i == s)
{ {
d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data)), mask1);
d1 = _mm_xor_si128(d1, x); d1 = _mm_xor_si128(d1, x);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0)); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1));
@ -637,7 +641,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
break; break;
} }
d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask2); d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data+(s-i)*16-8)), mask2);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1)); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1));
d2 = _mm_xor_si128(d2, d1); d2 = _mm_xor_si128(d2, d1);
@ -645,7 +649,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
if (++i == s) if (++i == s)
{ {
d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data)), mask1);
d1 = _mm_xor_si128(d1, x); d1 = _mm_xor_si128(d1, x);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11));
@ -654,7 +658,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
break; break;
} }
d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask1); d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data+(s-i)*16-8)), mask1);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
d1 = _mm_xor_si128(d1, d2); d1 = _mm_xor_si128(d1, d2);
@ -667,7 +671,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
x = CLMUL_Reduce(c0, c1, c2, r); x = CLMUL_Reduce(c0, c1, c2, r);
} }
_mm_store_si128((__m128i *)(void *)HashBuffer(), x); _mm_store_si128(M128I_CAST(HashBuffer()), x);
return len; return len;
} }
#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE #elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE

View File

@ -95,6 +95,10 @@ static void Rijndael_Dec_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byt
# define MAYBE_CONST const # define MAYBE_CONST const
#endif #endif
// Clang casts
#define M128I_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
#if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS) #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
# if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];} namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
@ -244,7 +248,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
const word32 *ro = rcLE, *rc = rcLE; const word32 *ro = rcLE, *rc = rcLE;
CRYPTOPP_UNUSED(ro); CRYPTOPP_UNUSED(ro);
__m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keylen-16)); __m128i temp = _mm_loadu_si128(M128I_CAST(userKey+keylen-16));
memcpy(rk, userKey, keylen); memcpy(rk, userKey, keylen);
while (true) while (true)
@ -300,16 +304,16 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
// SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds)); vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
#else #else
std::swap(*(__m128i *)(void *)(rk), *(__m128i *)(void *)(rk+4*m_rounds)); std::swap(*M128I_CAST(rk), *M128I_CAST(rk+4*m_rounds));
#endif #endif
for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
{ {
temp = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i)); temp = _mm_aesimc_si128(*M128I_CAST(rk+i));
*(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+j)); *M128I_CAST(rk+i) = _mm_aesimc_si128(*M128I_CAST(rk+j));
*(__m128i *)(void *)(rk+j) = temp; *M128I_CAST(rk+j) = temp;
} }
*(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i)); *M128I_CAST(rk+i) = _mm_aesimc_si128(*M128I_CAST(rk+i));
} }
return; return;
@ -1203,23 +1207,23 @@ inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128
{ {
while (length >= 4*blockSize) while (length >= 4*blockSize)
{ {
__m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3; __m128i block0 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks)), block1, block2, block3;
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const __m128i be1 = *(const __m128i *)(const void *)s_one; const __m128i be1 = *CONST_M128I_CAST(s_one);
block1 = _mm_add_epi32(block0, be1); block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1); block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1); block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1)); _mm_storeu_si128(M128I_CAST(inBlocks), _mm_add_epi32(block3, be1));
} }
else else
{ {
inBlocks += inIncrement; inBlocks += inIncrement;
block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); block1 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks));
inBlocks += inIncrement; inBlocks += inIncrement;
block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); block2 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks));
inBlocks += inIncrement; inBlocks += inIncrement;
block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); block3 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks));
inBlocks += inIncrement; inBlocks += inIncrement;
} }
@ -1227,13 +1231,13 @@ inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128
{ {
// Coverity finding, appears to be false positive. Assert the condition. // Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks); CRYPTOPP_ASSERT(xorBlocks);
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
} }
@ -1241,23 +1245,23 @@ inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
} }
_mm_storeu_si128((__m128i *)(void *)outBlocks, block0); _mm_storeu_si128(M128I_CAST(outBlocks), block0);
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block1); _mm_storeu_si128(M128I_CAST(outBlocks), block1);
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block2); _mm_storeu_si128(M128I_CAST(outBlocks), block2);
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block3); _mm_storeu_si128(M128I_CAST(outBlocks), block3);
outBlocks += outIncrement; outBlocks += outIncrement;
length -= 4*blockSize; length -= 4*blockSize;
@ -1266,10 +1270,10 @@ inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128
while (length >= blockSize) while (length >= blockSize)
{ {
__m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks); __m128i block = _mm_loadu_si128(CONST_M128I_CAST(inBlocks));
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++; const_cast<byte *>(inBlocks)[15]++;
@ -1277,9 +1281,9 @@ inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128
func1(block, subkeys, rounds); func1(block, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks)); block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks)));
_mm_storeu_si128((__m128i *)(void *)outBlocks, block); _mm_storeu_si128(M128I_CAST(outBlocks), block);
inBlocks += inIncrement; inBlocks += inIncrement;
outBlocks += outIncrement; outBlocks += outIncrement;