From 3fff9e85dfcaea7fd3e1aa31d41c2062bc44cd95 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Tue, 26 Dec 2017 12:41:04 -0500 Subject: [PATCH] Fix unaligned load for _mm_loaddup_pd with GCC and UBsan --- simon-simd.cpp | 42 ++++++++++++++++++++++++++++++------------ speck-simd.cpp | 36 +++++++++++++++++++++++++++--------- 2 files changed, 57 insertions(+), 21 deletions(-) diff --git a/simon-simd.cpp b/simon-simd.cpp index 8662acf7..e87e2996 100644 --- a/simon-simd.cpp +++ b/simon-simd.cpp @@ -583,6 +583,24 @@ inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) #endif +// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html +#ifndef DOUBLE_CAST +# if (CRYPTOPP_GCC_VERSION >= 40900) + typedef double __attribute__((__aligned__(1))) double_u; +# define DOUBLE_CAST(x) ((double_u *)(void *)(x)) +# else +# define DOUBLE_CAST(x) ((double *)(void *)(x)) +# endif +#endif +#ifndef CONST_DOUBLE_CAST +# if (CRYPTOPP_GCC_VERSION >= 40900) + typedef double __attribute__((__aligned__(1))) double_cu; +# define CONST_DOUBLE_CAST(x) ((const double_cu *)(const void *)(x)) +# else +# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x)) +# endif +#endif + inline void Swap128(__m128i& a,__m128i& b) { #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) @@ -660,18 +678,18 @@ inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1, const word64 *s for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) { const __m128i rk1 = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys+i))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1); const __m128i rk2 = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys+i+1))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1))); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2); } if (rounds & 1) { const __m128i rk = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys+rounds-1))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+rounds-1))); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk); Swap128(x1, y1); @@ -710,13 +728,13 @@ inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1, for (int i = 0; i < static_cast(rounds & ~1) - 1; i += 2) { const __m128i rk1 = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys + i))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i))); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1); y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1); const __m128i rk2 = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys + i + 1))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1))); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2); x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2); @@ -725,7 +743,7 @@ inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1, if (rounds & 1) { const __m128i rk = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys + rounds - 1))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1))); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk); y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk); @@ -764,7 +782,7 @@ inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1, const word64 *s if (rounds & 1) { const __m128i rk = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys + rounds - 1))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1))); Swap128(x1, y1); y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1)); @@ -774,11 +792,11 @@ inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1, const word64 *s for (int i = static_cast(rounds-2); i >= 0; i -= 2) { const __m128i rk1 = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys+i+1))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1))); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1); const __m128i rk2 = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys+i))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2); } @@ -815,7 +833,7 @@ inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1, if (rounds & 1) { const __m128i rk = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys + rounds - 1))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1))); Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1)); @@ -827,13 +845,13 @@ inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1, for (int i = static_cast(rounds-2); i >= 0; i -= 2) { const __m128i rk1 = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys + i + 1))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1))); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1); x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1); const __m128i rk2 = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys + i))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i))); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2); y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2); diff --git a/speck-simd.cpp b/speck-simd.cpp index bcd49604..c622f147 100644 --- a/speck-simd.cpp +++ b/speck-simd.cpp @@ -157,7 +157,7 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, x1 = Shuffle32(x1); y1 = Shuffle32(y1); - for (size_t i=rounds-1; static_cast(i)>=0; --i) + for (int i = static_cast(rounds-1); i >= 0; --i) { const uint32x4_t rk = vdupq_n_u32(subkeys[i]); @@ -249,7 +249,7 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, x2 = Shuffle32(x2); y2 = Shuffle32(y2); x3 = Shuffle32(x3); y3 = Shuffle32(y3); - for (size_t i=rounds-1; static_cast(i)>=0; --i) + for (int i = static_cast(rounds-1); i >= 0; --i) { const uint32x4_t rk = vdupq_n_u32(subkeys[i]); @@ -458,7 +458,7 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1, x1 = Shuffle64(x1); y1 = Shuffle64(y1); - for (size_t i=rounds-1; static_cast(i)>=0; --i) + for (int i = static_cast(rounds-1); i >= 0; --i) { const uint64x2_t rk = vld1q_dup_u64(subkeys+i); @@ -495,7 +495,7 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, x2 = Shuffle64(x2); y2 = Shuffle64(y2); x3 = Shuffle64(x3); y3 = Shuffle64(y3); - for (size_t i=rounds-1; static_cast(i)>=0; --i) + for (int i = static_cast(rounds-1); i >= 0; --i) { const uint64x2_t rk = vld1q_dup_u64(subkeys+i); @@ -543,6 +543,24 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) #endif +// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html +#ifndef DOUBLE_CAST +# if (CRYPTOPP_GCC_VERSION >= 40900) + typedef double __attribute__((__aligned__(1))) double_u; +# define DOUBLE_CAST(x) ((double_u *)(void *)(x)) +# else +# define DOUBLE_CAST(x) ((double *)(void *)(x)) +# endif +#endif +#ifndef CONST_DOUBLE_CAST +# if (CRYPTOPP_GCC_VERSION >= 40900) + typedef double __attribute__((__aligned__(1))) double_cu; +# define CONST_DOUBLE_CAST(x) ((const double_cu *)(const void *)(x)) +# else +# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x)) +# endif +#endif + #if defined(CRYPTOPP_AVX512_ROTATE) template inline __m128i RotateLeft64(const __m128i& val) @@ -605,7 +623,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1, for (int i=0; i < static_cast(rounds); ++i) { const __m128i rk = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys+i))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); x1 = RotateRight64<8>(x1); x1 = _mm_add_epi64(x1, y1); @@ -648,7 +666,7 @@ inline void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1, for (int i=0; i < static_cast(rounds); ++i) { const __m128i rk = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys+i))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); x1 = RotateRight64<8>(x1); x2 = RotateRight64<8>(x2); @@ -697,10 +715,10 @@ inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1, x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); - for (int i = static_cast(rounds-1); i >=0 ; --i) + for (int i = static_cast(rounds-1); i >= 0; --i) { const __m128i rk = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys+i))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); y1 = _mm_xor_si128(y1, x1); y1 = RotateRight64<3>(y1); @@ -743,7 +761,7 @@ inline void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1, for (int i = static_cast(rounds-1); i >= 0; --i) { const __m128i rk = _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(subkeys+i))); + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); y1 = _mm_xor_si128(y1, x1); y2 = _mm_xor_si128(y2, x2);