Fix unaligned load for _mm_loaddup_pd with GCC and UBsan

pull/552/head
Jeffrey Walton 2017-12-26 12:41:04 -05:00
parent ae445c0b0f
commit 3fff9e85df
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 57 additions and 21 deletions

View File

@ -583,6 +583,24 @@ inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
#endif #endif
// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
#ifndef DOUBLE_CAST
# if (CRYPTOPP_GCC_VERSION >= 40900)
typedef double __attribute__((__aligned__(1))) double_u;
# define DOUBLE_CAST(x) ((double_u *)(void *)(x))
# else
# define DOUBLE_CAST(x) ((double *)(void *)(x))
# endif
#endif
#ifndef CONST_DOUBLE_CAST
# if (CRYPTOPP_GCC_VERSION >= 40900)
typedef double __attribute__((__aligned__(1))) double_cu;
# define CONST_DOUBLE_CAST(x) ((const double_cu *)(const void *)(x))
# else
# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
# endif
#endif
inline void Swap128(__m128i& a,__m128i& b) inline void Swap128(__m128i& a,__m128i& b)
{ {
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
@ -660,18 +678,18 @@ inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1, const word64 *s
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2) for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{ {
const __m128i rk1 = _mm_castpd_si128( const __m128i rk1 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
const __m128i rk2 = _mm_castpd_si128( const __m128i rk2 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i+1))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
} }
if (rounds & 1) if (rounds & 1)
{ {
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+rounds-1))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+rounds-1)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
Swap128(x1, y1); Swap128(x1, y1);
@ -710,13 +728,13 @@ inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2) for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
{ {
const __m128i rk1 = _mm_castpd_si128( const __m128i rk1 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1); y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
const __m128i rk2 = _mm_castpd_si128( const __m128i rk2 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2); x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
@ -725,7 +743,7 @@ inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
if (rounds & 1) if (rounds & 1)
{ {
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk); y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
@ -764,7 +782,7 @@ inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1, const word64 *s
if (rounds & 1) if (rounds & 1)
{ {
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
Swap128(x1, y1); Swap128(x1, y1);
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1)); y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
@ -774,11 +792,11 @@ inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1, const word64 *s
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2) for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{ {
const __m128i rk1 = _mm_castpd_si128( const __m128i rk1 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i+1))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
const __m128i rk2 = _mm_castpd_si128( const __m128i rk2 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
} }
@ -815,7 +833,7 @@ inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
if (rounds & 1) if (rounds & 1)
{ {
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1)); y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
@ -827,13 +845,13 @@ inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2) for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{ {
const __m128i rk1 = _mm_castpd_si128( const __m128i rk1 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1); x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
const __m128i rk2 = _mm_castpd_si128( const __m128i rk2 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2); y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);

View File

@ -157,7 +157,7 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i) for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{ {
const uint32x4_t rk = vdupq_n_u32(subkeys[i]); const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
@ -249,7 +249,7 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
x2 = Shuffle32(x2); y2 = Shuffle32(y2); x2 = Shuffle32(x2); y2 = Shuffle32(y2);
x3 = Shuffle32(x3); y3 = Shuffle32(y3); x3 = Shuffle32(x3); y3 = Shuffle32(y3);
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i) for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{ {
const uint32x4_t rk = vdupq_n_u32(subkeys[i]); const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
@ -458,7 +458,7 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
x1 = Shuffle64(x1); y1 = Shuffle64(y1); x1 = Shuffle64(x1); y1 = Shuffle64(y1);
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i) for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{ {
const uint64x2_t rk = vld1q_dup_u64(subkeys+i); const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
@ -495,7 +495,7 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
x2 = Shuffle64(x2); y2 = Shuffle64(y2); x2 = Shuffle64(x2); y2 = Shuffle64(y2);
x3 = Shuffle64(x3); y3 = Shuffle64(y3); x3 = Shuffle64(x3); y3 = Shuffle64(y3);
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i) for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{ {
const uint64x2_t rk = vld1q_dup_u64(subkeys+i); const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
@ -543,6 +543,24 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
#endif #endif
// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
#ifndef DOUBLE_CAST
# if (CRYPTOPP_GCC_VERSION >= 40900)
typedef double __attribute__((__aligned__(1))) double_u;
# define DOUBLE_CAST(x) ((double_u *)(void *)(x))
# else
# define DOUBLE_CAST(x) ((double *)(void *)(x))
# endif
#endif
#ifndef CONST_DOUBLE_CAST
# if (CRYPTOPP_GCC_VERSION >= 40900)
typedef double __attribute__((__aligned__(1))) double_cu;
# define CONST_DOUBLE_CAST(x) ((const double_cu *)(const void *)(x))
# else
# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
# endif
#endif
#if defined(CRYPTOPP_AVX512_ROTATE) #if defined(CRYPTOPP_AVX512_ROTATE)
template <unsigned int R> template <unsigned int R>
inline __m128i RotateLeft64(const __m128i& val) inline __m128i RotateLeft64(const __m128i& val)
@ -605,7 +623,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
for (int i=0; i < static_cast<int>(rounds); ++i) for (int i=0; i < static_cast<int>(rounds); ++i)
{ {
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
x1 = RotateRight64<8>(x1); x1 = RotateRight64<8>(x1);
x1 = _mm_add_epi64(x1, y1); x1 = _mm_add_epi64(x1, y1);
@ -648,7 +666,7 @@ inline void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
for (int i=0; i < static_cast<int>(rounds); ++i) for (int i=0; i < static_cast<int>(rounds); ++i)
{ {
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
x1 = RotateRight64<8>(x1); x1 = RotateRight64<8>(x1);
x2 = RotateRight64<8>(x2); x2 = RotateRight64<8>(x2);
@ -697,10 +715,10 @@ inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
for (int i = static_cast<int>(rounds-1); i >=0 ; --i) for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{ {
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
y1 = _mm_xor_si128(y1, x1); y1 = _mm_xor_si128(y1, x1);
y1 = RotateRight64<3>(y1); y1 = RotateRight64<3>(y1);
@ -743,7 +761,7 @@ inline void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
for (int i = static_cast<int>(rounds-1); i >= 0; --i) for (int i = static_cast<int>(rounds-1); i >= 0; --i)
{ {
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i))); _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
y1 = _mm_xor_si128(y1, x1); y1 = _mm_xor_si128(y1, x1);
y2 = _mm_xor_si128(y2, x2); y2 = _mm_xor_si128(y2, x2);