Fix unaligned load for _mm_loaddup_pd with GCC and UBsan
parent
ae445c0b0f
commit
3fff9e85df
|
|
@ -583,6 +583,24 @@ inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||||
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
|
||||||
|
#ifndef DOUBLE_CAST
|
||||||
|
# if (CRYPTOPP_GCC_VERSION >= 40900)
|
||||||
|
typedef double __attribute__((__aligned__(1))) double_u;
|
||||||
|
# define DOUBLE_CAST(x) ((double_u *)(void *)(x))
|
||||||
|
# else
|
||||||
|
# define DOUBLE_CAST(x) ((double *)(void *)(x))
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
#ifndef CONST_DOUBLE_CAST
|
||||||
|
# if (CRYPTOPP_GCC_VERSION >= 40900)
|
||||||
|
typedef double __attribute__((__aligned__(1))) double_cu;
|
||||||
|
# define CONST_DOUBLE_CAST(x) ((const double_cu *)(const void *)(x))
|
||||||
|
# else
|
||||||
|
# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
inline void Swap128(__m128i& a,__m128i& b)
|
inline void Swap128(__m128i& a,__m128i& b)
|
||||||
{
|
{
|
||||||
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
|
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
|
||||||
|
|
@ -660,18 +678,18 @@ inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1, const word64 *s
|
||||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||||
{
|
{
|
||||||
const __m128i rk1 = _mm_castpd_si128(
|
const __m128i rk1 = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
|
||||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
|
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
|
||||||
|
|
||||||
const __m128i rk2 = _mm_castpd_si128(
|
const __m128i rk2 = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i+1)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
|
||||||
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
|
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
{
|
{
|
||||||
const __m128i rk = _mm_castpd_si128(
|
const __m128i rk = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+rounds-1)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+rounds-1)));
|
||||||
|
|
||||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
|
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
|
||||||
Swap128(x1, y1);
|
Swap128(x1, y1);
|
||||||
|
|
@ -710,13 +728,13 @@ inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||||
for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
|
for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
|
||||||
{
|
{
|
||||||
const __m128i rk1 = _mm_castpd_si128(
|
const __m128i rk1 = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
|
||||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
|
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
|
||||||
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
|
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
|
||||||
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
|
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
|
||||||
|
|
||||||
const __m128i rk2 = _mm_castpd_si128(
|
const __m128i rk2 = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
|
||||||
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
|
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
|
||||||
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
|
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
|
||||||
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
|
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
|
||||||
|
|
@ -725,7 +743,7 @@ inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
{
|
{
|
||||||
const __m128i rk = _mm_castpd_si128(
|
const __m128i rk = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
|
||||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
|
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
|
||||||
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
|
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
|
||||||
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
|
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
|
||||||
|
|
@ -764,7 +782,7 @@ inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1, const word64 *s
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
{
|
{
|
||||||
const __m128i rk = _mm_castpd_si128(
|
const __m128i rk = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
|
||||||
|
|
||||||
Swap128(x1, y1);
|
Swap128(x1, y1);
|
||||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
|
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
|
||||||
|
|
@ -774,11 +792,11 @@ inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1, const word64 *s
|
||||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||||
{
|
{
|
||||||
const __m128i rk1 = _mm_castpd_si128(
|
const __m128i rk1 = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i+1)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
|
||||||
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
|
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
|
||||||
|
|
||||||
const __m128i rk2 = _mm_castpd_si128(
|
const __m128i rk2 = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
|
||||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
|
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -815,7 +833,7 @@ inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
||||||
if (rounds & 1)
|
if (rounds & 1)
|
||||||
{
|
{
|
||||||
const __m128i rk = _mm_castpd_si128(
|
const __m128i rk = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
|
||||||
|
|
||||||
Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
|
Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
|
||||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
|
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
|
||||||
|
|
@ -827,13 +845,13 @@ inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
||||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||||
{
|
{
|
||||||
const __m128i rk1 = _mm_castpd_si128(
|
const __m128i rk1 = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
|
||||||
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
|
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
|
||||||
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
|
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
|
||||||
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
|
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
|
||||||
|
|
||||||
const __m128i rk2 = _mm_castpd_si128(
|
const __m128i rk2 = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
|
||||||
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
|
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
|
||||||
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
|
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
|
||||||
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
|
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
|
||||||
|
|
|
||||||
|
|
@ -157,7 +157,7 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
|
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
|
|
||||||
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
|
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||||
{
|
{
|
||||||
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
|
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
|
||||||
|
|
||||||
|
|
@ -249,7 +249,7 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||||
|
|
||||||
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
|
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||||
{
|
{
|
||||||
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
|
const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
|
||||||
|
|
||||||
|
|
@ -458,7 +458,7 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
|
||||||
|
|
||||||
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
x1 = Shuffle64(x1); y1 = Shuffle64(y1);
|
||||||
|
|
||||||
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
|
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||||
{
|
{
|
||||||
const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
|
const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
|
||||||
|
|
||||||
|
|
@ -495,7 +495,7 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||||
x2 = Shuffle64(x2); y2 = Shuffle64(y2);
|
x2 = Shuffle64(x2); y2 = Shuffle64(y2);
|
||||||
x3 = Shuffle64(x3); y3 = Shuffle64(y3);
|
x3 = Shuffle64(x3); y3 = Shuffle64(y3);
|
||||||
|
|
||||||
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
|
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||||
{
|
{
|
||||||
const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
|
const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
|
||||||
|
|
||||||
|
|
@ -543,6 +543,24 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
|
||||||
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
|
||||||
|
#ifndef DOUBLE_CAST
|
||||||
|
# if (CRYPTOPP_GCC_VERSION >= 40900)
|
||||||
|
typedef double __attribute__((__aligned__(1))) double_u;
|
||||||
|
# define DOUBLE_CAST(x) ((double_u *)(void *)(x))
|
||||||
|
# else
|
||||||
|
# define DOUBLE_CAST(x) ((double *)(void *)(x))
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
#ifndef CONST_DOUBLE_CAST
|
||||||
|
# if (CRYPTOPP_GCC_VERSION >= 40900)
|
||||||
|
typedef double __attribute__((__aligned__(1))) double_cu;
|
||||||
|
# define CONST_DOUBLE_CAST(x) ((const double_cu *)(const void *)(x))
|
||||||
|
# else
|
||||||
|
# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(CRYPTOPP_AVX512_ROTATE)
|
#if defined(CRYPTOPP_AVX512_ROTATE)
|
||||||
template <unsigned int R>
|
template <unsigned int R>
|
||||||
inline __m128i RotateLeft64(const __m128i& val)
|
inline __m128i RotateLeft64(const __m128i& val)
|
||||||
|
|
@ -605,7 +623,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
|
||||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||||
{
|
{
|
||||||
const __m128i rk = _mm_castpd_si128(
|
const __m128i rk = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
|
||||||
|
|
||||||
x1 = RotateRight64<8>(x1);
|
x1 = RotateRight64<8>(x1);
|
||||||
x1 = _mm_add_epi64(x1, y1);
|
x1 = _mm_add_epi64(x1, y1);
|
||||||
|
|
@ -648,7 +666,7 @@ inline void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
|
||||||
for (int i=0; i < static_cast<int>(rounds); ++i)
|
for (int i=0; i < static_cast<int>(rounds); ++i)
|
||||||
{
|
{
|
||||||
const __m128i rk = _mm_castpd_si128(
|
const __m128i rk = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
|
||||||
|
|
||||||
x1 = RotateRight64<8>(x1);
|
x1 = RotateRight64<8>(x1);
|
||||||
x2 = RotateRight64<8>(x2);
|
x2 = RotateRight64<8>(x2);
|
||||||
|
|
@ -697,10 +715,10 @@ inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
|
||||||
x1 = _mm_shuffle_epi8(x1, mask);
|
x1 = _mm_shuffle_epi8(x1, mask);
|
||||||
y1 = _mm_shuffle_epi8(y1, mask);
|
y1 = _mm_shuffle_epi8(y1, mask);
|
||||||
|
|
||||||
for (int i = static_cast<int>(rounds-1); i >=0 ; --i)
|
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||||
{
|
{
|
||||||
const __m128i rk = _mm_castpd_si128(
|
const __m128i rk = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
|
||||||
|
|
||||||
y1 = _mm_xor_si128(y1, x1);
|
y1 = _mm_xor_si128(y1, x1);
|
||||||
y1 = RotateRight64<3>(y1);
|
y1 = RotateRight64<3>(y1);
|
||||||
|
|
@ -743,7 +761,7 @@ inline void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
||||||
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
for (int i = static_cast<int>(rounds-1); i >= 0; --i)
|
||||||
{
|
{
|
||||||
const __m128i rk = _mm_castpd_si128(
|
const __m128i rk = _mm_castpd_si128(
|
||||||
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
|
_mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
|
||||||
|
|
||||||
y1 = _mm_xor_si128(y1, x1);
|
y1 = _mm_xor_si128(y1, x1);
|
||||||
y2 = _mm_xor_si128(y2, x2);
|
y2 = _mm_xor_si128(y2, x2);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue