Rearrange statements and avoid intermediates

The folding of statements helps GCC elimate some of the intermediate stores it was performing. The elimination saved about 1.0 cpb. SIMON-128 is now running around 10 cpb, but it is still off the Simon and Speck team's numbers of 3.5 cpb
pull/548/head
Jeffrey Walton 2017-12-01 04:11:31 -05:00
parent b7ced67892
commit 4792578f09
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
1 changed files with 88 additions and 149 deletions

View File

@ -134,32 +134,26 @@ inline void SIMON128_Enc_Block(uint8x16_t &block0, const word64 *subkeys, unsign
uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1); uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1); uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
x1 = Shuffle64(x1); x1 = Shuffle64(x1); y1 = Shuffle64(y1);
y1 = Shuffle64(y1);
for (size_t i = 0; static_cast<int>(i) < (rounds & ~1)-1; i += 2) for (size_t i = 0; static_cast<int>(i) < (rounds & ~1)-1; i += 2)
{ {
const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i); const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1); y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
y1 = veorq_u64(y1, SIMON128_f(x1)); const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
y1 = veorq_u64(y1, rk1); x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
x1 = veorq_u64(x1, SIMON128_f(y1));
x1 = veorq_u64(x1, rk2);
} }
if (rounds & 1) if (rounds & 1)
{ {
const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1); const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1);
y1 = veorq_u64(y1, SIMON128_f(x1)); y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
y1 = veorq_u64(y1, rk); std::swap(x1, y1);
const uint64x2_t t = x1; x1 = y1; y1 = t;
} }
x1 = Shuffle64(x1); x1 = Shuffle64(x1); y1 = Shuffle64(y1);
y1 = Shuffle64(y1);
block0 = UnpackLow64<uint8x16_t>(x1, y1); block0 = UnpackLow64<uint8x16_t>(x1, y1);
// block1 = UnpackHigh64<uint8x16_t>(x1, y1); // block1 = UnpackHigh64<uint8x16_t>(x1, y1);
@ -178,54 +172,36 @@ inline void SIMON128_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1,
uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5); uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5);
uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block4, block5); uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block4, block5);
x1 = Shuffle64(x1); x1 = Shuffle64(x1); y1 = Shuffle64(y1);
y1 = Shuffle64(y1); x2 = Shuffle64(x2); y2 = Shuffle64(y2);
x2 = Shuffle64(x2); x3 = Shuffle64(x3); y3 = Shuffle64(y3);
y2 = Shuffle64(y2);
x3 = Shuffle64(x3);
y3 = Shuffle64(y3);
for (size_t i = 0; static_cast<int>(i) < (rounds & ~1) - 1; i += 2) for (size_t i = 0; static_cast<int>(i) < (rounds & ~1) - 1; i += 2)
{ {
const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i); const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1); y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1);
y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1);
y1 = veorq_u64(y1, SIMON128_f(x1)); const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
y2 = veorq_u64(y2, SIMON128_f(x2)); x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
y3 = veorq_u64(y3, SIMON128_f(x3)); x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2);
y1 = veorq_u64(y1, rk1); x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2);
y2 = veorq_u64(y2, rk1);
y3 = veorq_u64(y3, rk1);
x1 = veorq_u64(x1, SIMON128_f(y1));
x2 = veorq_u64(x2, SIMON128_f(y2));
x3 = veorq_u64(x3, SIMON128_f(y3));
x1 = veorq_u64(x1, rk2);
x2 = veorq_u64(x2, rk2);
x3 = veorq_u64(x3, rk2);
} }
if (rounds & 1) if (rounds & 1)
{ {
const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1); const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
y1 = veorq_u64(y1, SIMON128_f(x1)); y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
y2 = veorq_u64(y2, SIMON128_f(x2)); y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk);
y3 = veorq_u64(y3, SIMON128_f(x3)); y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk);
y1 = veorq_u64(y1, rk); std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
y2 = veorq_u64(y2, rk);
y3 = veorq_u64(y3, rk);
const uint64x2_t t1 = x1; x1 = y1; y1 = t1;
const uint64x2_t t2 = x2; x2 = y2; y2 = t2;
const uint64x2_t t3 = x3; x3 = y3; y3 = t3;
} }
x1 = Shuffle64(x1); x1 = Shuffle64(x1); y1 = Shuffle64(y1);
y1 = Shuffle64(y1); x2 = Shuffle64(x2); y2 = Shuffle64(y2);
x2 = Shuffle64(x2); x3 = Shuffle64(x3); y3 = Shuffle64(y3);
y2 = Shuffle64(y2);
x3 = Shuffle64(x3);
y3 = Shuffle64(y3);
block0 = UnpackLow64<uint8x16_t>(x1, y1); block0 = UnpackLow64<uint8x16_t>(x1, y1);
block1 = UnpackHigh64<uint8x16_t>(x1, y1); block1 = UnpackHigh64<uint8x16_t>(x1, y1);
@ -244,32 +220,26 @@ inline void SIMON128_Dec_Block(uint8x16_t &block0, const word64 *subkeys, unsign
uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1); uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1); uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
x1 = Shuffle64(x1); x1 = Shuffle64(x1); y1 = Shuffle64(y1);
y1 = Shuffle64(y1);
if (rounds & 1) if (rounds & 1)
{ {
const uint64x2_t t = x1; x1 = y1; y1 = t;
const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1); const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
std::swap(x1, y1);
y1 = veorq_u64(y1, rk); y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
y1 = veorq_u64(y1, SIMON128_f(x1));
rounds--; rounds--;
} }
for (size_t i = rounds-2; static_cast<int>(i) >= 0; i -= 2) for (size_t i = rounds-2; static_cast<int>(i) >= 0; i -= 2)
{ {
const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1); const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1);
const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i); x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
x1 = veorq_u64(x1, SIMON128_f(y1)); const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i);
x1 = veorq_u64(x1, rk1); y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
y1 = veorq_u64(y1, SIMON128_f(x1));
y1 = veorq_u64(y1, rk2);
} }
x1 = Shuffle64(x1); x1 = Shuffle64(x1); y1 = Shuffle64(y1);
y1 = Shuffle64(y1);
block0 = UnpackLow64<uint8x16_t>(x1, y1); block0 = UnpackLow64<uint8x16_t>(x1, y1);
// block1 = UnpackHigh64<uint8x16_t>(x1, y1); // block1 = UnpackHigh64<uint8x16_t>(x1, y1);
@ -288,51 +258,36 @@ inline void SIMON128_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1,
uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5); uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5);
uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block5, block5); uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block5, block5);
x1 = Shuffle64(x1); x1 = Shuffle64(x1); y1 = Shuffle64(y1);
y1 = Shuffle64(y1); x2 = Shuffle64(x2); y2 = Shuffle64(y2);
x2 = Shuffle64(x2); x3 = Shuffle64(x3); y3 = Shuffle64(y3);
y2 = Shuffle64(y2);
x3 = Shuffle64(x3);
y3 = Shuffle64(y3);
if (rounds & 1) if (rounds & 1)
{ {
const uint64x2_t t = x1; x1 = y1; y1 = t; std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1); const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
y1 = veorq_u64(y1, rk); y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
y2 = veorq_u64(y2, rk); y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2));
y1 = veorq_u64(y1, SIMON128_f(x1));
y2 = veorq_u64(y2, SIMON128_f(x2));
rounds--; rounds--;
} }
for (size_t i = rounds - 2; static_cast<int>(i) >= 0; i -= 2) for (size_t i = rounds - 2; static_cast<int>(i) >= 0; i -= 2)
{ {
const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1); const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1);
const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i); x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1);
x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1);
x1 = veorq_u64(x1, SIMON128_f(y1)); const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i);
x2 = veorq_u64(x2, SIMON128_f(y2)); y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
x3 = veorq_u64(x3, SIMON128_f(y3)); y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2);
x1 = veorq_u64(x1, rk1); y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
x2 = veorq_u64(x2, rk1);
x3 = veorq_u64(x3, rk1);
y1 = veorq_u64(y1, SIMON128_f(x1));
y2 = veorq_u64(y2, SIMON128_f(x2));
y3 = veorq_u64(y3, SIMON128_f(x3));
y1 = veorq_u64(y1, rk2);
y2 = veorq_u64(y2, rk2);
y3 = veorq_u64(y3, rk2);
} }
x1 = Shuffle64(x1); x1 = Shuffle64(x1); y1 = Shuffle64(y1);
y1 = Shuffle64(y1); x2 = Shuffle64(x2); y2 = Shuffle64(y2);
x2 = Shuffle64(x2); x3 = Shuffle64(x3); y3 = Shuffle64(y3);
y2 = Shuffle64(y2);
x3 = Shuffle64(x3);
y3 = Shuffle64(y3);
block0 = UnpackLow64<uint8x16_t>(x1, y1); block0 = UnpackLow64<uint8x16_t>(x1, y1);
block1 = UnpackHigh64<uint8x16_t>(x1, y1); block1 = UnpackHigh64<uint8x16_t>(x1, y1);
@ -470,22 +425,29 @@ size_t SIMON128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6,
CRYPTOPP_ALIGN_DATA(16) CRYPTOPP_ALIGN_DATA(16)
const word32 s_one[] = {0, 0, 0, 1<<24}; const word32 s_one[] = {0, 0, 0, 1<<24};
inline void Swap128(__m128i& a,__m128i& b)
{
#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
// __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
// SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
vec_swap(a, b);
#else
std::swap(a, b);
#endif
}
template <unsigned int R> template <unsigned int R>
inline __m128i RotateLeft64(const __m128i& val) inline __m128i RotateLeft64(const __m128i& val)
{ {
CRYPTOPP_ASSERT(R < 64); return _mm_or_si128(
const __m128i a = _mm_slli_epi64(val, R); _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
const __m128i b = _mm_srli_epi64(val, 64-R);
return _mm_or_si128(a, b);
} }
template <unsigned int R> template <unsigned int R>
inline __m128i RotateRight64(const __m128i& val) inline __m128i RotateRight64(const __m128i& val)
{ {
CRYPTOPP_ASSERT(R < 64); return _mm_or_si128(
const __m128i a = _mm_slli_epi64(val, 64-R); _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
const __m128i b = _mm_srli_epi64(val, R);
return _mm_or_si128(a, b);
} }
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
@ -527,13 +489,11 @@ inline void SIMON128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
{ {
const __m128i rk1 = _mm_castpd_si128( const __m128i rk1 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
const __m128i rk2 = _mm_castpd_si128( const __m128i rk2 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i+1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i+1)));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
y1 = _mm_xor_si128(y1, SIMON128_f(x1));
y1 = _mm_xor_si128(y1, rk1);
x1 = _mm_xor_si128(x1, SIMON128_f(y1));
x1 = _mm_xor_si128(x1, rk2);
} }
if (rounds & 1) if (rounds & 1)
@ -541,10 +501,8 @@ inline void SIMON128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+rounds-1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+rounds-1)));
y1 = _mm_xor_si128(y1, SIMON128_f(x1)); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
y1 = _mm_xor_si128(y1, rk); Swap128(x1, y1);
const __m128i t = x1; x1 = y1; y1 = t;
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
@ -574,31 +532,22 @@ inline void SIMON128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
{ {
const __m128i rk1 = _mm_castpd_si128( const __m128i rk1 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
const __m128i rk2 = _mm_castpd_si128( const __m128i rk2 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1)));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
y1 = _mm_xor_si128(y1, SIMON128_f(x1)); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
y2 = _mm_xor_si128(y2, SIMON128_f(x2));
y1 = _mm_xor_si128(y1, rk1);
y2 = _mm_xor_si128(y2, rk1);
x1 = _mm_xor_si128(x1, SIMON128_f(y1));
x2 = _mm_xor_si128(x2, SIMON128_f(y2));
x1 = _mm_xor_si128(x1, rk2);
x2 = _mm_xor_si128(x2, rk2);
} }
if (rounds & 1) if (rounds & 1)
{ {
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
y1 = _mm_xor_si128(y1, SIMON128_f(x1)); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
y2 = _mm_xor_si128(y2, SIMON128_f(x2)); Swap128(x1, y1); Swap128(x2, y2);
y1 = _mm_xor_si128(y1, rk);
y2 = _mm_xor_si128(y2, rk);
const __m128i t1 = x1; x1 = y1; y1 = t1;
const __m128i t2 = x2; x2 = y2; y2 = t2;
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
@ -627,12 +576,11 @@ inline void SIMON128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
if (rounds & 1) if (rounds & 1)
{ {
const __m128i t = x1; x1 = y1; y1 = t;
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1)));
y1 = _mm_xor_si128(y1, rk); Swap128(x1, y1);
y1 = _mm_xor_si128(y1, SIMON128_f(x1)); y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
rounds--; rounds--;
} }
@ -640,13 +588,11 @@ inline void SIMON128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
{ {
const __m128i rk1 = _mm_castpd_si128( const __m128i rk1 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i+1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i+1)));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
const __m128i rk2 = _mm_castpd_si128( const __m128i rk2 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
x1 = _mm_xor_si128(x1, SIMON128_f(y1));
x1 = _mm_xor_si128(x1, rk1);
y1 = _mm_xor_si128(y1, SIMON128_f(x1));
y1 = _mm_xor_si128(y1, rk2);
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
@ -674,15 +620,12 @@ inline void SIMON128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
if (rounds & 1) if (rounds & 1)
{ {
const __m128i t = x1; x1 = y1; y1 = t;
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1)));
y1 = _mm_xor_si128(y1, rk); Swap128(x1, y1); Swap128(x2, y2);
y2 = _mm_xor_si128(y2, rk); y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
y1 = _mm_xor_si128(y1, SIMON128_f(x1)); y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2));
y2 = _mm_xor_si128(y2, SIMON128_f(x2));
rounds--; rounds--;
} }
@ -690,17 +633,13 @@ inline void SIMON128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
{ {
const __m128i rk1 = _mm_castpd_si128( const __m128i rk1 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1)));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
const __m128i rk2 = _mm_castpd_si128( const __m128i rk2 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
x1 = _mm_xor_si128(x1, SIMON128_f(y1)); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
x2 = _mm_xor_si128(x2, SIMON128_f(y2));
x1 = _mm_xor_si128(x1, rk1);
x2 = _mm_xor_si128(x2, rk1);
y1 = _mm_xor_si128(y1, SIMON128_f(x1));
y2 = _mm_xor_si128(y2, SIMON128_f(x2));
y1 = _mm_xor_si128(y1, rk2);
y2 = _mm_xor_si128(y2, rk2);
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);