Use 6x-2x-1x for Simon and Speck on IA-32

For Simon-64 and Speck-64 this means we are effectively using 12x-4x-1x. We are mostly at the threshold for IA-32 and parallelization. At any time 10 to 13 XMM registers are being used.

Prefer movsd by way of _mm_load_sd and _mm_store_sd.

Fix "error C3861: _mm_cvtsi128_si64x identifier not found".
pull/548/head
Jeffrey Walton 2017-12-06 06:18:46 -05:00
parent e9654192f2
commit 86acc8ed45
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 444 additions and 76 deletions

View File

@ -1073,13 +1073,12 @@ inline __m128i SIMON128_f(const __m128i& v)
_mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v))); _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v)));
} }
inline void SIMON128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1, const word64 *subkeys, unsigned int rounds)
{ {
// Rearrange the data for vectorization. The incoming data was read from // Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. // be permuted to the following.
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
__m128i block1 = _mm_setzero_si128();
__m128i x1 = _mm_unpacklo_epi64(block0, block1); __m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1);
@ -1111,11 +1110,12 @@ inline void SIMON128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
block0 = _mm_unpacklo_epi64(x1, y1); block0 = _mm_unpacklo_epi64(x1, y1);
// block1 = _mm_unpackhi_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1);
} }
inline void SIMON128_Enc_4_Blocks(__m128i &block0, __m128i &block1, inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word64 *subkeys, unsigned int rounds)
{ {
// Rearrange the data for vectorization. The incoming data was read from // Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
@ -1125,12 +1125,16 @@ inline void SIMON128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
__m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1);
__m128i x2 = _mm_unpacklo_epi64(block2, block3); __m128i x2 = _mm_unpacklo_epi64(block2, block3);
__m128i y2 = _mm_unpackhi_epi64(block2, block3); __m128i y2 = _mm_unpackhi_epi64(block2, block3);
__m128i x3 = _mm_unpacklo_epi64(block4, block5);
__m128i y3 = _mm_unpackhi_epi64(block4, block5);
const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7);
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
for (size_t i = 0; static_cast<int>(i) < (rounds & ~1) - 1; i += 2) for (size_t i = 0; static_cast<int>(i) < (rounds & ~1) - 1; i += 2)
{ {
@ -1138,11 +1142,13 @@ inline void SIMON128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
const __m128i rk2 = _mm_castpd_si128( const __m128i rk2 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1)));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
} }
if (rounds & 1) if (rounds & 1)
@ -1151,27 +1157,32 @@ inline void SIMON128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
Swap128(x1, y1); Swap128(x2, y2); y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = _mm_unpacklo_epi64(x1, y1); block0 = _mm_unpacklo_epi64(x1, y1);
block1 = _mm_unpackhi_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1);
block2 = _mm_unpacklo_epi64(x2, y2); block2 = _mm_unpacklo_epi64(x2, y2);
block3 = _mm_unpackhi_epi64(x2, y2); block3 = _mm_unpackhi_epi64(x2, y2);
block4 = _mm_unpacklo_epi64(x3, y3);
block5 = _mm_unpackhi_epi64(x3, y3);
} }
inline void SIMON128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1, const word64 *subkeys, unsigned int rounds)
{ {
// Rearrange the data for vectorization. The incoming data was read from // Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. // be permuted to the following.
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
__m128i block1 = _mm_setzero_si128();
__m128i x1 = _mm_unpacklo_epi64(block0, block1); __m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1);
@ -1204,11 +1215,12 @@ inline void SIMON128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
block0 = _mm_unpacklo_epi64(x1, y1); block0 = _mm_unpacklo_epi64(x1, y1);
// block1 = _mm_unpackhi_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1);
} }
inline void SIMON128_Dec_4_Blocks(__m128i &block0, __m128i &block1, inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word64 *subkeys, unsigned int rounds)
{ {
// Rearrange the data for vectorization. The incoming data was read from // Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
@ -1218,21 +1230,26 @@ inline void SIMON128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
__m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1);
__m128i x2 = _mm_unpacklo_epi64(block2, block3); __m128i x2 = _mm_unpacklo_epi64(block2, block3);
__m128i y2 = _mm_unpackhi_epi64(block2, block3); __m128i y2 = _mm_unpackhi_epi64(block2, block3);
__m128i x3 = _mm_unpacklo_epi64(block4, block5);
__m128i y3 = _mm_unpackhi_epi64(block4, block5);
const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7);
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
if (rounds & 1) if (rounds & 1)
{ {
const __m128i rk = _mm_castpd_si128( const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + rounds - 1)));
Swap128(x1, y1); Swap128(x2, y2); Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1)); y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2)); y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2));
y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3));
rounds--; rounds--;
} }
@ -1242,26 +1259,33 @@ inline void SIMON128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i + 1)));
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
const __m128i rk2 = _mm_castpd_si128( const __m128i rk2 = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i))); _mm_loaddup_pd(reinterpret_cast<const double*>(subkeys + i)));
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = _mm_unpacklo_epi64(x1, y1); block0 = _mm_unpacklo_epi64(x1, y1);
block1 = _mm_unpackhi_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1);
block2 = _mm_unpacklo_epi64(x2, y2); block2 = _mm_unpacklo_epi64(x2, y2);
block3 = _mm_unpackhi_epi64(x2, y2); block3 = _mm_unpackhi_epi64(x2, y2);
block4 = _mm_unpacklo_epi64(x3, y3);
block5 = _mm_unpackhi_epi64(x3, y3);
} }
template <typename F1, typename F4> template <typename F2, typename F6>
inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F2 func2, F6 func6,
const word64 *subKeys, size_t rounds, const byte *inBlocks, const word64 *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
@ -1287,16 +1311,19 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4,
if (flags & BlockTransformation::BT_AllowParallel) if (flags & BlockTransformation::BT_AllowParallel)
{ {
while (length >= 4*blockSize) while (length >= 6*blockSize)
{ {
__m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; __m128i block0, block1, block2, block3, block4, block5;
block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const __m128i be1 = *CONST_M128_CAST(s_one128); const __m128i be1 = *CONST_M128_CAST(s_one128);
block1 = _mm_add_epi32(block0, be1); block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1); block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1); block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); block4 = _mm_add_epi32(block3, be1);
block5 = _mm_add_epi32(block4, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1));
} }
else else
{ {
@ -1307,6 +1334,10 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4,
inBlocks += inIncrement; inBlocks += inIncrement;
block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement; inBlocks += inIncrement;
block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
} }
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
@ -1321,9 +1352,13 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4,
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
} }
func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds)); func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
@ -1335,6 +1370,10 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4,
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
} }
_mm_storeu_si128(M128_CAST(outBlocks), block0); _mm_storeu_si128(M128_CAST(outBlocks), block0);
@ -1345,14 +1384,63 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4,
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block3); _mm_storeu_si128(M128_CAST(outBlocks), block3);
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block4);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block5);
outBlocks += outIncrement;
length -= 4*blockSize; length -= 6*blockSize;
}
while (length >= 2*blockSize)
{
__m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const __m128i be1 = *CONST_M128_CAST(s_one128);
block1 = _mm_add_epi32(block0, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1));
}
else
{
inBlocks += inIncrement;
block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks);
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
_mm_storeu_si128(M128_CAST(outBlocks), block0);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block1);
outBlocks += outIncrement;
length -= 2*blockSize;
} }
} }
while (length >= blockSize) while (length >= blockSize)
{ {
__m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); __m128i block, zero = _mm_setzero_si128();
block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
@ -1360,7 +1448,7 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4,
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++; const_cast<byte *>(inBlocks)[15]++;
func1(block, subKeys, static_cast<unsigned int>(rounds)); func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
@ -1501,8 +1589,9 @@ inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
block1 = _mm_unpackhi_epi32(x1, y1); block1 = _mm_unpackhi_epi32(x1, y1);
} }
inline void SIMON64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block3, const word32 *subkeys, unsigned int rounds) __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word32 *subkeys, unsigned int rounds)
{ {
// Rearrange the data for vectorization. The incoming data was read from // Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
@ -1519,21 +1608,30 @@ inline void SIMON64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
const __m128 t4 = _mm_castsi128_ps(block4);
const __m128 t5 = _mm_castsi128_ps(block5);
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
for (size_t i = 0; static_cast<int>(i) < (rounds & ~1)-1; i += 2) for (size_t i = 0; static_cast<int>(i) < (rounds & ~1)-1; i += 2)
{ {
const __m128i rk1 = _mm_set1_epi32(subkeys[i]); const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]); const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
} }
if (rounds & 1) if (rounds & 1)
@ -1541,13 +1639,16 @@ inline void SIMON64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]); const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
Swap128(x1, y1); Swap128(x2, y2); y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
// The is roughly the SSE equivalent to ARM vzp32 // The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
@ -1555,10 +1656,13 @@ inline void SIMON64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
block1 = _mm_unpackhi_epi32(x1, y1); block1 = _mm_unpackhi_epi32(x1, y1);
block2 = _mm_unpacklo_epi32(x2, y2); block2 = _mm_unpacklo_epi32(x2, y2);
block3 = _mm_unpackhi_epi32(x2, y2); block3 = _mm_unpackhi_epi32(x2, y2);
block4 = _mm_unpacklo_epi32(x3, y3);
block5 = _mm_unpackhi_epi32(x3, y3);
} }
inline void SIMON64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block3, const word32 *subkeys, unsigned int rounds) __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word32 *subkeys, unsigned int rounds)
{ {
// Rearrange the data for vectorization. The incoming data was read from // Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
@ -1575,18 +1679,26 @@ inline void SIMON64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
const __m128 t4 = _mm_castsi128_ps(block4);
const __m128 t5 = _mm_castsi128_ps(block5);
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
if (rounds & 1) if (rounds & 1)
{ {
Swap128(x1, y1); Swap128(x2, y2); Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]); const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1)); y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2)); y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
rounds--; rounds--;
} }
@ -1595,16 +1707,20 @@ inline void SIMON64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]); const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
const __m128i rk2 = _mm_set1_epi32(subkeys[i]); const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
// The is roughly the SSE equivalent to ARM vzp32 // The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
@ -1612,10 +1728,12 @@ inline void SIMON64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
block1 = _mm_unpackhi_epi32(x1, y1); block1 = _mm_unpackhi_epi32(x1, y1);
block2 = _mm_unpacklo_epi32(x2, y2); block2 = _mm_unpacklo_epi32(x2, y2);
block3 = _mm_unpackhi_epi32(x2, y2); block3 = _mm_unpackhi_epi32(x2, y2);
block4 = _mm_unpacklo_epi32(x3, y3);
block5 = _mm_unpackhi_epi32(x3, y3);
} }
template <typename F2, typename F4> template <typename F2, typename F6>
inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
const word32 *subKeys, size_t rounds, const byte *inBlocks, const word32 *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
@ -1642,16 +1760,19 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
if (flags & BlockTransformation::BT_AllowParallel) if (flags & BlockTransformation::BT_AllowParallel)
{ {
while (length >= 4*xmmBlockSize) while (length >= 6*xmmBlockSize)
{ {
__m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; __m128i block0, block1, block2, block3, block4, block5;
block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const __m128i be1 = *CONST_M128_CAST(s_one64); const __m128i be1 = *CONST_M128_CAST(s_one64);
block1 = _mm_add_epi32(block0, be1); block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1); block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1); block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); block4 = _mm_add_epi32(block3, be1);
block5 = _mm_add_epi32(block4, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1));
} }
else else
{ {
@ -1662,6 +1783,10 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
inBlocks += inIncrement; inBlocks += inIncrement;
block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement; inBlocks += inIncrement;
block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
} }
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
@ -1676,9 +1801,13 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
} }
func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds)); func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
@ -1690,6 +1819,10 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
} }
_mm_storeu_si128(M128_CAST(outBlocks), block0); _mm_storeu_si128(M128_CAST(outBlocks), block0);
@ -1700,8 +1833,56 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block3); _mm_storeu_si128(M128_CAST(outBlocks), block3);
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block4);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block5);
outBlocks += outIncrement;
length -= 4*xmmBlockSize; length -= 6*xmmBlockSize;
}
while (length >= 2*xmmBlockSize)
{
__m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const __m128i be1 = *CONST_M128_CAST(s_one64);
block1 = _mm_add_epi32(block0, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1));
}
else
{
inBlocks += inIncrement;
block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks);
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
_mm_storeu_si128(M128_CAST(outBlocks), block0);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block1);
outBlocks += outIncrement;
length -= 2*xmmBlockSize;
} }
} }
@ -1728,13 +1909,13 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
while (length >= blockSize) while (length >= blockSize)
{ {
__m128i block, zero = _mm_setzero_si128(); __m128i block, zero = _mm_setzero_si128();
block = _mm_xor_si128(block, _mm_castpd_si128( block = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks)))); _mm_load_sd(reinterpret_cast<const double*>(inBlocks)));
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
{ {
block = _mm_xor_si128(block, _mm_castpd_si128( block = _mm_xor_si128(block, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(xorBlocks)))); _mm_load_sd(reinterpret_cast<const double*>(xorBlocks))));
} }
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -1745,11 +1926,10 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
block = _mm_xor_si128(block, _mm_castpd_si128( block = _mm_xor_si128(block, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(xorBlocks)))); _mm_load_sd(reinterpret_cast<const double*>(xorBlocks))));
} }
const word64 temp = _mm_cvtsi128_si64x(block); _mm_store_sd(reinterpret_cast<double*>(outBlocks), _mm_castsi128_pd(block));
std::memcpy(outBlocks, &temp, 8);
inBlocks += inIncrement; inBlocks += inIncrement;
outBlocks += outIncrement; outBlocks += outIncrement;
@ -1809,14 +1989,14 @@ size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rou
size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
return SIMON64_AdvancedProcessBlocks_SSE41(SIMON64_Enc_Block, SIMON64_Enc_4_Blocks, return SIMON64_AdvancedProcessBlocks_SSE41(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
} }
size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
return SIMON64_AdvancedProcessBlocks_SSE41(SIMON64_Dec_Block, SIMON64_Dec_4_Blocks, return SIMON64_AdvancedProcessBlocks_SSE41(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
} }
#endif #endif
@ -1825,14 +2005,14 @@ size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rou
size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
return SIMON128_AdvancedProcessBlocks_SSSE3(SIMON128_Enc_Block, SIMON128_Enc_4_Blocks, return SIMON128_AdvancedProcessBlocks_SSSE3(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
} }
size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
return SIMON128_AdvancedProcessBlocks_SSSE3(SIMON128_Dec_Block, SIMON128_Dec_4_Blocks, return SIMON128_AdvancedProcessBlocks_SSSE3(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
} }
#endif // CRYPTOPP_SSSE3_AVAILABLE #endif // CRYPTOPP_SSSE3_AVAILABLE

View File

@ -1043,8 +1043,9 @@ inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
block1 = _mm_unpackhi_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1);
} }
inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1, inline void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word64 *subkeys, unsigned int rounds)
{ {
// Rearrange the data for vectorization. The incoming data was read from // Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
@ -1054,12 +1055,16 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
__m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1);
__m128i x2 = _mm_unpacklo_epi64(block2, block3); __m128i x2 = _mm_unpacklo_epi64(block2, block3);
__m128i y2 = _mm_unpackhi_epi64(block2, block3); __m128i y2 = _mm_unpackhi_epi64(block2, block3);
__m128i x3 = _mm_unpacklo_epi64(block4, block5);
__m128i y3 = _mm_unpackhi_epi64(block4, block5);
const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7);
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
for (size_t i=0; static_cast<int>(i)<rounds; ++i) for (size_t i=0; static_cast<int>(i)<rounds; ++i)
{ {
@ -1068,26 +1073,35 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
x1 = RotateRight64<8>(x1); x1 = RotateRight64<8>(x1);
x2 = RotateRight64<8>(x2); x2 = RotateRight64<8>(x2);
x3 = RotateRight64<8>(x3);
x1 = _mm_add_epi64(x1, y1); x1 = _mm_add_epi64(x1, y1);
x2 = _mm_add_epi64(x2, y2); x2 = _mm_add_epi64(x2, y2);
x3 = _mm_add_epi64(x3, y3);
x1 = _mm_xor_si128(x1, rk); x1 = _mm_xor_si128(x1, rk);
x2 = _mm_xor_si128(x2, rk); x2 = _mm_xor_si128(x2, rk);
x3 = _mm_xor_si128(x3, rk);
y1 = RotateLeft64<3>(y1); y1 = RotateLeft64<3>(y1);
y2 = RotateLeft64<3>(y2); y2 = RotateLeft64<3>(y2);
y3 = RotateLeft64<3>(y3);
y1 = _mm_xor_si128(y1, x1); y1 = _mm_xor_si128(y1, x1);
y2 = _mm_xor_si128(y2, x2); y2 = _mm_xor_si128(y2, x2);
y3 = _mm_xor_si128(y3, x3);
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = _mm_unpacklo_epi64(x1, y1); block0 = _mm_unpacklo_epi64(x1, y1);
block1 = _mm_unpackhi_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1);
block2 = _mm_unpacklo_epi64(x2, y2); block2 = _mm_unpacklo_epi64(x2, y2);
block3 = _mm_unpackhi_epi64(x2, y2); block3 = _mm_unpackhi_epi64(x2, y2);
block4 = _mm_unpacklo_epi64(x3, y3);
block5 = _mm_unpackhi_epi64(x3, y3);
} }
inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1, inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
@ -1124,8 +1138,9 @@ inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
block1 = _mm_unpackhi_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1);
} }
inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1, inline void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word64 *subkeys, unsigned int rounds)
{ {
// Rearrange the data for vectorization. The incoming data was read from // Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
@ -1135,12 +1150,16 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
__m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1);
__m128i x2 = _mm_unpacklo_epi64(block2, block3); __m128i x2 = _mm_unpacklo_epi64(block2, block3);
__m128i y2 = _mm_unpackhi_epi64(block2, block3); __m128i y2 = _mm_unpackhi_epi64(block2, block3);
__m128i x3 = _mm_unpacklo_epi64(block4, block5);
__m128i y3 = _mm_unpackhi_epi64(block4, block5);
const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7);
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i) for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
{ {
@ -1149,30 +1168,39 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
y1 = _mm_xor_si128(y1, x1); y1 = _mm_xor_si128(y1, x1);
y2 = _mm_xor_si128(y2, x2); y2 = _mm_xor_si128(y2, x2);
y3 = _mm_xor_si128(y3, x3);
y1 = RotateRight64<3>(y1); y1 = RotateRight64<3>(y1);
y2 = RotateRight64<3>(y2); y2 = RotateRight64<3>(y2);
y3 = RotateRight64<3>(y3);
x1 = _mm_xor_si128(x1, rk); x1 = _mm_xor_si128(x1, rk);
x2 = _mm_xor_si128(x2, rk); x2 = _mm_xor_si128(x2, rk);
x3 = _mm_xor_si128(x3, rk);
x1 = _mm_sub_epi64(x1, y1); x1 = _mm_sub_epi64(x1, y1);
x2 = _mm_sub_epi64(x2, y2); x2 = _mm_sub_epi64(x2, y2);
x3 = _mm_sub_epi64(x3, y3);
x1 = RotateLeft64<8>(x1); x1 = RotateLeft64<8>(x1);
x2 = RotateLeft64<8>(x2); x2 = RotateLeft64<8>(x2);
x3 = RotateLeft64<8>(x3);
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = _mm_unpacklo_epi64(x1, y1); block0 = _mm_unpacklo_epi64(x1, y1);
block1 = _mm_unpackhi_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1);
block2 = _mm_unpacklo_epi64(x2, y2); block2 = _mm_unpacklo_epi64(x2, y2);
block3 = _mm_unpackhi_epi64(x2, y2); block3 = _mm_unpackhi_epi64(x2, y2);
block4 = _mm_unpacklo_epi64(x3, y3);
block5 = _mm_unpackhi_epi64(x3, y3);
} }
template <typename F2, typename F4> template <typename F2, typename F6>
inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4, inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F6 func6,
const word64 *subKeys, size_t rounds, const byte *inBlocks, const word64 *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
@ -1198,16 +1226,19 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4,
if (flags & BlockTransformation::BT_AllowParallel) if (flags & BlockTransformation::BT_AllowParallel)
{ {
while (length >= 4*blockSize) while (length >= 6*blockSize)
{ {
__m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; __m128i block0, block1, block2, block3, block4, block5;
block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const __m128i be1 = *CONST_M128_CAST(s_one128); const __m128i be1 = *CONST_M128_CAST(s_one128);
block1 = _mm_add_epi32(block0, be1); block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1); block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1); block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); block4 = _mm_add_epi32(block3, be1);
block5 = _mm_add_epi32(block4, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1));
} }
else else
{ {
@ -1218,6 +1249,10 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4,
inBlocks += inIncrement; inBlocks += inIncrement;
block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement; inBlocks += inIncrement;
block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
} }
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
@ -1232,9 +1267,13 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4,
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
} }
func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds)); func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
@ -1246,6 +1285,10 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4,
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
} }
_mm_storeu_si128(M128_CAST(outBlocks), block0); _mm_storeu_si128(M128_CAST(outBlocks), block0);
@ -1256,8 +1299,57 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4,
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block3); _mm_storeu_si128(M128_CAST(outBlocks), block3);
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block4);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block5);
outBlocks += outIncrement;
length -= 4*blockSize; length -= 6*blockSize;
}
while (length >= 2*blockSize)
{
__m128i block0, block1;
block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const __m128i be1 = *CONST_M128_CAST(s_one128);
block1 = _mm_add_epi32(block0, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1));
}
else
{
inBlocks += inIncrement;
block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks);
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
_mm_storeu_si128(M128_CAST(outBlocks), block0);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block1);
outBlocks += outIncrement;
length -= 2*blockSize;
} }
} }
@ -1396,8 +1488,9 @@ inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
block1 = _mm_unpackhi_epi32(x1, y1); block1 = _mm_unpackhi_epi32(x1, y1);
} }
inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block3, const word32 *subkeys, unsigned int rounds) __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word32 *subkeys, unsigned int rounds)
{ {
// Rearrange the data for vectorization. The incoming data was read from // Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
@ -1414,11 +1507,18 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
const __m128 t4 = _mm_castsi128_ps(block4);
const __m128 t5 = _mm_castsi128_ps(block5);
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
for (size_t i=0; static_cast<int>(i)<rounds; ++i) for (size_t i=0; static_cast<int>(i)<rounds; ++i)
{ {
@ -1426,20 +1526,27 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
x1 = RotateRight32<8>(x1); x1 = RotateRight32<8>(x1);
x2 = RotateRight32<8>(x2); x2 = RotateRight32<8>(x2);
x3 = RotateRight32<8>(x3);
x1 = _mm_add_epi32(x1, y1); x1 = _mm_add_epi32(x1, y1);
x2 = _mm_add_epi32(x2, y2); x2 = _mm_add_epi32(x2, y2);
x3 = _mm_add_epi32(x3, y3);
x1 = _mm_xor_si128(x1, rk); x1 = _mm_xor_si128(x1, rk);
x2 = _mm_xor_si128(x2, rk); x2 = _mm_xor_si128(x2, rk);
x3 = _mm_xor_si128(x3, rk);
y1 = RotateLeft32<3>(y1); y1 = RotateLeft32<3>(y1);
y2 = RotateLeft32<3>(y2); y2 = RotateLeft32<3>(y2);
y3 = RotateLeft32<3>(y3);
y1 = _mm_xor_si128(y1, x1); y1 = _mm_xor_si128(y1, x1);
y2 = _mm_xor_si128(y2, x2); y2 = _mm_xor_si128(y2, x2);
y3 = _mm_xor_si128(y3, x3);
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
// The is roughly the SSE equivalent to ARM vzp32 // The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
@ -1447,10 +1554,13 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
block1 = _mm_unpackhi_epi32(x1, y1); block1 = _mm_unpackhi_epi32(x1, y1);
block2 = _mm_unpacklo_epi32(x2, y2); block2 = _mm_unpacklo_epi32(x2, y2);
block3 = _mm_unpackhi_epi32(x2, y2); block3 = _mm_unpackhi_epi32(x2, y2);
block4 = _mm_unpacklo_epi32(x3, y3);
block5 = _mm_unpackhi_epi32(x3, y3);
} }
inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
__m128i &block3, const word32 *subkeys, unsigned int rounds) __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
const word32 *subkeys, unsigned int rounds)
{ {
// Rearrange the data for vectorization. The incoming data was read from // Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
@ -1467,11 +1577,18 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
const __m128 t4 = _mm_castsi128_ps(block4);
const __m128 t5 = _mm_castsi128_ps(block5);
__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i) for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
{ {
@ -1479,20 +1596,27 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
y1 = _mm_xor_si128(y1, x1); y1 = _mm_xor_si128(y1, x1);
y2 = _mm_xor_si128(y2, x2); y2 = _mm_xor_si128(y2, x2);
y3 = _mm_xor_si128(y3, x3);
y1 = RotateRight32<3>(y1); y1 = RotateRight32<3>(y1);
y2 = RotateRight32<3>(y2); y2 = RotateRight32<3>(y2);
y3 = RotateRight32<3>(y3);
x1 = _mm_xor_si128(x1, rk); x1 = _mm_xor_si128(x1, rk);
x2 = _mm_xor_si128(x2, rk); x2 = _mm_xor_si128(x2, rk);
x3 = _mm_xor_si128(x3, rk);
x1 = _mm_sub_epi32(x1, y1); x1 = _mm_sub_epi32(x1, y1);
x2 = _mm_sub_epi32(x2, y2); x2 = _mm_sub_epi32(x2, y2);
x3 = _mm_sub_epi32(x3, y3);
x1 = RotateLeft32<8>(x1); x1 = RotateLeft32<8>(x1);
x2 = RotateLeft32<8>(x2); x2 = RotateLeft32<8>(x2);
x3 = RotateLeft32<8>(x3);
} }
x1 = _mm_shuffle_epi8(x1, mask); x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
x2 = _mm_shuffle_epi8(x2, mask); x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask); y2 = _mm_shuffle_epi8(y2, mask);
x3 = _mm_shuffle_epi8(x3, mask);
y3 = _mm_shuffle_epi8(y3, mask);
// The is roughly the SSE equivalent to ARM vzp32 // The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
@ -1500,10 +1624,12 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
block1 = _mm_unpackhi_epi32(x1, y1); block1 = _mm_unpackhi_epi32(x1, y1);
block2 = _mm_unpacklo_epi32(x2, y2); block2 = _mm_unpacklo_epi32(x2, y2);
block3 = _mm_unpackhi_epi32(x2, y2); block3 = _mm_unpackhi_epi32(x2, y2);
block4 = _mm_unpacklo_epi32(x3, y3);
block5 = _mm_unpackhi_epi32(x3, y3);
} }
template <typename F2, typename F4> template <typename F2, typename F6>
inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
const word32 *subKeys, size_t rounds, const byte *inBlocks, const word32 *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
@ -1530,16 +1656,19 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
if (flags & BlockTransformation::BT_AllowParallel) if (flags & BlockTransformation::BT_AllowParallel)
{ {
while (length >= 4*xmmBlockSize) while (length >= 6*xmmBlockSize)
{ {
__m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; __m128i block0, block1, block2, block3, block4, block5;
block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const __m128i be1 = *CONST_M128_CAST(s_one64); const __m128i be1 = *CONST_M128_CAST(s_one64);
block1 = _mm_add_epi32(block0, be1); block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1); block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1); block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); block4 = _mm_add_epi32(block3, be1);
block5 = _mm_add_epi32(block4, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1));
} }
else else
{ {
@ -1550,6 +1679,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
inBlocks += inIncrement; inBlocks += inIncrement;
block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement; inBlocks += inIncrement;
block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
} }
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
@ -1564,9 +1697,13 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
} }
func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds)); func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
@ -1578,6 +1715,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement; xorBlocks += xorIncrement;
block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
} }
_mm_storeu_si128(M128_CAST(outBlocks), block0); _mm_storeu_si128(M128_CAST(outBlocks), block0);
@ -1588,8 +1729,56 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block3); _mm_storeu_si128(M128_CAST(outBlocks), block3);
outBlocks += outIncrement; outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block4);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block5);
outBlocks += outIncrement;
length -= 4*xmmBlockSize; length -= 6*xmmBlockSize;
}
while (length >= 2*xmmBlockSize)
{
__m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const __m128i be1 = *CONST_M128_CAST(s_one64);
block1 = _mm_add_epi32(block0, be1);
_mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1));
}
else
{
inBlocks += inIncrement;
block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
inBlocks += inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks);
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
}
_mm_storeu_si128(M128_CAST(outBlocks), block0);
outBlocks += outIncrement;
_mm_storeu_si128(M128_CAST(outBlocks), block1);
outBlocks += outIncrement;
length -= 2*xmmBlockSize;
} }
} }
@ -1616,13 +1805,13 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
while (length >= blockSize) while (length >= blockSize)
{ {
__m128i block, zero = _mm_setzero_si128(); __m128i block, zero = _mm_setzero_si128();
block = _mm_xor_si128(block, _mm_castpd_si128( block = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks)))); _mm_load_sd(reinterpret_cast<const double*>(inBlocks)));
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
{ {
block = _mm_xor_si128(block, _mm_castpd_si128( block = _mm_xor_si128(block, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(xorBlocks)))); _mm_load_sd(reinterpret_cast<const double*>(xorBlocks))));
} }
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -1633,11 +1822,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
block = _mm_xor_si128(block, _mm_castpd_si128( block = _mm_xor_si128(block, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(xorBlocks)))); _mm_load_sd(reinterpret_cast<const double*>(xorBlocks))));
} }
const word64 temp = _mm_cvtsi128_si64x(block); _mm_store_sd(reinterpret_cast<double*>(outBlocks), _mm_castsi128_pd(block));
std::memcpy(outBlocks, &temp, 8);
inBlocks += inIncrement; inBlocks += inIncrement;
outBlocks += outIncrement; outBlocks += outIncrement;
@ -1697,14 +1885,14 @@ size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rou
size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
return SPECK64_AdvancedProcessBlocks_SSE41(SPECK64_Enc_Block, SPECK64_Enc_4_Blocks, return SPECK64_AdvancedProcessBlocks_SSE41(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
} }
size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
return SPECK64_AdvancedProcessBlocks_SSE41(SPECK64_Dec_Block, SPECK64_Dec_4_Blocks, return SPECK64_AdvancedProcessBlocks_SSE41(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
} }
#endif #endif
@ -1713,14 +1901,14 @@ size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rou
size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Enc_Block, SPECK128_Enc_4_Blocks, return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
} }
size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{ {
return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_4_Blocks, return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
} }
#endif // CRYPTOPP_SSSE3_AVAILABLE #endif // CRYPTOPP_SSSE3_AVAILABLE