diff --git a/speck-simd.cpp b/speck-simd.cpp new file mode 100644 index 00000000..a78c3faa --- /dev/null +++ b/speck-simd.cpp @@ -0,0 +1,367 @@ +// speck-simd.cpp - written and placed in the public domain by Jeffrey Walton +// +// This source file uses intrinsics and built-ins to gain access to +// AES-NI, ARMv8a AES and Power8 AES instructions. A separate source +// file is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" + +#include "speck.h" +#include "misc.h" + +#if (CRYPTOPP_SSSE3_AVAILABLE) +# include +#endif + +// Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224 +#if (__SUNPRO_CC >= 0x5130) +# define MAYBE_CONST +# define MAYBE_UNCONST_CAST(T, x) const_cast(x) +#else +# define MAYBE_CONST const +# define MAYBE_UNCONST_CAST(T, x) (x) +#endif + +// Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670 +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::word64; +using CryptoPP::rotlFixed; +using CryptoPP::rotrFixed; +using CryptoPP::BlockTransformation; + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) + +CRYPTOPP_ALIGN_DATA(16) +const word32 s_one[] = {0, 0, 0, 1<<24}; + +template +inline __m128i RotateLeft64(const __m128i& val) +{ + CRYPTOPP_ASSERT(R < 64); + const __m128i a(_mm_slli_epi64(val, R)); + const __m128i b(_mm_srli_epi64(val, 64-R)); + return _mm_or_si128(a, b); +} + +template +inline __m128i RotateRight64(const __m128i& val) +{ + CRYPTOPP_ASSERT(R < 64); + const __m128i a(_mm_slli_epi64(val, 64-R)); + const __m128i b(_mm_srli_epi64(val, R)); + return _mm_or_si128(a, b); +} + +inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) +{ + // Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a + // __m128i. We can't SSE over them, so we rearrange the data to allow packed operations. + // Its also easier to permute them in SPECK128_Enc_4_Blocks rather than the calling code. + // SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. + __m128i block1 = _mm_setzero_si128(); + __m128i x1 = _mm_unpacklo_epi64(block0, block1); + __m128i y1 = _mm_unpackhi_epi64(block0, block1); + + const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); + x1 = _mm_shuffle_epi8(x1, mask); + y1 = _mm_shuffle_epi8(y1, mask); + + for (size_t i=0; static_cast(i)(x1); + x1 = _mm_add_epi64(x1, y1); + x1 = _mm_xor_si128(x1, k1); + y1 = RotateLeft64<3>(y1); + y1 = _mm_xor_si128(y1, x1); + } + + x1 = _mm_shuffle_epi8(x1, mask); + y1 = _mm_shuffle_epi8(y1, mask); + + block0 = _mm_unpacklo_epi64(x1, y1); + block1 = _mm_unpackhi_epi64(x1, y1); +} + +inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) +{ + // Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a + // __m128i. We can't SSE over them, so we rearrange the data to allow packed operations. + // Its also easier to permute them in SPECK128_Enc_4_Blocks rather than the calling code. + // SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. + __m128i x1 = _mm_unpacklo_epi64(block0, block1); + __m128i y1 = _mm_unpackhi_epi64(block0, block1); + __m128i x2 = _mm_unpacklo_epi64(block2, block3); // x2 + __m128i y2 = _mm_unpackhi_epi64(block2, block3); // y2 + + const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); + x1 = _mm_shuffle_epi8(x1, mask); + y1 = _mm_shuffle_epi8(y1, mask); + x2 = _mm_shuffle_epi8(x2, mask); + y2 = _mm_shuffle_epi8(y2, mask); + + for (size_t i=0; static_cast(i)(x1); + x2 = RotateRight64<8>(x2); + x1 = _mm_add_epi64(x1, y1); + x2 = _mm_add_epi64(x2, y2); + x1 = _mm_xor_si128(x1, k1); + x2 = _mm_xor_si128(x2, k1); + y1 = RotateLeft64<3>(y1); + y2 = RotateLeft64<3>(y2); + y1 = _mm_xor_si128(y1, x1); + y2 = _mm_xor_si128(y2, x2); + } + + x1 = _mm_shuffle_epi8(x1, mask); + y1 = _mm_shuffle_epi8(y1, mask); + x2 = _mm_shuffle_epi8(x2, mask); + y2 = _mm_shuffle_epi8(y2, mask); + + block0 = _mm_unpacklo_epi64(x1, y1); + block1 = _mm_unpackhi_epi64(x1, y1); + block2 = _mm_unpacklo_epi64(x2, y2); + block3 = _mm_unpackhi_epi64(x2, y2); +} + +inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) +{ + // Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a + // __m128i. We can't SSE over them, so we rearrange the data to allow packed operations. + // Its also easier to permute them in SPECK128_Enc_4_Blocks rather than the calling code. + // SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. + __m128i block1 = _mm_setzero_si128(); + __m128i x1 = _mm_unpacklo_epi64(block0, block1); + __m128i y1 = _mm_unpackhi_epi64(block0, block1); + + const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); + x1 = _mm_shuffle_epi8(x1, mask); + y1 = _mm_shuffle_epi8(y1, mask); + + for (size_t i=rounds-1; static_cast(i)>=0; --i) + { + const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i))); + + // y ^= x; + y1 = _mm_xor_si128(y1, x1); + // y = rotrFixed(y,3); + y1 = RotateRight64<3>(y1); + // x ^= k; + x1 = _mm_xor_si128(x1, k1); + // x -= y; + x1 = _mm_sub_epi64(x1, y1); + // x = rotlFixed(x,8); + x1 = RotateLeft64<8>(x1); + } + + x1 = _mm_shuffle_epi8(x1, mask); + y1 = _mm_shuffle_epi8(y1, mask); + + block0 = _mm_unpacklo_epi64(x1, y1); + block1 = _mm_unpackhi_epi64(x1, y1); +} + +inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) +{ + // Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a + // __m128i. We can't SSE over them, so we rearrange the data to allow packed operations. + // Its also easier to permute them in SPECK128_Enc_4_Blocks rather than the calling code. + // SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. + __m128i x1 = _mm_unpacklo_epi64(block0, block1); + __m128i y1 = _mm_unpackhi_epi64(block0, block1); + __m128i x2 = _mm_unpacklo_epi64(block2, block3); // x2 + __m128i y2 = _mm_unpackhi_epi64(block2, block3); // y2 + + const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); + x1 = _mm_shuffle_epi8(x1, mask); + y1 = _mm_shuffle_epi8(y1, mask); + x2 = _mm_shuffle_epi8(x2, mask); + y2 = _mm_shuffle_epi8(y2, mask); + + for (size_t i=rounds-1; static_cast(i)>=0; --i) + { + const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i))); + + // y ^= x; + y1 = _mm_xor_si128(y1, x1); + y2 = _mm_xor_si128(y2, x2); + // y = rotrFixed(y,3); + y1 = RotateRight64<3>(y1); + y2 = RotateRight64<3>(y2); + // x ^= k; + x1 = _mm_xor_si128(x1, k1); + x2 = _mm_xor_si128(x2, k1); + // x -= y; + x1 = _mm_sub_epi64(x1, y1); + x2 = _mm_sub_epi64(x2, y2); + // x = rotlFixed(x,8); + x1 = RotateLeft64<8>(x1); + x2 = RotateLeft64<8>(x2); + } + + x1 = _mm_shuffle_epi8(x1, mask); + y1 = _mm_shuffle_epi8(y1, mask); + x2 = _mm_shuffle_epi8(x2, mask); + y2 = _mm_shuffle_epi8(y2, mask); + + block0 = _mm_unpacklo_epi64(x1, y1); + block1 = _mm_unpackhi_epi64(x1, y1); + block2 = _mm_unpacklo_epi64(x2, y2); + block3 = _mm_unpackhi_epi64(x2, y2); +} + +template +inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, + const word64 *subKeys, size_t rounds, const byte *inBlocks, + const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + CRYPTOPP_ASSERT(subKeys); + CRYPTOPP_ASSERT(inBlocks); + CRYPTOPP_ASSERT(outBlocks); + CRYPTOPP_ASSERT(length >= 16); + + const size_t blockSize = 16; + size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = xorBlocks ? blockSize : 0; + size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; + + if (flags & BlockTransformation::BT_ReverseDirection) + { + inBlocks += length - blockSize; + xorBlocks += length - blockSize; + outBlocks += length - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BlockTransformation::BT_AllowParallel) + { + while (length >= 4*blockSize) + { + __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + const __m128i be1 = *CONST_M128_CAST(s_one); + block1 = _mm_add_epi32(block0, be1); + block2 = _mm_add_epi32(block1, be1); + block3 = _mm_add_epi32(block2, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); + } + else + { + inBlocks += inIncrement; + block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + } + + if (flags & BlockTransformation::BT_XorInput) + { + // Coverity finding, appears to be false positive. Assert the condition. + CRYPTOPP_ASSERT(xorBlocks); + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + func4(block0, block1, block2, block3, subKeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + _mm_storeu_si128(M128_CAST(outBlocks), block0); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block1); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block2); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block3); + outBlocks += outIncrement; + + length -= 4*blockSize; + } + } + + while (length >= blockSize) + { + __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + + if (flags & BlockTransformation::BT_XorInput) + block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1(block, subKeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + + _mm_storeu_si128(M128_CAST(outBlocks), block); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + + return length; +} + +#endif // CRYPTOPP_SSSE3_AVAILABLE + +ANONYMOUS_NAMESPACE_END + +/////////////////////////////////////////////////////////// + +NAMESPACE_BEGIN(CryptoPP) + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) +size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Enc_Block, SPECK128_Enc_4_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_4_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif + +NAMESPACE_END