From 195ac2c7c9483bdfd6d06d2abc02fc6afd293db5 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 10 Dec 2017 11:09:50 -0500 Subject: [PATCH] Refactor rijndael-simd.cpp and simon.simd.cpp to use adv-simd.h --- adv-simd.h | 302 +++++++++- misc.h | 7 + rijndael-simd.cpp | 1457 +++++++++++++++++++-------------------------- simon-simd.cpp | 419 +------------ 4 files changed, 909 insertions(+), 1276 deletions(-) diff --git a/adv-simd.h b/adv-simd.h index b1b9c79c..308ff855 100644 --- a/adv-simd.h +++ b/adv-simd.h @@ -69,9 +69,9 @@ const word32 s_one32x4_2b[] = {0, 2, 0, 2}; #endif #if defined(CRYPTOPP_LITTLE_ENDIAN) -const word32 s_one128[] = {0, 0, 0, 1<<24}; +const word32 s_one32x4[] = {0, 0, 0, 1<<24}; #else -const word32 s_one128[] = {0, 0, 0, 1}; +const word32 s_one32x4[] = {0, 0, 0, 1}; #endif ANONYMOUS_NAMESPACE_END @@ -306,6 +306,145 @@ inline size_t AdvancedProcessBlocks64_NEON2x6(F2 func2, F6 func6, return length; } +template +size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6, + const word32 *subKeys, size_t rounds, const byte *inBlocks, + const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + CRYPTOPP_ASSERT(subKeys); + CRYPTOPP_ASSERT(inBlocks); + CRYPTOPP_ASSERT(outBlocks); + CRYPTOPP_ASSERT(length >= 16); + + CRYPTOPP_CONSTANT(blockSize = 16) + // CRYPTOPP_CONSTANT(neonBlockSize = 16) + + size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = xorBlocks ? blockSize : 0; + size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize; + + if (flags & BT_ReverseDirection) + { + inBlocks += length - blockSize; + xorBlocks += length - blockSize; + outBlocks += length - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BT_AllowParallel) + { + while (length >= 6*blockSize) + { + uint64x2_t block0, block1, block2, block3, block4, block5; + if (flags & BT_InBlockIsCounter) + { + const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4)); + block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + + block1 = vaddq_u64(block0, be); + block2 = vaddq_u64(block1, be); + block3 = vaddq_u64(block2, be); + block4 = vaddq_u64(block3, be); + block5 = vaddq_u64(block4, be); + vst1q_u8(const_cast(inBlocks), + vreinterpretq_u8_u64(vaddq_u64(block5, be))); + } + else + { + block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + inBlocks += inIncrement; + block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + inBlocks += inIncrement; + block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + inBlocks += inIncrement; + block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + inBlocks += inIncrement; + block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + inBlocks += inIncrement; + block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + inBlocks += inIncrement; + } + + if (flags & BT_XorInput) + { + block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + } + + func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BT_XorInput)) + { + block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + } + + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0)); + outBlocks += outIncrement; + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1)); + outBlocks += outIncrement; + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2)); + outBlocks += outIncrement; + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3)); + outBlocks += outIncrement; + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4)); + outBlocks += outIncrement; + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5)); + outBlocks += outIncrement; + + length -= 6*blockSize; + } + } + + while (length >= blockSize) + { + uint64x2_t block; + block = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + + if (flags & BT_XorInput) + block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + + if (flags & BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1(block, subKeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BT_XorInput)) + block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block)); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + + return length; +} + template size_t AdvancedProcessBlocks128_NEON2x6(F2 func2, F6 func6, const word64 *subKeys, size_t rounds, const byte *inBlocks, @@ -340,7 +479,7 @@ size_t AdvancedProcessBlocks128_NEON2x6(F2 func2, F6 func6, uint64x2_t block0, block1, block2, block3, block4, block5; if (flags & BT_InBlockIsCounter) { - const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one128)); + const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4)); block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); block1 = vaddq_u64(block0, be); @@ -422,7 +561,7 @@ size_t AdvancedProcessBlocks128_NEON2x6(F2 func2, F6 func6, uint64x2_t block0, block1; if (flags & BT_InBlockIsCounter) { - const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one128)); + const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4)); block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); block1 = vaddq_u64(block0, be); @@ -499,6 +638,15 @@ NAMESPACE_END #if defined(CRYPTOPP_SSSE3_AVAILABLE) +// Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224 +#if (__SUNPRO_CC >= 0x5130) +# define MAYBE_CONST +# define MAYBE_UNCONST_CAST(T, x) const_cast(x) +#else +# define MAYBE_CONST const +# define MAYBE_UNCONST_CAST(T, x) (x) +#endif + // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670 #ifndef M128_CAST # define M128_CAST(x) ((__m128i *)(void *)(x)) @@ -513,12 +661,12 @@ using CryptoPP::word32; using CryptoPP::word64; CRYPTOPP_ALIGN_DATA(16) -const word32 s_one64_1b[] = {0, 0, 0, 1<<24}; +const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24}; CRYPTOPP_ALIGN_DATA(16) -const word32 s_one64_2b[] = {0, 2<<24, 0, 2<<24}; +const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24}; CRYPTOPP_ALIGN_DATA(16) -const word32 s_one128[] = {0, 0, 0, 1<<24}; +const word32 s_one32x4[] = {0, 0, 0, 1<<24}; ANONYMOUS_NAMESPACE_END @@ -561,18 +709,18 @@ inline size_t AdvancedProcessBlocks64_SSE2x6(F2 func2, F6 func6, // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. // After the dup load we have two counters in the XMM word. Then we need // to increment the low ctr by 0 and the high ctr by 1. - block0 = _mm_add_epi32(*CONST_M128_CAST(s_one64_1b), _mm_castpd_si128( + block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128( _mm_loaddup_pd(reinterpret_cast(inBlocks)))); // After initial increment of {0,1} remaining counters increment by {2,2}. - const __m128i be2 = *CONST_M128_CAST(s_one64_2b); + const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b); block1 = _mm_add_epi32(be2, block0); block2 = _mm_add_epi32(be2, block1); block3 = _mm_add_epi32(be2, block2); block4 = _mm_add_epi32(be2, block3); block5 = _mm_add_epi32(be2, block4); - // Store the next counter. + // Store the next counter. UBsan false positive; mem_addr can be unaligned. _mm_store_sd(reinterpret_cast(const_cast(inBlocks)), _mm_castsi128_pd(_mm_add_epi32(be2, block5))); } @@ -652,14 +800,14 @@ inline size_t AdvancedProcessBlocks64_SSE2x6(F2 func2, F6 func6, // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. // After the dup load we have two counters in the XMM word. Then we need // to increment the low ctr by 0 and the high ctr by 1. - block0 = _mm_add_epi32(*CONST_M128_CAST(s_one64_1b), _mm_castpd_si128( + block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128( _mm_loaddup_pd(reinterpret_cast(inBlocks)))); // After initial increment of {0,1} remaining counters increment by {2,2}. - const __m128i be2 = *CONST_M128_CAST(s_one64_2b); + const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b); block1 = _mm_add_epi32(be2, block0); - // Store the next counter. + // Store the next counter. UBsan false positive; mem_addr can be unaligned. _mm_store_sd(reinterpret_cast(const_cast(inBlocks)), _mm_castsi128_pd(_mm_add_epi64(be2, block1))); } @@ -723,11 +871,13 @@ inline size_t AdvancedProcessBlocks64_SSE2x6(F2 func2, F6 func6, { __m128i block, zero = _mm_setzero_si128(); block = _mm_castpd_si128( + // UBsan false positive; mem_addr can be unaligned. _mm_load_sd(reinterpret_cast(inBlocks))); if (flags & BT_XorInput) { block = _mm_xor_si128(block, _mm_castpd_si128( + // UBsan false positive; mem_addr can be unaligned. _mm_load_sd(reinterpret_cast(xorBlocks)))); } @@ -739,9 +889,11 @@ inline size_t AdvancedProcessBlocks64_SSE2x6(F2 func2, F6 func6, if (xorBlocks && !(flags & BT_XorInput)) { block = _mm_xor_si128(block, _mm_castpd_si128( + // UBsan false positive; mem_addr can be unaligned. _mm_load_sd(reinterpret_cast(xorBlocks)))); } + // UBsan false positive; mem_addr can be unaligned. _mm_store_sd(reinterpret_cast(outBlocks), _mm_castsi128_pd(block)); inBlocks += inIncrement; @@ -788,7 +940,7 @@ inline size_t AdvancedProcessBlocks128_SSE2x6(F2 func2, F6 func6, __m128i block0, block1, block2, block3, block4, block5; if (flags & BT_InBlockIsCounter) { - const __m128i be1 = *CONST_M128_CAST(s_one128); + const __m128i be1 = *CONST_M128_CAST(s_one32x4); block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); block1 = _mm_add_epi32(block0, be1); block2 = _mm_add_epi32(block1, be1); @@ -870,7 +1022,7 @@ inline size_t AdvancedProcessBlocks128_SSE2x6(F2 func2, F6 func6, __m128i block0, block1; if (flags & BT_InBlockIsCounter) { - const __m128i be1 = *CONST_M128_CAST(s_one128); + const __m128i be1 = *CONST_M128_CAST(s_one32x4); block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); block1 = _mm_add_epi32(block0, be1); _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1)); @@ -939,6 +1091,126 @@ inline size_t AdvancedProcessBlocks128_SSE2x6(F2 func2, F6 func6, return length; } +template +inline size_t AdvancedProcessBlocks128_SSE1x4(F1 func1, F4 func4, + MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks, + const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + CRYPTOPP_ASSERT(subKeys); + CRYPTOPP_ASSERT(inBlocks); + CRYPTOPP_ASSERT(outBlocks); + CRYPTOPP_ASSERT(length >= 16); + + CRYPTOPP_CONSTANT(blockSize = 16) + // CRYPTOPP_CONSTANT(xmmBlockSize = 16) + + size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = xorBlocks ? blockSize : 0; + size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize; + + if (flags & BT_ReverseDirection) + { + inBlocks += length - blockSize; + xorBlocks += length - blockSize; + outBlocks += length - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BT_AllowParallel) + { + while (length >= 4*blockSize) + { + __m128i block0, block1, block2, block3; + if (flags & BT_InBlockIsCounter) + { + const __m128i be1 = *CONST_M128_CAST(s_one32x4); + block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + block1 = _mm_add_epi32(block0, be1); + block2 = _mm_add_epi32(block1, be1); + block3 = _mm_add_epi32(block2, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); + } + else + { + block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + } + + if (flags & BT_XorInput) + { + // Coverity finding, appears to be false positive. Assert the condition. + CRYPTOPP_ASSERT(xorBlocks); + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + func4(block0, block1, block2, block3, subKeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BT_XorInput)) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + _mm_storeu_si128(M128_CAST(outBlocks), block0); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block1); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block2); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block3); + outBlocks += outIncrement; + + length -= 4*blockSize; + } + } + + while (length >= blockSize) + { + __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + + if (flags & BT_XorInput) + block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + + if (flags & BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1(block, subKeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BT_XorInput)) + block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + + _mm_storeu_si128(M128_CAST(outBlocks), block); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + + return length; +} + NAMESPACE_END #endif // CRYPTOPP_SSSE3_AVAILABLE diff --git a/misc.h b/misc.h index ce09d366..cb85e665 100644 --- a/misc.h +++ b/misc.h @@ -469,8 +469,15 @@ inline void memmove_s(void *dest, size_t sizeInBytes, const void *src, size_t co template inline void vec_swap(T& a, T& b) { + // __m128i is an unsigned long long[2], and support for swapping it was + // not added until C++11. SunCC 12.1 - 12.3 fail to consume the swap; while + // SunCC 12.4 consumes it without -std=c++11. +#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) T t; t=a, a=b, b=t; +#else + std::swap(a, b); +#endif } /// \brief Memory block initializer and eraser that attempts to survive optimizations diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index 810630d2..544241c4 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -23,6 +23,7 @@ #include "pch.h" #include "config.h" #include "misc.h" +#include "adv-simd.h" // We set CRYPTOPP_ARM_AES_AVAILABLE based on compiler version. // If the crypto is not available, then we have to disable it here. @@ -38,10 +39,6 @@ #endif #if (CRYPTOPP_AESNI_AVAILABLE) -// Hack... We are supposed to use . GCC 4.8, LLVM Clang 3.5 -// and Apple Clang 6.0 conflates SSE4.1 and SSE4.2. If we use -// then compile fails with "SSE4.2 instruction set not enabled". Also see -// http://gcc.gnu.org/ml/gcc-help/2017-08/msg00015.html. # include # include #endif @@ -66,30 +63,17 @@ # define EXCEPTION_EXECUTE_HANDLER 1 #endif -// Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224 -#if (__SUNPRO_CC >= 0x5130) -# define MAYBE_CONST -# define MAYBE_UNCONST_CAST(T, x) const_cast(x) -#else -# define MAYBE_CONST const -# define MAYBE_UNCONST_CAST(T, x) (x) -#endif - -// Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670 -#define M128_CAST(x) ((__m128i *)(void *)(x)) -#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) - NAMESPACE_BEGIN(CryptoPP) #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY extern "C" { typedef void (*SigHandler)(int); - static jmp_buf s_jmpSIGILL; - static void SigIllHandler(int) - { - longjmp(s_jmpSIGILL, 1); - } + static jmp_buf s_jmpSIGILL; + static void SigIllHandler(int) + { + longjmp(s_jmpSIGILL, 1); + } }; #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY @@ -97,60 +81,60 @@ extern "C" { bool CPU_ProbeAES() { #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) - return false; + return false; #elif (CRYPTOPP_ARM_AES_AVAILABLE) # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - // AES encrypt and decrypt - uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); - uint8x16_t r1 = vaeseq_u8(data, key); - uint8x16_t r2 = vaesdq_u8(data, key); - r1 = vaesmcq_u8(r1); - r2 = vaesimcq_u8(r2); + volatile bool result = true; + __try + { + // AES encrypt and decrypt + uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); + uint8x16_t r1 = vaeseq_u8(data, key); + uint8x16_t r2 = vaesdq_u8(data, key); + r1 = vaesmcq_u8(r1); + r2 = vaesimcq_u8(r2); - result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; + result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; # else - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; - if (setjmp(s_jmpSIGILL)) - result = false; - else - { - uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); - uint8x16_t r1 = vaeseq_u8(data, key); - uint8x16_t r2 = vaesdq_u8(data, key); - r1 = vaesmcq_u8(r1); - r2 = vaesimcq_u8(r2); + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); + uint8x16_t r1 = vaeseq_u8(data, key); + uint8x16_t r2 = vaesdq_u8(data, key); + r1 = vaesmcq_u8(r1); + r2 = vaesimcq_u8(r2); - // Hack... GCC optimizes away the code and returns true - result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); - } + // Hack... GCC optimizes away the code and returns true + result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); + } - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; # endif #else - return false; + return false; #endif // CRYPTOPP_ARM_AES_AVAILABLE } #endif // ARM32 or ARM64 @@ -161,296 +145,192 @@ bool CPU_ProbeAES() ANONYMOUS_NAMESPACE_BEGIN -#if defined(CRYPTOPP_LITTLE_ENDIAN) -const word32 s_one[] = {0, 0, 0, 1<<24}; // uint32x4_t -#else -const word32 s_one[] = {0, 0, 0, 1}; // uint32x4_t -#endif - -static inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds) +static inline void ARMV8_Enc_Block(uint64x2_t &data, const word32 *subkeys, unsigned int rounds) { - CRYPTOPP_ASSERT(subkeys); - const byte *keys = reinterpret_cast(subkeys); + CRYPTOPP_ASSERT(subkeys); + const byte *keys = reinterpret_cast(subkeys); + uint8x16_t block = vreinterpretq_u8_u64(data); - // AES single round encryption - block = vaeseq_u8(block, vld1q_u8(keys+0*16)); - // AES mix columns - block = vaesmcq_u8(block); + // AES single round encryption + block = vaeseq_u8(block, vld1q_u8(keys+0*16)); + // AES mix columns + block = vaesmcq_u8(block); - for (unsigned int i=1; i(subkeys); - uint8x16_t key; + CRYPTOPP_ASSERT(subkeys); + const byte *keys = reinterpret_cast(subkeys); - for (unsigned int i=0; i(subkeys); + CRYPTOPP_ASSERT(subkeys); + const byte *keys = reinterpret_cast(subkeys); + uint8x16_t block = vreinterpretq_u8_u64(data); - // AES single round decryption - block = vaesdq_u8(block, vld1q_u8(keys+0*16)); - // AES inverse mix columns - block = vaesimcq_u8(block); + // AES single round decryption + block = vaesdq_u8(block, vld1q_u8(keys+0*16)); + // AES inverse mix columns + block = vaesimcq_u8(block); - for (unsigned int i=1; i(subkeys); + CRYPTOPP_ASSERT(subkeys); + const byte *keys = reinterpret_cast(subkeys); - uint8x16_t key; - for (unsigned int i=0; i -size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F6 func6, const word32 *subKeys, size_t rounds, - const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlocks); - CRYPTOPP_ASSERT(outBlocks); - CRYPTOPP_ASSERT(length >= 16); - - const size_t blockSize = 16; - size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; - size_t xorIncrement = xorBlocks ? blockSize : 0; - size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; - - if (flags & BlockTransformation::BT_ReverseDirection) - { - inBlocks += length - blockSize; - xorBlocks += length - blockSize; - outBlocks += length - blockSize; - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } - - if (flags & BlockTransformation::BT_AllowParallel) - { - while (length >= 6*blockSize) - { - uint8x16_t block0, block1, block2, block3, block4, block5, temp; - block0 = vld1q_u8(inBlocks); - - if (flags & BlockTransformation::BT_InBlockIsCounter) - { - uint32x4_t be = vld1q_u32(s_one); - block1 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block0), be); - block2 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block1), be); - block3 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block2), be); - block4 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block3), be); - block5 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block4), be); - temp = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block5), be); - vst1q_u8(const_cast(inBlocks), temp); - } - else - { - const int inc = static_cast(inIncrement); - block1 = vld1q_u8(inBlocks+1*inc); - block2 = vld1q_u8(inBlocks+2*inc); - block3 = vld1q_u8(inBlocks+3*inc); - block4 = vld1q_u8(inBlocks+4*inc); - block5 = vld1q_u8(inBlocks+5*inc); - inBlocks += 6*inc; - } - - if (flags & BlockTransformation::BT_XorInput) - { - const int inc = static_cast(xorIncrement); - block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc)); - block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc)); - block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc)); - block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc)); - block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc)); - block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc)); - xorBlocks += 6*inc; - } - - func6(block0, block1, block2, block3, block4, block5, subKeys, rounds); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - { - const int inc = static_cast(xorIncrement); - block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc)); - block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc)); - block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc)); - block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc)); - block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc)); - block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc)); - xorBlocks += 6*inc; - } - - const int inc = static_cast(outIncrement); - vst1q_u8(outBlocks+0*inc, block0); - vst1q_u8(outBlocks+1*inc, block1); - vst1q_u8(outBlocks+2*inc, block2); - vst1q_u8(outBlocks+3*inc, block3); - vst1q_u8(outBlocks+4*inc, block4); - vst1q_u8(outBlocks+5*inc, block5); - - outBlocks += 6*inc; - length -= 6*blockSize; - } - } - - while (length >= blockSize) - { - uint8x16_t block = vld1q_u8(inBlocks); - - if (flags & BlockTransformation::BT_XorInput) - block = veorq_u8(block, vld1q_u8(xorBlocks)); - - if (flags & BlockTransformation::BT_InBlockIsCounter) - const_cast(inBlocks)[15]++; - - func1(block, subKeys, rounds); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - block = veorq_u8(block, vld1q_u8(xorBlocks)); - - vst1q_u8(outBlocks, block); - - inBlocks += inIncrement; - outBlocks += outIncrement; - xorBlocks += xorIncrement; - length -= blockSize; - } - - return length; + // Final Add (bitwise Xor) + key = vld1q_u8(keys+rounds*16); + data0 = vreinterpretq_u64_u8(veorq_u8(block0, key)); + data1 = vreinterpretq_u64_u8(veorq_u8(block1, key)); + data2 = vreinterpretq_u64_u8(veorq_u8(block2, key)); + data3 = vreinterpretq_u64_u8(veorq_u8(block3, key)); + data4 = vreinterpretq_u64_u8(veorq_u8(block4, key)); + data5 = vreinterpretq_u64_u8(veorq_u8(block5, key)); } ANONYMOUS_NAMESPACE_END @@ -458,14 +338,14 @@ ANONYMOUS_NAMESPACE_END size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return Rijndael_AdvancedProcessBlocks_ARMV8(ARMV8_Enc_Block, ARMV8_Enc_6_Blocks, + return AdvancedProcessBlocks128_NEON1x6(ARMV8_Enc_Block, ARMV8_Enc_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return Rijndael_AdvancedProcessBlocks_ARMV8(ARMV8_Dec_Block, ARMV8_Dec_6_Blocks, + return AdvancedProcessBlocks128_NEON1x6(ARMV8_Dec_Block, ARMV8_Dec_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } @@ -477,291 +357,174 @@ size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subKeys, size_t ro ANONYMOUS_NAMESPACE_BEGIN -CRYPTOPP_ALIGN_DATA(16) -const word32 s_one[] = {0, 0, 0, 1<<24}; - /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ CRYPTOPP_ALIGN_DATA(16) const word32 s_rconLE[] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 }; -static inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds) +static inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST word32 *subkeys, unsigned int rounds) { - block = _mm_xor_si128(block, subkeys[0]); - for (unsigned int i=1; i(subkeys); + + block = _mm_xor_si128(block, skeys[0]); + for (unsigned int i=1; i(subkeys); + + __m128i rk = skeys[0]; + block0 = _mm_xor_si128(block0, rk); + block1 = _mm_xor_si128(block1, rk); + block2 = _mm_xor_si128(block2, rk); + block3 = _mm_xor_si128(block3, rk); + for (unsigned int i=1; i(subkeys); + + block = _mm_xor_si128(block, skeys[0]); + for (unsigned int i=1; i(subkeys); -template -static inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, - MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks, - const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlocks); - CRYPTOPP_ASSERT(outBlocks); - CRYPTOPP_ASSERT(length >= 16); - - const size_t blockSize = 16; - size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; - size_t xorIncrement = xorBlocks ? blockSize : 0; - size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; - MAYBE_CONST __m128i *subkeys = reinterpret_cast(subKeys); - - if (flags & BlockTransformation::BT_ReverseDirection) - { - inBlocks += length - blockSize; - xorBlocks += length - blockSize; - outBlocks += length - blockSize; - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } - - if (flags & BlockTransformation::BT_AllowParallel) - { - while (length >= 4*blockSize) - { - __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; - if (flags & BlockTransformation::BT_InBlockIsCounter) - { - const __m128i be1 = *CONST_M128_CAST(s_one); - block1 = _mm_add_epi32(block0, be1); - block2 = _mm_add_epi32(block1, be1); - block3 = _mm_add_epi32(block2, be1); - _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); - } - else - { - inBlocks += inIncrement; - block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - } - - if (flags & BlockTransformation::BT_XorInput) - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(xorBlocks); - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - func4(block0, block1, block2, block3, subkeys, static_cast(rounds)); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - _mm_storeu_si128(M128_CAST(outBlocks), block0); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block1); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block2); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block3); - outBlocks += outIncrement; - - length -= 4*blockSize; - } - } - - while (length >= blockSize) - { - __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - - if (flags & BlockTransformation::BT_XorInput) - block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - - if (flags & BlockTransformation::BT_InBlockIsCounter) - const_cast(inBlocks)[15]++; - - func1(block, subkeys, static_cast(rounds)); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - - _mm_storeu_si128(M128_CAST(outBlocks), block); - - inBlocks += inIncrement; - outBlocks += outIncrement; - xorBlocks += xorIncrement; - length -= blockSize; - } - - return length; + __m128i rk = skeys[0]; + block0 = _mm_xor_si128(block0, rk); + block1 = _mm_xor_si128(block1, rk); + block2 = _mm_xor_si128(block2, rk); + block3 = _mm_xor_si128(block3, rk); + for (unsigned int i=1; i(r0, r1); /* line 2 */ - r3 = VectorEncryptLast(r3, r4); /* line 3 */ + r3 = VectorPermute(r1, r1, r5); /* line 1 */ + r6 = VectorShiftLeft<12>(r0, r1); /* line 2 */ + r3 = VectorEncryptLast(r3, r4); /* line 3 */ - r1 = VectorXor(r1, r6); /* line 4 */ - r6 = VectorShiftLeft<12>(r0, r1); /* line 5 */ - r1 = VectorXor(r1, r6); /* line 6 */ - r6 = VectorShiftLeft<12>(r0, r1); /* line 7 */ - r1 = VectorXor(r1, r6); /* line 8 */ + r1 = VectorXor(r1, r6); /* line 4 */ + r6 = VectorShiftLeft<12>(r0, r1); /* line 5 */ + r1 = VectorXor(r1, r6); /* line 6 */ + r6 = VectorShiftLeft<12>(r0, r1); /* line 7 */ + r1 = VectorXor(r1, r6); /* line 8 */ - // Caller handles r4 (rcon) addition - // r4 = VectorAdd(r4, r4); /* line 9 */ + // Caller handles r4 (rcon) addition + // r4 = VectorAdd(r4, r4); /* line 9 */ - // r1 is ready for next round - r1 = VectorXor(r1, r3); /* line 10 */ - return r1; + // r1 is ready for next round + r1 = VectorXor(r1, r3); /* line 10 */ + return r1; } static inline uint8_t* IncrementPointerAndStore(const uint8x16_p8& r, uint8_t* p) { - VectorStore(r, (p += 16)); - return p; + VectorStore(r, (p += 16)); + return p; } static inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds) { - CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); - const byte *keys = reinterpret_cast(subkeys); + CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); + const byte *keys = reinterpret_cast(subkeys); - VectorType k = VectorLoadKey(keys); - block = VectorXor(block, k); + VectorType k = VectorLoadKey(keys); + block = VectorXor(block, k); - for (size_t i=1; i(subkeys); + CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); + const byte *keys = reinterpret_cast(subkeys); - VectorType k = VectorLoadKey(keys); - block0 = VectorXor(block0, k); - block1 = VectorXor(block1, k); - block2 = VectorXor(block2, k); - block3 = VectorXor(block3, k); - block4 = VectorXor(block4, k); - block5 = VectorXor(block5, k); + VectorType k = VectorLoadKey(keys); + block0 = VectorXor(block0, k); + block1 = VectorXor(block1, k); + block2 = VectorXor(block2, k); + block3 = VectorXor(block3, k); + block4 = VectorXor(block4, k); + block5 = VectorXor(block5, k); - for (size_t i=1; i(subkeys); + CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); + const byte *keys = reinterpret_cast(subkeys); - VectorType k = VectorLoadKey(rounds*16, keys); - block = VectorXor(block, k); + VectorType k = VectorLoadKey(rounds*16, keys); + block = VectorXor(block, k); - for (size_t i=rounds-1; i>1; i-=2) - { - block = VectorDecrypt(block, VectorLoadKey( i*16, keys)); - block = VectorDecrypt(block, VectorLoadKey((i-1)*16, keys)); - } + for (size_t i=rounds-1; i>1; i-=2) + { + block = VectorDecrypt(block, VectorLoadKey( i*16, keys)); + block = VectorDecrypt(block, VectorLoadKey((i-1)*16, keys)); + } - block = VectorDecrypt(block, VectorLoadKey(16, keys)); - block = VectorDecryptLast(block, VectorLoadKey(0, keys)); + block = VectorDecrypt(block, VectorLoadKey(16, keys)); + block = VectorDecryptLast(block, VectorLoadKey(0, keys)); } static inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1, VectorType &block2, VectorType &block3, VectorType &block4, VectorType &block5, const word32 *subkeys, unsigned int rounds) { - CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); - const byte *keys = reinterpret_cast(subkeys); + CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); + const byte *keys = reinterpret_cast(subkeys); - VectorType k = VectorLoadKey(rounds*16, keys); - block0 = VectorXor(block0, k); - block1 = VectorXor(block1, k); - block2 = VectorXor(block2, k); - block3 = VectorXor(block3, k); - block4 = VectorXor(block4, k); - block5 = VectorXor(block5, k); + VectorType k = VectorLoadKey(rounds*16, keys); + block0 = VectorXor(block0, k); + block1 = VectorXor(block1, k); + block2 = VectorXor(block2, k); + block3 = VectorXor(block3, k); + block4 = VectorXor(block4, k); + block5 = VectorXor(block5, k); - for (size_t i=rounds-1; i>0; --i) - { - k = VectorLoadKey(i*16, keys); - block0 = VectorDecrypt(block0, k); - block1 = VectorDecrypt(block1, k); - block2 = VectorDecrypt(block2, k); - block3 = VectorDecrypt(block3, k); - block4 = VectorDecrypt(block4, k); - block5 = VectorDecrypt(block5, k); - } + for (size_t i=rounds-1; i>0; --i) + { + k = VectorLoadKey(i*16, keys); + block0 = VectorDecrypt(block0, k); + block1 = VectorDecrypt(block1, k); + block2 = VectorDecrypt(block2, k); + block3 = VectorDecrypt(block3, k); + block4 = VectorDecrypt(block4, k); + block5 = VectorDecrypt(block5, k); + } - k = VectorLoadKey(0, keys); - block0 = VectorDecryptLast(block0, k); - block1 = VectorDecryptLast(block1, k); - block2 = VectorDecryptLast(block2, k); - block3 = VectorDecryptLast(block3, k); - block4 = VectorDecryptLast(block4, k); - block5 = VectorDecryptLast(block5, k); + k = VectorLoadKey(0, keys); + block0 = VectorDecryptLast(block0, k); + block1 = VectorDecryptLast(block1, k); + block2 = VectorDecryptLast(block2, k); + block3 = VectorDecryptLast(block3, k); + block4 = VectorDecryptLast(block4, k); + block5 = VectorDecryptLast(block5, k); } template size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F6 func6, const word32 *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlocks); - CRYPTOPP_ASSERT(outBlocks); - CRYPTOPP_ASSERT(length >= 16); + CRYPTOPP_ASSERT(subKeys); + CRYPTOPP_ASSERT(inBlocks); + CRYPTOPP_ASSERT(outBlocks); + CRYPTOPP_ASSERT(length >= 16); - const size_t blockSize = 16; - size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; - size_t xorIncrement = xorBlocks ? blockSize : 0; - size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; + const size_t blockSize = 16; + size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = xorBlocks ? blockSize : 0; + size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; - if (flags & BlockTransformation::BT_ReverseDirection) - { - inBlocks += length - blockSize; - xorBlocks += length - blockSize; - outBlocks += length - blockSize; - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } + if (flags & BlockTransformation::BT_ReverseDirection) + { + inBlocks += length - blockSize; + xorBlocks += length - blockSize; + outBlocks += length - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } - if (flags & BlockTransformation::BT_AllowParallel) - { - while (length >= 6*blockSize) - { + if (flags & BlockTransformation::BT_AllowParallel) + { + while (length >= 6*blockSize) + { #if defined(CRYPTOPP_LITTLE_ENDIAN) - const VectorType one = (VectorType)((uint64x2_p8){1,0}); + const VectorType one = (VectorType)((uint64x2_p8){1,0}); #else - const VectorType one = (VectorType)((uint64x2_p8){0,1}); + const VectorType one = (VectorType)((uint64x2_p8){0,1}); #endif - VectorType block0, block1, block2, block3, block4, block5, temp; - block0 = VectorLoad(inBlocks); + VectorType block0, block1, block2, block3, block4, block5, temp; + block0 = VectorLoad(inBlocks); - if (flags & BlockTransformation::BT_InBlockIsCounter) - { - block1 = VectorAdd(block0, one); - block2 = VectorAdd(block1, one); - block3 = VectorAdd(block2, one); - block4 = VectorAdd(block3, one); - block5 = VectorAdd(block4, one); - temp = VectorAdd(block5, one); - VectorStore(temp, const_cast(inBlocks)); - } - else - { - const int inc = static_cast(inIncrement); - block1 = VectorLoad(1*inc, inBlocks); - block2 = VectorLoad(2*inc, inBlocks); - block3 = VectorLoad(3*inc, inBlocks); - block4 = VectorLoad(4*inc, inBlocks); - block5 = VectorLoad(5*inc, inBlocks); - inBlocks += 6*inc; - } + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + block1 = VectorAdd(block0, one); + block2 = VectorAdd(block1, one); + block3 = VectorAdd(block2, one); + block4 = VectorAdd(block3, one); + block5 = VectorAdd(block4, one); + temp = VectorAdd(block5, one); + VectorStore(temp, const_cast(inBlocks)); + } + else + { + const int inc = static_cast(inIncrement); + block1 = VectorLoad(1*inc, inBlocks); + block2 = VectorLoad(2*inc, inBlocks); + block3 = VectorLoad(3*inc, inBlocks); + block4 = VectorLoad(4*inc, inBlocks); + block5 = VectorLoad(5*inc, inBlocks); + inBlocks += 6*inc; + } - if (flags & BlockTransformation::BT_XorInput) - { - const int inc = static_cast(xorIncrement); - block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks)); - block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks)); - block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks)); - block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks)); - block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks)); - block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks)); - xorBlocks += 6*inc; - } + if (flags & BlockTransformation::BT_XorInput) + { + const int inc = static_cast(xorIncrement); + block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks)); + block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks)); + block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks)); + block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks)); + block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks)); + block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks)); + xorBlocks += 6*inc; + } - func6(block0, block1, block2, block3, block4, block5, subKeys, rounds); + func6(block0, block1, block2, block3, block4, block5, subKeys, rounds); - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - { - const int inc = static_cast(xorIncrement); - block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks)); - block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks)); - block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks)); - block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks)); - block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks)); - block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks)); - xorBlocks += 6*inc; - } + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + const int inc = static_cast(xorIncrement); + block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks)); + block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks)); + block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks)); + block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks)); + block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks)); + block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks)); + xorBlocks += 6*inc; + } - const int inc = static_cast(outIncrement); - VectorStore(block0, outBlocks+0*inc); - VectorStore(block1, outBlocks+1*inc); - VectorStore(block2, outBlocks+2*inc); - VectorStore(block3, outBlocks+3*inc); - VectorStore(block4, outBlocks+4*inc); - VectorStore(block5, outBlocks+5*inc); + const int inc = static_cast(outIncrement); + VectorStore(block0, outBlocks+0*inc); + VectorStore(block1, outBlocks+1*inc); + VectorStore(block2, outBlocks+2*inc); + VectorStore(block3, outBlocks+3*inc); + VectorStore(block4, outBlocks+4*inc); + VectorStore(block5, outBlocks+5*inc); - outBlocks += 6*inc; - length -= 6*blockSize; - } - } + outBlocks += 6*inc; + length -= 6*blockSize; + } + } - while (length >= blockSize) - { - VectorType block = VectorLoad(inBlocks); + while (length >= blockSize) + { + VectorType block = VectorLoad(inBlocks); - if (flags & BlockTransformation::BT_XorInput) - block = VectorXor(block, VectorLoad(xorBlocks)); + if (flags & BlockTransformation::BT_XorInput) + block = VectorXor(block, VectorLoad(xorBlocks)); - if (flags & BlockTransformation::BT_InBlockIsCounter) - const_cast(inBlocks)[15]++; + if (flags & BlockTransformation::BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; - func1(block, subKeys, rounds); + func1(block, subKeys, rounds); - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - block = VectorXor(block, VectorLoad(xorBlocks)); + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + block = VectorXor(block, VectorLoad(xorBlocks)); - VectorStore(block, outBlocks); + VectorStore(block, outBlocks); - inBlocks += inIncrement; - outBlocks += outIncrement; - xorBlocks += xorIncrement; - length -= blockSize; - } + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } - return length; + return length; } ANONYMOUS_NAMESPACE_END @@ -1065,111 +828,111 @@ ANONYMOUS_NAMESPACE_END void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32* rk, const word32* rc, const byte* Se) { - const size_t rounds = keyLen / 4 + 6; - if (keyLen == 16) - { - std::memcpy(rk, userKey, keyLen); - uint8_t* skptr = (uint8_t*)rk; + const size_t rounds = keyLen / 4 + 6; + if (keyLen == 16) + { + std::memcpy(rk, userKey, keyLen); + uint8_t* skptr = (uint8_t*)rk; - uint8x16_p8 r1 = (uint8x16_p8)VectorLoadKey(skptr); - uint8x16_p8 r4 = (uint8x16_p8)VectorLoadKey(s_rcon[0]); - uint8x16_p8 r5 = (uint8x16_p8)VectorLoadKey(s_mask); + uint8x16_p8 r1 = (uint8x16_p8)VectorLoadKey(skptr); + uint8x16_p8 r4 = (uint8x16_p8)VectorLoadKey(s_rcon[0]); + uint8x16_p8 r5 = (uint8x16_p8)VectorLoadKey(s_mask); #if defined(CRYPTOPP_LITTLE_ENDIAN) - // Only the user key requires byte reversing. - // The subkeys are stored in proper endianess. - ReverseByteArrayLE(skptr); + // Only the user key requires byte reversing. + // The subkeys are stored in proper endianess. + ReverseByteArrayLE(skptr); #endif - for (unsigned int i=0; i -inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F2 func2, F6 func6, - const word64 *subKeys, size_t rounds, const byte *inBlocks, - const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlocks); - CRYPTOPP_ASSERT(outBlocks); - CRYPTOPP_ASSERT(length >= 16); - - const size_t blockSize = 16; - size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; - size_t xorIncrement = xorBlocks ? blockSize : 0; - size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; - - if (flags & BlockTransformation::BT_ReverseDirection) - { - inBlocks += length - blockSize; - xorBlocks += length - blockSize; - outBlocks += length - blockSize; - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } - - if (flags & BlockTransformation::BT_AllowParallel) - { - while (length >= 6*blockSize) - { - __m128i block0, block1, block2, block3, block4, block5; - block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - if (flags & BlockTransformation::BT_InBlockIsCounter) - { - const __m128i be1 = *CONST_M128_CAST(s_one128); - block1 = _mm_add_epi32(block0, be1); - block2 = _mm_add_epi32(block1, be1); - block3 = _mm_add_epi32(block2, be1); - block4 = _mm_add_epi32(block3, be1); - block5 = _mm_add_epi32(block4, be1); - _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1)); - } - else - { - inBlocks += inIncrement; - block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - } - - if (flags & BlockTransformation::BT_XorInput) - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(xorBlocks); - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - _mm_storeu_si128(M128_CAST(outBlocks), block0); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block1); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block2); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block3); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block4); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block5); - outBlocks += outIncrement; - - length -= 6*blockSize; - } - - while (length >= 2*blockSize) - { - __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1; - if (flags & BlockTransformation::BT_InBlockIsCounter) - { - const __m128i be1 = *CONST_M128_CAST(s_one128); - block1 = _mm_add_epi32(block0, be1); - _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1)); - } - else - { - inBlocks += inIncrement; - block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - } - - if (flags & BlockTransformation::BT_XorInput) - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(xorBlocks); - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - func2(block0, block1, subKeys, static_cast(rounds)); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - _mm_storeu_si128(M128_CAST(outBlocks), block0); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block1); - outBlocks += outIncrement; - - length -= 2*blockSize; - } - } - - while (length >= blockSize) - { - __m128i block, zero = _mm_setzero_si128(); - block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - - if (flags & BlockTransformation::BT_XorInput) - block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - - if (flags & BlockTransformation::BT_InBlockIsCounter) - const_cast(inBlocks)[15]++; - - func2(block, zero, subKeys, static_cast(rounds)); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - - _mm_storeu_si128(M128_CAST(outBlocks), block); - - inBlocks += inIncrement; - outBlocks += outIncrement; - xorBlocks += xorIncrement; - length -= blockSize; - } - - return length; -} - #endif // CRYPTOPP_SSSE3_AVAILABLE #if defined(CRYPTOPP_SSE41_AVAILABLE) @@ -1302,236 +1123,6 @@ inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1, block5 = _mm_unpackhi_epi32(x3, y3); } -template -inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6, - const word32 *subKeys, size_t rounds, const byte *inBlocks, - const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlocks); - CRYPTOPP_ASSERT(outBlocks); - CRYPTOPP_ASSERT(length >= 8); - - // Fake block size to match XMM word - const size_t xmmBlockSize = 16; - size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize; - size_t xorIncrement = xorBlocks ? xmmBlockSize : 0; - size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize; - - if (flags & BlockTransformation::BT_ReverseDirection) - { - inBlocks += length - xmmBlockSize; - xorBlocks += length - xmmBlockSize; - outBlocks += length - xmmBlockSize; - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } - - if (flags & BlockTransformation::BT_AllowParallel) - { - while (length >= 6*xmmBlockSize) - { - __m128i block0, block1, block2, block3, block4, block5; - if (flags & BlockTransformation::BT_InBlockIsCounter) - { - // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. - // After the dup load we have two counters in the XMM word. Then we need - // to increment the low ctr by 0 and the high ctr by 1. - block0 = _mm_add_epi32(*CONST_M128_CAST(s_one64_1b), _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(inBlocks)))); - - // After initial increment of {0,1} remaining counters increment by {1,1}. - const __m128i be2 = *CONST_M128_CAST(s_one64_2b); - block1 = _mm_add_epi32(be2, block0); - block2 = _mm_add_epi32(be2, block1); - block3 = _mm_add_epi32(be2, block2); - block4 = _mm_add_epi32(be2, block3); - block5 = _mm_add_epi32(be2, block4); - - // Store the next counter. - _mm_store_sd(reinterpret_cast(const_cast(inBlocks)), - _mm_castsi128_pd(_mm_add_epi32(be2, block5))); - } - else - { - block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - } - - if (flags & BlockTransformation::BT_XorInput) - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(xorBlocks); - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - _mm_storeu_si128(M128_CAST(outBlocks), block0); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block1); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block2); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block3); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block4); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block5); - outBlocks += outIncrement; - - length -= 6*xmmBlockSize; - } - - while (length >= 2*xmmBlockSize) - { - __m128i block0, block1; - if (flags & BlockTransformation::BT_InBlockIsCounter) - { - // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. - // After the dup load we have two counters in the XMM word. Then we need - // to increment the low ctr by 0 and the high ctr by 1. - block0 = _mm_add_epi32(*CONST_M128_CAST(s_one64_1b), _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(inBlocks)))); - - // After initial increment of {0,1} remaining counters increment by {1,1}. - const __m128i be2 = *CONST_M128_CAST(s_one64_2b); - block1 = _mm_add_epi32(be2, block0); - - // Store the next counter. - _mm_store_sd(reinterpret_cast(const_cast(inBlocks)), - _mm_castsi128_pd(_mm_add_epi64(be2, block1))); - } - else - { - block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - inBlocks += inIncrement; - } - - if (flags & BlockTransformation::BT_XorInput) - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(xorBlocks); - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - func2(block0, block1, subKeys, static_cast(rounds)); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - _mm_storeu_si128(M128_CAST(outBlocks), block0); - outBlocks += outIncrement; - _mm_storeu_si128(M128_CAST(outBlocks), block1); - outBlocks += outIncrement; - - length -= 2*xmmBlockSize; - } - } - - if (length) - { - // Adjust to real block size - const size_t blockSize = 8; - if (flags & BlockTransformation::BT_ReverseDirection) - { - inIncrement += inIncrement ? blockSize : 0; - xorIncrement += xorIncrement ? blockSize : 0; - outIncrement += outIncrement ? blockSize : 0; - inBlocks -= inIncrement; - xorBlocks -= xorIncrement; - outBlocks -= outIncrement; - } - else - { - inIncrement -= inIncrement ? blockSize : 0; - xorIncrement -= xorIncrement ? blockSize : 0; - outIncrement -= outIncrement ? blockSize : 0; - } - - while (length >= blockSize) - { - __m128i block, zero = _mm_setzero_si128(); - block = _mm_castpd_si128( - _mm_load_sd(reinterpret_cast(inBlocks))); - - if (flags & BlockTransformation::BT_XorInput) - { - block = _mm_xor_si128(block, _mm_castpd_si128( - _mm_load_sd(reinterpret_cast(xorBlocks)))); - } - - if (flags & BlockTransformation::BT_InBlockIsCounter) - const_cast(inBlocks)[7]++; - - func2(block, zero, subKeys, static_cast(rounds)); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - { - block = _mm_xor_si128(block, _mm_castpd_si128( - _mm_load_sd(reinterpret_cast(xorBlocks)))); - } - - _mm_store_sd(reinterpret_cast(outBlocks), _mm_castsi128_pd(block)); - - inBlocks += inIncrement; - outBlocks += outIncrement; - xorBlocks += xorIncrement; - length -= blockSize; - } - } - - return length; -} - #endif // CRYPTOPP_SSE41_AVAILABLE ANONYMOUS_NAMESPACE_END @@ -1580,14 +1171,14 @@ size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rou size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SIMON64_AdvancedProcessBlocks_SSE41(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks, + return AdvancedProcessBlocks64_SSE2x6(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SIMON64_AdvancedProcessBlocks_SSE41(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks, + return AdvancedProcessBlocks64_SSE2x6(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } #endif @@ -1596,14 +1187,14 @@ size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rou size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SIMON128_AdvancedProcessBlocks_SSSE3(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks, + return AdvancedProcessBlocks128_SSE2x6(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SIMON128_AdvancedProcessBlocks_SSSE3(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks, + return AdvancedProcessBlocks128_SSE2x6(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } #endif // CRYPTOPP_SSSE3_AVAILABLE