From f2a303c30b5256d37c4ed260d8631742eae1ef50 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Tue, 15 Aug 2017 12:11:17 -0400 Subject: [PATCH] Cut-over to Wei's wide AES encryption for ARMv8 Wei's AESNI routines use ARMV8_Enc_Block, ARMV8_Enc_4_Blocks, ARMV8_Dec_Block, ARMV8_Dec_4_Blocks. They increased performance for ECB, CTR and CBC mode. Formerly ECB mode was rinning at 2.3 cpb. After the cut-over ECB dropped to 1.1 cpb. --- rijndael-simd.cpp | 345 +++++++++++++++++++++++++++++++++++----------- rijndael.cpp | 44 +++--- rijndael.h | 8 +- 3 files changed, 299 insertions(+), 98 deletions(-) diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index aedd4b32..65c7a7c8 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -128,19 +128,16 @@ bool CPU_TryAES_ARMV8() struct utsname systemInfo; systemInfo.machine[0] = '\0'; uname(&systemInfo); - const char* machine = systemInfo.machine; - if (0==strcmp(machine, "iPhone6,1") || 0==strcmp(machine, "iPhone6,2") || - 0==strcmp(machine, "iPhone7,1") || 0==strcmp(machine, "iPhone7,2") || - 0==strcmp(machine, "iPad4,1") || 0==strcmp(machine, "iPad4,2") || - 0==strcmp(machine, "iPad4,3") || 0==strcmp(machine, "iPad4,4") || - 0==strcmp(machine, "iPad4,5") || 0==strcmp(machine, "iPad4,6") || - 0==strcmp(machine, "iPad4,7") || 0==strcmp(machine, "iPad4,8") || - 0==strcmp(machine, "iPad4,9") || - 0==strcmp(machine, "iPad5,3") || 0==strcmp(machine, "iPad5,4") ) - { - return true; - } + std::string machine(systemInfo.machine); + + if (machine.substr(0, 7) == "iPhone6" || machine.substr(0, 7) == "iPhone7" || + machine.substr(0, 7) == "iPhone8" || machine.substr(0, 7) == "iPhone9" || + machine.substr(0, 5) == "iPad4" || machine.substr(0, 5) == "iPad5" || + machine.substr(0, 5) == "iPad6" || machine.substr(0, 5) == "iPad7") + { + return true; + } } # endif @@ -181,104 +178,293 @@ bool CPU_TryAES_ARMV8() #endif // ARM32 or ARM64 #if (CRYPTOPP_ARM_AES_AVAILABLE) - -void Rijndael_Enc_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock, - const word32 *subKeys, unsigned int rounds) +inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds) { - uint8x16_t data = vld1q_u8(inBlock); - const byte *keys = reinterpret_cast(subKeys); + const byte *keys = reinterpret_cast(subkeys); // Unroll the loop, profit 0.3 to 0.5 cpb. - data = vaeseq_u8(data, vld1q_u8(keys+0)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+16)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+32)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+48)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+64)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+80)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+96)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+112)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+128)); - data = vaesmcq_u8(data); + block = vaeseq_u8(block, vld1q_u8(keys+0)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+16)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+32)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+48)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+64)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+80)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+96)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+112)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+128)); + block = vaesmcq_u8(block); unsigned int i=9; for ( ; i(subKeys); + const byte *keys = reinterpret_cast(subkeys); + + unsigned int i=0; + for ( ; i(subkeys); // Unroll the loop, profit 0.3 to 0.5 cpb. - data = vaesdq_u8(data, vld1q_u8(keys+0)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+16)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+32)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+48)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+64)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+80)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+96)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+112)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+128)); - data = vaesimcq_u8(data); + block = vaesdq_u8(block, vld1q_u8(keys+0)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+16)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+32)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+48)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+64)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+80)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+96)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+112)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+128)); + block = vaesimcq_u8(block); unsigned int i=9; for ( ; i(subkeys); + + unsigned int i=0; + for ( ; i +size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F4 func4, const word32 *subkeys, unsigned int rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + size_t blockSize = 16; + size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = xorBlocks ? blockSize : 0; + size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; + + if (flags & BlockTransformation::BT_ReverseDirection) + { + inBlocks += length - blockSize; + xorBlocks += length - blockSize; + outBlocks += length - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BlockTransformation::BT_AllowParallel) + { + while (length >= 4*blockSize) + { + uint8x16_t block0, block1, block2, block3, temp; + block0 = vld1q_u8(inBlocks); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + uint32x4_t be = vld1q_u32(s_one); + block1 = vaddq_u8(block0, vreinterpretq_u8_u32(be)); + block2 = vaddq_u8(block1, vreinterpretq_u8_u32(be)); + block3 = vaddq_u8(block2, vreinterpretq_u8_u32(be)); + temp = vaddq_u8(block3, vreinterpretq_u8_u32(be)); + vst1q_u8(const_cast(inBlocks), temp); + } + else + { + inBlocks += inIncrement; + block1 = vld1q_u8(inBlocks); + inBlocks += inIncrement; + block2 = vld1q_u8(inBlocks); + inBlocks += inIncrement; + block3 = vld1q_u8(inBlocks); + inBlocks += inIncrement; + } + + if (flags & BlockTransformation::BT_XorInput) + { + block0 = veorq_u8(block0, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block1 = veorq_u8(block1, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block2 = veorq_u8(block2, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block3 = veorq_u8(block3, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + } + + func4(block0, block1, block2, block3, subkeys, rounds); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + block0 = veorq_u8(block0, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block1 = veorq_u8(block1, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block2 = veorq_u8(block2, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block3 = veorq_u8(block3, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + } + + vst1q_u8(outBlocks, block0); + outBlocks += outIncrement; + vst1q_u8(outBlocks, block1); + outBlocks += outIncrement; + vst1q_u8(outBlocks, block2); + outBlocks += outIncrement; + vst1q_u8(outBlocks, block3); + outBlocks += outIncrement; + + length -= 4*blockSize; + } + } + + while (length >= blockSize) + { + uint8x16_t block = vld1q_u8(inBlocks); + + if (flags & BlockTransformation::BT_XorInput) + block = veorq_u8(block, vld1q_u8(xorBlocks)); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1(block, subkeys, rounds); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + block = veorq_u8(block, vld1q_u8(xorBlocks)); + + vst1q_u8(outBlocks, block); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + + return length; +} + +size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return Rijndael_AdvancedProcessBlocks_ARMV8(ARMV8_Enc_Block, ARMV8_Enc_4_Blocks, + subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return Rijndael_AdvancedProcessBlocks_ARMV8(ARMV8_Dec_Block, ARMV8_Dec_4_Blocks, + subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + #endif // CRYPTOPP_ARM_AES_AVAILABLE #if (CRYPTOPP_AESNI_AVAILABLE) -void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds) +inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds) { block = _mm_xor_si128(block, subkeys[0]); for (unsigned int i=1; i= t0 || s1 > t0); } -#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 struct Locals { word32 subkeys[4*12], workspace[8]; @@ -1071,13 +1070,24 @@ const size_t s_aliasBlockSize = 256; const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals); Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { } + +#endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 + +#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 +// Do nothing +Rijndael::Enc::Enc() { } #endif +#if CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { #if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) - return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); + return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if CRYPTOPP_ARM_AES_AVAILABLE + if (HasAES()) + return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) @@ -1132,19 +1142,21 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } -#endif - -#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { #if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) - return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); + return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); +#endif + +#if CRYPTOPP_ARM_AES_AVAILABLE + if (HasAES()) + return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } -#endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 +#endif // CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS NAMESPACE_END diff --git a/rijndael.h b/rijndael.h index 32d69867..f6ab5bdb 100644 --- a/rijndael.h +++ b/rijndael.h @@ -16,6 +16,10 @@ # define CRYPTOPP_DISABLE_RIJNDAEL_ASM #endif +#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 +# define CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS 1 +#endif + NAMESPACE_BEGIN(CryptoPP) //! \brief Rijndael block cipher information @@ -55,7 +59,7 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; -#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 +#if CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS Enc(); size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; private: @@ -69,7 +73,7 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; -#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 +#if CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; #endif };