diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index 0c624fa8..ff46e564 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -817,6 +817,22 @@ uint8x16_p8 Load8x16(const uint8_t src[16]) #endif } +uint8x16_p8 Load8x16(int off, const uint8_t src[16]) +{ +#if defined(CRYPTOPP_XLC_VERSION) + /* http://stackoverflow.com/q/46124383/608639 */ + uint8_t* s = (uint8_t*)src; +# if defined(IS_LITTLE_ENDIAN) + return vec_xl_be(off, s); +# else + return vec_xl(off, s); +# endif +#else + /* GCC, Clang, etc */ + return (uint8x16_p8)vec_vsx_ld(off, src); +#endif +} + void Store8x16(const uint8x16_p8 src, uint8_t dest[16]) { #if defined(CRYPTOPP_XLC_VERSION) @@ -854,6 +870,28 @@ uint64x2_p8 Load64x2(const uint8_t src[16]) #endif } +uint64x2_p8 Load64x2(int off, const uint8_t src[16]) +{ +#if defined(CRYPTOPP_XLC_VERSION) + /* http://stackoverflow.com/q/46124383/608639 */ + uint8_t* s = (uint8_t*)src; +# if defined(IS_LITTLE_ENDIAN) + return (uint64x2_p8)vec_xl_be(off, s); +# else + return (uint64x2_p8)vec_xl(off, s); +# endif +#else + /* GCC, Clang, etc */ +# if defined(IS_LITTLE_ENDIAN) + const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + const uint8x16_p8 zero = {0}; + return (uint64x2_p8)vec_perm(vec_vsx_ld(off, src), zero, mask); +# else + return (uint64x2_p8)vec_vsx_ld(off, src); +# endif +#endif +} + void Store64x2(const uint64x2_p8 src, uint8_t dest[16]) { #if defined(CRYPTOPP_XLC_VERSION) @@ -894,6 +932,15 @@ inline VectorType VectorLoad(const byte src[16]) #endif } +inline VectorType VectorLoad(int off, const byte src[16]) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return Load8x16(off, src); +#elif defined(CRYPTOPP_GCC_VERSION) + return Load64x2(off, src); +#endif +} + inline VectorType VectorLoadAligned(const byte vec[16]) { return (VectorType)vec_ld(0, vec); @@ -973,64 +1020,238 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key) #endif } -////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// -void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds, - const byte *inBlock, const byte *xorBlock, byte *outBlock) +inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds) { CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); const byte *keys = reinterpret_cast(subkeys); - VectorType s = VectorLoad(inBlock); VectorType k = VectorLoadAligned(keys); + block = VectorXor(block, k); - s = VectorXor(s, k); for (size_t i=1; i(subkeys); + + VectorType k = VectorLoadAligned(keys); + block0 = VectorXor(block0, k); + block1 = VectorXor(block1, k); + block2 = VectorXor(block2, k); + block3 = VectorXor(block3, k); + + for (size_t i=1; i(subkeys); - VectorType s = VectorLoad(inBlock); VectorType k = VectorLoadAligned(rounds*16, keys); + block = VectorXor(block, k); - s = VectorXor(s, k); for (size_t i=rounds-1; i>1; i-=2) { - s = VectorDecrypt(s, VectorLoadAligned( i*16, keys)); - s = VectorDecrypt(s, VectorLoadAligned((i-1)*16, keys)); + block = VectorDecrypt(block, VectorLoadAligned( i*16, keys)); + block = VectorDecrypt(block, VectorLoadAligned((i-1)*16, keys)); } - s = VectorDecrypt(s, VectorLoadAligned(16, keys)); - s = VectorDecryptLast(s, VectorLoadAligned(0, keys)); - - // According to benchmarks this is a tad bit slower - // if (xorBlock) - // s = VectorXor(s, VectorLoad(xorBlock)); - - VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0}; - s = VectorXor(s, x); - - VectorStore(s, outBlock); + block = VectorDecrypt(block, VectorLoadAligned(16, keys)); + block = VectorDecryptLast(block, VectorLoadAligned(0, keys)); } + +inline void POWER8_Dec_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2, + VectorType &block3, const word32 *subkeys, unsigned int rounds) +{ + CRYPTOPP_ASSERT(subkeys); + const byte *keys = reinterpret_cast(subkeys); + + VectorType k = VectorLoadAligned(rounds*16, keys); + block0 = VectorXor(block0, k); + block1 = VectorXor(block1, k); + block2 = VectorXor(block2, k); + block3 = VectorXor(block3, k); + + for (size_t i=rounds-1; i>0; --i) + { + k = VectorLoadAligned(i*16, keys); + block0 = VectorDecrypt(block0, k); + block1 = VectorDecrypt(block1, k); + block2 = VectorDecrypt(block2, k); + block3 = VectorDecrypt(block3, k); + } + + k = VectorLoadAligned(0, keys); + block0 = VectorDecryptLast(block0, k); + block1 = VectorDecryptLast(block1, k); + block2 = VectorDecryptLast(block2, k); + block3 = VectorDecryptLast(block3, k); +} + +template +size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + CRYPTOPP_ASSERT(subKeys); + CRYPTOPP_ASSERT(inBlocks); + CRYPTOPP_ASSERT(outBlocks); + CRYPTOPP_ASSERT(length >= 16); + + const size_t blockSize = 16; + size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = xorBlocks ? blockSize : 0; + size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; + + if (flags & BlockTransformation::BT_ReverseDirection) + { + inBlocks += length - blockSize; + xorBlocks += length - blockSize; + outBlocks += length - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BlockTransformation::BT_AllowParallel) + { + while (length >= 4*blockSize) + { + VectorType block0, block1, block2, block3, temp; + block0 = VectorLoad(inBlocks); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + { +#if defined(IS_LITTLE_ENDIAN) + const VectorType one = {1}; +#else + const VectorType one = (VectorType)(uint64x2_p8){0,1}; +#endif + block1 = VectorAdd(block0, one); + block2 = VectorAdd(block1, one); + block3 = VectorAdd(block2, one); + temp = VectorAdd(block3, one); + VectorStore(temp, const_cast(inBlocks)); + } + else + { + //inBlocks += inIncrement; + block1 = VectorLoad(1*inIncrement, inBlocks); + //inBlocks += inIncrement; + block2 = VectorLoad(2*inIncrement, inBlocks); + //inBlocks += inIncrement; + block3 = VectorLoad(3*inIncrement, inBlocks); + //inBlocks += inIncrement; + inBlocks += 4*inIncrement; + } + + if (flags & BlockTransformation::BT_XorInput) + { + block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks)); + //xorBlocks += xorIncrement; + block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks)); + //xorBlocks += xorIncrement; + block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks)); + //xorBlocks += xorIncrement; + block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks)); + //xorBlocks += xorIncrement; + xorBlocks += 4*xorIncrement; + } + + func4(block0, block1, block2, block3, subKeys, rounds); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks)); + //xorBlocks += xorIncrement; + block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks)); + //xorBlocks += xorIncrement; + block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks)); + //xorBlocks += xorIncrement; + block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks)); + //xorBlocks += xorIncrement; + xorBlocks += 4*xorIncrement; + } + + // I can't get Store to run faster using indexed offsets + VectorStore(block0, outBlocks); + outBlocks += outIncrement; + VectorStore(block1, outBlocks); + outBlocks += outIncrement; + VectorStore(block2, outBlocks); + outBlocks += outIncrement; + VectorStore(block3, outBlocks); + outBlocks += outIncrement; + + length -= 4*blockSize; + } + } + + while (length >= blockSize) + { + VectorType block = VectorLoad(inBlocks); + + if (flags & BlockTransformation::BT_XorInput) + block = VectorXor(block, VectorLoad(xorBlocks)); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1(block, subKeys, rounds); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + block = VectorXor(block, VectorLoad(xorBlocks)); + + VectorStore(block, outBlocks); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + + return length; +} + +size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Enc_Block, POWER8_Enc_4_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Dec_Block, POWER8_Dec_4_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + #endif // CRYPTOPP_POWER8_AES_AVAILABLE NAMESPACE_END diff --git a/rijndael.cpp b/rijndael.cpp index 3e016d6f..43c7fa80 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -253,10 +253,10 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si #if (CRYPTOPP_POWER8_AES_AVAILABLE) extern void ByteReverseArrayLE(byte src[16]); -extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds, - const byte *inBlock, const byte *xorBlock, byte *outBlock); -extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds, - const byte *inBlock, const byte *xorBlock, byte *outBlock); +extern size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +extern size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); #endif void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &) @@ -408,7 +408,7 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock #if (CRYPTOPP_POWER8_AES_AVAILABLE) if (HasAES()) { - (void)Rijndael_Enc_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock); + (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); return; } #endif @@ -502,7 +502,7 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock #if (CRYPTOPP_POWER8_AES_AVAILABLE) if (HasAES()) { - (void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock); + (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); return; } #endif @@ -1130,7 +1130,7 @@ Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { } #endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 -#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 +#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64 // Do nothing Rijndael::Enc::Enc() { } #endif @@ -1146,6 +1146,10 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo if (HasAES()) return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif +#if CRYPTOPP_POWER8_AES_AVAILABLE + if (HasAES()) + return Rijndael_Enc_AdvancedProcessBlocks_POWER8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); +#endif #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) if (HasSSE2()) @@ -1205,11 +1209,14 @@ size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo if (HasAESNI()) return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif - #if CRYPTOPP_ARM_AES_AVAILABLE if (HasAES()) return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif +#if CRYPTOPP_POWER8_AES_AVAILABLE + if (HasAES()) + return Rijndael_Dec_AdvancedProcessBlocks_POWER8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); +#endif return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } diff --git a/rijndael.h b/rijndael.h index 483eab5a..b6cd0faf 100644 --- a/rijndael.h +++ b/rijndael.h @@ -12,7 +12,8 @@ #include "seckey.h" #include "secblock.h" -#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 +#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || \ + CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64 # define CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS 1 #endif