Add AdvancedProcessBlocks for Power8

This increases performance to about 1.6 cpb. We are about 0.5 cpb behind Botan, and about 1.0 cpb behind OpenSSL. However, it beats the snot out of C/C++, which runs at 20 to 30 cpb
pull/507/head
Jeffrey Walton 2017-09-12 18:15:55 -04:00
parent 2ebd30d43c
commit 6899d3f8bb
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 273 additions and 44 deletions

View File

@ -817,6 +817,22 @@ uint8x16_p8 Load8x16(const uint8_t src[16])
#endif
}
uint8x16_p8 Load8x16(int off, const uint8_t src[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
/* http://stackoverflow.com/q/46124383/608639 */
uint8_t* s = (uint8_t*)src;
# if defined(IS_LITTLE_ENDIAN)
return vec_xl_be(off, s);
# else
return vec_xl(off, s);
# endif
#else
/* GCC, Clang, etc */
return (uint8x16_p8)vec_vsx_ld(off, src);
#endif
}
void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
@ -854,6 +870,28 @@ uint64x2_p8 Load64x2(const uint8_t src[16])
#endif
}
uint64x2_p8 Load64x2(int off, const uint8_t src[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
/* http://stackoverflow.com/q/46124383/608639 */
uint8_t* s = (uint8_t*)src;
# if defined(IS_LITTLE_ENDIAN)
return (uint64x2_p8)vec_xl_be(off, s);
# else
return (uint64x2_p8)vec_xl(off, s);
# endif
#else
/* GCC, Clang, etc */
# if defined(IS_LITTLE_ENDIAN)
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
const uint8x16_p8 zero = {0};
return (uint64x2_p8)vec_perm(vec_vsx_ld(off, src), zero, mask);
# else
return (uint64x2_p8)vec_vsx_ld(off, src);
# endif
#endif
}
void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
@ -894,6 +932,15 @@ inline VectorType VectorLoad(const byte src[16])
#endif
}
inline VectorType VectorLoad(int off, const byte src[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
return Load8x16(off, src);
#elif defined(CRYPTOPP_GCC_VERSION)
return Load64x2(off, src);
#endif
}
inline VectorType VectorLoadAligned(const byte vec[16])
{
return (VectorType)vec_ld(0, vec);
@ -973,64 +1020,238 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key)
#endif
}
//////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
const byte *inBlock, const byte *xorBlock, byte *outBlock)
inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType s = VectorLoad(inBlock);
VectorType k = VectorLoadAligned(keys);
block = VectorXor(block, k);
s = VectorXor(s, k);
for (size_t i=1; i<rounds-1; i+=2)
{
s = VectorEncrypt(s, VectorLoadAligned( i*16, keys));
s = VectorEncrypt(s, VectorLoadAligned((i+1)*16, keys));
block = VectorEncrypt(block, VectorLoadAligned( i*16, keys));
block = VectorEncrypt(block, VectorLoadAligned((i+1)*16, keys));
}
s = VectorEncrypt(s, VectorLoadAligned((rounds-1)*16, keys));
s = VectorEncryptLast(s, VectorLoadAligned(rounds*16, keys));
// According to benchmarks this is a tad bit slower
// if (xorBlock)
// s = VectorXor(s, VectorLoad(xorBlock));
VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
s = VectorXor(s, x);
VectorStore(s, outBlock);
block = VectorEncrypt(block, VectorLoadAligned((rounds-1)*16, keys));
block = VectorEncryptLast(block, VectorLoadAligned(rounds*16, keys));
}
void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
const byte *inBlock, const byte *xorBlock, byte *outBlock)
inline void POWER8_Enc_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
VectorType &block3, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(subkeys);
const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType k = VectorLoadAligned(keys);
block0 = VectorXor(block0, k);
block1 = VectorXor(block1, k);
block2 = VectorXor(block2, k);
block3 = VectorXor(block3, k);
for (size_t i=1; i<rounds; ++i)
{
k = VectorLoadAligned(i*16, keys);
block0 = VectorEncrypt(block0, k);
block1 = VectorEncrypt(block1, k);
block2 = VectorEncrypt(block2, k);
block3 = VectorEncrypt(block3, k);
}
k = VectorLoadAligned(rounds*16, keys);
block0 = VectorEncryptLast(block0, k);
block1 = VectorEncryptLast(block1, k);
block2 = VectorEncryptLast(block2, k);
block3 = VectorEncryptLast(block3, k);
}
inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType s = VectorLoad(inBlock);
VectorType k = VectorLoadAligned(rounds*16, keys);
block = VectorXor(block, k);
s = VectorXor(s, k);
for (size_t i=rounds-1; i>1; i-=2)
{
s = VectorDecrypt(s, VectorLoadAligned( i*16, keys));
s = VectorDecrypt(s, VectorLoadAligned((i-1)*16, keys));
block = VectorDecrypt(block, VectorLoadAligned( i*16, keys));
block = VectorDecrypt(block, VectorLoadAligned((i-1)*16, keys));
}
s = VectorDecrypt(s, VectorLoadAligned(16, keys));
s = VectorDecryptLast(s, VectorLoadAligned(0, keys));
// According to benchmarks this is a tad bit slower
// if (xorBlock)
// s = VectorXor(s, VectorLoad(xorBlock));
VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
s = VectorXor(s, x);
VectorStore(s, outBlock);
block = VectorDecrypt(block, VectorLoadAligned(16, keys));
block = VectorDecryptLast(block, VectorLoadAligned(0, keys));
}
inline void POWER8_Dec_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
VectorType &block3, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(subkeys);
const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType k = VectorLoadAligned(rounds*16, keys);
block0 = VectorXor(block0, k);
block1 = VectorXor(block1, k);
block2 = VectorXor(block2, k);
block3 = VectorXor(block3, k);
for (size_t i=rounds-1; i>0; --i)
{
k = VectorLoadAligned(i*16, keys);
block0 = VectorDecrypt(block0, k);
block1 = VectorDecrypt(block1, k);
block2 = VectorDecrypt(block2, k);
block3 = VectorDecrypt(block3, k);
}
k = VectorLoadAligned(0, keys);
block0 = VectorDecryptLast(block0, k);
block1 = VectorDecryptLast(block1, k);
block2 = VectorDecryptLast(block2, k);
block3 = VectorDecryptLast(block3, k);
}
template <typename F1, typename F4>
size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
CRYPTOPP_ASSERT(subKeys);
CRYPTOPP_ASSERT(inBlocks);
CRYPTOPP_ASSERT(outBlocks);
CRYPTOPP_ASSERT(length >= 16);
const size_t blockSize = 16;
size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
size_t xorIncrement = xorBlocks ? blockSize : 0;
size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
if (flags & BlockTransformation::BT_ReverseDirection)
{
inBlocks += length - blockSize;
xorBlocks += length - blockSize;
outBlocks += length - blockSize;
inIncrement = 0-inIncrement;
xorIncrement = 0-xorIncrement;
outIncrement = 0-outIncrement;
}
if (flags & BlockTransformation::BT_AllowParallel)
{
while (length >= 4*blockSize)
{
VectorType block0, block1, block2, block3, temp;
block0 = VectorLoad(inBlocks);
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
#if defined(IS_LITTLE_ENDIAN)
const VectorType one = {1};
#else
const VectorType one = (VectorType)(uint64x2_p8){0,1};
#endif
block1 = VectorAdd(block0, one);
block2 = VectorAdd(block1, one);
block3 = VectorAdd(block2, one);
temp = VectorAdd(block3, one);
VectorStore(temp, const_cast<byte*>(inBlocks));
}
else
{
//inBlocks += inIncrement;
block1 = VectorLoad(1*inIncrement, inBlocks);
//inBlocks += inIncrement;
block2 = VectorLoad(2*inIncrement, inBlocks);
//inBlocks += inIncrement;
block3 = VectorLoad(3*inIncrement, inBlocks);
//inBlocks += inIncrement;
inBlocks += 4*inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks));
//xorBlocks += xorIncrement;
block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks));
//xorBlocks += xorIncrement;
block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks));
//xorBlocks += xorIncrement;
block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks));
//xorBlocks += xorIncrement;
xorBlocks += 4*xorIncrement;
}
func4(block0, block1, block2, block3, subKeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks));
//xorBlocks += xorIncrement;
block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks));
//xorBlocks += xorIncrement;
block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks));
//xorBlocks += xorIncrement;
block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks));
//xorBlocks += xorIncrement;
xorBlocks += 4*xorIncrement;
}
// I can't get Store to run faster using indexed offsets
VectorStore(block0, outBlocks);
outBlocks += outIncrement;
VectorStore(block1, outBlocks);
outBlocks += outIncrement;
VectorStore(block2, outBlocks);
outBlocks += outIncrement;
VectorStore(block3, outBlocks);
outBlocks += outIncrement;
length -= 4*blockSize;
}
}
while (length >= blockSize)
{
VectorType block = VectorLoad(inBlocks);
if (flags & BlockTransformation::BT_XorInput)
block = VectorXor(block, VectorLoad(xorBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++;
func1(block, subKeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = VectorXor(block, VectorLoad(xorBlocks));
VectorStore(block, outBlocks);
inBlocks += inIncrement;
outBlocks += outIncrement;
xorBlocks += xorIncrement;
length -= blockSize;
}
return length;
}
size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Enc_Block, POWER8_Enc_4_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Dec_Block, POWER8_Dec_4_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // CRYPTOPP_POWER8_AES_AVAILABLE
NAMESPACE_END

View File

@ -253,10 +253,10 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
extern void ByteReverseArrayLE(byte src[16]);
extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
const byte *inBlock, const byte *xorBlock, byte *outBlock);
extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
const byte *inBlock, const byte *xorBlock, byte *outBlock);
extern size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
@ -408,7 +408,7 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
if (HasAES())
{
(void)Rijndael_Enc_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
(void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
return;
}
#endif
@ -502,7 +502,7 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
if (HasAES())
{
(void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
(void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
return;
}
#endif
@ -1130,7 +1130,7 @@ Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { }
#endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
// Do nothing
Rijndael::Enc::Enc() { }
#endif
@ -1146,6 +1146,10 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
if (HasAES())
return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if CRYPTOPP_POWER8_AES_AVAILABLE
if (HasAES())
return Rijndael_Enc_AdvancedProcessBlocks_POWER8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
if (HasSSE2())
@ -1205,11 +1209,14 @@ size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
if (HasAESNI())
return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if CRYPTOPP_ARM_AES_AVAILABLE
if (HasAES())
return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if CRYPTOPP_POWER8_AES_AVAILABLE
if (HasAES())
return Rijndael_Dec_AdvancedProcessBlocks_POWER8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}

View File

@ -12,7 +12,8 @@
#include "seckey.h"
#include "secblock.h"
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || \
CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64
# define CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS 1
#endif