Add Power4 Vector Load, Store, Add and Xor

pull/548/merge
Jeffrey Walton 2018-01-02 08:13:42 -05:00
parent fac3a44a84
commit d6d53f2e9d
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 114 additions and 49 deletions

View File

@ -1285,11 +1285,11 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
{ {
while (length >= 6*blockSize) while (length >= 6*blockSize)
{ {
VectorType block0, block1, block2, block3, block4, block5, temp; uint32x4_p block0, block1, block2, block3, block4, block5, temp;
block0 = VectorLoad(inBlocks);
if (flags & BT_InBlockIsCounter) if (flags & BT_InBlockIsCounter)
{ {
block0 = VectorLoad(inBlocks);
block1 = VectorAdd(block0, s_one); block1 = VectorAdd(block0, s_one);
block2 = VectorAdd(block1, s_one); block2 = VectorAdd(block1, s_one);
block3 = VectorAdd(block2, s_one); block3 = VectorAdd(block2, s_one);
@ -1300,57 +1300,74 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
} }
else else
{ {
const int inc = static_cast<int>(inIncrement); block0 = VectorLoad(inBlocks);
block1 = VectorLoad(1*inc, inBlocks); inBlocks += inIncrement;
block2 = VectorLoad(2*inc, inBlocks); block1 = VectorLoad(inBlocks);
block3 = VectorLoad(3*inc, inBlocks); inBlocks += inIncrement;
block4 = VectorLoad(4*inc, inBlocks); block2 = VectorLoad(inBlocks);
block5 = VectorLoad(5*inc, inBlocks); inBlocks += inIncrement;
inBlocks += 6*inc; block3 = VectorLoad(inBlocks);
inBlocks += inIncrement;
block4 = VectorLoad(inBlocks);
inBlocks += inIncrement;
block5 = VectorLoad(inBlocks);
inBlocks += inIncrement;
} }
if (flags & BT_XorInput) if (flags & BT_XorInput)
{ {
const int inc = static_cast<int>(xorIncrement); block0 = VectorXor(block0, VectorLoad(xorBlocks));
block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks)); xorBlocks += xorIncrement;
block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks)); block1 = VectorXor(block1, VectorLoad(xorBlocks));
block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks)); xorBlocks += xorIncrement;
block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks)); block2 = VectorXor(block2, VectorLoad(xorBlocks));
block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks)); xorBlocks += xorIncrement;
block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks)); block3 = VectorXor(block3, VectorLoad(xorBlocks));
xorBlocks += 6*inc; xorBlocks += xorIncrement;
block4 = VectorXor(block4, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block5 = VectorXor(block5, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
} }
func6(block0, block1, block2, block3, block4, block5, subKeys, rounds); func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
if (xorBlocks && !(flags & BT_XorInput)) if (xorBlocks && !(flags & BT_XorInput))
{ {
const int inc = static_cast<int>(xorIncrement); block0 = VectorXor(block0, VectorLoad(xorBlocks));
block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks)); xorBlocks += xorIncrement;
block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks)); block1 = VectorXor(block1, VectorLoad(xorBlocks));
block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks)); xorBlocks += xorIncrement;
block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks)); block2 = VectorXor(block2, VectorLoad(xorBlocks));
block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks)); xorBlocks += xorIncrement;
block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks)); block3 = VectorXor(block3, VectorLoad(xorBlocks));
xorBlocks += 6*inc; xorBlocks += xorIncrement;
block4 = VectorXor(block4, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block5 = VectorXor(block5, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
} }
const int inc = static_cast<int>(outIncrement); VectorStore(block0, outBlocks);
VectorStore(block0, outBlocks+0*inc); outBlocks += outIncrement;
VectorStore(block1, outBlocks+1*inc); VectorStore(block1, outBlocks);
VectorStore(block2, outBlocks+2*inc); outBlocks += outIncrement;
VectorStore(block3, outBlocks+3*inc); VectorStore(block2, outBlocks);
VectorStore(block4, outBlocks+4*inc); outBlocks += outIncrement;
VectorStore(block5, outBlocks+5*inc); VectorStore(block3, outBlocks);
outBlocks += outIncrement;
VectorStore(block4, outBlocks);
outBlocks += outIncrement;
VectorStore(block5, outBlocks);
outBlocks += outIncrement;
outBlocks += 6*inc;
length -= 6*blockSize; length -= 6*blockSize;
} }
} }
while (length >= blockSize) while (length >= blockSize)
{ {
VectorType block = VectorLoad(inBlocks); uint32x4_p block = VectorLoad(inBlocks);
if (flags & BT_XorInput) if (flags & BT_XorInput)
block = VectorXor(block, VectorLoad(xorBlocks)); block = VectorXor(block, VectorLoad(xorBlocks));

View File

@ -38,11 +38,59 @@ typedef __vector unsigned int uint32x4_p;
typedef __vector unsigned long long uint64x2_p; typedef __vector unsigned long long uint64x2_p;
#endif #endif
/// \brief Default vector type
typedef uint32x4_p VectorType;
#endif // CRYPTOPP_ALTIVEC_AVAILABLE #endif // CRYPTOPP_ALTIVEC_AVAILABLE
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) && !defined(CRYPTOPP_POWER7_AVAILABLE)
inline uint32x4_p VectorLoad(const byte src[16])
{
// http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
const uint8x16_p perm = vec_lvsl(0, (uint8_t*)src);
const uint8x16_p low = vec_ld(0, (uint8_t*)src);
const uint8x16_p high = vec_ld(15, (uint8_t*)src);
const uint8x16_p data = vec_perm(low, high, perm);
#if defined(CRYPTOPP_BIG_ENDIAN)
return (uint32x4_p)data;
#else
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
return (uint32x4_p)vec_perm(data, data, mask);
#endif
}
inline void VectorStore(const uint32x4_p data, byte dest[16])
{
#if defined(CRYPTOPP_LITTLE_ENDIAN)
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
const uint8x16_p t1 = (uint8x16_p)vec_perm(data, data, mask);
#else
const uint8x16_p t1 = (uint8x16_p)data;
#endif
// http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
const uint8x16_p t2 = vec_perm(t1, t1, vec_lvsr(0, dest));
vec_ste((uint8x16_p) t2, 0, (unsigned char*) dest);
vec_ste((uint16x8_p) t2, 1, (unsigned short*)dest);
vec_ste((uint32x4_p) t2, 3, (unsigned int*) dest);
vec_ste((uint32x4_p) t2, 4, (unsigned int*) dest);
vec_ste((uint32x4_p) t2, 8, (unsigned int*) dest);
vec_ste((uint32x4_p) t2, 12, (unsigned int*) dest);
vec_ste((uint16x8_p) t2, 14, (unsigned short*)dest);
vec_ste((uint8x16_p) t2, 15, (unsigned char*) dest);
}
inline uint32x4_p VectorXor(const uint32x4_p vec1, const uint32x4_p vec2)
{
return vec_xor(vec1, vec2);
}
inline uint32x4_p VectorAdd(const uint32x4_p vec1, const uint32x4_p vec2)
{
return vec_add(vec1, vec2);
}
#endif
#if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) #if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
/// \brief Reverse a 16-byte array /// \brief Reverse a 16-byte array

View File

@ -596,12 +596,12 @@ IncrementPointerAndStore(const uint8x16_p& r, uint8_t* p)
return p; return p;
} }
static inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds) static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds)
{ {
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys); const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType k = VectorLoadKey(keys); uint32x4_p k = VectorLoadKey(keys);
block = VectorXor(block, k); block = VectorXor(block, k);
for (size_t i=1; i<rounds-1; i+=2) for (size_t i=1; i<rounds-1; i+=2)
@ -614,14 +614,14 @@ static inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, un
block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys)); block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys));
} }
static inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1, static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
VectorType &block2, VectorType &block3, VectorType &block4, uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
VectorType &block5, const word32 *subkeys, unsigned int rounds) uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
{ {
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys); const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType k = VectorLoadKey(keys); uint32x4_p k = VectorLoadKey(keys);
block0 = VectorXor(block0, k); block0 = VectorXor(block0, k);
block1 = VectorXor(block1, k); block1 = VectorXor(block1, k);
block2 = VectorXor(block2, k); block2 = VectorXor(block2, k);
@ -649,12 +649,12 @@ static inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
block5 = VectorEncryptLast(block5, k); block5 = VectorEncryptLast(block5, k);
} }
static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds) static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds)
{ {
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys); const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType k = VectorLoadKey(rounds*16, keys); uint32x4_p k = VectorLoadKey(rounds*16, keys);
block = VectorXor(block, k); block = VectorXor(block, k);
for (size_t i=rounds-1; i>1; i-=2) for (size_t i=rounds-1; i>1; i-=2)
@ -667,14 +667,14 @@ static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, un
block = VectorDecryptLast(block, VectorLoadKey(0, keys)); block = VectorDecryptLast(block, VectorLoadKey(0, keys));
} }
static inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1, static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
VectorType &block2, VectorType &block3, VectorType &block4, uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
VectorType &block5, const word32 *subkeys, unsigned int rounds) uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
{ {
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys); const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType k = VectorLoadKey(rounds*16, keys); uint32x4_p k = VectorLoadKey(rounds*16, keys);
block0 = VectorXor(block0, k); block0 = VectorXor(block0, k);
block1 = VectorXor(block1, k); block1 = VectorXor(block1, k);
block2 = VectorXor(block2, k); block2 = VectorXor(block2, k);