Add Power4 Vector Load, Store, Add and Xor

pull/548/merge
Jeffrey Walton 2018-01-02 08:13:42 -05:00
parent fac3a44a84
commit d6d53f2e9d
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 114 additions and 49 deletions

View File

@ -1285,11 +1285,11 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
{
while (length >= 6*blockSize)
{
VectorType block0, block1, block2, block3, block4, block5, temp;
block0 = VectorLoad(inBlocks);
uint32x4_p block0, block1, block2, block3, block4, block5, temp;
if (flags & BT_InBlockIsCounter)
{
block0 = VectorLoad(inBlocks);
block1 = VectorAdd(block0, s_one);
block2 = VectorAdd(block1, s_one);
block3 = VectorAdd(block2, s_one);
@ -1300,57 +1300,74 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
}
else
{
const int inc = static_cast<int>(inIncrement);
block1 = VectorLoad(1*inc, inBlocks);
block2 = VectorLoad(2*inc, inBlocks);
block3 = VectorLoad(3*inc, inBlocks);
block4 = VectorLoad(4*inc, inBlocks);
block5 = VectorLoad(5*inc, inBlocks);
inBlocks += 6*inc;
block0 = VectorLoad(inBlocks);
inBlocks += inIncrement;
block1 = VectorLoad(inBlocks);
inBlocks += inIncrement;
block2 = VectorLoad(inBlocks);
inBlocks += inIncrement;
block3 = VectorLoad(inBlocks);
inBlocks += inIncrement;
block4 = VectorLoad(inBlocks);
inBlocks += inIncrement;
block5 = VectorLoad(inBlocks);
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
{
const int inc = static_cast<int>(xorIncrement);
block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks));
block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks));
block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks));
block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks));
block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks));
block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks));
xorBlocks += 6*inc;
block0 = VectorXor(block0, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block1 = VectorXor(block1, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block2 = VectorXor(block2, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block3 = VectorXor(block3, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block4 = VectorXor(block4, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block5 = VectorXor(block5, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
}
func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
if (xorBlocks && !(flags & BT_XorInput))
{
const int inc = static_cast<int>(xorIncrement);
block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks));
block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks));
block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks));
block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks));
block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks));
block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks));
xorBlocks += 6*inc;
block0 = VectorXor(block0, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block1 = VectorXor(block1, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block2 = VectorXor(block2, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block3 = VectorXor(block3, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block4 = VectorXor(block4, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
block5 = VectorXor(block5, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
}
const int inc = static_cast<int>(outIncrement);
VectorStore(block0, outBlocks+0*inc);
VectorStore(block1, outBlocks+1*inc);
VectorStore(block2, outBlocks+2*inc);
VectorStore(block3, outBlocks+3*inc);
VectorStore(block4, outBlocks+4*inc);
VectorStore(block5, outBlocks+5*inc);
VectorStore(block0, outBlocks);
outBlocks += outIncrement;
VectorStore(block1, outBlocks);
outBlocks += outIncrement;
VectorStore(block2, outBlocks);
outBlocks += outIncrement;
VectorStore(block3, outBlocks);
outBlocks += outIncrement;
VectorStore(block4, outBlocks);
outBlocks += outIncrement;
VectorStore(block5, outBlocks);
outBlocks += outIncrement;
outBlocks += 6*inc;
length -= 6*blockSize;
}
}
while (length >= blockSize)
{
VectorType block = VectorLoad(inBlocks);
uint32x4_p block = VectorLoad(inBlocks);
if (flags & BT_XorInput)
block = VectorXor(block, VectorLoad(xorBlocks));

View File

@ -38,11 +38,59 @@ typedef __vector unsigned int uint32x4_p;
typedef __vector unsigned long long uint64x2_p;
#endif
/// \brief Default vector type
typedef uint32x4_p VectorType;
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) && !defined(CRYPTOPP_POWER7_AVAILABLE)
inline uint32x4_p VectorLoad(const byte src[16])
{
// http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
const uint8x16_p perm = vec_lvsl(0, (uint8_t*)src);
const uint8x16_p low = vec_ld(0, (uint8_t*)src);
const uint8x16_p high = vec_ld(15, (uint8_t*)src);
const uint8x16_p data = vec_perm(low, high, perm);
#if defined(CRYPTOPP_BIG_ENDIAN)
return (uint32x4_p)data;
#else
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
return (uint32x4_p)vec_perm(data, data, mask);
#endif
}
inline void VectorStore(const uint32x4_p data, byte dest[16])
{
#if defined(CRYPTOPP_LITTLE_ENDIAN)
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
const uint8x16_p t1 = (uint8x16_p)vec_perm(data, data, mask);
#else
const uint8x16_p t1 = (uint8x16_p)data;
#endif
// http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
const uint8x16_p t2 = vec_perm(t1, t1, vec_lvsr(0, dest));
vec_ste((uint8x16_p) t2, 0, (unsigned char*) dest);
vec_ste((uint16x8_p) t2, 1, (unsigned short*)dest);
vec_ste((uint32x4_p) t2, 3, (unsigned int*) dest);
vec_ste((uint32x4_p) t2, 4, (unsigned int*) dest);
vec_ste((uint32x4_p) t2, 8, (unsigned int*) dest);
vec_ste((uint32x4_p) t2, 12, (unsigned int*) dest);
vec_ste((uint16x8_p) t2, 14, (unsigned short*)dest);
vec_ste((uint8x16_p) t2, 15, (unsigned char*) dest);
}
inline uint32x4_p VectorXor(const uint32x4_p vec1, const uint32x4_p vec2)
{
return vec_xor(vec1, vec2);
}
inline uint32x4_p VectorAdd(const uint32x4_p vec1, const uint32x4_p vec2)
{
return vec_add(vec1, vec2);
}
#endif
#if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
/// \brief Reverse a 16-byte array

View File

@ -596,12 +596,12 @@ IncrementPointerAndStore(const uint8x16_p& r, uint8_t* p)
return p;
}
static inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType k = VectorLoadKey(keys);
uint32x4_p k = VectorLoadKey(keys);
block = VectorXor(block, k);
for (size_t i=1; i<rounds-1; i+=2)
@ -614,14 +614,14 @@ static inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, un
block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys));
}
static inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
VectorType &block2, VectorType &block3, VectorType &block4,
VectorType &block5, const word32 *subkeys, unsigned int rounds)
static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType k = VectorLoadKey(keys);
uint32x4_p k = VectorLoadKey(keys);
block0 = VectorXor(block0, k);
block1 = VectorXor(block1, k);
block2 = VectorXor(block2, k);
@ -649,12 +649,12 @@ static inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
block5 = VectorEncryptLast(block5, k);
}
static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType k = VectorLoadKey(rounds*16, keys);
uint32x4_p k = VectorLoadKey(rounds*16, keys);
block = VectorXor(block, k);
for (size_t i=rounds-1; i>1; i-=2)
@ -667,14 +667,14 @@ static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, un
block = VectorDecryptLast(block, VectorLoadKey(0, keys));
}
static inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1,
VectorType &block2, VectorType &block3, VectorType &block4,
VectorType &block5, const word32 *subkeys, unsigned int rounds)
static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType k = VectorLoadKey(rounds*16, keys);
uint32x4_p k = VectorLoadKey(rounds*16, keys);
block0 = VectorXor(block0, k);
block1 = VectorXor(block1, k);
block2 = VectorXor(block2, k);