Add Power4 Vector Load, Store, Add and Xor
parent
fac3a44a84
commit
d6d53f2e9d
85
adv-simd.h
85
adv-simd.h
|
|
@ -1285,11 +1285,11 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
|
||||||
{
|
{
|
||||||
while (length >= 6*blockSize)
|
while (length >= 6*blockSize)
|
||||||
{
|
{
|
||||||
VectorType block0, block1, block2, block3, block4, block5, temp;
|
uint32x4_p block0, block1, block2, block3, block4, block5, temp;
|
||||||
block0 = VectorLoad(inBlocks);
|
|
||||||
|
|
||||||
if (flags & BT_InBlockIsCounter)
|
if (flags & BT_InBlockIsCounter)
|
||||||
{
|
{
|
||||||
|
block0 = VectorLoad(inBlocks);
|
||||||
block1 = VectorAdd(block0, s_one);
|
block1 = VectorAdd(block0, s_one);
|
||||||
block2 = VectorAdd(block1, s_one);
|
block2 = VectorAdd(block1, s_one);
|
||||||
block3 = VectorAdd(block2, s_one);
|
block3 = VectorAdd(block2, s_one);
|
||||||
|
|
@ -1300,57 +1300,74 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
const int inc = static_cast<int>(inIncrement);
|
block0 = VectorLoad(inBlocks);
|
||||||
block1 = VectorLoad(1*inc, inBlocks);
|
inBlocks += inIncrement;
|
||||||
block2 = VectorLoad(2*inc, inBlocks);
|
block1 = VectorLoad(inBlocks);
|
||||||
block3 = VectorLoad(3*inc, inBlocks);
|
inBlocks += inIncrement;
|
||||||
block4 = VectorLoad(4*inc, inBlocks);
|
block2 = VectorLoad(inBlocks);
|
||||||
block5 = VectorLoad(5*inc, inBlocks);
|
inBlocks += inIncrement;
|
||||||
inBlocks += 6*inc;
|
block3 = VectorLoad(inBlocks);
|
||||||
|
inBlocks += inIncrement;
|
||||||
|
block4 = VectorLoad(inBlocks);
|
||||||
|
inBlocks += inIncrement;
|
||||||
|
block5 = VectorLoad(inBlocks);
|
||||||
|
inBlocks += inIncrement;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & BT_XorInput)
|
if (flags & BT_XorInput)
|
||||||
{
|
{
|
||||||
const int inc = static_cast<int>(xorIncrement);
|
block0 = VectorXor(block0, VectorLoad(xorBlocks));
|
||||||
block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks));
|
xorBlocks += xorIncrement;
|
||||||
block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks));
|
block1 = VectorXor(block1, VectorLoad(xorBlocks));
|
||||||
block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks));
|
xorBlocks += xorIncrement;
|
||||||
block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks));
|
block2 = VectorXor(block2, VectorLoad(xorBlocks));
|
||||||
block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks));
|
xorBlocks += xorIncrement;
|
||||||
block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks));
|
block3 = VectorXor(block3, VectorLoad(xorBlocks));
|
||||||
xorBlocks += 6*inc;
|
xorBlocks += xorIncrement;
|
||||||
|
block4 = VectorXor(block4, VectorLoad(xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
block5 = VectorXor(block5, VectorLoad(xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
}
|
}
|
||||||
|
|
||||||
func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
|
func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
|
||||||
|
|
||||||
if (xorBlocks && !(flags & BT_XorInput))
|
if (xorBlocks && !(flags & BT_XorInput))
|
||||||
{
|
{
|
||||||
const int inc = static_cast<int>(xorIncrement);
|
block0 = VectorXor(block0, VectorLoad(xorBlocks));
|
||||||
block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks));
|
xorBlocks += xorIncrement;
|
||||||
block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks));
|
block1 = VectorXor(block1, VectorLoad(xorBlocks));
|
||||||
block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks));
|
xorBlocks += xorIncrement;
|
||||||
block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks));
|
block2 = VectorXor(block2, VectorLoad(xorBlocks));
|
||||||
block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks));
|
xorBlocks += xorIncrement;
|
||||||
block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks));
|
block3 = VectorXor(block3, VectorLoad(xorBlocks));
|
||||||
xorBlocks += 6*inc;
|
xorBlocks += xorIncrement;
|
||||||
|
block4 = VectorXor(block4, VectorLoad(xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
block5 = VectorXor(block5, VectorLoad(xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int inc = static_cast<int>(outIncrement);
|
VectorStore(block0, outBlocks);
|
||||||
VectorStore(block0, outBlocks+0*inc);
|
outBlocks += outIncrement;
|
||||||
VectorStore(block1, outBlocks+1*inc);
|
VectorStore(block1, outBlocks);
|
||||||
VectorStore(block2, outBlocks+2*inc);
|
outBlocks += outIncrement;
|
||||||
VectorStore(block3, outBlocks+3*inc);
|
VectorStore(block2, outBlocks);
|
||||||
VectorStore(block4, outBlocks+4*inc);
|
outBlocks += outIncrement;
|
||||||
VectorStore(block5, outBlocks+5*inc);
|
VectorStore(block3, outBlocks);
|
||||||
|
outBlocks += outIncrement;
|
||||||
|
VectorStore(block4, outBlocks);
|
||||||
|
outBlocks += outIncrement;
|
||||||
|
VectorStore(block5, outBlocks);
|
||||||
|
outBlocks += outIncrement;
|
||||||
|
|
||||||
outBlocks += 6*inc;
|
|
||||||
length -= 6*blockSize;
|
length -= 6*blockSize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while (length >= blockSize)
|
while (length >= blockSize)
|
||||||
{
|
{
|
||||||
VectorType block = VectorLoad(inBlocks);
|
uint32x4_p block = VectorLoad(inBlocks);
|
||||||
|
|
||||||
if (flags & BT_XorInput)
|
if (flags & BT_XorInput)
|
||||||
block = VectorXor(block, VectorLoad(xorBlocks));
|
block = VectorXor(block, VectorLoad(xorBlocks));
|
||||||
|
|
|
||||||
54
ppc-simd.h
54
ppc-simd.h
|
|
@ -38,11 +38,59 @@ typedef __vector unsigned int uint32x4_p;
|
||||||
typedef __vector unsigned long long uint64x2_p;
|
typedef __vector unsigned long long uint64x2_p;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/// \brief Default vector type
|
|
||||||
typedef uint32x4_p VectorType;
|
|
||||||
|
|
||||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
|
|
||||||
|
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) && !defined(CRYPTOPP_POWER7_AVAILABLE)
|
||||||
|
|
||||||
|
inline uint32x4_p VectorLoad(const byte src[16])
|
||||||
|
{
|
||||||
|
// http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
|
||||||
|
const uint8x16_p perm = vec_lvsl(0, (uint8_t*)src);
|
||||||
|
const uint8x16_p low = vec_ld(0, (uint8_t*)src);
|
||||||
|
const uint8x16_p high = vec_ld(15, (uint8_t*)src);
|
||||||
|
const uint8x16_p data = vec_perm(low, high, perm);
|
||||||
|
|
||||||
|
#if defined(CRYPTOPP_BIG_ENDIAN)
|
||||||
|
return (uint32x4_p)data;
|
||||||
|
#else
|
||||||
|
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||||
|
return (uint32x4_p)vec_perm(data, data, mask);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void VectorStore(const uint32x4_p data, byte dest[16])
|
||||||
|
{
|
||||||
|
#if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||||
|
const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||||
|
const uint8x16_p t1 = (uint8x16_p)vec_perm(data, data, mask);
|
||||||
|
#else
|
||||||
|
const uint8x16_p t1 = (uint8x16_p)data;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
|
||||||
|
const uint8x16_p t2 = vec_perm(t1, t1, vec_lvsr(0, dest));
|
||||||
|
vec_ste((uint8x16_p) t2, 0, (unsigned char*) dest);
|
||||||
|
vec_ste((uint16x8_p) t2, 1, (unsigned short*)dest);
|
||||||
|
vec_ste((uint32x4_p) t2, 3, (unsigned int*) dest);
|
||||||
|
vec_ste((uint32x4_p) t2, 4, (unsigned int*) dest);
|
||||||
|
vec_ste((uint32x4_p) t2, 8, (unsigned int*) dest);
|
||||||
|
vec_ste((uint32x4_p) t2, 12, (unsigned int*) dest);
|
||||||
|
vec_ste((uint16x8_p) t2, 14, (unsigned short*)dest);
|
||||||
|
vec_ste((uint8x16_p) t2, 15, (unsigned char*) dest);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint32x4_p VectorXor(const uint32x4_p vec1, const uint32x4_p vec2)
|
||||||
|
{
|
||||||
|
return vec_xor(vec1, vec2);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint32x4_p VectorAdd(const uint32x4_p vec1, const uint32x4_p vec2)
|
||||||
|
{
|
||||||
|
return vec_add(vec1, vec2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
|
#if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
|
||||||
|
|
||||||
/// \brief Reverse a 16-byte array
|
/// \brief Reverse a 16-byte array
|
||||||
|
|
|
||||||
|
|
@ -596,12 +596,12 @@ IncrementPointerAndStore(const uint8x16_p& r, uint8_t* p)
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
|
static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||||
|
|
||||||
VectorType k = VectorLoadKey(keys);
|
uint32x4_p k = VectorLoadKey(keys);
|
||||||
block = VectorXor(block, k);
|
block = VectorXor(block, k);
|
||||||
|
|
||||||
for (size_t i=1; i<rounds-1; i+=2)
|
for (size_t i=1; i<rounds-1; i+=2)
|
||||||
|
|
@ -614,14 +614,14 @@ static inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, un
|
||||||
block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys));
|
block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
|
static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
VectorType &block2, VectorType &block3, VectorType &block4,
|
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
|
||||||
VectorType &block5, const word32 *subkeys, unsigned int rounds)
|
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||||
|
|
||||||
VectorType k = VectorLoadKey(keys);
|
uint32x4_p k = VectorLoadKey(keys);
|
||||||
block0 = VectorXor(block0, k);
|
block0 = VectorXor(block0, k);
|
||||||
block1 = VectorXor(block1, k);
|
block1 = VectorXor(block1, k);
|
||||||
block2 = VectorXor(block2, k);
|
block2 = VectorXor(block2, k);
|
||||||
|
|
@ -649,12 +649,12 @@ static inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
|
||||||
block5 = VectorEncryptLast(block5, k);
|
block5 = VectorEncryptLast(block5, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
|
static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||||
|
|
||||||
VectorType k = VectorLoadKey(rounds*16, keys);
|
uint32x4_p k = VectorLoadKey(rounds*16, keys);
|
||||||
block = VectorXor(block, k);
|
block = VectorXor(block, k);
|
||||||
|
|
||||||
for (size_t i=rounds-1; i>1; i-=2)
|
for (size_t i=rounds-1; i>1; i-=2)
|
||||||
|
|
@ -667,14 +667,14 @@ static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, un
|
||||||
block = VectorDecryptLast(block, VectorLoadKey(0, keys));
|
block = VectorDecryptLast(block, VectorLoadKey(0, keys));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1,
|
static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
||||||
VectorType &block2, VectorType &block3, VectorType &block4,
|
uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
|
||||||
VectorType &block5, const word32 *subkeys, unsigned int rounds)
|
uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
|
||||||
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
const byte *keys = reinterpret_cast<const byte*>(subkeys);
|
||||||
|
|
||||||
VectorType k = VectorLoadKey(rounds*16, keys);
|
uint32x4_p k = VectorLoadKey(rounds*16, keys);
|
||||||
block0 = VectorXor(block0, k);
|
block0 = VectorXor(block0, k);
|
||||||
block1 = VectorXor(block1, k);
|
block1 = VectorXor(block1, k);
|
||||||
block2 = VectorXor(block2, k);
|
block2 = VectorXor(block2, k);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue