diff --git a/adv-simd.h b/adv-simd.h index b3d8a847..5fa0313a 100644 --- a/adv-simd.h +++ b/adv-simd.h @@ -1285,11 +1285,11 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su { while (length >= 6*blockSize) { - VectorType block0, block1, block2, block3, block4, block5, temp; - block0 = VectorLoad(inBlocks); + uint32x4_p block0, block1, block2, block3, block4, block5, temp; if (flags & BT_InBlockIsCounter) { + block0 = VectorLoad(inBlocks); block1 = VectorAdd(block0, s_one); block2 = VectorAdd(block1, s_one); block3 = VectorAdd(block2, s_one); @@ -1300,57 +1300,74 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su } else { - const int inc = static_cast(inIncrement); - block1 = VectorLoad(1*inc, inBlocks); - block2 = VectorLoad(2*inc, inBlocks); - block3 = VectorLoad(3*inc, inBlocks); - block4 = VectorLoad(4*inc, inBlocks); - block5 = VectorLoad(5*inc, inBlocks); - inBlocks += 6*inc; + block0 = VectorLoad(inBlocks); + inBlocks += inIncrement; + block1 = VectorLoad(inBlocks); + inBlocks += inIncrement; + block2 = VectorLoad(inBlocks); + inBlocks += inIncrement; + block3 = VectorLoad(inBlocks); + inBlocks += inIncrement; + block4 = VectorLoad(inBlocks); + inBlocks += inIncrement; + block5 = VectorLoad(inBlocks); + inBlocks += inIncrement; } if (flags & BT_XorInput) { - const int inc = static_cast(xorIncrement); - block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks)); - block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks)); - block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks)); - block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks)); - block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks)); - block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks)); - xorBlocks += 6*inc; + block0 = VectorXor(block0, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; + block1 = VectorXor(block1, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; + block2 = VectorXor(block2, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; + block3 = VectorXor(block3, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; + block4 = VectorXor(block4, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; + block5 = VectorXor(block5, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; } func6(block0, block1, block2, block3, block4, block5, subKeys, rounds); if (xorBlocks && !(flags & BT_XorInput)) { - const int inc = static_cast(xorIncrement); - block0 = VectorXor(block0, VectorLoad(0*inc, xorBlocks)); - block1 = VectorXor(block1, VectorLoad(1*inc, xorBlocks)); - block2 = VectorXor(block2, VectorLoad(2*inc, xorBlocks)); - block3 = VectorXor(block3, VectorLoad(3*inc, xorBlocks)); - block4 = VectorXor(block4, VectorLoad(4*inc, xorBlocks)); - block5 = VectorXor(block5, VectorLoad(5*inc, xorBlocks)); - xorBlocks += 6*inc; + block0 = VectorXor(block0, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; + block1 = VectorXor(block1, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; + block2 = VectorXor(block2, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; + block3 = VectorXor(block3, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; + block4 = VectorXor(block4, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; + block5 = VectorXor(block5, VectorLoad(xorBlocks)); + xorBlocks += xorIncrement; } - const int inc = static_cast(outIncrement); - VectorStore(block0, outBlocks+0*inc); - VectorStore(block1, outBlocks+1*inc); - VectorStore(block2, outBlocks+2*inc); - VectorStore(block3, outBlocks+3*inc); - VectorStore(block4, outBlocks+4*inc); - VectorStore(block5, outBlocks+5*inc); + VectorStore(block0, outBlocks); + outBlocks += outIncrement; + VectorStore(block1, outBlocks); + outBlocks += outIncrement; + VectorStore(block2, outBlocks); + outBlocks += outIncrement; + VectorStore(block3, outBlocks); + outBlocks += outIncrement; + VectorStore(block4, outBlocks); + outBlocks += outIncrement; + VectorStore(block5, outBlocks); + outBlocks += outIncrement; - outBlocks += 6*inc; length -= 6*blockSize; } } while (length >= blockSize) { - VectorType block = VectorLoad(inBlocks); + uint32x4_p block = VectorLoad(inBlocks); if (flags & BT_XorInput) block = VectorXor(block, VectorLoad(xorBlocks)); diff --git a/ppc-simd.h b/ppc-simd.h index d1f04607..9405ff72 100644 --- a/ppc-simd.h +++ b/ppc-simd.h @@ -38,11 +38,59 @@ typedef __vector unsigned int uint32x4_p; typedef __vector unsigned long long uint64x2_p; #endif -/// \brief Default vector type -typedef uint32x4_p VectorType; - #endif // CRYPTOPP_ALTIVEC_AVAILABLE +#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) && !defined(CRYPTOPP_POWER7_AVAILABLE) + +inline uint32x4_p VectorLoad(const byte src[16]) +{ + // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf + const uint8x16_p perm = vec_lvsl(0, (uint8_t*)src); + const uint8x16_p low = vec_ld(0, (uint8_t*)src); + const uint8x16_p high = vec_ld(15, (uint8_t*)src); + const uint8x16_p data = vec_perm(low, high, perm); + +#if defined(CRYPTOPP_BIG_ENDIAN) + return (uint32x4_p)data; +#else + const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + return (uint32x4_p)vec_perm(data, data, mask); +#endif +} + +inline void VectorStore(const uint32x4_p data, byte dest[16]) +{ +#if defined(CRYPTOPP_LITTLE_ENDIAN) + const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + const uint8x16_p t1 = (uint8x16_p)vec_perm(data, data, mask); +#else + const uint8x16_p t1 = (uint8x16_p)data; +#endif + + // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf + const uint8x16_p t2 = vec_perm(t1, t1, vec_lvsr(0, dest)); + vec_ste((uint8x16_p) t2, 0, (unsigned char*) dest); + vec_ste((uint16x8_p) t2, 1, (unsigned short*)dest); + vec_ste((uint32x4_p) t2, 3, (unsigned int*) dest); + vec_ste((uint32x4_p) t2, 4, (unsigned int*) dest); + vec_ste((uint32x4_p) t2, 8, (unsigned int*) dest); + vec_ste((uint32x4_p) t2, 12, (unsigned int*) dest); + vec_ste((uint16x8_p) t2, 14, (unsigned short*)dest); + vec_ste((uint8x16_p) t2, 15, (unsigned char*) dest); +} + +inline uint32x4_p VectorXor(const uint32x4_p vec1, const uint32x4_p vec2) +{ + return vec_xor(vec1, vec2); +} + +inline uint32x4_p VectorAdd(const uint32x4_p vec1, const uint32x4_p vec2) +{ + return vec_add(vec1, vec2); +} + +#endif + #if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) /// \brief Reverse a 16-byte array diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index 0c5c7c52..032899e2 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -596,12 +596,12 @@ IncrementPointerAndStore(const uint8x16_p& r, uint8_t* p) return p; } -static inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds) +static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds) { CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); const byte *keys = reinterpret_cast(subkeys); - VectorType k = VectorLoadKey(keys); + uint32x4_p k = VectorLoadKey(keys); block = VectorXor(block, k); for (size_t i=1; i(subkeys); - VectorType k = VectorLoadKey(keys); + uint32x4_p k = VectorLoadKey(keys); block0 = VectorXor(block0, k); block1 = VectorXor(block1, k); block2 = VectorXor(block2, k); @@ -649,12 +649,12 @@ static inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1, block5 = VectorEncryptLast(block5, k); } -static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds) +static inline void POWER8_Dec_Block(uint32x4_p &block, const word32 *subkeys, unsigned int rounds) { CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); const byte *keys = reinterpret_cast(subkeys); - VectorType k = VectorLoadKey(rounds*16, keys); + uint32x4_p k = VectorLoadKey(rounds*16, keys); block = VectorXor(block, k); for (size_t i=rounds-1; i>1; i-=2) @@ -667,14 +667,14 @@ static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, un block = VectorDecryptLast(block, VectorLoadKey(0, keys)); } -static inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1, - VectorType &block2, VectorType &block3, VectorType &block4, - VectorType &block5, const word32 *subkeys, unsigned int rounds) +static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, + uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4, + uint32x4_p &block5, const word32 *subkeys, unsigned int rounds) { CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); const byte *keys = reinterpret_cast(subkeys); - VectorType k = VectorLoadKey(rounds*16, keys); + uint32x4_p k = VectorLoadKey(rounds*16, keys); block0 = VectorXor(block0, k); block1 = VectorXor(block1, k); block2 = VectorXor(block2, k);