diff --git a/adv-simd.h b/adv-simd.h index ff7e21f3..606d9f1b 100644 --- a/adv-simd.h +++ b/adv-simd.h @@ -1849,44 +1849,44 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, if (flags & BT_InBlockIsCounter) { - block0 = VectorLoad(inBlocks); + block0 = VectorLoadBE(inBlocks); block1 = VectorAdd(block0, s_one); block2 = VectorAdd(block1, s_one); block3 = VectorAdd(block2, s_one); block4 = VectorAdd(block3, s_one); block5 = VectorAdd(block4, s_one); temp = VectorAdd(block5, s_one); - VectorStore(temp, const_cast(inBlocks)); + VectorStoreBE(temp, const_cast(inBlocks)); } else { - block0 = VectorLoad(inBlocks); + block0 = VectorLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block1 = VectorLoad(inBlocks); + block1 = VectorLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block2 = VectorLoad(inBlocks); + block2 = VectorLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block3 = VectorLoad(inBlocks); + block3 = VectorLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block4 = VectorLoad(inBlocks); + block4 = VectorLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); - block5 = VectorLoad(inBlocks); + block5 = VectorLoadBE(inBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); } if (xorInput) { - block0 = VectorXor(block0, VectorLoad(xorBlocks)); + block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VectorXor(block1, VectorLoad(xorBlocks)); + block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = VectorXor(block2, VectorLoad(xorBlocks)); + block2 = VectorXor(block2, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = VectorXor(block3, VectorLoad(xorBlocks)); + block3 = VectorXor(block3, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = VectorXor(block4, VectorLoad(xorBlocks)); + block4 = VectorXor(block4, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = VectorXor(block5, VectorLoad(xorBlocks)); + block5 = VectorXor(block5, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); } @@ -1894,31 +1894,31 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, if (xorOutput) { - block0 = VectorXor(block0, VectorLoad(xorBlocks)); + block0 = VectorXor(block0, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block1 = VectorXor(block1, VectorLoad(xorBlocks)); + block1 = VectorXor(block1, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block2 = VectorXor(block2, VectorLoad(xorBlocks)); + block2 = VectorXor(block2, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block3 = VectorXor(block3, VectorLoad(xorBlocks)); + block3 = VectorXor(block3, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block4 = VectorXor(block4, VectorLoad(xorBlocks)); + block4 = VectorXor(block4, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); - block5 = VectorXor(block5, VectorLoad(xorBlocks)); + block5 = VectorXor(block5, VectorLoadBE(xorBlocks)); xorBlocks = PtrAdd(xorBlocks, xorIncrement); } - VectorStore(block0, outBlocks); + VectorStoreBE(block0, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStore(block1, outBlocks); + VectorStoreBE(block1, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStore(block2, outBlocks); + VectorStoreBE(block2, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStore(block3, outBlocks); + VectorStoreBE(block3, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStore(block4, outBlocks); + VectorStoreBE(block4, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); - VectorStore(block5, outBlocks); + VectorStoreBE(block5, outBlocks); outBlocks = PtrAdd(outBlocks, outIncrement); length -= 6*blockSize; @@ -1927,10 +1927,10 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, while (length >= blockSize) { - uint32x4_p block = VectorLoad(inBlocks); + uint32x4_p block = VectorLoadBE(inBlocks); if (xorInput) - block = VectorXor(block, VectorLoad(xorBlocks)); + block = VectorXor(block, VectorLoadBE(xorBlocks)); if (flags & BT_InBlockIsCounter) const_cast(inBlocks)[15]++; @@ -1938,9 +1938,9 @@ inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, func1(block, subKeys, rounds); if (xorOutput) - block = VectorXor(block, VectorLoad(xorBlocks)); + block = VectorXor(block, VectorLoadBE(xorBlocks)); - VectorStore(block, outBlocks); + VectorStoreBE(block, outBlocks); inBlocks = PtrAdd(inBlocks, inIncrement); outBlocks = PtrAdd(outBlocks, outIncrement); diff --git a/ppc-simd.h b/ppc-simd.h index 7e7ef52e..181a8252 100644 --- a/ppc-simd.h +++ b/ppc-simd.h @@ -32,11 +32,11 @@ # undef CRYPTOPP_POWER7_AVAILABLE #endif -#if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(_CRYPTO)) +#if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(__CRYPTO) || defined(__CRYPTO__)) # undef CRYPTOPP_POWER8_AVAILABLE # undef CRYPTOPP_POWER8_AES_AVAILABLE -# undef CRYPTOPP_POWER8_SHA_AVAILABLE # undef CRYPTOPP_POWER8_PMULL_AVAILABLE +# undef CRYPTOPP_POWER8_SHA_AVAILABLE #endif #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) @@ -65,7 +65,7 @@ typedef __vector unsigned long long uint64x2_p; /// \tparam T vector type /// \param src the vector /// \details Reverse() endian swaps the bytes in a vector -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() +/// \sa Reverse(), VectorLoadBE(), VectorLoad() /// \since Crypto++ 6.0 template inline T Reverse(const T& src) @@ -151,33 +151,45 @@ inline T1 VectorShiftLeft(const T1& vec1, const T2& vec2) #endif } +/// \brief Shift two vectors right +/// \tparam C shift byte count +/// \tparam T1 vector type +/// \tparam T2 vector type +/// \param vec1 the first vector +/// \param vec2 the second vector +/// \details VectorShiftRight() concatenates vec1 and vec2 and returns a +/// new vector after shifting the concatenation by the specified number +/// of bytes. Both vec1 and vec2 are cast to uint8x16_p. The return +/// vector is the same type as vec1. +/// \details On big endian machines VectorShiftRight() is vec_sld(a, b, +/// c). On little endian machines VectorShiftRight() is translated to +/// vec_sld(b, a, 16-c). You should always call the function as +/// if on a big endian machine as shown below. +///
+///    uint8x16_p r0 = {0};
+///    uint8x16_p r1 = VectorLoad(ptr);
+///    uint8x16_p r5 = VectorShiftRight<12>(r0, r1);
+/// 
+/// \sa Is vec_sld +/// endian sensitive? on Stack Overflow +/// \since Crypto++ 6.0 +template +inline T1 VectorShiftRight(const T1& vec1, const T2& vec2) +{ + return VectorShiftLeft<16-C>(vec1, vec2); +} + #endif // POWER4 and above // POWER7/POWER4 load and store #if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) -/// \brief Reverse a 16-byte array -/// \param src the byte array -/// \details ReverseByteArrayLE reverses a 16-byte array on a little endian -/// system. It does nothing on a big endian system. -/// \since Crypto++ 6.0 -inline void ReverseByteArrayLE(byte src[16]) -{ -#if defined(CRYPTOPP_XLC_VERSION) && defined(CRYPTOPP_LITTLE_ENDIAN) - vec_st(vec_reve(vec_ld(0, src)), 0, src); -#elif defined(CRYPTOPP_LITTLE_ENDIAN) - const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - const uint8x16_p zero = {0}; - vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, src); -#endif -} - /// \brief Loads a vector from a byte array /// \param src the byte array /// \details Loads a vector in big endian format from a byte array. /// VectorLoadBE will swap endianess on little endian systems. /// \note VectorLoadBE() does not require an aligned array. -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() +/// \sa Reverse(), VectorLoadBE(), VectorLoad() /// \since Crypto++ 6.0 inline uint32x4_p VectorLoadBE(const uint8_t src[16]) { @@ -198,7 +210,7 @@ inline uint32x4_p VectorLoadBE(const uint8_t src[16]) /// \details Loads a vector in big endian format from a byte array. /// VectorLoadBE will swap endianess on little endian systems. /// \note VectorLoadBE does not require an aligned array. -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() +/// \sa Reverse(), VectorLoadBE(), VectorLoad() /// \since Crypto++ 6.0 inline uint32x4_p VectorLoadBE(int off, const uint8_t src[16]) { @@ -215,70 +227,27 @@ inline uint32x4_p VectorLoadBE(int off, const uint8_t src[16]) /// \brief Loads a vector from a byte array /// \param src the byte array -/// \details Loads a vector in big endian format from a byte array. -/// VectorLoad will swap endianess on little endian systems. +/// \details Loads a vector in native endian format from a byte array. /// \note VectorLoad does not require an aligned array. -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() +/// \sa Reverse(), VectorLoadBE(), VectorLoad() /// \since Crypto++ 6.0 inline uint32x4_p VectorLoad(const byte src[16]) { - return (uint32x4_p)VectorLoadBE(src); +#if defined(CRYPTOPP_XLC_VERSION) + return (uint32x4_p)vec_xl(0, (byte*)src); +#else + return (uint32x4_p)vec_vsx_ld(0, src); +#endif } /// \brief Loads a vector from a byte array /// \param src the byte array /// \param off offset into the src byte array -/// \details Loads a vector in big endian format from a byte array. -/// VectorLoad will swap endianess on little endian systems. +/// \details Loads a vector in native endian format from a byte array. /// \note VectorLoad does not require an aligned array. -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() +/// \sa Reverse(), VectorLoadBE(), VectorLoad() /// \since Crypto++ 6.0 inline uint32x4_p VectorLoad(int off, const byte src[16]) -{ - return (uint32x4_p)VectorLoadBE(off, src); -} - -/// \brief Loads a vector from a byte array -/// \param src the byte array -/// \details Loads a vector from a byte array. -/// VectorLoadKey does not swap endianess on little endian systems. -/// \note VectorLoadKey does not require an aligned array. -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoadKey(const byte src[16]) -{ -#if defined(CRYPTOPP_XLC_VERSION) - return (uint32x4_p)vec_xl(0, (byte*)src); -#else - return (uint32x4_p)vec_vsx_ld(0, src); -#endif -} - -/// \brief Loads a vector from a 32-bit word array -/// \param src the 32-bit word array -/// \details Loads a vector from a 32-bit word array. -/// VectorLoadKey does not swap endianess on little endian systems. -/// \note VectorLoadKey does not require an aligned array. -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoadKey(const word32 src[4]) -{ -#if defined(CRYPTOPP_XLC_VERSION) - return (uint32x4_p)vec_xl(0, (byte*)src); -#else - return (uint32x4_p)vec_vsx_ld(0, src); -#endif -} - -/// \brief Loads a vector from a byte array -/// \param src the byte array -/// \param off offset into the src byte array -/// \details Loads a vector from a byte array. -/// VectorLoadKey does not swap endianess on little endian systems. -/// \note VectorLoadKey does not require an aligned array. -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoadKey(int off, const byte src[16]) { #if defined(CRYPTOPP_XLC_VERSION) return (uint32x4_p)vec_xl(off, (byte*)src); @@ -294,7 +263,7 @@ inline uint32x4_p VectorLoadKey(int off, const byte src[16]) /// \details Stores a vector in big endian format to a byte array. /// VectorStoreBE will swap endianess on little endian systems. /// \note VectorStoreBE does not require an aligned array. -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() +/// \sa Reverse(), VectorLoadBE(), VectorLoad() /// \since Crypto++ 6.0 template inline void VectorStoreBE(const T& src, uint8_t dest[16]) @@ -318,7 +287,7 @@ inline void VectorStoreBE(const T& src, uint8_t dest[16]) /// \details Stores a vector in big endian format to a byte array. /// VectorStoreBE will swap endianess on little endian systems. /// \note VectorStoreBE does not require an aligned array. -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() +/// \sa Reverse(), VectorLoadBE(), VectorLoad() /// \since Crypto++ 6.0 template inline void VectorStoreBE(const T& src, int off, uint8_t dest[16]) @@ -338,8 +307,7 @@ inline void VectorStoreBE(const T& src, int off, uint8_t dest[16]) /// \tparam T vector type /// \param src the vector /// \param dest the byte array -/// \details Stores a vector in big endian format to a byte array. -/// VectorStore will swap endianess on little endian systems. +/// \details Stores a vector in native endian format to a byte array. /// \note VectorStore does not require an aligned array. /// \since Crypto++ 6.0 template @@ -347,13 +315,9 @@ inline void VectorStore(const T& src, byte dest[16]) { // Do not call VectorStoreBE. It slows us down by about 0.5 cpb on LE. #if defined(CRYPTOPP_XLC_VERSION) - vec_xst_be((uint8x16_p)src, 0, dest); + vec_xst((uint8x16_p)src, 0, dest); #else -# if defined(CRYPTOPP_LITTLE_ENDIAN) - vec_vsx_st(Reverse((uint8x16_p)src), 0, dest); -# else vec_vsx_st((uint8x16_p)src, 0, dest); -# endif #endif } @@ -362,8 +326,7 @@ inline void VectorStore(const T& src, byte dest[16]) /// \param src the vector /// \param off offset into the dest byte array /// \param dest the byte array -/// \details Stores a vector in big endian format to a byte array. -/// VectorStore will swap endianess on little endian systems. +/// \details Stores a vector in native endian format to a byte array. /// \note VectorStore does not require an aligned array. /// \since Crypto++ 6.0 template @@ -371,13 +334,9 @@ inline void VectorStore(const T& src, int off, byte dest[16]) { // Do not call VectorStoreBE. It slows us down by about 0.5 cpb on LE. #if defined(CRYPTOPP_XLC_VERSION) - vec_xst_be((uint8x16_p)src, off, dest); + vec_xst((uint8x16_p)src, off, dest); #else -# if defined(CRYPTOPP_LITTLE_ENDIAN) - vec_vsx_st(Reverse((uint8x16_p)src), off, dest); -# else vec_vsx_st((uint8x16_p)src, off, dest); -# endif #endif } diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index 8b98c1ce..67e82527 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -697,17 +697,17 @@ static inline void POWER8_Enc_Block(uint32x4_p &block, const word32 *subkeys, un CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); const byte *keys = reinterpret_cast(subkeys); - uint32x4_p k = VectorLoadKey(keys); + uint32x4_p k = VectorLoad(keys); block = VectorXor(block, k); for (size_t i=1; i(subkeys); - uint32x4_p k = VectorLoadKey(keys); + uint32x4_p k = VectorLoad(keys); block0 = VectorXor(block0, k); block1 = VectorXor(block1, k); block2 = VectorXor(block2, k); @@ -727,7 +727,7 @@ static inline void POWER8_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, for (size_t i=1; i(subkeys); - uint32x4_p k = VectorLoadKey(rounds*16, keys); + uint32x4_p k = VectorLoad(rounds*16, keys); block = VectorXor(block, k); for (size_t i=rounds-1; i>1; i-=2) { - block = VectorDecrypt(block, VectorLoadKey( i*16, keys)); - block = VectorDecrypt(block, VectorLoadKey((i-1)*16, keys)); + block = VectorDecrypt(block, VectorLoad( i*16, keys)); + block = VectorDecrypt(block, VectorLoad((i-1)*16, keys)); } - block = VectorDecrypt(block, VectorLoadKey(16, keys)); - block = VectorDecryptLast(block, VectorLoadKey(0, keys)); + block = VectorDecrypt(block, VectorLoad(16, keys)); + block = VectorDecryptLast(block, VectorLoad(0, keys)); } static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, @@ -770,7 +770,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); const byte *keys = reinterpret_cast(subkeys); - uint32x4_p k = VectorLoadKey(rounds*16, keys); + uint32x4_p k = VectorLoad(rounds*16, keys); block0 = VectorXor(block0, k); block1 = VectorXor(block1, k); block2 = VectorXor(block2, k); @@ -780,7 +780,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, for (size_t i=rounds-1; i>0; --i) { - k = VectorLoadKey(i*16, keys); + k = VectorLoad(i*16, keys); block0 = VectorDecrypt(block0, k); block1 = VectorDecrypt(block1, k); block2 = VectorDecrypt(block2, k); @@ -789,7 +789,7 @@ static inline void POWER8_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, block5 = VectorDecrypt(block5, k); } - k = VectorLoadKey(0, keys); + k = VectorLoad(0, keys); block0 = VectorDecryptLast(block0, k); block1 = VectorDecryptLast(block1, k); block2 = VectorDecryptLast(block2, k); @@ -804,60 +804,62 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32* { const size_t rounds = keyLen / 4 + 6; const word32 *rc = s_rconBE; + word32 *rkey = rk, temp; - GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen); - word32 *rk_saved = rk, temp; // unused in big-endian - CRYPTOPP_UNUSED(rk_saved); + GetUserKey(BIG_ENDIAN_ORDER, rkey, keyLen/4, userKey, keyLen); // keySize: m_key allocates 4*(rounds+1) word32's. const size_t keySize = 4*(rounds+1); - const word32* end = rk + keySize; + const word32* end = rkey + keySize; while (true) { - temp = rk[keyLen/4-1]; + temp = rkey[keyLen/4-1]; word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)]; - rk[keyLen/4] = rk[0] ^ x ^ *(rc++); - rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4]; - rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1]; - rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2]; + rkey[keyLen/4] = rkey[0] ^ x ^ *(rc++); + rkey[keyLen/4+1] = rkey[1] ^ rkey[keyLen/4]; + rkey[keyLen/4+2] = rkey[2] ^ rkey[keyLen/4+1]; + rkey[keyLen/4+3] = rkey[3] ^ rkey[keyLen/4+2]; - if (rk + keyLen/4 + 4 == end) + if (rkey + keyLen/4 + 4 == end) break; if (keyLen == 24) { - rk[10] = rk[ 4] ^ rk[ 9]; - rk[11] = rk[ 5] ^ rk[10]; + rkey[10] = rkey[ 4] ^ rkey[ 9]; + rkey[11] = rkey[ 5] ^ rkey[10]; } else if (keyLen == 32) { - temp = rk[11]; - rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; - rk[13] = rk[ 5] ^ rk[12]; - rk[14] = rk[ 6] ^ rk[13]; - rk[15] = rk[ 7] ^ rk[14]; + temp = rkey[11]; + rkey[12] = rkey[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; + rkey[13] = rkey[ 5] ^ rkey[12]; + rkey[14] = rkey[ 6] ^ rkey[13]; + rkey[15] = rkey[ 7] ^ rkey[14]; } - rk += keyLen/4; + rkey += keyLen/4; } #if defined(CRYPTOPP_LITTLE_ENDIAN) - rk = rk_saved; + rkey = rk; const uint8x16_p mask = ((uint8x16_p){12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}); const uint8x16_p zero = {0}; unsigned int i=0; - for (i=0; i