From 10f85d65967bbe15ad807ee214fdf4babec1a991 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Thu, 15 Nov 2018 02:11:00 -0500 Subject: [PATCH] Make Altivec vector wraps friendly to downgrades The way the existing ppc_simd.h is written makes it hard to to switch between the old Altivec loads and stores and the new POWER7 loads and stores. This checkin rewrites the wrappers to use _ALTIVEC_, _ARCH_PWR7 and _ARCH_PWR8. The wrappers in this file now honor -maltivec, -mcpu-power7 and -mcpu=power8. It allows users to compile a source file, like chacha_simd.cpp, with a lower ISA and things just work for them. --- chacha.cpp | 19 +- chacha_simd.cpp | 133 ++++----- ppc_simd.h | 760 ++++++++++++++++++++++++++---------------------- 3 files changed, 468 insertions(+), 444 deletions(-) diff --git a/chacha.cpp b/chacha.cpp index 333a58a6..a89fa016 100644 --- a/chacha.cpp +++ b/chacha.cpp @@ -24,7 +24,7 @@ extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds); #endif -#if (CRYPTOPP_POWER7_AVAILABLE) +#if (CRYPTOPP_ALTIVEC_AVAILABLE) extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds); #endif @@ -85,6 +85,11 @@ std::string ChaCha_Policy::AlgorithmProvider() const if (HasPower7()) return "Power7"; else +#endif +#if (CRYPTOPP_ALTIVEC_AVAILABLE) + if (HasAltivec()) + return "Altivec"; + else #endif return "C++"; } @@ -139,8 +144,8 @@ unsigned int ChaCha_Policy::GetAlignment() const return 16; else #endif -#if (CRYPTOPP_POWER7_AVAILABLE) - if (HasPower7()) +#if (CRYPTOPP_ALTIVEC_AVAILABLE) + if (HasAltivec()) return 16; else #endif @@ -164,8 +169,8 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const return 4*BYTES_PER_ITERATION; else #endif -#if (CRYPTOPP_POWER7_AVAILABLE) - if (HasPower7()) +#if (CRYPTOPP_ALTIVEC_AVAILABLE) + if (HasAltivec()) return 4*BYTES_PER_ITERATION; else #endif @@ -245,8 +250,8 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, } #endif -#if (CRYPTOPP_POWER7_AVAILABLE) - if (HasPower7()) +#if (CRYPTOPP_ALTIVEC_AVAILABLE) + if (HasAltivec()) { while (iterationCount >= 4 && MultiBlockSafe(4)) { diff --git a/chacha_simd.cpp b/chacha_simd.cpp index 77e4efdd..97e78f49 100644 --- a/chacha_simd.cpp +++ b/chacha_simd.cpp @@ -2,7 +2,7 @@ // Jack Lloyd and Jeffrey Walton // // This source file uses intrinsics and built-ins to gain access to -// SSE2, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate +// SSE2, ARM NEON and ARMv8a, Power7 and Altivec instructions. A separate // source file is needed because additional CXXFLAGS are required to enable // the appropriate instructions sets in some build configurations. // @@ -54,7 +54,7 @@ # include #endif -#if defined(CRYPTOPP_POWER7_AVAILABLE) +#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) # include "ppc_simd.h" #endif @@ -201,25 +201,24 @@ inline __m128i RotateLeft<16>(const __m128i val) #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE -// **************************** POWER7 **************************** // +// **************************** Altivec **************************** // -#if (CRYPTOPP_POWER7_AVAILABLE) +#if (CRYPTOPP_ALTIVEC_AVAILABLE) -// POWER8 is optional and runs about 0.6 cpb faster because -// of the native 64-bit vector add. That's about 700 MB/s on -// GCC112 from the compile farm. Use -mcpu=power8 to engage -// POWER8. POWER7 lacks 64-bit element support, so code built -// with -mcpu=power8 will SIGILL on POWER7 machines. +// ChaCha_OperateKeystream_POWER7 is optimized for POWER7. However, Altivec +// is supported by using vec_ld and vec_st, and using a composite vec_add +// that supports 64-bit element adds. vec_ld and vec_st add significant +// overhead when memory is not aligned. Despite the drawbacks Altivec +// is profitable. The numbers for ChaCha8 are: +// +// PowerMac, C++, 2.0 GHz: 205 MB/s, 9.29 cpb +// PowerMac, Altivec, 2.0 GHz: 471 MB/s, 4.09 cpb using CryptoPP::uint8x16_p; using CryptoPP::uint32x4_p; using CryptoPP::VectorLoad; using CryptoPP::VectorStore; -#if (_ARCH_PWR8 || _ARCH_PWR9) -using CryptoPP::uint64x2_p; -#endif - // Permutes bytes in packed 32-bit words to little endian. // State is already in proper endian order. Input and // output must be permuted during load and save. @@ -241,28 +240,12 @@ inline void VectorStore32LE(uint8_t dest[16], const uint32x4_p& val) { #if (CRYPTOPP_BIG_ENDIAN) const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; - VectorStore(dest, vec_perm(val, val, mask)); + VectorStore(vec_perm(val, val, mask), dest); #else - return VectorStore(dest, val); + return VectorStore(val, dest); #endif } -// Rotate packed 32-bit words left by bit count -template -inline uint32x4_p RotateLeft(const uint32x4_p val) -{ - const uint32x4_p m = {C, C, C, C}; - return vec_rl(val, m); -} - -// Rotate packed 32-bit words right by bit count -template -inline uint32x4_p RotateRight(const uint32x4_p val) -{ - const uint32x4_p m = {32-C, 32-C, 32-C, 32-C}; - return vec_rl(val, m); -} - // ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte // rotation on the 128-bit vector word: // * [3,2,1,0] => [0,3,2,1] is Shuffle<1>(x) @@ -296,25 +279,7 @@ inline uint32x4_p Shuffle<3>(const uint32x4_p& val) return vec_perm(val, val, mask); } -// Helper to perform 64-bit addition across two elements of 32-bit vectors -inline uint32x4_p VectorAdd64(const uint32x4_p& a, const uint32x4_p& b) -{ -#if (_ARCH_PWR8 || _ARCH_PWR9) - return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b); -#else - // The carry mask selects carries from elements 1 and 3 and sets remaining - // elements to 0. The mask also shifts the carried values left by 4 bytes - // so the carries are added to elements 0 and 2. - const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16}; - const uint32x4_p zero = {0, 0, 0, 0}; - - uint32x4_p cy = vec_addc(a, b); - cy = vec_perm(cy, zero, cmask); - return vec_add(vec_add(a, b), cy); -#endif -} - -#endif // CRYPTOPP_POWER7_AVAILABLE +#endif // CRYPTOPP_ALTIVEC_AVAILABLE ANONYMOUS_NAMESPACE_END @@ -856,7 +821,7 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte * #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE -#if (CRYPTOPP_POWER7_AVAILABLE) +#if (CRYPTOPP_ALTIVEC_AVAILABLE) void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds) { @@ -901,10 +866,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte r2_3 = VectorXor(r2_3, r2_0); r3_3 = VectorXor(r3_3, r3_0); - r0_3 = RotateLeft<16>(r0_3); - r1_3 = RotateLeft<16>(r1_3); - r2_3 = RotateLeft<16>(r2_3); - r3_3 = RotateLeft<16>(r3_3); + r0_3 = VectorRotateLeft<16>(r0_3); + r1_3 = VectorRotateLeft<16>(r1_3); + r2_3 = VectorRotateLeft<16>(r2_3); + r3_3 = VectorRotateLeft<16>(r3_3); r0_2 = VectorAdd(r0_2, r0_3); r1_2 = VectorAdd(r1_2, r1_3); @@ -916,10 +881,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte r2_1 = VectorXor(r2_1, r2_2); r3_1 = VectorXor(r3_1, r3_2); - r0_1 = RotateLeft<12>(r0_1); - r1_1 = RotateLeft<12>(r1_1); - r2_1 = RotateLeft<12>(r2_1); - r3_1 = RotateLeft<12>(r3_1); + r0_1 = VectorRotateLeft<12>(r0_1); + r1_1 = VectorRotateLeft<12>(r1_1); + r2_1 = VectorRotateLeft<12>(r2_1); + r3_1 = VectorRotateLeft<12>(r3_1); r0_0 = VectorAdd(r0_0, r0_1); r1_0 = VectorAdd(r1_0, r1_1); @@ -931,10 +896,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte r2_3 = VectorXor(r2_3, r2_0); r3_3 = VectorXor(r3_3, r3_0); - r0_3 = RotateLeft<8>(r0_3); - r1_3 = RotateLeft<8>(r1_3); - r2_3 = RotateLeft<8>(r2_3); - r3_3 = RotateLeft<8>(r3_3); + r0_3 = VectorRotateLeft<8>(r0_3); + r1_3 = VectorRotateLeft<8>(r1_3); + r2_3 = VectorRotateLeft<8>(r2_3); + r3_3 = VectorRotateLeft<8>(r3_3); r0_2 = VectorAdd(r0_2, r0_3); r1_2 = VectorAdd(r1_2, r1_3); @@ -946,10 +911,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte r2_1 = VectorXor(r2_1, r2_2); r3_1 = VectorXor(r3_1, r3_2); - r0_1 = RotateLeft<7>(r0_1); - r1_1 = RotateLeft<7>(r1_1); - r2_1 = RotateLeft<7>(r2_1); - r3_1 = RotateLeft<7>(r3_1); + r0_1 = VectorRotateLeft<7>(r0_1); + r1_1 = VectorRotateLeft<7>(r1_1); + r2_1 = VectorRotateLeft<7>(r2_1); + r3_1 = VectorRotateLeft<7>(r3_1); r0_1 = Shuffle<1>(r0_1); r0_2 = Shuffle<2>(r0_2); @@ -977,10 +942,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte r2_3 = VectorXor(r2_3, r2_0); r3_3 = VectorXor(r3_3, r3_0); - r0_3 = RotateLeft<16>(r0_3); - r1_3 = RotateLeft<16>(r1_3); - r2_3 = RotateLeft<16>(r2_3); - r3_3 = RotateLeft<16>(r3_3); + r0_3 = VectorRotateLeft<16>(r0_3); + r1_3 = VectorRotateLeft<16>(r1_3); + r2_3 = VectorRotateLeft<16>(r2_3); + r3_3 = VectorRotateLeft<16>(r3_3); r0_2 = VectorAdd(r0_2, r0_3); r1_2 = VectorAdd(r1_2, r1_3); @@ -992,10 +957,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte r2_1 = VectorXor(r2_1, r2_2); r3_1 = VectorXor(r3_1, r3_2); - r0_1 = RotateLeft<12>(r0_1); - r1_1 = RotateLeft<12>(r1_1); - r2_1 = RotateLeft<12>(r2_1); - r3_1 = RotateLeft<12>(r3_1); + r0_1 = VectorRotateLeft<12>(r0_1); + r1_1 = VectorRotateLeft<12>(r1_1); + r2_1 = VectorRotateLeft<12>(r2_1); + r3_1 = VectorRotateLeft<12>(r3_1); r0_0 = VectorAdd(r0_0, r0_1); r1_0 = VectorAdd(r1_0, r1_1); @@ -1007,10 +972,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte r2_3 = VectorXor(r2_3, r2_0); r3_3 = VectorXor(r3_3, r3_0); - r0_3 = RotateLeft<8>(r0_3); - r1_3 = RotateLeft<8>(r1_3); - r2_3 = RotateLeft<8>(r2_3); - r3_3 = RotateLeft<8>(r3_3); + r0_3 = VectorRotateLeft<8>(r0_3); + r1_3 = VectorRotateLeft<8>(r1_3); + r2_3 = VectorRotateLeft<8>(r2_3); + r3_3 = VectorRotateLeft<8>(r3_3); r0_2 = VectorAdd(r0_2, r0_3); r1_2 = VectorAdd(r1_2, r1_3); @@ -1022,10 +987,10 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte r2_1 = VectorXor(r2_1, r2_2); r3_1 = VectorXor(r3_1, r3_2); - r0_1 = RotateLeft<7>(r0_1); - r1_1 = RotateLeft<7>(r1_1); - r2_1 = RotateLeft<7>(r2_1); - r3_1 = RotateLeft<7>(r3_1); + r0_1 = VectorRotateLeft<7>(r0_1); + r1_1 = VectorRotateLeft<7>(r1_1); + r2_1 = VectorRotateLeft<7>(r2_1); + r3_1 = VectorRotateLeft<7>(r3_1); r0_1 = Shuffle<3>(r0_1); r0_2 = Shuffle<2>(r0_2); @@ -1120,6 +1085,6 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte VectorStore32LE(output + 15*16, r3_3); } -#endif // CRYPTOPP_POWER7_AVAILABLE +#endif // CRYPTOPP_ALTIVEC_AVAILABLE NAMESPACE_END diff --git a/ppc_simd.h b/ppc_simd.h index 3b7bbcbc..9c5f50fb 100644 --- a/ppc_simd.h +++ b/ppc_simd.h @@ -13,19 +13,15 @@ /// provide best support and don't need many of the little hacks below. /// \since Crypto++ 6.0 +// Use __ALTIVEC__, _ARCH_PWR7 and _ARCH_PWR8. The preprocessor macros +// depend on compiler options like -maltivec (and not compiler versions). + #ifndef CRYPTOPP_PPC_CRYPTO_H #define CRYPTOPP_PPC_CRYPTO_H #include "config.h" #include "misc.h" -// We are boxed into undefining macros like CRYPTOPP_POWER8_AVAILABLE. -// We set CRYPTOPP_POWER8_AVAILABLE based on compiler versions because -// we needed them for the SIMD and non-SIMD files. When the SIMD file is -// compiled it may only get -mcpu=power4 or -mcpu=power7, so the POWER7 -// or POWER8 stuff is not actually available when this header is included. -// We also need to handle the case of -DCRYPTOPP_ALTIVEC_AVAILABLE=0. - #if defined(__ALTIVEC__) # include # undef vector @@ -33,6 +29,13 @@ # undef bool #endif +// VectorLoad_ALTIVEC and VectorStore_ALTIVEC are too noisy on modern compilers + +#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wdeprecated" +#endif + NAMESPACE_BEGIN(CryptoPP) // Wrap everything in this file based on CRYPTOPP_ALTIVEC_AVAILABLE @@ -42,9 +45,10 @@ NAMESPACE_BEGIN(CryptoPP) typedef __vector unsigned char uint8x16_p; typedef __vector unsigned short uint16x8_p; typedef __vector unsigned int uint32x4_p; + #if defined(_ARCH_PWR8) typedef __vector unsigned long long uint64x2_p; -#endif // POWER8 datatypes +#endif // _ARCH_PWR8 /// \brief Reverse a vector /// \tparam T vector type @@ -53,12 +57,323 @@ typedef __vector unsigned long long uint64x2_p; /// \details Reverse() endian swaps the bytes in a vector /// \since Crypto++ 6.0 template -inline T Reverse(const T& src) +inline T Reverse(const T src) { const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; return (T)vec_perm(src, src, mask); } +//////////////////////// Loads //////////////////////// + +/// \brief Loads a vector from a byte array +/// \param src the byte array +/// \details Loads a vector in native endian format from a byte array. +/// \note VectorLoad does not require an aligned array. +/// \since Crypto++ 6.0 +inline uint32x4_p VectorLoad_ALTIVEC(const byte src[16]) +{ + if (IsAlignedOn(src, 16)) + { + return (uint32x4_p)vec_ld(0, src); + } + else + { + // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf + const uint8x16_p perm = vec_lvsl(0, src); + const uint8x16_p low = vec_ld(0, src); + const uint8x16_p high = vec_ld(15, src); + return (uint32x4_p)vec_perm(low, high, perm); + } +} + +/// \brief Loads a vector from a byte array +/// \param src the byte array +/// \param off offset into the src byte array +/// \details Loads a vector in native endian format from a byte array. +/// \note VectorLoad does not require an aligned array. +/// \since Crypto++ 6.0 +inline uint32x4_p VectorLoad_ALTIVEC(int off, const byte src[16]) +{ + if (IsAlignedOn(src, 16)) + { + return (uint32x4_p)vec_ld(off, src); + } + else + { + // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf + const uint8x16_p perm = vec_lvsl(off, src); + const uint8x16_p low = vec_ld(off, src); + const uint8x16_p high = vec_ld(15, src); + return (uint32x4_p)vec_perm(low, high, perm); + } +} + +/// \brief Loads a vector from a byte array +/// \param src the byte array +/// \details Loads a vector in native endian format from a byte array. +/// \note VectorLoad does not require an aligned array. +/// \since Crypto++ 6.0 +inline uint32x4_p VectorLoad(const byte src[16]) +{ +#if defined(_ARCH_PWR7) +# if defined(__xlc__) || defined(__xlC__) || defined(__clang__) + return (uint32x4_p)vec_xl(0, (byte*)src); +# else + return (uint32x4_p)vec_vsx_ld(0, (byte*)src); +# endif +#else + return VectorLoad_ALTIVEC(src); +#endif +} + +/// \brief Loads a vector from a byte array +/// \param src the byte array +/// \param off offset into the byte array +/// \details Loads a vector in native endian format from a byte array. +/// \note VectorLoad does not require an aligned array. +/// \since Crypto++ 6.0 +inline uint32x4_p VectorLoad(int off, const byte src[16]) +{ +#if defined(_ARCH_PWR7) +# if defined(__xlc__) || defined(__xlC__) || defined(__clang__) + return (uint32x4_p)vec_xl(off, (byte*)src); +# else + return (uint32x4_p)vec_vsx_ld(off, (byte*)src); +# endif +#else + return VectorLoad_ALTIVEC(off, src); +#endif +} + +/// \brief Loads a vector from a byte array +/// \param src the byte array +/// \param off offset into the byte array +/// \details Loads a vector in native endian format from a byte array. +/// \note VectorLoad does not require an aligned array. +/// \since Crypto++ 6.0 +inline uint32x4_p VectorLoad(const word32 src[4]) +{ + return VectorLoad((const byte*)src); +} + +/// \brief Loads a vector from a byte array +/// \param src the byte array +/// \param off offset into the byte array +/// \details Loads a vector in native endian format from a byte array. +/// \note VectorLoad does not require an aligned array. +/// \since Crypto++ 6.0 +inline uint32x4_p VectorLoad(int off, const word32 src[4]) +{ + return VectorLoad(off, (const byte*)src); +} + +/// \brief Loads a vector from a byte array +/// \param src the byte array +/// \details Loads a vector in big endian format from a byte array. +/// VectorLoadBE will swap all bytes on little endian systems. +/// \note VectorLoadBE() does not require an aligned array. +/// \since Crypto++ 6.0 +inline uint32x4_p VectorLoadBE(const byte src[16]) +{ +#if defined(_ARCH_PWR7) +# if defined(__xlc__) || defined(__xlC__) || defined(__clang__) + return (uint32x4_p)vec_xl_be(0, (byte*)src); +# else +# if (CRYPTOPP_BIG_ENDIAN) + return (uint32x4_p)vec_vsx_ld(0, (byte*)src); +# else + return (uint32x4_p)Reverse(vec_vsx_ld(0, (byte*)src)); +# endif +# endif +#else // _ARCH_PWR7 +# if (CRYPTOPP_BIG_ENDIAN) + return (uint32x4_p)VectorLoad((const byte*)src); +# else + return (uint32x4_p)Reverse(VectorLoad((const byte*)src)); +# endif +#endif // _ARCH_PWR7 +} + +/// \brief Loads a vector from a byte array +/// \param src the byte array +/// \param off offset into the src byte array +/// \details Loads a vector in big endian format from a byte array. +/// VectorLoadBE will swap all bytes on little endian systems. +/// \note VectorLoadBE does not require an aligned array. +/// \since Crypto++ 6.0 +inline uint32x4_p VectorLoadBE(int off, const byte src[16]) +{ +#if defined(_ARCH_PWR7) +# if defined(__xlc__) || defined(__xlC__) || defined(__clang__) + return (uint32x4_p)vec_xl_be(off, (byte*)src); +# else +# if (CRYPTOPP_BIG_ENDIAN) + return (uint32x4_p)vec_vsx_ld(off, (byte*)src); +# else + return (uint32x4_p)Reverse(vec_vsx_ld(off, (byte*)src)); +# endif +# endif +#else // _ARCH_PWR7 +# if (CRYPTOPP_BIG_ENDIAN) + return (uint32x4_p)VectorLoad(off, (const byte*)src); +# else + return (uint32x4_p)Reverse(VectorLoad(off, (const byte*)src)); +# endif +#endif // _ARCH_PWR7 +} + +//////////////////////// Stores //////////////////////// + +template +inline void VectorStore_ALTIVEC(const T data, byte dest[16]) +{ + if (IsAlignedOn(dest, 16)) + { + vec_st((uint8x16_p)data, 0, dest); + } + else + { + // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf + uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, dest)); + vec_ste((uint8x16_p) perm, 0, (unsigned char*) dest); + vec_ste((uint16x8_p) perm, 1, (unsigned short*)dest); + vec_ste((uint32x4_p) perm, 3, (unsigned int*) dest); + vec_ste((uint32x4_p) perm, 4, (unsigned int*) dest); + vec_ste((uint32x4_p) perm, 8, (unsigned int*) dest); + vec_ste((uint32x4_p) perm, 12, (unsigned int*) dest); + vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest); + vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest); + } +} + +template +inline void VectorStore_ALTIVEC(const T data, int off, byte dest[16]) +{ + if (IsAlignedOn(dest, 16)) + { + vec_st((uint8x16_p)data, off, dest); + } + else + { + // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf + uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(off, dest)); + vec_ste((uint8x16_p) perm, 0, (unsigned char*) dest); + vec_ste((uint16x8_p) perm, 1, (unsigned short*)dest); + vec_ste((uint32x4_p) perm, 3, (unsigned int*) dest); + vec_ste((uint32x4_p) perm, 4, (unsigned int*) dest); + vec_ste((uint32x4_p) perm, 8, (unsigned int*) dest); + vec_ste((uint32x4_p) perm, 12, (unsigned int*) dest); + vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest); + vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest); + } +} + +/// \brief Stores a vector to a byte array +/// \tparam T vector type +/// \param data the vector +/// \param dest the byte array +/// \details Stores a vector in native endian format to a byte array. +/// \note VectorStore does not require an aligned array. +/// \since Crypto++ 6.0 +template +inline void VectorStore(const T data, byte dest[16]) +{ +#if defined(_ARCH_PWR7) +# if defined(__xlc__) || defined(__xlC__) || defined(__clang__) + vec_xst((uint8x16_p)data, 0, (byte*)dest); +# else + vec_vsx_st((uint8x16_p)data, 0, (byte*)dest); +# endif +#else + return VectorStore_ALTIVEC(data, 0, dest); +#endif +} + +/// \brief Stores a vector to a byte array +/// \tparam T vector type +/// \param data the vector +/// \param off the byte offset into the array +/// \param dest the byte array +/// \details Stores a vector in native endian format to a byte array. +/// \note VectorStore does not require an aligned array. +/// \since Crypto++ 6.0 +template +inline void VectorStore(const T data, int off, byte dest[16]) +{ +#if defined(_ARCH_PWR7) +# if defined(__xlc__) || defined(__xlC__) || defined(__clang__) + vec_xst((uint8x16_p)data, off, (byte*)dest); +# else + vec_vsx_st((uint8x16_p)data, off, (byte*)dest); +# endif +#else + return VectorStore_ALTIVEC(data, off, dest); +#endif +} + +/// \brief Stores a vector to a byte array +/// \tparam T vector type +/// \param src the vector +/// \param dest the byte array +/// \details Stores a vector in big endian format to a byte array. +/// VectorStoreBE will swap all bytes on little endian systems. +/// \note VectorStoreBE does not require an aligned array. +/// \since Crypto++ 6.0 +template +inline void VectorStoreBE(const T src, byte dest[16]) +{ +#if defined(_ARCH_PWR7) +# if defined(__xlc__) || defined(__xlC__) || defined(__clang__) + vec_xst_be((uint8x16_p)src, 0, (byte*)dest); +# else +# if (CRYPTOPP_BIG_ENDIAN) + vec_vsx_st((uint8x16_p)src, 0, (byte*)dest); +# else + vec_vsx_st((uint8x16_p)Reverse(src), 0, (byte*)dest); +# endif +# endif +#else // _ARCH_PWR7 +# if (CRYPTOPP_BIG_ENDIAN) + VectorStore((uint8x16_p)src, (byte*)dest); +# else + VectorStore((uint8x16_p)Reverse(src), (byte*)dest); +# endif +#endif // _ARCH_PWR7 +} + +/// \brief Stores a vector to a byte array +/// \tparam T vector type +/// \param src the vector +/// \param off offset into the dest byte array +/// \param dest the byte array +/// \details Stores a vector in big endian format to a byte array. +/// VectorStoreBE will swap all bytes on little endian systems. +/// \note VectorStoreBE does not require an aligned array. +/// \since Crypto++ 6.0 +template +inline void VectorStoreBE(const T src, int off, byte dest[16]) +{ +#if defined(_ARCH_PWR7) +# if defined(__xlc__) || defined(__xlC__) || defined(__clang__) + vec_xst_be((uint8x16_p)src, off, (byte*)dest); +# else +# if (CRYPTOPP_BIG_ENDIAN) + vec_vsx_st((uint8x16_p)src, off, (byte*)dest); +# else + vec_vsx_st((uint8x16_p)Reverse(src), off, (byte*)dest); +# endif +# endif +#else // _ARCH_PWR7 +# if (CRYPTOPP_BIG_ENDIAN) + VectorStore((uint8x16_p)src, off, (byte*)dest); +# else + VectorStore((uint8x16_p)Reverse(src), off, (byte*)dest); +# endif +#endif // _ARCH_PWR7 +} + +//////////////////////// Miscellaneous //////////////////////// + /// \brief Permutes a vector /// \tparam T vector type /// \param vec the vector @@ -69,7 +384,7 @@ inline T Reverse(const T& src) /// vector is the same type as vec. /// \since Crypto++ 6.0 template -inline T1 VectorPermute(const T1& vec, const T2& mask) +inline T1 VectorPermute(const T1 vec, const T2 mask) { return (T1)vec_perm(vec, vec, (uint8x16_p)mask); } @@ -86,7 +401,7 @@ inline T1 VectorPermute(const T1& vec, const T2& mask) /// vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorPermute(const T1& vec1, const T1& vec2, const T2& mask) +inline T1 VectorPermute(const T1 vec1, const T1 vec2, const T2 mask) { return (T1)vec_perm(vec1, vec2, (uint8x16_p)mask); } @@ -101,7 +416,7 @@ inline T1 VectorPermute(const T1& vec1, const T1& vec2, const T2& mask) /// vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorAnd(const T1& vec1, const T2& vec2) +inline T1 VectorAnd(const T1 vec1, const T2 vec2) { return (T1)vec_and(vec1, (T1)vec2); } @@ -116,7 +431,7 @@ inline T1 VectorAnd(const T1& vec1, const T2& vec2) /// vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorOr(const T1& vec1, const T2& vec2) +inline T1 VectorOr(const T1 vec1, const T2 vec2) { return (T1)vec_or(vec1, (T1)vec2); } @@ -131,7 +446,7 @@ inline T1 VectorOr(const T1& vec1, const T2& vec2) /// vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorXor(const T1& vec1, const T2& vec2) +inline T1 VectorXor(const T1 vec1, const T2 vec2) { return (T1)vec_xor(vec1, (T1)vec2); } @@ -147,7 +462,7 @@ inline T1 VectorXor(const T1& vec1, const T2& vec2) /// is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorAdd(const T1& vec1, const T2& vec2) +inline T1 VectorAdd(const T1 vec1, const T2 vec2) { return (T1)vec_add(vec1, (T1)vec2); } @@ -162,11 +477,37 @@ inline T1 VectorAdd(const T1& vec1, const T2& vec2) /// is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorSub(const T1& vec1, const T2& vec2) +inline T1 VectorSub(const T1 vec1, const T2 vec2) { return (T1)vec_sub(vec1, (T1)vec2); } +/// \brief Add two vectors +/// \tparam T1 vector type +/// \tparam T2 vector type +/// \param vec1 the first vector +/// \param vec2 the second vector +/// \returns vector +/// \details VectorAdd64 returns a new vector from vec1 and vec2. +/// vec1 and vec2 are added as uint64x2_p quantities. +/// \since Crypto++ 8.0 +inline uint32x4_p VectorAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2) +{ +#if defined(_ARCH_PWR8) + return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2); +#else + // The carry mask selects carries from elements 1 and 3 and sets remaining + // elements to 0. The mask also shifts the carried values left by 4 bytes + // so the carries are added to elements 0 and 2. + const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16}; + const uint32x4_p zero = {0, 0, 0, 0}; + + uint32x4_p cy = vec_addc(vec1, vec2); + cy = vec_perm(cy, zero, cmask); + return vec_add(vec_add(vec1, vec2), cy); +#endif +} + /// \brief Shift a vector left /// \tparam C shift byte count /// \tparam T vector type @@ -180,14 +521,14 @@ inline T1 VectorSub(const T1& vec1, const T2& vec2) /// vec_sld(z, a, 16-c). You should always call the function as /// if on a big endian machine as shown below. ///
-///    uint8x16_p r1 = VectorLoad(ptr);
-///    uint8x16_p r5 = VectorShiftLeftOctet<12>(r1);
+///    uint8x16_p x = VectorLoad(ptr);
+///    uint8x16_p y = VectorShiftLeftOctet<12>(x);
 /// 
/// \sa Is vec_sld /// endian sensitive? on Stack Overflow /// \since Crypto++ 6.0 template -inline T VectorShiftLeftOctet(const T& vec) +inline T VectorShiftLeftOctet(const T vec) { const T zero = {0}; if (C >= 16) @@ -223,14 +564,14 @@ inline T VectorShiftLeftOctet(const T& vec) /// vec_sld(z, a, 16-c). You should always call the function as /// if on a big endian machine as shown below. ///
-///    uint8x16_p r1 = VectorLoad(ptr);
-///    uint8x16_p r5 = VectorShiftRightOctet<12>(r1);
+///    uint8x16_p x = VectorLoad(ptr);
+///    uint8x16_p y = VectorShiftRightOctet<12>(y);
 /// 
/// \sa Is vec_sld /// endian sensitive? on Stack Overflow /// \since Crypto++ 6.0 template -inline T VectorShiftRightOctet(const T& vec) +inline T VectorShiftRightOctet(const T vec) { const T zero = {0}; if (C >= 16) @@ -265,7 +606,7 @@ inline T VectorShiftRightOctet(const T& vec) /// endian sensitive? on Stack Overflow /// \since Crypto++ 6.0 template -inline T VectorRotateLeftOctet(const T& vec) +inline T VectorRotateLeftOctet(const T vec) { enum { R = C&0xf }; #if (CRYPTOPP_BIG_ENDIAN) @@ -287,7 +628,7 @@ inline T VectorRotateLeftOctet(const T& vec) /// endian sensitive? on Stack Overflow /// \since Crypto++ 6.0 template -inline T VectorRotateRightOctet(const T& vec) +inline T VectorRotateRightOctet(const T vec) { enum { R = C&0xf }; #if (CRYPTOPP_BIG_ENDIAN) @@ -297,13 +638,37 @@ inline T VectorRotateRightOctet(const T& vec) #endif } +/// \brief Rotate a vector left +/// \tparam C shift bit count +/// \param vec the vector +/// \returns vector +/// \details VectorRotateLeft rotates each element in a packed vector by bit count. +template +inline uint32x4_p VectorRotateLeft(const uint32x4_p vec) +{ + const uint32x4_p m = {C, C, C, C}; + return vec_rl(vec, m); +} + +/// \brief Rotate a vector right +/// \tparam C shift bit count +/// \param vec the vector +/// \returns vector +/// \details VectorRotateRight rotates each element in a packed vector by bit count. +template +inline uint32x4_p VectorRotateRight(const uint32x4_p vec) +{ + const uint32x4_p m = {32-C, 32-C, 32-C, 32-C}; + return vec_rl(vec, m); +} + /// \brief Exchange high and low double words /// \tparam T vector type /// \param vec the vector /// \returns vector /// \since Crypto++ 7.0 template -inline T VectorSwapWords(const T& vec) +inline T VectorSwapWords(const T vec) { return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8); } @@ -317,7 +682,7 @@ inline T VectorSwapWords(const T& vec) /// when viewed as a big endian array. The return vector is the same type as /// the original vector and padded with 0's in the most significant bit positions. template -inline T VectorGetLow(const T& val) +inline T VectorGetLow(const T val) { //const T zero = {0}; //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 }; @@ -334,7 +699,7 @@ inline T VectorGetLow(const T& val) /// when viewed as a big endian array. The return vector is the same type as /// the original vector and padded with 0's in the most significant bit positions. template -inline T VectorGetHigh(const T& val) +inline T VectorGetHigh(const T val) { //const T zero = {0}; //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 }; @@ -349,7 +714,7 @@ inline T VectorGetHigh(const T& val) /// \param vec2 the second vector /// \returns true if vec1 equals vec2, false otherwise template -inline bool VectorEqual(const T1& vec1, const T2& vec2) +inline bool VectorEqual(const T1 vec1, const T2 vec2) { return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2); } @@ -361,324 +726,13 @@ inline bool VectorEqual(const T1& vec1, const T2& vec2) /// \param vec2 the second vector /// \returns true if vec1 does not equal vec2, false otherwise template -inline bool VectorNotEqual(const T1& vec1, const T2& vec2) +inline bool VectorNotEqual(const T1 vec1, const T2 vec2) { return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2); } -// POWER7/POWER4 load and store -#if defined(_ARCH_PWR7) +//////////////////////// Power8 Crypto //////////////////////// -/// \brief Loads a vector from a byte array -/// \param src the byte array -/// \details Loads a vector in big endian format from a byte array. -/// VectorLoadBE will swap endianess on little endian systems. -/// \note VectorLoadBE() does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoadBE(const byte src[16]) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - return (uint32x4_p)vec_xl_be(0, (byte*)src); -#else -# if (CRYPTOPP_BIG_ENDIAN) - return (uint32x4_p)vec_vsx_ld(0, src); -# else - return (uint32x4_p)Reverse(vec_vsx_ld(0, src)); -# endif -#endif -} - -/// \brief Loads a vector from a byte array -/// \param src the byte array -/// \param off offset into the src byte array -/// \details Loads a vector in big endian format from a byte array. -/// VectorLoadBE will swap endianess on little endian systems. -/// \note VectorLoadBE does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoadBE(int off, const byte src[16]) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - return (uint32x4_p)vec_xl_be(off, (byte*)src); -#else -# if (CRYPTOPP_BIG_ENDIAN) - return (uint32x4_p)vec_vsx_ld(off, (byte*)src); -# else - return (uint32x4_p)Reverse(vec_vsx_ld(off, (byte*)src)); -# endif -#endif -} - -/// \brief Loads a vector from a byte array -/// \param src the byte array -/// \details Loads a vector in native endian format from a byte array. -/// \note VectorLoad does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad(const byte src[16]) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - return (uint32x4_p)vec_xl(0, (byte*)src); -#else - return (uint32x4_p)vec_vsx_ld(0, (byte*)src); -#endif -} - -/// \brief Loads a vector from a byte array -/// \param src the byte array -/// \param off offset into the src byte array -/// \details Loads a vector in native endian format from a byte array. -/// \note VectorLoad does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad(int off, const byte src[16]) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - return (uint32x4_p)vec_xl(off, (byte*)src); -#else - return (uint32x4_p)vec_vsx_ld(off, (byte*)src); -#endif -} - -/// \brief Loads a vector from a word array -/// \param src the word array -/// \details Loads a vector in native endian format from a word array. -/// \note VectorLoad does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad(const word32 src[4]) -{ - return VectorLoad((const byte*)src); -} - -/// \brief Loads a vector from a word array -/// \param src the word array -/// \param off offset into the src word array -/// \details Loads a vector in native endian format from a word array. -/// \note VectorLoad does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad(int off, const word32 src[4]) -{ - return VectorLoad(off, (const byte*)src); -} - -/// \brief Stores a vector to a byte array -/// \tparam T vector type -/// \param src the vector -/// \param dest the byte array -/// \details Stores a vector in big endian format to a byte array. -/// VectorStoreBE will swap endianess on little endian systems. -/// \note VectorStoreBE does not require an aligned array. -/// \since Crypto++ 6.0 -template -inline void VectorStoreBE(const T& src, byte dest[16]) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - vec_xst_be((uint8x16_p)src, 0, (byte*)dest); -#else -# if (CRYPTOPP_BIG_ENDIAN) - vec_vsx_st((uint8x16_p)src, 0, (byte*)dest); -# else - vec_vsx_st((uint8x16_p)Reverse(src), 0, (byte*)dest); -# endif -#endif -} - -/// \brief Stores a vector to a byte array -/// \tparam T vector type -/// \param src the vector -/// \param off offset into the dest byte array -/// \param dest the byte array -/// \details Stores a vector in big endian format to a byte array. -/// VectorStoreBE will swap endianess on little endian systems. -/// \note VectorStoreBE does not require an aligned array. -/// \since Crypto++ 6.0 -template -inline void VectorStoreBE(const T& src, int off, byte dest[16]) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - vec_xst_be((uint8x16_p)src, off, (byte*)dest); -#else -# if (CRYPTOPP_BIG_ENDIAN) - vec_vsx_st((uint8x16_p)src, off, (byte*)dest); -# else - vec_vsx_st((uint8x16_p)Reverse(src), off, (byte*)dest); -# endif -#endif -} - -/// \brief Stores a vector to a byte array -/// \tparam T vector type -/// \param src the vector -/// \param dest the byte array -/// \details Stores a vector in native endian format to a byte array. -/// \note VectorStore does not require an aligned array. -/// \since Crypto++ 6.0 -template -inline void VectorStore(const T& src, byte dest[16]) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - vec_xst((uint8x16_p)src, 0, (byte*)dest); -#else - vec_vsx_st((uint8x16_p)src, 0, (byte*)dest); -#endif -} - -/// \brief Stores a vector to a byte array -/// \tparam T vector type -/// \param src the vector -/// \param dest the byte array -/// \details Stores a vector in native endian format to a byte array. -/// \note VectorStore does not require an aligned array. -/// \since Crypto++ 6.0 -template -inline void VectorStore(byte dest[16], const T& src) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - vec_xst((uint8x16_p)src, 0, (byte*)dest); -#else - vec_vsx_st((uint8x16_p)src, 0, (byte*)dest); -#endif -} - -/// \brief Stores a vector to a byte array -/// \tparam T vector type -/// \param src the vector -/// \param off offset into the dest byte array -/// \param dest the byte array -/// \details Stores a vector in native endian format to a byte array. -/// \note VectorStore does not require an aligned array. -/// \since Crypto++ 6.0 -template -inline void VectorStore(const T& src, int off, byte dest[16]) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - vec_xst((uint8x16_p)src, off, (byte*)dest); -#else - vec_vsx_st((uint8x16_p)src, off, (byte*)dest); -#endif -} - -#else // ########## Not CRYPTOPP_POWER7_AVAILABLE ########## - -/// \brief Loads a vector from a byte array -/// \param src the byte array -/// \details Loads a vector in native endian format from a byte array. -/// \note VectorLoad does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad(const byte src[16]) -{ - if (IsAlignedOn(src, 16)) - { - return (uint32x4_p)vec_ld(0, src); - } - else - { - // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf - const uint8x16_p perm = vec_lvsl(0, src); - const uint8x16_p low = vec_ld(0, src); - const uint8x16_p high = vec_ld(15, src); - return (uint32x4_p)vec_perm(low, high, perm); - } -} - -/// \brief Loads a vector from a byte array -/// \param src the byte array -/// \param off offset into the src byte array -/// \details Loads a vector in native endian format from a byte array. -/// \note VectorLoad does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad(int off, const byte src[16]) -{ - if (IsAlignedOn(src, 16)) - { - return (uint32x4_p)vec_ld(off, src); - } - else - { - // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf - const uint8x16_p perm = vec_lvsl(off, src); - const uint8x16_p low = vec_ld(off, src); - const uint8x16_p high = vec_ld(15, src); - return (uint32x4_p)vec_perm(low, high, perm); - } -} - -/// \brief Loads a vector from a word array -/// \param src the word array -/// \details Loads a vector in native endian format from a word array. -/// \note VectorLoad does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad(const word32 src[4]) -{ - return VectorLoad((const byte*)src); -} - -/// \brief Loads a vector from a word array -/// \param src the word array -/// \param off offset into the src word array -/// \details Loads a vector in native endian format from a word array. -/// \note VectorLoad does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoad(int off, const word32 src[4]) -{ - return VectorLoad(off, (const byte*)src); -} - -/// \brief Loads a vector from a byte array -/// \param src the byte array -/// \details Loads a vector in big endian format from a byte array. -/// VectorLoadBE will swap endianess on little endian systems. -/// \note VectorLoadBE() does not require an aligned array. -/// \since Crypto++ 6.0 -inline uint32x4_p VectorLoadBE(const byte src[16]) -{ -#if (CRYPTOPP_BIG_ENDIAN) - return (uint32x4_p)VectorLoad(src); -#else - return (uint32x4_p)Reverse(VectorLoad(src)); -#endif -} - -template -inline void VectorStore(const T& data, byte dest[16]) -{ - if (IsAlignedOn(dest, 16)) - { - vec_st((uint8x16_p)data, 0, dest); - } - else - { - // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf - uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, dest)); - vec_ste((uint8x16_p) perm, 0, (unsigned char*) dest); - vec_ste((uint16x8_p) perm, 1, (unsigned short*)dest); - vec_ste((uint32x4_p) perm, 3, (unsigned int*) dest); - vec_ste((uint32x4_p) perm, 4, (unsigned int*) dest); - vec_ste((uint32x4_p) perm, 8, (unsigned int*) dest); - vec_ste((uint32x4_p) perm, 12, (unsigned int*) dest); - vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest); - vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest); - } -} - -/// \brief Stores a vector to a byte array -/// \tparam T vector type -/// \param src the vector -/// \param dest the byte array -/// \details Stores a vector in big endian format to a byte array. -/// VectorStoreBE will swap endianess on little endian systems. -/// \note VectorStoreBE does not require an aligned array. -/// \since Crypto++ 6.0 -template -inline void VectorStoreBE(const T& src, byte dest[16]) -{ -#if (CRYPTOPP_BIG_ENDIAN) - VectorStore(src, dest); -#else - VectorStore(Reverse(src), dest); -#endif -} - -#endif // POWER4/POWER7 load and store - -// POWER8 crypto #if defined(_ARCH_PWR8) /// \brief One round of AES encryption @@ -690,7 +744,7 @@ inline void VectorStoreBE(const T& src, byte dest[16]) /// using subkey key. The return vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorEncrypt(const T1& state, const T2& key) +inline T1 VectorEncrypt(const T1 state, const T2 key) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key); @@ -710,7 +764,7 @@ inline T1 VectorEncrypt(const T1& state, const T2& key) /// of state using subkey key. The return vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorEncryptLast(const T1& state, const T2& key) +inline T1 VectorEncryptLast(const T1 state, const T2 key) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key); @@ -730,7 +784,7 @@ inline T1 VectorEncryptLast(const T1& state, const T2& key) /// using subkey key. The return vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorDecrypt(const T1& state, const T2& key) +inline T1 VectorDecrypt(const T1 state, const T2 key) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key); @@ -750,7 +804,7 @@ inline T1 VectorDecrypt(const T1& state, const T2& key) /// of state using subkey key. The return vector is the same type as vec1. /// \since Crypto++ 6.0 template -inline T1 VectorDecryptLast(const T1& state, const T2& key) +inline T1 VectorDecryptLast(const T1 state, const T2 key) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key); @@ -761,10 +815,6 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key) #endif } -#endif // POWER8 crypto - -#if defined(_ARCH_PWR8) - /// \brief SHA256 Sigma functions /// \tparam func function /// \tparam subfunc sub-function @@ -774,7 +824,7 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key) /// func and subfunc. The return vector is the same type as vec. /// \since Crypto++ 6.0 template -inline T VectorSHA256(const T& vec) +inline T VectorSHA256(const T vec) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T)__vshasigmaw((uint32x4_p)vec, func, subfunc); @@ -794,7 +844,7 @@ inline T VectorSHA256(const T& vec) /// func and subfunc. The return vector is the same type as vec. /// \since Crypto++ 6.0 template -inline T VectorSHA512(const T& vec) +inline T VectorSHA512(const T vec) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (T)__vshasigmad((uint64x2_p)vec, func, subfunc); @@ -805,10 +855,14 @@ inline T VectorSHA512(const T& vec) #endif } -#endif // POWER8 crypto +#endif // _ARCH_PWR8 -#endif // CRYPTOPP_ALTIVEC_AVAILABLE +#endif // _ALTIVEC_ NAMESPACE_END +#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE +# pragma GCC diagnostic pop +#endif + #endif // CRYPTOPP_PPC_CRYPTO_H