diff --git a/gcm_simd.cpp b/gcm_simd.cpp index 374ab8fa..083ec691 100644 --- a/gcm_simd.cpp +++ b/gcm_simd.cpp @@ -67,101 +67,6 @@ // Squash MS LNK4221 and libtool warnings extern const char GCM_SIMD_FNAME[] = __FILE__; -ANONYMOUS_NAMESPACE_BEGIN - -// ************************** Power8 Crypto ************************** // - -#if CRYPTOPP_POWER8_VMULL_AVAILABLE - -using CryptoPP::uint32x4_p; -using CryptoPP::uint64x2_p; -using CryptoPP::VecGetLow; -using CryptoPP::VecGetHigh; -using CryptoPP::VecRotateLeftOctet; - -// POWER8 GCM mode is confusing. The algorithm is reflected so -// nearly everything we do is reversed for a little-endian system, -// including on big-endian machines. VMULL2LE swaps dwords for a -// little endian machine; VMULL_00LE, VMULL_01LE, VMULL_10LE and -// VMULL_11LE are backwards and (1) read low words with -// VecGetHigh, (2) read high words with VecGetLow, and -// (3) yields a product that is endian swapped. The steps ensures -// GCM parameters are presented in the correct order for the -// algorithm on both big and little-endian systems, but it is -// awful to try to follow the logic because it is so backwards. -// Because functions like VMULL_NN are so backwards we can't put -// them in ppc_simd.h. They simply don't work the way a typical -// user expects them to work. - -inline uint64x2_p VMULL2LE(const uint64x2_p& val) -{ -#if (CRYPTOPP_BIG_ENDIAN) - return VecRotateLeftOctet<8>(val); -#else - return val; -#endif -} - -// _mm_clmulepi64_si128(a, b, 0x00) -inline uint64x2_p VMULL_00LE(const uint64x2_p& a, const uint64x2_p& b) -{ -#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) - return VMULL2LE(__vpmsumd (VecGetHigh(a), VecGetHigh(b))); -#elif defined(__clang__) - return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b))); -#else - return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b))); -#endif -} - -// _mm_clmulepi64_si128(a, b, 0x01) -inline uint64x2_p VMULL_01LE(const uint64x2_p& a, const uint64x2_p& b) -{ - // Small speedup. VecGetHigh(b) ensures the high dword of 'b' is 0. - // The 0 used in the vmull yields 0 for the high product, so the high - // dword of 'a' is "don't care". -#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) - return VMULL2LE(__vpmsumd (a, VecGetHigh(b))); -#elif defined(__clang__) - return VMULL2LE(__builtin_altivec_crypto_vpmsumd (a, VecGetHigh(b))); -#else - return VMULL2LE(__builtin_crypto_vpmsumd (a, VecGetHigh(b))); -#endif -} - -// _mm_clmulepi64_si128(a, b, 0x10) -inline uint64x2_p VMULL_10LE(const uint64x2_p& a, const uint64x2_p& b) -{ - // Small speedup. VecGetHigh(a) ensures the high dword of 'a' is 0. - // The 0 used in the vmull yields 0 for the high product, so the high - // dword of 'b' is "don't care". -#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) - return VMULL2LE(__vpmsumd (VecGetHigh(a), b)); -#elif defined(__clang__) - return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), b)); -#else - return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), b)); -#endif -} - -// _mm_clmulepi64_si128(a, b, 0x11) -inline uint64x2_p VMULL_11LE(const uint64x2_p& a, const uint64x2_p& b) -{ - // Small speedup. VecGetLow(a) ensures the high dword of 'a' is 0. - // The 0 used in the vmull yields 0 for the high product, so the high - // dword of 'b' is "don't care". -#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) - return VMULL2LE(__vpmsumd (VecGetLow(a), b)); -#elif defined(__clang__) - return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetLow(a), b)); -#else - return VMULL2LE(__builtin_crypto_vpmsumd (VecGetLow(a), b)); -#endif -} -#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE - -ANONYMOUS_NAMESPACE_END - NAMESPACE_BEGIN(CryptoPP) // ************************* Feature Probes ************************* // @@ -285,10 +190,10 @@ bool CPU_ProbePMULL() b={0x0f,0xc0,0xc0,0xc0, 0x0c,0x0c,0x0c,0x0c, 0x00,0xe0,0xe0,0xe0, 0x0e,0x0e,0x0e,0x0e}; - const uint64x2_p r1 = VMULL_00LE((uint64x2_p)(a), (uint64x2_p)(b)); - const uint64x2_p r2 = VMULL_01LE((uint64x2_p)(a), (uint64x2_p)(b)); - const uint64x2_p r3 = VMULL_10LE((uint64x2_p)(a), (uint64x2_p)(b)); - const uint64x2_p r4 = VMULL_11LE((uint64x2_p)(a), (uint64x2_p)(b)); + const uint64x2_p r1 = VecPolyMultiply00LE((uint64x2_p)(a), (uint64x2_p)(b)); + const uint64x2_p r2 = VecPolyMultiply01LE((uint64x2_p)(a), (uint64x2_p)(b)); + const uint64x2_p r3 = VecPolyMultiply10LE((uint64x2_p)(a), (uint64x2_p)(b)); + const uint64x2_p r4 = VecPolyMultiply11LE((uint64x2_p)(a), (uint64x2_p)(b)); result = VecNotEqual(r1, r2) && VecNotEqual(r3, r4); } @@ -671,9 +576,9 @@ uint64x2_p GCM_Reduce_VMULL(uint64x2_p c0, uint64x2_p c1, uint64x2_p c2, uint64x const uint64x2_p m1 = {1,1}, m63 = {63,63}; c1 = VecXor(c1, VecShiftRightOctet<8>(c0)); - c1 = VecXor(c1, VMULL_10LE(c0, r)); + c1 = VecXor(c1, VecPolyMultiply10LE(c0, r)); c0 = VecXor(c1, VecShiftLeftOctet<8>(c0)); - c0 = VMULL_00LE(vec_sl(c0, m1), r); + c0 = VecPolyMultiply00LE(vec_sl(c0, m1), r); c2 = VecXor(c2, c0); c2 = VecXor(c2, VecShiftLeftOctet<8>(c1)); c1 = vec_sr(vec_mergeh(c1, c2), m63); @@ -684,9 +589,9 @@ uint64x2_p GCM_Reduce_VMULL(uint64x2_p c0, uint64x2_p c1, uint64x2_p c2, uint64x inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r) { - const uint64x2_p c0 = VMULL_00LE(x, h); - const uint64x2_p c1 = VecXor(VMULL_01LE(x, h), VMULL_10LE(x, h)); - const uint64x2_p c2 = VMULL_11LE(x, h); + const uint64x2_p c0 = VecPolyMultiply00LE(x, h); + const uint64x2_p c1 = VecXor(VecPolyMultiply01LE(x, h), VecPolyMultiply10LE(x, h)); + const uint64x2_p c2 = VecPolyMultiply11LE(x, h); return GCM_Reduce_VMULL(c0, c1, c2, r); } @@ -781,35 +686,35 @@ size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mt { d1 = LoadBuffer2(data); d1 = VecXor(d1, x); - c0 = VecXor(c0, VMULL_00LE(d1, h0)); - c2 = VecXor(c2, VMULL_01LE(d1, h1)); + c0 = VecXor(c0, VecPolyMultiply00LE(d1, h0)); + c2 = VecXor(c2, VecPolyMultiply01LE(d1, h1)); d1 = VecXor(d1, SwapWords(d1)); - c1 = VecXor(c1, VMULL_00LE(d1, h2)); + c1 = VecXor(c1, VecPolyMultiply00LE(d1, h2)); break; } d1 = LoadBuffer1(data+(s-i)*16-8); - c0 = VecXor(c0, VMULL_01LE(d2, h0)); - c2 = VecXor(c2, VMULL_01LE(d1, h1)); + c0 = VecXor(c0, VecPolyMultiply01LE(d2, h0)); + c2 = VecXor(c2, VecPolyMultiply01LE(d1, h1)); d2 = VecXor(d2, d1); - c1 = VecXor(c1, VMULL_01LE(d2, h2)); + c1 = VecXor(c1, VecPolyMultiply01LE(d2, h2)); if (++i == s) { d1 = LoadBuffer2(data); d1 = VecXor(d1, x); - c0 = VecXor(c0, VMULL_10LE(d1, h0)); - c2 = VecXor(c2, VMULL_11LE(d1, h1)); + c0 = VecXor(c0, VecPolyMultiply10LE(d1, h0)); + c2 = VecXor(c2, VecPolyMultiply11LE(d1, h1)); d1 = VecXor(d1, SwapWords(d1)); - c1 = VecXor(c1, VMULL_10LE(d1, h2)); + c1 = VecXor(c1, VecPolyMultiply10LE(d1, h2)); break; } d2 = LoadBuffer2(data+(s-i)*16-8); - c0 = VecXor(c0, VMULL_10LE(d1, h0)); - c2 = VecXor(c2, VMULL_10LE(d2, h1)); + c0 = VecXor(c0, VecPolyMultiply10LE(d1, h0)); + c2 = VecXor(c2, VecPolyMultiply10LE(d2, h1)); d1 = VecXor(d1, d2); - c1 = VecXor(c1, VMULL_10LE(d1, h2)); + c1 = VecXor(c1, VecPolyMultiply10LE(d1, h2)); } data += s*16; len -= s*16; diff --git a/gf2n_simd.cpp b/gf2n_simd.cpp index 10ab0ab9..6ac1fe3c 100644 --- a/gf2n_simd.cpp +++ b/gf2n_simd.cpp @@ -343,40 +343,8 @@ using CryptoPP::VecShiftLeft; using CryptoPP::VecShiftRight; using CryptoPP::VecRotateLeftOctet; -inline uint64x2_p VMULL2LE(const uint64x2_p& val) -{ -#if (CRYPTOPP_BIG_ENDIAN) - return VecRotateLeftOctet<8>(val); -#else - return val; -#endif -} - -// _mm_clmulepi64_si128(a, b, 0x00) -inline uint64x2_p VMULL_00LE(const uint64x2_p& a, const uint64x2_p& b) -{ - const uint64x2_p z={0}; -#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) - return VMULL2LE(__vpmsumd (VecMergeHi(z, a), VecMergeHi(z, b))); -#elif defined(__clang__) - return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecMergeHi(z, a), VecMergeHi(z, b))); -#else - return VMULL2LE(__builtin_crypto_vpmsumd (VecMergeHi(z, a), VecMergeHi(z, b))); -#endif -} - -// _mm_clmulepi64_si128(a, b, 0x11) -inline uint64x2_p VMULL_11LE(const uint64x2_p& a, const uint64x2_p& b) -{ - const uint64x2_p z={0}; -#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) - return VMULL2LE(__vpmsumd (VecMergeLo(z, a), b)); -#elif defined(__clang__) - return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecMergeLo(z, a), b)); -#else - return VMULL2LE(__builtin_crypto_vpmsumd (VecMergeLo(z, a), b)); -#endif -} +using CryptoPP::VecPolyMultiply00LE; +using CryptoPP::VecPolyMultiply11LE; // c1c0 = a * b inline void @@ -385,13 +353,13 @@ F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a, uint64x2_p t1, t2; const uint64x2_p z0={0}; - c0 = VMULL_00LE(a, b); - c1 = VMULL_11LE(a, b); + c0 = VecPolyMultiply00LE(a, b); + c1 = VecPolyMultiply11LE(a, b); t1 = VecMergeLo(a, a); t1 = VecXor(a, t1); t2 = VecMergeLo(b, b); t2 = VecXor(b, t2); - t1 = VMULL_00LE(t1, t2); + t1 = VecPolyMultiply00LE(t1, t2); t1 = VecXor(c0, t1); t1 = VecXor(c1, t1); t2 = t1; diff --git a/ppc_simd.h b/ppc_simd.h index c7bcb066..1b1e5969 100644 --- a/ppc_simd.h +++ b/ppc_simd.h @@ -1345,10 +1345,12 @@ inline T VecSwapWords(const T vec) template inline T VecGetLow(const T val) { - //const T zero = {0}; - //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 }; - //return (T)vec_perm(zero, val, mask); +#if (CRYPTOPP_BIG_ENDIAN) + const T zero = {0}; + return VecMergeLo(zero, val); +#else return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val)); +#endif } /// \brief Extract a dword from a vector @@ -1365,10 +1367,12 @@ inline T VecGetLow(const T val) template inline T VecGetHigh(const T val) { - //const T zero = {0}; - //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 }; - //return (T)vec_perm(zero, val, mask); +#if (CRYPTOPP_BIG_ENDIAN) + const T zero = {0}; + return VecMergeHi(zero, val); +#else return VecShiftRightOctet<8>(val); +#endif } /// \brief Compare two vectors @@ -1409,6 +1413,114 @@ inline bool VecNotEqual(const T1 vec1, const T2 vec2) #if defined(__CRYPTO__) || defined(CRYPTOPP_DOXYGEN_PROCESSING) +/// \brief Polynomial multiplication helper +/// \details VMULL2LE helps perform polynomial multiplication +/// by presenting the results like Intel's _mm_clmulepi64_si128. +inline uint64x2_p VMULL2LE(const uint64x2_p& val) +{ +#if (CRYPTOPP_BIG_ENDIAN) + return VecRotateLeftOctet<8>(val); +#else + return val; +#endif +} + +/// \brief Polynomial multiplication +/// \param a the first term +/// \param b the second term +/// \returns vector product +/// \details VecPolyMultiply00LE perform polynomial multiplication and presents +/// the result like Intel's c = _mm_clmulepi64_si128(a, b, 0x00). +/// The 0x00 indicates the low 64-bits of a and b +/// are multiplied. +/// \note An Intel XMM register is composed of 128-bits. The leftmost bit +/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0. +/// \par Wraps +/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd. +/// \since Crypto++ 8.0 +inline uint64x2_p VecPolyMultiply00LE(const uint64x2_p& a, const uint64x2_p& b) +{ +#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) + return VMULL2LE(__vpmsumd (VecGetHigh(a), VecGetHigh(b))); +#elif defined(__clang__) + return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b))); +#else + return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b))); +#endif +} + +/// \brief Polynomial multiplication +/// \param a the first term +/// \param b the second term +/// \returns vector product +/// \details VecPolyMultiply01LE perform polynomial multiplication and presents +/// the result like Intel's c = _mm_clmulepi64_si128(a, b, 0x01). +/// The 0x01 indicates the low 64-bits of a and high +/// 64-bits of b are multiplied. +/// \note An Intel XMM register is composed of 128-bits. The leftmost bit +/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0. +/// \par Wraps +/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd. +/// \since Crypto++ 8.0 +inline uint64x2_p VecPolyMultiply01LE(const uint64x2_p& a, const uint64x2_p& b) +{ +#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) + return VMULL2LE(__vpmsumd (a, VecGetHigh(b))); +#elif defined(__clang__) + return VMULL2LE(__builtin_altivec_crypto_vpmsumd (a, VecGetHigh(b))); +#else + return VMULL2LE(__builtin_crypto_vpmsumd (a, VecGetHigh(b))); +#endif +} + +/// \brief Polynomial multiplication +/// \param a the first term +/// \param b the second term +/// \returns vector product +/// \details VecPolyMultiply10LE perform polynomial multiplication and presents +/// the result like Intel's c = _mm_clmulepi64_si128(a, b, 0x10). +/// The 0x10 indicates the high 64-bits of a and low +/// 64-bits of b are multiplied. +/// \note An Intel XMM register is composed of 128-bits. The leftmost bit +/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0. +/// \par Wraps +/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd. +/// \since Crypto++ 8.0 +inline uint64x2_p VecPolyMultiply10LE(const uint64x2_p& a, const uint64x2_p& b) +{ +#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) + return VMULL2LE(__vpmsumd (VecGetHigh(a), b)); +#elif defined(__clang__) + return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), b)); +#else + return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), b)); +#endif +} + +/// \brief Polynomial multiplication +/// \param a the first term +/// \param b the second term +/// \returns vector product +/// \details VecPolyMultiply11LE perform polynomial multiplication and presents +/// the result like Intel's c = _mm_clmulepi64_si128(a, b, 0x11). +/// The 0x11 indicates the high 64-bits of a and b +/// are multiplied. +/// \note An Intel XMM register is composed of 128-bits. The leftmost bit +/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0. +/// \par Wraps +/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd. +/// \since Crypto++ 8.0 +inline uint64x2_p VecPolyMultiply11LE(const uint64x2_p& a, const uint64x2_p& b) +{ +#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) + return VMULL2LE(__vpmsumd (VecGetLow(a), b)); +#elif defined(__clang__) + return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetLow(a), b)); +#else + return VMULL2LE(__builtin_crypto_vpmsumd (VecGetLow(a), b)); +#endif +} + /// \brief One round of AES encryption /// \tparam T1 vector type /// \tparam T2 vector type