Add VecPolyMultiply for Intel-equivalent F2N multiplies
parent
c60f30f912
commit
8fd5bb31cf
137
gcm_simd.cpp
137
gcm_simd.cpp
|
|
@ -67,101 +67,6 @@
|
||||||
// Squash MS LNK4221 and libtool warnings
|
// Squash MS LNK4221 and libtool warnings
|
||||||
extern const char GCM_SIMD_FNAME[] = __FILE__;
|
extern const char GCM_SIMD_FNAME[] = __FILE__;
|
||||||
|
|
||||||
ANONYMOUS_NAMESPACE_BEGIN
|
|
||||||
|
|
||||||
// ************************** Power8 Crypto ************************** //
|
|
||||||
|
|
||||||
#if CRYPTOPP_POWER8_VMULL_AVAILABLE
|
|
||||||
|
|
||||||
using CryptoPP::uint32x4_p;
|
|
||||||
using CryptoPP::uint64x2_p;
|
|
||||||
using CryptoPP::VecGetLow;
|
|
||||||
using CryptoPP::VecGetHigh;
|
|
||||||
using CryptoPP::VecRotateLeftOctet;
|
|
||||||
|
|
||||||
// POWER8 GCM mode is confusing. The algorithm is reflected so
|
|
||||||
// nearly everything we do is reversed for a little-endian system,
|
|
||||||
// including on big-endian machines. VMULL2LE swaps dwords for a
|
|
||||||
// little endian machine; VMULL_00LE, VMULL_01LE, VMULL_10LE and
|
|
||||||
// VMULL_11LE are backwards and (1) read low words with
|
|
||||||
// VecGetHigh, (2) read high words with VecGetLow, and
|
|
||||||
// (3) yields a product that is endian swapped. The steps ensures
|
|
||||||
// GCM parameters are presented in the correct order for the
|
|
||||||
// algorithm on both big and little-endian systems, but it is
|
|
||||||
// awful to try to follow the logic because it is so backwards.
|
|
||||||
// Because functions like VMULL_NN are so backwards we can't put
|
|
||||||
// them in ppc_simd.h. They simply don't work the way a typical
|
|
||||||
// user expects them to work.
|
|
||||||
|
|
||||||
inline uint64x2_p VMULL2LE(const uint64x2_p& val)
|
|
||||||
{
|
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
|
||||||
return VecRotateLeftOctet<8>(val);
|
|
||||||
#else
|
|
||||||
return val;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// _mm_clmulepi64_si128(a, b, 0x00)
|
|
||||||
inline uint64x2_p VMULL_00LE(const uint64x2_p& a, const uint64x2_p& b)
|
|
||||||
{
|
|
||||||
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
|
|
||||||
return VMULL2LE(__vpmsumd (VecGetHigh(a), VecGetHigh(b)));
|
|
||||||
#elif defined(__clang__)
|
|
||||||
return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b)));
|
|
||||||
#else
|
|
||||||
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b)));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// _mm_clmulepi64_si128(a, b, 0x01)
|
|
||||||
inline uint64x2_p VMULL_01LE(const uint64x2_p& a, const uint64x2_p& b)
|
|
||||||
{
|
|
||||||
// Small speedup. VecGetHigh(b) ensures the high dword of 'b' is 0.
|
|
||||||
// The 0 used in the vmull yields 0 for the high product, so the high
|
|
||||||
// dword of 'a' is "don't care".
|
|
||||||
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
|
|
||||||
return VMULL2LE(__vpmsumd (a, VecGetHigh(b)));
|
|
||||||
#elif defined(__clang__)
|
|
||||||
return VMULL2LE(__builtin_altivec_crypto_vpmsumd (a, VecGetHigh(b)));
|
|
||||||
#else
|
|
||||||
return VMULL2LE(__builtin_crypto_vpmsumd (a, VecGetHigh(b)));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// _mm_clmulepi64_si128(a, b, 0x10)
|
|
||||||
inline uint64x2_p VMULL_10LE(const uint64x2_p& a, const uint64x2_p& b)
|
|
||||||
{
|
|
||||||
// Small speedup. VecGetHigh(a) ensures the high dword of 'a' is 0.
|
|
||||||
// The 0 used in the vmull yields 0 for the high product, so the high
|
|
||||||
// dword of 'b' is "don't care".
|
|
||||||
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
|
|
||||||
return VMULL2LE(__vpmsumd (VecGetHigh(a), b));
|
|
||||||
#elif defined(__clang__)
|
|
||||||
return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), b));
|
|
||||||
#else
|
|
||||||
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), b));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// _mm_clmulepi64_si128(a, b, 0x11)
|
|
||||||
inline uint64x2_p VMULL_11LE(const uint64x2_p& a, const uint64x2_p& b)
|
|
||||||
{
|
|
||||||
// Small speedup. VecGetLow(a) ensures the high dword of 'a' is 0.
|
|
||||||
// The 0 used in the vmull yields 0 for the high product, so the high
|
|
||||||
// dword of 'b' is "don't care".
|
|
||||||
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
|
|
||||||
return VMULL2LE(__vpmsumd (VecGetLow(a), b));
|
|
||||||
#elif defined(__clang__)
|
|
||||||
return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetLow(a), b));
|
|
||||||
#else
|
|
||||||
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetLow(a), b));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
|
|
||||||
|
|
||||||
ANONYMOUS_NAMESPACE_END
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
// ************************* Feature Probes ************************* //
|
// ************************* Feature Probes ************************* //
|
||||||
|
|
@ -285,10 +190,10 @@ bool CPU_ProbePMULL()
|
||||||
b={0x0f,0xc0,0xc0,0xc0, 0x0c,0x0c,0x0c,0x0c,
|
b={0x0f,0xc0,0xc0,0xc0, 0x0c,0x0c,0x0c,0x0c,
|
||||||
0x00,0xe0,0xe0,0xe0, 0x0e,0x0e,0x0e,0x0e};
|
0x00,0xe0,0xe0,0xe0, 0x0e,0x0e,0x0e,0x0e};
|
||||||
|
|
||||||
const uint64x2_p r1 = VMULL_00LE((uint64x2_p)(a), (uint64x2_p)(b));
|
const uint64x2_p r1 = VecPolyMultiply00LE((uint64x2_p)(a), (uint64x2_p)(b));
|
||||||
const uint64x2_p r2 = VMULL_01LE((uint64x2_p)(a), (uint64x2_p)(b));
|
const uint64x2_p r2 = VecPolyMultiply01LE((uint64x2_p)(a), (uint64x2_p)(b));
|
||||||
const uint64x2_p r3 = VMULL_10LE((uint64x2_p)(a), (uint64x2_p)(b));
|
const uint64x2_p r3 = VecPolyMultiply10LE((uint64x2_p)(a), (uint64x2_p)(b));
|
||||||
const uint64x2_p r4 = VMULL_11LE((uint64x2_p)(a), (uint64x2_p)(b));
|
const uint64x2_p r4 = VecPolyMultiply11LE((uint64x2_p)(a), (uint64x2_p)(b));
|
||||||
|
|
||||||
result = VecNotEqual(r1, r2) && VecNotEqual(r3, r4);
|
result = VecNotEqual(r1, r2) && VecNotEqual(r3, r4);
|
||||||
}
|
}
|
||||||
|
|
@ -671,9 +576,9 @@ uint64x2_p GCM_Reduce_VMULL(uint64x2_p c0, uint64x2_p c1, uint64x2_p c2, uint64x
|
||||||
const uint64x2_p m1 = {1,1}, m63 = {63,63};
|
const uint64x2_p m1 = {1,1}, m63 = {63,63};
|
||||||
|
|
||||||
c1 = VecXor(c1, VecShiftRightOctet<8>(c0));
|
c1 = VecXor(c1, VecShiftRightOctet<8>(c0));
|
||||||
c1 = VecXor(c1, VMULL_10LE(c0, r));
|
c1 = VecXor(c1, VecPolyMultiply10LE(c0, r));
|
||||||
c0 = VecXor(c1, VecShiftLeftOctet<8>(c0));
|
c0 = VecXor(c1, VecShiftLeftOctet<8>(c0));
|
||||||
c0 = VMULL_00LE(vec_sl(c0, m1), r);
|
c0 = VecPolyMultiply00LE(vec_sl(c0, m1), r);
|
||||||
c2 = VecXor(c2, c0);
|
c2 = VecXor(c2, c0);
|
||||||
c2 = VecXor(c2, VecShiftLeftOctet<8>(c1));
|
c2 = VecXor(c2, VecShiftLeftOctet<8>(c1));
|
||||||
c1 = vec_sr(vec_mergeh(c1, c2), m63);
|
c1 = vec_sr(vec_mergeh(c1, c2), m63);
|
||||||
|
|
@ -684,9 +589,9 @@ uint64x2_p GCM_Reduce_VMULL(uint64x2_p c0, uint64x2_p c1, uint64x2_p c2, uint64x
|
||||||
|
|
||||||
inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
|
inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
|
||||||
{
|
{
|
||||||
const uint64x2_p c0 = VMULL_00LE(x, h);
|
const uint64x2_p c0 = VecPolyMultiply00LE(x, h);
|
||||||
const uint64x2_p c1 = VecXor(VMULL_01LE(x, h), VMULL_10LE(x, h));
|
const uint64x2_p c1 = VecXor(VecPolyMultiply01LE(x, h), VecPolyMultiply10LE(x, h));
|
||||||
const uint64x2_p c2 = VMULL_11LE(x, h);
|
const uint64x2_p c2 = VecPolyMultiply11LE(x, h);
|
||||||
|
|
||||||
return GCM_Reduce_VMULL(c0, c1, c2, r);
|
return GCM_Reduce_VMULL(c0, c1, c2, r);
|
||||||
}
|
}
|
||||||
|
|
@ -781,35 +686,35 @@ size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mt
|
||||||
{
|
{
|
||||||
d1 = LoadBuffer2(data);
|
d1 = LoadBuffer2(data);
|
||||||
d1 = VecXor(d1, x);
|
d1 = VecXor(d1, x);
|
||||||
c0 = VecXor(c0, VMULL_00LE(d1, h0));
|
c0 = VecXor(c0, VecPolyMultiply00LE(d1, h0));
|
||||||
c2 = VecXor(c2, VMULL_01LE(d1, h1));
|
c2 = VecXor(c2, VecPolyMultiply01LE(d1, h1));
|
||||||
d1 = VecXor(d1, SwapWords(d1));
|
d1 = VecXor(d1, SwapWords(d1));
|
||||||
c1 = VecXor(c1, VMULL_00LE(d1, h2));
|
c1 = VecXor(c1, VecPolyMultiply00LE(d1, h2));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
d1 = LoadBuffer1(data+(s-i)*16-8);
|
d1 = LoadBuffer1(data+(s-i)*16-8);
|
||||||
c0 = VecXor(c0, VMULL_01LE(d2, h0));
|
c0 = VecXor(c0, VecPolyMultiply01LE(d2, h0));
|
||||||
c2 = VecXor(c2, VMULL_01LE(d1, h1));
|
c2 = VecXor(c2, VecPolyMultiply01LE(d1, h1));
|
||||||
d2 = VecXor(d2, d1);
|
d2 = VecXor(d2, d1);
|
||||||
c1 = VecXor(c1, VMULL_01LE(d2, h2));
|
c1 = VecXor(c1, VecPolyMultiply01LE(d2, h2));
|
||||||
|
|
||||||
if (++i == s)
|
if (++i == s)
|
||||||
{
|
{
|
||||||
d1 = LoadBuffer2(data);
|
d1 = LoadBuffer2(data);
|
||||||
d1 = VecXor(d1, x);
|
d1 = VecXor(d1, x);
|
||||||
c0 = VecXor(c0, VMULL_10LE(d1, h0));
|
c0 = VecXor(c0, VecPolyMultiply10LE(d1, h0));
|
||||||
c2 = VecXor(c2, VMULL_11LE(d1, h1));
|
c2 = VecXor(c2, VecPolyMultiply11LE(d1, h1));
|
||||||
d1 = VecXor(d1, SwapWords(d1));
|
d1 = VecXor(d1, SwapWords(d1));
|
||||||
c1 = VecXor(c1, VMULL_10LE(d1, h2));
|
c1 = VecXor(c1, VecPolyMultiply10LE(d1, h2));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
d2 = LoadBuffer2(data+(s-i)*16-8);
|
d2 = LoadBuffer2(data+(s-i)*16-8);
|
||||||
c0 = VecXor(c0, VMULL_10LE(d1, h0));
|
c0 = VecXor(c0, VecPolyMultiply10LE(d1, h0));
|
||||||
c2 = VecXor(c2, VMULL_10LE(d2, h1));
|
c2 = VecXor(c2, VecPolyMultiply10LE(d2, h1));
|
||||||
d1 = VecXor(d1, d2);
|
d1 = VecXor(d1, d2);
|
||||||
c1 = VecXor(c1, VMULL_10LE(d1, h2));
|
c1 = VecXor(c1, VecPolyMultiply10LE(d1, h2));
|
||||||
}
|
}
|
||||||
data += s*16;
|
data += s*16;
|
||||||
len -= s*16;
|
len -= s*16;
|
||||||
|
|
|
||||||
|
|
@ -343,40 +343,8 @@ using CryptoPP::VecShiftLeft;
|
||||||
using CryptoPP::VecShiftRight;
|
using CryptoPP::VecShiftRight;
|
||||||
using CryptoPP::VecRotateLeftOctet;
|
using CryptoPP::VecRotateLeftOctet;
|
||||||
|
|
||||||
inline uint64x2_p VMULL2LE(const uint64x2_p& val)
|
using CryptoPP::VecPolyMultiply00LE;
|
||||||
{
|
using CryptoPP::VecPolyMultiply11LE;
|
||||||
#if (CRYPTOPP_BIG_ENDIAN)
|
|
||||||
return VecRotateLeftOctet<8>(val);
|
|
||||||
#else
|
|
||||||
return val;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// _mm_clmulepi64_si128(a, b, 0x00)
|
|
||||||
inline uint64x2_p VMULL_00LE(const uint64x2_p& a, const uint64x2_p& b)
|
|
||||||
{
|
|
||||||
const uint64x2_p z={0};
|
|
||||||
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
|
|
||||||
return VMULL2LE(__vpmsumd (VecMergeHi(z, a), VecMergeHi(z, b)));
|
|
||||||
#elif defined(__clang__)
|
|
||||||
return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecMergeHi(z, a), VecMergeHi(z, b)));
|
|
||||||
#else
|
|
||||||
return VMULL2LE(__builtin_crypto_vpmsumd (VecMergeHi(z, a), VecMergeHi(z, b)));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// _mm_clmulepi64_si128(a, b, 0x11)
|
|
||||||
inline uint64x2_p VMULL_11LE(const uint64x2_p& a, const uint64x2_p& b)
|
|
||||||
{
|
|
||||||
const uint64x2_p z={0};
|
|
||||||
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
|
|
||||||
return VMULL2LE(__vpmsumd (VecMergeLo(z, a), b));
|
|
||||||
#elif defined(__clang__)
|
|
||||||
return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecMergeLo(z, a), b));
|
|
||||||
#else
|
|
||||||
return VMULL2LE(__builtin_crypto_vpmsumd (VecMergeLo(z, a), b));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// c1c0 = a * b
|
// c1c0 = a * b
|
||||||
inline void
|
inline void
|
||||||
|
|
@ -385,13 +353,13 @@ F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a,
|
||||||
uint64x2_p t1, t2;
|
uint64x2_p t1, t2;
|
||||||
const uint64x2_p z0={0};
|
const uint64x2_p z0={0};
|
||||||
|
|
||||||
c0 = VMULL_00LE(a, b);
|
c0 = VecPolyMultiply00LE(a, b);
|
||||||
c1 = VMULL_11LE(a, b);
|
c1 = VecPolyMultiply11LE(a, b);
|
||||||
t1 = VecMergeLo(a, a);
|
t1 = VecMergeLo(a, a);
|
||||||
t1 = VecXor(a, t1);
|
t1 = VecXor(a, t1);
|
||||||
t2 = VecMergeLo(b, b);
|
t2 = VecMergeLo(b, b);
|
||||||
t2 = VecXor(b, t2);
|
t2 = VecXor(b, t2);
|
||||||
t1 = VMULL_00LE(t1, t2);
|
t1 = VecPolyMultiply00LE(t1, t2);
|
||||||
t1 = VecXor(c0, t1);
|
t1 = VecXor(c0, t1);
|
||||||
t1 = VecXor(c1, t1);
|
t1 = VecXor(c1, t1);
|
||||||
t2 = t1;
|
t2 = t1;
|
||||||
|
|
|
||||||
124
ppc_simd.h
124
ppc_simd.h
|
|
@ -1345,10 +1345,12 @@ inline T VecSwapWords(const T vec)
|
||||||
template <class T>
|
template <class T>
|
||||||
inline T VecGetLow(const T val)
|
inline T VecGetLow(const T val)
|
||||||
{
|
{
|
||||||
//const T zero = {0};
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
//const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 };
|
const T zero = {0};
|
||||||
//return (T)vec_perm(zero, val, mask);
|
return VecMergeLo(zero, val);
|
||||||
|
#else
|
||||||
return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
|
return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Extract a dword from a vector
|
/// \brief Extract a dword from a vector
|
||||||
|
|
@ -1365,10 +1367,12 @@ inline T VecGetLow(const T val)
|
||||||
template <class T>
|
template <class T>
|
||||||
inline T VecGetHigh(const T val)
|
inline T VecGetHigh(const T val)
|
||||||
{
|
{
|
||||||
//const T zero = {0};
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
//const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 };
|
const T zero = {0};
|
||||||
//return (T)vec_perm(zero, val, mask);
|
return VecMergeHi(zero, val);
|
||||||
|
#else
|
||||||
return VecShiftRightOctet<8>(val);
|
return VecShiftRightOctet<8>(val);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Compare two vectors
|
/// \brief Compare two vectors
|
||||||
|
|
@ -1409,6 +1413,114 @@ inline bool VecNotEqual(const T1 vec1, const T2 vec2)
|
||||||
|
|
||||||
#if defined(__CRYPTO__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
|
#if defined(__CRYPTO__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
|
||||||
|
|
||||||
|
/// \brief Polynomial multiplication helper
|
||||||
|
/// \details VMULL2LE helps perform polynomial multiplication
|
||||||
|
/// by presenting the results like Intel's <tt>_mm_clmulepi64_si128</tt>.
|
||||||
|
inline uint64x2_p VMULL2LE(const uint64x2_p& val)
|
||||||
|
{
|
||||||
|
#if (CRYPTOPP_BIG_ENDIAN)
|
||||||
|
return VecRotateLeftOctet<8>(val);
|
||||||
|
#else
|
||||||
|
return val;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/// \brief Polynomial multiplication
|
||||||
|
/// \param a the first term
|
||||||
|
/// \param b the second term
|
||||||
|
/// \returns vector product
|
||||||
|
/// \details VecPolyMultiply00LE perform polynomial multiplication and presents
|
||||||
|
/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
|
||||||
|
/// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
|
||||||
|
/// are multiplied.
|
||||||
|
/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
|
||||||
|
/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
|
||||||
|
/// \par Wraps
|
||||||
|
/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
|
||||||
|
/// \since Crypto++ 8.0
|
||||||
|
inline uint64x2_p VecPolyMultiply00LE(const uint64x2_p& a, const uint64x2_p& b)
|
||||||
|
{
|
||||||
|
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
|
||||||
|
return VMULL2LE(__vpmsumd (VecGetHigh(a), VecGetHigh(b)));
|
||||||
|
#elif defined(__clang__)
|
||||||
|
return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b)));
|
||||||
|
#else
|
||||||
|
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b)));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/// \brief Polynomial multiplication
|
||||||
|
/// \param a the first term
|
||||||
|
/// \param b the second term
|
||||||
|
/// \returns vector product
|
||||||
|
/// \details VecPolyMultiply01LE perform polynomial multiplication and presents
|
||||||
|
/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
|
||||||
|
/// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
|
||||||
|
/// 64-bits of <tt>b</tt> are multiplied.
|
||||||
|
/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
|
||||||
|
/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
|
||||||
|
/// \par Wraps
|
||||||
|
/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
|
||||||
|
/// \since Crypto++ 8.0
|
||||||
|
inline uint64x2_p VecPolyMultiply01LE(const uint64x2_p& a, const uint64x2_p& b)
|
||||||
|
{
|
||||||
|
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
|
||||||
|
return VMULL2LE(__vpmsumd (a, VecGetHigh(b)));
|
||||||
|
#elif defined(__clang__)
|
||||||
|
return VMULL2LE(__builtin_altivec_crypto_vpmsumd (a, VecGetHigh(b)));
|
||||||
|
#else
|
||||||
|
return VMULL2LE(__builtin_crypto_vpmsumd (a, VecGetHigh(b)));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/// \brief Polynomial multiplication
|
||||||
|
/// \param a the first term
|
||||||
|
/// \param b the second term
|
||||||
|
/// \returns vector product
|
||||||
|
/// \details VecPolyMultiply10LE perform polynomial multiplication and presents
|
||||||
|
/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
|
||||||
|
/// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
|
||||||
|
/// 64-bits of <tt>b</tt> are multiplied.
|
||||||
|
/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
|
||||||
|
/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
|
||||||
|
/// \par Wraps
|
||||||
|
/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
|
||||||
|
/// \since Crypto++ 8.0
|
||||||
|
inline uint64x2_p VecPolyMultiply10LE(const uint64x2_p& a, const uint64x2_p& b)
|
||||||
|
{
|
||||||
|
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
|
||||||
|
return VMULL2LE(__vpmsumd (VecGetHigh(a), b));
|
||||||
|
#elif defined(__clang__)
|
||||||
|
return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), b));
|
||||||
|
#else
|
||||||
|
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), b));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/// \brief Polynomial multiplication
|
||||||
|
/// \param a the first term
|
||||||
|
/// \param b the second term
|
||||||
|
/// \returns vector product
|
||||||
|
/// \details VecPolyMultiply11LE perform polynomial multiplication and presents
|
||||||
|
/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
|
||||||
|
/// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
|
||||||
|
/// are multiplied.
|
||||||
|
/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
|
||||||
|
/// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
|
||||||
|
/// \par Wraps
|
||||||
|
/// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
|
||||||
|
/// \since Crypto++ 8.0
|
||||||
|
inline uint64x2_p VecPolyMultiply11LE(const uint64x2_p& a, const uint64x2_p& b)
|
||||||
|
{
|
||||||
|
#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
|
||||||
|
return VMULL2LE(__vpmsumd (VecGetLow(a), b));
|
||||||
|
#elif defined(__clang__)
|
||||||
|
return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetLow(a), b));
|
||||||
|
#else
|
||||||
|
return VMULL2LE(__builtin_crypto_vpmsumd (VecGetLow(a), b));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
/// \brief One round of AES encryption
|
/// \brief One round of AES encryption
|
||||||
/// \tparam T1 vector type
|
/// \tparam T1 vector type
|
||||||
/// \tparam T2 vector type
|
/// \tparam T2 vector type
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue