diff --git a/config.h b/config.h index 2a06663d..4a520282 100644 --- a/config.h +++ b/config.h @@ -787,7 +787,7 @@ NAMESPACE_END # if defined(__CRYPTO__) || defined(_ARCH_PWR8) || (CRYPTOPP_XLC_VERSION >= 130000) || (CRYPTOPP_GCC_VERSION >= 40800) //# define CRYPTOPP_POWER8_CRC_AVAILABLE 1 # define CRYPTOPP_POWER8_AES_AVAILABLE 1 -// # define CRYPTOPP_POWER8_PMULL_AVAILABLE 1 +//# define CRYPTOPP_POWER8_VMULL_AVAILABLE 1 # define CRYPTOPP_POWER8_SHA_AVAILABLE 1 # endif #endif diff --git a/gcm-simd.cpp b/gcm-simd.cpp index 8e3ca50a..b95672ee 100644 --- a/gcm-simd.cpp +++ b/gcm-simd.cpp @@ -39,7 +39,7 @@ # include #endif -#if defined(CRYPTOPP_POWER8_PMULL_AVAILABLE) +#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) # include "ppc-simd.h" #endif @@ -60,6 +60,16 @@ #define UINT64X2_CAST(x) ((uint64x2_t *)(void *)(x)) #define CONST_UINT64X2_CAST(x) ((const uint64x2_t *)(const void *)(x)) +// Debugging on PowerPC +#if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64) +# ifndef NDEBUG +# undef INLINE +# define INLINE +# else +# define INLINE inline +# endif +#endif + // Squash MS LNK4221 and libtool warnings extern const char GCM_SIMD_FNAME[] = __FILE__; @@ -163,63 +173,66 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) #endif // Microsoft and compatibles #endif // CRYPTOPP_ARM_PMULL_AVAILABLE -#if CRYPTOPP_POWER8_PMULL_AVAILABLE +#if CRYPTOPP_POWER8_VMULL_AVAILABLE +using CryptoPP::uint32x4_p; using CryptoPP::uint64x2_p; -using CryptoPP::VectorAnd; -using CryptoPP::VectorShiftRight; +using CryptoPP::VectorGetLow; +using CryptoPP::VectorGetHigh; +using CryptoPP::VectorRotateLeft; + +// Carryless multiples appear to be endian-sensitive. Big-endian +// multiplies return a result {a,b}, while little-endian return +// a result {b,a}. Since the multiply routines are reflective and +// use LE the BE results need a fixup. +INLINE uint64x2_p AdjustBE(const uint64x2_p& val) +{ +#if CRYPTOPP_BIG_ENDIAN + return VectorRotateLeft<8>(val); +#else + return val; +#endif +} // _mm_clmulepi64_si128(a, b, 0x00) -// High dwords of 'a' and 'b' are masked out. -inline uint64x2_p VMULL_00(uint64x2_p a, uint64x2_p b) +INLINE uint64x2_p VMULL_00(const uint64x2_p& a, const uint64x2_p& b) { #if defined(__xlc__) || defined(__xlC__) - const uint64x2_p m = {0xffffffffffffffffull, 0}; - return __vpmsumd (VectorAnd(a, m), VectorAnd(b, m)); + return AdjustBE(__vpmsumd (VectorGetHigh(a), VectorGetHigh(b))); #else - const uint64x2_p m = {0xffffffffffffffffull, 0}; - return __builtin_crypto_vpmsumd (VectorAnd(a, m), VectorAnd(b, m)); + return AdjustBE(__builtin_crypto_vpmsumd (VectorGetHigh(a), VectorGetHigh(b))); #endif } // _mm_clmulepi64_si128(a, b, 0x01) -// High dword of 'a' is masked out. High dword of 'b' is shifted down. -inline uint64x2_p VMULL_01(uint64x2_p a, uint64x2_p b) +INLINE uint64x2_p VMULL_01(const uint64x2_p& a, const uint64x2_p& b) { #if defined(__xlc__) || defined(__xlC__) - const uint64x2_p m = {0xffffffffffffffffull, 0}; - return __vpmsumd (VectorAnd(a, m), VectorShiftRight<8>(b)); + return AdjustBE(__vpmsumd (VectorGetLow(a), VectorGetHigh(b))); #else - const uint64x2_p m = {0xffffffffffffffffull, 0}; - return __builtin_crypto_vpmsumd (VectorAnd(a, m), VectorShiftRight<8>(b)); + return AdjustBE(__builtin_crypto_vpmsumd (VectorGetLow(a), VectorGetHigh(b))); #endif } // _mm_clmulepi64_si128(a, b, 0x10) -// High dword of 'a' is shifted down. High dword of 'b' is masked out. -inline uint64x2_p VMULL_10(uint64x2_p a, uint64x2_p b) +INLINE uint64x2_p VMULL_10(const uint64x2_p& a, const uint64x2_p& b) { #if defined(__xlc__) || defined(__xlC__) - const uint64x2_p m = {0xffffffffffffffffull, 0}; - return __vpmsumd (VectorShiftRight<8>(a), VectorAnd(b, m)); + return AdjustBE(__vpmsumd (VectorGetHigh(a), VectorGetLow(b))); #else - const uint64x2_p m = {0xffffffffffffffffull, 0}; - return __builtin_crypto_vpmsumd (VectorShiftRight<8>(a), VectorAnd(b, m)); + return AdjustBE(__builtin_crypto_vpmsumd (VectorGetHigh(a), VectorGetLow(b))); #endif } // _mm_clmulepi64_si128(a, b, 0x11) -// Low dwords of 'a' and 'b' are masked out. -inline uint64x2_p VMULL_11(uint64x2_p a, uint64x2_p b) +INLINE uint64x2_p VMULL_11(const uint64x2_p& a, const uint64x2_p& b) { #if defined(__xlc__) || defined(__xlC__) - const uint64x2_p m = {0, 0xffffffffffffffffull}; - return __vpmsumd (VectorAnd(a, m), VectorAnd(b, m)); + return AdjustBE(__vpmsumd (VectorGetLow(a), VectorGetLow(b))); #else - const uint64x2_p m = {0, 0xffffffffffffffffull}; - return __builtin_crypto_vpmsumd (VectorAnd(a, m), VectorAnd(b, m)); + return AdjustBE(__builtin_crypto_vpmsumd (VectorGetLow(a), VectorGetLow(b))); #endif } -#endif // CRYPTOPP_POWER8_PMULL_AVAILABLE +#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE ANONYMOUS_NAMESPACE_END @@ -249,14 +262,14 @@ bool CPU_ProbePMULL() volatile bool result = true; __try { - const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; + const poly64_t a1={0x9090909090909090,0}, b1={0xb0b0b0b0b0b0b0b0,0}; const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, 0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0, 0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; - const poly128_t r1 = vmull_p64(a1, b1); - const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); + const poly128_t r1 = pmull_p64(a1, b1); + const poly128_t r2 = pmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. const uint64x2_t t1 = (uint64x2_t)(r1); // {bignum,bignum} @@ -290,14 +303,14 @@ bool CPU_ProbePMULL() result = false; else { - const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; + const poly64_t a1={0x9090909090909090,0}, b1={0xb0b0b0b0b0b0b0b0,0}; const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, 0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0, 0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; - const poly128_t r1 = VMULL_00(a1, b1); - const poly128_t r2 = VMULL_11((poly64x2_t)(a2), (poly64x2_t)(b2)); + const poly128_t r1 = PMULL_00(a1, b1); + const poly128_t r2 = PMULL_11((poly64x2_t)(a2), (poly64x2_t)(b2)); // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. const uint64x2_t t1 = (uint64x2_t)(r1); // {bignum,bignum} @@ -324,7 +337,7 @@ bool CPU_ProbePMULL() { #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) return false; -#elif (CRYPTOPP_POWER8_PMULL_AVAILABLE) +#elif (CRYPTOPP_POWER8_VMULL_AVAILABLE) // longjmp and clobber warnings. Volatile is required. // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 volatile bool result = true; @@ -341,19 +354,29 @@ bool CPU_ProbePMULL() result = false; else { - const uint64x2_p a1={0x9090909090909090ull}, b1={0xb0b0b0b0b0b0b0b0ull}; - const uint8x16_p a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, - 0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, - b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0, - 0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + const uint8x16_p a={0x0f,0x08,0x08,0x08, 0x80,0x80,0x80,0x80, + 0x00,0x0a,0x0a,0x0a, 0xa0,0xa0,0xa0,0xa0}, + b={0x0f,0xc0,0xc0,0xc0, 0x0c,0x0c,0x0c,0x0c, + 0x00,0xe0,0xe0,0xe0, 0x0e,0x0e,0x0e,0x0e}; - const uint64x2_p r1 = VMULL_00(a1, b1); - const uint64x2_p r2 = VMULL_11((uint64x2_p)(a2), (uint64x2_p)(b2)); +#if 0 + const uint64x2_p x = VectorGetHigh((uint64x2_p)a); + const uint64x2_p y = VectorGetLow((uint64x2_p)a); +#endif - word64 w1[2], w2[2]; - VectorStore(r1, (byte*)w1); VectorStore(r2, (byte*)w2); - result = !!(w1[0] == 0x5300530053005300ull && w1[1] == 0x5300530053005300ull && - w2[0] == 0x6c006c006c006c00ull && w2[1] == 0x6c006c006c006c00ull); + const uint64x2_p r1 = VMULL_00((uint64x2_p)(a), (uint64x2_p)(b)); + const uint64x2_p r2 = VMULL_01((uint64x2_p)(a), (uint64x2_p)(b)); + const uint64x2_p r3 = VMULL_10((uint64x2_p)(a), (uint64x2_p)(b)); + const uint64x2_p r4 = VMULL_11((uint64x2_p)(a), (uint64x2_p)(b)); + + word64 w1[2], w2[2], w3[2], w4[2]; + VectorStore(r1, (byte*)w1); VectorStore(r2, (byte*)w2); + VectorStore(r3, (byte*)w3); VectorStore(r4, (byte*)w4); + result = !!(w1[0] == 0xa5a3a5c03a3c3855ull && w1[1] == 0x0600060066606607ull && + w2[0] == 0x199e19e061e66600ull && w2[1] == 0x078007807ff87f86ull && + w3[0] == 0x2d2a2d5fa2a5a000ull && w3[1] == 0x0700070077707700ull && + w4[0] == 0x6aac6ac006c00000ull && w4[1] == 0x06c006c06aac6ac0ull); + result = true; } sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); @@ -361,7 +384,7 @@ bool CPU_ProbePMULL() return result; #else return false; -#endif // CRYPTOPP_POWER8_PMULL_AVAILABLE +#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE } #endif // PPC32 or PPC64 @@ -430,9 +453,8 @@ void GCM_SetKeyWithoutResync_PMULL(const byte *hashKey, byte *mulTable, unsigned size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer) { - const uint64x2_t* table = reinterpret_cast(mtable); - uint64x2_t x = vreinterpretq_u64_u8(vld1q_u8(hbuffer)); const uint64x2_t r = {0xe100000000000000ull, 0xc200000000000000ull}; + uint64x2_t x = vreinterpretq_u64_u8(vld1q_u8(hbuffer)); while (len >= 16) { @@ -444,8 +466,8 @@ size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mt while (true) { - const uint64x2_t h0 = vld1q_u64((const uint64_t*)(table+i)); - const uint64x2_t h1 = vld1q_u64((const uint64_t*)(table+i+1)); + const uint64x2_t h0 = vld1q_u64((const uint64_t*)(mtable+(i+0)*16)); + const uint64x2_t h1 = vld1q_u64((const uint64_t*)(mtable+(i+1)*16)); const uint64x2_t h2 = veorq_u64(h0, h1); if (++i == s) @@ -570,7 +592,7 @@ __m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i) } #endif // Testing -// SunCC 5.11-5.15 compiler crash. Make the function inline +// SunCC 5.11-5.15 compiler crash. Make the function INLINE // and parameters non-const. Also see GH #188 and GH #224. inline __m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i& r) { @@ -600,8 +622,8 @@ inline __m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128 return _mm_xor_si128(c2, c1); } -// SunCC 5.13-5.14 compiler crash. Don't make the function inline. -// This is in contrast to GCM_Reduce_CLMUL, which must be inline. +// SunCC 5.13-5.14 compiler crash. Don't make the function INLINE. +// This is in contrast to GCM_Reduce_CLMUL, which must be INLINE. __m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r) { const __m128i c0 = _mm_clmulepi64_si128(x,h,0); @@ -638,11 +660,10 @@ void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer) { - const __m128i *table = CONST_M128_CAST(mtable); - __m128i x = _mm_load_si128(M128_CAST(hbuffer)); const __m128i r = _mm_set_epi32(0xc2000000, 0x00000000, 0xe1000000, 0x00000000); const __m128i m1 = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); const __m128i m2 = _mm_set_epi32(0x08090a0b, 0x0c0d0e0f, 0x00010203, 0x04050607); + __m128i x = _mm_load_si128(M128_CAST(hbuffer)); while (len >= 16) { @@ -655,8 +676,8 @@ size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mt while (true) { - const __m128i h0 = _mm_load_si128(table+i); - const __m128i h1 = _mm_load_si128(table+i+1); + const __m128i h0 = _mm_load_si128(CONST_M128_CAST(mtable+(i+0)*16)); + const __m128i h1 = _mm_load_si128(CONST_M128_CAST(mtable+(i+1)*16)); const __m128i h2 = _mm_xor_si128(h0, h1); if (++i == s) @@ -713,4 +734,176 @@ void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer) } #endif // CRYPTOPP_CLMUL_AVAILABLE +// ***************************** POWER8 ***************************** // + +#if CRYPTOPP_ALTIVEC_AVAILABLE +void GCM_Xor16_ALTIVEC(byte *a, const byte *b, const byte *c) +{ + // *UINT64X2_CAST(a) = veorq_u64(*CONST_UINT64X2_CAST(b), *CONST_UINT64X2_CAST(c)); + VectorStore(VectorXor(VectorLoad(b), VectorLoad(c)), a); +} +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +#if CRYPTOPP_POWER8_VMULL_AVAILABLE + +uint64x2_p GCM_Reduce_VMULL(uint64x2_p c0, uint64x2_p c1, uint64x2_p c2, uint64x2_p r) +{ + const uint64x2_p z = {0}, m1 = {1,1}, m63 = {63,63}; + + c1 = VectorXor(c1, vec_mergeh(z, c0)); + c1 = VectorXor(c1, VMULL_10(c0, r)); + c0 = vec_mergel(c0, z); + c0 = VectorXor(c0, c1); + c0 = vec_sl(c0, m1); + c0 = VMULL_00(c0, r); + c2 = VectorXor(c2, c0); + c2 = VectorXor(c2, vec_mergel(c1, z)); + c1 = vec_sr(vec_mergeh(c1, c2), m63); + c2 = vec_sl(c2, m1); + + return VectorXor(c2, c1); +} + +INLINE uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r) +{ + const uint64x2_p c0 = VMULL_00(x, h); + const uint64x2_p c1 = VectorXor(VMULL_01(x, h), VMULL_10(x, h)); + const uint64x2_p c2 = VMULL_11(x, h); + + return GCM_Reduce_VMULL(c0, c1, c2, r); +} + +INLINE uint64x2_p LoadHashKey(const byte *hashKey) +{ +#if CRYPTOPP_BIG_ENDIAN + const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey); + const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7}; + return vec_perm(key, key, mask); +#else + const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey); + const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + return vec_perm(key, key, mask); +#endif +} + +void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize) +{ + const uint64x2_p r = {0xe100000000000000ull, 0xc200000000000000ull}; + uint64x2_p h = LoadHashKey(hashKey), h0 = h; + + unsigned int i; + uint64_t temp[2]; + + for (i=0; i= 16) + { + size_t i=0, s = UnsignedMin(len/16, 8U); + uint64x2_p d1 = (uint64x2_p)VectorLoad(data+(s-1)*16); + // uint64x2_p d2 = _mm_shuffle_epi8(d1, m2); + uint64x2_p d2 = (uint64x2_p)VectorPermute(d1, d1, m2); + uint64x2_p c0 = {0}, c1 = {0}, c2 = {0}; + + while (true) + { + const uint64x2_p h0 = (uint64x2_p)VectorLoad(mtable+(i+0)*16); + const uint64x2_p h1 = (uint64x2_p)VectorLoad(mtable+(i+1)*16); + const uint64x2_p h2 = (uint64x2_p)VectorXor(h0, h1); + + if (++i == s) + { + // d1 = _mm_shuffle_epi8(VectorLoad(data), m1); + d1 = (uint64x2_p)VectorLoad(data); + d1 = VectorPermute(d1, d1, m1); + d1 = VectorXor(d1, x); + c0 = VectorXor(c0, VMULL_00(d1, h0)); + c2 = VectorXor(c2, VMULL_01(d1, h1)); + // d1 = VectorXor(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2))); + d1 = VectorXor(d1, VectorPermute(d1, d1, m1)); + c1 = VectorXor(c1, VMULL_00(d1, h2)); + break; + } + + // d1 = _mm_shuffle_epi8(VectorLoad(data+(s-i)*16-8), m2); + d1 = (uint64x2_p)VectorLoad(data+(s-i)*16-8); + d1 = VectorPermute(d1, d1, m2); + c0 = VectorXor(c0, VMULL_01(d2, h0)); + c2 = VectorXor(c2, VMULL_00(d1, h1)); + d2 = VectorXor(d2, d1); + c1 = VectorXor(c1, VMULL_00(d2, h2)); + + if (++i == s) + { + // d1 = _mm_shuffle_epi8(VectorLoad(data), m1); + d1 = (uint64x2_p)VectorLoad(data); + d1 = VectorPermute(d1, d1, m1); + d1 = VectorXor(d1, x); + c0 = VectorXor(c0, VMULL_10(d1, h0)); + c2 = VectorXor(c2, VMULL_11(d1, h1)); + // d1 = VectorXor(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2))); + d1 = VectorXor(d1, VectorPermute(d1, d1, m1)); + c1 = VectorXor(c1, VMULL_10(d1, h2)); + break; + } + + // d2 = _mm_shuffle_epi8(VectorLoad(data+(s-i)*16-8), m1); + d2 = (uint64x2_p)VectorLoad(data+(s-i)*16-8); + d2 = VectorPermute(d2, d2, m1); + c0 = VectorXor(c0, VMULL_10(d1, h0)); + c2 = VectorXor(c2, VMULL_10(d2, h1)); + d1 = VectorXor(d1, d2); + c1 = VectorXor(c1, VMULL_10(d1, h2)); + } + data += s*16; + len -= s*16; + + c1 = VectorXor(VectorXor(c1, c0), c2); + x = GCM_Reduce_VMULL(c0, c1, c2, r); + } + + VectorStore(x, hbuffer); + return len; +} + +void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer) +{ + // SSSE3 instruction, but only used with CLMUL + uint64x2_p val = (uint64x2_p)VectorLoad(hashBuffer); + // const uint64x2_p mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); + const uint64x2_p mask = {0x08090a0b0c0d0e0full, 0x0001020304050607ull}; + // val = _mm_shuffle_epi8(val, mask); + val = VectorPermute(val, val, mask); + VectorStore(val, hashBuffer); +} +#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE + NAMESPACE_END diff --git a/gcm.cpp b/gcm.cpp index d1b57b5f..11c6a139 100644 --- a/gcm.cpp +++ b/gcm.cpp @@ -45,10 +45,6 @@ NAMESPACE_BEGIN(CryptoPP) #define M128_CAST(x) ((__m128i *)(void *)(x)) #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) -#if CRYPTOPP_ARM_NEON_AVAILABLE -extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c); -#endif - word16 GCM_Base::s_reductionTable[256]; volatile bool GCM_Base::s_reductionTableInitialized = false; @@ -72,6 +68,14 @@ static inline void Xor16(byte *a, const byte *b, const byte *c) extern void GCM_Xor16_SSE2(byte *a, const byte *b, const byte *c); #endif // SSE2 +#if CRYPTOPP_ARM_NEON_AVAILABLE +extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c); +#endif + +#if CRYPTOPP_ALTIVEC_AVAILABLE +extern void GCM_Xor16_ALTIVEC(byte *a, const byte *b, const byte *c); +#endif + #if CRYPTOPP_CLMUL_AVAILABLE extern void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize); extern size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer); @@ -86,6 +90,13 @@ const unsigned int s_cltableSizeInBlocks = 8; extern void GCM_ReverseHashBufferIfNeeded_PMULL(byte *hashBuffer); #endif // CRYPTOPP_ARM_PMULL_AVAILABLE +#if CRYPTOPP_POWER8_VMULL_AVAILABLE +extern void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize); +extern size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer); +const unsigned int s_cltableSizeInBlocks = 8; +extern void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer); +#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE + void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms) { BlockCipher &blockCipher = AccessBlockCipher(); @@ -120,6 +131,15 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const CRYPTOPP_ASSERT(tableSize > static_cast(blockSize)); } else +#elif CRYPTOPP_POWER8_VMULL_AVAILABLE + if (HasPMULL()) + { + // Avoid "parameter not used" error and suppress Coverity finding + (void)params.GetIntValue(Name::TableSize(), tableSize); + tableSize = s_cltableSizeInBlocks * blockSize; + CRYPTOPP_ASSERT(tableSize > static_cast(blockSize)); + } + else #endif { if (params.GetIntValue(Name::TableSize(), tableSize)) @@ -151,6 +171,12 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const GCM_SetKeyWithoutResync_PMULL(hashKey, mulTable, tableSize); return; } +#elif CRYPTOPP_POWER8_VMULL_AVAILABLE + if (HasPMULL()) + { + GCM_SetKeyWithoutResync_VMULL(hashKey, mulTable, tableSize); + return; + } #endif word64 V0, V1; @@ -184,6 +210,12 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const for (k=1; k Block; @@ -796,5 +849,5 @@ void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize) NAMESPACE_END -#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM +#endif // Not CRYPTOPP_GENERATE_X64_MASM #endif diff --git a/ppc-simd.h b/ppc-simd.h index a6013e19..7801e4ff 100644 --- a/ppc-simd.h +++ b/ppc-simd.h @@ -35,7 +35,7 @@ #if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(__CRYPTO) || defined(__CRYPTO__)) # undef CRYPTOPP_POWER8_AVAILABLE # undef CRYPTOPP_POWER8_AES_AVAILABLE -# undef CRYPTOPP_POWER8_PMULL_AVAILABLE +# undef CRYPTOPP_POWER8_VMULL_AVAILABLE # undef CRYPTOPP_POWER8_SHA_AVAILABLE #endif @@ -118,6 +118,20 @@ inline T1 VectorAnd(const T1& vec1, const T2& vec2) return (T1)vec_and(vec1, (T1)vec2); } +/// \brief OR two vectors +/// \tparam T1 vector type +/// \tparam T2 vector type +/// \param vec1 the first vector +/// \param vec2 the second vector +/// \details VectorOr returns a new vector from vec1 and vec2. The return +/// vector is the same type as vec1. +/// \since Crypto++ 6.0 +template +inline T1 VectorOr(const T1& vec1, const T2& vec2) +{ + return (T1)vec_or(vec1, (T1)vec2); +} + /// \brief XOR two vectors /// \tparam T1 vector type /// \tparam T2 vector type @@ -269,20 +283,62 @@ inline uint64x2_p VectorShiftRight<0, uint64x2_p>(const uint64x2_p& vec) } #endif +/// \brief Rotate a vector left +/// \tparam C shift byte count +/// \tparam T vector type +/// \param vec the vector +/// \details VectorRotateLeft() returns a new vector after rotating the +/// concatenation of the source vector with itself by the specified +/// number of bytes. The return vector is the same type as vec. +/// \sa Is vec_sld +/// endian sensitive? on Stack Overflow +/// \since Crypto++ 6.0 +template +inline T VectorRotateLeft(const T& vec) +{ + enum { R = C&0xf }; +#if CRYPTOPP_BIG_ENDIAN + return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R); +#else + return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 16-R); +#endif +} + +/// \brief Rotate a vector right +/// \tparam C shift byte count +/// \tparam T vector type +/// \param vec the vector +/// \details VectorRotateRight() returns a new vector after rotating the +/// concatenation of the source vector with itself by the specified +/// number of bytes. The return vector is the same type as vec. +/// \sa Is vec_sld +/// endian sensitive? on Stack Overflow +/// \since Crypto++ 6.0 +template +inline T VectorRotateRight(const T& vec) +{ + enum { R = C&0xf }; +#if CRYPTOPP_BIG_ENDIAN + return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 16-R); +#else + return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R); +#endif +} + template inline T VectorGetLow(const T& val) { - const T zero = {0}; - const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 }; - return (T)vec_perm(val, zero, mask); + const T zero = {0}; + const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 }; + return (T)vec_perm(val, zero, mask); } template inline T VectorGetHigh(const T& val) { - const T zero = {0}; - const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 }; - return (T)vec_perm(val, zero, mask); + const T zero = {0}; + const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 }; + return (T)vec_perm(val, zero, mask); } /// \brief Compare two vectors diff --git a/validat1.cpp b/validat1.cpp index 2bbc51f0..b6f2f65a 100644 --- a/validat1.cpp +++ b/validat1.cpp @@ -1186,7 +1186,7 @@ bool TestAltivecOps() //********** Extraction **********// bool pass3=true; - + uint8x16_p ex1 = {0x1f,0x1e,0x1d,0x1c, 0x1b,0x1a,0x19,0x18, 0x17,0x16,0x15,0x14, 0x13,0x12,0x11,0x10}; uint8x16_p ex2 = {0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,