diff --git a/gcm-simd.cpp b/gcm-simd.cpp index 2c5a7a6e..f55f18f2 100644 --- a/gcm-simd.cpp +++ b/gcm-simd.cpp @@ -1,5 +1,7 @@ // gcm-simd.cpp - written and placed in the public domain by // Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// Original x86 CLMUL by Wei Dai. ARM and POWER8 +// PMULL and VMULL by JW, UB and MR. // // This source file uses intrinsics to gain access to SSE4.2 and // ARMv8a CRC-32 and CRC-32C instructions. A separate source file @@ -60,16 +62,6 @@ #define UINT64X2_CAST(x) ((uint64x2_t *)(void *)(x)) #define CONST_UINT64X2_CAST(x) ((const uint64x2_t *)(const void *)(x)) -// Debugging on PowerPC -#if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64) -# ifndef NDEBUG -# undef INLINE -# define INLINE -# else -# define INLINE inline -# endif -#endif - // Squash MS LNK4221 and libtool warnings extern const char GCM_SIMD_FNAME[] = __FILE__; @@ -190,7 +182,7 @@ using CryptoPP::VectorRotateLeft; // multiplies return a result {a,b}, while little-endian return // a result {b,a}. Since the multiply routines are reflective and // use LE the BE results need a fixup. -INLINE uint64x2_p AdjustBE(const uint64x2_p& val) +inline uint64x2_p AdjustBE(const uint64x2_p& val) { #if CRYPTOPP_BIG_ENDIAN return VectorRotateLeft<8>(val); @@ -200,7 +192,7 @@ INLINE uint64x2_p AdjustBE(const uint64x2_p& val) } // _mm_clmulepi64_si128(a, b, 0x00) -INLINE uint64x2_p VMULL_00(const uint64x2_p& a, const uint64x2_p& b) +inline uint64x2_p VMULL_00(const uint64x2_p& a, const uint64x2_p& b) { #if defined(__xlc__) || defined(__xlC__) return AdjustBE(__vpmsumd (VectorGetHigh(a), VectorGetHigh(b))); @@ -210,7 +202,7 @@ INLINE uint64x2_p VMULL_00(const uint64x2_p& a, const uint64x2_p& b) } // _mm_clmulepi64_si128(a, b, 0x01) -INLINE uint64x2_p VMULL_01(const uint64x2_p& a, const uint64x2_p& b) +inline uint64x2_p VMULL_01(const uint64x2_p& a, const uint64x2_p& b) { // Small speedup. VectorGetHigh(b) ensures the high dword of 'b' is 0. // The 0 used in the vmull yields 0 for the high product, so the high @@ -225,7 +217,7 @@ INLINE uint64x2_p VMULL_01(const uint64x2_p& a, const uint64x2_p& b) } // _mm_clmulepi64_si128(a, b, 0x10) -INLINE uint64x2_p VMULL_10(const uint64x2_p& a, const uint64x2_p& b) +inline uint64x2_p VMULL_10(const uint64x2_p& a, const uint64x2_p& b) { // Small speedup. VectorGetHigh(a) ensures the high dword of 'a' is 0. // The 0 used in the vmull yields 0 for the high product, so the high @@ -240,7 +232,7 @@ INLINE uint64x2_p VMULL_10(const uint64x2_p& a, const uint64x2_p& b) } // _mm_clmulepi64_si128(a, b, 0x11) -INLINE uint64x2_p VMULL_11(const uint64x2_p& a, const uint64x2_p& b) +inline uint64x2_p VMULL_11(const uint64x2_p& a, const uint64x2_p& b) { // Small speedup. VectorGetLow(a) ensures the high dword of 'a' is 0. // The 0 used in the vmull yields 0 for the high product, so the high @@ -623,8 +615,7 @@ inline __m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128 */ c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8)); c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10)); - c0 = _mm_srli_si128(c0, 8); - c0 = _mm_xor_si128(c0, c1); + c0 = _mm_xor_si128(c1, _mm_srli_si128(c0, 8)); c0 = _mm_slli_epi64(c0, 1); c0 = _mm_clmulepi64_si128(c0, r, 0); c2 = _mm_xor_si128(c2, c0); @@ -650,9 +641,8 @@ void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned { const __m128i r = _mm_set_epi32(0xc2000000, 0x00000000, 0xe1000000, 0x00000000); const __m128i m = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); - const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(hashKey)), m); + __m128i h0 = _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(hashKey)), m), h = h0; - __m128i h = h0; unsigned int i; for (i=0; i -INLINE T SwapWords(const T& data) +inline T SwapWords(const T& data) { return (T)VectorRotateLeft<8>(data); } -INLINE uint64x2_p LoadBuffer1(const byte *dataBuffer) +inline uint64x2_p LoadBuffer1(const byte *dataBuffer) { #if CRYPTOPP_BIG_ENDIAN return (uint64x2_p)VectorLoad(dataBuffer); @@ -846,7 +833,7 @@ INLINE uint64x2_p LoadBuffer1(const byte *dataBuffer) #endif } -INLINE uint64x2_p LoadBuffer2(const byte *dataBuffer) +inline uint64x2_p LoadBuffer2(const byte *dataBuffer) { #if CRYPTOPP_BIG_ENDIAN return (uint64x2_p)SwapWords(VectorLoadBE(dataBuffer));