Remove INLINE used for debugging
We needed to switch inlining off manually. GDB was not stepping into code for us. No longer neededpull/703/head
parent
23e0ee44a0
commit
94eff2cdd6
41
gcm-simd.cpp
41
gcm-simd.cpp
|
|
@ -1,5 +1,7 @@
|
|||
// gcm-simd.cpp - written and placed in the public domain by
|
||||
// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
|
||||
// Original x86 CLMUL by Wei Dai. ARM and POWER8
|
||||
// PMULL and VMULL by JW, UB and MR.
|
||||
//
|
||||
// This source file uses intrinsics to gain access to SSE4.2 and
|
||||
// ARMv8a CRC-32 and CRC-32C instructions. A separate source file
|
||||
|
|
@ -60,16 +62,6 @@
|
|||
#define UINT64X2_CAST(x) ((uint64x2_t *)(void *)(x))
|
||||
#define CONST_UINT64X2_CAST(x) ((const uint64x2_t *)(const void *)(x))
|
||||
|
||||
// Debugging on PowerPC
|
||||
#if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64)
|
||||
# ifndef NDEBUG
|
||||
# undef INLINE
|
||||
# define INLINE
|
||||
# else
|
||||
# define INLINE inline
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// Squash MS LNK4221 and libtool warnings
|
||||
extern const char GCM_SIMD_FNAME[] = __FILE__;
|
||||
|
||||
|
|
@ -190,7 +182,7 @@ using CryptoPP::VectorRotateLeft;
|
|||
// multiplies return a result {a,b}, while little-endian return
|
||||
// a result {b,a}. Since the multiply routines are reflective and
|
||||
// use LE the BE results need a fixup.
|
||||
INLINE uint64x2_p AdjustBE(const uint64x2_p& val)
|
||||
inline uint64x2_p AdjustBE(const uint64x2_p& val)
|
||||
{
|
||||
#if CRYPTOPP_BIG_ENDIAN
|
||||
return VectorRotateLeft<8>(val);
|
||||
|
|
@ -200,7 +192,7 @@ INLINE uint64x2_p AdjustBE(const uint64x2_p& val)
|
|||
}
|
||||
|
||||
// _mm_clmulepi64_si128(a, b, 0x00)
|
||||
INLINE uint64x2_p VMULL_00(const uint64x2_p& a, const uint64x2_p& b)
|
||||
inline uint64x2_p VMULL_00(const uint64x2_p& a, const uint64x2_p& b)
|
||||
{
|
||||
#if defined(__xlc__) || defined(__xlC__)
|
||||
return AdjustBE(__vpmsumd (VectorGetHigh(a), VectorGetHigh(b)));
|
||||
|
|
@ -210,7 +202,7 @@ INLINE uint64x2_p VMULL_00(const uint64x2_p& a, const uint64x2_p& b)
|
|||
}
|
||||
|
||||
// _mm_clmulepi64_si128(a, b, 0x01)
|
||||
INLINE uint64x2_p VMULL_01(const uint64x2_p& a, const uint64x2_p& b)
|
||||
inline uint64x2_p VMULL_01(const uint64x2_p& a, const uint64x2_p& b)
|
||||
{
|
||||
// Small speedup. VectorGetHigh(b) ensures the high dword of 'b' is 0.
|
||||
// The 0 used in the vmull yields 0 for the high product, so the high
|
||||
|
|
@ -225,7 +217,7 @@ INLINE uint64x2_p VMULL_01(const uint64x2_p& a, const uint64x2_p& b)
|
|||
}
|
||||
|
||||
// _mm_clmulepi64_si128(a, b, 0x10)
|
||||
INLINE uint64x2_p VMULL_10(const uint64x2_p& a, const uint64x2_p& b)
|
||||
inline uint64x2_p VMULL_10(const uint64x2_p& a, const uint64x2_p& b)
|
||||
{
|
||||
// Small speedup. VectorGetHigh(a) ensures the high dword of 'a' is 0.
|
||||
// The 0 used in the vmull yields 0 for the high product, so the high
|
||||
|
|
@ -240,7 +232,7 @@ INLINE uint64x2_p VMULL_10(const uint64x2_p& a, const uint64x2_p& b)
|
|||
}
|
||||
|
||||
// _mm_clmulepi64_si128(a, b, 0x11)
|
||||
INLINE uint64x2_p VMULL_11(const uint64x2_p& a, const uint64x2_p& b)
|
||||
inline uint64x2_p VMULL_11(const uint64x2_p& a, const uint64x2_p& b)
|
||||
{
|
||||
// Small speedup. VectorGetLow(a) ensures the high dword of 'a' is 0.
|
||||
// The 0 used in the vmull yields 0 for the high product, so the high
|
||||
|
|
@ -623,8 +615,7 @@ inline __m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128
|
|||
*/
|
||||
c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8));
|
||||
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10));
|
||||
c0 = _mm_srli_si128(c0, 8);
|
||||
c0 = _mm_xor_si128(c0, c1);
|
||||
c0 = _mm_xor_si128(c1, _mm_srli_si128(c0, 8));
|
||||
c0 = _mm_slli_epi64(c0, 1);
|
||||
c0 = _mm_clmulepi64_si128(c0, r, 0);
|
||||
c2 = _mm_xor_si128(c2, c0);
|
||||
|
|
@ -650,9 +641,8 @@ void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned
|
|||
{
|
||||
const __m128i r = _mm_set_epi32(0xc2000000, 0x00000000, 0xe1000000, 0x00000000);
|
||||
const __m128i m = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
|
||||
const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(hashKey)), m);
|
||||
__m128i h0 = _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(hashKey)), m), h = h0;
|
||||
|
||||
__m128i h = h0;
|
||||
unsigned int i;
|
||||
for (i=0; i<tableSize-32; i+=32)
|
||||
{
|
||||
|
|
@ -774,7 +764,7 @@ uint64x2_p GCM_Reduce_VMULL(uint64x2_p c0, uint64x2_p c1, uint64x2_p c2, uint64x
|
|||
return VectorXor(c2, c1);
|
||||
}
|
||||
|
||||
INLINE uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
|
||||
inline uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
|
||||
{
|
||||
const uint64x2_p c0 = VMULL_00(x, h);
|
||||
const uint64x2_p c1 = VectorXor(VMULL_01(x, h), VMULL_10(x, h));
|
||||
|
|
@ -783,7 +773,7 @@ INLINE uint64x2_p GCM_Multiply_VMULL(uint64x2_p x, uint64x2_p h, uint64x2_p r)
|
|||
return GCM_Reduce_VMULL(c0, c1, c2, r);
|
||||
}
|
||||
|
||||
INLINE uint64x2_p LoadHashKey(const byte *hashKey)
|
||||
inline uint64x2_p LoadHashKey(const byte *hashKey)
|
||||
{
|
||||
#if CRYPTOPP_BIG_ENDIAN
|
||||
const uint64x2_p key = (uint64x2_p)VectorLoad(hashKey);
|
||||
|
|
@ -807,19 +797,16 @@ void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned
|
|||
for (i=0; i<tableSize-32; i+=32)
|
||||
{
|
||||
const uint64x2_p h1 = GCM_Multiply_VMULL(h, h0, r);
|
||||
|
||||
VectorStore(h, (byte*)temp);
|
||||
std::memcpy(mulTable+i, temp+0, 8);
|
||||
VectorStore(h1, mulTable+i+16);
|
||||
VectorStore(h, mulTable+i+8);
|
||||
VectorStore(h1, (byte*)temp);
|
||||
std::memcpy(mulTable+i+8, temp+0, 8);
|
||||
|
||||
h = GCM_Multiply_VMULL(h1, h0, r);
|
||||
}
|
||||
|
||||
const uint64x2_p h1 = GCM_Multiply_VMULL(h, h0, r);
|
||||
|
||||
VectorStore(h, (byte*)temp);
|
||||
std::memcpy(mulTable+i, temp+0, 8);
|
||||
VectorStore(h1, mulTable+i+16);
|
||||
|
|
@ -830,12 +817,12 @@ void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned
|
|||
|
||||
// Swaps high and low 64-bit words
|
||||
template <class T>
|
||||
INLINE T SwapWords(const T& data)
|
||||
inline T SwapWords(const T& data)
|
||||
{
|
||||
return (T)VectorRotateLeft<8>(data);
|
||||
}
|
||||
|
||||
INLINE uint64x2_p LoadBuffer1(const byte *dataBuffer)
|
||||
inline uint64x2_p LoadBuffer1(const byte *dataBuffer)
|
||||
{
|
||||
#if CRYPTOPP_BIG_ENDIAN
|
||||
return (uint64x2_p)VectorLoad(dataBuffer);
|
||||
|
|
@ -846,7 +833,7 @@ INLINE uint64x2_p LoadBuffer1(const byte *dataBuffer)
|
|||
#endif
|
||||
}
|
||||
|
||||
INLINE uint64x2_p LoadBuffer2(const byte *dataBuffer)
|
||||
inline uint64x2_p LoadBuffer2(const byte *dataBuffer)
|
||||
{
|
||||
#if CRYPTOPP_BIG_ENDIAN
|
||||
return (uint64x2_p)SwapWords(VectorLoadBE(dataBuffer));
|
||||
|
|
|
|||
Loading…
Reference in New Issue