From 93d56c76c1d530368372e00b23593ebda7b6696a Mon Sep 17 00:00:00 2001 From: weidai Date: Thu, 31 Jul 2003 01:54:53 +0000 Subject: [PATCH] enable SSE2 intrinsics on GCC 3.3 or later --- GNUmakefile | 6 + integer.cpp | 1482 +++++++++++++++++++++++++-------------------------- integer.h | 32 +- 3 files changed, 738 insertions(+), 782 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 679f20ea..15fe037f 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -9,6 +9,12 @@ CXXFLAGS = -g ARFLAGS = -cr # ar needs the dash on OpenBSD RANLIB = ranlib UNAME = $(shell uname) +ISX86 = $(shell uname -m | grep -c "i.86") +GCC33ORLATER = $(shell gcc -v 2>&1 | grep -c "gcc version \(3.[3-9]\|[4-9]\)") + +ifeq ($(ISX86) $(GCC33ORLATER),1 1) +CXXFLAGS += -msse2 +endif ifeq ($(UNAME),) # for DJGPP, where uname doesn't exist CXXFLAGS += -mbnu210 diff --git a/integer.cpp b/integer.cpp index f5b5fc47..93539dd0 100644 --- a/integer.cpp +++ b/integer.cpp @@ -18,9 +18,16 @@ #include #ifdef SSE2_INTRINSICS_AVAILABLE -#include + #ifdef __GNUC__ + #include + #include + #else + #include + #endif #elif defined(_MSC_VER) && defined(_M_IX86) -#pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.") + #pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.") +#elif defined(__GNUC__) && defined(__i386__) + #pragma message("You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled.") #endif NAMESPACE_BEGIN(CryptoPP) @@ -41,7 +48,11 @@ CPP_TYPENAME AllocatorBase::pointer AlignedAllocator::allocate(size_type n { #ifdef SSE2_INTRINSICS_AVAILABLE if (n >= 4) - return (T *)_mm_malloc(sizeof(T)*n, 16); + #ifdef __GNUC__ + return (T *)memalign(16, sizeof(T)*n); + #else + return (T *)_mm_malloc(sizeof(T)*n, 16); + #endif else #endif return new T[n]; @@ -53,10 +64,14 @@ void AlignedAllocator::deallocate(void *p, size_type n) memset(p, 0, n*sizeof(T)); #ifdef SSE2_INTRINSICS_AVAILABLE if (n >= 4) - _mm_free(p); + #ifdef __GNUC__ + free(p); + #else + _mm_free(p); + #endif else #endif - delete [] p; + delete [] (T *)p; } #endif @@ -640,6 +655,13 @@ void Portable::Square2(word *R, const word *A) void Portable::Square4(word *R, const word *A) { +#ifdef _MSC_VER + // VC60 workaround: MSVC 6.0 has an optimization bug that makes + // (dword)A*B where either A or B has been cast to a dword before + // very expensive. Revisit this function when this + // bug is fixed. + Multiply4(R, A, A); +#else const word *B = A; DWord p, q; word c, d, e; @@ -666,6 +688,7 @@ void Portable::Square4(word *R, const word *A) p = DWord::MultiplyAndAdd(A[3], A[3], d); R[6] = p.GetLowHalf(); R[7] = e + p.GetHighHalf(); +#endif } void Portable::Multiply8(word *R, const word *A, const word *B) @@ -834,800 +857,289 @@ void Portable::Multiply8Bottom(word *R, const word *A, const word *B) #undef SaveSquAcc // CodeWarrior defines _MSC_VER -#if defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86) && (_M_IX86<=700) +#if (defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86)) || (defined(__GNUC__) && defined(__i386__)) class PentiumOptimized : public Portable { public: - static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N); - static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N); -// TODO test this with .NET #if _MSC_VER < 1300 - static inline void Square4(word *R, const word *A) - { - // VC60 workaround: MSVC 6.0 has an optimization bug that makes - // (dword)A*B where either A or B has been cast to a dword before - // very expensive. Revisit this function when this - // bug is fixed. - Multiply4(R, A, A); - } -//#endif -}; - -typedef PentiumOptimized LowLevel; - -__declspec(naked) word __fastcall PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N) -{ - __asm - { - push ebp - push ebx - push esi - push edi - - mov esi, [esp+24] ; N - mov ebx, [esp+20] ; B - - // now: ebx = B, ecx = C, edx = A, esi = N - - sub ecx, edx // hold the distance between C & A so we can add this to A to get C - xor eax, eax // clear eax - - sub eax, esi // eax is a negative index from end of B - lea ebx, [ebx+4*esi] // ebx is end of B - - sar eax, 1 // unit of eax is now dwords; this also clears the carry flag - jz loopend // if no dwords then nothing to do - -loopstart: - mov esi,[edx] // load lower word of A - mov ebp,[edx+4] // load higher word of A - - mov edi,[ebx+8*eax] // load lower word of B - lea edx,[edx+8] // advance A and C - - adc esi,edi // add lower words - mov edi,[ebx+8*eax+4] // load higher word of B - - adc ebp,edi // add higher words - inc eax // advance B - - mov [edx+ecx-8],esi // store lower word result - mov [edx+ecx-4],ebp // store higher word result - - jnz loopstart // loop until eax overflows and becomes zero - -loopend: - adc eax, 0 // store carry into eax (return result register) - pop edi - pop esi - pop ebx - pop ebp - ret 8 - } -} - -__declspec(naked) word __fastcall PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N) -{ - __asm - { - push ebp - push ebx - push esi - push edi - - mov esi, [esp+24] ; N - mov ebx, [esp+20] ; B - - sub ecx, edx - xor eax, eax - - sub eax, esi - lea ebx, [ebx+4*esi] - - sar eax, 1 - jz loopend - -loopstart: - mov esi,[edx] - mov ebp,[edx+4] - - mov edi,[ebx+8*eax] - lea edx,[edx+8] - - sbb esi,edi - mov edi,[ebx+8*eax+4] - - sbb ebp,edi - inc eax - - mov [edx+ecx-8],esi - mov [edx+ecx-4],ebp - - jnz loopstart - -loopend: - adc eax, 0 - pop edi - pop esi - pop ebx - pop ebp - ret 8 - } -} - -#ifdef SSE2_INTRINSICS_AVAILABLE - -static bool GetSSE2Capability() -{ - word32 b; - - __asm - { - mov eax, 1 - cpuid - mov b, edx - } - - return (b & (1 << 26)) != 0; -} - -bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true; - -void DisableSSE2() -{ - g_sse2Enabled = false; -} - -static inline bool HasSSE2() -{ - if (g_sse2Enabled && !g_sse2DetectionDone) - { - g_sse2Detected = GetSSE2Capability(); - g_sse2DetectionDone = true; - } - return g_sse2Enabled && g_sse2Detected; -} - -class P4Optimized : public PentiumOptimized -{ -public: - static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N); - static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N); - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static inline void Square4(word *R, const word *A) - { - Multiply4(R, A, A); - } - static void Multiply8Bottom(word *C, const word *A, const word *B); -}; - -static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B) -{ - __m128i a3210 = _mm_load_si128(A); - __m128i b3210 = _mm_load_si128(B); - - __m128i sum; - - __m128i z = _mm_setzero_si128(); - __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210); - C[0] = a2b2_a0b0; - - __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1)); - __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021); - __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z); - __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z); - C[1] = _mm_add_epi64(a1b0, a0b1); - - __m128i a31 = _mm_srli_epi64(a3210, 32); - __m128i b31 = _mm_srli_epi64(b3210, 32); - __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31); - C[6] = a3b3_a1b1; - - __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z); - __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2)); - __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012); - __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z); - __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z); - sum = _mm_add_epi64(a1b1, a0b2); - C[2] = _mm_add_epi64(sum, a2b0); - - __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1)); - __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3)); - __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012); - __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103); - __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z); - __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z); - __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z); - __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z); - __m128i sum1 = _mm_add_epi64(a3b0, a1b2); - sum = _mm_add_epi64(a2b1, a0b3); - C[3] = _mm_add_epi64(sum, sum1); - - __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103); - __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z); - __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z); - __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z); - sum = _mm_add_epi64(a2b2, a3b1); - C[4] = _mm_add_epi64(sum, a1b3); - - __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2)); - __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3)); - __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203); - __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z); - __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z); - C[5] = _mm_add_epi64(a3b2, a2b3); -} - -void P4Optimized::Multiply4(word *C, const word *A, const word *B) -{ - __m128i temp[7]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - - P4_Mul(temp, (__m128i *)A, (__m128i *)B); - - C[0] = w[0]; - - __m64 s1, s2; - - __m64 w1 = _m_from_int(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _m_from_int(w[26]); - - s1 = _mm_add_si64(w1, w4); - C[1] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s2); - C[4] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s2 = _mm_add_si64(w18, w20); - s1 = _mm_add_si64(s1, s2); - C[5] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s2 = _mm_add_si64(w22, w26); - s1 = _mm_add_si64(s1, s2); - C[6] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - C[7] = _m_to_int(s1) + w[27]; - _mm_empty(); -} - -void P4Optimized::Multiply8(word *C, const word *A, const word *B) -{ - __m128i temp[28]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - const word *x = (word *)temp+7*4; - const __m64 *mx = (__m64 *)x; - const word *y = (word *)temp+7*4*2; - const __m64 *my = (__m64 *)y; - const word *z = (word *)temp+7*4*3; - const __m64 *mz = (__m64 *)z; - - P4_Mul(temp, (__m128i *)A, (__m128i *)B); - - P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); - - P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); - - P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1); - - C[0] = w[0]; - - __m64 s1, s2, s3, s4; - - __m64 w1 = _m_from_int(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _m_from_int(w[26]); - __m64 w27 = _m_from_int(w[27]); - - __m64 x0 = _m_from_int(x[0]); - __m64 x1 = _m_from_int(x[1]); - __m64 x4 = mx[2]; - __m64 x6 = mx[3]; - __m64 x8 = mx[4]; - __m64 x10 = mx[5]; - __m64 x12 = mx[6]; - __m64 x14 = mx[7]; - __m64 x16 = mx[8]; - __m64 x18 = mx[9]; - __m64 x20 = mx[10]; - __m64 x22 = mx[11]; - __m64 x26 = _m_from_int(x[26]); - __m64 x27 = _m_from_int(x[27]); - - __m64 y0 = _m_from_int(y[0]); - __m64 y1 = _m_from_int(y[1]); - __m64 y4 = my[2]; - __m64 y6 = my[3]; - __m64 y8 = my[4]; - __m64 y10 = my[5]; - __m64 y12 = my[6]; - __m64 y14 = my[7]; - __m64 y16 = my[8]; - __m64 y18 = my[9]; - __m64 y20 = my[10]; - __m64 y22 = my[11]; - __m64 y26 = _m_from_int(y[26]); - __m64 y27 = _m_from_int(y[27]); - - __m64 z0 = _m_from_int(z[0]); - __m64 z1 = _m_from_int(z[1]); - __m64 z4 = mz[2]; - __m64 z6 = mz[3]; - __m64 z8 = mz[4]; - __m64 z10 = mz[5]; - __m64 z12 = mz[6]; - __m64 z14 = mz[7]; - __m64 z16 = mz[8]; - __m64 z18 = mz[9]; - __m64 z20 = mz[10]; - __m64 z22 = mz[11]; - __m64 z26 = _m_from_int(z[26]); - - s1 = _mm_add_si64(w1, w4); - C[1] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x0, y0); - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s3); - s1 = _mm_add_si64(s1, s2); - C[4] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x1, y1); - s4 = _mm_add_si64(x4, y4); - s1 = _mm_add_si64(s1, w18); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w20); - s1 = _mm_add_si64(s1, s3); - C[5] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x6, y6); - s4 = _mm_add_si64(x8, y8); - s1 = _mm_add_si64(s1, w22); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w26); - s1 = _mm_add_si64(s1, s3); - C[6] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x10, y10); - s4 = _mm_add_si64(x12, y12); - s1 = _mm_add_si64(s1, w27); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, s3); - C[7] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x14, y14); - s4 = _mm_add_si64(x16, y16); - s1 = _mm_add_si64(s1, z0); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, s3); - C[8] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x18, y18); - s4 = _mm_add_si64(x20, y20); - s1 = _mm_add_si64(s1, z1); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, z4); - s1 = _mm_add_si64(s1, s3); - C[9] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x22, y22); - s4 = _mm_add_si64(x26, y26); - s1 = _mm_add_si64(s1, z6); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, z8); - s1 = _mm_add_si64(s1, s3); - C[10] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x27, y27); - s1 = _mm_add_si64(s1, z10); - s1 = _mm_add_si64(s1, z12); - s1 = _mm_add_si64(s1, s3); - C[11] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(z14, z16); - s1 = _mm_add_si64(s1, s3); - C[12] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(z18, z20); - s1 = _mm_add_si64(s1, s3); - C[13] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(z22, z26); - s1 = _mm_add_si64(s1, s3); - C[14] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - C[15] = z[27] + _m_to_int(s1); - _mm_empty(); -} - -void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B) -{ - __m128i temp[21]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - const word *x = (word *)temp+7*4; - const __m64 *mx = (__m64 *)x; - const word *y = (word *)temp+7*4*2; - const __m64 *my = (__m64 *)y; - - P4_Mul(temp, (__m128i *)A, (__m128i *)B); - - P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); - - P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); - - C[0] = w[0]; - - __m64 s1, s2, s3, s4; - - __m64 w1 = _m_from_int(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _m_from_int(w[26]); - - __m64 x0 = _m_from_int(x[0]); - __m64 x1 = _m_from_int(x[1]); - __m64 x4 = mx[2]; - __m64 x6 = mx[3]; - __m64 x8 = mx[4]; - - __m64 y0 = _m_from_int(y[0]); - __m64 y1 = _m_from_int(y[1]); - __m64 y4 = my[2]; - __m64 y6 = my[3]; - __m64 y8 = my[4]; - - s1 = _mm_add_si64(w1, w4); - C[1] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x0, y0); - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s3); - s1 = _mm_add_si64(s1, s2); - C[4] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x1, y1); - s4 = _mm_add_si64(x4, y4); - s1 = _mm_add_si64(s1, w18); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w20); - s1 = _mm_add_si64(s1, s3); - C[5] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - s3 = _mm_add_si64(x6, y6); - s4 = _mm_add_si64(x8, y8); - s1 = _mm_add_si64(s1, w22); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w26); - s1 = _mm_add_si64(s1, s3); - C[6] = _m_to_int(s1); - s1 = _m_psrlqi(s1, 32); - - C[7] = _m_to_int(s1) + w[27] + x[10] + y[10] + x[12] + y[12]; - _mm_empty(); -} - -__declspec(naked) word __fastcall P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N) -{ - __asm - { - sub esp, 16 - xor eax, eax - mov [esp], edi - mov [esp+4], esi - mov [esp+8], ebx - mov [esp+12], ebp - - mov ebx, [esp+20] // B - mov esi, [esp+24] // N - - // now: ebx = B, ecx = C, edx = A, esi = N - - neg esi - jz loopend // if no dwords then nothing to do - - mov edi, [edx] - mov ebp, [ebx] - -loopstart: - add edi, eax - jc carry1 - - xor eax, eax - -carry1continue: - add edi, ebp - mov ebp, 1 - mov [ecx], edi - mov edi, [edx+4] - cmovc eax, ebp - mov ebp, [ebx+4] - lea ebx, [ebx+8] - add edi, eax - jc carry2 - - xor eax, eax - -carry2continue: - add edi, ebp - mov ebp, 1 - cmovc eax, ebp - mov [ecx+4], edi - add ecx, 8 - mov edi, [edx+8] - add edx, 8 - add esi, 2 - mov ebp, [ebx] - jnz loopstart - -loopend: - mov edi, [esp] - mov esi, [esp+4] - mov ebx, [esp+8] - mov ebp, [esp+12] - add esp, 16 - ret 8 - -carry1: - mov eax, 1 - jmp carry1continue - -carry2: - mov eax, 1 - jmp carry2continue - } -} - -__declspec(naked) word __fastcall P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N) -{ - __asm - { - sub esp, 16 - xor eax, eax - mov [esp], edi - mov [esp+4], esi - mov [esp+8], ebx - mov [esp+12], ebp - - mov ebx, [esp+20] // B - mov esi, [esp+24] // N - - // now: ebx = B, ecx = C, edx = A, esi = N - - neg esi - jz loopend // if no dwords then nothing to do - - mov edi, [edx] - mov ebp, [ebx] - -loopstart: - sub edi, eax - jc carry1 - - xor eax, eax - -carry1continue: - sub edi, ebp - mov ebp, 1 - mov [ecx], edi - mov edi, [edx+4] - cmovc eax, ebp - mov ebp, [ebx+4] - lea ebx, [ebx+8] - sub edi, eax - jc carry2 - - xor eax, eax - -carry2continue: - sub edi, ebp - mov ebp, 1 - cmovc eax, ebp - mov [ecx+4], edi - add ecx, 8 - mov edi, [edx+8] - add edx, 8 - add esi, 2 - mov ebp, [ebx] - jnz loopstart - -loopend: - mov edi, [esp] - mov esi, [esp+4] - mov ebx, [esp+8] - mov ebp, [esp+12] - add esp, 16 - ret 8 - -carry1: - mov eax, 1 - jmp carry1continue - -carry2: - mov eax, 1 - jmp carry2continue - } -} - -#endif // #ifdef SSE2_INTRINSICS_AVAILABLE - -#elif defined(__GNUC__) && defined(__i386__) - -class PentiumOptimized : public Portable -{ -public: -#ifndef __pic__ // -fpic uses up a register, leaving too few for the asm code - static word Add(word *C, const word *A, const word *B, unsigned int N); - static word Subtract(word *C, const word *A, const word *B, unsigned int N); -#endif + static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N); + static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N); +#ifdef __GNUC__ static void Square4(word *R, const word *A); static void Multiply4(word *C, const word *A, const word *B); static void Multiply8(word *C, const word *A, const word *B); +#endif }; typedef PentiumOptimized LowLevel; -// Add and Subtract assembly code originally contributed by Alister Lee - -#ifndef __pic__ -__attribute__((regparm(3))) word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N) +// this may be selected at run time +class P4Optimized : public PentiumOptimized { - assert (N%2 == 0); +public: + static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N); + static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N); +#ifdef SSE2_INTRINSICS_AVAILABLE + static void Multiply4(word *C, const word *A, const word *B); + static void Multiply8(word *C, const word *A, const word *B); + static void Multiply8Bottom(word *C, const word *A, const word *B); + static inline void Square4(word *R, const word *A) {Multiply4(R, A, A);} +#endif +}; - register word carry, temp; +// use some tricks to share assembly code between MSVC and GCC +#ifdef _MSC_VER + #define CRYPTOPP_NAKED __declspec(naked) + #define AS1(x) __asm x + #define AS2(x, y) __asm x, y + #define PentiumPrologue \ + __asm push ebp \ + __asm push ebx \ + __asm push esi \ + __asm push edi \ + __asm mov ecx, [esp+20] \ + __asm mov edx, [esp+24] \ + __asm mov ebx, [esp+28] \ + __asm mov esi, [esp+32] + #define PentiumEpilogue \ + __asm pop edi \ + __asm pop esi \ + __asm pop ebx \ + __asm pop ebp \ + __asm ret + #define P4Prologue \ + __asm sub esp, 16 \ + __asm mov [esp], edi \ + __asm mov [esp+4], esi \ + __asm mov [esp+8], ebx \ + __asm mov [esp+12], ebp \ + __asm mov ecx, [esp+20] \ + __asm mov edx, [esp+24] \ + __asm mov ebx, [esp+28] \ + __asm mov esi, [esp+32] + #define P4Epilogue \ + __asm mov edi, [esp] \ + __asm mov esi, [esp+4] \ + __asm mov ebx, [esp+8] \ + __asm mov ebp, [esp+12] \ + __asm add esp, 16 \ + __asm ret +#else + #define CRYPTOPP_NAKED + #define AS1(x) #x ";" + #define AS2(x, y) #x ", " #y ";" + #define PentiumPrologue \ + __asm__ \ + ( \ + ".att_syntax prefix;" \ + "mov %0, %%ecx;" \ + "mov %1, %%edx;" \ + "mov %2, %%ebx;" \ + "mov %3, %%esi;" \ + ".intel_syntax noprefix;" + #define PentiumEpilogue \ + ".att_syntax prefix;" \ + : \ + : "m" (C), "m" (A), "m" (B), "m" (N) \ + : "%ecx", "%edx", "%ebx", "%esi", "%edi" \ + ); + #define P4Prologue PentiumPrologue + #define P4Epilogue PentiumEpilogue +#endif - __asm__ __volatile__( - "push %%ebp;" - "sub %3, %2;" - "xor %0, %0;" - "sub %4, %0;" - "lea (%1,%4,4), %1;" - "sar $1, %0;" - "jz 1f;" +CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N) +{ + PentiumPrologue - "0:;" - "mov 0(%3), %4;" - "mov 4(%3), %%ebp;" - "mov (%1,%0,8), %5;" - "lea 8(%3), %3;" - "adc %5, %4;" - "mov 4(%1,%0,8), %5;" - "adc %5, %%ebp;" - "inc %0;" - "mov %4, -8(%3, %2);" - "mov %%ebp, -4(%3, %2);" - "jnz 0b;" + // now: ebx = B, ecx = C, edx = A, esi = N + AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C + AS2( xor eax, eax) // clear eax - "1:;" - "adc $0, %0;" - "pop %%ebp;" + AS2( sub eax, esi) // eax is a negative index from end of B + AS2( lea ebx, [ebx+4*esi]) // ebx is end of B - : "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp) - : : "cc", "memory"); + AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag + AS1( jz loopendAdd) // if no dwords then nothing to do - return carry; + AS1(loopstartAdd:) + AS2( mov esi,[edx]) // load lower word of A + AS2( mov ebp,[edx+4]) // load higher word of A + + AS2( mov edi,[ebx+8*eax]) // load lower word of B + AS2( lea edx,[edx+8]) // advance A and C + + AS2( adc esi,edi) // add lower words + AS2( mov edi,[ebx+8*eax+4]) // load higher word of B + + AS2( adc ebp,edi) // add higher words + AS1( inc eax) // advance B + + AS2( mov [edx+ecx-8],esi) // store lower word result + AS2( mov [edx+ecx-4],ebp) // store higher word result + + AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero + + AS1(loopendAdd:) + AS2( adc eax, 0) // store carry into eax (return result register) + + PentiumEpilogue } -__attribute__((regparm(3))) word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N) +CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N) { - assert (N%2 == 0); + PentiumPrologue - register word carry, temp; + // now: ebx = B, ecx = C, edx = A, esi = N + AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C + AS2( xor eax, eax) // clear eax - __asm__ __volatile__( - "push %%ebp;" - "sub %3, %2;" - "xor %0, %0;" - "sub %4, %0;" - "lea (%1,%4,4), %1;" - "sar $1, %0;" - "jz 1f;" + AS2( sub eax, esi) // eax is a negative index from end of B + AS2( lea ebx, [ebx+4*esi]) // ebx is end of B - "0:;" - "mov 0(%3), %4;" - "mov 4(%3), %%ebp;" - "mov (%1,%0,8), %5;" - "lea 8(%3), %3;" - "sbb %5, %4;" - "mov 4(%1,%0,8), %5;" - "sbb %5, %%ebp;" - "inc %0;" - "mov %4, -8(%3, %2);" - "mov %%ebp, -4(%3, %2);" - "jnz 0b;" + AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag + AS1( jz loopendSub) // if no dwords then nothing to do - "1:;" - "adc $0, %0;" - "pop %%ebp;" + AS1(loopstartSub:) + AS2( mov esi,[edx]) // load lower word of A + AS2( mov ebp,[edx+4]) // load higher word of A - : "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp) - : : "cc", "memory"); + AS2( mov edi,[ebx+8*eax]) // load lower word of B + AS2( lea edx,[edx+8]) // advance A and C - return carry; + AS2( sbb esi,edi) // subtract lower words + AS2( mov edi,[ebx+8*eax+4]) // load higher word of B + + AS2( sbb ebp,edi) // subtract higher words + AS1( inc eax) // advance B + + AS2( mov [edx+ecx-8],esi) // store lower word result + AS2( mov [edx+ecx-4],ebp) // store higher word result + + AS1( jnz loopstartSub) // loop until eax overflows and becomes zero + + AS1(loopendSub:) + AS2( adc eax, 0) // store carry into eax (return result register) + + PentiumEpilogue } -#endif // __pic__ +CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N) +{ + P4Prologue + + // now: ebx = B, ecx = C, edx = A, esi = N + AS2( xor eax, eax) + AS1( neg esi) + AS1( jz loopendAddP4) // if no dwords then nothing to do + + AS2( mov edi, [edx]) + AS2( mov ebp, [ebx]) + + AS1(loopstartAddP4:) + AS2( add edi, eax) + AS1( jc carry1AddP4) + + AS2( xor eax, eax) + + AS1(carry1continueAddP4:) + AS2( add edi, ebp) + AS2( mov ebp, 1) + AS2( mov [ecx], edi) + AS2( mov edi, [edx+4]) + AS2( cmovc eax, ebp) + AS2( mov ebp, [ebx+4]) + AS2( lea ebx, [ebx+8]) + AS2( add edi, eax) + AS1( jc carry2AddP4) + + AS2( xor eax, eax) + + AS1(carry2continueAddP4:) + AS2( add edi, ebp) + AS2( mov ebp, 1) + AS2( cmovc eax, ebp) + AS2( mov [ecx+4], edi) + AS2( add ecx, 8) + AS2( mov edi, [edx+8]) + AS2( add edx, 8) + AS2( add esi, 2) + AS2( mov ebp, [ebx]) + AS1( jnz loopstartAddP4) + AS1( jmp loopendAddP4) + + AS1(carry1AddP4:) + AS2( mov eax, 1) + AS1( jmp carry1continueAddP4) + + AS1(carry2AddP4:) + AS2( mov eax, 1) + AS1( jmp carry2continueAddP4) + + AS1(loopendAddP4:) + + P4Epilogue +} + +CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N) +{ + P4Prologue + + // now: ebx = B, ecx = C, edx = A, esi = N + AS2( xor eax, eax) + AS1( neg esi) + AS1( jz loopendSubP4) // if no dwords then nothing to do + + AS2( mov edi, [edx]) + AS2( mov ebp, [ebx]) + + AS1(loopstartSubP4:) + AS2( sub edi, eax) + AS1( jc carry1SubP4) + + AS2( xor eax, eax) + + AS1(carry1continueSubP4:) + AS2( sub edi, ebp) + AS2( mov ebp, 1) + AS2( mov [ecx], edi) + AS2( mov edi, [edx+4]) + AS2( cmovc eax, ebp) + AS2( mov ebp, [ebx+4]) + AS2( lea ebx, [ebx+8]) + AS2( sub edi, eax) + AS1( jc carry2SubP4) + + AS2( xor eax, eax) + + AS1(carry2continueSubP4:) + AS2( sub edi, ebp) + AS2( mov ebp, 1) + AS2( cmovc eax, ebp) + AS2( mov [ecx+4], edi) + AS2( add ecx, 8) + AS2( mov edi, [edx+8]) + AS2( add edx, 8) + AS2( add esi, 2) + AS2( mov ebp, [ebx]) + AS1( jnz loopstartSubP4) + AS1( jmp loopendSubP4) + + AS1(carry1SubP4:) + AS2( mov eax, 1) + AS1( jmp carry1continueSubP4) + + AS1(carry2SubP4:) + AS2( mov eax, 1) + AS1( jmp carry2continueSubP4) + + AS1(loopendSubP4:) + + P4Epilogue +} + +#if __GNUC__ // Comba square and multiply assembly code originally contributed by Leonard Janke +// These are not needed with MSVC, which does a good job of optimizing the C++ multiply code. #define SqrStartup \ "push %%ebp\n\t" \ @@ -1926,12 +1438,442 @@ void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y) ); } -#else // no processor specific code at this layer +#endif // __GNUC__ + +#else // not x86 - no processor specific code at this layer typedef Portable LowLevel; #endif +bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true; + +void DisableSSE2() +{ + g_sse2Enabled = false; +} + +#ifdef SSE2_INTRINSICS_AVAILABLE + +static bool GetSSE2Capability() +{ + word32 b; + +#ifdef __GNUC__ + __asm__("mov $1, %%eax; cpuid; mov %%edx, %0" : "=rm" (b) : : "%eax", "%edx"); +#else + __asm + { + mov eax, 1 + cpuid + mov b, edx + } +#endif + + return (b & (1 << 26)) != 0; +} + +static inline bool HasSSE2() +{ + if (g_sse2Enabled && !g_sse2DetectionDone) + { + g_sse2Detected = GetSSE2Capability(); + g_sse2DetectionDone = true; + } + return g_sse2Enabled && g_sse2Detected; +} + +#ifdef __GNUC__ +#define __fastcall +#endif + +static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B) +{ + __m128i a3210 = _mm_load_si128(A); + __m128i b3210 = _mm_load_si128(B); + + __m128i sum; + + __m128i z = _mm_setzero_si128(); + __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210); + C[0] = a2b2_a0b0; + + __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1)); + __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021); + __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z); + __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z); + C[1] = _mm_add_epi64(a1b0, a0b1); + + __m128i a31 = _mm_srli_epi64(a3210, 32); + __m128i b31 = _mm_srli_epi64(b3210, 32); + __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31); + C[6] = a3b3_a1b1; + + __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z); + __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2)); + __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012); + __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z); + __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z); + sum = _mm_add_epi64(a1b1, a0b2); + C[2] = _mm_add_epi64(sum, a2b0); + + __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1)); + __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3)); + __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012); + __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103); + __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z); + __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z); + __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z); + __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z); + __m128i sum1 = _mm_add_epi64(a3b0, a1b2); + sum = _mm_add_epi64(a2b1, a0b3); + C[3] = _mm_add_epi64(sum, sum1); + + __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103); + __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z); + __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z); + __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z); + sum = _mm_add_epi64(a2b2, a3b1); + C[4] = _mm_add_epi64(sum, a1b3); + + __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2)); + __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3)); + __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203); + __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z); + __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z); + C[5] = _mm_add_epi64(a3b2, a2b3); +} + +void P4Optimized::Multiply4(word *C, const word *A, const word *B) +{ + __m128i temp[7]; + const word *w = (word *)temp; + const __m64 *mw = (__m64 *)w; + + P4_Mul(temp, (__m128i *)A, (__m128i *)B); + + C[0] = w[0]; + + __m64 s1, s2; + + __m64 w1 = _mm_cvtsi32_si64(w[1]); + __m64 w4 = mw[2]; + __m64 w6 = mw[3]; + __m64 w8 = mw[4]; + __m64 w10 = mw[5]; + __m64 w12 = mw[6]; + __m64 w14 = mw[7]; + __m64 w16 = mw[8]; + __m64 w18 = mw[9]; + __m64 w20 = mw[10]; + __m64 w22 = mw[11]; + __m64 w26 = _mm_cvtsi32_si64(w[26]); + + s1 = _mm_add_si64(w1, w4); + C[1] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s2 = _mm_add_si64(w6, w8); + s1 = _mm_add_si64(s1, s2); + C[2] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s2 = _mm_add_si64(w10, w12); + s1 = _mm_add_si64(s1, s2); + C[3] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s2 = _mm_add_si64(w14, w16); + s1 = _mm_add_si64(s1, s2); + C[4] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s2 = _mm_add_si64(w18, w20); + s1 = _mm_add_si64(s1, s2); + C[5] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s2 = _mm_add_si64(w22, w26); + s1 = _mm_add_si64(s1, s2); + C[6] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + C[7] = _mm_cvtsi64_si32(s1) + w[27]; + _mm_empty(); +} + +void P4Optimized::Multiply8(word *C, const word *A, const word *B) +{ + __m128i temp[28]; + const word *w = (word *)temp; + const __m64 *mw = (__m64 *)w; + const word *x = (word *)temp+7*4; + const __m64 *mx = (__m64 *)x; + const word *y = (word *)temp+7*4*2; + const __m64 *my = (__m64 *)y; + const word *z = (word *)temp+7*4*3; + const __m64 *mz = (__m64 *)z; + + P4_Mul(temp, (__m128i *)A, (__m128i *)B); + + P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); + + P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); + + P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1); + + C[0] = w[0]; + + __m64 s1, s2, s3, s4; + + __m64 w1 = _mm_cvtsi32_si64(w[1]); + __m64 w4 = mw[2]; + __m64 w6 = mw[3]; + __m64 w8 = mw[4]; + __m64 w10 = mw[5]; + __m64 w12 = mw[6]; + __m64 w14 = mw[7]; + __m64 w16 = mw[8]; + __m64 w18 = mw[9]; + __m64 w20 = mw[10]; + __m64 w22 = mw[11]; + __m64 w26 = _mm_cvtsi32_si64(w[26]); + __m64 w27 = _mm_cvtsi32_si64(w[27]); + + __m64 x0 = _mm_cvtsi32_si64(x[0]); + __m64 x1 = _mm_cvtsi32_si64(x[1]); + __m64 x4 = mx[2]; + __m64 x6 = mx[3]; + __m64 x8 = mx[4]; + __m64 x10 = mx[5]; + __m64 x12 = mx[6]; + __m64 x14 = mx[7]; + __m64 x16 = mx[8]; + __m64 x18 = mx[9]; + __m64 x20 = mx[10]; + __m64 x22 = mx[11]; + __m64 x26 = _mm_cvtsi32_si64(x[26]); + __m64 x27 = _mm_cvtsi32_si64(x[27]); + + __m64 y0 = _mm_cvtsi32_si64(y[0]); + __m64 y1 = _mm_cvtsi32_si64(y[1]); + __m64 y4 = my[2]; + __m64 y6 = my[3]; + __m64 y8 = my[4]; + __m64 y10 = my[5]; + __m64 y12 = my[6]; + __m64 y14 = my[7]; + __m64 y16 = my[8]; + __m64 y18 = my[9]; + __m64 y20 = my[10]; + __m64 y22 = my[11]; + __m64 y26 = _mm_cvtsi32_si64(y[26]); + __m64 y27 = _mm_cvtsi32_si64(y[27]); + + __m64 z0 = _mm_cvtsi32_si64(z[0]); + __m64 z1 = _mm_cvtsi32_si64(z[1]); + __m64 z4 = mz[2]; + __m64 z6 = mz[3]; + __m64 z8 = mz[4]; + __m64 z10 = mz[5]; + __m64 z12 = mz[6]; + __m64 z14 = mz[7]; + __m64 z16 = mz[8]; + __m64 z18 = mz[9]; + __m64 z20 = mz[10]; + __m64 z22 = mz[11]; + __m64 z26 = _mm_cvtsi32_si64(z[26]); + + s1 = _mm_add_si64(w1, w4); + C[1] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s2 = _mm_add_si64(w6, w8); + s1 = _mm_add_si64(s1, s2); + C[2] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s2 = _mm_add_si64(w10, w12); + s1 = _mm_add_si64(s1, s2); + C[3] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x0, y0); + s2 = _mm_add_si64(w14, w16); + s1 = _mm_add_si64(s1, s3); + s1 = _mm_add_si64(s1, s2); + C[4] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x1, y1); + s4 = _mm_add_si64(x4, y4); + s1 = _mm_add_si64(s1, w18); + s3 = _mm_add_si64(s3, s4); + s1 = _mm_add_si64(s1, w20); + s1 = _mm_add_si64(s1, s3); + C[5] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x6, y6); + s4 = _mm_add_si64(x8, y8); + s1 = _mm_add_si64(s1, w22); + s3 = _mm_add_si64(s3, s4); + s1 = _mm_add_si64(s1, w26); + s1 = _mm_add_si64(s1, s3); + C[6] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x10, y10); + s4 = _mm_add_si64(x12, y12); + s1 = _mm_add_si64(s1, w27); + s3 = _mm_add_si64(s3, s4); + s1 = _mm_add_si64(s1, s3); + C[7] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x14, y14); + s4 = _mm_add_si64(x16, y16); + s1 = _mm_add_si64(s1, z0); + s3 = _mm_add_si64(s3, s4); + s1 = _mm_add_si64(s1, s3); + C[8] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x18, y18); + s4 = _mm_add_si64(x20, y20); + s1 = _mm_add_si64(s1, z1); + s3 = _mm_add_si64(s3, s4); + s1 = _mm_add_si64(s1, z4); + s1 = _mm_add_si64(s1, s3); + C[9] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x22, y22); + s4 = _mm_add_si64(x26, y26); + s1 = _mm_add_si64(s1, z6); + s3 = _mm_add_si64(s3, s4); + s1 = _mm_add_si64(s1, z8); + s1 = _mm_add_si64(s1, s3); + C[10] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x27, y27); + s1 = _mm_add_si64(s1, z10); + s1 = _mm_add_si64(s1, z12); + s1 = _mm_add_si64(s1, s3); + C[11] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(z14, z16); + s1 = _mm_add_si64(s1, s3); + C[12] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(z18, z20); + s1 = _mm_add_si64(s1, s3); + C[13] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(z22, z26); + s1 = _mm_add_si64(s1, s3); + C[14] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + C[15] = z[27] + _mm_cvtsi64_si32(s1); + _mm_empty(); +} + +void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B) +{ + __m128i temp[21]; + const word *w = (word *)temp; + const __m64 *mw = (__m64 *)w; + const word *x = (word *)temp+7*4; + const __m64 *mx = (__m64 *)x; + const word *y = (word *)temp+7*4*2; + const __m64 *my = (__m64 *)y; + + P4_Mul(temp, (__m128i *)A, (__m128i *)B); + + P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); + + P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); + + C[0] = w[0]; + + __m64 s1, s2, s3, s4; + + __m64 w1 = _mm_cvtsi32_si64(w[1]); + __m64 w4 = mw[2]; + __m64 w6 = mw[3]; + __m64 w8 = mw[4]; + __m64 w10 = mw[5]; + __m64 w12 = mw[6]; + __m64 w14 = mw[7]; + __m64 w16 = mw[8]; + __m64 w18 = mw[9]; + __m64 w20 = mw[10]; + __m64 w22 = mw[11]; + __m64 w26 = _mm_cvtsi32_si64(w[26]); + + __m64 x0 = _mm_cvtsi32_si64(x[0]); + __m64 x1 = _mm_cvtsi32_si64(x[1]); + __m64 x4 = mx[2]; + __m64 x6 = mx[3]; + __m64 x8 = mx[4]; + + __m64 y0 = _mm_cvtsi32_si64(y[0]); + __m64 y1 = _mm_cvtsi32_si64(y[1]); + __m64 y4 = my[2]; + __m64 y6 = my[3]; + __m64 y8 = my[4]; + + s1 = _mm_add_si64(w1, w4); + C[1] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s2 = _mm_add_si64(w6, w8); + s1 = _mm_add_si64(s1, s2); + C[2] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s2 = _mm_add_si64(w10, w12); + s1 = _mm_add_si64(s1, s2); + C[3] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x0, y0); + s2 = _mm_add_si64(w14, w16); + s1 = _mm_add_si64(s1, s3); + s1 = _mm_add_si64(s1, s2); + C[4] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x1, y1); + s4 = _mm_add_si64(x4, y4); + s1 = _mm_add_si64(s1, w18); + s3 = _mm_add_si64(s3, s4); + s1 = _mm_add_si64(s1, w20); + s1 = _mm_add_si64(s1, s3); + C[5] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + s3 = _mm_add_si64(x6, y6); + s4 = _mm_add_si64(x8, y8); + s1 = _mm_add_si64(s1, w22); + s3 = _mm_add_si64(s3, s4); + s1 = _mm_add_si64(s1, w26); + s1 = _mm_add_si64(s1, s3); + C[6] = _mm_cvtsi64_si32(s1); + s1 = _mm_srli_si64(s1, 32); + + C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12]; + _mm_empty(); +} + +#endif // #ifdef SSE2_INTRINSICS_AVAILABLE + // ******************************************************** #define A0 A diff --git a/integer.h b/integer.h index 72a962e3..9006278a 100644 --- a/integer.h +++ b/integer.h @@ -10,21 +10,28 @@ #include #ifdef _M_IX86 -# if (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 500)) || (defined(__ICL) && (__ICL >= 500)) -# define SSE2_INTRINSICS_AVAILABLE -# elif defined(_MSC_VER) + #if (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 500)) || (defined(__ICL) && (__ICL >= 500)) + #define SSE2_INTRINSICS_AVAILABLE + #elif defined(_MSC_VER) // _mm_free seems to be the only way to tell if the Processor Pack is installed or not -# include -# if defined(_mm_free) -# define SSE2_INTRINSICS_AVAILABLE -# endif -# endif + #include + #if defined(_mm_free) + #define SSE2_INTRINSICS_AVAILABLE + #endif + #endif +#endif + +// SSE2 intrinsics work in GCC 3.3 or later +#if defined(__SSE2__) && (__GNUC_MAJOR__ > 3 || __GNUC_MINOR__ > 2) + #define SSE2_INTRINSICS_AVAILABLE #endif NAMESPACE_BEGIN(CryptoPP) #if defined(SSE2_INTRINSICS_AVAILABLE) || defined(_MSC_VER) - + // Defined this class for MSVC even if processor pack is not installed, + // so that the library can be compiled with processor pack, and calling app + // compiled without it. template class AlignedAllocator : public AllocatorBase { @@ -38,15 +45,16 @@ NAMESPACE_BEGIN(CryptoPP) return StandardReallocate(*this, p, oldSize, newSize, preserve); } }; -template class CRYPTOPP_DLL AlignedAllocator; - typedef SecBlock > SecAlignedWordBlock; -void CRYPTOPP_DLL DisableSSE2(); + template class CRYPTOPP_DLL AlignedAllocator; + typedef SecBlock > SecAlignedWordBlock; #else typedef SecWordBlock SecAlignedWordBlock; #endif +void CRYPTOPP_DLL DisableSSE2(); + //! multiple precision integer and basic arithmetics /*! This class can represent positive and negative integers with absolute value less than (256**sizeof(word)) ** (256**sizeof(int)).