diff --git a/integer.cpp b/integer.cpp index 0c5018ee..515643ed 100644 --- a/integer.cpp +++ b/integer.cpp @@ -14,30 +14,20 @@ #include "algparam.h" #include "pubkey.h" // for P1363_KDF2 #include "sha.h" +#include "cpu.h" #include -#ifdef _M_X64 -#include +#if defined(_MSC_VER) && _MSC_VER >= 1400 + #include #endif -#ifdef SSE2_INTRINSICS_AVAILABLE - #ifdef __GNUC__ - #include - #include - #include - #ifdef CRYPTOPP_MEMALIGN_AVAILABLE - #include - #else - #include - #endif - #else - #include - #endif -#elif defined(_MSC_VER) && defined(_M_IX86) - #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.") -#elif defined(__GNUC__) && defined(__i386__) - #warning "You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled." +#ifdef __DECCXX + #include +#endif + +#ifdef CRYPTOPP_MSVC6_NO_PP + #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.") #endif NAMESPACE_BEGIN(CryptoPP) @@ -50,67 +40,7 @@ bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const v return true; } -#ifdef SSE2_INTRINSICS_AVAILABLE -template -CPP_TYPENAME AlignedAllocator::pointer AlignedAllocator::allocate(size_type n, const void *) -{ - CheckSize(n); - if (n == 0) - return NULL; - if (n >= 4) - { - void *p; - #ifdef CRYPTOPP_MM_MALLOC_AVAILABLE - while (!(p = _mm_malloc(sizeof(T)*n, 16))) - #elif defined(CRYPTOPP_MEMALIGN_AVAILABLE) - while (!(p = memalign(16, sizeof(T)*n))) - #elif defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16) - while (!(p = malloc(sizeof(T)*n))) - #else - while (!(p = (byte *)malloc(sizeof(T)*n + 8))) // assume malloc alignment is at least 8 - #endif - CallNewHandler(); - - #ifdef CRYPTOPP_NO_ALIGNED_ALLOC - assert(m_pBlock == NULL); - m_pBlock = p; - if (!IsAlignedOn(p, 16)) - { - assert(IsAlignedOn(p, 8)); - p = (byte *)p + 8; - } - #endif - - assert(IsAlignedOn(p, 16)); - return (T*)p; - } - return new T[n]; -} - -template -void AlignedAllocator::deallocate(void *p, size_type n) -{ - memset(p, 0, n*sizeof(T)); - if (n >= 4) - { - #ifdef CRYPTOPP_MM_MALLOC_AVAILABLE - _mm_free(p); - #elif defined(CRYPTOPP_NO_ALIGNED_ALLOC) - assert(m_pBlock == p || (byte *)m_pBlock+8 == p); - free(m_pBlock); - m_pBlock = NULL; - #else - free(p); - #endif - } - else - delete [] (T *)p; -} - -template class CRYPTOPP_DLL AlignedAllocator; -#endif - -static int Compare(const word *A, const word *B, size_t N) +inline static int Compare(const word *A, const word *B, size_t N) { while (N--) if (A[N] > B[N]) @@ -121,7 +51,7 @@ static int Compare(const word *A, const word *B, size_t N) return 0; } -static int Increment(word *A, size_t N, word B=1) +inline static int Increment(word *A, size_t N, word B=1) { assert(N); word t = A[0]; @@ -134,7 +64,7 @@ static int Increment(word *A, size_t N, word B=1) return 1; } -static int Decrement(word *A, size_t N, word B=1) +inline static int Decrement(word *A, size_t N, word B=1) { assert(N); word t = A[0]; @@ -169,6 +99,45 @@ static word AtomicInverseModPower2(word A) // ******************************************************** +#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE + #define Declare2Words(x) dword x; + #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) + #define MultiplyWords(p, a, b) p = __emulu(a, b); + #else + #define MultiplyWords(p, a, b) p = (dword)a*b; + #endif + #define AssignWord(a, b) a = b; + #define Add2WordsBy1(a, b, c) a = b + c; + #define Acc2WordsBy1(a, b) a += b; + #define Acc2WordsBy2(a, b) a += b; + #define LowWord(a) (word)a + #define HighWord(a) (word)(a>>WORD_BITS) + #define Double2Words(a) a += a; + #define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u); + #define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u); + #define GetCarry(u) HighWord(u) + #define GetBorrow(u) word(u>>(WORD_BITS*2-1)) +#else + #define Declare2Words(x) word x##0, x##1; + #define AssignWord(a, b) a##0 = b; a##1 = 0; + #define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c); + #define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b) + #define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1; + #define LowWord(a) a##0 + #define HighWord(a) a##1 + #ifdef _MSC_VER + #define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1); + #define Double2Words(a) a##1 = __shiftleft128(a##0, a##1, 1); a##0 += a##0; + #elif defined(__DECCXX) + #define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b); + #define Double2Words(a) a##1 = (a##1 + a##1) + (a##0 >> (WORD_BITS-1)); a##0 += a##0; + #endif + #define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (ta) + (u##0>t);} + #define GetCarry(u) u##1 + #define GetBorrow(u) u##1 +#endif + class DWord { public: @@ -198,25 +167,8 @@ public: DWord r; #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE r.m_whole = (dword)a * b; - #elif defined(__alpha__) - r.m_halfs.low = a*b; __asm__("umulh %1,%2,%0" : "=r" (r.m_halfs.high) : "r" (a), "r" (b)); - #elif defined(__ia64__) - r.m_halfs.low = a*b; __asm__("xmpy.hu %0=%1,%2" : "=f" (r.m_halfs.high) : "f" (a), "f" (b)); - #elif defined(_ARCH_PPC64) - r.m_halfs.low = a*b; __asm__("mulhdu %0,%1,%2" : "=r" (r.m_halfs.high) : "r" (a), "r" (b) : "cc"); - #elif defined(__x86_64__) - __asm__("mulq %3" : "=d" (r.m_halfs.high), "=a" (r.m_halfs.low) : "a" (a), "rm" (b) : "cc"); - #elif defined(__mips64) - __asm__("dmultu %2,%3" : "=h" (r.m_halfs.high), "=l" (r.m_halfs.low) : "r" (a), "r" (b)); - #elif defined(_M_X64) - r.m_halfs.low = _umul128(a, b, &r.m_halfs.high); - #elif defined(_M_IX86) - // for testing - word64 t = (word64)a * b; - r.m_halfs.high = ((word32 *)(&t))[1]; - r.m_halfs.low = (word32)t; #else - #error can not implement DWord + r.m_halfs.low = _umul128(a, b, &r.m_halfs.high); #endif return r; } @@ -457,1529 +409,1449 @@ inline word DWord::operator%(word a) // ******************************************************** -class Portable -{ -public: - static int Add(word *C, const word *A, const word *B, size_t N); - static int Subtract(word *C, const word *A, const word *B, size_t N); - - static inline void Multiply2(word *C, const word *A, const word *B); - static inline word Multiply2Add(word *C, const word *A, const word *B); - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static inline unsigned int MultiplyRecursionLimit() {return 8;} - - static inline void Multiply2Bottom(word *C, const word *A, const word *B); - static void Multiply4Bottom(word *C, const word *A, const word *B); - static void Multiply8Bottom(word *C, const word *A, const word *B); - static inline unsigned int MultiplyBottomRecursionLimit() {return 8;} - - static void Square2(word *R, const word *A); - static void Square4(word *R, const word *A); - static void Square8(word *R, const word *A) {assert(false);} - static inline unsigned int SquareRecursionLimit() {return 4;} -}; - -int Portable::Add(word *C, const word *A, const word *B, size_t N) -{ - assert (N%2 == 0); - - DWord u(0, 0); - for (unsigned int i = 0; i < N; i+=2) - { - u = DWord(A[i]) + B[i] + u.GetHighHalf(); - C[i] = u.GetLowHalf(); - u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf(); - C[i+1] = u.GetLowHalf(); - } - return int(u.GetHighHalf()); -} - -int Portable::Subtract(word *C, const word *A, const word *B, size_t N) -{ - assert (N%2 == 0); - - DWord u(0, 0); - for (unsigned int i = 0; i < N; i+=2) - { - u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow(); - C[i] = u.GetLowHalf(); - u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow(); - C[i+1] = u.GetLowHalf(); - } - return int(0-u.GetHighHalf()); -} - -void Portable::Multiply2(word *C, const word *A, const word *B) -{ -/* - word s; - dword d; - - if (A1 >= A0) - if (B0 >= B1) - { - s = 0; - d = (dword)(A1-A0)*(B0-B1); - } - else - { - s = (A1-A0); - d = (dword)s*(word)(B0-B1); - } - else - if (B0 > B1) - { - s = (B0-B1); - d = (word)(A1-A0)*(dword)s; - } - else - { - s = 0; - d = (dword)(A0-A1)*(B1-B0); - } -*/ - // this segment is the branchless equivalent of above - word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]}; - unsigned int ai = A[1] < A[0]; - unsigned int bi = B[0] < B[1]; - unsigned int di = ai & bi; - DWord d = DWord::Multiply(D[di], D[di+2]); - D[1] = D[3] = 0; - unsigned int si = ai + !bi; - word s = D[si]; - - DWord A0B0 = DWord::Multiply(A[0], B[0]); - C[0] = A0B0.GetLowHalf(); - - DWord A1B1 = DWord::Multiply(A[1], B[1]); - DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf(); - C[1] = t.GetLowHalf(); - - t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s; - C[2] = t.GetLowHalf(); - C[3] = t.GetHighHalf(); -} - -inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B) -{ - DWord t = DWord::Multiply(A[0], B[0]); - C[0] = t.GetLowHalf(); - C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0]; -} - -word Portable::Multiply2Add(word *C, const word *A, const word *B) -{ - word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]}; - unsigned int ai = A[1] < A[0]; - unsigned int bi = B[0] < B[1]; - unsigned int di = ai & bi; - DWord d = DWord::Multiply(D[di], D[di+2]); - D[1] = D[3] = 0; - unsigned int si = ai + !bi; - word s = D[si]; - - DWord A0B0 = DWord::Multiply(A[0], B[0]); - DWord t = A0B0 + C[0]; - C[0] = t.GetLowHalf(); - - DWord A1B1 = DWord::Multiply(A[1], B[1]); - t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf() + C[1]; - C[1] = t.GetLowHalf(); - - t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2]; - C[2] = t.GetLowHalf(); - - t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3]; - C[3] = t.GetLowHalf(); - return t.GetHighHalf(); -} - -#define MulAcc(x, y) \ - p = DWord::MultiplyAndAdd(A[x], B[y], c); \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); - -#define SaveMulAcc(s, x, y) \ - R[s] = c; \ - p = DWord::MultiplyAndAdd(A[x], B[y], d); \ - c = p.GetLowHalf(); \ - p = (DWord) e + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e = p.GetHighHalf(); - -#define SquAcc(x, y) \ - q = DWord::Multiply(A[x], A[y]); \ - p = q + c; \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); \ - p = q + c; \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); - -#define SaveSquAcc(s, x, y) \ - R[s] = c; \ - q = DWord::Multiply(A[x], A[y]); \ - p = q + d; \ - c = p.GetLowHalf(); \ - p = (DWord) e + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e = p.GetHighHalf(); \ - p = q + c; \ - c = p.GetLowHalf(); \ - p = (DWord) d + p.GetHighHalf(); \ - d = p.GetLowHalf(); \ - e += p.GetHighHalf(); - -void Portable::Multiply4(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); - - SaveMulAcc(2, 0, 3); - MulAcc(1, 2); - MulAcc(2, 1); - MulAcc(3, 0); - - SaveMulAcc(3, 3, 1); - MulAcc(2, 2); - MulAcc(1, 3); - - SaveMulAcc(4, 2, 3); - MulAcc(3, 2); - - R[5] = c; - p = DWord::MultiplyAndAdd(A[3], B[3], d); - R[6] = p.GetLowHalf(); - R[7] = e + p.GetHighHalf(); -} - -void Portable::Square2(word *R, const word *A) -{ - DWord p, q; - word c, d, e; - - p = DWord::Multiply(A[0], A[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - SquAcc(0, 1); - - R[1] = c; - p = DWord::MultiplyAndAdd(A[1], A[1], d); - R[2] = p.GetLowHalf(); - R[3] = e + p.GetHighHalf(); -} - -void Portable::Square4(word *R, const word *A) -{ -#ifdef _MSC_VER - // VC60 workaround: MSVC 6.0 has an optimization bug that makes - // (dword)A*B where either A or B has been cast to a dword before - // very expensive. Revisit this function when this - // bug is fixed. - Multiply4(R, A, A); -#else - const word *B = A; - DWord p, q; - word c, d, e; - - p = DWord::Multiply(A[0], A[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - SquAcc(0, 1); - - SaveSquAcc(1, 2, 0); - MulAcc(1, 1); - - SaveSquAcc(2, 0, 3); - SquAcc(1, 2); - - SaveSquAcc(3, 3, 1); - MulAcc(2, 2); - - SaveSquAcc(4, 2, 3); - - R[5] = c; - p = DWord::MultiplyAndAdd(A[3], A[3], d); - R[6] = p.GetLowHalf(); - R[7] = e + p.GetHighHalf(); -#endif -} - -void Portable::Multiply8(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); - - SaveMulAcc(2, 0, 3); - MulAcc(1, 2); - MulAcc(2, 1); - MulAcc(3, 0); - - SaveMulAcc(3, 0, 4); - MulAcc(1, 3); - MulAcc(2, 2); - MulAcc(3, 1); - MulAcc(4, 0); - - SaveMulAcc(4, 0, 5); - MulAcc(1, 4); - MulAcc(2, 3); - MulAcc(3, 2); - MulAcc(4, 1); - MulAcc(5, 0); - - SaveMulAcc(5, 0, 6); - MulAcc(1, 5); - MulAcc(2, 4); - MulAcc(3, 3); - MulAcc(4, 2); - MulAcc(5, 1); - MulAcc(6, 0); - - SaveMulAcc(6, 0, 7); - MulAcc(1, 6); - MulAcc(2, 5); - MulAcc(3, 4); - MulAcc(4, 3); - MulAcc(5, 2); - MulAcc(6, 1); - MulAcc(7, 0); - - SaveMulAcc(7, 1, 7); - MulAcc(2, 6); - MulAcc(3, 5); - MulAcc(4, 4); - MulAcc(5, 3); - MulAcc(6, 2); - MulAcc(7, 1); - - SaveMulAcc(8, 2, 7); - MulAcc(3, 6); - MulAcc(4, 5); - MulAcc(5, 4); - MulAcc(6, 3); - MulAcc(7, 2); - - SaveMulAcc(9, 3, 7); - MulAcc(4, 6); - MulAcc(5, 5); - MulAcc(6, 4); - MulAcc(7, 3); - - SaveMulAcc(10, 4, 7); - MulAcc(5, 6); - MulAcc(6, 5); - MulAcc(7, 4); - - SaveMulAcc(11, 5, 7); - MulAcc(6, 6); - MulAcc(7, 5); - - SaveMulAcc(12, 6, 7); - MulAcc(7, 6); - - R[13] = c; - p = DWord::MultiplyAndAdd(A[7], B[7], d); - R[14] = p.GetLowHalf(); - R[15] = e + p.GetHighHalf(); -} - -void Portable::Multiply4Bottom(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); - - R[2] = c; - R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0]; -} - -void Portable::Multiply8Bottom(word *R, const word *A, const word *B) -{ - DWord p; - word c, d, e; - - p = DWord::Multiply(A[0], B[0]); - R[0] = p.GetLowHalf(); - c = p.GetHighHalf(); - d = e = 0; - - MulAcc(0, 1); - MulAcc(1, 0); - - SaveMulAcc(1, 2, 0); - MulAcc(1, 1); - MulAcc(0, 2); - - SaveMulAcc(2, 0, 3); - MulAcc(1, 2); - MulAcc(2, 1); - MulAcc(3, 0); - - SaveMulAcc(3, 0, 4); - MulAcc(1, 3); - MulAcc(2, 2); - MulAcc(3, 1); - MulAcc(4, 0); - - SaveMulAcc(4, 0, 5); - MulAcc(1, 4); - MulAcc(2, 3); - MulAcc(3, 2); - MulAcc(4, 1); - MulAcc(5, 0); - - SaveMulAcc(5, 0, 6); - MulAcc(1, 5); - MulAcc(2, 4); - MulAcc(3, 3); - MulAcc(4, 2); - MulAcc(5, 1); - MulAcc(6, 0); - - R[6] = c; - R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] + - A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0]; -} - -#undef MulAcc -#undef SaveMulAcc -#undef SquAcc -#undef SaveSquAcc - -#ifdef CRYPTOPP_X86ASM_AVAILABLE - -// ************** x86 feature detection *************** - -static bool s_sse2Enabled = true; - -static void CpuId(word32 input, word32 *output) -{ -#ifdef __GNUC__ - __asm__ - ( - // save ebx in case -fPIC is being used - "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx" - : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3]) - : "a" (input) - ); -#else - __asm - { - mov eax, input - cpuid - mov edi, output - mov [edi], eax - mov [edi+4], ebx - mov [edi+8], ecx - mov [edi+12], edx - } -#endif -} - -#ifdef SSE2_INTRINSICS_AVAILABLE -#ifndef _MSC_VER -static jmp_buf s_env; -static void SigIllHandler(int) -{ - longjmp(s_env, 1); -} -#endif - -static bool HasSSE2() -{ - if (!s_sse2Enabled) - return false; - - word32 cpuid[4]; - CpuId(1, cpuid); - if ((cpuid[3] & (1 << 26)) == 0) - return false; - -#ifdef _MSC_VER - __try - { - __asm xorpd xmm0, xmm0 // executing SSE2 instruction - } - __except (1) - { - return false; - } - return true; -#else - typedef void (*SigHandler)(int); - - SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; - - bool result = true; - if (setjmp(s_env)) - result = false; - else - __asm __volatile ("xorps %xmm0, %xmm0"); - - signal(SIGILL, oldHandler); - return result; -#endif -} -#endif - -static bool IsP4() -{ - word32 cpuid[4]; - - CpuId(0, cpuid); - std::swap(cpuid[2], cpuid[3]); - if (memcmp(cpuid+1, "GenuineIntel", 12) != 0) - return false; - - CpuId(1, cpuid); - return ((cpuid[0] >> 8) & 0xf) == 0xf; -} - -// ************** Pentium/P4 optimizations *************** - -class PentiumOptimized : public Portable -{ -public: - static int Add(word *C, const word *A, const word *B, size_t N); - static int Subtract(word *C, const word *A, const word *B, size_t N); - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static void Multiply8Bottom(word *C, const word *A, const word *B); -}; - -class P4Optimized -{ -public: - static int Add(word *C, const word *A, const word *B, size_t N); - static int Subtract(word *C, const word *A, const word *B, size_t N); -#ifdef SSE2_INTRINSICS_AVAILABLE - static void Multiply4(word *C, const word *A, const word *B); - static void Multiply8(word *C, const word *A, const word *B); - static void Multiply8Bottom(word *C, const word *A, const word *B); -#endif -}; - -typedef int (* PAddSub)(word *C, const word *A, const word *B, size_t N); -typedef void (* PMul)(word *C, const word *A, const word *B); - -static PAddSub s_pAdd, s_pSub; -#ifdef SSE2_INTRINSICS_AVAILABLE -static PMul s_pMul4, s_pMul8, s_pMul8B; -#endif - -static void SetPentiumFunctionPointers() -{ - if (IsP4()) - { - s_pAdd = &P4Optimized::Add; - s_pSub = &P4Optimized::Subtract; - } - else - { - s_pAdd = &PentiumOptimized::Add; - s_pSub = &PentiumOptimized::Subtract; - } - -#ifdef SSE2_INTRINSICS_AVAILABLE - if (HasSSE2()) - { - s_pMul4 = &P4Optimized::Multiply4; - s_pMul8 = &P4Optimized::Multiply8; - s_pMul8B = &P4Optimized::Multiply8Bottom; - } - else - { - s_pMul4 = &PentiumOptimized::Multiply4; - s_pMul8 = &PentiumOptimized::Multiply8; - s_pMul8B = &PentiumOptimized::Multiply8Bottom; - } -#endif -} - -void DisableSSE2() -{ - s_sse2Enabled = false; - SetPentiumFunctionPointers(); -} - -class LowLevel : public PentiumOptimized -{ -public: - inline static int Add(word *C, const word *A, const word *B, size_t N) - {return s_pAdd(C, A, B, N);} - inline static int Subtract(word *C, const word *A, const word *B, size_t N) - {return s_pSub(C, A, B, N);} - inline static void Square4(word *R, const word *A) - {Multiply4(R, A, A);} -#ifdef SSE2_INTRINSICS_AVAILABLE - inline static void Multiply4(word *C, const word *A, const word *B) - {s_pMul4(C, A, B);} - inline static void Multiply8(word *C, const word *A, const word *B) - {s_pMul8(C, A, B);} - inline static void Multiply8Bottom(word *C, const word *A, const word *B) - {s_pMul8B(C, A, B);} -#endif -}; - // use some tricks to share assembly code between MSVC and GCC -#ifdef _MSC_VER - #define CRYPTOPP_NAKED __declspec(naked) - #define AS1(x) __asm x - #define AS2(x, y) __asm x, y - #define AddPrologue \ - __asm push ebp \ - __asm push ebx \ - __asm push esi \ - __asm push edi \ - __asm mov ecx, [esp+20] \ - __asm mov edx, [esp+24] \ - __asm mov ebx, [esp+28] \ - __asm mov esi, [esp+32] - #define AddEpilogue \ - __asm pop edi \ - __asm pop esi \ - __asm pop ebx \ - __asm pop ebp \ - __asm ret - #define MulPrologue \ - __asm push ebp \ - __asm push ebx \ - __asm push esi \ - __asm push edi \ - __asm mov ecx, [esp+28] \ - __asm mov esi, [esp+24] \ - __asm push [esp+20] - #define MulEpilogue \ - __asm add esp, 4 \ - __asm pop edi \ - __asm pop esi \ - __asm pop ebx \ - __asm pop ebp \ - __asm ret -#else +#if defined(__GNUC__) #define CRYPTOPP_NAKED - #define AS1(x) #x ";" - #define AS2(x, y) #x ", " #y ";" #define AddPrologue \ __asm__ __volatile__ \ ( \ "push %%ebx;" /* save this manually, in case of -fPIC */ \ "mov %2, %%ebx;" \ - ".intel_syntax noprefix;" \ - "push ebp;" + ".intel_syntax noprefix;" #define AddEpilogue \ - "pop ebp;" \ ".att_syntax prefix;" \ "pop %%ebx;" \ : \ - : "c" (C), "d" (A), "m" (B), "S" (N) \ - : "%edi", "memory", "cc" \ + : "d" (C), "a" (A), "m" (B), "c" (N) \ + : "%esi", "memory", "cc" \ ); #define MulPrologue \ __asm__ __volatile__ \ ( \ - "push %%ebx;" /* save this manually, in case of -fPIC */ \ - "push %%ebp;" \ - "push %0;" \ - ".intel_syntax noprefix;" + ".intel_syntax noprefix;" \ + AS1( push ebx) \ + AS2( mov ebx, edx) #define MulEpilogue \ - "add esp, 4;" \ - "pop ebp;" \ - "pop ebx;" \ + AS1( pop ebx) \ ".att_syntax prefix;" \ : \ - : "rm" (Z), "S" (X), "c" (Y) \ - : "%eax", "%edx", "%edi", "memory", "cc" \ + : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \ + : "%esi", "memory", "cc" \ + ); + #define SquPrologue MulPrologue + #define SquEpilogue \ + AS1( pop ebx) \ + ".att_syntax prefix;" \ + : \ + : "d" (s_maskLow16), "c" (C), "a" (A) \ + : "%esi", "%edi", "memory", "cc" \ + ); + #define TopPrologue MulPrologue + #define TopEpilogue \ + AS1( pop ebx) \ + ".att_syntax prefix;" \ + : \ + : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \ + : "memory", "cc" \ ); -#endif - -CRYPTOPP_NAKED int PentiumOptimized::Add(word *C, const word *A, const word *B, size_t N) -{ - AddPrologue - - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C - AS2( xor eax, eax) // clear eax - - AS2( sub eax, esi) // eax is a negative index from end of B - AS2( lea ebx, [ebx+4*esi]) // ebx is end of B - - AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag - AS1( jz loopendAdd) // if no dwords then nothing to do - - AS1(loopstartAdd:) - AS2( mov esi,[edx]) // load lower word of A - AS2( mov ebp,[edx+4]) // load higher word of A - - AS2( mov edi,[ebx+8*eax]) // load lower word of B - AS2( lea edx,[edx+8]) // advance A and C - - AS2( adc esi,edi) // add lower words - AS2( mov edi,[ebx+8*eax+4]) // load higher word of B - - AS2( adc ebp,edi) // add higher words - AS1( inc eax) // advance B - - AS2( mov [edx+ecx-8],esi) // store lower word result - AS2( mov [edx+ecx-4],ebp) // store higher word result - - AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero - - AS1(loopendAdd:) - AS2( adc eax, 0) // store carry into eax (return result register) - - AddEpilogue -} - -CRYPTOPP_NAKED int PentiumOptimized::Subtract(word *C, const word *A, const word *B, size_t N) -{ - AddPrologue - - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C - AS2( xor eax, eax) // clear eax - - AS2( sub eax, esi) // eax is a negative index from end of B - AS2( lea ebx, [ebx+4*esi]) // ebx is end of B - - AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag - AS1( jz loopendSub) // if no dwords then nothing to do - - AS1(loopstartSub:) - AS2( mov esi,[edx]) // load lower word of A - AS2( mov ebp,[edx+4]) // load higher word of A - - AS2( mov edi,[ebx+8*eax]) // load lower word of B - AS2( lea edx,[edx+8]) // advance A and C - - AS2( sbb esi,edi) // subtract lower words - AS2( mov edi,[ebx+8*eax+4]) // load higher word of B - - AS2( sbb ebp,edi) // subtract higher words - AS1( inc eax) // advance B - - AS2( mov [edx+ecx-8],esi) // store lower word result - AS2( mov [edx+ecx-4],ebp) // store higher word result - - AS1( jnz loopstartSub) // loop until eax overflows and becomes zero - - AS1(loopendSub:) - AS2( adc eax, 0) // store carry into eax (return result register) - - AddEpilogue -} - -// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them. - -CRYPTOPP_NAKED int P4Optimized::Add(word *C, const word *A, const word *B, size_t N) -{ - AddPrologue - - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( xor eax, eax) - AS1( neg esi) - AS1( jz loopendAddP4) // if no dwords then nothing to do - - AS2( mov edi, [edx]) - AS2( mov ebp, [ebx]) - AS1( jmp carry1AddP4) - - AS1(loopstartAddP4:) - AS2( mov edi, [edx+8]) - AS2( add ecx, 8) - AS2( add edx, 8) - AS2( mov ebp, [ebx]) - AS2( add edi, eax) - AS1( jc carry1AddP4) - AS2( xor eax, eax) - - AS1(carry1AddP4:) - AS2( add edi, ebp) - AS2( mov ebp, 1) - AS2( mov [ecx], edi) - AS2( mov edi, [edx+4]) - AS2( cmovc eax, ebp) - AS2( mov ebp, [ebx+4]) - AS2( add ebx, 8) - AS2( add edi, eax) - AS1( jc carry2AddP4) - AS2( xor eax, eax) - - AS1(carry2AddP4:) - AS2( add edi, ebp) - AS2( mov ebp, 1) - AS2( cmovc eax, ebp) - AS2( mov [ecx+4], edi) - AS2( add esi, 2) - AS1( jnz loopstartAddP4) - - AS1(loopendAddP4:) - - AddEpilogue -} - -CRYPTOPP_NAKED int P4Optimized::Subtract(word *C, const word *A, const word *B, size_t N) -{ - AddPrologue - - // now: ebx = B, ecx = C, edx = A, esi = N - AS2( xor eax, eax) - AS1( neg esi) - AS1( jz loopendSubP4) // if no dwords then nothing to do - - AS2( mov edi, [edx]) - AS2( mov ebp, [ebx]) - AS1( jmp carry1SubP4) - - AS1(loopstartSubP4:) - AS2( mov edi, [edx+8]) - AS2( add edx, 8) - AS2( add ecx, 8) - AS2( mov ebp, [ebx]) - AS2( sub edi, eax) - AS1( jc carry1SubP4) - AS2( xor eax, eax) - - AS1(carry1SubP4:) - AS2( sub edi, ebp) - AS2( mov ebp, 1) - AS2( mov [ecx], edi) - AS2( mov edi, [edx+4]) - AS2( cmovc eax, ebp) - AS2( mov ebp, [ebx+4]) - AS2( add ebx, 8) - AS2( sub edi, eax) - AS1( jc carry2SubP4) - AS2( xor eax, eax) - - AS1(carry2SubP4:) - AS2( sub edi, ebp) - AS2( mov ebp, 1) - AS2( cmovc eax, ebp) - AS2( mov [ecx+4], edi) - AS2( add esi, 2) - AS1( jnz loopstartSubP4) - - AS1(loopendSubP4:) - - AddEpilogue -} - -// multiply assembly code originally contributed by Leonard Janke - -#define MulStartup \ - AS2(xor ebp, ebp) \ - AS2(xor edi, edi) \ - AS2(xor ebx, ebx) - -#define MulShiftCarry \ - AS2(mov ebp, edx) \ - AS2(mov edi, ebx) \ - AS2(xor ebx, ebx) - -#define MulAccumulateBottom(i,j) \ - AS2(mov eax, [ecx+4*j]) \ - AS2(imul eax, dword ptr [esi+4*i]) \ - AS2(add ebp, eax) - -#define MulAccumulate(i,j) \ - AS2(mov eax, [ecx+4*j]) \ - AS1(mul dword ptr [esi+4*i]) \ - AS2(add ebp, eax) \ - AS2(adc edi, edx) \ - AS2(adc bl, bh) - -#define MulStoreDigit(i) \ - AS2(mov edx, edi) \ - AS2(mov edi, [esp]) \ - AS2(mov [edi+4*i], ebp) - -#define MulLastDiagonal(digits) \ - AS2(mov eax, [ecx+4*(digits-1)]) \ - AS1(mul dword ptr [esi+4*(digits-1)]) \ - AS2(add ebp, eax) \ - AS2(adc edx, edi) \ - AS2(mov edi, [esp]) \ - AS2(mov [edi+4*(2*digits-2)], ebp) \ - AS2(mov [edi+4*(2*digits-1)], edx) - -CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y) -{ - MulPrologue - // now: [esp] = Z, esi = X, ecx = Y - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(3,2) - MulAccumulate(2,3) - MulStoreDigit(5) - MulShiftCarry - - MulLastDiagonal(4) - MulEpilogue -} - -CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y) -{ - MulPrologue - // now: [esp] = Z, esi = X, ecx = Y - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(4,0) - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulAccumulate(0,4) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(5,0) - MulAccumulate(4,1) - MulAccumulate(3,2) - MulAccumulate(2,3) - MulAccumulate(1,4) - MulAccumulate(0,5) - MulStoreDigit(5) - MulShiftCarry - - MulAccumulate(6,0) - MulAccumulate(5,1) - MulAccumulate(4,2) - MulAccumulate(3,3) - MulAccumulate(2,4) - MulAccumulate(1,5) - MulAccumulate(0,6) - MulStoreDigit(6) - MulShiftCarry - - MulAccumulate(7,0) - MulAccumulate(6,1) - MulAccumulate(5,2) - MulAccumulate(4,3) - MulAccumulate(3,4) - MulAccumulate(2,5) - MulAccumulate(1,6) - MulAccumulate(0,7) - MulStoreDigit(7) - MulShiftCarry - - MulAccumulate(7,1) - MulAccumulate(6,2) - MulAccumulate(5,3) - MulAccumulate(4,4) - MulAccumulate(3,5) - MulAccumulate(2,6) - MulAccumulate(1,7) - MulStoreDigit(8) - MulShiftCarry - - MulAccumulate(7,2) - MulAccumulate(6,3) - MulAccumulate(5,4) - MulAccumulate(4,5) - MulAccumulate(3,6) - MulAccumulate(2,7) - MulStoreDigit(9) - MulShiftCarry - - MulAccumulate(7,3) - MulAccumulate(6,4) - MulAccumulate(5,5) - MulAccumulate(4,6) - MulAccumulate(3,7) - MulStoreDigit(10) - MulShiftCarry - - MulAccumulate(7,4) - MulAccumulate(6,5) - MulAccumulate(5,6) - MulAccumulate(4,7) - MulStoreDigit(11) - MulShiftCarry - - MulAccumulate(7,5) - MulAccumulate(6,6) - MulAccumulate(5,7) - MulStoreDigit(12) - MulShiftCarry - - MulAccumulate(7,6) - MulAccumulate(6,7) - MulStoreDigit(13) - MulShiftCarry - - MulLastDiagonal(8) - MulEpilogue -} - -CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y) -{ - MulPrologue - // now: [esp] = Z, esi = X, ecx = Y - MulStartup - MulAccumulate(0,0) - MulStoreDigit(0) - MulShiftCarry - - MulAccumulate(1,0) - MulAccumulate(0,1) - MulStoreDigit(1) - MulShiftCarry - - MulAccumulate(2,0) - MulAccumulate(1,1) - MulAccumulate(0,2) - MulStoreDigit(2) - MulShiftCarry - - MulAccumulate(3,0) - MulAccumulate(2,1) - MulAccumulate(1,2) - MulAccumulate(0,3) - MulStoreDigit(3) - MulShiftCarry - - MulAccumulate(4,0) - MulAccumulate(3,1) - MulAccumulate(2,2) - MulAccumulate(1,3) - MulAccumulate(0,4) - MulStoreDigit(4) - MulShiftCarry - - MulAccumulate(5,0) - MulAccumulate(4,1) - MulAccumulate(3,2) - MulAccumulate(2,3) - MulAccumulate(1,4) - MulAccumulate(0,5) - MulStoreDigit(5) - MulShiftCarry - - MulAccumulate(6,0) - MulAccumulate(5,1) - MulAccumulate(4,2) - MulAccumulate(3,3) - MulAccumulate(2,4) - MulAccumulate(1,5) - MulAccumulate(0,6) - MulStoreDigit(6) - MulShiftCarry - - MulAccumulateBottom(7,0) - MulAccumulateBottom(6,1) - MulAccumulateBottom(5,2) - MulAccumulateBottom(4,3) - MulAccumulateBottom(3,4) - MulAccumulateBottom(2,5) - MulAccumulateBottom(1,6) - MulAccumulateBottom(0,7) - MulStoreDigit(7) - MulEpilogue -} - -#undef AS1 -#undef AS2 - -#else // not x86 - no processor specific code at this layer - -typedef Portable LowLevel; - -#endif - -#ifdef SSE2_INTRINSICS_AVAILABLE - -#ifdef __GNUC__ -#define CRYPTOPP_FASTCALL #else -#define CRYPTOPP_FASTCALL __fastcall + #define CRYPTOPP_NAKED __declspec(naked) + #define AddPrologue \ + __asm push ebx \ + __asm push esi \ + __asm mov eax, [esp+12] \ + __asm mov ebx, [esp+16] + #define AddEpilogue \ + __asm pop esi \ + __asm pop ebx \ + __asm ret 8 + #define SquPrologue \ + AS2( mov eax, A) \ + AS2( mov ecx, C) \ + AS2( lea ebx, s_maskLow16) + #define SquEpilogue + #define MulPrologue \ + AS2( mov eax, A) \ + AS2( mov edi, B) \ + AS2( mov ecx, C) \ + AS2( lea ebx, s_maskLow16) + #define MulEpilogue + #define TopPrologue \ + AS2( mov eax, A) \ + AS2( mov edi, B) \ + AS2( mov ecx, C) \ + AS2( mov esi, L) \ + AS2( lea ebx, s_maskLow16) + #define TopEpilogue #endif -static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B) +#if defined(_MSC_VER) && defined(_M_X64) +extern "C" { +int Baseline_Add(size_t N, word *C, const word *A, const word *B); +int Baseline_Sub(size_t N, word *C, const word *A, const word *B); +} +#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) { - __m128i a3210 = _mm_load_si128(A); - __m128i b3210 = _mm_load_si128(B); + AddPrologue - __m128i sum; + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) - __m128i z = _mm_setzero_si128(); - __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210); - C[0] = a2b2_a0b0; + AS1( neg ecx) // ecx is negative index + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) - __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1)); - __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021); - __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z); - __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z); - C[1] = _mm_add_epi64(a1b0, a0b1); + ASL(0) + ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero + AS2( mov esi,[eax+4*ecx]) + AS2( adc esi,[ebx+4*ecx]) + AS2( mov [edx+4*ecx],esi) + AS2( mov esi,[eax+4*ecx+4]) + AS2( adc esi,[ebx+4*ecx+4]) + AS2( mov [edx+4*ecx+4],esi) + ASL(1) + AS2( mov esi,[eax+4*ecx+8]) + AS2( adc esi,[ebx+4*ecx+8]) + AS2( mov [edx+4*ecx+8],esi) + AS2( mov esi,[eax+4*ecx+12]) + AS2( adc esi,[ebx+4*ecx+12]) + AS2( mov [edx+4*ecx+12],esi) - __m128i a31 = _mm_srli_epi64(a3210, 32); - __m128i b31 = _mm_srli_epi64(b3210, 32); - __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31); - C[6] = a3b3_a1b1; + AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2 + ASJ( jmp, 0, b) - __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z); - __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2)); - __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012); - __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z); - __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z); - sum = _mm_add_epi64(a1b1, a0b2); - C[2] = _mm_add_epi64(sum, a2b0); + ASL(2) + AS2( mov eax, 0) + AS1( setc al) // store carry into eax (return result register) - __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1)); - __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3)); - __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012); - __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103); - __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z); - __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z); - __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z); - __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z); - __m128i sum1 = _mm_add_epi64(a3b0, a1b2); - sum = _mm_add_epi64(a2b1, a0b3); - C[3] = _mm_add_epi64(sum, sum1); - - __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103); - __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z); - __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z); - __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z); - sum = _mm_add_epi64(a2b2, a3b1); - C[4] = _mm_add_epi64(sum, a1b3); - - __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2)); - __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3)); - __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203); - __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z); - __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z); - C[5] = _mm_add_epi64(a3b2, a2b3); + AddEpilogue } -void P4Optimized::Multiply4(word *C, const word *A, const word *B) +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) { - __m128i temp[7]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; + AddPrologue - P4_Mul(temp, (__m128i *)A, (__m128i *)B); + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) - C[0] = w[0]; + AS1( neg ecx) // ecx is negative index + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) - __m64 s1, s2; + ASL(0) + ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero + AS2( mov esi,[eax+4*ecx]) + AS2( sbb esi,[ebx+4*ecx]) + AS2( mov [edx+4*ecx],esi) + AS2( mov esi,[eax+4*ecx+4]) + AS2( sbb esi,[ebx+4*ecx+4]) + AS2( mov [edx+4*ecx+4],esi) + ASL(1) + AS2( mov esi,[eax+4*ecx+8]) + AS2( sbb esi,[ebx+4*ecx+8]) + AS2( mov [edx+4*ecx+8],esi) + AS2( mov esi,[eax+4*ecx+12]) + AS2( sbb esi,[ebx+4*ecx+12]) + AS2( mov [edx+4*ecx+12],esi) - __m64 w1 = _mm_cvtsi32_si64(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _mm_cvtsi32_si64(w[26]); + AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2 + ASJ( jmp, 0, b) - s1 = _mm_add_si64(w1, w4); - C[1] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); + ASL(2) + AS2( mov eax, 0) + AS1( setc al) // store carry into eax (return result register) - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s2); - C[4] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w18, w20); - s1 = _mm_add_si64(s1, s2); - C[5] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w22, w26); - s1 = _mm_add_si64(s1, s2); - C[6] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - C[7] = _mm_cvtsi64_si32(s1) + w[27]; - _mm_empty(); + AddEpilogue } -void P4Optimized::Multiply8(word *C, const word *A, const word *B) +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B) { - __m128i temp[28]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - const word *x = (word *)temp+7*4; - const __m64 *mx = (__m64 *)x; - const word *y = (word *)temp+7*4*2; - const __m64 *my = (__m64 *)y; - const word *z = (word *)temp+7*4*3; - const __m64 *mz = (__m64 *)z; + AddPrologue - P4_Mul(temp, (__m128i *)A, (__m128i *)B); + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) - P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); + AS1( neg ecx) // ecx is negative index + AS2( pxor mm2, mm2) + ASJ( jz, 2, f) + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) - P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); + ASL(0) + AS2( movd mm0, DWORD PTR [eax+4*ecx]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx], mm2) + AS2( psrlq mm2, 32) - P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1); + AS2( movd mm0, DWORD PTR [eax+4*ecx+4]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+4]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+4], mm2) + AS2( psrlq mm2, 32) - C[0] = w[0]; + ASL(1) + AS2( movd mm0, DWORD PTR [eax+4*ecx+8]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+8]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+8], mm2) + AS2( psrlq mm2, 32) - __m64 s1, s2, s3, s4; + AS2( movd mm0, DWORD PTR [eax+4*ecx+12]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+12]) + AS2( paddq mm0, mm1) + AS2( paddq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+12], mm2) + AS2( psrlq mm2, 32) - __m64 w1 = _mm_cvtsi32_si64(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _mm_cvtsi32_si64(w[26]); - __m64 w27 = _mm_cvtsi32_si64(w[27]); + AS2( add ecx, 4) + ASJ( jnz, 0, b) - __m64 x0 = _mm_cvtsi32_si64(x[0]); - __m64 x1 = _mm_cvtsi32_si64(x[1]); - __m64 x4 = mx[2]; - __m64 x6 = mx[3]; - __m64 x8 = mx[4]; - __m64 x10 = mx[5]; - __m64 x12 = mx[6]; - __m64 x14 = mx[7]; - __m64 x16 = mx[8]; - __m64 x18 = mx[9]; - __m64 x20 = mx[10]; - __m64 x22 = mx[11]; - __m64 x26 = _mm_cvtsi32_si64(x[26]); - __m64 x27 = _mm_cvtsi32_si64(x[27]); + ASL(2) + AS2( movd eax, mm2) + AS1( emms) - __m64 y0 = _mm_cvtsi32_si64(y[0]); - __m64 y1 = _mm_cvtsi32_si64(y[1]); - __m64 y4 = my[2]; - __m64 y6 = my[3]; - __m64 y8 = my[4]; - __m64 y10 = my[5]; - __m64 y12 = my[6]; - __m64 y14 = my[7]; - __m64 y16 = my[8]; - __m64 y18 = my[9]; - __m64 y20 = my[10]; - __m64 y22 = my[11]; - __m64 y26 = _mm_cvtsi32_si64(y[26]); - __m64 y27 = _mm_cvtsi32_si64(y[27]); - - __m64 z0 = _mm_cvtsi32_si64(z[0]); - __m64 z1 = _mm_cvtsi32_si64(z[1]); - __m64 z4 = mz[2]; - __m64 z6 = mz[3]; - __m64 z8 = mz[4]; - __m64 z10 = mz[5]; - __m64 z12 = mz[6]; - __m64 z14 = mz[7]; - __m64 z16 = mz[8]; - __m64 z18 = mz[9]; - __m64 z20 = mz[10]; - __m64 z22 = mz[11]; - __m64 z26 = _mm_cvtsi32_si64(z[26]); - - s1 = _mm_add_si64(w1, w4); - C[1] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x0, y0); - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s3); - s1 = _mm_add_si64(s1, s2); - C[4] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x1, y1); - s4 = _mm_add_si64(x4, y4); - s1 = _mm_add_si64(s1, w18); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w20); - s1 = _mm_add_si64(s1, s3); - C[5] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x6, y6); - s4 = _mm_add_si64(x8, y8); - s1 = _mm_add_si64(s1, w22); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w26); - s1 = _mm_add_si64(s1, s3); - C[6] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x10, y10); - s4 = _mm_add_si64(x12, y12); - s1 = _mm_add_si64(s1, w27); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, s3); - C[7] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x14, y14); - s4 = _mm_add_si64(x16, y16); - s1 = _mm_add_si64(s1, z0); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, s3); - C[8] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x18, y18); - s4 = _mm_add_si64(x20, y20); - s1 = _mm_add_si64(s1, z1); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, z4); - s1 = _mm_add_si64(s1, s3); - C[9] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x22, y22); - s4 = _mm_add_si64(x26, y26); - s1 = _mm_add_si64(s1, z6); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, z8); - s1 = _mm_add_si64(s1, s3); - C[10] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x27, y27); - s1 = _mm_add_si64(s1, z10); - s1 = _mm_add_si64(s1, z12); - s1 = _mm_add_si64(s1, s3); - C[11] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(z14, z16); - s1 = _mm_add_si64(s1, s3); - C[12] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(z18, z20); - s1 = _mm_add_si64(s1, s3); - C[13] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(z22, z26); - s1 = _mm_add_si64(s1, s3); - C[14] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - C[15] = z[27] + _mm_cvtsi64_si32(s1); - _mm_empty(); + AddEpilogue } -void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B) +CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B) { - __m128i temp[21]; - const word *w = (word *)temp; - const __m64 *mw = (__m64 *)w; - const word *x = (word *)temp+7*4; - const __m64 *mx = (__m64 *)x; - const word *y = (word *)temp+7*4*2; - const __m64 *my = (__m64 *)y; + AddPrologue - P4_Mul(temp, (__m128i *)A, (__m128i *)B); + // now: eax = A, ebx = B, edx = C, ecx = N + AS2( lea eax, [eax+4*ecx]) + AS2( lea ebx, [ebx+4*ecx]) + AS2( lea edx, [edx+4*ecx]) - P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); + AS1( neg ecx) // ecx is negative index + AS2( pxor mm2, mm2) + ASJ( jz, 2, f) + AS2( test ecx, 2) // this clears carry flag + ASJ( jz, 0, f) + AS2( sub ecx, 2) + ASJ( jmp, 1, f) - P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); + ASL(0) + AS2( movd mm0, DWORD PTR [eax+4*ecx]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx]) + AS2( psubq mm0, mm1) + AS2( psubq mm0, mm2) + AS2( movd DWORD PTR [edx+4*ecx], mm0) + AS2( psrlq mm0, 63) - C[0] = w[0]; + AS2( movd mm2, DWORD PTR [eax+4*ecx+4]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+4]) + AS2( psubq mm2, mm1) + AS2( psubq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+4], mm2) + AS2( psrlq mm2, 63) - __m64 s1, s2, s3, s4; + ASL(1) + AS2( movd mm0, DWORD PTR [eax+4*ecx+8]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+8]) + AS2( psubq mm0, mm1) + AS2( psubq mm0, mm2) + AS2( movd DWORD PTR [edx+4*ecx+8], mm0) + AS2( psrlq mm0, 63) - __m64 w1 = _mm_cvtsi32_si64(w[1]); - __m64 w4 = mw[2]; - __m64 w6 = mw[3]; - __m64 w8 = mw[4]; - __m64 w10 = mw[5]; - __m64 w12 = mw[6]; - __m64 w14 = mw[7]; - __m64 w16 = mw[8]; - __m64 w18 = mw[9]; - __m64 w20 = mw[10]; - __m64 w22 = mw[11]; - __m64 w26 = _mm_cvtsi32_si64(w[26]); + AS2( movd mm2, DWORD PTR [eax+4*ecx+12]) + AS2( movd mm1, DWORD PTR [ebx+4*ecx+12]) + AS2( psubq mm2, mm1) + AS2( psubq mm2, mm0) + AS2( movd DWORD PTR [edx+4*ecx+12], mm2) + AS2( psrlq mm2, 63) - __m64 x0 = _mm_cvtsi32_si64(x[0]); - __m64 x1 = _mm_cvtsi32_si64(x[1]); - __m64 x4 = mx[2]; - __m64 x6 = mx[3]; - __m64 x8 = mx[4]; + AS2( add ecx, 4) + ASJ( jnz, 0, b) - __m64 y0 = _mm_cvtsi32_si64(y[0]); - __m64 y1 = _mm_cvtsi32_si64(y[1]); - __m64 y4 = my[2]; - __m64 y6 = my[3]; - __m64 y8 = my[4]; + ASL(2) + AS2( movd eax, mm2) + AS1( emms) - s1 = _mm_add_si64(w1, w4); - C[1] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); + AddEpilogue +} +#else +int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) +{ + assert (N%2 == 0); - s2 = _mm_add_si64(w6, w8); - s1 = _mm_add_si64(s1, s2); - C[2] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s2 = _mm_add_si64(w10, w12); - s1 = _mm_add_si64(s1, s2); - C[3] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x0, y0); - s2 = _mm_add_si64(w14, w16); - s1 = _mm_add_si64(s1, s3); - s1 = _mm_add_si64(s1, s2); - C[4] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x1, y1); - s4 = _mm_add_si64(x4, y4); - s1 = _mm_add_si64(s1, w18); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w20); - s1 = _mm_add_si64(s1, s3); - C[5] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - s3 = _mm_add_si64(x6, y6); - s4 = _mm_add_si64(x8, y8); - s1 = _mm_add_si64(s1, w22); - s3 = _mm_add_si64(s3, s4); - s1 = _mm_add_si64(s1, w26); - s1 = _mm_add_si64(s1, s3); - C[6] = _mm_cvtsi64_si32(s1); - s1 = _mm_srli_si64(s1, 32); - - C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12]; - _mm_empty(); + Declare2Words(u); + for (size_t i=0; i=2 && N%2==0); - if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8) - LowLevel::Multiply8(R, A, B); - else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4) - LowLevel::Multiply4(R, A, B); - else if (N==2) - LowLevel::Multiply2(R, A, B); + if (N <= s_recursionLimit) + s_pMul[N/4](R, A, B); else { const size_t N2 = N/2; - int carry; - int aComp = Compare(A0, A1, N2); - int bComp = Compare(B0, B1, N2); + size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2; + Subtract(R0, A + AN2, A + (N2 ^ AN2), N2); - switch (2*aComp + aComp + bComp) - { - case -4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R0, N2); - carry = -1; - break; - case -2: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 2: - LowLevel::Subtract(R0, A0, A1, N2); - LowLevel::Subtract(R1, B1, B0, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R1, N2); - carry = -1; - break; - default: - SetWords(T0, 0, N); - carry = 0; - } + size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2; + Subtract(R1, B + BN2, B + (N2 ^ BN2), N2); - RecursiveMultiply(R0, T2, A0, B0, N2); RecursiveMultiply(R2, T2, A1, B1, N2); + RecursiveMultiply(T0, T2, R0, R1, N2); + RecursiveMultiply(R0, T2, A0, B0, N2); // now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1 - carry += LowLevel::Add(T0, T0, R0, N); - carry += LowLevel::Add(T0, T0, R2, N); - carry += LowLevel::Add(R1, R1, T0, N); + int c2 = Add(R2, R2, R1, N2); + int c3 = c2; + c2 += Add(R1, R2, R0, N2); + c3 += Add(R2, R2, R3, N2); - assert (carry >= 0 && carry <= 2); - Increment(R3, N2, carry); + if (AN2 == BN2) + c3 -= Subtract(R1, R1, T0, N); + else + c3 += Add(R1, R1, T0, N); + + c3 += Increment(R2, N2, c2); + assert (c3 >= 0 && c3 <= 2); + Increment(R3, N2, c3); } } @@ -2072,12 +1917,9 @@ void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N) void RecursiveSquare(word *R, word *T, const word *A, size_t N) { assert(N && N%2==0); - if (LowLevel::SquareRecursionLimit() >= 8 && N==8) - LowLevel::Square8(R, A); - if (LowLevel::SquareRecursionLimit() >= 4 && N==4) - LowLevel::Square4(R, A); - else if (N==2) - LowLevel::Square2(R, A); + + if (N <= s_recursionLimit) + s_pSqu[N/4](R, A); else { const size_t N2 = N/2; @@ -2086,35 +1928,32 @@ void RecursiveSquare(word *R, word *T, const word *A, size_t N) RecursiveSquare(R2, T2, A1, N2); RecursiveMultiply(T0, T2, A0, A1, N2); - int carry = LowLevel::Add(R1, R1, T0, N); - carry += LowLevel::Add(R1, R1, T0, N); + int carry = Add(R1, R1, T0, N); + carry += Add(R1, R1, T0, N); Increment(R3, N2, carry); } } // R[N] - bottom half of A*B -// T[N] - temporary work space +// T[3*N/2] - temporary work space // A[N] - multiplier // B[N] - multiplicant void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N) { assert(N>=2 && N%2==0); - if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8) - LowLevel::Multiply8Bottom(R, A, B); - else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4) - LowLevel::Multiply4Bottom(R, A, B); - else if (N==2) - LowLevel::Multiply2Bottom(R, A, B); + + if (N <= s_recursionLimit) + s_pBot[N/4](R, A, B); else { const size_t N2 = N/2; RecursiveMultiply(R, T, A0, B0, N2); RecursiveMultiplyBottom(T0, T1, A1, B0, N2); - LowLevel::Add(R1, R1, T0, N2); + Add(R1, R1, T0, N2); RecursiveMultiplyBottom(T0, T1, A0, B1, N2); - LowLevel::Add(R1, R1, T0, N2); + Add(R1, R1, T0, N2); } } @@ -2124,90 +1963,63 @@ void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, siz // A[N] --- multiplier // B[N] --- multiplicant -void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N) +void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N) { assert(N>=2 && N%2==0); - if (N==4) +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2() && ((N>=8) & (N<=32))) + s_pTop[N/16](R, A, B, L[N-1]); + else +#endif + if (N<=4) { - LowLevel::Multiply4(T, A, B); - memcpy(R, T+4, 4*WORD_SIZE); - } - else if (N==2) - { - LowLevel::Multiply2(T, A, B); - memcpy(R, T+2, 2*WORD_SIZE); + s_pMul[N/4](T, A, B); + memcpy(R, T+N, N*WORD_SIZE); } else { const size_t N2 = N/2; - int carry; - int aComp = Compare(A0, A1, N2); - int bComp = Compare(B0, B1, N2); + size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2; + Subtract(R0, A + AN2, A + (N2 ^ AN2), N2); - switch (2*aComp + aComp + bComp) + size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2; + Subtract(R1, B + BN2, B + (N2 ^ BN2), N2); + + RecursiveMultiply(T0, T2, R0, R1, N2); + RecursiveMultiply(R0, T2, A1, B1, N2); + + // now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1 + + int t, c3; + int c2 = Subtract(T2, L+N2, L, N2); + + if (AN2 == BN2) { - case -4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R0, N2); - carry = -1; - break; - case -2: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 2: - LowLevel::Subtract(R0, A0, A1, N2); - LowLevel::Subtract(R1, B1, B0, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - carry = 0; - break; - case 4: - LowLevel::Subtract(R0, A1, A0, N2); - LowLevel::Subtract(R1, B0, B1, N2); - RecursiveMultiply(T0, T2, R0, R1, N2); - LowLevel::Subtract(T1, T1, R1, N2); - carry = -1; - break; - default: - SetWords(T0, 0, N); - carry = 0; + c2 -= Add(T2, T2, T0, N2); + t = (Compare(T2, R0, N2) == -1); + c3 = t - Subtract(T2, T2, T1, N2); + } + else + { + c2 += Subtract(T2, T2, T0, N2); + t = (Compare(T2, R0, N2) == -1); + c3 = t + Add(T2, T2, T1, N2); } - RecursiveMultiply(T2, R0, A1, B1, N2); + c2 += t; + if (c2 >= 0) + c3 += Increment(T2, N2, c2); + else + c3 -= Decrement(T2, N2, -c2); + c3 += Add(R0, T2, R1, N2); - // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1 - - int c2 = LowLevel::Subtract(R0, L+N2, L, N2); - c2 += LowLevel::Subtract(R0, R0, T0, N2); - int t = (Compare(R0, T2, N2) == -1); - - carry += t; - carry += Increment(R0, N2, c2+t); - carry += LowLevel::Add(R0, R0, T1, N2); - carry += LowLevel::Add(R0, R0, T3, N2); - assert (carry >= 0 && carry <= 2); - - CopyWords(R1, T3, N2); - Increment(R1, N2, carry); + assert (c3 >= 0 && c3 <= 2); + Increment(R1, N2, c3); } } -inline int Add(word *C, const word *A, const word *B, size_t N) -{ - return LowLevel::Add(C, A, B, N); -} - -inline int Subtract(word *C, const word *A, const word *B, size_t N) -{ - return LowLevel::Subtract(C, A, B, N); -} - inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N) { RecursiveMultiply(R, T, A, B, N); @@ -2223,23 +2035,6 @@ inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_ RecursiveMultiplyBottom(R, T, A, B, N); } -inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N) -{ - RecursiveMultiplyTop(R, T, L, A, B, N); -} - -static word LinearMultiply(word *C, const word *A, word B, size_t N) -{ - word carry=0; - for(unsigned i=0; i::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0); } @@ -3657,7 +3500,7 @@ void PositiveMultiply(Integer &product, const Integer &a, const Integer &b) product.reg.CleanNew(RoundupSize(aSize+bSize)); product.sign = Integer::POSITIVE; - SecAlignedWordBlock workspace(aSize + bSize); + IntegerSecBlock workspace(aSize + bSize); AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize); } @@ -3723,7 +3566,7 @@ void PositiveDivide(Integer &remainder, Integer "ient, quotient.reg.CleanNew(RoundupSize(aSize-bSize+2)); quotient.sign = Integer::POSITIVE; - SecAlignedWordBlock T(aSize+2*bSize+4); + IntegerSecBlock T(aSize+3*(bSize+2)); Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize); } diff --git a/integer.h b/integer.h index 547e3778..4e93c3a1 100644 --- a/integer.h +++ b/integer.h @@ -11,44 +11,13 @@ NAMESPACE_BEGIN(CryptoPP) -#if defined(SSE2_INTRINSICS_AVAILABLE) - template - class AlignedAllocator : public AllocatorBase - { - public: - CRYPTOPP_INHERIT_ALLOCATOR_TYPES - - pointer allocate(size_type n, const void *); - void deallocate(void *p, size_type n); - pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve) - { - return StandardReallocate(*this, p, oldSize, newSize, preserve); - } - - #if !(defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16) || defined(CRYPTOPP_MEMALIGN_AVAILABLE) || defined(CRYPTOPP_MM_MALLOC_AVAILABLE)) - #define CRYPTOPP_NO_ALIGNED_ALLOC - AlignedAllocator() : m_pBlock(NULL) {} - protected: - void *m_pBlock; - #endif - }; - - #ifdef CRYPTOPP_IMPORTS - CRYPTOPP_DLL_TEMPLATE_CLASS AlignedAllocator; - #endif - - typedef SecBlock > SecAlignedWordBlock; -#else - typedef SecWordBlock SecAlignedWordBlock; -#endif - -void CRYPTOPP_DLL CRYPTOPP_API DisableSSE2(); - struct InitializeInteger // used to initialize static variables { InitializeInteger(); }; +typedef SecBlock > IntegerSecBlock; + //! multiple precision integer and basic arithmetics /*! This class can represent positive and negative integers with absolute value less than (256**sizeof(word)) ** (256**sizeof(int)). @@ -406,7 +375,7 @@ private: friend void PositiveMultiply(Integer &product, const Integer &a, const Integer &b); friend void PositiveDivide(Integer &remainder, Integer "ient, const Integer ÷nd, const Integer &divisor); - SecAlignedWordBlock reg; + IntegerSecBlock reg; Sign sign; }; diff --git a/rijndael.cpp b/rijndael.cpp index 2a1a19ef..4a8572f2 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -51,10 +51,7 @@ being unloaded from L1 cache, until that round is finished. #include "rijndael.h" #include "misc.h" - -#ifdef CRYPTOPP_L1_CACHE_ALIGN_NOT_AVAILABLE -#pragma message("Don't know how to align data on L1 cache boundary. Defense against AES timing attack may be affected.") -#endif +#include "cpu.h" NAMESPACE_BEGIN(CryptoPP) @@ -122,25 +119,25 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c for (i = 1; i < m_rounds; i++) { rk += 4; rk[0] = - Td0[Se[GETBYTE(rk[0], 3)]] ^ - Td1[Se[GETBYTE(rk[0], 2)]] ^ - Td2[Se[GETBYTE(rk[0], 1)]] ^ - Td3[Se[GETBYTE(rk[0], 0)]]; + Td[0*256+Se[GETBYTE(rk[0], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[0], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[0], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[0], 0)]]; rk[1] = - Td0[Se[GETBYTE(rk[1], 3)]] ^ - Td1[Se[GETBYTE(rk[1], 2)]] ^ - Td2[Se[GETBYTE(rk[1], 1)]] ^ - Td3[Se[GETBYTE(rk[1], 0)]]; + Td[0*256+Se[GETBYTE(rk[1], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[1], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[1], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[1], 0)]]; rk[2] = - Td0[Se[GETBYTE(rk[2], 3)]] ^ - Td1[Se[GETBYTE(rk[2], 2)]] ^ - Td2[Se[GETBYTE(rk[2], 1)]] ^ - Td3[Se[GETBYTE(rk[2], 0)]]; + Td[0*256+Se[GETBYTE(rk[2], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[2], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[2], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[2], 0)]]; rk[3] = - Td0[Se[GETBYTE(rk[3], 3)]] ^ - Td1[Se[GETBYTE(rk[3], 2)]] ^ - Td2[Se[GETBYTE(rk[3], 1)]] ^ - Td3[Se[GETBYTE(rk[3], 0)]]; + Td[0*256+Se[GETBYTE(rk[3], 3)]] ^ + Td[1*256+Se[GETBYTE(rk[3], 2)]] ^ + Td[2*256+Se[GETBYTE(rk[3], 1)]] ^ + Td[3*256+Se[GETBYTE(rk[3], 0)]]; } } @@ -148,15 +145,245 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16); } -const static unsigned int s_lineSizeDiv4 = CRYPTOPP_L1_CACHE_LINE_SIZE/4; -#ifdef IS_BIG_ENDIAN -const static unsigned int s_i3=3, s_i2=2, s_i1=1, s_i0=0; -#else -const static unsigned int s_i3=0, s_i2=1, s_i1=2, s_i0=3; -#endif +#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { +#ifdef CRYPTOPP_X86_ASM_AVAILABLE + if (HasMMX()) + { + const word32 *k = m_key; + const word32 *kLoopEnd = k + m_rounds*4; +#ifdef __GNUC__ + word32 t0, t1, t2, t3; + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) + AS1( push ebp) + AS2( mov ebp, eax) + AS2( movd mm5, ecx) +#else + AS2( mov edx, g_cacheLineSize) + AS2( mov edi, inBlock) + AS2( mov esi, k) + AS2( movd mm5, kLoopEnd) + AS1( push ebp) + AS2( lea ebp, Te) +#endif + AS2( mov eax, [esi+0*4]) // s0 + AS2( xor eax, [edi+0*4]) + AS2( movd mm0, eax) + AS2( mov ebx, [esi+1*4]) + AS2( xor ebx, [edi+1*4]) + AS2( movd mm1, ebx) + AS2( and ebx, eax) + AS2( mov eax, [esi+2*4]) + AS2( xor eax, [edi+2*4]) + AS2( movd mm2, eax) + AS2( and ebx, eax) + AS2( mov ecx, [esi+3*4]) + AS2( xor ecx, [edi+3*4]) + AS2( and ebx, ecx) + + // read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction + AS2( and ebx, 0) + AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence + ASL(2) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( and ebx, [ebp+edi]) + AS2( add edi, edx) + AS2( cmp edi, 1024) + ASJ( jl, 2, b) + AS2( and ebx, [ebp+1020]) + AS2( movd mm6, ebx) + AS2( pxor mm2, mm6) + AS2( pxor mm1, mm6) + AS2( pxor mm0, mm6) + AS2( xor ecx, ebx) + + AS2( mov edi, [esi+4*4]) // t0 + AS2( mov eax, [esi+5*4]) + AS2( mov ebx, [esi+6*4]) + AS2( mov edx, [esi+7*4]) + AS2( add esi, 8*4) + AS2( movd mm4, esi) + +#define QUARTER_ROUND(t, a, b, c, d) \ + AS2(movzx esi, t##l)\ + AS2(d, [ebp+0*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(c, [ebp+1*1024+4*esi])\ + AS2(shr e##t##x, 16)\ + AS2(movzx esi, t##l)\ + AS2(b, [ebp+2*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(a, [ebp+3*1024+4*esi]) + +#define s0 xor edi +#define s1 xor eax +#define s2 xor ebx +#define s3 xor ecx +#define t0 xor edi +#define t1 xor eax +#define t2 xor ebx +#define t3 xor edx + + QUARTER_ROUND(c, t0, t1, t2, t3) + AS2( movd ecx, mm2) + QUARTER_ROUND(c, t3, t0, t1, t2) + AS2( movd ecx, mm1) + QUARTER_ROUND(c, t2, t3, t0, t1) + AS2( movd ecx, mm0) + QUARTER_ROUND(c, t1, t2, t3, t0) + AS2( movd mm2, ebx) + AS2( movd mm1, eax) + AS2( movd mm0, edi) +#undef QUARTER_ROUND + + AS2( movd esi, mm4) + + ASL(0) + AS2( mov edi, [esi+0*4]) + AS2( mov eax, [esi+1*4]) + AS2( mov ebx, [esi+2*4]) + AS2( mov ecx, [esi+3*4]) + +#define QUARTER_ROUND(t, a, b, c, d) \ + AS2(movzx esi, t##l)\ + AS2(a, [ebp+3*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(b, [ebp+2*1024+4*esi])\ + AS2(shr e##t##x, 16)\ + AS2(movzx esi, t##l)\ + AS2(c, [ebp+1*1024+4*esi])\ + AS2(movzx esi, t##h)\ + AS2(d, [ebp+0*1024+4*esi]) + + QUARTER_ROUND(d, s0, s1, s2, s3) + AS2( movd edx, mm2) + QUARTER_ROUND(d, s3, s0, s1, s2) + AS2( movd edx, mm1) + QUARTER_ROUND(d, s2, s3, s0, s1) + AS2( movd edx, mm0) + QUARTER_ROUND(d, s1, s2, s3, s0) + AS2( movd esi, mm4) + AS2( movd mm2, ebx) + AS2( movd mm1, eax) + AS2( movd mm0, edi) + + AS2( mov edi, [esi+4*4]) + AS2( mov eax, [esi+5*4]) + AS2( mov ebx, [esi+6*4]) + AS2( mov edx, [esi+7*4]) + + QUARTER_ROUND(c, t0, t1, t2, t3) + AS2( movd ecx, mm2) + QUARTER_ROUND(c, t3, t0, t1, t2) + AS2( movd ecx, mm1) + QUARTER_ROUND(c, t2, t3, t0, t1) + AS2( movd ecx, mm0) + QUARTER_ROUND(c, t1, t2, t3, t0) + AS2( movd mm2, ebx) + AS2( movd mm1, eax) + AS2( movd mm0, edi) + + AS2( movd esi, mm4) + AS2( movd edi, mm5) + AS2( add esi, 8*4) + AS2( movd mm4, esi) + AS2( cmp edi, esi) + ASJ( jne, 0, b) + +#undef QUARTER_ROUND +#undef s0 +#undef s1 +#undef s2 +#undef s3 +#undef t0 +#undef t1 +#undef t2 +#undef t3 + + AS2( mov eax, [edi+0*4]) + AS2( mov ecx, [edi+1*4]) + AS2( mov esi, [edi+2*4]) + AS2( mov edi, [edi+3*4]) + +#define QUARTER_ROUND(a, b, c, d) \ + AS2( movzx ebx, dl)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( shl ebx, 3*8)\ + AS2( xor a, ebx)\ + AS2( movzx ebx, dh)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( shl ebx, 2*8)\ + AS2( xor b, ebx)\ + AS2( shr edx, 16)\ + AS2( movzx ebx, dl)\ + AS2( shr edx, 8)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( shl ebx, 1*8)\ + AS2( xor c, ebx)\ + AS2( movzx ebx, BYTE PTR [ebp+1+4*edx])\ + AS2( xor d, ebx) + + QUARTER_ROUND(eax, ecx, esi, edi) + AS2( movd edx, mm2) + QUARTER_ROUND(edi, eax, ecx, esi) + AS2( movd edx, mm1) + QUARTER_ROUND(esi, edi, eax, ecx) + AS2( movd edx, mm0) + QUARTER_ROUND(ecx, esi, edi, eax) + +#undef QUARTER_ROUND + + AS1( pop ebp) + AS1( emms) + +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3) + : "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize) + : "memory", "cc" + ); + + if (xorBlock) + { + t0 ^= ((const word32 *)xorBlock)[0]; + t1 ^= ((const word32 *)xorBlock)[1]; + t2 ^= ((const word32 *)xorBlock)[2]; + t3 ^= ((const word32 *)xorBlock)[3]; + } + ((word32 *)outBlock)[0] = t0; + ((word32 *)outBlock)[1] = t1; + ((word32 *)outBlock)[2] = t2; + ((word32 *)outBlock)[3] = t3; +#else + AS2( mov ebx, xorBlock) + AS2( test ebx, ebx) + ASJ( jz, 1, f) + AS2( xor eax, [ebx+0*4]) + AS2( xor ecx, [ebx+1*4]) + AS2( xor esi, [ebx+2*4]) + AS2( xor edi, [ebx+3*4]) + ASL(1) + AS2( mov ebx, outBlock) + AS2( mov [ebx+0*4], eax) + AS2( mov [ebx+1*4], ecx) + AS2( mov [ebx+2*4], esi) + AS2( mov [ebx+3*4], edi) +#endif + } + else +#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE + { word32 s0, s1, s2, s3, t0, t1, t2, t3; const word32 *rk = m_key; @@ -171,95 +398,68 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock rk += 8; // timing attack countermeasure. see comments at top for more details + const int cacheLineSize = GetCacheLineSize(); unsigned int i; word32 u = 0; - for (i=0; i>= 8;\ + b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\ + c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\ + d ^= Te[t]; +#else +#define QUARTER_ROUND(t, a, b, c, d) \ + d ^= Te[byte(t)]; t >>= 8;\ + c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\ + b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\ + a ^= rotrFixed(Te[t], 24); +#endif + + QUARTER_ROUND(s3, t0, t1, t2, t3) + QUARTER_ROUND(s2, t3, t0, t1, t2) + QUARTER_ROUND(s1, t2, t3, t0, t1) + QUARTER_ROUND(s0, t1, t2, t3, t0) +#undef QUARTER_ROUND // Nr - 2 full rounds: unsigned int r = m_rounds/2 - 1; do { - s0 = - Te0[GETBYTE(t0, 3)] ^ - Te1[GETBYTE(t1, 2)] ^ - Te2[GETBYTE(t2, 1)] ^ - Te3[GETBYTE(t3, 0)] ^ - rk[0]; - s1 = - Te0[GETBYTE(t1, 3)] ^ - Te1[GETBYTE(t2, 2)] ^ - Te2[GETBYTE(t3, 1)] ^ - Te3[GETBYTE(t0, 0)] ^ - rk[1]; - s2 = - Te0[GETBYTE(t2, 3)] ^ - Te1[GETBYTE(t3, 2)] ^ - Te2[GETBYTE(t0, 1)] ^ - Te3[GETBYTE(t1, 0)] ^ - rk[2]; - s3 = - Te0[GETBYTE(t3, 3)] ^ - Te1[GETBYTE(t0, 2)] ^ - Te2[GETBYTE(t1, 1)] ^ - Te3[GETBYTE(t2, 0)] ^ - rk[3]; +#define QUARTER_ROUND(t, a, b, c, d) \ + a ^= Te[3*256+byte(t)]; t >>= 8;\ + b ^= Te[2*256+byte(t)]; t >>= 8;\ + c ^= Te[1*256+byte(t)]; t >>= 8;\ + d ^= Te[t]; - t0 = - Te0[GETBYTE(s0, 3)] ^ - Te1[GETBYTE(s1, 2)] ^ - Te2[GETBYTE(s2, 1)] ^ - Te3[GETBYTE(s3, 0)] ^ - rk[4]; - t1 = - Te0[GETBYTE(s1, 3)] ^ - Te1[GETBYTE(s2, 2)] ^ - Te2[GETBYTE(s3, 1)] ^ - Te3[GETBYTE(s0, 0)] ^ - rk[5]; - t2 = - Te0[GETBYTE(s2, 3)] ^ - Te1[GETBYTE(s3, 2)] ^ - Te2[GETBYTE(s0, 1)] ^ - Te3[GETBYTE(s1, 0)] ^ - rk[6]; - t3 = - Te0[GETBYTE(s3, 3)] ^ - Te1[GETBYTE(s0, 2)] ^ - Te2[GETBYTE(s1, 1)] ^ - Te3[GETBYTE(s2, 0)] ^ - rk[7]; + s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; + + QUARTER_ROUND(t3, s0, s1, s2, s3) + QUARTER_ROUND(t2, s3, s0, s1, s2) + QUARTER_ROUND(t1, s2, s3, s0, s1) + QUARTER_ROUND(t0, s1, s2, s3, s0) + + t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; + + QUARTER_ROUND(s3, t0, t1, t2, t3) + QUARTER_ROUND(s2, t3, t0, t1, t2) + QUARTER_ROUND(s1, t2, t3, t0, t1) + QUARTER_ROUND(s0, t1, t2, t3, t0) +#undef QUARTER_ROUND rk += 8; } while (--r); // timing attack countermeasure. see comments at top for more details u = 0; - for (i=0; i>= 8;\ + tempBlock[b] = Se[byte(t)]; t >>= 8;\ + tempBlock[c] = Se[byte(t)]; t >>= 8;\ + tempBlock[d] = Se[t]; + + QUARTER_ROUND(t2, 15, 2, 5, 8) + QUARTER_ROUND(t1, 11, 14, 1, 4) + QUARTER_ROUND(t0, 7, 10, 13, 0) + QUARTER_ROUND(t3, 3, 6, 9, 12) +#undef QUARTER_ROUND if (xbw) { @@ -299,12 +493,13 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock obw[2] = tbw[2] ^ rk[2]; obw[3] = tbw[3] ^ rk[3]; } + } } void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { word32 s0, s1, s2, s3, t0, t1, t2, t3; - const word32 *rk = m_key; + const word32 *rk = m_key; s0 = ((const word32 *)inBlock)[0] ^ rk[0]; s1 = ((const word32 *)inBlock)[1] ^ rk[1]; @@ -317,95 +512,68 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock rk += 8; // timing attack countermeasure. see comments at top for more details + const int cacheLineSize = GetCacheLineSize(); unsigned int i; word32 u = 0; - for (i=0; i>= 8;\ + b ^= rotrFixed(Td[byte(t)], 16); t >>= 8;\ + c ^= rotrFixed(Td[byte(t)], 8); t >>= 8;\ + d ^= Td[t]; +#else +#define QUARTER_ROUND(t, a, b, c, d) \ + d ^= Td[byte(t)]; t >>= 8;\ + c ^= rotrFixed(Td[byte(t)], 8); t >>= 8;\ + b ^= rotrFixed(Td[byte(t)], 16); t >>= 8;\ + a ^= rotrFixed(Td[t], 24); +#endif + + QUARTER_ROUND(s3, t2, t1, t0, t3) + QUARTER_ROUND(s2, t1, t0, t3, t2) + QUARTER_ROUND(s1, t0, t3, t2, t1) + QUARTER_ROUND(s0, t3, t2, t1, t0) +#undef QUARTER_ROUND // Nr - 2 full rounds: unsigned int r = m_rounds/2 - 1; do { - s0 = - Td0[GETBYTE(t0, 3)] ^ - Td1[GETBYTE(t3, 2)] ^ - Td2[GETBYTE(t2, 1)] ^ - Td3[GETBYTE(t1, 0)] ^ - rk[0]; - s1 = - Td0[GETBYTE(t1, 3)] ^ - Td1[GETBYTE(t0, 2)] ^ - Td2[GETBYTE(t3, 1)] ^ - Td3[GETBYTE(t2, 0)] ^ - rk[1]; - s2 = - Td0[GETBYTE(t2, 3)] ^ - Td1[GETBYTE(t1, 2)] ^ - Td2[GETBYTE(t0, 1)] ^ - Td3[GETBYTE(t3, 0)] ^ - rk[2]; - s3 = - Td0[GETBYTE(t3, 3)] ^ - Td1[GETBYTE(t2, 2)] ^ - Td2[GETBYTE(t1, 1)] ^ - Td3[GETBYTE(t0, 0)] ^ - rk[3]; +#define QUARTER_ROUND(t, a, b, c, d) \ + a ^= Td[3*256+byte(t)]; t >>= 8;\ + b ^= Td[2*256+byte(t)]; t >>= 8;\ + c ^= Td[1*256+byte(t)]; t >>= 8;\ + d ^= Td[t]; - t0 = - Td0[GETBYTE(s0, 3)] ^ - Td1[GETBYTE(s3, 2)] ^ - Td2[GETBYTE(s2, 1)] ^ - Td3[GETBYTE(s1, 0)] ^ - rk[4]; - t1 = - Td0[GETBYTE(s1, 3)] ^ - Td1[GETBYTE(s0, 2)] ^ - Td2[GETBYTE(s3, 1)] ^ - Td3[GETBYTE(s2, 0)] ^ - rk[5]; - t2 = - Td0[GETBYTE(s2, 3)] ^ - Td1[GETBYTE(s1, 2)] ^ - Td2[GETBYTE(s0, 1)] ^ - Td3[GETBYTE(s3, 0)] ^ - rk[6]; - t3 = - Td0[GETBYTE(s3, 3)] ^ - Td1[GETBYTE(s2, 2)] ^ - Td2[GETBYTE(s1, 1)] ^ - Td3[GETBYTE(s0, 0)] ^ - rk[7]; + s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; + + QUARTER_ROUND(t3, s2, s1, s0, s3) + QUARTER_ROUND(t2, s1, s0, s3, s2) + QUARTER_ROUND(t1, s0, s3, s2, s1) + QUARTER_ROUND(t0, s3, s2, s1, s0) + + t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; + + QUARTER_ROUND(s3, t2, t1, t0, t3) + QUARTER_ROUND(s2, t1, t0, t3, t2) + QUARTER_ROUND(s1, t0, t3, t2, t1) + QUARTER_ROUND(s0, t3, t2, t1, t0) +#undef QUARTER_ROUND rk += 8; } while (--r); // timing attack countermeasure. see comments at top for more details u = 0; - for (i=0; i>= 8;\ + tempBlock[b] = Sd[byte(t)]; t >>= 8;\ + tempBlock[c] = Sd[byte(t)]; t >>= 8;\ + tempBlock[d] = Sd[t]; + + QUARTER_ROUND(t2, 7, 2, 13, 8) + QUARTER_ROUND(t1, 3, 14, 9, 4) + QUARTER_ROUND(t0, 15, 10, 5, 0) + QUARTER_ROUND(t3, 11, 6, 1, 12) +#undef QUARTER_ROUND if (xbw) { diff --git a/rijndael.h b/rijndael.h index a035da4c..a068d637 100644 --- a/rijndael.h +++ b/rijndael.h @@ -25,16 +25,10 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat protected: // VS2005 workaround: have to put these on seperate lines, or error C2487 is triggered in DLL build - CRYPTOPP_L1_CACHE_ALIGN(static const byte Se[256]); - CRYPTOPP_L1_CACHE_ALIGN(static const byte Sd[256]); - CRYPTOPP_L1_CACHE_ALIGN(static const word32 Te0[256]); - static const word32 Te1[256]; - static const word32 Te2[256]; - static const word32 Te3[256]; - CRYPTOPP_L1_CACHE_ALIGN(static const word32 Td0[256]); - static const word32 Td1[256]; - static const word32 Td2[256]; - static const word32 Td3[256]; + static const byte Se[256]; + static const byte Sd[256]; + static const word32 Te[4*256]; + static const word32 Td[4*256]; static const word32 rcon[]; @@ -52,6 +46,7 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; + void ProcessAndXorBlock_Old(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; }; public: diff --git a/sha.cpp b/sha.cpp index 5355995a..127d1f99 100644 --- a/sha.cpp +++ b/sha.cpp @@ -9,6 +9,7 @@ #include "sha.h" #include "misc.h" +#include "cpu.h" NAMESPACE_BEGIN(CryptoPP) @@ -74,27 +75,43 @@ void SHA1::Transform(word32 *state, const word32 *data) state[2] += c; state[3] += d; state[4] += e; - /* Wipe variables */ - a = b = c = d = e = 0; - memset(W, 0, sizeof(W)); } // end of Steve Reid's code // ************************************************************* +void SHA224::InitState(HashWordType *state) +{ + static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4}; + memcpy(state, s, sizeof(s)); +} + void SHA256::InitState(HashWordType *state) { - state[0] = 0x6a09e667; - state[1] = 0xbb67ae85; - state[2] = 0x3c6ef372; - state[3] = 0xa54ff53a; - state[4] = 0x510e527f; - state[5] = 0x9b05688c; - state[6] = 0x1f83d9ab; - state[7] = 0x5be0cd19; + static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; + memcpy(state, s, sizeof(s)); } +static const word32 SHA256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15])) #define Ch(x,y,z) (z^(x&(y^z))) @@ -109,7 +126,7 @@ void SHA256::InitState(HashWordType *state) #define g(i) T[(6-i)&7] #define h(i) T[(7-i)&7] -#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk0(i));\ +#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\ d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) // for SHA256 @@ -141,98 +158,114 @@ void SHA256::Transform(word32 *state, const word32 *data) state[5] += f(0); state[6] += g(0); state[7] += h(0); - /* Wipe variables */ - memset(W, 0, sizeof(W)); - memset(T, 0, sizeof(T)); } +/* +// smaller but slower +void SHA256_Transform(word32 *state, const word32 *data) +{ + word32 T[20]; + word32 W[32]; + unsigned int i = 0, j = 0; + word32 *t = T+8; + + memcpy(t, state, 8*4); + word32 e = t[4], a = t[0]; + + do + { + word32 w = data[j]; + W[j] = w; + w += K[j]; + w += t[7]; + w += S1(e); + w += Ch(e, t[5], t[6]); + e = t[3] + w; + t[3] = t[3+8] = e; + w += S0(t[0]); + a = w + Maj(a, t[1], t[2]); + t[-1] = t[7] = a; + --t; + ++j; + if (j%8 == 0) + t += 8; + } while (j<16); + + do + { + i = j&0xf; + word32 w = s1(W[i+16-2]) + s0(W[i+16-15]) + W[i] + W[i+16-7]; + W[i+16] = W[i] = w; + w += K[j]; + w += t[7]; + w += S1(e); + w += Ch(e, t[5], t[6]); + e = t[3] + w; + t[3] = t[3+8] = e; + w += S0(t[0]); + a = w + Maj(a, t[1], t[2]); + t[-1] = t[7] = a; + + w = s1(W[(i+1)+16-2]) + s0(W[(i+1)+16-15]) + W[(i+1)] + W[(i+1)+16-7]; + W[(i+1)+16] = W[(i+1)] = w; + w += K[j+1]; + w += (t-1)[7]; + w += S1(e); + w += Ch(e, (t-1)[5], (t-1)[6]); + e = (t-1)[3] + w; + (t-1)[3] = (t-1)[3+8] = e; + w += S0((t-1)[0]); + a = w + Maj(a, (t-1)[1], (t-1)[2]); + (t-1)[-1] = (t-1)[7] = a; + + t-=2; + j+=2; + if (j%8 == 0) + t += 8; + } while (j<64); + + state[0] += a; + state[1] += t[1]; + state[2] += t[2]; + state[3] += t[3]; + state[4] += e; + state[5] += t[5]; + state[6] += t[6]; + state[7] += t[7]; +} +*/ + #undef S0 #undef S1 #undef s0 #undef s1 - -const word32 SHA256::K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -void SHA224::InitState(HashWordType *state) -{ - state[0] = 0xc1059ed8; - state[1] = 0x367cd507; - state[2] = 0x3070dd17; - state[3] = 0xf70e5939; - state[4] = 0xffc00b31; - state[5] = 0x68581511; - state[6] = 0x64f98fa7; - state[7] = 0xbefa4fa4; -} +#undef R // ************************************************************* #ifdef WORD64_AVAILABLE +void SHA384::InitState(HashWordType *state) +{ + static const word64 s[8] = { + W64LIT(0xcbbb9d5dc1059ed8), W64LIT(0x629a292a367cd507), + W64LIT(0x9159015a3070dd17), W64LIT(0x152fecd8f70e5939), + W64LIT(0x67332667ffc00b31), W64LIT(0x8eb44a8768581511), + W64LIT(0xdb0c2e0d64f98fa7), W64LIT(0x47b5481dbefa4fa4)}; + memcpy(state, s, sizeof(s)); +} + void SHA512::InitState(HashWordType *state) { - state[0] = W64LIT(0x6a09e667f3bcc908); - state[1] = W64LIT(0xbb67ae8584caa73b); - state[2] = W64LIT(0x3c6ef372fe94f82b); - state[3] = W64LIT(0xa54ff53a5f1d36f1); - state[4] = W64LIT(0x510e527fade682d1); - state[5] = W64LIT(0x9b05688c2b3e6c1f); - state[6] = W64LIT(0x1f83d9abfb41bd6b); - state[7] = W64LIT(0x5be0cd19137e2179); + static const word64 s[8] = { + W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b), + W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1), + W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f), + W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)}; + memcpy(state, s, sizeof(s)); } -// for SHA512 -#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39)) -#define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41)) -#define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7)) -#define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6)) - -void SHA512::Transform(word64 *state, const word64 *data) -{ - word64 W[16]; - word64 T[8]; - /* Copy context->state[] to working vars */ - memcpy(T, state, sizeof(T)); - /* 80 operations, partially loop unrolled */ - for (unsigned int j=0; j<80; j+=16) - { - R( 0); R( 1); R( 2); R( 3); - R( 4); R( 5); R( 6); R( 7); - R( 8); R( 9); R(10); R(11); - R(12); R(13); R(14); R(15); - } - /* Add the working vars back into context.state[] */ - state[0] += a(0); - state[1] += b(0); - state[2] += c(0); - state[3] += d(0); - state[4] += e(0); - state[5] += f(0); - state[6] += g(0); - state[7] += h(0); - /* Wipe variables */ - memset(W, 0, sizeof(W)); - memset(T, 0, sizeof(T)); -} - -const word64 SHA512::K[80] = { +CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = { W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), @@ -275,16 +308,231 @@ const word64 SHA512::K[80] = { W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) }; -void SHA384::InitState(HashWordType *state) +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +// put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version +static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data) { - state[0] = W64LIT(0xcbbb9d5dc1059ed8); - state[1] = W64LIT(0x629a292a367cd507); - state[2] = W64LIT(0x9159015a3070dd17); - state[3] = W64LIT(0x152fecd8f70e5939); - state[4] = W64LIT(0x67332667ffc00b31); - state[5] = W64LIT(0x8eb44a8768581511); - state[6] = W64LIT(0xdb0c2e0d64f98fa7); - state[7] = W64LIT(0x47b5481dbefa4fa4); +#ifdef __GNUC__ + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) + AS2( mov ebx, eax) +#else + AS2( lea ebx, SHA512_K) +#endif + + AS2( mov eax, esp) + AS2( and esp, 0xfffffff0) + AS2( sub esp, 27*16) // 17*16 for expanded data, 20*8 for state + AS1( push eax) + AS2( xor eax, eax) + AS2( lea edi, [esp+4+8*8]) // start at middle of state buffer. will decrement pointer each round to avoid copying + AS2( lea esi, [esp+4+20*8+8]) // 16-byte alignment, then add 8 + + AS2( movq mm4, [ecx+0*8]) + AS2( movq [edi+0*8], mm4) + AS2( movq mm0, [ecx+1*8]) + AS2( movq [edi+1*8], mm0) + AS2( movq mm0, [ecx+2*8]) + AS2( movq [edi+2*8], mm0) + AS2( movq mm0, [ecx+3*8]) + AS2( movq [edi+3*8], mm0) + AS2( movq mm5, [ecx+4*8]) + AS2( movq [edi+4*8], mm5) + AS2( movq mm0, [ecx+5*8]) + AS2( movq [edi+5*8], mm0) + AS2( movq mm0, [ecx+6*8]) + AS2( movq [edi+6*8], mm0) + AS2( movq mm0, [ecx+7*8]) + AS2( movq [edi+7*8], mm0) + ASJ( jmp, 0, f) + +#define SSE2_S0_S1(r, a, b, c) \ + AS2( movq mm6, r)\ + AS2( psrlq r, a)\ + AS2( movq mm7, r)\ + AS2( psllq mm6, 64-c)\ + AS2( pxor mm7, mm6)\ + AS2( psrlq r, b-a)\ + AS2( pxor mm7, r)\ + AS2( psllq mm6, c-b)\ + AS2( pxor mm7, mm6)\ + AS2( psrlq r, c-b)\ + AS2( pxor r, mm7)\ + AS2( psllq mm6, b-a)\ + AS2( pxor r, mm6) + +#define SSE2_s0(r, a, b, c) \ + AS2( movdqa xmm6, r)\ + AS2( psrlq r, a)\ + AS2( movdqa xmm7, r)\ + AS2( psllq xmm6, 64-c)\ + AS2( pxor xmm7, xmm6)\ + AS2( psrlq r, b-a)\ + AS2( pxor xmm7, r)\ + AS2( psrlq r, c-b)\ + AS2( pxor r, xmm7)\ + AS2( psllq xmm6, c-a)\ + AS2( pxor r, xmm6) + +#define SSE2_s1(r, a, b, c) \ + AS2( movdqa xmm6, r)\ + AS2( psrlq r, a)\ + AS2( movdqa xmm7, r)\ + AS2( psllq xmm6, 64-c)\ + AS2( pxor xmm7, xmm6)\ + AS2( psrlq r, b-a)\ + AS2( pxor xmm7, r)\ + AS2( psllq xmm6, c-b)\ + AS2( pxor xmm7, xmm6)\ + AS2( psrlq r, c-b)\ + AS2( pxor r, xmm7) + + ASL(SHA512_Round) + // k + w is in mm0, a is in mm4, e is in mm5 + AS2( paddq mm0, [edi+7*8]) // h + AS2( movq mm2, [edi+5*8]) // f + AS2( movq mm3, [edi+6*8]) // g + AS2( pxor mm2, mm3) + AS2( pand mm2, mm5) + SSE2_S0_S1(mm5,14,18,41) + AS2( pxor mm2, mm3) + AS2( paddq mm0, mm2) // h += Ch(e,f,g) + AS2( paddq mm5, mm0) // h += S1(e) + AS2( movq mm2, [edi+1*8]) // b + AS2( movq mm1, mm2) + AS2( por mm2, mm4) + AS2( pand mm2, [edi+2*8]) // c + AS2( pand mm1, mm4) + AS2( por mm1, mm2) + AS2( paddq mm1, mm5) // temp = h + Maj(a,b,c) + AS2( paddq mm5, [edi+3*8]) // e = d + h + AS2( movq [edi+3*8], mm5) + AS2( movq [edi+11*8], mm5) + SSE2_S0_S1(mm4,28,34,39) // S0(a) + AS2( paddq mm4, mm1) // a = temp + S0(a) + AS2( movq [edi-8], mm4) + AS2( movq [edi+7*8], mm4) + AS1( ret) + + // first 16 rounds + ASL(0) + AS2( movq mm0, [edx+eax*8]) + AS2( movq [esi+eax*8], mm0) + AS2( movq [esi+eax*8+16*8], mm0) + AS2( paddq mm0, [ebx+eax*8]) + ASC( call, SHA512_Round) + AS1( inc eax) + AS2( sub edi, 8) + AS2( test eax, 7) + ASJ( jnz, 0, b) + AS2( add edi, 8*8) + AS2( cmp eax, 16) + ASJ( jne, 0, b) + + // rest of the rounds + AS2( movdqu xmm0, [esi+(16-2)*8]) + ASL(1) + // data expansion, W[i-2] already in xmm0 + AS2( movdqu xmm3, [esi]) + AS2( paddq xmm3, [esi+(16-7)*8]) + AS2( movdqa xmm2, [esi+(16-15)*8]) + SSE2_s1(xmm0, 6, 19, 61) + AS2( paddq xmm0, xmm3) + SSE2_s0(xmm2, 1, 7, 8) + AS2( paddq xmm0, xmm2) + AS2( movdq2q mm0, xmm0) + AS2( movhlps xmm1, xmm0) + AS2( paddq mm0, [ebx+eax*8]) + AS2( movlps [esi], xmm0) + AS2( movlps [esi+8], xmm1) + AS2( movlps [esi+8*16], xmm0) + AS2( movlps [esi+8*17], xmm1) + // 2 rounds + ASC( call, SHA512_Round) + AS2( sub edi, 8) + AS2( movdq2q mm0, xmm1) + AS2( paddq mm0, [ebx+eax*8+8]) + ASC( call, SHA512_Round) + // update indices and loop + AS2( add esi, 16) + AS2( add eax, 2) + AS2( sub edi, 8) + AS2( test eax, 7) + ASJ( jnz, 1, b) + // do housekeeping every 8 rounds + AS2( mov esi, 0xf) + AS2( and esi, eax) + AS2( lea esi, [esp+4+20*8+8+esi*8]) + AS2( add edi, 8*8) + AS2( cmp eax, 80) + ASJ( jne, 1, b) + +#define SSE2_CombineState(i) \ + AS2( movq mm0, [edi+i*8])\ + AS2( paddq mm0, [ecx+i*8])\ + AS2( movq [ecx+i*8], mm0) + + SSE2_CombineState(0) + SSE2_CombineState(1) + SSE2_CombineState(2) + SSE2_CombineState(3) + SSE2_CombineState(4) + SSE2_CombineState(5) + SSE2_CombineState(6) + SSE2_CombineState(7) + + AS1( pop esp) + AS1( emms) + +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : + : "a" (SHA512_K), "c" (state), "d" (data) + : "%esi", "%edi", "memory", "cc" + ); +#endif +} +#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + +void SHA512::Transform(word64 *state, const word64 *data) +{ +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2()) + return SHA512_SSE2_Transform(state, data); +#endif + +#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39)) +#define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41)) +#define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7)) +#define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6)) + +#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA512_K[i+j]+(j?blk2(i):blk0(i));\ + d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) + + word64 W[16]; + word64 T[8]; + /* Copy context->state[] to working vars */ + memcpy(T, state, sizeof(T)); + /* 80 operations, partially loop unrolled */ + for (unsigned int j=0; j<80; j+=16) + { + R( 0); R( 1); R( 2); R( 3); + R( 4); R( 5); R( 6); R( 7); + R( 8); R( 9); R(10); R(11); + R(12); R(13); R(14); R(15); + } + /* Add the working vars back into context.state[] */ + state[0] += a(0); + state[1] += b(0); + state[2] += c(0); + state[3] += d(0); + state[4] += e(0); + state[5] += f(0); + state[6] += g(0); + state[7] += h(0); } #endif diff --git a/sha.h b/sha.h index 69b02ff7..40eb6df6 100644 --- a/sha.h +++ b/sha.h @@ -23,9 +23,6 @@ public: static void CRYPTOPP_API InitState(HashWordType *state); static void CRYPTOPP_API Transform(word32 *digest, const word32 *data); static const char * CRYPTOPP_API StaticAlgorithmName() {return "SHA-256";} - -protected: - static const word32 K[64]; }; //! implements the SHA-224 standard @@ -46,9 +43,6 @@ public: static void CRYPTOPP_API InitState(HashWordType *state); static void CRYPTOPP_API Transform(word64 *digest, const word64 *data); static const char * CRYPTOPP_API StaticAlgorithmName() {return "SHA-512";} - -protected: - static const word64 K[80]; }; //! implements the SHA-384 standard diff --git a/tiger.cpp b/tiger.cpp index b69e975a..332de2c6 100644 --- a/tiger.cpp +++ b/tiger.cpp @@ -3,6 +3,7 @@ #include "pch.h" #include "tiger.h" #include "misc.h" +#include "cpu.h" #ifdef WORD64_AVAILABLE @@ -24,13 +25,187 @@ void Tiger::TruncatedFinal(byte *hash, size_t size) m_data[7] = GetBitCountLo(); - Transform(m_digest, m_data); - CorrectEndianess(m_digest, m_digest, DigestSize()); - memcpy(hash, m_digest, size); + Transform(m_state, m_data); + CorrectEndianess(m_state, m_state, DigestSize()); + memcpy(hash, m_state, size); Restart(); // reinit for next use } +void Tiger::Transform (word64 *digest, const word64 *X) +{ +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + if (HasSSE2()) + { +#ifdef __GNUC__ + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) +#else + AS2( mov eax, digest) + AS2( mov esi, X) + AS2( lea edx, [table]) +#endif + AS2( movq mm0, [eax]) + AS2( movq mm1, [eax+1*8]) + AS2( movq mm5, mm1) + AS2( movq mm2, [eax+2*8]) + AS2( movq mm7, [edx+4*2048+0*8]) + AS2( movq mm6, [edx+4*2048+1*8]) + AS2( mov ecx, esp) + AS2( and esp, 0xfffffff0) + AS2( sub esp, 8*8) + AS1( push ecx) + +#define SSE2_round(a,b,c,x,mul) \ + AS2( pxor c, [x])\ + AS2( movd ecx, c)\ + AS2( movzx edi, cl)\ + AS2( movq mm3, [edx+0*2048+edi*8])\ + AS2( movzx edi, ch)\ + AS2( movq mm4, [edx+3*2048+edi*8])\ + AS2( shr ecx, 16)\ + AS2( movzx edi, cl)\ + AS2( pxor mm3, [edx+1*2048+edi*8])\ + AS2( movzx edi, ch)\ + AS2( pxor mm4, [edx+2*2048+edi*8])\ + AS3( pextrw ecx, c, 2)\ + AS2( movzx edi, cl)\ + AS2( pxor mm3, [edx+2*2048+edi*8])\ + AS2( movzx edi, ch)\ + AS2( pxor mm4, [edx+1*2048+edi*8])\ + AS3( pextrw ecx, c, 3)\ + AS2( movzx edi, cl)\ + AS2( pxor mm3, [edx+3*2048+edi*8])\ + AS2( psubq a, mm3)\ + AS2( movzx edi, ch)\ + AS2( pxor mm4, [edx+0*2048+edi*8])\ + AS2( paddq b, mm4)\ + SSE2_mul_##mul(b) + +#define SSE2_mul_5(b) \ + AS2( movq mm3, b)\ + AS2( psllq b, 2)\ + AS2( paddq b, mm3) + +#define SSE2_mul_7(b) \ + AS2( movq mm3, b)\ + AS2( psllq b, 3)\ + AS2( psubq b, mm3) + +#define SSE2_mul_9(b) \ + AS2( movq mm3, b)\ + AS2( psllq b, 3)\ + AS2( paddq b, mm3) + +#define label2_5 1 +#define label2_7 2 +#define label2_9 3 + +#define SSE2_pass(A,B,C,mul,X) \ + AS2( xor ebx, ebx)\ + ASL(mul)\ + SSE2_round(A,B,C,X+0*8+ebx,mul)\ + SSE2_round(B,C,A,X+1*8+ebx,mul)\ + AS2( cmp ebx, 6*8)\ + ASJ( je, label2_##mul, f)\ + SSE2_round(C,A,B,X+2*8+ebx,mul)\ + AS2( add ebx, 3*8)\ + ASJ( jmp, mul, b)\ + ASL(label2_##mul) + +#define SSE2_key_schedule(Y,X) \ + AS2( movq mm3, [X+7*8])\ + AS2( pxor mm3, mm6)\ + AS2( movq mm4, [X+0*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+0*8], mm4)\ + AS2( pxor mm4, [X+1*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+1*8], mm4)\ + AS2( paddq mm4, [X+2*8])\ + AS2( pxor mm3, mm7)\ + AS2( psllq mm3, 19)\ + AS2( movq [Y+2*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [X+3*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+3*8], mm4)\ + AS2( pxor mm4, [X+4*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+4*8], mm4)\ + AS2( paddq mm4, [X+5*8])\ + AS2( pxor mm3, mm7)\ + AS2( psrlq mm3, 23)\ + AS2( movq [Y+5*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [X+6*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+6*8], mm4)\ + AS2( pxor mm4, [X+7*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+7*8], mm4)\ + AS2( paddq mm4, [Y+0*8])\ + AS2( pxor mm3, mm7)\ + AS2( psllq mm3, 19)\ + AS2( movq [Y+0*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [Y+1*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+1*8], mm4)\ + AS2( pxor mm4, [Y+2*8])\ + AS2( movq mm3, mm4)\ + AS2( movq [Y+2*8], mm4)\ + AS2( paddq mm4, [Y+3*8])\ + AS2( pxor mm3, mm7)\ + AS2( psrlq mm3, 23)\ + AS2( movq [Y+3*8], mm4)\ + AS2( pxor mm3, mm4)\ + AS2( movq mm4, [Y+4*8])\ + AS2( psubq mm4, mm3)\ + AS2( movq [Y+4*8], mm4)\ + AS2( pxor mm4, [Y+5*8])\ + AS2( movq [Y+5*8], mm4)\ + AS2( paddq mm4, [Y+6*8])\ + AS2( movq [Y+6*8], mm4)\ + AS2( pxor mm4, [edx+4*2048+2*8])\ + AS2( movq mm3, [Y+7*8])\ + AS2( psubq mm3, mm4)\ + AS2( movq [Y+7*8], mm3) + + SSE2_pass(mm0, mm1, mm2, 5, esi) + SSE2_key_schedule(esp+4, esi) + SSE2_pass(mm2, mm0, mm1, 7, esp+4) + SSE2_key_schedule(esp+4, esp+4) + SSE2_pass(mm1, mm2, mm0, 9, esp+4) + + AS2( pxor mm0, [eax+0*8]) + AS2( movq [eax+0*8], mm0) + AS2( psubq mm1, mm5) + AS2( movq [eax+1*8], mm1) + AS2( paddq mm2, [eax+2*8]) + AS2( movq [eax+2*8], mm2) + + AS1( pop esp) + AS1( emms) +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : + : "a" (digest), "S" (X), "d" (table) + : "%ecx", "%edi", "memory", "cc" + ); +#endif + } + else +#endif + { + word64 a = digest[0]; + word64 b = digest[1]; + word64 c = digest[2]; + word64 Y[8]; + #define t1 (table) #define t2 (table+256) #define t3 (table+256*2) @@ -42,15 +217,17 @@ void Tiger::TruncatedFinal(byte *hash, size_t size) b += t4[GETBYTE(c,1)] ^ t3[GETBYTE(c,3)] ^ t2[GETBYTE(c,5)] ^ t1[GETBYTE(c,7)]; \ b *= mul -#define pass(a,b,c,mul,X) \ - round(a,b,c,X[0],mul); \ - round(b,c,a,X[1],mul); \ - round(c,a,b,X[2],mul); \ - round(a,b,c,X[3],mul); \ - round(b,c,a,X[4],mul); \ - round(c,a,b,X[5],mul); \ - round(a,b,c,X[6],mul); \ - round(b,c,a,X[7],mul) +#define pass(a,b,c,mul,X) {\ + int i=0;\ + while (true)\ + {\ + round(a,b,c,X[i+0],mul); \ + round(b,c,a,X[i+1],mul); \ + if (i==6)\ + break;\ + round(c,a,b,X[i+2],mul); \ + i+=3;\ + }} #define key_schedule(Y,X) \ Y[0] = X[0] - (X[7]^W64LIT(0xA5A5A5A5A5A5A5A5)); \ @@ -70,24 +247,16 @@ void Tiger::TruncatedFinal(byte *hash, size_t size) Y[6] += Y[5]; \ Y[7] -= Y[6] ^ W64LIT(0x0123456789ABCDEF) -void Tiger::Transform (word64 *digest, const word64 *X) -{ - word64 a = digest[0]; - word64 b = digest[1]; - word64 c = digest[2]; - word64 Y[8]; + pass(a,b,c,5,X); + key_schedule(Y,X); + pass(c,a,b,7,Y); + key_schedule(Y,Y); + pass(b,c,a,9,Y); - pass(a,b,c,5,X); - key_schedule(Y,X); - pass(c,a,b,7,Y); - key_schedule(Y,Y); - pass(b,c,a,9,Y); - - digest[0] = a ^ digest[0]; - digest[1] = b - digest[1]; - digest[2] = c + digest[2]; - - memset(Y, 0, sizeof(Y)); + digest[0] = a ^ digest[0]; + digest[1] = b - digest[1]; + digest[2] = c + digest[2]; + } } NAMESPACE_END diff --git a/tiger.h b/tiger.h index 66d1da2a..42bf1614 100644 --- a/tiger.h +++ b/tiger.h @@ -9,7 +9,7 @@ NAMESPACE_BEGIN(CryptoPP) -/// Tiger +/// Tiger class Tiger : public IteratedHashWithStaticTransform { public: @@ -19,7 +19,7 @@ public: static const char * StaticAlgorithmName() {return "Tiger";} protected: - static const word64 table[4*256]; + static const word64 table[4*256+3]; }; NAMESPACE_END diff --git a/whrlpool.cpp b/whrlpool.cpp index 989281a3..da19d7ff 100644 --- a/whrlpool.cpp +++ b/whrlpool.cpp @@ -1,7 +1,7 @@ -// Whrlpool.cpp - modified by Kevin Springle from +// whrlpool.cpp - originally modified by Kevin Springle from // Paulo Barreto and Vincent Rijmen's public domain code, whirlpool.c. +// Updated to Whirlpool version 3.0, optimized and MMX version added by Wei Dai // Any modifications are placed in the public domain -// Updated to Whirlpool version 3.0 by Wei Dai // This is the original introductory comment: @@ -69,6 +69,7 @@ #include "whrlpool.h" #include "misc.h" +#include "cpu.h" NAMESPACE_BEGIN(CryptoPP) @@ -94,9 +95,9 @@ void Whirlpool::TruncatedFinal(byte *hash, size_t size) m_data[m_data.size()-2] = GetBitCountHi(); m_data[m_data.size()-1] = GetBitCountLo(); - Transform(m_digest, m_data); - CorrectEndianess(m_digest, m_digest, DigestSize()); - memcpy(hash, m_digest, size); + Transform(m_state, m_data); + CorrectEndianess(m_state, m_state, DigestSize()); + memcpy(hash, m_state, size); Restart(); // reinit for next use } @@ -113,7 +114,7 @@ void Whirlpool::TruncatedFinal(byte *hash, size_t size) * employed). */ -static const word64 C0[256] = { +CRYPTOPP_ALIGN_DATA(16) static const word64 Whirlpool_C[4*256+R] CRYPTOPP_SECTION_ALIGN16 = { W64LIT(0x18186018c07830d8), W64LIT(0x23238c2305af4626), W64LIT(0xc6c63fc67ef991b8), W64LIT(0xe8e887e8136fcdfb), W64LIT(0x878726874ca113cb), W64LIT(0xb8b8dab8a9626d11), W64LIT(0x0101040108050209), W64LIT(0x4f4f214f426e9e0d), W64LIT(0x3636d836adee6c9b), W64LIT(0xa6a6a2a6590451ff), W64LIT(0xd2d26fd2debdb90c), W64LIT(0xf5f5f3f5fb06f70e), @@ -177,11 +178,9 @@ static const word64 C0[256] = { W64LIT(0x16165816b04e2ca6), W64LIT(0x3a3ae83acdd274f7), W64LIT(0x6969b9696fd0d206), W64LIT(0x09092409482d1241), W64LIT(0x7070dd70a7ade0d7), W64LIT(0xb6b6e2b6d954716f), W64LIT(0xd0d067d0ceb7bd1e), W64LIT(0xeded93ed3b7ec7d6), W64LIT(0xcccc17cc2edb85e2), W64LIT(0x424215422a578468), W64LIT(0x98985a98b4c22d2c), W64LIT(0xa4a4aaa4490e55ed), - W64LIT(0x2828a0285d885075), W64LIT(0x5c5c6d5cda31b886), W64LIT(0xf8f8c7f8933fed6b), W64LIT(0x8686228644a411c2), -}; + W64LIT(0x2828a0285d885075), W64LIT(0x5c5c6d5cda31b886), W64LIT(0xf8f8c7f8933fed6b), W64LIT(0x8686228644a411c2), -static const word64 C1[256] = { - W64LIT(0xd818186018c07830), W64LIT(0x2623238c2305af46), W64LIT(0xb8c6c63fc67ef991), W64LIT(0xfbe8e887e8136fcd), + W64LIT(0xd818186018c07830), W64LIT(0x2623238c2305af46), W64LIT(0xb8c6c63fc67ef991), W64LIT(0xfbe8e887e8136fcd), W64LIT(0xcb878726874ca113), W64LIT(0x11b8b8dab8a9626d), W64LIT(0x0901010401080502), W64LIT(0x0d4f4f214f426e9e), W64LIT(0x9b3636d836adee6c), W64LIT(0xffa6a6a2a6590451), W64LIT(0x0cd2d26fd2debdb9), W64LIT(0x0ef5f5f3f5fb06f7), W64LIT(0x967979f979ef80f2), W64LIT(0x306f6fa16f5fcede), W64LIT(0x6d91917e91fcef3f), W64LIT(0xf852525552aa07a4), @@ -245,10 +244,8 @@ static const word64 C1[256] = { W64LIT(0xd77070dd70a7ade0), W64LIT(0x6fb6b6e2b6d95471), W64LIT(0x1ed0d067d0ceb7bd), W64LIT(0xd6eded93ed3b7ec7), W64LIT(0xe2cccc17cc2edb85), W64LIT(0x68424215422a5784), W64LIT(0x2c98985a98b4c22d), W64LIT(0xeda4a4aaa4490e55), W64LIT(0x752828a0285d8850), W64LIT(0x865c5c6d5cda31b8), W64LIT(0x6bf8f8c7f8933fed), W64LIT(0xc28686228644a411), -}; -static const word64 C2[256] = { - W64LIT(0x30d818186018c078), W64LIT(0x462623238c2305af), W64LIT(0x91b8c6c63fc67ef9), W64LIT(0xcdfbe8e887e8136f), + W64LIT(0x30d818186018c078), W64LIT(0x462623238c2305af), W64LIT(0x91b8c6c63fc67ef9), W64LIT(0xcdfbe8e887e8136f), W64LIT(0x13cb878726874ca1), W64LIT(0x6d11b8b8dab8a962), W64LIT(0x0209010104010805), W64LIT(0x9e0d4f4f214f426e), W64LIT(0x6c9b3636d836adee), W64LIT(0x51ffa6a6a2a65904), W64LIT(0xb90cd2d26fd2debd), W64LIT(0xf70ef5f5f3f5fb06), W64LIT(0xf2967979f979ef80), W64LIT(0xde306f6fa16f5fce), W64LIT(0x3f6d91917e91fcef), W64LIT(0xa4f852525552aa07), @@ -312,10 +309,8 @@ static const word64 C2[256] = { W64LIT(0xe0d77070dd70a7ad), W64LIT(0x716fb6b6e2b6d954), W64LIT(0xbd1ed0d067d0ceb7), W64LIT(0xc7d6eded93ed3b7e), W64LIT(0x85e2cccc17cc2edb), W64LIT(0x8468424215422a57), W64LIT(0x2d2c98985a98b4c2), W64LIT(0x55eda4a4aaa4490e), W64LIT(0x50752828a0285d88), W64LIT(0xb8865c5c6d5cda31), W64LIT(0xed6bf8f8c7f8933f), W64LIT(0x11c28686228644a4), -}; -static const word64 C3[256] = { - W64LIT(0x7830d818186018c0), W64LIT(0xaf462623238c2305), W64LIT(0xf991b8c6c63fc67e), W64LIT(0x6fcdfbe8e887e813), + W64LIT(0x7830d818186018c0), W64LIT(0xaf462623238c2305), W64LIT(0xf991b8c6c63fc67e), W64LIT(0x6fcdfbe8e887e813), W64LIT(0xa113cb878726874c), W64LIT(0x626d11b8b8dab8a9), W64LIT(0x0502090101040108), W64LIT(0x6e9e0d4f4f214f42), W64LIT(0xee6c9b3636d836ad), W64LIT(0x0451ffa6a6a2a659), W64LIT(0xbdb90cd2d26fd2de), W64LIT(0x06f70ef5f5f3f5fb), W64LIT(0x80f2967979f979ef), W64LIT(0xcede306f6fa16f5f), W64LIT(0xef3f6d91917e91fc), W64LIT(0x07a4f852525552aa), @@ -379,9 +374,7 @@ static const word64 C3[256] = { W64LIT(0xade0d77070dd70a7), W64LIT(0x54716fb6b6e2b6d9), W64LIT(0xb7bd1ed0d067d0ce), W64LIT(0x7ec7d6eded93ed3b), W64LIT(0xdb85e2cccc17cc2e), W64LIT(0x578468424215422a), W64LIT(0xc22d2c98985a98b4), W64LIT(0x0e55eda4a4aaa449), W64LIT(0x8850752828a0285d), W64LIT(0x31b8865c5c6d5cda), W64LIT(0x3fed6bf8f8c7f893), W64LIT(0xa411c28686228644), -}; -static const word64 rc[R] = { W64LIT(0x1823c6e887b8014f), W64LIT(0x36a6d2f5796f9152), W64LIT(0x60bc9b8ea30c7b35), @@ -397,55 +390,292 @@ static const word64 rc[R] = { // Whirlpool basic transformation. Transforms state based on block. void Whirlpool::Transform(word64 *digest, const word64 *block) { +#ifdef CRYPTOPP_X86_ASM_AVAILABLE + if (HasMMX()) + { + // MMX version has the same structure as C version below +#ifdef __GNUC__ + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + AS1( push ebx) + AS2( mov ebx, eax) +#else + AS2( lea ebx, [Whirlpool_C]) + AS2( mov ecx, digest) + AS2( mov edx, block) +#endif + AS2( mov eax, esp) + AS2( and esp, 0xfffffff0) + AS2( sub esp, 16*8) + AS1( push eax) + AS2( xor esi, esi) + ASL(0) + AS2( movq mm0, [ecx+8*esi]) + AS2( movq [esp+4+8*esi], mm0) // k + AS2( pxor mm0, [edx+8*esi]) + AS2( movq [esp+4+64+8*esi], mm0) // s + AS2( movq [ecx+8*esi], mm0) + AS1( inc esi) + AS2( cmp esi, 8) + ASJ( jne, 0, b) + + AS2( xor esi, esi) + ASL(1) + +#define KSL0(a, b) AS2(movq mm##a, b) +#define KSL1(a, b) AS2(pxor mm##a, b) + +#define KSL(op, i, a, b, c, d) \ + AS2(mov eax, [esp+4+8*i])\ + AS2(movzx edi, al)\ + KSL##op(a, [ebx+3*2048+8*edi])\ + AS2(movzx edi, ah)\ + KSL##op(b, [ebx+2*2048+8*edi])\ + AS2(shr eax, 16)\ + AS2(movzx edi, al)\ + AS2(shr eax, 8)\ + KSL##op(c, [ebx+1*2048+8*edi])\ + KSL##op(d, [ebx+0*2048+8*eax]) + +#define KSH0(a, b) \ + ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ + AS2(pxor mm##a, b) +#define KSH1(a, b) \ + AS2(pxor mm##a, b) +#define KSH2(a, b) \ + AS2(pxor mm##a, b)\ + AS2(movq [esp+4+8*a], mm##a) + +#define KSH(op, i, a, b, c, d) \ + AS2(mov eax, [esp+4+8*((i+4)-8*((i+4)/8))+4])\ + AS2(movzx edi, al)\ + KSH##op(a, [ebx+3*2048+8*edi])\ + AS2(movzx edi, ah)\ + KSH##op(b, [ebx+2*2048+8*edi])\ + AS2(shr eax, 16)\ + AS2(movzx edi, al)\ + AS2(shr eax, 8)\ + KSH##op(c, [ebx+1*2048+8*edi])\ + KSH##op(d, [ebx+0*2048+8*eax]) + +#define TSL(op, i, a, b, c, d) \ + AS2(mov eax, [esp+4+64+8*i])\ + AS2(movzx edi, al)\ + KSL##op(a, [ebx+3*2048+8*edi])\ + AS2(movzx edi, ah)\ + KSL##op(b, [ebx+2*2048+8*edi])\ + AS2(shr eax, 16)\ + AS2(movzx edi, al)\ + AS2(shr eax, 8)\ + KSL##op(c, [ebx+1*2048+8*edi])\ + KSL##op(d, [ebx+0*2048+8*eax]) + +#define TSH0(a, b) \ + ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ + AS2(pxor mm##a, [esp+4+8*a])\ + AS2(pxor mm##a, b) +#define TSH1(a, b) \ + AS2(pxor mm##a, b) +#define TSH2(a, b) \ + AS2(pxor mm##a, b)\ + AS2(movq [esp+4+64+8*a], mm##a) +#define TSH3(a, b) \ + AS2(pxor mm##a, b)\ + AS2(pxor mm##a, [ecx+8*a])\ + AS2(movq [ecx+8*a], mm##a) + +#define TSH(op, i, a, b, c, d) \ + AS2(mov eax, [esp+4+64+8*((i+4)-8*((i+4)/8))+4])\ + AS2(movzx edi, al)\ + TSH##op(a, [ebx+3*2048+8*edi])\ + AS2(movzx edi, ah)\ + TSH##op(b, [ebx+2*2048+8*edi])\ + AS2(shr eax, 16)\ + AS2(movzx edi, al)\ + AS2(shr eax, 8)\ + TSH##op(c, [ebx+1*2048+8*edi])\ + TSH##op(d, [ebx+0*2048+8*eax]) + + KSL(0, 4, 3, 2, 1, 0) + KSL(0, 0, 7, 6, 5, 4) + KSL(1, 1, 0, 7, 6, 5) + KSL(1, 2, 1, 0, 7, 6) + KSL(1, 3, 2, 1, 0, 7) + KSL(1, 5, 4, 3, 2, 1) + KSL(1, 6, 5, 4, 3, 2) + KSL(1, 7, 6, 5, 4, 3) + KSH(0, 0, 7, 6, 5, 4) + KSH(0, 4, 3, 2, 1, 0) + KSH(1, 1, 0, 7, 6, 5) + KSH(1, 2, 1, 0, 7, 6) + KSH(1, 5, 4, 3, 2, 1) + KSH(1, 6, 5, 4, 3, 2) + KSH(2, 3, 2, 1, 0, 7) + KSH(2, 7, 6, 5, 4, 3) + + AS2( pxor mm0, [ebx + 8*1024 + esi*8]) + AS2( movq [esp+4], mm0) + + TSL(0, 4, 3, 2, 1, 0) + TSL(0, 0, 7, 6, 5, 4) + TSL(1, 1, 0, 7, 6, 5) + TSL(1, 2, 1, 0, 7, 6) + TSL(1, 3, 2, 1, 0, 7) + TSL(1, 5, 4, 3, 2, 1) + TSL(1, 6, 5, 4, 3, 2) + TSL(1, 7, 6, 5, 4, 3) + TSH(0, 0, 7, 6, 5, 4) + TSH(0, 4, 3, 2, 1, 0) + TSH(1, 1, 0, 7, 6, 5) + TSH(1, 2, 1, 0, 7, 6) + TSH(1, 5, 4, 3, 2, 1) + TSH(1, 6, 5, 4, 3, 2) + + AS1( inc esi) + AS2( cmp esi, 10) + ASJ( je, 2, f) + + TSH(2, 3, 2, 1, 0, 7) + TSH(2, 7, 6, 5, 4, 3) + + ASJ( jmp, 1, b) + ASL(2) + + TSH(3, 3, 2, 1, 0, 7) + TSH(3, 7, 6, 5, 4, 3) + +#undef KSL +#undef KSH +#undef TSL +#undef TSH + + AS1( emms) + AS1( pop esp) + +#ifdef __GNUC__ + AS1( pop ebx) + ".att_syntax prefix;" + : + : "a" (Whirlpool_C), "c" (digest), "d" (block) + : "%esi", "%edi", "memory", "cc" + ); +#endif + } + else +#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE + { word64 s[8]; // the cipher state word64 k[8]; // the round key // Compute and apply K^0 to the cipher state // Also apply part of the Miyaguchi-Preneel compression function - digest[0] = s[0] = block[0] ^ (k[0] = digest[0]); - digest[1] = s[1] = block[1] ^ (k[1] = digest[1]); - digest[2] = s[2] = block[2] ^ (k[2] = digest[2]); - digest[3] = s[3] = block[3] ^ (k[3] = digest[3]); - digest[4] = s[4] = block[4] ^ (k[4] = digest[4]); - digest[5] = s[5] = block[5] ^ (k[5] = digest[5]); - digest[6] = s[6] = block[6] ^ (k[6] = digest[6]); - digest[7] = s[7] = block[7] ^ (k[7] = digest[7]); + for (int i=0; i<8; i++) + digest[i] = s[i] = block[i] ^ (k[i] = digest[i]); + +#define KSL(op, i, a, b, c, d) \ + t = (word32)k[i];\ + w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : 0);\ + t >>= 8;\ + w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : 0);\ + t >>= 8;\ + w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : 0);\ + t >>= 8;\ + w##d = Whirlpool_C[0*256 + t] ^ (op ? w##d : 0); + +#define KSH(op, i, a, b, c, d) \ + t = (word32)(k[(i+4)%8]>>32);\ + w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : rotrFixed(w##a, 32));\ + if (op==2) k[a] = w##a;\ + t >>= 8;\ + w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : rotrFixed(w##b, 32));\ + if (op==2) k[b] = w##b;\ + t >>= 8;\ + w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : rotrFixed(w##c, 32));\ + if (op==2) k[c] = w##c;\ + t >>= 8;\ + w##d = Whirlpool_C[0*256 + t] ^ (op ? w##d : rotrFixed(w##d, 32));\ + if (op==2) k[d] = w##d;\ + +#define TSL(op, i, a, b, c, d) \ + t = (word32)s[i];\ + w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : 0);\ + t >>= 8;\ + w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : 0);\ + t >>= 8;\ + w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : 0);\ + t >>= 8;\ + w##d = Whirlpool_C[0*256 + t] ^ (op ? w##d : 0); + +#define TSH_OP(op, a, b) \ + w##a = Whirlpool_C[b*256 + (byte)t] ^ (op ? w##a : rotrFixed(w##a, 32) ^ k[a]);\ + if (op==2) s[a] = w##a;\ + if (op==3) digest[a] ^= w##a;\ + +#define TSH(op, i, a, b, c, d) \ + t = (word32)(s[(i+4)%8]>>32);\ + TSH_OP(op, a, 3);\ + t >>= 8;\ + TSH_OP(op, b, 2);\ + t >>= 8;\ + TSH_OP(op, c, 1);\ + t >>= 8;\ + TSH_OP(op, d, 0);\ // Iterate over all rounds: - for (int r = 0; r < R; r++) + int r=0; + while (true) { word64 w0, w1, w2, w3, w4, w5, w6, w7; // temporary storage - word64 t; + word32 t; - // Compute K^r from K^{r-1}: -#define K(i,j) GETBYTE(k[(i+j+1)%8], j) -#define KS(i) \ - t = C0[K(i,3)] ^ C1[K(i,2)] ^ C2[K(i,1)] ^ C3[K(i,0)]; \ - w##i = rotrFixed(t, 32) ^ C0[K(i,7)] ^ C1[K(i,6)] ^ C2[K(i,5)] ^ C3[K(i,4)]; + KSL(0, 4, 3, 2, 1, 0) + KSL(0, 0, 7, 6, 5, 4) + KSL(1, 1, 0, 7, 6, 5) + KSL(1, 2, 1, 0, 7, 6) + KSL(1, 3, 2, 1, 0, 7) + KSL(1, 5, 4, 3, 2, 1) + KSL(1, 6, 5, 4, 3, 2) + KSL(1, 7, 6, 5, 4, 3) + KSH(0, 0, 7, 6, 5, 4) + KSH(0, 4, 3, 2, 1, 0) + KSH(1, 1, 0, 7, 6, 5) + KSH(1, 2, 1, 0, 7, 6) + KSH(1, 5, 4, 3, 2, 1) + KSH(1, 6, 5, 4, 3, 2) + KSH(2, 3, 2, 1, 0, 7) + KSH(2, 7, 6, 5, 4, 3) - KS(0); KS(1); KS(2); KS(3); KS(4); KS(5); KS(6); KS(7); - k[0] = w0 ^ rc[r]; - k[1] = w1; k[2] = w2; k[3] = w3; k[4] = w4; k[5] = w5; k[6] = w6; k[7] = w7; + k[0] ^= Whirlpool_C[1024+r]; - // Apply the r-th round transformation: -#define S(i,j) GETBYTE(s[(i+j+1)%8], j) -#define TS(i) \ - t = C0[S(i,3)] ^ C1[S(i,2)] ^ C2[S(i,1)] ^ C3[S(i,0)]; \ - w##i = rotrFixed(t, 32) ^ C0[S(i,7)] ^ C1[S(i,6)] ^ C2[S(i,5)] ^ C3[S(i,4)] ^ k[i]; + TSL(0, 4, 3, 2, 1, 0) + TSL(0, 0, 7, 6, 5, 4) + TSL(1, 1, 0, 7, 6, 5) + TSL(1, 2, 1, 0, 7, 6) + TSL(1, 3, 2, 1, 0, 7) + TSL(1, 5, 4, 3, 2, 1) + TSL(1, 6, 5, 4, 3, 2) + TSL(1, 7, 6, 5, 4, 3) + TSH(0, 0, 7, 6, 5, 4) + TSH(0, 4, 3, 2, 1, 0) + TSH(1, 1, 0, 7, 6, 5) + TSH(1, 2, 1, 0, 7, 6) + TSH(1, 5, 4, 3, 2, 1) + TSH(1, 6, 5, 4, 3, 2) - TS(0); TS(1); TS(2); TS(3); TS(4); TS(5); TS(6); TS(7); - s[0] = w0; s[1] = w1; s[2] = w2; s[3] = w3; s[4] = w4; s[5] = w5; s[6] = w6; s[7] = w7; + if (++r < R) + { + TSH(2, 3, 2, 1, 0, 7) + TSH(2, 7, 6, 5, 4, 3) + } + else + { + TSH(3, 3, 2, 1, 0, 7) + TSH(3, 7, 6, 5, 4, 3) + break; + } + } } - - // Apply the rest of the Miyaguchi-Preneel compression function: - digest[0] ^= s[0]; - digest[1] ^= s[1]; - digest[2] ^= s[2]; - digest[3] ^= s[3]; - digest[4] ^= s[4]; - digest[5] ^= s[5]; - digest[6] ^= s[6]; - digest[7] ^= s[7]; } NAMESPACE_END diff --git a/whrlpool.h b/whrlpool.h index c6971f08..298850ab 100644 --- a/whrlpool.h +++ b/whrlpool.h @@ -9,8 +9,7 @@ NAMESPACE_BEGIN(CryptoPP) -//! Whirlpool -/*! 512 Bit Hash */ +//! Whirlpool class Whirlpool : public IteratedHashWithStaticTransform { public: