unify GCC and MSVC multiplication code
parent
005b94f755
commit
3d354a8bf2
708
integer.cpp
708
integer.cpp
|
|
@ -27,7 +27,7 @@
|
||||||
#elif defined(_MSC_VER) && defined(_M_IX86)
|
#elif defined(_MSC_VER) && defined(_M_IX86)
|
||||||
#pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")
|
#pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")
|
||||||
#elif defined(__GNUC__) && defined(__i386__)
|
#elif defined(__GNUC__) && defined(__i386__)
|
||||||
#pragma message("You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled.")
|
#warning "You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled."
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
@ -859,22 +859,69 @@ void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
|
||||||
// CodeWarrior defines _MSC_VER
|
// CodeWarrior defines _MSC_VER
|
||||||
#if (defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86)) || (defined(__GNUC__) && defined(__i386__))
|
#if (defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86)) || (defined(__GNUC__) && defined(__i386__))
|
||||||
|
|
||||||
|
// ************** x86 feature detection ***************
|
||||||
|
|
||||||
|
static bool s_sse2Enabled = true;
|
||||||
|
|
||||||
|
static void CpuId(word32 input, word32 *output)
|
||||||
|
{
|
||||||
|
#ifdef __GNUC__
|
||||||
|
__asm__
|
||||||
|
(
|
||||||
|
"cpuid"
|
||||||
|
: "=a" (output[0]), "=b" (output[1]), "=c" (output[2]), "=d" (output[3])
|
||||||
|
: "a" (input)
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
__asm
|
||||||
|
{
|
||||||
|
mov eax, input
|
||||||
|
cpuid
|
||||||
|
mov edi, output
|
||||||
|
mov [edi], eax
|
||||||
|
mov [edi+4], ebx
|
||||||
|
mov [edi+8], ecx
|
||||||
|
mov [edi+12], edx
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool HasSSE2()
|
||||||
|
{
|
||||||
|
if (!s_sse2Enabled)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
word32 cpuid[4];
|
||||||
|
CpuId(1, cpuid);
|
||||||
|
return (cpuid[3] & (1 << 26)) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool IsP4()
|
||||||
|
{
|
||||||
|
word32 cpuid[4];
|
||||||
|
|
||||||
|
CpuId(0, cpuid);
|
||||||
|
std::swap(cpuid[2], cpuid[3]);
|
||||||
|
if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
CpuId(1, cpuid);
|
||||||
|
return ((cpuid[0] >> 8) & 0xf) == 0xf;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ************** Pentium/P4 optimizations ***************
|
||||||
|
|
||||||
class PentiumOptimized : public Portable
|
class PentiumOptimized : public Portable
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
|
static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
|
||||||
static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
|
static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
|
||||||
#ifdef __GNUC__
|
static void CRYPTOPP_CDECL Multiply4(word *C, const word *A, const word *B);
|
||||||
static void Square4(word *R, const word *A);
|
static void CRYPTOPP_CDECL Multiply8(word *C, const word *A, const word *B);
|
||||||
static void Multiply4(word *C, const word *A, const word *B);
|
static void CRYPTOPP_CDECL Multiply8Bottom(word *C, const word *A, const word *B);
|
||||||
static void Multiply8(word *C, const word *A, const word *B);
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef PentiumOptimized LowLevel;
|
class P4Optimized
|
||||||
|
|
||||||
// this may be selected at run time
|
|
||||||
class P4Optimized : public PentiumOptimized
|
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
|
static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
|
||||||
|
|
@ -883,7 +930,70 @@ public:
|
||||||
static void Multiply4(word *C, const word *A, const word *B);
|
static void Multiply4(word *C, const word *A, const word *B);
|
||||||
static void Multiply8(word *C, const word *A, const word *B);
|
static void Multiply8(word *C, const word *A, const word *B);
|
||||||
static void Multiply8Bottom(word *C, const word *A, const word *B);
|
static void Multiply8Bottom(word *C, const word *A, const word *B);
|
||||||
static inline void Square4(word *R, const word *A) {Multiply4(R, A, A);}
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef word (CRYPTOPP_CDECL * PAddSub)(word *C, const word *A, const word *B, unsigned int N);
|
||||||
|
typedef void (* PMul)(word *C, const word *A, const word *B);
|
||||||
|
|
||||||
|
static PAddSub s_pAdd, s_pSub;
|
||||||
|
#ifdef SSE2_INTRINSICS_AVAILABLE
|
||||||
|
static PMul s_pMul4, s_pMul8, s_pMul8B;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static void SetPentiumFunctionPointers()
|
||||||
|
{
|
||||||
|
if (IsP4())
|
||||||
|
{
|
||||||
|
s_pAdd = &P4Optimized::Add;
|
||||||
|
s_pSub = &P4Optimized::Subtract;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
s_pAdd = &PentiumOptimized::Add;
|
||||||
|
s_pSub = &PentiumOptimized::Subtract;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef SSE2_INTRINSICS_AVAILABLE
|
||||||
|
if (HasSSE2())
|
||||||
|
{
|
||||||
|
s_pMul4 = &P4Optimized::Multiply4;
|
||||||
|
s_pMul8 = &P4Optimized::Multiply8;
|
||||||
|
s_pMul8B = &P4Optimized::Multiply8Bottom;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
s_pMul4 = &PentiumOptimized::Multiply4;
|
||||||
|
s_pMul8 = &PentiumOptimized::Multiply8;
|
||||||
|
s_pMul8B = &PentiumOptimized::Multiply8Bottom;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char s_RunAtStartupSetPentiumFunctionPointers = (SetPentiumFunctionPointers(), 0);
|
||||||
|
|
||||||
|
void DisableSSE2()
|
||||||
|
{
|
||||||
|
s_sse2Enabled = false;
|
||||||
|
SetPentiumFunctionPointers();
|
||||||
|
}
|
||||||
|
|
||||||
|
class LowLevel : public PentiumOptimized
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
inline static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N)
|
||||||
|
{return s_pAdd(C, A, B, N);}
|
||||||
|
inline static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N)
|
||||||
|
{return s_pSub(C, A, B, N);}
|
||||||
|
inline static void Square4(word *R, const word *A)
|
||||||
|
{Multiply4(R, A, A);}
|
||||||
|
#ifdef SSE2_INTRINSICS_AVAILABLE
|
||||||
|
inline static void Multiply4(word *C, const word *A, const word *B)
|
||||||
|
{s_pMul4(C, A, B);}
|
||||||
|
inline static void Multiply8(word *C, const word *A, const word *B)
|
||||||
|
{s_pMul8(C, A, B);}
|
||||||
|
inline static void Multiply8Bottom(word *C, const word *A, const word *B)
|
||||||
|
{s_pMul8B(C, A, B);}
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -892,7 +1002,7 @@ public:
|
||||||
#define CRYPTOPP_NAKED __declspec(naked)
|
#define CRYPTOPP_NAKED __declspec(naked)
|
||||||
#define AS1(x) __asm x
|
#define AS1(x) __asm x
|
||||||
#define AS2(x, y) __asm x, y
|
#define AS2(x, y) __asm x, y
|
||||||
#define PentiumPrologue \
|
#define AddPrologue \
|
||||||
__asm push ebp \
|
__asm push ebp \
|
||||||
__asm push ebx \
|
__asm push ebx \
|
||||||
__asm push esi \
|
__asm push esi \
|
||||||
|
|
@ -901,55 +1011,67 @@ public:
|
||||||
__asm mov edx, [esp+24] \
|
__asm mov edx, [esp+24] \
|
||||||
__asm mov ebx, [esp+28] \
|
__asm mov ebx, [esp+28] \
|
||||||
__asm mov esi, [esp+32]
|
__asm mov esi, [esp+32]
|
||||||
#define PentiumEpilogue \
|
#define AddEpilogue \
|
||||||
__asm pop edi \
|
__asm pop edi \
|
||||||
__asm pop esi \
|
__asm pop esi \
|
||||||
__asm pop ebx \
|
__asm pop ebx \
|
||||||
__asm pop ebp \
|
__asm pop ebp \
|
||||||
__asm ret
|
__asm ret
|
||||||
#define P4Prologue \
|
#define MulPrologue \
|
||||||
__asm sub esp, 16 \
|
__asm push ebp \
|
||||||
__asm mov [esp], edi \
|
__asm push ebx \
|
||||||
__asm mov [esp+4], esi \
|
__asm push esi \
|
||||||
__asm mov [esp+8], ebx \
|
__asm push edi \
|
||||||
__asm mov [esp+12], ebp \
|
__asm mov ecx, [esp+28] \
|
||||||
__asm mov ecx, [esp+20] \
|
__asm mov esi, [esp+24] \
|
||||||
__asm mov edx, [esp+24] \
|
__asm push [esp+20]
|
||||||
__asm mov ebx, [esp+28] \
|
#define MulEpilogue \
|
||||||
__asm mov esi, [esp+32]
|
__asm add esp, 4 \
|
||||||
#define P4Epilogue \
|
__asm pop edi \
|
||||||
__asm mov edi, [esp] \
|
__asm pop esi \
|
||||||
__asm mov esi, [esp+4] \
|
__asm pop ebx \
|
||||||
__asm mov ebx, [esp+8] \
|
__asm pop ebp \
|
||||||
__asm mov ebp, [esp+12] \
|
|
||||||
__asm add esp, 16 \
|
|
||||||
__asm ret
|
__asm ret
|
||||||
#else
|
#else
|
||||||
#define CRYPTOPP_NAKED
|
#define CRYPTOPP_NAKED
|
||||||
#define AS1(x) #x ";"
|
#define AS1(x) #x ";"
|
||||||
#define AS2(x, y) #x ", " #y ";"
|
#define AS2(x, y) #x ", " #y ";"
|
||||||
#define PentiumPrologue \
|
#define AddPrologue \
|
||||||
__asm__ \
|
__asm__ __volatile__ \
|
||||||
( \
|
( \
|
||||||
".att_syntax prefix;" \
|
"push %%ebx;" /* save this manually, in case of -fPIC */ \
|
||||||
"mov %0, %%ecx;" \
|
|
||||||
"mov %1, %%edx;" \
|
|
||||||
"mov %2, %%ebx;" \
|
"mov %2, %%ebx;" \
|
||||||
"mov %3, %%esi;" \
|
".intel_syntax noprefix;" \
|
||||||
|
"push ebp;"
|
||||||
|
#define AddEpilogue \
|
||||||
|
"pop ebp;" \
|
||||||
|
".att_syntax prefix;" \
|
||||||
|
"pop %%ebx;" \
|
||||||
|
: \
|
||||||
|
: "c" (C), "d" (A), "m" (B), "S" (N) \
|
||||||
|
: "%edi", "memory", "cc" \
|
||||||
|
);
|
||||||
|
#define MulPrologue \
|
||||||
|
__asm__ __volatile__ \
|
||||||
|
( \
|
||||||
|
"push %%ebx;" /* save this manually, in case of -fPIC */ \
|
||||||
|
"push %%ebp;" \
|
||||||
|
"push %0;" \
|
||||||
".intel_syntax noprefix;"
|
".intel_syntax noprefix;"
|
||||||
#define PentiumEpilogue \
|
#define MulEpilogue \
|
||||||
|
"add esp, 4;" \
|
||||||
|
"pop ebp;" \
|
||||||
|
"pop ebx;" \
|
||||||
".att_syntax prefix;" \
|
".att_syntax prefix;" \
|
||||||
: \
|
: \
|
||||||
: "m" (C), "m" (A), "m" (B), "m" (N) \
|
: "rm" (Z), "S" (X), "c" (Y) \
|
||||||
: "%ecx", "%edx", "%ebx", "%esi", "%edi" \
|
: "%eax", "%edx", "%edi", "memory", "cc" \
|
||||||
);
|
);
|
||||||
#define P4Prologue PentiumPrologue
|
|
||||||
#define P4Epilogue PentiumEpilogue
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
|
CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
|
||||||
{
|
{
|
||||||
PentiumPrologue
|
AddPrologue
|
||||||
|
|
||||||
// now: ebx = B, ecx = C, edx = A, esi = N
|
// now: ebx = B, ecx = C, edx = A, esi = N
|
||||||
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
|
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
|
||||||
|
|
@ -982,12 +1104,12 @@ CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B,
|
||||||
AS1(loopendAdd:)
|
AS1(loopendAdd:)
|
||||||
AS2( adc eax, 0) // store carry into eax (return result register)
|
AS2( adc eax, 0) // store carry into eax (return result register)
|
||||||
|
|
||||||
PentiumEpilogue
|
AddEpilogue
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
|
CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
|
||||||
{
|
{
|
||||||
PentiumPrologue
|
AddPrologue
|
||||||
|
|
||||||
// now: ebx = B, ecx = C, edx = A, esi = N
|
// now: ebx = B, ecx = C, edx = A, esi = N
|
||||||
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
|
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
|
||||||
|
|
@ -1020,12 +1142,14 @@ CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const wor
|
||||||
AS1(loopendSub:)
|
AS1(loopendSub:)
|
||||||
AS2( adc eax, 0) // store carry into eax (return result register)
|
AS2( adc eax, 0) // store carry into eax (return result register)
|
||||||
|
|
||||||
PentiumEpilogue
|
AddEpilogue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
|
||||||
|
|
||||||
CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
|
CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
|
||||||
{
|
{
|
||||||
P4Prologue
|
AddPrologue
|
||||||
|
|
||||||
// now: ebx = B, ecx = C, edx = A, esi = N
|
// now: ebx = B, ecx = C, edx = A, esi = N
|
||||||
AS2( xor eax, eax)
|
AS2( xor eax, eax)
|
||||||
|
|
@ -1077,12 +1201,12 @@ CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsi
|
||||||
|
|
||||||
AS1(loopendAddP4:)
|
AS1(loopendAddP4:)
|
||||||
|
|
||||||
P4Epilogue
|
AddEpilogue
|
||||||
}
|
}
|
||||||
|
|
||||||
CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
|
CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
|
||||||
{
|
{
|
||||||
P4Prologue
|
AddPrologue
|
||||||
|
|
||||||
// now: ebx = B, ecx = C, edx = A, esi = N
|
// now: ebx = B, ecx = C, edx = A, esi = N
|
||||||
AS2( xor eax, eax)
|
AS2( xor eax, eax)
|
||||||
|
|
@ -1134,147 +1258,51 @@ CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B,
|
||||||
|
|
||||||
AS1(loopendSubP4:)
|
AS1(loopendSubP4:)
|
||||||
|
|
||||||
P4Epilogue
|
AddEpilogue
|
||||||
}
|
}
|
||||||
|
|
||||||
#if __GNUC__
|
// multiply assembly code originally contributed by Leonard Janke
|
||||||
// Comba square and multiply assembly code originally contributed by Leonard Janke
|
|
||||||
// These are not needed with MSVC, which does a good job of optimizing the C++ multiply code.
|
|
||||||
|
|
||||||
#define SqrStartup \
|
|
||||||
"push %%ebp\n\t" \
|
|
||||||
"push %%esi\n\t" \
|
|
||||||
"push %%ebx\n\t" \
|
|
||||||
"xor %%ebp, %%ebp\n\t" \
|
|
||||||
"xor %%ebx, %%ebx\n\t" \
|
|
||||||
"xor %%ecx, %%ecx\n\t"
|
|
||||||
|
|
||||||
#define SqrShiftCarry \
|
|
||||||
"mov %%ebx, %%ebp\n\t" \
|
|
||||||
"mov %%ecx, %%ebx\n\t" \
|
|
||||||
"xor %%ecx, %%ecx\n\t"
|
|
||||||
|
|
||||||
#define SqrAccumulate(i,j) \
|
|
||||||
"mov 4*"#j"(%%esi), %%eax\n\t" \
|
|
||||||
"mull 4*"#i"(%%esi)\n\t" \
|
|
||||||
"add %%eax, %%ebp\n\t" \
|
|
||||||
"adc %%edx, %%ebx\n\t" \
|
|
||||||
"adc %%ch, %%cl\n\t" \
|
|
||||||
"add %%eax, %%ebp\n\t" \
|
|
||||||
"adc %%edx, %%ebx\n\t" \
|
|
||||||
"adc %%ch, %%cl\n\t"
|
|
||||||
|
|
||||||
#define SqrAccumulateCentre(i) \
|
|
||||||
"mov 4*"#i"(%%esi), %%eax\n\t" \
|
|
||||||
"mull 4*"#i"(%%esi)\n\t" \
|
|
||||||
"add %%eax, %%ebp\n\t" \
|
|
||||||
"adc %%edx, %%ebx\n\t" \
|
|
||||||
"adc %%ch, %%cl\n\t"
|
|
||||||
|
|
||||||
#define SqrStoreDigit(X) \
|
|
||||||
"mov %%ebp, 4*"#X"(%%edi)\n\t" \
|
|
||||||
|
|
||||||
#define SqrLastDiagonal(digits) \
|
|
||||||
"mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
|
|
||||||
"mull 4*("#digits"-1)(%%esi)\n\t" \
|
|
||||||
"add %%eax, %%ebp\n\t" \
|
|
||||||
"adc %%edx, %%ebx\n\t" \
|
|
||||||
"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
|
|
||||||
"mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t"
|
|
||||||
|
|
||||||
#define SqrCleanup \
|
|
||||||
"pop %%ebx\n\t" \
|
|
||||||
"pop %%esi\n\t" \
|
|
||||||
"pop %%ebp\n\t"
|
|
||||||
|
|
||||||
void PentiumOptimized::Square4(word* Y, const word* X)
|
|
||||||
{
|
|
||||||
__asm__ __volatile__(
|
|
||||||
SqrStartup
|
|
||||||
|
|
||||||
SqrAccumulateCentre(0)
|
|
||||||
SqrStoreDigit(0)
|
|
||||||
SqrShiftCarry
|
|
||||||
|
|
||||||
SqrAccumulate(1,0)
|
|
||||||
SqrStoreDigit(1)
|
|
||||||
SqrShiftCarry
|
|
||||||
|
|
||||||
SqrAccumulate(2,0)
|
|
||||||
SqrAccumulateCentre(1)
|
|
||||||
SqrStoreDigit(2)
|
|
||||||
SqrShiftCarry
|
|
||||||
|
|
||||||
SqrAccumulate(3,0)
|
|
||||||
SqrAccumulate(2,1)
|
|
||||||
SqrStoreDigit(3)
|
|
||||||
SqrShiftCarry
|
|
||||||
|
|
||||||
SqrAccumulate(3,1)
|
|
||||||
SqrAccumulateCentre(2)
|
|
||||||
SqrStoreDigit(4)
|
|
||||||
SqrShiftCarry
|
|
||||||
|
|
||||||
SqrAccumulate(3,2)
|
|
||||||
SqrStoreDigit(5)
|
|
||||||
SqrShiftCarry
|
|
||||||
|
|
||||||
SqrLastDiagonal(4)
|
|
||||||
|
|
||||||
SqrCleanup
|
|
||||||
|
|
||||||
:
|
|
||||||
: "D" (Y), "S" (X)
|
|
||||||
: "eax", "ecx", "edx", "ebp", "memory"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define MulStartup \
|
#define MulStartup \
|
||||||
"push %%ebp\n\t" \
|
AS2(xor ebp, ebp) \
|
||||||
"push %%esi\n\t" \
|
AS2(xor edi, edi) \
|
||||||
"push %%ebx\n\t" \
|
AS2(xor ebx, ebx)
|
||||||
"push %%edi\n\t" \
|
|
||||||
"mov %%eax, %%ebx \n\t" \
|
|
||||||
"xor %%ebp, %%ebp\n\t" \
|
|
||||||
"xor %%edi, %%edi\n\t" \
|
|
||||||
"xor %%ecx, %%ecx\n\t"
|
|
||||||
|
|
||||||
#define MulShiftCarry \
|
#define MulShiftCarry \
|
||||||
"mov %%edx, %%ebp\n\t" \
|
AS2(mov ebp, edx) \
|
||||||
"mov %%ecx, %%edi\n\t" \
|
AS2(mov edi, ebx) \
|
||||||
"xor %%ecx, %%ecx\n\t"
|
AS2(xor ebx, ebx)
|
||||||
|
|
||||||
|
#define MulAccumulateBottom(i,j) \
|
||||||
|
AS2(mov eax, [ecx+4*j]) \
|
||||||
|
AS2(imul eax, dword ptr [esi+4*i]) \
|
||||||
|
AS2(add ebp, eax)
|
||||||
|
|
||||||
#define MulAccumulate(i,j) \
|
#define MulAccumulate(i,j) \
|
||||||
"mov 4*"#j"(%%ebx), %%eax\n\t" \
|
AS2(mov eax, [ecx+4*j]) \
|
||||||
"mull 4*"#i"(%%esi)\n\t" \
|
AS1(mul dword ptr [esi+4*i]) \
|
||||||
"add %%eax, %%ebp\n\t" \
|
AS2(add ebp, eax) \
|
||||||
"adc %%edx, %%edi\n\t" \
|
AS2(adc edi, edx) \
|
||||||
"adc %%ch, %%cl\n\t"
|
AS2(adc bl, bh)
|
||||||
|
|
||||||
#define MulStoreDigit(X) \
|
#define MulStoreDigit(i) \
|
||||||
"mov %%edi, %%edx \n\t" \
|
AS2(mov edx, edi) \
|
||||||
"mov (%%esp), %%edi \n\t" \
|
AS2(mov edi, [esp]) \
|
||||||
"mov %%ebp, 4*"#X"(%%edi)\n\t" \
|
AS2(mov [edi+4*i], ebp)
|
||||||
"mov %%edi, (%%esp)\n\t"
|
|
||||||
|
|
||||||
#define MulLastDiagonal(digits) \
|
#define MulLastDiagonal(digits) \
|
||||||
"mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
|
AS2(mov eax, [ecx+4*(digits-1)]) \
|
||||||
"mull 4*("#digits"-1)(%%esi)\n\t" \
|
AS1(mul dword ptr [esi+4*(digits-1)]) \
|
||||||
"add %%eax, %%ebp\n\t" \
|
AS2(add ebp, eax) \
|
||||||
"adc %%edi, %%edx\n\t" \
|
AS2(adc edx, edi) \
|
||||||
"mov (%%esp), %%edi\n\t" \
|
AS2(mov edi, [esp]) \
|
||||||
"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
|
AS2(mov [edi+4*(2*digits-2)], ebp) \
|
||||||
"mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t"
|
AS2(mov [edi+4*(2*digits-1)], edx)
|
||||||
|
|
||||||
#define MulCleanup \
|
CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
|
||||||
"pop %%edi\n\t" \
|
|
||||||
"pop %%ebx\n\t" \
|
|
||||||
"pop %%esi\n\t" \
|
|
||||||
"pop %%ebp\n\t"
|
|
||||||
|
|
||||||
void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
|
|
||||||
{
|
{
|
||||||
__asm__ __volatile__(
|
MulPrologue
|
||||||
|
// now: [esp] = Z, esi = X, ecx = Y
|
||||||
MulStartup
|
MulStartup
|
||||||
MulAccumulate(0,0)
|
MulAccumulate(0,0)
|
||||||
MulStoreDigit(0)
|
MulStoreDigit(0)
|
||||||
|
|
@ -1310,18 +1338,13 @@ void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
|
||||||
MulShiftCarry
|
MulShiftCarry
|
||||||
|
|
||||||
MulLastDiagonal(4)
|
MulLastDiagonal(4)
|
||||||
|
MulEpilogue
|
||||||
MulCleanup
|
|
||||||
|
|
||||||
:
|
|
||||||
: "D" (Z), "S" (X), "a" (Y)
|
|
||||||
: "%ecx", "%edx", "memory"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
|
CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
|
||||||
{
|
{
|
||||||
__asm__ __volatile__(
|
MulPrologue
|
||||||
|
// now: [esp] = Z, esi = X, ecx = Y
|
||||||
MulStartup
|
MulStartup
|
||||||
MulAccumulate(0,0)
|
MulAccumulate(0,0)
|
||||||
MulStoreDigit(0)
|
MulStoreDigit(0)
|
||||||
|
|
@ -1429,16 +1452,77 @@ void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
|
||||||
MulShiftCarry
|
MulShiftCarry
|
||||||
|
|
||||||
MulLastDiagonal(8)
|
MulLastDiagonal(8)
|
||||||
|
MulEpilogue
|
||||||
MulCleanup
|
|
||||||
|
|
||||||
:
|
|
||||||
: "D" (Z), "S" (X), "a" (Y)
|
|
||||||
: "%ecx", "%edx", "memory"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // __GNUC__
|
CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)
|
||||||
|
{
|
||||||
|
MulPrologue
|
||||||
|
// now: [esp] = Z, esi = X, ecx = Y
|
||||||
|
MulStartup
|
||||||
|
MulAccumulate(0,0)
|
||||||
|
MulStoreDigit(0)
|
||||||
|
MulShiftCarry
|
||||||
|
|
||||||
|
MulAccumulate(1,0)
|
||||||
|
MulAccumulate(0,1)
|
||||||
|
MulStoreDigit(1)
|
||||||
|
MulShiftCarry
|
||||||
|
|
||||||
|
MulAccumulate(2,0)
|
||||||
|
MulAccumulate(1,1)
|
||||||
|
MulAccumulate(0,2)
|
||||||
|
MulStoreDigit(2)
|
||||||
|
MulShiftCarry
|
||||||
|
|
||||||
|
MulAccumulate(3,0)
|
||||||
|
MulAccumulate(2,1)
|
||||||
|
MulAccumulate(1,2)
|
||||||
|
MulAccumulate(0,3)
|
||||||
|
MulStoreDigit(3)
|
||||||
|
MulShiftCarry
|
||||||
|
|
||||||
|
MulAccumulate(4,0)
|
||||||
|
MulAccumulate(3,1)
|
||||||
|
MulAccumulate(2,2)
|
||||||
|
MulAccumulate(1,3)
|
||||||
|
MulAccumulate(0,4)
|
||||||
|
MulStoreDigit(4)
|
||||||
|
MulShiftCarry
|
||||||
|
|
||||||
|
MulAccumulate(5,0)
|
||||||
|
MulAccumulate(4,1)
|
||||||
|
MulAccumulate(3,2)
|
||||||
|
MulAccumulate(2,3)
|
||||||
|
MulAccumulate(1,4)
|
||||||
|
MulAccumulate(0,5)
|
||||||
|
MulStoreDigit(5)
|
||||||
|
MulShiftCarry
|
||||||
|
|
||||||
|
MulAccumulate(6,0)
|
||||||
|
MulAccumulate(5,1)
|
||||||
|
MulAccumulate(4,2)
|
||||||
|
MulAccumulate(3,3)
|
||||||
|
MulAccumulate(2,4)
|
||||||
|
MulAccumulate(1,5)
|
||||||
|
MulAccumulate(0,6)
|
||||||
|
MulStoreDigit(6)
|
||||||
|
MulShiftCarry
|
||||||
|
|
||||||
|
MulAccumulateBottom(7,0)
|
||||||
|
MulAccumulateBottom(6,1)
|
||||||
|
MulAccumulateBottom(5,2)
|
||||||
|
MulAccumulateBottom(4,3)
|
||||||
|
MulAccumulateBottom(3,4)
|
||||||
|
MulAccumulateBottom(2,5)
|
||||||
|
MulAccumulateBottom(1,6)
|
||||||
|
MulAccumulateBottom(0,7)
|
||||||
|
MulStoreDigit(7)
|
||||||
|
MulEpilogue
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef AS1
|
||||||
|
#undef AS2
|
||||||
|
|
||||||
#else // not x86 - no processor specific code at this layer
|
#else // not x86 - no processor specific code at this layer
|
||||||
|
|
||||||
|
|
@ -1446,43 +1530,8 @@ typedef Portable LowLevel;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
|
|
||||||
|
|
||||||
void DisableSSE2()
|
|
||||||
{
|
|
||||||
g_sse2Enabled = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef SSE2_INTRINSICS_AVAILABLE
|
#ifdef SSE2_INTRINSICS_AVAILABLE
|
||||||
|
|
||||||
static bool GetSSE2Capability()
|
|
||||||
{
|
|
||||||
word32 b;
|
|
||||||
|
|
||||||
#ifdef __GNUC__
|
|
||||||
__asm__("mov $1, %%eax; cpuid; mov %%edx, %0" : "=rm" (b) : : "%eax", "%edx");
|
|
||||||
#else
|
|
||||||
__asm
|
|
||||||
{
|
|
||||||
mov eax, 1
|
|
||||||
cpuid
|
|
||||||
mov b, edx
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return (b & (1 << 26)) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool HasSSE2()
|
|
||||||
{
|
|
||||||
if (g_sse2Enabled && !g_sse2DetectionDone)
|
|
||||||
{
|
|
||||||
g_sse2Detected = GetSSE2Capability();
|
|
||||||
g_sse2DetectionDone = true;
|
|
||||||
}
|
|
||||||
return g_sse2Enabled && g_sse2Detected;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
#define __fastcall
|
#define __fastcall
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -1891,34 +1940,23 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
|
||||||
#define R2 (R+N)
|
#define R2 (R+N)
|
||||||
#define R3 (R+N+N2)
|
#define R3 (R+N+N2)
|
||||||
|
|
||||||
//VC60 workaround: compiler bug triggered without the extra dummy parameters
|
|
||||||
|
|
||||||
// R[2*N] - result = A*B
|
// R[2*N] - result = A*B
|
||||||
// T[2*N] - temporary work space
|
// T[2*N] - temporary work space
|
||||||
// A[N] --- multiplier
|
// A[N] --- multiplier
|
||||||
// B[N] --- multiplicant
|
// B[N] --- multiplicant
|
||||||
|
|
||||||
template <class P>
|
void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N)
|
||||||
void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
|
|
||||||
|
|
||||||
template <class P>
|
|
||||||
inline void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
|
|
||||||
{
|
{
|
||||||
assert(N>=2 && N%2==0);
|
assert(N>=2 && N%2==0);
|
||||||
|
|
||||||
if (P::MultiplyRecursionLimit() >= 8 && N==8)
|
if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
|
||||||
P::Multiply8(R, A, B);
|
LowLevel::Multiply8(R, A, B);
|
||||||
else if (P::MultiplyRecursionLimit() >= 4 && N==4)
|
else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
|
||||||
P::Multiply4(R, A, B);
|
LowLevel::Multiply4(R, A, B);
|
||||||
else if (N==2)
|
else if (N==2)
|
||||||
P::Multiply2(R, A, B);
|
LowLevel::Multiply2(R, A, B);
|
||||||
else
|
else
|
||||||
DoRecursiveMultiply<P>(R, T, A, B, N, NULL); // VC60 workaround: needs this NULL
|
{
|
||||||
}
|
|
||||||
|
|
||||||
template <class P>
|
|
||||||
void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
|
|
||||||
{
|
|
||||||
const unsigned int N2 = N/2;
|
const unsigned int N2 = N/2;
|
||||||
int carry;
|
int carry;
|
||||||
|
|
||||||
|
|
@ -1928,29 +1966,29 @@ void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigne
|
||||||
switch (2*aComp + aComp + bComp)
|
switch (2*aComp + aComp + bComp)
|
||||||
{
|
{
|
||||||
case -4:
|
case -4:
|
||||||
P::Subtract(R0, A1, A0, N2);
|
LowLevel::Subtract(R0, A1, A0, N2);
|
||||||
P::Subtract(R1, B0, B1, N2);
|
LowLevel::Subtract(R1, B0, B1, N2);
|
||||||
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
|
RecursiveMultiply(T0, T2, R0, R1, N2);
|
||||||
P::Subtract(T1, T1, R0, N2);
|
LowLevel::Subtract(T1, T1, R0, N2);
|
||||||
carry = -1;
|
carry = -1;
|
||||||
break;
|
break;
|
||||||
case -2:
|
case -2:
|
||||||
P::Subtract(R0, A1, A0, N2);
|
LowLevel::Subtract(R0, A1, A0, N2);
|
||||||
P::Subtract(R1, B0, B1, N2);
|
LowLevel::Subtract(R1, B0, B1, N2);
|
||||||
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
|
RecursiveMultiply(T0, T2, R0, R1, N2);
|
||||||
carry = 0;
|
carry = 0;
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
P::Subtract(R0, A0, A1, N2);
|
LowLevel::Subtract(R0, A0, A1, N2);
|
||||||
P::Subtract(R1, B1, B0, N2);
|
LowLevel::Subtract(R1, B1, B0, N2);
|
||||||
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
|
RecursiveMultiply(T0, T2, R0, R1, N2);
|
||||||
carry = 0;
|
carry = 0;
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
P::Subtract(R0, A1, A0, N2);
|
LowLevel::Subtract(R0, A1, A0, N2);
|
||||||
P::Subtract(R1, B0, B1, N2);
|
LowLevel::Subtract(R1, B0, B1, N2);
|
||||||
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
|
RecursiveMultiply(T0, T2, R0, R1, N2);
|
||||||
P::Subtract(T1, T1, R1, N2);
|
LowLevel::Subtract(T1, T1, R1, N2);
|
||||||
carry = -1;
|
carry = -1;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
|
@ -1958,52 +1996,45 @@ void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigne
|
||||||
carry = 0;
|
carry = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
RecursiveMultiply<P>(R0, T2, A0, B0, N2);
|
RecursiveMultiply(R0, T2, A0, B0, N2);
|
||||||
RecursiveMultiply<P>(R2, T2, A1, B1, N2);
|
RecursiveMultiply(R2, T2, A1, B1, N2);
|
||||||
|
|
||||||
// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
|
// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
|
||||||
|
|
||||||
carry += P::Add(T0, T0, R0, N);
|
carry += LowLevel::Add(T0, T0, R0, N);
|
||||||
carry += P::Add(T0, T0, R2, N);
|
carry += LowLevel::Add(T0, T0, R2, N);
|
||||||
carry += P::Add(R1, R1, T0, N);
|
carry += LowLevel::Add(R1, R1, T0, N);
|
||||||
|
|
||||||
assert (carry >= 0 && carry <= 2);
|
assert (carry >= 0 && carry <= 2);
|
||||||
Increment(R3, N2, carry);
|
Increment(R3, N2, carry);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// R[2*N] - result = A*A
|
// R[2*N] - result = A*A
|
||||||
// T[2*N] - temporary work space
|
// T[2*N] - temporary work space
|
||||||
// A[N] --- number to be squared
|
// A[N] --- number to be squared
|
||||||
|
|
||||||
template <class P>
|
void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)
|
||||||
void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL);
|
|
||||||
|
|
||||||
template <class P>
|
|
||||||
inline void RecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL)
|
|
||||||
{
|
{
|
||||||
assert(N && N%2==0);
|
assert(N && N%2==0);
|
||||||
if (P::SquareRecursionLimit() >= 8 && N==8)
|
if (LowLevel::SquareRecursionLimit() >= 8 && N==8)
|
||||||
P::Square8(R, A);
|
LowLevel::Square8(R, A);
|
||||||
if (P::SquareRecursionLimit() >= 4 && N==4)
|
if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
|
||||||
P::Square4(R, A);
|
LowLevel::Square4(R, A);
|
||||||
else if (N==2)
|
else if (N==2)
|
||||||
P::Square2(R, A);
|
LowLevel::Square2(R, A);
|
||||||
else
|
else
|
||||||
DoRecursiveSquare<P>(R, T, A, N, NULL); // VC60 workaround: needs this NULL
|
{
|
||||||
}
|
|
||||||
|
|
||||||
template <class P>
|
|
||||||
void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy)
|
|
||||||
{
|
|
||||||
const unsigned int N2 = N/2;
|
const unsigned int N2 = N/2;
|
||||||
|
|
||||||
RecursiveSquare<P>(R0, T2, A0, N2);
|
RecursiveSquare(R0, T2, A0, N2);
|
||||||
RecursiveSquare<P>(R2, T2, A1, N2);
|
RecursiveSquare(R2, T2, A1, N2);
|
||||||
RecursiveMultiply<P>(T0, T2, A0, A1, N2);
|
RecursiveMultiply(T0, T2, A0, A1, N2);
|
||||||
|
|
||||||
word carry = P::Add(R1, R1, T0, N);
|
word carry = LowLevel::Add(R1, R1, T0, N);
|
||||||
carry += P::Add(R1, R1, T0, N);
|
carry += LowLevel::Add(R1, R1, T0, N);
|
||||||
Increment(R3, N2, carry);
|
Increment(R3, N2, carry);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// R[N] - bottom half of A*B
|
// R[N] - bottom half of A*B
|
||||||
|
|
@ -2011,33 +2042,25 @@ void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P
|
||||||
// A[N] - multiplier
|
// A[N] - multiplier
|
||||||
// B[N] - multiplicant
|
// B[N] - multiplicant
|
||||||
|
|
||||||
template <class P>
|
void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
|
||||||
void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
|
|
||||||
|
|
||||||
template <class P>
|
|
||||||
inline void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
|
|
||||||
{
|
{
|
||||||
assert(N>=2 && N%2==0);
|
assert(N>=2 && N%2==0);
|
||||||
if (P::MultiplyBottomRecursionLimit() >= 8 && N==8)
|
if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
|
||||||
P::Multiply8Bottom(R, A, B);
|
LowLevel::Multiply8Bottom(R, A, B);
|
||||||
else if (P::MultiplyBottomRecursionLimit() >= 4 && N==4)
|
else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
|
||||||
P::Multiply4Bottom(R, A, B);
|
LowLevel::Multiply4Bottom(R, A, B);
|
||||||
else if (N==2)
|
else if (N==2)
|
||||||
P::Multiply2Bottom(R, A, B);
|
LowLevel::Multiply2Bottom(R, A, B);
|
||||||
else
|
else
|
||||||
DoRecursiveMultiplyBottom<P>(R, T, A, B, N, NULL);
|
{
|
||||||
}
|
|
||||||
|
|
||||||
template <class P>
|
|
||||||
void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
|
|
||||||
{
|
|
||||||
const unsigned int N2 = N/2;
|
const unsigned int N2 = N/2;
|
||||||
|
|
||||||
RecursiveMultiply<P>(R, T, A0, B0, N2);
|
RecursiveMultiply(R, T, A0, B0, N2);
|
||||||
RecursiveMultiplyBottom<P>(T0, T1, A1, B0, N2);
|
RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
|
||||||
P::Add(R1, R1, T0, N2);
|
LowLevel::Add(R1, R1, T0, N2);
|
||||||
RecursiveMultiplyBottom<P>(T0, T1, A0, B1, N2);
|
RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
|
||||||
P::Add(R1, R1, T0, N2);
|
LowLevel::Add(R1, R1, T0, N2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// R[N] --- upper half of A*B
|
// R[N] --- upper half of A*B
|
||||||
|
|
@ -2046,19 +2069,18 @@ void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, u
|
||||||
// A[N] --- multiplier
|
// A[N] --- multiplier
|
||||||
// B[N] --- multiplicant
|
// B[N] --- multiplicant
|
||||||
|
|
||||||
template <class P>
|
void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
|
||||||
void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
|
|
||||||
{
|
{
|
||||||
assert(N>=2 && N%2==0);
|
assert(N>=2 && N%2==0);
|
||||||
|
|
||||||
if (N==4)
|
if (N==4)
|
||||||
{
|
{
|
||||||
P::Multiply4(T, A, B);
|
LowLevel::Multiply4(T, A, B);
|
||||||
memcpy(R, T+4, 4*WORD_SIZE);
|
memcpy(R, T+4, 4*WORD_SIZE);
|
||||||
}
|
}
|
||||||
else if (N==2)
|
else if (N==2)
|
||||||
{
|
{
|
||||||
P::Multiply2(T, A, B);
|
LowLevel::Multiply2(T, A, B);
|
||||||
memcpy(R, T+2, 2*WORD_SIZE);
|
memcpy(R, T+2, 2*WORD_SIZE);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
@ -2072,29 +2094,29 @@ void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const
|
||||||
switch (2*aComp + aComp + bComp)
|
switch (2*aComp + aComp + bComp)
|
||||||
{
|
{
|
||||||
case -4:
|
case -4:
|
||||||
P::Subtract(R0, A1, A0, N2);
|
LowLevel::Subtract(R0, A1, A0, N2);
|
||||||
P::Subtract(R1, B0, B1, N2);
|
LowLevel::Subtract(R1, B0, B1, N2);
|
||||||
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
|
RecursiveMultiply(T0, T2, R0, R1, N2);
|
||||||
P::Subtract(T1, T1, R0, N2);
|
LowLevel::Subtract(T1, T1, R0, N2);
|
||||||
carry = -1;
|
carry = -1;
|
||||||
break;
|
break;
|
||||||
case -2:
|
case -2:
|
||||||
P::Subtract(R0, A1, A0, N2);
|
LowLevel::Subtract(R0, A1, A0, N2);
|
||||||
P::Subtract(R1, B0, B1, N2);
|
LowLevel::Subtract(R1, B0, B1, N2);
|
||||||
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
|
RecursiveMultiply(T0, T2, R0, R1, N2);
|
||||||
carry = 0;
|
carry = 0;
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
P::Subtract(R0, A0, A1, N2);
|
LowLevel::Subtract(R0, A0, A1, N2);
|
||||||
P::Subtract(R1, B1, B0, N2);
|
LowLevel::Subtract(R1, B1, B0, N2);
|
||||||
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
|
RecursiveMultiply(T0, T2, R0, R1, N2);
|
||||||
carry = 0;
|
carry = 0;
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
P::Subtract(R0, A1, A0, N2);
|
LowLevel::Subtract(R0, A1, A0, N2);
|
||||||
P::Subtract(R1, B0, B1, N2);
|
LowLevel::Subtract(R1, B0, B1, N2);
|
||||||
RecursiveMultiply<P>(T0, T2, R0, R1, N2);
|
RecursiveMultiply(T0, T2, R0, R1, N2);
|
||||||
P::Subtract(T1, T1, R1, N2);
|
LowLevel::Subtract(T1, T1, R1, N2);
|
||||||
carry = -1;
|
carry = -1;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
|
@ -2102,18 +2124,18 @@ void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const
|
||||||
carry = 0;
|
carry = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
RecursiveMultiply<P>(T2, R0, A1, B1, N2);
|
RecursiveMultiply(T2, R0, A1, B1, N2);
|
||||||
|
|
||||||
// now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
|
// now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
|
||||||
|
|
||||||
word c2 = P::Subtract(R0, L+N2, L, N2);
|
word c2 = LowLevel::Subtract(R0, L+N2, L, N2);
|
||||||
c2 += P::Subtract(R0, R0, T0, N2);
|
c2 += LowLevel::Subtract(R0, R0, T0, N2);
|
||||||
word t = (Compare(R0, T2, N2) == -1);
|
word t = (Compare(R0, T2, N2) == -1);
|
||||||
|
|
||||||
carry += t;
|
carry += t;
|
||||||
carry += Increment(R0, N2, c2+t);
|
carry += Increment(R0, N2, c2+t);
|
||||||
carry += P::Add(R0, R0, T1, N2);
|
carry += LowLevel::Add(R0, R0, T1, N2);
|
||||||
carry += P::Add(R0, R0, T3, N2);
|
carry += LowLevel::Add(R0, R0, T3, N2);
|
||||||
assert (carry >= 0 && carry <= 2);
|
assert (carry >= 0 && carry <= 2);
|
||||||
|
|
||||||
CopyWords(R1, T3, N2);
|
CopyWords(R1, T3, N2);
|
||||||
|
|
@ -2133,42 +2155,22 @@ inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
|
||||||
|
|
||||||
inline void Multiply(word *R, word *T, const word *A, const word *B, unsigned int N)
|
inline void Multiply(word *R, word *T, const word *A, const word *B, unsigned int N)
|
||||||
{
|
{
|
||||||
#ifdef SSE2_INTRINSICS_AVAILABLE
|
RecursiveMultiply(R, T, A, B, N);
|
||||||
if (HasSSE2())
|
|
||||||
RecursiveMultiply<P4Optimized>(R, T, A, B, N);
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
RecursiveMultiply<LowLevel>(R, T, A, B, N);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Square(word *R, word *T, const word *A, unsigned int N)
|
inline void Square(word *R, word *T, const word *A, unsigned int N)
|
||||||
{
|
{
|
||||||
#ifdef SSE2_INTRINSICS_AVAILABLE
|
RecursiveSquare(R, T, A, N);
|
||||||
if (HasSSE2())
|
|
||||||
RecursiveSquare<P4Optimized>(R, T, A, N);
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
RecursiveSquare<LowLevel>(R, T, A, N);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
|
inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
|
||||||
{
|
{
|
||||||
#ifdef SSE2_INTRINSICS_AVAILABLE
|
RecursiveMultiplyBottom(R, T, A, B, N);
|
||||||
if (HasSSE2())
|
|
||||||
RecursiveMultiplyBottom<P4Optimized>(R, T, A, B, N);
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
RecursiveMultiplyBottom<LowLevel>(R, T, A, B, N);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
|
inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
|
||||||
{
|
{
|
||||||
#ifdef SSE2_INTRINSICS_AVAILABLE
|
RecursiveMultiplyTop(R, T, L, A, B, N);
|
||||||
if (HasSSE2())
|
|
||||||
RecursiveMultiplyTop<P4Optimized>(R, T, L, A, B, N);
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
RecursiveMultiplyTop<LowLevel>(R, T, L, A, B, N);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
|
static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue