unify GCC and MSVC multiplication code

pull/2/head
weidai 2003-08-01 03:20:16 +00:00
parent 005b94f755
commit 3d354a8bf2
1 changed files with 510 additions and 508 deletions

View File

@ -27,7 +27,7 @@
#elif defined(_MSC_VER) && defined(_M_IX86) #elif defined(_MSC_VER) && defined(_M_IX86)
#pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.") #pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")
#elif defined(__GNUC__) && defined(__i386__) #elif defined(__GNUC__) && defined(__i386__)
#pragma message("You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled.") #warning "You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled."
#endif #endif
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
@ -859,22 +859,69 @@ void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
// CodeWarrior defines _MSC_VER // CodeWarrior defines _MSC_VER
#if (defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86)) || (defined(__GNUC__) && defined(__i386__)) #if (defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86)) || (defined(__GNUC__) && defined(__i386__))
// ************** x86 feature detection ***************
static bool s_sse2Enabled = true;
static void CpuId(word32 input, word32 *output)
{
#ifdef __GNUC__
__asm__
(
"cpuid"
: "=a" (output[0]), "=b" (output[1]), "=c" (output[2]), "=d" (output[3])
: "a" (input)
);
#else
__asm
{
mov eax, input
cpuid
mov edi, output
mov [edi], eax
mov [edi+4], ebx
mov [edi+8], ecx
mov [edi+12], edx
}
#endif
}
static bool HasSSE2()
{
if (!s_sse2Enabled)
return false;
word32 cpuid[4];
CpuId(1, cpuid);
return (cpuid[3] & (1 << 26)) != 0;
}
static bool IsP4()
{
word32 cpuid[4];
CpuId(0, cpuid);
std::swap(cpuid[2], cpuid[3]);
if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
return false;
CpuId(1, cpuid);
return ((cpuid[0] >> 8) & 0xf) == 0xf;
}
// ************** Pentium/P4 optimizations ***************
class PentiumOptimized : public Portable class PentiumOptimized : public Portable
{ {
public: public:
static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N); static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N); static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
#ifdef __GNUC__ static void CRYPTOPP_CDECL Multiply4(word *C, const word *A, const word *B);
static void Square4(word *R, const word *A); static void CRYPTOPP_CDECL Multiply8(word *C, const word *A, const word *B);
static void Multiply4(word *C, const word *A, const word *B); static void CRYPTOPP_CDECL Multiply8Bottom(word *C, const word *A, const word *B);
static void Multiply8(word *C, const word *A, const word *B);
#endif
}; };
typedef PentiumOptimized LowLevel; class P4Optimized
// this may be selected at run time
class P4Optimized : public PentiumOptimized
{ {
public: public:
static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N); static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
@ -883,7 +930,70 @@ public:
static void Multiply4(word *C, const word *A, const word *B); static void Multiply4(word *C, const word *A, const word *B);
static void Multiply8(word *C, const word *A, const word *B); static void Multiply8(word *C, const word *A, const word *B);
static void Multiply8Bottom(word *C, const word *A, const word *B); static void Multiply8Bottom(word *C, const word *A, const word *B);
static inline void Square4(word *R, const word *A) {Multiply4(R, A, A);} #endif
};
typedef word (CRYPTOPP_CDECL * PAddSub)(word *C, const word *A, const word *B, unsigned int N);
typedef void (* PMul)(word *C, const word *A, const word *B);
static PAddSub s_pAdd, s_pSub;
#ifdef SSE2_INTRINSICS_AVAILABLE
static PMul s_pMul4, s_pMul8, s_pMul8B;
#endif
static void SetPentiumFunctionPointers()
{
if (IsP4())
{
s_pAdd = &P4Optimized::Add;
s_pSub = &P4Optimized::Subtract;
}
else
{
s_pAdd = &PentiumOptimized::Add;
s_pSub = &PentiumOptimized::Subtract;
}
#ifdef SSE2_INTRINSICS_AVAILABLE
if (HasSSE2())
{
s_pMul4 = &P4Optimized::Multiply4;
s_pMul8 = &P4Optimized::Multiply8;
s_pMul8B = &P4Optimized::Multiply8Bottom;
}
else
{
s_pMul4 = &PentiumOptimized::Multiply4;
s_pMul8 = &PentiumOptimized::Multiply8;
s_pMul8B = &PentiumOptimized::Multiply8Bottom;
}
#endif
}
static const char s_RunAtStartupSetPentiumFunctionPointers = (SetPentiumFunctionPointers(), 0);
void DisableSSE2()
{
s_sse2Enabled = false;
SetPentiumFunctionPointers();
}
class LowLevel : public PentiumOptimized
{
public:
inline static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N)
{return s_pAdd(C, A, B, N);}
inline static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N)
{return s_pSub(C, A, B, N);}
inline static void Square4(word *R, const word *A)
{Multiply4(R, A, A);}
#ifdef SSE2_INTRINSICS_AVAILABLE
inline static void Multiply4(word *C, const word *A, const word *B)
{s_pMul4(C, A, B);}
inline static void Multiply8(word *C, const word *A, const word *B)
{s_pMul8(C, A, B);}
inline static void Multiply8Bottom(word *C, const word *A, const word *B)
{s_pMul8B(C, A, B);}
#endif #endif
}; };
@ -892,7 +1002,7 @@ public:
#define CRYPTOPP_NAKED __declspec(naked) #define CRYPTOPP_NAKED __declspec(naked)
#define AS1(x) __asm x #define AS1(x) __asm x
#define AS2(x, y) __asm x, y #define AS2(x, y) __asm x, y
#define PentiumPrologue \ #define AddPrologue \
__asm push ebp \ __asm push ebp \
__asm push ebx \ __asm push ebx \
__asm push esi \ __asm push esi \
@ -901,55 +1011,67 @@ public:
__asm mov edx, [esp+24] \ __asm mov edx, [esp+24] \
__asm mov ebx, [esp+28] \ __asm mov ebx, [esp+28] \
__asm mov esi, [esp+32] __asm mov esi, [esp+32]
#define PentiumEpilogue \ #define AddEpilogue \
__asm pop edi \ __asm pop edi \
__asm pop esi \ __asm pop esi \
__asm pop ebx \ __asm pop ebx \
__asm pop ebp \ __asm pop ebp \
__asm ret __asm ret
#define P4Prologue \ #define MulPrologue \
__asm sub esp, 16 \ __asm push ebp \
__asm mov [esp], edi \ __asm push ebx \
__asm mov [esp+4], esi \ __asm push esi \
__asm mov [esp+8], ebx \ __asm push edi \
__asm mov [esp+12], ebp \ __asm mov ecx, [esp+28] \
__asm mov ecx, [esp+20] \ __asm mov esi, [esp+24] \
__asm mov edx, [esp+24] \ __asm push [esp+20]
__asm mov ebx, [esp+28] \ #define MulEpilogue \
__asm mov esi, [esp+32] __asm add esp, 4 \
#define P4Epilogue \ __asm pop edi \
__asm mov edi, [esp] \ __asm pop esi \
__asm mov esi, [esp+4] \ __asm pop ebx \
__asm mov ebx, [esp+8] \ __asm pop ebp \
__asm mov ebp, [esp+12] \
__asm add esp, 16 \
__asm ret __asm ret
#else #else
#define CRYPTOPP_NAKED #define CRYPTOPP_NAKED
#define AS1(x) #x ";" #define AS1(x) #x ";"
#define AS2(x, y) #x ", " #y ";" #define AS2(x, y) #x ", " #y ";"
#define PentiumPrologue \ #define AddPrologue \
__asm__ \ __asm__ __volatile__ \
( \ ( \
".att_syntax prefix;" \ "push %%ebx;" /* save this manually, in case of -fPIC */ \
"mov %0, %%ecx;" \
"mov %1, %%edx;" \
"mov %2, %%ebx;" \ "mov %2, %%ebx;" \
"mov %3, %%esi;" \ ".intel_syntax noprefix;" \
"push ebp;"
#define AddEpilogue \
"pop ebp;" \
".att_syntax prefix;" \
"pop %%ebx;" \
: \
: "c" (C), "d" (A), "m" (B), "S" (N) \
: "%edi", "memory", "cc" \
);
#define MulPrologue \
__asm__ __volatile__ \
( \
"push %%ebx;" /* save this manually, in case of -fPIC */ \
"push %%ebp;" \
"push %0;" \
".intel_syntax noprefix;" ".intel_syntax noprefix;"
#define PentiumEpilogue \ #define MulEpilogue \
"add esp, 4;" \
"pop ebp;" \
"pop ebx;" \
".att_syntax prefix;" \ ".att_syntax prefix;" \
: \ : \
: "m" (C), "m" (A), "m" (B), "m" (N) \ : "rm" (Z), "S" (X), "c" (Y) \
: "%ecx", "%edx", "%ebx", "%esi", "%edi" \ : "%eax", "%edx", "%edi", "memory", "cc" \
); );
#define P4Prologue PentiumPrologue
#define P4Epilogue PentiumEpilogue
#endif #endif
CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N) CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
{ {
PentiumPrologue AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N // now: ebx = B, ecx = C, edx = A, esi = N
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
@ -982,12 +1104,12 @@ CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B,
AS1(loopendAdd:) AS1(loopendAdd:)
AS2( adc eax, 0) // store carry into eax (return result register) AS2( adc eax, 0) // store carry into eax (return result register)
PentiumEpilogue AddEpilogue
} }
CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N) CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
{ {
PentiumPrologue AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N // now: ebx = B, ecx = C, edx = A, esi = N
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
@ -1020,12 +1142,14 @@ CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const wor
AS1(loopendSub:) AS1(loopendSub:)
AS2( adc eax, 0) // store carry into eax (return result register) AS2( adc eax, 0) // store carry into eax (return result register)
PentiumEpilogue AddEpilogue
} }
// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N) CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
{ {
P4Prologue AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N // now: ebx = B, ecx = C, edx = A, esi = N
AS2( xor eax, eax) AS2( xor eax, eax)
@ -1077,12 +1201,12 @@ CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsi
AS1(loopendAddP4:) AS1(loopendAddP4:)
P4Epilogue AddEpilogue
} }
CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N) CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
{ {
P4Prologue AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N // now: ebx = B, ecx = C, edx = A, esi = N
AS2( xor eax, eax) AS2( xor eax, eax)
@ -1134,147 +1258,51 @@ CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B,
AS1(loopendSubP4:) AS1(loopendSubP4:)
P4Epilogue AddEpilogue
} }
#if __GNUC__ // multiply assembly code originally contributed by Leonard Janke
// Comba square and multiply assembly code originally contributed by Leonard Janke
// These are not needed with MSVC, which does a good job of optimizing the C++ multiply code.
#define SqrStartup \
"push %%ebp\n\t" \
"push %%esi\n\t" \
"push %%ebx\n\t" \
"xor %%ebp, %%ebp\n\t" \
"xor %%ebx, %%ebx\n\t" \
"xor %%ecx, %%ecx\n\t"
#define SqrShiftCarry \
"mov %%ebx, %%ebp\n\t" \
"mov %%ecx, %%ebx\n\t" \
"xor %%ecx, %%ecx\n\t"
#define SqrAccumulate(i,j) \
"mov 4*"#j"(%%esi), %%eax\n\t" \
"mull 4*"#i"(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"adc %%ch, %%cl\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"adc %%ch, %%cl\n\t"
#define SqrAccumulateCentre(i) \
"mov 4*"#i"(%%esi), %%eax\n\t" \
"mull 4*"#i"(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"adc %%ch, %%cl\n\t"
#define SqrStoreDigit(X) \
"mov %%ebp, 4*"#X"(%%edi)\n\t" \
#define SqrLastDiagonal(digits) \
"mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
"mull 4*("#digits"-1)(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
"mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t"
#define SqrCleanup \
"pop %%ebx\n\t" \
"pop %%esi\n\t" \
"pop %%ebp\n\t"
void PentiumOptimized::Square4(word* Y, const word* X)
{
__asm__ __volatile__(
SqrStartup
SqrAccumulateCentre(0)
SqrStoreDigit(0)
SqrShiftCarry
SqrAccumulate(1,0)
SqrStoreDigit(1)
SqrShiftCarry
SqrAccumulate(2,0)
SqrAccumulateCentre(1)
SqrStoreDigit(2)
SqrShiftCarry
SqrAccumulate(3,0)
SqrAccumulate(2,1)
SqrStoreDigit(3)
SqrShiftCarry
SqrAccumulate(3,1)
SqrAccumulateCentre(2)
SqrStoreDigit(4)
SqrShiftCarry
SqrAccumulate(3,2)
SqrStoreDigit(5)
SqrShiftCarry
SqrLastDiagonal(4)
SqrCleanup
:
: "D" (Y), "S" (X)
: "eax", "ecx", "edx", "ebp", "memory"
);
}
#define MulStartup \ #define MulStartup \
"push %%ebp\n\t" \ AS2(xor ebp, ebp) \
"push %%esi\n\t" \ AS2(xor edi, edi) \
"push %%ebx\n\t" \ AS2(xor ebx, ebx)
"push %%edi\n\t" \
"mov %%eax, %%ebx \n\t" \
"xor %%ebp, %%ebp\n\t" \
"xor %%edi, %%edi\n\t" \
"xor %%ecx, %%ecx\n\t"
#define MulShiftCarry \ #define MulShiftCarry \
"mov %%edx, %%ebp\n\t" \ AS2(mov ebp, edx) \
"mov %%ecx, %%edi\n\t" \ AS2(mov edi, ebx) \
"xor %%ecx, %%ecx\n\t" AS2(xor ebx, ebx)
#define MulAccumulateBottom(i,j) \
AS2(mov eax, [ecx+4*j]) \
AS2(imul eax, dword ptr [esi+4*i]) \
AS2(add ebp, eax)
#define MulAccumulate(i,j) \ #define MulAccumulate(i,j) \
"mov 4*"#j"(%%ebx), %%eax\n\t" \ AS2(mov eax, [ecx+4*j]) \
"mull 4*"#i"(%%esi)\n\t" \ AS1(mul dword ptr [esi+4*i]) \
"add %%eax, %%ebp\n\t" \ AS2(add ebp, eax) \
"adc %%edx, %%edi\n\t" \ AS2(adc edi, edx) \
"adc %%ch, %%cl\n\t" AS2(adc bl, bh)
#define MulStoreDigit(X) \ #define MulStoreDigit(i) \
"mov %%edi, %%edx \n\t" \ AS2(mov edx, edi) \
"mov (%%esp), %%edi \n\t" \ AS2(mov edi, [esp]) \
"mov %%ebp, 4*"#X"(%%edi)\n\t" \ AS2(mov [edi+4*i], ebp)
"mov %%edi, (%%esp)\n\t"
#define MulLastDiagonal(digits) \ #define MulLastDiagonal(digits) \
"mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \ AS2(mov eax, [ecx+4*(digits-1)]) \
"mull 4*("#digits"-1)(%%esi)\n\t" \ AS1(mul dword ptr [esi+4*(digits-1)]) \
"add %%eax, %%ebp\n\t" \ AS2(add ebp, eax) \
"adc %%edi, %%edx\n\t" \ AS2(adc edx, edi) \
"mov (%%esp), %%edi\n\t" \ AS2(mov edi, [esp]) \
"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \ AS2(mov [edi+4*(2*digits-2)], ebp) \
"mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t" AS2(mov [edi+4*(2*digits-1)], edx)
#define MulCleanup \ CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
"pop %%edi\n\t" \
"pop %%ebx\n\t" \
"pop %%esi\n\t" \
"pop %%ebp\n\t"
void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
{ {
__asm__ __volatile__( MulPrologue
// now: [esp] = Z, esi = X, ecx = Y
MulStartup MulStartup
MulAccumulate(0,0) MulAccumulate(0,0)
MulStoreDigit(0) MulStoreDigit(0)
@ -1310,18 +1338,13 @@ void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
MulShiftCarry MulShiftCarry
MulLastDiagonal(4) MulLastDiagonal(4)
MulEpilogue
MulCleanup
:
: "D" (Z), "S" (X), "a" (Y)
: "%ecx", "%edx", "memory"
);
} }
void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y) CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
{ {
__asm__ __volatile__( MulPrologue
// now: [esp] = Z, esi = X, ecx = Y
MulStartup MulStartup
MulAccumulate(0,0) MulAccumulate(0,0)
MulStoreDigit(0) MulStoreDigit(0)
@ -1429,16 +1452,77 @@ void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
MulShiftCarry MulShiftCarry
MulLastDiagonal(8) MulLastDiagonal(8)
MulEpilogue
MulCleanup
:
: "D" (Z), "S" (X), "a" (Y)
: "%ecx", "%edx", "memory"
);
} }
#endif // __GNUC__ CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)
{
MulPrologue
// now: [esp] = Z, esi = X, ecx = Y
MulStartup
MulAccumulate(0,0)
MulStoreDigit(0)
MulShiftCarry
MulAccumulate(1,0)
MulAccumulate(0,1)
MulStoreDigit(1)
MulShiftCarry
MulAccumulate(2,0)
MulAccumulate(1,1)
MulAccumulate(0,2)
MulStoreDigit(2)
MulShiftCarry
MulAccumulate(3,0)
MulAccumulate(2,1)
MulAccumulate(1,2)
MulAccumulate(0,3)
MulStoreDigit(3)
MulShiftCarry
MulAccumulate(4,0)
MulAccumulate(3,1)
MulAccumulate(2,2)
MulAccumulate(1,3)
MulAccumulate(0,4)
MulStoreDigit(4)
MulShiftCarry
MulAccumulate(5,0)
MulAccumulate(4,1)
MulAccumulate(3,2)
MulAccumulate(2,3)
MulAccumulate(1,4)
MulAccumulate(0,5)
MulStoreDigit(5)
MulShiftCarry
MulAccumulate(6,0)
MulAccumulate(5,1)
MulAccumulate(4,2)
MulAccumulate(3,3)
MulAccumulate(2,4)
MulAccumulate(1,5)
MulAccumulate(0,6)
MulStoreDigit(6)
MulShiftCarry
MulAccumulateBottom(7,0)
MulAccumulateBottom(6,1)
MulAccumulateBottom(5,2)
MulAccumulateBottom(4,3)
MulAccumulateBottom(3,4)
MulAccumulateBottom(2,5)
MulAccumulateBottom(1,6)
MulAccumulateBottom(0,7)
MulStoreDigit(7)
MulEpilogue
}
#undef AS1
#undef AS2
#else // not x86 - no processor specific code at this layer #else // not x86 - no processor specific code at this layer
@ -1446,43 +1530,8 @@ typedef Portable LowLevel;
#endif #endif
bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
void DisableSSE2()
{
g_sse2Enabled = false;
}
#ifdef SSE2_INTRINSICS_AVAILABLE #ifdef SSE2_INTRINSICS_AVAILABLE
static bool GetSSE2Capability()
{
word32 b;
#ifdef __GNUC__
__asm__("mov $1, %%eax; cpuid; mov %%edx, %0" : "=rm" (b) : : "%eax", "%edx");
#else
__asm
{
mov eax, 1
cpuid
mov b, edx
}
#endif
return (b & (1 << 26)) != 0;
}
static inline bool HasSSE2()
{
if (g_sse2Enabled && !g_sse2DetectionDone)
{
g_sse2Detected = GetSSE2Capability();
g_sse2DetectionDone = true;
}
return g_sse2Enabled && g_sse2Detected;
}
#ifdef __GNUC__ #ifdef __GNUC__
#define __fastcall #define __fastcall
#endif #endif
@ -1891,34 +1940,23 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
#define R2 (R+N) #define R2 (R+N)
#define R3 (R+N+N2) #define R3 (R+N+N2)
//VC60 workaround: compiler bug triggered without the extra dummy parameters
// R[2*N] - result = A*B // R[2*N] - result = A*B
// T[2*N] - temporary work space // T[2*N] - temporary work space
// A[N] --- multiplier // A[N] --- multiplier
// B[N] --- multiplicant // B[N] --- multiplicant
template <class P> void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N)
void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
template <class P>
inline void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
{ {
assert(N>=2 && N%2==0); assert(N>=2 && N%2==0);
if (P::MultiplyRecursionLimit() >= 8 && N==8) if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
P::Multiply8(R, A, B); LowLevel::Multiply8(R, A, B);
else if (P::MultiplyRecursionLimit() >= 4 && N==4) else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
P::Multiply4(R, A, B); LowLevel::Multiply4(R, A, B);
else if (N==2) else if (N==2)
P::Multiply2(R, A, B); LowLevel::Multiply2(R, A, B);
else else
DoRecursiveMultiply<P>(R, T, A, B, N, NULL); // VC60 workaround: needs this NULL {
}
template <class P>
void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
{
const unsigned int N2 = N/2; const unsigned int N2 = N/2;
int carry; int carry;
@ -1928,29 +1966,29 @@ void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigne
switch (2*aComp + aComp + bComp) switch (2*aComp + aComp + bComp)
{ {
case -4: case -4:
P::Subtract(R0, A1, A0, N2); LowLevel::Subtract(R0, A1, A0, N2);
P::Subtract(R1, B0, B1, N2); LowLevel::Subtract(R1, B0, B1, N2);
RecursiveMultiply<P>(T0, T2, R0, R1, N2); RecursiveMultiply(T0, T2, R0, R1, N2);
P::Subtract(T1, T1, R0, N2); LowLevel::Subtract(T1, T1, R0, N2);
carry = -1; carry = -1;
break; break;
case -2: case -2:
P::Subtract(R0, A1, A0, N2); LowLevel::Subtract(R0, A1, A0, N2);
P::Subtract(R1, B0, B1, N2); LowLevel::Subtract(R1, B0, B1, N2);
RecursiveMultiply<P>(T0, T2, R0, R1, N2); RecursiveMultiply(T0, T2, R0, R1, N2);
carry = 0; carry = 0;
break; break;
case 2: case 2:
P::Subtract(R0, A0, A1, N2); LowLevel::Subtract(R0, A0, A1, N2);
P::Subtract(R1, B1, B0, N2); LowLevel::Subtract(R1, B1, B0, N2);
RecursiveMultiply<P>(T0, T2, R0, R1, N2); RecursiveMultiply(T0, T2, R0, R1, N2);
carry = 0; carry = 0;
break; break;
case 4: case 4:
P::Subtract(R0, A1, A0, N2); LowLevel::Subtract(R0, A1, A0, N2);
P::Subtract(R1, B0, B1, N2); LowLevel::Subtract(R1, B0, B1, N2);
RecursiveMultiply<P>(T0, T2, R0, R1, N2); RecursiveMultiply(T0, T2, R0, R1, N2);
P::Subtract(T1, T1, R1, N2); LowLevel::Subtract(T1, T1, R1, N2);
carry = -1; carry = -1;
break; break;
default: default:
@ -1958,52 +1996,45 @@ void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigne
carry = 0; carry = 0;
} }
RecursiveMultiply<P>(R0, T2, A0, B0, N2); RecursiveMultiply(R0, T2, A0, B0, N2);
RecursiveMultiply<P>(R2, T2, A1, B1, N2); RecursiveMultiply(R2, T2, A1, B1, N2);
// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1 // now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
carry += P::Add(T0, T0, R0, N); carry += LowLevel::Add(T0, T0, R0, N);
carry += P::Add(T0, T0, R2, N); carry += LowLevel::Add(T0, T0, R2, N);
carry += P::Add(R1, R1, T0, N); carry += LowLevel::Add(R1, R1, T0, N);
assert (carry >= 0 && carry <= 2); assert (carry >= 0 && carry <= 2);
Increment(R3, N2, carry); Increment(R3, N2, carry);
}
} }
// R[2*N] - result = A*A // R[2*N] - result = A*A
// T[2*N] - temporary work space // T[2*N] - temporary work space
// A[N] --- number to be squared // A[N] --- number to be squared
template <class P> void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)
void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL);
template <class P>
inline void RecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL)
{ {
assert(N && N%2==0); assert(N && N%2==0);
if (P::SquareRecursionLimit() >= 8 && N==8) if (LowLevel::SquareRecursionLimit() >= 8 && N==8)
P::Square8(R, A); LowLevel::Square8(R, A);
if (P::SquareRecursionLimit() >= 4 && N==4) if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
P::Square4(R, A); LowLevel::Square4(R, A);
else if (N==2) else if (N==2)
P::Square2(R, A); LowLevel::Square2(R, A);
else else
DoRecursiveSquare<P>(R, T, A, N, NULL); // VC60 workaround: needs this NULL {
}
template <class P>
void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy)
{
const unsigned int N2 = N/2; const unsigned int N2 = N/2;
RecursiveSquare<P>(R0, T2, A0, N2); RecursiveSquare(R0, T2, A0, N2);
RecursiveSquare<P>(R2, T2, A1, N2); RecursiveSquare(R2, T2, A1, N2);
RecursiveMultiply<P>(T0, T2, A0, A1, N2); RecursiveMultiply(T0, T2, A0, A1, N2);
word carry = P::Add(R1, R1, T0, N); word carry = LowLevel::Add(R1, R1, T0, N);
carry += P::Add(R1, R1, T0, N); carry += LowLevel::Add(R1, R1, T0, N);
Increment(R3, N2, carry); Increment(R3, N2, carry);
}
} }
// R[N] - bottom half of A*B // R[N] - bottom half of A*B
@ -2011,33 +2042,25 @@ void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P
// A[N] - multiplier // A[N] - multiplier
// B[N] - multiplicant // B[N] - multiplicant
template <class P> void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
template <class P>
inline void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
{ {
assert(N>=2 && N%2==0); assert(N>=2 && N%2==0);
if (P::MultiplyBottomRecursionLimit() >= 8 && N==8) if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
P::Multiply8Bottom(R, A, B); LowLevel::Multiply8Bottom(R, A, B);
else if (P::MultiplyBottomRecursionLimit() >= 4 && N==4) else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
P::Multiply4Bottom(R, A, B); LowLevel::Multiply4Bottom(R, A, B);
else if (N==2) else if (N==2)
P::Multiply2Bottom(R, A, B); LowLevel::Multiply2Bottom(R, A, B);
else else
DoRecursiveMultiplyBottom<P>(R, T, A, B, N, NULL); {
}
template <class P>
void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
{
const unsigned int N2 = N/2; const unsigned int N2 = N/2;
RecursiveMultiply<P>(R, T, A0, B0, N2); RecursiveMultiply(R, T, A0, B0, N2);
RecursiveMultiplyBottom<P>(T0, T1, A1, B0, N2); RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
P::Add(R1, R1, T0, N2); LowLevel::Add(R1, R1, T0, N2);
RecursiveMultiplyBottom<P>(T0, T1, A0, B1, N2); RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
P::Add(R1, R1, T0, N2); LowLevel::Add(R1, R1, T0, N2);
}
} }
// R[N] --- upper half of A*B // R[N] --- upper half of A*B
@ -2046,19 +2069,18 @@ void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, u
// A[N] --- multiplier // A[N] --- multiplier
// B[N] --- multiplicant // B[N] --- multiplicant
template <class P> void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
{ {
assert(N>=2 && N%2==0); assert(N>=2 && N%2==0);
if (N==4) if (N==4)
{ {
P::Multiply4(T, A, B); LowLevel::Multiply4(T, A, B);
memcpy(R, T+4, 4*WORD_SIZE); memcpy(R, T+4, 4*WORD_SIZE);
} }
else if (N==2) else if (N==2)
{ {
P::Multiply2(T, A, B); LowLevel::Multiply2(T, A, B);
memcpy(R, T+2, 2*WORD_SIZE); memcpy(R, T+2, 2*WORD_SIZE);
} }
else else
@ -2072,29 +2094,29 @@ void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const
switch (2*aComp + aComp + bComp) switch (2*aComp + aComp + bComp)
{ {
case -4: case -4:
P::Subtract(R0, A1, A0, N2); LowLevel::Subtract(R0, A1, A0, N2);
P::Subtract(R1, B0, B1, N2); LowLevel::Subtract(R1, B0, B1, N2);
RecursiveMultiply<P>(T0, T2, R0, R1, N2); RecursiveMultiply(T0, T2, R0, R1, N2);
P::Subtract(T1, T1, R0, N2); LowLevel::Subtract(T1, T1, R0, N2);
carry = -1; carry = -1;
break; break;
case -2: case -2:
P::Subtract(R0, A1, A0, N2); LowLevel::Subtract(R0, A1, A0, N2);
P::Subtract(R1, B0, B1, N2); LowLevel::Subtract(R1, B0, B1, N2);
RecursiveMultiply<P>(T0, T2, R0, R1, N2); RecursiveMultiply(T0, T2, R0, R1, N2);
carry = 0; carry = 0;
break; break;
case 2: case 2:
P::Subtract(R0, A0, A1, N2); LowLevel::Subtract(R0, A0, A1, N2);
P::Subtract(R1, B1, B0, N2); LowLevel::Subtract(R1, B1, B0, N2);
RecursiveMultiply<P>(T0, T2, R0, R1, N2); RecursiveMultiply(T0, T2, R0, R1, N2);
carry = 0; carry = 0;
break; break;
case 4: case 4:
P::Subtract(R0, A1, A0, N2); LowLevel::Subtract(R0, A1, A0, N2);
P::Subtract(R1, B0, B1, N2); LowLevel::Subtract(R1, B0, B1, N2);
RecursiveMultiply<P>(T0, T2, R0, R1, N2); RecursiveMultiply(T0, T2, R0, R1, N2);
P::Subtract(T1, T1, R1, N2); LowLevel::Subtract(T1, T1, R1, N2);
carry = -1; carry = -1;
break; break;
default: default:
@ -2102,18 +2124,18 @@ void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const
carry = 0; carry = 0;
} }
RecursiveMultiply<P>(T2, R0, A1, B1, N2); RecursiveMultiply(T2, R0, A1, B1, N2);
// now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1 // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
word c2 = P::Subtract(R0, L+N2, L, N2); word c2 = LowLevel::Subtract(R0, L+N2, L, N2);
c2 += P::Subtract(R0, R0, T0, N2); c2 += LowLevel::Subtract(R0, R0, T0, N2);
word t = (Compare(R0, T2, N2) == -1); word t = (Compare(R0, T2, N2) == -1);
carry += t; carry += t;
carry += Increment(R0, N2, c2+t); carry += Increment(R0, N2, c2+t);
carry += P::Add(R0, R0, T1, N2); carry += LowLevel::Add(R0, R0, T1, N2);
carry += P::Add(R0, R0, T3, N2); carry += LowLevel::Add(R0, R0, T3, N2);
assert (carry >= 0 && carry <= 2); assert (carry >= 0 && carry <= 2);
CopyWords(R1, T3, N2); CopyWords(R1, T3, N2);
@ -2133,42 +2155,22 @@ inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
inline void Multiply(word *R, word *T, const word *A, const word *B, unsigned int N) inline void Multiply(word *R, word *T, const word *A, const word *B, unsigned int N)
{ {
#ifdef SSE2_INTRINSICS_AVAILABLE RecursiveMultiply(R, T, A, B, N);
if (HasSSE2())
RecursiveMultiply<P4Optimized>(R, T, A, B, N);
else
#endif
RecursiveMultiply<LowLevel>(R, T, A, B, N);
} }
inline void Square(word *R, word *T, const word *A, unsigned int N) inline void Square(word *R, word *T, const word *A, unsigned int N)
{ {
#ifdef SSE2_INTRINSICS_AVAILABLE RecursiveSquare(R, T, A, N);
if (HasSSE2())
RecursiveSquare<P4Optimized>(R, T, A, N);
else
#endif
RecursiveSquare<LowLevel>(R, T, A, N);
} }
inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N) inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
{ {
#ifdef SSE2_INTRINSICS_AVAILABLE RecursiveMultiplyBottom(R, T, A, B, N);
if (HasSSE2())
RecursiveMultiplyBottom<P4Optimized>(R, T, A, B, N);
else
#endif
RecursiveMultiplyBottom<LowLevel>(R, T, A, B, N);
} }
inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N) inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
{ {
#ifdef SSE2_INTRINSICS_AVAILABLE RecursiveMultiplyTop(R, T, L, A, B, N);
if (HasSSE2())
RecursiveMultiplyTop<P4Optimized>(R, T, L, A, B, N);
else
#endif
RecursiveMultiplyTop<LowLevel>(R, T, L, A, B, N);
} }
static word LinearMultiply(word *C, const word *A, word B, unsigned int N) static word LinearMultiply(word *C, const word *A, word B, unsigned int N)