unify GCC and MSVC multiplication code

2003-08-01 03:20:16 +00:00 · 2003-08-01 03:20:16 +00:00 · 3d354a8bf2
parent 005b94f755
commit 3d354a8bf2
1 changed files with 510 additions and 508 deletions
--- a/integer.cpp
+++ b/integer.cpp
@ -27,7 +27,7 @@
 #elif defined(_MSC_VER) && defined(_M_IX86)
 	#pragma message("You do no seem to have the Visual C++ Processor Pack installed, so use of SSE2 intrinsics will be disabled.")
 #elif defined(__GNUC__) && defined(__i386__)
-	#pragma message("You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled.")
+	#warning "You do not have GCC 3.3 or later, or did not specify -msse2 compiler option, so use of SSE2 intrinsics will be disabled."
 #endif
 NAMESPACE_BEGIN(CryptoPP)
@ -859,22 +859,69 @@ void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
 // CodeWarrior defines _MSC_VER
 #if (defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86)) || (defined(__GNUC__) && defined(__i386__))
 // ************** x86 feature detection ***************
 static bool s_sse2Enabled = true;
 static void CpuId(word32 input, word32 *output)
 {
 #ifdef __GNUC__
 	__asm__
 	(
 		"cpuid"
 		: "=a" (output[0]), "=b" (output[1]), "=c" (output[2]), "=d" (output[3])
 		: "a" (input)
 	);
 #else
 	__asm
 	{
 		mov eax, input
 		cpuid
 		mov edi, output
 		mov [edi], eax
 		mov [edi+4], ebx
 		mov [edi+8], ecx
 		mov [edi+12], edx
 	}
 #endif
 }
 static bool HasSSE2()
 {
 	if (!s_sse2Enabled)
 		return false;
 	word32 cpuid[4];
 	CpuId(1, cpuid);
 	return (cpuid[3] & (1 << 26)) != 0;
 }
 static bool IsP4()
 {
 	word32 cpuid[4];
 	CpuId(0, cpuid);
 	std::swap(cpuid[2], cpuid[3]);
 	if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
 		return false;
 	CpuId(1, cpuid);
 	return ((cpuid[0] >> 8) & 0xf) == 0xf;
 }
 // ************** Pentium/P4 optimizations ***************
 class PentiumOptimized : public Portable
 {
 public:
 	static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
 	static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
-#ifdef __GNUC__
+	static void CRYPTOPP_CDECL Multiply4(word *C, const word *A, const word *B);
-	static void Square4(word *R, const word *A);
+	static void CRYPTOPP_CDECL Multiply8(word *C, const word *A, const word *B);
-	static void Multiply4(word *C, const word *A, const word *B);
+	static void CRYPTOPP_CDECL Multiply8Bottom(word *C, const word *A, const word *B);
 	static void Multiply8(word *C, const word *A, const word *B);
 #endif
 };
-typedef PentiumOptimized LowLevel;
+class P4Optimized
 // this may be selected at run time
 class P4Optimized : public PentiumOptimized
 {
 public:
 	static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
@ -883,7 +930,70 @@ public:
 	static void Multiply4(word *C, const word *A, const word *B);
 	static void Multiply8(word *C, const word *A, const word *B);
 	static void Multiply8Bottom(word *C, const word *A, const word *B);
-	static inline void Square4(word *R, const word *A) {Multiply4(R, A, A);}
+#endif
 };
 typedef word (CRYPTOPP_CDECL * PAddSub)(word *C, const word *A, const word *B, unsigned int N);
 typedef void (* PMul)(word *C, const word *A, const word *B);
 static PAddSub s_pAdd, s_pSub;
 #ifdef SSE2_INTRINSICS_AVAILABLE
 static PMul s_pMul4, s_pMul8, s_pMul8B;
 #endif
 static void SetPentiumFunctionPointers()
 {
 	if (IsP4())
 	{
 		s_pAdd = &P4Optimized::Add;
 		s_pSub = &P4Optimized::Subtract;
 	}
 	else
 	{
 		s_pAdd = &PentiumOptimized::Add;
 		s_pSub = &PentiumOptimized::Subtract;
 	}
 #ifdef SSE2_INTRINSICS_AVAILABLE
 	if (HasSSE2())
 	{
 		s_pMul4 = &P4Optimized::Multiply4;
 		s_pMul8 = &P4Optimized::Multiply8;
 		s_pMul8B = &P4Optimized::Multiply8Bottom;
 	}
 	else
 	{
 		s_pMul4 = &PentiumOptimized::Multiply4;
 		s_pMul8 = &PentiumOptimized::Multiply8;
 		s_pMul8B = &PentiumOptimized::Multiply8Bottom;
 	}
 #endif
 }
 static const char s_RunAtStartupSetPentiumFunctionPointers = (SetPentiumFunctionPointers(), 0);
 void DisableSSE2()
 {
 	s_sse2Enabled = false;
 	SetPentiumFunctionPointers();
 }
 class LowLevel : public PentiumOptimized
 {
 public:
 	inline static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N)
 		{return s_pAdd(C, A, B, N);}
 	inline static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N)
 		{return s_pSub(C, A, B, N);}
 	inline static void Square4(word *R, const word *A)
 		{Multiply4(R, A, A);}
 #ifdef SSE2_INTRINSICS_AVAILABLE
 	inline static void Multiply4(word *C, const word *A, const word *B)
 		{s_pMul4(C, A, B);}
 	inline static void Multiply8(word *C, const word *A, const word *B)
 		{s_pMul8(C, A, B);}
 	inline static void Multiply8Bottom(word *C, const word *A, const word *B)
 		{s_pMul8B(C, A, B);}
 #endif
 };
@ -892,7 +1002,7 @@ public:
 	#define CRYPTOPP_NAKED __declspec(naked)
 	#define AS1(x) __asm x
 	#define AS2(x, y) __asm x, y
-	#define PentiumPrologue \
+	#define AddPrologue \
 		__asm	push ebp \
 		__asm	push ebx \
 		__asm	push esi \
@ -901,55 +1011,67 @@ public:
 		__asm	mov		edx, [esp+24] \
 		__asm	mov		ebx, [esp+28] \
 		__asm	mov		esi, [esp+32]
-	#define PentiumEpilogue \
+	#define AddEpilogue \
 		__asm	pop edi \
 		__asm	pop esi \
 		__asm	pop ebx \
 		__asm	pop ebp \
 		__asm	ret
-	#define P4Prologue \
+	#define MulPrologue \
-		__asm	sub		esp, 16 \
+		__asm	push ebp \
-		__asm	mov		[esp], edi \
+		__asm	push ebx \
-		__asm	mov		[esp+4], esi \
+		__asm	push esi \
-		__asm	mov		[esp+8], ebx \
+		__asm	push edi \
-		__asm	mov		[esp+12], ebp \
+		__asm	mov ecx, [esp+28] \
-		__asm	mov		ecx, [esp+20] \
+		__asm	mov esi, [esp+24] \
-		__asm	mov		edx, [esp+24] \
+		__asm	push [esp+20]
-		__asm	mov		ebx, [esp+28] \
+	#define MulEpilogue \
-		__asm	mov		esi, [esp+32]
+		__asm	add esp, 4 \
-	#define P4Epilogue \
+		__asm	pop edi \
-		__asm	mov		edi, [esp] \
+		__asm	pop esi \
-		__asm	mov		esi, [esp+4] \
+		__asm	pop ebx \
-		__asm	mov		ebx, [esp+8] \
+		__asm	pop ebp \
 		__asm	mov		ebp, [esp+12] \
 		__asm	add		esp, 16 \
 		__asm	ret
 #else
 	#define CRYPTOPP_NAKED
 	#define AS1(x) #x ";"
 	#define AS2(x, y) #x ", " #y ";"
-	#define PentiumPrologue \
+	#define AddPrologue \
-		__asm__ \
+		__asm__ __volatile__ \
 		( \
-			".att_syntax prefix;" \
+			"push %%ebx;"	/* save this manually, in case of -fPIC */ \
 			"mov %0, %%ecx;" \
 			"mov %1, %%edx;" \
 			"mov %2, %%ebx;" \
-			"mov %3, %%esi;" \
+			".intel_syntax noprefix;" \
 			"push ebp;"
 	#define AddEpilogue \
 			"pop ebp;" \
 			".att_syntax prefix;" \
 			"pop %%ebx;" \
 					: \
 					: "c" (C), "d" (A), "m" (B), "S" (N) \
 					: "%edi", "memory", "cc" \
 		);
 	#define MulPrologue \
 		__asm__ __volatile__ \
 		( \
 			"push %%ebx;"	/* save this manually, in case of -fPIC */ \
 			"push %%ebp;" \
 			"push %0;" \
 			".intel_syntax noprefix;"
-	#define PentiumEpilogue \
+	#define MulEpilogue \
 			"add esp, 4;" \
 			"pop ebp;" \
 			"pop ebx;" \
 			".att_syntax prefix;" \
 			: \
-					: "m" (C), "m" (A), "m" (B), "m" (N) \
+			: "rm" (Z), "S" (X), "c" (Y) \
-					: "%ecx", "%edx", "%ebx", "%esi", "%edi" \
+			: "%eax", "%edx", "%edi", "memory", "cc" \
 		);
 	#define P4Prologue PentiumPrologue
 	#define P4Epilogue PentiumEpilogue
 #endif
 CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
 {
-	PentiumPrologue
+	AddPrologue
 	// now: ebx = B, ecx = C, edx = A, esi = N
 	AS2(	sub ecx, edx)	// hold the distance between C & A so we can add this to A to get C
@ -982,12 +1104,12 @@ CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B,
 	AS1(loopendAdd:)
 	AS2(	adc eax, 0)		// store carry into eax (return result register)
-	PentiumEpilogue
+	AddEpilogue
 }
 CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
 {
-	PentiumPrologue
+	AddPrologue
 	// now: ebx = B, ecx = C, edx = A, esi = N
 	AS2(	sub ecx, edx)	// hold the distance between C & A so we can add this to A to get C
@ -1020,12 +1142,14 @@ CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const wor
 	AS1(loopendSub:)
 	AS2(	adc eax, 0)		// store carry into eax (return result register)
-	PentiumEpilogue
+	AddEpilogue
 }
 // On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
 CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
 {
-	P4Prologue
+	AddPrologue
 	// now: ebx = B, ecx = C, edx = A, esi = N
 	AS2(	xor		eax, eax)
@ -1077,12 +1201,12 @@ CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsi
 	AS1(loopendAddP4:)
-	P4Epilogue
+	AddEpilogue
 }
 CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
 {
-	P4Prologue
+	AddPrologue
 	// now: ebx = B, ecx = C, edx = A, esi = N
 	AS2(	xor		eax, eax)
@ -1134,147 +1258,51 @@ CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B,
 	AS1(loopendSubP4:)
-	P4Epilogue
+	AddEpilogue
 }
-#if __GNUC__
+// multiply assembly code originally contributed by Leonard Janke
 // Comba square and multiply assembly code originally contributed by Leonard Janke
 // These are not needed with MSVC, which does a good job of optimizing the C++ multiply code.
 #define SqrStartup \
  "push %%ebp\n\t" \
  "push %%esi\n\t" \
  "push %%ebx\n\t" \
  "xor %%ebp, %%ebp\n\t" \
  "xor %%ebx, %%ebx\n\t" \
  "xor %%ecx, %%ecx\n\t" 
 #define SqrShiftCarry \
  "mov %%ebx, %%ebp\n\t" \
  "mov %%ecx, %%ebx\n\t" \
  "xor %%ecx, %%ecx\n\t"
 #define SqrAccumulate(i,j) \
  "mov 4*"#j"(%%esi), %%eax\n\t" \
  "mull 4*"#i"(%%esi)\n\t" \
  "add %%eax, %%ebp\n\t" \
  "adc %%edx, %%ebx\n\t" \
  "adc %%ch, %%cl\n\t" \
  "add %%eax, %%ebp\n\t" \
  "adc %%edx, %%ebx\n\t" \
  "adc %%ch, %%cl\n\t"
 #define SqrAccumulateCentre(i) \
  "mov 4*"#i"(%%esi), %%eax\n\t" \
  "mull 4*"#i"(%%esi)\n\t" \
  "add %%eax, %%ebp\n\t" \
  "adc %%edx, %%ebx\n\t" \
  "adc %%ch, %%cl\n\t" 
 #define SqrStoreDigit(X)  \
  "mov %%ebp, 4*"#X"(%%edi)\n\t" \
 #define SqrLastDiagonal(digits) \
  "mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
  "mull 4*("#digits"-1)(%%esi)\n\t" \
  "add %%eax, %%ebp\n\t" \
  "adc %%edx, %%ebx\n\t" \
  "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
  "mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t" 
 #define SqrCleanup \
  "pop %%ebx\n\t" \
  "pop %%esi\n\t" \
  "pop %%ebp\n\t" 
 void PentiumOptimized::Square4(word* Y, const word* X)
 {
 	__asm__ __volatile__(
 		SqrStartup
 		SqrAccumulateCentre(0)
 		SqrStoreDigit(0)
 		SqrShiftCarry
 		SqrAccumulate(1,0)
 		SqrStoreDigit(1)
 		SqrShiftCarry
 		SqrAccumulate(2,0)
 		SqrAccumulateCentre(1)
 		SqrStoreDigit(2)
 		SqrShiftCarry
 		SqrAccumulate(3,0)
 		SqrAccumulate(2,1)
 		SqrStoreDigit(3)
 		SqrShiftCarry
 		SqrAccumulate(3,1)
 		SqrAccumulateCentre(2)
 		SqrStoreDigit(4)
 		SqrShiftCarry
 		SqrAccumulate(3,2)
 		SqrStoreDigit(5)
 		SqrShiftCarry
 		SqrLastDiagonal(4)
 		SqrCleanup
 		:
 		: "D" (Y), "S" (X)
 		: "eax",  "ecx", "edx", "ebp",   "memory"
 	);
 }
 #define MulStartup \
-  "push %%ebp\n\t" \
+	AS2(xor ebp, ebp) \
-  "push %%esi\n\t" \
+	AS2(xor edi, edi) \
-  "push %%ebx\n\t" \
+	AS2(xor ebx, ebx) 
  "push %%edi\n\t" \
  "mov %%eax, %%ebx \n\t" \
  "xor %%ebp, %%ebp\n\t" \
  "xor %%edi, %%edi\n\t" \
  "xor %%ecx, %%ecx\n\t" 
 #define MulShiftCarry \
-  "mov %%edx, %%ebp\n\t" \
+	AS2(mov ebp, edx) \
-  "mov %%ecx, %%edi\n\t" \
+	AS2(mov edi, ebx) \
-  "xor %%ecx, %%ecx\n\t"
+	AS2(xor ebx, ebx)
 #define MulAccumulateBottom(i,j) \
 	AS2(mov eax, [ecx+4*j]) \
 	AS2(imul eax, dword ptr [esi+4*i]) \
 	AS2(add ebp, eax)
 #define MulAccumulate(i,j) \
-  "mov 4*"#j"(%%ebx), %%eax\n\t" \
+	AS2(mov eax, [ecx+4*j]) \
-  "mull 4*"#i"(%%esi)\n\t" \
+	AS1(mul dword ptr [esi+4*i]) \
-  "add %%eax, %%ebp\n\t" \
+	AS2(add ebp, eax) \
-  "adc %%edx, %%edi\n\t" \
+	AS2(adc edi, edx) \
-  "adc %%ch, %%cl\n\t"
+	AS2(adc bl, bh)
-#define MulStoreDigit(X)  \
+#define MulStoreDigit(i)  \
-  "mov %%edi, %%edx \n\t" \
+	AS2(mov edx, edi) \
-  "mov (%%esp), %%edi \n\t" \
+	AS2(mov edi, [esp]) \
-  "mov %%ebp, 4*"#X"(%%edi)\n\t" \
+	AS2(mov [edi+4*i], ebp)
  "mov %%edi, (%%esp)\n\t" 
 #define MulLastDiagonal(digits) \
-  "mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
+	AS2(mov eax, [ecx+4*(digits-1)]) \
-  "mull 4*("#digits"-1)(%%esi)\n\t" \
+	AS1(mul dword ptr [esi+4*(digits-1)]) \
-  "add %%eax, %%ebp\n\t" \
+	AS2(add ebp, eax) \
-  "adc %%edi, %%edx\n\t" \
+	AS2(adc edx, edi) \
-  "mov (%%esp), %%edi\n\t" \
+	AS2(mov edi, [esp]) \
-  "mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
+	AS2(mov [edi+4*(2*digits-2)], ebp) \
-  "mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t" 
+	AS2(mov [edi+4*(2*digits-1)], edx)
-#define MulCleanup \
+CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
  "pop %%edi\n\t" \
  "pop %%ebx\n\t" \
  "pop %%esi\n\t" \
  "pop %%ebp\n\t" 
 void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
 {
-	__asm__ __volatile__(
+	MulPrologue
 	// now: [esp] = Z, esi = X, ecx = Y
 	MulStartup
 	MulAccumulate(0,0)
 	MulStoreDigit(0)
@ -1310,18 +1338,13 @@ void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
 	MulShiftCarry
 	MulLastDiagonal(4)
-
+	MulEpilogue
 		MulCleanup
 		: 
 		: "D" (Z), "S" (X), "a" (Y)
 		: "%ecx", "%edx",  "memory"
 	);
 }
-void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
+CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
 {
-	__asm__ __volatile__(
+	MulPrologue
 	// now: [esp] = Z, esi = X, ecx = Y
 	MulStartup
 	MulAccumulate(0,0)
 	MulStoreDigit(0)
@ -1429,16 +1452,77 @@ void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
 	MulShiftCarry
 	MulLastDiagonal(8)
-
+	MulEpilogue
 		MulCleanup
 		: 
 		: "D" (Z), "S" (X), "a" (Y)
 		: "%ecx", "%edx",  "memory"
 	);
 }
-#endif	// __GNUC__
+CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)
 {
 	MulPrologue
 	// now: [esp] = Z, esi = X, ecx = Y
 	MulStartup
 	MulAccumulate(0,0)
 	MulStoreDigit(0)
 	MulShiftCarry
 	MulAccumulate(1,0)
 	MulAccumulate(0,1)
 	MulStoreDigit(1)
 	MulShiftCarry
 	MulAccumulate(2,0)
 	MulAccumulate(1,1)
 	MulAccumulate(0,2)
 	MulStoreDigit(2)
 	MulShiftCarry
 	MulAccumulate(3,0)
 	MulAccumulate(2,1)
 	MulAccumulate(1,2)
 	MulAccumulate(0,3)
 	MulStoreDigit(3)
 	MulShiftCarry
 	MulAccumulate(4,0)
 	MulAccumulate(3,1)
 	MulAccumulate(2,2)
 	MulAccumulate(1,3)
 	MulAccumulate(0,4)
 	MulStoreDigit(4)
 	MulShiftCarry
 	MulAccumulate(5,0)
 	MulAccumulate(4,1)
 	MulAccumulate(3,2)
 	MulAccumulate(2,3)
 	MulAccumulate(1,4)
 	MulAccumulate(0,5)
 	MulStoreDigit(5)
 	MulShiftCarry
 	MulAccumulate(6,0)
 	MulAccumulate(5,1)
 	MulAccumulate(4,2)
 	MulAccumulate(3,3)
 	MulAccumulate(2,4)
 	MulAccumulate(1,5)
 	MulAccumulate(0,6)
 	MulStoreDigit(6)
 	MulShiftCarry
 	MulAccumulateBottom(7,0)
 	MulAccumulateBottom(6,1)
 	MulAccumulateBottom(5,2)
 	MulAccumulateBottom(4,3)
 	MulAccumulateBottom(3,4)
 	MulAccumulateBottom(2,5)
 	MulAccumulateBottom(1,6)
 	MulAccumulateBottom(0,7)
 	MulStoreDigit(7)
 	MulEpilogue
 }
 #undef AS1
 #undef AS2
 #else	// not x86 - no processor specific code at this layer
@ -1446,43 +1530,8 @@ typedef Portable LowLevel;
 #endif
 bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
 void DisableSSE2()
 {
 	g_sse2Enabled = false;
 }
 #ifdef SSE2_INTRINSICS_AVAILABLE
 static bool GetSSE2Capability()
 {
 	word32 b;
 #ifdef __GNUC__
 	__asm__("mov $1, %%eax; cpuid; mov %%edx, %0" : "=rm" (b) : : "%eax", "%edx");
 #else
 	__asm
 	{
 		mov		eax, 1
 		cpuid
 		mov		b, edx
 	}
 #endif
 	return (b & (1 << 26)) != 0;
 }
 static inline bool HasSSE2()
 {
 	if (g_sse2Enabled && !g_sse2DetectionDone)
 	{
 		g_sse2Detected = GetSSE2Capability();
 		g_sse2DetectionDone = true;
 	}
 	return g_sse2Enabled && g_sse2Detected;
 }
 #ifdef __GNUC__
 #define __fastcall
 #endif
@ -1891,34 +1940,23 @@ void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
 #define R2		(R+N)
 #define R3		(R+N+N2)
 //VC60 workaround: compiler bug triggered without the extra dummy parameters
 // R[2*N] - result = A*B
 // T[2*N] - temporary work space
 // A[N] --- multiplier
 // B[N] --- multiplicant
-template <class P>
+void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N)
 void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
 template <class P>
 inline void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
 {
 	assert(N>=2 && N%2==0);
-	if (P::MultiplyRecursionLimit() >= 8 && N==8)
+	if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
-		P::Multiply8(R, A, B);
+		LowLevel::Multiply8(R, A, B);
-	else if (P::MultiplyRecursionLimit() >= 4 && N==4)
+	else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
-		P::Multiply4(R, A, B);
+		LowLevel::Multiply4(R, A, B);
 	else if (N==2)
-		P::Multiply2(R, A, B);
+		LowLevel::Multiply2(R, A, B);
 	else
-		DoRecursiveMultiply<P>(R, T, A, B, N, NULL);	// VC60 workaround: needs this NULL
+	{
 }
 template <class P>
 void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
 {
 		const unsigned int N2 = N/2;
 		int carry;
@ -1928,29 +1966,29 @@ void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigne
 		switch (2*aComp + aComp + bComp)
 		{
 		case -4:
-		P::Subtract(R0, A1, A0, N2);
+			LowLevel::Subtract(R0, A1, A0, N2);
-		P::Subtract(R1, B0, B1, N2);
+			LowLevel::Subtract(R1, B0, B1, N2);
-		RecursiveMultiply<P>(T0, T2, R0, R1, N2);
+			RecursiveMultiply(T0, T2, R0, R1, N2);
-		P::Subtract(T1, T1, R0, N2);
+			LowLevel::Subtract(T1, T1, R0, N2);
 			carry = -1;
 			break;
 		case -2:
-		P::Subtract(R0, A1, A0, N2);
+			LowLevel::Subtract(R0, A1, A0, N2);
-		P::Subtract(R1, B0, B1, N2);
+			LowLevel::Subtract(R1, B0, B1, N2);
-		RecursiveMultiply<P>(T0, T2, R0, R1, N2);
+			RecursiveMultiply(T0, T2, R0, R1, N2);
 			carry = 0;
 			break;
 		case 2:
-		P::Subtract(R0, A0, A1, N2);
+			LowLevel::Subtract(R0, A0, A1, N2);
-		P::Subtract(R1, B1, B0, N2);
+			LowLevel::Subtract(R1, B1, B0, N2);
-		RecursiveMultiply<P>(T0, T2, R0, R1, N2);
+			RecursiveMultiply(T0, T2, R0, R1, N2);
 			carry = 0;
 			break;
 		case 4:
-		P::Subtract(R0, A1, A0, N2);
+			LowLevel::Subtract(R0, A1, A0, N2);
-		P::Subtract(R1, B0, B1, N2);
+			LowLevel::Subtract(R1, B0, B1, N2);
-		RecursiveMultiply<P>(T0, T2, R0, R1, N2);
+			RecursiveMultiply(T0, T2, R0, R1, N2);
-		P::Subtract(T1, T1, R1, N2);
+			LowLevel::Subtract(T1, T1, R1, N2);
 			carry = -1;
 			break;
 		default:
@ -1958,52 +1996,45 @@ void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigne
 			carry = 0;
 		}
-	RecursiveMultiply<P>(R0, T2, A0, B0, N2);
+		RecursiveMultiply(R0, T2, A0, B0, N2);
-	RecursiveMultiply<P>(R2, T2, A1, B1, N2);
+		RecursiveMultiply(R2, T2, A1, B1, N2);
 		// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
-	carry += P::Add(T0, T0, R0, N);
+		carry += LowLevel::Add(T0, T0, R0, N);
-	carry += P::Add(T0, T0, R2, N);
+		carry += LowLevel::Add(T0, T0, R2, N);
-	carry += P::Add(R1, R1, T0, N);
+		carry += LowLevel::Add(R1, R1, T0, N);
 		assert (carry >= 0 && carry <= 2);
 		Increment(R3, N2, carry);
 	}
 }
 // R[2*N] - result = A*A
 // T[2*N] - temporary work space
 // A[N] --- number to be squared
-template <class P>
+void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)
 void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL);
 template <class P>
 inline void RecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy=NULL)
 {
 	assert(N && N%2==0);
-	if (P::SquareRecursionLimit() >= 8 && N==8)
+	if (LowLevel::SquareRecursionLimit() >= 8 && N==8)
-		P::Square8(R, A);
+		LowLevel::Square8(R, A);
-	if (P::SquareRecursionLimit() >= 4 && N==4)
+	if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
-		P::Square4(R, A);
+		LowLevel::Square4(R, A);
 	else if (N==2)
-		P::Square2(R, A);
+		LowLevel::Square2(R, A);
 	else
-		DoRecursiveSquare<P>(R, T, A, N, NULL);	// VC60 workaround: needs this NULL
+	{
 }
 template <class P>
 void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P *dummy)
 {
 		const unsigned int N2 = N/2;
-	RecursiveSquare<P>(R0, T2, A0, N2);
+		RecursiveSquare(R0, T2, A0, N2);
-	RecursiveSquare<P>(R2, T2, A1, N2);
+		RecursiveSquare(R2, T2, A1, N2);
-	RecursiveMultiply<P>(T0, T2, A0, A1, N2);
+		RecursiveMultiply(T0, T2, A0, A1, N2);
-	word carry = P::Add(R1, R1, T0, N);
+		word carry = LowLevel::Add(R1, R1, T0, N);
-	carry += P::Add(R1, R1, T0, N);
+		carry += LowLevel::Add(R1, R1, T0, N);
 		Increment(R3, N2, carry);
 	}
 }
 // R[N] - bottom half of A*B
@ -2011,33 +2042,25 @@ void DoRecursiveSquare(word *R, word *T, const word *A, unsigned int N, const P
 // A[N] - multiplier
 // B[N] - multiplicant
-template <class P>
+void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
 void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
 template <class P>
 inline void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
 {
 	assert(N>=2 && N%2==0);
-	if (P::MultiplyBottomRecursionLimit() >= 8 && N==8)
+	if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
-		P::Multiply8Bottom(R, A, B);
+		LowLevel::Multiply8Bottom(R, A, B);
-	else if (P::MultiplyBottomRecursionLimit() >= 4 && N==4)
+	else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
-		P::Multiply4Bottom(R, A, B);
+		LowLevel::Multiply4Bottom(R, A, B);
 	else if (N==2)
-		P::Multiply2Bottom(R, A, B);
+		LowLevel::Multiply2Bottom(R, A, B);
 	else
-		DoRecursiveMultiplyBottom<P>(R, T, A, B, N, NULL);
+	{
 }
 template <class P>
 void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy)
 {
 		const unsigned int N2 = N/2;
-	RecursiveMultiply<P>(R, T, A0, B0, N2);
+		RecursiveMultiply(R, T, A0, B0, N2);
-	RecursiveMultiplyBottom<P>(T0, T1, A1, B0, N2);
+		RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
-	P::Add(R1, R1, T0, N2);
+		LowLevel::Add(R1, R1, T0, N2);
-	RecursiveMultiplyBottom<P>(T0, T1, A0, B1, N2);
+		RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
-	P::Add(R1, R1, T0, N2);
+		LowLevel::Add(R1, R1, T0, N2);
 	}
 }
 // R[N] --- upper half of A*B
@ -2046,19 +2069,18 @@ void DoRecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, u
 // A[N] --- multiplier
 // B[N] --- multiplicant
-template <class P>
+void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
 void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
 {
 	assert(N>=2 && N%2==0);
 	if (N==4)
 	{
-		P::Multiply4(T, A, B);
+		LowLevel::Multiply4(T, A, B);
 		memcpy(R, T+4, 4*WORD_SIZE);
 	}
 	else if (N==2)
 	{
-		P::Multiply2(T, A, B);
+		LowLevel::Multiply2(T, A, B);
 		memcpy(R, T+2, 2*WORD_SIZE);
 	}
 	else
@ -2072,29 +2094,29 @@ void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const
 		switch (2*aComp + aComp + bComp)
 		{
 		case -4:
-			P::Subtract(R0, A1, A0, N2);
+			LowLevel::Subtract(R0, A1, A0, N2);
-			P::Subtract(R1, B0, B1, N2);
+			LowLevel::Subtract(R1, B0, B1, N2);
-			RecursiveMultiply<P>(T0, T2, R0, R1, N2);
+			RecursiveMultiply(T0, T2, R0, R1, N2);
-			P::Subtract(T1, T1, R0, N2);
+			LowLevel::Subtract(T1, T1, R0, N2);
 			carry = -1;
 			break;
 		case -2:
-			P::Subtract(R0, A1, A0, N2);
+			LowLevel::Subtract(R0, A1, A0, N2);
-			P::Subtract(R1, B0, B1, N2);
+			LowLevel::Subtract(R1, B0, B1, N2);
-			RecursiveMultiply<P>(T0, T2, R0, R1, N2);
+			RecursiveMultiply(T0, T2, R0, R1, N2);
 			carry = 0;
 			break;
 		case 2:
-			P::Subtract(R0, A0, A1, N2);
+			LowLevel::Subtract(R0, A0, A1, N2);
-			P::Subtract(R1, B1, B0, N2);
+			LowLevel::Subtract(R1, B1, B0, N2);
-			RecursiveMultiply<P>(T0, T2, R0, R1, N2);
+			RecursiveMultiply(T0, T2, R0, R1, N2);
 			carry = 0;
 			break;
 		case 4:
-			P::Subtract(R0, A1, A0, N2);
+			LowLevel::Subtract(R0, A1, A0, N2);
-			P::Subtract(R1, B0, B1, N2);
+			LowLevel::Subtract(R1, B0, B1, N2);
-			RecursiveMultiply<P>(T0, T2, R0, R1, N2);
+			RecursiveMultiply(T0, T2, R0, R1, N2);
-			P::Subtract(T1, T1, R1, N2);
+			LowLevel::Subtract(T1, T1, R1, N2);
 			carry = -1;
 			break;
 		default:
@ -2102,18 +2124,18 @@ void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const
 			carry = 0;
 		}
-		RecursiveMultiply<P>(T2, R0, A1, B1, N2);
+		RecursiveMultiply(T2, R0, A1, B1, N2);
 		// now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
-		word c2 = P::Subtract(R0, L+N2, L, N2);
+		word c2 = LowLevel::Subtract(R0, L+N2, L, N2);
-		c2 += P::Subtract(R0, R0, T0, N2);
+		c2 += LowLevel::Subtract(R0, R0, T0, N2);
 		word t = (Compare(R0, T2, N2) == -1);
 		carry += t;
 		carry += Increment(R0, N2, c2+t);
-		carry += P::Add(R0, R0, T1, N2);
+		carry += LowLevel::Add(R0, R0, T1, N2);
-		carry += P::Add(R0, R0, T3, N2);
+		carry += LowLevel::Add(R0, R0, T3, N2);
 		assert (carry >= 0 && carry <= 2);
 		CopyWords(R1, T3, N2);
@ -2133,42 +2155,22 @@ inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
 inline void Multiply(word *R, word *T, const word *A, const word *B, unsigned int N)
 {
-#ifdef SSE2_INTRINSICS_AVAILABLE
+	RecursiveMultiply(R, T, A, B, N);
 	if (HasSSE2())
 		RecursiveMultiply<P4Optimized>(R, T, A, B, N);
 	else
 #endif
 		RecursiveMultiply<LowLevel>(R, T, A, B, N);
 }
 inline void Square(word *R, word *T, const word *A, unsigned int N)
 {
-#ifdef SSE2_INTRINSICS_AVAILABLE
+	RecursiveSquare(R, T, A, N);
 	if (HasSSE2())
 		RecursiveSquare<P4Optimized>(R, T, A, N);
 	else
 #endif
 		RecursiveSquare<LowLevel>(R, T, A, N);
 }
 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
 {
-#ifdef SSE2_INTRINSICS_AVAILABLE
+	RecursiveMultiplyBottom(R, T, A, B, N);
 	if (HasSSE2())
 		RecursiveMultiplyBottom<P4Optimized>(R, T, A, B, N);
 	else
 #endif
 		RecursiveMultiplyBottom<LowLevel>(R, T, A, B, N);
 }
 inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
 {
-#ifdef SSE2_INTRINSICS_AVAILABLE
+	RecursiveMultiplyTop(R, T, L, A, B, N);
 	if (HasSSE2())
 		RecursiveMultiplyTop<P4Optimized>(R, T, L, A, B, N);
 	else
 #endif
 		RecursiveMultiplyTop<LowLevel>(R, T, L, A, B, N);
 }
 static word LinearMultiply(word *C, const word *A, word B, unsigned int N)