From d2510f30c75b341dcbc45432a4bd38c0513f2616 Mon Sep 17 00:00:00 2001
From: weidai <weidai11@users.noreply.github.com>
Date: Fri, 4 May 2007 15:24:09 +0000
Subject: [PATCH] fix compile for x64, DLL and VC 6

---
 camellia.cpp  |   2 +-
 cpu.cpp       |  29 ++--
 cpu.h         |  33 ++++-
 datatest.cpp  |  13 +-
 integer.cpp   | 389 +++++++++++++++++++++++++++++++++++---------------
 panama.cpp    | 137 +++++++++---------
 rijndael.cpp  | 252 ++++++++++++++++++++------------
 secblock.h    |  27 ++--
 sha.cpp       |  21 ++-
 smartptr.h    |  14 +-
 sosemanuk.cpp | 267 +++++++++++++++++-----------------
 tiger.cpp     |   9 +-
 whrlpool.cpp  | 101 ++++++-------
 x64masm.asm   |  69 +++------
 14 files changed, 810 insertions(+), 553 deletions(-)
diff --git a/camellia.cpp b/camellia.cpp
index 0bca33a1..cdd7906c 100644
--- a/camellia.cpp
+++ b/camellia.cpp
@@ -228,7 +228,7 @@ void Camellia::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBloc
 
 	SLOW_ROUND(lh, ll, rh, rl, KS(1,0), KS(1,1))
 	SLOW_ROUND(rh, rl, lh, ll, KS(1,2), KS(1,3))
-	for (unsigned int i = m_rounds-1; i > 0; --i)
+	for (i = m_rounds-1; i > 0; --i)
 	{
 		DOUBLE_ROUND(lh, ll, rh, rl, KS(2,0), KS(2,1), KS(2,2), KS(2,3))
 		DOUBLE_ROUND(lh, ll, rh, rl, KS(3,0), KS(3,1), KS(3,2), KS(3,3))
diff --git a/cpu.cpp b/cpu.cpp
index a4922504..c42dd8bc 100755
--- a/cpu.cpp
+++ b/cpu.cpp
@@ -1,8 +1,10 @@
 // cpu.cpp - written and placed in the public domain by Wei Dai
 
 #include "pch.h"
-#include "cpu.h"
 
+#ifndef CRYPTOPP_IMPORTS
+
+#include "cpu.h"
 #include "misc.h"
 #include <algorithm>
 
@@ -11,10 +13,15 @@
 #include <setjmp.h>
 #endif
 
+#ifdef CRYPTOPP_MSVC6PP_OR_LATER
+#include <emmintrin.h>
+#endif
+
 NAMESPACE_BEGIN(CryptoPP)
 
 #ifdef CRYPTOPP_X86_ASM_AVAILABLE
 
+#ifndef _MSC_VER
 typedef void (*SigHandler)(int);
 
 static jmp_buf s_jmpNoCPUID;
@@ -22,6 +29,7 @@ static void SigIllHandlerCPUID(int)
 {
 	longjmp(s_jmpNoCPUID, 1);
 }
+#endif
 
 bool CpuId(word32 input, word32 *output)
 {
@@ -57,7 +65,11 @@ bool CpuId(word32 input, word32 *output)
 		__asm__
 		(
 			// save ebx in case -fPIC is being used
+#if CRYPTOPP_BOOL_X86
 			"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
+#else
+			"pushq %%rbx; cpuid; mov %%ebx, %%edi; popq %%rbx"
+#endif
 			: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
 			: "a" (input)
 		);
@@ -84,22 +96,19 @@ bool CpuId(word32 input, word32 *output)
 	return true;
 }
 
-inline bool TrySSE2()
-{
-	return true;
-}
-
 #endif
 
 #ifdef CRYPTOPP_CPUID_AVAILABLE
 
 static bool TrySSE2()
 {
-#ifdef _MSC_VER
+#if CRYPTOPP_BOOL_X64
+	return true;
+#elif defined(_MSC_VER)
     __try
 	{
 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
-        __asm por xmm0, xmm0        // executing SSE2 instruction
+        AS2(por xmm0, xmm0)        // executing SSE2 instruction
 #elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
 		__mm128i x = _mm_setzero_si128();
 		return _mm_cvtsi128_si32(x) == 0;
@@ -137,7 +146,7 @@ static bool TrySSE2()
 
 bool g_x86DetectionDone = false;
 bool g_hasSSE2 = false, g_hasSSSE3 = false, g_hasMMX = false, g_isP4 = false;
-int g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
+word32 g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
 
 void DetectX86Features()
 {
@@ -170,3 +179,5 @@ void DetectX86Features()
 #endif
 
 NAMESPACE_END
+
+#endif
diff --git a/cpu.h b/cpu.h
index 6a212345..6eae4896 100755
--- a/cpu.h
+++ b/cpu.h
@@ -3,6 +3,10 @@
 
 #include "config.h"
 
+#ifdef CRYPTOPP_MSVC6PP_OR_LATER
+	#include <emmintrin.h>
+#endif
+
 NAMESPACE_BEGIN(CryptoPP)
 
 #if defined(CRYPTOPP_X86_ASM_AVAILABLE) || (_MSC_VER >= 1400 && CRYPTOPP_BOOL_X64)
@@ -10,12 +14,15 @@ NAMESPACE_BEGIN(CryptoPP)
 #define CRYPTOPP_CPUID_AVAILABLE
 
 // these should not be used directly
-extern bool g_x86DetectionDone;
-extern bool g_hasSSE2, g_hasMMX, g_hasSSSE3, g_isP4;
-extern int g_cacheLineSize;
-void DetectX86Features();
+extern CRYPTOPP_DLL bool g_x86DetectionDone;
+extern CRYPTOPP_DLL bool g_hasSSE2;
+extern CRYPTOPP_DLL bool g_hasMMX;
+extern CRYPTOPP_DLL bool g_hasSSSE3;
+extern CRYPTOPP_DLL bool g_isP4;
+extern CRYPTOPP_DLL word32 g_cacheLineSize;
+CRYPTOPP_DLL void DetectX86Features();
 
-bool CpuId(word32 input, word32 *output);
+CRYPTOPP_DLL bool CpuId(word32 input, word32 *output);
 
 #if CRYPTOPP_BOOL_X64
 inline bool HasSSE2()	{return true;}
@@ -94,6 +101,7 @@ inline bool HasMMX()	{return false;}
 	#define ASL(x) GNU_ASL(x)
 	#define ASJ(x, y, z) GNU_ASJ(x, y, z)
 	#define ASC(x, y) #x " " #y ";"
+	#define CRYPTOPP_NAKED
 #else
 	#define AS1(x) __asm {x}
 	#define AS2(x, y) __asm {x, y}
@@ -102,11 +110,26 @@ inline bool HasMMX()	{return false;}
 	#define ASL(x) __asm {label##x:}
 	#define ASJ(x, y, z) __asm {x label##y}
 	#define ASC(x, y) __asm {x label##y}
+	#define CRYPTOPP_NAKED __declspec(naked)
 #endif
 
 // GNU assembler doesn't seem to have mod operator
 #define ASM_MOD(x, y) ((x)-((x)/(y))*(y))
 
+#if CRYPTOPP_BOOL_X86
+	#define WORD_SZ 4
+	#define WORD_REG(x)	e##x
+	#define WORD_PTR DWORD PTR
+	#define AS_PUSH(x) AS1(push e##x)
+	#define AS_POP(x) AS1(pop e##x)
+#elif CRYPTOPP_BOOL_X64
+	#define WORD_SZ 8
+	#define WORD_REG(x)	r##x
+	#define WORD_PTR QWORD PTR
+	#define AS_PUSH(x) AS1(pushq r##x)
+	#define AS_POP(x) AS1(popq r##x)
+#endif
+
 NAMESPACE_END
 
 #endif
diff --git a/datatest.cpp b/datatest.cpp
index 4a326093..950e4f90 100644
--- a/datatest.cpp
+++ b/datatest.cpp
@@ -5,14 +5,14 @@
 #include "randpool.h"
 #include "files.h"
 #include "trunhash.h"
+#include "queue.h"
+#include "validate.h"
 #include <iostream>
 #include <memory>
 
 USING_NAMESPACE(CryptoPP)
 USING_NAMESPACE(std)
 
-RandomPool & GlobalRNG();
-
 typedef std::map<std::string, std::string> TestData;
 
 class TestFailure : public Exception
@@ -67,7 +67,7 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo
 			s1 = s1.substr(s1.find(' ')+1);
 		}
 		
-		s2.clear();
+		s2 = ""; // MSVC 6 doesn't have clear();
 
 		if (s1[0] == '\"')
 		{
@@ -85,8 +85,13 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo
 			s1 = s1.substr(STDMIN(s1.find(' '), s1.length()));
 		}
 
+		ByteQueue q;
 		while (repeat--)
-			target.Put((const byte *)s2.data(), s2.size());
+		{
+			q.Put((const byte *)s2.data(), s2.size());
+			if (q.MaxRetrievable() > 4*1024 || repeat == 0)
+				q.TransferTo(target);
+		}
 	}
 }
 
diff --git a/integer.cpp b/integer.cpp
index 64f3cea0..a8e78818 100644
--- a/integer.cpp
+++ b/integer.cpp
@@ -18,7 +18,7 @@
 
 #include <iostream>
 
-#if defined(_MSC_VER) && _MSC_VER >= 1400
+#if _MSC_VER >= 1400
 	#include <intrin.h>
 #endif
 
@@ -30,6 +30,8 @@
 	#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
 #endif
 
+#define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86)
+
 NAMESPACE_BEGIN(CryptoPP)
 
 bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
@@ -99,7 +101,36 @@ static word AtomicInverseModPower2(word A)
 
 // ********************************************************
 
-#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+#if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || CRYPTOPP_BOOL_X64
+	#define Declare2Words(x)			word x##0, x##1;
+	#define AssignWord(a, b)			a##0 = b; a##1 = 0;
+	#define Add2WordsBy1(a, b, c)		a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
+	#define LowWord(a)					a##0
+	#define HighWord(a)					a##1
+	#ifdef _MSC_VER
+		#define MultiplyWords(p, a, b)	p##0 = _umul128(a, b, &p##1);
+		#define Double3Words(c, d)		d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
+	#elif defined(__DECCXX)
+		#define MultiplyWords(p, a, b)	p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
+	#elif CRYPTOPP_BOOL_X64
+		#define MultiplyWords(p, a, b)	asm ("mulq %3" : "=a"(p##0), "=d"(p##1) : "a"(a), "g"(b) : "cc");
+		#define MulAcc(c, d, a, b)		asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
+		#define Double3Words(c, d)		asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
+		#define Acc2WordsBy1(a, b)		asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
+		#define Acc2WordsBy2(a, b)		asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
+		#define Acc3WordsBy2(c, d, e)	asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
+	#endif
+	#ifndef Double3Words
+		#define Double3Words(c, d)		d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
+	#endif
+	#ifndef Acc2WordsBy2
+		#define Acc2WordsBy2(a, b)		a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
+	#endif
+	#define AddWithCarry(u, a, b)		{word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
+	#define SubtractWithBorrow(u, a, b)	{word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
+	#define GetCarry(u)					u##1
+	#define GetBorrow(u)				u##1
+#else
 	#define Declare2Words(x)			dword x;
 	#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
 		#define MultiplyWords(p, a, b)		p = __emulu(a, b);
@@ -108,34 +139,23 @@ static word AtomicInverseModPower2(word A)
 	#endif
 	#define AssignWord(a, b)			a = b;
 	#define Add2WordsBy1(a, b, c)		a = b + c;
-	#define Acc2WordsBy1(a, b)			a += b;
 	#define Acc2WordsBy2(a, b)			a += b;
-	#define LowWord(a)					(word)a
-	#define HighWord(a)					(word)(a>>WORD_BITS)
-	#define Double2Words(a)				a += a;
+	#define LowWord(a)					word(a)
+	#define HighWord(a)					word(a>>WORD_BITS)
+	#define Double3Words(c, d)			d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
 	#define AddWithCarry(u, a, b)		u = dword(a) + b + GetCarry(u);
 	#define SubtractWithBorrow(u, a, b)	u = dword(a) - b - GetBorrow(u);
 	#define GetCarry(u)					HighWord(u)
 	#define GetBorrow(u)				word(u>>(WORD_BITS*2-1))
-#else
-	#define Declare2Words(x)			word x##0, x##1;
-	#define AssignWord(a, b)			a##0 = b; a##1 = 0;
-	#define Add2WordsBy1(a, b, c)		a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
+#endif
+#ifndef MulAcc
+	#define MulAcc(c, d, a, b)			MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
+#endif
+#ifndef Acc2WordsBy1
 	#define Acc2WordsBy1(a, b)			Add2WordsBy1(a, a, b)
-	#define Acc2WordsBy2(a, b)			a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
-	#define LowWord(a)					a##0
-	#define HighWord(a)					a##1
-	#ifdef _MSC_VER
-		#define MultiplyWords(p, a, b)		p##0 = _umul128(a, b, &p##1);
-		#define Double2Words(a)				a##1 = __shiftleft128(a##0, a##1, 1); a##0 += a##0;
-	#elif defined(__DECCXX)
-		#define MultiplyWords(p, a, b)		p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
-		#define Double2Words(a)				a##1 = (a##1 + a##1) + (a##0 >> (WORD_BITS-1)); a##0 += a##0;
-	#endif
-	#define AddWithCarry(u, a, b)		{word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
-	#define SubtractWithBorrow(u, a, b)	{word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
-	#define GetCarry(u)					u##1
-	#define GetBorrow(u)				u##1
+#endif
+#ifndef Acc3WordsBy2
+	#define Acc3WordsBy2(c, d, e)		Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
 #endif
 
 class DWord
@@ -411,9 +431,8 @@ inline word DWord::operator%(word a)
 
 // use some tricks to share assembly code between MSVC and GCC
 #if defined(__GNUC__)
-	#define CRYPTOPP_NAKED
 	#define AddPrologue \
-		word32 result;	\
+		word result;	\
 		__asm__ __volatile__ \
 		( \
 			".intel_syntax noprefix;"
@@ -454,7 +473,6 @@ inline word DWord::operator%(word a)
 			: "memory", "cc" \
 		);
 #else
-	#define CRYPTOPP_NAKED __declspec(naked)
 	#define AddPrologue \
 		__asm	push edi \
 		__asm	push esi \
@@ -464,33 +482,107 @@ inline word DWord::operator%(word a)
 		__asm	pop esi \
 		__asm	pop edi \
 		__asm	ret 8
+#if _MSC_VER < 1300
+	#define SaveEBX		__asm push ebx
+	#define RestoreEBX	__asm pop ebx
+#else
+	#define SaveEBX
+	#define RestoreEBX
+#endif
 	#define SquPrologue					\
 		AS2(	mov		eax, A)			\
 		AS2(	mov		ecx, C)			\
+		SaveEBX							\
 		AS2(	lea		ebx, s_maskLow16)
-	#define SquEpilogue
 	#define MulPrologue					\
 		AS2(	mov		eax, A)			\
 		AS2(	mov		edi, B)			\
 		AS2(	mov		ecx, C)			\
+		SaveEBX							\
 		AS2(	lea		ebx, s_maskLow16)
-	#define MulEpilogue
 	#define TopPrologue					\
 		AS2(	mov		eax, A)			\
 		AS2(	mov		edi, B)			\
 		AS2(	mov		ecx, C)			\
 		AS2(	mov		esi, L)			\
+		SaveEBX							\
 		AS2(	lea		ebx, s_maskLow16)
-	#define TopEpilogue
+	#define SquEpilogue		RestoreEBX
+	#define MulEpilogue		RestoreEBX
+	#define TopEpilogue		RestoreEBX
 #endif
 
-#if defined(_MSC_VER) && defined(_M_X64)
+#ifdef CRYPTOPP_X64_MASM_AVAILABLE
 extern "C" {
-int Baseline_Add(size_t N, word *C, const word *A, const word *B);
-int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
+word Baseline_Add(size_t N, word *C, const word *A, const word *B);
+word Baseline_Sub(size_t N, word *C, const word *A, const word *B);
 }
-#elif defined(CRYPTOPP_X86_ASM_AVAILABLE)
-CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
+#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__)
+word Baseline_Add(size_t N, word *C, const word *A, const word *B)
+{
+	word result;
+	__asm__ __volatile__
+	(
+	".intel_syntax;"
+	AS1(	neg		%1)
+	ASJ(	jz,		1, f)
+	AS2(	mov		%0,[%3+8*%1])
+	AS2(	add		%0,[%4+8*%1])
+	AS2(	mov		[%2+8*%1],%0)
+	ASL(0)
+	AS2(	mov		%0,[%3+8*%1+8])
+	AS2(	adc		%0,[%4+8*%1+8])
+	AS2(	mov		[%2+8*%1+8],%0)
+	AS2(	lea		%1,[%1+2])
+	ASJ(	jrcxz,	1, f)
+	AS2(	mov		%0,[%3+8*%1])
+	AS2(	adc		%0,[%4+8*%1])
+	AS2(	mov		[%2+8*%1],%0)
+	ASJ(	jmp,	0, b)
+	ASL(1)
+	AS2(	mov		%0, 0)
+	AS2(	adc		%0, %0)
+	".att_syntax;"
+	: "=&r" (result)
+	: "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
+	: "memory", "cc"
+	);
+	return result;
+}
+
+word Baseline_Sub(size_t N, word *C, const word *A, const word *B)
+{
+	word result;
+	__asm__ __volatile__
+	(
+	".intel_syntax;"
+	AS1(	neg		%1)
+	ASJ(	jz,		1, f)
+	AS2(	mov		%0,[%3+8*%1])
+	AS2(	sub		%0,[%4+8*%1])
+	AS2(	mov		[%2+8*%1],%0)
+	ASL(0)
+	AS2(	mov		%0,[%3+8*%1+8])
+	AS2(	sbb		%0,[%4+8*%1+8])
+	AS2(	mov		[%2+8*%1+8],%0)
+	AS2(	lea		%1,[%1+2])
+	ASJ(	jrcxz,	1, f)
+	AS2(	mov		%0,[%3+8*%1])
+	AS2(	sbb		%0,[%4+8*%1])
+	AS2(	mov		[%2+8*%1],%0)
+	ASJ(	jmp,	0, b)
+	ASL(1)
+	AS2(	mov		%0, 0)
+	AS2(	adc		%0, %0)
+	".att_syntax;"
+	: "=&r" (result)
+	: "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
+	: "memory", "cc"
+	);
+	return result;
+}
+#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
+CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
 {
 	AddPrologue
 
@@ -531,7 +623,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word
 	AddEpilogue
 }
 
-CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
+CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
 {
 	AddPrologue
 
@@ -572,8 +664,8 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word
 	AddEpilogue
 }
 
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
-CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
+#if CRYPTOPP_INTEGER_SSE2
+CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
 {
 	AddPrologue
 
@@ -629,7 +721,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A,
 
 	AddEpilogue
 }
-CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
+CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
 {
 	AddPrologue
 
@@ -687,7 +779,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A,
 }
 #endif	// #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
 #else
-int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
+word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
 {
 	assert (N%2 == 0);
 
@@ -703,7 +795,7 @@ int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word
 	return int(GetCarry(u));
 }
 
-int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
+word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
 {
 	assert (N%2 == 0);
 
@@ -737,7 +829,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
 #define Mul_2 \
 	Mul_Begin(2) \
 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
-	Mul_End(2)
+	Mul_End(1, 1)
 
 #define Mul_4 \
 	Mul_Begin(4) \
@@ -746,7 +838,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
 	Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
 	Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
-	Mul_End(4)
+	Mul_End(5, 3)
 
 #define Mul_8 \
 	Mul_Begin(8) \
@@ -763,7 +855,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
 	Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
 	Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
 	Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
-	Mul_End(8)
+	Mul_End(13, 7)
 
 #define Mul_16 \
 	Mul_Begin(16) \
@@ -796,7 +888,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
 	Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
 	Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
 	Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
-	Mul_End(16)
+	Mul_End(29, 15)
 
 #define Squ_2 \
 	Squ_Begin(2) \
@@ -900,6 +992,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
 	Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
 	Bot_End(16)
 
+#if 0
 #define Mul_Begin(n)				\
 	Declare2Words(p)				\
 	Declare2Words(c)				\
@@ -938,9 +1031,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
 
 #define Bot_End(n)		\
 	R[n-1] = e;
-
-/*
-// this is slower on MSVC 2005 Win32
+#else
 #define Mul_Begin(n)				\
 	Declare2Words(p)				\
 	word c;	\
@@ -950,25 +1041,20 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
 	AssignWord(d, HighWord(p))
 
 #define Mul_Acc(i, j)				\
-	MultiplyWords(p, A[i], B[j])	\
-	Acc2WordsBy1(p, c)		\
-	c = LowWord(p);	\
-	Acc2WordsBy1(d, HighWord(p))
+	MulAcc(c, d, A[i], B[j])
 
 #define Mul_SaveAcc(k, i, j) 		\
 	R[k] = c;				\
-	MultiplyWords(p, A[i], B[j])	\
-	Acc2WordsBy1(p, LowWord(d))		\
-	c = LowWord(p);	\
+	c = LowWord(d);	\
 	AssignWord(d, HighWord(d))	\
-	Acc2WordsBy1(d, HighWord(p))
+	MulAcc(c, d, A[i], B[j])
 
-#define Mul_End(n)					\
-	R[2*n-3] = c;			\
-	MultiplyWords(p, A[n-1], B[n-1])\
-	Acc2WordsBy2(d, p)				\
-	R[2*n-2] = LowWord(d);			\
-	R[2*n-1] = HighWord(d);
+#define Mul_End(k, i)					\
+	R[k] = c;			\
+	MultiplyWords(p, A[i], B[i])	\
+	Acc2WordsBy2(p, d)				\
+	R[k+1] = LowWord(p);			\
+	R[k+2] = HighWord(p);
 
 #define Bot_SaveAcc(k, i, j)		\
 	R[k] = c;				\
@@ -980,52 +1066,45 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
 
 #define Bot_End(n)		\
 	R[n-1] = c;
-*/
+#endif
 
 #define Squ_Begin(n)				\
 	Declare2Words(p)				\
-	Declare2Words(c)				\
+	word c;				\
 	Declare2Words(d)				\
 	Declare2Words(e)				\
 	MultiplyWords(p, A[0], A[0])	\
 	R[0] = LowWord(p);				\
 	AssignWord(e, HighWord(p))		\
 	MultiplyWords(p, A[0], A[1])	\
-	AssignWord(c, LowWord(p))		\
+	c = LowWord(p);		\
 	AssignWord(d, HighWord(p))		\
 	Squ_NonDiag						\
 
 #define Squ_NonDiag				\
-	Double2Words(c)				\
-	Double2Words(d)				\
+	Double3Words(c, d)
 
 #define Squ_SaveAcc(k, i, j) 		\
-	Acc2WordsBy2(c, e)				\
-	R[k] = LowWord(c);				\
-	Add2WordsBy1(e, d, HighWord(c))	\
+	Acc3WordsBy2(c, d, e)			\
+	R[k] = c;				\
 	MultiplyWords(p, A[i], A[j])	\
-	AssignWord(c, LowWord(p))		\
+	c = LowWord(p);		\
 	AssignWord(d, HighWord(p))		\
 
 #define Squ_Acc(i, j)				\
-	MultiplyWords(p, A[i], A[j])	\
-	Acc2WordsBy1(c, LowWord(p))		\
-	Acc2WordsBy1(d, HighWord(p))
+	MulAcc(c, d, A[i], A[j])
 
 #define Squ_Diag(i)					\
 	Squ_NonDiag						\
-	MultiplyWords(p, A[i], A[i])	\
-	Acc2WordsBy1(c, LowWord(p))		\
-	Acc2WordsBy1(d, HighWord(p))	\
+	MulAcc(c, d, A[i], A[i])
 
 #define Squ_End(n)					\
-	Acc2WordsBy2(c, e)				\
-	R[2*n-3] = LowWord(c);			\
-	Acc2WordsBy1(d, HighWord(c))	\
+	Acc3WordsBy2(c, d, e)			\
+	R[2*n-3] = c;			\
 	MultiplyWords(p, A[n-1], A[n-1])\
-	Acc2WordsBy2(d, p)				\
-	R[2*n-2] = LowWord(d);			\
-	R[2*n-1] = HighWord(d);
+	Acc2WordsBy2(p, e)				\
+	R[2*n-2] = LowWord(p);			\
+	R[2*n-1] = HighWord(p);
 
 void Baseline_Multiply2(word *R, const word *A, const word *B)
 {
@@ -1072,7 +1151,62 @@ void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
 	Bot_8
 }
 
-/*
+#define Top_Begin(n)				\
+	Declare2Words(p)				\
+	word c;	\
+	Declare2Words(d)				\
+	MultiplyWords(p, A[0], B[n-2]);\
+	AssignWord(d, HighWord(p));
+
+#define Top_Acc(i, j)	\
+	MultiplyWords(p, A[i], B[j]);\
+	Acc2WordsBy1(d, HighWord(p));
+
+#define Top_SaveAcc0(i, j) 		\
+	c = LowWord(d);	\
+	AssignWord(d, HighWord(d))	\
+	MulAcc(c, d, A[i], B[j])
+
+#define Top_SaveAcc1(i, j) 		\
+	c = L<c; \
+	Acc2WordsBy1(d, c);	\
+	c = LowWord(d);	\
+	AssignWord(d, HighWord(d))	\
+	MulAcc(c, d, A[i], B[j])
+
+void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
+{
+	word T[4];
+	Baseline_Multiply2(T, A, B);
+	R[0] = T[2];
+	R[1] = T[3];
+}
+
+void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
+{
+	Top_Begin(4)
+	Top_Acc(1, 1) Top_Acc(2, 0)  \
+	Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
+	Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
+	Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
+	Mul_End(1, 3)
+}
+
+void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
+{
+	Top_Begin(8)
+	Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
+	Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
+	Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
+	Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
+	Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
+	Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
+	Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
+	Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
+	Mul_End(5, 7)
+}
+
+#if !CRYPTOPP_INTEGER_SSE2	// save memory by not compiling these functions when SSE2 is available
 void Baseline_Multiply16(word *R, const word *A, const word *B)
 {
 	Mul_16
@@ -1087,16 +1221,40 @@ void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
 {
 	Bot_16
 }
-*/
+
+void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
+{
+	Top_Begin(16)
+	Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
+	Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
+	Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
+	Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
+	Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
+	Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
+	Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
+	Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
+	Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
+	Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
+	Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
+	Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
+	Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
+	Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
+	Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
+	Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
+	Mul_End(13, 15)
+}
+#endif
 
 // ********************************************************
 
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_INTEGER_SSE2
 
 CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
 
 #undef Mul_Begin
 #undef Mul_Acc
+#undef Top_Begin
+#undef Top_Acc
 #undef Squ_Acc
 #undef Squ_NonDiag
 #undef Squ_Diag
@@ -1760,33 +1918,35 @@ void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
 	Top_End(8)
 }
 
-#endif	// #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#endif	// #if CRYPTOPP_INTEGER_SSE2
 
 // ********************************************************
 
-typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
+typedef word (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
 typedef void (* PMul)(word *C, const word *A, const word *B);
 typedef void (* PSqu)(word *C, const word *A);
 typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
 
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_INTEGER_SSE2
 static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
-static PMulTop s_pTop[3];
 static size_t s_recursionLimit = 8;
 #else
-static const size_t s_recursionLimit = 8;
+static const size_t s_recursionLimit = 16;
 #endif
 
 static PMul s_pMul[9], s_pBot[9];
 static PSqu s_pSqu[9];
+static PMulTop s_pTop[9];
 
 static void SetFunctionPointers()
 {
 	s_pMul[0] = &Baseline_Multiply2;
 	s_pBot[0] = &Baseline_MultiplyBottom2;
 	s_pSqu[0] = &Baseline_Square2;
+	s_pTop[0] = &Baseline_MultiplyTop2;
+	s_pTop[1] = &Baseline_MultiplyTop4;
 
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_INTEGER_SSE2
 	if (HasSSE2())
 	{
 		if (IsP4())
@@ -1812,39 +1972,45 @@ static void SetFunctionPointers()
 		s_pSqu[4] = &SSE2_Square16;
 		s_pSqu[8] = &SSE2_Square32;
 
-		s_pTop[0] = &SSE2_MultiplyTop8;
-		s_pTop[1] = &SSE2_MultiplyTop16;
-		s_pTop[2] = &SSE2_MultiplyTop32;
+		s_pTop[2] = &SSE2_MultiplyTop8;
+		s_pTop[4] = &SSE2_MultiplyTop16;
+		s_pTop[8] = &SSE2_MultiplyTop32;
 	}
 	else
 #endif
 	{
 		s_pMul[1] = &Baseline_Multiply4;
 		s_pMul[2] = &Baseline_Multiply8;
-//		s_pMul[4] = &Baseline_Multiply16;
 
 		s_pBot[1] = &Baseline_MultiplyBottom4;
 		s_pBot[2] = &Baseline_MultiplyBottom8;
-//		s_pBot[4] = &Baseline_MultiplyBottom16;
 
 		s_pSqu[1] = &Baseline_Square4;
 		s_pSqu[2] = &Baseline_Square8;
-//		s_pSqu[4] = &Baseline_Square16;
+
+		s_pTop[2] = &Baseline_MultiplyTop8;
+
+#if	!CRYPTOPP_INTEGER_SSE2
+		s_pMul[4] = &Baseline_Multiply16;
+		s_pBot[4] = &Baseline_MultiplyBottom16;
+		s_pSqu[4] = &Baseline_Square16;
+		s_pTop[4] = &Baseline_MultiplyTop16;
+#endif
 	}
 }
 
-inline int Add(word *C, const word *A, const word *B, size_t N)
+inline word Add(word *C, const word *A, const word *B, size_t N)
 {
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_INTEGER_SSE2
 	return s_pAdd(N, C, A, B);
 #else
 	return Baseline_Add(N, C, A, B);
 #endif
 }
 
-inline int Subtract(word *C, const word *A, const word *B, size_t N)
+inline word Subtract(word *C, const word *A, const word *B, size_t N)
 {
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_INTEGER_SSE2
 	return s_pSub(N, C, A, B);
 #else
 	return Baseline_Sub(N, C, A, B);
@@ -1969,16 +2135,8 @@ void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B,
 {
 	assert(N>=2 && N%2==0);
 
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
-	if (HasSSE2() && ((N>=8) & (N<=32)))
-		s_pTop[N/16](R, A, B, L[N-1]);
-	else
-#endif
-	if (N<=4)
-	{
-		s_pMul[N/4](T, A, B);
-		memcpy(R, T+N, N*WORD_SIZE);
-	}
+	if (N <= s_recursionLimit)
+		s_pTop[N/4](R, A, B, L[N-1]);
 	else
 	{
 		const size_t N2 = N/2;
@@ -3076,13 +3234,6 @@ public:
 		memcpy(m_counterAndSeed + 4, seed, seedSize);
 	}
 
-	byte GenerateByte()
-	{
-		byte b;
-		GenerateBlock(&b, 1);
-		return b;
-	}
-
 	void GenerateBlock(byte *output, size_t size)
 	{
 		PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
diff --git a/panama.cpp b/panama.cpp
index 89a5aeaa..a60e1670 100644
--- a/panama.cpp
+++ b/panama.cpp
@@ -26,31 +26,31 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 	__asm__ __volatile__
 	(
 		".intel_syntax noprefix;"
-	AS1(	push	ebx)
+	AS_PUSH(		bx)
 #else
-	AS2(	mov		ecx, count)
-	AS2(	mov		esi, state)
-	AS2(	mov		edi, z)
-	AS2(	mov		edx, y)
+	AS2(	mov		WORD_REG(cx), count)
+	AS2(	mov		WORD_REG(si), state)
+	AS2(	mov		WORD_REG(di), z)
+	AS2(	mov		WORD_REG(dx), y)
 #endif
-	AS2(	shl		ecx, 5)
+	AS2(	shl		WORD_REG(cx), 5)
 	ASJ(	jz,		5, f)
-	AS2(	mov		ebx, [esi+4*17])
-	AS2(	add		ecx, ebx)
+	AS2(	mov		ebx, [WORD_REG(si)+4*17])
+	AS2(	add		WORD_REG(cx), WORD_REG(bx))
 
-	AS1(	push	ebp)
-	AS1(	push	ecx)
+	AS_PUSH(		bp)
+	AS_PUSH(		cx)
 
-	AS2(	movdqa	xmm0, [esi+0*16])
-	AS2(	movdqa	xmm1, [esi+1*16])
-	AS2(	movdqa	xmm2, [esi+2*16])
-	AS2(	movdqa	xmm3, [esi+3*16])
-	AS2(	mov		eax, [esi+4*16])
+	AS2(	movdqa	xmm0, [WORD_REG(si)+0*16])
+	AS2(	movdqa	xmm1, [WORD_REG(si)+1*16])
+	AS2(	movdqa	xmm2, [WORD_REG(si)+2*16])
+	AS2(	movdqa	xmm3, [WORD_REG(si)+3*16])
+	AS2(	mov		eax, [WORD_REG(si)+4*16])
 
 	ASL(4)
 	// gamma and pi
 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
-	AS2(	test	ebx, 1)
+	AS2(	test	WORD_REG(bx), 1)
 	ASJ(	jnz,	6, f)
 #endif
 	AS2(	movdqa	xmm6, xmm2)
@@ -81,7 +81,7 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 #define pi(i)	\
 	AS2(	movd	ecx, xmm7)\
 	AS2(	rol		ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
-	AS2(	mov		[esi+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
+	AS2(	mov		[WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
 
 #define pi4(x, y, z, a, b, c, d)	\
 	AS2(	pcmpeqb	xmm7, xmm7)\
@@ -110,65 +110,65 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 	AS2(	punpckhdq	xmm2, xmm0)		// 11 12 15 16
 
 	// keystream
-	AS2(	test	edi, edi)
+	AS2(	test	WORD_REG(di), WORD_REG(di))
 	ASJ(	jz,		0, f)
 	AS2(	movdqa	xmm6, xmm4)
 	AS2(	punpcklqdq	xmm4, xmm2)
 	AS2(	punpckhqdq	xmm6, xmm2)
-	AS2(	test	edx, 0xf)
+	AS2(	test	WORD_REG(dx), 0xf)
 	ASJ(	jnz,	2, f)
-	AS2(	test	edx, edx)
+	AS2(	test	WORD_REG(dx), WORD_REG(dx))
 	ASJ(	jz,		1, f)
-	AS2(	pxor	xmm4, [edx])
-	AS2(	pxor	xmm6, [edx+16])
-	AS2(	add		edx, 32)
+	AS2(	pxor	xmm4, [WORD_REG(dx)])
+	AS2(	pxor	xmm6, [WORD_REG(dx)+16])
+	AS2(	add		WORD_REG(dx), 32)
 	ASJ(	jmp,	1, f)
 	ASL(2)
-	AS2(	movdqu	xmm0, [edx])
-	AS2(	movdqu	xmm2, [edx+16])
+	AS2(	movdqu	xmm0, [WORD_REG(dx)])
+	AS2(	movdqu	xmm2, [WORD_REG(dx)+16])
 	AS2(	pxor	xmm4, xmm0)
 	AS2(	pxor	xmm6, xmm2)
-	AS2(	add		edx, 32)
+	AS2(	add		WORD_REG(dx), 32)
 	ASL(1)
-	AS2(	test	edi, 0xf)
+	AS2(	test	WORD_REG(di), 0xf)
 	ASJ(	jnz,	3, f)
-	AS2(	movdqa	[edi], xmm4)
-	AS2(	movdqa	[edi+16], xmm6)
-	AS2(	add		edi, 32)
+	AS2(	movdqa	[WORD_REG(di)], xmm4)
+	AS2(	movdqa	[WORD_REG(di)+16], xmm6)
+	AS2(	add		WORD_REG(di), 32)
 	ASJ(	jmp,	0, f)
 	ASL(3)
-	AS2(	movdqu	[edi], xmm4)
-	AS2(	movdqu	[edi+16], xmm6)
-	AS2(	add		edi, 32)
+	AS2(	movdqu	[WORD_REG(di)], xmm4)
+	AS2(	movdqu	[WORD_REG(di)+16], xmm6)
+	AS2(	add		WORD_REG(di), 32)
 	ASL(0)
 
 	// buffer update
-	AS2(	lea		ecx, [ebx + 32])
-	AS2(	and		ecx, 31*32)
-	AS2(	lea		ebp, [ebx + (32-24)*32])
-	AS2(	and		ebp, 31*32)
+	AS2(	lea		WORD_REG(cx), [WORD_REG(bx) + 32])
+	AS2(	and		WORD_REG(cx), 31*32)
+	AS2(	lea		WORD_REG(bp), [WORD_REG(bx) + (32-24)*32])
+	AS2(	and		WORD_REG(bp), 31*32)
 
-	AS2(	movdqa	xmm0, [esi+20*4+ecx+0*8])
+	AS2(	movdqa	xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8])
 	AS2(	pxor	xmm3, xmm0)
 	ASS(	pshufd	xmm0, xmm0, 2, 3, 0, 1)
-	AS2(	movdqa	[esi+20*4+ecx+0*8], xmm3)
-	AS2(	pxor	xmm0, [esi+20*4+ebp+2*8])
-	AS2(	movdqa	[esi+20*4+ebp+2*8], xmm0)
+	AS2(	movdqa	[WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3)
+	AS2(	pxor	xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8])
+	AS2(	movdqa	[WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0)
 
-	AS2(	movdqa	xmm4, [esi+20*4+ecx+2*8])
+	AS2(	movdqa	xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8])
 	AS2(	pxor	xmm1, xmm4)
-	AS2(	movdqa	[esi+20*4+ecx+2*8], xmm1)
-	AS2(	pxor	xmm4, [esi+20*4+ebp+0*8])
-	AS2(	movdqa	[esi+20*4+ebp+0*8], xmm4)
+	AS2(	movdqa	[WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1)
+	AS2(	pxor	xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8])
+	AS2(	movdqa	[WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4)
 
 	// theta
-	AS2(	movdqa	xmm3, [esi+3*16])
-	AS2(	movdqa	xmm2, [esi+2*16])
-	AS2(	movdqa	xmm1, [esi+1*16])
-	AS2(	movdqa	xmm0, [esi+0*16])
+	AS2(	movdqa	xmm3, [WORD_REG(si)+3*16])
+	AS2(	movdqa	xmm2, [WORD_REG(si)+2*16])
+	AS2(	movdqa	xmm1, [WORD_REG(si)+1*16])
+	AS2(	movdqa	xmm0, [WORD_REG(si)+0*16])
 
 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
-	AS2(	test	ebx, 1)
+	AS2(	test	WORD_REG(bx), 1)
 	ASJ(	jnz,	8, f)
 #endif
 	AS2(	movd	xmm6, eax)
@@ -214,21 +214,21 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 	AS2(	pxor	xmm0, xmm4)
 
 	// sigma
-	AS2(	lea		ecx, [ebx + (32-4)*32])
-	AS2(	and		ecx, 31*32)
-	AS2(	lea		ebp, [ebx + 16*32])
-	AS2(	and		ebp, 31*32)
+	AS2(	lea		WORD_REG(cx), [WORD_REG(bx) + (32-4)*32])
+	AS2(	and		WORD_REG(cx), 31*32)
+	AS2(	lea		WORD_REG(bp), [WORD_REG(bx) + 16*32])
+	AS2(	and		WORD_REG(bp), 31*32)
 
-	AS2(	movdqa	xmm4, [esi+20*4+ecx+0*16])
-	AS2(	movdqa	xmm5, [esi+20*4+ebp+0*16])
+	AS2(	movdqa	xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16])
+	AS2(	movdqa	xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16])
 	AS2(	movdqa	xmm6, xmm4)
 	AS2(	punpcklqdq	xmm4, xmm5)
 	AS2(	punpckhqdq	xmm6, xmm5)
 	AS2(	pxor	xmm3, xmm4)
 	AS2(	pxor	xmm2, xmm6)
 
-	AS2(	movdqa	xmm4, [esi+20*4+ecx+1*16])
-	AS2(	movdqa	xmm5, [esi+20*4+ebp+1*16])
+	AS2(	movdqa	xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16])
+	AS2(	movdqa	xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16])
 	AS2(	movdqa	xmm6, xmm4)
 	AS2(	punpcklqdq	xmm4, xmm5)
 	AS2(	punpckhqdq	xmm6, xmm5)
@@ -236,23 +236,22 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
 	AS2(	pxor	xmm0, xmm6)
 
 	// loop
-	AS2(	add		ebx, 32)
-	AS2(	cmp		ebx, [esp])
+	AS2(	add		WORD_REG(bx), 32)
+	AS2(	cmp		WORD_REG(bx), [WORD_REG(sp)])
 	ASJ(	jne,	4, b)
 
 	// save state
-	AS2(	mov		ebp, [esp+4])
-	AS2(	add		esp, 8)
-	AS2(	mov		[esi+4*17], ebx)
-	AS2(	mov		[esi+4*16], eax)
-	AS2(	movdqa	[esi+3*16], xmm3)
-	AS2(	movdqa	[esi+2*16], xmm2)
-	AS2(	movdqa	[esi+1*16], xmm1)
-	AS2(	movdqa	[esi+0*16], xmm0)
+	AS2(	add		WORD_REG(sp), WORD_SZ)
+	AS_POP(			bp)
+	AS2(	mov		[WORD_REG(si)+4*16], eax)
+	AS2(	movdqa	[WORD_REG(si)+3*16], xmm3)
+	AS2(	movdqa	[WORD_REG(si)+2*16], xmm2)
+	AS2(	movdqa	[WORD_REG(si)+1*16], xmm1)
+	AS2(	movdqa	[WORD_REG(si)+0*16], xmm0)
 	ASL(5)
 
 #ifdef __GNUC__
-	AS1(	pop		ebx)
+	AS_POP(			bx)
 	".att_syntax prefix;"
 		:
 		: "c" (count), "S" (state), "D" (z), "d" (y)
diff --git a/rijndael.cpp b/rijndael.cpp
index 4a8572f2..ac4f7699 100644
--- a/rijndael.cpp
+++ b/rijndael.cpp
@@ -149,81 +149,133 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
 
 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
 {
-#ifdef CRYPTOPP_X86_ASM_AVAILABLE
+#if defined(CRYPTOPP_X86_ASM_AVAILABLE)
 	if (HasMMX())
 	{
 		const word32 *k = m_key;
 		const word32 *kLoopEnd = k + m_rounds*4;
+		#if CRYPTOPP_BOOL_X64
+			#define K_REG			r8
+			#define K_END_REG		r9
+			#define SAVE_K
+			#define RESTORE_K
+			#define RESTORE_K_END
+			#define SAVE_0(x)		AS2(mov	r10d, x)
+			#define SAVE_1(x)		AS2(mov	r11d, x)
+			#define SAVE_2(x)		AS2(mov	r12d, x)
+			#define RESTORE_0(x)	AS2(mov	x, r10d)
+			#define RESTORE_1(x)	AS2(mov	x, r11d)
+			#define RESTORE_2(x)	AS2(mov	x, r12d)
+		#else
+			#define K_REG			esi
+			#define K_END_REG		edi
+			#define SAVE_K			AS2(movd	mm4, esi)
+			#define RESTORE_K		AS2(movd	esi, mm4)
+			#define RESTORE_K_END	AS2(movd	edi, mm5)
+			#define SAVE_0(x)		AS2(movd	mm0, x)
+			#define SAVE_1(x)		AS2(movd	mm1, x)
+			#define SAVE_2(x)		AS2(movd	mm2, x)
+			#define RESTORE_0(x)	AS2(movd	x, mm0)
+			#define RESTORE_1(x)	AS2(movd	x, mm1)
+			#define RESTORE_2(x)	AS2(movd	x, mm2)
+		#endif
 #ifdef __GNUC__
 		word32 t0, t1, t2, t3;
 		__asm__ __volatile__
 		(
 		".intel_syntax noprefix;"
-		AS1(	push	ebx)
-		AS1(	push	ebp)
-		AS2(	mov		ebp, eax)
+		AS_PUSH(		bx)
+		AS_PUSH(		bp)
+		AS2(	mov		WORD_REG(bp), WORD_REG(ax))
+	#if CRYPTOPP_BOOL_X64
+		// save these manually. clobber list doesn't seem to work as of GCC 4.1.0
+		AS1(	pushq	K_REG)
+		AS1(	pushq	K_END_REG)
+		AS1(	pushq	r10)
+		AS1(	pushq	r11)
+		AS1(	pushq	r12)
+		AS2(	mov		K_REG, rsi)
+		AS2(	mov		K_END_REG, rcx)
+	#else
 		AS2(	movd	mm5, ecx)
+	#endif
 #else
+	#if _MSC_VER < 1300
+		const word32 *t = Te;
+		AS2(	mov		eax, t)
+	#endif
 		AS2(	mov		edx, g_cacheLineSize)
-		AS2(	mov		edi, inBlock)
-		AS2(	mov		esi, k)
+		AS2(	mov		WORD_REG(di), inBlock)
+		AS2(	mov		K_REG, k)
 		AS2(	movd	mm5, kLoopEnd)
-		AS1(	push	ebp)
+	#if _MSC_VER < 1300
+		AS_PUSH(		bx)
+		AS_PUSH(		bp)
+		AS2(	mov		ebp, eax)
+	#else
+		AS_PUSH(		bp)
 		AS2(	lea		ebp, Te)
+	#endif
 #endif
-		AS2(	mov		eax, [esi+0*4])	// s0
-		AS2(	xor		eax, [edi+0*4])
-		AS2(	movd	mm0, eax)
-		AS2(	mov		ebx, [esi+1*4])
-		AS2(	xor		ebx, [edi+1*4])
-		AS2(	movd	mm1, ebx)
+		AS2(	mov		eax, [K_REG+0*4])	// s0
+		AS2(	xor		eax, [WORD_REG(di)+0*4])
+		SAVE_0(eax)
+		AS2(	mov		ebx, [K_REG+1*4])
+		AS2(	xor		ebx, [WORD_REG(di)+1*4])
+		SAVE_1(ebx)
 		AS2(	and		ebx, eax)
-		AS2(	mov		eax, [esi+2*4])
-		AS2(	xor		eax, [edi+2*4])
-		AS2(	movd	mm2, eax)
+		AS2(	mov		eax, [K_REG+2*4])
+		AS2(	xor		eax, [WORD_REG(di)+2*4])
+		SAVE_2(eax)
 		AS2(	and		ebx, eax)
-		AS2(	mov		ecx, [esi+3*4])
-		AS2(	xor		ecx, [edi+3*4])
+		AS2(	mov		ecx, [K_REG+3*4])
+		AS2(	xor		ecx, [WORD_REG(di)+3*4])
 		AS2(	and		ebx, ecx)
 
 		// read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction
 		AS2(	and		ebx, 0)
 		AS2(	mov		edi, ebx)	// make index depend on previous loads to simulate lfence
 		ASL(2)
-		AS2(	and		ebx, [ebp+edi])
+		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
 		AS2(	add		edi, edx)
-		AS2(	and		ebx, [ebp+edi])
+		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
 		AS2(	add		edi, edx)
-		AS2(	and		ebx, [ebp+edi])
+		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
 		AS2(	add		edi, edx)
-		AS2(	and		ebx, [ebp+edi])
+		AS2(	and		ebx, [WORD_REG(bp)+WORD_REG(di)])
 		AS2(	add		edi, edx)
 		AS2(	cmp		edi, 1024)
 		ASJ(	jl,		2, b)
-		AS2(	and		ebx, [ebp+1020])
+		AS2(	and		ebx, [WORD_REG(bp)+1020])
+#if CRYPTOPP_BOOL_X64
+		AS2(	xor		r10d, ebx)
+		AS2(	xor		r11d, ebx)
+		AS2(	xor		r12d, ebx)
+#else
 		AS2(	movd	mm6, ebx)
 		AS2(	pxor	mm2, mm6)
 		AS2(	pxor	mm1, mm6)
 		AS2(	pxor	mm0, mm6)
+#endif
 		AS2(	xor		ecx, ebx)
 
-		AS2(	mov		edi, [esi+4*4])	// t0
-		AS2(	mov		eax, [esi+5*4])
-		AS2(	mov		ebx, [esi+6*4])
-		AS2(	mov		edx, [esi+7*4])
-		AS2(	add		esi, 8*4)
-		AS2(	movd	mm4, esi)
+		AS2(	mov		edi, [K_REG+4*4])	// t0
+		AS2(	mov		eax, [K_REG+5*4])
+		AS2(	mov		ebx, [K_REG+6*4])
+		AS2(	mov		edx, [K_REG+7*4])
+		AS2(	add		K_REG, 8*4)
+		SAVE_K
 
 #define QUARTER_ROUND(t, a, b, c, d)	\
 	AS2(movzx esi, t##l)\
-	AS2(d, [ebp+0*1024+4*esi])\
+	AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(c, [ebp+1*1024+4*esi])\
+	AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
 	AS2(shr e##t##x, 16)\
 	AS2(movzx esi, t##l)\
-	AS2(b, [ebp+2*1024+4*esi])\
+	AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(a, [ebp+3*1024+4*esi])
+	AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])
 
 #define s0		xor edi
 #define s1		xor eax
@@ -235,69 +287,69 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 #define t3		xor edx
 
 		QUARTER_ROUND(c, t0, t1, t2, t3)
-		AS2(	movd	ecx, mm2)
+		RESTORE_2(ecx)
 		QUARTER_ROUND(c, t3, t0, t1, t2)
-		AS2(	movd	ecx, mm1)
+		RESTORE_1(ecx)
 		QUARTER_ROUND(c, t2, t3, t0, t1)
-		AS2(	movd	ecx, mm0)
+		RESTORE_0(ecx)
 		QUARTER_ROUND(c, t1, t2, t3, t0)
-		AS2(	movd	mm2, ebx)
-		AS2(	movd	mm1, eax)
-		AS2(	movd	mm0, edi)
+		SAVE_2(ebx)
+		SAVE_1(eax)
+		SAVE_0(edi)
 #undef QUARTER_ROUND
 
-		AS2(	movd	esi, mm4)
+		RESTORE_K
 
 		ASL(0)
-		AS2(	mov		edi, [esi+0*4])
-		AS2(	mov		eax, [esi+1*4])
-		AS2(	mov		ebx, [esi+2*4])
-		AS2(	mov		ecx, [esi+3*4])
+		AS2(	mov		edi, [K_REG+0*4])
+		AS2(	mov		eax, [K_REG+1*4])
+		AS2(	mov		ebx, [K_REG+2*4])
+		AS2(	mov		ecx, [K_REG+3*4])
 
 #define QUARTER_ROUND(t, a, b, c, d)	\
 	AS2(movzx esi, t##l)\
-	AS2(a, [ebp+3*1024+4*esi])\
+	AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(b, [ebp+2*1024+4*esi])\
+	AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
 	AS2(shr e##t##x, 16)\
 	AS2(movzx esi, t##l)\
-	AS2(c, [ebp+1*1024+4*esi])\
+	AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
 	AS2(movzx esi, t##h)\
-	AS2(d, [ebp+0*1024+4*esi])
+	AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])
 
 		QUARTER_ROUND(d, s0, s1, s2, s3)
-		AS2(	movd	edx, mm2)
+		RESTORE_2(edx)
 		QUARTER_ROUND(d, s3, s0, s1, s2)
-		AS2(	movd	edx, mm1)
+		RESTORE_1(edx)
 		QUARTER_ROUND(d, s2, s3, s0, s1)
-		AS2(	movd	edx, mm0)
+		RESTORE_0(edx)
 		QUARTER_ROUND(d, s1, s2, s3, s0)
-		AS2(	movd	esi, mm4)
-		AS2(	movd	mm2, ebx)
-		AS2(	movd	mm1, eax)
-		AS2(	movd	mm0, edi)
+		RESTORE_K
+		SAVE_2(ebx)
+		SAVE_1(eax)
+		SAVE_0(edi)
 
-		AS2(	mov		edi, [esi+4*4])
-		AS2(	mov		eax, [esi+5*4])
-		AS2(	mov		ebx, [esi+6*4])
-		AS2(	mov		edx, [esi+7*4])
+		AS2(	mov		edi, [K_REG+4*4])
+		AS2(	mov		eax, [K_REG+5*4])
+		AS2(	mov		ebx, [K_REG+6*4])
+		AS2(	mov		edx, [K_REG+7*4])
 
 		QUARTER_ROUND(c, t0, t1, t2, t3)
-		AS2(	movd	ecx, mm2)
+		RESTORE_2(ecx)
 		QUARTER_ROUND(c, t3, t0, t1, t2)
-		AS2(	movd	ecx, mm1)
+		RESTORE_1(ecx)
 		QUARTER_ROUND(c, t2, t3, t0, t1)
-		AS2(	movd	ecx, mm0)
+		RESTORE_0(ecx)
 		QUARTER_ROUND(c, t1, t2, t3, t0)
-		AS2(	movd	mm2, ebx)
-		AS2(	movd	mm1, eax)
-		AS2(	movd	mm0, edi)
+		SAVE_2(ebx)
+		SAVE_1(eax)
+		SAVE_0(edi)
 
-		AS2(	movd	esi, mm4)
-		AS2(	movd	edi, mm5)
-		AS2(	add		esi, 8*4)
-		AS2(	movd	mm4, esi)
-		AS2(	cmp		edi, esi)
+		RESTORE_K
+		RESTORE_K_END
+		AS2(	add		K_REG, 8*4)
+		SAVE_K
+		AS2(	cmp		K_END_REG, K_REG)
 		ASJ(	jne,	0, b)
 
 #undef QUARTER_ROUND
@@ -310,44 +362,54 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 #undef t2
 #undef t3
 
-		AS2(	mov		eax, [edi+0*4])
-		AS2(	mov		ecx, [edi+1*4])
-		AS2(	mov		esi, [edi+2*4])
-		AS2(	mov		edi, [edi+3*4])
+		AS2(	mov		eax, [K_END_REG+0*4])
+		AS2(	mov		ecx, [K_END_REG+1*4])
+		AS2(	mov		esi, [K_END_REG+2*4])
+		AS2(	mov		edi, [K_END_REG+3*4])
 
 #define QUARTER_ROUND(a, b, c, d)	\
 	AS2(	movzx	ebx, dl)\
-	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
 	AS2(	shl		ebx, 3*8)\
 	AS2(	xor		a, ebx)\
 	AS2(	movzx	ebx, dh)\
-	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
 	AS2(	shl		ebx, 2*8)\
 	AS2(	xor		b, ebx)\
 	AS2(	shr		edx, 16)\
 	AS2(	movzx	ebx, dl)\
 	AS2(	shr		edx, 8)\
-	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
 	AS2(	shl		ebx, 1*8)\
 	AS2(	xor		c, ebx)\
-	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*edx])\
+	AS2(	movzx	ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\
 	AS2(	xor		d, ebx)
 
 		QUARTER_ROUND(eax, ecx, esi, edi)
-		AS2(	movd	edx, mm2)
+		RESTORE_2(edx)
 		QUARTER_ROUND(edi, eax, ecx, esi)
-		AS2(	movd	edx, mm1)
+		RESTORE_1(edx)
 		QUARTER_ROUND(esi, edi, eax, ecx)
-		AS2(	movd	edx, mm0)
+		RESTORE_0(edx)
 		QUARTER_ROUND(ecx, esi, edi, eax)
 
 #undef QUARTER_ROUND
 
-		AS1(	pop		ebp)
-		AS1(	emms)
+#if CRYPTOPP_BOOL_X64
+		AS1(popq	r12)
+		AS1(popq	r11)
+		AS1(popq	r10)
+		AS1(popq	K_END_REG)
+		AS1(popq	K_REG)
+#else
+		AS1(emms)
+#endif
+		AS_POP(		bp)
 
+#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
+		AS_POP(		bx)
+#endif
 #ifdef __GNUC__
-		AS1(	pop		ebx)
 		".att_syntax prefix;"
 			: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
 			: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
@@ -366,19 +428,19 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 		((word32 *)outBlock)[2] = t2;
 		((word32 *)outBlock)[3] = t3;
 #else
-		AS2(	mov		ebx, xorBlock)
-		AS2(	test	ebx, ebx)
+		AS2(	mov		WORD_REG(bx), xorBlock)
+		AS2(	test	WORD_REG(bx), WORD_REG(bx))
 		ASJ(	jz,		1, f)
-		AS2(	xor		eax, [ebx+0*4])
-		AS2(	xor		ecx, [ebx+1*4])
-		AS2(	xor		esi, [ebx+2*4])
-		AS2(	xor		edi, [ebx+3*4])
+		AS2(	xor		eax, [WORD_REG(bx)+0*4])
+		AS2(	xor		ecx, [WORD_REG(bx)+1*4])
+		AS2(	xor		esi, [WORD_REG(bx)+2*4])
+		AS2(	xor		edi, [WORD_REG(bx)+3*4])
 		ASL(1)
-		AS2(	mov		ebx, outBlock)
-		AS2(	mov		[ebx+0*4], eax)
-		AS2(	mov		[ebx+1*4], ecx)
-		AS2(	mov		[ebx+2*4], esi)
-		AS2(	mov		[ebx+3*4], edi)
+		AS2(	mov		WORD_REG(bx), outBlock)
+		AS2(	mov		[WORD_REG(bx)+0*4], eax)
+		AS2(	mov		[WORD_REG(bx)+1*4], ecx)
+		AS2(	mov		[WORD_REG(bx)+2*4], esi)
+		AS2(	mov		[WORD_REG(bx)+3*4], edi)
 #endif
 	}
 	else
diff --git a/secblock.h b/secblock.h
index cdc67c10..0bc53243 100644
--- a/secblock.h
+++ b/secblock.h
@@ -130,10 +130,13 @@ public:
 		#endif
 
 			assert(IsAlignedOn(p, 16));
-			return (T*)p;
+			return (pointer)p;
 		}
 
-		return new T[n];
+		pointer p;
+		while (!(p = (pointer)malloc(sizeof(T)*n)))
+			CallNewHandler();
+		return p;
 	}
 
 	void deallocate(void *p, size_type n)
@@ -153,7 +156,7 @@ public:
 			return;
 		}
 
-		delete [] (T *)p;
+		free(p);
 	}
 
 	pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve)
@@ -164,13 +167,19 @@ public:
 	// VS.NET STL enforces the policy of "All STL-compliant allocators have to provide a
 	// template class member called rebind".
     template <class U> struct rebind { typedef AllocatorWithCleanup<U, T_Align16> other; };
+#if _MSC_VER >= 1500
+	AllocatorWithCleanup() {}
+	template <class U, bool A> AllocatorWithCleanup(const AllocatorWithCleanup<U, A> &) {}
+#endif
 };
 
 CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<byte>;
 CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word16>;
 CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word32>;
 CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word64>;
-CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word, CRYPTOPP_BOOL_X86>;	// for Integer
+#if CRYPTOPP_BOOL_X86
+CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word, true>;	// for Integer
+#endif
 
 template <class T>
 class NullAllocator : public AllocatorBase<T>
@@ -260,7 +269,7 @@ public:
 	size_type max_size() const {return STDMAX(m_fallbackAllocator.max_size(), S);}
 
 private:
-	T* GetAlignedArray() {return T_Align16 ? (T*)(((byte *)m_array) + (0-(unsigned int)m_array)%16) : m_array;}
+	T* GetAlignedArray() {return T_Align16 ? (T*)(((byte *)m_array) + (0-(size_t)m_array)%16) : m_array;}
 
 	CRYPTOPP_ALIGN_DATA(8) T m_array[T_Align16 ? S+8/sizeof(T) : S];
 	A m_fallbackAllocator;
@@ -466,10 +475,10 @@ public:
 	explicit SecBlockWithHint(size_t size) : SecBlock<T, A>(size) {}
 };
 
-template<class T, class U>
-inline bool operator==(const CryptoPP::AllocatorWithCleanup<T>&, const CryptoPP::AllocatorWithCleanup<U>&) {return (true);}
-template<class T, class U>
-inline bool operator!=(const CryptoPP::AllocatorWithCleanup<T>&, const CryptoPP::AllocatorWithCleanup<U>&) {return (false);}
+template<class T, bool A, class U, bool B>
+inline bool operator==(const CryptoPP::AllocatorWithCleanup<T, A>&, const CryptoPP::AllocatorWithCleanup<U, B>&) {return (true);}
+template<class T, bool A, class U, bool B>
+inline bool operator!=(const CryptoPP::AllocatorWithCleanup<T, A>&, const CryptoPP::AllocatorWithCleanup<U, B>&) {return (false);}
 
 NAMESPACE_END
 
diff --git a/sha.cpp b/sha.cpp
index 127d1f99..78a850e9 100644
--- a/sha.cpp
+++ b/sha.cpp
@@ -308,9 +308,9 @@ CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN1
 	W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
 };
 
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
 // put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version
-static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
+CRYPTOPP_NAKED static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
 {
 #ifdef __GNUC__
 	__asm__ __volatile__
@@ -319,6 +319,9 @@ static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64
 	AS1(	push	ebx)
 	AS2(	mov		ebx, eax)
 #else
+	AS1(	push	ebx)
+	AS1(	push	esi)
+	AS1(	push	edi)
 	AS2(	lea		ebx, SHA512_K)
 #endif
 
@@ -486,22 +489,30 @@ static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64
 	AS1(	pop		esp)
 	AS1(	emms)
 
-#ifdef __GNUC__
+#if defined(__GNUC__)
 	AS1(	pop		ebx)
 	".att_syntax prefix;"
 		:
 		: "a" (SHA512_K), "c" (state), "d" (data)
 		: "%esi", "%edi", "memory", "cc"
 	);
+#else
+	AS1(	pop		edi)
+	AS1(	pop		esi)
+	AS1(	pop		ebx)
+	AS1(	ret)
 #endif
 }
 #endif	// #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
 
 void SHA512::Transform(word64 *state, const word64 *data)
 {
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
 	if (HasSSE2())
-		return SHA512_SSE2_Transform(state, data);
+	{
+		SHA512_SSE2_Transform(state, data);
+		return;
+	}
 #endif
 
 #define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
diff --git a/smartptr.h b/smartptr.h
index f5630012..6b4040e9 100644
--- a/smartptr.h
+++ b/smartptr.h
@@ -189,21 +189,21 @@ template <class T> counted_ptr<T> & counted_ptr<T>::operator=(const counted_ptr<
 template <class T> class vector_member_ptrs
 {
 public:
-	vector_member_ptrs(unsigned int size=0)
+	vector_member_ptrs(size_t size=0)
 		: m_size(size), m_ptr(new member_ptr<T>[size]) {}
 	~vector_member_ptrs()
 		{delete [] this->m_ptr;}
 
-	member_ptr<T>& operator[](unsigned int index)
+	member_ptr<T>& operator[](size_t index)
 		{assert(index<this->m_size); return this->m_ptr[index];}
-	const member_ptr<T>& operator[](unsigned int index) const
+	const member_ptr<T>& operator[](size_t index) const
 		{assert(index<this->m_size); return this->m_ptr[index];}
 
-	unsigned int size() const {return this->m_size;}
-	void resize(unsigned int newSize)
+	size_t size() const {return this->m_size;}
+	void resize(size_t newSize)
 	{
 		member_ptr<T> *newPtr = new member_ptr<T>[newSize];
-		for (unsigned int i=0; i<this->m_size && i<newSize; i++)
+		for (size_t i=0; i<this->m_size && i<newSize; i++)
 			newPtr[i].reset(this->m_ptr[i].release());
 		delete [] this->m_ptr;
 		this->m_size = newSize;
@@ -214,7 +214,7 @@ private:
 	vector_member_ptrs(const vector_member_ptrs<T> &c);	// copy not allowed
 	void operator=(const vector_member_ptrs<T> &x);		// assignment not allowed
 
-	unsigned int m_size;
+	size_t m_size;
 	member_ptr<T> *m_ptr;
 };
 
diff --git a/sosemanuk.cpp b/sosemanuk.cpp
index 816cb981..c86b8773 100755
--- a/sosemanuk.cpp
+++ b/sosemanuk.cpp
@@ -68,6 +68,10 @@ void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
 	m_state[1] = b;
 	m_state[2] = e;
 	m_state[3] = d;
+
+#define XMUX(c, x, y)   (x ^ (y & (0 - (c & 1))))
+	m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]);
+	m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
 }
 
 static word32 s_mulTables[512] = {
@@ -282,10 +286,8 @@ unsigned int SosemanukPolicy::GetAlignment() const
 	else
 #endif
 		return 1;
-#endif
 }
 
-#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
 unsigned int SosemanukPolicy::GetOptimalBlockSize() const
 {
 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
@@ -316,54 +318,54 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		__asm__ __volatile__
 		(
 		".intel_syntax noprefix;"
-		AS1(	push	ebx)
+		AS_PUSH(		bx)
 #else
 		word32 *state = m_state;
-		AS2(	mov		eax, state)
-		AS2(	mov		edi, output)
-		AS2(	mov		edx, input)
-		AS2(	mov		ecx, iterationCount)
+		AS2(	mov		WORD_REG(ax), state)
+		AS2(	mov		WORD_REG(di), output)
+		AS2(	mov		WORD_REG(dx), input)
+		AS2(	mov		WORD_REG(cx), iterationCount)
 #endif
 
-#define SSE2_output			DWORD PTR [esp+1*4]
-#define SSE2_input			DWORD PTR [esp+2*4]
-#define SSE2_wordsLeft		DWORD PTR [esp+3*4]
-#define SSE2_ediEnd			DWORD PTR [esp+4*4]
-#define SSE2_pMulTables		DWORD PTR [esp+5*4]
-#define SSE2_state			DWORD PTR [esp+6*4]
-#define SSE2_wordsLeft2		DWORD PTR [esp+7*4]
-#define SSE2_stateCopy		esp + 8*4
+#define SSE2_output			WORD_PTR [WORD_REG(sp)+1*WORD_SZ]
+#define SSE2_input			WORD_PTR [WORD_REG(sp)+2*WORD_SZ]
+#define SSE2_wordsLeft		WORD_PTR [WORD_REG(sp)+3*WORD_SZ]
+#define SSE2_diEnd			WORD_PTR [WORD_REG(sp)+4*WORD_SZ]
+#define SSE2_pMulTables		WORD_PTR [WORD_REG(sp)+5*WORD_SZ]
+#define SSE2_state			WORD_PTR [WORD_REG(sp)+6*WORD_SZ]
+#define SSE2_wordsLeft2		WORD_PTR [WORD_REG(sp)+7*WORD_SZ]
+#define SSE2_stateCopy		WORD_REG(sp) + 8*WORD_SZ
 #define	SSE2_uvStart		SSE2_stateCopy + 12*4
 
-		AS1(	push	ebp)
-		AS2(	mov		ebx, esp)
-		AS2(	and		esp, 0xfffffff0)
-		AS2(	sub		esp, 80*4*2+12*4+8*4)	// 80 v's, 80 u's, 12 state, 8 locals
-		AS2(	mov		[esp], ebx)
-		AS2(	mov		SSE2_output, edi)
-		AS2(	mov		SSE2_input, edx)
-		AS2(	mov		SSE2_state, eax)
+		AS_PUSH(		bp)
+		AS2(	mov		WORD_REG(bx), WORD_REG(sp))
+		AS2(	and		WORD_REG(sp), -16)
+		AS2(	sub		WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ)	// 80 v's, 80 u's, 12 state, 8 locals
+		AS2(	mov		[WORD_REG(sp)], WORD_REG(bx))
+		AS2(	mov		SSE2_output, WORD_REG(di))
+		AS2(	mov		SSE2_input, WORD_REG(dx))
+		AS2(	mov		SSE2_state, WORD_REG(ax))
 #ifndef _MSC_VER
-		AS2(	mov		SSE2_pMulTables, esi)
+		AS2(	mov		SSE2_pMulTables, WORD_REG(si))
 #endif
-		AS2(	lea		ecx, [4*ecx+ecx])
-		AS2(	lea		esi, [4*ecx])
-		AS2(	mov		SSE2_wordsLeft, esi)
-		AS2(	movdqa	xmm0, [eax+0*16])		// copy state to stack to save a register
+		AS2(	lea		WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
+		AS2(	lea		WORD_REG(si), [4*WORD_REG(cx)])
+		AS2(	mov		SSE2_wordsLeft, WORD_REG(si))
+		AS2(	movdqa	xmm0, [WORD_REG(ax)+0*16])		// copy state to stack to save a register
 		AS2(	movdqa	[SSE2_stateCopy+0*16], xmm0)
-		AS2(	movdqa	xmm0, [eax+1*16])
+		AS2(	movdqa	xmm0, [WORD_REG(ax)+1*16])
 		AS2(	movdqa	[SSE2_stateCopy+1*16], xmm0)
-		AS2(	movq	xmm0, QWORD PTR [eax+2*16])
+		AS2(	movq	xmm0, QWORD PTR [WORD_REG(ax)+2*16])
 		AS2(	movq	QWORD PTR [SSE2_stateCopy+2*16], xmm0)
 		AS2(	psrlq	xmm0, 32)
 		AS2(	movd	ebx, xmm0)				// s(9)
-		AS2(	mov		ecx, [eax+10*4])
-		AS2(	mov		edx, [eax+11*4])
+		AS2(	mov		ecx, [WORD_REG(ax)+10*4])
+		AS2(	mov		edx, [WORD_REG(ax)+11*4])
 		AS2(	pcmpeqb	xmm7, xmm7)				// all ones
 
 #define s(i)	SSE2_stateCopy + ASM_MOD(i,10)*4
-#define u(j)	edi + (ASM_MOD(j,4)*20 + (j/4)) * 4
-#define v(j)	edi + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
+#define u(j)	WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
+#define v(j)	WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
 
 #define r10 ecx
 #define r11 edx
@@ -371,42 +373,42 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 #define r21 ecx
 
 #define SSE2_STEP(i, j)	\
-	AS2(	mov		eax, [s(i+3)])\
-	AS2(	mov		ebp, 1)\
-	AS2(	and		ebp, r1##j)\
-	AS1(	neg		ebp)\
-	AS2(	and		ebp, [s(i+8)])\
-	AS2(	xor		ebp, [s(i+1)])\
-	AS2(	add		r2##j, ebp)\
-	AS2(	movzx	ebp, al)\
-	AS2(	shr		eax, 8)\
-	AS2(	xor		eax, [esi+1024+ebp*4])\
-	AS2(	lea		ebp, [ebx + r2##j])\
-	AS2(	xor		ebx, eax)\
-	AS2(	imul	r1##j, 0x54655307)\
 	AS2(	mov		eax, [s(i+0)])\
 	AS2(	mov		[v(i)], eax)\
 	AS2(	rol		eax, 8)\
-	AS2(	xor		ebx, eax)\
-	AS2(	movzx	eax, al)\
-	AS2(	rol		r1##j, 7)\
-	AS2(	xor		ebx, [esi+eax*4])\
+	AS2(	lea		ebp, [ebx + r2##j])\
 	AS2(	xor		ebp, r1##j)\
 	AS2(	mov		[u(i)], ebp)\
+	AS2(	mov		ebp, 1)\
+	AS2(	and		ebp, r2##j)\
+	AS1(	neg		ebp)\
+	AS2(	and		ebp, ebx)\
+	AS2(	xor		ebx, eax)\
+	AS2(	movzx	eax, al)\
+	AS2(	xor		ebx, [WORD_REG(si)+WORD_REG(ax)*4])\
+	AS2(	mov		eax, [s(i+3)])\
+	AS2(	xor		ebp, [s(i+2)])\
+	AS2(	add		r1##j, ebp)\
+	AS2(	movzx	ebp, al)\
+	AS2(	shr		eax, 8)\
+	AS2(	xor		ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\
+	AS2(	xor		ebx, eax)\
+	AS2(	imul	r2##j, 0x54655307)\
+	AS2(	rol		r2##j, 7)\
 	AS2(	mov		[s(i+0)], ebx)\
 
 		ASL(2)	// outer loop, each iteration of this processes 80 words
-		AS2(	lea		edi, [SSE2_uvStart])	// start of v and u
-		AS2(	mov		eax, 80)
-		AS2(	cmp		esi, 80)
-		AS2(	cmovg	esi, eax)
-		AS2(	mov		SSE2_wordsLeft2, esi)
-		AS2(	lea		esi, [edi+esi])		// use to first inner loop
-		AS2(	mov		SSE2_ediEnd, esi)
+		AS2(	lea		WORD_REG(di), [SSE2_uvStart])	// start of v and u
+		AS2(	mov		WORD_REG(ax), 80)
+		AS2(	cmp		WORD_REG(si), 80)
+		AS2(	cmovg	WORD_REG(si), WORD_REG(ax))
+		AS2(	mov		SSE2_wordsLeft2, WORD_REG(si))
+		AS2(	lea		WORD_REG(si), [WORD_REG(di)+WORD_REG(si)])		// use to end first inner loop
+		AS2(	mov		SSE2_diEnd, WORD_REG(si))
 #ifdef _MSC_VER
-		AS2(	lea		esi, s_mulTables)
+		AS2(	lea		WORD_REG(si), s_mulTables)
 #else
-		AS2(	mov		esi, SSE2_pMulTables)
+		AS2(	mov		WORD_REG(si), SSE2_pMulTables)
 #endif
 
 		ASL(0)	// first inner loop, 20 words each, 4 iterations
@@ -431,20 +433,20 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		SSE2_STEP(18, 0)
 		SSE2_STEP(19, 1)
 		// loop
-		AS2(	add		edi, 5*4)
-		AS2(	cmp		edi, SSE2_ediEnd)
+		AS2(	add		WORD_REG(di), 5*4)
+		AS2(	cmp		WORD_REG(di), SSE2_diEnd)
 		ASJ(	jne,	0, b)
 
-		AS2(	mov		eax, SSE2_input)
-		AS2(	mov		ebp, SSE2_output)
-		AS2(	lea		edi, [SSE2_uvStart])		// start of v and u
-		AS2(	mov		esi, SSE2_wordsLeft2)
+		AS2(	mov		WORD_REG(ax), SSE2_input)
+		AS2(	mov		WORD_REG(bp), SSE2_output)
+		AS2(	lea		WORD_REG(di), [SSE2_uvStart])		// start of v and u
+		AS2(	mov		WORD_REG(si), SSE2_wordsLeft2)
 
 		ASL(1)	// second inner loop, 16 words each, 5 iterations
-		AS2(	movdqa	xmm0, [edi+0*20*4])
-		AS2(	movdqa	xmm1, [edi+1*20*4])
-		AS2(	movdqa	xmm2, [edi+2*20*4])
-		AS2(	movdqa	xmm3, [edi+3*20*4])
+		AS2(	movdqa	xmm0, [WORD_REG(di)+0*20*4])
+		AS2(	movdqa	xmm2, [WORD_REG(di)+2*20*4])
+		AS2(	movdqa	xmm3, [WORD_REG(di)+3*20*4])
+		AS2(	movdqa	xmm1, [WORD_REG(di)+1*20*4])
 		// S2
 		AS2(	movdqa	xmm4, xmm0)
 		AS2(	pand	xmm0, xmm2)
@@ -463,13 +465,13 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
  		AS2(	pxor	xmm1, xmm4)
 		AS2(	pxor	xmm4, xmm7)
 		// xor with v
-		AS2(	pxor	xmm2, [edi+80*4])
-		AS2(	pxor	xmm3, [edi+80*5])
-		AS2(	pxor	xmm1, [edi+80*6])
-		AS2(	pxor	xmm4, [edi+80*7])
+		AS2(	pxor	xmm2, [WORD_REG(di)+80*4])
+		AS2(	pxor	xmm3, [WORD_REG(di)+80*5])
+		AS2(	pxor	xmm1, [WORD_REG(di)+80*6])
+		AS2(	pxor	xmm4, [WORD_REG(di)+80*7])
 		// exit loop early if less than 16 words left to output
 		// this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop
-		AS2(	cmp		esi, 16)
+		AS2(	cmp		WORD_REG(si), 16)
 		ASJ(	jl,		4, f)
 		// unpack
 		AS2(	movdqa		xmm6, xmm2)
@@ -485,75 +487,75 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		AS2(	punpcklqdq	xmm6, xmm5)
 		AS2(	punpckhqdq	xmm3, xmm5)
 		// output keystream
-		AS2(	test	eax, eax)
+		AS2(	test	WORD_REG(ax), WORD_REG(ax))
 		ASJ(	jz,		3, f)
 		AS2(	test	eax, 0xf)
 		ASJ(	jnz,	7, f)
-		AS2(	pxor	xmm2, [eax+0*16])
-		AS2(	pxor	xmm0, [eax+1*16])
-		AS2(	pxor	xmm6, [eax+2*16])
-		AS2(	pxor	xmm3, [eax+3*16])
-		AS2(	add		eax, 4*16)
+		AS2(	pxor	xmm2, [WORD_REG(ax)+0*16])
+		AS2(	pxor	xmm0, [WORD_REG(ax)+1*16])
+		AS2(	pxor	xmm6, [WORD_REG(ax)+2*16])
+		AS2(	pxor	xmm3, [WORD_REG(ax)+3*16])
+		AS2(	add		WORD_REG(ax), 4*16)
 		ASJ(	jmp,	3, f)
 		ASL(7)
-		AS2(	movdqu	xmm1, [eax+0*16])
+		AS2(	movdqu	xmm1, [WORD_REG(ax)+0*16])
 		AS2(	pxor	xmm2, xmm1)
-		AS2(	movdqu	xmm1, [eax+1*16])
+		AS2(	movdqu	xmm1, [WORD_REG(ax)+1*16])
 		AS2(	pxor	xmm0, xmm1)
-		AS2(	movdqu	xmm1, [eax+2*16])
+		AS2(	movdqu	xmm1, [WORD_REG(ax)+2*16])
 		AS2(	pxor	xmm6, xmm1)
-		AS2(	movdqu	xmm1, [eax+3*16])
+		AS2(	movdqu	xmm1, [WORD_REG(ax)+3*16])
 		AS2(	pxor	xmm3, xmm1)
-		AS2(	add		eax, 4*16)
+		AS2(	add		WORD_REG(ax), 4*16)
 		ASL(3)
 		AS2(	test	ebp, 0xf)
 		ASJ(	jnz,	8, f)
-		AS2(	movdqa	[ebp+0*16], xmm2)
-		AS2(	movdqa	[ebp+1*16], xmm0)
-		AS2(	movdqa	[ebp+2*16], xmm6)
-		AS2(	movdqa	[ebp+3*16], xmm3)
+		AS2(	movdqa	[WORD_REG(bp)+0*16], xmm2)
+		AS2(	movdqa	[WORD_REG(bp)+1*16], xmm0)
+		AS2(	movdqa	[WORD_REG(bp)+2*16], xmm6)
+		AS2(	movdqa	[WORD_REG(bp)+3*16], xmm3)
 		ASJ(	jmp,	9, f)
 		ASL(8)
-		AS2(	movdqu	[ebp+0*16], xmm2)
-		AS2(	movdqu	[ebp+1*16], xmm0)
-		AS2(	movdqu	[ebp+2*16], xmm6)
-		AS2(	movdqu	[ebp+3*16], xmm3)
+		AS2(	movdqu	[WORD_REG(bp)+0*16], xmm2)
+		AS2(	movdqu	[WORD_REG(bp)+1*16], xmm0)
+		AS2(	movdqu	[WORD_REG(bp)+2*16], xmm6)
+		AS2(	movdqu	[WORD_REG(bp)+3*16], xmm3)
 		ASL(9)
 		// loop
-		AS2(	add		edi, 4*4)
-		AS2(	add		ebp, 4*16)
-		AS2(	sub		esi, 16)
+		AS2(	add		WORD_REG(di), 4*4)
+		AS2(	add		WORD_REG(bp), 4*16)
+		AS2(	sub		WORD_REG(si), 16)
 		ASJ(	jnz,	1, b)
 
 		// outer loop
-		AS2(	mov		esi, SSE2_wordsLeft)
-		AS2(	sub		esi, 80)
+		AS2(	mov		WORD_REG(si), SSE2_wordsLeft)
+		AS2(	sub		WORD_REG(si), 80)
 		ASJ(	jz,		6, f)
-		AS2(	mov		SSE2_wordsLeft, esi)
-		AS2(	mov		SSE2_input, eax)
-		AS2(	mov		SSE2_output, ebp)
+		AS2(	mov		SSE2_wordsLeft, WORD_REG(si))
+		AS2(	mov		SSE2_input, WORD_REG(ax))
+		AS2(	mov		SSE2_output, WORD_REG(bp))
 		ASJ(	jmp,	2, b)
 
 		ASL(4)	// final output of less than 16 words
-		AS2(	test	eax, eax)
+		AS2(	test	WORD_REG(ax), WORD_REG(ax))
 		ASJ(	jz,		5, f)
-		AS2(	movd	xmm0, [eax+0*4])
+		AS2(	movd	xmm0, [WORD_REG(ax)+0*4])
 		AS2(	pxor	xmm2, xmm0)
-		AS2(	movd	xmm0, [eax+1*4])
+		AS2(	movd	xmm0, [WORD_REG(ax)+1*4])
 		AS2(	pxor	xmm3, xmm0)
-		AS2(	movd	xmm0, [eax+2*4])
+		AS2(	movd	xmm0, [WORD_REG(ax)+2*4])
 		AS2(	pxor	xmm1, xmm0)
-		AS2(	movd	xmm0, [eax+3*4])
+		AS2(	movd	xmm0, [WORD_REG(ax)+3*4])
 		AS2(	pxor	xmm4, xmm0)
-		AS2(	add		eax, 16)
+		AS2(	add		WORD_REG(ax), 16)
 		ASL(5)
-		AS2(	movd	[ebp+0*4], xmm2)
-		AS2(	movd	[ebp+1*4], xmm3)
-		AS2(	movd	[ebp+2*4], xmm1)
-		AS2(	movd	[ebp+3*4], xmm4)
-		AS2(	sub		esi, 4)
+		AS2(	movd	[WORD_REG(bp)+0*4], xmm2)
+		AS2(	movd	[WORD_REG(bp)+1*4], xmm3)
+		AS2(	movd	[WORD_REG(bp)+2*4], xmm1)
+		AS2(	movd	[WORD_REG(bp)+3*4], xmm4)
+		AS2(	sub		WORD_REG(si), 4)
 		ASJ(	jz,		6, f)
-		AS2(	add		ebp, 16)
+		AS2(	add		WORD_REG(bp), 16)
 		AS2(	psrldq	xmm2, 4)
 		AS2(	psrldq	xmm3, 4)
 		AS2(	psrldq	xmm1, 4)
@@ -561,26 +563,26 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		ASJ(	jmp,	4, b)
 
 		ASL(6)	// save state
-		AS2(	mov		ebx, SSE2_state)
+		AS2(	mov		WORD_REG(bx), SSE2_state)
 		AS2(	movdqa	xmm0, [SSE2_stateCopy+0*16])
-		AS2(	movdqa	[ebx+0*16], xmm0)
+		AS2(	movdqa	[WORD_REG(bx)+0*16], xmm0)
 		AS2(	movdqa	xmm0, [SSE2_stateCopy+1*16])
-		AS2(	movdqa	[ebx+1*16], xmm0)
+		AS2(	movdqa	[WORD_REG(bx)+1*16], xmm0)
 		AS2(	movq	xmm0, QWORD PTR [SSE2_stateCopy+2*16])
-		AS2(	movq	QWORD PTR [ebx+2*16], xmm0)
-		AS2(	mov		[ebx+10*4], ecx)
-		AS2(	mov		[ebx+11*4], edx)
+		AS2(	movq	QWORD PTR [WORD_REG(bx)+2*16], xmm0)
+		AS2(	mov		[WORD_REG(bx)+10*4], ecx)
+		AS2(	mov		[WORD_REG(bx)+11*4], edx)
 
-		AS1(	pop		esp)
-		AS1(	pop		ebp)
+		AS_POP(			sp)
+		AS_POP(			bp)
 
 #ifdef __GNUC__
-	AS1(	pop		ebx)
-	".att_syntax prefix;"
-		:
-		: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
-		: "memory", "cc"
-	);
+		AS_POP(			bx)
+		".att_syntax prefix;"
+			:
+			: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
+			: "memory", "cc"
+		);
 #endif
 	}
 	else
@@ -593,17 +595,16 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 #endif
 
 #define DIV_A(x)    (((x) >> 8) ^ s_mulTables[256 + byte(x)])
-#define XMUX(c, x, y)   (x ^ (y & (0 - (c & 1))))
 
 #define r1(i) ((i%2) ? reg2 : reg1)
 #define r2(i) ((i%2) ? reg1 : reg2)
 
 #define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u)	\
-		r2(x0) += XMUX(r1(x0), s##x1, s##x8);\
-		r1(x0) = rotlFixed(r1(x0) * 0x54655307, 7);\
-		v = s##x0;\
 		u = (s##x9 + r2(x0)) ^ r1(x0);\
-		s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;
+		v = s##x0;\
+		s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;\
+		r1(x0) += XMUX(r2(x0), s##x2, s##x9);\
+		r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\
 
 #define SOSEMANUK_OUTPUT(x)	\
 	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\
diff --git a/tiger.cpp b/tiger.cpp
index 332de2c6..87ec74f4 100644
--- a/tiger.cpp
+++ b/tiger.cpp
@@ -34,7 +34,7 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
 
 void Tiger::Transform (word64 *digest, const word64 *X)
 {
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
 	if (HasSSE2())
 	{
 #ifdef __GNUC__
@@ -43,9 +43,14 @@ void Tiger::Transform (word64 *digest, const word64 *X)
 		".intel_syntax noprefix;"
 		AS1(	push	ebx)
 #else
+	#if _MSC_VER < 1300
+		const word64 *t = table;
+		AS2(	mov		edx, t)
+	#else
+		AS2(	lea		edx, [table])
+	#endif
 		AS2(	mov		eax, digest)
 		AS2(	mov		esi, X)
-		AS2(	lea		edx, [table])
 #endif
 		AS2(	movq	mm0, [eax])
 		AS2(	movq	mm1, [eax+1*8])
diff --git a/whrlpool.cpp b/whrlpool.cpp
index da19d7ff..20e721e8 100644
--- a/whrlpool.cpp
+++ b/whrlpool.cpp
@@ -390,7 +390,7 @@ CRYPTOPP_ALIGN_DATA(16) static const word64 Whirlpool_C[4*256+R] CRYPTOPP_SECTIO
 // Whirlpool basic transformation. Transforms state based on block.
 void Whirlpool::Transform(word64 *digest, const word64 *block)
 {
-#ifdef CRYPTOPP_X86_ASM_AVAILABLE
+#if defined(CRYPTOPP_X86_ASM_AVAILABLE)
 	if (HasMMX())
 	{
 		// MMX version has the same structure as C version below
@@ -398,26 +398,29 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 	__asm__ __volatile__
 	(
 		".intel_syntax noprefix;"
-		AS1(	push	ebx)
-		AS2(	mov		ebx, eax)
+		AS_PUSH(		bx)
+		AS2(	mov		WORD_REG(bx), WORD_REG(ax))
 #else
-		AS2(	lea		ebx, [Whirlpool_C])
-		AS2(	mov		ecx, digest)
-		AS2(	mov		edx, block)
+	#if _MSC_VER < 1300
+		AS_PUSH(		bx)
+	#endif
+		AS2(	lea		WORD_REG(bx), [Whirlpool_C])
+		AS2(	mov		WORD_REG(cx), digest)
+		AS2(	mov		WORD_REG(dx), block)
 #endif
-		AS2(	mov		eax, esp)
-		AS2(	and		esp, 0xfffffff0)
-		AS2(	sub		esp, 16*8)
-		AS1(	push	eax)
+		AS2(	mov		WORD_REG(ax), WORD_REG(sp))
+		AS2(	and		WORD_REG(sp), -16)
+		AS2(	sub		WORD_REG(sp), 16*8)
+		AS_PUSH(		ax)
 		AS2(	xor		esi, esi)
 		ASL(0)
-		AS2(	movq	mm0, [ecx+8*esi])
-		AS2(	movq	[esp+4+8*esi], mm0)		// k
-		AS2(	pxor	mm0, [edx+8*esi])
-		AS2(	movq	[esp+4+64+8*esi], mm0)	// s
-		AS2(	movq	[ecx+8*esi], mm0)
-		AS1(	inc		esi)
-		AS2(	cmp		esi, 8)
+		AS2(	movq	mm0, [WORD_REG(cx)+8*WORD_REG(si)])
+		AS2(	movq	[WORD_REG(sp)+WORD_SZ+8*WORD_REG(si)], mm0)		// k
+		AS2(	pxor	mm0, [WORD_REG(dx)+8*WORD_REG(si)])
+		AS2(	movq	[WORD_REG(sp)+WORD_SZ+64+8*WORD_REG(si)], mm0)	// s
+		AS2(	movq	[WORD_REG(cx)+8*WORD_REG(si)], mm0)
+		AS1(	inc		WORD_REG(si))
+		AS2(	cmp		WORD_REG(si), 8)
 		ASJ(	jne,	0, b)
 
 		AS2(	xor		esi, esi)
@@ -427,16 +430,16 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 #define KSL1(a, b)	AS2(pxor	mm##a, b)
 
 #define KSL(op, i, a, b, c, d)	\
-	AS2(mov		eax, [esp+4+8*i])\
+	AS2(mov		eax, [WORD_REG(sp)+WORD_SZ+8*i])\
 	AS2(movzx	edi, al)\
-	KSL##op(a, [ebx+3*2048+8*edi])\
+	KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
 	AS2(movzx	edi, ah)\
-	KSL##op(b, [ebx+2*2048+8*edi])\
+	KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
 	AS2(shr		eax, 16)\
 	AS2(movzx	edi, al)\
 	AS2(shr		eax, 8)\
-	KSL##op(c, [ebx+1*2048+8*edi])\
-	KSL##op(d, [ebx+0*2048+8*eax])
+	KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
+	KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
 
 #define KSH0(a, b)	\
 	ASS(pshufw	mm##a, mm##a, 1, 0, 3, 2)\
@@ -445,57 +448,57 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 	AS2(pxor	mm##a, b)
 #define KSH2(a, b)	\
 	AS2(pxor	mm##a, b)\
-	AS2(movq	[esp+4+8*a], mm##a)
+	AS2(movq	[WORD_REG(sp)+WORD_SZ+8*a], mm##a)
 
 #define KSH(op, i, a, b, c, d)	\
-	AS2(mov		eax, [esp+4+8*((i+4)-8*((i+4)/8))+4])\
+	AS2(mov		eax, [WORD_REG(sp)+WORD_SZ+8*((i+4)-8*((i+4)/8))+4])\
 	AS2(movzx	edi, al)\
-	KSH##op(a, [ebx+3*2048+8*edi])\
+	KSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
 	AS2(movzx	edi, ah)\
-	KSH##op(b, [ebx+2*2048+8*edi])\
+	KSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
 	AS2(shr		eax, 16)\
 	AS2(movzx	edi, al)\
 	AS2(shr		eax, 8)\
-	KSH##op(c, [ebx+1*2048+8*edi])\
-	KSH##op(d, [ebx+0*2048+8*eax])
+	KSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
+	KSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
 
 #define TSL(op, i, a, b, c, d)	\
-	AS2(mov		eax, [esp+4+64+8*i])\
+	AS2(mov		eax, [WORD_REG(sp)+WORD_SZ+64+8*i])\
 	AS2(movzx	edi, al)\
-	KSL##op(a, [ebx+3*2048+8*edi])\
+	KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
 	AS2(movzx	edi, ah)\
-	KSL##op(b, [ebx+2*2048+8*edi])\
+	KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
 	AS2(shr		eax, 16)\
 	AS2(movzx	edi, al)\
 	AS2(shr		eax, 8)\
-	KSL##op(c, [ebx+1*2048+8*edi])\
-	KSL##op(d, [ebx+0*2048+8*eax])
+	KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
+	KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
 
 #define TSH0(a, b)	\
 	ASS(pshufw	mm##a, mm##a, 1, 0, 3, 2)\
-	AS2(pxor	mm##a, [esp+4+8*a])\
+	AS2(pxor	mm##a, [WORD_REG(sp)+WORD_SZ+8*a])\
 	AS2(pxor	mm##a, b)
 #define TSH1(a, b)	\
 	AS2(pxor	mm##a, b)
 #define TSH2(a, b)	\
 	AS2(pxor	mm##a, b)\
-	AS2(movq	[esp+4+64+8*a], mm##a)
+	AS2(movq	[WORD_REG(sp)+WORD_SZ+64+8*a], mm##a)
 #define TSH3(a, b)	\
 	AS2(pxor	mm##a, b)\
-	AS2(pxor	mm##a, [ecx+8*a])\
-	AS2(movq	[ecx+8*a], mm##a)
+	AS2(pxor	mm##a, [WORD_REG(cx)+8*a])\
+	AS2(movq	[WORD_REG(cx)+8*a], mm##a)
 
 #define TSH(op, i, a, b, c, d)	\
-	AS2(mov		eax, [esp+4+64+8*((i+4)-8*((i+4)/8))+4])\
+	AS2(mov		eax, [WORD_REG(sp)+WORD_SZ+64+8*((i+4)-8*((i+4)/8))+4])\
 	AS2(movzx	edi, al)\
-	TSH##op(a, [ebx+3*2048+8*edi])\
+	TSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
 	AS2(movzx	edi, ah)\
-	TSH##op(b, [ebx+2*2048+8*edi])\
+	TSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
 	AS2(shr		eax, 16)\
 	AS2(movzx	edi, al)\
 	AS2(shr		eax, 8)\
-	TSH##op(c, [ebx+1*2048+8*edi])\
-	TSH##op(d, [ebx+0*2048+8*eax])
+	TSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
+	TSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
 
 		KSL(0, 4, 3, 2, 1, 0)
 		KSL(0, 0, 7, 6, 5, 4)
@@ -514,8 +517,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 		KSH(2, 3, 2, 1, 0, 7)
 		KSH(2, 7, 6, 5, 4, 3)
 
-		AS2(	pxor	mm0, [ebx + 8*1024 + esi*8])
-		AS2(	movq	[esp+4], mm0)
+		AS2(	pxor	mm0, [WORD_REG(bx) + 8*1024 + WORD_REG(si)*8])
+		AS2(	movq	[WORD_REG(sp)+WORD_SZ], mm0)
 
 		TSL(0, 4, 3, 2, 1, 0)
 		TSL(0, 0, 7, 6, 5, 4)
@@ -532,8 +535,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 		TSH(1, 5, 4, 3, 2, 1)
 		TSH(1, 6, 5, 4, 3, 2)
 
-		AS1(	inc		esi)
-		AS2(	cmp		esi, 10)
+		AS1(	inc		WORD_REG(si))
+		AS2(	cmp		WORD_REG(si), 10)
 		ASJ(	je,		2, f)
 
 		TSH(2, 3, 2, 1, 0, 7)
@@ -550,11 +553,13 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
 #undef TSL
 #undef TSH
 
+		AS_POP(			sp)
 		AS1(	emms)
-		AS1(	pop		esp)
 
+#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
+		AS_POP(			bx)
+#endif
 #ifdef __GNUC__
-		AS1(	pop		ebx)
 		".att_syntax prefix;"
 			:
 			: "a" (Whirlpool_C), "c" (digest), "d" (block)
diff --git a/x64masm.asm b/x64masm.asm
index 4102c6a7..76676a77 100755
--- a/x64masm.asm
+++ b/x64masm.asm
@@ -1,80 +1,55 @@
 PUBLIC Baseline_Add
 PUBLIC Baseline_Sub
 .CODE
-       ALIGN     8
+    ALIGN   8
 Baseline_Add	PROC
-
 	lea		rdx, [rdx+8*rcx]
 	lea		r8, [r8+8*rcx]
 	lea		r9, [r9+8*rcx]
-
 	neg		rcx					; rcx is negative index
-	test	rcx, 2				; this clears carry flag
-	jz		$0@Baseline_Add
-	sub		rcx, 2
-	jmp		$1@Baseline_Add
-
-$0@Baseline_Add:
-	jrcxz	$2@Baseline_Add		; loop until rcx overflows and becomes zero
+	jz		$1@Baseline_Add
 	mov		rax,[r8+8*rcx]
-	adc		rax,[r9+8*rcx]
+	add		rax,[r9+8*rcx]
 	mov		[rdx+8*rcx],rax
+$0@Baseline_Add:
 	mov		rax,[r8+8*rcx+8]
 	adc		rax,[r9+8*rcx+8]
 	mov		[rdx+8*rcx+8],rax
-$1@Baseline_Add:
-	mov		rax,[r8+8*rcx+16]
-	adc		rax,[r9+8*rcx+16]
-	mov		[rdx+8*rcx+16],rax
-	mov		rax,[r8+8*rcx+24]
-	adc		rax,[r9+8*rcx+24]
-	mov		[rdx+8*rcx+24],rax
-
-	lea		rcx,[rcx+4]			; advance index, avoid inc which causes slowdown on Intel Core 2
+	lea		rcx,[rcx+2]			; advance index, avoid inc which causes slowdown on Intel Core 2
+	jrcxz	$1@Baseline_Add		; loop until rcx overflows and becomes zero
+	mov		rax,[r8+8*rcx]
+	adc		rax,[r9+8*rcx]
+	mov		[rdx+8*rcx],rax
 	jmp		$0@Baseline_Add
-
-$2@Baseline_Add:
+$1@Baseline_Add:
 	mov		rax, 0
-	setc	al					; store carry into rax (return result register)
-
+	adc		rax, rax			; store carry into rax (return result register)
 	ret
 Baseline_Add ENDP
 
-       ALIGN     8
+    ALIGN   8
 Baseline_Sub	PROC
-
 	lea		rdx, [rdx+8*rcx]
 	lea		r8, [r8+8*rcx]
 	lea		r9, [r9+8*rcx]
-
 	neg		rcx					; rcx is negative index
-	test	rcx, 2				; this clears carry flag
-	jz		$0@Baseline_Sub
-	sub		rcx, 2
-	jmp		$1@Baseline_Sub
-
-$0@Baseline_Sub:
-	jrcxz	$2@Baseline_Sub		; loop until rcx overflows and becomes zero
+	jz		$1@Baseline_Sub
 	mov		rax,[r8+8*rcx]
-	sbb		rax,[r9+8*rcx]
+	sub		rax,[r9+8*rcx]
 	mov		[rdx+8*rcx],rax
+$0@Baseline_Sub:
 	mov		rax,[r8+8*rcx+8]
 	sbb		rax,[r9+8*rcx+8]
 	mov		[rdx+8*rcx+8],rax
-$1@Baseline_Sub:
-	mov		rax,[r8+8*rcx+16]
-	sbb		rax,[r9+8*rcx+16]
-	mov		[rdx+8*rcx+16],rax
-	mov		rax,[r8+8*rcx+24]
-	sbb		rax,[r9+8*rcx+24]
-	mov		[rdx+8*rcx+24],rax
-
-	lea		rcx,[rcx+4]			; advance index, avoid inc which causes slowdown on Intel Core 2
+	lea		rcx,[rcx+2]			; advance index, avoid inc which causes slowdown on Intel Core 2
+	jrcxz	$1@Baseline_Sub		; loop until rcx overflows and becomes zero
+	mov		rax,[r8+8*rcx]
+	sbb		rax,[r9+8*rcx]
+	mov		[rdx+8*rcx],rax
 	jmp		$0@Baseline_Sub
-
-$2@Baseline_Sub:
+$1@Baseline_Sub:
 	mov		rax, 0
-	setc	al					; store carry into rax (return result register)
+	adc		rax, rax			; store carry into rax (return result register)
 
 	ret
 Baseline_Sub ENDP