From d2510f30c75b341dcbc45432a4bd38c0513f2616 Mon Sep 17 00:00:00 2001 From: weidai Date: Fri, 4 May 2007 15:24:09 +0000 Subject: [PATCH] fix compile for x64, DLL and VC 6 --- camellia.cpp | 2 +- cpu.cpp | 29 ++-- cpu.h | 33 ++++- datatest.cpp | 13 +- integer.cpp | 389 +++++++++++++++++++++++++++++++++++--------------- panama.cpp | 137 +++++++++--------- rijndael.cpp | 252 ++++++++++++++++++++------------ secblock.h | 27 ++-- sha.cpp | 21 ++- smartptr.h | 14 +- sosemanuk.cpp | 267 +++++++++++++++++----------------- tiger.cpp | 9 +- whrlpool.cpp | 101 ++++++------- x64masm.asm | 69 +++------ 14 files changed, 810 insertions(+), 553 deletions(-) diff --git a/camellia.cpp b/camellia.cpp index 0bca33a1..cdd7906c 100644 --- a/camellia.cpp +++ b/camellia.cpp @@ -228,7 +228,7 @@ void Camellia::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBloc SLOW_ROUND(lh, ll, rh, rl, KS(1,0), KS(1,1)) SLOW_ROUND(rh, rl, lh, ll, KS(1,2), KS(1,3)) - for (unsigned int i = m_rounds-1; i > 0; --i) + for (i = m_rounds-1; i > 0; --i) { DOUBLE_ROUND(lh, ll, rh, rl, KS(2,0), KS(2,1), KS(2,2), KS(2,3)) DOUBLE_ROUND(lh, ll, rh, rl, KS(3,0), KS(3,1), KS(3,2), KS(3,3)) diff --git a/cpu.cpp b/cpu.cpp index a4922504..c42dd8bc 100755 --- a/cpu.cpp +++ b/cpu.cpp @@ -1,8 +1,10 @@ // cpu.cpp - written and placed in the public domain by Wei Dai #include "pch.h" -#include "cpu.h" +#ifndef CRYPTOPP_IMPORTS + +#include "cpu.h" #include "misc.h" #include @@ -11,10 +13,15 @@ #include #endif +#ifdef CRYPTOPP_MSVC6PP_OR_LATER +#include +#endif + NAMESPACE_BEGIN(CryptoPP) #ifdef CRYPTOPP_X86_ASM_AVAILABLE +#ifndef _MSC_VER typedef void (*SigHandler)(int); static jmp_buf s_jmpNoCPUID; @@ -22,6 +29,7 @@ static void SigIllHandlerCPUID(int) { longjmp(s_jmpNoCPUID, 1); } +#endif bool CpuId(word32 input, word32 *output) { @@ -57,7 +65,11 @@ bool CpuId(word32 input, word32 *output) __asm__ ( // save ebx in case -fPIC is being used +#if CRYPTOPP_BOOL_X86 "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx" +#else + "pushq %%rbx; cpuid; mov %%ebx, %%edi; popq %%rbx" +#endif : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3]) : "a" (input) ); @@ -84,22 +96,19 @@ bool CpuId(word32 input, word32 *output) return true; } -inline bool TrySSE2() -{ - return true; -} - #endif #ifdef CRYPTOPP_CPUID_AVAILABLE static bool TrySSE2() { -#ifdef _MSC_VER +#if CRYPTOPP_BOOL_X64 + return true; +#elif defined(_MSC_VER) __try { #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE - __asm por xmm0, xmm0 // executing SSE2 instruction + AS2(por xmm0, xmm0) // executing SSE2 instruction #elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE __mm128i x = _mm_setzero_si128(); return _mm_cvtsi128_si32(x) == 0; @@ -137,7 +146,7 @@ static bool TrySSE2() bool g_x86DetectionDone = false; bool g_hasSSE2 = false, g_hasSSSE3 = false, g_hasMMX = false, g_isP4 = false; -int g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE; +word32 g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE; void DetectX86Features() { @@ -170,3 +179,5 @@ void DetectX86Features() #endif NAMESPACE_END + +#endif diff --git a/cpu.h b/cpu.h index 6a212345..6eae4896 100755 --- a/cpu.h +++ b/cpu.h @@ -3,6 +3,10 @@ #include "config.h" +#ifdef CRYPTOPP_MSVC6PP_OR_LATER + #include +#endif + NAMESPACE_BEGIN(CryptoPP) #if defined(CRYPTOPP_X86_ASM_AVAILABLE) || (_MSC_VER >= 1400 && CRYPTOPP_BOOL_X64) @@ -10,12 +14,15 @@ NAMESPACE_BEGIN(CryptoPP) #define CRYPTOPP_CPUID_AVAILABLE // these should not be used directly -extern bool g_x86DetectionDone; -extern bool g_hasSSE2, g_hasMMX, g_hasSSSE3, g_isP4; -extern int g_cacheLineSize; -void DetectX86Features(); +extern CRYPTOPP_DLL bool g_x86DetectionDone; +extern CRYPTOPP_DLL bool g_hasSSE2; +extern CRYPTOPP_DLL bool g_hasMMX; +extern CRYPTOPP_DLL bool g_hasSSSE3; +extern CRYPTOPP_DLL bool g_isP4; +extern CRYPTOPP_DLL word32 g_cacheLineSize; +CRYPTOPP_DLL void DetectX86Features(); -bool CpuId(word32 input, word32 *output); +CRYPTOPP_DLL bool CpuId(word32 input, word32 *output); #if CRYPTOPP_BOOL_X64 inline bool HasSSE2() {return true;} @@ -94,6 +101,7 @@ inline bool HasMMX() {return false;} #define ASL(x) GNU_ASL(x) #define ASJ(x, y, z) GNU_ASJ(x, y, z) #define ASC(x, y) #x " " #y ";" + #define CRYPTOPP_NAKED #else #define AS1(x) __asm {x} #define AS2(x, y) __asm {x, y} @@ -102,11 +110,26 @@ inline bool HasMMX() {return false;} #define ASL(x) __asm {label##x:} #define ASJ(x, y, z) __asm {x label##y} #define ASC(x, y) __asm {x label##y} + #define CRYPTOPP_NAKED __declspec(naked) #endif // GNU assembler doesn't seem to have mod operator #define ASM_MOD(x, y) ((x)-((x)/(y))*(y)) +#if CRYPTOPP_BOOL_X86 + #define WORD_SZ 4 + #define WORD_REG(x) e##x + #define WORD_PTR DWORD PTR + #define AS_PUSH(x) AS1(push e##x) + #define AS_POP(x) AS1(pop e##x) +#elif CRYPTOPP_BOOL_X64 + #define WORD_SZ 8 + #define WORD_REG(x) r##x + #define WORD_PTR QWORD PTR + #define AS_PUSH(x) AS1(pushq r##x) + #define AS_POP(x) AS1(popq r##x) +#endif + NAMESPACE_END #endif diff --git a/datatest.cpp b/datatest.cpp index 4a326093..950e4f90 100644 --- a/datatest.cpp +++ b/datatest.cpp @@ -5,14 +5,14 @@ #include "randpool.h" #include "files.h" #include "trunhash.h" +#include "queue.h" +#include "validate.h" #include #include USING_NAMESPACE(CryptoPP) USING_NAMESPACE(std) -RandomPool & GlobalRNG(); - typedef std::map TestData; class TestFailure : public Exception @@ -67,7 +67,7 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo s1 = s1.substr(s1.find(' ')+1); } - s2.clear(); + s2 = ""; // MSVC 6 doesn't have clear(); if (s1[0] == '\"') { @@ -85,8 +85,13 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo s1 = s1.substr(STDMIN(s1.find(' '), s1.length())); } + ByteQueue q; while (repeat--) - target.Put((const byte *)s2.data(), s2.size()); + { + q.Put((const byte *)s2.data(), s2.size()); + if (q.MaxRetrievable() > 4*1024 || repeat == 0) + q.TransferTo(target); + } } } diff --git a/integer.cpp b/integer.cpp index 64f3cea0..a8e78818 100644 --- a/integer.cpp +++ b/integer.cpp @@ -18,7 +18,7 @@ #include -#if defined(_MSC_VER) && _MSC_VER >= 1400 +#if _MSC_VER >= 1400 #include #endif @@ -30,6 +30,8 @@ #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.") #endif +#define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86) + NAMESPACE_BEGIN(CryptoPP) bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt) @@ -99,7 +101,36 @@ static word AtomicInverseModPower2(word A) // ******************************************************** -#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE +#if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || CRYPTOPP_BOOL_X64 + #define Declare2Words(x) word x##0, x##1; + #define AssignWord(a, b) a##0 = b; a##1 = 0; + #define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c); + #define LowWord(a) a##0 + #define HighWord(a) a##1 + #ifdef _MSC_VER + #define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1); + #define Double3Words(c, d) d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2; + #elif defined(__DECCXX) + #define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b); + #elif CRYPTOPP_BOOL_X64 + #define MultiplyWords(p, a, b) asm ("mulq %3" : "=a"(p##0), "=d"(p##1) : "a"(a), "g"(b) : "cc"); + #define MulAcc(c, d, a, b) asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc"); + #define Double3Words(c, d) asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc"); + #define Acc2WordsBy1(a, b) asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc"); + #define Acc2WordsBy2(a, b) asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc"); + #define Acc3WordsBy2(c, d, e) asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc"); + #endif + #ifndef Double3Words + #define Double3Words(c, d) d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2; + #endif + #ifndef Acc2WordsBy2 + #define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1; + #endif + #define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (ta) + (u##0>t);} + #define GetCarry(u) u##1 + #define GetBorrow(u) u##1 +#else #define Declare2Words(x) dword x; #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) #define MultiplyWords(p, a, b) p = __emulu(a, b); @@ -108,34 +139,23 @@ static word AtomicInverseModPower2(word A) #endif #define AssignWord(a, b) a = b; #define Add2WordsBy1(a, b, c) a = b + c; - #define Acc2WordsBy1(a, b) a += b; #define Acc2WordsBy2(a, b) a += b; - #define LowWord(a) (word)a - #define HighWord(a) (word)(a>>WORD_BITS) - #define Double2Words(a) a += a; + #define LowWord(a) word(a) + #define HighWord(a) word(a>>WORD_BITS) + #define Double3Words(c, d) d = 2*d + (c>>(WORD_BITS-1)); c *= 2; #define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u); #define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u); #define GetCarry(u) HighWord(u) #define GetBorrow(u) word(u>>(WORD_BITS*2-1)) -#else - #define Declare2Words(x) word x##0, x##1; - #define AssignWord(a, b) a##0 = b; a##1 = 0; - #define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c); +#endif +#ifndef MulAcc + #define MulAcc(c, d, a, b) MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p)); +#endif +#ifndef Acc2WordsBy1 #define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b) - #define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1; - #define LowWord(a) a##0 - #define HighWord(a) a##1 - #ifdef _MSC_VER - #define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1); - #define Double2Words(a) a##1 = __shiftleft128(a##0, a##1, 1); a##0 += a##0; - #elif defined(__DECCXX) - #define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b); - #define Double2Words(a) a##1 = (a##1 + a##1) + (a##0 >> (WORD_BITS-1)); a##0 += a##0; - #endif - #define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (ta) + (u##0>t);} - #define GetCarry(u) u##1 - #define GetBorrow(u) u##1 +#endif +#ifndef Acc3WordsBy2 + #define Acc3WordsBy2(c, d, e) Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e)); #endif class DWord @@ -411,9 +431,8 @@ inline word DWord::operator%(word a) // use some tricks to share assembly code between MSVC and GCC #if defined(__GNUC__) - #define CRYPTOPP_NAKED #define AddPrologue \ - word32 result; \ + word result; \ __asm__ __volatile__ \ ( \ ".intel_syntax noprefix;" @@ -454,7 +473,6 @@ inline word DWord::operator%(word a) : "memory", "cc" \ ); #else - #define CRYPTOPP_NAKED __declspec(naked) #define AddPrologue \ __asm push edi \ __asm push esi \ @@ -464,33 +482,107 @@ inline word DWord::operator%(word a) __asm pop esi \ __asm pop edi \ __asm ret 8 +#if _MSC_VER < 1300 + #define SaveEBX __asm push ebx + #define RestoreEBX __asm pop ebx +#else + #define SaveEBX + #define RestoreEBX +#endif #define SquPrologue \ AS2( mov eax, A) \ AS2( mov ecx, C) \ + SaveEBX \ AS2( lea ebx, s_maskLow16) - #define SquEpilogue #define MulPrologue \ AS2( mov eax, A) \ AS2( mov edi, B) \ AS2( mov ecx, C) \ + SaveEBX \ AS2( lea ebx, s_maskLow16) - #define MulEpilogue #define TopPrologue \ AS2( mov eax, A) \ AS2( mov edi, B) \ AS2( mov ecx, C) \ AS2( mov esi, L) \ + SaveEBX \ AS2( lea ebx, s_maskLow16) - #define TopEpilogue + #define SquEpilogue RestoreEBX + #define MulEpilogue RestoreEBX + #define TopEpilogue RestoreEBX #endif -#if defined(_MSC_VER) && defined(_M_X64) +#ifdef CRYPTOPP_X64_MASM_AVAILABLE extern "C" { -int Baseline_Add(size_t N, word *C, const word *A, const word *B); -int Baseline_Sub(size_t N, word *C, const word *A, const word *B); +word Baseline_Add(size_t N, word *C, const word *A, const word *B); +word Baseline_Sub(size_t N, word *C, const word *A, const word *B); } -#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) -CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) +#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) +word Baseline_Add(size_t N, word *C, const word *A, const word *B) +{ + word result; + __asm__ __volatile__ + ( + ".intel_syntax;" + AS1( neg %1) + ASJ( jz, 1, f) + AS2( mov %0,[%3+8*%1]) + AS2( add %0,[%4+8*%1]) + AS2( mov [%2+8*%1],%0) + ASL(0) + AS2( mov %0,[%3+8*%1+8]) + AS2( adc %0,[%4+8*%1+8]) + AS2( mov [%2+8*%1+8],%0) + AS2( lea %1,[%1+2]) + ASJ( jrcxz, 1, f) + AS2( mov %0,[%3+8*%1]) + AS2( adc %0,[%4+8*%1]) + AS2( mov [%2+8*%1],%0) + ASJ( jmp, 0, b) + ASL(1) + AS2( mov %0, 0) + AS2( adc %0, %0) + ".att_syntax;" + : "=&r" (result) + : "c" (N), "r" (C+N), "r" (A+N), "r" (B+N) + : "memory", "cc" + ); + return result; +} + +word Baseline_Sub(size_t N, word *C, const word *A, const word *B) +{ + word result; + __asm__ __volatile__ + ( + ".intel_syntax;" + AS1( neg %1) + ASJ( jz, 1, f) + AS2( mov %0,[%3+8*%1]) + AS2( sub %0,[%4+8*%1]) + AS2( mov [%2+8*%1],%0) + ASL(0) + AS2( mov %0,[%3+8*%1+8]) + AS2( sbb %0,[%4+8*%1+8]) + AS2( mov [%2+8*%1+8],%0) + AS2( lea %1,[%1+2]) + ASJ( jrcxz, 1, f) + AS2( mov %0,[%3+8*%1]) + AS2( sbb %0,[%4+8*%1]) + AS2( mov [%2+8*%1],%0) + ASJ( jmp, 0, b) + ASL(1) + AS2( mov %0, 0) + AS2( adc %0, %0) + ".att_syntax;" + : "=&r" (result) + : "c" (N), "r" (C+N), "r" (A+N), "r" (B+N) + : "memory", "cc" + ); + return result; +} +#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86 +CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) { AddPrologue @@ -531,7 +623,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word AddEpilogue } -CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) +CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) { AddPrologue @@ -572,8 +664,8 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word AddEpilogue } -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE -CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B) +#if CRYPTOPP_INTEGER_SSE2 +CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B) { AddPrologue @@ -629,7 +721,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, AddEpilogue } -CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B) +CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B) { AddPrologue @@ -687,7 +779,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, } #endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #else -int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) +word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) { assert (N%2 == 0); @@ -703,7 +795,7 @@ int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word return int(GetCarry(u)); } -int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) +word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) { assert (N%2 == 0); @@ -737,7 +829,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N) #define Mul_2 \ Mul_Begin(2) \ Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ - Mul_End(2) + Mul_End(1, 1) #define Mul_4 \ Mul_Begin(4) \ @@ -746,7 +838,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N) Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \ Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \ - Mul_End(4) + Mul_End(5, 3) #define Mul_8 \ Mul_Begin(8) \ @@ -763,7 +855,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N) Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \ Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \ Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \ - Mul_End(8) + Mul_End(13, 7) #define Mul_16 \ Mul_Begin(16) \ @@ -796,7 +888,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N) Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \ Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \ Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \ - Mul_End(16) + Mul_End(29, 15) #define Squ_2 \ Squ_Begin(2) \ @@ -900,6 +992,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N) Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \ Bot_End(16) +#if 0 #define Mul_Begin(n) \ Declare2Words(p) \ Declare2Words(c) \ @@ -938,9 +1031,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N) #define Bot_End(n) \ R[n-1] = e; - -/* -// this is slower on MSVC 2005 Win32 +#else #define Mul_Begin(n) \ Declare2Words(p) \ word c; \ @@ -950,25 +1041,20 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N) AssignWord(d, HighWord(p)) #define Mul_Acc(i, j) \ - MultiplyWords(p, A[i], B[j]) \ - Acc2WordsBy1(p, c) \ - c = LowWord(p); \ - Acc2WordsBy1(d, HighWord(p)) + MulAcc(c, d, A[i], B[j]) #define Mul_SaveAcc(k, i, j) \ R[k] = c; \ - MultiplyWords(p, A[i], B[j]) \ - Acc2WordsBy1(p, LowWord(d)) \ - c = LowWord(p); \ + c = LowWord(d); \ AssignWord(d, HighWord(d)) \ - Acc2WordsBy1(d, HighWord(p)) + MulAcc(c, d, A[i], B[j]) -#define Mul_End(n) \ - R[2*n-3] = c; \ - MultiplyWords(p, A[n-1], B[n-1])\ - Acc2WordsBy2(d, p) \ - R[2*n-2] = LowWord(d); \ - R[2*n-1] = HighWord(d); +#define Mul_End(k, i) \ + R[k] = c; \ + MultiplyWords(p, A[i], B[i]) \ + Acc2WordsBy2(p, d) \ + R[k+1] = LowWord(p); \ + R[k+2] = HighWord(p); #define Bot_SaveAcc(k, i, j) \ R[k] = c; \ @@ -980,52 +1066,45 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N) #define Bot_End(n) \ R[n-1] = c; -*/ +#endif #define Squ_Begin(n) \ Declare2Words(p) \ - Declare2Words(c) \ + word c; \ Declare2Words(d) \ Declare2Words(e) \ MultiplyWords(p, A[0], A[0]) \ R[0] = LowWord(p); \ AssignWord(e, HighWord(p)) \ MultiplyWords(p, A[0], A[1]) \ - AssignWord(c, LowWord(p)) \ + c = LowWord(p); \ AssignWord(d, HighWord(p)) \ Squ_NonDiag \ #define Squ_NonDiag \ - Double2Words(c) \ - Double2Words(d) \ + Double3Words(c, d) #define Squ_SaveAcc(k, i, j) \ - Acc2WordsBy2(c, e) \ - R[k] = LowWord(c); \ - Add2WordsBy1(e, d, HighWord(c)) \ + Acc3WordsBy2(c, d, e) \ + R[k] = c; \ MultiplyWords(p, A[i], A[j]) \ - AssignWord(c, LowWord(p)) \ + c = LowWord(p); \ AssignWord(d, HighWord(p)) \ #define Squ_Acc(i, j) \ - MultiplyWords(p, A[i], A[j]) \ - Acc2WordsBy1(c, LowWord(p)) \ - Acc2WordsBy1(d, HighWord(p)) + MulAcc(c, d, A[i], A[j]) #define Squ_Diag(i) \ Squ_NonDiag \ - MultiplyWords(p, A[i], A[i]) \ - Acc2WordsBy1(c, LowWord(p)) \ - Acc2WordsBy1(d, HighWord(p)) \ + MulAcc(c, d, A[i], A[i]) #define Squ_End(n) \ - Acc2WordsBy2(c, e) \ - R[2*n-3] = LowWord(c); \ - Acc2WordsBy1(d, HighWord(c)) \ + Acc3WordsBy2(c, d, e) \ + R[2*n-3] = c; \ MultiplyWords(p, A[n-1], A[n-1])\ - Acc2WordsBy2(d, p) \ - R[2*n-2] = LowWord(d); \ - R[2*n-1] = HighWord(d); + Acc2WordsBy2(p, e) \ + R[2*n-2] = LowWord(p); \ + R[2*n-1] = HighWord(p); void Baseline_Multiply2(word *R, const word *A, const word *B) { @@ -1072,7 +1151,62 @@ void Baseline_MultiplyBottom8(word *R, const word *A, const word *B) Bot_8 } -/* +#define Top_Begin(n) \ + Declare2Words(p) \ + word c; \ + Declare2Words(d) \ + MultiplyWords(p, A[0], B[n-2]);\ + AssignWord(d, HighWord(p)); + +#define Top_Acc(i, j) \ + MultiplyWords(p, A[i], B[j]);\ + Acc2WordsBy1(d, HighWord(p)); + +#define Top_SaveAcc0(i, j) \ + c = LowWord(d); \ + AssignWord(d, HighWord(d)) \ + MulAcc(c, d, A[i], B[j]) + +#define Top_SaveAcc1(i, j) \ + c = L=2 && N%2==0); -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE - if (HasSSE2() && ((N>=8) & (N<=32))) - s_pTop[N/16](R, A, B, L[N-1]); - else -#endif - if (N<=4) - { - s_pMul[N/4](T, A, B); - memcpy(R, T+N, N*WORD_SIZE); - } + if (N <= s_recursionLimit) + s_pTop[N/4](R, A, B, L[N-1]); else { const size_t N2 = N/2; @@ -3076,13 +3234,6 @@ public: memcpy(m_counterAndSeed + 4, seed, seedSize); } - byte GenerateByte() - { - byte b; - GenerateBlock(&b, 1); - return b; - } - void GenerateBlock(byte *output, size_t size) { PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter); diff --git a/panama.cpp b/panama.cpp index 89a5aeaa..a60e1670 100644 --- a/panama.cpp +++ b/panama.cpp @@ -26,31 +26,31 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) __asm__ __volatile__ ( ".intel_syntax noprefix;" - AS1( push ebx) + AS_PUSH( bx) #else - AS2( mov ecx, count) - AS2( mov esi, state) - AS2( mov edi, z) - AS2( mov edx, y) + AS2( mov WORD_REG(cx), count) + AS2( mov WORD_REG(si), state) + AS2( mov WORD_REG(di), z) + AS2( mov WORD_REG(dx), y) #endif - AS2( shl ecx, 5) + AS2( shl WORD_REG(cx), 5) ASJ( jz, 5, f) - AS2( mov ebx, [esi+4*17]) - AS2( add ecx, ebx) + AS2( mov ebx, [WORD_REG(si)+4*17]) + AS2( add WORD_REG(cx), WORD_REG(bx)) - AS1( push ebp) - AS1( push ecx) + AS_PUSH( bp) + AS_PUSH( cx) - AS2( movdqa xmm0, [esi+0*16]) - AS2( movdqa xmm1, [esi+1*16]) - AS2( movdqa xmm2, [esi+2*16]) - AS2( movdqa xmm3, [esi+3*16]) - AS2( mov eax, [esi+4*16]) + AS2( movdqa xmm0, [WORD_REG(si)+0*16]) + AS2( movdqa xmm1, [WORD_REG(si)+1*16]) + AS2( movdqa xmm2, [WORD_REG(si)+2*16]) + AS2( movdqa xmm3, [WORD_REG(si)+3*16]) + AS2( mov eax, [WORD_REG(si)+4*16]) ASL(4) // gamma and pi #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE - AS2( test ebx, 1) + AS2( test WORD_REG(bx), 1) ASJ( jnz, 6, f) #endif AS2( movdqa xmm6, xmm2) @@ -81,7 +81,7 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) #define pi(i) \ AS2( movd ecx, xmm7)\ AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\ - AS2( mov [esi+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx) + AS2( mov [WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx) #define pi4(x, y, z, a, b, c, d) \ AS2( pcmpeqb xmm7, xmm7)\ @@ -110,65 +110,65 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) AS2( punpckhdq xmm2, xmm0) // 11 12 15 16 // keystream - AS2( test edi, edi) + AS2( test WORD_REG(di), WORD_REG(di)) ASJ( jz, 0, f) AS2( movdqa xmm6, xmm4) AS2( punpcklqdq xmm4, xmm2) AS2( punpckhqdq xmm6, xmm2) - AS2( test edx, 0xf) + AS2( test WORD_REG(dx), 0xf) ASJ( jnz, 2, f) - AS2( test edx, edx) + AS2( test WORD_REG(dx), WORD_REG(dx)) ASJ( jz, 1, f) - AS2( pxor xmm4, [edx]) - AS2( pxor xmm6, [edx+16]) - AS2( add edx, 32) + AS2( pxor xmm4, [WORD_REG(dx)]) + AS2( pxor xmm6, [WORD_REG(dx)+16]) + AS2( add WORD_REG(dx), 32) ASJ( jmp, 1, f) ASL(2) - AS2( movdqu xmm0, [edx]) - AS2( movdqu xmm2, [edx+16]) + AS2( movdqu xmm0, [WORD_REG(dx)]) + AS2( movdqu xmm2, [WORD_REG(dx)+16]) AS2( pxor xmm4, xmm0) AS2( pxor xmm6, xmm2) - AS2( add edx, 32) + AS2( add WORD_REG(dx), 32) ASL(1) - AS2( test edi, 0xf) + AS2( test WORD_REG(di), 0xf) ASJ( jnz, 3, f) - AS2( movdqa [edi], xmm4) - AS2( movdqa [edi+16], xmm6) - AS2( add edi, 32) + AS2( movdqa [WORD_REG(di)], xmm4) + AS2( movdqa [WORD_REG(di)+16], xmm6) + AS2( add WORD_REG(di), 32) ASJ( jmp, 0, f) ASL(3) - AS2( movdqu [edi], xmm4) - AS2( movdqu [edi+16], xmm6) - AS2( add edi, 32) + AS2( movdqu [WORD_REG(di)], xmm4) + AS2( movdqu [WORD_REG(di)+16], xmm6) + AS2( add WORD_REG(di), 32) ASL(0) // buffer update - AS2( lea ecx, [ebx + 32]) - AS2( and ecx, 31*32) - AS2( lea ebp, [ebx + (32-24)*32]) - AS2( and ebp, 31*32) + AS2( lea WORD_REG(cx), [WORD_REG(bx) + 32]) + AS2( and WORD_REG(cx), 31*32) + AS2( lea WORD_REG(bp), [WORD_REG(bx) + (32-24)*32]) + AS2( and WORD_REG(bp), 31*32) - AS2( movdqa xmm0, [esi+20*4+ecx+0*8]) + AS2( movdqa xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8]) AS2( pxor xmm3, xmm0) ASS( pshufd xmm0, xmm0, 2, 3, 0, 1) - AS2( movdqa [esi+20*4+ecx+0*8], xmm3) - AS2( pxor xmm0, [esi+20*4+ebp+2*8]) - AS2( movdqa [esi+20*4+ebp+2*8], xmm0) + AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3) + AS2( pxor xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8]) + AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0) - AS2( movdqa xmm4, [esi+20*4+ecx+2*8]) + AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8]) AS2( pxor xmm1, xmm4) - AS2( movdqa [esi+20*4+ecx+2*8], xmm1) - AS2( pxor xmm4, [esi+20*4+ebp+0*8]) - AS2( movdqa [esi+20*4+ebp+0*8], xmm4) + AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1) + AS2( pxor xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8]) + AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4) // theta - AS2( movdqa xmm3, [esi+3*16]) - AS2( movdqa xmm2, [esi+2*16]) - AS2( movdqa xmm1, [esi+1*16]) - AS2( movdqa xmm0, [esi+0*16]) + AS2( movdqa xmm3, [WORD_REG(si)+3*16]) + AS2( movdqa xmm2, [WORD_REG(si)+2*16]) + AS2( movdqa xmm1, [WORD_REG(si)+1*16]) + AS2( movdqa xmm0, [WORD_REG(si)+0*16]) #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE - AS2( test ebx, 1) + AS2( test WORD_REG(bx), 1) ASJ( jnz, 8, f) #endif AS2( movd xmm6, eax) @@ -214,21 +214,21 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) AS2( pxor xmm0, xmm4) // sigma - AS2( lea ecx, [ebx + (32-4)*32]) - AS2( and ecx, 31*32) - AS2( lea ebp, [ebx + 16*32]) - AS2( and ebp, 31*32) + AS2( lea WORD_REG(cx), [WORD_REG(bx) + (32-4)*32]) + AS2( and WORD_REG(cx), 31*32) + AS2( lea WORD_REG(bp), [WORD_REG(bx) + 16*32]) + AS2( and WORD_REG(bp), 31*32) - AS2( movdqa xmm4, [esi+20*4+ecx+0*16]) - AS2( movdqa xmm5, [esi+20*4+ebp+0*16]) + AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16]) + AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16]) AS2( movdqa xmm6, xmm4) AS2( punpcklqdq xmm4, xmm5) AS2( punpckhqdq xmm6, xmm5) AS2( pxor xmm3, xmm4) AS2( pxor xmm2, xmm6) - AS2( movdqa xmm4, [esi+20*4+ecx+1*16]) - AS2( movdqa xmm5, [esi+20*4+ebp+1*16]) + AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16]) + AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16]) AS2( movdqa xmm6, xmm4) AS2( punpcklqdq xmm4, xmm5) AS2( punpckhqdq xmm6, xmm5) @@ -236,23 +236,22 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) AS2( pxor xmm0, xmm6) // loop - AS2( add ebx, 32) - AS2( cmp ebx, [esp]) + AS2( add WORD_REG(bx), 32) + AS2( cmp WORD_REG(bx), [WORD_REG(sp)]) ASJ( jne, 4, b) // save state - AS2( mov ebp, [esp+4]) - AS2( add esp, 8) - AS2( mov [esi+4*17], ebx) - AS2( mov [esi+4*16], eax) - AS2( movdqa [esi+3*16], xmm3) - AS2( movdqa [esi+2*16], xmm2) - AS2( movdqa [esi+1*16], xmm1) - AS2( movdqa [esi+0*16], xmm0) + AS2( add WORD_REG(sp), WORD_SZ) + AS_POP( bp) + AS2( mov [WORD_REG(si)+4*16], eax) + AS2( movdqa [WORD_REG(si)+3*16], xmm3) + AS2( movdqa [WORD_REG(si)+2*16], xmm2) + AS2( movdqa [WORD_REG(si)+1*16], xmm1) + AS2( movdqa [WORD_REG(si)+0*16], xmm0) ASL(5) #ifdef __GNUC__ - AS1( pop ebx) + AS_POP( bx) ".att_syntax prefix;" : : "c" (count), "S" (state), "D" (z), "d" (y) diff --git a/rijndael.cpp b/rijndael.cpp index 4a8572f2..ac4f7699 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -149,81 +149,133 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { -#ifdef CRYPTOPP_X86_ASM_AVAILABLE +#if defined(CRYPTOPP_X86_ASM_AVAILABLE) if (HasMMX()) { const word32 *k = m_key; const word32 *kLoopEnd = k + m_rounds*4; + #if CRYPTOPP_BOOL_X64 + #define K_REG r8 + #define K_END_REG r9 + #define SAVE_K + #define RESTORE_K + #define RESTORE_K_END + #define SAVE_0(x) AS2(mov r10d, x) + #define SAVE_1(x) AS2(mov r11d, x) + #define SAVE_2(x) AS2(mov r12d, x) + #define RESTORE_0(x) AS2(mov x, r10d) + #define RESTORE_1(x) AS2(mov x, r11d) + #define RESTORE_2(x) AS2(mov x, r12d) + #else + #define K_REG esi + #define K_END_REG edi + #define SAVE_K AS2(movd mm4, esi) + #define RESTORE_K AS2(movd esi, mm4) + #define RESTORE_K_END AS2(movd edi, mm5) + #define SAVE_0(x) AS2(movd mm0, x) + #define SAVE_1(x) AS2(movd mm1, x) + #define SAVE_2(x) AS2(movd mm2, x) + #define RESTORE_0(x) AS2(movd x, mm0) + #define RESTORE_1(x) AS2(movd x, mm1) + #define RESTORE_2(x) AS2(movd x, mm2) + #endif #ifdef __GNUC__ word32 t0, t1, t2, t3; __asm__ __volatile__ ( ".intel_syntax noprefix;" - AS1( push ebx) - AS1( push ebp) - AS2( mov ebp, eax) + AS_PUSH( bx) + AS_PUSH( bp) + AS2( mov WORD_REG(bp), WORD_REG(ax)) + #if CRYPTOPP_BOOL_X64 + // save these manually. clobber list doesn't seem to work as of GCC 4.1.0 + AS1( pushq K_REG) + AS1( pushq K_END_REG) + AS1( pushq r10) + AS1( pushq r11) + AS1( pushq r12) + AS2( mov K_REG, rsi) + AS2( mov K_END_REG, rcx) + #else AS2( movd mm5, ecx) + #endif #else + #if _MSC_VER < 1300 + const word32 *t = Te; + AS2( mov eax, t) + #endif AS2( mov edx, g_cacheLineSize) - AS2( mov edi, inBlock) - AS2( mov esi, k) + AS2( mov WORD_REG(di), inBlock) + AS2( mov K_REG, k) AS2( movd mm5, kLoopEnd) - AS1( push ebp) + #if _MSC_VER < 1300 + AS_PUSH( bx) + AS_PUSH( bp) + AS2( mov ebp, eax) + #else + AS_PUSH( bp) AS2( lea ebp, Te) + #endif #endif - AS2( mov eax, [esi+0*4]) // s0 - AS2( xor eax, [edi+0*4]) - AS2( movd mm0, eax) - AS2( mov ebx, [esi+1*4]) - AS2( xor ebx, [edi+1*4]) - AS2( movd mm1, ebx) + AS2( mov eax, [K_REG+0*4]) // s0 + AS2( xor eax, [WORD_REG(di)+0*4]) + SAVE_0(eax) + AS2( mov ebx, [K_REG+1*4]) + AS2( xor ebx, [WORD_REG(di)+1*4]) + SAVE_1(ebx) AS2( and ebx, eax) - AS2( mov eax, [esi+2*4]) - AS2( xor eax, [edi+2*4]) - AS2( movd mm2, eax) + AS2( mov eax, [K_REG+2*4]) + AS2( xor eax, [WORD_REG(di)+2*4]) + SAVE_2(eax) AS2( and ebx, eax) - AS2( mov ecx, [esi+3*4]) - AS2( xor ecx, [edi+3*4]) + AS2( mov ecx, [K_REG+3*4]) + AS2( xor ecx, [WORD_REG(di)+3*4]) AS2( and ebx, ecx) // read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction AS2( and ebx, 0) AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence ASL(2) - AS2( and ebx, [ebp+edi]) + AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( add edi, edx) - AS2( and ebx, [ebp+edi]) + AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( add edi, edx) - AS2( and ebx, [ebp+edi]) + AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( add edi, edx) - AS2( and ebx, [ebp+edi]) + AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( add edi, edx) AS2( cmp edi, 1024) ASJ( jl, 2, b) - AS2( and ebx, [ebp+1020]) + AS2( and ebx, [WORD_REG(bp)+1020]) +#if CRYPTOPP_BOOL_X64 + AS2( xor r10d, ebx) + AS2( xor r11d, ebx) + AS2( xor r12d, ebx) +#else AS2( movd mm6, ebx) AS2( pxor mm2, mm6) AS2( pxor mm1, mm6) AS2( pxor mm0, mm6) +#endif AS2( xor ecx, ebx) - AS2( mov edi, [esi+4*4]) // t0 - AS2( mov eax, [esi+5*4]) - AS2( mov ebx, [esi+6*4]) - AS2( mov edx, [esi+7*4]) - AS2( add esi, 8*4) - AS2( movd mm4, esi) + AS2( mov edi, [K_REG+4*4]) // t0 + AS2( mov eax, [K_REG+5*4]) + AS2( mov ebx, [K_REG+6*4]) + AS2( mov edx, [K_REG+7*4]) + AS2( add K_REG, 8*4) + SAVE_K #define QUARTER_ROUND(t, a, b, c, d) \ AS2(movzx esi, t##l)\ - AS2(d, [ebp+0*1024+4*esi])\ + AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(c, [ebp+1*1024+4*esi])\ + AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\ AS2(shr e##t##x, 16)\ AS2(movzx esi, t##l)\ - AS2(b, [ebp+2*1024+4*esi])\ + AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(a, [ebp+3*1024+4*esi]) + AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)]) #define s0 xor edi #define s1 xor eax @@ -235,69 +287,69 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock #define t3 xor edx QUARTER_ROUND(c, t0, t1, t2, t3) - AS2( movd ecx, mm2) + RESTORE_2(ecx) QUARTER_ROUND(c, t3, t0, t1, t2) - AS2( movd ecx, mm1) + RESTORE_1(ecx) QUARTER_ROUND(c, t2, t3, t0, t1) - AS2( movd ecx, mm0) + RESTORE_0(ecx) QUARTER_ROUND(c, t1, t2, t3, t0) - AS2( movd mm2, ebx) - AS2( movd mm1, eax) - AS2( movd mm0, edi) + SAVE_2(ebx) + SAVE_1(eax) + SAVE_0(edi) #undef QUARTER_ROUND - AS2( movd esi, mm4) + RESTORE_K ASL(0) - AS2( mov edi, [esi+0*4]) - AS2( mov eax, [esi+1*4]) - AS2( mov ebx, [esi+2*4]) - AS2( mov ecx, [esi+3*4]) + AS2( mov edi, [K_REG+0*4]) + AS2( mov eax, [K_REG+1*4]) + AS2( mov ebx, [K_REG+2*4]) + AS2( mov ecx, [K_REG+3*4]) #define QUARTER_ROUND(t, a, b, c, d) \ AS2(movzx esi, t##l)\ - AS2(a, [ebp+3*1024+4*esi])\ + AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(b, [ebp+2*1024+4*esi])\ + AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\ AS2(shr e##t##x, 16)\ AS2(movzx esi, t##l)\ - AS2(c, [ebp+1*1024+4*esi])\ + AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\ AS2(movzx esi, t##h)\ - AS2(d, [ebp+0*1024+4*esi]) + AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)]) QUARTER_ROUND(d, s0, s1, s2, s3) - AS2( movd edx, mm2) + RESTORE_2(edx) QUARTER_ROUND(d, s3, s0, s1, s2) - AS2( movd edx, mm1) + RESTORE_1(edx) QUARTER_ROUND(d, s2, s3, s0, s1) - AS2( movd edx, mm0) + RESTORE_0(edx) QUARTER_ROUND(d, s1, s2, s3, s0) - AS2( movd esi, mm4) - AS2( movd mm2, ebx) - AS2( movd mm1, eax) - AS2( movd mm0, edi) + RESTORE_K + SAVE_2(ebx) + SAVE_1(eax) + SAVE_0(edi) - AS2( mov edi, [esi+4*4]) - AS2( mov eax, [esi+5*4]) - AS2( mov ebx, [esi+6*4]) - AS2( mov edx, [esi+7*4]) + AS2( mov edi, [K_REG+4*4]) + AS2( mov eax, [K_REG+5*4]) + AS2( mov ebx, [K_REG+6*4]) + AS2( mov edx, [K_REG+7*4]) QUARTER_ROUND(c, t0, t1, t2, t3) - AS2( movd ecx, mm2) + RESTORE_2(ecx) QUARTER_ROUND(c, t3, t0, t1, t2) - AS2( movd ecx, mm1) + RESTORE_1(ecx) QUARTER_ROUND(c, t2, t3, t0, t1) - AS2( movd ecx, mm0) + RESTORE_0(ecx) QUARTER_ROUND(c, t1, t2, t3, t0) - AS2( movd mm2, ebx) - AS2( movd mm1, eax) - AS2( movd mm0, edi) + SAVE_2(ebx) + SAVE_1(eax) + SAVE_0(edi) - AS2( movd esi, mm4) - AS2( movd edi, mm5) - AS2( add esi, 8*4) - AS2( movd mm4, esi) - AS2( cmp edi, esi) + RESTORE_K + RESTORE_K_END + AS2( add K_REG, 8*4) + SAVE_K + AS2( cmp K_END_REG, K_REG) ASJ( jne, 0, b) #undef QUARTER_ROUND @@ -310,44 +362,54 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock #undef t2 #undef t3 - AS2( mov eax, [edi+0*4]) - AS2( mov ecx, [edi+1*4]) - AS2( mov esi, [edi+2*4]) - AS2( mov edi, [edi+3*4]) + AS2( mov eax, [K_END_REG+0*4]) + AS2( mov ecx, [K_END_REG+1*4]) + AS2( mov esi, [K_END_REG+2*4]) + AS2( mov edi, [K_END_REG+3*4]) #define QUARTER_ROUND(a, b, c, d) \ AS2( movzx ebx, dl)\ - AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ AS2( shl ebx, 3*8)\ AS2( xor a, ebx)\ AS2( movzx ebx, dh)\ - AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ AS2( shl ebx, 2*8)\ AS2( xor b, ebx)\ AS2( shr edx, 16)\ AS2( movzx ebx, dl)\ AS2( shr edx, 8)\ - AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ + AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ AS2( shl ebx, 1*8)\ AS2( xor c, ebx)\ - AS2( movzx ebx, BYTE PTR [ebp+1+4*edx])\ + AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\ AS2( xor d, ebx) QUARTER_ROUND(eax, ecx, esi, edi) - AS2( movd edx, mm2) + RESTORE_2(edx) QUARTER_ROUND(edi, eax, ecx, esi) - AS2( movd edx, mm1) + RESTORE_1(edx) QUARTER_ROUND(esi, edi, eax, ecx) - AS2( movd edx, mm0) + RESTORE_0(edx) QUARTER_ROUND(ecx, esi, edi, eax) #undef QUARTER_ROUND - AS1( pop ebp) - AS1( emms) +#if CRYPTOPP_BOOL_X64 + AS1(popq r12) + AS1(popq r11) + AS1(popq r10) + AS1(popq K_END_REG) + AS1(popq K_REG) +#else + AS1(emms) +#endif + AS_POP( bp) +#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300) + AS_POP( bx) +#endif #ifdef __GNUC__ - AS1( pop ebx) ".att_syntax prefix;" : "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3) : "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize) @@ -366,19 +428,19 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock ((word32 *)outBlock)[2] = t2; ((word32 *)outBlock)[3] = t3; #else - AS2( mov ebx, xorBlock) - AS2( test ebx, ebx) + AS2( mov WORD_REG(bx), xorBlock) + AS2( test WORD_REG(bx), WORD_REG(bx)) ASJ( jz, 1, f) - AS2( xor eax, [ebx+0*4]) - AS2( xor ecx, [ebx+1*4]) - AS2( xor esi, [ebx+2*4]) - AS2( xor edi, [ebx+3*4]) + AS2( xor eax, [WORD_REG(bx)+0*4]) + AS2( xor ecx, [WORD_REG(bx)+1*4]) + AS2( xor esi, [WORD_REG(bx)+2*4]) + AS2( xor edi, [WORD_REG(bx)+3*4]) ASL(1) - AS2( mov ebx, outBlock) - AS2( mov [ebx+0*4], eax) - AS2( mov [ebx+1*4], ecx) - AS2( mov [ebx+2*4], esi) - AS2( mov [ebx+3*4], edi) + AS2( mov WORD_REG(bx), outBlock) + AS2( mov [WORD_REG(bx)+0*4], eax) + AS2( mov [WORD_REG(bx)+1*4], ecx) + AS2( mov [WORD_REG(bx)+2*4], esi) + AS2( mov [WORD_REG(bx)+3*4], edi) #endif } else diff --git a/secblock.h b/secblock.h index cdc67c10..0bc53243 100644 --- a/secblock.h +++ b/secblock.h @@ -130,10 +130,13 @@ public: #endif assert(IsAlignedOn(p, 16)); - return (T*)p; + return (pointer)p; } - return new T[n]; + pointer p; + while (!(p = (pointer)malloc(sizeof(T)*n))) + CallNewHandler(); + return p; } void deallocate(void *p, size_type n) @@ -153,7 +156,7 @@ public: return; } - delete [] (T *)p; + free(p); } pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve) @@ -164,13 +167,19 @@ public: // VS.NET STL enforces the policy of "All STL-compliant allocators have to provide a // template class member called rebind". template struct rebind { typedef AllocatorWithCleanup other; }; +#if _MSC_VER >= 1500 + AllocatorWithCleanup() {} + template AllocatorWithCleanup(const AllocatorWithCleanup &) {} +#endif }; CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup; CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup; CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup; CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup; -CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup; // for Integer +#if CRYPTOPP_BOOL_X86 +CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup; // for Integer +#endif template class NullAllocator : public AllocatorBase @@ -260,7 +269,7 @@ public: size_type max_size() const {return STDMAX(m_fallbackAllocator.max_size(), S);} private: - T* GetAlignedArray() {return T_Align16 ? (T*)(((byte *)m_array) + (0-(unsigned int)m_array)%16) : m_array;} + T* GetAlignedArray() {return T_Align16 ? (T*)(((byte *)m_array) + (0-(size_t)m_array)%16) : m_array;} CRYPTOPP_ALIGN_DATA(8) T m_array[T_Align16 ? S+8/sizeof(T) : S]; A m_fallbackAllocator; @@ -466,10 +475,10 @@ public: explicit SecBlockWithHint(size_t size) : SecBlock(size) {} }; -template -inline bool operator==(const CryptoPP::AllocatorWithCleanup&, const CryptoPP::AllocatorWithCleanup&) {return (true);} -template -inline bool operator!=(const CryptoPP::AllocatorWithCleanup&, const CryptoPP::AllocatorWithCleanup&) {return (false);} +template +inline bool operator==(const CryptoPP::AllocatorWithCleanup&, const CryptoPP::AllocatorWithCleanup&) {return (true);} +template +inline bool operator!=(const CryptoPP::AllocatorWithCleanup&, const CryptoPP::AllocatorWithCleanup&) {return (false);} NAMESPACE_END diff --git a/sha.cpp b/sha.cpp index 127d1f99..78a850e9 100644 --- a/sha.cpp +++ b/sha.cpp @@ -308,9 +308,9 @@ CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN1 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) }; -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86 // put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version -static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data) +CRYPTOPP_NAKED static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data) { #ifdef __GNUC__ __asm__ __volatile__ @@ -319,6 +319,9 @@ static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 AS1( push ebx) AS2( mov ebx, eax) #else + AS1( push ebx) + AS1( push esi) + AS1( push edi) AS2( lea ebx, SHA512_K) #endif @@ -486,22 +489,30 @@ static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 AS1( pop esp) AS1( emms) -#ifdef __GNUC__ +#if defined(__GNUC__) AS1( pop ebx) ".att_syntax prefix;" : : "a" (SHA512_K), "c" (state), "d" (data) : "%esi", "%edi", "memory", "cc" ); +#else + AS1( pop edi) + AS1( pop esi) + AS1( pop ebx) + AS1( ret) #endif } #endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE void SHA512::Transform(word64 *state, const word64 *data) { -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86 if (HasSSE2()) - return SHA512_SSE2_Transform(state, data); + { + SHA512_SSE2_Transform(state, data); + return; + } #endif #define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39)) diff --git a/smartptr.h b/smartptr.h index f5630012..6b4040e9 100644 --- a/smartptr.h +++ b/smartptr.h @@ -189,21 +189,21 @@ template counted_ptr & counted_ptr::operator=(const counted_ptr< template class vector_member_ptrs { public: - vector_member_ptrs(unsigned int size=0) + vector_member_ptrs(size_t size=0) : m_size(size), m_ptr(new member_ptr[size]) {} ~vector_member_ptrs() {delete [] this->m_ptr;} - member_ptr& operator[](unsigned int index) + member_ptr& operator[](size_t index) {assert(indexm_size); return this->m_ptr[index];} - const member_ptr& operator[](unsigned int index) const + const member_ptr& operator[](size_t index) const {assert(indexm_size); return this->m_ptr[index];} - unsigned int size() const {return this->m_size;} - void resize(unsigned int newSize) + size_t size() const {return this->m_size;} + void resize(size_t newSize) { member_ptr *newPtr = new member_ptr[newSize]; - for (unsigned int i=0; im_size && im_size && im_ptr[i].release()); delete [] this->m_ptr; this->m_size = newSize; @@ -214,7 +214,7 @@ private: vector_member_ptrs(const vector_member_ptrs &c); // copy not allowed void operator=(const vector_member_ptrs &x); // assignment not allowed - unsigned int m_size; + size_t m_size; member_ptr *m_ptr; }; diff --git a/sosemanuk.cpp b/sosemanuk.cpp index 816cb981..c86b8773 100755 --- a/sosemanuk.cpp +++ b/sosemanuk.cpp @@ -68,6 +68,10 @@ void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv) m_state[1] = b; m_state[2] = e; m_state[3] = d; + +#define XMUX(c, x, y) (x ^ (y & (0 - (c & 1)))) + m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]); + m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7); } static word32 s_mulTables[512] = { @@ -282,10 +286,8 @@ unsigned int SosemanukPolicy::GetAlignment() const else #endif return 1; -#endif } -#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 unsigned int SosemanukPolicy::GetOptimalBlockSize() const { #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE @@ -316,54 +318,54 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu __asm__ __volatile__ ( ".intel_syntax noprefix;" - AS1( push ebx) + AS_PUSH( bx) #else word32 *state = m_state; - AS2( mov eax, state) - AS2( mov edi, output) - AS2( mov edx, input) - AS2( mov ecx, iterationCount) + AS2( mov WORD_REG(ax), state) + AS2( mov WORD_REG(di), output) + AS2( mov WORD_REG(dx), input) + AS2( mov WORD_REG(cx), iterationCount) #endif -#define SSE2_output DWORD PTR [esp+1*4] -#define SSE2_input DWORD PTR [esp+2*4] -#define SSE2_wordsLeft DWORD PTR [esp+3*4] -#define SSE2_ediEnd DWORD PTR [esp+4*4] -#define SSE2_pMulTables DWORD PTR [esp+5*4] -#define SSE2_state DWORD PTR [esp+6*4] -#define SSE2_wordsLeft2 DWORD PTR [esp+7*4] -#define SSE2_stateCopy esp + 8*4 +#define SSE2_output WORD_PTR [WORD_REG(sp)+1*WORD_SZ] +#define SSE2_input WORD_PTR [WORD_REG(sp)+2*WORD_SZ] +#define SSE2_wordsLeft WORD_PTR [WORD_REG(sp)+3*WORD_SZ] +#define SSE2_diEnd WORD_PTR [WORD_REG(sp)+4*WORD_SZ] +#define SSE2_pMulTables WORD_PTR [WORD_REG(sp)+5*WORD_SZ] +#define SSE2_state WORD_PTR [WORD_REG(sp)+6*WORD_SZ] +#define SSE2_wordsLeft2 WORD_PTR [WORD_REG(sp)+7*WORD_SZ] +#define SSE2_stateCopy WORD_REG(sp) + 8*WORD_SZ #define SSE2_uvStart SSE2_stateCopy + 12*4 - AS1( push ebp) - AS2( mov ebx, esp) - AS2( and esp, 0xfffffff0) - AS2( sub esp, 80*4*2+12*4+8*4) // 80 v's, 80 u's, 12 state, 8 locals - AS2( mov [esp], ebx) - AS2( mov SSE2_output, edi) - AS2( mov SSE2_input, edx) - AS2( mov SSE2_state, eax) + AS_PUSH( bp) + AS2( mov WORD_REG(bx), WORD_REG(sp)) + AS2( and WORD_REG(sp), -16) + AS2( sub WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals + AS2( mov [WORD_REG(sp)], WORD_REG(bx)) + AS2( mov SSE2_output, WORD_REG(di)) + AS2( mov SSE2_input, WORD_REG(dx)) + AS2( mov SSE2_state, WORD_REG(ax)) #ifndef _MSC_VER - AS2( mov SSE2_pMulTables, esi) + AS2( mov SSE2_pMulTables, WORD_REG(si)) #endif - AS2( lea ecx, [4*ecx+ecx]) - AS2( lea esi, [4*ecx]) - AS2( mov SSE2_wordsLeft, esi) - AS2( movdqa xmm0, [eax+0*16]) // copy state to stack to save a register + AS2( lea WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)]) + AS2( lea WORD_REG(si), [4*WORD_REG(cx)]) + AS2( mov SSE2_wordsLeft, WORD_REG(si)) + AS2( movdqa xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register AS2( movdqa [SSE2_stateCopy+0*16], xmm0) - AS2( movdqa xmm0, [eax+1*16]) + AS2( movdqa xmm0, [WORD_REG(ax)+1*16]) AS2( movdqa [SSE2_stateCopy+1*16], xmm0) - AS2( movq xmm0, QWORD PTR [eax+2*16]) + AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16]) AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0) AS2( psrlq xmm0, 32) AS2( movd ebx, xmm0) // s(9) - AS2( mov ecx, [eax+10*4]) - AS2( mov edx, [eax+11*4]) + AS2( mov ecx, [WORD_REG(ax)+10*4]) + AS2( mov edx, [WORD_REG(ax)+11*4]) AS2( pcmpeqb xmm7, xmm7) // all ones #define s(i) SSE2_stateCopy + ASM_MOD(i,10)*4 -#define u(j) edi + (ASM_MOD(j,4)*20 + (j/4)) * 4 -#define v(j) edi + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4 +#define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 +#define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4 #define r10 ecx #define r11 edx @@ -371,42 +373,42 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu #define r21 ecx #define SSE2_STEP(i, j) \ - AS2( mov eax, [s(i+3)])\ - AS2( mov ebp, 1)\ - AS2( and ebp, r1##j)\ - AS1( neg ebp)\ - AS2( and ebp, [s(i+8)])\ - AS2( xor ebp, [s(i+1)])\ - AS2( add r2##j, ebp)\ - AS2( movzx ebp, al)\ - AS2( shr eax, 8)\ - AS2( xor eax, [esi+1024+ebp*4])\ - AS2( lea ebp, [ebx + r2##j])\ - AS2( xor ebx, eax)\ - AS2( imul r1##j, 0x54655307)\ AS2( mov eax, [s(i+0)])\ AS2( mov [v(i)], eax)\ AS2( rol eax, 8)\ - AS2( xor ebx, eax)\ - AS2( movzx eax, al)\ - AS2( rol r1##j, 7)\ - AS2( xor ebx, [esi+eax*4])\ + AS2( lea ebp, [ebx + r2##j])\ AS2( xor ebp, r1##j)\ AS2( mov [u(i)], ebp)\ + AS2( mov ebp, 1)\ + AS2( and ebp, r2##j)\ + AS1( neg ebp)\ + AS2( and ebp, ebx)\ + AS2( xor ebx, eax)\ + AS2( movzx eax, al)\ + AS2( xor ebx, [WORD_REG(si)+WORD_REG(ax)*4])\ + AS2( mov eax, [s(i+3)])\ + AS2( xor ebp, [s(i+2)])\ + AS2( add r1##j, ebp)\ + AS2( movzx ebp, al)\ + AS2( shr eax, 8)\ + AS2( xor ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\ + AS2( xor ebx, eax)\ + AS2( imul r2##j, 0x54655307)\ + AS2( rol r2##j, 7)\ AS2( mov [s(i+0)], ebx)\ ASL(2) // outer loop, each iteration of this processes 80 words - AS2( lea edi, [SSE2_uvStart]) // start of v and u - AS2( mov eax, 80) - AS2( cmp esi, 80) - AS2( cmovg esi, eax) - AS2( mov SSE2_wordsLeft2, esi) - AS2( lea esi, [edi+esi]) // use to first inner loop - AS2( mov SSE2_ediEnd, esi) + AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u + AS2( mov WORD_REG(ax), 80) + AS2( cmp WORD_REG(si), 80) + AS2( cmovg WORD_REG(si), WORD_REG(ax)) + AS2( mov SSE2_wordsLeft2, WORD_REG(si)) + AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop + AS2( mov SSE2_diEnd, WORD_REG(si)) #ifdef _MSC_VER - AS2( lea esi, s_mulTables) + AS2( lea WORD_REG(si), s_mulTables) #else - AS2( mov esi, SSE2_pMulTables) + AS2( mov WORD_REG(si), SSE2_pMulTables) #endif ASL(0) // first inner loop, 20 words each, 4 iterations @@ -431,20 +433,20 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu SSE2_STEP(18, 0) SSE2_STEP(19, 1) // loop - AS2( add edi, 5*4) - AS2( cmp edi, SSE2_ediEnd) + AS2( add WORD_REG(di), 5*4) + AS2( cmp WORD_REG(di), SSE2_diEnd) ASJ( jne, 0, b) - AS2( mov eax, SSE2_input) - AS2( mov ebp, SSE2_output) - AS2( lea edi, [SSE2_uvStart]) // start of v and u - AS2( mov esi, SSE2_wordsLeft2) + AS2( mov WORD_REG(ax), SSE2_input) + AS2( mov WORD_REG(bp), SSE2_output) + AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u + AS2( mov WORD_REG(si), SSE2_wordsLeft2) ASL(1) // second inner loop, 16 words each, 5 iterations - AS2( movdqa xmm0, [edi+0*20*4]) - AS2( movdqa xmm1, [edi+1*20*4]) - AS2( movdqa xmm2, [edi+2*20*4]) - AS2( movdqa xmm3, [edi+3*20*4]) + AS2( movdqa xmm0, [WORD_REG(di)+0*20*4]) + AS2( movdqa xmm2, [WORD_REG(di)+2*20*4]) + AS2( movdqa xmm3, [WORD_REG(di)+3*20*4]) + AS2( movdqa xmm1, [WORD_REG(di)+1*20*4]) // S2 AS2( movdqa xmm4, xmm0) AS2( pand xmm0, xmm2) @@ -463,13 +465,13 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu AS2( pxor xmm1, xmm4) AS2( pxor xmm4, xmm7) // xor with v - AS2( pxor xmm2, [edi+80*4]) - AS2( pxor xmm3, [edi+80*5]) - AS2( pxor xmm1, [edi+80*6]) - AS2( pxor xmm4, [edi+80*7]) + AS2( pxor xmm2, [WORD_REG(di)+80*4]) + AS2( pxor xmm3, [WORD_REG(di)+80*5]) + AS2( pxor xmm1, [WORD_REG(di)+80*6]) + AS2( pxor xmm4, [WORD_REG(di)+80*7]) // exit loop early if less than 16 words left to output // this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop - AS2( cmp esi, 16) + AS2( cmp WORD_REG(si), 16) ASJ( jl, 4, f) // unpack AS2( movdqa xmm6, xmm2) @@ -485,75 +487,75 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu AS2( punpcklqdq xmm6, xmm5) AS2( punpckhqdq xmm3, xmm5) // output keystream - AS2( test eax, eax) + AS2( test WORD_REG(ax), WORD_REG(ax)) ASJ( jz, 3, f) AS2( test eax, 0xf) ASJ( jnz, 7, f) - AS2( pxor xmm2, [eax+0*16]) - AS2( pxor xmm0, [eax+1*16]) - AS2( pxor xmm6, [eax+2*16]) - AS2( pxor xmm3, [eax+3*16]) - AS2( add eax, 4*16) + AS2( pxor xmm2, [WORD_REG(ax)+0*16]) + AS2( pxor xmm0, [WORD_REG(ax)+1*16]) + AS2( pxor xmm6, [WORD_REG(ax)+2*16]) + AS2( pxor xmm3, [WORD_REG(ax)+3*16]) + AS2( add WORD_REG(ax), 4*16) ASJ( jmp, 3, f) ASL(7) - AS2( movdqu xmm1, [eax+0*16]) + AS2( movdqu xmm1, [WORD_REG(ax)+0*16]) AS2( pxor xmm2, xmm1) - AS2( movdqu xmm1, [eax+1*16]) + AS2( movdqu xmm1, [WORD_REG(ax)+1*16]) AS2( pxor xmm0, xmm1) - AS2( movdqu xmm1, [eax+2*16]) + AS2( movdqu xmm1, [WORD_REG(ax)+2*16]) AS2( pxor xmm6, xmm1) - AS2( movdqu xmm1, [eax+3*16]) + AS2( movdqu xmm1, [WORD_REG(ax)+3*16]) AS2( pxor xmm3, xmm1) - AS2( add eax, 4*16) + AS2( add WORD_REG(ax), 4*16) ASL(3) AS2( test ebp, 0xf) ASJ( jnz, 8, f) - AS2( movdqa [ebp+0*16], xmm2) - AS2( movdqa [ebp+1*16], xmm0) - AS2( movdqa [ebp+2*16], xmm6) - AS2( movdqa [ebp+3*16], xmm3) + AS2( movdqa [WORD_REG(bp)+0*16], xmm2) + AS2( movdqa [WORD_REG(bp)+1*16], xmm0) + AS2( movdqa [WORD_REG(bp)+2*16], xmm6) + AS2( movdqa [WORD_REG(bp)+3*16], xmm3) ASJ( jmp, 9, f) ASL(8) - AS2( movdqu [ebp+0*16], xmm2) - AS2( movdqu [ebp+1*16], xmm0) - AS2( movdqu [ebp+2*16], xmm6) - AS2( movdqu [ebp+3*16], xmm3) + AS2( movdqu [WORD_REG(bp)+0*16], xmm2) + AS2( movdqu [WORD_REG(bp)+1*16], xmm0) + AS2( movdqu [WORD_REG(bp)+2*16], xmm6) + AS2( movdqu [WORD_REG(bp)+3*16], xmm3) ASL(9) // loop - AS2( add edi, 4*4) - AS2( add ebp, 4*16) - AS2( sub esi, 16) + AS2( add WORD_REG(di), 4*4) + AS2( add WORD_REG(bp), 4*16) + AS2( sub WORD_REG(si), 16) ASJ( jnz, 1, b) // outer loop - AS2( mov esi, SSE2_wordsLeft) - AS2( sub esi, 80) + AS2( mov WORD_REG(si), SSE2_wordsLeft) + AS2( sub WORD_REG(si), 80) ASJ( jz, 6, f) - AS2( mov SSE2_wordsLeft, esi) - AS2( mov SSE2_input, eax) - AS2( mov SSE2_output, ebp) + AS2( mov SSE2_wordsLeft, WORD_REG(si)) + AS2( mov SSE2_input, WORD_REG(ax)) + AS2( mov SSE2_output, WORD_REG(bp)) ASJ( jmp, 2, b) ASL(4) // final output of less than 16 words - AS2( test eax, eax) + AS2( test WORD_REG(ax), WORD_REG(ax)) ASJ( jz, 5, f) - AS2( movd xmm0, [eax+0*4]) + AS2( movd xmm0, [WORD_REG(ax)+0*4]) AS2( pxor xmm2, xmm0) - AS2( movd xmm0, [eax+1*4]) + AS2( movd xmm0, [WORD_REG(ax)+1*4]) AS2( pxor xmm3, xmm0) - AS2( movd xmm0, [eax+2*4]) + AS2( movd xmm0, [WORD_REG(ax)+2*4]) AS2( pxor xmm1, xmm0) - AS2( movd xmm0, [eax+3*4]) + AS2( movd xmm0, [WORD_REG(ax)+3*4]) AS2( pxor xmm4, xmm0) - AS2( add eax, 16) + AS2( add WORD_REG(ax), 16) ASL(5) - AS2( movd [ebp+0*4], xmm2) - AS2( movd [ebp+1*4], xmm3) - AS2( movd [ebp+2*4], xmm1) - AS2( movd [ebp+3*4], xmm4) - AS2( sub esi, 4) + AS2( movd [WORD_REG(bp)+0*4], xmm2) + AS2( movd [WORD_REG(bp)+1*4], xmm3) + AS2( movd [WORD_REG(bp)+2*4], xmm1) + AS2( movd [WORD_REG(bp)+3*4], xmm4) + AS2( sub WORD_REG(si), 4) ASJ( jz, 6, f) - AS2( add ebp, 16) + AS2( add WORD_REG(bp), 16) AS2( psrldq xmm2, 4) AS2( psrldq xmm3, 4) AS2( psrldq xmm1, 4) @@ -561,26 +563,26 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu ASJ( jmp, 4, b) ASL(6) // save state - AS2( mov ebx, SSE2_state) + AS2( mov WORD_REG(bx), SSE2_state) AS2( movdqa xmm0, [SSE2_stateCopy+0*16]) - AS2( movdqa [ebx+0*16], xmm0) + AS2( movdqa [WORD_REG(bx)+0*16], xmm0) AS2( movdqa xmm0, [SSE2_stateCopy+1*16]) - AS2( movdqa [ebx+1*16], xmm0) + AS2( movdqa [WORD_REG(bx)+1*16], xmm0) AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16]) - AS2( movq QWORD PTR [ebx+2*16], xmm0) - AS2( mov [ebx+10*4], ecx) - AS2( mov [ebx+11*4], edx) + AS2( movq QWORD PTR [WORD_REG(bx)+2*16], xmm0) + AS2( mov [WORD_REG(bx)+10*4], ecx) + AS2( mov [WORD_REG(bx)+11*4], edx) - AS1( pop esp) - AS1( pop ebp) + AS_POP( sp) + AS_POP( bp) #ifdef __GNUC__ - AS1( pop ebx) - ".att_syntax prefix;" - : - : "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input) - : "memory", "cc" - ); + AS_POP( bx) + ".att_syntax prefix;" + : + : "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input) + : "memory", "cc" + ); #endif } else @@ -593,17 +595,16 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu #endif #define DIV_A(x) (((x) >> 8) ^ s_mulTables[256 + byte(x)]) -#define XMUX(c, x, y) (x ^ (y & (0 - (c & 1)))) #define r1(i) ((i%2) ? reg2 : reg1) #define r2(i) ((i%2) ? reg1 : reg2) #define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u) \ - r2(x0) += XMUX(r1(x0), s##x1, s##x8);\ - r1(x0) = rotlFixed(r1(x0) * 0x54655307, 7);\ - v = s##x0;\ u = (s##x9 + r2(x0)) ^ r1(x0);\ - s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9; + v = s##x0;\ + s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;\ + r1(x0) += XMUX(r2(x0), s##x2, s##x9);\ + r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\ #define SOSEMANUK_OUTPUT(x) \ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\ diff --git a/tiger.cpp b/tiger.cpp index 332de2c6..87ec74f4 100644 --- a/tiger.cpp +++ b/tiger.cpp @@ -34,7 +34,7 @@ void Tiger::TruncatedFinal(byte *hash, size_t size) void Tiger::Transform (word64 *digest, const word64 *X) { -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86 if (HasSSE2()) { #ifdef __GNUC__ @@ -43,9 +43,14 @@ void Tiger::Transform (word64 *digest, const word64 *X) ".intel_syntax noprefix;" AS1( push ebx) #else + #if _MSC_VER < 1300 + const word64 *t = table; + AS2( mov edx, t) + #else + AS2( lea edx, [table]) + #endif AS2( mov eax, digest) AS2( mov esi, X) - AS2( lea edx, [table]) #endif AS2( movq mm0, [eax]) AS2( movq mm1, [eax+1*8]) diff --git a/whrlpool.cpp b/whrlpool.cpp index da19d7ff..20e721e8 100644 --- a/whrlpool.cpp +++ b/whrlpool.cpp @@ -390,7 +390,7 @@ CRYPTOPP_ALIGN_DATA(16) static const word64 Whirlpool_C[4*256+R] CRYPTOPP_SECTIO // Whirlpool basic transformation. Transforms state based on block. void Whirlpool::Transform(word64 *digest, const word64 *block) { -#ifdef CRYPTOPP_X86_ASM_AVAILABLE +#if defined(CRYPTOPP_X86_ASM_AVAILABLE) if (HasMMX()) { // MMX version has the same structure as C version below @@ -398,26 +398,29 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) __asm__ __volatile__ ( ".intel_syntax noprefix;" - AS1( push ebx) - AS2( mov ebx, eax) + AS_PUSH( bx) + AS2( mov WORD_REG(bx), WORD_REG(ax)) #else - AS2( lea ebx, [Whirlpool_C]) - AS2( mov ecx, digest) - AS2( mov edx, block) + #if _MSC_VER < 1300 + AS_PUSH( bx) + #endif + AS2( lea WORD_REG(bx), [Whirlpool_C]) + AS2( mov WORD_REG(cx), digest) + AS2( mov WORD_REG(dx), block) #endif - AS2( mov eax, esp) - AS2( and esp, 0xfffffff0) - AS2( sub esp, 16*8) - AS1( push eax) + AS2( mov WORD_REG(ax), WORD_REG(sp)) + AS2( and WORD_REG(sp), -16) + AS2( sub WORD_REG(sp), 16*8) + AS_PUSH( ax) AS2( xor esi, esi) ASL(0) - AS2( movq mm0, [ecx+8*esi]) - AS2( movq [esp+4+8*esi], mm0) // k - AS2( pxor mm0, [edx+8*esi]) - AS2( movq [esp+4+64+8*esi], mm0) // s - AS2( movq [ecx+8*esi], mm0) - AS1( inc esi) - AS2( cmp esi, 8) + AS2( movq mm0, [WORD_REG(cx)+8*WORD_REG(si)]) + AS2( movq [WORD_REG(sp)+WORD_SZ+8*WORD_REG(si)], mm0) // k + AS2( pxor mm0, [WORD_REG(dx)+8*WORD_REG(si)]) + AS2( movq [WORD_REG(sp)+WORD_SZ+64+8*WORD_REG(si)], mm0) // s + AS2( movq [WORD_REG(cx)+8*WORD_REG(si)], mm0) + AS1( inc WORD_REG(si)) + AS2( cmp WORD_REG(si), 8) ASJ( jne, 0, b) AS2( xor esi, esi) @@ -427,16 +430,16 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) #define KSL1(a, b) AS2(pxor mm##a, b) #define KSL(op, i, a, b, c, d) \ - AS2(mov eax, [esp+4+8*i])\ + AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*i])\ AS2(movzx edi, al)\ - KSL##op(a, [ebx+3*2048+8*edi])\ + KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ AS2(movzx edi, ah)\ - KSL##op(b, [ebx+2*2048+8*edi])\ + KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ AS2(shr eax, 16)\ AS2(movzx edi, al)\ AS2(shr eax, 8)\ - KSL##op(c, [ebx+1*2048+8*edi])\ - KSL##op(d, [ebx+0*2048+8*eax]) + KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ + KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) #define KSH0(a, b) \ ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ @@ -445,57 +448,57 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) AS2(pxor mm##a, b) #define KSH2(a, b) \ AS2(pxor mm##a, b)\ - AS2(movq [esp+4+8*a], mm##a) + AS2(movq [WORD_REG(sp)+WORD_SZ+8*a], mm##a) #define KSH(op, i, a, b, c, d) \ - AS2(mov eax, [esp+4+8*((i+4)-8*((i+4)/8))+4])\ + AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*((i+4)-8*((i+4)/8))+4])\ AS2(movzx edi, al)\ - KSH##op(a, [ebx+3*2048+8*edi])\ + KSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ AS2(movzx edi, ah)\ - KSH##op(b, [ebx+2*2048+8*edi])\ + KSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ AS2(shr eax, 16)\ AS2(movzx edi, al)\ AS2(shr eax, 8)\ - KSH##op(c, [ebx+1*2048+8*edi])\ - KSH##op(d, [ebx+0*2048+8*eax]) + KSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ + KSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) #define TSL(op, i, a, b, c, d) \ - AS2(mov eax, [esp+4+64+8*i])\ + AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*i])\ AS2(movzx edi, al)\ - KSL##op(a, [ebx+3*2048+8*edi])\ + KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ AS2(movzx edi, ah)\ - KSL##op(b, [ebx+2*2048+8*edi])\ + KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ AS2(shr eax, 16)\ AS2(movzx edi, al)\ AS2(shr eax, 8)\ - KSL##op(c, [ebx+1*2048+8*edi])\ - KSL##op(d, [ebx+0*2048+8*eax]) + KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ + KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) #define TSH0(a, b) \ ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ - AS2(pxor mm##a, [esp+4+8*a])\ + AS2(pxor mm##a, [WORD_REG(sp)+WORD_SZ+8*a])\ AS2(pxor mm##a, b) #define TSH1(a, b) \ AS2(pxor mm##a, b) #define TSH2(a, b) \ AS2(pxor mm##a, b)\ - AS2(movq [esp+4+64+8*a], mm##a) + AS2(movq [WORD_REG(sp)+WORD_SZ+64+8*a], mm##a) #define TSH3(a, b) \ AS2(pxor mm##a, b)\ - AS2(pxor mm##a, [ecx+8*a])\ - AS2(movq [ecx+8*a], mm##a) + AS2(pxor mm##a, [WORD_REG(cx)+8*a])\ + AS2(movq [WORD_REG(cx)+8*a], mm##a) #define TSH(op, i, a, b, c, d) \ - AS2(mov eax, [esp+4+64+8*((i+4)-8*((i+4)/8))+4])\ + AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*((i+4)-8*((i+4)/8))+4])\ AS2(movzx edi, al)\ - TSH##op(a, [ebx+3*2048+8*edi])\ + TSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ AS2(movzx edi, ah)\ - TSH##op(b, [ebx+2*2048+8*edi])\ + TSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ AS2(shr eax, 16)\ AS2(movzx edi, al)\ AS2(shr eax, 8)\ - TSH##op(c, [ebx+1*2048+8*edi])\ - TSH##op(d, [ebx+0*2048+8*eax]) + TSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ + TSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) KSL(0, 4, 3, 2, 1, 0) KSL(0, 0, 7, 6, 5, 4) @@ -514,8 +517,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) KSH(2, 3, 2, 1, 0, 7) KSH(2, 7, 6, 5, 4, 3) - AS2( pxor mm0, [ebx + 8*1024 + esi*8]) - AS2( movq [esp+4], mm0) + AS2( pxor mm0, [WORD_REG(bx) + 8*1024 + WORD_REG(si)*8]) + AS2( movq [WORD_REG(sp)+WORD_SZ], mm0) TSL(0, 4, 3, 2, 1, 0) TSL(0, 0, 7, 6, 5, 4) @@ -532,8 +535,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) TSH(1, 5, 4, 3, 2, 1) TSH(1, 6, 5, 4, 3, 2) - AS1( inc esi) - AS2( cmp esi, 10) + AS1( inc WORD_REG(si)) + AS2( cmp WORD_REG(si), 10) ASJ( je, 2, f) TSH(2, 3, 2, 1, 0, 7) @@ -550,11 +553,13 @@ void Whirlpool::Transform(word64 *digest, const word64 *block) #undef TSL #undef TSH + AS_POP( sp) AS1( emms) - AS1( pop esp) +#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300) + AS_POP( bx) +#endif #ifdef __GNUC__ - AS1( pop ebx) ".att_syntax prefix;" : : "a" (Whirlpool_C), "c" (digest), "d" (block) diff --git a/x64masm.asm b/x64masm.asm index 4102c6a7..76676a77 100755 --- a/x64masm.asm +++ b/x64masm.asm @@ -1,80 +1,55 @@ PUBLIC Baseline_Add PUBLIC Baseline_Sub .CODE - ALIGN 8 + ALIGN 8 Baseline_Add PROC - lea rdx, [rdx+8*rcx] lea r8, [r8+8*rcx] lea r9, [r9+8*rcx] - neg rcx ; rcx is negative index - test rcx, 2 ; this clears carry flag - jz $0@Baseline_Add - sub rcx, 2 - jmp $1@Baseline_Add - -$0@Baseline_Add: - jrcxz $2@Baseline_Add ; loop until rcx overflows and becomes zero + jz $1@Baseline_Add mov rax,[r8+8*rcx] - adc rax,[r9+8*rcx] + add rax,[r9+8*rcx] mov [rdx+8*rcx],rax +$0@Baseline_Add: mov rax,[r8+8*rcx+8] adc rax,[r9+8*rcx+8] mov [rdx+8*rcx+8],rax -$1@Baseline_Add: - mov rax,[r8+8*rcx+16] - adc rax,[r9+8*rcx+16] - mov [rdx+8*rcx+16],rax - mov rax,[r8+8*rcx+24] - adc rax,[r9+8*rcx+24] - mov [rdx+8*rcx+24],rax - - lea rcx,[rcx+4] ; advance index, avoid inc which causes slowdown on Intel Core 2 + lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2 + jrcxz $1@Baseline_Add ; loop until rcx overflows and becomes zero + mov rax,[r8+8*rcx] + adc rax,[r9+8*rcx] + mov [rdx+8*rcx],rax jmp $0@Baseline_Add - -$2@Baseline_Add: +$1@Baseline_Add: mov rax, 0 - setc al ; store carry into rax (return result register) - + adc rax, rax ; store carry into rax (return result register) ret Baseline_Add ENDP - ALIGN 8 + ALIGN 8 Baseline_Sub PROC - lea rdx, [rdx+8*rcx] lea r8, [r8+8*rcx] lea r9, [r9+8*rcx] - neg rcx ; rcx is negative index - test rcx, 2 ; this clears carry flag - jz $0@Baseline_Sub - sub rcx, 2 - jmp $1@Baseline_Sub - -$0@Baseline_Sub: - jrcxz $2@Baseline_Sub ; loop until rcx overflows and becomes zero + jz $1@Baseline_Sub mov rax,[r8+8*rcx] - sbb rax,[r9+8*rcx] + sub rax,[r9+8*rcx] mov [rdx+8*rcx],rax +$0@Baseline_Sub: mov rax,[r8+8*rcx+8] sbb rax,[r9+8*rcx+8] mov [rdx+8*rcx+8],rax -$1@Baseline_Sub: - mov rax,[r8+8*rcx+16] - sbb rax,[r9+8*rcx+16] - mov [rdx+8*rcx+16],rax - mov rax,[r8+8*rcx+24] - sbb rax,[r9+8*rcx+24] - mov [rdx+8*rcx+24],rax - - lea rcx,[rcx+4] ; advance index, avoid inc which causes slowdown on Intel Core 2 + lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2 + jrcxz $1@Baseline_Sub ; loop until rcx overflows and becomes zero + mov rax,[r8+8*rcx] + sbb rax,[r9+8*rcx] + mov [rdx+8*rcx],rax jmp $0@Baseline_Sub - -$2@Baseline_Sub: +$1@Baseline_Sub: mov rax, 0 - setc al ; store carry into rax (return result register) + adc rax, rax ; store carry into rax (return result register) ret Baseline_Sub ENDP