fix compile for x64, DLL and VC 6

pull/2/head
weidai 2007-05-04 15:24:09 +00:00
parent 460c2d6c6a
commit d2510f30c7
14 changed files with 810 additions and 553 deletions

View File

@ -228,7 +228,7 @@ void Camellia::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBloc
SLOW_ROUND(lh, ll, rh, rl, KS(1,0), KS(1,1)) SLOW_ROUND(lh, ll, rh, rl, KS(1,0), KS(1,1))
SLOW_ROUND(rh, rl, lh, ll, KS(1,2), KS(1,3)) SLOW_ROUND(rh, rl, lh, ll, KS(1,2), KS(1,3))
for (unsigned int i = m_rounds-1; i > 0; --i) for (i = m_rounds-1; i > 0; --i)
{ {
DOUBLE_ROUND(lh, ll, rh, rl, KS(2,0), KS(2,1), KS(2,2), KS(2,3)) DOUBLE_ROUND(lh, ll, rh, rl, KS(2,0), KS(2,1), KS(2,2), KS(2,3))
DOUBLE_ROUND(lh, ll, rh, rl, KS(3,0), KS(3,1), KS(3,2), KS(3,3)) DOUBLE_ROUND(lh, ll, rh, rl, KS(3,0), KS(3,1), KS(3,2), KS(3,3))

29
cpu.cpp
View File

@ -1,8 +1,10 @@
// cpu.cpp - written and placed in the public domain by Wei Dai // cpu.cpp - written and placed in the public domain by Wei Dai
#include "pch.h" #include "pch.h"
#include "cpu.h"
#ifndef CRYPTOPP_IMPORTS
#include "cpu.h"
#include "misc.h" #include "misc.h"
#include <algorithm> #include <algorithm>
@ -11,10 +13,15 @@
#include <setjmp.h> #include <setjmp.h>
#endif #endif
#ifdef CRYPTOPP_MSVC6PP_OR_LATER
#include <emmintrin.h>
#endif
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_X86_ASM_AVAILABLE #ifdef CRYPTOPP_X86_ASM_AVAILABLE
#ifndef _MSC_VER
typedef void (*SigHandler)(int); typedef void (*SigHandler)(int);
static jmp_buf s_jmpNoCPUID; static jmp_buf s_jmpNoCPUID;
@ -22,6 +29,7 @@ static void SigIllHandlerCPUID(int)
{ {
longjmp(s_jmpNoCPUID, 1); longjmp(s_jmpNoCPUID, 1);
} }
#endif
bool CpuId(word32 input, word32 *output) bool CpuId(word32 input, word32 *output)
{ {
@ -57,7 +65,11 @@ bool CpuId(word32 input, word32 *output)
__asm__ __asm__
( (
// save ebx in case -fPIC is being used // save ebx in case -fPIC is being used
#if CRYPTOPP_BOOL_X86
"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx" "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
#else
"pushq %%rbx; cpuid; mov %%ebx, %%edi; popq %%rbx"
#endif
: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3]) : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
: "a" (input) : "a" (input)
); );
@ -84,22 +96,19 @@ bool CpuId(word32 input, word32 *output)
return true; return true;
} }
inline bool TrySSE2()
{
return true;
}
#endif #endif
#ifdef CRYPTOPP_CPUID_AVAILABLE #ifdef CRYPTOPP_CPUID_AVAILABLE
static bool TrySSE2() static bool TrySSE2()
{ {
#ifdef _MSC_VER #if CRYPTOPP_BOOL_X64
return true;
#elif defined(_MSC_VER)
__try __try
{ {
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
__asm por xmm0, xmm0 // executing SSE2 instruction AS2(por xmm0, xmm0) // executing SSE2 instruction
#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE #elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
__mm128i x = _mm_setzero_si128(); __mm128i x = _mm_setzero_si128();
return _mm_cvtsi128_si32(x) == 0; return _mm_cvtsi128_si32(x) == 0;
@ -137,7 +146,7 @@ static bool TrySSE2()
bool g_x86DetectionDone = false; bool g_x86DetectionDone = false;
bool g_hasSSE2 = false, g_hasSSSE3 = false, g_hasMMX = false, g_isP4 = false; bool g_hasSSE2 = false, g_hasSSSE3 = false, g_hasMMX = false, g_isP4 = false;
int g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE; word32 g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
void DetectX86Features() void DetectX86Features()
{ {
@ -170,3 +179,5 @@ void DetectX86Features()
#endif #endif
NAMESPACE_END NAMESPACE_END
#endif

33
cpu.h
View File

@ -3,6 +3,10 @@
#include "config.h" #include "config.h"
#ifdef CRYPTOPP_MSVC6PP_OR_LATER
#include <emmintrin.h>
#endif
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
#if defined(CRYPTOPP_X86_ASM_AVAILABLE) || (_MSC_VER >= 1400 && CRYPTOPP_BOOL_X64) #if defined(CRYPTOPP_X86_ASM_AVAILABLE) || (_MSC_VER >= 1400 && CRYPTOPP_BOOL_X64)
@ -10,12 +14,15 @@ NAMESPACE_BEGIN(CryptoPP)
#define CRYPTOPP_CPUID_AVAILABLE #define CRYPTOPP_CPUID_AVAILABLE
// these should not be used directly // these should not be used directly
extern bool g_x86DetectionDone; extern CRYPTOPP_DLL bool g_x86DetectionDone;
extern bool g_hasSSE2, g_hasMMX, g_hasSSSE3, g_isP4; extern CRYPTOPP_DLL bool g_hasSSE2;
extern int g_cacheLineSize; extern CRYPTOPP_DLL bool g_hasMMX;
void DetectX86Features(); extern CRYPTOPP_DLL bool g_hasSSSE3;
extern CRYPTOPP_DLL bool g_isP4;
extern CRYPTOPP_DLL word32 g_cacheLineSize;
CRYPTOPP_DLL void DetectX86Features();
bool CpuId(word32 input, word32 *output); CRYPTOPP_DLL bool CpuId(word32 input, word32 *output);
#if CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X64
inline bool HasSSE2() {return true;} inline bool HasSSE2() {return true;}
@ -94,6 +101,7 @@ inline bool HasMMX() {return false;}
#define ASL(x) GNU_ASL(x) #define ASL(x) GNU_ASL(x)
#define ASJ(x, y, z) GNU_ASJ(x, y, z) #define ASJ(x, y, z) GNU_ASJ(x, y, z)
#define ASC(x, y) #x " " #y ";" #define ASC(x, y) #x " " #y ";"
#define CRYPTOPP_NAKED
#else #else
#define AS1(x) __asm {x} #define AS1(x) __asm {x}
#define AS2(x, y) __asm {x, y} #define AS2(x, y) __asm {x, y}
@ -102,11 +110,26 @@ inline bool HasMMX() {return false;}
#define ASL(x) __asm {label##x:} #define ASL(x) __asm {label##x:}
#define ASJ(x, y, z) __asm {x label##y} #define ASJ(x, y, z) __asm {x label##y}
#define ASC(x, y) __asm {x label##y} #define ASC(x, y) __asm {x label##y}
#define CRYPTOPP_NAKED __declspec(naked)
#endif #endif
// GNU assembler doesn't seem to have mod operator // GNU assembler doesn't seem to have mod operator
#define ASM_MOD(x, y) ((x)-((x)/(y))*(y)) #define ASM_MOD(x, y) ((x)-((x)/(y))*(y))
#if CRYPTOPP_BOOL_X86
#define WORD_SZ 4
#define WORD_REG(x) e##x
#define WORD_PTR DWORD PTR
#define AS_PUSH(x) AS1(push e##x)
#define AS_POP(x) AS1(pop e##x)
#elif CRYPTOPP_BOOL_X64
#define WORD_SZ 8
#define WORD_REG(x) r##x
#define WORD_PTR QWORD PTR
#define AS_PUSH(x) AS1(pushq r##x)
#define AS_POP(x) AS1(popq r##x)
#endif
NAMESPACE_END NAMESPACE_END
#endif #endif

View File

@ -5,14 +5,14 @@
#include "randpool.h" #include "randpool.h"
#include "files.h" #include "files.h"
#include "trunhash.h" #include "trunhash.h"
#include "queue.h"
#include "validate.h"
#include <iostream> #include <iostream>
#include <memory> #include <memory>
USING_NAMESPACE(CryptoPP) USING_NAMESPACE(CryptoPP)
USING_NAMESPACE(std) USING_NAMESPACE(std)
RandomPool & GlobalRNG();
typedef std::map<std::string, std::string> TestData; typedef std::map<std::string, std::string> TestData;
class TestFailure : public Exception class TestFailure : public Exception
@ -67,7 +67,7 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo
s1 = s1.substr(s1.find(' ')+1); s1 = s1.substr(s1.find(' ')+1);
} }
s2.clear(); s2 = ""; // MSVC 6 doesn't have clear();
if (s1[0] == '\"') if (s1[0] == '\"')
{ {
@ -85,8 +85,13 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo
s1 = s1.substr(STDMIN(s1.find(' '), s1.length())); s1 = s1.substr(STDMIN(s1.find(' '), s1.length()));
} }
ByteQueue q;
while (repeat--) while (repeat--)
target.Put((const byte *)s2.data(), s2.size()); {
q.Put((const byte *)s2.data(), s2.size());
if (q.MaxRetrievable() > 4*1024 || repeat == 0)
q.TransferTo(target);
}
} }
} }

View File

@ -18,7 +18,7 @@
#include <iostream> #include <iostream>
#if defined(_MSC_VER) && _MSC_VER >= 1400 #if _MSC_VER >= 1400
#include <intrin.h> #include <intrin.h>
#endif #endif
@ -30,6 +30,8 @@
#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.") #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
#endif #endif
#define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86)
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt) bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
@ -99,7 +101,36 @@ static word AtomicInverseModPower2(word A)
// ******************************************************** // ********************************************************
#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE #if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || CRYPTOPP_BOOL_X64
#define Declare2Words(x) word x##0, x##1;
#define AssignWord(a, b) a##0 = b; a##1 = 0;
#define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
#define LowWord(a) a##0
#define HighWord(a) a##1
#ifdef _MSC_VER
#define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1);
#define Double3Words(c, d) d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
#elif defined(__DECCXX)
#define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
#elif CRYPTOPP_BOOL_X64
#define MultiplyWords(p, a, b) asm ("mulq %3" : "=a"(p##0), "=d"(p##1) : "a"(a), "g"(b) : "cc");
#define MulAcc(c, d, a, b) asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
#define Double3Words(c, d) asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
#define Acc2WordsBy1(a, b) asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
#define Acc2WordsBy2(a, b) asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
#define Acc3WordsBy2(c, d, e) asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
#endif
#ifndef Double3Words
#define Double3Words(c, d) d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
#endif
#ifndef Acc2WordsBy2
#define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
#endif
#define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
#define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
#define GetCarry(u) u##1
#define GetBorrow(u) u##1
#else
#define Declare2Words(x) dword x; #define Declare2Words(x) dword x;
#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
#define MultiplyWords(p, a, b) p = __emulu(a, b); #define MultiplyWords(p, a, b) p = __emulu(a, b);
@ -108,34 +139,23 @@ static word AtomicInverseModPower2(word A)
#endif #endif
#define AssignWord(a, b) a = b; #define AssignWord(a, b) a = b;
#define Add2WordsBy1(a, b, c) a = b + c; #define Add2WordsBy1(a, b, c) a = b + c;
#define Acc2WordsBy1(a, b) a += b;
#define Acc2WordsBy2(a, b) a += b; #define Acc2WordsBy2(a, b) a += b;
#define LowWord(a) (word)a #define LowWord(a) word(a)
#define HighWord(a) (word)(a>>WORD_BITS) #define HighWord(a) word(a>>WORD_BITS)
#define Double2Words(a) a += a; #define Double3Words(c, d) d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
#define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u); #define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u);
#define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u); #define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u);
#define GetCarry(u) HighWord(u) #define GetCarry(u) HighWord(u)
#define GetBorrow(u) word(u>>(WORD_BITS*2-1)) #define GetBorrow(u) word(u>>(WORD_BITS*2-1))
#else #endif
#define Declare2Words(x) word x##0, x##1; #ifndef MulAcc
#define AssignWord(a, b) a##0 = b; a##1 = 0; #define MulAcc(c, d, a, b) MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
#define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c); #endif
#ifndef Acc2WordsBy1
#define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b) #define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b)
#define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1; #endif
#define LowWord(a) a##0 #ifndef Acc3WordsBy2
#define HighWord(a) a##1 #define Acc3WordsBy2(c, d, e) Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
#ifdef _MSC_VER
#define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1);
#define Double2Words(a) a##1 = __shiftleft128(a##0, a##1, 1); a##0 += a##0;
#elif defined(__DECCXX)
#define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
#define Double2Words(a) a##1 = (a##1 + a##1) + (a##0 >> (WORD_BITS-1)); a##0 += a##0;
#endif
#define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
#define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
#define GetCarry(u) u##1
#define GetBorrow(u) u##1
#endif #endif
class DWord class DWord
@ -411,9 +431,8 @@ inline word DWord::operator%(word a)
// use some tricks to share assembly code between MSVC and GCC // use some tricks to share assembly code between MSVC and GCC
#if defined(__GNUC__) #if defined(__GNUC__)
#define CRYPTOPP_NAKED
#define AddPrologue \ #define AddPrologue \
word32 result; \ word result; \
__asm__ __volatile__ \ __asm__ __volatile__ \
( \ ( \
".intel_syntax noprefix;" ".intel_syntax noprefix;"
@ -454,7 +473,6 @@ inline word DWord::operator%(word a)
: "memory", "cc" \ : "memory", "cc" \
); );
#else #else
#define CRYPTOPP_NAKED __declspec(naked)
#define AddPrologue \ #define AddPrologue \
__asm push edi \ __asm push edi \
__asm push esi \ __asm push esi \
@ -464,33 +482,107 @@ inline word DWord::operator%(word a)
__asm pop esi \ __asm pop esi \
__asm pop edi \ __asm pop edi \
__asm ret 8 __asm ret 8
#if _MSC_VER < 1300
#define SaveEBX __asm push ebx
#define RestoreEBX __asm pop ebx
#else
#define SaveEBX
#define RestoreEBX
#endif
#define SquPrologue \ #define SquPrologue \
AS2( mov eax, A) \ AS2( mov eax, A) \
AS2( mov ecx, C) \ AS2( mov ecx, C) \
SaveEBX \
AS2( lea ebx, s_maskLow16) AS2( lea ebx, s_maskLow16)
#define SquEpilogue
#define MulPrologue \ #define MulPrologue \
AS2( mov eax, A) \ AS2( mov eax, A) \
AS2( mov edi, B) \ AS2( mov edi, B) \
AS2( mov ecx, C) \ AS2( mov ecx, C) \
SaveEBX \
AS2( lea ebx, s_maskLow16) AS2( lea ebx, s_maskLow16)
#define MulEpilogue
#define TopPrologue \ #define TopPrologue \
AS2( mov eax, A) \ AS2( mov eax, A) \
AS2( mov edi, B) \ AS2( mov edi, B) \
AS2( mov ecx, C) \ AS2( mov ecx, C) \
AS2( mov esi, L) \ AS2( mov esi, L) \
SaveEBX \
AS2( lea ebx, s_maskLow16) AS2( lea ebx, s_maskLow16)
#define TopEpilogue #define SquEpilogue RestoreEBX
#define MulEpilogue RestoreEBX
#define TopEpilogue RestoreEBX
#endif #endif
#if defined(_MSC_VER) && defined(_M_X64) #ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" { extern "C" {
int Baseline_Add(size_t N, word *C, const word *A, const word *B); word Baseline_Add(size_t N, word *C, const word *A, const word *B);
int Baseline_Sub(size_t N, word *C, const word *A, const word *B); word Baseline_Sub(size_t N, word *C, const word *A, const word *B);
} }
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) #elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__)
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) word Baseline_Add(size_t N, word *C, const word *A, const word *B)
{
word result;
__asm__ __volatile__
(
".intel_syntax;"
AS1( neg %1)
ASJ( jz, 1, f)
AS2( mov %0,[%3+8*%1])
AS2( add %0,[%4+8*%1])
AS2( mov [%2+8*%1],%0)
ASL(0)
AS2( mov %0,[%3+8*%1+8])
AS2( adc %0,[%4+8*%1+8])
AS2( mov [%2+8*%1+8],%0)
AS2( lea %1,[%1+2])
ASJ( jrcxz, 1, f)
AS2( mov %0,[%3+8*%1])
AS2( adc %0,[%4+8*%1])
AS2( mov [%2+8*%1],%0)
ASJ( jmp, 0, b)
ASL(1)
AS2( mov %0, 0)
AS2( adc %0, %0)
".att_syntax;"
: "=&r" (result)
: "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
: "memory", "cc"
);
return result;
}
word Baseline_Sub(size_t N, word *C, const word *A, const word *B)
{
word result;
__asm__ __volatile__
(
".intel_syntax;"
AS1( neg %1)
ASJ( jz, 1, f)
AS2( mov %0,[%3+8*%1])
AS2( sub %0,[%4+8*%1])
AS2( mov [%2+8*%1],%0)
ASL(0)
AS2( mov %0,[%3+8*%1+8])
AS2( sbb %0,[%4+8*%1+8])
AS2( mov [%2+8*%1+8],%0)
AS2( lea %1,[%1+2])
ASJ( jrcxz, 1, f)
AS2( mov %0,[%3+8*%1])
AS2( sbb %0,[%4+8*%1])
AS2( mov [%2+8*%1],%0)
ASJ( jmp, 0, b)
ASL(1)
AS2( mov %0, 0)
AS2( adc %0, %0)
".att_syntax;"
: "=&r" (result)
: "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
: "memory", "cc"
);
return result;
}
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
{ {
AddPrologue AddPrologue
@ -531,7 +623,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word
AddEpilogue AddEpilogue
} }
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
{ {
AddPrologue AddPrologue
@ -572,8 +664,8 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word
AddEpilogue AddEpilogue
} }
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_INTEGER_SSE2
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B) CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
{ {
AddPrologue AddPrologue
@ -629,7 +721,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A,
AddEpilogue AddEpilogue
} }
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B) CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
{ {
AddPrologue AddPrologue
@ -687,7 +779,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A,
} }
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#else #else
int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
{ {
assert (N%2 == 0); assert (N%2 == 0);
@ -703,7 +795,7 @@ int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word
return int(GetCarry(u)); return int(GetCarry(u));
} }
int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
{ {
assert (N%2 == 0); assert (N%2 == 0);
@ -737,7 +829,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
#define Mul_2 \ #define Mul_2 \
Mul_Begin(2) \ Mul_Begin(2) \
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
Mul_End(2) Mul_End(1, 1)
#define Mul_4 \ #define Mul_4 \
Mul_Begin(4) \ Mul_Begin(4) \
@ -746,7 +838,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \ Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \ Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
Mul_End(4) Mul_End(5, 3)
#define Mul_8 \ #define Mul_8 \
Mul_Begin(8) \ Mul_Begin(8) \
@ -763,7 +855,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \ Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \ Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \ Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
Mul_End(8) Mul_End(13, 7)
#define Mul_16 \ #define Mul_16 \
Mul_Begin(16) \ Mul_Begin(16) \
@ -796,7 +888,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \ Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \ Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \ Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
Mul_End(16) Mul_End(29, 15)
#define Squ_2 \ #define Squ_2 \
Squ_Begin(2) \ Squ_Begin(2) \
@ -900,6 +992,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \ Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
Bot_End(16) Bot_End(16)
#if 0
#define Mul_Begin(n) \ #define Mul_Begin(n) \
Declare2Words(p) \ Declare2Words(p) \
Declare2Words(c) \ Declare2Words(c) \
@ -938,9 +1031,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
#define Bot_End(n) \ #define Bot_End(n) \
R[n-1] = e; R[n-1] = e;
#else
/*
// this is slower on MSVC 2005 Win32
#define Mul_Begin(n) \ #define Mul_Begin(n) \
Declare2Words(p) \ Declare2Words(p) \
word c; \ word c; \
@ -950,25 +1041,20 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
AssignWord(d, HighWord(p)) AssignWord(d, HighWord(p))
#define Mul_Acc(i, j) \ #define Mul_Acc(i, j) \
MultiplyWords(p, A[i], B[j]) \ MulAcc(c, d, A[i], B[j])
Acc2WordsBy1(p, c) \
c = LowWord(p); \
Acc2WordsBy1(d, HighWord(p))
#define Mul_SaveAcc(k, i, j) \ #define Mul_SaveAcc(k, i, j) \
R[k] = c; \ R[k] = c; \
MultiplyWords(p, A[i], B[j]) \ c = LowWord(d); \
Acc2WordsBy1(p, LowWord(d)) \
c = LowWord(p); \
AssignWord(d, HighWord(d)) \ AssignWord(d, HighWord(d)) \
Acc2WordsBy1(d, HighWord(p)) MulAcc(c, d, A[i], B[j])
#define Mul_End(n) \ #define Mul_End(k, i) \
R[2*n-3] = c; \ R[k] = c; \
MultiplyWords(p, A[n-1], B[n-1])\ MultiplyWords(p, A[i], B[i]) \
Acc2WordsBy2(d, p) \ Acc2WordsBy2(p, d) \
R[2*n-2] = LowWord(d); \ R[k+1] = LowWord(p); \
R[2*n-1] = HighWord(d); R[k+2] = HighWord(p);
#define Bot_SaveAcc(k, i, j) \ #define Bot_SaveAcc(k, i, j) \
R[k] = c; \ R[k] = c; \
@ -980,52 +1066,45 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
#define Bot_End(n) \ #define Bot_End(n) \
R[n-1] = c; R[n-1] = c;
*/ #endif
#define Squ_Begin(n) \ #define Squ_Begin(n) \
Declare2Words(p) \ Declare2Words(p) \
Declare2Words(c) \ word c; \
Declare2Words(d) \ Declare2Words(d) \
Declare2Words(e) \ Declare2Words(e) \
MultiplyWords(p, A[0], A[0]) \ MultiplyWords(p, A[0], A[0]) \
R[0] = LowWord(p); \ R[0] = LowWord(p); \
AssignWord(e, HighWord(p)) \ AssignWord(e, HighWord(p)) \
MultiplyWords(p, A[0], A[1]) \ MultiplyWords(p, A[0], A[1]) \
AssignWord(c, LowWord(p)) \ c = LowWord(p); \
AssignWord(d, HighWord(p)) \ AssignWord(d, HighWord(p)) \
Squ_NonDiag \ Squ_NonDiag \
#define Squ_NonDiag \ #define Squ_NonDiag \
Double2Words(c) \ Double3Words(c, d)
Double2Words(d) \
#define Squ_SaveAcc(k, i, j) \ #define Squ_SaveAcc(k, i, j) \
Acc2WordsBy2(c, e) \ Acc3WordsBy2(c, d, e) \
R[k] = LowWord(c); \ R[k] = c; \
Add2WordsBy1(e, d, HighWord(c)) \
MultiplyWords(p, A[i], A[j]) \ MultiplyWords(p, A[i], A[j]) \
AssignWord(c, LowWord(p)) \ c = LowWord(p); \
AssignWord(d, HighWord(p)) \ AssignWord(d, HighWord(p)) \
#define Squ_Acc(i, j) \ #define Squ_Acc(i, j) \
MultiplyWords(p, A[i], A[j]) \ MulAcc(c, d, A[i], A[j])
Acc2WordsBy1(c, LowWord(p)) \
Acc2WordsBy1(d, HighWord(p))
#define Squ_Diag(i) \ #define Squ_Diag(i) \
Squ_NonDiag \ Squ_NonDiag \
MultiplyWords(p, A[i], A[i]) \ MulAcc(c, d, A[i], A[i])
Acc2WordsBy1(c, LowWord(p)) \
Acc2WordsBy1(d, HighWord(p)) \
#define Squ_End(n) \ #define Squ_End(n) \
Acc2WordsBy2(c, e) \ Acc3WordsBy2(c, d, e) \
R[2*n-3] = LowWord(c); \ R[2*n-3] = c; \
Acc2WordsBy1(d, HighWord(c)) \
MultiplyWords(p, A[n-1], A[n-1])\ MultiplyWords(p, A[n-1], A[n-1])\
Acc2WordsBy2(d, p) \ Acc2WordsBy2(p, e) \
R[2*n-2] = LowWord(d); \ R[2*n-2] = LowWord(p); \
R[2*n-1] = HighWord(d); R[2*n-1] = HighWord(p);
void Baseline_Multiply2(word *R, const word *A, const word *B) void Baseline_Multiply2(word *R, const word *A, const word *B)
{ {
@ -1072,7 +1151,62 @@ void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
Bot_8 Bot_8
} }
/* #define Top_Begin(n) \
Declare2Words(p) \
word c; \
Declare2Words(d) \
MultiplyWords(p, A[0], B[n-2]);\
AssignWord(d, HighWord(p));
#define Top_Acc(i, j) \
MultiplyWords(p, A[i], B[j]);\
Acc2WordsBy1(d, HighWord(p));
#define Top_SaveAcc0(i, j) \
c = LowWord(d); \
AssignWord(d, HighWord(d)) \
MulAcc(c, d, A[i], B[j])
#define Top_SaveAcc1(i, j) \
c = L<c; \
Acc2WordsBy1(d, c); \
c = LowWord(d); \
AssignWord(d, HighWord(d)) \
MulAcc(c, d, A[i], B[j])
void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
{
word T[4];
Baseline_Multiply2(T, A, B);
R[0] = T[2];
R[1] = T[3];
}
void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
{
Top_Begin(4)
Top_Acc(1, 1) Top_Acc(2, 0) \
Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
Mul_End(1, 3)
}
void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
{
Top_Begin(8)
Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
Mul_End(5, 7)
}
#if !CRYPTOPP_INTEGER_SSE2 // save memory by not compiling these functions when SSE2 is available
void Baseline_Multiply16(word *R, const word *A, const word *B) void Baseline_Multiply16(word *R, const word *A, const word *B)
{ {
Mul_16 Mul_16
@ -1087,16 +1221,40 @@ void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
{ {
Bot_16 Bot_16
} }
*/
void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
{
Top_Begin(16)
Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
Mul_End(13, 15)
}
#endif
// ******************************************************** // ********************************************************
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_INTEGER_SSE2
CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff}; CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
#undef Mul_Begin #undef Mul_Begin
#undef Mul_Acc #undef Mul_Acc
#undef Top_Begin
#undef Top_Acc
#undef Squ_Acc #undef Squ_Acc
#undef Squ_NonDiag #undef Squ_NonDiag
#undef Squ_Diag #undef Squ_Diag
@ -1760,33 +1918,35 @@ void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
Top_End(8) Top_End(8)
} }
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #endif // #if CRYPTOPP_INTEGER_SSE2
// ******************************************************** // ********************************************************
typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B); typedef word (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
typedef void (* PMul)(word *C, const word *A, const word *B); typedef void (* PMul)(word *C, const word *A, const word *B);
typedef void (* PSqu)(word *C, const word *A); typedef void (* PSqu)(word *C, const word *A);
typedef void (* PMulTop)(word *C, const word *A, const word *B, word L); typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_INTEGER_SSE2
static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub; static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
static PMulTop s_pTop[3];
static size_t s_recursionLimit = 8; static size_t s_recursionLimit = 8;
#else #else
static const size_t s_recursionLimit = 8; static const size_t s_recursionLimit = 16;
#endif #endif
static PMul s_pMul[9], s_pBot[9]; static PMul s_pMul[9], s_pBot[9];
static PSqu s_pSqu[9]; static PSqu s_pSqu[9];
static PMulTop s_pTop[9];
static void SetFunctionPointers() static void SetFunctionPointers()
{ {
s_pMul[0] = &Baseline_Multiply2; s_pMul[0] = &Baseline_Multiply2;
s_pBot[0] = &Baseline_MultiplyBottom2; s_pBot[0] = &Baseline_MultiplyBottom2;
s_pSqu[0] = &Baseline_Square2; s_pSqu[0] = &Baseline_Square2;
s_pTop[0] = &Baseline_MultiplyTop2;
s_pTop[1] = &Baseline_MultiplyTop4;
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_INTEGER_SSE2
if (HasSSE2()) if (HasSSE2())
{ {
if (IsP4()) if (IsP4())
@ -1812,39 +1972,45 @@ static void SetFunctionPointers()
s_pSqu[4] = &SSE2_Square16; s_pSqu[4] = &SSE2_Square16;
s_pSqu[8] = &SSE2_Square32; s_pSqu[8] = &SSE2_Square32;
s_pTop[0] = &SSE2_MultiplyTop8; s_pTop[2] = &SSE2_MultiplyTop8;
s_pTop[1] = &SSE2_MultiplyTop16; s_pTop[4] = &SSE2_MultiplyTop16;
s_pTop[2] = &SSE2_MultiplyTop32; s_pTop[8] = &SSE2_MultiplyTop32;
} }
else else
#endif #endif
{ {
s_pMul[1] = &Baseline_Multiply4; s_pMul[1] = &Baseline_Multiply4;
s_pMul[2] = &Baseline_Multiply8; s_pMul[2] = &Baseline_Multiply8;
// s_pMul[4] = &Baseline_Multiply16;
s_pBot[1] = &Baseline_MultiplyBottom4; s_pBot[1] = &Baseline_MultiplyBottom4;
s_pBot[2] = &Baseline_MultiplyBottom8; s_pBot[2] = &Baseline_MultiplyBottom8;
// s_pBot[4] = &Baseline_MultiplyBottom16;
s_pSqu[1] = &Baseline_Square4; s_pSqu[1] = &Baseline_Square4;
s_pSqu[2] = &Baseline_Square8; s_pSqu[2] = &Baseline_Square8;
// s_pSqu[4] = &Baseline_Square16;
s_pTop[2] = &Baseline_MultiplyTop8;
#if !CRYPTOPP_INTEGER_SSE2
s_pMul[4] = &Baseline_Multiply16;
s_pBot[4] = &Baseline_MultiplyBottom16;
s_pSqu[4] = &Baseline_Square16;
s_pTop[4] = &Baseline_MultiplyTop16;
#endif
} }
} }
inline int Add(word *C, const word *A, const word *B, size_t N) inline word Add(word *C, const word *A, const word *B, size_t N)
{ {
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_INTEGER_SSE2
return s_pAdd(N, C, A, B); return s_pAdd(N, C, A, B);
#else #else
return Baseline_Add(N, C, A, B); return Baseline_Add(N, C, A, B);
#endif #endif
} }
inline int Subtract(word *C, const word *A, const word *B, size_t N) inline word Subtract(word *C, const word *A, const word *B, size_t N)
{ {
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_INTEGER_SSE2
return s_pSub(N, C, A, B); return s_pSub(N, C, A, B);
#else #else
return Baseline_Sub(N, C, A, B); return Baseline_Sub(N, C, A, B);
@ -1969,16 +2135,8 @@ void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B,
{ {
assert(N>=2 && N%2==0); assert(N>=2 && N%2==0);
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE if (N <= s_recursionLimit)
if (HasSSE2() && ((N>=8) & (N<=32))) s_pTop[N/4](R, A, B, L[N-1]);
s_pTop[N/16](R, A, B, L[N-1]);
else
#endif
if (N<=4)
{
s_pMul[N/4](T, A, B);
memcpy(R, T+N, N*WORD_SIZE);
}
else else
{ {
const size_t N2 = N/2; const size_t N2 = N/2;
@ -3076,13 +3234,6 @@ public:
memcpy(m_counterAndSeed + 4, seed, seedSize); memcpy(m_counterAndSeed + 4, seed, seedSize);
} }
byte GenerateByte()
{
byte b;
GenerateBlock(&b, 1);
return b;
}
void GenerateBlock(byte *output, size_t size) void GenerateBlock(byte *output, size_t size)
{ {
PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter); PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);

View File

@ -26,31 +26,31 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
__asm__ __volatile__ __asm__ __volatile__
( (
".intel_syntax noprefix;" ".intel_syntax noprefix;"
AS1( push ebx) AS_PUSH( bx)
#else #else
AS2( mov ecx, count) AS2( mov WORD_REG(cx), count)
AS2( mov esi, state) AS2( mov WORD_REG(si), state)
AS2( mov edi, z) AS2( mov WORD_REG(di), z)
AS2( mov edx, y) AS2( mov WORD_REG(dx), y)
#endif #endif
AS2( shl ecx, 5) AS2( shl WORD_REG(cx), 5)
ASJ( jz, 5, f) ASJ( jz, 5, f)
AS2( mov ebx, [esi+4*17]) AS2( mov ebx, [WORD_REG(si)+4*17])
AS2( add ecx, ebx) AS2( add WORD_REG(cx), WORD_REG(bx))
AS1( push ebp) AS_PUSH( bp)
AS1( push ecx) AS_PUSH( cx)
AS2( movdqa xmm0, [esi+0*16]) AS2( movdqa xmm0, [WORD_REG(si)+0*16])
AS2( movdqa xmm1, [esi+1*16]) AS2( movdqa xmm1, [WORD_REG(si)+1*16])
AS2( movdqa xmm2, [esi+2*16]) AS2( movdqa xmm2, [WORD_REG(si)+2*16])
AS2( movdqa xmm3, [esi+3*16]) AS2( movdqa xmm3, [WORD_REG(si)+3*16])
AS2( mov eax, [esi+4*16]) AS2( mov eax, [WORD_REG(si)+4*16])
ASL(4) ASL(4)
// gamma and pi // gamma and pi
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
AS2( test ebx, 1) AS2( test WORD_REG(bx), 1)
ASJ( jnz, 6, f) ASJ( jnz, 6, f)
#endif #endif
AS2( movdqa xmm6, xmm2) AS2( movdqa xmm6, xmm2)
@ -81,7 +81,7 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
#define pi(i) \ #define pi(i) \
AS2( movd ecx, xmm7)\ AS2( movd ecx, xmm7)\
AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\ AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
AS2( mov [esi+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx) AS2( mov [WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
#define pi4(x, y, z, a, b, c, d) \ #define pi4(x, y, z, a, b, c, d) \
AS2( pcmpeqb xmm7, xmm7)\ AS2( pcmpeqb xmm7, xmm7)\
@ -110,65 +110,65 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( punpckhdq xmm2, xmm0) // 11 12 15 16 AS2( punpckhdq xmm2, xmm0) // 11 12 15 16
// keystream // keystream
AS2( test edi, edi) AS2( test WORD_REG(di), WORD_REG(di))
ASJ( jz, 0, f) ASJ( jz, 0, f)
AS2( movdqa xmm6, xmm4) AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm2) AS2( punpcklqdq xmm4, xmm2)
AS2( punpckhqdq xmm6, xmm2) AS2( punpckhqdq xmm6, xmm2)
AS2( test edx, 0xf) AS2( test WORD_REG(dx), 0xf)
ASJ( jnz, 2, f) ASJ( jnz, 2, f)
AS2( test edx, edx) AS2( test WORD_REG(dx), WORD_REG(dx))
ASJ( jz, 1, f) ASJ( jz, 1, f)
AS2( pxor xmm4, [edx]) AS2( pxor xmm4, [WORD_REG(dx)])
AS2( pxor xmm6, [edx+16]) AS2( pxor xmm6, [WORD_REG(dx)+16])
AS2( add edx, 32) AS2( add WORD_REG(dx), 32)
ASJ( jmp, 1, f) ASJ( jmp, 1, f)
ASL(2) ASL(2)
AS2( movdqu xmm0, [edx]) AS2( movdqu xmm0, [WORD_REG(dx)])
AS2( movdqu xmm2, [edx+16]) AS2( movdqu xmm2, [WORD_REG(dx)+16])
AS2( pxor xmm4, xmm0) AS2( pxor xmm4, xmm0)
AS2( pxor xmm6, xmm2) AS2( pxor xmm6, xmm2)
AS2( add edx, 32) AS2( add WORD_REG(dx), 32)
ASL(1) ASL(1)
AS2( test edi, 0xf) AS2( test WORD_REG(di), 0xf)
ASJ( jnz, 3, f) ASJ( jnz, 3, f)
AS2( movdqa [edi], xmm4) AS2( movdqa [WORD_REG(di)], xmm4)
AS2( movdqa [edi+16], xmm6) AS2( movdqa [WORD_REG(di)+16], xmm6)
AS2( add edi, 32) AS2( add WORD_REG(di), 32)
ASJ( jmp, 0, f) ASJ( jmp, 0, f)
ASL(3) ASL(3)
AS2( movdqu [edi], xmm4) AS2( movdqu [WORD_REG(di)], xmm4)
AS2( movdqu [edi+16], xmm6) AS2( movdqu [WORD_REG(di)+16], xmm6)
AS2( add edi, 32) AS2( add WORD_REG(di), 32)
ASL(0) ASL(0)
// buffer update // buffer update
AS2( lea ecx, [ebx + 32]) AS2( lea WORD_REG(cx), [WORD_REG(bx) + 32])
AS2( and ecx, 31*32) AS2( and WORD_REG(cx), 31*32)
AS2( lea ebp, [ebx + (32-24)*32]) AS2( lea WORD_REG(bp), [WORD_REG(bx) + (32-24)*32])
AS2( and ebp, 31*32) AS2( and WORD_REG(bp), 31*32)
AS2( movdqa xmm0, [esi+20*4+ecx+0*8]) AS2( movdqa xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8])
AS2( pxor xmm3, xmm0) AS2( pxor xmm3, xmm0)
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1) ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
AS2( movdqa [esi+20*4+ecx+0*8], xmm3) AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3)
AS2( pxor xmm0, [esi+20*4+ebp+2*8]) AS2( pxor xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8])
AS2( movdqa [esi+20*4+ebp+2*8], xmm0) AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0)
AS2( movdqa xmm4, [esi+20*4+ecx+2*8]) AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8])
AS2( pxor xmm1, xmm4) AS2( pxor xmm1, xmm4)
AS2( movdqa [esi+20*4+ecx+2*8], xmm1) AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1)
AS2( pxor xmm4, [esi+20*4+ebp+0*8]) AS2( pxor xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8])
AS2( movdqa [esi+20*4+ebp+0*8], xmm4) AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4)
// theta // theta
AS2( movdqa xmm3, [esi+3*16]) AS2( movdqa xmm3, [WORD_REG(si)+3*16])
AS2( movdqa xmm2, [esi+2*16]) AS2( movdqa xmm2, [WORD_REG(si)+2*16])
AS2( movdqa xmm1, [esi+1*16]) AS2( movdqa xmm1, [WORD_REG(si)+1*16])
AS2( movdqa xmm0, [esi+0*16]) AS2( movdqa xmm0, [WORD_REG(si)+0*16])
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
AS2( test ebx, 1) AS2( test WORD_REG(bx), 1)
ASJ( jnz, 8, f) ASJ( jnz, 8, f)
#endif #endif
AS2( movd xmm6, eax) AS2( movd xmm6, eax)
@ -214,21 +214,21 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( pxor xmm0, xmm4) AS2( pxor xmm0, xmm4)
// sigma // sigma
AS2( lea ecx, [ebx + (32-4)*32]) AS2( lea WORD_REG(cx), [WORD_REG(bx) + (32-4)*32])
AS2( and ecx, 31*32) AS2( and WORD_REG(cx), 31*32)
AS2( lea ebp, [ebx + 16*32]) AS2( lea WORD_REG(bp), [WORD_REG(bx) + 16*32])
AS2( and ebp, 31*32) AS2( and WORD_REG(bp), 31*32)
AS2( movdqa xmm4, [esi+20*4+ecx+0*16]) AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16])
AS2( movdqa xmm5, [esi+20*4+ebp+0*16]) AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16])
AS2( movdqa xmm6, xmm4) AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5) AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5) AS2( punpckhqdq xmm6, xmm5)
AS2( pxor xmm3, xmm4) AS2( pxor xmm3, xmm4)
AS2( pxor xmm2, xmm6) AS2( pxor xmm2, xmm6)
AS2( movdqa xmm4, [esi+20*4+ecx+1*16]) AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16])
AS2( movdqa xmm5, [esi+20*4+ebp+1*16]) AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16])
AS2( movdqa xmm6, xmm4) AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5) AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5) AS2( punpckhqdq xmm6, xmm5)
@ -236,23 +236,22 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( pxor xmm0, xmm6) AS2( pxor xmm0, xmm6)
// loop // loop
AS2( add ebx, 32) AS2( add WORD_REG(bx), 32)
AS2( cmp ebx, [esp]) AS2( cmp WORD_REG(bx), [WORD_REG(sp)])
ASJ( jne, 4, b) ASJ( jne, 4, b)
// save state // save state
AS2( mov ebp, [esp+4]) AS2( add WORD_REG(sp), WORD_SZ)
AS2( add esp, 8) AS_POP( bp)
AS2( mov [esi+4*17], ebx) AS2( mov [WORD_REG(si)+4*16], eax)
AS2( mov [esi+4*16], eax) AS2( movdqa [WORD_REG(si)+3*16], xmm3)
AS2( movdqa [esi+3*16], xmm3) AS2( movdqa [WORD_REG(si)+2*16], xmm2)
AS2( movdqa [esi+2*16], xmm2) AS2( movdqa [WORD_REG(si)+1*16], xmm1)
AS2( movdqa [esi+1*16], xmm1) AS2( movdqa [WORD_REG(si)+0*16], xmm0)
AS2( movdqa [esi+0*16], xmm0)
ASL(5) ASL(5)
#ifdef __GNUC__ #ifdef __GNUC__
AS1( pop ebx) AS_POP( bx)
".att_syntax prefix;" ".att_syntax prefix;"
: :
: "c" (count), "S" (state), "D" (z), "d" (y) : "c" (count), "S" (state), "D" (z), "d" (y)

View File

@ -149,81 +149,133 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{ {
#ifdef CRYPTOPP_X86_ASM_AVAILABLE #if defined(CRYPTOPP_X86_ASM_AVAILABLE)
if (HasMMX()) if (HasMMX())
{ {
const word32 *k = m_key; const word32 *k = m_key;
const word32 *kLoopEnd = k + m_rounds*4; const word32 *kLoopEnd = k + m_rounds*4;
#if CRYPTOPP_BOOL_X64
#define K_REG r8
#define K_END_REG r9
#define SAVE_K
#define RESTORE_K
#define RESTORE_K_END
#define SAVE_0(x) AS2(mov r10d, x)
#define SAVE_1(x) AS2(mov r11d, x)
#define SAVE_2(x) AS2(mov r12d, x)
#define RESTORE_0(x) AS2(mov x, r10d)
#define RESTORE_1(x) AS2(mov x, r11d)
#define RESTORE_2(x) AS2(mov x, r12d)
#else
#define K_REG esi
#define K_END_REG edi
#define SAVE_K AS2(movd mm4, esi)
#define RESTORE_K AS2(movd esi, mm4)
#define RESTORE_K_END AS2(movd edi, mm5)
#define SAVE_0(x) AS2(movd mm0, x)
#define SAVE_1(x) AS2(movd mm1, x)
#define SAVE_2(x) AS2(movd mm2, x)
#define RESTORE_0(x) AS2(movd x, mm0)
#define RESTORE_1(x) AS2(movd x, mm1)
#define RESTORE_2(x) AS2(movd x, mm2)
#endif
#ifdef __GNUC__ #ifdef __GNUC__
word32 t0, t1, t2, t3; word32 t0, t1, t2, t3;
__asm__ __volatile__ __asm__ __volatile__
( (
".intel_syntax noprefix;" ".intel_syntax noprefix;"
AS1( push ebx) AS_PUSH( bx)
AS1( push ebp) AS_PUSH( bp)
AS2( mov ebp, eax) AS2( mov WORD_REG(bp), WORD_REG(ax))
#if CRYPTOPP_BOOL_X64
// save these manually. clobber list doesn't seem to work as of GCC 4.1.0
AS1( pushq K_REG)
AS1( pushq K_END_REG)
AS1( pushq r10)
AS1( pushq r11)
AS1( pushq r12)
AS2( mov K_REG, rsi)
AS2( mov K_END_REG, rcx)
#else
AS2( movd mm5, ecx) AS2( movd mm5, ecx)
#endif
#else #else
#if _MSC_VER < 1300
const word32 *t = Te;
AS2( mov eax, t)
#endif
AS2( mov edx, g_cacheLineSize) AS2( mov edx, g_cacheLineSize)
AS2( mov edi, inBlock) AS2( mov WORD_REG(di), inBlock)
AS2( mov esi, k) AS2( mov K_REG, k)
AS2( movd mm5, kLoopEnd) AS2( movd mm5, kLoopEnd)
AS1( push ebp) #if _MSC_VER < 1300
AS_PUSH( bx)
AS_PUSH( bp)
AS2( mov ebp, eax)
#else
AS_PUSH( bp)
AS2( lea ebp, Te) AS2( lea ebp, Te)
#endif
#endif #endif
AS2( mov eax, [esi+0*4]) // s0 AS2( mov eax, [K_REG+0*4]) // s0
AS2( xor eax, [edi+0*4]) AS2( xor eax, [WORD_REG(di)+0*4])
AS2( movd mm0, eax) SAVE_0(eax)
AS2( mov ebx, [esi+1*4]) AS2( mov ebx, [K_REG+1*4])
AS2( xor ebx, [edi+1*4]) AS2( xor ebx, [WORD_REG(di)+1*4])
AS2( movd mm1, ebx) SAVE_1(ebx)
AS2( and ebx, eax) AS2( and ebx, eax)
AS2( mov eax, [esi+2*4]) AS2( mov eax, [K_REG+2*4])
AS2( xor eax, [edi+2*4]) AS2( xor eax, [WORD_REG(di)+2*4])
AS2( movd mm2, eax) SAVE_2(eax)
AS2( and ebx, eax) AS2( and ebx, eax)
AS2( mov ecx, [esi+3*4]) AS2( mov ecx, [K_REG+3*4])
AS2( xor ecx, [edi+3*4]) AS2( xor ecx, [WORD_REG(di)+3*4])
AS2( and ebx, ecx) AS2( and ebx, ecx)
// read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction // read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction
AS2( and ebx, 0) AS2( and ebx, 0)
AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence
ASL(2) ASL(2)
AS2( and ebx, [ebp+edi]) AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( add edi, edx) AS2( add edi, edx)
AS2( and ebx, [ebp+edi]) AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( add edi, edx) AS2( add edi, edx)
AS2( and ebx, [ebp+edi]) AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( add edi, edx) AS2( add edi, edx)
AS2( and ebx, [ebp+edi]) AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( add edi, edx) AS2( add edi, edx)
AS2( cmp edi, 1024) AS2( cmp edi, 1024)
ASJ( jl, 2, b) ASJ( jl, 2, b)
AS2( and ebx, [ebp+1020]) AS2( and ebx, [WORD_REG(bp)+1020])
#if CRYPTOPP_BOOL_X64
AS2( xor r10d, ebx)
AS2( xor r11d, ebx)
AS2( xor r12d, ebx)
#else
AS2( movd mm6, ebx) AS2( movd mm6, ebx)
AS2( pxor mm2, mm6) AS2( pxor mm2, mm6)
AS2( pxor mm1, mm6) AS2( pxor mm1, mm6)
AS2( pxor mm0, mm6) AS2( pxor mm0, mm6)
#endif
AS2( xor ecx, ebx) AS2( xor ecx, ebx)
AS2( mov edi, [esi+4*4]) // t0 AS2( mov edi, [K_REG+4*4]) // t0
AS2( mov eax, [esi+5*4]) AS2( mov eax, [K_REG+5*4])
AS2( mov ebx, [esi+6*4]) AS2( mov ebx, [K_REG+6*4])
AS2( mov edx, [esi+7*4]) AS2( mov edx, [K_REG+7*4])
AS2( add esi, 8*4) AS2( add K_REG, 8*4)
AS2( movd mm4, esi) SAVE_K
#define QUARTER_ROUND(t, a, b, c, d) \ #define QUARTER_ROUND(t, a, b, c, d) \
AS2(movzx esi, t##l)\ AS2(movzx esi, t##l)\
AS2(d, [ebp+0*1024+4*esi])\ AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\ AS2(movzx esi, t##h)\
AS2(c, [ebp+1*1024+4*esi])\ AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
AS2(shr e##t##x, 16)\ AS2(shr e##t##x, 16)\
AS2(movzx esi, t##l)\ AS2(movzx esi, t##l)\
AS2(b, [ebp+2*1024+4*esi])\ AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\ AS2(movzx esi, t##h)\
AS2(a, [ebp+3*1024+4*esi]) AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])
#define s0 xor edi #define s0 xor edi
#define s1 xor eax #define s1 xor eax
@ -235,69 +287,69 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#define t3 xor edx #define t3 xor edx
QUARTER_ROUND(c, t0, t1, t2, t3) QUARTER_ROUND(c, t0, t1, t2, t3)
AS2( movd ecx, mm2) RESTORE_2(ecx)
QUARTER_ROUND(c, t3, t0, t1, t2) QUARTER_ROUND(c, t3, t0, t1, t2)
AS2( movd ecx, mm1) RESTORE_1(ecx)
QUARTER_ROUND(c, t2, t3, t0, t1) QUARTER_ROUND(c, t2, t3, t0, t1)
AS2( movd ecx, mm0) RESTORE_0(ecx)
QUARTER_ROUND(c, t1, t2, t3, t0) QUARTER_ROUND(c, t1, t2, t3, t0)
AS2( movd mm2, ebx) SAVE_2(ebx)
AS2( movd mm1, eax) SAVE_1(eax)
AS2( movd mm0, edi) SAVE_0(edi)
#undef QUARTER_ROUND #undef QUARTER_ROUND
AS2( movd esi, mm4) RESTORE_K
ASL(0) ASL(0)
AS2( mov edi, [esi+0*4]) AS2( mov edi, [K_REG+0*4])
AS2( mov eax, [esi+1*4]) AS2( mov eax, [K_REG+1*4])
AS2( mov ebx, [esi+2*4]) AS2( mov ebx, [K_REG+2*4])
AS2( mov ecx, [esi+3*4]) AS2( mov ecx, [K_REG+3*4])
#define QUARTER_ROUND(t, a, b, c, d) \ #define QUARTER_ROUND(t, a, b, c, d) \
AS2(movzx esi, t##l)\ AS2(movzx esi, t##l)\
AS2(a, [ebp+3*1024+4*esi])\ AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\ AS2(movzx esi, t##h)\
AS2(b, [ebp+2*1024+4*esi])\ AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
AS2(shr e##t##x, 16)\ AS2(shr e##t##x, 16)\
AS2(movzx esi, t##l)\ AS2(movzx esi, t##l)\
AS2(c, [ebp+1*1024+4*esi])\ AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\ AS2(movzx esi, t##h)\
AS2(d, [ebp+0*1024+4*esi]) AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])
QUARTER_ROUND(d, s0, s1, s2, s3) QUARTER_ROUND(d, s0, s1, s2, s3)
AS2( movd edx, mm2) RESTORE_2(edx)
QUARTER_ROUND(d, s3, s0, s1, s2) QUARTER_ROUND(d, s3, s0, s1, s2)
AS2( movd edx, mm1) RESTORE_1(edx)
QUARTER_ROUND(d, s2, s3, s0, s1) QUARTER_ROUND(d, s2, s3, s0, s1)
AS2( movd edx, mm0) RESTORE_0(edx)
QUARTER_ROUND(d, s1, s2, s3, s0) QUARTER_ROUND(d, s1, s2, s3, s0)
AS2( movd esi, mm4) RESTORE_K
AS2( movd mm2, ebx) SAVE_2(ebx)
AS2( movd mm1, eax) SAVE_1(eax)
AS2( movd mm0, edi) SAVE_0(edi)
AS2( mov edi, [esi+4*4]) AS2( mov edi, [K_REG+4*4])
AS2( mov eax, [esi+5*4]) AS2( mov eax, [K_REG+5*4])
AS2( mov ebx, [esi+6*4]) AS2( mov ebx, [K_REG+6*4])
AS2( mov edx, [esi+7*4]) AS2( mov edx, [K_REG+7*4])
QUARTER_ROUND(c, t0, t1, t2, t3) QUARTER_ROUND(c, t0, t1, t2, t3)
AS2( movd ecx, mm2) RESTORE_2(ecx)
QUARTER_ROUND(c, t3, t0, t1, t2) QUARTER_ROUND(c, t3, t0, t1, t2)
AS2( movd ecx, mm1) RESTORE_1(ecx)
QUARTER_ROUND(c, t2, t3, t0, t1) QUARTER_ROUND(c, t2, t3, t0, t1)
AS2( movd ecx, mm0) RESTORE_0(ecx)
QUARTER_ROUND(c, t1, t2, t3, t0) QUARTER_ROUND(c, t1, t2, t3, t0)
AS2( movd mm2, ebx) SAVE_2(ebx)
AS2( movd mm1, eax) SAVE_1(eax)
AS2( movd mm0, edi) SAVE_0(edi)
AS2( movd esi, mm4) RESTORE_K
AS2( movd edi, mm5) RESTORE_K_END
AS2( add esi, 8*4) AS2( add K_REG, 8*4)
AS2( movd mm4, esi) SAVE_K
AS2( cmp edi, esi) AS2( cmp K_END_REG, K_REG)
ASJ( jne, 0, b) ASJ( jne, 0, b)
#undef QUARTER_ROUND #undef QUARTER_ROUND
@ -310,44 +362,54 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#undef t2 #undef t2
#undef t3 #undef t3
AS2( mov eax, [edi+0*4]) AS2( mov eax, [K_END_REG+0*4])
AS2( mov ecx, [edi+1*4]) AS2( mov ecx, [K_END_REG+1*4])
AS2( mov esi, [edi+2*4]) AS2( mov esi, [K_END_REG+2*4])
AS2( mov edi, [edi+3*4]) AS2( mov edi, [K_END_REG+3*4])
#define QUARTER_ROUND(a, b, c, d) \ #define QUARTER_ROUND(a, b, c, d) \
AS2( movzx ebx, dl)\ AS2( movzx ebx, dl)\
AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
AS2( shl ebx, 3*8)\ AS2( shl ebx, 3*8)\
AS2( xor a, ebx)\ AS2( xor a, ebx)\
AS2( movzx ebx, dh)\ AS2( movzx ebx, dh)\
AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
AS2( shl ebx, 2*8)\ AS2( shl ebx, 2*8)\
AS2( xor b, ebx)\ AS2( xor b, ebx)\
AS2( shr edx, 16)\ AS2( shr edx, 16)\
AS2( movzx ebx, dl)\ AS2( movzx ebx, dl)\
AS2( shr edx, 8)\ AS2( shr edx, 8)\
AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\ AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
AS2( shl ebx, 1*8)\ AS2( shl ebx, 1*8)\
AS2( xor c, ebx)\ AS2( xor c, ebx)\
AS2( movzx ebx, BYTE PTR [ebp+1+4*edx])\ AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\
AS2( xor d, ebx) AS2( xor d, ebx)
QUARTER_ROUND(eax, ecx, esi, edi) QUARTER_ROUND(eax, ecx, esi, edi)
AS2( movd edx, mm2) RESTORE_2(edx)
QUARTER_ROUND(edi, eax, ecx, esi) QUARTER_ROUND(edi, eax, ecx, esi)
AS2( movd edx, mm1) RESTORE_1(edx)
QUARTER_ROUND(esi, edi, eax, ecx) QUARTER_ROUND(esi, edi, eax, ecx)
AS2( movd edx, mm0) RESTORE_0(edx)
QUARTER_ROUND(ecx, esi, edi, eax) QUARTER_ROUND(ecx, esi, edi, eax)
#undef QUARTER_ROUND #undef QUARTER_ROUND
AS1( pop ebp) #if CRYPTOPP_BOOL_X64
AS1( emms) AS1(popq r12)
AS1(popq r11)
AS1(popq r10)
AS1(popq K_END_REG)
AS1(popq K_REG)
#else
AS1(emms)
#endif
AS_POP( bp)
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
AS_POP( bx)
#endif
#ifdef __GNUC__ #ifdef __GNUC__
AS1( pop ebx)
".att_syntax prefix;" ".att_syntax prefix;"
: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3) : "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize) : "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
@ -366,19 +428,19 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
((word32 *)outBlock)[2] = t2; ((word32 *)outBlock)[2] = t2;
((word32 *)outBlock)[3] = t3; ((word32 *)outBlock)[3] = t3;
#else #else
AS2( mov ebx, xorBlock) AS2( mov WORD_REG(bx), xorBlock)
AS2( test ebx, ebx) AS2( test WORD_REG(bx), WORD_REG(bx))
ASJ( jz, 1, f) ASJ( jz, 1, f)
AS2( xor eax, [ebx+0*4]) AS2( xor eax, [WORD_REG(bx)+0*4])
AS2( xor ecx, [ebx+1*4]) AS2( xor ecx, [WORD_REG(bx)+1*4])
AS2( xor esi, [ebx+2*4]) AS2( xor esi, [WORD_REG(bx)+2*4])
AS2( xor edi, [ebx+3*4]) AS2( xor edi, [WORD_REG(bx)+3*4])
ASL(1) ASL(1)
AS2( mov ebx, outBlock) AS2( mov WORD_REG(bx), outBlock)
AS2( mov [ebx+0*4], eax) AS2( mov [WORD_REG(bx)+0*4], eax)
AS2( mov [ebx+1*4], ecx) AS2( mov [WORD_REG(bx)+1*4], ecx)
AS2( mov [ebx+2*4], esi) AS2( mov [WORD_REG(bx)+2*4], esi)
AS2( mov [ebx+3*4], edi) AS2( mov [WORD_REG(bx)+3*4], edi)
#endif #endif
} }
else else

View File

@ -130,10 +130,13 @@ public:
#endif #endif
assert(IsAlignedOn(p, 16)); assert(IsAlignedOn(p, 16));
return (T*)p; return (pointer)p;
} }
return new T[n]; pointer p;
while (!(p = (pointer)malloc(sizeof(T)*n)))
CallNewHandler();
return p;
} }
void deallocate(void *p, size_type n) void deallocate(void *p, size_type n)
@ -153,7 +156,7 @@ public:
return; return;
} }
delete [] (T *)p; free(p);
} }
pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve) pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve)
@ -164,13 +167,19 @@ public:
// VS.NET STL enforces the policy of "All STL-compliant allocators have to provide a // VS.NET STL enforces the policy of "All STL-compliant allocators have to provide a
// template class member called rebind". // template class member called rebind".
template <class U> struct rebind { typedef AllocatorWithCleanup<U, T_Align16> other; }; template <class U> struct rebind { typedef AllocatorWithCleanup<U, T_Align16> other; };
#if _MSC_VER >= 1500
AllocatorWithCleanup() {}
template <class U, bool A> AllocatorWithCleanup(const AllocatorWithCleanup<U, A> &) {}
#endif
}; };
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<byte>; CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<byte>;
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word16>; CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word16>;
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word32>; CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word32>;
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word64>; CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word64>;
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word, CRYPTOPP_BOOL_X86>; // for Integer #if CRYPTOPP_BOOL_X86
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word, true>; // for Integer
#endif
template <class T> template <class T>
class NullAllocator : public AllocatorBase<T> class NullAllocator : public AllocatorBase<T>
@ -260,7 +269,7 @@ public:
size_type max_size() const {return STDMAX(m_fallbackAllocator.max_size(), S);} size_type max_size() const {return STDMAX(m_fallbackAllocator.max_size(), S);}
private: private:
T* GetAlignedArray() {return T_Align16 ? (T*)(((byte *)m_array) + (0-(unsigned int)m_array)%16) : m_array;} T* GetAlignedArray() {return T_Align16 ? (T*)(((byte *)m_array) + (0-(size_t)m_array)%16) : m_array;}
CRYPTOPP_ALIGN_DATA(8) T m_array[T_Align16 ? S+8/sizeof(T) : S]; CRYPTOPP_ALIGN_DATA(8) T m_array[T_Align16 ? S+8/sizeof(T) : S];
A m_fallbackAllocator; A m_fallbackAllocator;
@ -466,10 +475,10 @@ public:
explicit SecBlockWithHint(size_t size) : SecBlock<T, A>(size) {} explicit SecBlockWithHint(size_t size) : SecBlock<T, A>(size) {}
}; };
template<class T, class U> template<class T, bool A, class U, bool B>
inline bool operator==(const CryptoPP::AllocatorWithCleanup<T>&, const CryptoPP::AllocatorWithCleanup<U>&) {return (true);} inline bool operator==(const CryptoPP::AllocatorWithCleanup<T, A>&, const CryptoPP::AllocatorWithCleanup<U, B>&) {return (true);}
template<class T, class U> template<class T, bool A, class U, bool B>
inline bool operator!=(const CryptoPP::AllocatorWithCleanup<T>&, const CryptoPP::AllocatorWithCleanup<U>&) {return (false);} inline bool operator!=(const CryptoPP::AllocatorWithCleanup<T, A>&, const CryptoPP::AllocatorWithCleanup<U, B>&) {return (false);}
NAMESPACE_END NAMESPACE_END

21
sha.cpp
View File

@ -308,9 +308,9 @@ CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN1
W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
}; };
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
// put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version // put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version
static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data) CRYPTOPP_NAKED static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
{ {
#ifdef __GNUC__ #ifdef __GNUC__
__asm__ __volatile__ __asm__ __volatile__
@ -319,6 +319,9 @@ static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64
AS1( push ebx) AS1( push ebx)
AS2( mov ebx, eax) AS2( mov ebx, eax)
#else #else
AS1( push ebx)
AS1( push esi)
AS1( push edi)
AS2( lea ebx, SHA512_K) AS2( lea ebx, SHA512_K)
#endif #endif
@ -486,22 +489,30 @@ static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64
AS1( pop esp) AS1( pop esp)
AS1( emms) AS1( emms)
#ifdef __GNUC__ #if defined(__GNUC__)
AS1( pop ebx) AS1( pop ebx)
".att_syntax prefix;" ".att_syntax prefix;"
: :
: "a" (SHA512_K), "c" (state), "d" (data) : "a" (SHA512_K), "c" (state), "d" (data)
: "%esi", "%edi", "memory", "cc" : "%esi", "%edi", "memory", "cc"
); );
#else
AS1( pop edi)
AS1( pop esi)
AS1( pop ebx)
AS1( ret)
#endif #endif
} }
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
void SHA512::Transform(word64 *state, const word64 *data) void SHA512::Transform(word64 *state, const word64 *data)
{ {
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
if (HasSSE2()) if (HasSSE2())
return SHA512_SSE2_Transform(state, data); {
SHA512_SSE2_Transform(state, data);
return;
}
#endif #endif
#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39)) #define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))

View File

@ -189,21 +189,21 @@ template <class T> counted_ptr<T> & counted_ptr<T>::operator=(const counted_ptr<
template <class T> class vector_member_ptrs template <class T> class vector_member_ptrs
{ {
public: public:
vector_member_ptrs(unsigned int size=0) vector_member_ptrs(size_t size=0)
: m_size(size), m_ptr(new member_ptr<T>[size]) {} : m_size(size), m_ptr(new member_ptr<T>[size]) {}
~vector_member_ptrs() ~vector_member_ptrs()
{delete [] this->m_ptr;} {delete [] this->m_ptr;}
member_ptr<T>& operator[](unsigned int index) member_ptr<T>& operator[](size_t index)
{assert(index<this->m_size); return this->m_ptr[index];} {assert(index<this->m_size); return this->m_ptr[index];}
const member_ptr<T>& operator[](unsigned int index) const const member_ptr<T>& operator[](size_t index) const
{assert(index<this->m_size); return this->m_ptr[index];} {assert(index<this->m_size); return this->m_ptr[index];}
unsigned int size() const {return this->m_size;} size_t size() const {return this->m_size;}
void resize(unsigned int newSize) void resize(size_t newSize)
{ {
member_ptr<T> *newPtr = new member_ptr<T>[newSize]; member_ptr<T> *newPtr = new member_ptr<T>[newSize];
for (unsigned int i=0; i<this->m_size && i<newSize; i++) for (size_t i=0; i<this->m_size && i<newSize; i++)
newPtr[i].reset(this->m_ptr[i].release()); newPtr[i].reset(this->m_ptr[i].release());
delete [] this->m_ptr; delete [] this->m_ptr;
this->m_size = newSize; this->m_size = newSize;
@ -214,7 +214,7 @@ private:
vector_member_ptrs(const vector_member_ptrs<T> &c); // copy not allowed vector_member_ptrs(const vector_member_ptrs<T> &c); // copy not allowed
void operator=(const vector_member_ptrs<T> &x); // assignment not allowed void operator=(const vector_member_ptrs<T> &x); // assignment not allowed
unsigned int m_size; size_t m_size;
member_ptr<T> *m_ptr; member_ptr<T> *m_ptr;
}; };

View File

@ -68,6 +68,10 @@ void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
m_state[1] = b; m_state[1] = b;
m_state[2] = e; m_state[2] = e;
m_state[3] = d; m_state[3] = d;
#define XMUX(c, x, y) (x ^ (y & (0 - (c & 1))))
m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]);
m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
} }
static word32 s_mulTables[512] = { static word32 s_mulTables[512] = {
@ -282,10 +286,8 @@ unsigned int SosemanukPolicy::GetAlignment() const
else else
#endif #endif
return 1; return 1;
#endif
} }
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
unsigned int SosemanukPolicy::GetOptimalBlockSize() const unsigned int SosemanukPolicy::GetOptimalBlockSize() const
{ {
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
@ -316,54 +318,54 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
__asm__ __volatile__ __asm__ __volatile__
( (
".intel_syntax noprefix;" ".intel_syntax noprefix;"
AS1( push ebx) AS_PUSH( bx)
#else #else
word32 *state = m_state; word32 *state = m_state;
AS2( mov eax, state) AS2( mov WORD_REG(ax), state)
AS2( mov edi, output) AS2( mov WORD_REG(di), output)
AS2( mov edx, input) AS2( mov WORD_REG(dx), input)
AS2( mov ecx, iterationCount) AS2( mov WORD_REG(cx), iterationCount)
#endif #endif
#define SSE2_output DWORD PTR [esp+1*4] #define SSE2_output WORD_PTR [WORD_REG(sp)+1*WORD_SZ]
#define SSE2_input DWORD PTR [esp+2*4] #define SSE2_input WORD_PTR [WORD_REG(sp)+2*WORD_SZ]
#define SSE2_wordsLeft DWORD PTR [esp+3*4] #define SSE2_wordsLeft WORD_PTR [WORD_REG(sp)+3*WORD_SZ]
#define SSE2_ediEnd DWORD PTR [esp+4*4] #define SSE2_diEnd WORD_PTR [WORD_REG(sp)+4*WORD_SZ]
#define SSE2_pMulTables DWORD PTR [esp+5*4] #define SSE2_pMulTables WORD_PTR [WORD_REG(sp)+5*WORD_SZ]
#define SSE2_state DWORD PTR [esp+6*4] #define SSE2_state WORD_PTR [WORD_REG(sp)+6*WORD_SZ]
#define SSE2_wordsLeft2 DWORD PTR [esp+7*4] #define SSE2_wordsLeft2 WORD_PTR [WORD_REG(sp)+7*WORD_SZ]
#define SSE2_stateCopy esp + 8*4 #define SSE2_stateCopy WORD_REG(sp) + 8*WORD_SZ
#define SSE2_uvStart SSE2_stateCopy + 12*4 #define SSE2_uvStart SSE2_stateCopy + 12*4
AS1( push ebp) AS_PUSH( bp)
AS2( mov ebx, esp) AS2( mov WORD_REG(bx), WORD_REG(sp))
AS2( and esp, 0xfffffff0) AS2( and WORD_REG(sp), -16)
AS2( sub esp, 80*4*2+12*4+8*4) // 80 v's, 80 u's, 12 state, 8 locals AS2( sub WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals
AS2( mov [esp], ebx) AS2( mov [WORD_REG(sp)], WORD_REG(bx))
AS2( mov SSE2_output, edi) AS2( mov SSE2_output, WORD_REG(di))
AS2( mov SSE2_input, edx) AS2( mov SSE2_input, WORD_REG(dx))
AS2( mov SSE2_state, eax) AS2( mov SSE2_state, WORD_REG(ax))
#ifndef _MSC_VER #ifndef _MSC_VER
AS2( mov SSE2_pMulTables, esi) AS2( mov SSE2_pMulTables, WORD_REG(si))
#endif #endif
AS2( lea ecx, [4*ecx+ecx]) AS2( lea WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
AS2( lea esi, [4*ecx]) AS2( lea WORD_REG(si), [4*WORD_REG(cx)])
AS2( mov SSE2_wordsLeft, esi) AS2( mov SSE2_wordsLeft, WORD_REG(si))
AS2( movdqa xmm0, [eax+0*16]) // copy state to stack to save a register AS2( movdqa xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register
AS2( movdqa [SSE2_stateCopy+0*16], xmm0) AS2( movdqa [SSE2_stateCopy+0*16], xmm0)
AS2( movdqa xmm0, [eax+1*16]) AS2( movdqa xmm0, [WORD_REG(ax)+1*16])
AS2( movdqa [SSE2_stateCopy+1*16], xmm0) AS2( movdqa [SSE2_stateCopy+1*16], xmm0)
AS2( movq xmm0, QWORD PTR [eax+2*16]) AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0) AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
AS2( psrlq xmm0, 32) AS2( psrlq xmm0, 32)
AS2( movd ebx, xmm0) // s(9) AS2( movd ebx, xmm0) // s(9)
AS2( mov ecx, [eax+10*4]) AS2( mov ecx, [WORD_REG(ax)+10*4])
AS2( mov edx, [eax+11*4]) AS2( mov edx, [WORD_REG(ax)+11*4])
AS2( pcmpeqb xmm7, xmm7) // all ones AS2( pcmpeqb xmm7, xmm7) // all ones
#define s(i) SSE2_stateCopy + ASM_MOD(i,10)*4 #define s(i) SSE2_stateCopy + ASM_MOD(i,10)*4
#define u(j) edi + (ASM_MOD(j,4)*20 + (j/4)) * 4 #define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
#define v(j) edi + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4 #define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
#define r10 ecx #define r10 ecx
#define r11 edx #define r11 edx
@ -371,42 +373,42 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
#define r21 ecx #define r21 ecx
#define SSE2_STEP(i, j) \ #define SSE2_STEP(i, j) \
AS2( mov eax, [s(i+3)])\
AS2( mov ebp, 1)\
AS2( and ebp, r1##j)\
AS1( neg ebp)\
AS2( and ebp, [s(i+8)])\
AS2( xor ebp, [s(i+1)])\
AS2( add r2##j, ebp)\
AS2( movzx ebp, al)\
AS2( shr eax, 8)\
AS2( xor eax, [esi+1024+ebp*4])\
AS2( lea ebp, [ebx + r2##j])\
AS2( xor ebx, eax)\
AS2( imul r1##j, 0x54655307)\
AS2( mov eax, [s(i+0)])\ AS2( mov eax, [s(i+0)])\
AS2( mov [v(i)], eax)\ AS2( mov [v(i)], eax)\
AS2( rol eax, 8)\ AS2( rol eax, 8)\
AS2( xor ebx, eax)\ AS2( lea ebp, [ebx + r2##j])\
AS2( movzx eax, al)\
AS2( rol r1##j, 7)\
AS2( xor ebx, [esi+eax*4])\
AS2( xor ebp, r1##j)\ AS2( xor ebp, r1##j)\
AS2( mov [u(i)], ebp)\ AS2( mov [u(i)], ebp)\
AS2( mov ebp, 1)\
AS2( and ebp, r2##j)\
AS1( neg ebp)\
AS2( and ebp, ebx)\
AS2( xor ebx, eax)\
AS2( movzx eax, al)\
AS2( xor ebx, [WORD_REG(si)+WORD_REG(ax)*4])\
AS2( mov eax, [s(i+3)])\
AS2( xor ebp, [s(i+2)])\
AS2( add r1##j, ebp)\
AS2( movzx ebp, al)\
AS2( shr eax, 8)\
AS2( xor ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\
AS2( xor ebx, eax)\
AS2( imul r2##j, 0x54655307)\
AS2( rol r2##j, 7)\
AS2( mov [s(i+0)], ebx)\ AS2( mov [s(i+0)], ebx)\
ASL(2) // outer loop, each iteration of this processes 80 words ASL(2) // outer loop, each iteration of this processes 80 words
AS2( lea edi, [SSE2_uvStart]) // start of v and u AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
AS2( mov eax, 80) AS2( mov WORD_REG(ax), 80)
AS2( cmp esi, 80) AS2( cmp WORD_REG(si), 80)
AS2( cmovg esi, eax) AS2( cmovg WORD_REG(si), WORD_REG(ax))
AS2( mov SSE2_wordsLeft2, esi) AS2( mov SSE2_wordsLeft2, WORD_REG(si))
AS2( lea esi, [edi+esi]) // use to first inner loop AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop
AS2( mov SSE2_ediEnd, esi) AS2( mov SSE2_diEnd, WORD_REG(si))
#ifdef _MSC_VER #ifdef _MSC_VER
AS2( lea esi, s_mulTables) AS2( lea WORD_REG(si), s_mulTables)
#else #else
AS2( mov esi, SSE2_pMulTables) AS2( mov WORD_REG(si), SSE2_pMulTables)
#endif #endif
ASL(0) // first inner loop, 20 words each, 4 iterations ASL(0) // first inner loop, 20 words each, 4 iterations
@ -431,20 +433,20 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
SSE2_STEP(18, 0) SSE2_STEP(18, 0)
SSE2_STEP(19, 1) SSE2_STEP(19, 1)
// loop // loop
AS2( add edi, 5*4) AS2( add WORD_REG(di), 5*4)
AS2( cmp edi, SSE2_ediEnd) AS2( cmp WORD_REG(di), SSE2_diEnd)
ASJ( jne, 0, b) ASJ( jne, 0, b)
AS2( mov eax, SSE2_input) AS2( mov WORD_REG(ax), SSE2_input)
AS2( mov ebp, SSE2_output) AS2( mov WORD_REG(bp), SSE2_output)
AS2( lea edi, [SSE2_uvStart]) // start of v and u AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
AS2( mov esi, SSE2_wordsLeft2) AS2( mov WORD_REG(si), SSE2_wordsLeft2)
ASL(1) // second inner loop, 16 words each, 5 iterations ASL(1) // second inner loop, 16 words each, 5 iterations
AS2( movdqa xmm0, [edi+0*20*4]) AS2( movdqa xmm0, [WORD_REG(di)+0*20*4])
AS2( movdqa xmm1, [edi+1*20*4]) AS2( movdqa xmm2, [WORD_REG(di)+2*20*4])
AS2( movdqa xmm2, [edi+2*20*4]) AS2( movdqa xmm3, [WORD_REG(di)+3*20*4])
AS2( movdqa xmm3, [edi+3*20*4]) AS2( movdqa xmm1, [WORD_REG(di)+1*20*4])
// S2 // S2
AS2( movdqa xmm4, xmm0) AS2( movdqa xmm4, xmm0)
AS2( pand xmm0, xmm2) AS2( pand xmm0, xmm2)
@ -463,13 +465,13 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( pxor xmm1, xmm4) AS2( pxor xmm1, xmm4)
AS2( pxor xmm4, xmm7) AS2( pxor xmm4, xmm7)
// xor with v // xor with v
AS2( pxor xmm2, [edi+80*4]) AS2( pxor xmm2, [WORD_REG(di)+80*4])
AS2( pxor xmm3, [edi+80*5]) AS2( pxor xmm3, [WORD_REG(di)+80*5])
AS2( pxor xmm1, [edi+80*6]) AS2( pxor xmm1, [WORD_REG(di)+80*6])
AS2( pxor xmm4, [edi+80*7]) AS2( pxor xmm4, [WORD_REG(di)+80*7])
// exit loop early if less than 16 words left to output // exit loop early if less than 16 words left to output
// this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop // this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop
AS2( cmp esi, 16) AS2( cmp WORD_REG(si), 16)
ASJ( jl, 4, f) ASJ( jl, 4, f)
// unpack // unpack
AS2( movdqa xmm6, xmm2) AS2( movdqa xmm6, xmm2)
@ -485,75 +487,75 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( punpcklqdq xmm6, xmm5) AS2( punpcklqdq xmm6, xmm5)
AS2( punpckhqdq xmm3, xmm5) AS2( punpckhqdq xmm3, xmm5)
// output keystream // output keystream
AS2( test eax, eax) AS2( test WORD_REG(ax), WORD_REG(ax))
ASJ( jz, 3, f) ASJ( jz, 3, f)
AS2( test eax, 0xf) AS2( test eax, 0xf)
ASJ( jnz, 7, f) ASJ( jnz, 7, f)
AS2( pxor xmm2, [eax+0*16]) AS2( pxor xmm2, [WORD_REG(ax)+0*16])
AS2( pxor xmm0, [eax+1*16]) AS2( pxor xmm0, [WORD_REG(ax)+1*16])
AS2( pxor xmm6, [eax+2*16]) AS2( pxor xmm6, [WORD_REG(ax)+2*16])
AS2( pxor xmm3, [eax+3*16]) AS2( pxor xmm3, [WORD_REG(ax)+3*16])
AS2( add eax, 4*16) AS2( add WORD_REG(ax), 4*16)
ASJ( jmp, 3, f) ASJ( jmp, 3, f)
ASL(7) ASL(7)
AS2( movdqu xmm1, [eax+0*16]) AS2( movdqu xmm1, [WORD_REG(ax)+0*16])
AS2( pxor xmm2, xmm1) AS2( pxor xmm2, xmm1)
AS2( movdqu xmm1, [eax+1*16]) AS2( movdqu xmm1, [WORD_REG(ax)+1*16])
AS2( pxor xmm0, xmm1) AS2( pxor xmm0, xmm1)
AS2( movdqu xmm1, [eax+2*16]) AS2( movdqu xmm1, [WORD_REG(ax)+2*16])
AS2( pxor xmm6, xmm1) AS2( pxor xmm6, xmm1)
AS2( movdqu xmm1, [eax+3*16]) AS2( movdqu xmm1, [WORD_REG(ax)+3*16])
AS2( pxor xmm3, xmm1) AS2( pxor xmm3, xmm1)
AS2( add eax, 4*16) AS2( add WORD_REG(ax), 4*16)
ASL(3) ASL(3)
AS2( test ebp, 0xf) AS2( test ebp, 0xf)
ASJ( jnz, 8, f) ASJ( jnz, 8, f)
AS2( movdqa [ebp+0*16], xmm2) AS2( movdqa [WORD_REG(bp)+0*16], xmm2)
AS2( movdqa [ebp+1*16], xmm0) AS2( movdqa [WORD_REG(bp)+1*16], xmm0)
AS2( movdqa [ebp+2*16], xmm6) AS2( movdqa [WORD_REG(bp)+2*16], xmm6)
AS2( movdqa [ebp+3*16], xmm3) AS2( movdqa [WORD_REG(bp)+3*16], xmm3)
ASJ( jmp, 9, f) ASJ( jmp, 9, f)
ASL(8) ASL(8)
AS2( movdqu [ebp+0*16], xmm2) AS2( movdqu [WORD_REG(bp)+0*16], xmm2)
AS2( movdqu [ebp+1*16], xmm0) AS2( movdqu [WORD_REG(bp)+1*16], xmm0)
AS2( movdqu [ebp+2*16], xmm6) AS2( movdqu [WORD_REG(bp)+2*16], xmm6)
AS2( movdqu [ebp+3*16], xmm3) AS2( movdqu [WORD_REG(bp)+3*16], xmm3)
ASL(9) ASL(9)
// loop // loop
AS2( add edi, 4*4) AS2( add WORD_REG(di), 4*4)
AS2( add ebp, 4*16) AS2( add WORD_REG(bp), 4*16)
AS2( sub esi, 16) AS2( sub WORD_REG(si), 16)
ASJ( jnz, 1, b) ASJ( jnz, 1, b)
// outer loop // outer loop
AS2( mov esi, SSE2_wordsLeft) AS2( mov WORD_REG(si), SSE2_wordsLeft)
AS2( sub esi, 80) AS2( sub WORD_REG(si), 80)
ASJ( jz, 6, f) ASJ( jz, 6, f)
AS2( mov SSE2_wordsLeft, esi) AS2( mov SSE2_wordsLeft, WORD_REG(si))
AS2( mov SSE2_input, eax) AS2( mov SSE2_input, WORD_REG(ax))
AS2( mov SSE2_output, ebp) AS2( mov SSE2_output, WORD_REG(bp))
ASJ( jmp, 2, b) ASJ( jmp, 2, b)
ASL(4) // final output of less than 16 words ASL(4) // final output of less than 16 words
AS2( test eax, eax) AS2( test WORD_REG(ax), WORD_REG(ax))
ASJ( jz, 5, f) ASJ( jz, 5, f)
AS2( movd xmm0, [eax+0*4]) AS2( movd xmm0, [WORD_REG(ax)+0*4])
AS2( pxor xmm2, xmm0) AS2( pxor xmm2, xmm0)
AS2( movd xmm0, [eax+1*4]) AS2( movd xmm0, [WORD_REG(ax)+1*4])
AS2( pxor xmm3, xmm0) AS2( pxor xmm3, xmm0)
AS2( movd xmm0, [eax+2*4]) AS2( movd xmm0, [WORD_REG(ax)+2*4])
AS2( pxor xmm1, xmm0) AS2( pxor xmm1, xmm0)
AS2( movd xmm0, [eax+3*4]) AS2( movd xmm0, [WORD_REG(ax)+3*4])
AS2( pxor xmm4, xmm0) AS2( pxor xmm4, xmm0)
AS2( add eax, 16) AS2( add WORD_REG(ax), 16)
ASL(5) ASL(5)
AS2( movd [ebp+0*4], xmm2) AS2( movd [WORD_REG(bp)+0*4], xmm2)
AS2( movd [ebp+1*4], xmm3) AS2( movd [WORD_REG(bp)+1*4], xmm3)
AS2( movd [ebp+2*4], xmm1) AS2( movd [WORD_REG(bp)+2*4], xmm1)
AS2( movd [ebp+3*4], xmm4) AS2( movd [WORD_REG(bp)+3*4], xmm4)
AS2( sub esi, 4) AS2( sub WORD_REG(si), 4)
ASJ( jz, 6, f) ASJ( jz, 6, f)
AS2( add ebp, 16) AS2( add WORD_REG(bp), 16)
AS2( psrldq xmm2, 4) AS2( psrldq xmm2, 4)
AS2( psrldq xmm3, 4) AS2( psrldq xmm3, 4)
AS2( psrldq xmm1, 4) AS2( psrldq xmm1, 4)
@ -561,21 +563,21 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
ASJ( jmp, 4, b) ASJ( jmp, 4, b)
ASL(6) // save state ASL(6) // save state
AS2( mov ebx, SSE2_state) AS2( mov WORD_REG(bx), SSE2_state)
AS2( movdqa xmm0, [SSE2_stateCopy+0*16]) AS2( movdqa xmm0, [SSE2_stateCopy+0*16])
AS2( movdqa [ebx+0*16], xmm0) AS2( movdqa [WORD_REG(bx)+0*16], xmm0)
AS2( movdqa xmm0, [SSE2_stateCopy+1*16]) AS2( movdqa xmm0, [SSE2_stateCopy+1*16])
AS2( movdqa [ebx+1*16], xmm0) AS2( movdqa [WORD_REG(bx)+1*16], xmm0)
AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16]) AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
AS2( movq QWORD PTR [ebx+2*16], xmm0) AS2( movq QWORD PTR [WORD_REG(bx)+2*16], xmm0)
AS2( mov [ebx+10*4], ecx) AS2( mov [WORD_REG(bx)+10*4], ecx)
AS2( mov [ebx+11*4], edx) AS2( mov [WORD_REG(bx)+11*4], edx)
AS1( pop esp) AS_POP( sp)
AS1( pop ebp) AS_POP( bp)
#ifdef __GNUC__ #ifdef __GNUC__
AS1( pop ebx) AS_POP( bx)
".att_syntax prefix;" ".att_syntax prefix;"
: :
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input) : "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
@ -593,17 +595,16 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
#endif #endif
#define DIV_A(x) (((x) >> 8) ^ s_mulTables[256 + byte(x)]) #define DIV_A(x) (((x) >> 8) ^ s_mulTables[256 + byte(x)])
#define XMUX(c, x, y) (x ^ (y & (0 - (c & 1))))
#define r1(i) ((i%2) ? reg2 : reg1) #define r1(i) ((i%2) ? reg2 : reg1)
#define r2(i) ((i%2) ? reg1 : reg2) #define r2(i) ((i%2) ? reg1 : reg2)
#define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u) \ #define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u) \
r2(x0) += XMUX(r1(x0), s##x1, s##x8);\
r1(x0) = rotlFixed(r1(x0) * 0x54655307, 7);\
v = s##x0;\
u = (s##x9 + r2(x0)) ^ r1(x0);\ u = (s##x9 + r2(x0)) ^ r1(x0);\
s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9; v = s##x0;\
s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;\
r1(x0) += XMUX(r2(x0), s##x2, s##x9);\
r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\
#define SOSEMANUK_OUTPUT(x) \ #define SOSEMANUK_OUTPUT(x) \
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\

View File

@ -34,7 +34,7 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
void Tiger::Transform (word64 *digest, const word64 *X) void Tiger::Transform (word64 *digest, const word64 *X)
{ {
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
if (HasSSE2()) if (HasSSE2())
{ {
#ifdef __GNUC__ #ifdef __GNUC__
@ -43,9 +43,14 @@ void Tiger::Transform (word64 *digest, const word64 *X)
".intel_syntax noprefix;" ".intel_syntax noprefix;"
AS1( push ebx) AS1( push ebx)
#else #else
#if _MSC_VER < 1300
const word64 *t = table;
AS2( mov edx, t)
#else
AS2( lea edx, [table])
#endif
AS2( mov eax, digest) AS2( mov eax, digest)
AS2( mov esi, X) AS2( mov esi, X)
AS2( lea edx, [table])
#endif #endif
AS2( movq mm0, [eax]) AS2( movq mm0, [eax])
AS2( movq mm1, [eax+1*8]) AS2( movq mm1, [eax+1*8])

View File

@ -390,7 +390,7 @@ CRYPTOPP_ALIGN_DATA(16) static const word64 Whirlpool_C[4*256+R] CRYPTOPP_SECTIO
// Whirlpool basic transformation. Transforms state based on block. // Whirlpool basic transformation. Transforms state based on block.
void Whirlpool::Transform(word64 *digest, const word64 *block) void Whirlpool::Transform(word64 *digest, const word64 *block)
{ {
#ifdef CRYPTOPP_X86_ASM_AVAILABLE #if defined(CRYPTOPP_X86_ASM_AVAILABLE)
if (HasMMX()) if (HasMMX())
{ {
// MMX version has the same structure as C version below // MMX version has the same structure as C version below
@ -398,26 +398,29 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
__asm__ __volatile__ __asm__ __volatile__
( (
".intel_syntax noprefix;" ".intel_syntax noprefix;"
AS1( push ebx) AS_PUSH( bx)
AS2( mov ebx, eax) AS2( mov WORD_REG(bx), WORD_REG(ax))
#else #else
AS2( lea ebx, [Whirlpool_C]) #if _MSC_VER < 1300
AS2( mov ecx, digest) AS_PUSH( bx)
AS2( mov edx, block) #endif
AS2( lea WORD_REG(bx), [Whirlpool_C])
AS2( mov WORD_REG(cx), digest)
AS2( mov WORD_REG(dx), block)
#endif #endif
AS2( mov eax, esp) AS2( mov WORD_REG(ax), WORD_REG(sp))
AS2( and esp, 0xfffffff0) AS2( and WORD_REG(sp), -16)
AS2( sub esp, 16*8) AS2( sub WORD_REG(sp), 16*8)
AS1( push eax) AS_PUSH( ax)
AS2( xor esi, esi) AS2( xor esi, esi)
ASL(0) ASL(0)
AS2( movq mm0, [ecx+8*esi]) AS2( movq mm0, [WORD_REG(cx)+8*WORD_REG(si)])
AS2( movq [esp+4+8*esi], mm0) // k AS2( movq [WORD_REG(sp)+WORD_SZ+8*WORD_REG(si)], mm0) // k
AS2( pxor mm0, [edx+8*esi]) AS2( pxor mm0, [WORD_REG(dx)+8*WORD_REG(si)])
AS2( movq [esp+4+64+8*esi], mm0) // s AS2( movq [WORD_REG(sp)+WORD_SZ+64+8*WORD_REG(si)], mm0) // s
AS2( movq [ecx+8*esi], mm0) AS2( movq [WORD_REG(cx)+8*WORD_REG(si)], mm0)
AS1( inc esi) AS1( inc WORD_REG(si))
AS2( cmp esi, 8) AS2( cmp WORD_REG(si), 8)
ASJ( jne, 0, b) ASJ( jne, 0, b)
AS2( xor esi, esi) AS2( xor esi, esi)
@ -427,16 +430,16 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
#define KSL1(a, b) AS2(pxor mm##a, b) #define KSL1(a, b) AS2(pxor mm##a, b)
#define KSL(op, i, a, b, c, d) \ #define KSL(op, i, a, b, c, d) \
AS2(mov eax, [esp+4+8*i])\ AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*i])\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
KSL##op(a, [ebx+3*2048+8*edi])\ KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\ AS2(movzx edi, ah)\
KSL##op(b, [ebx+2*2048+8*edi])\ KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\ AS2(shr eax, 16)\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
AS2(shr eax, 8)\ AS2(shr eax, 8)\
KSL##op(c, [ebx+1*2048+8*edi])\ KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
KSL##op(d, [ebx+0*2048+8*eax]) KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
#define KSH0(a, b) \ #define KSH0(a, b) \
ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
@ -445,57 +448,57 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
AS2(pxor mm##a, b) AS2(pxor mm##a, b)
#define KSH2(a, b) \ #define KSH2(a, b) \
AS2(pxor mm##a, b)\ AS2(pxor mm##a, b)\
AS2(movq [esp+4+8*a], mm##a) AS2(movq [WORD_REG(sp)+WORD_SZ+8*a], mm##a)
#define KSH(op, i, a, b, c, d) \ #define KSH(op, i, a, b, c, d) \
AS2(mov eax, [esp+4+8*((i+4)-8*((i+4)/8))+4])\ AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*((i+4)-8*((i+4)/8))+4])\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
KSH##op(a, [ebx+3*2048+8*edi])\ KSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\ AS2(movzx edi, ah)\
KSH##op(b, [ebx+2*2048+8*edi])\ KSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\ AS2(shr eax, 16)\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
AS2(shr eax, 8)\ AS2(shr eax, 8)\
KSH##op(c, [ebx+1*2048+8*edi])\ KSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
KSH##op(d, [ebx+0*2048+8*eax]) KSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
#define TSL(op, i, a, b, c, d) \ #define TSL(op, i, a, b, c, d) \
AS2(mov eax, [esp+4+64+8*i])\ AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*i])\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
KSL##op(a, [ebx+3*2048+8*edi])\ KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\ AS2(movzx edi, ah)\
KSL##op(b, [ebx+2*2048+8*edi])\ KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\ AS2(shr eax, 16)\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
AS2(shr eax, 8)\ AS2(shr eax, 8)\
KSL##op(c, [ebx+1*2048+8*edi])\ KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
KSL##op(d, [ebx+0*2048+8*eax]) KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
#define TSH0(a, b) \ #define TSH0(a, b) \
ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
AS2(pxor mm##a, [esp+4+8*a])\ AS2(pxor mm##a, [WORD_REG(sp)+WORD_SZ+8*a])\
AS2(pxor mm##a, b) AS2(pxor mm##a, b)
#define TSH1(a, b) \ #define TSH1(a, b) \
AS2(pxor mm##a, b) AS2(pxor mm##a, b)
#define TSH2(a, b) \ #define TSH2(a, b) \
AS2(pxor mm##a, b)\ AS2(pxor mm##a, b)\
AS2(movq [esp+4+64+8*a], mm##a) AS2(movq [WORD_REG(sp)+WORD_SZ+64+8*a], mm##a)
#define TSH3(a, b) \ #define TSH3(a, b) \
AS2(pxor mm##a, b)\ AS2(pxor mm##a, b)\
AS2(pxor mm##a, [ecx+8*a])\ AS2(pxor mm##a, [WORD_REG(cx)+8*a])\
AS2(movq [ecx+8*a], mm##a) AS2(movq [WORD_REG(cx)+8*a], mm##a)
#define TSH(op, i, a, b, c, d) \ #define TSH(op, i, a, b, c, d) \
AS2(mov eax, [esp+4+64+8*((i+4)-8*((i+4)/8))+4])\ AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*((i+4)-8*((i+4)/8))+4])\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
TSH##op(a, [ebx+3*2048+8*edi])\ TSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\ AS2(movzx edi, ah)\
TSH##op(b, [ebx+2*2048+8*edi])\ TSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\ AS2(shr eax, 16)\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
AS2(shr eax, 8)\ AS2(shr eax, 8)\
TSH##op(c, [ebx+1*2048+8*edi])\ TSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
TSH##op(d, [ebx+0*2048+8*eax]) TSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
KSL(0, 4, 3, 2, 1, 0) KSL(0, 4, 3, 2, 1, 0)
KSL(0, 0, 7, 6, 5, 4) KSL(0, 0, 7, 6, 5, 4)
@ -514,8 +517,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
KSH(2, 3, 2, 1, 0, 7) KSH(2, 3, 2, 1, 0, 7)
KSH(2, 7, 6, 5, 4, 3) KSH(2, 7, 6, 5, 4, 3)
AS2( pxor mm0, [ebx + 8*1024 + esi*8]) AS2( pxor mm0, [WORD_REG(bx) + 8*1024 + WORD_REG(si)*8])
AS2( movq [esp+4], mm0) AS2( movq [WORD_REG(sp)+WORD_SZ], mm0)
TSL(0, 4, 3, 2, 1, 0) TSL(0, 4, 3, 2, 1, 0)
TSL(0, 0, 7, 6, 5, 4) TSL(0, 0, 7, 6, 5, 4)
@ -532,8 +535,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
TSH(1, 5, 4, 3, 2, 1) TSH(1, 5, 4, 3, 2, 1)
TSH(1, 6, 5, 4, 3, 2) TSH(1, 6, 5, 4, 3, 2)
AS1( inc esi) AS1( inc WORD_REG(si))
AS2( cmp esi, 10) AS2( cmp WORD_REG(si), 10)
ASJ( je, 2, f) ASJ( je, 2, f)
TSH(2, 3, 2, 1, 0, 7) TSH(2, 3, 2, 1, 0, 7)
@ -550,11 +553,13 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
#undef TSL #undef TSL
#undef TSH #undef TSH
AS_POP( sp)
AS1( emms) AS1( emms)
AS1( pop esp)
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
AS_POP( bx)
#endif
#ifdef __GNUC__ #ifdef __GNUC__
AS1( pop ebx)
".att_syntax prefix;" ".att_syntax prefix;"
: :
: "a" (Whirlpool_C), "c" (digest), "d" (block) : "a" (Whirlpool_C), "c" (digest), "d" (block)

View File

@ -3,78 +3,53 @@ PUBLIC Baseline_Sub
.CODE .CODE
ALIGN 8 ALIGN 8
Baseline_Add PROC Baseline_Add PROC
lea rdx, [rdx+8*rcx] lea rdx, [rdx+8*rcx]
lea r8, [r8+8*rcx] lea r8, [r8+8*rcx]
lea r9, [r9+8*rcx] lea r9, [r9+8*rcx]
neg rcx ; rcx is negative index neg rcx ; rcx is negative index
test rcx, 2 ; this clears carry flag jz $1@Baseline_Add
jz $0@Baseline_Add
sub rcx, 2
jmp $1@Baseline_Add
$0@Baseline_Add:
jrcxz $2@Baseline_Add ; loop until rcx overflows and becomes zero
mov rax,[r8+8*rcx] mov rax,[r8+8*rcx]
adc rax,[r9+8*rcx] add rax,[r9+8*rcx]
mov [rdx+8*rcx],rax mov [rdx+8*rcx],rax
$0@Baseline_Add:
mov rax,[r8+8*rcx+8] mov rax,[r8+8*rcx+8]
adc rax,[r9+8*rcx+8] adc rax,[r9+8*rcx+8]
mov [rdx+8*rcx+8],rax mov [rdx+8*rcx+8],rax
$1@Baseline_Add: lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2
mov rax,[r8+8*rcx+16] jrcxz $1@Baseline_Add ; loop until rcx overflows and becomes zero
adc rax,[r9+8*rcx+16] mov rax,[r8+8*rcx]
mov [rdx+8*rcx+16],rax adc rax,[r9+8*rcx]
mov rax,[r8+8*rcx+24] mov [rdx+8*rcx],rax
adc rax,[r9+8*rcx+24]
mov [rdx+8*rcx+24],rax
lea rcx,[rcx+4] ; advance index, avoid inc which causes slowdown on Intel Core 2
jmp $0@Baseline_Add jmp $0@Baseline_Add
$1@Baseline_Add:
$2@Baseline_Add:
mov rax, 0 mov rax, 0
setc al ; store carry into rax (return result register) adc rax, rax ; store carry into rax (return result register)
ret ret
Baseline_Add ENDP Baseline_Add ENDP
ALIGN 8 ALIGN 8
Baseline_Sub PROC Baseline_Sub PROC
lea rdx, [rdx+8*rcx] lea rdx, [rdx+8*rcx]
lea r8, [r8+8*rcx] lea r8, [r8+8*rcx]
lea r9, [r9+8*rcx] lea r9, [r9+8*rcx]
neg rcx ; rcx is negative index neg rcx ; rcx is negative index
test rcx, 2 ; this clears carry flag jz $1@Baseline_Sub
jz $0@Baseline_Sub
sub rcx, 2
jmp $1@Baseline_Sub
$0@Baseline_Sub:
jrcxz $2@Baseline_Sub ; loop until rcx overflows and becomes zero
mov rax,[r8+8*rcx] mov rax,[r8+8*rcx]
sbb rax,[r9+8*rcx] sub rax,[r9+8*rcx]
mov [rdx+8*rcx],rax mov [rdx+8*rcx],rax
$0@Baseline_Sub:
mov rax,[r8+8*rcx+8] mov rax,[r8+8*rcx+8]
sbb rax,[r9+8*rcx+8] sbb rax,[r9+8*rcx+8]
mov [rdx+8*rcx+8],rax mov [rdx+8*rcx+8],rax
$1@Baseline_Sub: lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2
mov rax,[r8+8*rcx+16] jrcxz $1@Baseline_Sub ; loop until rcx overflows and becomes zero
sbb rax,[r9+8*rcx+16] mov rax,[r8+8*rcx]
mov [rdx+8*rcx+16],rax sbb rax,[r9+8*rcx]
mov rax,[r8+8*rcx+24] mov [rdx+8*rcx],rax
sbb rax,[r9+8*rcx+24]
mov [rdx+8*rcx+24],rax
lea rcx,[rcx+4] ; advance index, avoid inc which causes slowdown on Intel Core 2
jmp $0@Baseline_Sub jmp $0@Baseline_Sub
$1@Baseline_Sub:
$2@Baseline_Sub:
mov rax, 0 mov rax, 0
setc al ; store carry into rax (return result register) adc rax, rax ; store carry into rax (return result register)
ret ret
Baseline_Sub ENDP Baseline_Sub ENDP