fix compile for x64, DLL and VC 6

pull/2/head
weidai 2007-05-04 15:24:09 +00:00
parent 460c2d6c6a
commit d2510f30c7
14 changed files with 810 additions and 553 deletions

View File

@ -228,7 +228,7 @@ void Camellia::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBloc
SLOW_ROUND(lh, ll, rh, rl, KS(1,0), KS(1,1))
SLOW_ROUND(rh, rl, lh, ll, KS(1,2), KS(1,3))
for (unsigned int i = m_rounds-1; i > 0; --i)
for (i = m_rounds-1; i > 0; --i)
{
DOUBLE_ROUND(lh, ll, rh, rl, KS(2,0), KS(2,1), KS(2,2), KS(2,3))
DOUBLE_ROUND(lh, ll, rh, rl, KS(3,0), KS(3,1), KS(3,2), KS(3,3))

29
cpu.cpp
View File

@ -1,8 +1,10 @@
// cpu.cpp - written and placed in the public domain by Wei Dai
#include "pch.h"
#include "cpu.h"
#ifndef CRYPTOPP_IMPORTS
#include "cpu.h"
#include "misc.h"
#include <algorithm>
@ -11,10 +13,15 @@
#include <setjmp.h>
#endif
#ifdef CRYPTOPP_MSVC6PP_OR_LATER
#include <emmintrin.h>
#endif
NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_X86_ASM_AVAILABLE
#ifndef _MSC_VER
typedef void (*SigHandler)(int);
static jmp_buf s_jmpNoCPUID;
@ -22,6 +29,7 @@ static void SigIllHandlerCPUID(int)
{
longjmp(s_jmpNoCPUID, 1);
}
#endif
bool CpuId(word32 input, word32 *output)
{
@ -57,7 +65,11 @@ bool CpuId(word32 input, word32 *output)
__asm__
(
// save ebx in case -fPIC is being used
#if CRYPTOPP_BOOL_X86
"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
#else
"pushq %%rbx; cpuid; mov %%ebx, %%edi; popq %%rbx"
#endif
: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
: "a" (input)
);
@ -84,22 +96,19 @@ bool CpuId(word32 input, word32 *output)
return true;
}
inline bool TrySSE2()
{
return true;
}
#endif
#ifdef CRYPTOPP_CPUID_AVAILABLE
static bool TrySSE2()
{
#ifdef _MSC_VER
#if CRYPTOPP_BOOL_X64
return true;
#elif defined(_MSC_VER)
__try
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
__asm por xmm0, xmm0 // executing SSE2 instruction
AS2(por xmm0, xmm0) // executing SSE2 instruction
#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
__mm128i x = _mm_setzero_si128();
return _mm_cvtsi128_si32(x) == 0;
@ -137,7 +146,7 @@ static bool TrySSE2()
bool g_x86DetectionDone = false;
bool g_hasSSE2 = false, g_hasSSSE3 = false, g_hasMMX = false, g_isP4 = false;
int g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
word32 g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
void DetectX86Features()
{
@ -170,3 +179,5 @@ void DetectX86Features()
#endif
NAMESPACE_END
#endif

33
cpu.h
View File

@ -3,6 +3,10 @@
#include "config.h"
#ifdef CRYPTOPP_MSVC6PP_OR_LATER
#include <emmintrin.h>
#endif
NAMESPACE_BEGIN(CryptoPP)
#if defined(CRYPTOPP_X86_ASM_AVAILABLE) || (_MSC_VER >= 1400 && CRYPTOPP_BOOL_X64)
@ -10,12 +14,15 @@ NAMESPACE_BEGIN(CryptoPP)
#define CRYPTOPP_CPUID_AVAILABLE
// these should not be used directly
extern bool g_x86DetectionDone;
extern bool g_hasSSE2, g_hasMMX, g_hasSSSE3, g_isP4;
extern int g_cacheLineSize;
void DetectX86Features();
extern CRYPTOPP_DLL bool g_x86DetectionDone;
extern CRYPTOPP_DLL bool g_hasSSE2;
extern CRYPTOPP_DLL bool g_hasMMX;
extern CRYPTOPP_DLL bool g_hasSSSE3;
extern CRYPTOPP_DLL bool g_isP4;
extern CRYPTOPP_DLL word32 g_cacheLineSize;
CRYPTOPP_DLL void DetectX86Features();
bool CpuId(word32 input, word32 *output);
CRYPTOPP_DLL bool CpuId(word32 input, word32 *output);
#if CRYPTOPP_BOOL_X64
inline bool HasSSE2() {return true;}
@ -94,6 +101,7 @@ inline bool HasMMX() {return false;}
#define ASL(x) GNU_ASL(x)
#define ASJ(x, y, z) GNU_ASJ(x, y, z)
#define ASC(x, y) #x " " #y ";"
#define CRYPTOPP_NAKED
#else
#define AS1(x) __asm {x}
#define AS2(x, y) __asm {x, y}
@ -102,11 +110,26 @@ inline bool HasMMX() {return false;}
#define ASL(x) __asm {label##x:}
#define ASJ(x, y, z) __asm {x label##y}
#define ASC(x, y) __asm {x label##y}
#define CRYPTOPP_NAKED __declspec(naked)
#endif
// GNU assembler doesn't seem to have mod operator
#define ASM_MOD(x, y) ((x)-((x)/(y))*(y))
#if CRYPTOPP_BOOL_X86
#define WORD_SZ 4
#define WORD_REG(x) e##x
#define WORD_PTR DWORD PTR
#define AS_PUSH(x) AS1(push e##x)
#define AS_POP(x) AS1(pop e##x)
#elif CRYPTOPP_BOOL_X64
#define WORD_SZ 8
#define WORD_REG(x) r##x
#define WORD_PTR QWORD PTR
#define AS_PUSH(x) AS1(pushq r##x)
#define AS_POP(x) AS1(popq r##x)
#endif
NAMESPACE_END
#endif

View File

@ -5,14 +5,14 @@
#include "randpool.h"
#include "files.h"
#include "trunhash.h"
#include "queue.h"
#include "validate.h"
#include <iostream>
#include <memory>
USING_NAMESPACE(CryptoPP)
USING_NAMESPACE(std)
RandomPool & GlobalRNG();
typedef std::map<std::string, std::string> TestData;
class TestFailure : public Exception
@ -67,7 +67,7 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo
s1 = s1.substr(s1.find(' ')+1);
}
s2.clear();
s2 = ""; // MSVC 6 doesn't have clear();
if (s1[0] == '\"')
{
@ -85,8 +85,13 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo
s1 = s1.substr(STDMIN(s1.find(' '), s1.length()));
}
ByteQueue q;
while (repeat--)
target.Put((const byte *)s2.data(), s2.size());
{
q.Put((const byte *)s2.data(), s2.size());
if (q.MaxRetrievable() > 4*1024 || repeat == 0)
q.TransferTo(target);
}
}
}

View File

@ -18,7 +18,7 @@
#include <iostream>
#if defined(_MSC_VER) && _MSC_VER >= 1400
#if _MSC_VER >= 1400
#include <intrin.h>
#endif
@ -30,6 +30,8 @@
#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
#endif
#define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86)
NAMESPACE_BEGIN(CryptoPP)
bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
@ -99,7 +101,36 @@ static word AtomicInverseModPower2(word A)
// ********************************************************
#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
#if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || CRYPTOPP_BOOL_X64
#define Declare2Words(x) word x##0, x##1;
#define AssignWord(a, b) a##0 = b; a##1 = 0;
#define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
#define LowWord(a) a##0
#define HighWord(a) a##1
#ifdef _MSC_VER
#define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1);
#define Double3Words(c, d) d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
#elif defined(__DECCXX)
#define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
#elif CRYPTOPP_BOOL_X64
#define MultiplyWords(p, a, b) asm ("mulq %3" : "=a"(p##0), "=d"(p##1) : "a"(a), "g"(b) : "cc");
#define MulAcc(c, d, a, b) asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
#define Double3Words(c, d) asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
#define Acc2WordsBy1(a, b) asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
#define Acc2WordsBy2(a, b) asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
#define Acc3WordsBy2(c, d, e) asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
#endif
#ifndef Double3Words
#define Double3Words(c, d) d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
#endif
#ifndef Acc2WordsBy2
#define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
#endif
#define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
#define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
#define GetCarry(u) u##1
#define GetBorrow(u) u##1
#else
#define Declare2Words(x) dword x;
#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
#define MultiplyWords(p, a, b) p = __emulu(a, b);
@ -108,34 +139,23 @@ static word AtomicInverseModPower2(word A)
#endif
#define AssignWord(a, b) a = b;
#define Add2WordsBy1(a, b, c) a = b + c;
#define Acc2WordsBy1(a, b) a += b;
#define Acc2WordsBy2(a, b) a += b;
#define LowWord(a) (word)a
#define HighWord(a) (word)(a>>WORD_BITS)
#define Double2Words(a) a += a;
#define LowWord(a) word(a)
#define HighWord(a) word(a>>WORD_BITS)
#define Double3Words(c, d) d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
#define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u);
#define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u);
#define GetCarry(u) HighWord(u)
#define GetBorrow(u) word(u>>(WORD_BITS*2-1))
#else
#define Declare2Words(x) word x##0, x##1;
#define AssignWord(a, b) a##0 = b; a##1 = 0;
#define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
#endif
#ifndef MulAcc
#define MulAcc(c, d, a, b) MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
#endif
#ifndef Acc2WordsBy1
#define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b)
#define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
#define LowWord(a) a##0
#define HighWord(a) a##1
#ifdef _MSC_VER
#define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1);
#define Double2Words(a) a##1 = __shiftleft128(a##0, a##1, 1); a##0 += a##0;
#elif defined(__DECCXX)
#define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
#define Double2Words(a) a##1 = (a##1 + a##1) + (a##0 >> (WORD_BITS-1)); a##0 += a##0;
#endif
#define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
#define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
#define GetCarry(u) u##1
#define GetBorrow(u) u##1
#endif
#ifndef Acc3WordsBy2
#define Acc3WordsBy2(c, d, e) Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
#endif
class DWord
@ -411,9 +431,8 @@ inline word DWord::operator%(word a)
// use some tricks to share assembly code between MSVC and GCC
#if defined(__GNUC__)
#define CRYPTOPP_NAKED
#define AddPrologue \
word32 result; \
word result; \
__asm__ __volatile__ \
( \
".intel_syntax noprefix;"
@ -454,7 +473,6 @@ inline word DWord::operator%(word a)
: "memory", "cc" \
);
#else
#define CRYPTOPP_NAKED __declspec(naked)
#define AddPrologue \
__asm push edi \
__asm push esi \
@ -464,33 +482,107 @@ inline word DWord::operator%(word a)
__asm pop esi \
__asm pop edi \
__asm ret 8
#if _MSC_VER < 1300
#define SaveEBX __asm push ebx
#define RestoreEBX __asm pop ebx
#else
#define SaveEBX
#define RestoreEBX
#endif
#define SquPrologue \
AS2( mov eax, A) \
AS2( mov ecx, C) \
SaveEBX \
AS2( lea ebx, s_maskLow16)
#define SquEpilogue
#define MulPrologue \
AS2( mov eax, A) \
AS2( mov edi, B) \
AS2( mov ecx, C) \
SaveEBX \
AS2( lea ebx, s_maskLow16)
#define MulEpilogue
#define TopPrologue \
AS2( mov eax, A) \
AS2( mov edi, B) \
AS2( mov ecx, C) \
AS2( mov esi, L) \
SaveEBX \
AS2( lea ebx, s_maskLow16)
#define TopEpilogue
#define SquEpilogue RestoreEBX
#define MulEpilogue RestoreEBX
#define TopEpilogue RestoreEBX
#endif
#if defined(_MSC_VER) && defined(_M_X64)
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
int Baseline_Add(size_t N, word *C, const word *A, const word *B);
int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
word Baseline_Add(size_t N, word *C, const word *A, const word *B);
word Baseline_Sub(size_t N, word *C, const word *A, const word *B);
}
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE)
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__)
word Baseline_Add(size_t N, word *C, const word *A, const word *B)
{
word result;
__asm__ __volatile__
(
".intel_syntax;"
AS1( neg %1)
ASJ( jz, 1, f)
AS2( mov %0,[%3+8*%1])
AS2( add %0,[%4+8*%1])
AS2( mov [%2+8*%1],%0)
ASL(0)
AS2( mov %0,[%3+8*%1+8])
AS2( adc %0,[%4+8*%1+8])
AS2( mov [%2+8*%1+8],%0)
AS2( lea %1,[%1+2])
ASJ( jrcxz, 1, f)
AS2( mov %0,[%3+8*%1])
AS2( adc %0,[%4+8*%1])
AS2( mov [%2+8*%1],%0)
ASJ( jmp, 0, b)
ASL(1)
AS2( mov %0, 0)
AS2( adc %0, %0)
".att_syntax;"
: "=&r" (result)
: "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
: "memory", "cc"
);
return result;
}
word Baseline_Sub(size_t N, word *C, const word *A, const word *B)
{
word result;
__asm__ __volatile__
(
".intel_syntax;"
AS1( neg %1)
ASJ( jz, 1, f)
AS2( mov %0,[%3+8*%1])
AS2( sub %0,[%4+8*%1])
AS2( mov [%2+8*%1],%0)
ASL(0)
AS2( mov %0,[%3+8*%1+8])
AS2( sbb %0,[%4+8*%1+8])
AS2( mov [%2+8*%1+8],%0)
AS2( lea %1,[%1+2])
ASJ( jrcxz, 1, f)
AS2( mov %0,[%3+8*%1])
AS2( sbb %0,[%4+8*%1])
AS2( mov [%2+8*%1],%0)
ASJ( jmp, 0, b)
ASL(1)
AS2( mov %0, 0)
AS2( adc %0, %0)
".att_syntax;"
: "=&r" (result)
: "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
: "memory", "cc"
);
return result;
}
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
{
AddPrologue
@ -531,7 +623,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word
AddEpilogue
}
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
{
AddPrologue
@ -572,8 +664,8 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word
AddEpilogue
}
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
#if CRYPTOPP_INTEGER_SSE2
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
{
AddPrologue
@ -629,7 +721,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A,
AddEpilogue
}
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
{
AddPrologue
@ -687,7 +779,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A,
}
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#else
int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
{
assert (N%2 == 0);
@ -703,7 +795,7 @@ int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word
return int(GetCarry(u));
}
int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
{
assert (N%2 == 0);
@ -737,7 +829,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
#define Mul_2 \
Mul_Begin(2) \
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
Mul_End(2)
Mul_End(1, 1)
#define Mul_4 \
Mul_Begin(4) \
@ -746,7 +838,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
Mul_End(4)
Mul_End(5, 3)
#define Mul_8 \
Mul_Begin(8) \
@ -763,7 +855,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
Mul_End(8)
Mul_End(13, 7)
#define Mul_16 \
Mul_Begin(16) \
@ -796,7 +888,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
Mul_End(16)
Mul_End(29, 15)
#define Squ_2 \
Squ_Begin(2) \
@ -900,6 +992,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
Bot_End(16)
#if 0
#define Mul_Begin(n) \
Declare2Words(p) \
Declare2Words(c) \
@ -938,9 +1031,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
#define Bot_End(n) \
R[n-1] = e;
/*
// this is slower on MSVC 2005 Win32
#else
#define Mul_Begin(n) \
Declare2Words(p) \
word c; \
@ -950,25 +1041,20 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
AssignWord(d, HighWord(p))
#define Mul_Acc(i, j) \
MultiplyWords(p, A[i], B[j]) \
Acc2WordsBy1(p, c) \
c = LowWord(p); \
Acc2WordsBy1(d, HighWord(p))
MulAcc(c, d, A[i], B[j])
#define Mul_SaveAcc(k, i, j) \
R[k] = c; \
MultiplyWords(p, A[i], B[j]) \
Acc2WordsBy1(p, LowWord(d)) \
c = LowWord(p); \
c = LowWord(d); \
AssignWord(d, HighWord(d)) \
Acc2WordsBy1(d, HighWord(p))
MulAcc(c, d, A[i], B[j])
#define Mul_End(n) \
R[2*n-3] = c; \
MultiplyWords(p, A[n-1], B[n-1])\
Acc2WordsBy2(d, p) \
R[2*n-2] = LowWord(d); \
R[2*n-1] = HighWord(d);
#define Mul_End(k, i) \
R[k] = c; \
MultiplyWords(p, A[i], B[i]) \
Acc2WordsBy2(p, d) \
R[k+1] = LowWord(p); \
R[k+2] = HighWord(p);
#define Bot_SaveAcc(k, i, j) \
R[k] = c; \
@ -980,52 +1066,45 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
#define Bot_End(n) \
R[n-1] = c;
*/
#endif
#define Squ_Begin(n) \
Declare2Words(p) \
Declare2Words(c) \
word c; \
Declare2Words(d) \
Declare2Words(e) \
MultiplyWords(p, A[0], A[0]) \
R[0] = LowWord(p); \
AssignWord(e, HighWord(p)) \
MultiplyWords(p, A[0], A[1]) \
AssignWord(c, LowWord(p)) \
c = LowWord(p); \
AssignWord(d, HighWord(p)) \
Squ_NonDiag \
#define Squ_NonDiag \
Double2Words(c) \
Double2Words(d) \
Double3Words(c, d)
#define Squ_SaveAcc(k, i, j) \
Acc2WordsBy2(c, e) \
R[k] = LowWord(c); \
Add2WordsBy1(e, d, HighWord(c)) \
Acc3WordsBy2(c, d, e) \
R[k] = c; \
MultiplyWords(p, A[i], A[j]) \
AssignWord(c, LowWord(p)) \
c = LowWord(p); \
AssignWord(d, HighWord(p)) \
#define Squ_Acc(i, j) \
MultiplyWords(p, A[i], A[j]) \
Acc2WordsBy1(c, LowWord(p)) \
Acc2WordsBy1(d, HighWord(p))
MulAcc(c, d, A[i], A[j])
#define Squ_Diag(i) \
Squ_NonDiag \
MultiplyWords(p, A[i], A[i]) \
Acc2WordsBy1(c, LowWord(p)) \
Acc2WordsBy1(d, HighWord(p)) \
MulAcc(c, d, A[i], A[i])
#define Squ_End(n) \
Acc2WordsBy2(c, e) \
R[2*n-3] = LowWord(c); \
Acc2WordsBy1(d, HighWord(c)) \
Acc3WordsBy2(c, d, e) \
R[2*n-3] = c; \
MultiplyWords(p, A[n-1], A[n-1])\
Acc2WordsBy2(d, p) \
R[2*n-2] = LowWord(d); \
R[2*n-1] = HighWord(d);
Acc2WordsBy2(p, e) \
R[2*n-2] = LowWord(p); \
R[2*n-1] = HighWord(p);
void Baseline_Multiply2(word *R, const word *A, const word *B)
{
@ -1072,7 +1151,62 @@ void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
Bot_8
}
/*
#define Top_Begin(n) \
Declare2Words(p) \
word c; \
Declare2Words(d) \
MultiplyWords(p, A[0], B[n-2]);\
AssignWord(d, HighWord(p));
#define Top_Acc(i, j) \
MultiplyWords(p, A[i], B[j]);\
Acc2WordsBy1(d, HighWord(p));
#define Top_SaveAcc0(i, j) \
c = LowWord(d); \
AssignWord(d, HighWord(d)) \
MulAcc(c, d, A[i], B[j])
#define Top_SaveAcc1(i, j) \
c = L<c; \
Acc2WordsBy1(d, c); \
c = LowWord(d); \
AssignWord(d, HighWord(d)) \
MulAcc(c, d, A[i], B[j])
void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
{
word T[4];
Baseline_Multiply2(T, A, B);
R[0] = T[2];
R[1] = T[3];
}
void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
{
Top_Begin(4)
Top_Acc(1, 1) Top_Acc(2, 0) \
Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
Mul_End(1, 3)
}
void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
{
Top_Begin(8)
Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
Mul_End(5, 7)
}
#if !CRYPTOPP_INTEGER_SSE2 // save memory by not compiling these functions when SSE2 is available
void Baseline_Multiply16(word *R, const word *A, const word *B)
{
Mul_16
@ -1087,16 +1221,40 @@ void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
{
Bot_16
}
*/
void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
{
Top_Begin(16)
Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
Mul_End(13, 15)
}
#endif
// ********************************************************
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_INTEGER_SSE2
CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
#undef Mul_Begin
#undef Mul_Acc
#undef Top_Begin
#undef Top_Acc
#undef Squ_Acc
#undef Squ_NonDiag
#undef Squ_Diag
@ -1760,33 +1918,35 @@ void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
Top_End(8)
}
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#endif // #if CRYPTOPP_INTEGER_SSE2
// ********************************************************
typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
typedef word (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
typedef void (* PMul)(word *C, const word *A, const word *B);
typedef void (* PSqu)(word *C, const word *A);
typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_INTEGER_SSE2
static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
static PMulTop s_pTop[3];
static size_t s_recursionLimit = 8;
#else
static const size_t s_recursionLimit = 8;
static const size_t s_recursionLimit = 16;
#endif
static PMul s_pMul[9], s_pBot[9];
static PSqu s_pSqu[9];
static PMulTop s_pTop[9];
static void SetFunctionPointers()
{
s_pMul[0] = &Baseline_Multiply2;
s_pBot[0] = &Baseline_MultiplyBottom2;
s_pSqu[0] = &Baseline_Square2;
s_pTop[0] = &Baseline_MultiplyTop2;
s_pTop[1] = &Baseline_MultiplyTop4;
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_INTEGER_SSE2
if (HasSSE2())
{
if (IsP4())
@ -1812,39 +1972,45 @@ static void SetFunctionPointers()
s_pSqu[4] = &SSE2_Square16;
s_pSqu[8] = &SSE2_Square32;
s_pTop[0] = &SSE2_MultiplyTop8;
s_pTop[1] = &SSE2_MultiplyTop16;
s_pTop[2] = &SSE2_MultiplyTop32;
s_pTop[2] = &SSE2_MultiplyTop8;
s_pTop[4] = &SSE2_MultiplyTop16;
s_pTop[8] = &SSE2_MultiplyTop32;
}
else
#endif
{
s_pMul[1] = &Baseline_Multiply4;
s_pMul[2] = &Baseline_Multiply8;
// s_pMul[4] = &Baseline_Multiply16;
s_pBot[1] = &Baseline_MultiplyBottom4;
s_pBot[2] = &Baseline_MultiplyBottom8;
// s_pBot[4] = &Baseline_MultiplyBottom16;
s_pSqu[1] = &Baseline_Square4;
s_pSqu[2] = &Baseline_Square8;
// s_pSqu[4] = &Baseline_Square16;
s_pTop[2] = &Baseline_MultiplyTop8;
#if !CRYPTOPP_INTEGER_SSE2
s_pMul[4] = &Baseline_Multiply16;
s_pBot[4] = &Baseline_MultiplyBottom16;
s_pSqu[4] = &Baseline_Square16;
s_pTop[4] = &Baseline_MultiplyTop16;
#endif
}
}
inline int Add(word *C, const word *A, const word *B, size_t N)
inline word Add(word *C, const word *A, const word *B, size_t N)
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_INTEGER_SSE2
return s_pAdd(N, C, A, B);
#else
return Baseline_Add(N, C, A, B);
#endif
}
inline int Subtract(word *C, const word *A, const word *B, size_t N)
inline word Subtract(word *C, const word *A, const word *B, size_t N)
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_INTEGER_SSE2
return s_pSub(N, C, A, B);
#else
return Baseline_Sub(N, C, A, B);
@ -1969,16 +2135,8 @@ void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B,
{
assert(N>=2 && N%2==0);
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2() && ((N>=8) & (N<=32)))
s_pTop[N/16](R, A, B, L[N-1]);
else
#endif
if (N<=4)
{
s_pMul[N/4](T, A, B);
memcpy(R, T+N, N*WORD_SIZE);
}
if (N <= s_recursionLimit)
s_pTop[N/4](R, A, B, L[N-1]);
else
{
const size_t N2 = N/2;
@ -3076,13 +3234,6 @@ public:
memcpy(m_counterAndSeed + 4, seed, seedSize);
}
byte GenerateByte()
{
byte b;
GenerateBlock(&b, 1);
return b;
}
void GenerateBlock(byte *output, size_t size)
{
PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);

View File

@ -26,31 +26,31 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
__asm__ __volatile__
(
".intel_syntax noprefix;"
AS1( push ebx)
AS_PUSH( bx)
#else
AS2( mov ecx, count)
AS2( mov esi, state)
AS2( mov edi, z)
AS2( mov edx, y)
AS2( mov WORD_REG(cx), count)
AS2( mov WORD_REG(si), state)
AS2( mov WORD_REG(di), z)
AS2( mov WORD_REG(dx), y)
#endif
AS2( shl ecx, 5)
AS2( shl WORD_REG(cx), 5)
ASJ( jz, 5, f)
AS2( mov ebx, [esi+4*17])
AS2( add ecx, ebx)
AS2( mov ebx, [WORD_REG(si)+4*17])
AS2( add WORD_REG(cx), WORD_REG(bx))
AS1( push ebp)
AS1( push ecx)
AS_PUSH( bp)
AS_PUSH( cx)
AS2( movdqa xmm0, [esi+0*16])
AS2( movdqa xmm1, [esi+1*16])
AS2( movdqa xmm2, [esi+2*16])
AS2( movdqa xmm3, [esi+3*16])
AS2( mov eax, [esi+4*16])
AS2( movdqa xmm0, [WORD_REG(si)+0*16])
AS2( movdqa xmm1, [WORD_REG(si)+1*16])
AS2( movdqa xmm2, [WORD_REG(si)+2*16])
AS2( movdqa xmm3, [WORD_REG(si)+3*16])
AS2( mov eax, [WORD_REG(si)+4*16])
ASL(4)
// gamma and pi
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
AS2( test ebx, 1)
AS2( test WORD_REG(bx), 1)
ASJ( jnz, 6, f)
#endif
AS2( movdqa xmm6, xmm2)
@ -81,7 +81,7 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
#define pi(i) \
AS2( movd ecx, xmm7)\
AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
AS2( mov [esi+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
AS2( mov [WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
#define pi4(x, y, z, a, b, c, d) \
AS2( pcmpeqb xmm7, xmm7)\
@ -110,65 +110,65 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( punpckhdq xmm2, xmm0) // 11 12 15 16
// keystream
AS2( test edi, edi)
AS2( test WORD_REG(di), WORD_REG(di))
ASJ( jz, 0, f)
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm2)
AS2( punpckhqdq xmm6, xmm2)
AS2( test edx, 0xf)
AS2( test WORD_REG(dx), 0xf)
ASJ( jnz, 2, f)
AS2( test edx, edx)
AS2( test WORD_REG(dx), WORD_REG(dx))
ASJ( jz, 1, f)
AS2( pxor xmm4, [edx])
AS2( pxor xmm6, [edx+16])
AS2( add edx, 32)
AS2( pxor xmm4, [WORD_REG(dx)])
AS2( pxor xmm6, [WORD_REG(dx)+16])
AS2( add WORD_REG(dx), 32)
ASJ( jmp, 1, f)
ASL(2)
AS2( movdqu xmm0, [edx])
AS2( movdqu xmm2, [edx+16])
AS2( movdqu xmm0, [WORD_REG(dx)])
AS2( movdqu xmm2, [WORD_REG(dx)+16])
AS2( pxor xmm4, xmm0)
AS2( pxor xmm6, xmm2)
AS2( add edx, 32)
AS2( add WORD_REG(dx), 32)
ASL(1)
AS2( test edi, 0xf)
AS2( test WORD_REG(di), 0xf)
ASJ( jnz, 3, f)
AS2( movdqa [edi], xmm4)
AS2( movdqa [edi+16], xmm6)
AS2( add edi, 32)
AS2( movdqa [WORD_REG(di)], xmm4)
AS2( movdqa [WORD_REG(di)+16], xmm6)
AS2( add WORD_REG(di), 32)
ASJ( jmp, 0, f)
ASL(3)
AS2( movdqu [edi], xmm4)
AS2( movdqu [edi+16], xmm6)
AS2( add edi, 32)
AS2( movdqu [WORD_REG(di)], xmm4)
AS2( movdqu [WORD_REG(di)+16], xmm6)
AS2( add WORD_REG(di), 32)
ASL(0)
// buffer update
AS2( lea ecx, [ebx + 32])
AS2( and ecx, 31*32)
AS2( lea ebp, [ebx + (32-24)*32])
AS2( and ebp, 31*32)
AS2( lea WORD_REG(cx), [WORD_REG(bx) + 32])
AS2( and WORD_REG(cx), 31*32)
AS2( lea WORD_REG(bp), [WORD_REG(bx) + (32-24)*32])
AS2( and WORD_REG(bp), 31*32)
AS2( movdqa xmm0, [esi+20*4+ecx+0*8])
AS2( movdqa xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8])
AS2( pxor xmm3, xmm0)
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
AS2( movdqa [esi+20*4+ecx+0*8], xmm3)
AS2( pxor xmm0, [esi+20*4+ebp+2*8])
AS2( movdqa [esi+20*4+ebp+2*8], xmm0)
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3)
AS2( pxor xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8])
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0)
AS2( movdqa xmm4, [esi+20*4+ecx+2*8])
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8])
AS2( pxor xmm1, xmm4)
AS2( movdqa [esi+20*4+ecx+2*8], xmm1)
AS2( pxor xmm4, [esi+20*4+ebp+0*8])
AS2( movdqa [esi+20*4+ebp+0*8], xmm4)
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1)
AS2( pxor xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8])
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4)
// theta
AS2( movdqa xmm3, [esi+3*16])
AS2( movdqa xmm2, [esi+2*16])
AS2( movdqa xmm1, [esi+1*16])
AS2( movdqa xmm0, [esi+0*16])
AS2( movdqa xmm3, [WORD_REG(si)+3*16])
AS2( movdqa xmm2, [WORD_REG(si)+2*16])
AS2( movdqa xmm1, [WORD_REG(si)+1*16])
AS2( movdqa xmm0, [WORD_REG(si)+0*16])
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
AS2( test ebx, 1)
AS2( test WORD_REG(bx), 1)
ASJ( jnz, 8, f)
#endif
AS2( movd xmm6, eax)
@ -214,21 +214,21 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( pxor xmm0, xmm4)
// sigma
AS2( lea ecx, [ebx + (32-4)*32])
AS2( and ecx, 31*32)
AS2( lea ebp, [ebx + 16*32])
AS2( and ebp, 31*32)
AS2( lea WORD_REG(cx), [WORD_REG(bx) + (32-4)*32])
AS2( and WORD_REG(cx), 31*32)
AS2( lea WORD_REG(bp), [WORD_REG(bx) + 16*32])
AS2( and WORD_REG(bp), 31*32)
AS2( movdqa xmm4, [esi+20*4+ecx+0*16])
AS2( movdqa xmm5, [esi+20*4+ebp+0*16])
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16])
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16])
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5)
AS2( pxor xmm3, xmm4)
AS2( pxor xmm2, xmm6)
AS2( movdqa xmm4, [esi+20*4+ecx+1*16])
AS2( movdqa xmm5, [esi+20*4+ebp+1*16])
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16])
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16])
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5)
@ -236,23 +236,22 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( pxor xmm0, xmm6)
// loop
AS2( add ebx, 32)
AS2( cmp ebx, [esp])
AS2( add WORD_REG(bx), 32)
AS2( cmp WORD_REG(bx), [WORD_REG(sp)])
ASJ( jne, 4, b)
// save state
AS2( mov ebp, [esp+4])
AS2( add esp, 8)
AS2( mov [esi+4*17], ebx)
AS2( mov [esi+4*16], eax)
AS2( movdqa [esi+3*16], xmm3)
AS2( movdqa [esi+2*16], xmm2)
AS2( movdqa [esi+1*16], xmm1)
AS2( movdqa [esi+0*16], xmm0)
AS2( add WORD_REG(sp), WORD_SZ)
AS_POP( bp)
AS2( mov [WORD_REG(si)+4*16], eax)
AS2( movdqa [WORD_REG(si)+3*16], xmm3)
AS2( movdqa [WORD_REG(si)+2*16], xmm2)
AS2( movdqa [WORD_REG(si)+1*16], xmm1)
AS2( movdqa [WORD_REG(si)+0*16], xmm0)
ASL(5)
#ifdef __GNUC__
AS1( pop ebx)
AS_POP( bx)
".att_syntax prefix;"
:
: "c" (count), "S" (state), "D" (z), "d" (y)

View File

@ -149,81 +149,133 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
#ifdef CRYPTOPP_X86_ASM_AVAILABLE
#if defined(CRYPTOPP_X86_ASM_AVAILABLE)
if (HasMMX())
{
const word32 *k = m_key;
const word32 *kLoopEnd = k + m_rounds*4;
#if CRYPTOPP_BOOL_X64
#define K_REG r8
#define K_END_REG r9
#define SAVE_K
#define RESTORE_K
#define RESTORE_K_END
#define SAVE_0(x) AS2(mov r10d, x)
#define SAVE_1(x) AS2(mov r11d, x)
#define SAVE_2(x) AS2(mov r12d, x)
#define RESTORE_0(x) AS2(mov x, r10d)
#define RESTORE_1(x) AS2(mov x, r11d)
#define RESTORE_2(x) AS2(mov x, r12d)
#else
#define K_REG esi
#define K_END_REG edi
#define SAVE_K AS2(movd mm4, esi)
#define RESTORE_K AS2(movd esi, mm4)
#define RESTORE_K_END AS2(movd edi, mm5)
#define SAVE_0(x) AS2(movd mm0, x)
#define SAVE_1(x) AS2(movd mm1, x)
#define SAVE_2(x) AS2(movd mm2, x)
#define RESTORE_0(x) AS2(movd x, mm0)
#define RESTORE_1(x) AS2(movd x, mm1)
#define RESTORE_2(x) AS2(movd x, mm2)
#endif
#ifdef __GNUC__
word32 t0, t1, t2, t3;
__asm__ __volatile__
(
".intel_syntax noprefix;"
AS1( push ebx)
AS1( push ebp)
AS2( mov ebp, eax)
AS_PUSH( bx)
AS_PUSH( bp)
AS2( mov WORD_REG(bp), WORD_REG(ax))
#if CRYPTOPP_BOOL_X64
// save these manually. clobber list doesn't seem to work as of GCC 4.1.0
AS1( pushq K_REG)
AS1( pushq K_END_REG)
AS1( pushq r10)
AS1( pushq r11)
AS1( pushq r12)
AS2( mov K_REG, rsi)
AS2( mov K_END_REG, rcx)
#else
AS2( movd mm5, ecx)
#endif
#else
#if _MSC_VER < 1300
const word32 *t = Te;
AS2( mov eax, t)
#endif
AS2( mov edx, g_cacheLineSize)
AS2( mov edi, inBlock)
AS2( mov esi, k)
AS2( mov WORD_REG(di), inBlock)
AS2( mov K_REG, k)
AS2( movd mm5, kLoopEnd)
AS1( push ebp)
#if _MSC_VER < 1300
AS_PUSH( bx)
AS_PUSH( bp)
AS2( mov ebp, eax)
#else
AS_PUSH( bp)
AS2( lea ebp, Te)
#endif
#endif
AS2( mov eax, [esi+0*4]) // s0
AS2( xor eax, [edi+0*4])
AS2( movd mm0, eax)
AS2( mov ebx, [esi+1*4])
AS2( xor ebx, [edi+1*4])
AS2( movd mm1, ebx)
AS2( mov eax, [K_REG+0*4]) // s0
AS2( xor eax, [WORD_REG(di)+0*4])
SAVE_0(eax)
AS2( mov ebx, [K_REG+1*4])
AS2( xor ebx, [WORD_REG(di)+1*4])
SAVE_1(ebx)
AS2( and ebx, eax)
AS2( mov eax, [esi+2*4])
AS2( xor eax, [edi+2*4])
AS2( movd mm2, eax)
AS2( mov eax, [K_REG+2*4])
AS2( xor eax, [WORD_REG(di)+2*4])
SAVE_2(eax)
AS2( and ebx, eax)
AS2( mov ecx, [esi+3*4])
AS2( xor ecx, [edi+3*4])
AS2( mov ecx, [K_REG+3*4])
AS2( xor ecx, [WORD_REG(di)+3*4])
AS2( and ebx, ecx)
// read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction
AS2( and ebx, 0)
AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence
ASL(2)
AS2( and ebx, [ebp+edi])
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( add edi, edx)
AS2( and ebx, [ebp+edi])
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( add edi, edx)
AS2( and ebx, [ebp+edi])
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( add edi, edx)
AS2( and ebx, [ebp+edi])
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( add edi, edx)
AS2( cmp edi, 1024)
ASJ( jl, 2, b)
AS2( and ebx, [ebp+1020])
AS2( and ebx, [WORD_REG(bp)+1020])
#if CRYPTOPP_BOOL_X64
AS2( xor r10d, ebx)
AS2( xor r11d, ebx)
AS2( xor r12d, ebx)
#else
AS2( movd mm6, ebx)
AS2( pxor mm2, mm6)
AS2( pxor mm1, mm6)
AS2( pxor mm0, mm6)
#endif
AS2( xor ecx, ebx)
AS2( mov edi, [esi+4*4]) // t0
AS2( mov eax, [esi+5*4])
AS2( mov ebx, [esi+6*4])
AS2( mov edx, [esi+7*4])
AS2( add esi, 8*4)
AS2( movd mm4, esi)
AS2( mov edi, [K_REG+4*4]) // t0
AS2( mov eax, [K_REG+5*4])
AS2( mov ebx, [K_REG+6*4])
AS2( mov edx, [K_REG+7*4])
AS2( add K_REG, 8*4)
SAVE_K
#define QUARTER_ROUND(t, a, b, c, d) \
AS2(movzx esi, t##l)\
AS2(d, [ebp+0*1024+4*esi])\
AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\
AS2(c, [ebp+1*1024+4*esi])\
AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
AS2(shr e##t##x, 16)\
AS2(movzx esi, t##l)\
AS2(b, [ebp+2*1024+4*esi])\
AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\
AS2(a, [ebp+3*1024+4*esi])
AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])
#define s0 xor edi
#define s1 xor eax
@ -235,69 +287,69 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#define t3 xor edx
QUARTER_ROUND(c, t0, t1, t2, t3)
AS2( movd ecx, mm2)
RESTORE_2(ecx)
QUARTER_ROUND(c, t3, t0, t1, t2)
AS2( movd ecx, mm1)
RESTORE_1(ecx)
QUARTER_ROUND(c, t2, t3, t0, t1)
AS2( movd ecx, mm0)
RESTORE_0(ecx)
QUARTER_ROUND(c, t1, t2, t3, t0)
AS2( movd mm2, ebx)
AS2( movd mm1, eax)
AS2( movd mm0, edi)
SAVE_2(ebx)
SAVE_1(eax)
SAVE_0(edi)
#undef QUARTER_ROUND
AS2( movd esi, mm4)
RESTORE_K
ASL(0)
AS2( mov edi, [esi+0*4])
AS2( mov eax, [esi+1*4])
AS2( mov ebx, [esi+2*4])
AS2( mov ecx, [esi+3*4])
AS2( mov edi, [K_REG+0*4])
AS2( mov eax, [K_REG+1*4])
AS2( mov ebx, [K_REG+2*4])
AS2( mov ecx, [K_REG+3*4])
#define QUARTER_ROUND(t, a, b, c, d) \
AS2(movzx esi, t##l)\
AS2(a, [ebp+3*1024+4*esi])\
AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\
AS2(b, [ebp+2*1024+4*esi])\
AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
AS2(shr e##t##x, 16)\
AS2(movzx esi, t##l)\
AS2(c, [ebp+1*1024+4*esi])\
AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\
AS2(d, [ebp+0*1024+4*esi])
AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])
QUARTER_ROUND(d, s0, s1, s2, s3)
AS2( movd edx, mm2)
RESTORE_2(edx)
QUARTER_ROUND(d, s3, s0, s1, s2)
AS2( movd edx, mm1)
RESTORE_1(edx)
QUARTER_ROUND(d, s2, s3, s0, s1)
AS2( movd edx, mm0)
RESTORE_0(edx)
QUARTER_ROUND(d, s1, s2, s3, s0)
AS2( movd esi, mm4)
AS2( movd mm2, ebx)
AS2( movd mm1, eax)
AS2( movd mm0, edi)
RESTORE_K
SAVE_2(ebx)
SAVE_1(eax)
SAVE_0(edi)
AS2( mov edi, [esi+4*4])
AS2( mov eax, [esi+5*4])
AS2( mov ebx, [esi+6*4])
AS2( mov edx, [esi+7*4])
AS2( mov edi, [K_REG+4*4])
AS2( mov eax, [K_REG+5*4])
AS2( mov ebx, [K_REG+6*4])
AS2( mov edx, [K_REG+7*4])
QUARTER_ROUND(c, t0, t1, t2, t3)
AS2( movd ecx, mm2)
RESTORE_2(ecx)
QUARTER_ROUND(c, t3, t0, t1, t2)
AS2( movd ecx, mm1)
RESTORE_1(ecx)
QUARTER_ROUND(c, t2, t3, t0, t1)
AS2( movd ecx, mm0)
RESTORE_0(ecx)
QUARTER_ROUND(c, t1, t2, t3, t0)
AS2( movd mm2, ebx)
AS2( movd mm1, eax)
AS2( movd mm0, edi)
SAVE_2(ebx)
SAVE_1(eax)
SAVE_0(edi)
AS2( movd esi, mm4)
AS2( movd edi, mm5)
AS2( add esi, 8*4)
AS2( movd mm4, esi)
AS2( cmp edi, esi)
RESTORE_K
RESTORE_K_END
AS2( add K_REG, 8*4)
SAVE_K
AS2( cmp K_END_REG, K_REG)
ASJ( jne, 0, b)
#undef QUARTER_ROUND
@ -310,44 +362,54 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#undef t2
#undef t3
AS2( mov eax, [edi+0*4])
AS2( mov ecx, [edi+1*4])
AS2( mov esi, [edi+2*4])
AS2( mov edi, [edi+3*4])
AS2( mov eax, [K_END_REG+0*4])
AS2( mov ecx, [K_END_REG+1*4])
AS2( mov esi, [K_END_REG+2*4])
AS2( mov edi, [K_END_REG+3*4])
#define QUARTER_ROUND(a, b, c, d) \
AS2( movzx ebx, dl)\
AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
AS2( shl ebx, 3*8)\
AS2( xor a, ebx)\
AS2( movzx ebx, dh)\
AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
AS2( shl ebx, 2*8)\
AS2( xor b, ebx)\
AS2( shr edx, 16)\
AS2( movzx ebx, dl)\
AS2( shr edx, 8)\
AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
AS2( shl ebx, 1*8)\
AS2( xor c, ebx)\
AS2( movzx ebx, BYTE PTR [ebp+1+4*edx])\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\
AS2( xor d, ebx)
QUARTER_ROUND(eax, ecx, esi, edi)
AS2( movd edx, mm2)
RESTORE_2(edx)
QUARTER_ROUND(edi, eax, ecx, esi)
AS2( movd edx, mm1)
RESTORE_1(edx)
QUARTER_ROUND(esi, edi, eax, ecx)
AS2( movd edx, mm0)
RESTORE_0(edx)
QUARTER_ROUND(ecx, esi, edi, eax)
#undef QUARTER_ROUND
AS1( pop ebp)
AS1( emms)
#if CRYPTOPP_BOOL_X64
AS1(popq r12)
AS1(popq r11)
AS1(popq r10)
AS1(popq K_END_REG)
AS1(popq K_REG)
#else
AS1(emms)
#endif
AS_POP( bp)
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
AS_POP( bx)
#endif
#ifdef __GNUC__
AS1( pop ebx)
".att_syntax prefix;"
: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
@ -366,19 +428,19 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
((word32 *)outBlock)[2] = t2;
((word32 *)outBlock)[3] = t3;
#else
AS2( mov ebx, xorBlock)
AS2( test ebx, ebx)
AS2( mov WORD_REG(bx), xorBlock)
AS2( test WORD_REG(bx), WORD_REG(bx))
ASJ( jz, 1, f)
AS2( xor eax, [ebx+0*4])
AS2( xor ecx, [ebx+1*4])
AS2( xor esi, [ebx+2*4])
AS2( xor edi, [ebx+3*4])
AS2( xor eax, [WORD_REG(bx)+0*4])
AS2( xor ecx, [WORD_REG(bx)+1*4])
AS2( xor esi, [WORD_REG(bx)+2*4])
AS2( xor edi, [WORD_REG(bx)+3*4])
ASL(1)
AS2( mov ebx, outBlock)
AS2( mov [ebx+0*4], eax)
AS2( mov [ebx+1*4], ecx)
AS2( mov [ebx+2*4], esi)
AS2( mov [ebx+3*4], edi)
AS2( mov WORD_REG(bx), outBlock)
AS2( mov [WORD_REG(bx)+0*4], eax)
AS2( mov [WORD_REG(bx)+1*4], ecx)
AS2( mov [WORD_REG(bx)+2*4], esi)
AS2( mov [WORD_REG(bx)+3*4], edi)
#endif
}
else

View File

@ -130,10 +130,13 @@ public:
#endif
assert(IsAlignedOn(p, 16));
return (T*)p;
return (pointer)p;
}
return new T[n];
pointer p;
while (!(p = (pointer)malloc(sizeof(T)*n)))
CallNewHandler();
return p;
}
void deallocate(void *p, size_type n)
@ -153,7 +156,7 @@ public:
return;
}
delete [] (T *)p;
free(p);
}
pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve)
@ -164,13 +167,19 @@ public:
// VS.NET STL enforces the policy of "All STL-compliant allocators have to provide a
// template class member called rebind".
template <class U> struct rebind { typedef AllocatorWithCleanup<U, T_Align16> other; };
#if _MSC_VER >= 1500
AllocatorWithCleanup() {}
template <class U, bool A> AllocatorWithCleanup(const AllocatorWithCleanup<U, A> &) {}
#endif
};
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<byte>;
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word16>;
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word32>;
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word64>;
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word, CRYPTOPP_BOOL_X86>; // for Integer
#if CRYPTOPP_BOOL_X86
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word, true>; // for Integer
#endif
template <class T>
class NullAllocator : public AllocatorBase<T>
@ -260,7 +269,7 @@ public:
size_type max_size() const {return STDMAX(m_fallbackAllocator.max_size(), S);}
private:
T* GetAlignedArray() {return T_Align16 ? (T*)(((byte *)m_array) + (0-(unsigned int)m_array)%16) : m_array;}
T* GetAlignedArray() {return T_Align16 ? (T*)(((byte *)m_array) + (0-(size_t)m_array)%16) : m_array;}
CRYPTOPP_ALIGN_DATA(8) T m_array[T_Align16 ? S+8/sizeof(T) : S];
A m_fallbackAllocator;
@ -466,10 +475,10 @@ public:
explicit SecBlockWithHint(size_t size) : SecBlock<T, A>(size) {}
};
template<class T, class U>
inline bool operator==(const CryptoPP::AllocatorWithCleanup<T>&, const CryptoPP::AllocatorWithCleanup<U>&) {return (true);}
template<class T, class U>
inline bool operator!=(const CryptoPP::AllocatorWithCleanup<T>&, const CryptoPP::AllocatorWithCleanup<U>&) {return (false);}
template<class T, bool A, class U, bool B>
inline bool operator==(const CryptoPP::AllocatorWithCleanup<T, A>&, const CryptoPP::AllocatorWithCleanup<U, B>&) {return (true);}
template<class T, bool A, class U, bool B>
inline bool operator!=(const CryptoPP::AllocatorWithCleanup<T, A>&, const CryptoPP::AllocatorWithCleanup<U, B>&) {return (false);}
NAMESPACE_END

21
sha.cpp
View File

@ -308,9 +308,9 @@ CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN1
W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
};
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
// put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version
static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
CRYPTOPP_NAKED static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
{
#ifdef __GNUC__
__asm__ __volatile__
@ -319,6 +319,9 @@ static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64
AS1( push ebx)
AS2( mov ebx, eax)
#else
AS1( push ebx)
AS1( push esi)
AS1( push edi)
AS2( lea ebx, SHA512_K)
#endif
@ -486,22 +489,30 @@ static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64
AS1( pop esp)
AS1( emms)
#ifdef __GNUC__
#if defined(__GNUC__)
AS1( pop ebx)
".att_syntax prefix;"
:
: "a" (SHA512_K), "c" (state), "d" (data)
: "%esi", "%edi", "memory", "cc"
);
#else
AS1( pop edi)
AS1( pop esi)
AS1( pop ebx)
AS1( ret)
#endif
}
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
void SHA512::Transform(word64 *state, const word64 *data)
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
if (HasSSE2())
return SHA512_SSE2_Transform(state, data);
{
SHA512_SSE2_Transform(state, data);
return;
}
#endif
#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))

View File

@ -189,21 +189,21 @@ template <class T> counted_ptr<T> & counted_ptr<T>::operator=(const counted_ptr<
template <class T> class vector_member_ptrs
{
public:
vector_member_ptrs(unsigned int size=0)
vector_member_ptrs(size_t size=0)
: m_size(size), m_ptr(new member_ptr<T>[size]) {}
~vector_member_ptrs()
{delete [] this->m_ptr;}
member_ptr<T>& operator[](unsigned int index)
member_ptr<T>& operator[](size_t index)
{assert(index<this->m_size); return this->m_ptr[index];}
const member_ptr<T>& operator[](unsigned int index) const
const member_ptr<T>& operator[](size_t index) const
{assert(index<this->m_size); return this->m_ptr[index];}
unsigned int size() const {return this->m_size;}
void resize(unsigned int newSize)
size_t size() const {return this->m_size;}
void resize(size_t newSize)
{
member_ptr<T> *newPtr = new member_ptr<T>[newSize];
for (unsigned int i=0; i<this->m_size && i<newSize; i++)
for (size_t i=0; i<this->m_size && i<newSize; i++)
newPtr[i].reset(this->m_ptr[i].release());
delete [] this->m_ptr;
this->m_size = newSize;
@ -214,7 +214,7 @@ private:
vector_member_ptrs(const vector_member_ptrs<T> &c); // copy not allowed
void operator=(const vector_member_ptrs<T> &x); // assignment not allowed
unsigned int m_size;
size_t m_size;
member_ptr<T> *m_ptr;
};

View File

@ -68,6 +68,10 @@ void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
m_state[1] = b;
m_state[2] = e;
m_state[3] = d;
#define XMUX(c, x, y) (x ^ (y & (0 - (c & 1))))
m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]);
m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
}
static word32 s_mulTables[512] = {
@ -282,10 +286,8 @@ unsigned int SosemanukPolicy::GetAlignment() const
else
#endif
return 1;
#endif
}
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
unsigned int SosemanukPolicy::GetOptimalBlockSize() const
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
@ -316,54 +318,54 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
__asm__ __volatile__
(
".intel_syntax noprefix;"
AS1( push ebx)
AS_PUSH( bx)
#else
word32 *state = m_state;
AS2( mov eax, state)
AS2( mov edi, output)
AS2( mov edx, input)
AS2( mov ecx, iterationCount)
AS2( mov WORD_REG(ax), state)
AS2( mov WORD_REG(di), output)
AS2( mov WORD_REG(dx), input)
AS2( mov WORD_REG(cx), iterationCount)
#endif
#define SSE2_output DWORD PTR [esp+1*4]
#define SSE2_input DWORD PTR [esp+2*4]
#define SSE2_wordsLeft DWORD PTR [esp+3*4]
#define SSE2_ediEnd DWORD PTR [esp+4*4]
#define SSE2_pMulTables DWORD PTR [esp+5*4]
#define SSE2_state DWORD PTR [esp+6*4]
#define SSE2_wordsLeft2 DWORD PTR [esp+7*4]
#define SSE2_stateCopy esp + 8*4
#define SSE2_output WORD_PTR [WORD_REG(sp)+1*WORD_SZ]
#define SSE2_input WORD_PTR [WORD_REG(sp)+2*WORD_SZ]
#define SSE2_wordsLeft WORD_PTR [WORD_REG(sp)+3*WORD_SZ]
#define SSE2_diEnd WORD_PTR [WORD_REG(sp)+4*WORD_SZ]
#define SSE2_pMulTables WORD_PTR [WORD_REG(sp)+5*WORD_SZ]
#define SSE2_state WORD_PTR [WORD_REG(sp)+6*WORD_SZ]
#define SSE2_wordsLeft2 WORD_PTR [WORD_REG(sp)+7*WORD_SZ]
#define SSE2_stateCopy WORD_REG(sp) + 8*WORD_SZ
#define SSE2_uvStart SSE2_stateCopy + 12*4
AS1( push ebp)
AS2( mov ebx, esp)
AS2( and esp, 0xfffffff0)
AS2( sub esp, 80*4*2+12*4+8*4) // 80 v's, 80 u's, 12 state, 8 locals
AS2( mov [esp], ebx)
AS2( mov SSE2_output, edi)
AS2( mov SSE2_input, edx)
AS2( mov SSE2_state, eax)
AS_PUSH( bp)
AS2( mov WORD_REG(bx), WORD_REG(sp))
AS2( and WORD_REG(sp), -16)
AS2( sub WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals
AS2( mov [WORD_REG(sp)], WORD_REG(bx))
AS2( mov SSE2_output, WORD_REG(di))
AS2( mov SSE2_input, WORD_REG(dx))
AS2( mov SSE2_state, WORD_REG(ax))
#ifndef _MSC_VER
AS2( mov SSE2_pMulTables, esi)
AS2( mov SSE2_pMulTables, WORD_REG(si))
#endif
AS2( lea ecx, [4*ecx+ecx])
AS2( lea esi, [4*ecx])
AS2( mov SSE2_wordsLeft, esi)
AS2( movdqa xmm0, [eax+0*16]) // copy state to stack to save a register
AS2( lea WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
AS2( lea WORD_REG(si), [4*WORD_REG(cx)])
AS2( mov SSE2_wordsLeft, WORD_REG(si))
AS2( movdqa xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register
AS2( movdqa [SSE2_stateCopy+0*16], xmm0)
AS2( movdqa xmm0, [eax+1*16])
AS2( movdqa xmm0, [WORD_REG(ax)+1*16])
AS2( movdqa [SSE2_stateCopy+1*16], xmm0)
AS2( movq xmm0, QWORD PTR [eax+2*16])
AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
AS2( psrlq xmm0, 32)
AS2( movd ebx, xmm0) // s(9)
AS2( mov ecx, [eax+10*4])
AS2( mov edx, [eax+11*4])
AS2( mov ecx, [WORD_REG(ax)+10*4])
AS2( mov edx, [WORD_REG(ax)+11*4])
AS2( pcmpeqb xmm7, xmm7) // all ones
#define s(i) SSE2_stateCopy + ASM_MOD(i,10)*4
#define u(j) edi + (ASM_MOD(j,4)*20 + (j/4)) * 4
#define v(j) edi + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
#define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
#define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
#define r10 ecx
#define r11 edx
@ -371,42 +373,42 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
#define r21 ecx
#define SSE2_STEP(i, j) \
AS2( mov eax, [s(i+3)])\
AS2( mov ebp, 1)\
AS2( and ebp, r1##j)\
AS1( neg ebp)\
AS2( and ebp, [s(i+8)])\
AS2( xor ebp, [s(i+1)])\
AS2( add r2##j, ebp)\
AS2( movzx ebp, al)\
AS2( shr eax, 8)\
AS2( xor eax, [esi+1024+ebp*4])\
AS2( lea ebp, [ebx + r2##j])\
AS2( xor ebx, eax)\
AS2( imul r1##j, 0x54655307)\
AS2( mov eax, [s(i+0)])\
AS2( mov [v(i)], eax)\
AS2( rol eax, 8)\
AS2( xor ebx, eax)\
AS2( movzx eax, al)\
AS2( rol r1##j, 7)\
AS2( xor ebx, [esi+eax*4])\
AS2( lea ebp, [ebx + r2##j])\
AS2( xor ebp, r1##j)\
AS2( mov [u(i)], ebp)\
AS2( mov ebp, 1)\
AS2( and ebp, r2##j)\
AS1( neg ebp)\
AS2( and ebp, ebx)\
AS2( xor ebx, eax)\
AS2( movzx eax, al)\
AS2( xor ebx, [WORD_REG(si)+WORD_REG(ax)*4])\
AS2( mov eax, [s(i+3)])\
AS2( xor ebp, [s(i+2)])\
AS2( add r1##j, ebp)\
AS2( movzx ebp, al)\
AS2( shr eax, 8)\
AS2( xor ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\
AS2( xor ebx, eax)\
AS2( imul r2##j, 0x54655307)\
AS2( rol r2##j, 7)\
AS2( mov [s(i+0)], ebx)\
ASL(2) // outer loop, each iteration of this processes 80 words
AS2( lea edi, [SSE2_uvStart]) // start of v and u
AS2( mov eax, 80)
AS2( cmp esi, 80)
AS2( cmovg esi, eax)
AS2( mov SSE2_wordsLeft2, esi)
AS2( lea esi, [edi+esi]) // use to first inner loop
AS2( mov SSE2_ediEnd, esi)
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
AS2( mov WORD_REG(ax), 80)
AS2( cmp WORD_REG(si), 80)
AS2( cmovg WORD_REG(si), WORD_REG(ax))
AS2( mov SSE2_wordsLeft2, WORD_REG(si))
AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop
AS2( mov SSE2_diEnd, WORD_REG(si))
#ifdef _MSC_VER
AS2( lea esi, s_mulTables)
AS2( lea WORD_REG(si), s_mulTables)
#else
AS2( mov esi, SSE2_pMulTables)
AS2( mov WORD_REG(si), SSE2_pMulTables)
#endif
ASL(0) // first inner loop, 20 words each, 4 iterations
@ -431,20 +433,20 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
SSE2_STEP(18, 0)
SSE2_STEP(19, 1)
// loop
AS2( add edi, 5*4)
AS2( cmp edi, SSE2_ediEnd)
AS2( add WORD_REG(di), 5*4)
AS2( cmp WORD_REG(di), SSE2_diEnd)
ASJ( jne, 0, b)
AS2( mov eax, SSE2_input)
AS2( mov ebp, SSE2_output)
AS2( lea edi, [SSE2_uvStart]) // start of v and u
AS2( mov esi, SSE2_wordsLeft2)
AS2( mov WORD_REG(ax), SSE2_input)
AS2( mov WORD_REG(bp), SSE2_output)
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
AS2( mov WORD_REG(si), SSE2_wordsLeft2)
ASL(1) // second inner loop, 16 words each, 5 iterations
AS2( movdqa xmm0, [edi+0*20*4])
AS2( movdqa xmm1, [edi+1*20*4])
AS2( movdqa xmm2, [edi+2*20*4])
AS2( movdqa xmm3, [edi+3*20*4])
AS2( movdqa xmm0, [WORD_REG(di)+0*20*4])
AS2( movdqa xmm2, [WORD_REG(di)+2*20*4])
AS2( movdqa xmm3, [WORD_REG(di)+3*20*4])
AS2( movdqa xmm1, [WORD_REG(di)+1*20*4])
// S2
AS2( movdqa xmm4, xmm0)
AS2( pand xmm0, xmm2)
@ -463,13 +465,13 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( pxor xmm1, xmm4)
AS2( pxor xmm4, xmm7)
// xor with v
AS2( pxor xmm2, [edi+80*4])
AS2( pxor xmm3, [edi+80*5])
AS2( pxor xmm1, [edi+80*6])
AS2( pxor xmm4, [edi+80*7])
AS2( pxor xmm2, [WORD_REG(di)+80*4])
AS2( pxor xmm3, [WORD_REG(di)+80*5])
AS2( pxor xmm1, [WORD_REG(di)+80*6])
AS2( pxor xmm4, [WORD_REG(di)+80*7])
// exit loop early if less than 16 words left to output
// this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop
AS2( cmp esi, 16)
AS2( cmp WORD_REG(si), 16)
ASJ( jl, 4, f)
// unpack
AS2( movdqa xmm6, xmm2)
@ -485,75 +487,75 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( punpcklqdq xmm6, xmm5)
AS2( punpckhqdq xmm3, xmm5)
// output keystream
AS2( test eax, eax)
AS2( test WORD_REG(ax), WORD_REG(ax))
ASJ( jz, 3, f)
AS2( test eax, 0xf)
ASJ( jnz, 7, f)
AS2( pxor xmm2, [eax+0*16])
AS2( pxor xmm0, [eax+1*16])
AS2( pxor xmm6, [eax+2*16])
AS2( pxor xmm3, [eax+3*16])
AS2( add eax, 4*16)
AS2( pxor xmm2, [WORD_REG(ax)+0*16])
AS2( pxor xmm0, [WORD_REG(ax)+1*16])
AS2( pxor xmm6, [WORD_REG(ax)+2*16])
AS2( pxor xmm3, [WORD_REG(ax)+3*16])
AS2( add WORD_REG(ax), 4*16)
ASJ( jmp, 3, f)
ASL(7)
AS2( movdqu xmm1, [eax+0*16])
AS2( movdqu xmm1, [WORD_REG(ax)+0*16])
AS2( pxor xmm2, xmm1)
AS2( movdqu xmm1, [eax+1*16])
AS2( movdqu xmm1, [WORD_REG(ax)+1*16])
AS2( pxor xmm0, xmm1)
AS2( movdqu xmm1, [eax+2*16])
AS2( movdqu xmm1, [WORD_REG(ax)+2*16])
AS2( pxor xmm6, xmm1)
AS2( movdqu xmm1, [eax+3*16])
AS2( movdqu xmm1, [WORD_REG(ax)+3*16])
AS2( pxor xmm3, xmm1)
AS2( add eax, 4*16)
AS2( add WORD_REG(ax), 4*16)
ASL(3)
AS2( test ebp, 0xf)
ASJ( jnz, 8, f)
AS2( movdqa [ebp+0*16], xmm2)
AS2( movdqa [ebp+1*16], xmm0)
AS2( movdqa [ebp+2*16], xmm6)
AS2( movdqa [ebp+3*16], xmm3)
AS2( movdqa [WORD_REG(bp)+0*16], xmm2)
AS2( movdqa [WORD_REG(bp)+1*16], xmm0)
AS2( movdqa [WORD_REG(bp)+2*16], xmm6)
AS2( movdqa [WORD_REG(bp)+3*16], xmm3)
ASJ( jmp, 9, f)
ASL(8)
AS2( movdqu [ebp+0*16], xmm2)
AS2( movdqu [ebp+1*16], xmm0)
AS2( movdqu [ebp+2*16], xmm6)
AS2( movdqu [ebp+3*16], xmm3)
AS2( movdqu [WORD_REG(bp)+0*16], xmm2)
AS2( movdqu [WORD_REG(bp)+1*16], xmm0)
AS2( movdqu [WORD_REG(bp)+2*16], xmm6)
AS2( movdqu [WORD_REG(bp)+3*16], xmm3)
ASL(9)
// loop
AS2( add edi, 4*4)
AS2( add ebp, 4*16)
AS2( sub esi, 16)
AS2( add WORD_REG(di), 4*4)
AS2( add WORD_REG(bp), 4*16)
AS2( sub WORD_REG(si), 16)
ASJ( jnz, 1, b)
// outer loop
AS2( mov esi, SSE2_wordsLeft)
AS2( sub esi, 80)
AS2( mov WORD_REG(si), SSE2_wordsLeft)
AS2( sub WORD_REG(si), 80)
ASJ( jz, 6, f)
AS2( mov SSE2_wordsLeft, esi)
AS2( mov SSE2_input, eax)
AS2( mov SSE2_output, ebp)
AS2( mov SSE2_wordsLeft, WORD_REG(si))
AS2( mov SSE2_input, WORD_REG(ax))
AS2( mov SSE2_output, WORD_REG(bp))
ASJ( jmp, 2, b)
ASL(4) // final output of less than 16 words
AS2( test eax, eax)
AS2( test WORD_REG(ax), WORD_REG(ax))
ASJ( jz, 5, f)
AS2( movd xmm0, [eax+0*4])
AS2( movd xmm0, [WORD_REG(ax)+0*4])
AS2( pxor xmm2, xmm0)
AS2( movd xmm0, [eax+1*4])
AS2( movd xmm0, [WORD_REG(ax)+1*4])
AS2( pxor xmm3, xmm0)
AS2( movd xmm0, [eax+2*4])
AS2( movd xmm0, [WORD_REG(ax)+2*4])
AS2( pxor xmm1, xmm0)
AS2( movd xmm0, [eax+3*4])
AS2( movd xmm0, [WORD_REG(ax)+3*4])
AS2( pxor xmm4, xmm0)
AS2( add eax, 16)
AS2( add WORD_REG(ax), 16)
ASL(5)
AS2( movd [ebp+0*4], xmm2)
AS2( movd [ebp+1*4], xmm3)
AS2( movd [ebp+2*4], xmm1)
AS2( movd [ebp+3*4], xmm4)
AS2( sub esi, 4)
AS2( movd [WORD_REG(bp)+0*4], xmm2)
AS2( movd [WORD_REG(bp)+1*4], xmm3)
AS2( movd [WORD_REG(bp)+2*4], xmm1)
AS2( movd [WORD_REG(bp)+3*4], xmm4)
AS2( sub WORD_REG(si), 4)
ASJ( jz, 6, f)
AS2( add ebp, 16)
AS2( add WORD_REG(bp), 16)
AS2( psrldq xmm2, 4)
AS2( psrldq xmm3, 4)
AS2( psrldq xmm1, 4)
@ -561,26 +563,26 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
ASJ( jmp, 4, b)
ASL(6) // save state
AS2( mov ebx, SSE2_state)
AS2( mov WORD_REG(bx), SSE2_state)
AS2( movdqa xmm0, [SSE2_stateCopy+0*16])
AS2( movdqa [ebx+0*16], xmm0)
AS2( movdqa [WORD_REG(bx)+0*16], xmm0)
AS2( movdqa xmm0, [SSE2_stateCopy+1*16])
AS2( movdqa [ebx+1*16], xmm0)
AS2( movdqa [WORD_REG(bx)+1*16], xmm0)
AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
AS2( movq QWORD PTR [ebx+2*16], xmm0)
AS2( mov [ebx+10*4], ecx)
AS2( mov [ebx+11*4], edx)
AS2( movq QWORD PTR [WORD_REG(bx)+2*16], xmm0)
AS2( mov [WORD_REG(bx)+10*4], ecx)
AS2( mov [WORD_REG(bx)+11*4], edx)
AS1( pop esp)
AS1( pop ebp)
AS_POP( sp)
AS_POP( bp)
#ifdef __GNUC__
AS1( pop ebx)
".att_syntax prefix;"
:
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
: "memory", "cc"
);
AS_POP( bx)
".att_syntax prefix;"
:
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
: "memory", "cc"
);
#endif
}
else
@ -593,17 +595,16 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
#endif
#define DIV_A(x) (((x) >> 8) ^ s_mulTables[256 + byte(x)])
#define XMUX(c, x, y) (x ^ (y & (0 - (c & 1))))
#define r1(i) ((i%2) ? reg2 : reg1)
#define r2(i) ((i%2) ? reg1 : reg2)
#define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u) \
r2(x0) += XMUX(r1(x0), s##x1, s##x8);\
r1(x0) = rotlFixed(r1(x0) * 0x54655307, 7);\
v = s##x0;\
u = (s##x9 + r2(x0)) ^ r1(x0);\
s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;
v = s##x0;\
s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;\
r1(x0) += XMUX(r2(x0), s##x2, s##x9);\
r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\
#define SOSEMANUK_OUTPUT(x) \
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\

View File

@ -34,7 +34,7 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
void Tiger::Transform (word64 *digest, const word64 *X)
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
if (HasSSE2())
{
#ifdef __GNUC__
@ -43,9 +43,14 @@ void Tiger::Transform (word64 *digest, const word64 *X)
".intel_syntax noprefix;"
AS1( push ebx)
#else
#if _MSC_VER < 1300
const word64 *t = table;
AS2( mov edx, t)
#else
AS2( lea edx, [table])
#endif
AS2( mov eax, digest)
AS2( mov esi, X)
AS2( lea edx, [table])
#endif
AS2( movq mm0, [eax])
AS2( movq mm1, [eax+1*8])

View File

@ -390,7 +390,7 @@ CRYPTOPP_ALIGN_DATA(16) static const word64 Whirlpool_C[4*256+R] CRYPTOPP_SECTIO
// Whirlpool basic transformation. Transforms state based on block.
void Whirlpool::Transform(word64 *digest, const word64 *block)
{
#ifdef CRYPTOPP_X86_ASM_AVAILABLE
#if defined(CRYPTOPP_X86_ASM_AVAILABLE)
if (HasMMX())
{
// MMX version has the same structure as C version below
@ -398,26 +398,29 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
__asm__ __volatile__
(
".intel_syntax noprefix;"
AS1( push ebx)
AS2( mov ebx, eax)
AS_PUSH( bx)
AS2( mov WORD_REG(bx), WORD_REG(ax))
#else
AS2( lea ebx, [Whirlpool_C])
AS2( mov ecx, digest)
AS2( mov edx, block)
#if _MSC_VER < 1300
AS_PUSH( bx)
#endif
AS2( lea WORD_REG(bx), [Whirlpool_C])
AS2( mov WORD_REG(cx), digest)
AS2( mov WORD_REG(dx), block)
#endif
AS2( mov eax, esp)
AS2( and esp, 0xfffffff0)
AS2( sub esp, 16*8)
AS1( push eax)
AS2( mov WORD_REG(ax), WORD_REG(sp))
AS2( and WORD_REG(sp), -16)
AS2( sub WORD_REG(sp), 16*8)
AS_PUSH( ax)
AS2( xor esi, esi)
ASL(0)
AS2( movq mm0, [ecx+8*esi])
AS2( movq [esp+4+8*esi], mm0) // k
AS2( pxor mm0, [edx+8*esi])
AS2( movq [esp+4+64+8*esi], mm0) // s
AS2( movq [ecx+8*esi], mm0)
AS1( inc esi)
AS2( cmp esi, 8)
AS2( movq mm0, [WORD_REG(cx)+8*WORD_REG(si)])
AS2( movq [WORD_REG(sp)+WORD_SZ+8*WORD_REG(si)], mm0) // k
AS2( pxor mm0, [WORD_REG(dx)+8*WORD_REG(si)])
AS2( movq [WORD_REG(sp)+WORD_SZ+64+8*WORD_REG(si)], mm0) // s
AS2( movq [WORD_REG(cx)+8*WORD_REG(si)], mm0)
AS1( inc WORD_REG(si))
AS2( cmp WORD_REG(si), 8)
ASJ( jne, 0, b)
AS2( xor esi, esi)
@ -427,16 +430,16 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
#define KSL1(a, b) AS2(pxor mm##a, b)
#define KSL(op, i, a, b, c, d) \
AS2(mov eax, [esp+4+8*i])\
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*i])\
AS2(movzx edi, al)\
KSL##op(a, [ebx+3*2048+8*edi])\
KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\
KSL##op(b, [ebx+2*2048+8*edi])\
KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\
AS2(movzx edi, al)\
AS2(shr eax, 8)\
KSL##op(c, [ebx+1*2048+8*edi])\
KSL##op(d, [ebx+0*2048+8*eax])
KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
#define KSH0(a, b) \
ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
@ -445,57 +448,57 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
AS2(pxor mm##a, b)
#define KSH2(a, b) \
AS2(pxor mm##a, b)\
AS2(movq [esp+4+8*a], mm##a)
AS2(movq [WORD_REG(sp)+WORD_SZ+8*a], mm##a)
#define KSH(op, i, a, b, c, d) \
AS2(mov eax, [esp+4+8*((i+4)-8*((i+4)/8))+4])\
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*((i+4)-8*((i+4)/8))+4])\
AS2(movzx edi, al)\
KSH##op(a, [ebx+3*2048+8*edi])\
KSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\
KSH##op(b, [ebx+2*2048+8*edi])\
KSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\
AS2(movzx edi, al)\
AS2(shr eax, 8)\
KSH##op(c, [ebx+1*2048+8*edi])\
KSH##op(d, [ebx+0*2048+8*eax])
KSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
KSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
#define TSL(op, i, a, b, c, d) \
AS2(mov eax, [esp+4+64+8*i])\
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*i])\
AS2(movzx edi, al)\
KSL##op(a, [ebx+3*2048+8*edi])\
KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\
KSL##op(b, [ebx+2*2048+8*edi])\
KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\
AS2(movzx edi, al)\
AS2(shr eax, 8)\
KSL##op(c, [ebx+1*2048+8*edi])\
KSL##op(d, [ebx+0*2048+8*eax])
KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
#define TSH0(a, b) \
ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
AS2(pxor mm##a, [esp+4+8*a])\
AS2(pxor mm##a, [WORD_REG(sp)+WORD_SZ+8*a])\
AS2(pxor mm##a, b)
#define TSH1(a, b) \
AS2(pxor mm##a, b)
#define TSH2(a, b) \
AS2(pxor mm##a, b)\
AS2(movq [esp+4+64+8*a], mm##a)
AS2(movq [WORD_REG(sp)+WORD_SZ+64+8*a], mm##a)
#define TSH3(a, b) \
AS2(pxor mm##a, b)\
AS2(pxor mm##a, [ecx+8*a])\
AS2(movq [ecx+8*a], mm##a)
AS2(pxor mm##a, [WORD_REG(cx)+8*a])\
AS2(movq [WORD_REG(cx)+8*a], mm##a)
#define TSH(op, i, a, b, c, d) \
AS2(mov eax, [esp+4+64+8*((i+4)-8*((i+4)/8))+4])\
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*((i+4)-8*((i+4)/8))+4])\
AS2(movzx edi, al)\
TSH##op(a, [ebx+3*2048+8*edi])\
TSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\
TSH##op(b, [ebx+2*2048+8*edi])\
TSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\
AS2(movzx edi, al)\
AS2(shr eax, 8)\
TSH##op(c, [ebx+1*2048+8*edi])\
TSH##op(d, [ebx+0*2048+8*eax])
TSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
TSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
KSL(0, 4, 3, 2, 1, 0)
KSL(0, 0, 7, 6, 5, 4)
@ -514,8 +517,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
KSH(2, 3, 2, 1, 0, 7)
KSH(2, 7, 6, 5, 4, 3)
AS2( pxor mm0, [ebx + 8*1024 + esi*8])
AS2( movq [esp+4], mm0)
AS2( pxor mm0, [WORD_REG(bx) + 8*1024 + WORD_REG(si)*8])
AS2( movq [WORD_REG(sp)+WORD_SZ], mm0)
TSL(0, 4, 3, 2, 1, 0)
TSL(0, 0, 7, 6, 5, 4)
@ -532,8 +535,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
TSH(1, 5, 4, 3, 2, 1)
TSH(1, 6, 5, 4, 3, 2)
AS1( inc esi)
AS2( cmp esi, 10)
AS1( inc WORD_REG(si))
AS2( cmp WORD_REG(si), 10)
ASJ( je, 2, f)
TSH(2, 3, 2, 1, 0, 7)
@ -550,11 +553,13 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
#undef TSL
#undef TSH
AS_POP( sp)
AS1( emms)
AS1( pop esp)
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
AS_POP( bx)
#endif
#ifdef __GNUC__
AS1( pop ebx)
".att_syntax prefix;"
:
: "a" (Whirlpool_C), "c" (digest), "d" (block)

View File

@ -1,80 +1,55 @@
PUBLIC Baseline_Add
PUBLIC Baseline_Sub
.CODE
ALIGN 8
ALIGN 8
Baseline_Add PROC
lea rdx, [rdx+8*rcx]
lea r8, [r8+8*rcx]
lea r9, [r9+8*rcx]
neg rcx ; rcx is negative index
test rcx, 2 ; this clears carry flag
jz $0@Baseline_Add
sub rcx, 2
jmp $1@Baseline_Add
$0@Baseline_Add:
jrcxz $2@Baseline_Add ; loop until rcx overflows and becomes zero
jz $1@Baseline_Add
mov rax,[r8+8*rcx]
adc rax,[r9+8*rcx]
add rax,[r9+8*rcx]
mov [rdx+8*rcx],rax
$0@Baseline_Add:
mov rax,[r8+8*rcx+8]
adc rax,[r9+8*rcx+8]
mov [rdx+8*rcx+8],rax
$1@Baseline_Add:
mov rax,[r8+8*rcx+16]
adc rax,[r9+8*rcx+16]
mov [rdx+8*rcx+16],rax
mov rax,[r8+8*rcx+24]
adc rax,[r9+8*rcx+24]
mov [rdx+8*rcx+24],rax
lea rcx,[rcx+4] ; advance index, avoid inc which causes slowdown on Intel Core 2
lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2
jrcxz $1@Baseline_Add ; loop until rcx overflows and becomes zero
mov rax,[r8+8*rcx]
adc rax,[r9+8*rcx]
mov [rdx+8*rcx],rax
jmp $0@Baseline_Add
$2@Baseline_Add:
$1@Baseline_Add:
mov rax, 0
setc al ; store carry into rax (return result register)
adc rax, rax ; store carry into rax (return result register)
ret
Baseline_Add ENDP
ALIGN 8
ALIGN 8
Baseline_Sub PROC
lea rdx, [rdx+8*rcx]
lea r8, [r8+8*rcx]
lea r9, [r9+8*rcx]
neg rcx ; rcx is negative index
test rcx, 2 ; this clears carry flag
jz $0@Baseline_Sub
sub rcx, 2
jmp $1@Baseline_Sub
$0@Baseline_Sub:
jrcxz $2@Baseline_Sub ; loop until rcx overflows and becomes zero
jz $1@Baseline_Sub
mov rax,[r8+8*rcx]
sbb rax,[r9+8*rcx]
sub rax,[r9+8*rcx]
mov [rdx+8*rcx],rax
$0@Baseline_Sub:
mov rax,[r8+8*rcx+8]
sbb rax,[r9+8*rcx+8]
mov [rdx+8*rcx+8],rax
$1@Baseline_Sub:
mov rax,[r8+8*rcx+16]
sbb rax,[r9+8*rcx+16]
mov [rdx+8*rcx+16],rax
mov rax,[r8+8*rcx+24]
sbb rax,[r9+8*rcx+24]
mov [rdx+8*rcx+24],rax
lea rcx,[rcx+4] ; advance index, avoid inc which causes slowdown on Intel Core 2
lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2
jrcxz $1@Baseline_Sub ; loop until rcx overflows and becomes zero
mov rax,[r8+8*rcx]
sbb rax,[r9+8*rcx]
mov [rdx+8*rcx],rax
jmp $0@Baseline_Sub
$2@Baseline_Sub:
$1@Baseline_Sub:
mov rax, 0
setc al ; store carry into rax (return result register)
adc rax, rax ; store carry into rax (return result register)
ret
Baseline_Sub ENDP