fix compile for x64, DLL and VC 6
parent
460c2d6c6a
commit
d2510f30c7
|
|
@ -228,7 +228,7 @@ void Camellia::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBloc
|
|||
|
||||
SLOW_ROUND(lh, ll, rh, rl, KS(1,0), KS(1,1))
|
||||
SLOW_ROUND(rh, rl, lh, ll, KS(1,2), KS(1,3))
|
||||
for (unsigned int i = m_rounds-1; i > 0; --i)
|
||||
for (i = m_rounds-1; i > 0; --i)
|
||||
{
|
||||
DOUBLE_ROUND(lh, ll, rh, rl, KS(2,0), KS(2,1), KS(2,2), KS(2,3))
|
||||
DOUBLE_ROUND(lh, ll, rh, rl, KS(3,0), KS(3,1), KS(3,2), KS(3,3))
|
||||
|
|
|
|||
29
cpu.cpp
29
cpu.cpp
|
|
@ -1,8 +1,10 @@
|
|||
// cpu.cpp - written and placed in the public domain by Wei Dai
|
||||
|
||||
#include "pch.h"
|
||||
#include "cpu.h"
|
||||
|
||||
#ifndef CRYPTOPP_IMPORTS
|
||||
|
||||
#include "cpu.h"
|
||||
#include "misc.h"
|
||||
#include <algorithm>
|
||||
|
||||
|
|
@ -11,10 +13,15 @@
|
|||
#include <setjmp.h>
|
||||
#endif
|
||||
|
||||
#ifdef CRYPTOPP_MSVC6PP_OR_LATER
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
#ifdef CRYPTOPP_X86_ASM_AVAILABLE
|
||||
|
||||
#ifndef _MSC_VER
|
||||
typedef void (*SigHandler)(int);
|
||||
|
||||
static jmp_buf s_jmpNoCPUID;
|
||||
|
|
@ -22,6 +29,7 @@ static void SigIllHandlerCPUID(int)
|
|||
{
|
||||
longjmp(s_jmpNoCPUID, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
bool CpuId(word32 input, word32 *output)
|
||||
{
|
||||
|
|
@ -57,7 +65,11 @@ bool CpuId(word32 input, word32 *output)
|
|||
__asm__
|
||||
(
|
||||
// save ebx in case -fPIC is being used
|
||||
#if CRYPTOPP_BOOL_X86
|
||||
"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
|
||||
#else
|
||||
"pushq %%rbx; cpuid; mov %%ebx, %%edi; popq %%rbx"
|
||||
#endif
|
||||
: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
|
||||
: "a" (input)
|
||||
);
|
||||
|
|
@ -84,22 +96,19 @@ bool CpuId(word32 input, word32 *output)
|
|||
return true;
|
||||
}
|
||||
|
||||
inline bool TrySSE2()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CRYPTOPP_CPUID_AVAILABLE
|
||||
|
||||
static bool TrySSE2()
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
#if CRYPTOPP_BOOL_X64
|
||||
return true;
|
||||
#elif defined(_MSC_VER)
|
||||
__try
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
__asm por xmm0, xmm0 // executing SSE2 instruction
|
||||
AS2(por xmm0, xmm0) // executing SSE2 instruction
|
||||
#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
|
||||
__mm128i x = _mm_setzero_si128();
|
||||
return _mm_cvtsi128_si32(x) == 0;
|
||||
|
|
@ -137,7 +146,7 @@ static bool TrySSE2()
|
|||
|
||||
bool g_x86DetectionDone = false;
|
||||
bool g_hasSSE2 = false, g_hasSSSE3 = false, g_hasMMX = false, g_isP4 = false;
|
||||
int g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
|
||||
word32 g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
|
||||
|
||||
void DetectX86Features()
|
||||
{
|
||||
|
|
@ -170,3 +179,5 @@ void DetectX86Features()
|
|||
#endif
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
|
|
|||
33
cpu.h
33
cpu.h
|
|
@ -3,6 +3,10 @@
|
|||
|
||||
#include "config.h"
|
||||
|
||||
#ifdef CRYPTOPP_MSVC6PP_OR_LATER
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
#if defined(CRYPTOPP_X86_ASM_AVAILABLE) || (_MSC_VER >= 1400 && CRYPTOPP_BOOL_X64)
|
||||
|
|
@ -10,12 +14,15 @@ NAMESPACE_BEGIN(CryptoPP)
|
|||
#define CRYPTOPP_CPUID_AVAILABLE
|
||||
|
||||
// these should not be used directly
|
||||
extern bool g_x86DetectionDone;
|
||||
extern bool g_hasSSE2, g_hasMMX, g_hasSSSE3, g_isP4;
|
||||
extern int g_cacheLineSize;
|
||||
void DetectX86Features();
|
||||
extern CRYPTOPP_DLL bool g_x86DetectionDone;
|
||||
extern CRYPTOPP_DLL bool g_hasSSE2;
|
||||
extern CRYPTOPP_DLL bool g_hasMMX;
|
||||
extern CRYPTOPP_DLL bool g_hasSSSE3;
|
||||
extern CRYPTOPP_DLL bool g_isP4;
|
||||
extern CRYPTOPP_DLL word32 g_cacheLineSize;
|
||||
CRYPTOPP_DLL void DetectX86Features();
|
||||
|
||||
bool CpuId(word32 input, word32 *output);
|
||||
CRYPTOPP_DLL bool CpuId(word32 input, word32 *output);
|
||||
|
||||
#if CRYPTOPP_BOOL_X64
|
||||
inline bool HasSSE2() {return true;}
|
||||
|
|
@ -94,6 +101,7 @@ inline bool HasMMX() {return false;}
|
|||
#define ASL(x) GNU_ASL(x)
|
||||
#define ASJ(x, y, z) GNU_ASJ(x, y, z)
|
||||
#define ASC(x, y) #x " " #y ";"
|
||||
#define CRYPTOPP_NAKED
|
||||
#else
|
||||
#define AS1(x) __asm {x}
|
||||
#define AS2(x, y) __asm {x, y}
|
||||
|
|
@ -102,11 +110,26 @@ inline bool HasMMX() {return false;}
|
|||
#define ASL(x) __asm {label##x:}
|
||||
#define ASJ(x, y, z) __asm {x label##y}
|
||||
#define ASC(x, y) __asm {x label##y}
|
||||
#define CRYPTOPP_NAKED __declspec(naked)
|
||||
#endif
|
||||
|
||||
// GNU assembler doesn't seem to have mod operator
|
||||
#define ASM_MOD(x, y) ((x)-((x)/(y))*(y))
|
||||
|
||||
#if CRYPTOPP_BOOL_X86
|
||||
#define WORD_SZ 4
|
||||
#define WORD_REG(x) e##x
|
||||
#define WORD_PTR DWORD PTR
|
||||
#define AS_PUSH(x) AS1(push e##x)
|
||||
#define AS_POP(x) AS1(pop e##x)
|
||||
#elif CRYPTOPP_BOOL_X64
|
||||
#define WORD_SZ 8
|
||||
#define WORD_REG(x) r##x
|
||||
#define WORD_PTR QWORD PTR
|
||||
#define AS_PUSH(x) AS1(pushq r##x)
|
||||
#define AS_POP(x) AS1(popq r##x)
|
||||
#endif
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
|
|
|||
13
datatest.cpp
13
datatest.cpp
|
|
@ -5,14 +5,14 @@
|
|||
#include "randpool.h"
|
||||
#include "files.h"
|
||||
#include "trunhash.h"
|
||||
#include "queue.h"
|
||||
#include "validate.h"
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
USING_NAMESPACE(CryptoPP)
|
||||
USING_NAMESPACE(std)
|
||||
|
||||
RandomPool & GlobalRNG();
|
||||
|
||||
typedef std::map<std::string, std::string> TestData;
|
||||
|
||||
class TestFailure : public Exception
|
||||
|
|
@ -67,7 +67,7 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo
|
|||
s1 = s1.substr(s1.find(' ')+1);
|
||||
}
|
||||
|
||||
s2.clear();
|
||||
s2 = ""; // MSVC 6 doesn't have clear();
|
||||
|
||||
if (s1[0] == '\"')
|
||||
{
|
||||
|
|
@ -85,8 +85,13 @@ void PutDecodedDatumInto(const TestData &data, const char *name, BufferedTransfo
|
|||
s1 = s1.substr(STDMIN(s1.find(' '), s1.length()));
|
||||
}
|
||||
|
||||
ByteQueue q;
|
||||
while (repeat--)
|
||||
target.Put((const byte *)s2.data(), s2.size());
|
||||
{
|
||||
q.Put((const byte *)s2.data(), s2.size());
|
||||
if (q.MaxRetrievable() > 4*1024 || repeat == 0)
|
||||
q.TransferTo(target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
389
integer.cpp
389
integer.cpp
|
|
@ -18,7 +18,7 @@
|
|||
|
||||
#include <iostream>
|
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
||||
#if _MSC_VER >= 1400
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
|
|
@ -30,6 +30,8 @@
|
|||
#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
|
||||
#endif
|
||||
|
||||
#define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86)
|
||||
|
||||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
|
||||
|
|
@ -99,7 +101,36 @@ static word AtomicInverseModPower2(word A)
|
|||
|
||||
// ********************************************************
|
||||
|
||||
#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
|
||||
#if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || CRYPTOPP_BOOL_X64
|
||||
#define Declare2Words(x) word x##0, x##1;
|
||||
#define AssignWord(a, b) a##0 = b; a##1 = 0;
|
||||
#define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
|
||||
#define LowWord(a) a##0
|
||||
#define HighWord(a) a##1
|
||||
#ifdef _MSC_VER
|
||||
#define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1);
|
||||
#define Double3Words(c, d) d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
|
||||
#elif defined(__DECCXX)
|
||||
#define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
|
||||
#elif CRYPTOPP_BOOL_X64
|
||||
#define MultiplyWords(p, a, b) asm ("mulq %3" : "=a"(p##0), "=d"(p##1) : "a"(a), "g"(b) : "cc");
|
||||
#define MulAcc(c, d, a, b) asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
|
||||
#define Double3Words(c, d) asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
|
||||
#define Acc2WordsBy1(a, b) asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
|
||||
#define Acc2WordsBy2(a, b) asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
|
||||
#define Acc3WordsBy2(c, d, e) asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
|
||||
#endif
|
||||
#ifndef Double3Words
|
||||
#define Double3Words(c, d) d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
|
||||
#endif
|
||||
#ifndef Acc2WordsBy2
|
||||
#define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
|
||||
#endif
|
||||
#define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
|
||||
#define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
|
||||
#define GetCarry(u) u##1
|
||||
#define GetBorrow(u) u##1
|
||||
#else
|
||||
#define Declare2Words(x) dword x;
|
||||
#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
|
||||
#define MultiplyWords(p, a, b) p = __emulu(a, b);
|
||||
|
|
@ -108,34 +139,23 @@ static word AtomicInverseModPower2(word A)
|
|||
#endif
|
||||
#define AssignWord(a, b) a = b;
|
||||
#define Add2WordsBy1(a, b, c) a = b + c;
|
||||
#define Acc2WordsBy1(a, b) a += b;
|
||||
#define Acc2WordsBy2(a, b) a += b;
|
||||
#define LowWord(a) (word)a
|
||||
#define HighWord(a) (word)(a>>WORD_BITS)
|
||||
#define Double2Words(a) a += a;
|
||||
#define LowWord(a) word(a)
|
||||
#define HighWord(a) word(a>>WORD_BITS)
|
||||
#define Double3Words(c, d) d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
|
||||
#define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u);
|
||||
#define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u);
|
||||
#define GetCarry(u) HighWord(u)
|
||||
#define GetBorrow(u) word(u>>(WORD_BITS*2-1))
|
||||
#else
|
||||
#define Declare2Words(x) word x##0, x##1;
|
||||
#define AssignWord(a, b) a##0 = b; a##1 = 0;
|
||||
#define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
|
||||
#endif
|
||||
#ifndef MulAcc
|
||||
#define MulAcc(c, d, a, b) MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
|
||||
#endif
|
||||
#ifndef Acc2WordsBy1
|
||||
#define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b)
|
||||
#define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
|
||||
#define LowWord(a) a##0
|
||||
#define HighWord(a) a##1
|
||||
#ifdef _MSC_VER
|
||||
#define MultiplyWords(p, a, b) p##0 = _umul128(a, b, &p##1);
|
||||
#define Double2Words(a) a##1 = __shiftleft128(a##0, a##1, 1); a##0 += a##0;
|
||||
#elif defined(__DECCXX)
|
||||
#define MultiplyWords(p, a, b) p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
|
||||
#define Double2Words(a) a##1 = (a##1 + a##1) + (a##0 >> (WORD_BITS-1)); a##0 += a##0;
|
||||
#endif
|
||||
#define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
|
||||
#define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
|
||||
#define GetCarry(u) u##1
|
||||
#define GetBorrow(u) u##1
|
||||
#endif
|
||||
#ifndef Acc3WordsBy2
|
||||
#define Acc3WordsBy2(c, d, e) Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
|
||||
#endif
|
||||
|
||||
class DWord
|
||||
|
|
@ -411,9 +431,8 @@ inline word DWord::operator%(word a)
|
|||
|
||||
// use some tricks to share assembly code between MSVC and GCC
|
||||
#if defined(__GNUC__)
|
||||
#define CRYPTOPP_NAKED
|
||||
#define AddPrologue \
|
||||
word32 result; \
|
||||
word result; \
|
||||
__asm__ __volatile__ \
|
||||
( \
|
||||
".intel_syntax noprefix;"
|
||||
|
|
@ -454,7 +473,6 @@ inline word DWord::operator%(word a)
|
|||
: "memory", "cc" \
|
||||
);
|
||||
#else
|
||||
#define CRYPTOPP_NAKED __declspec(naked)
|
||||
#define AddPrologue \
|
||||
__asm push edi \
|
||||
__asm push esi \
|
||||
|
|
@ -464,33 +482,107 @@ inline word DWord::operator%(word a)
|
|||
__asm pop esi \
|
||||
__asm pop edi \
|
||||
__asm ret 8
|
||||
#if _MSC_VER < 1300
|
||||
#define SaveEBX __asm push ebx
|
||||
#define RestoreEBX __asm pop ebx
|
||||
#else
|
||||
#define SaveEBX
|
||||
#define RestoreEBX
|
||||
#endif
|
||||
#define SquPrologue \
|
||||
AS2( mov eax, A) \
|
||||
AS2( mov ecx, C) \
|
||||
SaveEBX \
|
||||
AS2( lea ebx, s_maskLow16)
|
||||
#define SquEpilogue
|
||||
#define MulPrologue \
|
||||
AS2( mov eax, A) \
|
||||
AS2( mov edi, B) \
|
||||
AS2( mov ecx, C) \
|
||||
SaveEBX \
|
||||
AS2( lea ebx, s_maskLow16)
|
||||
#define MulEpilogue
|
||||
#define TopPrologue \
|
||||
AS2( mov eax, A) \
|
||||
AS2( mov edi, B) \
|
||||
AS2( mov ecx, C) \
|
||||
AS2( mov esi, L) \
|
||||
SaveEBX \
|
||||
AS2( lea ebx, s_maskLow16)
|
||||
#define TopEpilogue
|
||||
#define SquEpilogue RestoreEBX
|
||||
#define MulEpilogue RestoreEBX
|
||||
#define TopEpilogue RestoreEBX
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && defined(_M_X64)
|
||||
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
|
||||
extern "C" {
|
||||
int Baseline_Add(size_t N, word *C, const word *A, const word *B);
|
||||
int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
|
||||
word Baseline_Add(size_t N, word *C, const word *A, const word *B);
|
||||
word Baseline_Sub(size_t N, word *C, const word *A, const word *B);
|
||||
}
|
||||
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE)
|
||||
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
|
||||
#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__)
|
||||
word Baseline_Add(size_t N, word *C, const word *A, const word *B)
|
||||
{
|
||||
word result;
|
||||
__asm__ __volatile__
|
||||
(
|
||||
".intel_syntax;"
|
||||
AS1( neg %1)
|
||||
ASJ( jz, 1, f)
|
||||
AS2( mov %0,[%3+8*%1])
|
||||
AS2( add %0,[%4+8*%1])
|
||||
AS2( mov [%2+8*%1],%0)
|
||||
ASL(0)
|
||||
AS2( mov %0,[%3+8*%1+8])
|
||||
AS2( adc %0,[%4+8*%1+8])
|
||||
AS2( mov [%2+8*%1+8],%0)
|
||||
AS2( lea %1,[%1+2])
|
||||
ASJ( jrcxz, 1, f)
|
||||
AS2( mov %0,[%3+8*%1])
|
||||
AS2( adc %0,[%4+8*%1])
|
||||
AS2( mov [%2+8*%1],%0)
|
||||
ASJ( jmp, 0, b)
|
||||
ASL(1)
|
||||
AS2( mov %0, 0)
|
||||
AS2( adc %0, %0)
|
||||
".att_syntax;"
|
||||
: "=&r" (result)
|
||||
: "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
|
||||
: "memory", "cc"
|
||||
);
|
||||
return result;
|
||||
}
|
||||
|
||||
word Baseline_Sub(size_t N, word *C, const word *A, const word *B)
|
||||
{
|
||||
word result;
|
||||
__asm__ __volatile__
|
||||
(
|
||||
".intel_syntax;"
|
||||
AS1( neg %1)
|
||||
ASJ( jz, 1, f)
|
||||
AS2( mov %0,[%3+8*%1])
|
||||
AS2( sub %0,[%4+8*%1])
|
||||
AS2( mov [%2+8*%1],%0)
|
||||
ASL(0)
|
||||
AS2( mov %0,[%3+8*%1+8])
|
||||
AS2( sbb %0,[%4+8*%1+8])
|
||||
AS2( mov [%2+8*%1+8],%0)
|
||||
AS2( lea %1,[%1+2])
|
||||
ASJ( jrcxz, 1, f)
|
||||
AS2( mov %0,[%3+8*%1])
|
||||
AS2( sbb %0,[%4+8*%1])
|
||||
AS2( mov [%2+8*%1],%0)
|
||||
ASJ( jmp, 0, b)
|
||||
ASL(1)
|
||||
AS2( mov %0, 0)
|
||||
AS2( adc %0, %0)
|
||||
".att_syntax;"
|
||||
: "=&r" (result)
|
||||
: "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
|
||||
: "memory", "cc"
|
||||
);
|
||||
return result;
|
||||
}
|
||||
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
|
||||
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
|
||||
{
|
||||
AddPrologue
|
||||
|
||||
|
|
@ -531,7 +623,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word
|
|||
AddEpilogue
|
||||
}
|
||||
|
||||
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
|
||||
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
|
||||
{
|
||||
AddPrologue
|
||||
|
||||
|
|
@ -572,8 +664,8 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word
|
|||
AddEpilogue
|
||||
}
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
|
||||
#if CRYPTOPP_INTEGER_SSE2
|
||||
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
|
||||
{
|
||||
AddPrologue
|
||||
|
||||
|
|
@ -629,7 +721,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A,
|
|||
|
||||
AddEpilogue
|
||||
}
|
||||
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
|
||||
CRYPTOPP_NAKED word CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
|
||||
{
|
||||
AddPrologue
|
||||
|
||||
|
|
@ -687,7 +779,7 @@ CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A,
|
|||
}
|
||||
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
#else
|
||||
int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
|
||||
word CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
|
||||
{
|
||||
assert (N%2 == 0);
|
||||
|
||||
|
|
@ -703,7 +795,7 @@ int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word
|
|||
return int(GetCarry(u));
|
||||
}
|
||||
|
||||
int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
|
||||
word CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
|
||||
{
|
||||
assert (N%2 == 0);
|
||||
|
||||
|
|
@ -737,7 +829,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
|
|||
#define Mul_2 \
|
||||
Mul_Begin(2) \
|
||||
Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
|
||||
Mul_End(2)
|
||||
Mul_End(1, 1)
|
||||
|
||||
#define Mul_4 \
|
||||
Mul_Begin(4) \
|
||||
|
|
@ -746,7 +838,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
|
|||
Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
|
||||
Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
|
||||
Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
|
||||
Mul_End(4)
|
||||
Mul_End(5, 3)
|
||||
|
||||
#define Mul_8 \
|
||||
Mul_Begin(8) \
|
||||
|
|
@ -763,7 +855,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
|
|||
Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
|
||||
Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
|
||||
Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
|
||||
Mul_End(8)
|
||||
Mul_End(13, 7)
|
||||
|
||||
#define Mul_16 \
|
||||
Mul_Begin(16) \
|
||||
|
|
@ -796,7 +888,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
|
|||
Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
|
||||
Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
|
||||
Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
|
||||
Mul_End(16)
|
||||
Mul_End(29, 15)
|
||||
|
||||
#define Squ_2 \
|
||||
Squ_Begin(2) \
|
||||
|
|
@ -900,6 +992,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
|
|||
Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
|
||||
Bot_End(16)
|
||||
|
||||
#if 0
|
||||
#define Mul_Begin(n) \
|
||||
Declare2Words(p) \
|
||||
Declare2Words(c) \
|
||||
|
|
@ -938,9 +1031,7 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
|
|||
|
||||
#define Bot_End(n) \
|
||||
R[n-1] = e;
|
||||
|
||||
/*
|
||||
// this is slower on MSVC 2005 Win32
|
||||
#else
|
||||
#define Mul_Begin(n) \
|
||||
Declare2Words(p) \
|
||||
word c; \
|
||||
|
|
@ -950,25 +1041,20 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
|
|||
AssignWord(d, HighWord(p))
|
||||
|
||||
#define Mul_Acc(i, j) \
|
||||
MultiplyWords(p, A[i], B[j]) \
|
||||
Acc2WordsBy1(p, c) \
|
||||
c = LowWord(p); \
|
||||
Acc2WordsBy1(d, HighWord(p))
|
||||
MulAcc(c, d, A[i], B[j])
|
||||
|
||||
#define Mul_SaveAcc(k, i, j) \
|
||||
R[k] = c; \
|
||||
MultiplyWords(p, A[i], B[j]) \
|
||||
Acc2WordsBy1(p, LowWord(d)) \
|
||||
c = LowWord(p); \
|
||||
c = LowWord(d); \
|
||||
AssignWord(d, HighWord(d)) \
|
||||
Acc2WordsBy1(d, HighWord(p))
|
||||
MulAcc(c, d, A[i], B[j])
|
||||
|
||||
#define Mul_End(n) \
|
||||
R[2*n-3] = c; \
|
||||
MultiplyWords(p, A[n-1], B[n-1])\
|
||||
Acc2WordsBy2(d, p) \
|
||||
R[2*n-2] = LowWord(d); \
|
||||
R[2*n-1] = HighWord(d);
|
||||
#define Mul_End(k, i) \
|
||||
R[k] = c; \
|
||||
MultiplyWords(p, A[i], B[i]) \
|
||||
Acc2WordsBy2(p, d) \
|
||||
R[k+1] = LowWord(p); \
|
||||
R[k+2] = HighWord(p);
|
||||
|
||||
#define Bot_SaveAcc(k, i, j) \
|
||||
R[k] = c; \
|
||||
|
|
@ -980,52 +1066,45 @@ static word LinearMultiply(word *C, const word *A, word B, size_t N)
|
|||
|
||||
#define Bot_End(n) \
|
||||
R[n-1] = c;
|
||||
*/
|
||||
#endif
|
||||
|
||||
#define Squ_Begin(n) \
|
||||
Declare2Words(p) \
|
||||
Declare2Words(c) \
|
||||
word c; \
|
||||
Declare2Words(d) \
|
||||
Declare2Words(e) \
|
||||
MultiplyWords(p, A[0], A[0]) \
|
||||
R[0] = LowWord(p); \
|
||||
AssignWord(e, HighWord(p)) \
|
||||
MultiplyWords(p, A[0], A[1]) \
|
||||
AssignWord(c, LowWord(p)) \
|
||||
c = LowWord(p); \
|
||||
AssignWord(d, HighWord(p)) \
|
||||
Squ_NonDiag \
|
||||
|
||||
#define Squ_NonDiag \
|
||||
Double2Words(c) \
|
||||
Double2Words(d) \
|
||||
Double3Words(c, d)
|
||||
|
||||
#define Squ_SaveAcc(k, i, j) \
|
||||
Acc2WordsBy2(c, e) \
|
||||
R[k] = LowWord(c); \
|
||||
Add2WordsBy1(e, d, HighWord(c)) \
|
||||
Acc3WordsBy2(c, d, e) \
|
||||
R[k] = c; \
|
||||
MultiplyWords(p, A[i], A[j]) \
|
||||
AssignWord(c, LowWord(p)) \
|
||||
c = LowWord(p); \
|
||||
AssignWord(d, HighWord(p)) \
|
||||
|
||||
#define Squ_Acc(i, j) \
|
||||
MultiplyWords(p, A[i], A[j]) \
|
||||
Acc2WordsBy1(c, LowWord(p)) \
|
||||
Acc2WordsBy1(d, HighWord(p))
|
||||
MulAcc(c, d, A[i], A[j])
|
||||
|
||||
#define Squ_Diag(i) \
|
||||
Squ_NonDiag \
|
||||
MultiplyWords(p, A[i], A[i]) \
|
||||
Acc2WordsBy1(c, LowWord(p)) \
|
||||
Acc2WordsBy1(d, HighWord(p)) \
|
||||
MulAcc(c, d, A[i], A[i])
|
||||
|
||||
#define Squ_End(n) \
|
||||
Acc2WordsBy2(c, e) \
|
||||
R[2*n-3] = LowWord(c); \
|
||||
Acc2WordsBy1(d, HighWord(c)) \
|
||||
Acc3WordsBy2(c, d, e) \
|
||||
R[2*n-3] = c; \
|
||||
MultiplyWords(p, A[n-1], A[n-1])\
|
||||
Acc2WordsBy2(d, p) \
|
||||
R[2*n-2] = LowWord(d); \
|
||||
R[2*n-1] = HighWord(d);
|
||||
Acc2WordsBy2(p, e) \
|
||||
R[2*n-2] = LowWord(p); \
|
||||
R[2*n-1] = HighWord(p);
|
||||
|
||||
void Baseline_Multiply2(word *R, const word *A, const word *B)
|
||||
{
|
||||
|
|
@ -1072,7 +1151,62 @@ void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
|
|||
Bot_8
|
||||
}
|
||||
|
||||
/*
|
||||
#define Top_Begin(n) \
|
||||
Declare2Words(p) \
|
||||
word c; \
|
||||
Declare2Words(d) \
|
||||
MultiplyWords(p, A[0], B[n-2]);\
|
||||
AssignWord(d, HighWord(p));
|
||||
|
||||
#define Top_Acc(i, j) \
|
||||
MultiplyWords(p, A[i], B[j]);\
|
||||
Acc2WordsBy1(d, HighWord(p));
|
||||
|
||||
#define Top_SaveAcc0(i, j) \
|
||||
c = LowWord(d); \
|
||||
AssignWord(d, HighWord(d)) \
|
||||
MulAcc(c, d, A[i], B[j])
|
||||
|
||||
#define Top_SaveAcc1(i, j) \
|
||||
c = L<c; \
|
||||
Acc2WordsBy1(d, c); \
|
||||
c = LowWord(d); \
|
||||
AssignWord(d, HighWord(d)) \
|
||||
MulAcc(c, d, A[i], B[j])
|
||||
|
||||
void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
|
||||
{
|
||||
word T[4];
|
||||
Baseline_Multiply2(T, A, B);
|
||||
R[0] = T[2];
|
||||
R[1] = T[3];
|
||||
}
|
||||
|
||||
void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
|
||||
{
|
||||
Top_Begin(4)
|
||||
Top_Acc(1, 1) Top_Acc(2, 0) \
|
||||
Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
|
||||
Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
|
||||
Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
|
||||
Mul_End(1, 3)
|
||||
}
|
||||
|
||||
void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
|
||||
{
|
||||
Top_Begin(8)
|
||||
Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
|
||||
Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
|
||||
Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
|
||||
Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
|
||||
Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
|
||||
Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
|
||||
Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
|
||||
Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
|
||||
Mul_End(5, 7)
|
||||
}
|
||||
|
||||
#if !CRYPTOPP_INTEGER_SSE2 // save memory by not compiling these functions when SSE2 is available
|
||||
void Baseline_Multiply16(word *R, const word *A, const word *B)
|
||||
{
|
||||
Mul_16
|
||||
|
|
@ -1087,16 +1221,40 @@ void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
|
|||
{
|
||||
Bot_16
|
||||
}
|
||||
*/
|
||||
|
||||
void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
|
||||
{
|
||||
Top_Begin(16)
|
||||
Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
|
||||
Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
|
||||
Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
|
||||
Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
|
||||
Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
|
||||
Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
|
||||
Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
|
||||
Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
|
||||
Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
|
||||
Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
|
||||
Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
|
||||
Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
|
||||
Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
|
||||
Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
|
||||
Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
|
||||
Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
|
||||
Mul_End(13, 15)
|
||||
}
|
||||
#endif
|
||||
|
||||
// ********************************************************
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
#if CRYPTOPP_INTEGER_SSE2
|
||||
|
||||
CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
|
||||
|
||||
#undef Mul_Begin
|
||||
#undef Mul_Acc
|
||||
#undef Top_Begin
|
||||
#undef Top_Acc
|
||||
#undef Squ_Acc
|
||||
#undef Squ_NonDiag
|
||||
#undef Squ_Diag
|
||||
|
|
@ -1760,33 +1918,35 @@ void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
|
|||
Top_End(8)
|
||||
}
|
||||
|
||||
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
#endif // #if CRYPTOPP_INTEGER_SSE2
|
||||
|
||||
// ********************************************************
|
||||
|
||||
typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
|
||||
typedef word (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
|
||||
typedef void (* PMul)(word *C, const word *A, const word *B);
|
||||
typedef void (* PSqu)(word *C, const word *A);
|
||||
typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
#if CRYPTOPP_INTEGER_SSE2
|
||||
static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
|
||||
static PMulTop s_pTop[3];
|
||||
static size_t s_recursionLimit = 8;
|
||||
#else
|
||||
static const size_t s_recursionLimit = 8;
|
||||
static const size_t s_recursionLimit = 16;
|
||||
#endif
|
||||
|
||||
static PMul s_pMul[9], s_pBot[9];
|
||||
static PSqu s_pSqu[9];
|
||||
static PMulTop s_pTop[9];
|
||||
|
||||
static void SetFunctionPointers()
|
||||
{
|
||||
s_pMul[0] = &Baseline_Multiply2;
|
||||
s_pBot[0] = &Baseline_MultiplyBottom2;
|
||||
s_pSqu[0] = &Baseline_Square2;
|
||||
s_pTop[0] = &Baseline_MultiplyTop2;
|
||||
s_pTop[1] = &Baseline_MultiplyTop4;
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
#if CRYPTOPP_INTEGER_SSE2
|
||||
if (HasSSE2())
|
||||
{
|
||||
if (IsP4())
|
||||
|
|
@ -1812,39 +1972,45 @@ static void SetFunctionPointers()
|
|||
s_pSqu[4] = &SSE2_Square16;
|
||||
s_pSqu[8] = &SSE2_Square32;
|
||||
|
||||
s_pTop[0] = &SSE2_MultiplyTop8;
|
||||
s_pTop[1] = &SSE2_MultiplyTop16;
|
||||
s_pTop[2] = &SSE2_MultiplyTop32;
|
||||
s_pTop[2] = &SSE2_MultiplyTop8;
|
||||
s_pTop[4] = &SSE2_MultiplyTop16;
|
||||
s_pTop[8] = &SSE2_MultiplyTop32;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
s_pMul[1] = &Baseline_Multiply4;
|
||||
s_pMul[2] = &Baseline_Multiply8;
|
||||
// s_pMul[4] = &Baseline_Multiply16;
|
||||
|
||||
s_pBot[1] = &Baseline_MultiplyBottom4;
|
||||
s_pBot[2] = &Baseline_MultiplyBottom8;
|
||||
// s_pBot[4] = &Baseline_MultiplyBottom16;
|
||||
|
||||
s_pSqu[1] = &Baseline_Square4;
|
||||
s_pSqu[2] = &Baseline_Square8;
|
||||
// s_pSqu[4] = &Baseline_Square16;
|
||||
|
||||
s_pTop[2] = &Baseline_MultiplyTop8;
|
||||
|
||||
#if !CRYPTOPP_INTEGER_SSE2
|
||||
s_pMul[4] = &Baseline_Multiply16;
|
||||
s_pBot[4] = &Baseline_MultiplyBottom16;
|
||||
s_pSqu[4] = &Baseline_Square16;
|
||||
s_pTop[4] = &Baseline_MultiplyTop16;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
inline int Add(word *C, const word *A, const word *B, size_t N)
|
||||
inline word Add(word *C, const word *A, const word *B, size_t N)
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
#if CRYPTOPP_INTEGER_SSE2
|
||||
return s_pAdd(N, C, A, B);
|
||||
#else
|
||||
return Baseline_Add(N, C, A, B);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline int Subtract(word *C, const word *A, const word *B, size_t N)
|
||||
inline word Subtract(word *C, const word *A, const word *B, size_t N)
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
#if CRYPTOPP_INTEGER_SSE2
|
||||
return s_pSub(N, C, A, B);
|
||||
#else
|
||||
return Baseline_Sub(N, C, A, B);
|
||||
|
|
@ -1969,16 +2135,8 @@ void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B,
|
|||
{
|
||||
assert(N>=2 && N%2==0);
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
if (HasSSE2() && ((N>=8) & (N<=32)))
|
||||
s_pTop[N/16](R, A, B, L[N-1]);
|
||||
else
|
||||
#endif
|
||||
if (N<=4)
|
||||
{
|
||||
s_pMul[N/4](T, A, B);
|
||||
memcpy(R, T+N, N*WORD_SIZE);
|
||||
}
|
||||
if (N <= s_recursionLimit)
|
||||
s_pTop[N/4](R, A, B, L[N-1]);
|
||||
else
|
||||
{
|
||||
const size_t N2 = N/2;
|
||||
|
|
@ -3076,13 +3234,6 @@ public:
|
|||
memcpy(m_counterAndSeed + 4, seed, seedSize);
|
||||
}
|
||||
|
||||
byte GenerateByte()
|
||||
{
|
||||
byte b;
|
||||
GenerateBlock(&b, 1);
|
||||
return b;
|
||||
}
|
||||
|
||||
void GenerateBlock(byte *output, size_t size)
|
||||
{
|
||||
PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
|
||||
|
|
|
|||
137
panama.cpp
137
panama.cpp
|
|
@ -26,31 +26,31 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
|
|||
__asm__ __volatile__
|
||||
(
|
||||
".intel_syntax noprefix;"
|
||||
AS1( push ebx)
|
||||
AS_PUSH( bx)
|
||||
#else
|
||||
AS2( mov ecx, count)
|
||||
AS2( mov esi, state)
|
||||
AS2( mov edi, z)
|
||||
AS2( mov edx, y)
|
||||
AS2( mov WORD_REG(cx), count)
|
||||
AS2( mov WORD_REG(si), state)
|
||||
AS2( mov WORD_REG(di), z)
|
||||
AS2( mov WORD_REG(dx), y)
|
||||
#endif
|
||||
AS2( shl ecx, 5)
|
||||
AS2( shl WORD_REG(cx), 5)
|
||||
ASJ( jz, 5, f)
|
||||
AS2( mov ebx, [esi+4*17])
|
||||
AS2( add ecx, ebx)
|
||||
AS2( mov ebx, [WORD_REG(si)+4*17])
|
||||
AS2( add WORD_REG(cx), WORD_REG(bx))
|
||||
|
||||
AS1( push ebp)
|
||||
AS1( push ecx)
|
||||
AS_PUSH( bp)
|
||||
AS_PUSH( cx)
|
||||
|
||||
AS2( movdqa xmm0, [esi+0*16])
|
||||
AS2( movdqa xmm1, [esi+1*16])
|
||||
AS2( movdqa xmm2, [esi+2*16])
|
||||
AS2( movdqa xmm3, [esi+3*16])
|
||||
AS2( mov eax, [esi+4*16])
|
||||
AS2( movdqa xmm0, [WORD_REG(si)+0*16])
|
||||
AS2( movdqa xmm1, [WORD_REG(si)+1*16])
|
||||
AS2( movdqa xmm2, [WORD_REG(si)+2*16])
|
||||
AS2( movdqa xmm3, [WORD_REG(si)+3*16])
|
||||
AS2( mov eax, [WORD_REG(si)+4*16])
|
||||
|
||||
ASL(4)
|
||||
// gamma and pi
|
||||
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
|
||||
AS2( test ebx, 1)
|
||||
AS2( test WORD_REG(bx), 1)
|
||||
ASJ( jnz, 6, f)
|
||||
#endif
|
||||
AS2( movdqa xmm6, xmm2)
|
||||
|
|
@ -81,7 +81,7 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
|
|||
#define pi(i) \
|
||||
AS2( movd ecx, xmm7)\
|
||||
AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
|
||||
AS2( mov [esi+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
|
||||
AS2( mov [WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
|
||||
|
||||
#define pi4(x, y, z, a, b, c, d) \
|
||||
AS2( pcmpeqb xmm7, xmm7)\
|
||||
|
|
@ -110,65 +110,65 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
|
|||
AS2( punpckhdq xmm2, xmm0) // 11 12 15 16
|
||||
|
||||
// keystream
|
||||
AS2( test edi, edi)
|
||||
AS2( test WORD_REG(di), WORD_REG(di))
|
||||
ASJ( jz, 0, f)
|
||||
AS2( movdqa xmm6, xmm4)
|
||||
AS2( punpcklqdq xmm4, xmm2)
|
||||
AS2( punpckhqdq xmm6, xmm2)
|
||||
AS2( test edx, 0xf)
|
||||
AS2( test WORD_REG(dx), 0xf)
|
||||
ASJ( jnz, 2, f)
|
||||
AS2( test edx, edx)
|
||||
AS2( test WORD_REG(dx), WORD_REG(dx))
|
||||
ASJ( jz, 1, f)
|
||||
AS2( pxor xmm4, [edx])
|
||||
AS2( pxor xmm6, [edx+16])
|
||||
AS2( add edx, 32)
|
||||
AS2( pxor xmm4, [WORD_REG(dx)])
|
||||
AS2( pxor xmm6, [WORD_REG(dx)+16])
|
||||
AS2( add WORD_REG(dx), 32)
|
||||
ASJ( jmp, 1, f)
|
||||
ASL(2)
|
||||
AS2( movdqu xmm0, [edx])
|
||||
AS2( movdqu xmm2, [edx+16])
|
||||
AS2( movdqu xmm0, [WORD_REG(dx)])
|
||||
AS2( movdqu xmm2, [WORD_REG(dx)+16])
|
||||
AS2( pxor xmm4, xmm0)
|
||||
AS2( pxor xmm6, xmm2)
|
||||
AS2( add edx, 32)
|
||||
AS2( add WORD_REG(dx), 32)
|
||||
ASL(1)
|
||||
AS2( test edi, 0xf)
|
||||
AS2( test WORD_REG(di), 0xf)
|
||||
ASJ( jnz, 3, f)
|
||||
AS2( movdqa [edi], xmm4)
|
||||
AS2( movdqa [edi+16], xmm6)
|
||||
AS2( add edi, 32)
|
||||
AS2( movdqa [WORD_REG(di)], xmm4)
|
||||
AS2( movdqa [WORD_REG(di)+16], xmm6)
|
||||
AS2( add WORD_REG(di), 32)
|
||||
ASJ( jmp, 0, f)
|
||||
ASL(3)
|
||||
AS2( movdqu [edi], xmm4)
|
||||
AS2( movdqu [edi+16], xmm6)
|
||||
AS2( add edi, 32)
|
||||
AS2( movdqu [WORD_REG(di)], xmm4)
|
||||
AS2( movdqu [WORD_REG(di)+16], xmm6)
|
||||
AS2( add WORD_REG(di), 32)
|
||||
ASL(0)
|
||||
|
||||
// buffer update
|
||||
AS2( lea ecx, [ebx + 32])
|
||||
AS2( and ecx, 31*32)
|
||||
AS2( lea ebp, [ebx + (32-24)*32])
|
||||
AS2( and ebp, 31*32)
|
||||
AS2( lea WORD_REG(cx), [WORD_REG(bx) + 32])
|
||||
AS2( and WORD_REG(cx), 31*32)
|
||||
AS2( lea WORD_REG(bp), [WORD_REG(bx) + (32-24)*32])
|
||||
AS2( and WORD_REG(bp), 31*32)
|
||||
|
||||
AS2( movdqa xmm0, [esi+20*4+ecx+0*8])
|
||||
AS2( movdqa xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8])
|
||||
AS2( pxor xmm3, xmm0)
|
||||
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
|
||||
AS2( movdqa [esi+20*4+ecx+0*8], xmm3)
|
||||
AS2( pxor xmm0, [esi+20*4+ebp+2*8])
|
||||
AS2( movdqa [esi+20*4+ebp+2*8], xmm0)
|
||||
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3)
|
||||
AS2( pxor xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8])
|
||||
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0)
|
||||
|
||||
AS2( movdqa xmm4, [esi+20*4+ecx+2*8])
|
||||
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8])
|
||||
AS2( pxor xmm1, xmm4)
|
||||
AS2( movdqa [esi+20*4+ecx+2*8], xmm1)
|
||||
AS2( pxor xmm4, [esi+20*4+ebp+0*8])
|
||||
AS2( movdqa [esi+20*4+ebp+0*8], xmm4)
|
||||
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1)
|
||||
AS2( pxor xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8])
|
||||
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4)
|
||||
|
||||
// theta
|
||||
AS2( movdqa xmm3, [esi+3*16])
|
||||
AS2( movdqa xmm2, [esi+2*16])
|
||||
AS2( movdqa xmm1, [esi+1*16])
|
||||
AS2( movdqa xmm0, [esi+0*16])
|
||||
AS2( movdqa xmm3, [WORD_REG(si)+3*16])
|
||||
AS2( movdqa xmm2, [WORD_REG(si)+2*16])
|
||||
AS2( movdqa xmm1, [WORD_REG(si)+1*16])
|
||||
AS2( movdqa xmm0, [WORD_REG(si)+0*16])
|
||||
|
||||
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
|
||||
AS2( test ebx, 1)
|
||||
AS2( test WORD_REG(bx), 1)
|
||||
ASJ( jnz, 8, f)
|
||||
#endif
|
||||
AS2( movd xmm6, eax)
|
||||
|
|
@ -214,21 +214,21 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
|
|||
AS2( pxor xmm0, xmm4)
|
||||
|
||||
// sigma
|
||||
AS2( lea ecx, [ebx + (32-4)*32])
|
||||
AS2( and ecx, 31*32)
|
||||
AS2( lea ebp, [ebx + 16*32])
|
||||
AS2( and ebp, 31*32)
|
||||
AS2( lea WORD_REG(cx), [WORD_REG(bx) + (32-4)*32])
|
||||
AS2( and WORD_REG(cx), 31*32)
|
||||
AS2( lea WORD_REG(bp), [WORD_REG(bx) + 16*32])
|
||||
AS2( and WORD_REG(bp), 31*32)
|
||||
|
||||
AS2( movdqa xmm4, [esi+20*4+ecx+0*16])
|
||||
AS2( movdqa xmm5, [esi+20*4+ebp+0*16])
|
||||
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16])
|
||||
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16])
|
||||
AS2( movdqa xmm6, xmm4)
|
||||
AS2( punpcklqdq xmm4, xmm5)
|
||||
AS2( punpckhqdq xmm6, xmm5)
|
||||
AS2( pxor xmm3, xmm4)
|
||||
AS2( pxor xmm2, xmm6)
|
||||
|
||||
AS2( movdqa xmm4, [esi+20*4+ecx+1*16])
|
||||
AS2( movdqa xmm5, [esi+20*4+ebp+1*16])
|
||||
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16])
|
||||
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16])
|
||||
AS2( movdqa xmm6, xmm4)
|
||||
AS2( punpcklqdq xmm4, xmm5)
|
||||
AS2( punpckhqdq xmm6, xmm5)
|
||||
|
|
@ -236,23 +236,22 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
|
|||
AS2( pxor xmm0, xmm6)
|
||||
|
||||
// loop
|
||||
AS2( add ebx, 32)
|
||||
AS2( cmp ebx, [esp])
|
||||
AS2( add WORD_REG(bx), 32)
|
||||
AS2( cmp WORD_REG(bx), [WORD_REG(sp)])
|
||||
ASJ( jne, 4, b)
|
||||
|
||||
// save state
|
||||
AS2( mov ebp, [esp+4])
|
||||
AS2( add esp, 8)
|
||||
AS2( mov [esi+4*17], ebx)
|
||||
AS2( mov [esi+4*16], eax)
|
||||
AS2( movdqa [esi+3*16], xmm3)
|
||||
AS2( movdqa [esi+2*16], xmm2)
|
||||
AS2( movdqa [esi+1*16], xmm1)
|
||||
AS2( movdqa [esi+0*16], xmm0)
|
||||
AS2( add WORD_REG(sp), WORD_SZ)
|
||||
AS_POP( bp)
|
||||
AS2( mov [WORD_REG(si)+4*16], eax)
|
||||
AS2( movdqa [WORD_REG(si)+3*16], xmm3)
|
||||
AS2( movdqa [WORD_REG(si)+2*16], xmm2)
|
||||
AS2( movdqa [WORD_REG(si)+1*16], xmm1)
|
||||
AS2( movdqa [WORD_REG(si)+0*16], xmm0)
|
||||
ASL(5)
|
||||
|
||||
#ifdef __GNUC__
|
||||
AS1( pop ebx)
|
||||
AS_POP( bx)
|
||||
".att_syntax prefix;"
|
||||
:
|
||||
: "c" (count), "S" (state), "D" (z), "d" (y)
|
||||
|
|
|
|||
252
rijndael.cpp
252
rijndael.cpp
|
|
@ -149,81 +149,133 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
|
|||
|
||||
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
||||
{
|
||||
#ifdef CRYPTOPP_X86_ASM_AVAILABLE
|
||||
#if defined(CRYPTOPP_X86_ASM_AVAILABLE)
|
||||
if (HasMMX())
|
||||
{
|
||||
const word32 *k = m_key;
|
||||
const word32 *kLoopEnd = k + m_rounds*4;
|
||||
#if CRYPTOPP_BOOL_X64
|
||||
#define K_REG r8
|
||||
#define K_END_REG r9
|
||||
#define SAVE_K
|
||||
#define RESTORE_K
|
||||
#define RESTORE_K_END
|
||||
#define SAVE_0(x) AS2(mov r10d, x)
|
||||
#define SAVE_1(x) AS2(mov r11d, x)
|
||||
#define SAVE_2(x) AS2(mov r12d, x)
|
||||
#define RESTORE_0(x) AS2(mov x, r10d)
|
||||
#define RESTORE_1(x) AS2(mov x, r11d)
|
||||
#define RESTORE_2(x) AS2(mov x, r12d)
|
||||
#else
|
||||
#define K_REG esi
|
||||
#define K_END_REG edi
|
||||
#define SAVE_K AS2(movd mm4, esi)
|
||||
#define RESTORE_K AS2(movd esi, mm4)
|
||||
#define RESTORE_K_END AS2(movd edi, mm5)
|
||||
#define SAVE_0(x) AS2(movd mm0, x)
|
||||
#define SAVE_1(x) AS2(movd mm1, x)
|
||||
#define SAVE_2(x) AS2(movd mm2, x)
|
||||
#define RESTORE_0(x) AS2(movd x, mm0)
|
||||
#define RESTORE_1(x) AS2(movd x, mm1)
|
||||
#define RESTORE_2(x) AS2(movd x, mm2)
|
||||
#endif
|
||||
#ifdef __GNUC__
|
||||
word32 t0, t1, t2, t3;
|
||||
__asm__ __volatile__
|
||||
(
|
||||
".intel_syntax noprefix;"
|
||||
AS1( push ebx)
|
||||
AS1( push ebp)
|
||||
AS2( mov ebp, eax)
|
||||
AS_PUSH( bx)
|
||||
AS_PUSH( bp)
|
||||
AS2( mov WORD_REG(bp), WORD_REG(ax))
|
||||
#if CRYPTOPP_BOOL_X64
|
||||
// save these manually. clobber list doesn't seem to work as of GCC 4.1.0
|
||||
AS1( pushq K_REG)
|
||||
AS1( pushq K_END_REG)
|
||||
AS1( pushq r10)
|
||||
AS1( pushq r11)
|
||||
AS1( pushq r12)
|
||||
AS2( mov K_REG, rsi)
|
||||
AS2( mov K_END_REG, rcx)
|
||||
#else
|
||||
AS2( movd mm5, ecx)
|
||||
#endif
|
||||
#else
|
||||
#if _MSC_VER < 1300
|
||||
const word32 *t = Te;
|
||||
AS2( mov eax, t)
|
||||
#endif
|
||||
AS2( mov edx, g_cacheLineSize)
|
||||
AS2( mov edi, inBlock)
|
||||
AS2( mov esi, k)
|
||||
AS2( mov WORD_REG(di), inBlock)
|
||||
AS2( mov K_REG, k)
|
||||
AS2( movd mm5, kLoopEnd)
|
||||
AS1( push ebp)
|
||||
#if _MSC_VER < 1300
|
||||
AS_PUSH( bx)
|
||||
AS_PUSH( bp)
|
||||
AS2( mov ebp, eax)
|
||||
#else
|
||||
AS_PUSH( bp)
|
||||
AS2( lea ebp, Te)
|
||||
#endif
|
||||
#endif
|
||||
AS2( mov eax, [esi+0*4]) // s0
|
||||
AS2( xor eax, [edi+0*4])
|
||||
AS2( movd mm0, eax)
|
||||
AS2( mov ebx, [esi+1*4])
|
||||
AS2( xor ebx, [edi+1*4])
|
||||
AS2( movd mm1, ebx)
|
||||
AS2( mov eax, [K_REG+0*4]) // s0
|
||||
AS2( xor eax, [WORD_REG(di)+0*4])
|
||||
SAVE_0(eax)
|
||||
AS2( mov ebx, [K_REG+1*4])
|
||||
AS2( xor ebx, [WORD_REG(di)+1*4])
|
||||
SAVE_1(ebx)
|
||||
AS2( and ebx, eax)
|
||||
AS2( mov eax, [esi+2*4])
|
||||
AS2( xor eax, [edi+2*4])
|
||||
AS2( movd mm2, eax)
|
||||
AS2( mov eax, [K_REG+2*4])
|
||||
AS2( xor eax, [WORD_REG(di)+2*4])
|
||||
SAVE_2(eax)
|
||||
AS2( and ebx, eax)
|
||||
AS2( mov ecx, [esi+3*4])
|
||||
AS2( xor ecx, [edi+3*4])
|
||||
AS2( mov ecx, [K_REG+3*4])
|
||||
AS2( xor ecx, [WORD_REG(di)+3*4])
|
||||
AS2( and ebx, ecx)
|
||||
|
||||
// read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction
|
||||
AS2( and ebx, 0)
|
||||
AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence
|
||||
ASL(2)
|
||||
AS2( and ebx, [ebp+edi])
|
||||
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
|
||||
AS2( add edi, edx)
|
||||
AS2( and ebx, [ebp+edi])
|
||||
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
|
||||
AS2( add edi, edx)
|
||||
AS2( and ebx, [ebp+edi])
|
||||
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
|
||||
AS2( add edi, edx)
|
||||
AS2( and ebx, [ebp+edi])
|
||||
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
|
||||
AS2( add edi, edx)
|
||||
AS2( cmp edi, 1024)
|
||||
ASJ( jl, 2, b)
|
||||
AS2( and ebx, [ebp+1020])
|
||||
AS2( and ebx, [WORD_REG(bp)+1020])
|
||||
#if CRYPTOPP_BOOL_X64
|
||||
AS2( xor r10d, ebx)
|
||||
AS2( xor r11d, ebx)
|
||||
AS2( xor r12d, ebx)
|
||||
#else
|
||||
AS2( movd mm6, ebx)
|
||||
AS2( pxor mm2, mm6)
|
||||
AS2( pxor mm1, mm6)
|
||||
AS2( pxor mm0, mm6)
|
||||
#endif
|
||||
AS2( xor ecx, ebx)
|
||||
|
||||
AS2( mov edi, [esi+4*4]) // t0
|
||||
AS2( mov eax, [esi+5*4])
|
||||
AS2( mov ebx, [esi+6*4])
|
||||
AS2( mov edx, [esi+7*4])
|
||||
AS2( add esi, 8*4)
|
||||
AS2( movd mm4, esi)
|
||||
AS2( mov edi, [K_REG+4*4]) // t0
|
||||
AS2( mov eax, [K_REG+5*4])
|
||||
AS2( mov ebx, [K_REG+6*4])
|
||||
AS2( mov edx, [K_REG+7*4])
|
||||
AS2( add K_REG, 8*4)
|
||||
SAVE_K
|
||||
|
||||
#define QUARTER_ROUND(t, a, b, c, d) \
|
||||
AS2(movzx esi, t##l)\
|
||||
AS2(d, [ebp+0*1024+4*esi])\
|
||||
AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\
|
||||
AS2(movzx esi, t##h)\
|
||||
AS2(c, [ebp+1*1024+4*esi])\
|
||||
AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
|
||||
AS2(shr e##t##x, 16)\
|
||||
AS2(movzx esi, t##l)\
|
||||
AS2(b, [ebp+2*1024+4*esi])\
|
||||
AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
|
||||
AS2(movzx esi, t##h)\
|
||||
AS2(a, [ebp+3*1024+4*esi])
|
||||
AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])
|
||||
|
||||
#define s0 xor edi
|
||||
#define s1 xor eax
|
||||
|
|
@ -235,69 +287,69 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
|
|||
#define t3 xor edx
|
||||
|
||||
QUARTER_ROUND(c, t0, t1, t2, t3)
|
||||
AS2( movd ecx, mm2)
|
||||
RESTORE_2(ecx)
|
||||
QUARTER_ROUND(c, t3, t0, t1, t2)
|
||||
AS2( movd ecx, mm1)
|
||||
RESTORE_1(ecx)
|
||||
QUARTER_ROUND(c, t2, t3, t0, t1)
|
||||
AS2( movd ecx, mm0)
|
||||
RESTORE_0(ecx)
|
||||
QUARTER_ROUND(c, t1, t2, t3, t0)
|
||||
AS2( movd mm2, ebx)
|
||||
AS2( movd mm1, eax)
|
||||
AS2( movd mm0, edi)
|
||||
SAVE_2(ebx)
|
||||
SAVE_1(eax)
|
||||
SAVE_0(edi)
|
||||
#undef QUARTER_ROUND
|
||||
|
||||
AS2( movd esi, mm4)
|
||||
RESTORE_K
|
||||
|
||||
ASL(0)
|
||||
AS2( mov edi, [esi+0*4])
|
||||
AS2( mov eax, [esi+1*4])
|
||||
AS2( mov ebx, [esi+2*4])
|
||||
AS2( mov ecx, [esi+3*4])
|
||||
AS2( mov edi, [K_REG+0*4])
|
||||
AS2( mov eax, [K_REG+1*4])
|
||||
AS2( mov ebx, [K_REG+2*4])
|
||||
AS2( mov ecx, [K_REG+3*4])
|
||||
|
||||
#define QUARTER_ROUND(t, a, b, c, d) \
|
||||
AS2(movzx esi, t##l)\
|
||||
AS2(a, [ebp+3*1024+4*esi])\
|
||||
AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\
|
||||
AS2(movzx esi, t##h)\
|
||||
AS2(b, [ebp+2*1024+4*esi])\
|
||||
AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
|
||||
AS2(shr e##t##x, 16)\
|
||||
AS2(movzx esi, t##l)\
|
||||
AS2(c, [ebp+1*1024+4*esi])\
|
||||
AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
|
||||
AS2(movzx esi, t##h)\
|
||||
AS2(d, [ebp+0*1024+4*esi])
|
||||
AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])
|
||||
|
||||
QUARTER_ROUND(d, s0, s1, s2, s3)
|
||||
AS2( movd edx, mm2)
|
||||
RESTORE_2(edx)
|
||||
QUARTER_ROUND(d, s3, s0, s1, s2)
|
||||
AS2( movd edx, mm1)
|
||||
RESTORE_1(edx)
|
||||
QUARTER_ROUND(d, s2, s3, s0, s1)
|
||||
AS2( movd edx, mm0)
|
||||
RESTORE_0(edx)
|
||||
QUARTER_ROUND(d, s1, s2, s3, s0)
|
||||
AS2( movd esi, mm4)
|
||||
AS2( movd mm2, ebx)
|
||||
AS2( movd mm1, eax)
|
||||
AS2( movd mm0, edi)
|
||||
RESTORE_K
|
||||
SAVE_2(ebx)
|
||||
SAVE_1(eax)
|
||||
SAVE_0(edi)
|
||||
|
||||
AS2( mov edi, [esi+4*4])
|
||||
AS2( mov eax, [esi+5*4])
|
||||
AS2( mov ebx, [esi+6*4])
|
||||
AS2( mov edx, [esi+7*4])
|
||||
AS2( mov edi, [K_REG+4*4])
|
||||
AS2( mov eax, [K_REG+5*4])
|
||||
AS2( mov ebx, [K_REG+6*4])
|
||||
AS2( mov edx, [K_REG+7*4])
|
||||
|
||||
QUARTER_ROUND(c, t0, t1, t2, t3)
|
||||
AS2( movd ecx, mm2)
|
||||
RESTORE_2(ecx)
|
||||
QUARTER_ROUND(c, t3, t0, t1, t2)
|
||||
AS2( movd ecx, mm1)
|
||||
RESTORE_1(ecx)
|
||||
QUARTER_ROUND(c, t2, t3, t0, t1)
|
||||
AS2( movd ecx, mm0)
|
||||
RESTORE_0(ecx)
|
||||
QUARTER_ROUND(c, t1, t2, t3, t0)
|
||||
AS2( movd mm2, ebx)
|
||||
AS2( movd mm1, eax)
|
||||
AS2( movd mm0, edi)
|
||||
SAVE_2(ebx)
|
||||
SAVE_1(eax)
|
||||
SAVE_0(edi)
|
||||
|
||||
AS2( movd esi, mm4)
|
||||
AS2( movd edi, mm5)
|
||||
AS2( add esi, 8*4)
|
||||
AS2( movd mm4, esi)
|
||||
AS2( cmp edi, esi)
|
||||
RESTORE_K
|
||||
RESTORE_K_END
|
||||
AS2( add K_REG, 8*4)
|
||||
SAVE_K
|
||||
AS2( cmp K_END_REG, K_REG)
|
||||
ASJ( jne, 0, b)
|
||||
|
||||
#undef QUARTER_ROUND
|
||||
|
|
@ -310,44 +362,54 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
|
|||
#undef t2
|
||||
#undef t3
|
||||
|
||||
AS2( mov eax, [edi+0*4])
|
||||
AS2( mov ecx, [edi+1*4])
|
||||
AS2( mov esi, [edi+2*4])
|
||||
AS2( mov edi, [edi+3*4])
|
||||
AS2( mov eax, [K_END_REG+0*4])
|
||||
AS2( mov ecx, [K_END_REG+1*4])
|
||||
AS2( mov esi, [K_END_REG+2*4])
|
||||
AS2( mov edi, [K_END_REG+3*4])
|
||||
|
||||
#define QUARTER_ROUND(a, b, c, d) \
|
||||
AS2( movzx ebx, dl)\
|
||||
AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\
|
||||
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
|
||||
AS2( shl ebx, 3*8)\
|
||||
AS2( xor a, ebx)\
|
||||
AS2( movzx ebx, dh)\
|
||||
AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\
|
||||
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
|
||||
AS2( shl ebx, 2*8)\
|
||||
AS2( xor b, ebx)\
|
||||
AS2( shr edx, 16)\
|
||||
AS2( movzx ebx, dl)\
|
||||
AS2( shr edx, 8)\
|
||||
AS2( movzx ebx, BYTE PTR [ebp+1+4*ebx])\
|
||||
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
|
||||
AS2( shl ebx, 1*8)\
|
||||
AS2( xor c, ebx)\
|
||||
AS2( movzx ebx, BYTE PTR [ebp+1+4*edx])\
|
||||
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\
|
||||
AS2( xor d, ebx)
|
||||
|
||||
QUARTER_ROUND(eax, ecx, esi, edi)
|
||||
AS2( movd edx, mm2)
|
||||
RESTORE_2(edx)
|
||||
QUARTER_ROUND(edi, eax, ecx, esi)
|
||||
AS2( movd edx, mm1)
|
||||
RESTORE_1(edx)
|
||||
QUARTER_ROUND(esi, edi, eax, ecx)
|
||||
AS2( movd edx, mm0)
|
||||
RESTORE_0(edx)
|
||||
QUARTER_ROUND(ecx, esi, edi, eax)
|
||||
|
||||
#undef QUARTER_ROUND
|
||||
|
||||
AS1( pop ebp)
|
||||
AS1( emms)
|
||||
#if CRYPTOPP_BOOL_X64
|
||||
AS1(popq r12)
|
||||
AS1(popq r11)
|
||||
AS1(popq r10)
|
||||
AS1(popq K_END_REG)
|
||||
AS1(popq K_REG)
|
||||
#else
|
||||
AS1(emms)
|
||||
#endif
|
||||
AS_POP( bp)
|
||||
|
||||
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
|
||||
AS_POP( bx)
|
||||
#endif
|
||||
#ifdef __GNUC__
|
||||
AS1( pop ebx)
|
||||
".att_syntax prefix;"
|
||||
: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
|
||||
: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
|
||||
|
|
@ -366,19 +428,19 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
|
|||
((word32 *)outBlock)[2] = t2;
|
||||
((word32 *)outBlock)[3] = t3;
|
||||
#else
|
||||
AS2( mov ebx, xorBlock)
|
||||
AS2( test ebx, ebx)
|
||||
AS2( mov WORD_REG(bx), xorBlock)
|
||||
AS2( test WORD_REG(bx), WORD_REG(bx))
|
||||
ASJ( jz, 1, f)
|
||||
AS2( xor eax, [ebx+0*4])
|
||||
AS2( xor ecx, [ebx+1*4])
|
||||
AS2( xor esi, [ebx+2*4])
|
||||
AS2( xor edi, [ebx+3*4])
|
||||
AS2( xor eax, [WORD_REG(bx)+0*4])
|
||||
AS2( xor ecx, [WORD_REG(bx)+1*4])
|
||||
AS2( xor esi, [WORD_REG(bx)+2*4])
|
||||
AS2( xor edi, [WORD_REG(bx)+3*4])
|
||||
ASL(1)
|
||||
AS2( mov ebx, outBlock)
|
||||
AS2( mov [ebx+0*4], eax)
|
||||
AS2( mov [ebx+1*4], ecx)
|
||||
AS2( mov [ebx+2*4], esi)
|
||||
AS2( mov [ebx+3*4], edi)
|
||||
AS2( mov WORD_REG(bx), outBlock)
|
||||
AS2( mov [WORD_REG(bx)+0*4], eax)
|
||||
AS2( mov [WORD_REG(bx)+1*4], ecx)
|
||||
AS2( mov [WORD_REG(bx)+2*4], esi)
|
||||
AS2( mov [WORD_REG(bx)+3*4], edi)
|
||||
#endif
|
||||
}
|
||||
else
|
||||
|
|
|
|||
27
secblock.h
27
secblock.h
|
|
@ -130,10 +130,13 @@ public:
|
|||
#endif
|
||||
|
||||
assert(IsAlignedOn(p, 16));
|
||||
return (T*)p;
|
||||
return (pointer)p;
|
||||
}
|
||||
|
||||
return new T[n];
|
||||
pointer p;
|
||||
while (!(p = (pointer)malloc(sizeof(T)*n)))
|
||||
CallNewHandler();
|
||||
return p;
|
||||
}
|
||||
|
||||
void deallocate(void *p, size_type n)
|
||||
|
|
@ -153,7 +156,7 @@ public:
|
|||
return;
|
||||
}
|
||||
|
||||
delete [] (T *)p;
|
||||
free(p);
|
||||
}
|
||||
|
||||
pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve)
|
||||
|
|
@ -164,13 +167,19 @@ public:
|
|||
// VS.NET STL enforces the policy of "All STL-compliant allocators have to provide a
|
||||
// template class member called rebind".
|
||||
template <class U> struct rebind { typedef AllocatorWithCleanup<U, T_Align16> other; };
|
||||
#if _MSC_VER >= 1500
|
||||
AllocatorWithCleanup() {}
|
||||
template <class U, bool A> AllocatorWithCleanup(const AllocatorWithCleanup<U, A> &) {}
|
||||
#endif
|
||||
};
|
||||
|
||||
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<byte>;
|
||||
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word16>;
|
||||
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word32>;
|
||||
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word64>;
|
||||
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word, CRYPTOPP_BOOL_X86>; // for Integer
|
||||
#if CRYPTOPP_BOOL_X86
|
||||
CRYPTOPP_DLL_TEMPLATE_CLASS AllocatorWithCleanup<word, true>; // for Integer
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
class NullAllocator : public AllocatorBase<T>
|
||||
|
|
@ -260,7 +269,7 @@ public:
|
|||
size_type max_size() const {return STDMAX(m_fallbackAllocator.max_size(), S);}
|
||||
|
||||
private:
|
||||
T* GetAlignedArray() {return T_Align16 ? (T*)(((byte *)m_array) + (0-(unsigned int)m_array)%16) : m_array;}
|
||||
T* GetAlignedArray() {return T_Align16 ? (T*)(((byte *)m_array) + (0-(size_t)m_array)%16) : m_array;}
|
||||
|
||||
CRYPTOPP_ALIGN_DATA(8) T m_array[T_Align16 ? S+8/sizeof(T) : S];
|
||||
A m_fallbackAllocator;
|
||||
|
|
@ -466,10 +475,10 @@ public:
|
|||
explicit SecBlockWithHint(size_t size) : SecBlock<T, A>(size) {}
|
||||
};
|
||||
|
||||
template<class T, class U>
|
||||
inline bool operator==(const CryptoPP::AllocatorWithCleanup<T>&, const CryptoPP::AllocatorWithCleanup<U>&) {return (true);}
|
||||
template<class T, class U>
|
||||
inline bool operator!=(const CryptoPP::AllocatorWithCleanup<T>&, const CryptoPP::AllocatorWithCleanup<U>&) {return (false);}
|
||||
template<class T, bool A, class U, bool B>
|
||||
inline bool operator==(const CryptoPP::AllocatorWithCleanup<T, A>&, const CryptoPP::AllocatorWithCleanup<U, B>&) {return (true);}
|
||||
template<class T, bool A, class U, bool B>
|
||||
inline bool operator!=(const CryptoPP::AllocatorWithCleanup<T, A>&, const CryptoPP::AllocatorWithCleanup<U, B>&) {return (false);}
|
||||
|
||||
NAMESPACE_END
|
||||
|
||||
|
|
|
|||
21
sha.cpp
21
sha.cpp
|
|
@ -308,9 +308,9 @@ CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN1
|
|||
W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
|
||||
};
|
||||
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
|
||||
// put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version
|
||||
static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
|
||||
CRYPTOPP_NAKED static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
|
||||
{
|
||||
#ifdef __GNUC__
|
||||
__asm__ __volatile__
|
||||
|
|
@ -319,6 +319,9 @@ static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64
|
|||
AS1( push ebx)
|
||||
AS2( mov ebx, eax)
|
||||
#else
|
||||
AS1( push ebx)
|
||||
AS1( push esi)
|
||||
AS1( push edi)
|
||||
AS2( lea ebx, SHA512_K)
|
||||
#endif
|
||||
|
||||
|
|
@ -486,22 +489,30 @@ static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64
|
|||
AS1( pop esp)
|
||||
AS1( emms)
|
||||
|
||||
#ifdef __GNUC__
|
||||
#if defined(__GNUC__)
|
||||
AS1( pop ebx)
|
||||
".att_syntax prefix;"
|
||||
:
|
||||
: "a" (SHA512_K), "c" (state), "d" (data)
|
||||
: "%esi", "%edi", "memory", "cc"
|
||||
);
|
||||
#else
|
||||
AS1( pop edi)
|
||||
AS1( pop esi)
|
||||
AS1( pop ebx)
|
||||
AS1( ret)
|
||||
#endif
|
||||
}
|
||||
#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
|
||||
void SHA512::Transform(word64 *state, const word64 *data)
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
|
||||
if (HasSSE2())
|
||||
return SHA512_SSE2_Transform(state, data);
|
||||
{
|
||||
SHA512_SSE2_Transform(state, data);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
|
||||
|
|
|
|||
14
smartptr.h
14
smartptr.h
|
|
@ -189,21 +189,21 @@ template <class T> counted_ptr<T> & counted_ptr<T>::operator=(const counted_ptr<
|
|||
template <class T> class vector_member_ptrs
|
||||
{
|
||||
public:
|
||||
vector_member_ptrs(unsigned int size=0)
|
||||
vector_member_ptrs(size_t size=0)
|
||||
: m_size(size), m_ptr(new member_ptr<T>[size]) {}
|
||||
~vector_member_ptrs()
|
||||
{delete [] this->m_ptr;}
|
||||
|
||||
member_ptr<T>& operator[](unsigned int index)
|
||||
member_ptr<T>& operator[](size_t index)
|
||||
{assert(index<this->m_size); return this->m_ptr[index];}
|
||||
const member_ptr<T>& operator[](unsigned int index) const
|
||||
const member_ptr<T>& operator[](size_t index) const
|
||||
{assert(index<this->m_size); return this->m_ptr[index];}
|
||||
|
||||
unsigned int size() const {return this->m_size;}
|
||||
void resize(unsigned int newSize)
|
||||
size_t size() const {return this->m_size;}
|
||||
void resize(size_t newSize)
|
||||
{
|
||||
member_ptr<T> *newPtr = new member_ptr<T>[newSize];
|
||||
for (unsigned int i=0; i<this->m_size && i<newSize; i++)
|
||||
for (size_t i=0; i<this->m_size && i<newSize; i++)
|
||||
newPtr[i].reset(this->m_ptr[i].release());
|
||||
delete [] this->m_ptr;
|
||||
this->m_size = newSize;
|
||||
|
|
@ -214,7 +214,7 @@ private:
|
|||
vector_member_ptrs(const vector_member_ptrs<T> &c); // copy not allowed
|
||||
void operator=(const vector_member_ptrs<T> &x); // assignment not allowed
|
||||
|
||||
unsigned int m_size;
|
||||
size_t m_size;
|
||||
member_ptr<T> *m_ptr;
|
||||
};
|
||||
|
||||
|
|
|
|||
267
sosemanuk.cpp
267
sosemanuk.cpp
|
|
@ -68,6 +68,10 @@ void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
|
|||
m_state[1] = b;
|
||||
m_state[2] = e;
|
||||
m_state[3] = d;
|
||||
|
||||
#define XMUX(c, x, y) (x ^ (y & (0 - (c & 1))))
|
||||
m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]);
|
||||
m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
|
||||
}
|
||||
|
||||
static word32 s_mulTables[512] = {
|
||||
|
|
@ -282,10 +286,8 @@ unsigned int SosemanukPolicy::GetAlignment() const
|
|||
else
|
||||
#endif
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
|
||||
unsigned int SosemanukPolicy::GetOptimalBlockSize() const
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
|
|
@ -316,54 +318,54 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
|
|||
__asm__ __volatile__
|
||||
(
|
||||
".intel_syntax noprefix;"
|
||||
AS1( push ebx)
|
||||
AS_PUSH( bx)
|
||||
#else
|
||||
word32 *state = m_state;
|
||||
AS2( mov eax, state)
|
||||
AS2( mov edi, output)
|
||||
AS2( mov edx, input)
|
||||
AS2( mov ecx, iterationCount)
|
||||
AS2( mov WORD_REG(ax), state)
|
||||
AS2( mov WORD_REG(di), output)
|
||||
AS2( mov WORD_REG(dx), input)
|
||||
AS2( mov WORD_REG(cx), iterationCount)
|
||||
#endif
|
||||
|
||||
#define SSE2_output DWORD PTR [esp+1*4]
|
||||
#define SSE2_input DWORD PTR [esp+2*4]
|
||||
#define SSE2_wordsLeft DWORD PTR [esp+3*4]
|
||||
#define SSE2_ediEnd DWORD PTR [esp+4*4]
|
||||
#define SSE2_pMulTables DWORD PTR [esp+5*4]
|
||||
#define SSE2_state DWORD PTR [esp+6*4]
|
||||
#define SSE2_wordsLeft2 DWORD PTR [esp+7*4]
|
||||
#define SSE2_stateCopy esp + 8*4
|
||||
#define SSE2_output WORD_PTR [WORD_REG(sp)+1*WORD_SZ]
|
||||
#define SSE2_input WORD_PTR [WORD_REG(sp)+2*WORD_SZ]
|
||||
#define SSE2_wordsLeft WORD_PTR [WORD_REG(sp)+3*WORD_SZ]
|
||||
#define SSE2_diEnd WORD_PTR [WORD_REG(sp)+4*WORD_SZ]
|
||||
#define SSE2_pMulTables WORD_PTR [WORD_REG(sp)+5*WORD_SZ]
|
||||
#define SSE2_state WORD_PTR [WORD_REG(sp)+6*WORD_SZ]
|
||||
#define SSE2_wordsLeft2 WORD_PTR [WORD_REG(sp)+7*WORD_SZ]
|
||||
#define SSE2_stateCopy WORD_REG(sp) + 8*WORD_SZ
|
||||
#define SSE2_uvStart SSE2_stateCopy + 12*4
|
||||
|
||||
AS1( push ebp)
|
||||
AS2( mov ebx, esp)
|
||||
AS2( and esp, 0xfffffff0)
|
||||
AS2( sub esp, 80*4*2+12*4+8*4) // 80 v's, 80 u's, 12 state, 8 locals
|
||||
AS2( mov [esp], ebx)
|
||||
AS2( mov SSE2_output, edi)
|
||||
AS2( mov SSE2_input, edx)
|
||||
AS2( mov SSE2_state, eax)
|
||||
AS_PUSH( bp)
|
||||
AS2( mov WORD_REG(bx), WORD_REG(sp))
|
||||
AS2( and WORD_REG(sp), -16)
|
||||
AS2( sub WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals
|
||||
AS2( mov [WORD_REG(sp)], WORD_REG(bx))
|
||||
AS2( mov SSE2_output, WORD_REG(di))
|
||||
AS2( mov SSE2_input, WORD_REG(dx))
|
||||
AS2( mov SSE2_state, WORD_REG(ax))
|
||||
#ifndef _MSC_VER
|
||||
AS2( mov SSE2_pMulTables, esi)
|
||||
AS2( mov SSE2_pMulTables, WORD_REG(si))
|
||||
#endif
|
||||
AS2( lea ecx, [4*ecx+ecx])
|
||||
AS2( lea esi, [4*ecx])
|
||||
AS2( mov SSE2_wordsLeft, esi)
|
||||
AS2( movdqa xmm0, [eax+0*16]) // copy state to stack to save a register
|
||||
AS2( lea WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
|
||||
AS2( lea WORD_REG(si), [4*WORD_REG(cx)])
|
||||
AS2( mov SSE2_wordsLeft, WORD_REG(si))
|
||||
AS2( movdqa xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register
|
||||
AS2( movdqa [SSE2_stateCopy+0*16], xmm0)
|
||||
AS2( movdqa xmm0, [eax+1*16])
|
||||
AS2( movdqa xmm0, [WORD_REG(ax)+1*16])
|
||||
AS2( movdqa [SSE2_stateCopy+1*16], xmm0)
|
||||
AS2( movq xmm0, QWORD PTR [eax+2*16])
|
||||
AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
|
||||
AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
|
||||
AS2( psrlq xmm0, 32)
|
||||
AS2( movd ebx, xmm0) // s(9)
|
||||
AS2( mov ecx, [eax+10*4])
|
||||
AS2( mov edx, [eax+11*4])
|
||||
AS2( mov ecx, [WORD_REG(ax)+10*4])
|
||||
AS2( mov edx, [WORD_REG(ax)+11*4])
|
||||
AS2( pcmpeqb xmm7, xmm7) // all ones
|
||||
|
||||
#define s(i) SSE2_stateCopy + ASM_MOD(i,10)*4
|
||||
#define u(j) edi + (ASM_MOD(j,4)*20 + (j/4)) * 4
|
||||
#define v(j) edi + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
|
||||
#define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
|
||||
#define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
|
||||
|
||||
#define r10 ecx
|
||||
#define r11 edx
|
||||
|
|
@ -371,42 +373,42 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
|
|||
#define r21 ecx
|
||||
|
||||
#define SSE2_STEP(i, j) \
|
||||
AS2( mov eax, [s(i+3)])\
|
||||
AS2( mov ebp, 1)\
|
||||
AS2( and ebp, r1##j)\
|
||||
AS1( neg ebp)\
|
||||
AS2( and ebp, [s(i+8)])\
|
||||
AS2( xor ebp, [s(i+1)])\
|
||||
AS2( add r2##j, ebp)\
|
||||
AS2( movzx ebp, al)\
|
||||
AS2( shr eax, 8)\
|
||||
AS2( xor eax, [esi+1024+ebp*4])\
|
||||
AS2( lea ebp, [ebx + r2##j])\
|
||||
AS2( xor ebx, eax)\
|
||||
AS2( imul r1##j, 0x54655307)\
|
||||
AS2( mov eax, [s(i+0)])\
|
||||
AS2( mov [v(i)], eax)\
|
||||
AS2( rol eax, 8)\
|
||||
AS2( xor ebx, eax)\
|
||||
AS2( movzx eax, al)\
|
||||
AS2( rol r1##j, 7)\
|
||||
AS2( xor ebx, [esi+eax*4])\
|
||||
AS2( lea ebp, [ebx + r2##j])\
|
||||
AS2( xor ebp, r1##j)\
|
||||
AS2( mov [u(i)], ebp)\
|
||||
AS2( mov ebp, 1)\
|
||||
AS2( and ebp, r2##j)\
|
||||
AS1( neg ebp)\
|
||||
AS2( and ebp, ebx)\
|
||||
AS2( xor ebx, eax)\
|
||||
AS2( movzx eax, al)\
|
||||
AS2( xor ebx, [WORD_REG(si)+WORD_REG(ax)*4])\
|
||||
AS2( mov eax, [s(i+3)])\
|
||||
AS2( xor ebp, [s(i+2)])\
|
||||
AS2( add r1##j, ebp)\
|
||||
AS2( movzx ebp, al)\
|
||||
AS2( shr eax, 8)\
|
||||
AS2( xor ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\
|
||||
AS2( xor ebx, eax)\
|
||||
AS2( imul r2##j, 0x54655307)\
|
||||
AS2( rol r2##j, 7)\
|
||||
AS2( mov [s(i+0)], ebx)\
|
||||
|
||||
ASL(2) // outer loop, each iteration of this processes 80 words
|
||||
AS2( lea edi, [SSE2_uvStart]) // start of v and u
|
||||
AS2( mov eax, 80)
|
||||
AS2( cmp esi, 80)
|
||||
AS2( cmovg esi, eax)
|
||||
AS2( mov SSE2_wordsLeft2, esi)
|
||||
AS2( lea esi, [edi+esi]) // use to first inner loop
|
||||
AS2( mov SSE2_ediEnd, esi)
|
||||
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
|
||||
AS2( mov WORD_REG(ax), 80)
|
||||
AS2( cmp WORD_REG(si), 80)
|
||||
AS2( cmovg WORD_REG(si), WORD_REG(ax))
|
||||
AS2( mov SSE2_wordsLeft2, WORD_REG(si))
|
||||
AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop
|
||||
AS2( mov SSE2_diEnd, WORD_REG(si))
|
||||
#ifdef _MSC_VER
|
||||
AS2( lea esi, s_mulTables)
|
||||
AS2( lea WORD_REG(si), s_mulTables)
|
||||
#else
|
||||
AS2( mov esi, SSE2_pMulTables)
|
||||
AS2( mov WORD_REG(si), SSE2_pMulTables)
|
||||
#endif
|
||||
|
||||
ASL(0) // first inner loop, 20 words each, 4 iterations
|
||||
|
|
@ -431,20 +433,20 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
|
|||
SSE2_STEP(18, 0)
|
||||
SSE2_STEP(19, 1)
|
||||
// loop
|
||||
AS2( add edi, 5*4)
|
||||
AS2( cmp edi, SSE2_ediEnd)
|
||||
AS2( add WORD_REG(di), 5*4)
|
||||
AS2( cmp WORD_REG(di), SSE2_diEnd)
|
||||
ASJ( jne, 0, b)
|
||||
|
||||
AS2( mov eax, SSE2_input)
|
||||
AS2( mov ebp, SSE2_output)
|
||||
AS2( lea edi, [SSE2_uvStart]) // start of v and u
|
||||
AS2( mov esi, SSE2_wordsLeft2)
|
||||
AS2( mov WORD_REG(ax), SSE2_input)
|
||||
AS2( mov WORD_REG(bp), SSE2_output)
|
||||
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
|
||||
AS2( mov WORD_REG(si), SSE2_wordsLeft2)
|
||||
|
||||
ASL(1) // second inner loop, 16 words each, 5 iterations
|
||||
AS2( movdqa xmm0, [edi+0*20*4])
|
||||
AS2( movdqa xmm1, [edi+1*20*4])
|
||||
AS2( movdqa xmm2, [edi+2*20*4])
|
||||
AS2( movdqa xmm3, [edi+3*20*4])
|
||||
AS2( movdqa xmm0, [WORD_REG(di)+0*20*4])
|
||||
AS2( movdqa xmm2, [WORD_REG(di)+2*20*4])
|
||||
AS2( movdqa xmm3, [WORD_REG(di)+3*20*4])
|
||||
AS2( movdqa xmm1, [WORD_REG(di)+1*20*4])
|
||||
// S2
|
||||
AS2( movdqa xmm4, xmm0)
|
||||
AS2( pand xmm0, xmm2)
|
||||
|
|
@ -463,13 +465,13 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
|
|||
AS2( pxor xmm1, xmm4)
|
||||
AS2( pxor xmm4, xmm7)
|
||||
// xor with v
|
||||
AS2( pxor xmm2, [edi+80*4])
|
||||
AS2( pxor xmm3, [edi+80*5])
|
||||
AS2( pxor xmm1, [edi+80*6])
|
||||
AS2( pxor xmm4, [edi+80*7])
|
||||
AS2( pxor xmm2, [WORD_REG(di)+80*4])
|
||||
AS2( pxor xmm3, [WORD_REG(di)+80*5])
|
||||
AS2( pxor xmm1, [WORD_REG(di)+80*6])
|
||||
AS2( pxor xmm4, [WORD_REG(di)+80*7])
|
||||
// exit loop early if less than 16 words left to output
|
||||
// this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop
|
||||
AS2( cmp esi, 16)
|
||||
AS2( cmp WORD_REG(si), 16)
|
||||
ASJ( jl, 4, f)
|
||||
// unpack
|
||||
AS2( movdqa xmm6, xmm2)
|
||||
|
|
@ -485,75 +487,75 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
|
|||
AS2( punpcklqdq xmm6, xmm5)
|
||||
AS2( punpckhqdq xmm3, xmm5)
|
||||
// output keystream
|
||||
AS2( test eax, eax)
|
||||
AS2( test WORD_REG(ax), WORD_REG(ax))
|
||||
ASJ( jz, 3, f)
|
||||
AS2( test eax, 0xf)
|
||||
ASJ( jnz, 7, f)
|
||||
AS2( pxor xmm2, [eax+0*16])
|
||||
AS2( pxor xmm0, [eax+1*16])
|
||||
AS2( pxor xmm6, [eax+2*16])
|
||||
AS2( pxor xmm3, [eax+3*16])
|
||||
AS2( add eax, 4*16)
|
||||
AS2( pxor xmm2, [WORD_REG(ax)+0*16])
|
||||
AS2( pxor xmm0, [WORD_REG(ax)+1*16])
|
||||
AS2( pxor xmm6, [WORD_REG(ax)+2*16])
|
||||
AS2( pxor xmm3, [WORD_REG(ax)+3*16])
|
||||
AS2( add WORD_REG(ax), 4*16)
|
||||
ASJ( jmp, 3, f)
|
||||
ASL(7)
|
||||
AS2( movdqu xmm1, [eax+0*16])
|
||||
AS2( movdqu xmm1, [WORD_REG(ax)+0*16])
|
||||
AS2( pxor xmm2, xmm1)
|
||||
AS2( movdqu xmm1, [eax+1*16])
|
||||
AS2( movdqu xmm1, [WORD_REG(ax)+1*16])
|
||||
AS2( pxor xmm0, xmm1)
|
||||
AS2( movdqu xmm1, [eax+2*16])
|
||||
AS2( movdqu xmm1, [WORD_REG(ax)+2*16])
|
||||
AS2( pxor xmm6, xmm1)
|
||||
AS2( movdqu xmm1, [eax+3*16])
|
||||
AS2( movdqu xmm1, [WORD_REG(ax)+3*16])
|
||||
AS2( pxor xmm3, xmm1)
|
||||
AS2( add eax, 4*16)
|
||||
AS2( add WORD_REG(ax), 4*16)
|
||||
ASL(3)
|
||||
AS2( test ebp, 0xf)
|
||||
ASJ( jnz, 8, f)
|
||||
AS2( movdqa [ebp+0*16], xmm2)
|
||||
AS2( movdqa [ebp+1*16], xmm0)
|
||||
AS2( movdqa [ebp+2*16], xmm6)
|
||||
AS2( movdqa [ebp+3*16], xmm3)
|
||||
AS2( movdqa [WORD_REG(bp)+0*16], xmm2)
|
||||
AS2( movdqa [WORD_REG(bp)+1*16], xmm0)
|
||||
AS2( movdqa [WORD_REG(bp)+2*16], xmm6)
|
||||
AS2( movdqa [WORD_REG(bp)+3*16], xmm3)
|
||||
ASJ( jmp, 9, f)
|
||||
ASL(8)
|
||||
AS2( movdqu [ebp+0*16], xmm2)
|
||||
AS2( movdqu [ebp+1*16], xmm0)
|
||||
AS2( movdqu [ebp+2*16], xmm6)
|
||||
AS2( movdqu [ebp+3*16], xmm3)
|
||||
AS2( movdqu [WORD_REG(bp)+0*16], xmm2)
|
||||
AS2( movdqu [WORD_REG(bp)+1*16], xmm0)
|
||||
AS2( movdqu [WORD_REG(bp)+2*16], xmm6)
|
||||
AS2( movdqu [WORD_REG(bp)+3*16], xmm3)
|
||||
ASL(9)
|
||||
// loop
|
||||
AS2( add edi, 4*4)
|
||||
AS2( add ebp, 4*16)
|
||||
AS2( sub esi, 16)
|
||||
AS2( add WORD_REG(di), 4*4)
|
||||
AS2( add WORD_REG(bp), 4*16)
|
||||
AS2( sub WORD_REG(si), 16)
|
||||
ASJ( jnz, 1, b)
|
||||
|
||||
// outer loop
|
||||
AS2( mov esi, SSE2_wordsLeft)
|
||||
AS2( sub esi, 80)
|
||||
AS2( mov WORD_REG(si), SSE2_wordsLeft)
|
||||
AS2( sub WORD_REG(si), 80)
|
||||
ASJ( jz, 6, f)
|
||||
AS2( mov SSE2_wordsLeft, esi)
|
||||
AS2( mov SSE2_input, eax)
|
||||
AS2( mov SSE2_output, ebp)
|
||||
AS2( mov SSE2_wordsLeft, WORD_REG(si))
|
||||
AS2( mov SSE2_input, WORD_REG(ax))
|
||||
AS2( mov SSE2_output, WORD_REG(bp))
|
||||
ASJ( jmp, 2, b)
|
||||
|
||||
ASL(4) // final output of less than 16 words
|
||||
AS2( test eax, eax)
|
||||
AS2( test WORD_REG(ax), WORD_REG(ax))
|
||||
ASJ( jz, 5, f)
|
||||
AS2( movd xmm0, [eax+0*4])
|
||||
AS2( movd xmm0, [WORD_REG(ax)+0*4])
|
||||
AS2( pxor xmm2, xmm0)
|
||||
AS2( movd xmm0, [eax+1*4])
|
||||
AS2( movd xmm0, [WORD_REG(ax)+1*4])
|
||||
AS2( pxor xmm3, xmm0)
|
||||
AS2( movd xmm0, [eax+2*4])
|
||||
AS2( movd xmm0, [WORD_REG(ax)+2*4])
|
||||
AS2( pxor xmm1, xmm0)
|
||||
AS2( movd xmm0, [eax+3*4])
|
||||
AS2( movd xmm0, [WORD_REG(ax)+3*4])
|
||||
AS2( pxor xmm4, xmm0)
|
||||
AS2( add eax, 16)
|
||||
AS2( add WORD_REG(ax), 16)
|
||||
ASL(5)
|
||||
AS2( movd [ebp+0*4], xmm2)
|
||||
AS2( movd [ebp+1*4], xmm3)
|
||||
AS2( movd [ebp+2*4], xmm1)
|
||||
AS2( movd [ebp+3*4], xmm4)
|
||||
AS2( sub esi, 4)
|
||||
AS2( movd [WORD_REG(bp)+0*4], xmm2)
|
||||
AS2( movd [WORD_REG(bp)+1*4], xmm3)
|
||||
AS2( movd [WORD_REG(bp)+2*4], xmm1)
|
||||
AS2( movd [WORD_REG(bp)+3*4], xmm4)
|
||||
AS2( sub WORD_REG(si), 4)
|
||||
ASJ( jz, 6, f)
|
||||
AS2( add ebp, 16)
|
||||
AS2( add WORD_REG(bp), 16)
|
||||
AS2( psrldq xmm2, 4)
|
||||
AS2( psrldq xmm3, 4)
|
||||
AS2( psrldq xmm1, 4)
|
||||
|
|
@ -561,26 +563,26 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
|
|||
ASJ( jmp, 4, b)
|
||||
|
||||
ASL(6) // save state
|
||||
AS2( mov ebx, SSE2_state)
|
||||
AS2( mov WORD_REG(bx), SSE2_state)
|
||||
AS2( movdqa xmm0, [SSE2_stateCopy+0*16])
|
||||
AS2( movdqa [ebx+0*16], xmm0)
|
||||
AS2( movdqa [WORD_REG(bx)+0*16], xmm0)
|
||||
AS2( movdqa xmm0, [SSE2_stateCopy+1*16])
|
||||
AS2( movdqa [ebx+1*16], xmm0)
|
||||
AS2( movdqa [WORD_REG(bx)+1*16], xmm0)
|
||||
AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
|
||||
AS2( movq QWORD PTR [ebx+2*16], xmm0)
|
||||
AS2( mov [ebx+10*4], ecx)
|
||||
AS2( mov [ebx+11*4], edx)
|
||||
AS2( movq QWORD PTR [WORD_REG(bx)+2*16], xmm0)
|
||||
AS2( mov [WORD_REG(bx)+10*4], ecx)
|
||||
AS2( mov [WORD_REG(bx)+11*4], edx)
|
||||
|
||||
AS1( pop esp)
|
||||
AS1( pop ebp)
|
||||
AS_POP( sp)
|
||||
AS_POP( bp)
|
||||
|
||||
#ifdef __GNUC__
|
||||
AS1( pop ebx)
|
||||
".att_syntax prefix;"
|
||||
:
|
||||
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
|
||||
: "memory", "cc"
|
||||
);
|
||||
AS_POP( bx)
|
||||
".att_syntax prefix;"
|
||||
:
|
||||
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
|
||||
: "memory", "cc"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
|
|
@ -593,17 +595,16 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
|
|||
#endif
|
||||
|
||||
#define DIV_A(x) (((x) >> 8) ^ s_mulTables[256 + byte(x)])
|
||||
#define XMUX(c, x, y) (x ^ (y & (0 - (c & 1))))
|
||||
|
||||
#define r1(i) ((i%2) ? reg2 : reg1)
|
||||
#define r2(i) ((i%2) ? reg1 : reg2)
|
||||
|
||||
#define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u) \
|
||||
r2(x0) += XMUX(r1(x0), s##x1, s##x8);\
|
||||
r1(x0) = rotlFixed(r1(x0) * 0x54655307, 7);\
|
||||
v = s##x0;\
|
||||
u = (s##x9 + r2(x0)) ^ r1(x0);\
|
||||
s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;
|
||||
v = s##x0;\
|
||||
s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;\
|
||||
r1(x0) += XMUX(r2(x0), s##x2, s##x9);\
|
||||
r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\
|
||||
|
||||
#define SOSEMANUK_OUTPUT(x) \
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
|
|||
|
||||
void Tiger::Transform (word64 *digest, const word64 *X)
|
||||
{
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
|
||||
if (HasSSE2())
|
||||
{
|
||||
#ifdef __GNUC__
|
||||
|
|
@ -43,9 +43,14 @@ void Tiger::Transform (word64 *digest, const word64 *X)
|
|||
".intel_syntax noprefix;"
|
||||
AS1( push ebx)
|
||||
#else
|
||||
#if _MSC_VER < 1300
|
||||
const word64 *t = table;
|
||||
AS2( mov edx, t)
|
||||
#else
|
||||
AS2( lea edx, [table])
|
||||
#endif
|
||||
AS2( mov eax, digest)
|
||||
AS2( mov esi, X)
|
||||
AS2( lea edx, [table])
|
||||
#endif
|
||||
AS2( movq mm0, [eax])
|
||||
AS2( movq mm1, [eax+1*8])
|
||||
|
|
|
|||
101
whrlpool.cpp
101
whrlpool.cpp
|
|
@ -390,7 +390,7 @@ CRYPTOPP_ALIGN_DATA(16) static const word64 Whirlpool_C[4*256+R] CRYPTOPP_SECTIO
|
|||
// Whirlpool basic transformation. Transforms state based on block.
|
||||
void Whirlpool::Transform(word64 *digest, const word64 *block)
|
||||
{
|
||||
#ifdef CRYPTOPP_X86_ASM_AVAILABLE
|
||||
#if defined(CRYPTOPP_X86_ASM_AVAILABLE)
|
||||
if (HasMMX())
|
||||
{
|
||||
// MMX version has the same structure as C version below
|
||||
|
|
@ -398,26 +398,29 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
|
|||
__asm__ __volatile__
|
||||
(
|
||||
".intel_syntax noprefix;"
|
||||
AS1( push ebx)
|
||||
AS2( mov ebx, eax)
|
||||
AS_PUSH( bx)
|
||||
AS2( mov WORD_REG(bx), WORD_REG(ax))
|
||||
#else
|
||||
AS2( lea ebx, [Whirlpool_C])
|
||||
AS2( mov ecx, digest)
|
||||
AS2( mov edx, block)
|
||||
#if _MSC_VER < 1300
|
||||
AS_PUSH( bx)
|
||||
#endif
|
||||
AS2( lea WORD_REG(bx), [Whirlpool_C])
|
||||
AS2( mov WORD_REG(cx), digest)
|
||||
AS2( mov WORD_REG(dx), block)
|
||||
#endif
|
||||
AS2( mov eax, esp)
|
||||
AS2( and esp, 0xfffffff0)
|
||||
AS2( sub esp, 16*8)
|
||||
AS1( push eax)
|
||||
AS2( mov WORD_REG(ax), WORD_REG(sp))
|
||||
AS2( and WORD_REG(sp), -16)
|
||||
AS2( sub WORD_REG(sp), 16*8)
|
||||
AS_PUSH( ax)
|
||||
AS2( xor esi, esi)
|
||||
ASL(0)
|
||||
AS2( movq mm0, [ecx+8*esi])
|
||||
AS2( movq [esp+4+8*esi], mm0) // k
|
||||
AS2( pxor mm0, [edx+8*esi])
|
||||
AS2( movq [esp+4+64+8*esi], mm0) // s
|
||||
AS2( movq [ecx+8*esi], mm0)
|
||||
AS1( inc esi)
|
||||
AS2( cmp esi, 8)
|
||||
AS2( movq mm0, [WORD_REG(cx)+8*WORD_REG(si)])
|
||||
AS2( movq [WORD_REG(sp)+WORD_SZ+8*WORD_REG(si)], mm0) // k
|
||||
AS2( pxor mm0, [WORD_REG(dx)+8*WORD_REG(si)])
|
||||
AS2( movq [WORD_REG(sp)+WORD_SZ+64+8*WORD_REG(si)], mm0) // s
|
||||
AS2( movq [WORD_REG(cx)+8*WORD_REG(si)], mm0)
|
||||
AS1( inc WORD_REG(si))
|
||||
AS2( cmp WORD_REG(si), 8)
|
||||
ASJ( jne, 0, b)
|
||||
|
||||
AS2( xor esi, esi)
|
||||
|
|
@ -427,16 +430,16 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
|
|||
#define KSL1(a, b) AS2(pxor mm##a, b)
|
||||
|
||||
#define KSL(op, i, a, b, c, d) \
|
||||
AS2(mov eax, [esp+4+8*i])\
|
||||
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*i])\
|
||||
AS2(movzx edi, al)\
|
||||
KSL##op(a, [ebx+3*2048+8*edi])\
|
||||
KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
|
||||
AS2(movzx edi, ah)\
|
||||
KSL##op(b, [ebx+2*2048+8*edi])\
|
||||
KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
|
||||
AS2(shr eax, 16)\
|
||||
AS2(movzx edi, al)\
|
||||
AS2(shr eax, 8)\
|
||||
KSL##op(c, [ebx+1*2048+8*edi])\
|
||||
KSL##op(d, [ebx+0*2048+8*eax])
|
||||
KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
|
||||
KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
|
||||
|
||||
#define KSH0(a, b) \
|
||||
ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
|
||||
|
|
@ -445,57 +448,57 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
|
|||
AS2(pxor mm##a, b)
|
||||
#define KSH2(a, b) \
|
||||
AS2(pxor mm##a, b)\
|
||||
AS2(movq [esp+4+8*a], mm##a)
|
||||
AS2(movq [WORD_REG(sp)+WORD_SZ+8*a], mm##a)
|
||||
|
||||
#define KSH(op, i, a, b, c, d) \
|
||||
AS2(mov eax, [esp+4+8*((i+4)-8*((i+4)/8))+4])\
|
||||
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*((i+4)-8*((i+4)/8))+4])\
|
||||
AS2(movzx edi, al)\
|
||||
KSH##op(a, [ebx+3*2048+8*edi])\
|
||||
KSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
|
||||
AS2(movzx edi, ah)\
|
||||
KSH##op(b, [ebx+2*2048+8*edi])\
|
||||
KSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
|
||||
AS2(shr eax, 16)\
|
||||
AS2(movzx edi, al)\
|
||||
AS2(shr eax, 8)\
|
||||
KSH##op(c, [ebx+1*2048+8*edi])\
|
||||
KSH##op(d, [ebx+0*2048+8*eax])
|
||||
KSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
|
||||
KSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
|
||||
|
||||
#define TSL(op, i, a, b, c, d) \
|
||||
AS2(mov eax, [esp+4+64+8*i])\
|
||||
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*i])\
|
||||
AS2(movzx edi, al)\
|
||||
KSL##op(a, [ebx+3*2048+8*edi])\
|
||||
KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
|
||||
AS2(movzx edi, ah)\
|
||||
KSL##op(b, [ebx+2*2048+8*edi])\
|
||||
KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
|
||||
AS2(shr eax, 16)\
|
||||
AS2(movzx edi, al)\
|
||||
AS2(shr eax, 8)\
|
||||
KSL##op(c, [ebx+1*2048+8*edi])\
|
||||
KSL##op(d, [ebx+0*2048+8*eax])
|
||||
KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
|
||||
KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
|
||||
|
||||
#define TSH0(a, b) \
|
||||
ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
|
||||
AS2(pxor mm##a, [esp+4+8*a])\
|
||||
AS2(pxor mm##a, [WORD_REG(sp)+WORD_SZ+8*a])\
|
||||
AS2(pxor mm##a, b)
|
||||
#define TSH1(a, b) \
|
||||
AS2(pxor mm##a, b)
|
||||
#define TSH2(a, b) \
|
||||
AS2(pxor mm##a, b)\
|
||||
AS2(movq [esp+4+64+8*a], mm##a)
|
||||
AS2(movq [WORD_REG(sp)+WORD_SZ+64+8*a], mm##a)
|
||||
#define TSH3(a, b) \
|
||||
AS2(pxor mm##a, b)\
|
||||
AS2(pxor mm##a, [ecx+8*a])\
|
||||
AS2(movq [ecx+8*a], mm##a)
|
||||
AS2(pxor mm##a, [WORD_REG(cx)+8*a])\
|
||||
AS2(movq [WORD_REG(cx)+8*a], mm##a)
|
||||
|
||||
#define TSH(op, i, a, b, c, d) \
|
||||
AS2(mov eax, [esp+4+64+8*((i+4)-8*((i+4)/8))+4])\
|
||||
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*((i+4)-8*((i+4)/8))+4])\
|
||||
AS2(movzx edi, al)\
|
||||
TSH##op(a, [ebx+3*2048+8*edi])\
|
||||
TSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
|
||||
AS2(movzx edi, ah)\
|
||||
TSH##op(b, [ebx+2*2048+8*edi])\
|
||||
TSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
|
||||
AS2(shr eax, 16)\
|
||||
AS2(movzx edi, al)\
|
||||
AS2(shr eax, 8)\
|
||||
TSH##op(c, [ebx+1*2048+8*edi])\
|
||||
TSH##op(d, [ebx+0*2048+8*eax])
|
||||
TSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
|
||||
TSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
|
||||
|
||||
KSL(0, 4, 3, 2, 1, 0)
|
||||
KSL(0, 0, 7, 6, 5, 4)
|
||||
|
|
@ -514,8 +517,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
|
|||
KSH(2, 3, 2, 1, 0, 7)
|
||||
KSH(2, 7, 6, 5, 4, 3)
|
||||
|
||||
AS2( pxor mm0, [ebx + 8*1024 + esi*8])
|
||||
AS2( movq [esp+4], mm0)
|
||||
AS2( pxor mm0, [WORD_REG(bx) + 8*1024 + WORD_REG(si)*8])
|
||||
AS2( movq [WORD_REG(sp)+WORD_SZ], mm0)
|
||||
|
||||
TSL(0, 4, 3, 2, 1, 0)
|
||||
TSL(0, 0, 7, 6, 5, 4)
|
||||
|
|
@ -532,8 +535,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
|
|||
TSH(1, 5, 4, 3, 2, 1)
|
||||
TSH(1, 6, 5, 4, 3, 2)
|
||||
|
||||
AS1( inc esi)
|
||||
AS2( cmp esi, 10)
|
||||
AS1( inc WORD_REG(si))
|
||||
AS2( cmp WORD_REG(si), 10)
|
||||
ASJ( je, 2, f)
|
||||
|
||||
TSH(2, 3, 2, 1, 0, 7)
|
||||
|
|
@ -550,11 +553,13 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
|
|||
#undef TSL
|
||||
#undef TSH
|
||||
|
||||
AS_POP( sp)
|
||||
AS1( emms)
|
||||
AS1( pop esp)
|
||||
|
||||
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
|
||||
AS_POP( bx)
|
||||
#endif
|
||||
#ifdef __GNUC__
|
||||
AS1( pop ebx)
|
||||
".att_syntax prefix;"
|
||||
:
|
||||
: "a" (Whirlpool_C), "c" (digest), "d" (block)
|
||||
|
|
|
|||
69
x64masm.asm
69
x64masm.asm
|
|
@ -1,80 +1,55 @@
|
|||
PUBLIC Baseline_Add
|
||||
PUBLIC Baseline_Sub
|
||||
.CODE
|
||||
ALIGN 8
|
||||
ALIGN 8
|
||||
Baseline_Add PROC
|
||||
|
||||
lea rdx, [rdx+8*rcx]
|
||||
lea r8, [r8+8*rcx]
|
||||
lea r9, [r9+8*rcx]
|
||||
|
||||
neg rcx ; rcx is negative index
|
||||
test rcx, 2 ; this clears carry flag
|
||||
jz $0@Baseline_Add
|
||||
sub rcx, 2
|
||||
jmp $1@Baseline_Add
|
||||
|
||||
$0@Baseline_Add:
|
||||
jrcxz $2@Baseline_Add ; loop until rcx overflows and becomes zero
|
||||
jz $1@Baseline_Add
|
||||
mov rax,[r8+8*rcx]
|
||||
adc rax,[r9+8*rcx]
|
||||
add rax,[r9+8*rcx]
|
||||
mov [rdx+8*rcx],rax
|
||||
$0@Baseline_Add:
|
||||
mov rax,[r8+8*rcx+8]
|
||||
adc rax,[r9+8*rcx+8]
|
||||
mov [rdx+8*rcx+8],rax
|
||||
$1@Baseline_Add:
|
||||
mov rax,[r8+8*rcx+16]
|
||||
adc rax,[r9+8*rcx+16]
|
||||
mov [rdx+8*rcx+16],rax
|
||||
mov rax,[r8+8*rcx+24]
|
||||
adc rax,[r9+8*rcx+24]
|
||||
mov [rdx+8*rcx+24],rax
|
||||
|
||||
lea rcx,[rcx+4] ; advance index, avoid inc which causes slowdown on Intel Core 2
|
||||
lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2
|
||||
jrcxz $1@Baseline_Add ; loop until rcx overflows and becomes zero
|
||||
mov rax,[r8+8*rcx]
|
||||
adc rax,[r9+8*rcx]
|
||||
mov [rdx+8*rcx],rax
|
||||
jmp $0@Baseline_Add
|
||||
|
||||
$2@Baseline_Add:
|
||||
$1@Baseline_Add:
|
||||
mov rax, 0
|
||||
setc al ; store carry into rax (return result register)
|
||||
|
||||
adc rax, rax ; store carry into rax (return result register)
|
||||
ret
|
||||
Baseline_Add ENDP
|
||||
|
||||
ALIGN 8
|
||||
ALIGN 8
|
||||
Baseline_Sub PROC
|
||||
|
||||
lea rdx, [rdx+8*rcx]
|
||||
lea r8, [r8+8*rcx]
|
||||
lea r9, [r9+8*rcx]
|
||||
|
||||
neg rcx ; rcx is negative index
|
||||
test rcx, 2 ; this clears carry flag
|
||||
jz $0@Baseline_Sub
|
||||
sub rcx, 2
|
||||
jmp $1@Baseline_Sub
|
||||
|
||||
$0@Baseline_Sub:
|
||||
jrcxz $2@Baseline_Sub ; loop until rcx overflows and becomes zero
|
||||
jz $1@Baseline_Sub
|
||||
mov rax,[r8+8*rcx]
|
||||
sbb rax,[r9+8*rcx]
|
||||
sub rax,[r9+8*rcx]
|
||||
mov [rdx+8*rcx],rax
|
||||
$0@Baseline_Sub:
|
||||
mov rax,[r8+8*rcx+8]
|
||||
sbb rax,[r9+8*rcx+8]
|
||||
mov [rdx+8*rcx+8],rax
|
||||
$1@Baseline_Sub:
|
||||
mov rax,[r8+8*rcx+16]
|
||||
sbb rax,[r9+8*rcx+16]
|
||||
mov [rdx+8*rcx+16],rax
|
||||
mov rax,[r8+8*rcx+24]
|
||||
sbb rax,[r9+8*rcx+24]
|
||||
mov [rdx+8*rcx+24],rax
|
||||
|
||||
lea rcx,[rcx+4] ; advance index, avoid inc which causes slowdown on Intel Core 2
|
||||
lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2
|
||||
jrcxz $1@Baseline_Sub ; loop until rcx overflows and becomes zero
|
||||
mov rax,[r8+8*rcx]
|
||||
sbb rax,[r9+8*rcx]
|
||||
mov [rdx+8*rcx],rax
|
||||
jmp $0@Baseline_Sub
|
||||
|
||||
$2@Baseline_Sub:
|
||||
$1@Baseline_Sub:
|
||||
mov rax, 0
|
||||
setc al ; store carry into rax (return result register)
|
||||
adc rax, rax ; store carry into rax (return result register)
|
||||
|
||||
ret
|
||||
Baseline_Sub ENDP
|
||||
|
|
|
|||
Loading…
Reference in New Issue