- port x64 assembly code to MASM

- improve stack unwindability on x64 for GCC by not modifying RBP/RSP registers in inline assembly
pull/2/head
weidai 2007-09-24 00:43:57 +00:00
parent 1921a557dc
commit 23accd43c5
7 changed files with 2833 additions and 513 deletions

121
cpu.h
View File

@ -1,6 +1,15 @@
#ifndef CRYPTOPP_CPU_H #ifndef CRYPTOPP_CPU_H
#define CRYPTOPP_CPU_H #define CRYPTOPP_CPU_H
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define CRYPTOPP_X86_ASM_AVAILABLE
#define CRYPTOPP_BOOL_X64 1
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 1
#define NAMESPACE_END
#else
#include "config.h" #include "config.h"
#ifdef CRYPTOPP_MSVC6PP_OR_LATER #ifdef CRYPTOPP_MSVC6PP_OR_LATER
@ -98,7 +107,18 @@ inline bool HasMMX() {return false;}
#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE || _MSC_VER >= 1400 #endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE || _MSC_VER >= 1400
#if defined(__GNUC__) #endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define AS1(x) x*newline*
#define AS2(x, y) x, y*newline*
#define AS3(x, y, z) x, y, z*newline*
#define ASS(x, y, a, b, c, d) x, y, a*64+b*16+c*4+d*newline*
#define ASL(x) label##x:*newline*
#define ASJ(x, y, z) x label##y*newline*
#define ASC(x, y) x label##y*newline*
#define AS_HEX(y) y##h
#elif defined(__GNUC__)
// define these in two steps to allow arguments to be expanded // define these in two steps to allow arguments to be expanded
#define GNU_AS1(x) #x ";" #define GNU_AS1(x) #x ";"
#define GNU_AS2(x, y) #x ", " #y ";" #define GNU_AS2(x, y) #x ", " #y ";"
@ -113,6 +133,7 @@ inline bool HasMMX() {return false;}
#define ASJ(x, y, z) GNU_ASJ(x, y, z) #define ASJ(x, y, z) GNU_ASJ(x, y, z)
#define ASC(x, y) #x " " #y ";" #define ASC(x, y) #x " " #y ";"
#define CRYPTOPP_NAKED #define CRYPTOPP_NAKED
#define AS_HEX(y) 0x##y
#else #else
#define AS1(x) __asm {x} #define AS1(x) __asm {x}
#define AS2(x, y) __asm {x, y} #define AS2(x, y) __asm {x, y}
@ -122,25 +143,115 @@ inline bool HasMMX() {return false;}
#define ASJ(x, y, z) __asm {x label##y} #define ASJ(x, y, z) __asm {x label##y}
#define ASC(x, y) __asm {x label##y} #define ASC(x, y) __asm {x label##y}
#define CRYPTOPP_NAKED __declspec(naked) #define CRYPTOPP_NAKED __declspec(naked)
#define AS_HEX(y) 0x##y
#endif #endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define ASM_MOD(x, y) ((x) MOD (y))
#else
// GNU assembler doesn't seem to have mod operator // GNU assembler doesn't seem to have mod operator
#define ASM_MOD(x, y) ((x)-((x)/(y))*(y)) #define ASM_MOD(x, y) ((x)-((x)/(y))*(y))
#endif
#if CRYPTOPP_BOOL_X86 #if CRYPTOPP_BOOL_X86
#define AS_REG_1 ecx
#define AS_REG_2 edx
#define AS_REG_3 esi
#define AS_REG_4 edi
#define AS_REG_5 eax
#define AS_REG_6 ebx
#define AS_REG_7 ebp
#define AS_REG_1d ecx
#define AS_REG_2d edx
#define AS_REG_3d esi
#define AS_REG_4d edi
#define AS_REG_5d eax
#define AS_REG_6d ebx
#define AS_REG_7d ebp
#define WORD_SZ 4 #define WORD_SZ 4
#define WORD_REG(x) e##x #define WORD_REG(x) e##x
#define WORD_PTR DWORD PTR #define WORD_PTR DWORD PTR
#define AS_PUSH(x) AS1(push e##x) #define AS_PUSH_IF86(x) AS1(push e##x)
#define AS_POP(x) AS1(pop e##x) #define AS_POP_IF86(x) AS1(pop e##x)
#define AS_JCXZ jecxz
#elif CRYPTOPP_BOOL_X64 #elif CRYPTOPP_BOOL_X64
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define AS_REG_1 rcx
#define AS_REG_2 rdx
#define AS_REG_3 r8
#define AS_REG_4 r9
#define AS_REG_5 rax
#define AS_REG_6 r10
#define AS_REG_7 r11
#define AS_REG_1d ecx
#define AS_REG_2d edx
#define AS_REG_3d r8d
#define AS_REG_4d r9d
#define AS_REG_5d eax
#define AS_REG_6d r10d
#define AS_REG_7d r11d
#else
#define AS_REG_1 rdi
#define AS_REG_2 rsi
#define AS_REG_3 rdx
#define AS_REG_4 rcx
#define AS_REG_5 r8
#define AS_REG_6 r9
#define AS_REG_7 r10
#define AS_REG_1d edi
#define AS_REG_2d esi
#define AS_REG_3d edx
#define AS_REG_4d ecx
#define AS_REG_5d r8d
#define AS_REG_6d r9d
#define AS_REG_7d r10d
#endif
#define WORD_SZ 8 #define WORD_SZ 8
#define WORD_REG(x) r##x #define WORD_REG(x) r##x
#define WORD_PTR QWORD PTR #define WORD_PTR QWORD PTR
#define AS_PUSH(x) AS1(pushq r##x) #define AS_PUSH_IF86(x)
#define AS_POP(x) AS1(popq r##x) #define AS_POP_IF86(x)
#define AS_JCXZ jrcxz
#endif #endif
// helper macro for stream cipher output
#define AS_XMM_OUTPUT4(labelPrefix, inputPtr, outputPtr, x0, x1, x2, x3, t, p0, p1, p2, p3, increment)\
AS2( test inputPtr, inputPtr)\
ASC( jz, labelPrefix##3)\
AS2( test inputPtr, 15)\
ASC( jnz, labelPrefix##7)\
AS2( pxor xmm##x0, [inputPtr+p0*16])\
AS2( pxor xmm##x1, [inputPtr+p1*16])\
AS2( pxor xmm##x2, [inputPtr+p2*16])\
AS2( pxor xmm##x3, [inputPtr+p3*16])\
AS2( add inputPtr, increment*16)\
ASC( jmp, labelPrefix##3)\
ASL(labelPrefix##7)\
AS2( movdqu xmm##t, [inputPtr+p0*16])\
AS2( pxor xmm##x0, xmm##t)\
AS2( movdqu xmm##t, [inputPtr+p1*16])\
AS2( pxor xmm##x1, xmm##t)\
AS2( movdqu xmm##t, [inputPtr+p2*16])\
AS2( pxor xmm##x2, xmm##t)\
AS2( movdqu xmm##t, [inputPtr+p3*16])\
AS2( pxor xmm##x3, xmm##t)\
AS2( add inputPtr, increment*16)\
ASL(labelPrefix##3)\
AS2( test outputPtr, 15)\
ASC( jnz, labelPrefix##8)\
AS2( movdqa [outputPtr+p0*16], xmm##x0)\
AS2( movdqa [outputPtr+p1*16], xmm##x1)\
AS2( movdqa [outputPtr+p2*16], xmm##x2)\
AS2( movdqa [outputPtr+p3*16], xmm##x3)\
ASC( jmp, labelPrefix##9)\
ASL(labelPrefix##8)\
AS2( movdqu [outputPtr+p0*16], xmm##x0)\
AS2( movdqu [outputPtr+p1*16], xmm##x1)\
AS2( movdqu [outputPtr+p2*16], xmm##x2)\
AS2( movdqu [outputPtr+p3*16], xmm##x3)\
ASL(labelPrefix##9)\
AS2( add outputPtr, increment*16)
NAMESPACE_END NAMESPACE_END
#endif #endif

View File

@ -1,6 +1,11 @@
// panama.cpp - written and placed in the public domain by Wei Dai // panama.cpp - written and placed in the public domain by Wei Dai
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM panama.cpp" to generate MASM code
#include "pch.h" #include "pch.h"
#ifndef CRYPTOPP_GENERATE_X64_MASM
#include "panama.h" #include "panama.h"
#include "misc.h" #include "misc.h"
#include "cpu.h" #include "cpu.h"
@ -16,41 +21,67 @@ void Panama<B>::Reset()
#endif #endif
} }
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y);
}
#elif CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#ifdef CRYPTOPP_GENERATE_X64_MASM
Panama_SSE2_Pull PROC FRAME
alloc_stack(2*16+8)
save_xmm128 xmm6, 0h
save_xmm128 xmm7, 10h
.endprolog
#else
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y) void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
{ {
#ifdef __GNUC__ #ifdef __GNUC__
__asm__ __volatile__ __asm__ __volatile__
( (
".intel_syntax noprefix;" ".intel_syntax noprefix;"
AS_PUSH( bx) AS_POP_IF86( bx)
#else #else
AS2( mov WORD_REG(cx), count) AS2( mov AS_REG_1, count)
AS2( mov WORD_REG(si), state) AS2( mov AS_REG_2, state)
AS2( mov WORD_REG(di), z) AS2( mov AS_REG_3, z)
AS2( mov WORD_REG(dx), y) AS2( mov AS_REG_4, y)
#endif #endif
AS2( shl WORD_REG(cx), 5) #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
#if CRYPTOPP_BOOL_X86
#define REG_loopEnd [esp]
#elif defined(CRYPTOPP_GENERATE_X64_MASM)
#define REG_loopEnd rdi
#else
#define REG_loopEnd r8
#endif
AS2( shl AS_REG_1, 5)
ASJ( jz, 5, f) ASJ( jz, 5, f)
AS2( mov ebx, [WORD_REG(si)+4*17]) AS2( mov AS_REG_6d, [AS_REG_2+4*17])
AS2( add WORD_REG(cx), WORD_REG(bx)) AS2( add AS_REG_1, AS_REG_6)
AS_PUSH( bp) #if CRYPTOPP_BOOL_X64
AS_PUSH( cx) AS2( mov REG_loopEnd, AS_REG_1)
#else
AS1( push ebp)
AS1( push AS_REG_1)
#endif
AS2( movdqa xmm0, [WORD_REG(si)+0*16]) AS2( movdqa xmm0, XMMWORD PTR [AS_REG_2+0*16])
AS2( movdqa xmm1, [WORD_REG(si)+1*16]) AS2( movdqa xmm1, XMMWORD PTR [AS_REG_2+1*16])
AS2( movdqa xmm2, [WORD_REG(si)+2*16]) AS2( movdqa xmm2, XMMWORD PTR [AS_REG_2+2*16])
AS2( movdqa xmm3, [WORD_REG(si)+3*16]) AS2( movdqa xmm3, XMMWORD PTR [AS_REG_2+3*16])
AS2( mov eax, [WORD_REG(si)+4*16]) AS2( mov eax, dword ptr [AS_REG_2+4*16])
ASL(4) ASL(4)
// gamma and pi // gamma and pi
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
AS2( test WORD_REG(bx), 1) AS2( test AS_REG_6, 1)
ASJ( jnz, 6, f) ASJ( jnz, 6, f)
#endif #endif
AS2( movdqa xmm6, xmm2) AS2( movdqa xmm6, xmm2)
@ -70,18 +101,18 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
ASL(7) ASL(7)
#endif #endif
AS2( movd ecx, xmm2) AS2( movd AS_REG_1d, xmm2)
AS1( not ecx) AS1( not AS_REG_1d)
AS2( movd ebp, xmm3) AS2( movd AS_REG_7d, xmm3)
AS2( or ecx, ebp) AS2( or AS_REG_1d, AS_REG_7d)
AS2( xor eax, ecx) AS2( xor eax, AS_REG_1d)
#define SSE2_Index(i) ASM_MOD(((i)*13+16), 17) #define SSE2_Index(i) ASM_MOD(((i)*13+16), 17)
#define pi(i) \ #define pi(i) \
AS2( movd ecx, xmm7)\ AS2( movd AS_REG_1d, xmm7)\
AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\ AS2( rol AS_REG_1d, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
AS2( mov [WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx) AS2( mov [AS_REG_2+SSE2_Index(ASM_MOD(5*(i), 17))*4], AS_REG_1d)
#define pi4(x, y, z, a, b, c, d) \ #define pi4(x, y, z, a, b, c, d) \
AS2( pcmpeqb xmm7, xmm7)\ AS2( pcmpeqb xmm7, xmm7)\
@ -110,65 +141,65 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( punpckhdq xmm2, xmm0) // 11 12 15 16 AS2( punpckhdq xmm2, xmm0) // 11 12 15 16
// keystream // keystream
AS2( test WORD_REG(di), WORD_REG(di)) AS2( test AS_REG_3, AS_REG_3)
ASJ( jz, 0, f) ASJ( jz, 0, f)
AS2( movdqa xmm6, xmm4) AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm2) AS2( punpcklqdq xmm4, xmm2)
AS2( punpckhqdq xmm6, xmm2) AS2( punpckhqdq xmm6, xmm2)
AS2( test WORD_REG(dx), 0xf) AS2( test AS_REG_4, 15)
ASJ( jnz, 2, f) ASJ( jnz, 2, f)
AS2( test WORD_REG(dx), WORD_REG(dx)) AS2( test AS_REG_4, AS_REG_4)
ASJ( jz, 1, f) ASJ( jz, 1, f)
AS2( pxor xmm4, [WORD_REG(dx)]) AS2( pxor xmm4, [AS_REG_4])
AS2( pxor xmm6, [WORD_REG(dx)+16]) AS2( pxor xmm6, [AS_REG_4+16])
AS2( add WORD_REG(dx), 32) AS2( add AS_REG_4, 32)
ASJ( jmp, 1, f) ASJ( jmp, 1, f)
ASL(2) ASL(2)
AS2( movdqu xmm0, [WORD_REG(dx)]) AS2( movdqu xmm0, [AS_REG_4])
AS2( movdqu xmm2, [WORD_REG(dx)+16]) AS2( movdqu xmm2, [AS_REG_4+16])
AS2( pxor xmm4, xmm0) AS2( pxor xmm4, xmm0)
AS2( pxor xmm6, xmm2) AS2( pxor xmm6, xmm2)
AS2( add WORD_REG(dx), 32) AS2( add AS_REG_4, 32)
ASL(1) ASL(1)
AS2( test WORD_REG(di), 0xf) AS2( test AS_REG_3, 15)
ASJ( jnz, 3, f) ASJ( jnz, 3, f)
AS2( movdqa [WORD_REG(di)], xmm4) AS2( movdqa XMMWORD PTR [AS_REG_3], xmm4)
AS2( movdqa [WORD_REG(di)+16], xmm6) AS2( movdqa XMMWORD PTR [AS_REG_3+16], xmm6)
AS2( add WORD_REG(di), 32) AS2( add AS_REG_3, 32)
ASJ( jmp, 0, f) ASJ( jmp, 0, f)
ASL(3) ASL(3)
AS2( movdqu [WORD_REG(di)], xmm4) AS2( movdqu XMMWORD PTR [AS_REG_3], xmm4)
AS2( movdqu [WORD_REG(di)+16], xmm6) AS2( movdqu XMMWORD PTR [AS_REG_3+16], xmm6)
AS2( add WORD_REG(di), 32) AS2( add AS_REG_3, 32)
ASL(0) ASL(0)
// buffer update // buffer update
AS2( lea WORD_REG(cx), [WORD_REG(bx) + 32]) AS2( lea AS_REG_1, [AS_REG_6 + 32])
AS2( and WORD_REG(cx), 31*32) AS2( and AS_REG_1, 31*32)
AS2( lea WORD_REG(bp), [WORD_REG(bx) + (32-24)*32]) AS2( lea AS_REG_7, [AS_REG_6 + (32-24)*32])
AS2( and WORD_REG(bp), 31*32) AS2( and AS_REG_7, 31*32)
AS2( movdqa xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8]) AS2( movdqa xmm0, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*8])
AS2( pxor xmm3, xmm0) AS2( pxor xmm3, xmm0)
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1) ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3) AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
AS2( pxor xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8]) AS2( pxor xmm0, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+2*8])
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0) AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8]) AS2( movdqa xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+2*8])
AS2( pxor xmm1, xmm4) AS2( pxor xmm1, xmm4)
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1) AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
AS2( pxor xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8]) AS2( pxor xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*8])
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4) AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
// theta // theta
AS2( movdqa xmm3, [WORD_REG(si)+3*16]) AS2( movdqa xmm3, XMMWORD PTR [AS_REG_2+3*16])
AS2( movdqa xmm2, [WORD_REG(si)+2*16]) AS2( movdqa xmm2, XMMWORD PTR [AS_REG_2+2*16])
AS2( movdqa xmm1, [WORD_REG(si)+1*16]) AS2( movdqa xmm1, XMMWORD PTR [AS_REG_2+1*16])
AS2( movdqa xmm0, [WORD_REG(si)+0*16]) AS2( movdqa xmm0, XMMWORD PTR [AS_REG_2+0*16])
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
AS2( test WORD_REG(bx), 1) AS2( test AS_REG_6, 1)
ASJ( jnz, 8, f) ASJ( jnz, 8, f)
#endif #endif
AS2( movd xmm6, eax) AS2( movd xmm6, eax)
@ -199,10 +230,10 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
#endif #endif
AS2( xor eax, 1) AS2( xor eax, 1)
AS2( movd ecx, xmm0) AS2( movd AS_REG_1d, xmm0)
AS2( xor eax, ecx) AS2( xor eax, AS_REG_1d)
AS2( movd ecx, xmm3) AS2( movd AS_REG_1d, xmm3)
AS2( xor eax, ecx) AS2( xor eax, AS_REG_1d)
AS2( pxor xmm3, xmm2) AS2( pxor xmm3, xmm2)
AS2( pxor xmm2, xmm1) AS2( pxor xmm2, xmm1)
@ -214,21 +245,21 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( pxor xmm0, xmm4) AS2( pxor xmm0, xmm4)
// sigma // sigma
AS2( lea WORD_REG(cx), [WORD_REG(bx) + (32-4)*32]) AS2( lea AS_REG_1, [AS_REG_6 + (32-4)*32])
AS2( and WORD_REG(cx), 31*32) AS2( and AS_REG_1, 31*32)
AS2( lea WORD_REG(bp), [WORD_REG(bx) + 16*32]) AS2( lea AS_REG_7, [AS_REG_6 + 16*32])
AS2( and WORD_REG(bp), 31*32) AS2( and AS_REG_7, 31*32)
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16]) AS2( movdqa xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*16])
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16]) AS2( movdqa xmm5, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*16])
AS2( movdqa xmm6, xmm4) AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5) AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5) AS2( punpckhqdq xmm6, xmm5)
AS2( pxor xmm3, xmm4) AS2( pxor xmm3, xmm4)
AS2( pxor xmm2, xmm6) AS2( pxor xmm2, xmm6)
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16]) AS2( movdqa xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+1*16])
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16]) AS2( movdqa xmm5, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+1*16])
AS2( movdqa xmm6, xmm4) AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5) AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5) AS2( punpckhqdq xmm6, xmm5)
@ -236,31 +267,48 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( pxor xmm0, xmm6) AS2( pxor xmm0, xmm6)
// loop // loop
AS2( add WORD_REG(bx), 32) AS2( add AS_REG_6, 32)
AS2( cmp WORD_REG(bx), [WORD_REG(sp)]) AS2( cmp AS_REG_6, REG_loopEnd)
ASJ( jne, 4, b) ASJ( jne, 4, b)
// save state // save state
AS2( add WORD_REG(sp), WORD_SZ) AS2( mov [AS_REG_2+4*16], eax)
AS_POP( bp) AS2( movdqa XMMWORD PTR [AS_REG_2+3*16], xmm3)
AS2( mov [WORD_REG(si)+4*16], eax) AS2( movdqa XMMWORD PTR [AS_REG_2+2*16], xmm2)
AS2( movdqa [WORD_REG(si)+3*16], xmm3) AS2( movdqa XMMWORD PTR [AS_REG_2+1*16], xmm1)
AS2( movdqa [WORD_REG(si)+2*16], xmm2) AS2( movdqa XMMWORD PTR [AS_REG_2+0*16], xmm0)
AS2( movdqa [WORD_REG(si)+1*16], xmm1)
AS2( movdqa [WORD_REG(si)+0*16], xmm0) #if CRYPTOPP_BOOL_X86
AS2( add esp, 4)
AS1( pop ebp)
#endif
ASL(5) ASL(5)
#ifdef __GNUC__ #ifdef __GNUC__
AS_POP( bx) AS_POP_IF86( bx)
".att_syntax prefix;" ".att_syntax prefix;"
: :
: "c" (count), "S" (state), "D" (z), "d" (y) #if CRYPTOPP_BOOL_X64
: "%eax", "memory", "cc" : "D" (count), "S" (state), "d" (z), "c" (y)
: "%r8", "%r9", "r10", "%eax", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#else
: "c" (count), "d" (state), "S" (z), "D" (y)
: "%eax", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#endif
); );
#endif #endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
movdqa xmm6, [rsp + 0h]
movdqa xmm7, [rsp + 10h]
add rsp, 2*16+8
ret
Panama_SSE2_Pull ENDP
#else
} }
#endif #endif
#endif // #ifdef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#ifndef CRYPTOPP_GENERATE_X64_MASM
template <class B> template <class B>
void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y) void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y)
@ -411,7 +459,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byt
this->Iterate(1, buf); this->Iterate(1, buf);
} }
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
Panama_SSE2_Pull(32, this->m_state, NULL, NULL); Panama_SSE2_Pull(32, this->m_state, NULL, NULL);
else else
@ -423,7 +471,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byt
template <class B> template <class B>
unsigned int PanamaCipherPolicy<B>::GetAlignment() const unsigned int PanamaCipherPolicy<B>::GetAlignment() const
{ {
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
return 16; return 16;
else else
@ -435,7 +483,7 @@ unsigned int PanamaCipherPolicy<B>::GetAlignment() const
template <class B> template <class B>
void PanamaCipherPolicy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) void PanamaCipherPolicy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
{ {
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2()) if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
Panama_SSE2_Pull(iterationCount, this->m_state, (word32 *)output, (const word32 *)input); Panama_SSE2_Pull(iterationCount, this->m_state, (word32 *)output, (const word32 *)input);
else else
@ -453,3 +501,5 @@ template class PanamaCipherPolicy<BigEndian>;
template class PanamaCipherPolicy<LittleEndian>; template class PanamaCipherPolicy<LittleEndian>;
NAMESPACE_END NAMESPACE_END
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM

View File

@ -2,6 +2,8 @@
// and Wei Dai from Paulo Baretto's Rijndael implementation // and Wei Dai from Paulo Baretto's Rijndael implementation
// The original code and all modifications are in the public domain. // The original code and all modifications are in the public domain.
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
/* /*
Defense against timing attacks was added in July 2006 by Wei Dai. Defense against timing attacks was added in July 2006 by Wei Dai.
@ -48,6 +50,7 @@ being unloaded from L1 cache, until that round is finished.
#include "pch.h" #include "pch.h"
#ifndef CRYPTOPP_IMPORTS #ifndef CRYPTOPP_IMPORTS
#ifndef CRYPTOPP_GENERATE_X64_MASM
#include "rijndael.h" #include "rijndael.h"
#include "misc.h" #include "misc.h"
@ -145,27 +148,56 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16); ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
} }
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
void Rijndael_Enc_ProcessAndXorBlock(const word32 *table, word32 cacheLineSize, const word32 *k, const word32 *kLoopEnd, const byte *inBlock, const byte *xorBlock, byte *outBlock);
}
#endif
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{ {
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
Rijndael_Enc_ProcessAndXorBlock(Te, g_cacheLineSize, m_key, m_key + m_rounds*4, inBlock, xorBlock, outBlock);
return;
#endif
#if defined(CRYPTOPP_X86_ASM_AVAILABLE) #if defined(CRYPTOPP_X86_ASM_AVAILABLE)
#ifdef CRYPTOPP_GENERATE_X64_MASM
ALIGN 8
Rijndael_Enc_ProcessAndXorBlock PROC FRAME
rex_push_reg rbx
push_reg rsi
push_reg rdi
push_reg r12
push_reg r13
push_reg r14
push_reg r15
.endprolog
mov AS_REG_7, rcx
mov rdi, [rsp + 5*8 + 7*8] ; inBlock
#else
if (HasMMX()) if (HasMMX())
{ {
const word32 *k = m_key; const word32 *k = m_key;
const word32 *kLoopEnd = k + m_rounds*4; const word32 *kLoopEnd = k + m_rounds*4;
#endif
#if CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X64
#define K_REG r8 #define K_REG r8
#define K_END_REG r9 #define K_END_REG r9
#define SAVE_K #define SAVE_K
#define RESTORE_K #define RESTORE_K
#define RESTORE_K_END #define RESTORE_K_END
#define SAVE_0(x) AS2(mov r10d, x) #define SAVE_0(x) AS2(mov r13d, x)
#define SAVE_1(x) AS2(mov r11d, x) #define SAVE_1(x) AS2(mov r14d, x)
#define SAVE_2(x) AS2(mov r12d, x) #define SAVE_2(x) AS2(mov r15d, x)
#define RESTORE_0(x) AS2(mov x, r10d) #define RESTORE_0(x) AS2(mov x, r13d)
#define RESTORE_1(x) AS2(mov x, r11d) #define RESTORE_1(x) AS2(mov x, r14d)
#define RESTORE_2(x) AS2(mov x, r12d) #define RESTORE_2(x) AS2(mov x, r15d)
#else #else
#define K_REG esi #define K_REG esi
#define K_END_REG edi #define K_END_REG edi
@ -184,22 +216,16 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
__asm__ __volatile__ __asm__ __volatile__
( (
".intel_syntax noprefix;" ".intel_syntax noprefix;"
AS_PUSH( bx)
AS_PUSH( bp)
AS2( mov WORD_REG(bp), WORD_REG(ax))
#if CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X64
// save these manually. clobber list doesn't seem to work as of GCC 4.1.0
AS1( pushq K_REG)
AS1( pushq K_END_REG)
AS1( pushq r10)
AS1( pushq r11)
AS1( pushq r12)
AS2( mov K_REG, rsi) AS2( mov K_REG, rsi)
AS2( mov K_END_REG, rcx) AS2( mov K_END_REG, rcx)
#else #else
AS1( push ebx)
AS1( push ebp)
AS2( movd mm5, ecx) AS2( movd mm5, ecx)
#endif #endif
#else AS2( mov AS_REG_7, WORD_REG(ax))
#elif CRYPTOPP_BOOL_X86
#if _MSC_VER < 1300 #if _MSC_VER < 1300
const word32 *t = Te; const word32 *t = Te;
AS2( mov eax, t) AS2( mov eax, t)
@ -209,12 +235,12 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
AS2( mov K_REG, k) AS2( mov K_REG, k)
AS2( movd mm5, kLoopEnd) AS2( movd mm5, kLoopEnd)
#if _MSC_VER < 1300 #if _MSC_VER < 1300
AS_PUSH( bx) AS1( push ebx)
AS_PUSH( bp) AS1( push ebp)
AS2( mov ebp, eax) AS2( mov AS_REG_7, eax)
#else #else
AS_PUSH( bp) AS1( push ebp)
AS2( lea ebp, Te) AS2( lea AS_REG_7, Te)
#endif #endif
#endif #endif
AS2( mov eax, [K_REG+0*4]) // s0 AS2( mov eax, [K_REG+0*4]) // s0
@ -236,21 +262,21 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
AS2( and ebx, 0) AS2( and ebx, 0)
AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence
ASL(2) ASL(2)
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( and ebx, [AS_REG_7+WORD_REG(di)])
AS2( add edi, edx) AS2( add edi, edx)
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( and ebx, [AS_REG_7+WORD_REG(di)])
AS2( add edi, edx) AS2( add edi, edx)
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( and ebx, [AS_REG_7+WORD_REG(di)])
AS2( add edi, edx) AS2( add edi, edx)
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)]) AS2( and ebx, [AS_REG_7+WORD_REG(di)])
AS2( add edi, edx) AS2( add edi, edx)
AS2( cmp edi, 1024) AS2( cmp edi, 1024)
ASJ( jl, 2, b) ASJ( jl, 2, b)
AS2( and ebx, [WORD_REG(bp)+1020]) AS2( and ebx, [AS_REG_7+1020])
#if CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X64
AS2( xor r10d, ebx) AS2( xor r13d, ebx)
AS2( xor r11d, ebx) AS2( xor r14d, ebx)
AS2( xor r12d, ebx) AS2( xor r15d, ebx)
#else #else
AS2( movd mm6, ebx) AS2( movd mm6, ebx)
AS2( pxor mm2, mm6) AS2( pxor mm2, mm6)
@ -268,14 +294,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#define QUARTER_ROUND(t, a, b, c, d) \ #define QUARTER_ROUND(t, a, b, c, d) \
AS2(movzx esi, t##l)\ AS2(movzx esi, t##l)\
AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\ AS2(d, [AS_REG_7+0*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\ AS2(movzx esi, t##h)\
AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\ AS2(c, [AS_REG_7+1*1024+4*WORD_REG(si)])\
AS2(shr e##t##x, 16)\ AS2(shr e##t##x, 16)\
AS2(movzx esi, t##l)\ AS2(movzx esi, t##l)\
AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\ AS2(b, [AS_REG_7+2*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\ AS2(movzx esi, t##h)\
AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)]) AS2(a, [AS_REG_7+3*1024+4*WORD_REG(si)])
#define s0 xor edi #define s0 xor edi
#define s1 xor eax #define s1 xor eax
@ -308,14 +334,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#define QUARTER_ROUND(t, a, b, c, d) \ #define QUARTER_ROUND(t, a, b, c, d) \
AS2(movzx esi, t##l)\ AS2(movzx esi, t##l)\
AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\ AS2(a, [AS_REG_7+3*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\ AS2(movzx esi, t##h)\
AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\ AS2(b, [AS_REG_7+2*1024+4*WORD_REG(si)])\
AS2(shr e##t##x, 16)\ AS2(shr e##t##x, 16)\
AS2(movzx esi, t##l)\ AS2(movzx esi, t##l)\
AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\ AS2(c, [AS_REG_7+1*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\ AS2(movzx esi, t##h)\
AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)]) AS2(d, [AS_REG_7+0*1024+4*WORD_REG(si)])
QUARTER_ROUND(d, s0, s1, s2, s3) QUARTER_ROUND(d, s0, s1, s2, s3)
RESTORE_2(edx) RESTORE_2(edx)
@ -369,20 +395,20 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#define QUARTER_ROUND(a, b, c, d) \ #define QUARTER_ROUND(a, b, c, d) \
AS2( movzx ebx, dl)\ AS2( movzx ebx, dl)\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\
AS2( shl ebx, 3*8)\ AS2( shl ebx, 3*8)\
AS2( xor a, ebx)\ AS2( xor a, ebx)\
AS2( movzx ebx, dh)\ AS2( movzx ebx, dh)\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\
AS2( shl ebx, 2*8)\ AS2( shl ebx, 2*8)\
AS2( xor b, ebx)\ AS2( xor b, ebx)\
AS2( shr edx, 16)\ AS2( shr edx, 16)\
AS2( movzx ebx, dl)\ AS2( movzx ebx, dl)\
AS2( shr edx, 8)\ AS2( shr edx, 8)\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\ AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\
AS2( shl ebx, 1*8)\ AS2( shl ebx, 1*8)\
AS2( xor c, ebx)\ AS2( xor c, ebx)\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\ AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(dx)])\
AS2( xor d, ebx) AS2( xor d, ebx)
QUARTER_ROUND(eax, ecx, esi, edi) QUARTER_ROUND(eax, ecx, esi, edi)
@ -395,25 +421,22 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#undef QUARTER_ROUND #undef QUARTER_ROUND
#if CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X86
AS1(popq r12)
AS1(popq r11)
AS1(popq r10)
AS1(popq K_END_REG)
AS1(popq K_REG)
#else
AS1(emms) AS1(emms)
#endif AS1(pop ebp)
AS_POP( bp)
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300) #if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
AS_POP( bx) AS1(pop ebx)
#endif #endif
#endif
#ifdef __GNUC__ #ifdef __GNUC__
".att_syntax prefix;" ".att_syntax prefix;"
: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3) : "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize) : "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
: "memory", "cc" : "memory", "cc"
#if CRYPTOPP_BOOL_X64
, "%ebx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
#endif
); );
if (xorBlock) if (xorBlock)
@ -428,7 +451,11 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
((word32 *)outBlock)[2] = t2; ((word32 *)outBlock)[2] = t2;
((word32 *)outBlock)[3] = t3; ((word32 *)outBlock)[3] = t3;
#else #else
AS2( mov WORD_REG(bx), xorBlock) #if CRYPTOPP_BOOL_X64
mov rbx, [rsp + 6*8 + 7*8] ; xorBlock
#else
AS2( mov ebx, xorBlock)
#endif
AS2( test WORD_REG(bx), WORD_REG(bx)) AS2( test WORD_REG(bx), WORD_REG(bx))
ASJ( jz, 1, f) ASJ( jz, 1, f)
AS2( xor eax, [WORD_REG(bx)+0*4]) AS2( xor eax, [WORD_REG(bx)+0*4])
@ -436,15 +463,33 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
AS2( xor esi, [WORD_REG(bx)+2*4]) AS2( xor esi, [WORD_REG(bx)+2*4])
AS2( xor edi, [WORD_REG(bx)+3*4]) AS2( xor edi, [WORD_REG(bx)+3*4])
ASL(1) ASL(1)
AS2( mov WORD_REG(bx), outBlock) #if CRYPTOPP_BOOL_X64
mov rbx, [rsp + 7*8 + 7*8] ; outBlock
#else
AS2( mov ebx, outBlock)
#endif
AS2( mov [WORD_REG(bx)+0*4], eax) AS2( mov [WORD_REG(bx)+0*4], eax)
AS2( mov [WORD_REG(bx)+1*4], ecx) AS2( mov [WORD_REG(bx)+1*4], ecx)
AS2( mov [WORD_REG(bx)+2*4], esi) AS2( mov [WORD_REG(bx)+2*4], esi)
AS2( mov [WORD_REG(bx)+3*4], edi) AS2( mov [WORD_REG(bx)+3*4], edi)
#endif #endif
#if CRYPTOPP_GENERATE_X64_MASM
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbx
ret
Rijndael_Enc_ProcessAndXorBlock ENDP
#else
} }
else else
#endif
#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE #endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE
#ifndef CRYPTOPP_GENERATE_X64_MASM
{ {
word32 s0, s1, s2, s3, t0, t1, t2, t3; word32 s0, s1, s2, s3, t0, t1, t2, t3;
const word32 *rk = m_key; const word32 *rk = m_key;
@ -674,3 +719,4 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
NAMESPACE_END NAMESPACE_END
#endif #endif
#endif

597
salsa.cpp
View File

@ -1,6 +1,11 @@
// salsa.cpp - written and placed in the public domain by Wei Dai // salsa.cpp - written and placed in the public domain by Wei Dai
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
#include "pch.h" #include "pch.h"
#ifndef CRYPTOPP_GENERATE_X64_MASM
#include "salsa.h" #include "salsa.h"
#include "misc.h" #include "misc.h"
#include "argnames.h" #include "argnames.h"
@ -53,7 +58,7 @@ void Salsa20_Policy::SeekToIteration(lword iterationCount)
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
unsigned int Salsa20_Policy::GetAlignment() const unsigned int Salsa20_Policy::GetAlignment() const
{ {
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2()) if (HasSSE2())
return 16; return 16;
else else
@ -63,7 +68,7 @@ unsigned int Salsa20_Policy::GetAlignment() const
unsigned int Salsa20_Policy::GetOptimalBlockSize() const unsigned int Salsa20_Policy::GetOptimalBlockSize() const
{ {
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2()) if (HasSSE2())
return 4*BYTES_PER_ITERATION; return 4*BYTES_PER_ITERATION;
else else
@ -72,202 +77,421 @@ unsigned int Salsa20_Policy::GetOptimalBlockSize() const
} }
#endif #endif
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
}
#endif
void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
{ {
int i; #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#define SSE2_QUARTER_ROUND(a, b, d, i) {\
__m128i t = _mm_add_epi32(a, d); \
b = _mm_xor_si128(b, _mm_slli_epi32(t, i)); \
b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));}
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
return;
#endif
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#ifdef CRYPTOPP_GENERATE_X64_MASM
ALIGN 8
Salsa20_OperateKeystream PROC FRAME
mov r10, [rsp + 5*8] ; state
alloc_stack(10*16 + 32*16 + 8)
save_xmm128 xmm6, 0200h
save_xmm128 xmm7, 0210h
save_xmm128 xmm8, 0220h
save_xmm128 xmm9, 0230h
save_xmm128 xmm10, 0240h
save_xmm128 xmm11, 0250h
save_xmm128 xmm12, 0260h
save_xmm128 xmm13, 0270h
save_xmm128 xmm14, 0280h
save_xmm128 xmm15, 0290h
.endprolog
#define REG_output rcx
#define REG_input rdx
#define REG_iterationCount r8
#define REG_state r10
#define REG_rounds eax
#define REG_temp32 r11d
#define REG_temp r11
#define SSE2_WORKSPACE rsp
#define SSE2_LOAD_ROUNDS mov eax, r9d
#else
if (HasSSE2()) if (HasSSE2())
{ {
__m128i *s = (__m128i *)m_state.data(); #if CRYPTOPP_BOOL_X64
#define REG_output %4
#define REG_input %1
#define REG_iterationCount %2
#define REG_state %3
#define REG_rounds eax
#define REG_temp32 edx
#define REG_temp rdx
#define SSE2_WORKSPACE %5
#define SSE2_LOAD_ROUNDS AS2(mov eax, %0)
#if _MSC_VER > 1400 || (defined(_MSC_VER) && CRYPTOPP_BOOL_X86) || (CRYPTOPP_GCC_VERSION >= 40000 && CRYPTOPP_BOOL_X86) __m128i workspace[32];
// This code triggers an internal compiler error on MSVC 2005 when compiling #else
// for x64 with optimizations on. hopefully it will get fixed in the next release. #define REG_output edi
// A bug report has been submitted at http://connect.microsoft.com/VisualStudio/feedback/ViewFeedback.aspx?FeedbackID=274123 #define REG_input eax
// Also, GCC 3.4.4 generates incorrect code for x86 at -O2. #define REG_iterationCount ecx
// GCC 4.1.1 generates incorrect code for x64 at -O2 #define REG_state esi
if (iterationCount >= 4) #define REG_rounds ebx
{ #define REG_temp32 edx
__m128i ss[16]; #define REG_temp edx
ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0)); #define SSE2_WORKSPACE esp + WORD_SZ
ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1)); #ifdef __GNUC__
ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2)); // this assumes that a frame pointer is used
ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3)); #define SSE2_LOAD_ROUNDS ".att_syntax prefix;movl %0, %%ebx;.intel_syntax noprefix;"
ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0)); #else
ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2)); #define SSE2_LOAD_ROUNDS AS2(mov REG_rounds, r)
ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3)); #endif
ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1));
ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2));
ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3));
ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0));
ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1));
ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2));
ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3));
do
{
word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]);
for (i=0; i<4; i++)
{
countersLo[i] = m_state[8];
countersHi[i] = m_state[5];
if (++m_state[8] == 0)
++m_state[5];
}
__m128i x0 = ss[0];
__m128i x1 = ss[1];
__m128i x2 = ss[2];
__m128i x3 = ss[3];
__m128i x4 = ss[4];
__m128i x5 = ss[5];
__m128i x6 = ss[6];
__m128i x7 = ss[7];
__m128i x8 = ss[8];
__m128i x9 = ss[9];
__m128i x10 = ss[10];
__m128i x11 = ss[11];
__m128i x12 = ss[12];
__m128i x13 = ss[13];
__m128i x14 = ss[14];
__m128i x15 = ss[15];
for (i=m_rounds; i>0; i-=2)
{
#define QUARTER_ROUND(a, b, c, d) \
SSE2_QUARTER_ROUND(a, b, d, 7) \
SSE2_QUARTER_ROUND(b, c, a, 9) \
SSE2_QUARTER_ROUND(c, d, b, 13) \
SSE2_QUARTER_ROUND(d, a, c, 18)
QUARTER_ROUND(x0, x4, x8, x12)
QUARTER_ROUND(x1, x5, x9, x13)
QUARTER_ROUND(x2, x6, x10, x14)
QUARTER_ROUND(x3, x7, x11, x15)
QUARTER_ROUND(x0, x13, x10, x7)
QUARTER_ROUND(x1, x14, x11, x4)
QUARTER_ROUND(x2, x15, x8, x5)
QUARTER_ROUND(x3, x12, x9, x6)
#undef QUARTER_ROUND
}
x0 = _mm_add_epi32(x0, ss[0]);
x1 = _mm_add_epi32(x1, ss[1]);
x2 = _mm_add_epi32(x2, ss[2]);
x3 = _mm_add_epi32(x3, ss[3]);
x4 = _mm_add_epi32(x4, ss[4]);
x5 = _mm_add_epi32(x5, ss[5]);
x6 = _mm_add_epi32(x6, ss[6]);
x7 = _mm_add_epi32(x7, ss[7]);
x8 = _mm_add_epi32(x8, ss[8]);
x9 = _mm_add_epi32(x9, ss[9]);
x10 = _mm_add_epi32(x10, ss[10]);
x11 = _mm_add_epi32(x11, ss[11]);
x12 = _mm_add_epi32(x12, ss[12]);
x13 = _mm_add_epi32(x13, ss[13]);
x14 = _mm_add_epi32(x14, ss[14]);
x15 = _mm_add_epi32(x15, ss[15]);
#define OUTPUT_4(x, a, b, c, d, e, f, g, h) {\
__m128i t0 = _mm_unpacklo_epi32(a, b);\
__m128i t1 = _mm_unpacklo_epi32(c, d);\
__m128i t2 = _mm_unpacklo_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\
t2 = _mm_unpackhi_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\
t0 = _mm_unpackhi_epi32(a, b);\
t1 = _mm_unpackhi_epi32(c, d);\
t2 = _mm_unpacklo_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\
t2 = _mm_unpackhi_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)}
#define SALSA_OUTPUT(x) \
OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\
OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\
OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\
OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15)
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION)
#undef SALSA_OUTPUT
} while ((iterationCount-=4) >= 4);
}
#endif #endif
if (!IsP4() && iterationCount > 0) word32 r = m_rounds;
{
const __m128i s_maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0));
const __m128i s_maskHi32 = _mm_slli_epi64(s_maskLo32, 32);
do #ifdef __GNUC__
{ __asm__ __volatile__
__m128i x0 = s[0]; (
__m128i x1 = s[1]; ".intel_syntax noprefix;"
__m128i x2 = s[2]; AS_PUSH_IF86( bx)
__m128i x3 = s[3]; #else
void *s = m_state.data();
for (i=m_rounds; i>0; i-=2) AS2( mov REG_iterationCount, iterationCount)
{ AS2( mov REG_state, s)
SSE2_QUARTER_ROUND(x0, x1, x3, 7) AS2( mov REG_input, input)
SSE2_QUARTER_ROUND(x1, x2, x0, 9) AS2( mov REG_output, output)
SSE2_QUARTER_ROUND(x2, x3, x1, 13) #endif
SSE2_QUARTER_ROUND(x3, x0, x2, 18) #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3)); AS2( cmp REG_iterationCount, 4)
x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2)); ASJ( jl, 5, f)
x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1));
SSE2_QUARTER_ROUND(x0, x3, x1, 7) #if CRYPTOPP_BOOL_X86
SSE2_QUARTER_ROUND(x3, x2, x0, 9) AS2( mov ebx, esp)
SSE2_QUARTER_ROUND(x2, x1, x3, 13) AS2( and esp, -16)
SSE2_QUARTER_ROUND(x1, x0, x2, 18) AS2( sub esp, 32*16)
AS1( push ebx)
x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1));
x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3));
}
x0 = _mm_add_epi32(x0, s[0]);
x1 = _mm_add_epi32(x1, s[1]);
x2 = _mm_add_epi32(x2, s[2]);
x3 = _mm_add_epi32(x3, s[3]);
if (++m_state[8] == 0)
++m_state[5];
__m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32));
k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3));
__m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32));
k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3));
__m128i k20 = _mm_or_si128(_mm_and_si128(x2, s_maskLo32), _mm_and_si128(x1, s_maskHi32));
__m128i k31 = _mm_or_si128(_mm_and_si128(x3, s_maskLo32), _mm_and_si128(x2, s_maskHi32));
__m128i k0 = _mm_unpackhi_epi64(k02, k20);
__m128i k1 = _mm_unpackhi_epi64(k13, k31);
__m128i k2 = _mm_unpacklo_epi64(k20, k02);
__m128i k3 = _mm_unpacklo_epi64(k31, k13);
#define SSE2_OUTPUT(x) {\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)}
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION);
}
while (--iterationCount);
}
}
#endif #endif
#define SSE2_EXPAND_S(i, j) \
ASS( pshufd xmm4, xmm##i, j, j, j, j) \
AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
AS2( movdqa xmm0, [REG_state + 0*16])
AS2( movdqa xmm1, [REG_state + 1*16])
AS2( movdqa xmm2, [REG_state + 2*16])
AS2( movdqa xmm3, [REG_state + 3*16])
SSE2_EXPAND_S(0, 0)
SSE2_EXPAND_S(0, 1)
SSE2_EXPAND_S(0, 2)
SSE2_EXPAND_S(0, 3)
SSE2_EXPAND_S(1, 0)
SSE2_EXPAND_S(1, 2)
SSE2_EXPAND_S(1, 3)
SSE2_EXPAND_S(2, 1)
SSE2_EXPAND_S(2, 2)
SSE2_EXPAND_S(2, 3)
SSE2_EXPAND_S(3, 0)
SSE2_EXPAND_S(3, 1)
SSE2_EXPAND_S(3, 2)
SSE2_EXPAND_S(3, 3)
#define SSE2_EXPAND_S85(i) \
AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_rounds) \
AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
AS2( add REG_rounds, 1) \
AS2( adc REG_temp32, 0)
ASL(1)
AS2( mov REG_rounds, dword ptr [REG_state + 8*4])
AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
SSE2_EXPAND_S85(0)
SSE2_EXPAND_S85(1)
SSE2_EXPAND_S85(2)
SSE2_EXPAND_S85(3)
AS2( mov dword ptr [REG_state + 8*4], REG_rounds)
AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
#define SSE2_QUARTER_ROUND(a, b, d, i) \
AS2( movdqa xmm4, xmm##d) \
AS2( paddd xmm4, xmm##a) \
AS2( movdqa xmm5, xmm4) \
AS2( pslld xmm4, i) \
AS2( psrld xmm5, 32-i) \
AS2( pxor xmm##b, xmm4) \
AS2( pxor xmm##b, xmm5)
#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
#define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
#define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
#define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
#define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
#define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
#define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
#define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
#define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
#define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
#define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
#define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
#define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
#define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
#define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
#define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
#define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
#define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
#define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
#define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
#define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
#define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
#define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
#define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
#define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
#if CRYPTOPP_BOOL_X64
SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
#else
SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
#endif
SSE2_LOAD_ROUNDS
ASJ( jmp, 2, f)
ASL(SSE2_Salsa_Output)
AS2( movdqa xmm0, xmm4)
AS2( punpckldq xmm4, xmm5)
AS2( movdqa xmm1, xmm6)
AS2( punpckldq xmm6, xmm7)
AS2( movdqa xmm2, xmm4)
AS2( punpcklqdq xmm4, xmm6) // e
AS2( punpckhqdq xmm2, xmm6) // f
AS2( punpckhdq xmm0, xmm5)
AS2( punpckhdq xmm1, xmm7)
AS2( movdqa xmm6, xmm0)
AS2( punpcklqdq xmm0, xmm1) // g
AS2( punpckhqdq xmm6, xmm1) // h
AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
AS1( ret)
ASL(6)
#if CRYPTOPP_BOOL_X64
SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
ASL(2)
SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
#else
SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
ASL(2)
SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
#endif
AS2( sub REG_rounds, 2)
ASJ( jnz, 6, b)
#define SSE2_OUTPUT_4(a, b, c, d) \
AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
ASC( call, SSE2_Salsa_Output)
SSE2_OUTPUT_4(0, 13, 10, 7)
SSE2_OUTPUT_4(4, 1, 14, 11)
SSE2_OUTPUT_4(8, 5, 2, 15)
SSE2_OUTPUT_4(12, 9, 6, 3)
AS2( test REG_input, REG_input)
ASJ( jz, 9, f)
AS2( add REG_input, 12*16)
ASL(9)
AS2( add REG_output, 12*16)
AS2( sub REG_iterationCount, 4)
AS2( cmp REG_iterationCount, 4)
ASJ( jge, 1, b)
AS_POP_IF86( sp)
ASL(5)
AS2( sub REG_iterationCount, 1)
ASJ( jl, 4, f)
AS2( movdqa xmm0, [REG_state + 0*16])
AS2( movdqa xmm1, [REG_state + 1*16])
AS2( movdqa xmm2, [REG_state + 2*16])
AS2( movdqa xmm3, [REG_state + 3*16])
SSE2_LOAD_ROUNDS
ASL(0)
SSE2_QUARTER_ROUND(0, 1, 3, 7)
SSE2_QUARTER_ROUND(1, 2, 0, 9)
SSE2_QUARTER_ROUND(2, 3, 1, 13)
SSE2_QUARTER_ROUND(3, 0, 2, 18)
ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
SSE2_QUARTER_ROUND(0, 3, 1, 7)
SSE2_QUARTER_ROUND(3, 2, 0, 9)
SSE2_QUARTER_ROUND(2, 1, 3, 13)
SSE2_QUARTER_ROUND(1, 0, 2, 18)
ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
AS2( sub REG_rounds, 2)
ASJ( jnz, 0, b)
AS2( paddd xmm0, [REG_state + 0*16])
AS2( paddd xmm1, [REG_state + 1*16])
AS2( paddd xmm2, [REG_state + 2*16])
AS2( paddd xmm3, [REG_state + 3*16])
AS2( add dword ptr [REG_state + 8*4], 1)
AS2( adc dword ptr [REG_state + 5*4], 0)
AS2( pcmpeqb xmm6, xmm6) // all ones
AS2( psrlq xmm6, 32) // lo32 mask
ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask
AS2( movdqa xmm4, xmm0)
AS2( movdqa xmm5, xmm3)
AS2( pand xmm0, xmm7)
AS2( pand xmm4, xmm6)
AS2( pand xmm3, xmm6)
AS2( pand xmm5, xmm7)
AS2( por xmm4, xmm5) // 0,13,2,15
AS2( movdqa xmm5, xmm1)
AS2( pand xmm1, xmm7)
AS2( pand xmm5, xmm6)
AS2( por xmm0, xmm5) // 4,1,6,3
AS2( pand xmm6, xmm2)
AS2( pand xmm2, xmm7)
AS2( por xmm1, xmm6) // 8,5,10,7
AS2( por xmm2, xmm3) // 12,9,14,11
AS2( movdqa xmm5, xmm4)
AS2( movdqa xmm6, xmm0)
AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7
AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11
AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15
AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3
// output keystream
AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
ASJ( jmp, 5, b)
ASL(4)
#ifdef __GNUC__
AS_POP_IF86( bx)
".att_syntax prefix;"
:
#if CRYPTOPP_BOOL_X64
: "r" (r), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace)
: "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
#else
: "m" (r), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output)
: "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#endif
);
#endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
movdqa xmm6, [rsp + 0200h]
movdqa xmm7, [rsp + 0210h]
movdqa xmm8, [rsp + 0220h]
movdqa xmm9, [rsp + 0230h]
movdqa xmm10, [rsp + 0240h]
movdqa xmm11, [rsp + 0250h]
movdqa xmm12, [rsp + 0260h]
movdqa xmm13, [rsp + 0270h]
movdqa xmm14, [rsp + 0280h]
movdqa xmm15, [rsp + 0290h]
add rsp, 10*16 + 32*16 + 8
ret
Salsa20_OperateKeystream ENDP
#else
}
else
#endif
#endif
#ifndef CRYPTOPP_GENERATE_X64_MASM
{
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
while (iterationCount--) while (iterationCount--)
@ -289,7 +513,7 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
x14 = m_state[14]; x14 = m_state[14];
x15 = m_state[15]; x15 = m_state[15];
for (i=m_rounds; i>0; i-=2) for (int i=m_rounds; i>0; i-=2)
{ {
#define QUARTER_ROUND(a, b, c, d) \ #define QUARTER_ROUND(a, b, c, d) \
b = b ^ rotlFixed(a + d, 7); \ b = b ^ rotlFixed(a + d, 7); \
@ -333,6 +557,9 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
if (++m_state[8] == 0) if (++m_state[8] == 0)
++m_state[5]; ++m_state[5];
} }
}
} // see comment above if an internal compiler error occurs here } // see comment above if an internal compiler error occurs here
NAMESPACE_END NAMESPACE_END
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM

View File

@ -1,12 +1,21 @@
// sosemanuk.cpp - written and placed in the public domain by Wei Dai // sosemanuk.cpp - written and placed in the public domain by Wei Dai
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sosemanuk.cpp" to generate MASM code
#include "pch.h" #include "pch.h"
#ifndef CRYPTOPP_GENERATE_X64_MASM
#include "sosemanuk.h" #include "sosemanuk.h"
#include "misc.h" #include "misc.h"
#include "cpu.h" #include "cpu.h"
#include "serpentp.h" #include "serpentp.h"
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#include <emmintrin.h>
#endif
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
void SosemanukPolicy::CipherSetKey(const NameValuePairs &params, const byte *userKey, size_t keylen) void SosemanukPolicy::CipherSetKey(const NameValuePairs &params, const byte *userKey, size_t keylen)
@ -74,7 +83,8 @@ void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7); m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
} }
static word32 s_mulTables[512] = { extern "C" {
word32 s_sosemanukMulTables[512] = {
#if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836, 0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836,
0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E, 0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E,
@ -271,7 +281,7 @@ static word32 s_mulTables[512] = {
0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84, 0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84,
0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2 0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2
}; };
}
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
unsigned int SosemanukPolicy::GetAlignment() const unsigned int SosemanukPolicy::GetAlignment() const
@ -303,11 +313,36 @@ unsigned int SosemanukPolicy::GetOptimalBlockSize() const
} }
#endif #endif
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
void Sosemanuk_OperateKeystream(size_t iterationCount, const byte *input, byte *output, word32 *state);
}
#endif
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
{ {
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
Sosemanuk_OperateKeystream(iterationCount, input, output, m_state.data());
return;
#endif
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#ifdef CRYPTOPP_GENERATE_X64_MASM
ALIGN 8
Sosemanuk_OperateKeystream PROC FRAME
rex_push_reg rsi
push_reg rdi
alloc_stack(80*4*2+12*4+8*WORD_SZ + 2*16+8)
save_xmm128 xmm6, 02f0h
save_xmm128 xmm7, 0300h
.endprolog
mov rdi, r8
mov rax, r9
#else
#ifdef __INTEL_COMPILER #ifdef __INTEL_COMPILER
if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4 if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4
#else #else
@ -315,10 +350,13 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
#endif #endif
{ {
#ifdef __GNUC__ #ifdef __GNUC__
#if CRYPTOPP_BOOL_X64
__m128i workspace[(80*4*2+12*4+8*WORD_SZ)/16];
#endif
__asm__ __volatile__ __asm__ __volatile__
( (
".intel_syntax noprefix;" ".intel_syntax noprefix;"
AS_PUSH( bx) AS_PUSH_IF86( bx)
#else #else
word32 *state = m_state; word32 *state = m_state;
AS2( mov WORD_REG(ax), state) AS2( mov WORD_REG(ax), state)
@ -326,22 +364,31 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( mov WORD_REG(dx), input) AS2( mov WORD_REG(dx), input)
AS2( mov WORD_REG(cx), iterationCount) AS2( mov WORD_REG(cx), iterationCount)
#endif #endif
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
#define SSE2_output WORD_PTR [WORD_REG(sp)+1*WORD_SZ] #if defined(__GNUC__) && CRYPTOPP_BOOL_X64
#define SSE2_input WORD_PTR [WORD_REG(sp)+2*WORD_SZ] #define SSE2_workspace %5
#define SSE2_wordsLeft WORD_PTR [WORD_REG(sp)+3*WORD_SZ] #else
#define SSE2_diEnd WORD_PTR [WORD_REG(sp)+4*WORD_SZ] #define SSE2_workspace WORD_REG(sp)
#define SSE2_pMulTables WORD_PTR [WORD_REG(sp)+5*WORD_SZ] #endif
#define SSE2_state WORD_PTR [WORD_REG(sp)+6*WORD_SZ]
#define SSE2_wordsLeft2 WORD_PTR [WORD_REG(sp)+7*WORD_SZ] #define SSE2_output WORD_PTR [SSE2_workspace+1*WORD_SZ]
#define SSE2_stateCopy WORD_REG(sp) + 8*WORD_SZ #define SSE2_input WORD_PTR [SSE2_workspace+2*WORD_SZ]
#define SSE2_wordsLeft WORD_PTR [SSE2_workspace+3*WORD_SZ]
#define SSE2_diEnd WORD_PTR [SSE2_workspace+4*WORD_SZ]
#define SSE2_pMulTables WORD_PTR [SSE2_workspace+5*WORD_SZ]
#define SSE2_state WORD_PTR [SSE2_workspace+6*WORD_SZ]
#define SSE2_wordsLeft2 WORD_PTR [SSE2_workspace+7*WORD_SZ]
#define SSE2_stateCopy SSE2_workspace + 8*WORD_SZ
#define SSE2_uvStart SSE2_stateCopy + 12*4 #define SSE2_uvStart SSE2_stateCopy + 12*4
AS_PUSH( bp) #if CRYPTOPP_BOOL_X86
AS2( mov WORD_REG(bx), WORD_REG(sp)) AS_PUSH_IF86( bp)
AS2( and WORD_REG(sp), -16) AS2( mov AS_REG_6, esp)
AS2( sub WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals AS2( and esp, -16)
AS2( mov [WORD_REG(sp)], WORD_REG(bx)) AS2( sub esp, 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals
AS2( mov [esp], AS_REG_6)
#endif
AS2( mov SSE2_output, WORD_REG(di)) AS2( mov SSE2_output, WORD_REG(di))
AS2( mov SSE2_input, WORD_REG(dx)) AS2( mov SSE2_input, WORD_REG(dx))
AS2( mov SSE2_state, WORD_REG(ax)) AS2( mov SSE2_state, WORD_REG(ax))
@ -358,7 +405,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16]) AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0) AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
AS2( psrlq xmm0, 32) AS2( psrlq xmm0, 32)
AS2( movd ebx, xmm0) // s(9) AS2( movd AS_REG_6d, xmm0) // s(9)
AS2( mov ecx, [WORD_REG(ax)+10*4]) AS2( mov ecx, [WORD_REG(ax)+10*4])
AS2( mov edx, [WORD_REG(ax)+11*4]) AS2( mov edx, [WORD_REG(ax)+11*4])
AS2( pcmpeqb xmm7, xmm7) // all ones AS2( pcmpeqb xmm7, xmm7) // all ones
@ -367,35 +414,35 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
#define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 #define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
#define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4 #define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
#define r10 ecx #define R10 ecx
#define r11 edx #define R11 edx
#define r20 edx #define R20 edx
#define r21 ecx #define R21 ecx
#define SSE2_STEP(i, j) \ #define SSE2_STEP(i, j) \
AS2( mov eax, [s(i+0)])\ AS2( mov eax, [s(i+0)])\
AS2( mov [v(i)], eax)\ AS2( mov [v(i)], eax)\
AS2( rol eax, 8)\ AS2( rol eax, 8)\
AS2( lea ebp, [ebx + r2##j])\ AS2( lea AS_REG_7d, [AS_REG_6d + R2##j])\
AS2( xor ebp, r1##j)\ AS2( xor AS_REG_7d, R1##j)\
AS2( mov [u(i)], ebp)\ AS2( mov [u(i)], AS_REG_7d)\
AS2( mov ebp, 1)\ AS2( mov AS_REG_7d, 1)\
AS2( and ebp, r2##j)\ AS2( and AS_REG_7d, R2##j)\
AS1( neg ebp)\ AS1( neg AS_REG_7d)\
AS2( and ebp, ebx)\ AS2( and AS_REG_7d, AS_REG_6d)\
AS2( xor ebx, eax)\ AS2( xor AS_REG_6d, eax)\
AS2( movzx eax, al)\ AS2( movzx eax, al)\
AS2( xor ebx, [WORD_REG(si)+WORD_REG(ax)*4])\ AS2( xor AS_REG_6d, [WORD_REG(si)+WORD_REG(ax)*4])\
AS2( mov eax, [s(i+3)])\ AS2( mov eax, [s(i+3)])\
AS2( xor ebp, [s(i+2)])\ AS2( xor AS_REG_7d, [s(i+2)])\
AS2( add r1##j, ebp)\ AS2( add R1##j, AS_REG_7d)\
AS2( movzx ebp, al)\ AS2( movzx AS_REG_7d, al)\
AS2( shr eax, 8)\ AS2( shr eax, 8)\
AS2( xor ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\ AS2( xor AS_REG_6d, [WORD_REG(si)+1024+AS_REG_7*4])\
AS2( xor ebx, eax)\ AS2( xor AS_REG_6d, eax)\
AS2( imul r2##j, 0x54655307)\ AS2( imul R2##j, AS_HEX(54655307))\
AS2( rol r2##j, 7)\ AS2( rol R2##j, 7)\
AS2( mov [s(i+0)], ebx)\ AS2( mov [s(i+0)], AS_REG_6d)\
ASL(2) // outer loop, each iteration of this processes 80 words ASL(2) // outer loop, each iteration of this processes 80 words
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
@ -406,7 +453,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop
AS2( mov SSE2_diEnd, WORD_REG(si)) AS2( mov SSE2_diEnd, WORD_REG(si))
#ifdef _MSC_VER #ifdef _MSC_VER
AS2( lea WORD_REG(si), s_mulTables) AS2( lea WORD_REG(si), s_sosemanukMulTables)
#else #else
AS2( mov WORD_REG(si), SSE2_pMulTables) AS2( mov WORD_REG(si), SSE2_pMulTables)
#endif #endif
@ -438,7 +485,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
ASJ( jne, 0, b) ASJ( jne, 0, b)
AS2( mov WORD_REG(ax), SSE2_input) AS2( mov WORD_REG(ax), SSE2_input)
AS2( mov WORD_REG(bp), SSE2_output) AS2( mov AS_REG_7, SSE2_output)
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
AS2( mov WORD_REG(si), SSE2_wordsLeft2) AS2( mov WORD_REG(si), SSE2_wordsLeft2)
@ -487,43 +534,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( punpcklqdq xmm6, xmm5) AS2( punpcklqdq xmm6, xmm5)
AS2( punpckhqdq xmm3, xmm5) AS2( punpckhqdq xmm3, xmm5)
// output keystream // output keystream
AS2( test WORD_REG(ax), WORD_REG(ax)) AS_XMM_OUTPUT4(SSE2_Sosemanuk_Output, WORD_REG(ax), AS_REG_7, 2,0,6,3, 1, 0,1,2,3, 4)
ASJ( jz, 3, f)
AS2( test eax, 0xf)
ASJ( jnz, 7, f)
AS2( pxor xmm2, [WORD_REG(ax)+0*16])
AS2( pxor xmm0, [WORD_REG(ax)+1*16])
AS2( pxor xmm6, [WORD_REG(ax)+2*16])
AS2( pxor xmm3, [WORD_REG(ax)+3*16])
AS2( add WORD_REG(ax), 4*16)
ASJ( jmp, 3, f)
ASL(7)
AS2( movdqu xmm1, [WORD_REG(ax)+0*16])
AS2( pxor xmm2, xmm1)
AS2( movdqu xmm1, [WORD_REG(ax)+1*16])
AS2( pxor xmm0, xmm1)
AS2( movdqu xmm1, [WORD_REG(ax)+2*16])
AS2( pxor xmm6, xmm1)
AS2( movdqu xmm1, [WORD_REG(ax)+3*16])
AS2( pxor xmm3, xmm1)
AS2( add WORD_REG(ax), 4*16)
ASL(3)
AS2( test ebp, 0xf)
ASJ( jnz, 8, f)
AS2( movdqa [WORD_REG(bp)+0*16], xmm2)
AS2( movdqa [WORD_REG(bp)+1*16], xmm0)
AS2( movdqa [WORD_REG(bp)+2*16], xmm6)
AS2( movdqa [WORD_REG(bp)+3*16], xmm3)
ASJ( jmp, 9, f)
ASL(8)
AS2( movdqu [WORD_REG(bp)+0*16], xmm2)
AS2( movdqu [WORD_REG(bp)+1*16], xmm0)
AS2( movdqu [WORD_REG(bp)+2*16], xmm6)
AS2( movdqu [WORD_REG(bp)+3*16], xmm3)
ASL(9)
// loop // loop
AS2( add WORD_REG(di), 4*4) AS2( add WORD_REG(di), 4*4)
AS2( add WORD_REG(bp), 4*16)
AS2( sub WORD_REG(si), 16) AS2( sub WORD_REG(si), 16)
ASJ( jnz, 1, b) ASJ( jnz, 1, b)
@ -533,29 +547,29 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
ASJ( jz, 6, f) ASJ( jz, 6, f)
AS2( mov SSE2_wordsLeft, WORD_REG(si)) AS2( mov SSE2_wordsLeft, WORD_REG(si))
AS2( mov SSE2_input, WORD_REG(ax)) AS2( mov SSE2_input, WORD_REG(ax))
AS2( mov SSE2_output, WORD_REG(bp)) AS2( mov SSE2_output, AS_REG_7)
ASJ( jmp, 2, b) ASJ( jmp, 2, b)
ASL(4) // final output of less than 16 words ASL(4) // final output of less than 16 words
AS2( test WORD_REG(ax), WORD_REG(ax)) AS2( test WORD_REG(ax), WORD_REG(ax))
ASJ( jz, 5, f) ASJ( jz, 5, f)
AS2( movd xmm0, [WORD_REG(ax)+0*4]) AS2( movd xmm0, dword ptr [WORD_REG(ax)+0*4])
AS2( pxor xmm2, xmm0) AS2( pxor xmm2, xmm0)
AS2( movd xmm0, [WORD_REG(ax)+1*4]) AS2( movd xmm0, dword ptr [WORD_REG(ax)+1*4])
AS2( pxor xmm3, xmm0) AS2( pxor xmm3, xmm0)
AS2( movd xmm0, [WORD_REG(ax)+2*4]) AS2( movd xmm0, dword ptr [WORD_REG(ax)+2*4])
AS2( pxor xmm1, xmm0) AS2( pxor xmm1, xmm0)
AS2( movd xmm0, [WORD_REG(ax)+3*4]) AS2( movd xmm0, dword ptr [WORD_REG(ax)+3*4])
AS2( pxor xmm4, xmm0) AS2( pxor xmm4, xmm0)
AS2( add WORD_REG(ax), 16) AS2( add WORD_REG(ax), 16)
ASL(5) ASL(5)
AS2( movd [WORD_REG(bp)+0*4], xmm2) AS2( movd dword ptr [AS_REG_7+0*4], xmm2)
AS2( movd [WORD_REG(bp)+1*4], xmm3) AS2( movd dword ptr [AS_REG_7+1*4], xmm3)
AS2( movd [WORD_REG(bp)+2*4], xmm1) AS2( movd dword ptr [AS_REG_7+2*4], xmm1)
AS2( movd [WORD_REG(bp)+3*4], xmm4) AS2( movd dword ptr [AS_REG_7+3*4], xmm4)
AS2( sub WORD_REG(si), 4) AS2( sub WORD_REG(si), 4)
ASJ( jz, 6, f) ASJ( jz, 6, f)
AS2( add WORD_REG(bp), 16) AS2( add AS_REG_7, 16)
AS2( psrldq xmm2, 4) AS2( psrldq xmm2, 4)
AS2( psrldq xmm3, 4) AS2( psrldq xmm3, 4)
AS2( psrldq xmm1, 4) AS2( psrldq xmm1, 4)
@ -563,38 +577,52 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
ASJ( jmp, 4, b) ASJ( jmp, 4, b)
ASL(6) // save state ASL(6) // save state
AS2( mov WORD_REG(bx), SSE2_state) AS2( mov AS_REG_6, SSE2_state)
AS2( movdqa xmm0, [SSE2_stateCopy+0*16]) AS2( movdqa xmm0, [SSE2_stateCopy+0*16])
AS2( movdqa [WORD_REG(bx)+0*16], xmm0) AS2( movdqa [AS_REG_6+0*16], xmm0)
AS2( movdqa xmm0, [SSE2_stateCopy+1*16]) AS2( movdqa xmm0, [SSE2_stateCopy+1*16])
AS2( movdqa [WORD_REG(bx)+1*16], xmm0) AS2( movdqa [AS_REG_6+1*16], xmm0)
AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16]) AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
AS2( movq QWORD PTR [WORD_REG(bx)+2*16], xmm0) AS2( movq QWORD PTR [AS_REG_6+2*16], xmm0)
AS2( mov [WORD_REG(bx)+10*4], ecx) AS2( mov [AS_REG_6+10*4], ecx)
AS2( mov [WORD_REG(bx)+11*4], edx) AS2( mov [AS_REG_6+11*4], edx)
AS_POP( sp) AS_POP_IF86( sp)
AS_POP( bp) AS_POP_IF86( bp)
#ifdef __GNUC__ #ifdef __GNUC__
AS_POP( bx) AS_POP_IF86( bx)
".att_syntax prefix;" ".att_syntax prefix;"
: :
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input) : "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_sosemanukMulTables), "D" (output), "d" (input)
: "memory", "cc" #if CRYPTOPP_BOOL_X64
, "r" (workspace)
#endif
: "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
); );
#endif #endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
movdqa xmm6, [rsp + 02f0h]
movdqa xmm7, [rsp + 0300h]
add rsp, 80*4*2+12*4+8*WORD_SZ + 2*16+8
pop rdi
pop rsi
ret
Sosemanuk_OperateKeystream ENDP
#else
} }
else else
#endif #endif
#endif
#ifndef CRYPTOPP_GENERATE_X64_MASM
{ {
#if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
#define MUL_A(x) (x = rotlFixed(x, 8), x ^ s_mulTables[byte(x)]) #define MUL_A(x) (x = rotlFixed(x, 8), x ^ s_sosemanukMulTables[byte(x)])
#else #else
#define MUL_A(x) (((x) << 8) ^ s_mulTables[(x) >> 24]) #define MUL_A(x) (((x) << 8) ^ s_sosemanukMulTables[(x) >> 24])
#endif #endif
#define DIV_A(x) (((x) >> 8) ^ s_mulTables[256 + byte(x)]) #define DIV_A(x) (((x) >> 8) ^ s_sosemanukMulTables[256 + byte(x)])
#define r1(i) ((i%2) ? reg2 : reg1) #define r1(i) ((i%2) ? reg2 : reg1)
#define r2(i) ((i%2) ? reg1 : reg2) #define r2(i) ((i%2) ? reg1 : reg2)
@ -676,3 +704,5 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
} }
NAMESPACE_END NAMESPACE_END
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM

View File

@ -1,7 +1,7 @@
// whrlpool.cpp - originally modified by Kevin Springle from // whrlpool.cpp - originally modified by Kevin Springle from
// Paulo Barreto and Vincent Rijmen's public domain code, whirlpool.c. // Paulo Barreto and Vincent Rijmen's public domain code, whirlpool.c.
// Updated to Whirlpool version 3.0, optimized and SSE version added by Wei Dai // Updated to Whirlpool version 3.0, optimized and SSE version added by Wei Dai
// Any modifications are placed in the public domain // All modifications are placed in the public domain
// This is the original introductory comment: // This is the original introductory comment:
@ -71,6 +71,10 @@
#include "misc.h" #include "misc.h"
#include "cpu.h" #include "cpu.h"
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#include <emmintrin.h>
#endif
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
void Whirlpool_TestInstantiations() void Whirlpool_TestInstantiations()
@ -395,29 +399,37 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
{ {
// MMX version has the same structure as C version below // MMX version has the same structure as C version below
#ifdef __GNUC__ #ifdef __GNUC__
#if CRYPTOPP_BOOL_X64
__m128i workspace[8];
#endif
__asm__ __volatile__ __asm__ __volatile__
( (
".intel_syntax noprefix;" ".intel_syntax noprefix;"
AS_PUSH( bx) AS_PUSH_IF86( bx)
AS2( mov WORD_REG(bx), WORD_REG(ax)) AS2( mov AS_REG_6, WORD_REG(ax))
#else #else
#if _MSC_VER < 1300 #if _MSC_VER < 1300
AS_PUSH( bx) AS_PUSH_IF86( bx)
#endif #endif
AS2( lea WORD_REG(bx), [Whirlpool_C]) AS2( lea AS_REG_6, [Whirlpool_C])
AS2( mov WORD_REG(cx), digest) AS2( mov WORD_REG(cx), digest)
AS2( mov WORD_REG(dx), block) AS2( mov WORD_REG(dx), block)
#endif #endif
AS2( mov WORD_REG(ax), WORD_REG(sp)) #if CRYPTOPP_BOOL_X86
AS2( and WORD_REG(sp), -16) AS2( mov eax, esp)
AS2( sub WORD_REG(sp), 16*8) AS2( and esp, -16)
AS_PUSH( ax) AS2( sub esp, 16*8)
AS1( push eax)
#define SSE2_workspace esp+WORD_SZ
#else
#define SSE2_workspace %3
#endif
AS2( xor esi, esi) AS2( xor esi, esi)
ASL(0) ASL(0)
AS2( movq mm0, [WORD_REG(cx)+8*WORD_REG(si)]) AS2( movq mm0, [WORD_REG(cx)+8*WORD_REG(si)])
AS2( movq [WORD_REG(sp)+WORD_SZ+8*WORD_REG(si)], mm0) // k AS2( movq [SSE2_workspace+8*WORD_REG(si)], mm0) // k
AS2( pxor mm0, [WORD_REG(dx)+8*WORD_REG(si)]) AS2( pxor mm0, [WORD_REG(dx)+8*WORD_REG(si)])
AS2( movq [WORD_REG(sp)+WORD_SZ+64+8*WORD_REG(si)], mm0) // s AS2( movq [SSE2_workspace+64+8*WORD_REG(si)], mm0) // s
AS2( movq [WORD_REG(cx)+8*WORD_REG(si)], mm0) AS2( movq [WORD_REG(cx)+8*WORD_REG(si)], mm0)
AS1( inc WORD_REG(si)) AS1( inc WORD_REG(si))
AS2( cmp WORD_REG(si), 8) AS2( cmp WORD_REG(si), 8)
@ -430,16 +442,16 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
#define KSL1(a, b) AS2(pxor mm##a, b) #define KSL1(a, b) AS2(pxor mm##a, b)
#define KSL(op, i, a, b, c, d) \ #define KSL(op, i, a, b, c, d) \
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*i])\ AS2(mov eax, [SSE2_workspace+8*i])\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ KSL##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\ AS2(movzx edi, ah)\
KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ KSL##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\ AS2(shr eax, 16)\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
AS2(shr eax, 8)\ AS2(shr eax, 8)\
KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ KSL##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) KSL##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
#define KSH0(a, b) \ #define KSH0(a, b) \
ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
@ -448,57 +460,57 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
AS2(pxor mm##a, b) AS2(pxor mm##a, b)
#define KSH2(a, b) \ #define KSH2(a, b) \
AS2(pxor mm##a, b)\ AS2(pxor mm##a, b)\
AS2(movq [WORD_REG(sp)+WORD_SZ+8*a], mm##a) AS2(movq [SSE2_workspace+8*a], mm##a)
#define KSH(op, i, a, b, c, d) \ #define KSH(op, i, a, b, c, d) \
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*((i+4)-8*((i+4)/8))+4])\ AS2(mov eax, [SSE2_workspace+8*((i+4)-8*((i+4)/8))+4])\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
KSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ KSH##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\ AS2(movzx edi, ah)\
KSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ KSH##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\ AS2(shr eax, 16)\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
AS2(shr eax, 8)\ AS2(shr eax, 8)\
KSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ KSH##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
KSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) KSH##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
#define TSL(op, i, a, b, c, d) \ #define TSL(op, i, a, b, c, d) \
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*i])\ AS2(mov eax, [SSE2_workspace+64+8*i])\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ KSL##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\ AS2(movzx edi, ah)\
KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ KSL##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\ AS2(shr eax, 16)\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
AS2(shr eax, 8)\ AS2(shr eax, 8)\
KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ KSL##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) KSL##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
#define TSH0(a, b) \ #define TSH0(a, b) \
ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\ ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
AS2(pxor mm##a, [WORD_REG(sp)+WORD_SZ+8*a])\ AS2(pxor mm##a, [SSE2_workspace+8*a])\
AS2(pxor mm##a, b) AS2(pxor mm##a, b)
#define TSH1(a, b) \ #define TSH1(a, b) \
AS2(pxor mm##a, b) AS2(pxor mm##a, b)
#define TSH2(a, b) \ #define TSH2(a, b) \
AS2(pxor mm##a, b)\ AS2(pxor mm##a, b)\
AS2(movq [WORD_REG(sp)+WORD_SZ+64+8*a], mm##a) AS2(movq [SSE2_workspace+64+8*a], mm##a)
#define TSH3(a, b) \ #define TSH3(a, b) \
AS2(pxor mm##a, b)\ AS2(pxor mm##a, b)\
AS2(pxor mm##a, [WORD_REG(cx)+8*a])\ AS2(pxor mm##a, [WORD_REG(cx)+8*a])\
AS2(movq [WORD_REG(cx)+8*a], mm##a) AS2(movq [WORD_REG(cx)+8*a], mm##a)
#define TSH(op, i, a, b, c, d) \ #define TSH(op, i, a, b, c, d) \
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*((i+4)-8*((i+4)/8))+4])\ AS2(mov eax, [SSE2_workspace+64+8*((i+4)-8*((i+4)/8))+4])\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
TSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\ TSH##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\ AS2(movzx edi, ah)\
TSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\ TSH##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\ AS2(shr eax, 16)\
AS2(movzx edi, al)\ AS2(movzx edi, al)\
AS2(shr eax, 8)\ AS2(shr eax, 8)\
TSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\ TSH##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
TSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)]) TSH##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
KSL(0, 4, 3, 2, 1, 0) KSL(0, 4, 3, 2, 1, 0)
KSL(0, 0, 7, 6, 5, 4) KSL(0, 0, 7, 6, 5, 4)
@ -517,8 +529,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
KSH(2, 3, 2, 1, 0, 7) KSH(2, 3, 2, 1, 0, 7)
KSH(2, 7, 6, 5, 4, 3) KSH(2, 7, 6, 5, 4, 3)
AS2( pxor mm0, [WORD_REG(bx) + 8*1024 + WORD_REG(si)*8]) AS2( pxor mm0, [AS_REG_6 + 8*1024 + WORD_REG(si)*8])
AS2( movq [WORD_REG(sp)+WORD_SZ], mm0) AS2( movq [SSE2_workspace], mm0)
TSL(0, 4, 3, 2, 1, 0) TSL(0, 4, 3, 2, 1, 0)
TSL(0, 0, 7, 6, 5, 4) TSL(0, 0, 7, 6, 5, 4)
@ -553,17 +565,23 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
#undef TSL #undef TSL
#undef TSH #undef TSH
AS_POP( sp) AS_POP_IF86( sp)
AS1( emms) AS1( emms)
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300) #if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
AS_POP( bx) AS_POP_IF86( bx)
#endif #endif
#ifdef __GNUC__ #ifdef __GNUC__
".att_syntax prefix;" ".att_syntax prefix;"
: :
: "a" (Whirlpool_C), "c" (digest), "d" (block) : "a" (Whirlpool_C), "c" (digest), "d" (block)
#if CRYPTOPP_BOOL_X64
, "r" (workspace)
#endif
: "%esi", "%edi", "memory", "cc" : "%esi", "%edi", "memory", "cc"
#if CRYPTOPP_BOOL_X64
, "%r9"
#endif
); );
#endif #endif
} }

File diff suppressed because it is too large Load Diff