- port x64 assembly code to MASM

- improve stack unwindability on x64 for GCC by not modifying RBP/RSP registers in inline assembly
pull/2/head
weidai 2007-09-24 00:43:57 +00:00
parent 1921a557dc
commit 23accd43c5
7 changed files with 2833 additions and 513 deletions

121
cpu.h
View File

@ -1,6 +1,15 @@
#ifndef CRYPTOPP_CPU_H
#define CRYPTOPP_CPU_H
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define CRYPTOPP_X86_ASM_AVAILABLE
#define CRYPTOPP_BOOL_X64 1
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 1
#define NAMESPACE_END
#else
#include "config.h"
#ifdef CRYPTOPP_MSVC6PP_OR_LATER
@ -98,7 +107,18 @@ inline bool HasMMX() {return false;}
#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE || _MSC_VER >= 1400
#if defined(__GNUC__)
#endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define AS1(x) x*newline*
#define AS2(x, y) x, y*newline*
#define AS3(x, y, z) x, y, z*newline*
#define ASS(x, y, a, b, c, d) x, y, a*64+b*16+c*4+d*newline*
#define ASL(x) label##x:*newline*
#define ASJ(x, y, z) x label##y*newline*
#define ASC(x, y) x label##y*newline*
#define AS_HEX(y) y##h
#elif defined(__GNUC__)
// define these in two steps to allow arguments to be expanded
#define GNU_AS1(x) #x ";"
#define GNU_AS2(x, y) #x ", " #y ";"
@ -113,6 +133,7 @@ inline bool HasMMX() {return false;}
#define ASJ(x, y, z) GNU_ASJ(x, y, z)
#define ASC(x, y) #x " " #y ";"
#define CRYPTOPP_NAKED
#define AS_HEX(y) 0x##y
#else
#define AS1(x) __asm {x}
#define AS2(x, y) __asm {x, y}
@ -122,25 +143,115 @@ inline bool HasMMX() {return false;}
#define ASJ(x, y, z) __asm {x label##y}
#define ASC(x, y) __asm {x label##y}
#define CRYPTOPP_NAKED __declspec(naked)
#define AS_HEX(y) 0x##y
#endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define ASM_MOD(x, y) ((x) MOD (y))
#else
// GNU assembler doesn't seem to have mod operator
#define ASM_MOD(x, y) ((x)-((x)/(y))*(y))
#endif
#if CRYPTOPP_BOOL_X86
#define AS_REG_1 ecx
#define AS_REG_2 edx
#define AS_REG_3 esi
#define AS_REG_4 edi
#define AS_REG_5 eax
#define AS_REG_6 ebx
#define AS_REG_7 ebp
#define AS_REG_1d ecx
#define AS_REG_2d edx
#define AS_REG_3d esi
#define AS_REG_4d edi
#define AS_REG_5d eax
#define AS_REG_6d ebx
#define AS_REG_7d ebp
#define WORD_SZ 4
#define WORD_REG(x) e##x
#define WORD_PTR DWORD PTR
#define AS_PUSH(x) AS1(push e##x)
#define AS_POP(x) AS1(pop e##x)
#define AS_PUSH_IF86(x) AS1(push e##x)
#define AS_POP_IF86(x) AS1(pop e##x)
#define AS_JCXZ jecxz
#elif CRYPTOPP_BOOL_X64
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define AS_REG_1 rcx
#define AS_REG_2 rdx
#define AS_REG_3 r8
#define AS_REG_4 r9
#define AS_REG_5 rax
#define AS_REG_6 r10
#define AS_REG_7 r11
#define AS_REG_1d ecx
#define AS_REG_2d edx
#define AS_REG_3d r8d
#define AS_REG_4d r9d
#define AS_REG_5d eax
#define AS_REG_6d r10d
#define AS_REG_7d r11d
#else
#define AS_REG_1 rdi
#define AS_REG_2 rsi
#define AS_REG_3 rdx
#define AS_REG_4 rcx
#define AS_REG_5 r8
#define AS_REG_6 r9
#define AS_REG_7 r10
#define AS_REG_1d edi
#define AS_REG_2d esi
#define AS_REG_3d edx
#define AS_REG_4d ecx
#define AS_REG_5d r8d
#define AS_REG_6d r9d
#define AS_REG_7d r10d
#endif
#define WORD_SZ 8
#define WORD_REG(x) r##x
#define WORD_PTR QWORD PTR
#define AS_PUSH(x) AS1(pushq r##x)
#define AS_POP(x) AS1(popq r##x)
#define AS_PUSH_IF86(x)
#define AS_POP_IF86(x)
#define AS_JCXZ jrcxz
#endif
// helper macro for stream cipher output
#define AS_XMM_OUTPUT4(labelPrefix, inputPtr, outputPtr, x0, x1, x2, x3, t, p0, p1, p2, p3, increment)\
AS2( test inputPtr, inputPtr)\
ASC( jz, labelPrefix##3)\
AS2( test inputPtr, 15)\
ASC( jnz, labelPrefix##7)\
AS2( pxor xmm##x0, [inputPtr+p0*16])\
AS2( pxor xmm##x1, [inputPtr+p1*16])\
AS2( pxor xmm##x2, [inputPtr+p2*16])\
AS2( pxor xmm##x3, [inputPtr+p3*16])\
AS2( add inputPtr, increment*16)\
ASC( jmp, labelPrefix##3)\
ASL(labelPrefix##7)\
AS2( movdqu xmm##t, [inputPtr+p0*16])\
AS2( pxor xmm##x0, xmm##t)\
AS2( movdqu xmm##t, [inputPtr+p1*16])\
AS2( pxor xmm##x1, xmm##t)\
AS2( movdqu xmm##t, [inputPtr+p2*16])\
AS2( pxor xmm##x2, xmm##t)\
AS2( movdqu xmm##t, [inputPtr+p3*16])\
AS2( pxor xmm##x3, xmm##t)\
AS2( add inputPtr, increment*16)\
ASL(labelPrefix##3)\
AS2( test outputPtr, 15)\
ASC( jnz, labelPrefix##8)\
AS2( movdqa [outputPtr+p0*16], xmm##x0)\
AS2( movdqa [outputPtr+p1*16], xmm##x1)\
AS2( movdqa [outputPtr+p2*16], xmm##x2)\
AS2( movdqa [outputPtr+p3*16], xmm##x3)\
ASC( jmp, labelPrefix##9)\
ASL(labelPrefix##8)\
AS2( movdqu [outputPtr+p0*16], xmm##x0)\
AS2( movdqu [outputPtr+p1*16], xmm##x1)\
AS2( movdqu [outputPtr+p2*16], xmm##x2)\
AS2( movdqu [outputPtr+p3*16], xmm##x3)\
ASL(labelPrefix##9)\
AS2( add outputPtr, increment*16)
NAMESPACE_END
#endif

View File

@ -1,6 +1,11 @@
// panama.cpp - written and placed in the public domain by Wei Dai
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM panama.cpp" to generate MASM code
#include "pch.h"
#ifndef CRYPTOPP_GENERATE_X64_MASM
#include "panama.h"
#include "misc.h"
#include "cpu.h"
@ -16,41 +21,67 @@ void Panama<B>::Reset()
#endif
}
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y);
}
#elif CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#ifdef CRYPTOPP_GENERATE_X64_MASM
Panama_SSE2_Pull PROC FRAME
alloc_stack(2*16+8)
save_xmm128 xmm6, 0h
save_xmm128 xmm7, 10h
.endprolog
#else
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
{
#ifdef __GNUC__
__asm__ __volatile__
(
".intel_syntax noprefix;"
AS_PUSH( bx)
AS_POP_IF86( bx)
#else
AS2( mov WORD_REG(cx), count)
AS2( mov WORD_REG(si), state)
AS2( mov WORD_REG(di), z)
AS2( mov WORD_REG(dx), y)
AS2( mov AS_REG_1, count)
AS2( mov AS_REG_2, state)
AS2( mov AS_REG_3, z)
AS2( mov AS_REG_4, y)
#endif
AS2( shl WORD_REG(cx), 5)
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
#if CRYPTOPP_BOOL_X86
#define REG_loopEnd [esp]
#elif defined(CRYPTOPP_GENERATE_X64_MASM)
#define REG_loopEnd rdi
#else
#define REG_loopEnd r8
#endif
AS2( shl AS_REG_1, 5)
ASJ( jz, 5, f)
AS2( mov ebx, [WORD_REG(si)+4*17])
AS2( add WORD_REG(cx), WORD_REG(bx))
AS2( mov AS_REG_6d, [AS_REG_2+4*17])
AS2( add AS_REG_1, AS_REG_6)
AS_PUSH( bp)
AS_PUSH( cx)
#if CRYPTOPP_BOOL_X64
AS2( mov REG_loopEnd, AS_REG_1)
#else
AS1( push ebp)
AS1( push AS_REG_1)
#endif
AS2( movdqa xmm0, [WORD_REG(si)+0*16])
AS2( movdqa xmm1, [WORD_REG(si)+1*16])
AS2( movdqa xmm2, [WORD_REG(si)+2*16])
AS2( movdqa xmm3, [WORD_REG(si)+3*16])
AS2( mov eax, [WORD_REG(si)+4*16])
AS2( movdqa xmm0, XMMWORD PTR [AS_REG_2+0*16])
AS2( movdqa xmm1, XMMWORD PTR [AS_REG_2+1*16])
AS2( movdqa xmm2, XMMWORD PTR [AS_REG_2+2*16])
AS2( movdqa xmm3, XMMWORD PTR [AS_REG_2+3*16])
AS2( mov eax, dword ptr [AS_REG_2+4*16])
ASL(4)
// gamma and pi
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
AS2( test WORD_REG(bx), 1)
AS2( test AS_REG_6, 1)
ASJ( jnz, 6, f)
#endif
AS2( movdqa xmm6, xmm2)
@ -70,18 +101,18 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
ASL(7)
#endif
AS2( movd ecx, xmm2)
AS1( not ecx)
AS2( movd ebp, xmm3)
AS2( or ecx, ebp)
AS2( xor eax, ecx)
AS2( movd AS_REG_1d, xmm2)
AS1( not AS_REG_1d)
AS2( movd AS_REG_7d, xmm3)
AS2( or AS_REG_1d, AS_REG_7d)
AS2( xor eax, AS_REG_1d)
#define SSE2_Index(i) ASM_MOD(((i)*13+16), 17)
#define pi(i) \
AS2( movd ecx, xmm7)\
AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
AS2( mov [WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
AS2( movd AS_REG_1d, xmm7)\
AS2( rol AS_REG_1d, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
AS2( mov [AS_REG_2+SSE2_Index(ASM_MOD(5*(i), 17))*4], AS_REG_1d)
#define pi4(x, y, z, a, b, c, d) \
AS2( pcmpeqb xmm7, xmm7)\
@ -110,65 +141,65 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( punpckhdq xmm2, xmm0) // 11 12 15 16
// keystream
AS2( test WORD_REG(di), WORD_REG(di))
AS2( test AS_REG_3, AS_REG_3)
ASJ( jz, 0, f)
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm2)
AS2( punpckhqdq xmm6, xmm2)
AS2( test WORD_REG(dx), 0xf)
AS2( test AS_REG_4, 15)
ASJ( jnz, 2, f)
AS2( test WORD_REG(dx), WORD_REG(dx))
AS2( test AS_REG_4, AS_REG_4)
ASJ( jz, 1, f)
AS2( pxor xmm4, [WORD_REG(dx)])
AS2( pxor xmm6, [WORD_REG(dx)+16])
AS2( add WORD_REG(dx), 32)
AS2( pxor xmm4, [AS_REG_4])
AS2( pxor xmm6, [AS_REG_4+16])
AS2( add AS_REG_4, 32)
ASJ( jmp, 1, f)
ASL(2)
AS2( movdqu xmm0, [WORD_REG(dx)])
AS2( movdqu xmm2, [WORD_REG(dx)+16])
AS2( movdqu xmm0, [AS_REG_4])
AS2( movdqu xmm2, [AS_REG_4+16])
AS2( pxor xmm4, xmm0)
AS2( pxor xmm6, xmm2)
AS2( add WORD_REG(dx), 32)
AS2( add AS_REG_4, 32)
ASL(1)
AS2( test WORD_REG(di), 0xf)
AS2( test AS_REG_3, 15)
ASJ( jnz, 3, f)
AS2( movdqa [WORD_REG(di)], xmm4)
AS2( movdqa [WORD_REG(di)+16], xmm6)
AS2( add WORD_REG(di), 32)
AS2( movdqa XMMWORD PTR [AS_REG_3], xmm4)
AS2( movdqa XMMWORD PTR [AS_REG_3+16], xmm6)
AS2( add AS_REG_3, 32)
ASJ( jmp, 0, f)
ASL(3)
AS2( movdqu [WORD_REG(di)], xmm4)
AS2( movdqu [WORD_REG(di)+16], xmm6)
AS2( add WORD_REG(di), 32)
AS2( movdqu XMMWORD PTR [AS_REG_3], xmm4)
AS2( movdqu XMMWORD PTR [AS_REG_3+16], xmm6)
AS2( add AS_REG_3, 32)
ASL(0)
// buffer update
AS2( lea WORD_REG(cx), [WORD_REG(bx) + 32])
AS2( and WORD_REG(cx), 31*32)
AS2( lea WORD_REG(bp), [WORD_REG(bx) + (32-24)*32])
AS2( and WORD_REG(bp), 31*32)
AS2( lea AS_REG_1, [AS_REG_6 + 32])
AS2( and AS_REG_1, 31*32)
AS2( lea AS_REG_7, [AS_REG_6 + (32-24)*32])
AS2( and AS_REG_7, 31*32)
AS2( movdqa xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8])
AS2( movdqa xmm0, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*8])
AS2( pxor xmm3, xmm0)
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3)
AS2( pxor xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8])
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0)
AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
AS2( pxor xmm0, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+2*8])
AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8])
AS2( movdqa xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+2*8])
AS2( pxor xmm1, xmm4)
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1)
AS2( pxor xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8])
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4)
AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
AS2( pxor xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*8])
AS2( movdqa XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
// theta
AS2( movdqa xmm3, [WORD_REG(si)+3*16])
AS2( movdqa xmm2, [WORD_REG(si)+2*16])
AS2( movdqa xmm1, [WORD_REG(si)+1*16])
AS2( movdqa xmm0, [WORD_REG(si)+0*16])
AS2( movdqa xmm3, XMMWORD PTR [AS_REG_2+3*16])
AS2( movdqa xmm2, XMMWORD PTR [AS_REG_2+2*16])
AS2( movdqa xmm1, XMMWORD PTR [AS_REG_2+1*16])
AS2( movdqa xmm0, XMMWORD PTR [AS_REG_2+0*16])
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
AS2( test WORD_REG(bx), 1)
AS2( test AS_REG_6, 1)
ASJ( jnz, 8, f)
#endif
AS2( movd xmm6, eax)
@ -199,10 +230,10 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
#endif
AS2( xor eax, 1)
AS2( movd ecx, xmm0)
AS2( xor eax, ecx)
AS2( movd ecx, xmm3)
AS2( xor eax, ecx)
AS2( movd AS_REG_1d, xmm0)
AS2( xor eax, AS_REG_1d)
AS2( movd AS_REG_1d, xmm3)
AS2( xor eax, AS_REG_1d)
AS2( pxor xmm3, xmm2)
AS2( pxor xmm2, xmm1)
@ -214,21 +245,21 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( pxor xmm0, xmm4)
// sigma
AS2( lea WORD_REG(cx), [WORD_REG(bx) + (32-4)*32])
AS2( and WORD_REG(cx), 31*32)
AS2( lea WORD_REG(bp), [WORD_REG(bx) + 16*32])
AS2( and WORD_REG(bp), 31*32)
AS2( lea AS_REG_1, [AS_REG_6 + (32-4)*32])
AS2( and AS_REG_1, 31*32)
AS2( lea AS_REG_7, [AS_REG_6 + 16*32])
AS2( and AS_REG_7, 31*32)
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16])
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16])
AS2( movdqa xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+0*16])
AS2( movdqa xmm5, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+0*16])
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5)
AS2( pxor xmm3, xmm4)
AS2( pxor xmm2, xmm6)
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16])
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16])
AS2( movdqa xmm4, XMMWORD PTR [AS_REG_2+20*4+AS_REG_1+1*16])
AS2( movdqa xmm5, XMMWORD PTR [AS_REG_2+20*4+AS_REG_7+1*16])
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5)
@ -236,31 +267,48 @@ void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
AS2( pxor xmm0, xmm6)
// loop
AS2( add WORD_REG(bx), 32)
AS2( cmp WORD_REG(bx), [WORD_REG(sp)])
AS2( add AS_REG_6, 32)
AS2( cmp AS_REG_6, REG_loopEnd)
ASJ( jne, 4, b)
// save state
AS2( add WORD_REG(sp), WORD_SZ)
AS_POP( bp)
AS2( mov [WORD_REG(si)+4*16], eax)
AS2( movdqa [WORD_REG(si)+3*16], xmm3)
AS2( movdqa [WORD_REG(si)+2*16], xmm2)
AS2( movdqa [WORD_REG(si)+1*16], xmm1)
AS2( movdqa [WORD_REG(si)+0*16], xmm0)
AS2( mov [AS_REG_2+4*16], eax)
AS2( movdqa XMMWORD PTR [AS_REG_2+3*16], xmm3)
AS2( movdqa XMMWORD PTR [AS_REG_2+2*16], xmm2)
AS2( movdqa XMMWORD PTR [AS_REG_2+1*16], xmm1)
AS2( movdqa XMMWORD PTR [AS_REG_2+0*16], xmm0)
#if CRYPTOPP_BOOL_X86
AS2( add esp, 4)
AS1( pop ebp)
#endif
ASL(5)
#ifdef __GNUC__
AS_POP( bx)
AS_POP_IF86( bx)
".att_syntax prefix;"
:
: "c" (count), "S" (state), "D" (z), "d" (y)
: "%eax", "memory", "cc"
#if CRYPTOPP_BOOL_X64
: "D" (count), "S" (state), "d" (z), "c" (y)
: "%r8", "%r9", "r10", "%eax", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#else
: "c" (count), "d" (state), "S" (z), "D" (y)
: "%eax", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#endif
);
#endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
movdqa xmm6, [rsp + 0h]
movdqa xmm7, [rsp + 10h]
add rsp, 2*16+8
ret
Panama_SSE2_Pull ENDP
#else
}
#endif
#endif // #ifdef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#ifndef CRYPTOPP_GENERATE_X64_MASM
template <class B>
void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y)
@ -411,7 +459,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byt
this->Iterate(1, buf);
}
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
Panama_SSE2_Pull(32, this->m_state, NULL, NULL);
else
@ -423,7 +471,7 @@ void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byt
template <class B>
unsigned int PanamaCipherPolicy<B>::GetAlignment() const
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
return 16;
else
@ -435,7 +483,7 @@ unsigned int PanamaCipherPolicy<B>::GetAlignment() const
template <class B>
void PanamaCipherPolicy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
Panama_SSE2_Pull(iterationCount, this->m_state, (word32 *)output, (const word32 *)input);
else
@ -453,3 +501,5 @@ template class PanamaCipherPolicy<BigEndian>;
template class PanamaCipherPolicy<LittleEndian>;
NAMESPACE_END
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM

View File

@ -2,6 +2,8 @@
// and Wei Dai from Paulo Baretto's Rijndael implementation
// The original code and all modifications are in the public domain.
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
/*
Defense against timing attacks was added in July 2006 by Wei Dai.
@ -48,6 +50,7 @@ being unloaded from L1 cache, until that round is finished.
#include "pch.h"
#ifndef CRYPTOPP_IMPORTS
#ifndef CRYPTOPP_GENERATE_X64_MASM
#include "rijndael.h"
#include "misc.h"
@ -145,27 +148,56 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
}
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
void Rijndael_Enc_ProcessAndXorBlock(const word32 *table, word32 cacheLineSize, const word32 *k, const word32 *kLoopEnd, const byte *inBlock, const byte *xorBlock, byte *outBlock);
}
#endif
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
Rijndael_Enc_ProcessAndXorBlock(Te, g_cacheLineSize, m_key, m_key + m_rounds*4, inBlock, xorBlock, outBlock);
return;
#endif
#if defined(CRYPTOPP_X86_ASM_AVAILABLE)
#ifdef CRYPTOPP_GENERATE_X64_MASM
ALIGN 8
Rijndael_Enc_ProcessAndXorBlock PROC FRAME
rex_push_reg rbx
push_reg rsi
push_reg rdi
push_reg r12
push_reg r13
push_reg r14
push_reg r15
.endprolog
mov AS_REG_7, rcx
mov rdi, [rsp + 5*8 + 7*8] ; inBlock
#else
if (HasMMX())
{
const word32 *k = m_key;
const word32 *kLoopEnd = k + m_rounds*4;
#endif
#if CRYPTOPP_BOOL_X64
#define K_REG r8
#define K_END_REG r9
#define SAVE_K
#define RESTORE_K
#define RESTORE_K_END
#define SAVE_0(x) AS2(mov r10d, x)
#define SAVE_1(x) AS2(mov r11d, x)
#define SAVE_2(x) AS2(mov r12d, x)
#define RESTORE_0(x) AS2(mov x, r10d)
#define RESTORE_1(x) AS2(mov x, r11d)
#define RESTORE_2(x) AS2(mov x, r12d)
#define SAVE_0(x) AS2(mov r13d, x)
#define SAVE_1(x) AS2(mov r14d, x)
#define SAVE_2(x) AS2(mov r15d, x)
#define RESTORE_0(x) AS2(mov x, r13d)
#define RESTORE_1(x) AS2(mov x, r14d)
#define RESTORE_2(x) AS2(mov x, r15d)
#else
#define K_REG esi
#define K_END_REG edi
@ -184,22 +216,16 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
__asm__ __volatile__
(
".intel_syntax noprefix;"
AS_PUSH( bx)
AS_PUSH( bp)
AS2( mov WORD_REG(bp), WORD_REG(ax))
#if CRYPTOPP_BOOL_X64
// save these manually. clobber list doesn't seem to work as of GCC 4.1.0
AS1( pushq K_REG)
AS1( pushq K_END_REG)
AS1( pushq r10)
AS1( pushq r11)
AS1( pushq r12)
AS2( mov K_REG, rsi)
AS2( mov K_END_REG, rcx)
#else
AS1( push ebx)
AS1( push ebp)
AS2( movd mm5, ecx)
#endif
#else
AS2( mov AS_REG_7, WORD_REG(ax))
#elif CRYPTOPP_BOOL_X86
#if _MSC_VER < 1300
const word32 *t = Te;
AS2( mov eax, t)
@ -209,12 +235,12 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
AS2( mov K_REG, k)
AS2( movd mm5, kLoopEnd)
#if _MSC_VER < 1300
AS_PUSH( bx)
AS_PUSH( bp)
AS2( mov ebp, eax)
AS1( push ebx)
AS1( push ebp)
AS2( mov AS_REG_7, eax)
#else
AS_PUSH( bp)
AS2( lea ebp, Te)
AS1( push ebp)
AS2( lea AS_REG_7, Te)
#endif
#endif
AS2( mov eax, [K_REG+0*4]) // s0
@ -236,21 +262,21 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
AS2( and ebx, 0)
AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence
ASL(2)
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( and ebx, [AS_REG_7+WORD_REG(di)])
AS2( add edi, edx)
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( and ebx, [AS_REG_7+WORD_REG(di)])
AS2( add edi, edx)
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( and ebx, [AS_REG_7+WORD_REG(di)])
AS2( add edi, edx)
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
AS2( and ebx, [AS_REG_7+WORD_REG(di)])
AS2( add edi, edx)
AS2( cmp edi, 1024)
ASJ( jl, 2, b)
AS2( and ebx, [WORD_REG(bp)+1020])
AS2( and ebx, [AS_REG_7+1020])
#if CRYPTOPP_BOOL_X64
AS2( xor r10d, ebx)
AS2( xor r11d, ebx)
AS2( xor r12d, ebx)
AS2( xor r13d, ebx)
AS2( xor r14d, ebx)
AS2( xor r15d, ebx)
#else
AS2( movd mm6, ebx)
AS2( pxor mm2, mm6)
@ -268,14 +294,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#define QUARTER_ROUND(t, a, b, c, d) \
AS2(movzx esi, t##l)\
AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\
AS2(d, [AS_REG_7+0*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\
AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
AS2(c, [AS_REG_7+1*1024+4*WORD_REG(si)])\
AS2(shr e##t##x, 16)\
AS2(movzx esi, t##l)\
AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
AS2(b, [AS_REG_7+2*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\
AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])
AS2(a, [AS_REG_7+3*1024+4*WORD_REG(si)])
#define s0 xor edi
#define s1 xor eax
@ -308,14 +334,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#define QUARTER_ROUND(t, a, b, c, d) \
AS2(movzx esi, t##l)\
AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\
AS2(a, [AS_REG_7+3*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\
AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
AS2(b, [AS_REG_7+2*1024+4*WORD_REG(si)])\
AS2(shr e##t##x, 16)\
AS2(movzx esi, t##l)\
AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
AS2(c, [AS_REG_7+1*1024+4*WORD_REG(si)])\
AS2(movzx esi, t##h)\
AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])
AS2(d, [AS_REG_7+0*1024+4*WORD_REG(si)])
QUARTER_ROUND(d, s0, s1, s2, s3)
RESTORE_2(edx)
@ -369,20 +395,20 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#define QUARTER_ROUND(a, b, c, d) \
AS2( movzx ebx, dl)\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\
AS2( shl ebx, 3*8)\
AS2( xor a, ebx)\
AS2( movzx ebx, dh)\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\
AS2( shl ebx, 2*8)\
AS2( xor b, ebx)\
AS2( shr edx, 16)\
AS2( movzx ebx, dl)\
AS2( shr edx, 8)\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(bx)])\
AS2( shl ebx, 1*8)\
AS2( xor c, ebx)\
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\
AS2( movzx ebx, BYTE PTR [AS_REG_7+1+4*WORD_REG(dx)])\
AS2( xor d, ebx)
QUARTER_ROUND(eax, ecx, esi, edi)
@ -395,25 +421,22 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
#undef QUARTER_ROUND
#if CRYPTOPP_BOOL_X64
AS1(popq r12)
AS1(popq r11)
AS1(popq r10)
AS1(popq K_END_REG)
AS1(popq K_REG)
#else
#if CRYPTOPP_BOOL_X86
AS1(emms)
AS1(pop ebp)
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
AS1(pop ebx)
#endif
#endif
AS_POP( bp)
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
AS_POP( bx)
#endif
#ifdef __GNUC__
".att_syntax prefix;"
: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
: "memory", "cc"
#if CRYPTOPP_BOOL_X64
, "%ebx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
#endif
);
if (xorBlock)
@ -428,7 +451,11 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
((word32 *)outBlock)[2] = t2;
((word32 *)outBlock)[3] = t3;
#else
AS2( mov WORD_REG(bx), xorBlock)
#if CRYPTOPP_BOOL_X64
mov rbx, [rsp + 6*8 + 7*8] ; xorBlock
#else
AS2( mov ebx, xorBlock)
#endif
AS2( test WORD_REG(bx), WORD_REG(bx))
ASJ( jz, 1, f)
AS2( xor eax, [WORD_REG(bx)+0*4])
@ -436,15 +463,33 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
AS2( xor esi, [WORD_REG(bx)+2*4])
AS2( xor edi, [WORD_REG(bx)+3*4])
ASL(1)
AS2( mov WORD_REG(bx), outBlock)
#if CRYPTOPP_BOOL_X64
mov rbx, [rsp + 7*8 + 7*8] ; outBlock
#else
AS2( mov ebx, outBlock)
#endif
AS2( mov [WORD_REG(bx)+0*4], eax)
AS2( mov [WORD_REG(bx)+1*4], ecx)
AS2( mov [WORD_REG(bx)+2*4], esi)
AS2( mov [WORD_REG(bx)+3*4], edi)
#endif
#if CRYPTOPP_GENERATE_X64_MASM
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbx
ret
Rijndael_Enc_ProcessAndXorBlock ENDP
#else
}
else
#endif
#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE
#ifndef CRYPTOPP_GENERATE_X64_MASM
{
word32 s0, s1, s2, s3, t0, t1, t2, t3;
const word32 *rk = m_key;
@ -674,3 +719,4 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
NAMESPACE_END
#endif
#endif

579
salsa.cpp
View File

@ -1,6 +1,11 @@
// salsa.cpp - written and placed in the public domain by Wei Dai
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
#include "pch.h"
#ifndef CRYPTOPP_GENERATE_X64_MASM
#include "salsa.h"
#include "misc.h"
#include "argnames.h"
@ -53,7 +58,7 @@ void Salsa20_Policy::SeekToIteration(lword iterationCount)
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
unsigned int Salsa20_Policy::GetAlignment() const
{
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2())
return 16;
else
@ -63,7 +68,7 @@ unsigned int Salsa20_Policy::GetAlignment() const
unsigned int Salsa20_Policy::GetOptimalBlockSize() const
{
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2())
return 4*BYTES_PER_ITERATION;
else
@ -72,202 +77,421 @@ unsigned int Salsa20_Policy::GetOptimalBlockSize() const
}
#endif
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
}
#endif
void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
{
int i;
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#define SSE2_QUARTER_ROUND(a, b, d, i) {\
__m128i t = _mm_add_epi32(a, d); \
b = _mm_xor_si128(b, _mm_slli_epi32(t, i)); \
b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));}
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
return;
#endif
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#ifdef CRYPTOPP_GENERATE_X64_MASM
ALIGN 8
Salsa20_OperateKeystream PROC FRAME
mov r10, [rsp + 5*8] ; state
alloc_stack(10*16 + 32*16 + 8)
save_xmm128 xmm6, 0200h
save_xmm128 xmm7, 0210h
save_xmm128 xmm8, 0220h
save_xmm128 xmm9, 0230h
save_xmm128 xmm10, 0240h
save_xmm128 xmm11, 0250h
save_xmm128 xmm12, 0260h
save_xmm128 xmm13, 0270h
save_xmm128 xmm14, 0280h
save_xmm128 xmm15, 0290h
.endprolog
#define REG_output rcx
#define REG_input rdx
#define REG_iterationCount r8
#define REG_state r10
#define REG_rounds eax
#define REG_temp32 r11d
#define REG_temp r11
#define SSE2_WORKSPACE rsp
#define SSE2_LOAD_ROUNDS mov eax, r9d
#else
if (HasSSE2())
{
__m128i *s = (__m128i *)m_state.data();
#if CRYPTOPP_BOOL_X64
#define REG_output %4
#define REG_input %1
#define REG_iterationCount %2
#define REG_state %3
#define REG_rounds eax
#define REG_temp32 edx
#define REG_temp rdx
#define SSE2_WORKSPACE %5
#define SSE2_LOAD_ROUNDS AS2(mov eax, %0)
#if _MSC_VER > 1400 || (defined(_MSC_VER) && CRYPTOPP_BOOL_X86) || (CRYPTOPP_GCC_VERSION >= 40000 && CRYPTOPP_BOOL_X86)
// This code triggers an internal compiler error on MSVC 2005 when compiling
// for x64 with optimizations on. hopefully it will get fixed in the next release.
// A bug report has been submitted at http://connect.microsoft.com/VisualStudio/feedback/ViewFeedback.aspx?FeedbackID=274123
// Also, GCC 3.4.4 generates incorrect code for x86 at -O2.
// GCC 4.1.1 generates incorrect code for x64 at -O2
if (iterationCount >= 4)
{
__m128i ss[16];
ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0));
ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1));
ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2));
ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3));
ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0));
ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2));
ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3));
ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1));
ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2));
ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3));
ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0));
ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1));
ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2));
ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3));
__m128i workspace[32];
#else
#define REG_output edi
#define REG_input eax
#define REG_iterationCount ecx
#define REG_state esi
#define REG_rounds ebx
#define REG_temp32 edx
#define REG_temp edx
#define SSE2_WORKSPACE esp + WORD_SZ
#ifdef __GNUC__
// this assumes that a frame pointer is used
#define SSE2_LOAD_ROUNDS ".att_syntax prefix;movl %0, %%ebx;.intel_syntax noprefix;"
#else
#define SSE2_LOAD_ROUNDS AS2(mov REG_rounds, r)
#endif
#endif
do
{
word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]);
for (i=0; i<4; i++)
{
countersLo[i] = m_state[8];
countersHi[i] = m_state[5];
if (++m_state[8] == 0)
++m_state[5];
}
word32 r = m_rounds;
__m128i x0 = ss[0];
__m128i x1 = ss[1];
__m128i x2 = ss[2];
__m128i x3 = ss[3];
__m128i x4 = ss[4];
__m128i x5 = ss[5];
__m128i x6 = ss[6];
__m128i x7 = ss[7];
__m128i x8 = ss[8];
__m128i x9 = ss[9];
__m128i x10 = ss[10];
__m128i x11 = ss[11];
__m128i x12 = ss[12];
__m128i x13 = ss[13];
__m128i x14 = ss[14];
__m128i x15 = ss[15];
#ifdef __GNUC__
__asm__ __volatile__
(
".intel_syntax noprefix;"
AS_PUSH_IF86( bx)
#else
void *s = m_state.data();
for (i=m_rounds; i>0; i-=2)
{
#define QUARTER_ROUND(a, b, c, d) \
SSE2_QUARTER_ROUND(a, b, d, 7) \
SSE2_QUARTER_ROUND(b, c, a, 9) \
SSE2_QUARTER_ROUND(c, d, b, 13) \
SSE2_QUARTER_ROUND(d, a, c, 18)
AS2( mov REG_iterationCount, iterationCount)
AS2( mov REG_state, s)
AS2( mov REG_input, input)
AS2( mov REG_output, output)
#endif
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
QUARTER_ROUND(x0, x4, x8, x12)
QUARTER_ROUND(x1, x5, x9, x13)
QUARTER_ROUND(x2, x6, x10, x14)
QUARTER_ROUND(x3, x7, x11, x15)
AS2( cmp REG_iterationCount, 4)
ASJ( jl, 5, f)
QUARTER_ROUND(x0, x13, x10, x7)
QUARTER_ROUND(x1, x14, x11, x4)
QUARTER_ROUND(x2, x15, x8, x5)
QUARTER_ROUND(x3, x12, x9, x6)
#undef QUARTER_ROUND
}
x0 = _mm_add_epi32(x0, ss[0]);
x1 = _mm_add_epi32(x1, ss[1]);
x2 = _mm_add_epi32(x2, ss[2]);
x3 = _mm_add_epi32(x3, ss[3]);
x4 = _mm_add_epi32(x4, ss[4]);
x5 = _mm_add_epi32(x5, ss[5]);
x6 = _mm_add_epi32(x6, ss[6]);
x7 = _mm_add_epi32(x7, ss[7]);
x8 = _mm_add_epi32(x8, ss[8]);
x9 = _mm_add_epi32(x9, ss[9]);
x10 = _mm_add_epi32(x10, ss[10]);
x11 = _mm_add_epi32(x11, ss[11]);
x12 = _mm_add_epi32(x12, ss[12]);
x13 = _mm_add_epi32(x13, ss[13]);
x14 = _mm_add_epi32(x14, ss[14]);
x15 = _mm_add_epi32(x15, ss[15]);
#define OUTPUT_4(x, a, b, c, d, e, f, g, h) {\
__m128i t0 = _mm_unpacklo_epi32(a, b);\
__m128i t1 = _mm_unpacklo_epi32(c, d);\
__m128i t2 = _mm_unpacklo_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\
t2 = _mm_unpackhi_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\
t0 = _mm_unpackhi_epi32(a, b);\
t1 = _mm_unpackhi_epi32(c, d);\
t2 = _mm_unpacklo_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\
t2 = _mm_unpackhi_epi64(t0, t1);\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)}
#define SALSA_OUTPUT(x) \
OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\
OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\
OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\
OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15)
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION)
#undef SALSA_OUTPUT
} while ((iterationCount-=4) >= 4);
}
#if CRYPTOPP_BOOL_X86
AS2( mov ebx, esp)
AS2( and esp, -16)
AS2( sub esp, 32*16)
AS1( push ebx)
#endif
if (!IsP4() && iterationCount > 0)
{
const __m128i s_maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0));
const __m128i s_maskHi32 = _mm_slli_epi64(s_maskLo32, 32);
#define SSE2_EXPAND_S(i, j) \
ASS( pshufd xmm4, xmm##i, j, j, j, j) \
AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
do
{
__m128i x0 = s[0];
__m128i x1 = s[1];
__m128i x2 = s[2];
__m128i x3 = s[3];
AS2( movdqa xmm0, [REG_state + 0*16])
AS2( movdqa xmm1, [REG_state + 1*16])
AS2( movdqa xmm2, [REG_state + 2*16])
AS2( movdqa xmm3, [REG_state + 3*16])
SSE2_EXPAND_S(0, 0)
SSE2_EXPAND_S(0, 1)
SSE2_EXPAND_S(0, 2)
SSE2_EXPAND_S(0, 3)
SSE2_EXPAND_S(1, 0)
SSE2_EXPAND_S(1, 2)
SSE2_EXPAND_S(1, 3)
SSE2_EXPAND_S(2, 1)
SSE2_EXPAND_S(2, 2)
SSE2_EXPAND_S(2, 3)
SSE2_EXPAND_S(3, 0)
SSE2_EXPAND_S(3, 1)
SSE2_EXPAND_S(3, 2)
SSE2_EXPAND_S(3, 3)
for (i=m_rounds; i>0; i-=2)
{
SSE2_QUARTER_ROUND(x0, x1, x3, 7)
SSE2_QUARTER_ROUND(x1, x2, x0, 9)
SSE2_QUARTER_ROUND(x2, x3, x1, 13)
SSE2_QUARTER_ROUND(x3, x0, x2, 18)
#define SSE2_EXPAND_S85(i) \
AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_rounds) \
AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
AS2( add REG_rounds, 1) \
AS2( adc REG_temp32, 0)
x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3));
x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1));
ASL(1)
AS2( mov REG_rounds, dword ptr [REG_state + 8*4])
AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
SSE2_EXPAND_S85(0)
SSE2_EXPAND_S85(1)
SSE2_EXPAND_S85(2)
SSE2_EXPAND_S85(3)
AS2( mov dword ptr [REG_state + 8*4], REG_rounds)
AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
SSE2_QUARTER_ROUND(x0, x3, x1, 7)
SSE2_QUARTER_ROUND(x3, x2, x0, 9)
SSE2_QUARTER_ROUND(x2, x1, x3, 13)
SSE2_QUARTER_ROUND(x1, x0, x2, 18)
#define SSE2_QUARTER_ROUND(a, b, d, i) \
AS2( movdqa xmm4, xmm##d) \
AS2( paddd xmm4, xmm##a) \
AS2( movdqa xmm5, xmm4) \
AS2( pslld xmm4, i) \
AS2( psrld xmm5, 32-i) \
AS2( pxor xmm##b, xmm4) \
AS2( pxor xmm##b, xmm5)
x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1));
x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3));
}
#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
#define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
#define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
#define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
#define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
#define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
#define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
#define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
#define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
#define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
#define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
#define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
#define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
#define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
#define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
#define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
#define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
#define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
#define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
#define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
#define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
#define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
#define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
x0 = _mm_add_epi32(x0, s[0]);
x1 = _mm_add_epi32(x1, s[1]);
x2 = _mm_add_epi32(x2, s[2]);
x3 = _mm_add_epi32(x3, s[3]);
#define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
if (++m_state[8] == 0)
++m_state[5];
#define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
__m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32));
k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3));
__m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32));
k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3));
__m128i k20 = _mm_or_si128(_mm_and_si128(x2, s_maskLo32), _mm_and_si128(x1, s_maskHi32));
__m128i k31 = _mm_or_si128(_mm_and_si128(x3, s_maskLo32), _mm_and_si128(x2, s_maskHi32));
__m128i k0 = _mm_unpackhi_epi64(k02, k20);
__m128i k1 = _mm_unpackhi_epi64(k13, k31);
__m128i k2 = _mm_unpacklo_epi64(k20, k02);
__m128i k3 = _mm_unpacklo_epi64(k31, k13);
#define SSE2_OUTPUT(x) {\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)}
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION);
}
while (--iterationCount);
}
}
#if CRYPTOPP_BOOL_X64
SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
#else
SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
#endif
SSE2_LOAD_ROUNDS
ASJ( jmp, 2, f)
ASL(SSE2_Salsa_Output)
AS2( movdqa xmm0, xmm4)
AS2( punpckldq xmm4, xmm5)
AS2( movdqa xmm1, xmm6)
AS2( punpckldq xmm6, xmm7)
AS2( movdqa xmm2, xmm4)
AS2( punpcklqdq xmm4, xmm6) // e
AS2( punpckhqdq xmm2, xmm6) // f
AS2( punpckhdq xmm0, xmm5)
AS2( punpckhdq xmm1, xmm7)
AS2( movdqa xmm6, xmm0)
AS2( punpcklqdq xmm0, xmm1) // g
AS2( punpckhqdq xmm6, xmm1) // h
AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
AS1( ret)
ASL(6)
#if CRYPTOPP_BOOL_X64
SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
ASL(2)
SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
#else
SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
ASL(2)
SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
#endif
AS2( sub REG_rounds, 2)
ASJ( jnz, 6, b)
#define SSE2_OUTPUT_4(a, b, c, d) \
AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
ASC( call, SSE2_Salsa_Output)
SSE2_OUTPUT_4(0, 13, 10, 7)
SSE2_OUTPUT_4(4, 1, 14, 11)
SSE2_OUTPUT_4(8, 5, 2, 15)
SSE2_OUTPUT_4(12, 9, 6, 3)
AS2( test REG_input, REG_input)
ASJ( jz, 9, f)
AS2( add REG_input, 12*16)
ASL(9)
AS2( add REG_output, 12*16)
AS2( sub REG_iterationCount, 4)
AS2( cmp REG_iterationCount, 4)
ASJ( jge, 1, b)
AS_POP_IF86( sp)
ASL(5)
AS2( sub REG_iterationCount, 1)
ASJ( jl, 4, f)
AS2( movdqa xmm0, [REG_state + 0*16])
AS2( movdqa xmm1, [REG_state + 1*16])
AS2( movdqa xmm2, [REG_state + 2*16])
AS2( movdqa xmm3, [REG_state + 3*16])
SSE2_LOAD_ROUNDS
ASL(0)
SSE2_QUARTER_ROUND(0, 1, 3, 7)
SSE2_QUARTER_ROUND(1, 2, 0, 9)
SSE2_QUARTER_ROUND(2, 3, 1, 13)
SSE2_QUARTER_ROUND(3, 0, 2, 18)
ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
SSE2_QUARTER_ROUND(0, 3, 1, 7)
SSE2_QUARTER_ROUND(3, 2, 0, 9)
SSE2_QUARTER_ROUND(2, 1, 3, 13)
SSE2_QUARTER_ROUND(1, 0, 2, 18)
ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
AS2( sub REG_rounds, 2)
ASJ( jnz, 0, b)
AS2( paddd xmm0, [REG_state + 0*16])
AS2( paddd xmm1, [REG_state + 1*16])
AS2( paddd xmm2, [REG_state + 2*16])
AS2( paddd xmm3, [REG_state + 3*16])
AS2( add dword ptr [REG_state + 8*4], 1)
AS2( adc dword ptr [REG_state + 5*4], 0)
AS2( pcmpeqb xmm6, xmm6) // all ones
AS2( psrlq xmm6, 32) // lo32 mask
ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask
AS2( movdqa xmm4, xmm0)
AS2( movdqa xmm5, xmm3)
AS2( pand xmm0, xmm7)
AS2( pand xmm4, xmm6)
AS2( pand xmm3, xmm6)
AS2( pand xmm5, xmm7)
AS2( por xmm4, xmm5) // 0,13,2,15
AS2( movdqa xmm5, xmm1)
AS2( pand xmm1, xmm7)
AS2( pand xmm5, xmm6)
AS2( por xmm0, xmm5) // 4,1,6,3
AS2( pand xmm6, xmm2)
AS2( pand xmm2, xmm7)
AS2( por xmm1, xmm6) // 8,5,10,7
AS2( por xmm2, xmm3) // 12,9,14,11
AS2( movdqa xmm5, xmm4)
AS2( movdqa xmm6, xmm0)
AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7
AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11
AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15
AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3
// output keystream
AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
ASJ( jmp, 5, b)
ASL(4)
#ifdef __GNUC__
AS_POP_IF86( bx)
".att_syntax prefix;"
:
#if CRYPTOPP_BOOL_X64
: "r" (r), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace)
: "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
#else
: "m" (r), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output)
: "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
#endif
);
#endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
movdqa xmm6, [rsp + 0200h]
movdqa xmm7, [rsp + 0210h]
movdqa xmm8, [rsp + 0220h]
movdqa xmm9, [rsp + 0230h]
movdqa xmm10, [rsp + 0240h]
movdqa xmm11, [rsp + 0250h]
movdqa xmm12, [rsp + 0260h]
movdqa xmm13, [rsp + 0270h]
movdqa xmm14, [rsp + 0280h]
movdqa xmm15, [rsp + 0290h]
add rsp, 10*16 + 32*16 + 8
ret
Salsa20_OperateKeystream ENDP
#else
}
else
#endif
#endif
#ifndef CRYPTOPP_GENERATE_X64_MASM
{
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
while (iterationCount--)
@ -289,7 +513,7 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
x14 = m_state[14];
x15 = m_state[15];
for (i=m_rounds; i>0; i-=2)
for (int i=m_rounds; i>0; i-=2)
{
#define QUARTER_ROUND(a, b, c, d) \
b = b ^ rotlFixed(a + d, 7); \
@ -333,6 +557,9 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
if (++m_state[8] == 0)
++m_state[5];
}
}
} // see comment above if an internal compiler error occurs here
NAMESPACE_END
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM

View File

@ -1,12 +1,21 @@
// sosemanuk.cpp - written and placed in the public domain by Wei Dai
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sosemanuk.cpp" to generate MASM code
#include "pch.h"
#ifndef CRYPTOPP_GENERATE_X64_MASM
#include "sosemanuk.h"
#include "misc.h"
#include "cpu.h"
#include "serpentp.h"
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#include <emmintrin.h>
#endif
NAMESPACE_BEGIN(CryptoPP)
void SosemanukPolicy::CipherSetKey(const NameValuePairs &params, const byte *userKey, size_t keylen)
@ -74,7 +83,8 @@ void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
}
static word32 s_mulTables[512] = {
extern "C" {
word32 s_sosemanukMulTables[512] = {
#if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836,
0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E,
@ -271,7 +281,7 @@ static word32 s_mulTables[512] = {
0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84,
0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2
};
}
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
unsigned int SosemanukPolicy::GetAlignment() const
@ -303,11 +313,36 @@ unsigned int SosemanukPolicy::GetOptimalBlockSize() const
}
#endif
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
void Sosemanuk_OperateKeystream(size_t iterationCount, const byte *input, byte *output, word32 *state);
}
#endif
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
{
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
Sosemanuk_OperateKeystream(iterationCount, input, output, m_state.data());
return;
#endif
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#ifdef CRYPTOPP_GENERATE_X64_MASM
ALIGN 8
Sosemanuk_OperateKeystream PROC FRAME
rex_push_reg rsi
push_reg rdi
alloc_stack(80*4*2+12*4+8*WORD_SZ + 2*16+8)
save_xmm128 xmm6, 02f0h
save_xmm128 xmm7, 0300h
.endprolog
mov rdi, r8
mov rax, r9
#else
#ifdef __INTEL_COMPILER
if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4
#else
@ -315,10 +350,13 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
#endif
{
#ifdef __GNUC__
#if CRYPTOPP_BOOL_X64
__m128i workspace[(80*4*2+12*4+8*WORD_SZ)/16];
#endif
__asm__ __volatile__
(
".intel_syntax noprefix;"
AS_PUSH( bx)
AS_PUSH_IF86( bx)
#else
word32 *state = m_state;
AS2( mov WORD_REG(ax), state)
@ -326,22 +364,31 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( mov WORD_REG(dx), input)
AS2( mov WORD_REG(cx), iterationCount)
#endif
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
#define SSE2_output WORD_PTR [WORD_REG(sp)+1*WORD_SZ]
#define SSE2_input WORD_PTR [WORD_REG(sp)+2*WORD_SZ]
#define SSE2_wordsLeft WORD_PTR [WORD_REG(sp)+3*WORD_SZ]
#define SSE2_diEnd WORD_PTR [WORD_REG(sp)+4*WORD_SZ]
#define SSE2_pMulTables WORD_PTR [WORD_REG(sp)+5*WORD_SZ]
#define SSE2_state WORD_PTR [WORD_REG(sp)+6*WORD_SZ]
#define SSE2_wordsLeft2 WORD_PTR [WORD_REG(sp)+7*WORD_SZ]
#define SSE2_stateCopy WORD_REG(sp) + 8*WORD_SZ
#if defined(__GNUC__) && CRYPTOPP_BOOL_X64
#define SSE2_workspace %5
#else
#define SSE2_workspace WORD_REG(sp)
#endif
#define SSE2_output WORD_PTR [SSE2_workspace+1*WORD_SZ]
#define SSE2_input WORD_PTR [SSE2_workspace+2*WORD_SZ]
#define SSE2_wordsLeft WORD_PTR [SSE2_workspace+3*WORD_SZ]
#define SSE2_diEnd WORD_PTR [SSE2_workspace+4*WORD_SZ]
#define SSE2_pMulTables WORD_PTR [SSE2_workspace+5*WORD_SZ]
#define SSE2_state WORD_PTR [SSE2_workspace+6*WORD_SZ]
#define SSE2_wordsLeft2 WORD_PTR [SSE2_workspace+7*WORD_SZ]
#define SSE2_stateCopy SSE2_workspace + 8*WORD_SZ
#define SSE2_uvStart SSE2_stateCopy + 12*4
AS_PUSH( bp)
AS2( mov WORD_REG(bx), WORD_REG(sp))
AS2( and WORD_REG(sp), -16)
AS2( sub WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals
AS2( mov [WORD_REG(sp)], WORD_REG(bx))
#if CRYPTOPP_BOOL_X86
AS_PUSH_IF86( bp)
AS2( mov AS_REG_6, esp)
AS2( and esp, -16)
AS2( sub esp, 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals
AS2( mov [esp], AS_REG_6)
#endif
AS2( mov SSE2_output, WORD_REG(di))
AS2( mov SSE2_input, WORD_REG(dx))
AS2( mov SSE2_state, WORD_REG(ax))
@ -358,7 +405,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
AS2( psrlq xmm0, 32)
AS2( movd ebx, xmm0) // s(9)
AS2( movd AS_REG_6d, xmm0) // s(9)
AS2( mov ecx, [WORD_REG(ax)+10*4])
AS2( mov edx, [WORD_REG(ax)+11*4])
AS2( pcmpeqb xmm7, xmm7) // all ones
@ -367,35 +414,35 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
#define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
#define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
#define r10 ecx
#define r11 edx
#define r20 edx
#define r21 ecx
#define R10 ecx
#define R11 edx
#define R20 edx
#define R21 ecx
#define SSE2_STEP(i, j) \
AS2( mov eax, [s(i+0)])\
AS2( mov [v(i)], eax)\
AS2( rol eax, 8)\
AS2( lea ebp, [ebx + r2##j])\
AS2( xor ebp, r1##j)\
AS2( mov [u(i)], ebp)\
AS2( mov ebp, 1)\
AS2( and ebp, r2##j)\
AS1( neg ebp)\
AS2( and ebp, ebx)\
AS2( xor ebx, eax)\
AS2( lea AS_REG_7d, [AS_REG_6d + R2##j])\
AS2( xor AS_REG_7d, R1##j)\
AS2( mov [u(i)], AS_REG_7d)\
AS2( mov AS_REG_7d, 1)\
AS2( and AS_REG_7d, R2##j)\
AS1( neg AS_REG_7d)\
AS2( and AS_REG_7d, AS_REG_6d)\
AS2( xor AS_REG_6d, eax)\
AS2( movzx eax, al)\
AS2( xor ebx, [WORD_REG(si)+WORD_REG(ax)*4])\
AS2( xor AS_REG_6d, [WORD_REG(si)+WORD_REG(ax)*4])\
AS2( mov eax, [s(i+3)])\
AS2( xor ebp, [s(i+2)])\
AS2( add r1##j, ebp)\
AS2( movzx ebp, al)\
AS2( xor AS_REG_7d, [s(i+2)])\
AS2( add R1##j, AS_REG_7d)\
AS2( movzx AS_REG_7d, al)\
AS2( shr eax, 8)\
AS2( xor ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\
AS2( xor ebx, eax)\
AS2( imul r2##j, 0x54655307)\
AS2( rol r2##j, 7)\
AS2( mov [s(i+0)], ebx)\
AS2( xor AS_REG_6d, [WORD_REG(si)+1024+AS_REG_7*4])\
AS2( xor AS_REG_6d, eax)\
AS2( imul R2##j, AS_HEX(54655307))\
AS2( rol R2##j, 7)\
AS2( mov [s(i+0)], AS_REG_6d)\
ASL(2) // outer loop, each iteration of this processes 80 words
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
@ -406,7 +453,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop
AS2( mov SSE2_diEnd, WORD_REG(si))
#ifdef _MSC_VER
AS2( lea WORD_REG(si), s_mulTables)
AS2( lea WORD_REG(si), s_sosemanukMulTables)
#else
AS2( mov WORD_REG(si), SSE2_pMulTables)
#endif
@ -438,7 +485,7 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
ASJ( jne, 0, b)
AS2( mov WORD_REG(ax), SSE2_input)
AS2( mov WORD_REG(bp), SSE2_output)
AS2( mov AS_REG_7, SSE2_output)
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
AS2( mov WORD_REG(si), SSE2_wordsLeft2)
@ -487,43 +534,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( punpcklqdq xmm6, xmm5)
AS2( punpckhqdq xmm3, xmm5)
// output keystream
AS2( test WORD_REG(ax), WORD_REG(ax))
ASJ( jz, 3, f)
AS2( test eax, 0xf)
ASJ( jnz, 7, f)
AS2( pxor xmm2, [WORD_REG(ax)+0*16])
AS2( pxor xmm0, [WORD_REG(ax)+1*16])
AS2( pxor xmm6, [WORD_REG(ax)+2*16])
AS2( pxor xmm3, [WORD_REG(ax)+3*16])
AS2( add WORD_REG(ax), 4*16)
ASJ( jmp, 3, f)
ASL(7)
AS2( movdqu xmm1, [WORD_REG(ax)+0*16])
AS2( pxor xmm2, xmm1)
AS2( movdqu xmm1, [WORD_REG(ax)+1*16])
AS2( pxor xmm0, xmm1)
AS2( movdqu xmm1, [WORD_REG(ax)+2*16])
AS2( pxor xmm6, xmm1)
AS2( movdqu xmm1, [WORD_REG(ax)+3*16])
AS2( pxor xmm3, xmm1)
AS2( add WORD_REG(ax), 4*16)
ASL(3)
AS2( test ebp, 0xf)
ASJ( jnz, 8, f)
AS2( movdqa [WORD_REG(bp)+0*16], xmm2)
AS2( movdqa [WORD_REG(bp)+1*16], xmm0)
AS2( movdqa [WORD_REG(bp)+2*16], xmm6)
AS2( movdqa [WORD_REG(bp)+3*16], xmm3)
ASJ( jmp, 9, f)
ASL(8)
AS2( movdqu [WORD_REG(bp)+0*16], xmm2)
AS2( movdqu [WORD_REG(bp)+1*16], xmm0)
AS2( movdqu [WORD_REG(bp)+2*16], xmm6)
AS2( movdqu [WORD_REG(bp)+3*16], xmm3)
ASL(9)
AS_XMM_OUTPUT4(SSE2_Sosemanuk_Output, WORD_REG(ax), AS_REG_7, 2,0,6,3, 1, 0,1,2,3, 4)
// loop
AS2( add WORD_REG(di), 4*4)
AS2( add WORD_REG(bp), 4*16)
AS2( sub WORD_REG(si), 16)
ASJ( jnz, 1, b)
@ -533,29 +547,29 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
ASJ( jz, 6, f)
AS2( mov SSE2_wordsLeft, WORD_REG(si))
AS2( mov SSE2_input, WORD_REG(ax))
AS2( mov SSE2_output, WORD_REG(bp))
AS2( mov SSE2_output, AS_REG_7)
ASJ( jmp, 2, b)
ASL(4) // final output of less than 16 words
AS2( test WORD_REG(ax), WORD_REG(ax))
ASJ( jz, 5, f)
AS2( movd xmm0, [WORD_REG(ax)+0*4])
AS2( movd xmm0, dword ptr [WORD_REG(ax)+0*4])
AS2( pxor xmm2, xmm0)
AS2( movd xmm0, [WORD_REG(ax)+1*4])
AS2( movd xmm0, dword ptr [WORD_REG(ax)+1*4])
AS2( pxor xmm3, xmm0)
AS2( movd xmm0, [WORD_REG(ax)+2*4])
AS2( movd xmm0, dword ptr [WORD_REG(ax)+2*4])
AS2( pxor xmm1, xmm0)
AS2( movd xmm0, [WORD_REG(ax)+3*4])
AS2( movd xmm0, dword ptr [WORD_REG(ax)+3*4])
AS2( pxor xmm4, xmm0)
AS2( add WORD_REG(ax), 16)
ASL(5)
AS2( movd [WORD_REG(bp)+0*4], xmm2)
AS2( movd [WORD_REG(bp)+1*4], xmm3)
AS2( movd [WORD_REG(bp)+2*4], xmm1)
AS2( movd [WORD_REG(bp)+3*4], xmm4)
AS2( movd dword ptr [AS_REG_7+0*4], xmm2)
AS2( movd dword ptr [AS_REG_7+1*4], xmm3)
AS2( movd dword ptr [AS_REG_7+2*4], xmm1)
AS2( movd dword ptr [AS_REG_7+3*4], xmm4)
AS2( sub WORD_REG(si), 4)
ASJ( jz, 6, f)
AS2( add WORD_REG(bp), 16)
AS2( add AS_REG_7, 16)
AS2( psrldq xmm2, 4)
AS2( psrldq xmm3, 4)
AS2( psrldq xmm1, 4)
@ -563,38 +577,52 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
ASJ( jmp, 4, b)
ASL(6) // save state
AS2( mov WORD_REG(bx), SSE2_state)
AS2( mov AS_REG_6, SSE2_state)
AS2( movdqa xmm0, [SSE2_stateCopy+0*16])
AS2( movdqa [WORD_REG(bx)+0*16], xmm0)
AS2( movdqa [AS_REG_6+0*16], xmm0)
AS2( movdqa xmm0, [SSE2_stateCopy+1*16])
AS2( movdqa [WORD_REG(bx)+1*16], xmm0)
AS2( movdqa [AS_REG_6+1*16], xmm0)
AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
AS2( movq QWORD PTR [WORD_REG(bx)+2*16], xmm0)
AS2( mov [WORD_REG(bx)+10*4], ecx)
AS2( mov [WORD_REG(bx)+11*4], edx)
AS2( movq QWORD PTR [AS_REG_6+2*16], xmm0)
AS2( mov [AS_REG_6+10*4], ecx)
AS2( mov [AS_REG_6+11*4], edx)
AS_POP( sp)
AS_POP( bp)
AS_POP_IF86( sp)
AS_POP_IF86( bp)
#ifdef __GNUC__
AS_POP( bx)
AS_POP_IF86( bx)
".att_syntax prefix;"
:
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
: "memory", "cc"
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_sosemanukMulTables), "D" (output), "d" (input)
#if CRYPTOPP_BOOL_X64
, "r" (workspace)
#endif
: "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
);
#endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
movdqa xmm6, [rsp + 02f0h]
movdqa xmm7, [rsp + 0300h]
add rsp, 80*4*2+12*4+8*WORD_SZ + 2*16+8
pop rdi
pop rsi
ret
Sosemanuk_OperateKeystream ENDP
#else
}
else
#endif
#endif
#ifndef CRYPTOPP_GENERATE_X64_MASM
{
#if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
#define MUL_A(x) (x = rotlFixed(x, 8), x ^ s_mulTables[byte(x)])
#define MUL_A(x) (x = rotlFixed(x, 8), x ^ s_sosemanukMulTables[byte(x)])
#else
#define MUL_A(x) (((x) << 8) ^ s_mulTables[(x) >> 24])
#define MUL_A(x) (((x) << 8) ^ s_sosemanukMulTables[(x) >> 24])
#endif
#define DIV_A(x) (((x) >> 8) ^ s_mulTables[256 + byte(x)])
#define DIV_A(x) (((x) >> 8) ^ s_sosemanukMulTables[256 + byte(x)])
#define r1(i) ((i%2) ? reg2 : reg1)
#define r2(i) ((i%2) ? reg1 : reg2)
@ -676,3 +704,5 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
}
NAMESPACE_END
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM

View File

@ -1,7 +1,7 @@
// whrlpool.cpp - originally modified by Kevin Springle from
// Paulo Barreto and Vincent Rijmen's public domain code, whirlpool.c.
// Updated to Whirlpool version 3.0, optimized and SSE version added by Wei Dai
// Any modifications are placed in the public domain
// All modifications are placed in the public domain
// This is the original introductory comment:
@ -71,6 +71,10 @@
#include "misc.h"
#include "cpu.h"
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
#include <emmintrin.h>
#endif
NAMESPACE_BEGIN(CryptoPP)
void Whirlpool_TestInstantiations()
@ -395,29 +399,37 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
{
// MMX version has the same structure as C version below
#ifdef __GNUC__
#if CRYPTOPP_BOOL_X64
__m128i workspace[8];
#endif
__asm__ __volatile__
(
".intel_syntax noprefix;"
AS_PUSH( bx)
AS2( mov WORD_REG(bx), WORD_REG(ax))
AS_PUSH_IF86( bx)
AS2( mov AS_REG_6, WORD_REG(ax))
#else
#if _MSC_VER < 1300
AS_PUSH( bx)
AS_PUSH_IF86( bx)
#endif
AS2( lea WORD_REG(bx), [Whirlpool_C])
AS2( lea AS_REG_6, [Whirlpool_C])
AS2( mov WORD_REG(cx), digest)
AS2( mov WORD_REG(dx), block)
#endif
AS2( mov WORD_REG(ax), WORD_REG(sp))
AS2( and WORD_REG(sp), -16)
AS2( sub WORD_REG(sp), 16*8)
AS_PUSH( ax)
#if CRYPTOPP_BOOL_X86
AS2( mov eax, esp)
AS2( and esp, -16)
AS2( sub esp, 16*8)
AS1( push eax)
#define SSE2_workspace esp+WORD_SZ
#else
#define SSE2_workspace %3
#endif
AS2( xor esi, esi)
ASL(0)
AS2( movq mm0, [WORD_REG(cx)+8*WORD_REG(si)])
AS2( movq [WORD_REG(sp)+WORD_SZ+8*WORD_REG(si)], mm0) // k
AS2( movq [SSE2_workspace+8*WORD_REG(si)], mm0) // k
AS2( pxor mm0, [WORD_REG(dx)+8*WORD_REG(si)])
AS2( movq [WORD_REG(sp)+WORD_SZ+64+8*WORD_REG(si)], mm0) // s
AS2( movq [SSE2_workspace+64+8*WORD_REG(si)], mm0) // s
AS2( movq [WORD_REG(cx)+8*WORD_REG(si)], mm0)
AS1( inc WORD_REG(si))
AS2( cmp WORD_REG(si), 8)
@ -430,16 +442,16 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
#define KSL1(a, b) AS2(pxor mm##a, b)
#define KSL(op, i, a, b, c, d) \
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*i])\
AS2(mov eax, [SSE2_workspace+8*i])\
AS2(movzx edi, al)\
KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
KSL##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\
KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
KSL##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\
AS2(movzx edi, al)\
AS2(shr eax, 8)\
KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
KSL##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
KSL##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
#define KSH0(a, b) \
ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
@ -448,57 +460,57 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
AS2(pxor mm##a, b)
#define KSH2(a, b) \
AS2(pxor mm##a, b)\
AS2(movq [WORD_REG(sp)+WORD_SZ+8*a], mm##a)
AS2(movq [SSE2_workspace+8*a], mm##a)
#define KSH(op, i, a, b, c, d) \
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+8*((i+4)-8*((i+4)/8))+4])\
AS2(mov eax, [SSE2_workspace+8*((i+4)-8*((i+4)/8))+4])\
AS2(movzx edi, al)\
KSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
KSH##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\
KSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
KSH##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\
AS2(movzx edi, al)\
AS2(shr eax, 8)\
KSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
KSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
KSH##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
KSH##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
#define TSL(op, i, a, b, c, d) \
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*i])\
AS2(mov eax, [SSE2_workspace+64+8*i])\
AS2(movzx edi, al)\
KSL##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
KSL##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\
KSL##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
KSL##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\
AS2(movzx edi, al)\
AS2(shr eax, 8)\
KSL##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
KSL##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
KSL##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
KSL##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
#define TSH0(a, b) \
ASS(pshufw mm##a, mm##a, 1, 0, 3, 2)\
AS2(pxor mm##a, [WORD_REG(sp)+WORD_SZ+8*a])\
AS2(pxor mm##a, [SSE2_workspace+8*a])\
AS2(pxor mm##a, b)
#define TSH1(a, b) \
AS2(pxor mm##a, b)
#define TSH2(a, b) \
AS2(pxor mm##a, b)\
AS2(movq [WORD_REG(sp)+WORD_SZ+64+8*a], mm##a)
AS2(movq [SSE2_workspace+64+8*a], mm##a)
#define TSH3(a, b) \
AS2(pxor mm##a, b)\
AS2(pxor mm##a, [WORD_REG(cx)+8*a])\
AS2(movq [WORD_REG(cx)+8*a], mm##a)
#define TSH(op, i, a, b, c, d) \
AS2(mov eax, [WORD_REG(sp)+WORD_SZ+64+8*((i+4)-8*((i+4)/8))+4])\
AS2(mov eax, [SSE2_workspace+64+8*((i+4)-8*((i+4)/8))+4])\
AS2(movzx edi, al)\
TSH##op(a, [WORD_REG(bx)+3*2048+8*WORD_REG(di)])\
TSH##op(a, [AS_REG_6+3*2048+8*WORD_REG(di)])\
AS2(movzx edi, ah)\
TSH##op(b, [WORD_REG(bx)+2*2048+8*WORD_REG(di)])\
TSH##op(b, [AS_REG_6+2*2048+8*WORD_REG(di)])\
AS2(shr eax, 16)\
AS2(movzx edi, al)\
AS2(shr eax, 8)\
TSH##op(c, [WORD_REG(bx)+1*2048+8*WORD_REG(di)])\
TSH##op(d, [WORD_REG(bx)+0*2048+8*WORD_REG(ax)])
TSH##op(c, [AS_REG_6+1*2048+8*WORD_REG(di)])\
TSH##op(d, [AS_REG_6+0*2048+8*WORD_REG(ax)])
KSL(0, 4, 3, 2, 1, 0)
KSL(0, 0, 7, 6, 5, 4)
@ -517,8 +529,8 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
KSH(2, 3, 2, 1, 0, 7)
KSH(2, 7, 6, 5, 4, 3)
AS2( pxor mm0, [WORD_REG(bx) + 8*1024 + WORD_REG(si)*8])
AS2( movq [WORD_REG(sp)+WORD_SZ], mm0)
AS2( pxor mm0, [AS_REG_6 + 8*1024 + WORD_REG(si)*8])
AS2( movq [SSE2_workspace], mm0)
TSL(0, 4, 3, 2, 1, 0)
TSL(0, 0, 7, 6, 5, 4)
@ -553,17 +565,23 @@ void Whirlpool::Transform(word64 *digest, const word64 *block)
#undef TSL
#undef TSH
AS_POP( sp)
AS_POP_IF86( sp)
AS1( emms)
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
AS_POP( bx)
AS_POP_IF86( bx)
#endif
#ifdef __GNUC__
".att_syntax prefix;"
:
: "a" (Whirlpool_C), "c" (digest), "d" (block)
#if CRYPTOPP_BOOL_X64
, "r" (workspace)
#endif
: "%esi", "%edi", "memory", "cc"
#if CRYPTOPP_BOOL_X64
, "%r9"
#endif
);
#endif
}

File diff suppressed because it is too large Load Diff