Fix alignment on Win32 and Solaris Sparc (PR #709)

These fixes were interesting in a morbid sort of way. I thought the FixedSizeAllocatorWithCleanup specializations faithfully reproduced semantics but I was wrong on Win32 and Sparc. Also see Commit e054d36dc8.

It seems there was another requirement or dependency that we missed, but it was not readily apparent. If I am parsing results correctly (which I may not be), it appears the bit twiddling using 8 byte alignment had more influence on alignment than I originally thought based on use of CRYPTOPP_BOOL_ALIGN16 and T_Align16. Or maybe the alignment attributes specified by CRYPTOPP_ALIGN_DATA are not being honored like they should for stack allocations.

This check-in avoids some uses of x86 movdqa (aligned) in favor of movdqu (unaligned). The uses were concentrated on memory operands which were 8-byte aligned instead of 16-byte aligned. It is not clear to me how the specializations lost 8-bytes of alignment. The check-in also enlists CRYPTOPP_ASSERT to tell us when there's a problem so we don't need to go hunting for bugs.
pull/710/head
Jeffrey Walton 2018-08-23 14:42:29 -04:00 committed by GitHub
parent e054d36dc8
commit afbd3e60f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 86 additions and 72 deletions

View File

@ -152,8 +152,8 @@ public:
{ {
CRYPTOPP_ASSERT(in != NULLPTR); CRYPTOPP_ASSERT(in != NULLPTR);
CRYPTOPP_ASSERT(out != NULLPTR); CRYPTOPP_ASSERT(out != NULLPTR);
CRYPTOPP_ASSERT(IsAligned<T_HashWordType>(in)); CRYPTOPP_ASSERT(IsAligned<T_HashWordType*>(in));
CRYPTOPP_ASSERT(IsAligned<T_HashWordType>(out)); CRYPTOPP_ASSERT(IsAligned<T_HashWordType*>(out));
ConditionalByteReverse(T_Endianness::ToEnum(), out, in, byteCount); ConditionalByteReverse(T_Endianness::ToEnum(), out, in, byteCount);
} }

8
misc.h
View File

@ -2043,7 +2043,7 @@ inline T ConditionalByteReverse(ByteOrder order, T value)
/// not part of a full element. If T is int (and int is 4 bytes), then /// not part of a full element. If T is int (and int is 4 bytes), then
/// <tt>byteCount = 10</tt> means only the first 2 elements or 8 bytes are /// <tt>byteCount = 10</tt> means only the first 2 elements or 8 bytes are
/// reversed. /// reversed.
/// \details The follwoing program should help illustrate the behavior. /// \details The following program should help illustrate the behavior.
/// <pre>vector<word32> v1, v2; /// <pre>vector<word32> v1, v2;
/// ///
/// v1.push_back(1); /// v1.push_back(1);
@ -2063,7 +2063,7 @@ inline T ConditionalByteReverse(ByteOrder order, T value)
/// for(unsigned int i = 0; i < v2.size(); i++) /// for(unsigned int i = 0; i < v2.size(); i++)
/// cout << std::hex << v2[i] << " "; /// cout << std::hex << v2[i] << " ";
/// cout << endl;</pre> /// cout << endl;</pre>
/// The program above results in the follwoing output. /// The program above results in the following output.
/// <pre>V1: 00000001 00000002 00000003 00000004 /// <pre>V1: 00000001 00000002 00000003 00000004
/// V2: 01000000 02000000 03000000 04000000</pre> /// V2: 01000000 02000000 03000000 04000000</pre>
/// \sa ConditionalByteReverse /// \sa ConditionalByteReverse
@ -2072,8 +2072,8 @@ void ByteReverse(T *out, const T *in, size_t byteCount)
{ {
// Alignment check due to Issues 690 // Alignment check due to Issues 690
CRYPTOPP_ASSERT(byteCount % sizeof(T) == 0); CRYPTOPP_ASSERT(byteCount % sizeof(T) == 0);
CRYPTOPP_ASSERT(IsAligned<T>(in)); //CRYPTOPP_ASSERT(IsAligned<T*>(in));
CRYPTOPP_ASSERT(IsAligned<T>(out)); //CRYPTOPP_ASSERT(IsAligned<T*>(out));
size_t count = byteCount/sizeof(T); size_t count = byteCount/sizeof(T);
for (size_t i=0; i<count; i++) for (size_t i=0; i<count; i++)

View File

@ -93,10 +93,10 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
AS_PUSH_IF86( cx) AS_PUSH_IF86( cx)
#endif #endif
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16]) AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+0*16])
AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16]) AS2( movdqu xmm1, XMMWORD_PTR [AS_REG_2+1*16])
AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16]) AS2( movdqu xmm2, XMMWORD_PTR [AS_REG_2+2*16])
AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16]) AS2( movdqu xmm3, XMMWORD_PTR [AS_REG_2+3*16])
AS2( mov eax, dword ptr [AS_REG_2+4*16]) AS2( mov eax, dword ptr [AS_REG_2+4*16])
ASL(4) ASL(4)
@ -184,8 +184,8 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
ASL(1) ASL(1)
AS2( test AS_REG_3, 15) AS2( test AS_REG_3, 15)
ASJ( jnz, 3, f) ASJ( jnz, 3, f)
AS2( movdqa XMMWORD_PTR [AS_REG_3], xmm4) AS2( movdqu XMMWORD_PTR [AS_REG_3], xmm4)
AS2( movdqa XMMWORD_PTR [AS_REG_3+16], xmm6) AS2( movdqu XMMWORD_PTR [AS_REG_3+16], xmm6)
AS2( add AS_REG_3, 32) AS2( add AS_REG_3, 32)
ASJ( jmp, 0, f) ASJ( jmp, 0, f)
ASL(3) ASL(3)
@ -200,24 +200,26 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
AS2( lea AS_REG_7, [AS_REG_6 + (32-24)*32]) AS2( lea AS_REG_7, [AS_REG_6 + (32-24)*32])
AS2( and AS_REG_7, 31*32) AS2( and AS_REG_7, 31*32)
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8]) AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8])
AS2( pxor xmm3, xmm0) AS2( pxor xmm3, xmm0)
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1) ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3) AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
AS2( pxor xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8]) AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8])
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0) AS2( pxor xmm0, xmm5)
AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8]) AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8])
AS2( pxor xmm1, xmm4) AS2( pxor xmm1, xmm4)
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1) AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
AS2( pxor xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8]) AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8])
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4) AS2( pxor xmm4, xmm5)
AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
// theta // theta
AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16]) AS2( movdqu xmm3, XMMWORD_PTR [AS_REG_2+3*16])
AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16]) AS2( movdqu xmm2, XMMWORD_PTR [AS_REG_2+2*16])
AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16]) AS2( movdqu xmm1, XMMWORD_PTR [AS_REG_2+1*16])
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16]) AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+0*16])
#if CRYPTOPP_SSSE3_ASM_AVAILABLE #if CRYPTOPP_SSSE3_ASM_AVAILABLE
AS2( test AS_REG_6, 1) AS2( test AS_REG_6, 1)
@ -271,16 +273,16 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
AS2( lea AS_REG_7, [AS_REG_6 + 16*32]) AS2( lea AS_REG_7, [AS_REG_6 + 16*32])
AS2( and AS_REG_7, 31*32) AS2( and AS_REG_7, 31*32)
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16]) AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16])
AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16]) AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16])
AS2( movdqa xmm6, xmm4) AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5) AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5) AS2( punpckhqdq xmm6, xmm5)
AS2( pxor xmm3, xmm4) AS2( pxor xmm3, xmm4)
AS2( pxor xmm2, xmm6) AS2( pxor xmm2, xmm6)
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16]) AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16])
AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16]) AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16])
AS2( movdqa xmm6, xmm4) AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5) AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5) AS2( punpckhqdq xmm6, xmm5)
@ -294,10 +296,10 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
// save state // save state
AS2( mov [AS_REG_2+4*16], eax) AS2( mov [AS_REG_2+4*16], eax)
AS2( movdqa XMMWORD_PTR [AS_REG_2+3*16], xmm3) AS2( movdqu XMMWORD_PTR [AS_REG_2+3*16], xmm3)
AS2( movdqa XMMWORD_PTR [AS_REG_2+2*16], xmm2) AS2( movdqu XMMWORD_PTR [AS_REG_2+2*16], xmm2)
AS2( movdqa XMMWORD_PTR [AS_REG_2+1*16], xmm1) AS2( movdqu XMMWORD_PTR [AS_REG_2+1*16], xmm1)
AS2( movdqa XMMWORD_PTR [AS_REG_2+0*16], xmm0) AS2( movdqu XMMWORD_PTR [AS_REG_2+0*16], xmm0)
#if CRYPTOPP_BOOL_X86 #if CRYPTOPP_BOOL_X86
AS2( add esp, 4) AS2( add esp, 4)
@ -329,7 +331,7 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
#else #else
} }
#endif #endif
#endif // #ifdef CRYPTOPP_SSE2_ASM_AVAILABLE #endif // CRYPTOPP_SSE2_ASM_AVAILABLE
#ifndef CRYPTOPP_GENERATE_X64_MASM #ifndef CRYPTOPP_GENERATE_X64_MASM

View File

@ -265,12 +265,12 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
#define SSE2_EXPAND_S(i, j) \ #define SSE2_EXPAND_S(i, j) \
ASS( pshufd xmm4, xmm##i, j, j, j, j) \ ASS( pshufd xmm4, xmm##i, j, j, j, j) \
AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4) AS2( movdqu [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
AS2( movdqa xmm0, [REG_state + 0*16]) AS2( movdqu xmm0, [REG_state + 0*16])
AS2( movdqa xmm1, [REG_state + 1*16]) AS2( movdqu xmm1, [REG_state + 1*16])
AS2( movdqa xmm2, [REG_state + 2*16]) AS2( movdqu xmm2, [REG_state + 2*16])
AS2( movdqa xmm3, [REG_state + 3*16]) AS2( movdqu xmm3, [REG_state + 3*16])
SSE2_EXPAND_S(0, 0) SSE2_EXPAND_S(0, 0)
SSE2_EXPAND_S(0, 1) SSE2_EXPAND_S(0, 1)
SSE2_EXPAND_S(0, 2) SSE2_EXPAND_S(0, 2)
@ -311,15 +311,15 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
AS2( pxor xmm##b, xmm4) \ AS2( pxor xmm##b, xmm4) \
AS2( pxor xmm##b, xmm5) AS2( pxor xmm##b, xmm5)
#define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */ #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqu xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
#define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */ #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqu xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
#define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */ #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
#define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
#define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7) #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
#define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7) #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
#define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256]) #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
#define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */ #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
#define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A) #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + b*16], xmm##A)
#define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
#define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */ #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
#define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
@ -327,7 +327,7 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
#define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9) #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
#define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256]) #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
#define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */ #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
#define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A) #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + c*16], xmm##A)
#define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
#define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */ #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
#define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A) #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
@ -335,14 +335,14 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
#define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13) #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
#define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
#define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */ #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
#define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A) #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + d*16], xmm##A)
#define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */ #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
#define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A) #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
#define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18) #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
#define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18) #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
#define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */ #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
#define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */ #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
#define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A) #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqu [SSE2_WORKSPACE + a*16], xmm##A)
#define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \ #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \ L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
@ -453,13 +453,13 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
ASJ( jnz, 6, b) ASJ( jnz, 6, b)
#define SSE2_OUTPUT_4(a, b, c, d) \ #define SSE2_OUTPUT_4(a, b, c, d) \
AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\ AS2( movdqu xmm4, [SSE2_WORKSPACE + a*16 + 256])\
AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\ AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\ AS2( movdqu xmm5, [SSE2_WORKSPACE + b*16 + 256])\
AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\ AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\ AS2( movdqu xmm6, [SSE2_WORKSPACE + c*16 + 256])\
AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\ AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\ AS2( movdqu xmm7, [SSE2_WORKSPACE + d*16 + 256])\
AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\ AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
ASC( call, SSE2_Salsa_Output) ASC( call, SSE2_Salsa_Output)
@ -480,10 +480,10 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
ASL(5) ASL(5)
AS2( sub REG_iterationCount, 1) AS2( sub REG_iterationCount, 1)
ASJ( jl, 4, f) ASJ( jl, 4, f)
AS2( movdqa xmm0, [REG_state + 0*16]) AS2( movdqu xmm0, [REG_state + 0*16])
AS2( movdqa xmm1, [REG_state + 1*16]) AS2( movdqu xmm1, [REG_state + 1*16])
AS2( movdqa xmm2, [REG_state + 2*16]) AS2( movdqu xmm2, [REG_state + 2*16])
AS2( movdqa xmm3, [REG_state + 3*16]) AS2( movdqu xmm3, [REG_state + 3*16])
AS2( mov REG_roundsLeft, REG_rounds) AS2( mov REG_roundsLeft, REG_rounds)
ASL(0) ASL(0)
@ -504,10 +504,14 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
AS2( sub REG_roundsLeft, 2) AS2( sub REG_roundsLeft, 2)
ASJ( jnz, 0, b) ASJ( jnz, 0, b)
AS2( paddd xmm0, [REG_state + 0*16]) AS2( movdqu xmm6, [REG_state + 0*16])
AS2( paddd xmm1, [REG_state + 1*16]) AS2( paddd xmm0, xmm6)
AS2( paddd xmm2, [REG_state + 2*16]) AS2( movdqu xmm7, [REG_state + 1*16])
AS2( paddd xmm3, [REG_state + 3*16]) AS2( paddd xmm1, xmm7)
AS2( movdqu xmm6, [REG_state + 2*16])
AS2( paddd xmm2, xmm6)
AS2( movdqu xmm7, [REG_state + 3*16])
AS2( paddd xmm3, xmm7)
AS2( add dword ptr [REG_state + 8*4], 1) AS2( add dword ptr [REG_state + 8*4], 1)
AS2( adc dword ptr [REG_state + 5*4], 0) AS2( adc dword ptr [REG_state + 5*4], 0)

View File

@ -469,14 +469,21 @@ public:
private: private:
#if defined(CRYPTOPP_BOOL_ALIGN16) && defined(CRYPTOPP_ALIGN_ATTRIBUTE) #if defined(CRYPTOPP_BOOL_ALIGN16) && defined(CRYPTOPP_ALIGN_ATTRIBUTE)
T* GetAlignedArray() {return m_array;} T* GetAlignedArray() {
CRYPTOPP_ASSERT(IsAlignedOn(m_array, 16));
return m_array;
}
CRYPTOPP_ALIGN_DATA(16) T m_array[S]; CRYPTOPP_ALIGN_DATA(16) T m_array[S];
#elif defined(CRYPTOPP_BOOL_ALIGN16) #elif defined(CRYPTOPP_BOOL_ALIGN16)
// There be demons here... Some platforms and small datatypes can // There be demons here... Some platforms and small datatypes can
// make things go sideways. We experienced it on AIX with XLC. If // make things go sideways. We experienced it on AIX with XLC. If
// we see anymore problems we should probably avoid the stack and // we see anymore problems we should probably avoid the stack and
// move to aligned heap allocations. // move to aligned heap allocations.
T* GetAlignedArray() {return (T*)(void*)(((byte*)m_array) + (0-(size_t)m_array)%16);} T* GetAlignedArray() {
T* p_array = (T*)(void*)(((byte*)m_array) + (0-(size_t)m_array)%16);
CRYPTOPP_ASSERT(IsAlignedOn(p_array, 16));
return p_array;
}
T m_array[S+8/sizeof(T)]; T m_array[S+8/sizeof(T)];
#else #else
T* GetAlignedArray() {return m_array;} T* GetAlignedArray() {return m_array;}

View File

@ -1119,7 +1119,8 @@ CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_HashBlock_SSE2(word64 *state, const
#define SSE2_CombineState(i) \ #define SSE2_CombineState(i) \
AS2( movdqu xmm0, [edi+i*16])\ AS2( movdqu xmm0, [edi+i*16])\
AS2( paddq xmm0, [ecx+i*16])\ AS2( movdqu xmm1, [ecx+i*16])\
AS2( paddq xmm0, xmm1)\
AS2( movdqu [ecx+i*16], xmm0) AS2( movdqu [ecx+i*16], xmm0)
SSE2_CombineState(0) SSE2_CombineState(0)
@ -1147,7 +1148,7 @@ CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_HashBlock_SSE2(word64 *state, const
// ANONYMOUS_NAMESPACE_END // ANONYMOUS_NAMESPACE_END
#endif // CRYPTOPP_SSE2_ASM_AVAILABLE #endif // CRYPTOPP_SSE2_ASM_AVAILABLE
ANONYMOUS_NAMESPACE_BEGIN ANONYMOUS_NAMESPACE_BEGIN

View File

@ -412,10 +412,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( lea WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)]) AS2( lea WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
AS2( lea WORD_REG(si), [4*WORD_REG(cx)]) AS2( lea WORD_REG(si), [4*WORD_REG(cx)])
AS2( mov SSE2_wordsLeft, WORD_REG(si)) AS2( mov SSE2_wordsLeft, WORD_REG(si))
AS2( movdqa xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register AS2( movdqu xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register
AS2( movdqa [SSE2_stateCopy+0*16], xmm0) AS2( movdqu [SSE2_stateCopy+0*16], xmm0)
AS2( movdqa xmm0, [WORD_REG(ax)+1*16]) AS2( movdqu xmm0, [WORD_REG(ax)+1*16])
AS2( movdqa [SSE2_stateCopy+1*16], xmm0) AS2( movdqu [SSE2_stateCopy+1*16], xmm0)
AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16]) AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0) AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
AS2( psrlq xmm0, 32) AS2( psrlq xmm0, 32)
@ -507,10 +507,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
AS2( mov WORD_REG(si), SSE2_wordsLeft2) AS2( mov WORD_REG(si), SSE2_wordsLeft2)
ASL(1) // second inner loop, 16 words each, 5 iterations ASL(1) // second inner loop, 16 words each, 5 iterations
AS2( movdqa xmm0, [WORD_REG(di)+0*20*4]) AS2( movdqu xmm0, [WORD_REG(di)+0*20*4])
AS2( movdqa xmm2, [WORD_REG(di)+2*20*4]) AS2( movdqu xmm2, [WORD_REG(di)+2*20*4])
AS2( movdqa xmm3, [WORD_REG(di)+3*20*4]) AS2( movdqu xmm3, [WORD_REG(di)+3*20*4])
AS2( movdqa xmm1, [WORD_REG(di)+1*20*4]) AS2( movdqu xmm1, [WORD_REG(di)+1*20*4])
// S2 // S2
AS2( movdqa xmm4, xmm0) AS2( movdqa xmm4, xmm0)
AS2( pand xmm0, xmm2) AS2( pand xmm0, xmm2)
@ -596,10 +596,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
ASL(6) // save state ASL(6) // save state
AS2( mov AS_REG_6, SSE2_state) AS2( mov AS_REG_6, SSE2_state)
AS2( movdqa xmm0, [SSE2_stateCopy+0*16]) AS2( movdqu xmm0, [SSE2_stateCopy+0*16])
AS2( movdqa [AS_REG_6+0*16], xmm0) AS2( movdqu [AS_REG_6+0*16], xmm0)
AS2( movdqa xmm0, [SSE2_stateCopy+1*16]) AS2( movdqu xmm0, [SSE2_stateCopy+1*16])
AS2( movdqa [AS_REG_6+1*16], xmm0) AS2( movdqu [AS_REG_6+1*16], xmm0)
AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16]) AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
AS2( movq QWORD PTR [AS_REG_6+2*16], xmm0) AS2( movq QWORD PTR [AS_REG_6+2*16], xmm0)
AS2( mov [AS_REG_6+10*4], ecx) AS2( mov [AS_REG_6+10*4], ecx)