Fix alignment on Win32 and Solaris Sparc (PR #709)

These fixes were interesting in a morbid sort of way. I thought the FixedSizeAllocatorWithCleanup specializations faithfully reproduced semantics but I was wrong on Win32 and Sparc. Also see Commit e054d36dc8. It seems there was another requirement or dependency that we missed, but it was not readily apparent. If I am parsing results correctly (which I may not be), it appears the bit twiddling using 8 byte alignment had more influence on alignment than I originally thought based on use of CRYPTOPP_BOOL_ALIGN16 and T_Align16. Or maybe the alignment attributes specified by CRYPTOPP_ALIGN_DATA are not being honored like they should for stack allocations. This check-in avoids some uses of x86 movdqa (aligned) in favor of movdqu (unaligned). The uses were concentrated on memory operands which were 8-byte aligned instead of 16-byte aligned. It is not clear to me how the specializations lost 8-bytes of alignment. The check-in also enlists CRYPTOPP_ASSERT to tell us when there's a problem so we don't need to go hunting for bugs.
2018-08-23 14:42:29 -04:00 · 2018-08-23 14:42:29 -04:00 · afbd3e60f6
parent e054d36dc8
commit afbd3e60f6
7 changed files with 86 additions and 72 deletions
--- a/iterhash.h
+++ b/iterhash.h
@ -152,8 +152,8 @@ public:
 	{
 		CRYPTOPP_ASSERT(in != NULLPTR);
 		CRYPTOPP_ASSERT(out != NULLPTR);
-		CRYPTOPP_ASSERT(IsAligned<T_HashWordType>(in));
+		CRYPTOPP_ASSERT(IsAligned<T_HashWordType*>(in));
-		CRYPTOPP_ASSERT(IsAligned<T_HashWordType>(out));
+		CRYPTOPP_ASSERT(IsAligned<T_HashWordType*>(out));
 		ConditionalByteReverse(T_Endianness::ToEnum(), out, in, byteCount);
 	}
--- a/misc.h
+++ b/misc.h
@ -2043,7 +2043,7 @@ inline T ConditionalByteReverse(ByteOrder order, T value)
 ///   not part of a full element. If T is int (and int is 4 bytes), then
 ///   <tt>byteCount = 10</tt> means only the first 2 elements or 8 bytes are
 ///   reversed.
-/// \details The follwoing program should help illustrate the behavior.
+/// \details The following program should help illustrate the behavior.
 /// <pre>vector<word32> v1, v2;
 ///
 /// v1.push_back(1);
@ -2063,7 +2063,7 @@ inline T ConditionalByteReverse(ByteOrder order, T value)
 /// for(unsigned int i = 0; i < v2.size(); i++)
 ///   cout << std::hex << v2[i] << " ";
 /// cout << endl;</pre>
-/// The program above results in the follwoing output.
+/// The program above results in the following output.
 /// <pre>V1: 00000001 00000002 00000003 00000004
 /// V2: 01000000 02000000 03000000 04000000</pre>
 /// \sa ConditionalByteReverse
@ -2072,8 +2072,8 @@ void ByteReverse(T *out, const T *in, size_t byteCount)
 {
 	// Alignment check due to Issues 690
 	CRYPTOPP_ASSERT(byteCount % sizeof(T) == 0);
-	CRYPTOPP_ASSERT(IsAligned<T>(in));
+	//CRYPTOPP_ASSERT(IsAligned<T*>(in));
-	CRYPTOPP_ASSERT(IsAligned<T>(out));
+	//CRYPTOPP_ASSERT(IsAligned<T*>(out));
 	size_t count = byteCount/sizeof(T);
 	for (size_t i=0; i<count; i++)
--- a/panama.cpp
+++ b/panama.cpp
@ -93,10 +93,10 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
 		AS_PUSH_IF86(	cx)
 	#endif
-	AS2(	movdqa	xmm0, XMMWORD_PTR [AS_REG_2+0*16])
+	AS2(	movdqu	xmm0, XMMWORD_PTR [AS_REG_2+0*16])
-	AS2(	movdqa	xmm1, XMMWORD_PTR [AS_REG_2+1*16])
+	AS2(	movdqu	xmm1, XMMWORD_PTR [AS_REG_2+1*16])
-	AS2(	movdqa	xmm2, XMMWORD_PTR [AS_REG_2+2*16])
+	AS2(	movdqu	xmm2, XMMWORD_PTR [AS_REG_2+2*16])
-	AS2(	movdqa	xmm3, XMMWORD_PTR [AS_REG_2+3*16])
+	AS2(	movdqu	xmm3, XMMWORD_PTR [AS_REG_2+3*16])
 	AS2(	mov		eax, dword ptr [AS_REG_2+4*16])
 	ASL(4)
@ -184,8 +184,8 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
 	ASL(1)
 	AS2(	test	AS_REG_3, 15)
 	ASJ(	jnz,	3, f)
-	AS2(	movdqa	XMMWORD_PTR [AS_REG_3], xmm4)
+	AS2(	movdqu	XMMWORD_PTR [AS_REG_3], xmm4)
-	AS2(	movdqa	XMMWORD_PTR [AS_REG_3+16], xmm6)
+	AS2(	movdqu	XMMWORD_PTR [AS_REG_3+16], xmm6)
 	AS2(	add		AS_REG_3, 32)
 	ASJ(	jmp,	0, f)
 	ASL(3)
@ -200,24 +200,26 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
 	AS2(	lea		AS_REG_7, [AS_REG_6 + (32-24)*32])
 	AS2(	and		AS_REG_7, 31*32)
-	AS2(	movdqa	xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8])
+	AS2(	movdqu	xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8])
 	AS2(	pxor	xmm3, xmm0)
 	ASS(	pshufd	xmm0, xmm0, 2, 3, 0, 1)
-	AS2(	movdqa	XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
+	AS2(	movdqu	XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
-	AS2(	pxor	xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8])
+	AS2(	movdqu	xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8])
-	AS2(	movdqa	XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
+	AS2(	pxor	xmm0, xmm5)
 	AS2(	movdqu	XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
-	AS2(	movdqa	xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8])
+	AS2(	movdqu	xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8])
 	AS2(	pxor	xmm1, xmm4)
-	AS2(	movdqa	XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
+	AS2(	movdqu	XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
-	AS2(	pxor	xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8])
+	AS2(	movdqu	xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8])
-	AS2(	movdqa	XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
+	AS2(	pxor	xmm4, xmm5)
 	AS2(	movdqu	XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
 	// theta
-	AS2(	movdqa	xmm3, XMMWORD_PTR [AS_REG_2+3*16])
+	AS2(	movdqu	xmm3, XMMWORD_PTR [AS_REG_2+3*16])
-	AS2(	movdqa	xmm2, XMMWORD_PTR [AS_REG_2+2*16])
+	AS2(	movdqu	xmm2, XMMWORD_PTR [AS_REG_2+2*16])
-	AS2(	movdqa	xmm1, XMMWORD_PTR [AS_REG_2+1*16])
+	AS2(	movdqu	xmm1, XMMWORD_PTR [AS_REG_2+1*16])
-	AS2(	movdqa	xmm0, XMMWORD_PTR [AS_REG_2+0*16])
+	AS2(	movdqu	xmm0, XMMWORD_PTR [AS_REG_2+0*16])
 #if CRYPTOPP_SSSE3_ASM_AVAILABLE
 	AS2(	test	AS_REG_6, 1)
@ -271,16 +273,16 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
 	AS2(	lea		AS_REG_7, [AS_REG_6 + 16*32])
 	AS2(	and		AS_REG_7, 31*32)
-	AS2(	movdqa	xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16])
+	AS2(	movdqu	xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16])
-	AS2(	movdqa	xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16])
+	AS2(	movdqu	xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16])
 	AS2(	movdqa	xmm6, xmm4)
 	AS2(	punpcklqdq	xmm4, xmm5)
 	AS2(	punpckhqdq	xmm6, xmm5)
 	AS2(	pxor	xmm3, xmm4)
 	AS2(	pxor	xmm2, xmm6)
-	AS2(	movdqa	xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16])
+	AS2(	movdqu	xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16])
-	AS2(	movdqa	xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16])
+	AS2(	movdqu	xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16])
 	AS2(	movdqa	xmm6, xmm4)
 	AS2(	punpcklqdq	xmm4, xmm5)
 	AS2(	punpckhqdq	xmm6, xmm5)
@ -294,10 +296,10 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
 	// save state
 	AS2(	mov		[AS_REG_2+4*16], eax)
-	AS2(	movdqa	XMMWORD_PTR [AS_REG_2+3*16], xmm3)
+	AS2(	movdqu	XMMWORD_PTR [AS_REG_2+3*16], xmm3)
-	AS2(	movdqa	XMMWORD_PTR [AS_REG_2+2*16], xmm2)
+	AS2(	movdqu	XMMWORD_PTR [AS_REG_2+2*16], xmm2)
-	AS2(	movdqa	XMMWORD_PTR [AS_REG_2+1*16], xmm1)
+	AS2(	movdqu	XMMWORD_PTR [AS_REG_2+1*16], xmm1)
-	AS2(	movdqa	XMMWORD_PTR [AS_REG_2+0*16], xmm0)
+	AS2(	movdqu	XMMWORD_PTR [AS_REG_2+0*16], xmm0)
 	#if CRYPTOPP_BOOL_X86
 		AS2(	add		esp, 4)
@ -329,7 +331,7 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
 #else
 }
 #endif
-#endif	// #ifdef CRYPTOPP_SSE2_ASM_AVAILABLE
+#endif	// CRYPTOPP_SSE2_ASM_AVAILABLE
 #ifndef CRYPTOPP_GENERATE_X64_MASM
--- a/salsa.cpp
+++ b/salsa.cpp
@ -265,12 +265,12 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
 #define SSE2_EXPAND_S(i, j)		\
 	ASS(	pshufd	xmm4, xmm##i, j, j, j, j)	\
-	AS2(	movdqa	[SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
+	AS2(	movdqu	[SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
-		AS2(	movdqa	xmm0, [REG_state + 0*16])
+		AS2(	movdqu	xmm0, [REG_state + 0*16])
-		AS2(	movdqa	xmm1, [REG_state + 1*16])
+		AS2(	movdqu	xmm1, [REG_state + 1*16])
-		AS2(	movdqa	xmm2, [REG_state + 2*16])
+		AS2(	movdqu	xmm2, [REG_state + 2*16])
-		AS2(	movdqa	xmm3, [REG_state + 3*16])
+		AS2(	movdqu	xmm3, [REG_state + 3*16])
 		SSE2_EXPAND_S(0, 0)
 		SSE2_EXPAND_S(0, 1)
 		SSE2_EXPAND_S(0, 2)
@ -311,15 +311,15 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
 	AS2(	pxor	xmm##b, xmm4)			\
 	AS2(	pxor	xmm##b, xmm5)
-#define L01(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##A, [SSE2_WORKSPACE + d*16 + i*256])	/* y3 */
+#define L01(A,B,C,D,a,b,c,d,i)		AS2(	movdqu	xmm##A, [SSE2_WORKSPACE + d*16 + i*256])	/* y3 */
-#define L02(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##C, [SSE2_WORKSPACE + a*16 + i*256])	/* y0 */
+#define L02(A,B,C,D,a,b,c,d,i)		AS2(	movdqu	xmm##C, [SSE2_WORKSPACE + a*16 + i*256])	/* y0 */
 #define L03(A,B,C,D,a,b,c,d,i)		AS2(	paddd	xmm##A, xmm##C)		/* y0+y3 */
 #define L04(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##B, xmm##A)
 #define L05(A,B,C,D,a,b,c,d,i)		AS2(	pslld	xmm##A, 7)
 #define L06(A,B,C,D,a,b,c,d,i)		AS2(	psrld	xmm##B, 32-7)
 #define L07(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
 #define L08(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, xmm##B)		/* z1 */
-#define L09(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	[SSE2_WORKSPACE + b*16], xmm##A)
+#define L09(A,B,C,D,a,b,c,d,i)		AS2(	movdqu	[SSE2_WORKSPACE + b*16], xmm##A)
 #define L10(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##B, xmm##A)
 #define L11(A,B,C,D,a,b,c,d,i)		AS2(	paddd	xmm##A, xmm##C)		/* z1+y0 */
 #define L12(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##D, xmm##A)
@ -327,7 +327,7 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
 #define L14(A,B,C,D,a,b,c,d,i)		AS2(	psrld	xmm##D, 32-9)
 #define L15(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
 #define L16(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, xmm##D)		/* z2 */
-#define L17(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	[SSE2_WORKSPACE + c*16], xmm##A)
+#define L17(A,B,C,D,a,b,c,d,i)		AS2(	movdqu	[SSE2_WORKSPACE + c*16], xmm##A)
 #define L18(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##D, xmm##A)
 #define L19(A,B,C,D,a,b,c,d,i)		AS2(	paddd	xmm##A, xmm##B)		/* z2+z1 */
 #define L20(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##B, xmm##A)
@ -335,14 +335,14 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
 #define L22(A,B,C,D,a,b,c,d,i)		AS2(	psrld	xmm##B, 32-13)
 #define L23(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
 #define L24(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, xmm##B)		/* z3 */
-#define L25(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	[SSE2_WORKSPACE + d*16], xmm##A)
+#define L25(A,B,C,D,a,b,c,d,i)		AS2(	movdqu	[SSE2_WORKSPACE + d*16], xmm##A)
 #define L26(A,B,C,D,a,b,c,d,i)		AS2(	paddd	xmm##A, xmm##D)		/* z3+z2 */
 #define L27(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	xmm##D, xmm##A)
 #define L28(A,B,C,D,a,b,c,d,i)		AS2(	pslld	xmm##A, 18)
 #define L29(A,B,C,D,a,b,c,d,i)		AS2(	psrld	xmm##D, 32-18)
 #define L30(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, xmm##C)		/* xor y0 */
 #define L31(A,B,C,D,a,b,c,d,i)		AS2(	pxor	xmm##A, xmm##D)		/* z0 */
-#define L32(A,B,C,D,a,b,c,d,i)		AS2(	movdqa	[SSE2_WORKSPACE + a*16], xmm##A)
+#define L32(A,B,C,D,a,b,c,d,i)		AS2(	movdqu	[SSE2_WORKSPACE + a*16], xmm##A)
 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h)	\
 	L01(0,1,2,3, a,b,c,d, i)	L01(4,5,6,7, e,f,g,h, i)	\
@ -453,13 +453,13 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
 		ASJ(	jnz,	6, b)
 #define SSE2_OUTPUT_4(a, b, c, d)	\
-	AS2(	movdqa		xmm4, [SSE2_WORKSPACE + a*16 + 256])\
+	AS2(	movdqu		xmm4, [SSE2_WORKSPACE + a*16 + 256])\
 	AS2(	paddd		xmm4, [SSE2_WORKSPACE + a*16])\
-	AS2(	movdqa		xmm5, [SSE2_WORKSPACE + b*16 + 256])\
+	AS2(	movdqu		xmm5, [SSE2_WORKSPACE + b*16 + 256])\
 	AS2(	paddd		xmm5, [SSE2_WORKSPACE + b*16])\
-	AS2(	movdqa		xmm6, [SSE2_WORKSPACE + c*16 + 256])\
+	AS2(	movdqu		xmm6, [SSE2_WORKSPACE + c*16 + 256])\
 	AS2(	paddd		xmm6, [SSE2_WORKSPACE + c*16])\
-	AS2(	movdqa		xmm7, [SSE2_WORKSPACE + d*16 + 256])\
+	AS2(	movdqu		xmm7, [SSE2_WORKSPACE + d*16 + 256])\
 	AS2(	paddd		xmm7, [SSE2_WORKSPACE + d*16])\
 	ASC(	call,		SSE2_Salsa_Output)
@ -480,10 +480,10 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
 		ASL(5)
 		AS2(	sub		REG_iterationCount, 1)
 		ASJ(	jl,		4, f)
-		AS2(	movdqa	xmm0, [REG_state + 0*16])
+		AS2(	movdqu	xmm0, [REG_state + 0*16])
-		AS2(	movdqa	xmm1, [REG_state + 1*16])
+		AS2(	movdqu	xmm1, [REG_state + 1*16])
-		AS2(	movdqa	xmm2, [REG_state + 2*16])
+		AS2(	movdqu	xmm2, [REG_state + 2*16])
-		AS2(	movdqa	xmm3, [REG_state + 3*16])
+		AS2(	movdqu	xmm3, [REG_state + 3*16])
 		AS2(	mov		REG_roundsLeft, REG_rounds)
 		ASL(0)
@ -504,10 +504,14 @@ void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output
 		AS2(	sub		REG_roundsLeft, 2)
 		ASJ(	jnz,	0, b)
-		AS2(	paddd	xmm0, [REG_state + 0*16])
+		AS2(	movdqu	xmm6, [REG_state + 0*16])
-		AS2(	paddd	xmm1, [REG_state + 1*16])
+		AS2(	paddd	xmm0, xmm6)
-		AS2(	paddd	xmm2, [REG_state + 2*16])
+		AS2(	movdqu	xmm7, [REG_state + 1*16])
-		AS2(	paddd	xmm3, [REG_state + 3*16])
+		AS2(	paddd	xmm1, xmm7)
 		AS2(	movdqu	xmm6, [REG_state + 2*16])
 		AS2(	paddd	xmm2, xmm6)
 		AS2(	movdqu	xmm7, [REG_state + 3*16])
 		AS2(	paddd	xmm3, xmm7)
 		AS2(	add		dword ptr [REG_state + 8*4], 1)
 		AS2(	adc		dword ptr [REG_state + 5*4], 0)
--- a/secblock.h
+++ b/secblock.h
@ -469,14 +469,21 @@ public:
 private:
 #if defined(CRYPTOPP_BOOL_ALIGN16) && defined(CRYPTOPP_ALIGN_ATTRIBUTE)
-	T* GetAlignedArray() {return m_array;}
+	T* GetAlignedArray() {
 		CRYPTOPP_ASSERT(IsAlignedOn(m_array, 16));
 		return m_array;
 	}
 	CRYPTOPP_ALIGN_DATA(16) T m_array[S];
 #elif defined(CRYPTOPP_BOOL_ALIGN16)
 	// There be demons here... Some platforms and small datatypes can
 	// make things go sideways. We experienced it on AIX with XLC. If
 	// we see anymore problems we should probably avoid the stack and
 	// move to aligned heap allocations.
-	T* GetAlignedArray() {return (T*)(void*)(((byte*)m_array) + (0-(size_t)m_array)%16);}
+	T* GetAlignedArray() {
 		T* p_array = (T*)(void*)(((byte*)m_array) + (0-(size_t)m_array)%16);
 		CRYPTOPP_ASSERT(IsAlignedOn(p_array, 16));
 		return p_array;
 	}
 	T m_array[S+8/sizeof(T)];
 #else
 	T* GetAlignedArray() {return m_array;}
--- a/sha.cpp
+++ b/sha.cpp
@ -1119,7 +1119,8 @@ CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_HashBlock_SSE2(word64 *state, const
 #define SSE2_CombineState(i)    \
    AS2(    movdqu   xmm0, [edi+i*16])\
-    AS2(    paddq    xmm0, [ecx+i*16])\
+    AS2(    movdqu   xmm1, [ecx+i*16])\
    AS2(    paddq    xmm0, xmm1)\
    AS2(    movdqu   [ecx+i*16], xmm0)
    SSE2_CombineState(0)
@ -1147,7 +1148,7 @@ CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_HashBlock_SSE2(word64 *state, const
 // ANONYMOUS_NAMESPACE_END
-#endif    // CRYPTOPP_SSE2_ASM_AVAILABLE
+#endif  // CRYPTOPP_SSE2_ASM_AVAILABLE
 ANONYMOUS_NAMESPACE_BEGIN
--- a/sosemanuk.cpp
+++ b/sosemanuk.cpp
@ -412,10 +412,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		AS2(	lea		WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
 		AS2(	lea		WORD_REG(si), [4*WORD_REG(cx)])
 		AS2(	mov		SSE2_wordsLeft, WORD_REG(si))
-		AS2(	movdqa	xmm0, [WORD_REG(ax)+0*16])		// copy state to stack to save a register
+		AS2(	movdqu	xmm0, [WORD_REG(ax)+0*16])		// copy state to stack to save a register
-		AS2(	movdqa	[SSE2_stateCopy+0*16], xmm0)
+		AS2(	movdqu	[SSE2_stateCopy+0*16], xmm0)
-		AS2(	movdqa	xmm0, [WORD_REG(ax)+1*16])
+		AS2(	movdqu	xmm0, [WORD_REG(ax)+1*16])
-		AS2(	movdqa	[SSE2_stateCopy+1*16], xmm0)
+		AS2(	movdqu	[SSE2_stateCopy+1*16], xmm0)
 		AS2(	movq	xmm0, QWORD PTR [WORD_REG(ax)+2*16])
 		AS2(	movq	QWORD PTR [SSE2_stateCopy+2*16], xmm0)
 		AS2(	psrlq	xmm0, 32)
@ -507,10 +507,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		AS2(	mov		WORD_REG(si), SSE2_wordsLeft2)
 		ASL(1)	// second inner loop, 16 words each, 5 iterations
-		AS2(	movdqa	xmm0, [WORD_REG(di)+0*20*4])
+		AS2(	movdqu	xmm0, [WORD_REG(di)+0*20*4])
-		AS2(	movdqa	xmm2, [WORD_REG(di)+2*20*4])
+		AS2(	movdqu	xmm2, [WORD_REG(di)+2*20*4])
-		AS2(	movdqa	xmm3, [WORD_REG(di)+3*20*4])
+		AS2(	movdqu	xmm3, [WORD_REG(di)+3*20*4])
-		AS2(	movdqa	xmm1, [WORD_REG(di)+1*20*4])
+		AS2(	movdqu	xmm1, [WORD_REG(di)+1*20*4])
 		// S2
 		AS2(	movdqa	xmm4, xmm0)
 		AS2(	pand	xmm0, xmm2)
@ -596,10 +596,10 @@ void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *outpu
 		ASL(6)	// save state
 		AS2(	mov		AS_REG_6, SSE2_state)
-		AS2(	movdqa	xmm0, [SSE2_stateCopy+0*16])
+		AS2(	movdqu	xmm0, [SSE2_stateCopy+0*16])
-		AS2(	movdqa	[AS_REG_6+0*16], xmm0)
+		AS2(	movdqu	[AS_REG_6+0*16], xmm0)
-		AS2(	movdqa	xmm0, [SSE2_stateCopy+1*16])
+		AS2(	movdqu	xmm0, [SSE2_stateCopy+1*16])
-		AS2(	movdqa	[AS_REG_6+1*16], xmm0)
+		AS2(	movdqu	[AS_REG_6+1*16], xmm0)
 		AS2(	movq	xmm0, QWORD PTR [SSE2_stateCopy+2*16])
 		AS2(	movq	QWORD PTR [AS_REG_6+2*16], xmm0)
 		AS2(	mov		[AS_REG_6+10*4], ecx)