MMX/SSE2 optimizations

2007-04-15 23:00:27 +00:00 · 2007-04-15 23:00:27 +00:00 · 643b302227
parent 3b89824be3
commit 643b302227
10 changed files with 2786 additions and 2177 deletions
--- a/integer.cpp
+++ b/integer.cpp
--- a/integer.h
+++ b/integer.h
@ -11,44 +11,13 @@

 NAMESPACE_BEGIN(CryptoPP)

-#if defined(SSE2_INTRINSICS_AVAILABLE)
-	template <class T>
-	class AlignedAllocator : public AllocatorBase<T>
-	{
-	public:
-		CRYPTOPP_INHERIT_ALLOCATOR_TYPES
-
-		pointer allocate(size_type n, const void *);
-		void deallocate(void *p, size_type n);
-		pointer reallocate(T *p, size_type oldSize, size_type newSize, bool preserve)
-		{
-			return StandardReallocate(*this, p, oldSize, newSize, preserve);
-		}
-
-	#if !(defined(CRYPTOPP_MALLOC_ALIGNMENT_IS_16) || defined(CRYPTOPP_MEMALIGN_AVAILABLE) || defined(CRYPTOPP_MM_MALLOC_AVAILABLE))
-	#define CRYPTOPP_NO_ALIGNED_ALLOC
-		AlignedAllocator() : m_pBlock(NULL) {}
-	protected:
-		void *m_pBlock;
-	#endif
-	};
-
-	#ifdef CRYPTOPP_IMPORTS
-		CRYPTOPP_DLL_TEMPLATE_CLASS AlignedAllocator<word>;
-	#endif
-
-	typedef SecBlock<word, AlignedAllocator<word> > SecAlignedWordBlock;
-#else
-	typedef SecWordBlock SecAlignedWordBlock;
-#endif
-
-void CRYPTOPP_DLL CRYPTOPP_API DisableSSE2();
-
 struct InitializeInteger	// used to initialize static variables
 {
 	InitializeInteger();
 };

+typedef SecBlock<word, AllocatorWithCleanup<word, CRYPTOPP_BOOL_X86> > IntegerSecBlock;
+
 //! multiple precision integer and basic arithmetics
 /*! This class can represent positive and negative integers
 	with absolute value less than (256**sizeof(word)) ** (256**sizeof(int)).
@ -406,7 +375,7 @@ private:
 	friend void PositiveMultiply(Integer &product, const Integer &a, const Integer &b);
 	friend void PositiveDivide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor);

-	SecAlignedWordBlock reg;
+	IntegerSecBlock reg;
 	Sign sign;
 };

--- a/rijndael.cpp
+++ b/rijndael.cpp
@ -51,10 +51,7 @@ being unloaded from L1 cache, until that round is finished.

 #include "rijndael.h"
 #include "misc.h"
-
-#ifdef CRYPTOPP_L1_CACHE_ALIGN_NOT_AVAILABLE
-#pragma message("Don't know how to align data on L1 cache boundary. Defense against AES timing attack may be affected.")
-#endif
+#include "cpu.h"

 NAMESPACE_BEGIN(CryptoPP)

@ -122,25 +119,25 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
 		for (i = 1; i < m_rounds; i++) {
 			rk += 4;
 			rk[0] =
-				Td0[Se[GETBYTE(rk[0], 3)]] ^
-				Td1[Se[GETBYTE(rk[0], 2)]] ^
-				Td2[Se[GETBYTE(rk[0], 1)]] ^
-				Td3[Se[GETBYTE(rk[0], 0)]];
+				Td[0*256+Se[GETBYTE(rk[0], 3)]] ^
+				Td[1*256+Se[GETBYTE(rk[0], 2)]] ^
+				Td[2*256+Se[GETBYTE(rk[0], 1)]] ^
+				Td[3*256+Se[GETBYTE(rk[0], 0)]];
 			rk[1] =
-				Td0[Se[GETBYTE(rk[1], 3)]] ^
-				Td1[Se[GETBYTE(rk[1], 2)]] ^
-				Td2[Se[GETBYTE(rk[1], 1)]] ^
-				Td3[Se[GETBYTE(rk[1], 0)]];
+				Td[0*256+Se[GETBYTE(rk[1], 3)]] ^
+				Td[1*256+Se[GETBYTE(rk[1], 2)]] ^
+				Td[2*256+Se[GETBYTE(rk[1], 1)]] ^
+				Td[3*256+Se[GETBYTE(rk[1], 0)]];
 			rk[2] =
-				Td0[Se[GETBYTE(rk[2], 3)]] ^
-				Td1[Se[GETBYTE(rk[2], 2)]] ^
-				Td2[Se[GETBYTE(rk[2], 1)]] ^
-				Td3[Se[GETBYTE(rk[2], 0)]];
+				Td[0*256+Se[GETBYTE(rk[2], 3)]] ^
+				Td[1*256+Se[GETBYTE(rk[2], 2)]] ^
+				Td[2*256+Se[GETBYTE(rk[2], 1)]] ^
+				Td[3*256+Se[GETBYTE(rk[2], 0)]];
 			rk[3] =
-				Td0[Se[GETBYTE(rk[3], 3)]] ^
-				Td1[Se[GETBYTE(rk[3], 2)]] ^
-				Td2[Se[GETBYTE(rk[3], 1)]] ^
-				Td3[Se[GETBYTE(rk[3], 0)]];
+				Td[0*256+Se[GETBYTE(rk[3], 3)]] ^
+				Td[1*256+Se[GETBYTE(rk[3], 2)]] ^
+				Td[2*256+Se[GETBYTE(rk[3], 1)]] ^
+				Td[3*256+Se[GETBYTE(rk[3], 0)]];
 		}
 	}

@ -148,14 +145,244 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
 	ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
 }

-const static unsigned int s_lineSizeDiv4 = CRYPTOPP_L1_CACHE_LINE_SIZE/4;
-#ifdef IS_BIG_ENDIAN
-const static unsigned int s_i3=3, s_i2=2, s_i1=1, s_i0=0;
-#else
-const static unsigned int s_i3=0, s_i2=1, s_i1=2, s_i0=3;
-#endif
+#pragma warning(disable: 4731)	// frame pointer register 'ebp' modified by inline assembly code

 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+#ifdef CRYPTOPP_X86_ASM_AVAILABLE
+	if (HasMMX())
+	{
+		const word32 *k = m_key;
+		const word32 *kLoopEnd = k + m_rounds*4;
+#ifdef __GNUC__
+		word32 t0, t1, t2, t3;
+		__asm__ __volatile__
+		(
+		".intel_syntax noprefix;"
+		AS1(	push	ebx)
+		AS1(	push	ebp)
+		AS2(	mov		ebp, eax)
+		AS2(	movd	mm5, ecx)
+#else
+		AS2(	mov		edx, g_cacheLineSize)
+		AS2(	mov		edi, inBlock)
+		AS2(	mov		esi, k)
+		AS2(	movd	mm5, kLoopEnd)
+		AS1(	push	ebp)
+		AS2(	lea		ebp, Te)
+#endif
+		AS2(	mov		eax, [esi+0*4])	// s0
+		AS2(	xor		eax, [edi+0*4])
+		AS2(	movd	mm0, eax)
+		AS2(	mov		ebx, [esi+1*4])
+		AS2(	xor		ebx, [edi+1*4])
+		AS2(	movd	mm1, ebx)
+		AS2(	and		ebx, eax)
+		AS2(	mov		eax, [esi+2*4])
+		AS2(	xor		eax, [edi+2*4])
+		AS2(	movd	mm2, eax)
+		AS2(	and		ebx, eax)
+		AS2(	mov		ecx, [esi+3*4])
+		AS2(	xor		ecx, [edi+3*4])
+		AS2(	and		ebx, ecx)
+
+		// read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction
+		AS2(	and		ebx, 0)
+		AS2(	mov		edi, ebx)	// make index depend on previous loads to simulate lfence
+		ASL(2)
+		AS2(	and		ebx, [ebp+edi])
+		AS2(	add		edi, edx)
+		AS2(	and		ebx, [ebp+edi])
+		AS2(	add		edi, edx)
+		AS2(	and		ebx, [ebp+edi])
+		AS2(	add		edi, edx)
+		AS2(	and		ebx, [ebp+edi])
+		AS2(	add		edi, edx)
+		AS2(	cmp		edi, 1024)
+		ASJ(	jl,		2, b)
+		AS2(	and		ebx, [ebp+1020])
+		AS2(	movd	mm6, ebx)
+		AS2(	pxor	mm2, mm6)
+		AS2(	pxor	mm1, mm6)
+		AS2(	pxor	mm0, mm6)
+		AS2(	xor		ecx, ebx)
+
+		AS2(	mov		edi, [esi+4*4])	// t0
+		AS2(	mov		eax, [esi+5*4])
+		AS2(	mov		ebx, [esi+6*4])
+		AS2(	mov		edx, [esi+7*4])
+		AS2(	add		esi, 8*4)
+		AS2(	movd	mm4, esi)
+
+#define QUARTER_ROUND(t, a, b, c, d)	\
+	AS2(movzx esi, t##l)\
+	AS2(d, [ebp+0*1024+4*esi])\
+	AS2(movzx esi, t##h)\
+	AS2(c, [ebp+1*1024+4*esi])\
+	AS2(shr e##t##x, 16)\
+	AS2(movzx esi, t##l)\
+	AS2(b, [ebp+2*1024+4*esi])\
+	AS2(movzx esi, t##h)\
+	AS2(a, [ebp+3*1024+4*esi])
+
+#define s0		xor edi
+#define s1		xor eax
+#define s2		xor ebx
+#define s3		xor ecx
+#define t0		xor edi
+#define t1		xor eax
+#define t2		xor ebx
+#define t3		xor edx
+
+		QUARTER_ROUND(c, t0, t1, t2, t3)
+		AS2(	movd	ecx, mm2)
+		QUARTER_ROUND(c, t3, t0, t1, t2)
+		AS2(	movd	ecx, mm1)
+		QUARTER_ROUND(c, t2, t3, t0, t1)
+		AS2(	movd	ecx, mm0)
+		QUARTER_ROUND(c, t1, t2, t3, t0)
+		AS2(	movd	mm2, ebx)
+		AS2(	movd	mm1, eax)
+		AS2(	movd	mm0, edi)
+#undef QUARTER_ROUND
+
+		AS2(	movd	esi, mm4)
+
+		ASL(0)
+		AS2(	mov		edi, [esi+0*4])
+		AS2(	mov		eax, [esi+1*4])
+		AS2(	mov		ebx, [esi+2*4])
+		AS2(	mov		ecx, [esi+3*4])
+
+#define QUARTER_ROUND(t, a, b, c, d)	\
+	AS2(movzx esi, t##l)\
+	AS2(a, [ebp+3*1024+4*esi])\
+	AS2(movzx esi, t##h)\
+	AS2(b, [ebp+2*1024+4*esi])\
+	AS2(shr e##t##x, 16)\
+	AS2(movzx esi, t##l)\
+	AS2(c, [ebp+1*1024+4*esi])\
+	AS2(movzx esi, t##h)\
+	AS2(d, [ebp+0*1024+4*esi])
+
+		QUARTER_ROUND(d, s0, s1, s2, s3)
+		AS2(	movd	edx, mm2)
+		QUARTER_ROUND(d, s3, s0, s1, s2)
+		AS2(	movd	edx, mm1)
+		QUARTER_ROUND(d, s2, s3, s0, s1)
+		AS2(	movd	edx, mm0)
+		QUARTER_ROUND(d, s1, s2, s3, s0)
+		AS2(	movd	esi, mm4)
+		AS2(	movd	mm2, ebx)
+		AS2(	movd	mm1, eax)
+		AS2(	movd	mm0, edi)
+
+		AS2(	mov		edi, [esi+4*4])
+		AS2(	mov		eax, [esi+5*4])
+		AS2(	mov		ebx, [esi+6*4])
+		AS2(	mov		edx, [esi+7*4])
+
+		QUARTER_ROUND(c, t0, t1, t2, t3)
+		AS2(	movd	ecx, mm2)
+		QUARTER_ROUND(c, t3, t0, t1, t2)
+		AS2(	movd	ecx, mm1)
+		QUARTER_ROUND(c, t2, t3, t0, t1)
+		AS2(	movd	ecx, mm0)
+		QUARTER_ROUND(c, t1, t2, t3, t0)
+		AS2(	movd	mm2, ebx)
+		AS2(	movd	mm1, eax)
+		AS2(	movd	mm0, edi)
+
+		AS2(	movd	esi, mm4)
+		AS2(	movd	edi, mm5)
+		AS2(	add		esi, 8*4)
+		AS2(	movd	mm4, esi)
+		AS2(	cmp		edi, esi)
+		ASJ(	jne,	0, b)
+
+#undef QUARTER_ROUND
+#undef s0
+#undef s1
+#undef s2
+#undef s3
+#undef t0
+#undef t1
+#undef t2
+#undef t3
+
+		AS2(	mov		eax, [edi+0*4])
+		AS2(	mov		ecx, [edi+1*4])
+		AS2(	mov		esi, [edi+2*4])
+		AS2(	mov		edi, [edi+3*4])
+
+#define QUARTER_ROUND(a, b, c, d)	\
+	AS2(	movzx	ebx, dl)\
+	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	shl		ebx, 3*8)\
+	AS2(	xor		a, ebx)\
+	AS2(	movzx	ebx, dh)\
+	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	shl		ebx, 2*8)\
+	AS2(	xor		b, ebx)\
+	AS2(	shr		edx, 16)\
+	AS2(	movzx	ebx, dl)\
+	AS2(	shr		edx, 8)\
+	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*ebx])\
+	AS2(	shl		ebx, 1*8)\
+	AS2(	xor		c, ebx)\
+	AS2(	movzx	ebx, BYTE PTR [ebp+1+4*edx])\
+	AS2(	xor		d, ebx)
+
+		QUARTER_ROUND(eax, ecx, esi, edi)
+		AS2(	movd	edx, mm2)
+		QUARTER_ROUND(edi, eax, ecx, esi)
+		AS2(	movd	edx, mm1)
+		QUARTER_ROUND(esi, edi, eax, ecx)
+		AS2(	movd	edx, mm0)
+		QUARTER_ROUND(ecx, esi, edi, eax)
+
+#undef QUARTER_ROUND
+
+		AS1(	pop		ebp)
+		AS1(	emms)
+
+#ifdef __GNUC__
+		AS1(	pop		ebx)
+		".att_syntax prefix;"
+			: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
+			: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
+			: "memory", "cc"
+		);
+
+		if (xorBlock)
+		{
+			t0 ^= ((const word32 *)xorBlock)[0];
+			t1 ^= ((const word32 *)xorBlock)[1];
+			t2 ^= ((const word32 *)xorBlock)[2];
+			t3 ^= ((const word32 *)xorBlock)[3];
+		}
+		((word32 *)outBlock)[0] = t0;
+		((word32 *)outBlock)[1] = t1;
+		((word32 *)outBlock)[2] = t2;
+		((word32 *)outBlock)[3] = t3;
+#else
+		AS2(	mov		ebx, xorBlock)
+		AS2(	test	ebx, ebx)
+		ASJ(	jz,		1, f)
+		AS2(	xor		eax, [ebx+0*4])
+		AS2(	xor		ecx, [ebx+1*4])
+		AS2(	xor		esi, [ebx+2*4])
+		AS2(	xor		edi, [ebx+3*4])
+		ASL(1)
+		AS2(	mov		ebx, outBlock)
+		AS2(	mov		[ebx+0*4], eax)
+		AS2(	mov		[ebx+1*4], ecx)
+		AS2(	mov		[ebx+2*4], esi)
+		AS2(	mov		[ebx+3*4], edi)
+#endif
+	}
+	else
+#endif	// #ifdef CRYPTOPP_X86_ASM_AVAILABLE
 	{
 	word32 s0, s1, s2, s3, t0, t1, t2, t3;
 	const word32 *rk = m_key;
@ -171,95 +398,68 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	rk += 8;

 	// timing attack countermeasure. see comments at top for more details
+	const int cacheLineSize = GetCacheLineSize();
 	unsigned int i;
 	word32 u = 0;
-	for (i=0; i<sizeof(Te0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
-		u &= (Te0[i+0*s_lineSizeDiv4] & Te0[i+2*s_lineSizeDiv4]) & (Te0[i+1*s_lineSizeDiv4] & Te0[i+3*s_lineSizeDiv4]);
+	for (i=0; i<1024; i+=cacheLineSize)
+		u &= *(const word32 *)(((const byte *)Te)+i);
+	u &= Te[255];
 	s0 |= u; s1 |= u; s2 |= u; s3 |= u;

 	// first round
-    t0 ^=
-        Te0[GETBYTE(s0, s_i3)] ^
-        rotrFixed(Te0[GETBYTE(s1, s_i2)], 8) ^
-        rotrFixed(Te0[GETBYTE(s2, s_i1)], 16) ^
-        rotrFixed(Te0[GETBYTE(s3, s_i0)], 24);
-    t1 ^=
-        Te0[GETBYTE(s1, s_i3)] ^
-        rotrFixed(Te0[GETBYTE(s2, s_i2)], 8) ^
-        rotrFixed(Te0[GETBYTE(s3, s_i1)], 16) ^
-        rotrFixed(Te0[GETBYTE(s0, s_i0)], 24);
-    t2 ^=
-        Te0[GETBYTE(s2, s_i3)] ^
-        rotrFixed(Te0[GETBYTE(s3, s_i2)], 8) ^
-        rotrFixed(Te0[GETBYTE(s0, s_i1)], 16) ^
-        rotrFixed(Te0[GETBYTE(s1, s_i0)], 24);
-    t3 ^=
-        Te0[GETBYTE(s3, s_i3)] ^
-        rotrFixed(Te0[GETBYTE(s0, s_i2)], 8) ^
-        rotrFixed(Te0[GETBYTE(s1, s_i1)], 16) ^
-        rotrFixed(Te0[GETBYTE(s2, s_i0)], 24);
+#ifdef IS_BIG_ENDIAN
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		a ^= rotrFixed(Te[byte(t)], 24);	t >>= 8;\
+		b ^= rotrFixed(Te[byte(t)], 16);	t >>= 8;\
+		c ^= rotrFixed(Te[byte(t)], 8);	t >>= 8;\
+		d ^= Te[t];
+#else
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		d ^= Te[byte(t)];					t >>= 8;\
+		c ^= rotrFixed(Te[byte(t)], 8);	t >>= 8;\
+		b ^= rotrFixed(Te[byte(t)], 16);	t >>= 8;\
+		a ^= rotrFixed(Te[t], 24);
+#endif
+
+	QUARTER_ROUND(s3, t0, t1, t2, t3)
+	QUARTER_ROUND(s2, t3, t0, t1, t2)
+	QUARTER_ROUND(s1, t2, t3, t0, t1)
+	QUARTER_ROUND(s0, t1, t2, t3, t0)
+#undef QUARTER_ROUND

 	// Nr - 2 full rounds:
    unsigned int r = m_rounds/2 - 1;
    do
 	{
-        s0 =
-            Te0[GETBYTE(t0, 3)] ^
-            Te1[GETBYTE(t1, 2)] ^
-            Te2[GETBYTE(t2, 1)] ^
-            Te3[GETBYTE(t3, 0)] ^
-            rk[0];
-        s1 =
-            Te0[GETBYTE(t1, 3)] ^
-            Te1[GETBYTE(t2, 2)] ^
-            Te2[GETBYTE(t3, 1)] ^
-            Te3[GETBYTE(t0, 0)] ^
-            rk[1];
-        s2 =
-            Te0[GETBYTE(t2, 3)] ^
-            Te1[GETBYTE(t3, 2)] ^
-            Te2[GETBYTE(t0, 1)] ^
-            Te3[GETBYTE(t1, 0)] ^
-            rk[2];
-        s3 =
-            Te0[GETBYTE(t3, 3)] ^
-            Te1[GETBYTE(t0, 2)] ^
-            Te2[GETBYTE(t1, 1)] ^
-            Te3[GETBYTE(t2, 0)] ^
-            rk[3];
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		a ^= Te[3*256+byte(t)]; t >>= 8;\
+		b ^= Te[2*256+byte(t)]; t >>= 8;\
+		c ^= Te[1*256+byte(t)]; t >>= 8;\
+		d ^= Te[t];

-        t0 =
-            Te0[GETBYTE(s0, 3)] ^
-            Te1[GETBYTE(s1, 2)] ^
-            Te2[GETBYTE(s2, 1)] ^
-            Te3[GETBYTE(s3, 0)] ^
-            rk[4];
-        t1 =
-            Te0[GETBYTE(s1, 3)] ^
-            Te1[GETBYTE(s2, 2)] ^
-            Te2[GETBYTE(s3, 1)] ^
-            Te3[GETBYTE(s0, 0)] ^
-            rk[5];
-        t2 =
-            Te0[GETBYTE(s2, 3)] ^
-            Te1[GETBYTE(s3, 2)] ^
-            Te2[GETBYTE(s0, 1)] ^
-            Te3[GETBYTE(s1, 0)] ^
-            rk[6];
-        t3 =
-            Te0[GETBYTE(s3, 3)] ^
-            Te1[GETBYTE(s0, 2)] ^
-            Te2[GETBYTE(s1, 1)] ^
-            Te3[GETBYTE(s2, 0)] ^
-            rk[7];
+		s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
+
+		QUARTER_ROUND(t3, s0, s1, s2, s3)
+		QUARTER_ROUND(t2, s3, s0, s1, s2)
+		QUARTER_ROUND(t1, s2, s3, s0, s1)
+		QUARTER_ROUND(t0, s1, s2, s3, s0)
+
+		t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
+
+		QUARTER_ROUND(s3, t0, t1, t2, t3)
+		QUARTER_ROUND(s2, t3, t0, t1, t2)
+		QUARTER_ROUND(s1, t2, t3, t0, t1)
+		QUARTER_ROUND(s0, t1, t2, t3, t0)
+#undef QUARTER_ROUND

        rk += 8;
    } while (--r);

 	// timing attack countermeasure. see comments at top for more details
 	u = 0;
-	for (i=0; i<sizeof(Se)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
-		u &= (((word32*)Se)[i+0*s_lineSizeDiv4] & ((word32*)Se)[i+2*s_lineSizeDiv4]) & (((word32*)Se)[i+1*s_lineSizeDiv4] & ((word32*)Se)[i+3*s_lineSizeDiv4]);
+	for (i=0; i<256; i+=cacheLineSize)
+		u &= *(const word32 *)(Se+i);
+	u &= *(const word32 *)(Se+252);
 	t0 |= u; t1 |= u; t2 |= u; t3 |= u;

 	word32 tbw[4];
@ -267,23 +467,17 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	word32 *const obw = (word32 *)outBlock;
 	const word32 *const xbw = (const word32 *)xorBlock;

-	// last round
-	tempBlock[0] = Se[GETBYTE(t0, 3)];
-	tempBlock[1] = Se[GETBYTE(t1, 2)];
-	tempBlock[2] = Se[GETBYTE(t2, 1)];
-	tempBlock[3] = Se[GETBYTE(t3, 0)];
-	tempBlock[4] = Se[GETBYTE(t1, 3)];
-	tempBlock[5] = Se[GETBYTE(t2, 2)];
-	tempBlock[6] = Se[GETBYTE(t3, 1)];
-	tempBlock[7] = Se[GETBYTE(t0, 0)];
-	tempBlock[8] = Se[GETBYTE(t2, 3)];
-	tempBlock[9] = Se[GETBYTE(t3, 2)];
-	tempBlock[10] = Se[GETBYTE(t0, 1)];
-	tempBlock[11] = Se[GETBYTE(t1, 0)];
-	tempBlock[12] = Se[GETBYTE(t3, 3)];
-	tempBlock[13] = Se[GETBYTE(t0, 2)];
-	tempBlock[14] = Se[GETBYTE(t1, 1)];
-	tempBlock[15] = Se[GETBYTE(t2, 0)];
+#define QUARTER_ROUND(t, a, b, c, d)	\
+	tempBlock[a] = Se[byte(t)]; t >>= 8;\
+	tempBlock[b] = Se[byte(t)]; t >>= 8;\
+	tempBlock[c] = Se[byte(t)]; t >>= 8;\
+	tempBlock[d] = Se[t];
+
+	QUARTER_ROUND(t2, 15, 2, 5, 8)
+	QUARTER_ROUND(t1, 11, 14, 1, 4)
+	QUARTER_ROUND(t0, 7, 10, 13, 0)
+	QUARTER_ROUND(t3, 3, 6, 9, 12)
+#undef QUARTER_ROUND

 	if (xbw)
 	{
@ -300,6 +494,7 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 		obw[3] = tbw[3] ^ rk[3];
 	}
 	}
+}

 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
 {
@ -317,95 +512,68 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	rk += 8;

 	// timing attack countermeasure. see comments at top for more details
+	const int cacheLineSize = GetCacheLineSize();
 	unsigned int i;
 	word32 u = 0;
-	for (i=0; i<sizeof(Td0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
-		u &= (Td0[i+0*s_lineSizeDiv4] & Td0[i+2*s_lineSizeDiv4]) & (Td0[i+1*s_lineSizeDiv4] & Td0[i+3*s_lineSizeDiv4]);
+	for (i=0; i<1024; i+=cacheLineSize)
+		u &= *(const word32 *)(((const byte *)Td)+i);
+	u &= Td[255];
 	s0 |= u; s1 |= u; s2 |= u; s3 |= u;

 	// first round
-    t0 ^=
-        Td0[GETBYTE(s0, s_i3)] ^
-        rotrFixed(Td0[GETBYTE(s3, s_i2)], 8) ^
-        rotrFixed(Td0[GETBYTE(s2, s_i1)], 16) ^
-        rotrFixed(Td0[GETBYTE(s1, s_i0)], 24);
-    t1 ^=
-        Td0[GETBYTE(s1, s_i3)] ^
-        rotrFixed(Td0[GETBYTE(s0, s_i2)], 8) ^
-        rotrFixed(Td0[GETBYTE(s3, s_i1)], 16) ^
-        rotrFixed(Td0[GETBYTE(s2, s_i0)], 24);
-    t2 ^=
-        Td0[GETBYTE(s2, s_i3)] ^
-        rotrFixed(Td0[GETBYTE(s1, s_i2)], 8) ^
-        rotrFixed(Td0[GETBYTE(s0, s_i1)], 16) ^
-        rotrFixed(Td0[GETBYTE(s3, s_i0)], 24);
-    t3 ^=
-        Td0[GETBYTE(s3, s_i3)] ^
-        rotrFixed(Td0[GETBYTE(s2, s_i2)], 8) ^
-        rotrFixed(Td0[GETBYTE(s1, s_i1)], 16) ^
-        rotrFixed(Td0[GETBYTE(s0, s_i0)], 24);
+#ifdef IS_BIG_ENDIAN
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		a ^= rotrFixed(Td[byte(t)], 24);	t >>= 8;\
+		b ^= rotrFixed(Td[byte(t)], 16);	t >>= 8;\
+		c ^= rotrFixed(Td[byte(t)], 8);		t >>= 8;\
+		d ^= Td[t];
+#else
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		d ^= Td[byte(t)];					t >>= 8;\
+		c ^= rotrFixed(Td[byte(t)], 8);		t >>= 8;\
+		b ^= rotrFixed(Td[byte(t)], 16);	t >>= 8;\
+		a ^= rotrFixed(Td[t], 24);
+#endif
+
+	QUARTER_ROUND(s3, t2, t1, t0, t3)
+	QUARTER_ROUND(s2, t1, t0, t3, t2)
+	QUARTER_ROUND(s1, t0, t3, t2, t1)
+	QUARTER_ROUND(s0, t3, t2, t1, t0)
+#undef QUARTER_ROUND

 	// Nr - 2 full rounds:
    unsigned int r = m_rounds/2 - 1;
    do
 	{
-        s0 =
-            Td0[GETBYTE(t0, 3)] ^
-            Td1[GETBYTE(t3, 2)] ^
-            Td2[GETBYTE(t2, 1)] ^
-            Td3[GETBYTE(t1, 0)] ^
-            rk[0];
-        s1 =
-            Td0[GETBYTE(t1, 3)] ^
-            Td1[GETBYTE(t0, 2)] ^
-            Td2[GETBYTE(t3, 1)] ^
-            Td3[GETBYTE(t2, 0)] ^
-            rk[1];
-        s2 =
-            Td0[GETBYTE(t2, 3)] ^
-            Td1[GETBYTE(t1, 2)] ^
-            Td2[GETBYTE(t0, 1)] ^
-            Td3[GETBYTE(t3, 0)] ^
-            rk[2];
-        s3 =
-            Td0[GETBYTE(t3, 3)] ^
-            Td1[GETBYTE(t2, 2)] ^
-            Td2[GETBYTE(t1, 1)] ^
-            Td3[GETBYTE(t0, 0)] ^
-            rk[3];
+#define QUARTER_ROUND(t, a, b, c, d)	\
+		a ^= Td[3*256+byte(t)]; t >>= 8;\
+		b ^= Td[2*256+byte(t)]; t >>= 8;\
+		c ^= Td[1*256+byte(t)]; t >>= 8;\
+		d ^= Td[t];

-        t0 =
-            Td0[GETBYTE(s0, 3)] ^
-            Td1[GETBYTE(s3, 2)] ^
-            Td2[GETBYTE(s2, 1)] ^
-            Td3[GETBYTE(s1, 0)] ^
-            rk[4];
-        t1 =
-            Td0[GETBYTE(s1, 3)] ^
-            Td1[GETBYTE(s0, 2)] ^
-            Td2[GETBYTE(s3, 1)] ^
-            Td3[GETBYTE(s2, 0)] ^
-            rk[5];
-        t2 =
-            Td0[GETBYTE(s2, 3)] ^
-            Td1[GETBYTE(s1, 2)] ^
-            Td2[GETBYTE(s0, 1)] ^
-            Td3[GETBYTE(s3, 0)] ^
-            rk[6];
-        t3 =
-            Td0[GETBYTE(s3, 3)] ^
-            Td1[GETBYTE(s2, 2)] ^
-            Td2[GETBYTE(s1, 1)] ^
-            Td3[GETBYTE(s0, 0)] ^
-            rk[7];
+		s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
+
+		QUARTER_ROUND(t3, s2, s1, s0, s3)
+		QUARTER_ROUND(t2, s1, s0, s3, s2)
+		QUARTER_ROUND(t1, s0, s3, s2, s1)
+		QUARTER_ROUND(t0, s3, s2, s1, s0)
+
+		t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
+
+		QUARTER_ROUND(s3, t2, t1, t0, t3)
+		QUARTER_ROUND(s2, t1, t0, t3, t2)
+		QUARTER_ROUND(s1, t0, t3, t2, t1)
+		QUARTER_ROUND(s0, t3, t2, t1, t0)
+#undef QUARTER_ROUND

        rk += 8;
    } while (--r);

 	// timing attack countermeasure. see comments at top for more details
 	u = 0;
-	for (i=0; i<sizeof(Sd)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
-		u &= (((word32*)Sd)[i+0*s_lineSizeDiv4] & ((word32*)Sd)[i+2*s_lineSizeDiv4]) & (((word32*)Sd)[i+1*s_lineSizeDiv4] & ((word32*)Sd)[i+3*s_lineSizeDiv4]);
+	for (i=0; i<256; i+=cacheLineSize)
+		u &= *(const word32 *)(Sd+i);
+	u &= *(const word32 *)(Sd+252);
 	t0 |= u; t1 |= u; t2 |= u; t3 |= u;

 	word32 tbw[4];
@ -413,23 +581,17 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	word32 *const obw = (word32 *)outBlock;
 	const word32 *const xbw = (const word32 *)xorBlock;

-	// last round
-	tempBlock[0] = Sd[GETBYTE(t0, 3)];
-	tempBlock[1] = Sd[GETBYTE(t3, 2)];
-	tempBlock[2] = Sd[GETBYTE(t2, 1)];
-	tempBlock[3] = Sd[GETBYTE(t1, 0)];
-	tempBlock[4] = Sd[GETBYTE(t1, 3)];
-	tempBlock[5] = Sd[GETBYTE(t0, 2)];
-	tempBlock[6] = Sd[GETBYTE(t3, 1)];
-	tempBlock[7] = Sd[GETBYTE(t2, 0)];
-	tempBlock[8] = Sd[GETBYTE(t2, 3)];
-	tempBlock[9] = Sd[GETBYTE(t1, 2)];
-	tempBlock[10] = Sd[GETBYTE(t0, 1)];
-	tempBlock[11] = Sd[GETBYTE(t3, 0)];
-	tempBlock[12] = Sd[GETBYTE(t3, 3)];
-	tempBlock[13] = Sd[GETBYTE(t2, 2)];
-	tempBlock[14] = Sd[GETBYTE(t1, 1)];
-	tempBlock[15] = Sd[GETBYTE(t0, 0)];
+#define QUARTER_ROUND(t, a, b, c, d)	\
+	tempBlock[a] = Sd[byte(t)]; t >>= 8;\
+	tempBlock[b] = Sd[byte(t)]; t >>= 8;\
+	tempBlock[c] = Sd[byte(t)]; t >>= 8;\
+	tempBlock[d] = Sd[t];
+
+	QUARTER_ROUND(t2, 7, 2, 13, 8)
+	QUARTER_ROUND(t1, 3, 14, 9, 4)
+	QUARTER_ROUND(t0, 15, 10, 5, 0)
+	QUARTER_ROUND(t3, 11, 6, 1, 12)
+#undef QUARTER_ROUND

 	if (xbw)
 	{
--- a/rijndael.h
+++ b/rijndael.h
@ -25,16 +25,10 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat

 	protected:
 		// VS2005 workaround: have to put these on seperate lines, or error C2487 is triggered in DLL build
-		CRYPTOPP_L1_CACHE_ALIGN(static const byte Se[256]);
-		CRYPTOPP_L1_CACHE_ALIGN(static const byte Sd[256]);
-		CRYPTOPP_L1_CACHE_ALIGN(static const word32 Te0[256]);
-		static const word32 Te1[256];
-		static const word32 Te2[256];
-		static const word32 Te3[256];
-		CRYPTOPP_L1_CACHE_ALIGN(static const word32 Td0[256]);
-		static const word32 Td1[256];
-		static const word32 Td2[256];
-		static const word32 Td3[256];
+		static const byte Se[256];
+		static const byte Sd[256];
+		static const word32 Te[4*256];
+		static const word32 Td[4*256];

 		static const word32 rcon[];

@ -52,6 +46,7 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat
 	{
 	public:
 		void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+		void ProcessAndXorBlock_Old(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
 	};

 public:
--- a/sha.cpp
+++ b/sha.cpp
@ -9,6 +9,7 @@

 #include "sha.h"
 #include "misc.h"
+#include "cpu.h"

 NAMESPACE_BEGIN(CryptoPP)

@ -74,27 +75,43 @@ void SHA1::Transform(word32 *state, const word32 *data)
    state[2] += c;
    state[3] += d;
    state[4] += e;
-    /* Wipe variables */
-    a = b = c = d = e = 0;
-	memset(W, 0, sizeof(W));
 }

 // end of Steve Reid's code

 // *************************************************************

+void SHA224::InitState(HashWordType *state)
+{
+	static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
+	memcpy(state, s, sizeof(s));
+}
+
 void SHA256::InitState(HashWordType *state)
 {
-	state[0] = 0x6a09e667;
-	state[1] = 0xbb67ae85;
-	state[2] = 0x3c6ef372;
-	state[3] = 0xa54ff53a;
-	state[4] = 0x510e527f;
-	state[5] = 0x9b05688c;
-	state[6] = 0x1f83d9ab;
-	state[7] = 0x5be0cd19;
+	static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
+	memcpy(state, s, sizeof(s));
 }

+static const word32 SHA256_K[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
 #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))

 #define Ch(x,y,z) (z^(x&(y^z)))
@ -109,7 +126,7 @@ void SHA256::InitState(HashWordType *state)
 #define g(i) T[(6-i)&7]
 #define h(i) T[(7-i)&7]

-#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk0(i));\
+#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
 	d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))

 // for SHA256
@ -141,98 +158,114 @@ void SHA256::Transform(word32 *state, const word32 *data)
    state[5] += f(0);
    state[6] += g(0);
    state[7] += h(0);
-    /* Wipe variables */
-	memset(W, 0, sizeof(W));
-	memset(T, 0, sizeof(T));
 }

+/* 
+// smaller but slower
+void SHA256_Transform(word32 *state, const word32 *data)
+{
+	word32 T[20];
+	word32 W[32];
+	unsigned int i = 0, j = 0;
+	word32 *t = T+8;
+
+	memcpy(t, state, 8*4);
+	word32 e = t[4], a = t[0];
+
+	do 
+	{
+		word32 w = data[j];
+		W[j] = w;
+		w += K[j];
+		w += t[7];
+		w += S1(e);
+		w += Ch(e, t[5], t[6]);
+		e = t[3] + w;
+		t[3] = t[3+8] = e;
+		w += S0(t[0]);
+		a = w + Maj(a, t[1], t[2]);
+		t[-1] = t[7] = a;
+		--t;
+		++j;
+		if (j%8 == 0)
+			t += 8;
+	} while (j<16);
+
+	do
+	{
+		i = j&0xf;
+		word32 w = s1(W[i+16-2]) + s0(W[i+16-15]) + W[i] + W[i+16-7];
+		W[i+16] = W[i] = w;
+		w += K[j];
+		w += t[7];
+		w += S1(e);
+		w += Ch(e, t[5], t[6]);
+		e = t[3] + w;
+		t[3] = t[3+8] = e;
+		w += S0(t[0]);
+		a = w + Maj(a, t[1], t[2]);
+		t[-1] = t[7] = a;
+
+		w = s1(W[(i+1)+16-2]) + s0(W[(i+1)+16-15]) + W[(i+1)] + W[(i+1)+16-7];
+		W[(i+1)+16] = W[(i+1)] = w;
+		w += K[j+1];
+		w += (t-1)[7];
+		w += S1(e);
+		w += Ch(e, (t-1)[5], (t-1)[6]);
+		e = (t-1)[3] + w;
+		(t-1)[3] = (t-1)[3+8] = e;
+		w += S0((t-1)[0]);
+		a = w + Maj(a, (t-1)[1], (t-1)[2]);
+		(t-1)[-1] = (t-1)[7] = a;
+
+		t-=2;
+		j+=2;
+		if (j%8 == 0)
+			t += 8;
+	} while (j<64);
+
+    state[0] += a;
+    state[1] += t[1];
+    state[2] += t[2];
+    state[3] += t[3];
+    state[4] += e;
+    state[5] += t[5];
+    state[6] += t[6];
+    state[7] += t[7];
+}
+*/
+
 #undef S0
 #undef S1
 #undef s0
 #undef s1
-
-const word32 SHA256::K[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-void SHA224::InitState(HashWordType *state)
-{
-	state[0] = 0xc1059ed8;
-	state[1] = 0x367cd507;
-	state[2] = 0x3070dd17;
-	state[3] = 0xf70e5939;
-	state[4] = 0xffc00b31;
-	state[5] = 0x68581511;
-	state[6] = 0x64f98fa7;
-	state[7] = 0xbefa4fa4;
-}
+#undef R

 // *************************************************************

 #ifdef WORD64_AVAILABLE

+void SHA384::InitState(HashWordType *state)
+{
+	static const word64 s[8] = {
+		W64LIT(0xcbbb9d5dc1059ed8), W64LIT(0x629a292a367cd507),
+		W64LIT(0x9159015a3070dd17), W64LIT(0x152fecd8f70e5939),
+		W64LIT(0x67332667ffc00b31), W64LIT(0x8eb44a8768581511),
+		W64LIT(0xdb0c2e0d64f98fa7), W64LIT(0x47b5481dbefa4fa4)};
+	memcpy(state, s, sizeof(s));
+}
+
 void SHA512::InitState(HashWordType *state)
 {
-	state[0] = W64LIT(0x6a09e667f3bcc908);
-	state[1] = W64LIT(0xbb67ae8584caa73b);
-	state[2] = W64LIT(0x3c6ef372fe94f82b);
-	state[3] = W64LIT(0xa54ff53a5f1d36f1);
-	state[4] = W64LIT(0x510e527fade682d1);
-	state[5] = W64LIT(0x9b05688c2b3e6c1f);
-	state[6] = W64LIT(0x1f83d9abfb41bd6b);
-	state[7] = W64LIT(0x5be0cd19137e2179);
+	static const word64 s[8] = {
+		W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
+		W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
+		W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
+		W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)};
+	memcpy(state, s, sizeof(s));
 }

-// for SHA512
-#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
-#define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41))
-#define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7))
-#define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6))
-
-void SHA512::Transform(word64 *state, const word64 *data)
-{
-	word64 W[16];
-	word64 T[8];
-    /* Copy context->state[] to working vars */
-	memcpy(T, state, sizeof(T));
-    /* 80 operations, partially loop unrolled */
-	for (unsigned int j=0; j<80; j+=16)
-	{
-		R( 0); R( 1); R( 2); R( 3);
-		R( 4); R( 5); R( 6); R( 7);
-		R( 8); R( 9); R(10); R(11);
-		R(12); R(13); R(14); R(15);
-	}
-    /* Add the working vars back into context.state[] */
-    state[0] += a(0);
-    state[1] += b(0);
-    state[2] += c(0);
-    state[3] += d(0);
-    state[4] += e(0);
-    state[5] += f(0);
-    state[6] += g(0);
-    state[7] += h(0);
-    /* Wipe variables */
-	memset(W, 0, sizeof(W));
-	memset(T, 0, sizeof(T));
-}
-
-const word64 SHA512::K[80] = {
+CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = {
 	W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
 	W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
 	W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
@ -275,16 +308,231 @@ const word64 SHA512::K[80] = {
 	W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
 };

-void SHA384::InitState(HashWordType *state)
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+// put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version
+static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
 {
-	state[0] = W64LIT(0xcbbb9d5dc1059ed8);
-	state[1] = W64LIT(0x629a292a367cd507);
-	state[2] = W64LIT(0x9159015a3070dd17);
-	state[3] = W64LIT(0x152fecd8f70e5939);
-	state[4] = W64LIT(0x67332667ffc00b31);
-	state[5] = W64LIT(0x8eb44a8768581511);
-	state[6] = W64LIT(0xdb0c2e0d64f98fa7);
-	state[7] = W64LIT(0x47b5481dbefa4fa4);
+#ifdef __GNUC__
+	__asm__ __volatile__
+	(
+		".intel_syntax noprefix;"
+	AS1(	push	ebx)
+	AS2(	mov		ebx, eax)
+#else
+	AS2(	lea		ebx, SHA512_K)
+#endif
+
+	AS2(	mov		eax, esp)
+	AS2(	and		esp, 0xfffffff0)
+	AS2(	sub		esp, 27*16)				// 17*16 for expanded data, 20*8 for state
+	AS1(	push	eax)
+	AS2(	xor		eax, eax)
+	AS2(	lea		edi, [esp+4+8*8])		// start at middle of state buffer. will decrement pointer each round to avoid copying
+	AS2(	lea		esi, [esp+4+20*8+8])	// 16-byte alignment, then add 8
+
+	AS2(	movq	mm4, [ecx+0*8])
+	AS2(	movq	[edi+0*8], mm4)
+	AS2(	movq	mm0, [ecx+1*8])
+	AS2(	movq	[edi+1*8], mm0)
+	AS2(	movq	mm0, [ecx+2*8])
+	AS2(	movq	[edi+2*8], mm0)
+	AS2(	movq	mm0, [ecx+3*8])
+	AS2(	movq	[edi+3*8], mm0)
+	AS2(	movq	mm5, [ecx+4*8])
+	AS2(	movq	[edi+4*8], mm5)
+	AS2(	movq	mm0, [ecx+5*8])
+	AS2(	movq	[edi+5*8], mm0)
+	AS2(	movq	mm0, [ecx+6*8])
+	AS2(	movq	[edi+6*8], mm0)
+	AS2(	movq	mm0, [ecx+7*8])
+	AS2(	movq	[edi+7*8], mm0)
+	ASJ(	jmp,	0, f)
+
+#define SSE2_S0_S1(r, a, b, c)	\
+	AS2(	movq	mm6, r)\
+	AS2(	psrlq	r, a)\
+	AS2(	movq	mm7, r)\
+	AS2(	psllq	mm6, 64-c)\
+	AS2(	pxor	mm7, mm6)\
+	AS2(	psrlq	r, b-a)\
+	AS2(	pxor	mm7, r)\
+	AS2(	psllq	mm6, c-b)\
+	AS2(	pxor	mm7, mm6)\
+	AS2(	psrlq	r, c-b)\
+	AS2(	pxor	r, mm7)\
+	AS2(	psllq	mm6, b-a)\
+	AS2(	pxor	r, mm6)
+
+#define SSE2_s0(r, a, b, c)	\
+	AS2(	movdqa	xmm6, r)\
+	AS2(	psrlq	r, a)\
+	AS2(	movdqa	xmm7, r)\
+	AS2(	psllq	xmm6, 64-c)\
+	AS2(	pxor	xmm7, xmm6)\
+	AS2(	psrlq	r, b-a)\
+	AS2(	pxor	xmm7, r)\
+	AS2(	psrlq	r, c-b)\
+	AS2(	pxor	r, xmm7)\
+	AS2(	psllq	xmm6, c-a)\
+	AS2(	pxor	r, xmm6)
+
+#define SSE2_s1(r, a, b, c)	\
+	AS2(	movdqa	xmm6, r)\
+	AS2(	psrlq	r, a)\
+	AS2(	movdqa	xmm7, r)\
+	AS2(	psllq	xmm6, 64-c)\
+	AS2(	pxor	xmm7, xmm6)\
+	AS2(	psrlq	r, b-a)\
+	AS2(	pxor	xmm7, r)\
+	AS2(	psllq	xmm6, c-b)\
+	AS2(	pxor	xmm7, xmm6)\
+	AS2(	psrlq	r, c-b)\
+	AS2(	pxor	r, xmm7)
+
+	ASL(SHA512_Round)
+	// k + w is in mm0, a is in mm4, e is in mm5
+	AS2(	paddq	mm0, [edi+7*8])		// h
+	AS2(	movq	mm2, [edi+5*8])		// f
+	AS2(	movq	mm3, [edi+6*8])		// g
+	AS2(	pxor	mm2, mm3)
+	AS2(	pand	mm2, mm5)
+	SSE2_S0_S1(mm5,14,18,41)
+	AS2(	pxor	mm2, mm3)
+	AS2(	paddq	mm0, mm2)			// h += Ch(e,f,g)
+	AS2(	paddq	mm5, mm0)			// h += S1(e)
+	AS2(	movq	mm2, [edi+1*8])		// b
+	AS2(	movq	mm1, mm2)
+	AS2(	por		mm2, mm4)
+	AS2(	pand	mm2, [edi+2*8])		// c
+	AS2(	pand	mm1, mm4)
+	AS2(	por		mm1, mm2)
+	AS2(	paddq	mm1, mm5)			// temp = h + Maj(a,b,c)
+	AS2(	paddq	mm5, [edi+3*8])		// e = d + h
+	AS2(	movq	[edi+3*8], mm5)
+	AS2(	movq	[edi+11*8], mm5)
+	SSE2_S0_S1(mm4,28,34,39)			// S0(a)
+	AS2(	paddq	mm4, mm1)			// a = temp + S0(a)
+	AS2(	movq	[edi-8], mm4)
+	AS2(	movq	[edi+7*8], mm4)
+	AS1(	ret)
+
+	// first 16 rounds
+	ASL(0)
+	AS2(	movq	mm0, [edx+eax*8])
+	AS2(	movq	[esi+eax*8], mm0)
+	AS2(	movq	[esi+eax*8+16*8], mm0)
+	AS2(	paddq	mm0, [ebx+eax*8])
+	ASC(	call,	SHA512_Round)
+	AS1(	inc		eax)
+	AS2(	sub		edi, 8)
+	AS2(	test	eax, 7)
+	ASJ(	jnz,	0, b)
+	AS2(	add		edi, 8*8)
+	AS2(	cmp		eax, 16)
+	ASJ(	jne,	0, b)
+
+	// rest of the rounds
+	AS2(	movdqu	xmm0, [esi+(16-2)*8])
+	ASL(1)
+	// data expansion, W[i-2] already in xmm0
+	AS2(	movdqu	xmm3, [esi])
+	AS2(	paddq	xmm3, [esi+(16-7)*8])
+	AS2(	movdqa	xmm2, [esi+(16-15)*8])
+	SSE2_s1(xmm0, 6, 19, 61)
+	AS2(	paddq	xmm0, xmm3)
+	SSE2_s0(xmm2, 1, 7, 8)
+	AS2(	paddq	xmm0, xmm2)
+	AS2(	movdq2q	mm0, xmm0)
+	AS2(	movhlps	xmm1, xmm0)
+	AS2(	paddq	mm0, [ebx+eax*8])
+	AS2(	movlps	[esi], xmm0)
+	AS2(	movlps	[esi+8], xmm1)
+	AS2(	movlps	[esi+8*16], xmm0)
+	AS2(	movlps	[esi+8*17], xmm1)
+	// 2 rounds
+	ASC(	call,	SHA512_Round)
+	AS2(	sub		edi, 8)
+	AS2(	movdq2q	mm0, xmm1)
+	AS2(	paddq	mm0, [ebx+eax*8+8])
+	ASC(	call,	SHA512_Round)
+	// update indices and loop
+	AS2(	add		esi, 16)
+	AS2(	add		eax, 2)
+	AS2(	sub		edi, 8)
+	AS2(	test	eax, 7)
+	ASJ(	jnz,	1, b)
+	// do housekeeping every 8 rounds
+	AS2(	mov		esi, 0xf)
+	AS2(	and		esi, eax)
+	AS2(	lea		esi, [esp+4+20*8+8+esi*8])
+	AS2(	add		edi, 8*8)
+	AS2(	cmp		eax, 80)
+	ASJ(	jne,	1, b)
+
+#define SSE2_CombineState(i)	\
+	AS2(	movq	mm0, [edi+i*8])\
+	AS2(	paddq	mm0, [ecx+i*8])\
+	AS2(	movq	[ecx+i*8], mm0)
+
+	SSE2_CombineState(0)
+	SSE2_CombineState(1)
+	SSE2_CombineState(2)
+	SSE2_CombineState(3)
+	SSE2_CombineState(4)
+	SSE2_CombineState(5)
+	SSE2_CombineState(6)
+	SSE2_CombineState(7)
+
+	AS1(	pop		esp)
+	AS1(	emms)
+
+#ifdef __GNUC__
+	AS1(	pop		ebx)
+	".att_syntax prefix;"
+		:
+		: "a" (SHA512_K), "c" (state), "d" (data)
+		: "%esi", "%edi", "memory", "cc"
+	);
+#endif
+}
+#endif	// #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+
+void SHA512::Transform(word64 *state, const word64 *data)
+{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+	if (HasSSE2())
+		return SHA512_SSE2_Transform(state, data);
+#endif
+
+#define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
+#define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41))
+#define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7))
+#define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6))
+
+#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA512_K[i+j]+(j?blk2(i):blk0(i));\
+	d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
+
+	word64 W[16];
+	word64 T[8];
+    /* Copy context->state[] to working vars */
+	memcpy(T, state, sizeof(T));
+    /* 80 operations, partially loop unrolled */
+	for (unsigned int j=0; j<80; j+=16)
+	{
+		R( 0); R( 1); R( 2); R( 3);
+		R( 4); R( 5); R( 6); R( 7);
+		R( 8); R( 9); R(10); R(11);
+		R(12); R(13); R(14); R(15);
+	}
+    /* Add the working vars back into context.state[] */
+    state[0] += a(0);
+    state[1] += b(0);
+    state[2] += c(0);
+    state[3] += d(0);
+    state[4] += e(0);
+    state[5] += f(0);
+    state[6] += g(0);
+    state[7] += h(0);
 }

 #endif
--- a/sha.h
+++ b/sha.h
@ -23,9 +23,6 @@ public:
 	static void CRYPTOPP_API InitState(HashWordType *state);
 	static void CRYPTOPP_API Transform(word32 *digest, const word32 *data);
 	static const char * CRYPTOPP_API StaticAlgorithmName() {return "SHA-256";}
-
-protected:
-	static const word32 K[64];
 };

 //! implements the SHA-224 standard
@ -46,9 +43,6 @@ public:
 	static void CRYPTOPP_API InitState(HashWordType *state);
 	static void CRYPTOPP_API Transform(word64 *digest, const word64 *data);
 	static const char * CRYPTOPP_API StaticAlgorithmName() {return "SHA-512";}
-
-protected:
-	static const word64 K[80];
 };

 //! implements the SHA-384 standard
--- a/tiger.cpp
+++ b/tiger.cpp
@ -3,6 +3,7 @@
 #include "pch.h"
 #include "tiger.h"
 #include "misc.h"
+#include "cpu.h"

 #ifdef WORD64_AVAILABLE

@ -24,13 +25,187 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)

 	m_data[7] = GetBitCountLo();

-	Transform(m_digest, m_data);
-	CorrectEndianess(m_digest, m_digest, DigestSize());
-	memcpy(hash, m_digest, size);
+	Transform(m_state, m_data);
+	CorrectEndianess(m_state, m_state, DigestSize());
+	memcpy(hash, m_state, size);

 	Restart();		// reinit for next use
 }

+void Tiger::Transform (word64 *digest, const word64 *X)
+{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+	if (HasSSE2())
+	{
+#ifdef __GNUC__
+		__asm__ __volatile__
+		(
+		".intel_syntax noprefix;"
+		AS1(	push	ebx)
+#else
+		AS2(	mov		eax, digest)
+		AS2(	mov		esi, X)
+		AS2(	lea		edx, [table])
+#endif
+		AS2(	movq	mm0, [eax])
+		AS2(	movq	mm1, [eax+1*8])
+		AS2(	movq	mm5, mm1)
+		AS2(	movq	mm2, [eax+2*8])
+		AS2(	movq	mm7, [edx+4*2048+0*8])
+		AS2(	movq	mm6, [edx+4*2048+1*8])
+		AS2(	mov		ecx, esp)
+		AS2(	and		esp, 0xfffffff0)
+		AS2(	sub		esp, 8*8)
+		AS1(	push	ecx)
+
+#define SSE2_round(a,b,c,x,mul) \
+		AS2(	pxor	c, [x])\
+		AS2(	movd	ecx, c)\
+		AS2(	movzx	edi, cl)\
+		AS2(	movq	mm3, [edx+0*2048+edi*8])\
+		AS2(	movzx	edi, ch)\
+		AS2(	movq	mm4, [edx+3*2048+edi*8])\
+		AS2(	shr		ecx, 16)\
+		AS2(	movzx	edi, cl)\
+		AS2(	pxor	mm3, [edx+1*2048+edi*8])\
+		AS2(	movzx	edi, ch)\
+		AS2(	pxor	mm4, [edx+2*2048+edi*8])\
+		AS3(	pextrw	ecx, c, 2)\
+		AS2(	movzx	edi, cl)\
+		AS2(	pxor	mm3, [edx+2*2048+edi*8])\
+		AS2(	movzx	edi, ch)\
+		AS2(	pxor	mm4, [edx+1*2048+edi*8])\
+		AS3(	pextrw	ecx, c, 3)\
+		AS2(	movzx	edi, cl)\
+		AS2(	pxor	mm3, [edx+3*2048+edi*8])\
+		AS2(	psubq	a, mm3)\
+		AS2(	movzx	edi, ch)\
+		AS2(	pxor	mm4, [edx+0*2048+edi*8])\
+		AS2(	paddq	b, mm4)\
+		SSE2_mul_##mul(b)
+
+#define SSE2_mul_5(b)	\
+		AS2(	movq	mm3, b)\
+		AS2(	psllq	b, 2)\
+		AS2(	paddq	b, mm3)
+
+#define SSE2_mul_7(b)	\
+		AS2(	movq	mm3, b)\
+		AS2(	psllq	b, 3)\
+		AS2(	psubq	b, mm3)
+
+#define SSE2_mul_9(b)	\
+		AS2(	movq	mm3, b)\
+		AS2(	psllq	b, 3)\
+		AS2(	paddq	b, mm3)
+
+#define label2_5 1
+#define label2_7 2
+#define label2_9 3
+
+#define SSE2_pass(A,B,C,mul,X)	\
+		AS2(	xor		ebx, ebx)\
+		ASL(mul)\
+		SSE2_round(A,B,C,X+0*8+ebx,mul)\
+		SSE2_round(B,C,A,X+1*8+ebx,mul)\
+		AS2(	cmp		ebx, 6*8)\
+		ASJ(	je,		label2_##mul, f)\
+		SSE2_round(C,A,B,X+2*8+ebx,mul)\
+		AS2(	add		ebx, 3*8)\
+		ASJ(	jmp,	mul, b)\
+		ASL(label2_##mul)
+
+#define SSE2_key_schedule(Y,X) \
+		AS2(	movq	mm3, [X+7*8])\
+		AS2(	pxor	mm3, mm6)\
+		AS2(	movq	mm4, [X+0*8])\
+		AS2(	psubq	mm4, mm3)\
+		AS2(	movq	[Y+0*8], mm4)\
+		AS2(	pxor	mm4, [X+1*8])\
+		AS2(	movq	mm3, mm4)\
+		AS2(	movq	[Y+1*8], mm4)\
+		AS2(	paddq	mm4, [X+2*8])\
+		AS2(	pxor	mm3, mm7)\
+		AS2(	psllq	mm3, 19)\
+		AS2(	movq	[Y+2*8], mm4)\
+		AS2(	pxor	mm3, mm4)\
+		AS2(	movq	mm4, [X+3*8])\
+		AS2(	psubq	mm4, mm3)\
+		AS2(	movq	[Y+3*8], mm4)\
+		AS2(	pxor	mm4, [X+4*8])\
+		AS2(	movq	mm3, mm4)\
+		AS2(	movq	[Y+4*8], mm4)\
+		AS2(	paddq	mm4, [X+5*8])\
+		AS2(	pxor	mm3, mm7)\
+		AS2(	psrlq	mm3, 23)\
+		AS2(	movq	[Y+5*8], mm4)\
+		AS2(	pxor	mm3, mm4)\
+		AS2(	movq	mm4, [X+6*8])\
+		AS2(	psubq	mm4, mm3)\
+		AS2(	movq	[Y+6*8], mm4)\
+		AS2(	pxor	mm4, [X+7*8])\
+		AS2(	movq	mm3, mm4)\
+		AS2(	movq	[Y+7*8], mm4)\
+		AS2(	paddq	mm4, [Y+0*8])\
+		AS2(	pxor	mm3, mm7)\
+		AS2(	psllq	mm3, 19)\
+		AS2(	movq	[Y+0*8], mm4)\
+		AS2(	pxor	mm3, mm4)\
+		AS2(	movq	mm4, [Y+1*8])\
+		AS2(	psubq	mm4, mm3)\
+		AS2(	movq	[Y+1*8], mm4)\
+		AS2(	pxor	mm4, [Y+2*8])\
+		AS2(	movq	mm3, mm4)\
+		AS2(	movq	[Y+2*8], mm4)\
+		AS2(	paddq	mm4, [Y+3*8])\
+		AS2(	pxor	mm3, mm7)\
+		AS2(	psrlq	mm3, 23)\
+		AS2(	movq	[Y+3*8], mm4)\
+		AS2(	pxor	mm3, mm4)\
+		AS2(	movq	mm4, [Y+4*8])\
+		AS2(	psubq	mm4, mm3)\
+		AS2(	movq	[Y+4*8], mm4)\
+		AS2(	pxor	mm4, [Y+5*8])\
+		AS2(	movq	[Y+5*8], mm4)\
+		AS2(	paddq	mm4, [Y+6*8])\
+		AS2(	movq	[Y+6*8], mm4)\
+		AS2(	pxor	mm4, [edx+4*2048+2*8])\
+		AS2(	movq	mm3, [Y+7*8])\
+		AS2(	psubq	mm3, mm4)\
+		AS2(	movq	[Y+7*8], mm3)
+
+		SSE2_pass(mm0, mm1, mm2, 5, esi)
+		SSE2_key_schedule(esp+4, esi)
+		SSE2_pass(mm2, mm0, mm1, 7, esp+4)
+		SSE2_key_schedule(esp+4, esp+4)
+		SSE2_pass(mm1, mm2, mm0, 9, esp+4)
+
+		AS2(	pxor	mm0, [eax+0*8])
+		AS2(	movq	[eax+0*8], mm0)
+		AS2(	psubq	mm1, mm5)
+		AS2(	movq	[eax+1*8], mm1)
+		AS2(	paddq	mm2, [eax+2*8])
+		AS2(	movq	[eax+2*8], mm2)
+
+		AS1(	pop		esp)
+		AS1(	emms)
+#ifdef __GNUC__
+		AS1(	pop		ebx)
+		".att_syntax prefix;"
+			:
+			: "a" (digest), "S" (X), "d" (table)
+			: "%ecx", "%edi", "memory", "cc"
+		);
+#endif
+	}
+	else
+#endif
+	{
+		word64 a = digest[0];
+		word64 b = digest[1];
+		word64 c = digest[2];
+		word64 Y[8];
+
 #define t1 (table)
 #define t2 (table+256)
 #define t3 (table+256*2)
@ -42,15 +217,17 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
 	b += t4[GETBYTE(c,1)] ^ t3[GETBYTE(c,3)] ^ t2[GETBYTE(c,5)] ^ t1[GETBYTE(c,7)]; \
 	b *= mul

-#define pass(a,b,c,mul,X) \
-	round(a,b,c,X[0],mul); \
-	round(b,c,a,X[1],mul); \
-	round(c,a,b,X[2],mul); \
-	round(a,b,c,X[3],mul); \
-	round(b,c,a,X[4],mul); \
-	round(c,a,b,X[5],mul); \
-	round(a,b,c,X[6],mul); \
-	round(b,c,a,X[7],mul)
+#define pass(a,b,c,mul,X) {\
+	int i=0;\
+	while (true)\
+	{\
+		round(a,b,c,X[i+0],mul); \
+		round(b,c,a,X[i+1],mul); \
+		if (i==6)\
+			break;\
+		round(c,a,b,X[i+2],mul); \
+		i+=3;\
+	}}

 #define key_schedule(Y,X) \
 	Y[0] = X[0] - (X[7]^W64LIT(0xA5A5A5A5A5A5A5A5)); \
@ -70,13 +247,6 @@ void Tiger::TruncatedFinal(byte *hash, size_t size)
 	Y[6] += Y[5]; \
 	Y[7] -= Y[6] ^ W64LIT(0x0123456789ABCDEF)

-void Tiger::Transform (word64 *digest, const word64 *X)
-{
-	word64 a = digest[0];
-	word64 b = digest[1];
-	word64 c = digest[2];
-	word64 Y[8];
-
 		pass(a,b,c,5,X);
 		key_schedule(Y,X);
 		pass(c,a,b,7,Y);
@ -86,8 +256,7 @@ void Tiger::Transform (word64 *digest, const word64 *X)
 		digest[0] = a ^ digest[0];
 		digest[1] = b - digest[1];
 		digest[2] = c + digest[2];
-
-	memset(Y, 0, sizeof(Y));
+	}
 }

 NAMESPACE_END
--- a/tiger.h
+++ b/tiger.h
@ -9,7 +9,7 @@

 NAMESPACE_BEGIN(CryptoPP)

-/// <a href="http://www.weidai.com/scan-mirror/md.html#Tiger">Tiger</a>
+/// <a href="http://www.cryptolounge.org/wiki/Tiger">Tiger</a>
 class Tiger : public IteratedHashWithStaticTransform<word64, LittleEndian, 64, 24, Tiger>
 {
 public:
@ -19,7 +19,7 @@ public:
 	static const char * StaticAlgorithmName() {return "Tiger";}

 protected:
-	static const word64 table[4*256];
+	static const word64 table[4*256+3];
 };

 NAMESPACE_END
--- a/whrlpool.cpp
+++ b/whrlpool.cpp
@ -1,7 +1,7 @@
-// Whrlpool.cpp - modified by Kevin Springle from
+// whrlpool.cpp - originally modified by Kevin Springle from
 // Paulo Barreto and Vincent Rijmen's public domain code, whirlpool.c.
+// Updated to Whirlpool version 3.0, optimized and MMX version added by Wei Dai
 // Any modifications are placed in the public domain
-// Updated to Whirlpool version 3.0 by Wei Dai

 // This is the original introductory comment:

@ -69,6 +69,7 @@

 #include "whrlpool.h"
 #include "misc.h"
+#include "cpu.h"

 NAMESPACE_BEGIN(CryptoPP)

@ -94,9 +95,9 @@ void Whirlpool::TruncatedFinal(byte *hash, size_t size)
 	m_data[m_data.size()-2] = GetBitCountHi();
 	m_data[m_data.size()-1] = GetBitCountLo();

-	Transform(m_digest, m_data);
-	CorrectEndianess(m_digest, m_digest, DigestSize());
-	memcpy(hash, m_digest, size);
+	Transform(m_state, m_data);
+	CorrectEndianess(m_state, m_state, DigestSize());
+	memcpy(hash, m_state, size);

 	Restart();		// reinit for next use
 }
@ -113,7 +114,7 @@ void Whirlpool::TruncatedFinal(byte *hash, size_t size)
 * employed).
 */

-static const word64 C0[256] = {
+CRYPTOPP_ALIGN_DATA(16) static const word64 Whirlpool_C[4*256+R] CRYPTOPP_SECTION_ALIGN16 = {
    W64LIT(0x18186018c07830d8), W64LIT(0x23238c2305af4626), W64LIT(0xc6c63fc67ef991b8), W64LIT(0xe8e887e8136fcdfb),
    W64LIT(0x878726874ca113cb), W64LIT(0xb8b8dab8a9626d11), W64LIT(0x0101040108050209), W64LIT(0x4f4f214f426e9e0d),
    W64LIT(0x3636d836adee6c9b), W64LIT(0xa6a6a2a6590451ff), W64LIT(0xd2d26fd2debdb90c), W64LIT(0xf5f5f3f5fb06f70e),
@ -178,9 +179,7 @@ static const word64 C0[256] = {
    W64LIT(0x7070dd70a7ade0d7), W64LIT(0xb6b6e2b6d954716f), W64LIT(0xd0d067d0ceb7bd1e), W64LIT(0xeded93ed3b7ec7d6),
    W64LIT(0xcccc17cc2edb85e2), W64LIT(0x424215422a578468), W64LIT(0x98985a98b4c22d2c), W64LIT(0xa4a4aaa4490e55ed),
 	W64LIT(0x2828a0285d885075), W64LIT(0x5c5c6d5cda31b886), W64LIT(0xf8f8c7f8933fed6b), W64LIT(0x8686228644a411c2),
-};

-static const word64 C1[256] = {
 	W64LIT(0xd818186018c07830), W64LIT(0x2623238c2305af46), W64LIT(0xb8c6c63fc67ef991), W64LIT(0xfbe8e887e8136fcd),
    W64LIT(0xcb878726874ca113), W64LIT(0x11b8b8dab8a9626d), W64LIT(0x0901010401080502), W64LIT(0x0d4f4f214f426e9e),
    W64LIT(0x9b3636d836adee6c), W64LIT(0xffa6a6a2a6590451), W64LIT(0x0cd2d26fd2debdb9), W64LIT(0x0ef5f5f3f5fb06f7),
@ -245,9 +244,7 @@ static const word64 C1[256] = {
    W64LIT(0xd77070dd70a7ade0), W64LIT(0x6fb6b6e2b6d95471), W64LIT(0x1ed0d067d0ceb7bd), W64LIT(0xd6eded93ed3b7ec7),
    W64LIT(0xe2cccc17cc2edb85), W64LIT(0x68424215422a5784), W64LIT(0x2c98985a98b4c22d), W64LIT(0xeda4a4aaa4490e55),
    W64LIT(0x752828a0285d8850), W64LIT(0x865c5c6d5cda31b8), W64LIT(0x6bf8f8c7f8933fed), W64LIT(0xc28686228644a411),
-};

-static const word64 C2[256] = {
 	W64LIT(0x30d818186018c078), W64LIT(0x462623238c2305af), W64LIT(0x91b8c6c63fc67ef9), W64LIT(0xcdfbe8e887e8136f),
    W64LIT(0x13cb878726874ca1), W64LIT(0x6d11b8b8dab8a962), W64LIT(0x0209010104010805), W64LIT(0x9e0d4f4f214f426e),
    W64LIT(0x6c9b3636d836adee), W64LIT(0x51ffa6a6a2a65904), W64LIT(0xb90cd2d26fd2debd), W64LIT(0xf70ef5f5f3f5fb06),
@ -312,9 +309,7 @@ static const word64 C2[256] = {
    W64LIT(0xe0d77070dd70a7ad), W64LIT(0x716fb6b6e2b6d954), W64LIT(0xbd1ed0d067d0ceb7), W64LIT(0xc7d6eded93ed3b7e),
    W64LIT(0x85e2cccc17cc2edb), W64LIT(0x8468424215422a57), W64LIT(0x2d2c98985a98b4c2), W64LIT(0x55eda4a4aaa4490e),
    W64LIT(0x50752828a0285d88), W64LIT(0xb8865c5c6d5cda31), W64LIT(0xed6bf8f8c7f8933f), W64LIT(0x11c28686228644a4),
-};

-static const word64 C3[256] = {
 	W64LIT(0x7830d818186018c0), W64LIT(0xaf462623238c2305), W64LIT(0xf991b8c6c63fc67e), W64LIT(0x6fcdfbe8e887e813),
    W64LIT(0xa113cb878726874c), W64LIT(0x626d11b8b8dab8a9), W64LIT(0x0502090101040108), W64LIT(0x6e9e0d4f4f214f42),
    W64LIT(0xee6c9b3636d836ad), W64LIT(0x0451ffa6a6a2a659), W64LIT(0xbdb90cd2d26fd2de), W64LIT(0x06f70ef5f5f3f5fb),
@ -379,9 +374,7 @@ static const word64 C3[256] = {
    W64LIT(0xade0d77070dd70a7), W64LIT(0x54716fb6b6e2b6d9), W64LIT(0xb7bd1ed0d067d0ce), W64LIT(0x7ec7d6eded93ed3b),
    W64LIT(0xdb85e2cccc17cc2e), W64LIT(0x578468424215422a), W64LIT(0xc22d2c98985a98b4), W64LIT(0x0e55eda4a4aaa449),
    W64LIT(0x8850752828a0285d), W64LIT(0x31b8865c5c6d5cda), W64LIT(0x3fed6bf8f8c7f893), W64LIT(0xa411c28686228644),
-};

-static const word64 rc[R] = {
 	W64LIT(0x1823c6e887b8014f),
 	W64LIT(0x36a6d2f5796f9152),
 	W64LIT(0x60bc9b8ea30c7b35),
@ -396,56 +389,293 @@ static const word64 rc[R] = {

 // Whirlpool basic transformation. Transforms state based on block.
 void Whirlpool::Transform(word64 *digest, const word64 *block)
+{
+#ifdef CRYPTOPP_X86_ASM_AVAILABLE
+	if (HasMMX())
+	{
+		// MMX version has the same structure as C version below
+#ifdef __GNUC__
+	__asm__ __volatile__
+	(
+		".intel_syntax noprefix;"
+		AS1(	push	ebx)
+		AS2(	mov		ebx, eax)
+#else
+		AS2(	lea		ebx, [Whirlpool_C])
+		AS2(	mov		ecx, digest)
+		AS2(	mov		edx, block)
+#endif
+		AS2(	mov		eax, esp)
+		AS2(	and		esp, 0xfffffff0)
+		AS2(	sub		esp, 16*8)
+		AS1(	push	eax)
+		AS2(	xor		esi, esi)
+		ASL(0)
+		AS2(	movq	mm0, [ecx+8*esi])
+		AS2(	movq	[esp+4+8*esi], mm0)		// k
+		AS2(	pxor	mm0, [edx+8*esi])
+		AS2(	movq	[esp+4+64+8*esi], mm0)	// s
+		AS2(	movq	[ecx+8*esi], mm0)
+		AS1(	inc		esi)
+		AS2(	cmp		esi, 8)
+		ASJ(	jne,	0, b)
+
+		AS2(	xor		esi, esi)
+		ASL(1)
+
+#define KSL0(a, b)	AS2(movq	mm##a, b)
+#define KSL1(a, b)	AS2(pxor	mm##a, b)
+
+#define KSL(op, i, a, b, c, d)	\
+	AS2(mov		eax, [esp+4+8*i])\
+	AS2(movzx	edi, al)\
+	KSL##op(a, [ebx+3*2048+8*edi])\
+	AS2(movzx	edi, ah)\
+	KSL##op(b, [ebx+2*2048+8*edi])\
+	AS2(shr		eax, 16)\
+	AS2(movzx	edi, al)\
+	AS2(shr		eax, 8)\
+	KSL##op(c, [ebx+1*2048+8*edi])\
+	KSL##op(d, [ebx+0*2048+8*eax])
+
+#define KSH0(a, b)	\
+	ASS(pshufw	mm##a, mm##a, 1, 0, 3, 2)\
+	AS2(pxor	mm##a, b)
+#define KSH1(a, b)	\
+	AS2(pxor	mm##a, b)
+#define KSH2(a, b)	\
+	AS2(pxor	mm##a, b)\
+	AS2(movq	[esp+4+8*a], mm##a)
+
+#define KSH(op, i, a, b, c, d)	\
+	AS2(mov		eax, [esp+4+8*((i+4)-8*((i+4)/8))+4])\
+	AS2(movzx	edi, al)\
+	KSH##op(a, [ebx+3*2048+8*edi])\
+	AS2(movzx	edi, ah)\
+	KSH##op(b, [ebx+2*2048+8*edi])\
+	AS2(shr		eax, 16)\
+	AS2(movzx	edi, al)\
+	AS2(shr		eax, 8)\
+	KSH##op(c, [ebx+1*2048+8*edi])\
+	KSH##op(d, [ebx+0*2048+8*eax])
+
+#define TSL(op, i, a, b, c, d)	\
+	AS2(mov		eax, [esp+4+64+8*i])\
+	AS2(movzx	edi, al)\
+	KSL##op(a, [ebx+3*2048+8*edi])\
+	AS2(movzx	edi, ah)\
+	KSL##op(b, [ebx+2*2048+8*edi])\
+	AS2(shr		eax, 16)\
+	AS2(movzx	edi, al)\
+	AS2(shr		eax, 8)\
+	KSL##op(c, [ebx+1*2048+8*edi])\
+	KSL##op(d, [ebx+0*2048+8*eax])
+
+#define TSH0(a, b)	\
+	ASS(pshufw	mm##a, mm##a, 1, 0, 3, 2)\
+	AS2(pxor	mm##a, [esp+4+8*a])\
+	AS2(pxor	mm##a, b)
+#define TSH1(a, b)	\
+	AS2(pxor	mm##a, b)
+#define TSH2(a, b)	\
+	AS2(pxor	mm##a, b)\
+	AS2(movq	[esp+4+64+8*a], mm##a)
+#define TSH3(a, b)	\
+	AS2(pxor	mm##a, b)\
+	AS2(pxor	mm##a, [ecx+8*a])\
+	AS2(movq	[ecx+8*a], mm##a)
+
+#define TSH(op, i, a, b, c, d)	\
+	AS2(mov		eax, [esp+4+64+8*((i+4)-8*((i+4)/8))+4])\
+	AS2(movzx	edi, al)\
+	TSH##op(a, [ebx+3*2048+8*edi])\
+	AS2(movzx	edi, ah)\
+	TSH##op(b, [ebx+2*2048+8*edi])\
+	AS2(shr		eax, 16)\
+	AS2(movzx	edi, al)\
+	AS2(shr		eax, 8)\
+	TSH##op(c, [ebx+1*2048+8*edi])\
+	TSH##op(d, [ebx+0*2048+8*eax])
+
+		KSL(0, 4, 3, 2, 1, 0)
+		KSL(0, 0, 7, 6, 5, 4)
+		KSL(1, 1, 0, 7, 6, 5)
+		KSL(1, 2, 1, 0, 7, 6)
+		KSL(1, 3, 2, 1, 0, 7)
+		KSL(1, 5, 4, 3, 2, 1)
+		KSL(1, 6, 5, 4, 3, 2)
+		KSL(1, 7, 6, 5, 4, 3)
+		KSH(0, 0, 7, 6, 5, 4)
+		KSH(0, 4, 3, 2, 1, 0)
+		KSH(1, 1, 0, 7, 6, 5)
+		KSH(1, 2, 1, 0, 7, 6)
+		KSH(1, 5, 4, 3, 2, 1)
+		KSH(1, 6, 5, 4, 3, 2)
+		KSH(2, 3, 2, 1, 0, 7)
+		KSH(2, 7, 6, 5, 4, 3)
+
+		AS2(	pxor	mm0, [ebx + 8*1024 + esi*8])
+		AS2(	movq	[esp+4], mm0)
+
+		TSL(0, 4, 3, 2, 1, 0)
+		TSL(0, 0, 7, 6, 5, 4)
+		TSL(1, 1, 0, 7, 6, 5)
+		TSL(1, 2, 1, 0, 7, 6)
+		TSL(1, 3, 2, 1, 0, 7)
+		TSL(1, 5, 4, 3, 2, 1)
+		TSL(1, 6, 5, 4, 3, 2)
+		TSL(1, 7, 6, 5, 4, 3)
+		TSH(0, 0, 7, 6, 5, 4)
+		TSH(0, 4, 3, 2, 1, 0)
+		TSH(1, 1, 0, 7, 6, 5)
+		TSH(1, 2, 1, 0, 7, 6)
+		TSH(1, 5, 4, 3, 2, 1)
+		TSH(1, 6, 5, 4, 3, 2)
+
+		AS1(	inc		esi)
+		AS2(	cmp		esi, 10)
+		ASJ(	je,		2, f)
+
+		TSH(2, 3, 2, 1, 0, 7)
+		TSH(2, 7, 6, 5, 4, 3)
+
+		ASJ(	jmp,	1, b)
+		ASL(2)
+
+		TSH(3, 3, 2, 1, 0, 7)
+		TSH(3, 7, 6, 5, 4, 3)
+
+#undef KSL
+#undef KSH
+#undef TSL
+#undef TSH
+
+		AS1(	emms)
+		AS1(	pop		esp)
+
+#ifdef __GNUC__
+		AS1(	pop		ebx)
+		".att_syntax prefix;"
+			:
+			: "a" (Whirlpool_C), "c" (digest), "d" (block)
+			: "%esi", "%edi", "memory", "cc"
+		);
+#endif
+	}
+	else
+#endif		// #ifdef CRYPTOPP_X86_ASM_AVAILABLE
 	{
 	word64 s[8];	// the cipher state
 	word64 k[8];	// the round key

 	// Compute and apply K^0 to the cipher state
 	// Also apply part of the Miyaguchi-Preneel compression function
-	digest[0] = s[0] = block[0] ^ (k[0] = digest[0]);
-	digest[1] = s[1] = block[1] ^ (k[1] = digest[1]);
-	digest[2] = s[2] = block[2] ^ (k[2] = digest[2]);
-	digest[3] = s[3] = block[3] ^ (k[3] = digest[3]);
-	digest[4] = s[4] = block[4] ^ (k[4] = digest[4]);
-	digest[5] = s[5] = block[5] ^ (k[5] = digest[5]);
-	digest[6] = s[6] = block[6] ^ (k[6] = digest[6]);
-	digest[7] = s[7] = block[7] ^ (k[7] = digest[7]);
+	for (int i=0; i<8; i++)
+		digest[i] = s[i] = block[i] ^ (k[i] = digest[i]);
+
+#define KSL(op, i, a, b, c, d)	\
+	t = (word32)k[i];\
+	w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : 0);\
+	t >>= 8;\
+	w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : 0);\
+	t >>= 8;\
+	w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : 0);\
+	t >>= 8;\
+	w##d = Whirlpool_C[0*256 + t]       ^ (op ? w##d : 0);
+
+#define KSH(op, i, a, b, c, d)	\
+	t = (word32)(k[(i+4)%8]>>32);\
+	w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : rotrFixed(w##a, 32));\
+	if (op==2) k[a] = w##a;\
+	t >>= 8;\
+	w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : rotrFixed(w##b, 32));\
+	if (op==2) k[b] = w##b;\
+	t >>= 8;\
+	w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : rotrFixed(w##c, 32));\
+	if (op==2) k[c] = w##c;\
+	t >>= 8;\
+	w##d = Whirlpool_C[0*256 + t]       ^ (op ? w##d : rotrFixed(w##d, 32));\
+	if (op==2) k[d] = w##d;\
+
+#define TSL(op, i, a, b, c, d)	\
+	t = (word32)s[i];\
+	w##a = Whirlpool_C[3*256 + (byte)t] ^ (op ? w##a : 0);\
+	t >>= 8;\
+	w##b = Whirlpool_C[2*256 + (byte)t] ^ (op ? w##b : 0);\
+	t >>= 8;\
+	w##c = Whirlpool_C[1*256 + (byte)t] ^ (op ? w##c : 0);\
+	t >>= 8;\
+	w##d = Whirlpool_C[0*256 + t]       ^ (op ? w##d : 0);
+
+#define TSH_OP(op, a, b)	\
+	w##a = Whirlpool_C[b*256 + (byte)t] ^ (op ? w##a : rotrFixed(w##a, 32) ^ k[a]);\
+	if (op==2) s[a] = w##a;\
+	if (op==3) digest[a] ^= w##a;\
+
+#define TSH(op, i, a, b, c, d)	\
+	t = (word32)(s[(i+4)%8]>>32);\
+	TSH_OP(op, a, 3);\
+	t >>= 8;\
+	TSH_OP(op, b, 2);\
+	t >>= 8;\
+	TSH_OP(op, c, 1);\
+	t >>= 8;\
+	TSH_OP(op, d, 0);\

 	// Iterate over all rounds:
-	for (int r = 0; r < R; r++)
+	int r=0;
+	while (true)
 	{
 		word64 w0, w1, w2, w3, w4, w5, w6, w7;	// temporary storage
-		word64 t;
+		word32 t;

-		// Compute K^r from K^{r-1}:
-#define K(i,j) GETBYTE(k[(i+j+1)%8], j)
-#define KS(i) \
-	t = C0[K(i,3)] ^ C1[K(i,2)] ^ C2[K(i,1)] ^ C3[K(i,0)]; \
-	w##i = rotrFixed(t, 32) ^ C0[K(i,7)] ^ C1[K(i,6)] ^ C2[K(i,5)] ^ C3[K(i,4)];
+		KSL(0, 4, 3, 2, 1, 0)
+		KSL(0, 0, 7, 6, 5, 4)
+		KSL(1, 1, 0, 7, 6, 5)
+		KSL(1, 2, 1, 0, 7, 6)
+		KSL(1, 3, 2, 1, 0, 7)
+		KSL(1, 5, 4, 3, 2, 1)
+		KSL(1, 6, 5, 4, 3, 2)
+		KSL(1, 7, 6, 5, 4, 3)
+		KSH(0, 0, 7, 6, 5, 4)
+		KSH(0, 4, 3, 2, 1, 0)
+		KSH(1, 1, 0, 7, 6, 5)
+		KSH(1, 2, 1, 0, 7, 6)
+		KSH(1, 5, 4, 3, 2, 1)
+		KSH(1, 6, 5, 4, 3, 2)
+		KSH(2, 3, 2, 1, 0, 7)
+		KSH(2, 7, 6, 5, 4, 3)

-		KS(0); KS(1); KS(2); KS(3); KS(4); KS(5); KS(6); KS(7);
-		k[0] = w0 ^ rc[r];
-		k[1] = w1; k[2] = w2; k[3] = w3; k[4] = w4; k[5] = w5; k[6] = w6; k[7] = w7;
+		k[0] ^= Whirlpool_C[1024+r];

-		// Apply the r-th round transformation:
-#define S(i,j) GETBYTE(s[(i+j+1)%8], j)
-#define TS(i) \
-	t = C0[S(i,3)] ^ C1[S(i,2)] ^ C2[S(i,1)] ^ C3[S(i,0)]; \
-	w##i = rotrFixed(t, 32) ^ C0[S(i,7)] ^ C1[S(i,6)] ^ C2[S(i,5)] ^ C3[S(i,4)] ^ k[i];
+		TSL(0, 4, 3, 2, 1, 0)
+		TSL(0, 0, 7, 6, 5, 4)
+		TSL(1, 1, 0, 7, 6, 5)
+		TSL(1, 2, 1, 0, 7, 6)
+		TSL(1, 3, 2, 1, 0, 7)
+		TSL(1, 5, 4, 3, 2, 1)
+		TSL(1, 6, 5, 4, 3, 2)
+		TSL(1, 7, 6, 5, 4, 3)
+		TSH(0, 0, 7, 6, 5, 4)
+		TSH(0, 4, 3, 2, 1, 0)
+		TSH(1, 1, 0, 7, 6, 5)
+		TSH(1, 2, 1, 0, 7, 6)
+		TSH(1, 5, 4, 3, 2, 1)
+		TSH(1, 6, 5, 4, 3, 2)

-		TS(0); TS(1); TS(2); TS(3); TS(4); TS(5); TS(6); TS(7);
-		s[0] = w0; s[1] = w1; s[2] = w2; s[3] = w3; s[4] = w4; s[5] = w5; s[6] = w6; s[7] = w7;
+		if (++r < R)
+		{
+			TSH(2, 3, 2, 1, 0, 7)
+			TSH(2, 7, 6, 5, 4, 3)
+		}
+		else
+		{
+			TSH(3, 3, 2, 1, 0, 7)
+			TSH(3, 7, 6, 5, 4, 3)
+			break;
+		}
+	}
 	}
-
-	// Apply the rest of the Miyaguchi-Preneel compression function:
-	digest[0] ^= s[0];
-	digest[1] ^= s[1];
-	digest[2] ^= s[2];
-	digest[3] ^= s[3];
-	digest[4] ^= s[4];
-	digest[5] ^= s[5];
-	digest[6] ^= s[6];
-	digest[7] ^= s[7];
 }

 NAMESPACE_END
--- a/whrlpool.h
+++ b/whrlpool.h
@ -9,8 +9,7 @@

 NAMESPACE_BEGIN(CryptoPP)

-//! <a href="http://www.weidai.com/scan-mirror/md.html#Whirlpool">Whirlpool</a>
-/*! 512 Bit Hash */
+//! <a href="http://www.cryptolounge.org/wiki/Whirlpool">Whirlpool</a>
 class Whirlpool : public IteratedHashWithStaticTransform<word64, BigEndian, 64, 64, Whirlpool>
 {
 public: