diff --git a/aria.cpp b/aria.cpp index 53b39a9f..764cfea7 100644 --- a/aria.cpp +++ b/aria.cpp @@ -14,7 +14,11 @@ #include "misc.h" #include "cpu.h" -#include +// Enable SSE intrinsics for Visual Studio. It reduces key schedule setup by 150 +// to 200 cycles. GCC does fine on its own, and it slows things down a small bit. +#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE && _MSC_VER +# define CRYPTOPP_ENABLE_ARIA_INTRINSICS 1 +#endif ANONYMOUS_NAMESPACE_BEGIN @@ -313,46 +317,148 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam // w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t is a 16 byte temp area. word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20; - w0[0] = LoadWord(mk,0); w0[1] = LoadWord(mk,1); - w0[2] = LoadWord(mk,2); w0[3] = LoadWord(mk,3); +#if CRYPTOPP_ENABLE_ARIA_INTRINSICS + if (HasSSSE3()) + { + // 7 SSE instructions + const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); + const __m128i w = _mm_shuffle_epi8(_mm_load_si128((const __m128i*)mk), m); + _mm_store_si128((__m128i*)w0, w); - t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1]; - t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3]; + _mm_store_si128((__m128i*)t, _mm_xor_si128(w, + _mm_load_si128((const __m128i*)KRK[q]))); + } + else +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + // 27 integer instructions + w0[0] = LoadWord(mk,0); w0[1] = LoadWord(mk,1); + w0[2] = LoadWord(mk,2); w0[3] = LoadWord(mk,3); + + t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1]; + t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3]; + } + + // 24 integer instructions ARIA_FO; - if (keyBits > 128) + if (keyBits == 256) { - w1[0] = LoadWord(mk,4); - w1[1] = LoadWord(mk,5); - - if (keyBits > 192) +#if CRYPTOPP_ENABLE_ARIA_INTRINSICS + if (HasSSSE3()) { + // 3 SSE instructions + const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); + _mm_store_si128(reinterpret_cast<__m128i*>(w1), + _mm_shuffle_epi8(_mm_load_si128((const __m128i*)(mk+16)), m)); + } +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + // 14 integer instructions + w1[0] = LoadWord(mk,4); + w1[1] = LoadWord(mk,5); w1[2] = LoadWord(mk,6); w1[3] = LoadWord(mk,7); } - else - { - w1[2]=w1[3]=0; - } + } + else if (keyBits == 192) + { + w1[0] = LoadWord(mk,4); + w1[1] = LoadWord(mk,5); + w1[2] = w1[3] = 0; } else { - w1[0]=w1[1]=w1[2]=w1[3]=0; +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE + if (HasSSE2()) + { + _mm_store_si128(reinterpret_cast<__m128i*>(w1), _mm_setzero_si128()); + } + else +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + w1[0]=w1[1]=w1[2]=w1[3]=0; + } } - w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3]; - t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3]; +#if CRYPTOPP_ENABLE_ARIA_INTRINSICS + if (HasSSSE3()) + { + // 4 integer, 7 SSE instructions + const __m128i x = _mm_xor_si128( + _mm_load_si128((const __m128i*)(w1)), + _mm_load_si128((const __m128i*)(t))); + + _mm_store_si128((__m128i*)(w1), x); + _mm_store_si128((__m128i*)(t), x); + + q = (q==2) ? 0 : (q+1); + const __m128i y = _mm_xor_si128( + _mm_load_si128((const __m128i*)(t)), + _mm_load_si128((const __m128i*)(KRK[q]))); + + _mm_store_si128((__m128i*)(t), y); + } + else +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + // 23 integer instructions + w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3]; + t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3]; + + q = (q==2) ? 0 : (q+1); + t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; + } - q = (q==2) ? 0 : (q+1); - t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; ARIA_FE; - t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3]; - w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3]; - q = (q==2) ? 0 : (q+1); - t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; +#if CRYPTOPP_ENABLE_ARIA_INTRINSICS + if (HasSSSE3()) + { + // 4 integer, 7 SSE instructions + const __m128i x = _mm_xor_si128( + _mm_load_si128((const __m128i*)(w0)), + _mm_load_si128((const __m128i*)(t))); + + _mm_store_si128((__m128i*)(w2), x); + _mm_store_si128((__m128i*)(t), x); + + q = (q==2) ? 0 : (q+1); + const __m128i y = _mm_xor_si128( + _mm_load_si128((const __m128i*)(t)), + _mm_load_si128((const __m128i*)(KRK[q]))); + + _mm_store_si128((__m128i*)(t), y); + } + else +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + // 23 integer instructions + t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3]; + w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3]; + + q = (q==2) ? 0 : (q+1); + t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; + } + ARIA_FO; - w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3]; + +#if CRYPTOPP_ENABLE_ARIA_INTRINSICS + if (HasSSSE3()) + { + // 3 SSE instructions + const __m128i x = _mm_xor_si128( + _mm_load_si128((const __m128i*)(w1)), + _mm_load_si128((const __m128i*)(t))); + + _mm_store_si128((__m128i*)(w3), x); + } + else +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + // 14 integer instructions + w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3]; + } #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE if (HasNEON()) @@ -384,7 +490,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam } } else -#endif +#endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE { ARIA_GSRK<19>(w0, w1, rk + 0); ARIA_GSRK<19>(w1, w2, rk + 16); @@ -423,6 +529,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam rk = m_rk.data(); r = R; q = Q; + // 32 integer intructions a=reinterpret_cast(rk); z=a+r*4; t[0]=a[0]; t[1]=a[1]; t[2]=a[2]; t[3]=a[3]; a[0]=z[0]; a[1]=z[1]; a[2]=z[2]; a[3]=z[3]; @@ -432,89 +539,92 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam for (; a(m_rk.data()); word32 *t = const_cast(m_w.data()+20); - t[0] = LoadWord(i,0); t[1] = LoadWord(i,1); - t[2] = LoadWord(i,2); t[3] = LoadWord(i,3); + // Visual Studio is generating bad code within the SSSE3 code block. It is + // providing a NULL pointer or a pointer set to a constant like 0x1000. + // It looks like some leftover garbage in the XMM register rather than + // the pointer loaded into the integer register for the non-SSE code path. + t[0] = LoadWord(inBlock,0); t[1] = LoadWord(inBlock,1); + t[2] = LoadWord(inBlock,2); t[3] = LoadWord(inBlock,3); if (m_rounds > 12) { - ARIA_KXL rk+= 16; ARIA_FO - ARIA_KXL rk+= 16; ARIA_FE + ARIA_KXL; rk+= 16; ARIA_FO; + ARIA_KXL; rk+= 16; ARIA_FE; } if (m_rounds > 14) { - ARIA_KXL rk+= 16; ARIA_FO - ARIA_KXL rk+= 16; ARIA_FE + ARIA_KXL; rk+= 16; ARIA_FO; + ARIA_KXL; rk+= 16; ARIA_FE; } - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; #ifdef IS_LITTLE_ENDIAN - o[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3]; - o[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8) ^ rk[ 2]; - o[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ) ^ rk[ 1]; - o[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ) ^ rk[ 0]; - o[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ) ^ rk[ 7]; - o[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8) ^ rk[ 6]; - o[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ) ^ rk[ 5]; - o[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ) ^ rk[ 4]; - o[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ) ^ rk[11]; - o[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8) ^ rk[10]; - o[10] = (byte)(S1[ARIA_BRF(t[2],1)] ) ^ rk[ 9]; - o[11] = (byte)(S2[ARIA_BRF(t[2],0)] ) ^ rk[ 8]; - o[12] = (byte)(X1[ARIA_BRF(t[3],3)] ) ^ rk[15]; - o[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8) ^ rk[14]; - o[14] = (byte)(S1[ARIA_BRF(t[3],1)] ) ^ rk[13]; - o[15] = (byte)(S2[ARIA_BRF(t[3],0)] ) ^ rk[12]; + outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3]; + outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8) ^ rk[ 2]; + outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ) ^ rk[ 1]; + outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ) ^ rk[ 0]; + outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ) ^ rk[ 7]; + outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8) ^ rk[ 6]; + outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ) ^ rk[ 5]; + outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ) ^ rk[ 4]; + outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ) ^ rk[11]; + outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8) ^ rk[10]; + outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ) ^ rk[ 9]; + outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ) ^ rk[ 8]; + outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ) ^ rk[15]; + outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8) ^ rk[14]; + outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ) ^ rk[13]; + outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ) ^ rk[12]; #else #define ARIA_WORD(X,Y) (((word32 *)(X))[Y]) - o[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ); - o[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8); - o[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ); - o[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ); - o[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ); - o[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8); - o[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ); - o[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ); - o[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ); - o[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8); - o[10] = (byte)(S1[ARIA_BRF(t[2],1)] ); - o[11] = (byte)(S2[ARIA_BRF(t[2],0)] ); - o[12] = (byte)(X1[ARIA_BRF(t[3],3)] ); - o[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8); - o[14] = (byte)(S1[ARIA_BRF(t[3],1)] ); - o[15] = (byte)(S2[ARIA_BRF(t[3],0)] ); - ARIA_WORD(o,0)^=LoadWord(rk,0); ARIA_WORD(o,1)^=LoadWord(rk,1); - ARIA_WORD(o,2)^=LoadWord(rk,2); ARIA_WORD(o,3)^=LoadWord(rk,3); + outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ); + outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8); + outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ); + outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ); + outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ); + outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8); + outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ); + outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ); + outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ); + outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8); + outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ); + outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ); + outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ); + outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8); + outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ); + outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ); + ARIA_WORD(outBlock,0)^=LoadWord(rk,0); + ARIA_WORD(outBlock,1)^=LoadWord(rk,1); + ARIA_WORD(outBlock,2)^=LoadWord(rk,2); + ARIA_WORD(outBlock,3)^=LoadWord(rk,3); #endif - if (x) + if (xorBlock) for (unsigned int n=0; n<16; ++n) - o[n] ^= x[n]; + outBlock[n] ^= xorBlock[n]; } NAMESPACE_END diff --git a/config.h b/config.h index 1d96fbd4..3109d56c 100644 --- a/config.h +++ b/config.h @@ -402,7 +402,7 @@ NAMESPACE_END #define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0 #endif - #if !defined(CRYPTOPP_DISABLE_SSE3) && (_MSC_VER >= 1500 || (defined(__SSE3__) && defined(__SSSE3__))) + #if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || (defined(__SSSE3__) && defined(__SSSE3__))) #define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1 #else #define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0 @@ -423,6 +423,12 @@ NAMESPACE_END #define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0 #endif +#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || defined(__GNUC__) || (defined(__SSSE3__) && defined(__SSSE3__))) + #define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 1 +#else + #define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 0 +#endif + // Intrinsics availible in GCC 4.3 (http://gcc.gnu.org/gcc-4.3/changes.html) and // MSVC 2008 (http://msdn.microsoft.com/en-us/library/bb892950%28v=vs.90%29.aspx) // SunCC could generate SSE4 at 12.1, but the intrinsics are missing until 12.4.