From 59767be52e2537bf8460f211f57b4739c9b02dcb Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 12 Apr 2017 23:28:41 -0400 Subject: [PATCH] Add Intel and ARM intrinsics Win32 and Win64 benefited from the Intel intrinsics. A32 and Aarch64 benefited from the ARM intrinsics. The intrinsics shaved 150 to 350 cycles from key setup. The intrinsics slowed modern GCC down a small bit, and did not appear to affect old GCC. As such, Intel intrinsics were only enabled for Microsoft compilers. We were not able to improve encryption and decryption. In fact, some of the attempted macro conversions and intrinsics attempts slowed things down considerably. For example, GCC 5.4 on x86_64 went from 120 MB/s to about 70 MB/s when we tried to improve code around the Key XOR Layer (ARIA_KXL). --- aria.cpp | 268 +++++++++++++++++++++++++++++++++++++++---------------- config.h | 8 +- 2 files changed, 196 insertions(+), 80 deletions(-) diff --git a/aria.cpp b/aria.cpp index 53b39a9f..764cfea7 100644 --- a/aria.cpp +++ b/aria.cpp @@ -14,7 +14,11 @@ #include "misc.h" #include "cpu.h" -#include +// Enable SSE intrinsics for Visual Studio. It reduces key schedule setup by 150 +// to 200 cycles. GCC does fine on its own, and it slows things down a small bit. +#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE && _MSC_VER +# define CRYPTOPP_ENABLE_ARIA_INTRINSICS 1 +#endif ANONYMOUS_NAMESPACE_BEGIN @@ -313,46 +317,148 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam // w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t is a 16 byte temp area. word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20; - w0[0] = LoadWord(mk,0); w0[1] = LoadWord(mk,1); - w0[2] = LoadWord(mk,2); w0[3] = LoadWord(mk,3); +#if CRYPTOPP_ENABLE_ARIA_INTRINSICS + if (HasSSSE3()) + { + // 7 SSE instructions + const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); + const __m128i w = _mm_shuffle_epi8(_mm_load_si128((const __m128i*)mk), m); + _mm_store_si128((__m128i*)w0, w); - t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1]; - t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3]; + _mm_store_si128((__m128i*)t, _mm_xor_si128(w, + _mm_load_si128((const __m128i*)KRK[q]))); + } + else +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + // 27 integer instructions + w0[0] = LoadWord(mk,0); w0[1] = LoadWord(mk,1); + w0[2] = LoadWord(mk,2); w0[3] = LoadWord(mk,3); + + t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1]; + t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3]; + } + + // 24 integer instructions ARIA_FO; - if (keyBits > 128) + if (keyBits == 256) { - w1[0] = LoadWord(mk,4); - w1[1] = LoadWord(mk,5); - - if (keyBits > 192) +#if CRYPTOPP_ENABLE_ARIA_INTRINSICS + if (HasSSSE3()) { + // 3 SSE instructions + const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); + _mm_store_si128(reinterpret_cast<__m128i*>(w1), + _mm_shuffle_epi8(_mm_load_si128((const __m128i*)(mk+16)), m)); + } +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + // 14 integer instructions + w1[0] = LoadWord(mk,4); + w1[1] = LoadWord(mk,5); w1[2] = LoadWord(mk,6); w1[3] = LoadWord(mk,7); } - else - { - w1[2]=w1[3]=0; - } + } + else if (keyBits == 192) + { + w1[0] = LoadWord(mk,4); + w1[1] = LoadWord(mk,5); + w1[2] = w1[3] = 0; } else { - w1[0]=w1[1]=w1[2]=w1[3]=0; +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE + if (HasSSE2()) + { + _mm_store_si128(reinterpret_cast<__m128i*>(w1), _mm_setzero_si128()); + } + else +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + w1[0]=w1[1]=w1[2]=w1[3]=0; + } } - w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3]; - t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3]; +#if CRYPTOPP_ENABLE_ARIA_INTRINSICS + if (HasSSSE3()) + { + // 4 integer, 7 SSE instructions + const __m128i x = _mm_xor_si128( + _mm_load_si128((const __m128i*)(w1)), + _mm_load_si128((const __m128i*)(t))); + + _mm_store_si128((__m128i*)(w1), x); + _mm_store_si128((__m128i*)(t), x); + + q = (q==2) ? 0 : (q+1); + const __m128i y = _mm_xor_si128( + _mm_load_si128((const __m128i*)(t)), + _mm_load_si128((const __m128i*)(KRK[q]))); + + _mm_store_si128((__m128i*)(t), y); + } + else +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + // 23 integer instructions + w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3]; + t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3]; + + q = (q==2) ? 0 : (q+1); + t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; + } - q = (q==2) ? 0 : (q+1); - t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; ARIA_FE; - t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3]; - w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3]; - q = (q==2) ? 0 : (q+1); - t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; +#if CRYPTOPP_ENABLE_ARIA_INTRINSICS + if (HasSSSE3()) + { + // 4 integer, 7 SSE instructions + const __m128i x = _mm_xor_si128( + _mm_load_si128((const __m128i*)(w0)), + _mm_load_si128((const __m128i*)(t))); + + _mm_store_si128((__m128i*)(w2), x); + _mm_store_si128((__m128i*)(t), x); + + q = (q==2) ? 0 : (q+1); + const __m128i y = _mm_xor_si128( + _mm_load_si128((const __m128i*)(t)), + _mm_load_si128((const __m128i*)(KRK[q]))); + + _mm_store_si128((__m128i*)(t), y); + } + else +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + // 23 integer instructions + t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3]; + w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3]; + + q = (q==2) ? 0 : (q+1); + t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; + } + ARIA_FO; - w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3]; + +#if CRYPTOPP_ENABLE_ARIA_INTRINSICS + if (HasSSSE3()) + { + // 3 SSE instructions + const __m128i x = _mm_xor_si128( + _mm_load_si128((const __m128i*)(w1)), + _mm_load_si128((const __m128i*)(t))); + + _mm_store_si128((__m128i*)(w3), x); + } + else +#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS + { + // 14 integer instructions + w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3]; + } #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE if (HasNEON()) @@ -384,7 +490,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam } } else -#endif +#endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE { ARIA_GSRK<19>(w0, w1, rk + 0); ARIA_GSRK<19>(w1, w2, rk + 16); @@ -423,6 +529,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam rk = m_rk.data(); r = R; q = Q; + // 32 integer intructions a=reinterpret_cast(rk); z=a+r*4; t[0]=a[0]; t[1]=a[1]; t[2]=a[2]; t[3]=a[3]; a[0]=z[0]; a[1]=z[1]; a[2]=z[2]; a[3]=z[3]; @@ -432,89 +539,92 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam for (; a(m_rk.data()); word32 *t = const_cast(m_w.data()+20); - t[0] = LoadWord(i,0); t[1] = LoadWord(i,1); - t[2] = LoadWord(i,2); t[3] = LoadWord(i,3); + // Visual Studio is generating bad code within the SSSE3 code block. It is + // providing a NULL pointer or a pointer set to a constant like 0x1000. + // It looks like some leftover garbage in the XMM register rather than + // the pointer loaded into the integer register for the non-SSE code path. + t[0] = LoadWord(inBlock,0); t[1] = LoadWord(inBlock,1); + t[2] = LoadWord(inBlock,2); t[3] = LoadWord(inBlock,3); if (m_rounds > 12) { - ARIA_KXL rk+= 16; ARIA_FO - ARIA_KXL rk+= 16; ARIA_FE + ARIA_KXL; rk+= 16; ARIA_FO; + ARIA_KXL; rk+= 16; ARIA_FE; } if (m_rounds > 14) { - ARIA_KXL rk+= 16; ARIA_FO - ARIA_KXL rk+= 16; ARIA_FE + ARIA_KXL; rk+= 16; ARIA_FO; + ARIA_KXL; rk+= 16; ARIA_FE; } - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; ARIA_FE - ARIA_KXL rk+= 16; ARIA_FO ARIA_KXL rk+= 16; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FE; + ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; #ifdef IS_LITTLE_ENDIAN - o[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3]; - o[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8) ^ rk[ 2]; - o[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ) ^ rk[ 1]; - o[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ) ^ rk[ 0]; - o[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ) ^ rk[ 7]; - o[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8) ^ rk[ 6]; - o[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ) ^ rk[ 5]; - o[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ) ^ rk[ 4]; - o[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ) ^ rk[11]; - o[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8) ^ rk[10]; - o[10] = (byte)(S1[ARIA_BRF(t[2],1)] ) ^ rk[ 9]; - o[11] = (byte)(S2[ARIA_BRF(t[2],0)] ) ^ rk[ 8]; - o[12] = (byte)(X1[ARIA_BRF(t[3],3)] ) ^ rk[15]; - o[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8) ^ rk[14]; - o[14] = (byte)(S1[ARIA_BRF(t[3],1)] ) ^ rk[13]; - o[15] = (byte)(S2[ARIA_BRF(t[3],0)] ) ^ rk[12]; + outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3]; + outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8) ^ rk[ 2]; + outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ) ^ rk[ 1]; + outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ) ^ rk[ 0]; + outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ) ^ rk[ 7]; + outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8) ^ rk[ 6]; + outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ) ^ rk[ 5]; + outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ) ^ rk[ 4]; + outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ) ^ rk[11]; + outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8) ^ rk[10]; + outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ) ^ rk[ 9]; + outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ) ^ rk[ 8]; + outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ) ^ rk[15]; + outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8) ^ rk[14]; + outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ) ^ rk[13]; + outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ) ^ rk[12]; #else #define ARIA_WORD(X,Y) (((word32 *)(X))[Y]) - o[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ); - o[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8); - o[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ); - o[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ); - o[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ); - o[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8); - o[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ); - o[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ); - o[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ); - o[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8); - o[10] = (byte)(S1[ARIA_BRF(t[2],1)] ); - o[11] = (byte)(S2[ARIA_BRF(t[2],0)] ); - o[12] = (byte)(X1[ARIA_BRF(t[3],3)] ); - o[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8); - o[14] = (byte)(S1[ARIA_BRF(t[3],1)] ); - o[15] = (byte)(S2[ARIA_BRF(t[3],0)] ); - ARIA_WORD(o,0)^=LoadWord(rk,0); ARIA_WORD(o,1)^=LoadWord(rk,1); - ARIA_WORD(o,2)^=LoadWord(rk,2); ARIA_WORD(o,3)^=LoadWord(rk,3); + outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ); + outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8); + outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ); + outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ); + outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ); + outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8); + outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ); + outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ); + outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ); + outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8); + outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ); + outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ); + outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ); + outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8); + outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ); + outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ); + ARIA_WORD(outBlock,0)^=LoadWord(rk,0); + ARIA_WORD(outBlock,1)^=LoadWord(rk,1); + ARIA_WORD(outBlock,2)^=LoadWord(rk,2); + ARIA_WORD(outBlock,3)^=LoadWord(rk,3); #endif - if (x) + if (xorBlock) for (unsigned int n=0; n<16; ++n) - o[n] ^= x[n]; + outBlock[n] ^= xorBlock[n]; } NAMESPACE_END diff --git a/config.h b/config.h index 1d96fbd4..3109d56c 100644 --- a/config.h +++ b/config.h @@ -402,7 +402,7 @@ NAMESPACE_END #define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0 #endif - #if !defined(CRYPTOPP_DISABLE_SSE3) && (_MSC_VER >= 1500 || (defined(__SSE3__) && defined(__SSSE3__))) + #if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || (defined(__SSSE3__) && defined(__SSSE3__))) #define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1 #else #define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0 @@ -423,6 +423,12 @@ NAMESPACE_END #define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0 #endif +#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || defined(__GNUC__) || (defined(__SSSE3__) && defined(__SSSE3__))) + #define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 1 +#else + #define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 0 +#endif + // Intrinsics availible in GCC 4.3 (http://gcc.gnu.org/gcc-4.3/changes.html) and // MSVC 2008 (http://msdn.microsoft.com/en-us/library/bb892950%28v=vs.90%29.aspx) // SunCC could generate SSE4 at 12.1, but the intrinsics are missing until 12.4.