From babdf8b38b2b01b7c9691ec95fd19b3551fb857b Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 24 Oct 2018 17:12:03 -0400 Subject: [PATCH] Add XOP aware CHAM and LEA --- chacha-simd.cpp | 4 ++-- cham-simd.cpp | 32 ++++++++++++++++++++++++++++++++ lea-simd.cpp | 20 ++++++++++++++++++++ 3 files changed, 54 insertions(+), 2 deletions(-) diff --git a/chacha-simd.cpp b/chacha-simd.cpp index 21782164..7b58b8a3 100644 --- a/chacha-simd.cpp +++ b/chacha-simd.cpp @@ -63,7 +63,7 @@ template <> inline __m128i RotateLeft<8>(const __m128i val) { #ifdef __XOP__ - return _mm_roti_epi32(val, R); + return _mm_roti_epi32(val, 8); #else const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); return _mm_shuffle_epi8(val, mask); @@ -74,7 +74,7 @@ template <> inline __m128i RotateLeft<16>(const __m128i val) { #ifdef __XOP__ - return _mm_roti_epi32(val, R); + return _mm_roti_epi32(val, 16); #else const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); return _mm_shuffle_epi8(val, mask); diff --git a/cham-simd.cpp b/cham-simd.cpp index a1bb5e85..8f3d0b5b 100644 --- a/cham-simd.cpp +++ b/cham-simd.cpp @@ -22,6 +22,10 @@ # include #endif +#if defined(__XOP__) +# include +#endif + #if defined(__AVX512F__) && defined(__AVX512VL__) # define CRYPTOPP_AVX512_ROTATE 1 # include @@ -44,31 +48,47 @@ NAMESPACE_BEGIN(W16) // CHAM64, 16-bit word size template inline __m128i RotateLeft16(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi16(val, R); +#else return _mm_or_si128( _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R)); +#endif } template inline __m128i RotateRight16(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi16(val, 16-R); +#else return _mm_or_si128( _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R)); +#endif } // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. template <> inline __m128i RotateLeft16<8>(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi16(val, 8); +#else const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1); return _mm_shuffle_epi8(val, mask); +#endif } // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. template <> inline __m128i RotateRight16<8>(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi16(val, 16-8); +#else const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1); return _mm_shuffle_epi8(val, mask); +#endif } template @@ -626,6 +646,8 @@ inline __m128i RotateLeft32(const __m128i& val) { #if defined(CRYPTOPP_AVX512_ROTATE) return _mm_rol_epi32(val, R); +#elif defined(__XOP__) + return _mm_roti_epi32(val, R); #else return _mm_or_si128( _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); @@ -637,6 +659,8 @@ inline __m128i RotateRight32(const __m128i& val) { #if defined(CRYPTOPP_AVX512_ROTATE) return _mm_ror_epi32(val, R); +#elif defined(__XOP__) + return _mm_roti_epi32(val, 32-R); #else return _mm_or_si128( _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); @@ -647,16 +671,24 @@ inline __m128i RotateRight32(const __m128i& val) template <> inline __m128i RotateLeft32<8>(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi32(val, 8); +#else const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); return _mm_shuffle_epi8(val, mask); +#endif } // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. template <> inline __m128i RotateRight32<8>(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi32(val, 32-8); +#else const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); return _mm_shuffle_epi8(val, mask); +#endif } template diff --git a/lea-simd.cpp b/lea-simd.cpp index d49dfa82..c05d7505 100644 --- a/lea-simd.cpp +++ b/lea-simd.cpp @@ -22,6 +22,10 @@ # include #endif +#if defined(__XOP__) +# include +#endif + #if defined(__AVX512F__) && defined(__AVX512VL__) # define CRYPTOPP_AVX512_ROTATE 1 # include @@ -279,31 +283,47 @@ inline __m128i Sub(const __m128i& a, const __m128i& b) template inline __m128i RotateLeft(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi32(val, R); +#else return _mm_or_si128( _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); +#endif } template inline __m128i RotateRight(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi32(val, 32-R); +#else return _mm_or_si128( _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); +#endif } // Faster than two Shifts and an Or. template <> inline __m128i RotateLeft<8>(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi32(val, 8); +#else const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); return _mm_shuffle_epi8(val, mask); +#endif } // Faster than two Shifts and an Or. template <> inline __m128i RotateRight<8>(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi32(val, 32-8); +#else const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); return _mm_shuffle_epi8(val, mask); +#endif } template