diff --git a/simeck-simd.cpp b/simeck-simd.cpp index 87c02eea..099b8731 100644 --- a/simeck-simd.cpp +++ b/simeck-simd.cpp @@ -22,6 +22,10 @@ # include #endif +#if defined(__XOP__) +# include +#endif + #if defined(__AVX512F__) && defined(__AVX512VL__) # define CRYPTOPP_AVX512_ROTATE 1 # include @@ -44,6 +48,8 @@ inline __m128i RotateLeft32(const __m128i& val) { #if defined(CRYPTOPP_AVX512_ROTATE) return _mm_rol_epi32(val, R); +#elif defined(__XOP__) + return _mm_roti_epi32(val, R); #else return _mm_or_si128( _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); @@ -55,6 +61,8 @@ inline __m128i RotateRight32(const __m128i& val) { #if defined(CRYPTOPP_AVX512_ROTATE) return _mm_ror_epi32(val, R); +#elif defined(__XOP__) + return _mm_roti_epi32(val, 32-R); #else return _mm_or_si128( _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); @@ -65,16 +73,24 @@ inline __m128i RotateRight32(const __m128i& val) template <> inline __m128i RotateLeft32<8>(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi32(val, 8); +#else const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); return _mm_shuffle_epi8(val, mask); +#endif } // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. template <> inline __m128i RotateRight32<8>(const __m128i& val) { +#if defined(__XOP__) + return _mm_roti_epi32(val, 32-8); +#else const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); return _mm_shuffle_epi8(val, mask); +#endif } /// \brief Unpack XMM words