From ed4d57cecbd4a0a8a2a58742881c7af10e1be938 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 24 Oct 2018 16:15:13 -0400 Subject: [PATCH] Add XOP aware ChaCha ChaCha is about 50% faster using XOP for the rotates on AMD machines --- chacha-simd.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/chacha-simd.cpp b/chacha-simd.cpp index ce409173..21782164 100644 --- a/chacha-simd.cpp +++ b/chacha-simd.cpp @@ -26,6 +26,10 @@ # include #endif +#ifdef __XOP__ +# include +#endif + #if (CRYPTOPP_ARM_NEON_AVAILABLE) # include #endif @@ -47,24 +51,36 @@ ANONYMOUS_NAMESPACE_BEGIN template inline __m128i RotateLeft(const __m128i val) { +#ifdef __XOP__ + return _mm_roti_epi32(val, R); +#else return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); +#endif } -#ifdef __SSSE3__ +#if defined(__SSSE3__) template <> inline __m128i RotateLeft<8>(const __m128i val) { +#ifdef __XOP__ + return _mm_roti_epi32(val, R); +#else const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); return _mm_shuffle_epi8(val, mask); +#endif } template <> inline __m128i RotateLeft<16>(const __m128i val) { +#ifdef __XOP__ + return _mm_roti_epi32(val, R); +#else const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); return _mm_shuffle_epi8(val, mask); -} #endif +} +#endif // SSE3 #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE