Add XOP aware ChaCha
ChaCha is about 50% faster using XOP for the rotates on AMD machinespull/730/head
parent
b4c4c5aa14
commit
ed4d57cecb
|
|
@ -26,6 +26,10 @@
|
|||
# include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __XOP__
|
||||
# include <ammintrin.h>
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
|
|
@ -47,24 +51,36 @@ ANONYMOUS_NAMESPACE_BEGIN
|
|||
template <unsigned int R>
|
||||
inline __m128i RotateLeft(const __m128i val)
|
||||
{
|
||||
#ifdef __XOP__
|
||||
return _mm_roti_epi32(val, R);
|
||||
#else
|
||||
return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef __SSSE3__
|
||||
#if defined(__SSSE3__)
|
||||
template <>
|
||||
inline __m128i RotateLeft<8>(const __m128i val)
|
||||
{
|
||||
#ifdef __XOP__
|
||||
return _mm_roti_epi32(val, R);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
inline __m128i RotateLeft<16>(const __m128i val)
|
||||
{
|
||||
#ifdef __XOP__
|
||||
return _mm_roti_epi32(val, R);
|
||||
#else
|
||||
const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
|
||||
return _mm_shuffle_epi8(val, mask);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif // SSE3
|
||||
|
||||
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue