Add SSSE3 rotates when available

This change obtains the remaining 0.1 to 0.15 cpb. It should be engaged with -march=native
pull/730/head
Jeffrey Walton 2018-10-24 15:34:54 -04:00
parent c43c47e590
commit b4c4c5aa14
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 22 additions and 11 deletions

View File

@ -9,7 +9,7 @@
// SSE2 implementation based on Botan's chacha_sse2.cpp. Many thanks
// to Jack Lloyd and the Botan team for allowing us to use it.
//
// ARMv8 Power7 is upcoming.
// NEON and Power7 is upcoming.
#include "pch.h"
#include "config.h"
@ -22,6 +22,10 @@
# include <emmintrin.h>
#endif
#if (CRYPTOPP_SSSE3_INTRIN_AVAILABLE || CRYPTOPP_SSSE3_ASM_AVAILABLE)
# include <tmmintrin.h>
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
# include <arm_neon.h>
#endif
@ -46,6 +50,22 @@ inline __m128i RotateLeft(const __m128i val)
return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
}
#ifdef __SSSE3__
template <>
inline __m128i RotateLeft<8>(const __m128i val)
{
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
return _mm_shuffle_epi8(val, mask);
}
template <>
inline __m128i RotateLeft<16>(const __m128i val)
{
const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
return _mm_shuffle_epi8(val, mask);
}
#endif
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
ANONYMOUS_NAMESPACE_END

View File

@ -33,10 +33,6 @@ std::string ChaCha_Policy::AlgorithmProvider() const
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
if (HasSSE2())
return "SSE2";
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return "NEON";
#endif
return "C++";
}
@ -95,11 +91,6 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
if (HasSSE2())
return 4*BYTES_PER_ITERATION;
else
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return 4*BYTES_PER_ITERATION;
else
#endif
return BYTES_PER_ITERATION;
}
@ -122,7 +113,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
if (m_state[12] < 4)
m_state[13]++;
input += 4*BYTES_PER_ITERATION;
input += !!xorInput*4*BYTES_PER_ITERATION;
output += 4*BYTES_PER_ITERATION;
iterationCount -= 4;
}