Add CRYPTOPP_SLOW_ARMV8_SHIFT for Aarch32 and Aarch64

Both BLAKE2 and SPECK slow down when using NEON/ASIMD. When just BLAKE2 experienced the issue, it was a one-off problem. Its now wider than a one-off, so add the formal define
pull/548/head
Jeffrey Walton 2017-11-23 02:22:27 -05:00
parent 78ec2aa5f4
commit b08596da44
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 24 additions and 8 deletions

View File

@ -16,6 +16,12 @@
// #undef CRYPTOPP_SSE41_AVAILABLE
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
# undef CRYPTOPP_ARM_NEON_AVAILABLE
#endif
#if !(defined(__ARM_NEON) || defined(_MSC_VER))
# undef CRYPTOPP_ARM_NEON_AVAILABLE
#endif
@ -43,7 +49,7 @@ NAMESPACE_BEGIN(CryptoPP)
inline __m128i MM_SET_EPI64X(const word64 a, const word64 b)
{
const word64 t[2] = {b,a}; __m128i r;
::memcpy(&r, t, sizeof(t));
std::memcpy(&r, t, sizeof(t));
return r;
}
#else
@ -1600,8 +1606,7 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state
}
#endif // CRYPTOPP_SSE41_AVAILABLE
// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
#if CRYPTOPP_ARM_NEON_AVAILABLE
void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State<word32, false>& state)
{
#define BLAKE2S_LOAD_MSG_0_1(buf) \
@ -2179,6 +2184,6 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state
vst1q_u64(&state.h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
vst1q_u64(&state.h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
}
#endif // CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
NAMESPACE_END

View File

@ -17,6 +17,12 @@ NAMESPACE_BEGIN(CryptoPP)
// #undef CRYPTOPP_SSE41_AVAILABLE
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
# undef CRYPTOPP_ARM_NEON_AVAILABLE
#endif
void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State<word32, false>& state);
void BLAKE2_Compress64_CXX(const byte* input, BLAKE2_State<word64, true>& state);
@ -25,8 +31,7 @@ extern void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State<word32, false
extern void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state);
#endif
// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
#if CRYPTOPP_ARM_NEON_AVAILABLE
extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State<word32, false>& state);
extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state);
#endif
@ -93,7 +98,7 @@ pfnCompress64 InitializeCompress64Fn()
#if CRYPTOPP_SSE41_AVAILABLE
HasSSE41() ? &BLAKE2_Compress64_SSE4 :
#endif
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
#if CRYPTOPP_ARM_NEON_AVAILABLE
HasNEON() ? &BLAKE2_Compress64_NEON :
#endif
&BLAKE2_Compress64_CXX;
@ -105,7 +110,7 @@ pfnCompress32 InitializeCompress32Fn()
#if CRYPTOPP_SSE41_AVAILABLE
HasSSE41() ? &BLAKE2_Compress32_SSE4 :
#endif
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
#if CRYPTOPP_ARM_NEON_AVAILABLE
HasNEON() ? &BLAKE2_Compress32_NEON :
#endif
&BLAKE2_Compress32_CXX;

View File

@ -118,6 +118,12 @@
// of 'b', 'o', 'h' or '.' (the last for decimal).
// #define CRYPTOPP_USE_STD_SHOWBASE
// Define this if ARMv8 shifts are slow. ARM Cortex-A53 and Cortex-A57 shift
// operation perform poorly, so NEON and ASIMD code that relies on shifts
// or rotates often performs worse than regular C/C++ code. Also see
// http://github.com/weidai11/cryptopp/issues/367.
#define CRYPTOPP_SLOW_ARMV8_SHIFT 1
// Define this if you want to decouple AlgorithmParameters and Integer
// The decoupling should make it easier for the linker to remove Integer
// related code for those who do not need Integer, and avoid a potential