Add CRYPTOPP_SLOW_ARMV8_SHIFT for Aarch32 and Aarch64
Both BLAKE2 and SPECK slow down when using NEON/ASIMD. When just BLAKE2 experienced the issue, it was a one-off problem. Its now wider than a one-off, so add the formal definepull/548/head
parent
78ec2aa5f4
commit
b08596da44
|
|
@ -16,6 +16,12 @@
|
|||
// #undef CRYPTOPP_SSE41_AVAILABLE
|
||||
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
|
||||
// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
|
||||
#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
|
||||
# undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
#endif
|
||||
|
||||
#if !(defined(__ARM_NEON) || defined(_MSC_VER))
|
||||
# undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
#endif
|
||||
|
|
@ -43,7 +49,7 @@ NAMESPACE_BEGIN(CryptoPP)
|
|||
inline __m128i MM_SET_EPI64X(const word64 a, const word64 b)
|
||||
{
|
||||
const word64 t[2] = {b,a}; __m128i r;
|
||||
::memcpy(&r, t, sizeof(t));
|
||||
std::memcpy(&r, t, sizeof(t));
|
||||
return r;
|
||||
}
|
||||
#else
|
||||
|
|
@ -1600,8 +1606,7 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state
|
|||
}
|
||||
#endif // CRYPTOPP_SSE41_AVAILABLE
|
||||
|
||||
// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367
|
||||
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
#if CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State<word32, false>& state)
|
||||
{
|
||||
#define BLAKE2S_LOAD_MSG_0_1(buf) \
|
||||
|
|
@ -2179,6 +2184,6 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state
|
|||
vst1q_u64(&state.h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
|
||||
vst1q_u64(&state.h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
|
||||
}
|
||||
#endif // CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
NAMESPACE_END
|
||||
|
|
|
|||
13
blake2.cpp
13
blake2.cpp
|
|
@ -17,6 +17,12 @@ NAMESPACE_BEGIN(CryptoPP)
|
|||
// #undef CRYPTOPP_SSE41_AVAILABLE
|
||||
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
|
||||
// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
|
||||
#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
|
||||
# undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
#endif
|
||||
|
||||
void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State<word32, false>& state);
|
||||
void BLAKE2_Compress64_CXX(const byte* input, BLAKE2_State<word64, true>& state);
|
||||
|
||||
|
|
@ -25,8 +31,7 @@ extern void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State<word32, false
|
|||
extern void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state);
|
||||
#endif
|
||||
|
||||
// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367
|
||||
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
#if CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State<word32, false>& state);
|
||||
extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state);
|
||||
#endif
|
||||
|
|
@ -93,7 +98,7 @@ pfnCompress64 InitializeCompress64Fn()
|
|||
#if CRYPTOPP_SSE41_AVAILABLE
|
||||
HasSSE41() ? &BLAKE2_Compress64_SSE4 :
|
||||
#endif
|
||||
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
#if CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
HasNEON() ? &BLAKE2_Compress64_NEON :
|
||||
#endif
|
||||
&BLAKE2_Compress64_CXX;
|
||||
|
|
@ -105,7 +110,7 @@ pfnCompress32 InitializeCompress32Fn()
|
|||
#if CRYPTOPP_SSE41_AVAILABLE
|
||||
HasSSE41() ? &BLAKE2_Compress32_SSE4 :
|
||||
#endif
|
||||
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
#if CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
HasNEON() ? &BLAKE2_Compress32_NEON :
|
||||
#endif
|
||||
&BLAKE2_Compress32_CXX;
|
||||
|
|
|
|||
6
config.h
6
config.h
|
|
@ -118,6 +118,12 @@
|
|||
// of 'b', 'o', 'h' or '.' (the last for decimal).
|
||||
// #define CRYPTOPP_USE_STD_SHOWBASE
|
||||
|
||||
// Define this if ARMv8 shifts are slow. ARM Cortex-A53 and Cortex-A57 shift
|
||||
// operation perform poorly, so NEON and ASIMD code that relies on shifts
|
||||
// or rotates often performs worse than regular C/C++ code. Also see
|
||||
// http://github.com/weidai11/cryptopp/issues/367.
|
||||
#define CRYPTOPP_SLOW_ARMV8_SHIFT 1
|
||||
|
||||
// Define this if you want to decouple AlgorithmParameters and Integer
|
||||
// The decoupling should make it easier for the linker to remove Integer
|
||||
// related code for those who do not need Integer, and avoid a potential
|
||||
|
|
|
|||
Loading…
Reference in New Issue