Add CRYPTOPP_SLOW_ARMV8_SHIFT for Aarch32 and Aarch64
Both BLAKE2 and SPECK slow down when using NEON/ASIMD. When just BLAKE2 experienced the issue, it was a one-off problem. Its now wider than a one-off, so add the formal definepull/548/head
parent
78ec2aa5f4
commit
b08596da44
|
|
@ -16,6 +16,12 @@
|
||||||
// #undef CRYPTOPP_SSE41_AVAILABLE
|
// #undef CRYPTOPP_SSE41_AVAILABLE
|
||||||
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
||||||
|
// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
|
||||||
|
// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
|
||||||
|
#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
|
||||||
|
# undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !(defined(__ARM_NEON) || defined(_MSC_VER))
|
#if !(defined(__ARM_NEON) || defined(_MSC_VER))
|
||||||
# undef CRYPTOPP_ARM_NEON_AVAILABLE
|
# undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -43,7 +49,7 @@ NAMESPACE_BEGIN(CryptoPP)
|
||||||
inline __m128i MM_SET_EPI64X(const word64 a, const word64 b)
|
inline __m128i MM_SET_EPI64X(const word64 a, const word64 b)
|
||||||
{
|
{
|
||||||
const word64 t[2] = {b,a}; __m128i r;
|
const word64 t[2] = {b,a}; __m128i r;
|
||||||
::memcpy(&r, t, sizeof(t));
|
std::memcpy(&r, t, sizeof(t));
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
@ -1600,8 +1606,7 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_SSE41_AVAILABLE
|
#endif // CRYPTOPP_SSE41_AVAILABLE
|
||||||
|
|
||||||
// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367
|
#if CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
|
|
||||||
void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State<word32, false>& state)
|
void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State<word32, false>& state)
|
||||||
{
|
{
|
||||||
#define BLAKE2S_LOAD_MSG_0_1(buf) \
|
#define BLAKE2S_LOAD_MSG_0_1(buf) \
|
||||||
|
|
@ -2179,6 +2184,6 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state
|
||||||
vst1q_u64(&state.h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
|
vst1q_u64(&state.h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
|
||||||
vst1q_u64(&state.h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
|
vst1q_u64(&state.h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
|
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
||||||
NAMESPACE_END
|
NAMESPACE_END
|
||||||
|
|
|
||||||
13
blake2.cpp
13
blake2.cpp
|
|
@ -17,6 +17,12 @@ NAMESPACE_BEGIN(CryptoPP)
|
||||||
// #undef CRYPTOPP_SSE41_AVAILABLE
|
// #undef CRYPTOPP_SSE41_AVAILABLE
|
||||||
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
||||||
|
// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
|
||||||
|
// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
|
||||||
|
#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
|
||||||
|
# undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
#endif
|
||||||
|
|
||||||
void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State<word32, false>& state);
|
void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State<word32, false>& state);
|
||||||
void BLAKE2_Compress64_CXX(const byte* input, BLAKE2_State<word64, true>& state);
|
void BLAKE2_Compress64_CXX(const byte* input, BLAKE2_State<word64, true>& state);
|
||||||
|
|
||||||
|
|
@ -25,8 +31,7 @@ extern void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State<word32, false
|
||||||
extern void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state);
|
extern void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State<word64, true>& state);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367
|
#if CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
|
|
||||||
extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State<word32, false>& state);
|
extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State<word32, false>& state);
|
||||||
extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state);
|
extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State<word64, true>& state);
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -93,7 +98,7 @@ pfnCompress64 InitializeCompress64Fn()
|
||||||
#if CRYPTOPP_SSE41_AVAILABLE
|
#if CRYPTOPP_SSE41_AVAILABLE
|
||||||
HasSSE41() ? &BLAKE2_Compress64_SSE4 :
|
HasSSE41() ? &BLAKE2_Compress64_SSE4 :
|
||||||
#endif
|
#endif
|
||||||
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
|
#if CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
HasNEON() ? &BLAKE2_Compress64_NEON :
|
HasNEON() ? &BLAKE2_Compress64_NEON :
|
||||||
#endif
|
#endif
|
||||||
&BLAKE2_Compress64_CXX;
|
&BLAKE2_Compress64_CXX;
|
||||||
|
|
@ -105,7 +110,7 @@ pfnCompress32 InitializeCompress32Fn()
|
||||||
#if CRYPTOPP_SSE41_AVAILABLE
|
#if CRYPTOPP_SSE41_AVAILABLE
|
||||||
HasSSE41() ? &BLAKE2_Compress32_SSE4 :
|
HasSSE41() ? &BLAKE2_Compress32_SSE4 :
|
||||||
#endif
|
#endif
|
||||||
#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE
|
#if CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
HasNEON() ? &BLAKE2_Compress32_NEON :
|
HasNEON() ? &BLAKE2_Compress32_NEON :
|
||||||
#endif
|
#endif
|
||||||
&BLAKE2_Compress32_CXX;
|
&BLAKE2_Compress32_CXX;
|
||||||
|
|
|
||||||
6
config.h
6
config.h
|
|
@ -118,6 +118,12 @@
|
||||||
// of 'b', 'o', 'h' or '.' (the last for decimal).
|
// of 'b', 'o', 'h' or '.' (the last for decimal).
|
||||||
// #define CRYPTOPP_USE_STD_SHOWBASE
|
// #define CRYPTOPP_USE_STD_SHOWBASE
|
||||||
|
|
||||||
|
// Define this if ARMv8 shifts are slow. ARM Cortex-A53 and Cortex-A57 shift
|
||||||
|
// operation perform poorly, so NEON and ASIMD code that relies on shifts
|
||||||
|
// or rotates often performs worse than regular C/C++ code. Also see
|
||||||
|
// http://github.com/weidai11/cryptopp/issues/367.
|
||||||
|
#define CRYPTOPP_SLOW_ARMV8_SHIFT 1
|
||||||
|
|
||||||
// Define this if you want to decouple AlgorithmParameters and Integer
|
// Define this if you want to decouple AlgorithmParameters and Integer
|
||||||
// The decoupling should make it easier for the linker to remove Integer
|
// The decoupling should make it easier for the linker to remove Integer
|
||||||
// related code for those who do not need Integer, and avoid a potential
|
// related code for those who do not need Integer, and avoid a potential
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue