diff --git a/blake2-simd.cpp b/blake2-simd.cpp index 54108c44..d3bae574 100644 --- a/blake2-simd.cpp +++ b/blake2-simd.cpp @@ -16,6 +16,12 @@ // #undef CRYPTOPP_SSE41_AVAILABLE // #undef CRYPTOPP_ARM_NEON_AVAILABLE +// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about +// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367. +#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT) +# undef CRYPTOPP_ARM_NEON_AVAILABLE +#endif + #if !(defined(__ARM_NEON) || defined(_MSC_VER)) # undef CRYPTOPP_ARM_NEON_AVAILABLE #endif @@ -43,7 +49,7 @@ NAMESPACE_BEGIN(CryptoPP) inline __m128i MM_SET_EPI64X(const word64 a, const word64 b) { const word64 t[2] = {b,a}; __m128i r; - ::memcpy(&r, t, sizeof(t)); + std::memcpy(&r, t, sizeof(t)); return r; } #else @@ -1600,8 +1606,7 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State& state } #endif // CRYPTOPP_SSE41_AVAILABLE -// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367 -#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE +#if CRYPTOPP_ARM_NEON_AVAILABLE void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State& state) { #define BLAKE2S_LOAD_MSG_0_1(buf) \ @@ -2179,6 +2184,6 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State& state vst1q_u64(&state.h[4], veorq_u64(h2, veorq_u64(row2l, row4l))); vst1q_u64(&state.h[6], veorq_u64(h3, veorq_u64(row2h, row4h))); } -#endif // CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE +#endif // CRYPTOPP_ARM_NEON_AVAILABLE NAMESPACE_END diff --git a/blake2.cpp b/blake2.cpp index 83e1bdaf..a54902c9 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -17,6 +17,12 @@ NAMESPACE_BEGIN(CryptoPP) // #undef CRYPTOPP_SSE41_AVAILABLE // #undef CRYPTOPP_ARM_NEON_AVAILABLE +// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about +// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367. +#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT) +# undef CRYPTOPP_ARM_NEON_AVAILABLE +#endif + void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State& state); void BLAKE2_Compress64_CXX(const byte* input, BLAKE2_State& state); @@ -25,8 +31,7 @@ extern void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State& state); #endif -// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367 -#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE +#if CRYPTOPP_ARM_NEON_AVAILABLE extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State& state); extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State& state); #endif @@ -93,7 +98,7 @@ pfnCompress64 InitializeCompress64Fn() #if CRYPTOPP_SSE41_AVAILABLE HasSSE41() ? &BLAKE2_Compress64_SSE4 : #endif -#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE +#if CRYPTOPP_ARM_NEON_AVAILABLE HasNEON() ? &BLAKE2_Compress64_NEON : #endif &BLAKE2_Compress64_CXX; @@ -105,7 +110,7 @@ pfnCompress32 InitializeCompress32Fn() #if CRYPTOPP_SSE41_AVAILABLE HasSSE41() ? &BLAKE2_Compress32_SSE4 : #endif -#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE +#if CRYPTOPP_ARM_NEON_AVAILABLE HasNEON() ? &BLAKE2_Compress32_NEON : #endif &BLAKE2_Compress32_CXX; diff --git a/config.h b/config.h index 94a678c8..36cd7dce 100644 --- a/config.h +++ b/config.h @@ -118,6 +118,12 @@ // of 'b', 'o', 'h' or '.' (the last for decimal). // #define CRYPTOPP_USE_STD_SHOWBASE +// Define this if ARMv8 shifts are slow. ARM Cortex-A53 and Cortex-A57 shift +// operation perform poorly, so NEON and ASIMD code that relies on shifts +// or rotates often performs worse than regular C/C++ code. Also see +// http://github.com/weidai11/cryptopp/issues/367. +#define CRYPTOPP_SLOW_ARMV8_SHIFT 1 + // Define this if you want to decouple AlgorithmParameters and Integer // The decoupling should make it easier for the linker to remove Integer // related code for those who do not need Integer, and avoid a potential