From 404e6cfae33c8185ef8752c7d62ce111583d4829 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sat, 23 Jun 2018 20:55:17 -0400 Subject: [PATCH] Remove CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS GCC 8 was producing bad decryption results for CBC mode on x86. NEON and Aarch64 was fine. We lose 0.6 cpb so LEA runs around 3.5 cpb instead of 2.9 cpb. It would be nice to pinpoint the GCC issue but it is kind of difficult at the moment. --- lea-simd.cpp | 30 ++---------------------------- lea.cpp | 20 -------------------- lea.h | 10 +--------- 3 files changed, 3 insertions(+), 57 deletions(-) diff --git a/lea-simd.cpp b/lea-simd.cpp index 703930c9..75e710b7 100644 --- a/lea-simd.cpp +++ b/lea-simd.cpp @@ -140,11 +140,7 @@ uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b) template inline uint32x4_t LoadKey(const word32 rkey[]) { -#if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS) - return vld1q_u32(&rkey[IDX*4]); -#else return vdupq_n_u32(rkey[IDX]); -#endif } template @@ -599,7 +595,8 @@ inline __m128i RotateRight<8>(const __m128i& val) template inline __m128i LoadKey(const word32 rkey[]) { - return _mm_loadu_si128((const __m128i*) &rkey[IDX*4]); + float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); + return _mm_castps_si128(_mm_load_ps1(&rk)); } template @@ -989,17 +986,6 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) #if defined(CRYPTOPP_SSSE3_AVAILABLE) -void LEA_SplatKeys_SSSE3(SecBlock& rkeys) -{ - SecBlock temp(rkeys.size() * 4); - for (size_t i=0, j=0; i& rkeys) -{ - SecBlock temp(rkeys.size() * 4); - for (size_t i=0, j=0; i& rkeys); - extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -567,10 +565,6 @@ extern size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t # endif # if (CRYPTOPP_ARM_NEON_AVAILABLE) -# if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS) -extern void LEA_SplatKeys_NEON(SecBlock& rkeys); -# endif // CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS - extern size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -609,20 +603,6 @@ void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, con default: CRYPTOPP_ASSERT(0);; } - - // If we pre-splat the round keys at setup then we avoid a shuffle - // at runtime for each subkey used during encryption and decryption. - // Pre-splatting saves about 0.7 to 1.0 cpb at the cost of 4x storage. -#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_SSSE3_AVAILABLE) - if (HasSSSE3()) - LEA_SplatKeys_SSSE3(m_rkey); -#endif -#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_ARM_NEON_AVAILABLE) -# if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS) - if (HasNEON()) - LEA_SplatKeys_NEON(m_rkey); -# endif // CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS -#endif } void LEA::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const diff --git a/lea.h b/lea.h index 5b9f74b0..d8d95700 100644 --- a/lea.h +++ b/lea.h @@ -19,19 +19,11 @@ # define CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS 1 #endif -// Define this if you want to pre-splat the round key table -// for NEON and Aarch64. Pre-splatting the round key increases -// performance by about 0.7 cpb on ARM server boards like an -// AMD Opteron A1100. However, it crushes performance on ARM -// dev-boards like LeMaker HiKey and Pine64. HiKey and Pine64 -// run about 8 cpb slower when pre-splatting the round keys. -// # define CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS 1 - NAMESPACE_BEGIN(CryptoPP) /// \brief LEA block cipher information /// \since Crypto++ 7.1 -struct LEA_Info : public FixedBlockSize<16>, VariableKeyLength<16,16,32,8> +struct LEA_Info : public FixedBlockSize<16>, public VariableKeyLength<16,16,32,8> { static const std::string StaticAlgorithmName() {