diff --git a/lea-simd.cpp b/lea-simd.cpp index 703930c9..75e710b7 100644 --- a/lea-simd.cpp +++ b/lea-simd.cpp @@ -140,11 +140,7 @@ uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b) template inline uint32x4_t LoadKey(const word32 rkey[]) { -#if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS) - return vld1q_u32(&rkey[IDX*4]); -#else return vdupq_n_u32(rkey[IDX]); -#endif } template @@ -599,7 +595,8 @@ inline __m128i RotateRight<8>(const __m128i& val) template inline __m128i LoadKey(const word32 rkey[]) { - return _mm_loadu_si128((const __m128i*) &rkey[IDX*4]); + float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); + return _mm_castps_si128(_mm_load_ps1(&rk)); } template @@ -989,17 +986,6 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) #if defined(CRYPTOPP_SSSE3_AVAILABLE) -void LEA_SplatKeys_SSSE3(SecBlock& rkeys) -{ - SecBlock temp(rkeys.size() * 4); - for (size_t i=0, j=0; i& rkeys) -{ - SecBlock temp(rkeys.size() * 4); - for (size_t i=0, j=0; i& rkeys); - extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -567,10 +565,6 @@ extern size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t # endif # if (CRYPTOPP_ARM_NEON_AVAILABLE) -# if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS) -extern void LEA_SplatKeys_NEON(SecBlock& rkeys); -# endif // CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS - extern size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -609,20 +603,6 @@ void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, con default: CRYPTOPP_ASSERT(0);; } - - // If we pre-splat the round keys at setup then we avoid a shuffle - // at runtime for each subkey used during encryption and decryption. - // Pre-splatting saves about 0.7 to 1.0 cpb at the cost of 4x storage. -#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_SSSE3_AVAILABLE) - if (HasSSSE3()) - LEA_SplatKeys_SSSE3(m_rkey); -#endif -#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_ARM_NEON_AVAILABLE) -# if (CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS) - if (HasNEON()) - LEA_SplatKeys_NEON(m_rkey); -# endif // CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS -#endif } void LEA::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const diff --git a/lea.h b/lea.h index 5b9f74b0..d8d95700 100644 --- a/lea.h +++ b/lea.h @@ -19,19 +19,11 @@ # define CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS 1 #endif -// Define this if you want to pre-splat the round key table -// for NEON and Aarch64. Pre-splatting the round key increases -// performance by about 0.7 cpb on ARM server boards like an -// AMD Opteron A1100. However, it crushes performance on ARM -// dev-boards like LeMaker HiKey and Pine64. HiKey and Pine64 -// run about 8 cpb slower when pre-splatting the round keys. -// # define CRYPTOPP_LEA_ARM_SPLAT_ROUNDKEYS 1 - NAMESPACE_BEGIN(CryptoPP) /// \brief LEA block cipher information /// \since Crypto++ 7.1 -struct LEA_Info : public FixedBlockSize<16>, VariableKeyLength<16,16,32,8> +struct LEA_Info : public FixedBlockSize<16>, public VariableKeyLength<16,16,32,8> { static const std::string StaticAlgorithmName() {