From 2c18fe8af8b4e7108a3b504b14863341ade8b040 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Mon, 18 Sep 2017 18:15:25 -0400 Subject: [PATCH] Refactor LoadT() and StoreT(). Add separate ReverseT() for little endian machines The refactoring has no effect on little endian machines. However, on big endian GCC119 using GCC 7.1 the performance improved by 2.5x for ECB and CTR modes: BEFORE: AES/CTR (128-bit key)27231.40.163670 AES/CTR (192-bit key)25601.50.175719 AES/CTR (256-bit key)27281.40.183749 AES/CBC (128-bit key)12043.20.135554 AES/CBC (192-bit key)10663.70.148605 AES/CBC (256-bit key)9484.10.155635 AES/OFB (128-bit key)10193.80.158648 AES/CFB (128-bit key)9494.10.192787 AES/ECB (128-bit key)35641.10.082337 AFTER: AES/CTR (128-bit key)64840.60.163677 AES/CTR (192-bit key)56410.70.176728 AES/CTR (256-bit key)50050.80.183761 AES/CBC (128-bit key)12233.20.135559 AES/CBC (192-bit key)10803.70.147611 AES/CBC (256-bit key)9664.10.155642 AES/OFB (128-bit key)10573.70.158656 AES/CFB (128-bit key)12173.30.186774 AES/ECB (128-bit key)72890.50.082342 --- rijndael-simd.cpp | 77 +++++++++++++++++++++++------------------------ rijndael.cpp | 4 +-- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index e1bda7b4..06c65ea0 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -771,22 +771,7 @@ void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds) typedef __vector unsigned char uint8x16_p8; typedef __vector unsigned long long uint64x2_p8; -/* Reverses a 16-byte array as needed */ -void ByteReverseArrayLE(byte dest[16], const byte src[16]) -{ -#if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN) - vec_st(vec_reve(vec_ld(0, src)), 0, dest); -#elif defined(IS_LITTLE_ENDIAN) - const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - const uint8x16_p8 zero = {0}; - vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, dest); -#else - if (src != dest) - std::memcpy(dest, src, 16); -#endif -} - -void ByteReverseArrayLE(byte src[16]) +void ByteReverseArray(byte src[16]) { #if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN) vec_st(vec_reve(vec_ld(0, src)), 0, src); @@ -797,78 +782,92 @@ void ByteReverseArrayLE(byte src[16]) #endif } -uint8x16_p8 Load8x16(const uint8_t src[16]) +static inline uint8x16_p8 Reverse8x16(const uint8x16_p8& src) +{ + const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + const uint8x16_p8 zero = {0}; + return vec_perm(src, zero, mask); +} + +static inline uint64x2_p8 Reverse64x2(const uint64x2_p8& src) +{ + const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + const uint8x16_p8 zero = {0}; + return (uint64x2_p8)vec_perm((uint8x16_p8)src, zero, mask); +} + +static inline uint8x16_p8 Load8x16(const uint8_t src[16]) { #if defined(CRYPTOPP_XLC_VERSION) - /* http://stackoverflow.com/q/46124383/608639 */ return vec_xl_be(0, (uint8_t*)src); #else - return (uint8x16_p8)vec_vsx_ld(0, src); +# if defined(IS_LITTLE_ENDIAN) + return Reverse8x16(vec_vsx_ld(0, src)); +# else + return vec_vsx_ld(0, src); +# endif #endif } -uint8x16_p8 Load8x16(int off, const uint8_t src[16]) +static inline uint8x16_p8 Load8x16(int off, const uint8_t src[16]) { #if defined(CRYPTOPP_XLC_VERSION) - /* http://stackoverflow.com/q/46124383/608639 */ return vec_xl_be(off, (uint8_t*)src); #else - return (uint8x16_p8)vec_vsx_ld(off, src); +# if defined(IS_LITTLE_ENDIAN) + return Reverse8x16(vec_vsx_ld(off, src)); +# else + return vec_vsx_ld(off, src); +# endif #endif } -void Store8x16(const uint8x16_p8 src, uint8_t dest[16]) +static inline void Store8x16(const uint8x16_p8 src, uint8_t dest[16]) { #if defined(CRYPTOPP_XLC_VERSION) - /* http://stackoverflow.com/q/46124383/608639 */ vec_xst_be(src, 0, (uint8_t*)dest); #else +# if defined(IS_LITTLE_ENDIAN) + vec_vsx_st(Reverse8x16(src), 0, dest); +# else vec_vsx_st(src, 0, dest); +# endif #endif } -uint64x2_p8 Load64x2(const uint8_t src[16]) +static inline uint64x2_p8 Load64x2(const uint8_t src[16]) { #if defined(CRYPTOPP_XLC_VERSION) - /* http://stackoverflow.com/q/46124383/608639 */ return (uint64x2_p8)vec_xl_be(0, (uint8_t*)src); #else # if defined(IS_LITTLE_ENDIAN) - const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - const uint8x16_p8 zero = {0}; - return (uint64x2_p8)vec_perm(vec_vsx_ld(0, src), zero, mask); + return Reverse64x2((uint64x2_p8)vec_vsx_ld(0, src)); # else return (uint64x2_p8)vec_vsx_ld(0, src); # endif #endif } -uint64x2_p8 Load64x2(int off, const uint8_t src[16]) +static inline uint64x2_p8 Load64x2(int off, const uint8_t src[16]) { #if defined(CRYPTOPP_XLC_VERSION) - /* http://stackoverflow.com/q/46124383/608639 */ return (uint64x2_p8)vec_xl_be(off, (uint8_t*)src); #else # if defined(IS_LITTLE_ENDIAN) - const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - const uint8x16_p8 zero = {0}; - return (uint64x2_p8)vec_perm(vec_vsx_ld(off, src), zero, mask); + return Reverse64x2((uint64x2_p8)vec_vsx_ld(off, src)); # else return (uint64x2_p8)vec_vsx_ld(off, src); # endif #endif } -void Store64x2(const uint64x2_p8 src, uint8_t dest[16]) +static inline void Store64x2(const uint64x2_p8 src, uint8_t dest[16]) { #if defined(CRYPTOPP_XLC_VERSION) - /* http://stackoverflow.com/q/46124383/608639 */ vec_xst_be((uint8x16_p8)src, 0, (uint8_t*)dest); #else # if defined(IS_LITTLE_ENDIAN) - const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - const uint8x16_p8 zero = {0}; - vec_vsx_st(vec_perm((uint8x16_p8)src, zero, mask), 0, dest); + vec_vsx_st((uint8x16_p8)Reverse64x2(src), 0, dest); # else vec_vsx_st((uint8x16_p8)src, 0, dest); # endif diff --git a/rijndael.cpp b/rijndael.cpp index 43c7fa80..94085ac1 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -251,7 +251,7 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si #endif #if (CRYPTOPP_POWER8_AES_AVAILABLE) -extern void ByteReverseArrayLE(byte src[16]); +extern void ByteReverseArray(byte src[16]); extern size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -329,7 +329,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c // reversed on little-endian systems to ensure it loads properly. byte * ptr = reinterpret_cast(rk); for (unsigned int i=0; i<=m_rounds; i++) - ByteReverseArrayLE(ptr+i*16); + ByteReverseArray(ptr+i*16); #endif // IS_LITTLE_ENDIAN return;