From c5a427d69086f9e2388cf6697c4b20046574306a Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 20 Sep 2017 08:57:53 -0400 Subject: [PATCH] Add PowerPC VectorLoadKeyUnaligned for AES-192 Make internal functions static. We get better optimizations depsice using unnamed namespaces Add PowerPC uint32x4 functions for handling 32-bit rcon and mask --- rijndael-simd.cpp | 133 +++++++++++++++++++++++++++++++--------------- 1 file changed, 91 insertions(+), 42 deletions(-) diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index 75cb779a..b39e6e29 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -11,8 +11,8 @@ // mbedTLS under a debugger was helped for us to determine problems // with our subkey generation and scheduling. // -// AltiVec and Power8 code based on http://github.com/noloader/AES-Power8 -// +// AltiVec and Power8 code based on http://github.com/noloader/AES-Intrinsics and +// http://www.ibm.com/developerworks/library/se-power8-in-core-cryptography/ #include "pch.h" #include "config.h" @@ -161,7 +161,7 @@ const word32 s_one[] = {0, 0, 0, 1<<24}; // uint32x4_t const word32 s_one[] = {0, 0, 0, 1}; // uint32x4_t #endif -inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds) +static inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds) { CRYPTOPP_ASSERT(subkeys); const byte *keys = reinterpret_cast(subkeys); @@ -189,7 +189,7 @@ inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned i block = veorq_u8(block, vld1q_u8(keys+rounds*16)); } -inline void ARMV8_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2, +static inline void ARMV8_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4, uint8x16_t &block5, const word32 *subkeys, unsigned int rounds) { @@ -245,7 +245,7 @@ inline void ARMV8_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_ block5 = veorq_u8(block5, key); } -inline void ARMV8_Dec_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds) +static inline void ARMV8_Dec_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds) { CRYPTOPP_ASSERT(subkeys); const byte *keys = reinterpret_cast(subkeys); @@ -273,7 +273,7 @@ inline void ARMV8_Dec_Block(uint8x16_t &block, const word32 *subkeys, unsigned i block = veorq_u8(block, vld1q_u8(keys+rounds*16)); } -inline void ARMV8_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2, +static inline void ARMV8_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4, uint8x16_t &block5, const word32 *subkeys, unsigned int rounds) { @@ -476,7 +476,7 @@ const word32 s_rconLE[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 }; -inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds) +static inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds) { block = _mm_xor_si128(block, subkeys[0]); for (unsigned int i=1; i -inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, +static inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { @@ -788,6 +788,13 @@ static inline uint8x16_p8 Reverse8x16(const uint8x16_p8& src) return vec_perm(src, zero, mask); } +static inline uint32x4_p8 Reverse8x16(const uint32x4_p8& src) +{ + const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + const uint8x16_p8 zero = {0}; + return (uint32x4_p8)vec_perm((uint8x16_p8)src, zero, mask); +} + static inline uint64x2_p8 Reverse64x2(const uint64x2_p8& src) { const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; @@ -821,6 +828,19 @@ static inline uint8x16_p8 Load8x16(int off, const uint8_t src[16]) #endif } +static inline uint32x4_p8 Load32x4(const uint32_t src[4]) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return vec_xl_be(0, (uint32_t*)src); +#else +# if defined(IS_LITTLE_ENDIAN) + return Reverse8x16(vec_vsx_ld(0, src)); +# else + return vec_vsx_ld(0, src); +# endif +#endif +} + static inline void Store8x16(const uint8x16_p8& src, uint8_t dest[16]) { #if defined(CRYPTOPP_XLC_VERSION) @@ -884,7 +904,7 @@ static inline void Store64x2(const uint64x2_p8& src, uint8_t dest[16]) #endif // Loads a mis-aligned byte array, performs an endian conversion. -inline VectorType VectorLoad(const byte src[16]) +static inline VectorType VectorLoad(const byte src[16]) { #if defined(CRYPTOPP_XLC_VERSION) return Load8x16(src); @@ -894,7 +914,7 @@ inline VectorType VectorLoad(const byte src[16]) } // Loads a mis-aligned byte array, performs an endian conversion. -inline VectorType VectorLoad(int off, const byte src[16]) +static inline VectorType VectorLoad(int off, const byte src[16]) { #if defined(CRYPTOPP_XLC_VERSION) return Load8x16(off, src); @@ -903,13 +923,28 @@ inline VectorType VectorLoad(int off, const byte src[16]) #endif } +// Loads a mis-aligned byte array, performs an endian conversion. +static inline VectorType VectorLoad(const word32 src[4]) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return (VectorType)Load32x4((uint32_t*)src); +#elif defined(CRYPTOPP_GCC_VERSION) + return (VectorType)Load32x4((uint32_t*)src); +#endif +} + // Loads an aligned byte array, does not perform an endian conversion. // This function presumes the subkey table is correct endianess. -inline VectorType VectorLoadKey(const byte src[16]) +static inline VectorType VectorLoadKey(const byte src[16]) { - if (IsAlignedOn(src, 16)) - return (VectorType)vec_ld(0, (uint8_t*)src); + CRYPTOPP_ASSERT(IsAlignedOn(src, 16)); + return (VectorType)vec_ld(0, (uint8_t*)src); +} +// Loads a byte array, does not perform an endian conversion. +// This function presumes the subkey table is correct endianess. +static inline VectorType VectorLoadKeyUnaligned(const byte src[16]) +{ const uint8x16_p8 perm = vec_lvsl(0, src); const uint8x16_p8 low = vec_ld(0, src); const uint8x16_p8 high = vec_ld(15, src); @@ -918,11 +953,16 @@ inline VectorType VectorLoadKey(const byte src[16]) // Loads an aligned byte array, does not perform an endian conversion. // This function presumes the subkey table is correct endianess. -inline VectorType VectorLoadKey(const word32 src[4]) +static inline VectorType VectorLoadKey(const word32 src[4]) { - if (IsAlignedOn(src, 16)) - return (VectorType)vec_ld(0, (uint8_t*)src); + CRYPTOPP_ASSERT(IsAlignedOn(src, 16)); + return (VectorType)vec_ld(0, (uint8_t*)src); +} +// Loads a byte array, does not perform an endian conversion. +// This function presumes the subkey table is correct endianess. +static inline VectorType VectorLoadKeyUnaligned(const word32 src[4]) +{ const uint8x16_p8 perm = vec_lvsl(0, (uint8_t*)src); const uint8x16_p8 low = vec_ld(0, (uint8_t*)src); const uint8x16_p8 high = vec_ld(15, (uint8_t*)src); @@ -931,11 +971,16 @@ inline VectorType VectorLoadKey(const word32 src[4]) // Loads an aligned byte array, does not perform an endian conversion. // This function presumes the subkey table is correct endianess. -inline VectorType VectorLoadKey(int off, const byte src[16]) +static inline VectorType VectorLoadKey(int off, const byte src[16]) { - if (IsAlignedOn(src, 16)) - return (VectorType)vec_ld(off, (uint8_t*)src); + CRYPTOPP_ASSERT(IsAlignedOn(src, 16)); + return (VectorType)vec_ld(off, (uint8_t*)src); +} +// Loads a byte array, does not perform an endian conversion. +// This function presumes the subkey table is correct endianess. +static inline VectorType VectorLoadKeyUnaligned(int off, const byte src[16]) +{ const uint8x16_p8 perm = vec_lvsl(off, src); const uint8x16_p8 low = vec_ld(off, src); const uint8x16_p8 high = vec_ld(off+15, src); @@ -943,7 +988,7 @@ inline VectorType VectorLoadKey(int off, const byte src[16]) } // Stores to a mis-aligned byte array, performs an endian conversion. -inline void VectorStore(const VectorType& src, byte dest[16]) +static inline void VectorStore(const VectorType& src, byte dest[16]) { #if defined(CRYPTOPP_XLC_VERSION) return Store8x16(src, dest); @@ -953,19 +998,19 @@ inline void VectorStore(const VectorType& src, byte dest[16]) } template -inline T1 VectorXor(const T1& vec1, const T2& vec2) +static inline T1 VectorXor(const T1& vec1, const T2& vec2) { return (T1)vec_xor(vec1, (T1)vec2); } template -inline T1 VectorAdd(const T1& vec1, const T2& vec2) +static inline T1 VectorAdd(const T1& vec1, const T2& vec2) { return (T1)vec_add(vec1, (T1)vec2); } template -inline T1 VectorEncrypt(const T1& state, const T2& key) +static inline T1 VectorEncrypt(const T1& state, const T2& key) { #if defined(CRYPTOPP_XLC_VERSION) return (T1)__vcipher((VectorType)state, (VectorType)key); @@ -977,7 +1022,7 @@ inline T1 VectorEncrypt(const T1& state, const T2& key) } template -inline T1 VectorEncryptLast(const T1& state, const T2& key) +static inline T1 VectorEncryptLast(const T1& state, const T2& key) { #if defined(CRYPTOPP_XLC_VERSION) return (T1)__vcipherlast((VectorType)state, (VectorType)key); @@ -989,7 +1034,7 @@ inline T1 VectorEncryptLast(const T1& state, const T2& key) } template -inline T1 VectorDecrypt(const T1& state, const T2& key) +static inline T1 VectorDecrypt(const T1& state, const T2& key) { #if defined(CRYPTOPP_XLC_VERSION) return (T1)__vncipher((VectorType)state, (VectorType)key); @@ -1001,7 +1046,7 @@ inline T1 VectorDecrypt(const T1& state, const T2& key) } template -inline T1 VectorDecryptLast(const T1& state, const T2& key) +static inline T1 VectorDecryptLast(const T1& state, const T2& key) { #if defined(CRYPTOPP_XLC_VERSION) return (T1)__vncipherlast((VectorType)state, (VectorType)key); @@ -1015,18 +1060,22 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key) ////////////////////////////////////////////////////////////////// /* Round constants */ -CRYPTOPP_ALIGN_DATA(16) -const uint32_t s_rcon[3][4] = { - 0x01<<24,0x01<<24,0x01<<24,0x01<<24, /* 1 */ - 0x1b<<24,0x1b<<24,0x1b<<24,0x1b<<24, /* 9 */ - 0x36<<24,0x36<<24,0x36<<24,0x36<<24 /* 10 */ +static const uint32_t s_rcon[3][4] = { + {0x01<<24,0x01<<24,0x01<<24,0x01<<24}, /* 1 */ + {0x1b<<24,0x1b<<24,0x1b<<24,0x1b<<24}, /* 9 */ + {0x36<<24,0x36<<24,0x36<<24,0x36<<24} /* 10 */ +}; + +static const uint32_t s_mask[4] = { + 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c }; static inline uint8x16_p8 -Rijndael_Subkey_POWER8(uint8x16_p8 r1, uint8x16_p8 r4) +Rijndael_Subkey_POWER8(uint8x16_p8 r1, const uint8x16_p8 r4) { - const uint8x16_p8 r5 = (uint8x16_p8)((uint32x4_p8){0x0d0e0f0c,0x0d0e0f0c,0x0d0e0f0c,0x0d0e0f0c}); + const uint8x16_p8 r5 = (uint8x16_p8)VectorLoad(s_mask); const uint8x16_p8 r0 = {0}; + uint8x16_p8 r3, r6; r3 = vec_perm(r1, r1, r5); /* line 1 */ @@ -1051,12 +1100,12 @@ void Rijndael_UncheckedSetKey_POWER8(word32* rk, size_t keyLen, const word32* rc const byte* Se, unsigned int rounds) { #if defined(IS_BIG_ENDIAN) - // Testing shows this is about 150 to 300 cycles faster. + // Testing shows this is about 125 to 275 cycles faster. if (keyLen == 16) { uint8_t* skptr = (uint8_t*)rk; uint8x16_p8 r1 = (uint8x16_p8)VectorLoadKey((uint8_t*)skptr); - uint8x16_p8 r4 = (uint8x16_p8)VectorLoadKey(s_rcon[0]); + uint8x16_p8 r4 = (uint8x16_p8)VectorLoad(s_rcon[0]); for (unsigned int i=0; i(subkeys); @@ -1157,7 +1206,7 @@ inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys)); } -inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1, +static inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1, VectorType &block2, VectorType &block3, VectorType &block4, VectorType &block5, const word32 *subkeys, unsigned int rounds) { @@ -1192,7 +1241,7 @@ inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1, block5 = VectorEncryptLast(block5, k); } -inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds) +static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds) { CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16)); const byte *keys = reinterpret_cast(subkeys); @@ -1210,7 +1259,7 @@ inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned block = VectorDecryptLast(block, VectorLoadKey(0, keys)); } -inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1, +static inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1, VectorType &block2, VectorType &block3, VectorType &block4, VectorType &block5, const word32 *subkeys, unsigned int rounds) {