diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index 018d4f5e..e7f0acbe 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -767,6 +767,7 @@ void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds) #if (CRYPTOPP_POWER8_AES_AVAILABLE) typedef __vector unsigned char uint8x16_p8; +typedef __vector unsigned int uint32x4_p8; typedef __vector unsigned long long uint64x2_p8; void ReverseByteArrayLE(byte src[16]) @@ -906,16 +907,39 @@ inline VectorType VectorLoad(int off, const byte src[16]) // This function presumes the subkey table is correct endianess. inline VectorType VectorLoadKey(const byte src[16]) { - CRYPTOPP_ASSERT(IsAlignedOn(src, 16)); - return (VectorType)vec_ld(0, src); + if (IsAlignedOn(src, 16)) + return (VectorType)vec_ld(0, (uint8_t*)src); + + const uint8x16_p8 perm = vec_lvsl(0, src); + const uint8x16_p8 low = vec_ld(0, src); + const uint8x16_p8 high = vec_ld(15, src); + return (VectorType)vec_perm(low, high, perm); +} + +// Loads an aligned byte array, does not perform an endian conversion. +// This function presumes the subkey table is correct endianess. +inline VectorType VectorLoadKey(const word32 src[4]) +{ + if (IsAlignedOn(src, 16)) + return (VectorType)vec_ld(0, (uint8_t*)src); + + const uint8x16_p8 perm = vec_lvsl(0, (uint8_t*)src); + const uint8x16_p8 low = vec_ld(0, (uint8_t*)src); + const uint8x16_p8 high = vec_ld(15, (uint8_t*)src); + return (VectorType)vec_perm(low, high, perm); } // Loads an aligned byte array, does not perform an endian conversion. // This function presumes the subkey table is correct endianess. inline VectorType VectorLoadKey(int off, const byte src[16]) { - CRYPTOPP_ASSERT(IsAlignedOn(src, 16)); - return (VectorType)vec_ld(off, src); + if (IsAlignedOn(src, 16)) + return (VectorType)vec_ld(off, (uint8_t*)src); + + const uint8x16_p8 perm = vec_lvsl(off, src); + const uint8x16_p8 low = vec_ld(off, src); + const uint8x16_p8 high = vec_ld(off+15, src); + return (VectorType)vec_perm(low, high, perm); } // Stores to a mis-aligned byte array, performs an endian conversion. @@ -944,9 +968,9 @@ template inline T1 VectorEncrypt(const T1& state, const T2& key) { #if defined(CRYPTOPP_XLC_VERSION) - return (T1)__vcipher(state, (T1)key); + return (T1)__vcipher((VectorType)state, (VectorType)key); #elif defined(CRYPTOPP_GCC_VERSION) - return (T1)__builtin_crypto_vcipher(state, (T1)key); + return (T1)__builtin_crypto_vcipher((VectorType)state, (VectorType)key); #else CRYPTOPP_ASSERT(0); #endif @@ -956,9 +980,9 @@ template inline T1 VectorEncryptLast(const T1& state, const T2& key) { #if defined(CRYPTOPP_XLC_VERSION) - return (T1)__vcipherlast(state, (T1)key); + return (T1)__vcipherlast((VectorType)state, (VectorType)key); #elif defined(CRYPTOPP_GCC_VERSION) - return (T1)__builtin_crypto_vcipherlast(state, (T1)key); + return (T1)__builtin_crypto_vcipherlast((VectorType)state, (VectorType)key); #else CRYPTOPP_ASSERT(0); #endif @@ -968,9 +992,9 @@ template inline T1 VectorDecrypt(const T1& state, const T2& key) { #if defined(CRYPTOPP_XLC_VERSION) - return (T1)__vncipher(state, (T1)key); + return (T1)__vncipher((VectorType)state, (VectorType)key); #elif defined(CRYPTOPP_GCC_VERSION) - return (T1)__builtin_crypto_vncipher(state, (T1)key); + return (T1)__builtin_crypto_vncipher((VectorType)state, (VectorType)key); #else CRYPTOPP_ASSERT(0); #endif @@ -980,9 +1004,9 @@ template inline T1 VectorDecryptLast(const T1& state, const T2& key) { #if defined(CRYPTOPP_XLC_VERSION) - return (T1)__vncipherlast(state, (T1)key); + return (T1)__vncipherlast((VectorType)state, (VectorType)key); #elif defined(CRYPTOPP_GCC_VERSION) - return (T1)__builtin_crypto_vncipherlast(state, (T1)key); + return (T1)__builtin_crypto_vncipherlast((VectorType)state, (VectorType)key); #else CRYPTOPP_ASSERT(0); #endif @@ -990,56 +1014,124 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key) ////////////////////////////////////////////////////////////////// +/* Round constants */ +CRYPTOPP_ALIGN_DATA(16) +const uint32_t s_rcon[3][4] = { + 0x01<<24,0x01<<24,0x01<<24,0x01<<24, /* 1 */ + 0x1b<<24,0x1b<<24,0x1b<<24,0x1b<<24, /* 9 */ + 0x36<<24,0x36<<24,0x36<<24,0x36<<24 /* 10 */ +}; + +static inline uint8x16_p8 +Rijndael_Subkey_POWER8(uint8x16_p8 r1, uint8x16_p8 r4, uint8_t subkey[16]) +{ + const uint8x16_p8 r5 = (uint8x16_p8)((uint32x4_p8){0x0d0e0f0c,0x0d0e0f0c,0x0d0e0f0c,0x0d0e0f0c}); + const uint8x16_p8 r0 = {0}; + uint8x16_p8 r3, r6; + + r3 = vec_perm(r1, r1, r5); /* line 1 */ + r6 = vec_sld(r0, r1, 12); /* line 2 */ + r3 = VectorEncryptLast(r3, r4); /* line 3 */ + + r1 = vec_xor(r1, r6); /* line 4 */ + r6 = vec_sld(r0, r6, 12); /* line 5 */ + r1 = vec_xor(r1, r6); /* line 6 */ + r6 = vec_sld(r0, r6, 12); /* line 7 */ + r1 = vec_xor(r1, r6); /* line 8 */ + + // Caller handles r4 addition + // r4 = vec_add(r4, r4); /* line 9 */ + + r1 = vec_xor(r1, r3); /* line 10 */ + + const VectorType t = (VectorType)r1; + VectorStore(t, subkey); + + // r1 is ready for next round + return r1; +} + void Rijndael_UncheckedSetKey_POWER8(word32* rk, size_t keyLen, const word32* rc, const byte* Se, unsigned int rounds) { - word32 *rk_saved = rk, temp; - - // keySize: m_key allocates 4*(rounds+1) word32's. - const size_t keySize = 4*(rounds+1); - const word32* end = rk + keySize; - - while (true) +#if defined(IS_BIG_ENDIAN) + // Testing shows this is about 150 to 300 cycles faster. + if (keyLen == 16) { - temp = rk[keyLen/4-1]; - word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ - (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)]; - rk[keyLen/4] = rk[0] ^ x ^ *(rc++); - rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4]; - rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1]; - rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2]; + uint8_t* skptr = (uint8_t*)rk; + uint8x16_p8 r1 = (uint8x16_p8)VectorLoadKey((uint8_t*)skptr); + uint8x16_p8 r4 = (uint8x16_p8)VectorLoadKey(s_rcon[0]); - if (rk + keyLen/4 + 4 == end) - break; + for (unsigned int i=0; i(rk); - for (unsigned int i=0; i<=rounds; i++) - ReverseByteArrayLE(ptr+i*16); + // VSX registers are big-endian. The entire subkey table must be byte + // reversed on little-endian systems to ensure it loads properly. + byte * ptr = reinterpret_cast(rk); + for (unsigned int i=0; i<=rounds; i++) + ReverseByteArrayLE(ptr+i*16); #endif // IS_LITTLE_ENDIAN + } } inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)