Add PowerPC VectorLoadKeyUnaligned for AES-192

Make internal functions static. We get better optimizations depsice using unnamed namespaces Add PowerPC uint32x4 functions for handling 32-bit rcon and mask
2017-09-20 08:57:53 -04:00 · 2017-09-20 08:57:53 -04:00 · c5a427d690
parent c94d076aa1
commit c5a427d690
1 changed files with 91 additions and 42 deletions
--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@ -11,8 +11,8 @@
 //    mbedTLS under a debugger was helped for us to determine problems
 //    with our subkey generation and scheduling.
 //
-//    AltiVec and Power8 code based on http://github.com/noloader/AES-Power8
-//
+//    AltiVec and Power8 code based on http://github.com/noloader/AES-Intrinsics and
+//    http://www.ibm.com/developerworks/library/se-power8-in-core-cryptography/

 #include "pch.h"
 #include "config.h"
@ -161,7 +161,7 @@ const word32 s_one[] = {0, 0, 0, 1<<24};  // uint32x4_t
 const word32 s_one[] = {0, 0, 0, 1};      // uint32x4_t
 #endif

-inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
+static inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(subkeys);
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
@ -189,7 +189,7 @@ inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned i
 	block = veorq_u8(block, vld1q_u8(keys+rounds*16));
 }

-inline void ARMV8_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2,
+static inline void ARMV8_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2,
            uint8x16_t &block3, uint8x16_t &block4, uint8x16_t &block5,
            const word32 *subkeys, unsigned int rounds)
 {
@ -245,7 +245,7 @@ inline void ARMV8_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_
 	block5 = veorq_u8(block5, key);
 }

-inline void ARMV8_Dec_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
+static inline void ARMV8_Dec_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(subkeys);
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
@ -273,7 +273,7 @@ inline void ARMV8_Dec_Block(uint8x16_t &block, const word32 *subkeys, unsigned i
 	block = veorq_u8(block, vld1q_u8(keys+rounds*16));
 }

-inline void ARMV8_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2,
+static inline void ARMV8_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2,
            uint8x16_t &block3, uint8x16_t &block4, uint8x16_t &block5,
            const word32 *subkeys, unsigned int rounds)
 {
@ -476,7 +476,7 @@ const word32 s_rconLE[] = {
 	0x01, 0x02, 0x04, 0x08,	0x10, 0x20, 0x40, 0x80,	0x1B, 0x36
 };

-inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
+static inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
 {
 	block = _mm_xor_si128(block, subkeys[0]);
 	for (unsigned int i=1; i<rounds-1; i+=2)
@ -488,7 +488,7 @@ inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsign
 	block = _mm_aesenclast_si128(block, subkeys[rounds]);
 }

-inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
+static inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
                               MAYBE_CONST __m128i *subkeys, unsigned int rounds)
 {
 	__m128i rk = subkeys[0];
@ -511,7 +511,7 @@ inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2
 	block3 = _mm_aesenclast_si128(block3, rk);
 }

-inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
+static inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
 {
 	block = _mm_xor_si128(block, subkeys[0]);
 	for (unsigned int i=1; i<rounds-1; i+=2)
@ -523,7 +523,7 @@ inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsign
 	block = _mm_aesdeclast_si128(block, subkeys[rounds]);
 }

-inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
+static inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
                        MAYBE_CONST __m128i *subkeys, unsigned int rounds)
 {
 	__m128i rk = subkeys[0];
@ -547,7 +547,7 @@ inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2
 }

 template <typename F1, typename F4>
-inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
+static inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
        MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks,
        const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
 {
@ -788,6 +788,13 @@ static inline uint8x16_p8 Reverse8x16(const uint8x16_p8& src)
 	return vec_perm(src, zero, mask);
 }

+static inline uint32x4_p8 Reverse8x16(const uint32x4_p8& src)
+{
+	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+	const uint8x16_p8 zero = {0};
+	return (uint32x4_p8)vec_perm((uint8x16_p8)src, zero, mask);
+}
+
 static inline uint64x2_p8 Reverse64x2(const uint64x2_p8& src)
 {
 	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
@ -821,6 +828,19 @@ static inline uint8x16_p8 Load8x16(int off, const uint8_t src[16])
 #endif
 }

+static inline uint32x4_p8 Load32x4(const uint32_t src[4])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return vec_xl_be(0, (uint32_t*)src);
+#else
+# if defined(IS_LITTLE_ENDIAN)
+	return Reverse8x16(vec_vsx_ld(0, src));
+# else
+	return vec_vsx_ld(0, src);
+# endif
+#endif
+}
+
 static inline void Store8x16(const uint8x16_p8& src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
@ -884,7 +904,7 @@ static inline void Store64x2(const uint64x2_p8& src, uint8_t dest[16])
 #endif

 // Loads a mis-aligned byte array, performs an endian conversion.
-inline VectorType VectorLoad(const byte src[16])
+static inline VectorType VectorLoad(const byte src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return Load8x16(src);
@ -894,7 +914,7 @@ inline VectorType VectorLoad(const byte src[16])
 }

 // Loads a mis-aligned byte array, performs an endian conversion.
-inline VectorType VectorLoad(int off, const byte src[16])
+static inline VectorType VectorLoad(int off, const byte src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return Load8x16(off, src);
@ -903,13 +923,28 @@ inline VectorType VectorLoad(int off, const byte src[16])
 #endif
 }

+// Loads a mis-aligned byte array, performs an endian conversion.
+static inline VectorType VectorLoad(const word32 src[4])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return (VectorType)Load32x4((uint32_t*)src);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return (VectorType)Load32x4((uint32_t*)src);
+#endif
+}
+
 // Loads an aligned byte array, does not perform an endian conversion.
 //  This function presumes the subkey table is correct endianess.
-inline VectorType VectorLoadKey(const byte src[16])
+static inline VectorType VectorLoadKey(const byte src[16])
 {
-	if (IsAlignedOn(src, 16))
+	CRYPTOPP_ASSERT(IsAlignedOn(src, 16));
 	return (VectorType)vec_ld(0, (uint8_t*)src);
+}

+// Loads a byte array, does not perform an endian conversion.
+//  This function presumes the subkey table is correct endianess.
+static inline VectorType VectorLoadKeyUnaligned(const byte src[16])
+{
 	const uint8x16_p8 perm = vec_lvsl(0, src);
 	const uint8x16_p8 low = vec_ld(0, src);
 	const uint8x16_p8 high = vec_ld(15, src);
@ -918,11 +953,16 @@ inline VectorType VectorLoadKey(const byte src[16])

 // Loads an aligned byte array, does not perform an endian conversion.
 //  This function presumes the subkey table is correct endianess.
-inline VectorType VectorLoadKey(const word32 src[4])
+static inline VectorType VectorLoadKey(const word32 src[4])
 {
-	if (IsAlignedOn(src, 16))
+	CRYPTOPP_ASSERT(IsAlignedOn(src, 16));
 	return (VectorType)vec_ld(0, (uint8_t*)src);
+}

+// Loads a byte array, does not perform an endian conversion.
+//  This function presumes the subkey table is correct endianess.
+static inline VectorType VectorLoadKeyUnaligned(const word32 src[4])
+{
 	const uint8x16_p8 perm = vec_lvsl(0, (uint8_t*)src);
 	const uint8x16_p8 low = vec_ld(0, (uint8_t*)src);
 	const uint8x16_p8 high = vec_ld(15, (uint8_t*)src);
@ -931,11 +971,16 @@ inline VectorType VectorLoadKey(const word32 src[4])

 // Loads an aligned byte array, does not perform an endian conversion.
 //  This function presumes the subkey table is correct endianess.
-inline VectorType VectorLoadKey(int off, const byte src[16])
+static inline VectorType VectorLoadKey(int off, const byte src[16])
 {
-	if (IsAlignedOn(src, 16))
+	CRYPTOPP_ASSERT(IsAlignedOn(src, 16));
 	return (VectorType)vec_ld(off, (uint8_t*)src);
+}

+// Loads a byte array, does not perform an endian conversion.
+//  This function presumes the subkey table is correct endianess.
+static inline VectorType VectorLoadKeyUnaligned(int off, const byte src[16])
+{
 	const uint8x16_p8 perm = vec_lvsl(off, src);
 	const uint8x16_p8 low = vec_ld(off, src);
 	const uint8x16_p8 high = vec_ld(off+15, src);
@ -943,7 +988,7 @@ inline VectorType VectorLoadKey(int off, const byte src[16])
 }

 // Stores to a mis-aligned byte array, performs an endian conversion.
-inline void VectorStore(const VectorType& src, byte dest[16])
+static inline void VectorStore(const VectorType& src, byte dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return Store8x16(src, dest);
@ -953,19 +998,19 @@ inline void VectorStore(const VectorType& src, byte dest[16])
 }

 template <class T1, class T2>
-inline T1 VectorXor(const T1& vec1, const T2& vec2)
+static inline T1 VectorXor(const T1& vec1, const T2& vec2)
 {
 	return (T1)vec_xor(vec1, (T1)vec2);
 }

 template <class T1, class T2>
-inline T1 VectorAdd(const T1& vec1, const T2& vec2)
+static inline T1 VectorAdd(const T1& vec1, const T2& vec2)
 {
 	return (T1)vec_add(vec1, (T1)vec2);
 }

 template <class T1, class T2>
-inline T1 VectorEncrypt(const T1& state, const T2& key)
+static inline T1 VectorEncrypt(const T1& state, const T2& key)
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return (T1)__vcipher((VectorType)state, (VectorType)key);
@ -977,7 +1022,7 @@ inline T1 VectorEncrypt(const T1& state, const T2& key)
 }

 template <class T1, class T2>
-inline T1 VectorEncryptLast(const T1& state, const T2& key)
+static inline T1 VectorEncryptLast(const T1& state, const T2& key)
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return (T1)__vcipherlast((VectorType)state, (VectorType)key);
@ -989,7 +1034,7 @@ inline T1 VectorEncryptLast(const T1& state, const T2& key)
 }

 template <class T1, class T2>
-inline T1 VectorDecrypt(const T1& state, const T2& key)
+static inline T1 VectorDecrypt(const T1& state, const T2& key)
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return (T1)__vncipher((VectorType)state, (VectorType)key);
@ -1001,7 +1046,7 @@ inline T1 VectorDecrypt(const T1& state, const T2& key)
 }

 template <class T1, class T2>
-inline T1 VectorDecryptLast(const T1& state, const T2& key)
+static inline T1 VectorDecryptLast(const T1& state, const T2& key)
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return (T1)__vncipherlast((VectorType)state, (VectorType)key);
@ -1015,18 +1060,22 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key)
 //////////////////////////////////////////////////////////////////

 /* Round constants */
-CRYPTOPP_ALIGN_DATA(16)
-const uint32_t s_rcon[3][4] = {
-	0x01<<24,0x01<<24,0x01<<24,0x01<<24,  /*  1 */
-	0x1b<<24,0x1b<<24,0x1b<<24,0x1b<<24,  /*  9 */
-	0x36<<24,0x36<<24,0x36<<24,0x36<<24   /* 10 */
+static const uint32_t s_rcon[3][4] = {
+	{0x01<<24,0x01<<24,0x01<<24,0x01<<24},  /*  1 */
+	{0x1b<<24,0x1b<<24,0x1b<<24,0x1b<<24},  /*  9 */
+	{0x36<<24,0x36<<24,0x36<<24,0x36<<24}   /* 10 */
+};
+
+static const uint32_t s_mask[4] = {
+	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c
 };

 static inline uint8x16_p8
-Rijndael_Subkey_POWER8(uint8x16_p8 r1, uint8x16_p8 r4)
+Rijndael_Subkey_POWER8(uint8x16_p8 r1, const uint8x16_p8 r4)
 {
-	const uint8x16_p8 r5 = (uint8x16_p8)((uint32x4_p8){0x0d0e0f0c,0x0d0e0f0c,0x0d0e0f0c,0x0d0e0f0c});
+	const uint8x16_p8 r5 = (uint8x16_p8)VectorLoad(s_mask);
 	const uint8x16_p8 r0 = {0};
+
 	uint8x16_p8 r3, r6;

 	r3 = vec_perm(r1, r1, r5);       /* line  1 */
@ -1051,12 +1100,12 @@ void Rijndael_UncheckedSetKey_POWER8(word32* rk, size_t keyLen, const word32* rc
                                     const byte* Se, unsigned int rounds)
 {
 #if defined(IS_BIG_ENDIAN)
-	// Testing shows this is about 150 to 300 cycles faster.
+	// Testing shows this is about 125 to 275 cycles faster.
 	if (keyLen == 16)
 	{
 		uint8_t* skptr = (uint8_t*)rk;
 		uint8x16_p8 r1 = (uint8x16_p8)VectorLoadKey((uint8_t*)skptr);
-		uint8x16_p8 r4 = (uint8x16_p8)VectorLoadKey(s_rcon[0]);
+		uint8x16_p8 r4 = (uint8x16_p8)VectorLoad(s_rcon[0]);

 		for (unsigned int i=0; i<rounds-2; ++i)
 		{
@ -1139,7 +1188,7 @@ void Rijndael_UncheckedSetKey_POWER8(word32* rk, size_t keyLen, const word32* rc
 	}
 }

-inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
+static inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
@ -1157,7 +1206,7 @@ inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned
 	block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys));
 }

-inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
+static inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
            VectorType &block2, VectorType &block3, VectorType &block4,
            VectorType &block5, const word32 *subkeys, unsigned int rounds)
 {
@ -1192,7 +1241,7 @@ inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
 	block5 = VectorEncryptLast(block5, k);
 }

-inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
+static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
@ -1210,7 +1259,7 @@ inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned
 	block = VectorDecryptLast(block, VectorLoadKey(0, keys));
 }

-inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1,
+static inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1,
            VectorType &block2, VectorType &block3, VectorType &block4,
            VectorType &block5, const word32 *subkeys, unsigned int rounds)
 {