Add Power8 AES encryption

This is the forward direction on encryption only. Crypto++ uses the "Equivalent Inverse Cipher" (FIPS-197, Section 5.3.5, p.23), and it is not compatible with IBM hardware. The library library will need to re-work the decryption key scheduling routines. (We may be able to work around it another way, but I have not investigated it).
2017-09-11 22:52:22 -04:00 · 2017-09-11 22:52:22 -04:00 · 7fb34e9b08
parent 9c9d5ebe87
commit 7fb34e9b08
3 changed files with 358 additions and 19 deletions
--- a/ppc-simd.cpp
+++ b/ppc-simd.cpp
@ -24,7 +24,7 @@
 #endif

 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
-# include "altivec.h"
+# include <altivec.h>
 # undef vector
 # undef pixel
 # undef bool
@ -33,11 +33,11 @@
 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
 # if defined(CRYPTOPP_XLC_VERSION)
 // #include <builtins.h>
- typedef vector unsigned char uint8x16_p8;
- typedef vector unsigned long long uint64x2_p8;
+ typedef __vector unsigned char uint8x16_p8;
+ typedef __vector unsigned long long uint64x2_p8;
 #elif defined(CRYPTOPP_GCC_VERSION)
- typedef vector unsigned char uint8x16_p8;
- typedef vector unsigned long long uint64x2_p8;
+ typedef __vector unsigned char uint8x16_p8;
+ typedef __vector unsigned long long uint64x2_p8;
 #endif
 #endif

--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@ -10,34 +10,51 @@
 //    Skip Hovsmith and Barry O'Rourke for the mbedTLS project. Stepping
 //    mbedTLS under a debugger was helped for us to determine problems
 //    with our subkey generation and scheduling.
+//
+//    AltiVec and Power8 code based on http://github.com/noloader/AES-Power8
+//

 #include "pch.h"
 #include "config.h"
 #include "misc.h"

-// Clang and GCC hoops...
+// We set CRYPTOPP_ARM_AES_AVAILABLE based on compiler version.
+// If the crypto is not available, then we have to disable it here.
 #if !(defined(__ARM_FEATURE_CRYPTO) || defined(_MSC_VER))
 # undef CRYPTOPP_ARM_AES_AVAILABLE
 #endif

+// We set CRYPTOPP_POWER8_CRYPTO_AVAILABLE based on compiler version.
+// If the crypto is not available, then we have to disable it here.
+#if !(defined(__CRYPTO) || defined(_ARCH_PWR8) || defined(_ARCH_PWR9))
+# undef CRYPTOPP_POWER8_CRYPTO_AVAILABLE
+#endif
+
 #if (CRYPTOPP_AESNI_AVAILABLE)
 // Hack... We are supposed to use <nmmintrin.h>. GCC 4.8, LLVM Clang 3.5
 //   and Apple Clang 6.0 conflates SSE4.1 and SSE4.2. If we use <nmmintrin.h>
 //   then compile fails with "SSE4.2 instruction set not enabled". Also see
-//   https://gcc.gnu.org/ml/gcc-help/2017-08/msg00015.html.
-# include "smmintrin.h"
-# include "wmmintrin.h"
+//   http://gcc.gnu.org/ml/gcc-help/2017-08/msg00015.html.
+# include <smmintrin.h>
+# include <wmmintrin.h>
 #endif

 #if (CRYPTOPP_ARM_AES_AVAILABLE)
-# include "arm_neon.h"
+# include <arm_neon.h>
 #endif

 // Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
 //  fail to compile with <arm_acle.h> included. Later Apple compilers compile
 //  intrinsics without <arm_acle.h> included.
 #if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION)
-# include "arm_acle.h"
+# include <arm_acle.h>
+#endif
+
+#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
+# include <altivec.h>
+# undef vector
+# undef pixel
+# undef bool
 #endif

 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
@ -141,6 +158,8 @@ bool CPU_ProbeAES()
 }
 #endif  // ARM32 or ARM64

+// ***************************** ARMv8 ***************************** //
+
 #if (CRYPTOPP_ARM_AES_AVAILABLE)
 inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
 {
@ -306,6 +325,13 @@ inline void ARMV8_Dec_4_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_

 const word32 s_one[] = {0, 0, 0, 1<<24};

+/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+const word32 rcon[] = {
+	0x01, 0x02, 0x04, 0x08,
+	0x10, 0x20, 0x40, 0x80,
+	0x1B, 0x36
+};
+
 template <typename F1, typename F4>
 size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
@ -438,6 +464,8 @@ size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subKeys, size_t ro

 #endif  // CRYPTOPP_ARM_AES_AVAILABLE

+// ***************************** AES-NI ***************************** //
+
 #if (CRYPTOPP_AESNI_AVAILABLE)
 inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
 {
@ -734,4 +762,271 @@ void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds)
 }
 #endif  // CRYPTOPP_AESNI_AVAILABLE

+// ***************************** Power 8 ***************************** //
+
+#if (CRYPTOPP_POWER8_AES_AVAILABLE)
+
+#if defined(CRYPTOPP_XLC_VERSION)
+ // #include <builtins.h>
+ typedef __vector unsigned char uint8x16_p8;
+ typedef __vector unsigned long long uint64x2_p8;
+#elif defined(CRYPTOPP_GCC_VERSION)
+ typedef __vector unsigned char uint8x16_p8;
+ typedef __vector unsigned long long uint64x2_p8;
+#endif
+
+/* Reverses a 16-byte array as needed */
+void ByteReverseArrayLE(byte dest[16], const byte src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN)
+	vec_st(vec_reve(vec_ld(0, src)), 0, dest);
+#elif defined(IS_LITTLE_ENDIAN)
+	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+	const uint8x16_p8 zero = {0};
+	vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, dest);
+#else
+	if (src != dest)
+		std::memcpy(dest, src, 16);
+#endif
+}
+
+void ByteReverseArrayLE(byte src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN)
+	vec_st(vec_reve(vec_ld(0, src)), 0, src);
+#elif defined(IS_LITTLE_ENDIAN)
+	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+	const uint8x16_p8 zero = {0};
+	vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, src);
+#endif
+}
+
+uint8x16_p8 Load8x16(const uint8_t src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* http://stackoverflow.com/q/46124383/608639 */
+	uint8_t* s = (uint8_t*)src;
+# if defined(IS_LITTLE_ENDIAN)
+	return vec_xl_be(0, s);
+# else
+	return vec_xl(0, s);
+# endif
+#else
+	/* GCC, Clang, etc */
+	return (uint8x16_p8)vec_vsx_ld(0, src);
+#endif
+}
+
+void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* IBM XL C/C++ compiler */
+# if defined(IS_LITTLE_ENDIAN)
+	vec_xst_be(src, 0, dest);
+# else
+	vec_xst(src, 0, dest);
+# endif
+#else
+	/* GCC, Clang, etc */
+	vec_vsx_st(src, 0, dest);
+#endif
+}
+
+uint64x2_p8 Load64x2(const uint8_t src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* http://stackoverflow.com/q/46124383/608639 */
+	uint8_t* s = (uint8_t*)src;
+# if defined(IS_LITTLE_ENDIAN)
+	return (uint64x2_p8)vec_xl_be(0, s);
+# else
+	return (uint64x2_p8)vec_xl(0, s);
+# endif
+#else
+	/* GCC, Clang, etc */
+# if defined(IS_LITTLE_ENDIAN)
+	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+	const uint8x16_p8 zero = {0};
+	return (uint64x2_p8)vec_perm(vec_vsx_ld(0, src), zero, mask);
+# else
+	return (uint64x2_p8)vec_vsx_ld(0, src);
+# endif
+#endif
+}
+
+void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* IBM XL C/C++ compiler */
+# if defined(IS_LITTLE_ENDIAN)
+	vec_xst_be((uint8x16_p8)src, 0, dest);
+# else
+	vec_xst((uint8x16_p8)src, 0, dest);
+# endif
+#else
+	/* GCC, Clang, etc */
+# if defined(IS_LITTLE_ENDIAN)
+	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+	const uint8x16_p8 zero = {0};
+	vec_vsx_st(vec_perm((uint8x16_p8)src, zero, mask), 0, dest);
+# else
+	vec_vsx_st((uint8x16_p8)src, 0, dest);
+# endif
+#endif
+}
+
+//////////////////////////////////////////////////////////////////
+
+#if defined(CRYPTOPP_XLC_VERSION)
+	typedef uint8x16_p8 VectorType;
+#elif defined(CRYPTOPP_GCC_VERSION)
+	typedef uint64x2_p8 VectorType;
+#else
+	CRYPTOPP_ASSERT(0);
+#endif
+
+inline VectorType VectorLoad(const byte src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return Load8x16(src);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return Load64x2(src);
+#endif
+}
+
+inline VectorType VectorLoadAligned(const byte vec[16])
+{
+	return (VectorType)vec_ld(0, vec);
+}
+
+inline VectorType VectorLoadAligned(int off, const byte vec[16])
+{
+	return (VectorType)vec_ld(off, vec);
+}
+
+inline void VectorStore(const VectorType& src, byte dest[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return Store8x16(src, dest);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return Store64x2(src, dest);
+#endif
+}
+
+template <class T1, class T2>
+inline T1 VectorXor(const T1& vec1, const T2& vec2)
+{
+	return (T1)vec_xor(vec1, (T1)vec2);
+}
+
+template <class T1, class T2>
+inline T1 VectorAdd(const T1& vec1, const T2& vec2)
+{
+	return (T1)vec_add(vec1, (T1)vec2);
+}
+
+template <class T1, class T2>
+inline T1 VectorEncrypt(const T1& state, const T2& key)
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return (T2)__vcipher(state, key);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return __builtin_crypto_vcipher(state, (T1)key);
+#else
+	CRYPTOPP_ASSERT(0);
+#endif
+}
+
+template <class T1, class T2>
+inline T1 VectorEncryptLast(const T1& state, const T2& key)
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return (T1)__vcipherlast(state, key);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return __builtin_crypto_vcipherlast(state, (T1)key);
+#else
+	CRYPTOPP_ASSERT(0);
+#endif
+}
+
+template <class T1, class T2>
+inline T1 VectorDecrypt(const T1& state, const T2& key)
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return (T1)__vncipher(state, key);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return __builtin_crypto_vncipher(state, (T1)key);
+#else
+	CRYPTOPP_ASSERT(0);
+#endif
+}
+
+template <class T1, class T2>
+inline T1 VectorDecryptLast(const T1& state, const T2& key)
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return (T1)__vncipherlast(state, key);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return __builtin_crypto_vncipherlast(state, (T1)key);
+#else
+	CRYPTOPP_ASSERT(0);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////
+
+void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
+        const byte *inBlock, const byte *xorBlock, byte *outBlock)
+{
+	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
+	const byte *keys = reinterpret_cast<const byte*>(subkeys);
+
+	VectorType s = VectorLoad(inBlock);
+	VectorType k = VectorLoadAligned(keys);
+
+	s = VectorXor(s, k);
+	for (size_t i=1; i<rounds-1; i+=2)
+	{
+		s = VectorEncrypt(s, VectorLoadAligned(  i*16,   keys));
+		s = VectorEncrypt(s, VectorLoadAligned((i+1)*16, keys));
+	}
+
+	s = VectorEncrypt(s, VectorLoadAligned((rounds-1)*16, keys));
+	s = VectorEncryptLast(s, VectorLoadAligned(rounds*16, keys));
+
+	// According to benchmarks this is a tad bit slower
+	// if (xorBlock)
+	//	s = VectorXor(s, VectorLoad(xorBlock));
+
+	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
+	s = VectorXor(s, x);
+
+	VectorStore(s, outBlock);
+}
+
+void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
+        const byte *inBlock, const byte *xorBlock, byte *outBlock)
+{
+	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
+	const byte *keys = reinterpret_cast<const byte*>(subkeys);
+
+	VectorType s = VectorLoad(inBlock);
+	VectorType k = VectorLoadAligned(keys);
+
+	s = VectorXor(s, k);
+	for (size_t i=1; i<rounds-1; i+=2)
+	{
+		s = VectorDecrypt(s, VectorLoadAligned(  i*16,   keys));
+		s = VectorDecrypt(s, VectorLoadAligned((i+1)*16, keys));
+	}
+
+	s = VectorDecrypt(s, VectorLoadAligned((rounds-1)*16, keys));
+	s = VectorDecryptLast(s, VectorLoadAligned(rounds*16, keys));
+
+	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
+	s = VectorXor(s, x);
+
+	VectorStore(s, outBlock);
+}
+#endif  // CRYPTOPP_POWER8_AES_AVAILABLE
 NAMESPACE_END
--- a/rijndael.cpp
+++ b/rijndael.cpp
@ -237,6 +237,17 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif

+#if (CRYPTOPP_POWER8_AES_AVAILABLE)
+extern void ByteReverseArrayLE(byte src[16]);
+
+extern void Rijndael_UncheckedSetKey_POWER8(const byte *userKey, size_t keyLen, word32 *rk, CipherDir dir);
+
+extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
+        const byte *inBlock, const byte *xorBlock, byte *outBlock);
+extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
+        const byte *inBlock, const byte *xorBlock, byte *outBlock);
+#endif
+
 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
 {
 	AssertValidKeyLength(keyLen);
@ -267,7 +278,8 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
 	while (true)
 	{
 		temp  = rk[keyLen/4-1];
-		word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
+		word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
+					(word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
 		rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
 		rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
 		rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
@ -307,10 +319,11 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
 		if (!s_TdFilled)
 			FillDecTable();

+		#define InverseMixColumn(x) \
+			TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
+			TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
+
 		unsigned int i, j;
-
-#define InverseMixColumn(x)		TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
-
 		for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
 		{
 			temp = InverseMixColumn(rk[i    ]); rk[i    ] = InverseMixColumn(rk[j    ]); rk[j    ] = temp;
@ -338,6 +351,21 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
 	if (HasAES())
 		ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
 #endif
+#if CRYPTOPP_POWER8_AES_AVAILABLE
+	if (IsForwardTransformation() && HasAES())
+	{
+		ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
+
+		// VSX registers are big-endian. The entire subkey table must be byte
+		//   reversed on little-endian systems to ensure it loads properly.
+		//   I believe we should do this when msr.le=1, but I can't find an
+		//   intrinsic to access the machine status register. In the meantime
+		//   we will do it anytime IS_LITTLE_ENDIAN is true.
+		byte * ptr = reinterpret_cast<byte*>(rk);
+		for (unsigned int i=0; i<=m_rounds; i++)
+			ByteReverseArrayLE(ptr+i*16);
+	}
+#endif
 }

 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@ -362,6 +390,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	}
 #endif

+#if (CRYPTOPP_POWER8_AES_AVAILABLE)
+	if (HasAES())
+	{
+		(void)Rijndael_Enc_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
+		return;
+	}
+#endif
+
 	typedef BlockGetAndPut<word32, NativeByteOrder> Block;

 	word32 s0, s1, s2, s3, t0, t1, t2, t3;
@ -448,6 +484,14 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	}
 #endif

+#if (CRYPTOPP_POWER8_AES_AVAILABLE) && 0
+	if (HasAES())
+	{
+		(void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
+		return;
+	}
+#endif
+
 	typedef BlockGetAndPut<word32, NativeByteOrder> Block;

 	word32 s0, s1, s2, s3, t0, t1, t2, t3;