Add Power8 AES encryption

This is the forward direction on encryption only. Crypto++ uses the "Equivalent Inverse Cipher" (FIPS-197, Section 5.3.5, p.23), and it is not compatible with IBM hardware. The library library will need to re-work the decryption key scheduling routines. (We may be able to work around it another way, but I have not investigated it).
2017-09-11 22:52:22 -04:00 · 2017-09-11 22:52:22 -04:00 · 7fb34e9b08
parent 9c9d5ebe87
commit 7fb34e9b08
3 changed files with 358 additions and 19 deletions
--- a/ppc-simd.cpp
+++ b/ppc-simd.cpp
@ -24,7 +24,7 @@
 #endif
 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
-# include "altivec.h"
+# include <altivec.h>
 # undef vector
 # undef pixel
 # undef bool
@ -33,11 +33,11 @@
 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
 # if defined(CRYPTOPP_XLC_VERSION)
 // #include <builtins.h>
- typedef vector unsigned char uint8x16_p8;
+ typedef __vector unsigned char uint8x16_p8;
- typedef vector unsigned long long uint64x2_p8;
+ typedef __vector unsigned long long uint64x2_p8;
 #elif defined(CRYPTOPP_GCC_VERSION)
- typedef vector unsigned char uint8x16_p8;
+ typedef __vector unsigned char uint8x16_p8;
- typedef vector unsigned long long uint64x2_p8;
+ typedef __vector unsigned long long uint64x2_p8;
 #endif
 #endif
--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@ -10,34 +10,51 @@
 //    Skip Hovsmith and Barry O'Rourke for the mbedTLS project. Stepping
 //    mbedTLS under a debugger was helped for us to determine problems
 //    with our subkey generation and scheduling.
 //
 //    AltiVec and Power8 code based on http://github.com/noloader/AES-Power8
 //
 #include "pch.h"
 #include "config.h"
 #include "misc.h"
-// Clang and GCC hoops...
+// We set CRYPTOPP_ARM_AES_AVAILABLE based on compiler version.
 // If the crypto is not available, then we have to disable it here.
 #if !(defined(__ARM_FEATURE_CRYPTO) || defined(_MSC_VER))
 # undef CRYPTOPP_ARM_AES_AVAILABLE
 #endif
 // We set CRYPTOPP_POWER8_CRYPTO_AVAILABLE based on compiler version.
 // If the crypto is not available, then we have to disable it here.
 #if !(defined(__CRYPTO) || defined(_ARCH_PWR8) || defined(_ARCH_PWR9))
 # undef CRYPTOPP_POWER8_CRYPTO_AVAILABLE
 #endif
 #if (CRYPTOPP_AESNI_AVAILABLE)
 // Hack... We are supposed to use <nmmintrin.h>. GCC 4.8, LLVM Clang 3.5
 //   and Apple Clang 6.0 conflates SSE4.1 and SSE4.2. If we use <nmmintrin.h>
 //   then compile fails with "SSE4.2 instruction set not enabled". Also see
-//   https://gcc.gnu.org/ml/gcc-help/2017-08/msg00015.html.
+//   http://gcc.gnu.org/ml/gcc-help/2017-08/msg00015.html.
-# include "smmintrin.h"
+# include <smmintrin.h>
-# include "wmmintrin.h"
+# include <wmmintrin.h>
 #endif
 #if (CRYPTOPP_ARM_AES_AVAILABLE)
-# include "arm_neon.h"
+# include <arm_neon.h>
 #endif
 // Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
 //  fail to compile with <arm_acle.h> included. Later Apple compilers compile
 //  intrinsics without <arm_acle.h> included.
 #if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION)
-# include "arm_acle.h"
+# include <arm_acle.h>
 #endif
 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
 # include <altivec.h>
 # undef vector
 # undef pixel
 # undef bool
 #endif
 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
@ -141,6 +158,8 @@ bool CPU_ProbeAES()
 }
 #endif  // ARM32 or ARM64
 // ***************************** ARMv8 ***************************** //
 #if (CRYPTOPP_ARM_AES_AVAILABLE)
 inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
 {
@ -306,6 +325,13 @@ inline void ARMV8_Dec_4_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_
 const word32 s_one[] = {0, 0, 0, 1<<24};
 /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
 const word32 rcon[] = {
 	0x01, 0x02, 0x04, 0x08,
 	0x10, 0x20, 0x40, 0x80,
 	0x1B, 0x36
 };
 template <typename F1, typename F4>
 size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
@ -438,6 +464,8 @@ size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subKeys, size_t ro
 #endif  // CRYPTOPP_ARM_AES_AVAILABLE
 // ***************************** AES-NI ***************************** //
 #if (CRYPTOPP_AESNI_AVAILABLE)
 inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
 {
@ -634,8 +662,8 @@ size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subKeys, size_t ro
 {
 	// SunCC workaround
 	MAYBE_CONST word32* sk = MAYBE_UNCONST_CAST(word32*, subKeys);
-	MAYBE_CONST byte* ib = MAYBE_UNCONST_CAST(byte*, inBlocks);
+	MAYBE_CONST   byte* ib = MAYBE_UNCONST_CAST(byte*, inBlocks);
-	MAYBE_CONST byte* xb = MAYBE_UNCONST_CAST(byte*, xorBlocks);
+	MAYBE_CONST   byte* xb = MAYBE_UNCONST_CAST(byte*, xorBlocks);
 	return Rijndael_AdvancedProcessBlocks_AESNI(AESNI_Enc_Block, AESNI_Enc_4_Blocks,
                sk, rounds, ib, xb, outBlocks, length, flags);
@ -645,8 +673,8 @@ size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subKeys, size_t ro
        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
 {
 	MAYBE_CONST word32* sk = MAYBE_UNCONST_CAST(word32*, subKeys);
-	MAYBE_CONST byte* ib = MAYBE_UNCONST_CAST(byte*, inBlocks);
+	MAYBE_CONST   byte* ib = MAYBE_UNCONST_CAST(byte*, inBlocks);
-	MAYBE_CONST byte* xb = MAYBE_UNCONST_CAST(byte*, xorBlocks);
+	MAYBE_CONST   byte* xb = MAYBE_UNCONST_CAST(byte*, xorBlocks);
 	return Rijndael_AdvancedProcessBlocks_AESNI(AESNI_Dec_Block, AESNI_Dec_4_Blocks,
                sk, rounds, ib, xb, outBlocks, length, flags);
@ -734,4 +762,271 @@ void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds)
 }
 #endif  // CRYPTOPP_AESNI_AVAILABLE
 // ***************************** Power 8 ***************************** //
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 #if defined(CRYPTOPP_XLC_VERSION)
 // #include <builtins.h>
 typedef __vector unsigned char uint8x16_p8;
 typedef __vector unsigned long long uint64x2_p8;
 #elif defined(CRYPTOPP_GCC_VERSION)
 typedef __vector unsigned char uint8x16_p8;
 typedef __vector unsigned long long uint64x2_p8;
 #endif
 /* Reverses a 16-byte array as needed */
 void ByteReverseArrayLE(byte dest[16], const byte src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN)
 	vec_st(vec_reve(vec_ld(0, src)), 0, dest);
 #elif defined(IS_LITTLE_ENDIAN)
 	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
 	const uint8x16_p8 zero = {0};
 	vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, dest);
 #else
 	if (src != dest)
 		std::memcpy(dest, src, 16);
 #endif
 }
 void ByteReverseArrayLE(byte src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN)
 	vec_st(vec_reve(vec_ld(0, src)), 0, src);
 #elif defined(IS_LITTLE_ENDIAN)
 	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
 	const uint8x16_p8 zero = {0};
 	vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, src);
 #endif
 }
 uint8x16_p8 Load8x16(const uint8_t src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	/* http://stackoverflow.com/q/46124383/608639 */
 	uint8_t* s = (uint8_t*)src;
 # if defined(IS_LITTLE_ENDIAN)
 	return vec_xl_be(0, s);
 # else
 	return vec_xl(0, s);
 # endif
 #else
 	/* GCC, Clang, etc */
 	return (uint8x16_p8)vec_vsx_ld(0, src);
 #endif
 }
 void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	/* IBM XL C/C++ compiler */
 # if defined(IS_LITTLE_ENDIAN)
 	vec_xst_be(src, 0, dest);
 # else
 	vec_xst(src, 0, dest);
 # endif
 #else
 	/* GCC, Clang, etc */
 	vec_vsx_st(src, 0, dest);
 #endif
 }
 uint64x2_p8 Load64x2(const uint8_t src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	/* http://stackoverflow.com/q/46124383/608639 */
 	uint8_t* s = (uint8_t*)src;
 # if defined(IS_LITTLE_ENDIAN)
 	return (uint64x2_p8)vec_xl_be(0, s);
 # else
 	return (uint64x2_p8)vec_xl(0, s);
 # endif
 #else
 	/* GCC, Clang, etc */
 # if defined(IS_LITTLE_ENDIAN)
 	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
 	const uint8x16_p8 zero = {0};
 	return (uint64x2_p8)vec_perm(vec_vsx_ld(0, src), zero, mask);
 # else
 	return (uint64x2_p8)vec_vsx_ld(0, src);
 # endif
 #endif
 }
 void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	/* IBM XL C/C++ compiler */
 # if defined(IS_LITTLE_ENDIAN)
 	vec_xst_be((uint8x16_p8)src, 0, dest);
 # else
 	vec_xst((uint8x16_p8)src, 0, dest);
 # endif
 #else
 	/* GCC, Clang, etc */
 # if defined(IS_LITTLE_ENDIAN)
 	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
 	const uint8x16_p8 zero = {0};
 	vec_vsx_st(vec_perm((uint8x16_p8)src, zero, mask), 0, dest);
 # else
 	vec_vsx_st((uint8x16_p8)src, 0, dest);
 # endif
 #endif
 }
 //////////////////////////////////////////////////////////////////
 #if defined(CRYPTOPP_XLC_VERSION)
 	typedef uint8x16_p8 VectorType;
 #elif defined(CRYPTOPP_GCC_VERSION)
 	typedef uint64x2_p8 VectorType;
 #else
 	CRYPTOPP_ASSERT(0);
 #endif
 inline VectorType VectorLoad(const byte src[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return Load8x16(src);
 #elif defined(CRYPTOPP_GCC_VERSION)
 	return Load64x2(src);
 #endif
 }
 inline VectorType VectorLoadAligned(const byte vec[16])
 {
 	return (VectorType)vec_ld(0, vec);
 }
 inline VectorType VectorLoadAligned(int off, const byte vec[16])
 {
 	return (VectorType)vec_ld(off, vec);
 }
 inline void VectorStore(const VectorType& src, byte dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return Store8x16(src, dest);
 #elif defined(CRYPTOPP_GCC_VERSION)
 	return Store64x2(src, dest);
 #endif
 }
 template <class T1, class T2>
 inline T1 VectorXor(const T1& vec1, const T2& vec2)
 {
 	return (T1)vec_xor(vec1, (T1)vec2);
 }
 template <class T1, class T2>
 inline T1 VectorAdd(const T1& vec1, const T2& vec2)
 {
 	return (T1)vec_add(vec1, (T1)vec2);
 }
 template <class T1, class T2>
 inline T1 VectorEncrypt(const T1& state, const T2& key)
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return (T2)__vcipher(state, key);
 #elif defined(CRYPTOPP_GCC_VERSION)
 	return __builtin_crypto_vcipher(state, (T1)key);
 #else
 	CRYPTOPP_ASSERT(0);
 #endif
 }
 template <class T1, class T2>
 inline T1 VectorEncryptLast(const T1& state, const T2& key)
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return (T1)__vcipherlast(state, key);
 #elif defined(CRYPTOPP_GCC_VERSION)
 	return __builtin_crypto_vcipherlast(state, (T1)key);
 #else
 	CRYPTOPP_ASSERT(0);
 #endif
 }
 template <class T1, class T2>
 inline T1 VectorDecrypt(const T1& state, const T2& key)
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return (T1)__vncipher(state, key);
 #elif defined(CRYPTOPP_GCC_VERSION)
 	return __builtin_crypto_vncipher(state, (T1)key);
 #else
 	CRYPTOPP_ASSERT(0);
 #endif
 }
 template <class T1, class T2>
 inline T1 VectorDecryptLast(const T1& state, const T2& key)
 {
 #if defined(CRYPTOPP_XLC_VERSION)
 	return (T1)__vncipherlast(state, key);
 #elif defined(CRYPTOPP_GCC_VERSION)
 	return __builtin_crypto_vncipherlast(state, (T1)key);
 #else
 	CRYPTOPP_ASSERT(0);
 #endif
 }
 //////////////////////////////////////////////////////////////////
 void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
        const byte *inBlock, const byte *xorBlock, byte *outBlock)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
 	VectorType s = VectorLoad(inBlock);
 	VectorType k = VectorLoadAligned(keys);
 	s = VectorXor(s, k);
 	for (size_t i=1; i<rounds-1; i+=2)
 	{
 		s = VectorEncrypt(s, VectorLoadAligned(  i*16,   keys));
 		s = VectorEncrypt(s, VectorLoadAligned((i+1)*16, keys));
 	}
 	s = VectorEncrypt(s, VectorLoadAligned((rounds-1)*16, keys));
 	s = VectorEncryptLast(s, VectorLoadAligned(rounds*16, keys));
 	// According to benchmarks this is a tad bit slower
 	// if (xorBlock)
 	//	s = VectorXor(s, VectorLoad(xorBlock));
 	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
 	s = VectorXor(s, x);
 	VectorStore(s, outBlock);
 }
 void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
        const byte *inBlock, const byte *xorBlock, byte *outBlock)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
 	VectorType s = VectorLoad(inBlock);
 	VectorType k = VectorLoadAligned(keys);
 	s = VectorXor(s, k);
 	for (size_t i=1; i<rounds-1; i+=2)
 	{
 		s = VectorDecrypt(s, VectorLoadAligned(  i*16,   keys));
 		s = VectorDecrypt(s, VectorLoadAligned((i+1)*16, keys));
 	}
 	s = VectorDecrypt(s, VectorLoadAligned((rounds-1)*16, keys));
 	s = VectorDecryptLast(s, VectorLoadAligned(rounds*16, keys));
 	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
 	s = VectorXor(s, x);
 	VectorStore(s, outBlock);
 }
 #endif  // CRYPTOPP_POWER8_AES_AVAILABLE
 NAMESPACE_END
--- a/rijndael.cpp
+++ b/rijndael.cpp
@ -237,6 +237,17 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
        const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 extern void ByteReverseArrayLE(byte src[16]);
 extern void Rijndael_UncheckedSetKey_POWER8(const byte *userKey, size_t keyLen, word32 *rk, CipherDir dir);
 extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
        const byte *inBlock, const byte *xorBlock, byte *outBlock);
 extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
        const byte *inBlock, const byte *xorBlock, byte *outBlock);
 #endif
 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
 {
 	AssertValidKeyLength(keyLen);
@ -267,7 +278,8 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
 	while (true)
 	{
 		temp  = rk[keyLen/4-1];
-		word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
+		word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
 					(word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
 		rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
 		rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
 		rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
@ -307,10 +319,11 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
 		if (!s_TdFilled)
 			FillDecTable();
 		#define InverseMixColumn(x) \
 			TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
 			TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
 		unsigned int i, j;
 #define InverseMixColumn(x)		TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
 		for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
 		{
 			temp = InverseMixColumn(rk[i    ]); rk[i    ] = InverseMixColumn(rk[j    ]); rk[j    ] = temp;
@ -338,6 +351,21 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
 	if (HasAES())
 		ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
 #endif
 #if CRYPTOPP_POWER8_AES_AVAILABLE
 	if (IsForwardTransformation() && HasAES())
 	{
 		ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
 		// VSX registers are big-endian. The entire subkey table must be byte
 		//   reversed on little-endian systems to ensure it loads properly.
 		//   I believe we should do this when msr.le=1, but I can't find an
 		//   intrinsic to access the machine status register. In the meantime
 		//   we will do it anytime IS_LITTLE_ENDIAN is true.
 		byte * ptr = reinterpret_cast<byte*>(rk);
 		for (unsigned int i=0; i<=m_rounds; i++)
 			ByteReverseArrayLE(ptr+i*16);
 	}
 #endif
 }
 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@ -362,6 +390,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	}
 #endif
 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
 	if (HasAES())
 	{
 		(void)Rijndael_Enc_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
 		return;
 	}
 #endif
 	typedef BlockGetAndPut<word32, NativeByteOrder> Block;
 	word32 s0, s1, s2, s3, t0, t1, t2, t3;
@ -448,6 +484,14 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
 	}
 #endif
 #if (CRYPTOPP_POWER8_AES_AVAILABLE) && 0
 	if (HasAES())
 	{
 		(void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);
 		return;
 	}
 #endif
 	typedef BlockGetAndPut<word32, NativeByteOrder> Block;
 	word32 s0, s1, s2, s3, t0, t1, t2, t3;